Skip to content

Commit

Permalink
Change mutex to use logical_device_id (#416)
Browse files Browse the repository at this point in the history
### Issue
Related to #418

### Description
This makes working with this part of the code easier. Both
logical_chip_id and pci_device_id is fixed on specific system, so there
is no functional change here.
The motivation is that this change makes it easier to work and refactor
the locking mechanism.

### List of the changes
- Changed creation of mutexes to use logical_device_id
- Changed the way mutexes are fetched to use logical_device_id

### Testing
CI testing should be enough

### API Changes
There are no API changes in this PR.
  • Loading branch information
broskoTT authored Dec 18, 2024
1 parent e454c69 commit d54ffc7
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 22 deletions.
4 changes: 2 additions & 2 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ class Cluster : public tt_device {
const uint32_t& num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources);
void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm);
void initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm);
void cleanup_shared_host_state();
void initialize_pcie_devices();
void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& cores);
Expand Down Expand Up @@ -747,7 +747,7 @@ class Cluster : public tt_device {
std::map<chip_id_t, std::unordered_map<int32_t, uint64_t>> tlb_config_map = {};
std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, std::int32_t>> map_core_to_tlb_per_chip = {};

std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int pci_interface_id);
std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int logical_device_id);
virtual uint32_t get_harvested_noc_rows_for_chip(
int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
void generate_tensix_broadcast_grids_for_grayskull(
Expand Down
37 changes: 17 additions & 20 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ bool Cluster::is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_
address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_description.value()), target.chip);
}

void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) {
void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm) {
// These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here
// (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm
// is tied to clean_system_resources from the constructor. The main process is responsible for initializing the
Expand All @@ -192,7 +192,7 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup

// Initialize Dynamic TLB mutexes
for (auto& tlb : dynamic_tlb_config) {
mutex_name = tlb.first + std::to_string(pci_interface_id);
mutex_name = tlb.first + std::to_string(logical_device_id);
if (cleanup_mutexes_in_shm) {
named_mutex::remove(mutex_name.c_str());
}
Expand All @@ -201,15 +201,15 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup
}

// Initialize ARC core mutex
mutex_name = fmt::format("ARC_MSG{}", pci_interface_id);
mutex_name = fmt::format("ARC_MSG{}", logical_device_id);
if (cleanup_mutexes_in_shm) {
named_mutex::remove(mutex_name.c_str());
}
hardware_resource_mutex_map[mutex_name] =
std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);

if (arch_name == tt::ARCH::WORMHOLE_B0) {
mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(pci_interface_id);
mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(logical_device_id);
// Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for
// ethernet broadcast
if (cleanup_mutexes_in_shm) {
Expand All @@ -220,7 +220,7 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup
}

// Initialize interprocess mutexes to make host -> device memory barriers atomic
mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(pci_interface_id);
mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(logical_device_id);
if (cleanup_mutexes_in_shm) {
named_mutex::remove(mutex_name.c_str());
}
Expand Down Expand Up @@ -266,7 +266,7 @@ void Cluster::create_device(
pci_device->get_device_num(),
pci_device->revision_id);

initialize_interprocess_mutexes(pci_device->get_device_num(), clean_system_resources);
initialize_interprocess_mutexes(logical_device_id, clean_system_resources);

// MT: Initial BH - hugepages will fail init
// For using silicon driver without workload to query mission mode params, no need for hugepage.
Expand Down Expand Up @@ -1110,7 +1110,7 @@ void Cluster::write_device_memory(
}
} else {
const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, target.chip));

while (size_in_bytes > 0) {
auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(
Expand Down Expand Up @@ -1158,7 +1158,7 @@ void Cluster::read_device_memory(
log_debug(LogSiliconDriver, " read_block called with tlb_offset: {}, tlb_size: {}", tlb_offset, tlb_size);
} else {
const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, target.chip));
log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index);
while (size_in_bytes > 0) {
auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(
Expand Down Expand Up @@ -1524,7 +1524,7 @@ int Cluster::pcie_arc_msg(

// Exclusive access for a single process at a time. Based on physical pci interface id.
std::string msg_type = "ARC_MSG";
const scoped_lock<named_mutex> lock(*get_mutex(msg_type, tt_device->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(msg_type, logical_device_id));
uint32_t fw_arg = arg0 | (arg1 << 16);
int exit_code = 0;

Expand Down Expand Up @@ -1718,8 +1718,8 @@ inline TTDevice* Cluster::get_tt_device(chip_id_t device_id) const {
}

std::shared_ptr<boost::interprocess::named_mutex> Cluster::get_mutex(
const std::string& tlb_name, int pci_interface_id) {
std::string mutex_name = tlb_name + std::to_string(pci_interface_id);
const std::string& tlb_name, int logical_device_id) {
std::string mutex_name = tlb_name + std::to_string(logical_device_id);
return hardware_resource_mutex_map.at(mutex_name);
}

Expand Down Expand Up @@ -1855,8 +1855,7 @@ void Cluster::write_to_non_mmio_device(
// MUTEX ACQUIRE (NON-MMIO)
// do not locate any ethernet core reads/writes before this acquire
//
const scoped_lock<named_mutex> lock(*get_mutex(
NON_MMIO_MUTEX_NAME, this->get_tt_device(mmio_capable_chip_logical)->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, mmio_capable_chip_logical));

int& active_core_for_txn =
non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
Expand Down Expand Up @@ -2081,8 +2080,7 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
// MUTEX ACQUIRE (NON-MMIO)
// do not locate any ethernet core reads/writes before this acquire
//
const scoped_lock<named_mutex> lock(*get_mutex(
NON_MMIO_MUTEX_NAME, this->get_tt_device(mmio_capable_chip_logical)->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, mmio_capable_chip_logical));
const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0);

read_device_memory(
Expand Down Expand Up @@ -2511,7 +2509,7 @@ void Cluster::pcie_broadcast_write(
TTDevice* tt_device = get_tt_device(chip);
const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const uint8_t* buffer_addr = static_cast<const uint8_t*>(mem_ptr);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, tt_device->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, chip));
while (size_in_bytes > 0) {
auto [mapped_address, tlb_size] = tt_device->set_dynamic_tlb_broadcast(
tlb_index,
Expand Down Expand Up @@ -2880,8 +2878,7 @@ void Cluster::insert_host_to_device_barrier(
const uint32_t barrier_addr,
const std::string& fallback_tlb) {
// Ensure that this memory barrier is atomic across processes/threads
const scoped_lock<named_mutex> lock(
*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_tt_device(chip)->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, chip));
set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb);
set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb);
}
Expand Down Expand Up @@ -2995,7 +2992,7 @@ void Cluster::read_mmio_device_register(
TTDevice* tt_device = get_tt_device(core.chip);

const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, tt_device->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, core.chip));
log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index);

auto [mapped_address, tlb_size] = tt_device->set_dynamic_tlb(
Expand All @@ -3015,7 +3012,7 @@ void Cluster::write_mmio_device_register(
TTDevice* tt_device = get_tt_device(core.chip);

const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, tt_device->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, core.chip));
log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index);

auto [mapped_address, tlb_size] = tt_device->set_dynamic_tlb(
Expand Down

0 comments on commit d54ffc7

Please sign in to comment.