Skip to content

Commit

Permalink
Fix bug when calling CreateDevice in a loop on TG
Browse files Browse the repository at this point in the history
  • Loading branch information
aliuTT committed Dec 23, 2024
1 parent 91e61c0 commit bb659e2
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 10 deletions.
13 changes: 12 additions & 1 deletion tt_metal/impl/device/device_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,20 +206,31 @@ void DevicePool::initialize(

// Never skip for TG Cluster
bool skip = not tt::Cluster::instance().is_galaxy_cluster();
std::vector<chip_id_t> target_mmio_ids;
for (const auto& device_id : device_ids) {
TT_FATAL(
device_id < tt::Cluster::instance().number_of_devices(),
"Device index {} out of range. There are {} devices available.",
device_id,
tt::Cluster::instance().number_of_devices());
const auto& mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
if (std::find(target_mmio_ids.begin(), target_mmio_ids.end(), mmio_device_id) == target_mmio_ids.end()) {
target_mmio_ids.push_back(mmio_device_id);
}
skip &= (device_id == mmio_device_id);
}
if (target_mmio_ids.size() != tt::Cluster::instance().number_of_pci_devices()) {
log_warning(
tt::LogMetal,
"Opening subset of mmio devices slows down UMD read/write to remote chips. If opening more devices, "
"consider using CreateDevices API.");
}

_inst->skip_remote_devices = skip;

_inst->add_devices_to_pool(device_ids);
_inst->init_firmware_on_active_devices();
tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(true);
tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(true, target_mmio_ids);
_inst->init_profiler_devices();
}

Expand Down
18 changes: 10 additions & 8 deletions tt_metal/llrt/tt_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -973,18 +973,20 @@ std::tuple<tt_cxy_pair, tt_cxy_pair> Cluster::get_eth_tunnel_core(
}

// TODO: ALLAN Can change to write one bit
void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing) const {
void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing, std::vector<chip_id_t> target_mmio_devices) const {
log_debug(tt::LogDevice, "Set internal routing bit {}", enable_internal_routing);
const uint32_t routing_info_addr = eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_BASE;
// TODO: initialize devices if user does not
// Must initialize remote chips first, then mmio chips since once mmio chips are doing fd routing
// we do not always context switch to base FW
std::vector<chip_id_t> mmio_devices;
mmio_devices.reserve(this->devices_grouped_by_assoc_mmio_device_.size());
std::vector<chip_id_t> non_mmio_devices;
for (const auto &[assoc_mmio_device, devices] : this->devices_grouped_by_assoc_mmio_device_) {
mmio_devices.emplace_back(assoc_mmio_device);
for (const auto &chip_id : devices) {
if (target_mmio_devices.size() == 0) {
for (const auto &[assoc_mmio_device, devices] : this->devices_grouped_by_assoc_mmio_device_) {
target_mmio_devices.emplace_back(assoc_mmio_device);
}
}
for (const auto &mmio_chip_id : target_mmio_devices) {
for (const auto &chip_id : this->devices_grouped_by_assoc_mmio_device_.at(mmio_chip_id)) {
non_mmio_devices.emplace_back(chip_id);
}
}
Expand All @@ -1003,7 +1005,7 @@ void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_
(void *)&routing_info_enabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false);
}
}
for (const auto &chip_id : mmio_devices) {
for (const auto &chip_id : target_mmio_devices) {
for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) {
tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH));
// Enable internal ethernet routing for mmio devices
Expand All @@ -1017,7 +1019,7 @@ void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_
.src_sent_valid_cmd = 0,
.dst_acked_valid_cmd = 0,
};
for (const auto &chip_id : mmio_devices) {
for (const auto &chip_id : target_mmio_devices) {
for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) {
tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH));
// Disable internal ethernet routing for mmio devices
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/llrt/tt_cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ class Cluster {
// set_internal_routing_info_for_ethernet_cores(false);
// CloseDevice(0)
// CloseDevice(1)
void set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing) const;
void set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing, std::vector<chip_id_t> target_mmio_devices = {}) const;

// Returns MMIO device ID (logical) that controls given `device_id`. If `device_id` is MMIO device it is returned.
chip_id_t get_associated_mmio_device(chip_id_t device_id) const {
Expand Down

0 comments on commit bb659e2

Please sign in to comment.