diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp index 90970df39630..c32c553b4f2f 100644 --- a/tt_metal/common/core_descriptor.hpp +++ b/tt_metal/common/core_descriptor.hpp @@ -88,12 +88,12 @@ inline uint32_t get_l1_bank_size(chip_id_t device_id, const uint8_t num_hw_cqs) inline const std::vector &get_logical_storage_cores(chip_id_t device_id, const uint8_t num_hw_cqs) { const core_descriptor_t &core_desc = get_core_descriptor_config(device_id, num_hw_cqs); - static std::unordered_map> logical_storage_cores_by_device; - if (logical_storage_cores_by_device.count(device_id)) { - return logical_storage_cores_by_device.at(device_id); + static std::unordered_map>> logical_storage_cores_by_device; + if (logical_storage_cores_by_device[device_id].count(num_hw_cqs)) { + return logical_storage_cores_by_device.at(device_id).at(num_hw_cqs); } CoreCoord grid_size = tt::Cluster::instance().get_soc_desc(device_id).worker_grid_size; - std::vector &logical_storage_cores = logical_storage_cores_by_device[device_id]; + std::vector &logical_storage_cores = logical_storage_cores_by_device[device_id][num_hw_cqs]; std::transform(core_desc.relative_storage_cores.cbegin(), core_desc.relative_storage_cores.cend(), std::back_inserter(logical_storage_cores), [&grid_size](RelativeCoreCoord rel_coord) { return get_core_coord_from_relative(rel_coord, grid_size); }); return logical_storage_cores; @@ -106,12 +106,12 @@ inline CoreCoord get_compute_grid_size(chip_id_t device_id, const uint8_t num_hw inline const std::vector &get_logical_compute_cores(chip_id_t device_id, const uint8_t num_hw_cqs) { const core_descriptor_t &core_desc = get_core_descriptor_config(device_id, num_hw_cqs); - static std::unordered_map> logical_compute_cores_by_device; - if (logical_compute_cores_by_device.count(device_id)) { - return logical_compute_cores_by_device.at(device_id); + static std::unordered_map>> logical_compute_cores_by_device; + if (logical_compute_cores_by_device[device_id].count(num_hw_cqs)) { + return logical_compute_cores_by_device.at(device_id).at(num_hw_cqs); } CoreCoord grid_size = tt::Cluster::instance().get_soc_desc(device_id).worker_grid_size; - std::vector &logical_compute_cores = logical_compute_cores_by_device[device_id]; + std::vector &logical_compute_cores = logical_compute_cores_by_device[device_id][num_hw_cqs]; std::transform(core_desc.relative_compute_cores.cbegin(), core_desc.relative_compute_cores.cend(), std::back_inserter(logical_compute_cores), [&grid_size](RelativeCoreCoord rel_coord) { return get_core_coord_from_relative(rel_coord, grid_size); }); return logical_compute_cores; @@ -119,12 +119,12 @@ inline const std::vector &get_logical_compute_cores(chip_id_t device_ inline const std::vector &get_logical_dispatch_cores(chip_id_t device_id, const uint8_t num_hw_cqs) { const core_descriptor_t &core_desc = get_core_descriptor_config(device_id, num_hw_cqs); - static std::unordered_map> logical_dispatch_cores_by_device; - if (logical_dispatch_cores_by_device.count(device_id)) { - return logical_dispatch_cores_by_device.at(device_id); + static std::unordered_map>> logical_dispatch_cores_by_device; + if (logical_dispatch_cores_by_device[device_id].count(num_hw_cqs)) { + return logical_dispatch_cores_by_device.at(device_id).at(num_hw_cqs); } CoreCoord grid_size = tt::Cluster::instance().get_soc_desc(device_id).worker_grid_size; - std::vector &logical_dispatch_cores = logical_dispatch_cores_by_device[device_id]; + std::vector &logical_dispatch_cores = logical_dispatch_cores_by_device[device_id][num_hw_cqs]; std::transform(core_desc.relative_dispatch_cores.cbegin(), core_desc.relative_dispatch_cores.cend(), std::back_inserter(logical_dispatch_cores), [&grid_size](RelativeCoreCoord rel_coord) { return get_core_coord_from_relative(rel_coord, grid_size); }); return logical_dispatch_cores; @@ -132,11 +132,11 @@ inline const std::vector &get_logical_dispatch_cores(chip_id_t device inline const CoreType get_dispatch_core_type(chip_id_t device_id, const uint8_t num_hw_cqs) { const core_descriptor_t &core_desc = get_core_descriptor_config(device_id, num_hw_cqs); - static std::unordered_map dispatch_core_type_by_device; - if (dispatch_core_type_by_device.count(device_id)) { - return dispatch_core_type_by_device.at(device_id); + static std::unordered_map> dispatch_core_type_by_device; + if (dispatch_core_type_by_device[device_id].count(num_hw_cqs)) { + return dispatch_core_type_by_device.at(device_id).at(num_hw_cqs); } - dispatch_core_type_by_device[device_id] = core_desc.dispatch_core_type; + dispatch_core_type_by_device[device_id][num_hw_cqs] = core_desc.dispatch_core_type; return core_desc.dispatch_core_type; } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 95f3c63582c8..74643e065e12 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -31,8 +31,6 @@ Device::Device( chip_id_t device_id, const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector &l1_bank_remap, bool minimal, uint32_t worker_core) : id_(device_id), worker_thread_core(worker_core), work_executor(worker_core, device_id) { ZoneScoped; - TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs <= Device::max_num_hw_cqs, "num_hw_cqs can be between 1 and {}", Device::max_num_hw_cqs); - this->build_key_ = tt::Cluster::instance().get_harvesting_mask(device_id); tunnel_device_dispatch_workers_ = {}; this->initialize(num_hw_cqs, l1_small_size, trace_region_size, l1_bank_remap, minimal); } @@ -1606,10 +1604,11 @@ void Device::initialize_synchronous_sw_cmd_queue() { bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector &l1_bank_remap, bool minimal) { ZoneScoped; log_info(tt::LogMetal, "Initializing device {}. Program cache is {}enabled", this->id_, this->program_cache.is_enabled() ? "": "NOT "); - TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs < 3, "num_hw_cqs can be between 1 and 2"); + TT_FATAL(num_hw_cqs > 0 and num_hw_cqs <= Device::max_num_hw_cqs, "num_hw_cqs can be between 1 and {}", Device::max_num_hw_cqs); this->using_fast_dispatch = false; - this->build_key_ = tt::Cluster::instance().get_harvesting_mask(this->id()); this->num_hw_cqs_ = num_hw_cqs; + constexpr uint32_t harvesting_map_bits = 12; + this->build_key_ = ((uint32_t)this->num_hw_cqs_ << harvesting_map_bits) | tt::Cluster::instance().get_harvesting_mask(this->id()); this->initialize_cluster(); this->initialize_allocator(l1_small_size, trace_region_size, l1_bank_remap); this->initialize_build(); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 6a543a2f7e73..59b163c69ca2 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -136,13 +136,18 @@ void DevicePool::activate_device(chip_id_t id) { int core_assigned_to_device = this->device_to_core_map.at(id); auto dev = new Device(id, this->num_hw_cqs, this->l1_small_size, this->trace_region_size, this->l1_bank_remap, false, core_assigned_to_device); - dev->build_firmware(); + if (!this->firmware_built_keys.contains(dev->build_key())) { + dev->build_firmware(); + } this->devices[id] = std::unique_ptr(dev); } else { const auto& dev = this->devices[id]; - log_debug(tt::LogMetal, "DevicePool re-initialize device {}", id); + log_info(tt::LogMetal, "DevicePool re-initialize device {}", id); if (not dev->is_initialized()) { dev->initialize(num_hw_cqs, this->l1_small_size, this->trace_region_size, this->l1_bank_remap); + if (!this->firmware_built_keys.contains(dev->build_key())) { + dev->build_firmware(); + } } else { TT_THROW("Cannot re-initialize device {}, must first call close()", id); } diff --git a/tt_metal/impl/device/device_pool.hpp b/tt_metal/impl/device/device_pool.hpp index 70c76128e130..564d9876aa08 100644 --- a/tt_metal/impl/device/device_pool.hpp +++ b/tt_metal/impl/device/device_pool.hpp @@ -72,6 +72,7 @@ class DevicePool { std::mutex lock; std::vector> devices; bool skip_remote_devices; + std::unordered_set firmware_built_keys; // Determine which CPU cores the worker threads need to be placed on for each device std::unordered_map device_to_core_map; diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.hpp b/tt_metal/impl/dispatch/dispatch_core_manager.hpp index a184c1083c49..97b361d886c8 100644 --- a/tt_metal/impl/dispatch/dispatch_core_manager.hpp +++ b/tt_metal/impl/dispatch/dispatch_core_manager.hpp @@ -84,8 +84,12 @@ class dispatch_core_manager { // Ugly to accept num HW CQs here but it is needed to pull the correct number of initially available dispatch cores for assignment static dispatch_core_manager &get(uint8_t num_hw_cqs) { - static dispatch_core_manager inst = dispatch_core_manager(num_hw_cqs); - return inst; + static std::unordered_map> dispatch_core_managers; + if (dispatch_core_managers[num_hw_cqs] == nullptr) { + // Need to do this since dispatch_core_manager constructor is private + dispatch_core_managers[num_hw_cqs] = std::unique_ptr(new dispatch_core_manager(num_hw_cqs)); + } + return *dispatch_core_managers[num_hw_cqs]; } /// @brief Gets the location of the kernel desginated to read from the issue queue region from a particular command queue