diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp
index 90970df39630..c32c553b4f2f 100644
--- a/tt_metal/common/core_descriptor.hpp
+++ b/tt_metal/common/core_descriptor.hpp
@@ -88,12 +88,12 @@ inline uint32_t get_l1_bank_size(chip_id_t device_id, const uint8_t num_hw_cqs)
 
 inline const std::vector<CoreCoord> &get_logical_storage_cores(chip_id_t device_id, const uint8_t num_hw_cqs) {
     const core_descriptor_t &core_desc = get_core_descriptor_config(device_id, num_hw_cqs);
-    static std::unordered_map<chip_id_t, std::vector<CoreCoord>> logical_storage_cores_by_device;
-    if (logical_storage_cores_by_device.count(device_id)) {
-        return logical_storage_cores_by_device.at(device_id);
+    static std::unordered_map<chip_id_t, std::unordered_map<uint8_t, std::vector<CoreCoord>>> logical_storage_cores_by_device;
+    if (logical_storage_cores_by_device[device_id].count(num_hw_cqs)) {
+        return logical_storage_cores_by_device.at(device_id).at(num_hw_cqs);
     }
     CoreCoord grid_size = tt::Cluster::instance().get_soc_desc(device_id).worker_grid_size;
-    std::vector<CoreCoord> &logical_storage_cores = logical_storage_cores_by_device[device_id];
+    std::vector<CoreCoord> &logical_storage_cores = logical_storage_cores_by_device[device_id][num_hw_cqs];
     std::transform(core_desc.relative_storage_cores.cbegin(), core_desc.relative_storage_cores.cend(), std::back_inserter(logical_storage_cores),
                 [&grid_size](RelativeCoreCoord rel_coord) { return get_core_coord_from_relative(rel_coord, grid_size); });
     return logical_storage_cores;
@@ -106,12 +106,12 @@ inline CoreCoord get_compute_grid_size(chip_id_t device_id, const uint8_t num_hw
 
 inline const std::vector<CoreCoord> &get_logical_compute_cores(chip_id_t device_id, const uint8_t num_hw_cqs) {
     const core_descriptor_t &core_desc = get_core_descriptor_config(device_id, num_hw_cqs);
-    static std::unordered_map<chip_id_t, std::vector<CoreCoord>> logical_compute_cores_by_device;
-    if (logical_compute_cores_by_device.count(device_id)) {
-        return logical_compute_cores_by_device.at(device_id);
+    static std::unordered_map<chip_id_t, std::unordered_map<uint8_t, std::vector<CoreCoord>>> logical_compute_cores_by_device;
+    if (logical_compute_cores_by_device[device_id].count(num_hw_cqs)) {
+        return logical_compute_cores_by_device.at(device_id).at(num_hw_cqs);
     }
     CoreCoord grid_size = tt::Cluster::instance().get_soc_desc(device_id).worker_grid_size;
-    std::vector<CoreCoord> &logical_compute_cores = logical_compute_cores_by_device[device_id];
+    std::vector<CoreCoord> &logical_compute_cores = logical_compute_cores_by_device[device_id][num_hw_cqs];
     std::transform(core_desc.relative_compute_cores.cbegin(), core_desc.relative_compute_cores.cend(), std::back_inserter(logical_compute_cores),
                 [&grid_size](RelativeCoreCoord rel_coord) { return get_core_coord_from_relative(rel_coord, grid_size); });
     return logical_compute_cores;
@@ -119,12 +119,12 @@ inline const std::vector<CoreCoord> &get_logical_compute_cores(chip_id_t device_
 
 inline const std::vector<CoreCoord> &get_logical_dispatch_cores(chip_id_t device_id, const uint8_t num_hw_cqs) {
     const core_descriptor_t &core_desc = get_core_descriptor_config(device_id, num_hw_cqs);
-    static std::unordered_map<chip_id_t, std::vector<CoreCoord>> logical_dispatch_cores_by_device;
-    if (logical_dispatch_cores_by_device.count(device_id)) {
-        return logical_dispatch_cores_by_device.at(device_id);
+    static std::unordered_map<chip_id_t, std::unordered_map<uint8_t, std::vector<CoreCoord>>> logical_dispatch_cores_by_device;
+    if (logical_dispatch_cores_by_device[device_id].count(num_hw_cqs)) {
+        return logical_dispatch_cores_by_device.at(device_id).at(num_hw_cqs);
     }
     CoreCoord grid_size = tt::Cluster::instance().get_soc_desc(device_id).worker_grid_size;
-    std::vector<CoreCoord> &logical_dispatch_cores = logical_dispatch_cores_by_device[device_id];
+    std::vector<CoreCoord> &logical_dispatch_cores = logical_dispatch_cores_by_device[device_id][num_hw_cqs];
     std::transform(core_desc.relative_dispatch_cores.cbegin(), core_desc.relative_dispatch_cores.cend(), std::back_inserter(logical_dispatch_cores),
                 [&grid_size](RelativeCoreCoord rel_coord) { return get_core_coord_from_relative(rel_coord, grid_size); });
     return logical_dispatch_cores;
@@ -132,11 +132,11 @@ inline const std::vector<CoreCoord> &get_logical_dispatch_cores(chip_id_t device
 
 inline const CoreType get_dispatch_core_type(chip_id_t device_id, const uint8_t num_hw_cqs) {
     const core_descriptor_t &core_desc = get_core_descriptor_config(device_id, num_hw_cqs);
-    static std::unordered_map<chip_id_t, CoreType> dispatch_core_type_by_device;
-    if (dispatch_core_type_by_device.count(device_id)) {
-        return dispatch_core_type_by_device.at(device_id);
+    static std::unordered_map<chip_id_t, std::unordered_map<uint8_t, CoreType>> dispatch_core_type_by_device;
+    if (dispatch_core_type_by_device[device_id].count(num_hw_cqs)) {
+        return dispatch_core_type_by_device.at(device_id).at(num_hw_cqs);
     }
-    dispatch_core_type_by_device[device_id] = core_desc.dispatch_core_type;
+    dispatch_core_type_by_device[device_id][num_hw_cqs] = core_desc.dispatch_core_type;
     return core_desc.dispatch_core_type;
 }
 
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 95f3c63582c8..74643e065e12 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -31,8 +31,6 @@ Device::Device(
     chip_id_t device_id, const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap, bool minimal, uint32_t worker_core) :
     id_(device_id), worker_thread_core(worker_core), work_executor(worker_core, device_id) {
     ZoneScoped;
-    TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs <= Device::max_num_hw_cqs, "num_hw_cqs can be between 1 and {}", Device::max_num_hw_cqs);
-    this->build_key_ = tt::Cluster::instance().get_harvesting_mask(device_id);
     tunnel_device_dispatch_workers_ = {};
     this->initialize(num_hw_cqs, l1_small_size, trace_region_size, l1_bank_remap, minimal);
 }
@@ -1606,10 +1604,11 @@ void Device::initialize_synchronous_sw_cmd_queue() {
 bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap, bool minimal) {
     ZoneScoped;
     log_info(tt::LogMetal, "Initializing device {}. Program cache is {}enabled", this->id_, this->program_cache.is_enabled() ? "": "NOT ");
-    TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs < 3, "num_hw_cqs can be between 1 and 2");
+    TT_FATAL(num_hw_cqs > 0 and num_hw_cqs <= Device::max_num_hw_cqs, "num_hw_cqs can be between 1 and {}", Device::max_num_hw_cqs);
     this->using_fast_dispatch = false;
-    this->build_key_ = tt::Cluster::instance().get_harvesting_mask(this->id());
     this->num_hw_cqs_ = num_hw_cqs;
+    constexpr uint32_t harvesting_map_bits = 12;
+    this->build_key_ = ((uint32_t)this->num_hw_cqs_ << harvesting_map_bits) | tt::Cluster::instance().get_harvesting_mask(this->id());
     this->initialize_cluster();
     this->initialize_allocator(l1_small_size, trace_region_size, l1_bank_remap);
     this->initialize_build();
diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp
index 6a543a2f7e73..59b163c69ca2 100644
--- a/tt_metal/impl/device/device_pool.cpp
+++ b/tt_metal/impl/device/device_pool.cpp
@@ -136,13 +136,18 @@ void DevicePool::activate_device(chip_id_t id) {
         int core_assigned_to_device = this->device_to_core_map.at(id);
         auto dev =
             new Device(id, this->num_hw_cqs, this->l1_small_size, this->trace_region_size, this->l1_bank_remap, false, core_assigned_to_device);
-        dev->build_firmware();
+        if (!this->firmware_built_keys.contains(dev->build_key())) {
+            dev->build_firmware();
+        }
         this->devices[id] = std::unique_ptr<Device>(dev);
     } else {
         const auto& dev = this->devices[id];
-        log_debug(tt::LogMetal, "DevicePool re-initialize device {}", id);
+        log_info(tt::LogMetal, "DevicePool re-initialize device {}", id);
         if (not dev->is_initialized()) {
             dev->initialize(num_hw_cqs, this->l1_small_size, this->trace_region_size, this->l1_bank_remap);
+            if (!this->firmware_built_keys.contains(dev->build_key())) {
+                dev->build_firmware();
+            }
         } else {
             TT_THROW("Cannot re-initialize device {}, must first call close()", id);
         }
diff --git a/tt_metal/impl/device/device_pool.hpp b/tt_metal/impl/device/device_pool.hpp
index 70c76128e130..564d9876aa08 100644
--- a/tt_metal/impl/device/device_pool.hpp
+++ b/tt_metal/impl/device/device_pool.hpp
@@ -72,6 +72,7 @@ class DevicePool {
     std::mutex lock;
     std::vector<std::unique_ptr<Device>> devices;
     bool skip_remote_devices;
+    std::unordered_set<uint32_t> firmware_built_keys;
 
     // Determine which CPU cores the worker threads need to be placed on for each device
     std::unordered_map<uint32_t, uint32_t> device_to_core_map;
diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.hpp b/tt_metal/impl/dispatch/dispatch_core_manager.hpp
index a184c1083c49..97b361d886c8 100644
--- a/tt_metal/impl/dispatch/dispatch_core_manager.hpp
+++ b/tt_metal/impl/dispatch/dispatch_core_manager.hpp
@@ -84,8 +84,12 @@ class dispatch_core_manager {
 
     // Ugly to accept num HW CQs here but it is needed to pull the correct number of initially available dispatch cores for assignment
     static dispatch_core_manager &get(uint8_t num_hw_cqs) {
-        static dispatch_core_manager inst = dispatch_core_manager(num_hw_cqs);
-        return inst;
+        static std::unordered_map<uint8_t, std::unique_ptr<dispatch_core_manager>> dispatch_core_managers;
+        if (dispatch_core_managers[num_hw_cqs] == nullptr) {
+            // Need to do this since dispatch_core_manager constructor is private
+            dispatch_core_managers[num_hw_cqs] = std::unique_ptr<dispatch_core_manager>(new dispatch_core_manager(num_hw_cqs));
+        }
+        return *dispatch_core_managers[num_hw_cqs];
     }
 
     /// @brief Gets the location of the kernel desginated to read from the issue queue region from a particular command queue