diff --git a/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
index 227adf612e5..6072e34599f 100644
--- a/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
+++ b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
@@ -85,7 +85,8 @@ std::vector<v1::DeviceHandle> get_device_ring(std::vector<tt::tt_metal::v1::Devi
     std::vector<std::vector<int>> adj(devices.size(), std::vector<int>(devices.size(), 0));
     for (uint32_t i = 0; i < devices.size(); ++i) {
         const auto& device = devices[i];
-        for (const auto& connected_device_id : device->get_ethernet_connected_device_ids()) {
+        auto ethernet_connected_device_ids = tt::Cluster::instance().get_ethernet_connected_device_ids(device->id());
+        for (const auto& connected_device_id : ethernet_connected_device_ids) {
             for (uint32_t j = 0; j < devices.size(); ++j) {
                 if (devices[j]->id() == connected_device_id) {
                     adj[i][j] = 1;
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index cb49710dc63..39ef6314e6f 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -63,6 +63,18 @@ bool Device::is_inactive_ethernet_core(CoreCoord logical_core) const {
     return inactive_ethernet_cores.find(logical_core) != inactive_ethernet_cores.end();
 }
 
+std::tuple<chip_id_t, CoreCoord> Device::get_connected_ethernet_core(CoreCoord eth_core) const {
+    return tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(this->id_, eth_core));
+}
+
+std::vector<CoreCoord> Device::get_ethernet_sockets(chip_id_t connected_chip_id) const {
+    return tt::Cluster::instance().get_ethernet_sockets(this->id_, connected_chip_id);
+}
+
+bool Device::is_mmio_capable() const {
+    return tt::Cluster::instance().get_associated_mmio_device(this->id_) == this->id_;
+}
+
 CoreRangeSet Device::worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const {
     return this->active_sub_device_manager_->sub_device(sub_device_id).cores(core_type);
 }
@@ -3223,15 +3235,15 @@ CoreCoord Device::logical_grid_size() const {
     return tt::Cluster::instance().get_soc_desc(id_).worker_grid_size;
 }
 
+CoreCoord Device::dram_grid_size() const {
+    return tt::Cluster::instance().get_soc_desc(id_).get_dram_grid_size();
+}
+
 CoreCoord Device::compute_with_storage_grid_size() const {
     const auto &dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(id_);
     return tt::get_compute_grid_size(id_, num_hw_cqs_, dispatch_core_config);
 }
 
-CoreCoord Device::dram_grid_size() const {
-    return tt::Cluster::instance().get_soc_desc(id_).get_dram_grid_size();
-}
-
 CoreType Device::core_type_from_physical_core(const CoreCoord &physical_coord) const {
     const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_);
     if (soc_desc.physical_cores.find(physical_coord) == soc_desc.physical_cores.end())
@@ -3249,7 +3261,6 @@ CoreType Device::core_type_from_virtual_core(const CoreCoord &virtual_coord) con
     return this->core_type_from_physical_core(virtual_coord);
 }
 
-
 CoreCoord Device::virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const {
     if (coord.x >= this->grid_size().x || coord.y >= this->grid_size().y) {
         // Coordinate already in virtual space: NOC0 and NOC1 are the same
@@ -3304,6 +3315,7 @@ std::vector<CoreCoord> Device::ethernet_cores_from_logical_cores(const std::vect
     }
     return eth_cores;
 }
+
 CoreCoord Device::virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const {
     return tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(this->id_, logical_coord, core_type);
 }
@@ -3679,14 +3691,18 @@ void Device::enable_async(bool enable) {
 }
 
 bool Device::using_slow_dispatch() const {
-    return not (this->using_fast_dispatch_);
+    return !using_fast_dispatch();
+}
+
+bool Device::using_fast_dispatch() const {
+    return using_fast_dispatch_;
 }
 
 void Device::begin_trace(const uint8_t cq_id, const uint32_t tid) {
     ZoneScoped;
     TracyTTMetalBeginTrace(this->id(), tid);
     TT_FATAL(!this->hw_command_queues_[cq_id]->tid.has_value(), "CQ {} is already being used for tracing tid {}", (uint32_t)cq_id, tid);
-    this->MarkAllocationsSafe();
+    this->mark_allocations_safe();
     // Create an empty trace buffer here. This will get initialized in end_trace
     TT_FATAL(this->active_sub_device_manager_->get_trace(tid) == nullptr, "Trace already exists for tid {} on device {}'s active sub-device manager {}", tid, this->id_, this->active_sub_device_manager_id_);
     auto &trace_buffer = this->active_sub_device_manager_->create_trace(tid);
@@ -3701,7 +3717,7 @@ void Device::end_trace(const uint8_t cq_id, const uint32_t tid) {
     TT_FATAL(trace_buffer != nullptr, "Trace instance {} must exist on device {}'s active sub-device manager {}", tid, this->id_, this->active_sub_device_manager_id_);
     this->hw_command_queues_[cq_id]->record_end();
     Trace::initialize_buffer(this->command_queue(cq_id), trace_buffer);
-    this->MarkAllocationsUnsafe();
+    this->mark_allocations_unsafe();
 }
 
 void Device::replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking) {
@@ -3724,7 +3740,7 @@ void Device::release_trace(const uint32_t tid) {
 
     // Only enable allocations once all captured traces are released
     if (this->trace_buffers_size_ == 0) {
-        this->MarkAllocationsSafe();
+        this->mark_allocations_safe();
     }
 }
 
@@ -3750,11 +3766,11 @@ std::size_t Device::num_program_cache_entries() {
     return program_cache_.num_entries();
 }
 
-void Device::MarkAllocationsUnsafe() {
+void Device::mark_allocations_unsafe() {
     tt::tt_metal::allocator::mark_allocations_unsafe(*this->get_initialized_allocator());
 }
 
-void Device::MarkAllocationsSafe() {
+void Device::mark_allocations_safe() {
     tt::tt_metal::allocator::mark_allocations_safe(*this->get_initialized_allocator());
 }
 
@@ -3964,6 +3980,35 @@ std::vector<CoreCoord> Device::get_optimal_dram_bank_to_logical_worker_assignmen
     return this->optimal_dram_bank_to_logical_worker_assignment_;
 }
 
+HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord virtual_core) const {
+    if (!tt::Cluster::instance().is_ethernet_core(virtual_core, this->id_)) {
+        return HalProgrammableCoreType::TENSIX;
+    }
+
+    // Eth pcores have a different address, but only active ones.
+    CoreCoord logical_core = this->logical_core_from_ethernet_core(virtual_core);
+    if (this->is_active_ethernet_core(logical_core)) {
+        return HalProgrammableCoreType::ACTIVE_ETH;
+    }
+
+    return HalProgrammableCoreType::IDLE_ETH;
+}
+
+// TODO: Find a better home for this function
+// Extracts all the pairs of noc multicast encodings given a set of core ranges
+std::vector<std::pair<transfer_info_cores, uint32_t>> Device::extract_dst_noc_multicast_info(const std::vector<CoreRange>& ranges, const CoreType core_type) {
+    std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info;
+    dst_noc_multicast_info.reserve(ranges.size());
+    for (const CoreRange& core_range : ranges) {
+        CoreCoord virtual_start = this->virtual_core_from_logical_core(core_range.start_coord, core_type);
+        CoreCoord virtual_end = this->virtual_core_from_logical_core(core_range.end_coord, core_type);
+
+        uint32_t num_receivers = core_range.size();
+        dst_noc_multicast_info.push_back(std::make_pair(CoreRange(virtual_start, virtual_end), num_receivers));
+    }
+    return dst_noc_multicast_info;
+}
+
 
 
 size_t v1::GetNumAvailableDevices() { return tt::Cluster::instance().number_of_user_devices(); }
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index d37cfa6f038..cbd7e6b96d7 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -61,16 +61,7 @@ inline namespace v0 {
 
 // A physical PCIexpress Tenstorrent device
 class Device {
-   private:
-    static_assert(detail::SubDeviceManager::MAX_NUM_SUB_DEVICES <= dispatch_constants::DISPATCH_MESSAGE_ENTRIES, "MAX_NUM_SUB_DEVICES must be less than or equal to dispatch_constants::DISPATCH_MESSAGE_ENTRIES");
-    static constexpr uint32_t DEFAULT_NUM_SUB_DEVICES = 1;
-
-    CoreCoord physical_worker_core_from_logical_core(const CoreCoord &logical_core) const;
-    CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const;
-    CoreType core_type_from_physical_core(const CoreCoord &physical_core) const;
-    CoreCoord virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const;
-
-   public:
+public:
     // friend void tt_gdb(Device* device, int chip_id, const vector<CoreCoord> cores, vector<string> ops);
     Device () = delete;
     Device(
@@ -103,67 +94,45 @@ class Device {
     bool is_initialized() const { return this->initialized_; }
 
     int num_dram_channels() const;
-
     uint32_t l1_size_per_core() const;
     uint32_t dram_size_per_channel() const;
-
     CoreCoord grid_size() const;
-
     CoreCoord logical_grid_size() const;
-
-    CoreCoord compute_with_storage_grid_size() const;
-
     CoreCoord dram_grid_size() const;
-
     CoreType core_type_from_virtual_core(const CoreCoord& virtual_coord) const;
+
     // Given a Virtual coordinate in noc_index space, get the equivalent coordinate in Virtual NOC0 space
     CoreCoord virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const;
     // Given a coordinate in Virtual NOC0 Space, get the equivalent coordinate in Virtual noc_index space
     CoreCoord virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const;
+
     std::vector<CoreCoord> worker_cores_from_logical_cores(const std::vector<CoreCoord> &logical_cores) const;
     std::vector<CoreCoord> ethernet_cores_from_logical_cores(const std::vector<CoreCoord> &logical_cores) const;
     std::vector<CoreCoord> get_optimal_dram_bank_to_logical_worker_assignment();
 
     CoreCoord virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const;
-
     CoreCoord worker_core_from_logical_core(const CoreCoord &logical_core) const;
 
     // Ethernet API
     CoreCoord ethernet_core_from_logical_core(const CoreCoord &logical_core) const;
     CoreCoord logical_core_from_ethernet_core(const CoreCoord &ethernet_core) const;
-
-    std::unordered_set<chip_id_t> get_ethernet_connected_device_ids() const {
-        return tt::Cluster::instance().get_ethernet_connected_device_ids(this->id_);
-    }
-
     std::unordered_set<CoreCoord> get_active_ethernet_cores(bool skip_reserved_tunnel_cores=false) const;
-
-    bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores=false) const;
-
     std::unordered_set<CoreCoord> get_inactive_ethernet_cores() const;
-
+    bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores=false) const;
+    std::tuple<chip_id_t, CoreCoord> get_connected_ethernet_core(CoreCoord eth_core) const;
+    std::vector<CoreCoord> get_ethernet_sockets(chip_id_t connected_chip_id) const;
     bool is_inactive_ethernet_core(CoreCoord logical_core) const;
 
+    CoreCoord compute_with_storage_grid_size() const;
+
     CoreRangeSet worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const;
     uint32_t num_worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const;
 
-    std::tuple<chip_id_t, CoreCoord> get_connected_ethernet_core(CoreCoord eth_core) const {
-        return tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(this->id_, eth_core));
-    }
-
-    std::vector<CoreCoord> get_ethernet_sockets(chip_id_t connected_chip_id) const {
-        return tt::Cluster::instance().get_ethernet_sockets(this->id_, connected_chip_id);
-    }
-
-    bool is_mmio_capable() const {
-        return tt::Cluster::instance().get_associated_mmio_device(this->id_) == this->id_;
-    }
-
-    void setup_tunnel_for_remote_devices();
-
-    void update_workers_build_settings(std::vector<std::vector<std::tuple<tt_cxy_pair, dispatch_worker_build_settings_t>>> &device_worker_variants);
+    const std::unique_ptr<Allocator> &get_initialized_allocator() const;
+    const std::unique_ptr<Allocator> &get_initialized_allocator(SubDeviceId sub_device_id) const;
 
-    uint32_t num_sub_devices() const;
+    DeviceAddr get_base_allocator_addr(const HalMemType &mem_type) const;
+    DeviceAddr get_base_allocator_addr(const HalMemType &mem_type, SubDeviceId sub_device_id) const;
 
     uint32_t num_banks(const BufferType &buffer_type) const;
     uint32_t num_banks(const BufferType &buffer_type, SubDeviceId sub_device_id) const;
@@ -176,9 +145,6 @@ class Device {
     CoreCoord logical_core_from_dram_channel(uint32_t dram_channel) const;
     uint32_t dram_channel_from_logical_core(const CoreCoord& logical_core) const;
 
-    const std::unique_ptr<Allocator> &get_initialized_allocator() const;
-    const std::unique_ptr<Allocator> &get_initialized_allocator(SubDeviceId sub_device_id) const;
-
     int32_t bank_offset(BufferType buffer_type, uint32_t bank_id) const;
     int32_t bank_offset(BufferType buffer_type, uint32_t bank_id, SubDeviceId sub_device_id) const;
 
@@ -188,10 +154,8 @@ class Device {
     const std::vector<uint32_t> &bank_ids_from_dram_channel(uint32_t dram_channel) const;
     const std::vector<uint32_t> &bank_ids_from_dram_channel(uint32_t dram_channel, SubDeviceId sub_device_id) const;
 
-    const std::vector<uint32_t> &bank_ids_from_logical_core(
-        BufferType buffer_type, const CoreCoord &logical_core) const;
-    const std::vector<uint32_t> &bank_ids_from_logical_core(
-        BufferType buffer_type, const CoreCoord &logical_core, SubDeviceId sub_device_id) const;
+    const std::vector<uint32_t> &bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord &logical_core) const;
+    const std::vector<uint32_t> &bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord &logical_core, SubDeviceId sub_device_id) const;
 
     allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type) const;
     allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type, SubDeviceId sub_device_id) const;
@@ -199,35 +163,31 @@ class Device {
     uint32_t get_allocator_alignment() const;
     uint32_t get_allocator_alignment(SubDeviceId sub_device_id) const;
 
+    std::optional<DeviceAddr> lowest_occupied_compute_l1_address() const;
+    std::optional<DeviceAddr> lowest_occupied_compute_l1_address(tt::stl::Span<const SubDeviceId> sub_device_ids) const;
+
     size_t get_l1_small_size() const;
     size_t get_l1_small_size(SubDeviceId sub_device_id) const;
 
-    void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const;
-    void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const;
+    const std::unordered_set<Buffer *> &get_allocated_buffers() const;
+    const std::unordered_set<Buffer *> &get_allocated_buffers(SubDeviceId sub_device_id) const;
 
-    // Set of logical storage only core coordinates
-    const std::set<CoreCoord> &storage_only_cores() const { return this->storage_only_cores_; }
+    void deallocate_buffers();
+    void deallocate_buffers(SubDeviceId sub_device_id);
 
-    // Set of logical dispatch core coordinates
+    void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const;
+    void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const;
 
     // Set of logical ethernet core coordinates
     // core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
     // core.y represents different channels along one <x>
     const std::set<CoreCoord> &ethernet_cores() const { return this->ethernet_cores_; }
 
+    const std::set<CoreCoord> &storage_only_cores() const { return this->storage_only_cores_; }
 
     uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const;
     uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const;
 
-    const std::unordered_set<Buffer *> &get_allocated_buffers() const;
-    const std::unordered_set<Buffer *> &get_allocated_buffers(SubDeviceId sub_device_id) const;
-
-    void deallocate_buffers();
-    void deallocate_buffers(SubDeviceId sub_device_id);
-
-    std::optional<DeviceAddr> lowest_occupied_compute_l1_address() const;
-    std::optional<DeviceAddr> lowest_occupied_compute_l1_address(tt::stl::Span<const SubDeviceId> sub_device_ids) const;
-
     // machine epsilon
     float sfpu_eps() const;
 
@@ -237,13 +197,13 @@ class Device {
     // machine inf
     float sfpu_inf() const;
 
-    void generate_device_bank_to_noc_tables();
     const JitBuildEnv& build_env() const { return this->build_env_; }
     const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const;
     const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const;
     const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const;
     const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const;
     const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const;
+
     SystemMemoryManager& sysmem_manager() { return *sysmem_manager_; }
     HWCommandQueue& hw_command_queue(size_t cq_id = 0);
     CommandQueue& command_queue(size_t cq_id = 0);
@@ -254,70 +214,57 @@ class Device {
     void replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking);
     void release_trace(const uint32_t tid);
     std::shared_ptr<TraceBuffer> get_trace(uint32_t tid);
+    uint32_t get_trace_buffers_size() const { return trace_buffers_size_; }
+    void set_trace_buffers_size(uint32_t size) { trace_buffers_size_ = size; }
 
     bool using_slow_dispatch() const;
+    bool using_fast_dispatch() const;
 
     // Checks that the given arch is on the given pci_slot and that it's responding
     // Puts device into reset
     bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {}, bool minimal = false);
-    void initialize_cluster();
-    std::unique_ptr<Allocator> initialize_allocator(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {});
-    void initialize_build();
-    void initialize_device_kernel_defines();
     void build_firmware();
-    void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core);
-    void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg);
     void reset_cores();
     void initialize_and_launch_firmware();
     void init_command_queue_host();
     void init_command_queue_device();
     void initialize_synchronous_sw_cmd_queue();
+    void update_dispatch_cores_for_multi_cq_eth_dispatch();
+
     void configure_kernel_variant(Program& program, const string& path, const std::vector<uint32_t>& compile_args, CoreCoord kernel_core, CoreCoord Kernel_virtual_core,
                                   CoreType dispatch_core_type, CoreCoord upstream_virtual_core, CoreCoord downstream_virtual_core, CoreCoord downstream_slave_virtual_core, std::map<string, string> defines_in, NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index, bool is_active_eth_core = false, bool send_to_brisc = false, bool force_watcher_no_inline = false);
-    void compile_command_queue_programs();
-    void compile_command_queue_programs_new();
-    void configure_command_queue_programs();
-    void configure_command_queue_programs_new();
-    void clear_l1_state();
-    void get_associated_dispatch_virtual_cores(
-        std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &my_dispatch_cores,
-        std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &other_dispatch_cores);
-    std::pair<int, int> build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const;
 
     // Puts device into reset
     bool close();
     friend bool CloseDevice(Device *device);
 
-    // APIs to access this device's work executor
-    bool can_use_passthrough_scheduling() const;
-    template<typename F>
-    void push_work(F&& work, bool blocking = false) {
-        this->work_executor_.push_work(std::forward<F>(work), blocking);
-    }
-    void synchronize();
-    void set_worker_mode(const WorkExecutorMode& mode);
     void enable_async(bool enable);
+    void synchronize();
     WorkExecutorMode get_worker_mode() { return work_executor_.get_worker_mode(); }
     void set_worker_queue_mode(const WorkerQueueMode& mode) { this->work_executor_.set_worker_queue_mode(mode); }
     WorkerQueueMode get_worker_queue_mode() { return this->work_executor_.get_worker_queue_mode(); }
+    bool is_worker_queue_empty() const { return work_executor_.worker_queue.empty(); }
+    bool can_use_passthrough_scheduling() const;
 
-    void update_dispatch_cores_for_multi_cq_eth_dispatch();
+    template<typename F>
+    void push_work(F&& work, bool blocking = false) {
+        this->work_executor_.push_work(std::forward<F>(work), blocking);
+    }
 
     // Program cache interface. Syncrhonize with worker worker threads before querying or
     // modifying this structure, since worker threads use this for compiling ops
     void enable_program_cache();
     void disable_and_clear_program_cache();
+    program_cache::detail::ProgramCache& get_program_cache() { return program_cache_; }
     std::size_t num_program_cache_entries();
 
     HalProgrammableCoreType get_programmable_core_type(CoreCoord virtual_core) const;
+
     template <typename T = DeviceAddr>
     T get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const;
-    // Returns address where allocator starts allocating buffer
-    DeviceAddr get_base_allocator_addr(const HalMemType &mem_type) const;
-    DeviceAddr get_base_allocator_addr(const HalMemType &mem_type, SubDeviceId sub_device_id) const;
 
-    template <typename CoreRangeContainer>
-    std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(const CoreRangeContainer& ranges, const CoreType core_type);
+    std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(const std::vector<CoreRange>& ranges, const CoreType core_type);
+
     bool dispatch_s_enabled() const;
     bool distributed_dispatcher() const;
     NOC dispatch_go_signal_noc() const;
@@ -327,43 +274,68 @@ class Device {
     uint8_t num_noc_unicast_txns(SubDeviceId sub_device_id) const;
     uint8_t noc_data_start_index(SubDeviceId sub_device_id, bool mcast_data=true, bool unicast_data=true) const;
 
-    LaunchMessageRingBufferState& get_worker_launch_message_buffer_state(SubDeviceId sub_device_id);
-
     SubDeviceManagerId get_active_sub_device_manager_id() const;
     SubDeviceManagerId get_default_sub_device_manager_id() const;
     SubDeviceManagerId create_sub_device_manager(tt::stl::Span<const SubDevice> sub_devices, DeviceAddr local_l1_size);
+    void remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id);
     void load_sub_device_manager(SubDeviceManagerId sub_device_manager_id);
     void clear_loaded_sub_device_manager();
-    void remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id);
+    LaunchMessageRingBufferState& get_worker_launch_message_buffer_state(SubDeviceId sub_device_id);
     const std::vector<SubDeviceId> &get_sub_device_ids() const;
+    uint32_t num_sub_devices() const;
 
     // TODO #15944: Temporary api until migration to actual fabric is complete
     std::tuple<SubDeviceManagerId, SubDeviceId> create_sub_device_manager_with_fabric(tt::stl::Span<const SubDevice> sub_devices, DeviceAddr local_l1_size);
     std::optional<SubDeviceId> get_fabric_sub_device_id() const;
 
-    program_cache::detail::ProgramCache& get_program_cache() { return program_cache_; }
-
-    std::set<CoreCoord> get_compute_cores() const { return compute_cores_; }
-
     uint32_t get_completion_queue_reader_core() const { return completion_queue_reader_core_; }
 
-    bool is_worker_queue_empty() const { return work_executor_.worker_queue.empty(); }
-
-    uint32_t get_trace_buffers_size() const { return trace_buffers_size_; }
-    void set_trace_buffers_size(uint32_t size) { trace_buffers_size_ = size; }
-
-    bool using_fast_dispatch() const { return using_fast_dispatch_; }
-
+    bool is_mmio_capable() const;
     std::vector<std::vector<chip_id_t>> get_tunnels_from_mmio() const { return tunnels_from_mmio_; }
 
     static constexpr MemoryAllocator allocator_scheme_ = MemoryAllocator::L1_BANKING;
 
 private:
+    static_assert(detail::SubDeviceManager::MAX_NUM_SUB_DEVICES <= dispatch_constants::DISPATCH_MESSAGE_ENTRIES, "MAX_NUM_SUB_DEVICES must be less than or equal to dispatch_constants::DISPATCH_MESSAGE_ENTRIES");
+    static constexpr uint32_t DEFAULT_NUM_SUB_DEVICES = 1;
+
+    void initialize_cluster();
+    std::unique_ptr<Allocator> initialize_allocator(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {});
+    void initialize_build();
+    void initialize_device_kernel_defines();
+    void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core);
+    void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg);
+
     void initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap);
+
+    void compile_command_queue_programs();
+    void compile_command_queue_programs_new();
+    void configure_command_queue_programs();
+    void configure_command_queue_programs_new();
+    void clear_l1_state();
+    void get_associated_dispatch_virtual_cores(
+        std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &my_dispatch_cores,
+        std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &other_dispatch_cores);
+    std::pair<int, int> build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const;
+
+    void set_worker_mode(const WorkExecutorMode& mode);
+
+    void generate_device_bank_to_noc_tables();
+
+    void setup_tunnel_for_remote_devices();
+
+    void update_workers_build_settings(std::vector<std::vector<std::tuple<tt_cxy_pair, dispatch_worker_build_settings_t>>> &device_worker_variants);
+
     SubDeviceManagerId get_next_sub_device_manager_id();
     void reset_sub_devices_state(const std::unique_ptr<detail::SubDeviceManager>& sub_device_manager);
-    void MarkAllocationsUnsafe();
-    void MarkAllocationsSafe();
+
+    void mark_allocations_unsafe();
+    void mark_allocations_safe();
+
+    CoreCoord physical_worker_core_from_logical_core(const CoreCoord &logical_core) const;
+    CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const;
+    CoreType core_type_from_physical_core(const CoreCoord &physical_core) const;
+    CoreCoord virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const;
 
     chip_id_t id_;
     uint32_t build_key_ = 0;
@@ -372,7 +344,7 @@ class Device {
 
     // Leaving here for compatibility with current reacharounds
     // TODO: Replace with get_initialized_allocator()
-    Allocator * allocator_ = nullptr;
+    Allocator* allocator_ = nullptr;
     bool initialized_ = false;
 
     std::vector<std::unique_ptr<Program>> command_queue_programs_;
@@ -421,43 +393,11 @@ class Device {
 
 }  // namespace v0
 
-inline HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord virtual_core) const {
-
-    HalProgrammableCoreType programmable_core_type = HalProgrammableCoreType::TENSIX;
-    if (tt::Cluster::instance().is_ethernet_core(virtual_core, this->id_)) {
-        // Eth pcores have a different address, but only active ones.
-        CoreCoord logical_core = this->logical_core_from_ethernet_core(virtual_core);
-        if (this->is_active_ethernet_core(logical_core)) {
-            programmable_core_type = HalProgrammableCoreType::ACTIVE_ETH;
-        } else {
-            programmable_core_type = HalProgrammableCoreType::IDLE_ETH;
-        }
-    }
-
-    return programmable_core_type;
-}
-
 template <typename T>
 inline T Device::get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const {
     return hal.get_dev_addr<T>(this->get_programmable_core_type(virtual_core), addr_type);
 }
 
-// TODO: Find a better home for this function
-template <typename CoreRangeContainer>
-std::vector<std::pair<transfer_info_cores, uint32_t>> Device::extract_dst_noc_multicast_info(const CoreRangeContainer& ranges, const CoreType core_type) {
-    // This API extracts all the pairs of noc multicast encodings given a set of core ranges
-    std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info;
-    dst_noc_multicast_info.reserve(ranges.size());
-    for (const CoreRange& core_range : ranges) {
-        CoreCoord virtual_start = this->virtual_core_from_logical_core(core_range.start_coord, core_type);
-        CoreCoord virtual_end = this->virtual_core_from_logical_core(core_range.end_coord, core_type);
-
-        uint32_t num_receivers = core_range.size();
-        dst_noc_multicast_info.push_back(std::make_pair(CoreRange(virtual_start, virtual_end), num_receivers));
-    }
-    return dst_noc_multicast_info;
-}
-
 }  // namespace tt_metal
 
 }  // namespace tt
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 1eed5f60384..f6c4d500289 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -724,7 +724,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc
                         }
                     } else {
                         std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
-                            device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
+                            device->extract_dst_noc_multicast_info(
                                 kernel->logical_coreranges(), core_type);
                         common_sub_cmds.emplace<std::vector<CQDispatchWritePackedMulticastSubCmd>>(
                             std::vector<CQDispatchWritePackedMulticastSubCmd>());
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index 59be426029b..fbbe914ac85 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -1019,7 +1019,7 @@ void detail::Program_::populate_dispatch_data(Device *device) {
         if (semaphore.core_type() == CoreType::WORKER) {
             uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
             std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
-                device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
+                device->extract_dst_noc_multicast_info(
                     semaphore.core_range_set().ranges(), CoreType::WORKER);
             transfer_info transfer_info = {
                 .dst_base_addr = semaphore.offset(),
@@ -1111,7 +1111,7 @@ void detail::Program_::populate_dispatch_data(Device *device) {
             // TODO: add a bit in the hal that says if this core type is unicast/multicast
             if (core_type == CoreType::WORKER) {
                 std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
-                    device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
+                    device->extract_dst_noc_multicast_info(
                         kernel_group.core_ranges.ranges(), core_type);
                 std::vector<KernelHandle> kernel_ids;
                 for (int dispatch_class = 0; dispatch_class < kernel_group.kernel_ids.size(); dispatch_class++) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp
index 9f69e69b474..c5be2c09d7a 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp
@@ -36,11 +36,8 @@ inline uint32_t get_estimated_size_of_cbs(const Tensor& input_tensor_a) {
 }
 
 inline uint32_t get_max_l1_space(const Tensor& input_tensor_a) {
-    tt::tt_metal::Device* device = input_tensor_a.device();
-    const std::vector<uint32_t>& bank_ids =
-        device->bank_ids_from_logical_core(BufferType::L1, *device->get_compute_cores().begin());
-    std::optional<uint64_t> lowest_address =
-        allocator::lowest_occupied_l1_address(*device->get_initialized_allocator(), bank_ids[0]);
+    auto device = input_tensor_a.device();
+    auto lowest_address = device->lowest_occupied_compute_l1_address();
     uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core();
     max_l1_space = max_l1_space - device->get_base_allocator_addr(HalMemType::L1);
     return max_l1_space;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
index bfb8350a644..2c0c62c0edc 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
@@ -171,11 +171,8 @@ inline uint32_t get_estimated_size_of_cbs(
 }
 
 inline uint32_t get_max_l1_space(const Tensor& input_tensor_a) {
-    tt::tt_metal::Device* device = input_tensor_a.device();
-    const std::vector<uint32_t>& bank_ids =
-        device->bank_ids_from_logical_core(BufferType::L1, *device->get_compute_cores().begin());
-    std::optional<uint64_t> lowest_address =
-        allocator::lowest_occupied_l1_address(*device->get_initialized_allocator(), bank_ids[0]);
+    auto device = input_tensor_a.device();
+    auto lowest_address = device->lowest_occupied_compute_l1_address();
     uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core();
     max_l1_space = max_l1_space - device->get_base_allocator_addr(HalMemType::L1);
     return max_l1_space;