From d9304dd4acf235b03a3e8953b70a069802e0704c Mon Sep 17 00:00:00 2001
From: Raymond Kim <raymond.kim243@outlook.com>
Date: Wed, 1 Jan 2025 14:48:00 -0500
Subject: [PATCH] Revert "#16356: Program Dispatch Modifications for
 MeshWorkload"

This reverts commit 3755428c7e5db16f11a0e7751c6277d0b5ffcb01.
---
 tt_metal/impl/CMakeLists.txt                  |    1 -
 tt_metal/impl/device/device.cpp               |    3 +-
 tt_metal/impl/device/device_pool.cpp          |    2 +-
 tt_metal/impl/dispatch/command_queue.cpp      | 1255 +++++++++++++-
 tt_metal/impl/dispatch/command_queue.hpp      |   11 +
 .../impl/dispatch/dispatch_core_common.hpp    |    5 -
 .../impl/dispatch/kernel_config/dispatch.cpp  |    3 +
 .../impl/dispatch/kernel_config/dispatch.hpp  |   16 -
 .../dispatch/kernel_config/dispatch_s.cpp     |    2 +-
 .../dispatch/kernel_config/dispatch_s.hpp     |    5 +-
 .../dispatch/kernel_config/eth_tunneler.cpp   |    4 +-
 .../impl/dispatch/kernel_config/prefetch.cpp  |    6 +
 .../impl/dispatch/kernel_config/prefetch.hpp  |   13 -
 tt_metal/impl/dispatch/topology.cpp           |   37 +-
 tt_metal/impl/kernels/kernel_types.hpp        |    2 +-
 tt_metal/impl/program/program.cpp             |  337 +++-
 tt_metal/impl/program/program.hpp             |   30 +-
 .../impl/program/program_dispatch_utils.cpp   | 1472 -----------------
 .../impl/program/program_dispatch_utils.hpp   |  112 --
 tt_metal/tt_metal.cpp                         |   12 +-
 20 files changed, 1503 insertions(+), 1825 deletions(-)
 delete mode 100644 tt_metal/impl/program/program_dispatch_utils.cpp
 delete mode 100644 tt_metal/impl/program/program_dispatch_utils.hpp
diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt
index 25384d6f267..97946d994d0 100644
--- a/tt_metal/impl/CMakeLists.txt
+++ b/tt_metal/impl/CMakeLists.txt
@@ -17,7 +17,6 @@ set(IMPL_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/allocator/basic_allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/allocator/l1_banking_allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/program/program.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/program/program_dispatch_utils.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/debug_tools.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/command_queue.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/worker_config_buffer.cpp
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 2c1f685bd9d..8f5afb8603b 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -938,7 +938,7 @@ void Device::init_command_queue_host() {
     this->sysmem_manager_ = std::make_unique<SystemMemoryManager>(this->id_, this->num_hw_cqs());
     hw_command_queues_.resize(num_hw_cqs());
     for (size_t cq_id = 0; cq_id < num_hw_cqs(); cq_id++) {
-        hw_command_queues_[cq_id] = std::make_unique<HWCommandQueue>(this, cq_id, dispatch_downstream_noc);
+        hw_command_queues_[cq_id] = std::make_unique<HWCommandQueue>(this, cq_id, NOC::NOC_0);
         // Need to do this since CommandQueue constructor is private
         sw_command_queues_.push_back(std::unique_ptr<CommandQueue>(new CommandQueue(this, cq_id)));
     }
@@ -1750,7 +1750,6 @@ LaunchMessageRingBufferState& Device::get_worker_launch_message_buffer_state(Sub
     return this->active_sub_device_manager_->get_worker_launch_message_buffer_state(sub_device_id);
 }
 
-// Main source to get NOC idx for dispatch core
 NOC Device::dispatch_go_signal_noc() const {
     return this->dispatch_s_enabled() ? NOC::NOC_1 : NOC::NOC_0;
 }
diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp
index 0ccadf286d5..06f3d807460 100644
--- a/tt_metal/impl/device/device_pool.cpp
+++ b/tt_metal/impl/device/device_pool.cpp
@@ -334,12 +334,12 @@ void DevicePool::add_devices_to_pool(const std::vector<chip_id_t>& device_ids) {
         }
     }
 
+    populate_fd_kernels(devices_to_activate, this->num_hw_cqs);
     for (const auto& device_id : devices_to_activate) {
         if (not this->is_device_active(device_id)) {
             this->activate_device(device_id);
         }
     }
-    populate_fd_kernels(devices_to_activate, this->num_hw_cqs);
 }
 
 void DevicePool::register_worker_thread_for_device(v1::DeviceHandle device, std::thread::id worker_thread_id) {
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 7d26c02eff2..f6c4d500289 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -35,11 +35,12 @@
 #include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
 #include "tt_metal/impl/event/event.hpp"
 #include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/program/program_dispatch_utils.hpp"
 #include "umd/device/tt_xy_pair.h"
 
 #include "llrt/hal.hpp"
 
+#define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST)
+
 using namespace tt::tt_metal;
 
 namespace tt::tt_metal {
@@ -112,10 +113,10 @@ void EnqueueReadBufferCommand::process() {
     uint32_t num_worker_counters = this->sub_device_ids.size();
     // accounts for padding
     uint32_t cmd_sequence_sizeB =
-        hal.get_alignment(HalMemType::HOST) * num_worker_counters +  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
-        hal.get_alignment(HalMemType::HOST) +  // CQ_PREFETCH_CMD_STALL
-        hal.get_alignment(HalMemType::HOST) +  // CQ_PREFETCH_CMD_RELAY_INLINE_NOFLUSH + CQ_DISPATCH_CMD_WRITE_LINEAR_HOST
-        hal.get_alignment(HalMemType::HOST);   // CQ_PREFETCH_CMD_RELAY_LINEAR or CQ_PREFETCH_CMD_RELAY_PAGED
+        CQ_PREFETCH_CMD_BARE_MIN_SIZE * num_worker_counters +  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
+        CQ_PREFETCH_CMD_BARE_MIN_SIZE +  // CQ_PREFETCH_CMD_STALL
+        CQ_PREFETCH_CMD_BARE_MIN_SIZE +  // CQ_PREFETCH_CMD_RELAY_INLINE_NOFLUSH + CQ_DISPATCH_CMD_WRITE_LINEAR_HOST
+        CQ_PREFETCH_CMD_BARE_MIN_SIZE;   // CQ_PREFETCH_CMD_RELAY_LINEAR or CQ_PREFETCH_CMD_RELAY_PAGED
 
     void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id);
 
@@ -291,7 +292,7 @@ void EnqueueWriteBufferCommand::process() {
         sizeof(CQDispatchCmd) + // CQ_DISPATCH_CMD_WRITE_PAGED or CQ_DISPATCH_CMD_WRITE_LINEAR
         data_size_bytes, pcie_alignment);
     if (this->issue_wait) {
-        cmd_sequence_sizeB += hal.get_alignment(HalMemType::HOST) * num_worker_counters;  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
+        cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE * num_worker_counters;  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
     }
 
     void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id);
@@ -358,6 +359,1067 @@ EnqueueProgramCommand::EnqueueProgramCommand(
         dispatch_constants::get(this->dispatch_core_type).get_dispatch_message_offset(this->sub_device_id.to_index());
 }
 
+void EnqueueProgramCommand::assemble_preamble_commands(
+    ProgramCommandSequence& program_command_sequence, const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs) {
+    uint32_t uncached_cmd_sequence_sizeB =
+        CQ_PREFETCH_CMD_BARE_MIN_SIZE;  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_SET_WRITE_OFFSET
+
+    program_command_sequence.preamble_command_sequence =
+        HostMemDeviceCommand(uncached_cmd_sequence_sizeB);
+
+    // Send write offsets
+    if (hal.get_programmable_core_type_count() >= 2) {
+        program_command_sequence.preamble_command_sequence.add_dispatch_set_write_offsets(
+            0,
+            kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)].addr,
+            kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)].addr);
+    } else {
+        program_command_sequence.preamble_command_sequence.add_dispatch_set_write_offsets(
+            0, kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)].addr, 0);
+    }
+}
+
+void EnqueueProgramCommand::assemble_stall_commands(ProgramCommandSequence& program_command_sequence, bool prefetch_stall) {
+    if (prefetch_stall) {
+        // Wait command so previous program finishes
+        // Wait command with barrier for binaries to commit to DRAM
+        // Prefetch stall to prevent prefetcher picking up incomplete binaries from DRAM
+        uint32_t uncached_cmd_sequence_sizeB =
+            CQ_PREFETCH_CMD_BARE_MIN_SIZE +  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
+            CQ_PREFETCH_CMD_BARE_MIN_SIZE;   // CQ_PREFETCH_CMD_STALL
+
+        program_command_sequence.stall_command_sequences[UncachedStallSequenceIdx] =
+            HostMemDeviceCommand(uncached_cmd_sequence_sizeB);
+
+        // Wait for Noc Write Barrier
+        // wait for binaries to commit to dram, also wait for previous program to be done
+        // Wait Noc Write Barrier, wait for binaries to be written to worker cores
+        // Stall to allow binaries to commit to DRAM first
+        // TODO: this can be removed for all but the first program run
+        program_command_sequence.stall_command_sequences[UncachedStallSequenceIdx].add_dispatch_wait_with_prefetch_stall(
+            true, this->dispatch_message_addr, this->expected_num_workers_completed);
+    } else {
+        // Wait command so previous program finishes
+        uint32_t cached_cmd_sequence_sizeB =
+            CQ_PREFETCH_CMD_BARE_MIN_SIZE;  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
+
+        program_command_sequence.stall_command_sequences[CachedStallSequenceIdx] =
+            HostMemDeviceCommand(cached_cmd_sequence_sizeB);
+        program_command_sequence.stall_command_sequences[CachedStallSequenceIdx].add_dispatch_wait(
+            false, this->dispatch_message_addr, this->expected_num_workers_completed);
+    }
+}
+
+template <typename PackedSubCmd>
+uint32_t get_max_write_packed_sub_cmds(
+    uint32_t data_size, uint32_t max_prefetch_cmd_size, uint32_t packed_write_max_unicast_sub_cmds, bool no_stride) {
+    static_assert(
+        std::is_same<PackedSubCmd, CQDispatchWritePackedUnicastSubCmd>::value or
+        std::is_same<PackedSubCmd, CQDispatchWritePackedMulticastSubCmd>::value);
+    constexpr bool is_unicast = std::is_same<PackedSubCmd, CQDispatchWritePackedUnicastSubCmd>::value;
+    uint32_t sub_cmd_sizeB =
+        is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd);
+    // Approximate calculation due to alignment
+    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+    uint32_t max_prefetch_size =
+        max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - hal.get_alignment(HalMemType::HOST) - sizeof(CQDispatchCmd) - l1_alignment;
+    uint32_t max_prefetch_num_packed_cmds =
+        no_stride ? (max_prefetch_size - align(data_size * sizeof(uint32_t), l1_alignment)) / sub_cmd_sizeB
+                  : max_prefetch_size / (align(data_size * sizeof(uint32_t), l1_alignment) + sub_cmd_sizeB);
+
+    uint32_t packed_write_max_multicast_sub_cmds =
+        get_packed_write_max_multicast_sub_cmds(packed_write_max_unicast_sub_cmds);
+    return std::min(
+        max_prefetch_num_packed_cmds,
+        is_unicast ? packed_write_max_unicast_sub_cmds : packed_write_max_multicast_sub_cmds);
+};
+
+template <typename PackedSubCmd>
+uint32_t insert_write_packed_payloads(
+    const uint32_t num_sub_cmds,
+    const uint32_t sub_cmd_sizeB,
+    const uint32_t max_prefetch_command_size,
+    const uint32_t packed_write_max_unicast_sub_cmds,
+    std::vector<std::pair<uint32_t, uint32_t>>& packed_cmd_payloads) {
+    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+    const uint32_t aligned_sub_cmd_sizeB = align(sub_cmd_sizeB, l1_alignment);
+    const uint32_t max_packed_sub_cmds_per_cmd = get_max_write_packed_sub_cmds<PackedSubCmd>(
+        aligned_sub_cmd_sizeB, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false);
+    uint32_t rem_num_sub_cmds = num_sub_cmds;
+    uint32_t cmd_payload_sizeB = 0;
+    uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST);
+    while (rem_num_sub_cmds != 0) {
+        const uint32_t num_sub_cmds_in_cmd = std::min(max_packed_sub_cmds_per_cmd, rem_num_sub_cmds);
+        const uint32_t aligned_data_sizeB = aligned_sub_cmd_sizeB * num_sub_cmds_in_cmd;
+        const uint32_t dispatch_cmd_sizeB =
+            align(sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), l1_alignment);
+        packed_cmd_payloads.emplace_back(num_sub_cmds_in_cmd, dispatch_cmd_sizeB + aligned_data_sizeB);
+        cmd_payload_sizeB += align(sizeof(CQPrefetchCmd) + packed_cmd_payloads.back().second, pcie_alignment);
+        rem_num_sub_cmds -= num_sub_cmds_in_cmd;
+    }
+    return cmd_payload_sizeB;
+}
+
+template <typename PackedSubCmd>
+void generate_runtime_args_cmds(
+    std::vector<HostMemDeviceCommand>& runtime_args_command_sequences,
+    const uint32_t& l1_arg_base_addr,
+    const std::vector<PackedSubCmd>& sub_cmds,
+    const std::vector<std::vector<std::tuple<const void*, uint32_t, uint32_t>>>& rt_data_and_sizes,
+    const uint32_t& max_runtime_args_len,
+    std::vector<std::vector<std::reference_wrapper<RuntimeArgsData>>>& rt_args_data,
+    const uint32_t max_prefetch_command_size,
+    const uint32_t packed_write_max_unicast_sub_cmds,
+    bool no_stride,
+    enum DispatchWriteOffsets write_offset_index) {
+    static_assert(
+        std::is_same<PackedSubCmd, CQDispatchWritePackedUnicastSubCmd>::value or
+        std::is_same<PackedSubCmd, CQDispatchWritePackedMulticastSubCmd>::value);
+
+    thread_local static auto get_runtime_payload_sizeB =
+        [](uint32_t num_packed_cmds, uint32_t runtime_args_len, bool is_unicast, bool no_stride) {
+            uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+            uint32_t sub_cmd_sizeB =
+                is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd);
+            uint32_t dispatch_cmd_sizeB = sizeof(CQDispatchCmd) + align(num_packed_cmds * sub_cmd_sizeB, l1_alignment);
+            uint32_t aligned_runtime_data_sizeB =
+                (no_stride ? 1 : num_packed_cmds) * align(runtime_args_len * sizeof(uint32_t), l1_alignment);
+            return dispatch_cmd_sizeB + aligned_runtime_data_sizeB;
+        };
+    thread_local static auto get_runtime_args_data_offset =
+        [](uint32_t num_packed_cmds, uint32_t runtime_args_len, bool is_unicast) {
+            uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+            uint32_t sub_cmd_sizeB =
+                is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd);
+            uint32_t dispatch_cmd_sizeB = sizeof(CQDispatchCmd) + align(num_packed_cmds * sub_cmd_sizeB, l1_alignment);
+            return sizeof(CQPrefetchCmd) + dispatch_cmd_sizeB;
+        };
+
+    constexpr bool unicast = std::is_same<PackedSubCmd, CQDispatchWritePackedUnicastSubCmd>::value;
+
+    uint32_t num_packed_cmds_in_seq = sub_cmds.size();
+    uint32_t max_packed_cmds = get_max_write_packed_sub_cmds<PackedSubCmd>(
+        max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, no_stride);
+    uint32_t offset_idx = 0;
+    if (no_stride) {
+        TT_FATAL(
+            max_packed_cmds >= num_packed_cmds_in_seq,
+            "num_packed_cmds_in_seq {} cannot exceed max_packed_cmds {} when no_stride is true",
+            num_packed_cmds_in_seq,
+            max_packed_cmds);
+    }
+    uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST);
+    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+    while (num_packed_cmds_in_seq != 0) {
+        // Generate the device command
+        uint32_t num_packed_cmds = std::min(num_packed_cmds_in_seq, max_packed_cmds);
+        uint32_t rt_payload_sizeB =
+            get_runtime_payload_sizeB(num_packed_cmds, max_runtime_args_len, unicast, no_stride);
+        uint32_t cmd_sequence_sizeB = align(sizeof(CQPrefetchCmd) + rt_payload_sizeB, pcie_alignment);
+        runtime_args_command_sequences.emplace_back(cmd_sequence_sizeB);
+        runtime_args_command_sequences.back().add_dispatch_write_packed<PackedSubCmd>(
+            num_packed_cmds,
+            l1_arg_base_addr,
+            max_runtime_args_len * sizeof(uint32_t),
+            rt_payload_sizeB,
+            sub_cmds,
+            rt_data_and_sizes,
+            packed_write_max_unicast_sub_cmds,
+            offset_idx,
+            no_stride,
+            write_offset_index);
+
+        // Update kernel RTA pointers to point into the generated command
+        // Future RTA updates through the API will update the command sequence directly
+        uint32_t data_offset = (uint32_t)get_runtime_args_data_offset(num_packed_cmds, max_runtime_args_len, unicast);
+        const uint32_t data_inc = align(max_runtime_args_len * sizeof(uint32_t), l1_alignment);
+        uint32_t num_data_copies = no_stride ? 1 : num_packed_cmds;
+        for (uint32_t i = offset_idx; i < offset_idx + num_data_copies; ++i) {
+            uint32_t offset = 0;
+            for (auto& data : rt_args_data[i]) {
+                data.get().rt_args_data =
+                    (uint32_t*)((char*)runtime_args_command_sequences.back().data() + data_offset + offset);
+                offset += data.get().rt_args_count * sizeof(uint32_t);
+            }
+            data_offset += data_inc;
+        }
+        num_packed_cmds_in_seq -= num_packed_cmds;
+        offset_idx += num_packed_cmds;
+    }
+}
+
+// Generate command sequence for unique (unicast) and common (multicast) runtime args
+void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequence& program_command_sequence) {
+    CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->device->id());
+    const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
+
+    // Note: each sub_cmd contain data for multiple kernels (DM*, COMPUTE)
+    // the outer vector counts through the kernels, the inner vectors contains the data for each kernel
+    std::vector<CQDispatchWritePackedUnicastSubCmd> unique_sub_cmds;
+    std::vector<std::vector<std::tuple<const void*, uint32_t, uint32_t>>> unique_rt_data_and_sizes;
+    std::vector<std::vector<std::reference_wrapper<RuntimeArgsData>>> unique_rt_args_data;
+
+    std::variant<std::vector<CQDispatchWritePackedMulticastSubCmd>, std::vector<CQDispatchWritePackedUnicastSubCmd>>
+        common_sub_cmds;
+    std::vector<std::vector<std::tuple<const void*, uint32_t, uint32_t>>> common_rt_data_and_sizes;
+    std::vector<std::vector<std::reference_wrapper<RuntimeArgsData>>> common_rt_args_data;
+
+    program_command_sequence.runtime_args_command_sequences = {};
+
+    uint32_t command_count = 0;
+    // Unique RTAs
+    for (uint32_t programmable_core_type_index = 0;
+         programmable_core_type_index < hal.get_programmable_core_type_count();
+         programmable_core_type_index++) {
+        if (hal.get_programmable_core_type(programmable_core_type_index) == HalProgrammableCoreType::IDLE_ETH) {
+            // Fast dispatch not supported on IDLE_ETH yet
+            // TODO: can't just loop here as code below confuses ACTIVE/IDLE
+            continue;
+        }
+        for (auto& kg : program.get_kernel_groups(programmable_core_type_index)) {
+            if (kg.total_rta_size != 0) {
+                uint32_t num_sub_cmds = kg.core_ranges.num_cores();
+                uint32_t max_runtime_args_len = kg.total_rta_size / sizeof(uint32_t);
+                uint32_t max_packed_cmds = get_max_write_packed_sub_cmds<decltype(unique_sub_cmds)::value_type>(
+                    max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false);
+                command_count += div_up(num_sub_cmds, max_packed_cmds);
+            }
+        }
+    }
+    // Common RTAs
+    for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) {
+        auto kernel = detail::GetKernel(program, kernel_id);
+        auto programmable_core_type = kernel->get_kernel_programmable_core_type();
+        if (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) {
+            // Fast dispatch not supported on IDLE_ETH yet
+            // TODO: can't just loop here as code below confuses ACTIVE/IDLE
+            continue;
+        }
+        uint32_t programmable_core_type_index = hal.get_programmable_core_type_index(programmable_core_type);
+        uint32_t common_size = program.get_program_config(programmable_core_type_index)
+                                   .crta_sizes[kernel->dispatch_class()];
+        if (common_size != 0) {
+            uint32_t max_runtime_args_len = common_size / sizeof(uint32_t);
+            const auto& common_rt_args = kernel->common_runtime_args();
+            if (common_rt_args.size() > 0) {
+                CoreType core_type = hal.get_core_type(programmable_core_type_index);
+                if (core_type == CoreType::ETH) {
+                    uint32_t num_sub_cmds = kernel->logical_cores().size();
+                    uint32_t max_packed_cmds = get_max_write_packed_sub_cmds<CQDispatchWritePackedUnicastSubCmd>(
+                        max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true);
+                    command_count += div_up(num_sub_cmds, max_packed_cmds);
+                } else {
+                    uint32_t num_sub_cmds = kernel->logical_coreranges().size();
+                    uint32_t max_packed_cmds = get_max_write_packed_sub_cmds<CQDispatchWritePackedMulticastSubCmd>(
+                        max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true);
+                    command_count += div_up(num_sub_cmds, max_packed_cmds);
+                }
+            }
+        }
+    }
+
+    program_command_sequence.runtime_args_command_sequences.reserve(command_count);
+    // Unique Runtime Args (Unicast)
+    for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
+        if (hal.get_programmable_core_type(index) == HalProgrammableCoreType::IDLE_ETH) {
+            // Fast dispatch not supported on IDLE_ETH yet
+            // TODO: can't just loop here as code below confuses ACTIVE/IDLE
+            continue;
+        }
+        CoreType core_type = hal.get_core_type(index);
+        uint32_t processor_classes = hal.get_processor_classes_count(index);
+
+        for (auto& kg : program.get_kernel_groups(index)) {
+            if (kg.total_rta_size != 0) {
+                for (const CoreRange& core_range : kg.core_ranges.ranges()) {
+                    for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
+                        for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
+                            CoreCoord core_coord(x, y);
+
+                            unique_rt_args_data.resize(unique_rt_args_data.size() + 1);
+                            unique_rt_data_and_sizes.resize(unique_rt_data_and_sizes.size() + 1);
+                            for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
+                                auto& optional_id = kg.kernel_ids[dispatch_class];
+                                if (optional_id) {
+                                    auto kernel = detail::GetKernel(program, optional_id.value());
+                                    if (!kernel->cores_with_runtime_args().empty()) {
+                                        const auto& runtime_args_data = kernel->runtime_args(core_coord);
+                                        unique_rt_args_data.back().emplace_back(kernel->runtime_args_data(core_coord));
+                                        TT_ASSERT(
+                                            runtime_args_data.size() * sizeof(uint32_t) <=
+                                            kg.rta_sizes[dispatch_class]);
+                                        unique_rt_data_and_sizes.back().emplace_back(
+                                            runtime_args_data.data(),
+                                            runtime_args_data.size() * sizeof(uint32_t),
+                                            kg.rta_sizes[dispatch_class]);
+                                    }
+                                }
+                            }
+                            CoreCoord virtual_core = device->virtual_core_from_logical_core(core_coord, core_type);
+                            unique_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{
+                                .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, virtual_core)});
+                        }
+                    }
+                }
+                uint32_t rta_offset = program.get_program_config(index).rta_offset;
+                generate_runtime_args_cmds(
+                    program_command_sequence.runtime_args_command_sequences,
+                    rta_offset,
+                    unique_sub_cmds,
+                    unique_rt_data_and_sizes,
+                    kg.total_rta_size / sizeof(uint32_t),
+                    unique_rt_args_data,
+                    max_prefetch_command_size,
+                    packed_write_max_unicast_sub_cmds,
+                    false,
+                    core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE
+                                                  : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE);
+                for (auto& data_per_kernel : unique_rt_data_and_sizes) {
+                    for (auto& data_and_sizes : data_per_kernel) {
+                        RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes));
+                    }
+                }
+                unique_sub_cmds.clear();
+                unique_rt_data_and_sizes.clear();
+                unique_rt_args_data.clear();
+            }
+        }
+
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
+            uint32_t common_size = program.get_program_config(index).crta_sizes[dispatch_class];
+            if (common_size == 0) {
+                continue;
+            }
+            for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) {
+                auto kernel = detail::GetKernel(program, kernel_id);
+                if (kernel->get_kernel_core_type() != core_type) {
+                    continue;  // TODO: fixme, need list of kernels by core_typexdispatch_class
+                }
+                if (kernel->dispatch_class() != dispatch_class) {
+                    continue;  // TODO: fixme, need list of kernels by core_typexdispatch_class
+                }
+
+                const auto& common_rt_args = kernel->common_runtime_args();
+                if (common_rt_args.size() > 0) {
+                    common_rt_args_data.resize(common_rt_args_data.size() + 1);
+                    common_rt_data_and_sizes.resize(common_rt_data_and_sizes.size() + 1);
+
+                    TT_ASSERT(kernel->common_runtime_args_data().size() * sizeof(uint32_t) == common_size);
+                    TT_ASSERT(common_rt_args.size() * sizeof(uint32_t) <= common_size);
+                    common_rt_data_and_sizes.back().emplace_back(
+                        common_rt_args.data(), common_rt_args.size() * sizeof(uint32_t), common_size);
+                    common_rt_args_data.back().emplace_back(kernel->common_runtime_args_data());
+
+                    if (core_type == CoreType::ETH) {
+                        common_sub_cmds.emplace<std::vector<CQDispatchWritePackedUnicastSubCmd>>(
+                            std::vector<CQDispatchWritePackedUnicastSubCmd>());
+                        auto& unicast_sub_cmd =
+                            std::get<std::vector<CQDispatchWritePackedUnicastSubCmd>>(common_sub_cmds);
+                        unicast_sub_cmd.reserve(kernel->logical_cores().size());
+                        for (auto& core_coord : kernel->logical_cores()) {
+                            // can make a vector of unicast encodings here
+                            CoreCoord virtual_core_coords = device->virtual_core_from_logical_core(core_coord, CoreType::ETH);
+                            unicast_sub_cmd.emplace_back(CQDispatchWritePackedUnicastSubCmd{
+                                .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, virtual_core_coords)});
+                        }
+                    } else {
+                        std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
+                            device->extract_dst_noc_multicast_info(
+                                kernel->logical_coreranges(), core_type);
+                        common_sub_cmds.emplace<std::vector<CQDispatchWritePackedMulticastSubCmd>>(
+                            std::vector<CQDispatchWritePackedMulticastSubCmd>());
+                        auto& multicast_sub_cmd =
+                            std::get<std::vector<CQDispatchWritePackedMulticastSubCmd>>(common_sub_cmds);
+                        multicast_sub_cmd.reserve(dst_noc_multicast_info.size());
+                        for (const auto& mcast_dests : dst_noc_multicast_info) {
+                            multicast_sub_cmd.emplace_back(CQDispatchWritePackedMulticastSubCmd{
+                                .noc_xy_addr = this->device->get_noc_multicast_encoding(
+                                    this->noc_index, std::get<CoreRange>(mcast_dests.first)),
+                                .num_mcast_dests = mcast_dests.second});
+                        }
+                    }
+                }
+            }
+
+            uint32_t crta_offset = program.get_program_config(index).crta_offsets[dispatch_class];
+
+            // Common rtas are always expected to fit in one prefetch cmd
+            // TODO: use a linear write instead of a packed-write
+            std::visit(
+                [&](auto&& sub_cmds) {
+                    generate_runtime_args_cmds(
+                        program_command_sequence.runtime_args_command_sequences,
+                        crta_offset,
+                        sub_cmds,
+                        common_rt_data_and_sizes,
+                        common_size / sizeof(uint32_t),
+                        common_rt_args_data,
+                        max_prefetch_command_size,
+                        packed_write_max_unicast_sub_cmds,
+                        true,
+                        core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE
+                                                        : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE);
+                    sub_cmds.clear();
+                },
+                common_sub_cmds);
+
+            for (auto& data_per_kernel : common_rt_data_and_sizes) {
+                for (auto& data_and_sizes : data_per_kernel) {
+                    RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes));
+                }
+            }
+            common_rt_data_and_sizes.clear();
+            common_rt_args_data.clear();
+        }
+    }
+    TT_ASSERT(
+        command_count >= program_command_sequence.runtime_args_command_sequences.size(),
+        "Incorrect number of commands reserved {}, final size {}. Vector reallocation causes cached addresses to be incorrect.",
+        command_count,
+        program_command_sequence.runtime_args_command_sequences.size());
+
+    uint32_t runtime_args_fetch_size_bytes = 0;
+    for (const auto& cmds : program_command_sequence.runtime_args_command_sequences) {
+        // BRISC, NCRISC, TRISC...
+        runtime_args_fetch_size_bytes += cmds.size_bytes();
+    }
+    program_command_sequence.runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes;
+}
+
+void EnqueueProgramCommand::assemble_device_commands(
+    ProgramCommandSequence& program_command_sequence, const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs) {
+    // Calculate size of command and fill program indices of data to update
+    // TODO: Would be nice if we could pull this out of program
+    uint32_t cmd_sequence_sizeB = 0;
+    const uint32_t max_prefetch_command_size =
+        dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
+
+    const auto &program_transfer_info = program.get_program_transfer_info();
+    // Multicast Semaphore Cmd
+    uint32_t num_multicast_semaphores = program_transfer_info.multicast_semaphores.size();
+    std::vector<std::vector<CQDispatchWritePackedMulticastSubCmd>> multicast_sem_sub_cmds(num_multicast_semaphores);
+    std::vector<std::vector<std::pair<const void*, uint32_t>>> multicast_sem_data(num_multicast_semaphores);
+    std::vector<std::vector<std::pair<uint32_t, uint32_t>>> multicast_sem_payload(num_multicast_semaphores);
+    std::vector<std::pair<uint32_t, uint32_t>> multicast_sem_dst_size;
+    multicast_sem_dst_size.reserve(num_multicast_semaphores);
+    if (num_multicast_semaphores > 0) {
+        uint32_t i = 0;
+        for (const auto& [dst, transfer_info_vec] : program_transfer_info.multicast_semaphores) {
+            // TODO: loop over things inside transfer_info[i]
+            uint32_t write_packed_len = transfer_info_vec[0].data.size();
+            multicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t)));
+
+            for (const auto& transfer_info : transfer_info_vec) {
+                for (const auto& dst_noc_info : transfer_info.dst_noc_info) {
+                    TT_ASSERT(
+                        transfer_info.data.size() == write_packed_len,
+                        "Not all data std::vectors in write packed semaphore cmd equal in len");
+                    multicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedMulticastSubCmd{
+                        .noc_xy_addr = this->device->get_noc_multicast_encoding(
+                            this->noc_index, std::get<CoreRange>(dst_noc_info.first)),
+                        .num_mcast_dests = dst_noc_info.second});
+                    multicast_sem_data[i].emplace_back(
+                        transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t));
+                }
+            }
+            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(
+                multicast_sem_sub_cmds[i].size(),
+                multicast_sem_dst_size.back().second,
+                max_prefetch_command_size,
+                this->packed_write_max_unicast_sub_cmds,
+                multicast_sem_payload[i]);
+            i++;
+        }
+    }
+
+    // Unicast Semaphore Cmd
+    uint32_t num_unicast_semaphores = program_transfer_info.unicast_semaphores.size();
+    std::vector<std::vector<CQDispatchWritePackedUnicastSubCmd>> unicast_sem_sub_cmds(num_unicast_semaphores);
+    std::vector<std::vector<std::pair<const void*, uint32_t>>> unicast_sem_data(num_unicast_semaphores);
+    std::vector<std::vector<std::pair<uint32_t, uint32_t>>> unicast_sem_payload(num_unicast_semaphores);
+    std::vector<std::pair<uint32_t, uint32_t>> unicast_sem_dst_size;
+    unicast_sem_dst_size.reserve(num_unicast_semaphores);
+    if (num_unicast_semaphores > 0) {
+        uint32_t i = 0;
+        for (const auto& [dst, transfer_info_vec] : program_transfer_info.unicast_semaphores) {
+            // TODO: loop over things inside transfer_info[i]
+            uint32_t write_packed_len = transfer_info_vec[0].data.size();
+            unicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t)));
+
+            for (const auto& transfer_info : transfer_info_vec) {
+                for (const auto& dst_noc_info : transfer_info.dst_noc_info) {
+                    TT_ASSERT(
+                        transfer_info.data.size() == write_packed_len,
+                        "Not all data std::vectors in write packed semaphore cmd equal in len");
+                    unicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedUnicastSubCmd{
+                        .noc_xy_addr = this->device->get_noc_unicast_encoding(
+                            this->noc_index, std::get<CoreCoord>(dst_noc_info.first))});
+                    unicast_sem_data[i].emplace_back(
+                        transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t));
+                }
+            }
+            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedUnicastSubCmd>(
+                unicast_sem_sub_cmds[i].size(),
+                unicast_sem_dst_size.back().second,
+                max_prefetch_command_size,
+                this->packed_write_max_unicast_sub_cmds,
+                unicast_sem_payload[i]);
+            i++;
+        }
+    }
+
+    uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+
+    const auto& circular_buffers_unique_coreranges = program.circular_buffers_unique_coreranges();
+    const uint16_t num_multicast_cb_sub_cmds = circular_buffers_unique_coreranges.size();
+    std::vector<std::pair<uint32_t, uint32_t>> mcast_cb_payload;
+    uint16_t cb_config_size_bytes = 0;
+    uint32_t aligned_cb_config_size_bytes = 0;
+    std::vector<std::vector<uint32_t>> cb_config_payloads(
+        num_multicast_cb_sub_cmds,
+        std::vector<uint32_t>(program.get_program_config(index).cb_size / sizeof(uint32_t), 0));
+    std::vector<CQDispatchWritePackedMulticastSubCmd> multicast_cb_config_sub_cmds;
+    std::vector<std::pair<const void*, uint32_t>> multicast_cb_config_data;
+    if (num_multicast_cb_sub_cmds > 0) {
+        multicast_cb_config_sub_cmds.reserve(num_multicast_cb_sub_cmds);
+        multicast_cb_config_data.reserve(num_multicast_cb_sub_cmds);
+        program_command_sequence.circular_buffers_on_core_ranges.resize(num_multicast_cb_sub_cmds);
+        uint32_t i = 0;
+        uint32_t max_overall_index = 0;
+        uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t);
+        for (const CoreRange& core_range : circular_buffers_unique_coreranges) {
+            const CoreCoord virtual_start = device->virtual_core_from_logical_core(core_range.start_coord, CoreType::WORKER);
+            const CoreCoord virtual_end = device->virtual_core_from_logical_core(core_range.end_coord, CoreType::WORKER);
+
+            const uint32_t num_receivers = core_range.size();
+            auto& cb_config_payload = cb_config_payloads[i];
+            uint32_t max_index = 0;
+            const auto& circular_buffers_on_corerange = program.circular_buffers_on_corerange(core_range);
+            program_command_sequence.circular_buffers_on_core_ranges[i].reserve(
+                circular_buffers_on_corerange.size());
+            for (const std::shared_ptr<CircularBuffer>& cb : circular_buffers_on_corerange) {
+                program_command_sequence.circular_buffers_on_core_ranges[i].emplace_back(cb);
+                const uint32_t cb_address = cb->address();
+                const uint32_t cb_size = cb->size();
+                for (const auto& buffer_index : cb->local_buffer_indices()) {
+                    // 1 cmd for all 32 buffer indices, populate with real data for specified indices
+                    // cb config payload
+                    const uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
+                    cb_config_payload[base_index] = cb_address;
+                    cb_config_payload[base_index + 1] = cb_size;
+                    cb_config_payload[base_index + 2] = cb->num_pages(buffer_index);
+                    cb_config_payload[base_index + 3] = cb->page_size(buffer_index);
+                    max_index = std::max(max_index, base_index + UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG);
+                }
+                for (const auto& buffer_index : cb->remote_buffer_indices()) {
+                    const uint32_t base_index =
+                        remote_offset_index +
+                        (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) * UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG;
+                    cb_config_payload[base_index] = cb->config_address();
+                    cb_config_payload[base_index + 1] = cb->page_size(buffer_index);
+                    max_index = std::max(max_index, base_index + UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG);
+                }
+            }
+            multicast_cb_config_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{
+                .noc_xy_addr = this->device->get_noc_multicast_encoding(
+                    this->noc_index, CoreRange(virtual_start, virtual_end)),
+                .num_mcast_dests = (uint32_t)core_range.size()});
+            multicast_cb_config_data.emplace_back(cb_config_payload.data(), max_index * sizeof(uint32_t));
+            max_overall_index = std::max(max_overall_index, max_index);
+            i++;
+        }
+        uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+        cb_config_size_bytes = max_overall_index * sizeof(uint32_t);
+        aligned_cb_config_size_bytes = align(cb_config_size_bytes, l1_alignment);
+        cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(
+            num_multicast_cb_sub_cmds,
+            cb_config_size_bytes,
+            max_prefetch_command_size,
+            this->packed_write_max_unicast_sub_cmds,
+            mcast_cb_payload);
+    }
+
+    // Program Binaries and Go Signals
+    // Get launch msg data while getting size of cmds
+    std::vector<std::vector<CQPrefetchRelayPagedPackedSubCmd>> kernel_bins_prefetch_subcmds;
+    std::vector<std::vector<CQDispatchWritePackedLargeSubCmd>> kernel_bins_dispatch_subcmds;
+    std::vector<uint32_t> kernel_bins_write_packed_large_data_aligned_sizeB;
+    std::vector<HostMemDeviceCommand> kernel_bins_unicast_cmds;
+    const uint32_t max_length_per_sub_cmd = dispatch_constants::get(this->dispatch_core_type).scratch_db_size() / 2;
+    const uint32_t max_paged_length_per_sub_cmd =
+        max_length_per_sub_cmd / HostMemDeviceCommand::PROGRAM_PAGE_SIZE * HostMemDeviceCommand::PROGRAM_PAGE_SIZE;
+    if (program_transfer_info.kernel_bins.size()) {
+        TT_FATAL(program.get_kernels_buffer(this->device).get(), "Expected Kernel Binary Buffer to be allocated for program.");
+    }
+    const auto kernels_buffer = program.get_kernels_buffer(this->device);
+    for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program_transfer_info.kernel_bins) {
+        bool write_linear;
+        uint32_t noc_encoding;
+        std::visit(
+            [&](auto&& cores) {
+                using T = std::decay_t<decltype(cores)>;
+                if constexpr (std::is_same_v<T, CoreRange>) {
+                    noc_encoding = this->device->get_noc_multicast_encoding(this->noc_index, cores);
+                    write_linear = false;
+                } else {
+                    noc_encoding = this->device->get_noc_unicast_encoding(this->noc_index, cores);
+                    write_linear = true;
+                }
+            },
+            cores);
+        for (uint32_t kernel_idx = 0; kernel_idx < kg_transfer_info.dst_base_addrs.size(); kernel_idx++) {
+            if (write_linear) {
+                kernel_bins_unicast_cmds.emplace_back(2 * CQ_PREFETCH_CMD_BARE_MIN_SIZE);
+                cmd_sequence_sizeB += 2 * CQ_PREFETCH_CMD_BARE_MIN_SIZE;
+                constexpr bool flush_prefetch = false;
+                kernel_bins_unicast_cmds.back().add_dispatch_write_linear<flush_prefetch>(
+                    num_mcast_dests,  // num_mcast_dests
+                    noc_encoding,     // noc_xy_addr
+                    kg_transfer_info.dst_base_addrs[kernel_idx],
+                    kg_transfer_info.lengths[kernel_idx]);
+                RecordDispatchData(
+                    program,
+                    DISPATCH_DATA_BINARY,
+                    kg_transfer_info.lengths[kernel_idx],
+                    kg_transfer_info.riscvs[kernel_idx]);
+                // Difference between prefetch total relayed pages and dispatch write linear
+                uint32_t relayed_bytes =
+                    align(kg_transfer_info.lengths[kernel_idx], HostMemDeviceCommand::PROGRAM_PAGE_SIZE);
+                uint16_t length_adjust = uint16_t(relayed_bytes - kg_transfer_info.lengths[kernel_idx]);
+
+                uint32_t base_address, page_offset;
+                if (kg_transfer_info.page_offsets[kernel_idx] > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) {
+                    const uint32_t num_banks = this->device->num_banks(kernels_buffer->buffer_type());
+                    page_offset = kg_transfer_info.page_offsets[kernel_idx] % num_banks;
+                    uint32_t num_full_pages_written_per_bank =
+                        kg_transfer_info.page_offsets[kernel_idx] / num_banks;
+                    base_address = kernels_buffer->address() +
+                                    num_full_pages_written_per_bank * kernels_buffer->page_size();
+                } else {
+                    base_address = kernels_buffer->address();
+                    page_offset = kg_transfer_info.page_offsets[kernel_idx];
+                }
+
+                kernel_bins_unicast_cmds.back().add_prefetch_relay_paged(
+                    true,  // is_dram
+                    page_offset,
+                    base_address,
+                    kernels_buffer->page_size(),
+                    relayed_bytes / kernels_buffer->page_size(),
+                    length_adjust);
+            } else {
+                uint32_t base_address = kernels_buffer->address();
+                uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx];
+
+                // TODO: pack all these writes into 1 linear write
+                uint32_t kernel_config_buffer_offset = kg_transfer_info.dst_base_addrs[kernel_idx];
+                uint32_t aligned_length = align(kg_transfer_info.lengths[kernel_idx], hal.get_alignment(HalMemType::DRAM));
+                uint32_t padding = aligned_length - kg_transfer_info.lengths[kernel_idx];
+                while (aligned_length != 0) {
+                    if (kernel_bins_dispatch_subcmds.empty() ||
+                        kernel_bins_dispatch_subcmds.back().size() ==
+                            CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_MAX_SUB_CMDS) {
+                        kernel_bins_dispatch_subcmds.push_back({});
+                        kernel_bins_prefetch_subcmds.push_back({});
+                        kernel_bins_write_packed_large_data_aligned_sizeB.push_back(0);
+                    }
+                    uint32_t write_length, read_length;
+                    if (aligned_length <= max_length_per_sub_cmd) {
+                        read_length = aligned_length;
+                        write_length = read_length - padding;
+                    } else {
+                        read_length = max_paged_length_per_sub_cmd;
+                        write_length = read_length;
+                    }
+                    if (!kernel_bins_dispatch_subcmds.back().empty()) {
+                        auto& back = kernel_bins_dispatch_subcmds.back().back();
+                        if (back.noc_xy_addr != noc_encoding) {
+                            back.flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;
+                        }
+                    }
+                    kernel_bins_dispatch_subcmds.back().emplace_back(CQDispatchWritePackedLargeSubCmd{
+                        .noc_xy_addr = noc_encoding,
+                        .addr = kernel_config_buffer_offset,
+                        .length = (uint16_t)write_length,
+                        .num_mcast_dests = (uint8_t)num_mcast_dests,
+                        .flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE});
+                    RecordDispatchData(
+                        program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]);
+                    kernel_config_buffer_offset += write_length;
+
+                    kernel_bins_prefetch_subcmds.back().emplace_back(CQPrefetchRelayPagedPackedSubCmd{
+                        .start_page = (uint16_t)page_offset,
+                        .log_page_size = (uint16_t)HostMemDeviceCommand::LOG2_PROGRAM_PAGE_SIZE,
+                        .base_addr = base_address,
+                        .length = read_length});
+                    page_offset += read_length / HostMemDeviceCommand::PROGRAM_PAGE_SIZE;
+                    aligned_length -= read_length;
+                    kernel_bins_write_packed_large_data_aligned_sizeB.back() += read_length;
+                }
+            }
+        }
+    }
+    // Unlink the last subcmd of every dispatch, to ensure we don't hold the
+    // path reservation for an incredible long time. This also prevents a hang
+    // if the next mcast is to a different destination.
+    for (auto& subcmd_list : kernel_bins_dispatch_subcmds) {
+        if (!subcmd_list.empty()) {
+            subcmd_list.back().flags |= CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;
+        }
+    }
+    uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST);
+    for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) {
+        cmd_sequence_sizeB += align(
+            ((sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd))) +
+                kernel_bins_dispatch_subcmds[i].size() * sizeof(CQDispatchWritePackedLargeSubCmd),
+            pcie_alignment);
+        cmd_sequence_sizeB += align(
+            kernel_bins_prefetch_subcmds[i].size() * sizeof(CQPrefetchRelayPagedPackedSubCmd) +
+                sizeof(CQPrefetchCmd),
+            pcie_alignment);
+    }
+
+    std::vector<std::pair<const void*, uint32_t>> multicast_go_signal_data;
+    std::vector<std::pair<const void*, uint32_t>> unicast_go_signal_data;
+    std::vector<CQDispatchWritePackedMulticastSubCmd> multicast_go_signal_sub_cmds;
+    std::vector<CQDispatchWritePackedUnicastSubCmd> unicast_go_signal_sub_cmds;
+    std::vector<std::pair<uint32_t, uint32_t>> multicast_go_signals_payload;
+    std::vector<std::pair<uint32_t, uint32_t>> unicast_go_signals_payload;
+    constexpr uint32_t go_signal_sizeB = sizeof(launch_msg_t);
+    uint32_t aligned_go_signal_sizeB = align(go_signal_sizeB, hal.get_alignment(HalMemType::L1));
+    uint32_t go_signal_size_words = aligned_go_signal_sizeB / sizeof(uint32_t);
+
+    // TODO: eventually the code below could be structured to loop over programmable_indices
+    // and check for mcast/unicast
+    uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+    for (KernelGroup& kernel_group : program.get_kernel_groups(programmable_core_index)) {
+        kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV;
+        for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) {
+            kernel_group.launch_msg.kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr;
+        }
+        kernel_group.launch_msg.kernel_config.host_assigned_id = program.get_runtime_id();
+        const void* launch_message_data = (const void*)(&kernel_group.launch_msg);
+        for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) {
+            CoreCoord virtual_start = device->virtual_core_from_logical_core(core_range.start_coord, kernel_group.get_core_type());
+            CoreCoord virtual_end = device->virtual_core_from_logical_core(core_range.end_coord, kernel_group.get_core_type());
+
+            multicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{
+                .noc_xy_addr = this->device->get_noc_multicast_encoding(
+                    this->noc_index, CoreRange(virtual_start, virtual_end)),
+                .num_mcast_dests = (uint32_t)core_range.size()});
+            multicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB);
+        }
+    }
+    if (multicast_go_signal_sub_cmds.size() > 0) {
+        cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(
+            multicast_go_signal_sub_cmds.size(),
+            go_signal_sizeB,
+            max_prefetch_command_size,
+            this->packed_write_max_unicast_sub_cmds,
+            multicast_go_signals_payload);
+    }
+
+    programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
+    // TODO: ugly, can be fixed by looping over indices w/ some work
+    if (programmable_core_index != -1) {
+        for (KernelGroup& kernel_group : program.get_kernel_groups(programmable_core_index)) {
+            kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV;
+            for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) {
+                kernel_group.launch_msg.kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr;
+            }
+            kernel_group.launch_msg.kernel_config.host_assigned_id = program.get_runtime_id();
+            const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg);
+            for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) {
+                for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
+                    for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
+                        CoreCoord virtual_coord = device->virtual_core_from_logical_core(
+                            CoreCoord({x, y}), kernel_group.get_core_type());
+                        unicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{
+                            .noc_xy_addr =
+                                this->device->get_noc_unicast_encoding(this->noc_index, virtual_coord)});
+                        unicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB);
+                    }
+                }
+            }
+        }
+    }
+
+    if (unicast_go_signal_sub_cmds.size() > 0) {
+        cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedUnicastSubCmd>(
+            unicast_go_signal_sub_cmds.size(),
+            go_signal_sizeB,
+            max_prefetch_command_size,
+            this->packed_write_max_unicast_sub_cmds,
+            unicast_go_signals_payload);
+    }
+    // if dispatch_s is enabled have dispatch_d send a semaphore update to dispatch_s (this will include a write barrier on dispatch_d if program is active)
+    // if not,  check if the program is active on workers. If active, have dispatch_d issue a write barrier
+    cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE;
+
+    // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command)
+    const auto& noc_data_start_idx = device->noc_data_start_index(this->sub_device_id, multicast_go_signal_sub_cmds.size() > 0,  unicast_go_signal_sub_cmds.size() > 0);
+    const auto& num_noc_mcast_txns = multicast_go_signal_sub_cmds.size() > 0 ? device->num_noc_mcast_txns(this->sub_device_id) : 0;
+    const auto& num_noc_unicast_txns = unicast_go_signal_sub_cmds.size() > 0 ? device->num_noc_unicast_txns(this->sub_device_id) : 0;
+    cmd_sequence_sizeB += align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment);
+
+    program_command_sequence.device_command_sequence = HostMemDeviceCommand(cmd_sequence_sizeB);
+
+    auto& device_command_sequence = program_command_sequence.device_command_sequence;
+
+    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+
+    // Semaphores
+    // Multicast Semaphore Cmd
+    index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+    for (uint32_t i = 0; i < num_multicast_semaphores; ++i) {
+        uint32_t curr_sub_cmd_idx = 0;
+        for (const auto& [num_sub_cmds_in_cmd, multicast_sem_payload_sizeB] : multicast_sem_payload[i]) {
+            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedMulticastSubCmd>(
+                num_sub_cmds_in_cmd,
+                multicast_sem_dst_size[i].first + program.get_program_config(index).sem_offset,
+                multicast_sem_dst_size[i].second,
+                multicast_sem_payload_sizeB,
+                multicast_sem_sub_cmds[i],
+                multicast_sem_data[i],
+                this->packed_write_max_unicast_sub_cmds,
+                curr_sub_cmd_idx,
+                false,
+                DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE);
+            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
+            for (auto& data_and_size : multicast_sem_data[i]) {
+                RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second);
+            }
+        }
+    }
+
+    // Unicast Semaphore Cmd
+    index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
+    for (uint32_t i = 0; i < num_unicast_semaphores; ++i) {
+        uint32_t curr_sub_cmd_idx = 0;
+        for (const auto& [num_sub_cmds_in_cmd, unicast_sem_payload_sizeB] : unicast_sem_payload[i]) {
+            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedUnicastSubCmd>(
+                num_sub_cmds_in_cmd,
+                unicast_sem_dst_size[i].first + program.get_program_config(index).sem_offset,
+                unicast_sem_dst_size[i].second,
+                unicast_sem_payload_sizeB,
+                unicast_sem_sub_cmds[i],
+                unicast_sem_data[i],
+                this->packed_write_max_unicast_sub_cmds,
+                curr_sub_cmd_idx,
+                false,
+                DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE);
+            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
+            for (auto& data_and_size : unicast_sem_data[i]) {
+                RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second);
+            }
+        }
+    }
+
+    // CB Configs commands
+    index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+    if (num_multicast_cb_sub_cmds > 0) {
+        uint32_t curr_sub_cmd_idx = 0;
+        program_command_sequence.cb_configs_payloads.reserve(num_multicast_cb_sub_cmds);
+        const uint32_t cb_config_size_words = aligned_cb_config_size_bytes / sizeof(uint32_t);
+        for (const auto& [num_sub_cmds_in_cmd, mcast_cb_payload_sizeB] : mcast_cb_payload) {
+            uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes();
+            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedMulticastSubCmd>(
+                num_sub_cmds_in_cmd,
+                program.get_program_config(index).cb_offset,
+                cb_config_size_bytes,
+                mcast_cb_payload_sizeB,
+                multicast_cb_config_sub_cmds,
+                multicast_cb_config_data,
+                this->packed_write_max_unicast_sub_cmds,
+                curr_sub_cmd_idx,
+                false,
+                DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE);
+            for (auto& data_and_size : multicast_cb_config_data) {
+                RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, data_and_size.second);
+            }
+            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
+            RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, mcast_cb_payload_sizeB);
+            uint32_t curr_sub_cmd_data_offset_words =
+                (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) +
+                    align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), l1_alignment)) /
+                sizeof(uint32_t);
+            for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
+                program_command_sequence.cb_configs_payloads.push_back(
+                    (uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words);
+                curr_sub_cmd_data_offset_words += cb_config_size_words;
+            }
+        }
+    }
+
+    // All Previous Cmds Up to This Point Go Into the Kernel Config Buffer
+    program_command_sequence.program_config_buffer_data_size_bytes =
+        device_command_sequence.write_offset_bytes();
+
+    // Program Binaries
+    for (const auto& kernel_bins_unicast_cmd : kernel_bins_unicast_cmds) {
+        device_command_sequence.add_data(
+            kernel_bins_unicast_cmd.data(),
+            kernel_bins_unicast_cmd.size_bytes(),
+            kernel_bins_unicast_cmd.size_bytes());
+    }
+    uint32_t dram_alignment = hal.get_alignment(HalMemType::DRAM);
+    for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) {
+        device_command_sequence.add_dispatch_write_packed_large(
+            dram_alignment,
+            kernel_bins_dispatch_subcmds[i].size(),
+            kernel_bins_dispatch_subcmds[i],
+            0,
+            DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE);
+        device_command_sequence.add_prefetch_relay_paged_packed(
+            kernel_bins_write_packed_large_data_aligned_sizeB[i],
+            kernel_bins_prefetch_subcmds[i],
+            kernel_bins_prefetch_subcmds[i].size());
+    }
+
+    // Go Signals
+    program_command_sequence.go_signals.reserve(
+        multicast_go_signal_sub_cmds.size() + unicast_go_signal_sub_cmds.size());
+
+    // Get the address for the slot this launch_message will be written to
+    uint32_t multicast_launch_msg_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::LAUNCH) + this->multicast_cores_launch_message_wptr * sizeof(launch_msg_t);
+
+    if (multicast_go_signal_sub_cmds.size() > 0) {
+        uint32_t curr_sub_cmd_idx = 0;
+        for (const auto& [num_sub_cmds_in_cmd, multicast_go_signal_payload_sizeB] : multicast_go_signals_payload) {
+            uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes();
+            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedMulticastSubCmd>(
+                num_sub_cmds_in_cmd,
+                multicast_launch_msg_addr,
+                go_signal_sizeB,
+                multicast_go_signal_payload_sizeB,
+                multicast_go_signal_sub_cmds,
+                multicast_go_signal_data,
+                this->packed_write_max_unicast_sub_cmds,
+                curr_sub_cmd_idx);
+            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
+            program_command_sequence.launch_msg_write_packed_cmd_ptrs.push_back(&((CQDispatchCmd*) ((uint32_t*)device_command_sequence.data() + (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))->write_packed);
+            uint32_t curr_sub_cmd_data_offset_words =
+                (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) +
+                    align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), l1_alignment)) /
+                sizeof(uint32_t);
+            for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
+                program_command_sequence.go_signals.push_back(
+                    (launch_msg_t*)((uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words));
+                curr_sub_cmd_data_offset_words += go_signal_size_words;
+            }
+        }
+    }
+
+    if (unicast_go_signal_sub_cmds.size() > 0) {
+        uint32_t unicast_launch_msg_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH) + this->unicast_cores_launch_message_wptr * sizeof(launch_msg_t);
+        uint32_t curr_sub_cmd_idx = 0;
+        for (const auto& [num_sub_cmds_in_cmd, unicast_go_signal_payload_sizeB] : unicast_go_signals_payload) {
+            uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes();
+            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedUnicastSubCmd>(
+                num_sub_cmds_in_cmd,
+                unicast_launch_msg_addr,
+                go_signal_sizeB,
+                unicast_go_signal_payload_sizeB,
+                unicast_go_signal_sub_cmds,
+                unicast_go_signal_data,
+                this->packed_write_max_unicast_sub_cmds,
+                curr_sub_cmd_idx);
+            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
+            program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs.push_back(&((CQDispatchCmd*) ((uint32_t*)device_command_sequence.data() + (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))->write_packed);
+            uint32_t curr_sub_cmd_data_offset_words =
+                (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) +
+                    align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedUnicastSubCmd), l1_alignment)) /
+                sizeof(uint32_t);
+            for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
+                program_command_sequence.go_signals.push_back(
+                    (launch_msg_t*)((uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words));
+                curr_sub_cmd_data_offset_words += go_signal_size_words;
+            }
+        }
+    }
+
+    DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER;
+    auto sub_device_index = this->sub_device_id.to_index();
+    if (this->device->dispatch_s_enabled()) {
+        // dispatch_d signals dispatch_s to send the go signal, use a barrier if there are cores active
+        uint16_t index_bitmask = 0;
+        index_bitmask |= 1 << sub_device_index;
+        device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program_transfer_info.num_active_cores > 0, index_bitmask);
+        dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE;
+    } else {
+        // Wait Noc Write Barrier, wait for binaries/configs and launch_msg to be written to worker cores
+        if (program_transfer_info.num_active_cores > 0) {
+            device_command_sequence.add_dispatch_wait(true, this->dispatch_message_addr, 0, 0, false, false);
+        }
+    }
+    go_msg_t run_program_go_signal;
+    run_program_go_signal.signal = RUN_MSG_GO;
+    run_program_go_signal.master_x = (uint8_t)this->dispatch_core.x;
+    run_program_go_signal.master_y = (uint8_t)this->dispatch_core.y;
+    run_program_go_signal.dispatch_message_offset = (uint8_t)dispatch_constants::get(this->dispatch_core_type).get_dispatch_message_offset(sub_device_index);
+    uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes();
+    device_command_sequence.add_dispatch_go_signal_mcast(this->expected_num_workers_completed, *reinterpret_cast<uint32_t*>(&run_program_go_signal), this->dispatch_message_addr, num_noc_mcast_txns, num_noc_unicast_txns, noc_data_start_idx, dispatcher_for_go_signal);
+    program_command_sequence.mcast_go_signal_cmd_ptr = &((CQDispatchCmd*) ((uint32_t*)device_command_sequence.data() + (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))->mcast;
+}
+
+void EnqueueProgramCommand::update_device_commands(
+    ProgramCommandSequence& cached_program_command_sequence,
+    const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs) {
+    uint32_t i = 0;
+    ZoneScopedN("program_loaded_on_device");
+    uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+    uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t);
+    for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) {
+        uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i];
+        for (const std::shared_ptr<CircularBuffer>& cb : cbs_on_core_range) {
+            const uint32_t cb_address = cb->address();
+            const uint32_t cb_size = cb->size();
+            for (const auto& buffer_index : cb->local_buffer_indices()) {
+                // 1 cmd for all 32 buffer indices, populate with real data for specified indices
+
+                // cb config payload
+                uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
+                cb_config_payload[base_index] = cb_address;
+                cb_config_payload[base_index + 1] = cb_size;
+                cb_config_payload[base_index + 2] = cb->num_pages(buffer_index);
+                cb_config_payload[base_index + 3] = cb->page_size(buffer_index);
+            }
+            for (const auto& buffer_index : cb->remote_buffer_indices()) {
+                const uint32_t base_index = remote_offset_index + (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) *
+                                                                      UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG;
+                cb_config_payload[base_index] = cb->config_address();
+                cb_config_payload[base_index + 1] = cb->page_size(buffer_index);
+            }
+        }
+        i++;
+    }
+    for (auto& go_signal : cached_program_command_sequence.go_signals) {
+        for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) {
+            go_signal->kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr;
+        }
+        go_signal->kernel_config.host_assigned_id = program.get_runtime_id();
+    }
+    // Update launch message addresses to reflect new launch_msg slot in ring buffer
+    uint32_t multicast_cores_launch_msg_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::LAUNCH) + this->multicast_cores_launch_message_wptr * sizeof(launch_msg_t);
+    for (auto launch_msg_cmd_ptr : cached_program_command_sequence.launch_msg_write_packed_cmd_ptrs) {
+        launch_msg_cmd_ptr->addr = multicast_cores_launch_msg_addr;
+    }
+    if (cached_program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs.size()) {
+        uint32_t unicast_cores_launch_message_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH) + this->unicast_cores_launch_message_wptr * sizeof(launch_msg_t);
+        for (auto launch_msg_cmd_ptr : cached_program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs) {
+            launch_msg_cmd_ptr->addr = unicast_cores_launch_message_addr;
+        }
+    }
+    // Update go signal to reflect potentially modified dispatch core and new wait count
+    go_msg_t run_program_go_signal;
+    run_program_go_signal.signal = RUN_MSG_GO;
+    run_program_go_signal.master_x = (uint8_t)this->dispatch_core.x;
+    run_program_go_signal.master_y = (uint8_t)this->dispatch_core.y;
+    run_program_go_signal.dispatch_message_offset = (uint8_t)dispatch_constants::get(this->dispatch_core_type).get_dispatch_message_offset(this->sub_device_id.to_index());
+    cached_program_command_sequence.mcast_go_signal_cmd_ptr->go_signal =  *reinterpret_cast<uint32_t*>(&run_program_go_signal);
+    cached_program_command_sequence.mcast_go_signal_cmd_ptr->wait_count = this->expected_num_workers_completed;
+}
+
 void EnqueueProgramCommand::write_program_command_sequence(
     const ProgramCommandSequence& program_command_sequence, bool stall_first, bool stall_before_program) {
     TT_ASSERT(!(stall_first && stall_before_program));
@@ -501,11 +1563,55 @@ void EnqueueProgramCommand::write_program_command_sequence(
 }
 
 void EnqueueProgramCommand::process() {
-    // Dispatch metadata contains runtime information based on
-    // the kernel config ring buffer state
-    program_utils::ProgramDispatchMetadata dispatch_metadata;
-
-    // Compute the total number of workers this program uses
+    std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> reservation =
+        this->config_buffer_mgr.reserve(program.get_program_config_sizes());
+    uint32_t sync_count = 0;
+    bool stall_first = reservation.first.need_sync;
+    bool stall_before_program = false;
+    if (!program.kernel_binary_always_stored_in_ringbuffer()) {
+        // Wait for all existing commands to run before writing out the kernel binary.
+        sync_count = this->expected_num_workers_completed;
+        stall_before_program = !stall_first;
+    } else if (reservation.first.need_sync) {
+        // TODO: attempt to send RTA only without stalling.
+        sync_count = reservation.first.sync_count;
+        // Check if the launch message is the only thing preventing us from
+        // sending the program. If so, we can at least send the RTAs. Ideally we
+        // would also send the kernel binaries in this case, but the rest of the
+        // code isn't set up for that.
+        auto config_sizes = program.get_program_config_sizes();
+        config_sizes[config_sizes.size() - 1] = 0;
+        const std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> memory_reservation =
+            this->config_buffer_mgr.reserve(config_sizes);
+        if (!memory_reservation.first.need_sync) {
+            stall_first = false;
+            stall_before_program = true;
+        }
+        reservation = this->config_buffer_mgr.reserve(program.get_program_config_sizes());
+    }
+
+    // Access the program command cache
+    auto& cached_program_command_sequences = program.get_cached_program_command_sequences();
+    // Start constructing the cache entry, using the build_key
+    uint64_t command_hash = device->build_key();
+    if (not hal.is_coordinate_virtualization_enabled()) {
+        // When coordinate virtualization is not enabled, explicitly encode the device
+        // id into the command hash, to always assert on programs being reused across devices.
+        command_hash = (command_hash << 32) | (device->id());
+    }
+    bool is_cached = program.is_cached(); // Program is cached, its is assocated command stream was previously generated
+    auto cached_cmd_iter = cached_program_command_sequences.find(command_hash);
+    // Is the program was cached, the current command_hash, must match the one generated when the program was cached
+    // Finalizing the program multiple times/Regenerating program cache state is not currently supported
+    TT_FATAL((not is_cached) or cached_cmd_iter != cached_program_command_sequences.end(), "Enqueueing a Program across devices with different cores harvested is not supported, unless coordinate virtualization is enabled (only enabled on Wormhole and above).");
+    if (program.get_program_binary_status(device->id()) == ProgramBinaryStatus::InFlight) {
+        // assemble_stall_commands is hardcoded to always wait for everything for now.
+        this->config_buffer_mgr.free(this->expected_num_workers_completed);
+    } else {
+        if (stall_first || stall_before_program) {
+            this->config_buffer_mgr.free(sync_count);
+        }
+    }
     uint32_t num_workers = 0;
     if (program.runs_on_noc_multicast_only_cores()) {
         num_workers += device->num_worker_cores(HalProgrammableCoreType::TENSIX, this->sub_device_id);
@@ -513,41 +1619,70 @@ void EnqueueProgramCommand::process() {
     if (program.runs_on_noc_unicast_only_cores()) {
         num_workers += device->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, this->sub_device_id);
     }
-    // Reserve space for this program in the kernel config ring buffer
-    program_utils::reserve_space_in_kernel_config_buffer(
-        this->config_buffer_mgr,
-        program.get_program_config_sizes(),
-        program.kernel_binary_always_stored_in_ringbuffer(),
-        program.get_program_binary_status(device->id()),
-        num_workers,
-        this->expected_num_workers_completed,
-        dispatch_metadata);
+    this->config_buffer_mgr.alloc(this->expected_num_workers_completed + num_workers);
+    std::vector<ConfigBufferEntry>& kernel_config_addrs_raw = reservation.second;
 
     // Remove launch buffer from config addrs, since it's not a real core.
     const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs{
-        dispatch_metadata.kernel_config_addrs.data(), dispatch_metadata.kernel_config_addrs.size() - 1};
+        kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 1};
 
     RecordProgramRun(program);
 
-    // Access the program dispatch-command cache
-    auto& cached_program_command_sequence = program.get_cached_program_command_sequences().begin()->second;
-    // Update the generated dispatch commands based on the state of the CQ and the ring buffer
-    program_utils::update_program_dispatch_commands(
-        program,
-        cached_program_command_sequence,
-        kernel_config_addrs,
-        this->multicast_cores_launch_message_wptr,
-        this->unicast_cores_launch_message_wptr,
-        this->expected_num_workers_completed,
-        this->dispatch_core,
-        this->dispatch_core_type,
-        this->sub_device_id,
-        dispatch_metadata,
-        program.get_program_binary_status(device->id()));
-    // Issue dispatch commands for this program
-    this->write_program_command_sequence(cached_program_command_sequence, dispatch_metadata.stall_first, dispatch_metadata.stall_before_program);
-    // Kernel Binaries are committed to DRAM, the first time the program runs on device. Reflect this on host.
-    program.set_program_binary_status(device->id(), ProgramBinaryStatus::Committed);
+    // Calculate all commands size and determine how many fetch q entries to use
+    // Preamble, some waits and stalls
+    // can be written directly to the issue queue
+    if (!is_cached) {
+        ProgramCommandSequence program_command_sequence;
+        this->assemble_preamble_commands(program_command_sequence, kernel_config_addrs);
+        this->assemble_stall_commands(program_command_sequence, true);
+        program_command_sequence.current_stall_seq_idx = UncachedStallSequenceIdx;
+        // Runtime Args Command Sequence
+        this->assemble_runtime_args_commands(program_command_sequence);
+
+        // Record kernel groups in this program, only need to do it once.
+        for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
+            CoreType core_type = hal.get_core_type(index);
+            RecordKernelGroups(program, core_type, program.get_kernel_groups(index));
+        }
+        this->assemble_device_commands(program_command_sequence, kernel_config_addrs);
+        // Stall first for simplicity, because we don't use `sync_count` in assemble_stall_commands.
+        this->write_program_command_sequence(
+            program_command_sequence, /*stall_first=*/true, /*stall_before_program=*/false);
+        this->assemble_stall_commands(program_command_sequence, false);
+        cached_program_command_sequences.insert({command_hash, std::move(program_command_sequence)});
+        program.set_cached();
+        program.set_program_binary_status(device->id(), ProgramBinaryStatus::Committed);
+    } else {
+        static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count));
+        static constexpr uint32_t tensix_l1_write_offset_offset =
+            (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset1));
+        static constexpr uint32_t eth_l1_write_offset_offset =
+            (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset2));
+
+        auto& cached_program_command_sequence = cached_cmd_iter->second;
+        if (program.get_program_binary_status(device->id()) != ProgramBinaryStatus::Committed) {
+            cached_program_command_sequence.current_stall_seq_idx = UncachedStallSequenceIdx;
+            program.set_program_binary_status(device->id(), ProgramBinaryStatus::Committed);
+        } else {
+            cached_program_command_sequence.current_stall_seq_idx = CachedStallSequenceIdx;
+        }
+        auto& curr_stall_seq_idx = cached_program_command_sequence.current_stall_seq_idx;
+        cached_program_command_sequence.stall_command_sequences[curr_stall_seq_idx].update_cmd_sequence(
+            wait_count_offset, &sync_count, sizeof(uint32_t));
+
+        cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence(
+            tensix_l1_write_offset_offset,
+            &kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)],
+            sizeof(uint32_t));
+        if (hal.get_programmable_core_type_count() >= 2) {
+            cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence(
+                eth_l1_write_offset_offset,
+                &kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)],
+                sizeof(uint32_t));
+        }
+        this->update_device_commands(cached_program_command_sequence, kernel_config_addrs);
+        this->write_program_command_sequence(cached_program_command_sequence, stall_first, stall_before_program);
+    }
 }
 
 EnqueueRecordEventCommand::EnqueueRecordEventCommand(
@@ -585,7 +1720,7 @@ void EnqueueRecordEventCommand::process() {
     uint32_t num_worker_counters = this->sub_device_ids.size();
 
     uint32_t cmd_sequence_sizeB =
-        hal.get_alignment(HalMemType::HOST) * num_worker_counters +  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
+        CQ_PREFETCH_CMD_BARE_MIN_SIZE * num_worker_counters +  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
         packed_write_sizeB +  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WRITE_PACKED + unicast subcmds + event
                               // payload
         align(
@@ -675,7 +1810,7 @@ EnqueueWaitForEventCommand::EnqueueWaitForEventCommand(
 }
 
 void EnqueueWaitForEventCommand::process() {
-    uint32_t cmd_sequence_sizeB = hal.get_alignment(HalMemType::HOST);  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
+    uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE;  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
 
     void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id);
 
@@ -720,14 +1855,14 @@ void EnqueueTraceCommand::process() {
     uint32_t go_signals_cmd_size = align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment) * descriptor->descriptors.size();
 
     uint32_t cmd_sequence_sizeB =
-        this->device->dispatch_s_enabled() * hal.get_alignment(HalMemType::HOST) + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running)
+        this->device->dispatch_s_enabled() * CQ_PREFETCH_CMD_BARE_MIN_SIZE + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running)
         go_signals_cmd_size +  // go signal cmd
-        (hal.get_alignment(HalMemType::HOST) +  // wait to ensure that reset go signal was processed (dispatch_d)
+        (CQ_PREFETCH_CMD_BARE_MIN_SIZE +  // wait to ensure that reset go signal was processed (dispatch_d)
         // when dispatch_s and dispatch_d are running on 2 cores, workers update dispatch_s. dispatch_s is responsible for resetting worker count
         // and giving dispatch_d the latest worker state. This is encapsulated in the dispatch_s wait command (only to be sent when dispatch is distributed
         // on 2 cores)
-        (this->device->distributed_dispatcher()) * hal.get_alignment(HalMemType::HOST)) * num_sub_devices +
-        hal.get_alignment(HalMemType::HOST);  // CQ_PREFETCH_CMD_EXEC_BUF
+        (this->device->distributed_dispatcher()) * CQ_PREFETCH_CMD_BARE_MIN_SIZE) * num_sub_devices +
+        CQ_PREFETCH_CMD_BARE_MIN_SIZE;  // CQ_PREFETCH_CMD_EXEC_BUF
 
     void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id);
 
@@ -810,7 +1945,7 @@ EnqueueTerminateCommand::EnqueueTerminateCommand(
 void EnqueueTerminateCommand::process() {
     // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_TERMINATE
     // CQ_PREFETCH_CMD_TERMINATE
-    uint32_t cmd_sequence_sizeB = hal.get_alignment(HalMemType::HOST);
+    uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE;
 
     // dispatch and prefetch terminate commands each needs to be a separate fetch queue entry
     void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id);
@@ -892,7 +2027,7 @@ void HWCommandQueue::set_num_worker_sems_on_dispatch(uint32_t num_worker_sems) {
     if (!this->device->dispatch_s_enabled()) {
         return;
     }
-    uint32_t cmd_sequence_sizeB = hal.get_alignment(HalMemType::HOST);
+    uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE;
     void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->id);
     HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB);
     command_sequence.add_dispatch_set_num_worker_sems(num_worker_sems, DispatcherSelect::DISPATCH_SLAVE);
@@ -921,13 +2056,13 @@ void HWCommandQueue::reset_worker_state(bool reset_launch_msg_state) {
         go_signals_cmd_size = align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment) * num_sub_devices;
     }
     uint32_t cmd_sequence_sizeB =
-        reset_launch_msg_state * this->device->dispatch_s_enabled() * hal.get_alignment(HalMemType::HOST) + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running)
+        reset_launch_msg_state * this->device->dispatch_s_enabled() * CQ_PREFETCH_CMD_BARE_MIN_SIZE + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running)
         go_signals_cmd_size +  // go signal cmd
-        (hal.get_alignment(HalMemType::HOST) +  // wait to ensure that reset go signal was processed (dispatch_d)
+        (CQ_PREFETCH_CMD_BARE_MIN_SIZE +  // wait to ensure that reset go signal was processed (dispatch_d)
         // when dispatch_s and dispatch_d are running on 2 cores, workers update dispatch_s. dispatch_s is responsible for resetting worker count
         // and giving dispatch_d the latest worker state. This is encapsulated in the dispatch_s wait command (only to be sent when dispatch is distributed
         // on 2 cores)
-        this->device->distributed_dispatcher() * hal.get_alignment(HalMemType::HOST)) * num_sub_devices;
+        this->device->distributed_dispatcher() * CQ_PREFETCH_CMD_BARE_MIN_SIZE) * num_sub_devices;
     void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->id);
     HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB);
     bool clear_count = true;
@@ -1179,7 +2314,7 @@ void HWCommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool
     CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->device->id());
     const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
     uint32_t max_data_sizeB =
-        max_prefetch_command_size - (hal.get_alignment(HalMemType::HOST) * 2);  // * 2 to account for issue
+        max_prefetch_command_size - (CQ_PREFETCH_CMD_BARE_MIN_SIZE * 2);  // * 2 to account for issue
 
     uint32_t dst_page_index = 0;
 
@@ -1303,7 +2438,7 @@ void HWCommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool
 
         uint32_t num_full_pages_written = 0;
         while (total_pages_to_write > 0) {
-            uint32_t data_offsetB = hal.get_alignment(HalMemType::HOST);  // data appended after CQ_PREFETCH_CMD_RELAY_INLINE
+            uint32_t data_offsetB = CQ_PREFETCH_CMD_BARE_MIN_SIZE;  // data appended after CQ_PREFETCH_CMD_RELAY_INLINE
                                                                     // + CQ_DISPATCH_CMD_WRITE_PAGED
             bool issue_wait =
                 (dst_page_index == 0 and
@@ -1369,12 +2504,11 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
     ZoneScopedN("HWCommandQueue_enqueue_program");
     std::vector<SubDeviceId> sub_device_ids = {program.determine_sub_device_ids(device)};
     TT_FATAL(sub_device_ids.size() == 1, "Programs must be executed on a single sub-device");
-    // Finalize Program: Compute relative offsets for data structures (semaphores, kernel binaries, etc) in L1
     if (not program.is_finalized()) {
         program.finalize(device);
     }
     if (program.get_program_binary_status(device->id()) == ProgramBinaryStatus::NotSent) {
-        // Write program binaries to device if it hasn't previously been cached
+        // Write program binaries to device
         program.allocate_kernel_bin_buf_on_device(device);
         if (program.get_program_transfer_info().binary_data.size()) {
             this->enqueue_write_buffer(
@@ -1382,10 +2516,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
         }
         program.set_program_binary_status(device->id(), ProgramBinaryStatus::InFlight);
     }
-    // Lower the program to device: Generate dispatch commands.
-    // Values in these commands will get updated based on kernel config ring
-    // buffer state at runtime.
-    program.generate_dispatch_commands(device);
+
     program.set_last_used_command_queue_for_testing(this);
 
 #ifdef DEBUG
@@ -1842,10 +2973,10 @@ volatile bool HWCommandQueue::is_noc_hung() { return illegal_noc_txn_hang; }
 void HWCommandQueue::record_begin(const uint32_t tid, std::shared_ptr<detail::TraceDescriptor> ctx) {
     uint32_t num_sub_devices = this->device->num_sub_devices();
     // Issue event as a barrier and a counter reset
-    uint32_t cmd_sequence_sizeB = hal.get_alignment(HalMemType::HOST);
+    uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE;
     if (this->device->distributed_dispatcher()) {
         // wait on dispatch_s before issuing counter reset
-        cmd_sequence_sizeB += hal.get_alignment(HalMemType::HOST);
+        cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE;
     }
     cmd_sequence_sizeB *= num_sub_devices;
     void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->id);
@@ -1894,7 +3025,7 @@ void HWCommandQueue::record_end() {
     auto &trace_data = this->trace_ctx->data;
     trace_data = std::move(this->manager.get_bypass_data());
     // Add command to terminate the trace buffer
-    DeviceCommand command_sequence(hal.get_alignment(HalMemType::HOST));
+    DeviceCommand command_sequence(CQ_PREFETCH_CMD_BARE_MIN_SIZE);
     command_sequence.add_prefetch_exec_buf_end();
     for (int i = 0; i < command_sequence.size_bytes() / sizeof(uint32_t); i++) {
         trace_data.push_back(((uint32_t*)command_sequence.data())[i]);
diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp
index 5a3a77b786b..661e3d6d499 100644
--- a/tt_metal/impl/dispatch/command_queue.hpp
+++ b/tt_metal/impl/dispatch/command_queue.hpp
@@ -331,6 +331,16 @@ class EnqueueProgramCommand : public Command {
         uint32_t unicast_cores_launch_message_wptr,
         SubDeviceId sub_device_id);
 
+    void assemble_preamble_commands(
+        ProgramCommandSequence& program_command_sequence, const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs);
+    void assemble_stall_commands(ProgramCommandSequence& program_command_sequence, bool prefetch_stall);
+    void assemble_runtime_args_commands(ProgramCommandSequence& program_command_sequence);
+    void assemble_device_commands(
+        ProgramCommandSequence& program_command_sequence, const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs);
+    void update_device_commands(
+        ProgramCommandSequence& cached_program_command_sequence,
+        const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs);
+
     void write_program_command_sequence(
         const ProgramCommandSequence& program_command_sequence, bool stall_first, bool stall_before_program);
 
@@ -517,6 +527,7 @@ class HWCommandQueue {
     void set_num_worker_sems_on_dispatch(uint32_t num_worker_sems);
     void set_go_signal_noc_data_on_dispatch(const vector_memcpy_aligned<uint32_t>& go_signal_noc_data);
     void reset_worker_state(bool reset_launch_msg_state);
+
    private:
     uint32_t id;
     uint32_t size_B;
diff --git a/tt_metal/impl/dispatch/dispatch_core_common.hpp b/tt_metal/impl/dispatch/dispatch_core_common.hpp
index 35c94d6e775..3336175e72b 100644
--- a/tt_metal/impl/dispatch/dispatch_core_common.hpp
+++ b/tt_metal/impl/dispatch/dispatch_core_common.hpp
@@ -6,7 +6,6 @@
 
 #include "common/core_descriptor.hpp"
 #include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/kernels/data_types.hpp"
 #include "tt_metal/llrt/get_platform_architecture.hpp"
 #include "tt_metal/tt_stl/reflection.hpp"
 
@@ -33,10 +32,6 @@ enum DispatchWorkerType : uint32_t {
     COUNT = 17
 };
 
-// NOC ID used by dispatch kernels to communicate with downstream cores. This parameter
-// is required when setting up Command Queue objects on host.
-static constexpr NOC dispatch_downstream_noc = NOC::NOC_0;
-
 enum class DispatchCoreType : uint32_t { WORKER, ETH, COUNT };
 
 enum class DispatchCoreAxis { ROW, COL, COUNT };
diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch.cpp b/tt_metal/impl/dispatch/kernel_config/dispatch.cpp
index 1bfbbd579a3..c97c5457c2d 100644
--- a/tt_metal/impl/dispatch/kernel_config/dispatch.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/dispatch.cpp
@@ -26,6 +26,7 @@ void DispatchKernel::GenerateStaticConfigs() {
         uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
         uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_);
 
+        logical_core_ = dispatch_core_manager::instance().dispatcher_core(device_->id(), channel, cq_id_);
         static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base();
         static_config_.dispatch_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
         static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
@@ -76,6 +77,7 @@ void DispatchKernel::GenerateStaticConfigs() {
         uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
         uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_);
 
+        logical_core_ = dispatch_core_manager::instance().dispatcher_core(servicing_device_id_, channel, cq_id_);
         static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base();
         static_config_.dispatch_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
         static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
@@ -122,6 +124,7 @@ void DispatchKernel::GenerateStaticConfigs() {
         uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
         uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_);
 
+        logical_core_ = dispatch_core_manager::instance().dispatcher_d_core(device_->id(), channel, cq_id_);
         static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base();
         static_config_.dispatch_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
         static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch.hpp b/tt_metal/impl/dispatch/kernel_config/dispatch.hpp
index c2a18bdfa3e..431d54c8605 100644
--- a/tt_metal/impl/dispatch/kernel_config/dispatch.hpp
+++ b/tt_metal/impl/dispatch/kernel_config/dispatch.hpp
@@ -65,24 +65,8 @@ class DispatchKernel : public FDKernel {
         bool h_variant,
         bool d_variant) :
         FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {
-        TT_FATAL(
-            noc_selection.downstream_noc == dispatch_downstream_noc,
-            "Invalid downstream NOC specified for Dispatcher kernel");
-        TT_FATAL(
-            noc_selection.upstream_noc != noc_selection.downstream_noc,
-            "Dispatcher kernel cannot have identical upstream and downstream NOCs.");
         static_config_.is_h_variant = h_variant;
         static_config_.is_d_variant = d_variant;
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
-        if (h_variant && d_variant) {
-            this->logical_core_ = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id);
-        } else if (h_variant) {
-            channel = tt::Cluster::instance().get_assigned_channel_for_device(servicing_device_id);
-            this->logical_core_ =
-                dispatch_core_manager::instance().dispatcher_core(servicing_device_id, channel, cq_id);
-        } else if (d_variant) {
-            this->logical_core_ = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id);
-        }
     }
     void CreateKernel() override;
     void GenerateStaticConfigs() override;
diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp b/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp
index 236cf78939e..9552dd2d5f4 100644
--- a/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp
@@ -27,7 +27,7 @@ void DispatchSKernel::GenerateStaticConfigs() {
             dispatch_s_buffer_base = dispatch_buffer_base;
         }
     }
-
+    logical_core_ = dispatch_core_manager::instance().dispatcher_s_core(device_->id(), channel, cq_id_);
     static_config_.cb_base = dispatch_s_buffer_base;
     static_config_.cb_log_page_size = dispatch_constants::DISPATCH_S_BUFFER_LOG_PAGE_SIZE;
     static_config_.cb_size = my_dispatch_constants.dispatch_s_buffer_size();
diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp b/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp
index ba1760f53f8..a9293e7bd1f 100644
--- a/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp
+++ b/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp
@@ -29,10 +29,7 @@ class DispatchSKernel : public FDKernel {
 public:
     DispatchSKernel(
         int node_id, chip_id_t device_id, chip_id_t servicing_device_id, uint8_t cq_id, noc_selection_t noc_selection) :
-        FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
-        this->logical_core_ = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id_);
-    }
+        FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {}
     void CreateKernel() override;
     void GenerateStaticConfigs() override;
     void GenerateDependentConfigs() override;
diff --git a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp
index bd991bdb349..5ffcb7939d0 100644
--- a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp
@@ -54,7 +54,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() {
             } else if (auto tk = dynamic_cast<EthTunnelerKernel*>(k)) {
                 tunneler_kernel = tk;
             } else {
-                TT_FATAL(false, "Unexpected kernel type upstream of TUNNELER");
+                TT_FATAL(false, "Unexpected kernelt tyoe downstream of TUNNELER");
             }
         }
         TT_ASSERT(tunneler_kernel && !tunneler_kernel->IsRemote());
@@ -175,7 +175,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() {
             } else if (auto tk = dynamic_cast<EthTunnelerKernel*>(k)) {
                 ds_tunneler_kernel = tk;
             } else {
-                TT_FATAL(false, "Unexpected kernel type downstream of TUNNELER");
+                TT_FATAL(false, "Unexpected kernelt tyoe downstream of TUNNELER");
             }
         }
         TT_ASSERT(ds_tunneler_kernel && ds_tunneler_kernel == tunneler_kernel);
diff --git a/tt_metal/impl/dispatch/kernel_config/prefetch.cpp b/tt_metal/impl/dispatch/kernel_config/prefetch.cpp
index e7f86fe0581..28646120468 100644
--- a/tt_metal/impl/dispatch/kernel_config/prefetch.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/prefetch.cpp
@@ -23,6 +23,8 @@ void PrefetchKernel::GenerateStaticConfigs() {
         uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start;
         uint32_t issue_queue_size = device_->sysmem_manager().get_issue_queue_size(cq_id_);
 
+        logical_core_ = dispatch_core_manager::instance().prefetcher_core(device_->id(), channel, cq_id_);
+
         dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base();
         static_config_.downstream_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
         static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
@@ -81,6 +83,8 @@ void PrefetchKernel::GenerateStaticConfigs() {
         uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start;
         uint32_t issue_queue_size = device_->sysmem_manager().get_issue_queue_size(cq_id_);
 
+        logical_core_ = dispatch_core_manager::instance().prefetcher_core(servicing_device_id_, channel, cq_id_);
+
         static_config_.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
         if (tt::Cluster::instance().is_galaxy_cluster()) {  // TODO: whys is this hard-coded for galaxy?
             static_config_.downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(1);
@@ -119,6 +123,8 @@ void PrefetchKernel::GenerateStaticConfigs() {
         static_config_.dispatch_s_buffer_size = 0;
         static_config_.dispatch_s_cb_log_page_size = 0;
     } else if (static_config_.is_d_variant.value()) {
+        logical_core_ = dispatch_core_manager::instance().prefetcher_d_core(device_->id(), channel, cq_id_);
+
         dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base();
         static_config_.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
         static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
diff --git a/tt_metal/impl/dispatch/kernel_config/prefetch.hpp b/tt_metal/impl/dispatch/kernel_config/prefetch.hpp
index a029049928e..2410a1ca275 100644
--- a/tt_metal/impl/dispatch/kernel_config/prefetch.hpp
+++ b/tt_metal/impl/dispatch/kernel_config/prefetch.hpp
@@ -66,19 +66,6 @@ class PrefetchKernel : public FDKernel {
         FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {
         static_config_.is_h_variant = h_variant;
         static_config_.is_d_variant = d_variant;
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
-        TT_FATAL(
-            noc_selection.downstream_noc == dispatch_downstream_noc,
-            "Invalid downstream NOC specified for Prefetcher kernel");
-        if (h_variant && d_variant) {
-            this->logical_core_ = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id);
-        } else if (h_variant) {
-            channel = tt::Cluster::instance().get_assigned_channel_for_device(servicing_device_id);
-            this->logical_core_ =
-                dispatch_core_manager::instance().prefetcher_core(servicing_device_id, channel, cq_id);
-        } else if (d_variant) {
-            this->logical_core_ = dispatch_core_manager::instance().prefetcher_d_core(device_id, channel, cq_id);
-        }
     }
     void CreateKernel() override;
     void GenerateStaticConfigs() override;
diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp
index 93bc8e3e7df..ce040d65b96 100644
--- a/tt_metal/impl/dispatch/topology.cpp
+++ b/tt_metal/impl/dispatch/topology.cpp
@@ -79,18 +79,15 @@ static const std::vector<dispatch_kernel_node_t> two_chip_arch_1cq = {
 
     {3, 0, 1, 0, PREFETCH_H, {x, x, x, x}, {5, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {4, 0, 1, 0, DISPATCH_H, {6, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
-
     {5, 0, 1, 0, PACKET_ROUTER_MUX, {3, x, x, x}, {7, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {6, 0, 1, 0, DEMUX, {7, x, x, x}, {4, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {7, 0, 1, 0, US_TUNNELER_REMOTE, {11, 5, x, x}, {11, 6, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-
-    {8, 1, x, 0, PREFETCH_D, {13, x, x, x}, {9, 10, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {9, 1, x, 0, DISPATCH_D, {8, x, x, x}, {10, 12, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
-    {10, 1, x, 0, DISPATCH_S, {8, x, x, x}, {9, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
-
-    {11, 1, x, 0, US_TUNNELER_LOCAL, {7, 12, x, x}, {7, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {12, 1, x, 0, MUX_D, {9, x, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {7, 0, 1, 0, US_TUNNELER_REMOTE, {8, 5, x, x}, {8, 6, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {8, 1, x, 0, US_TUNNELER_LOCAL, {7, 9, x, x}, {7, 10, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {9, 1, x, 0, MUX_D, {12, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {10, 1, x, 0, PACKET_ROUTER_DEMUX, {8, x, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {11, 1, x, 0, PREFETCH_D, {10, x, x, x}, {12, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {12, 1, x, 0, DISPATCH_D, {11, x, x, x}, {13, 9, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
+    {13, 1, x, 0, DISPATCH_S, {11, x, x, x}, {12, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
 };
 
 static const std::vector<dispatch_kernel_node_t> two_chip_arch_2cq = {
@@ -106,17 +103,15 @@ static const std::vector<dispatch_kernel_node_t> two_chip_arch_2cq = {
 
     {8, 0, 1, 0, PACKET_ROUTER_MUX, {4, 5, x, x}, {10, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {9, 0, 1, 0, DEMUX, {10, x, x, x}, {6, 7, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {10, 0, 1, 0, US_TUNNELER_REMOTE, {15, 8, x, x}, {15, 9, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-
-    {11, 1, x, 0, PREFETCH_D, {17, x, x, x}, {13, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {12, 1, x, 1, PREFETCH_D, {17, x, x, x}, {14, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {13, 1, x, 0, DISPATCH_D, {11, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
-    {14, 1, x, 1, DISPATCH_D, {12, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
-
-    {15, 1, x, 0, US_TUNNELER_LOCAL, {10, 16, x, x}, {10, 17, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {16, 1, x, 0, MUX_D, {13, 14, x, x}, {15, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-    {17, 1, x, 0, PACKET_ROUTER_DEMUX, {15, x, x, x}, {11, 12, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
-
+    {10, 0, 1, 0, US_TUNNELER_REMOTE, {11, 8, x, x}, {11, 9, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {11, 1, x, 0, US_TUNNELER_LOCAL, {10, 12, x, x}, {10, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {12, 1, x, 0, MUX_D, {16, 17, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {14, 15, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+
+    {14, 1, x, 0, PREFETCH_D, {13, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {15, 1, x, 1, PREFETCH_D, {13, x, x, x}, {17, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
+    {16, 1, x, 0, DISPATCH_D, {14, x, x, x}, {12, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
+    {17, 1, x, 1, DISPATCH_D, {15, x, x, x}, {12, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
 };
 
 static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_1cq = {
diff --git a/tt_metal/impl/kernels/kernel_types.hpp b/tt_metal/impl/kernels/kernel_types.hpp
index 444f4619cee..195b592eb77 100644
--- a/tt_metal/impl/kernels/kernel_types.hpp
+++ b/tt_metal/impl/kernels/kernel_types.hpp
@@ -15,7 +15,7 @@
 
 namespace tt::tt_metal {
 
-using KernelHandle = std::uint32_t;
+using KernelHandle = std::uint16_t;
 
 struct DataMovementConfig {
     DataMovementProcessor processor = DataMovementProcessor::RISCV_0;  // For data transfer kernels: NCRISC & BRISC
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index b37424c7903..c2cb844b499 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -2,7 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/program/program_dispatch_utils.hpp"
+#include "tt_metal/impl/program/program.hpp"
+
 #include <range/v3/view/filter.hpp>
 #include <range/v3/view/transform.hpp>
 
@@ -106,8 +107,7 @@ class Program_ {
     const std::vector< Semaphore > & semaphores() const;
 
     KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index);
-    std::vector<std::shared_ptr<KernelGroup>>& get_kernel_groups(uint32_t programmable_core_type_index);
-    std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& get_kernels(uint32_t programmable_core_type_index);
+    std::vector<KernelGroup>& get_kernel_groups(uint32_t programmable_core_type_index);
     void add_buffer(std::shared_ptr<Buffer> buf);
     void release_buffers();
     std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_core(const CoreCoord &core) const;
@@ -155,9 +155,10 @@ class Program_ {
     uint32_t get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const;
     uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const;
     void set_last_used_command_queue_for_testing(HWCommandQueue *queue);
-    void populate_dispatch_data(Device *device);
 
    private:
+    void populate_dispatch_data(Device *device);
+
     HWCommandQueue *last_used_command_queue_for_testing = nullptr;
 
     // Buffers temporarily owned by the program
@@ -220,8 +221,9 @@ class Program_ {
     bool local_circular_buffer_allocation_needed_;
 
     static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff;
-    std::vector<std::vector<std::shared_ptr<KernelGroup>>> kernel_groups_;
+    std::vector<std::vector<KernelGroup>> kernel_groups_;
     std::vector<std::vector<uint8_t>> core_to_kernel_group_index_table_;
+    uint32_t tensix_go_signal_count_;
 
     std::vector<std::shared_ptr<Buffer>> config_buffers_;
 
@@ -442,29 +444,21 @@ CoreType KernelGroup::get_core_type() const {
     return hal.get_core_type(this->programmable_core_type_index);
 };
 
-std::vector<std::shared_ptr<KernelGroup>> &detail::Program_::get_kernel_groups(uint32_t programmable_core_type_index) {
+std::vector<KernelGroup> &detail::Program_::get_kernel_groups(uint32_t programmable_core_type_index) {
     update_kernel_groups(programmable_core_type_index);
     return kernel_groups_[programmable_core_type_index];
 }
 
-std::vector<std::shared_ptr<KernelGroup>> &Program::get_kernel_groups(uint32_t programmable_core_type_index) {
+std::vector<KernelGroup> &Program::get_kernel_groups(uint32_t programmable_core_type_index) {
     return pimpl_->get_kernel_groups(programmable_core_type_index);
 }
 
-std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& detail::Program_::get_kernels(uint32_t programmable_core_type_index) {
-    return this->kernels_.at(programmable_core_type_index);
-}
-
-std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& Program::get_kernels(uint32_t programmable_core_type_index) {
-    return pimpl_->get_kernels(programmable_core_type_index);
-}
-
 KernelGroup *detail::Program_::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) {
     update_kernel_groups(programmable_core_type_index);
     if (core.x >= grid_extent_[programmable_core_type_index].x || core.y >= grid_extent_[programmable_core_type_index].y)
         return nullptr;
     uint8_t index = core_to_kernel_group_index_table_[programmable_core_type_index].at(core.y * grid_extent_[programmable_core_type_index].x + core.x);
-    return (index == core_to_kernel_group_invalid_index) ? nullptr : kernel_groups_[programmable_core_type_index].at(index).get();
+    return (index == core_to_kernel_group_invalid_index) ? nullptr : &kernel_groups_[programmable_core_type_index].at(index);
 }
 
 KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) {
@@ -589,16 +583,14 @@ void detail::Program_::update_kernel_groups(uint32_t programmable_core_type_inde
                 programmable_core_type_index,
                 max_local_cb_end_index,
                 min_remote_cb_start_index);
-            kernel_groups_[programmable_core_type_index].push_back(
-                std::make_shared<KernelGroup>(
-                    *this,
-                    programmable_core_type_index,
-                    kg_to_cores.first.kernel_ids,
-                    erisc_is_idle,
-                    max_local_cb_end_index,
-                    min_remote_cb_start_index,
-                    kg_to_cores.second)
-                );
+            kernel_groups_[programmable_core_type_index].push_back(KernelGroup(
+                *this,
+                programmable_core_type_index,
+                kg_to_cores.first.kernel_ids,
+                erisc_is_idle,
+                max_local_cb_end_index,
+                min_remote_cb_start_index,
+                kg_to_cores.second));
             index++;
         }
     }
@@ -1107,22 +1099,21 @@ void detail::Program_::populate_dispatch_data(Device *device) {
     std::uint32_t num_active_cores = 0;
     for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
         CoreType core_type = hal.get_core_type(index);
-        for (const auto& kernel_group : this->get_kernel_groups(index)) {
+        for (KernelGroup &kernel_group : this->get_kernel_groups(index)) {
             // TODO: add a bit in the hal that says if this core type is unicast/multicast
             if (core_type == CoreType::WORKER) {
                 std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
                     device->extract_dst_noc_multicast_info(
-                       kernel_group->core_ranges.ranges(), core_type);
+                        kernel_group.core_ranges.ranges(), core_type);
                 std::vector<KernelHandle> kernel_ids;
-                for (int dispatch_class = 0; dispatch_class < kernel_group->kernel_ids.size(); dispatch_class++) {
-                    auto &optional_id = kernel_group->kernel_ids[dispatch_class];
+                for (int dispatch_class = 0; dispatch_class < kernel_group.kernel_ids.size(); dispatch_class++) {
+                    auto &optional_id = kernel_group.kernel_ids[dispatch_class];
                     if (optional_id) {
-                        KernelHandle device_local_kernel_id = program_utils::get_device_local_kernel_handle(optional_id.value());
-                        kernel_ids.push_back(device_local_kernel_id);
+                        kernel_ids.push_back(optional_id.value());
                         int proc_sub_class = 0;
-                        for (uint32_t& dst_addr : kernel_transfer_info.at(device_local_kernel_id).dst_base_addrs) {
+                        for (uint32_t& dst_addr : kernel_transfer_info.at(optional_id.value()).dst_base_addrs) {
                             // TODO: ditch this w/ linear writes based on program config kernel_text_offset and size
-                            dst_addr = kernel_group->kernel_text_offsets[dispatch_class + proc_sub_class];
+                            dst_addr = kernel_group.kernel_text_offsets[dispatch_class + proc_sub_class];
                             proc_sub_class++;
                         }
                     }
@@ -1137,12 +1128,11 @@ void detail::Program_::populate_dispatch_data(Device *device) {
             } else {
                 TT_ASSERT(core_type == CoreType::ETH);
                 std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
-                    extract_dst_noc_unicast_info(kernel_group->core_ranges.ranges(), core_type);
+                    extract_dst_noc_unicast_info(kernel_group.core_ranges.ranges(), core_type);
 
                 std::vector<KernelHandle> kernel_ids;
-                if (kernel_group->kernel_ids[DISPATCH_CLASS_ETH_DM0]) {
-                    KernelHandle device_local_kernel_id = program_utils::get_device_local_kernel_handle(kernel_group->kernel_ids[DISPATCH_CLASS_ETH_DM0].value());
-                    kernel_ids.push_back(device_local_kernel_id);
+                if (kernel_group.kernel_ids[DISPATCH_CLASS_ETH_DM0]) {
+                    kernel_ids.push_back(kernel_group.kernel_ids[DISPATCH_CLASS_ETH_DM0].value());
                 }
 
                 for (const auto &[cores, num_mcast_dsts] : dst_noc_unicast_info) {
@@ -1162,18 +1152,110 @@ void detail::Program_::populate_dispatch_data(Device *device) {
 }
 
 uint32_t detail::Program_::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) {
+
     // Iterate over kernels in the program and "level" the number of RTAs based on the max
     // Unique RTAs are packed across dispatch classes
     // Common RTAs come after unique RTAs
-    return program_utils::finalize_rt_args(
-        this->kernels_[programmable_core_type_index],
-        this->get_kernel_groups(programmable_core_type_index),
-        base_offset,
-        programmable_core_type_index,
-        this->get_program_config(programmable_core_type_index).rta_offset,
-        this->get_program_config(programmable_core_type_index).crta_offsets,
-        this->get_program_config(programmable_core_type_index).crta_sizes
-    );
+    uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index);
+
+    std::vector<uint32_t> max_rtas(processor_classes);
+    std::vector<uint32_t> max_crtas(processor_classes);
+    uint32_t max_unique_rta_size = 0;
+    uint32_t total_crta_size = 0;
+
+    CoreType core_type = hal.get_core_type(programmable_core_type_index);
+    HalProgrammableCoreType programmable_core_type = hal.get_programmable_core_type(programmable_core_type_index);
+
+    this->get_program_config(programmable_core_type_index).rta_offset = base_offset;
+
+    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+    for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) {
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
+            max_rtas[dispatch_class] = 0;
+            auto& optional_id = kg.kernel_ids[dispatch_class];
+            if (optional_id) {
+                auto kernel = get_kernel(optional_id.value());
+                for (const CoreRange &core_range : kg.core_ranges.ranges()) {
+                    for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
+                        for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
+                            CoreCoord core_coord(x, y);
+                            max_rtas[dispatch_class] =
+                                std::max(max_rtas[dispatch_class], (uint32_t)kernel->runtime_args(core_coord).size());
+                        }
+                    }
+                }
+            }
+        }
+
+        uint32_t offset = 0;
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
+            auto& optional_id = kg.kernel_ids[dispatch_class];
+            kg.rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t);
+            if (optional_id) {
+                auto kernel = get_kernel(optional_id.value());
+                kernel->set_runtime_args_count(kg.core_ranges, max_rtas[dispatch_class]);
+                kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = base_offset + offset;
+                offset += max_rtas[dispatch_class] * sizeof(uint32_t);
+            } else {
+                kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = 0;
+            }
+        }
+
+        kg.total_rta_size = offset;
+        offset = align(offset, l1_alignment);
+        max_unique_rta_size = std::max(offset, max_unique_rta_size);
+    }
+
+    for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
+        max_crtas[dispatch_class] = 0;
+    }
+    // Find the max # common RTAs across all kernels for each dispatch class
+    for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) {
+        auto kernel = get_kernel(kernel_id);
+        // TODO: kernels should be stored by programmable core type
+        if (core_type == kernel->get_kernel_core_type() &&
+            (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) {
+            uint32_t dispatch_class = kernel->dispatch_class();
+            max_crtas[dispatch_class] =
+                std::max(max_crtas[dispatch_class], (uint32_t)kernel->common_runtime_args().size());
+        }
+    }
+
+    // Calculate the address offset and size for common RTAs for each dispatch class
+    uint32_t offset = 0;
+    for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
+        uint32_t size = max_crtas[dispatch_class] * sizeof(uint32_t);
+        this->get_program_config(programmable_core_type_index).crta_offsets[dispatch_class] = base_offset + max_unique_rta_size + offset;
+        this->get_program_config(programmable_core_type_index).crta_sizes[dispatch_class] = size;
+        offset += size;
+        offset = align(offset, l1_alignment);
+    }
+    total_crta_size = offset;
+
+    // Set the runtime_args_data sizing info based on the shared max
+    for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) {
+        auto kernel = get_kernel(kernel_id);
+        // TODO: as above, fix when kernels are stored by programmable core type
+        if (core_type == kernel->get_kernel_core_type() &&
+            (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) {
+            uint32_t dispatch_class = kernel->dispatch_class();
+            kernel->set_common_runtime_args_count(max_crtas[dispatch_class]);
+        }
+    }
+
+    // Set the kernel group common runtime arg offsets use in the launch message
+    for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) {
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
+            kg.launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset = this->get_program_config(programmable_core_type_index).crta_offsets[dispatch_class];
+        }
+    }
+
+    // TODO: this is asserted here as the leveling above can break the limits enforced by the API
+    // Once we use a ring buffer, memory space will be dynamic and this assert won't matter
+    std::uint32_t l1_kernel_config_size = tt::tt_metal::hal.get_dev_size(tt::tt_metal::HalProgrammableCoreType::TENSIX, tt::tt_metal::HalL1MemAddrType::KERNEL_CONFIG);
+    TT_FATAL(offset <= l1_kernel_config_size, "offset {} cannot exceed config size {}", offset, l1_kernel_config_size);
+
+    return max_unique_rta_size + total_crta_size;
 }
 
 ProgramConfig& detail::Program_::get_program_config(uint32_t programmable_core_type_index) {
@@ -1185,7 +1267,21 @@ ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index
 }
 
 uint32_t detail::Program_::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) {
-    return program_utils::finalize_sems(programmable_core_type_index, base_offset, this->semaphores_, this->program_configs_[programmable_core_type_index].sem_offset, this->program_configs_[programmable_core_type_index].sem_size);
+
+    int max_id = -1;
+    CoreType core_type = hal.get_core_type(programmable_core_type_index);
+    for (const auto & sem : this->semaphores_) {
+        if (sem.core_type() == core_type && (int)sem.id() > max_id) {
+            max_id = sem.id();
+        }
+    }
+
+    uint32_t sem_size = (max_id + 1) * hal.get_alignment(HalMemType::L1);
+
+    this->program_configs_[programmable_core_type_index].sem_offset = base_offset;
+    this->program_configs_[programmable_core_type_index].sem_size = sem_size;
+
+    return base_offset + sem_size;
 }
 
 void detail::Program_::set_launch_msg_sem_offsets() {
@@ -1193,7 +1289,7 @@ void detail::Program_::set_launch_msg_sem_offsets() {
     for (uint32_t kg_type_index = 0; kg_type_index < hal.get_programmable_core_type_count(); kg_type_index++) {
         for (auto& kg : this->get_kernel_groups(kg_type_index)) {
             for (uint32_t sem_type_index = 0; sem_type_index < hal.get_programmable_core_type_count(); sem_type_index++) {
-                kg->launch_msg.kernel_config.sem_offset[sem_type_index] =
+                kg.launch_msg.kernel_config.sem_offset[sem_type_index] =
                     this->program_configs_[sem_type_index].sem_offset;
             }
         }
@@ -1201,11 +1297,104 @@ void detail::Program_::set_launch_msg_sem_offsets() {
 }
 
 uint32_t detail::Program_::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) {
-     return program_utils::finalize_cbs(programmable_core_type_index,  this->get_kernel_groups(programmable_core_type_index), base_offset, this->program_configs_[programmable_core_type_index].cb_offset, this->program_configs_[programmable_core_type_index].cb_size, this->program_configs_[programmable_core_type_index].local_cb_size);
+    uint32_t max_local_end_index = 0;
+    uint32_t min_remote_start_index = NUM_CIRCULAR_BUFFERS;
+    // TODO: has to be better way to do this and don't read from volatile
+    auto& kernel_groups = this->get_kernel_groups(programmable_core_type_index);
+    for (auto& kg : kernel_groups) {
+        max_local_end_index =
+            std::max(max_local_end_index, (uint32_t)kg.launch_msg.kernel_config.max_local_cb_end_index);
+        min_remote_start_index =
+            std::min(min_remote_start_index, (uint32_t)kg.launch_msg.kernel_config.min_remote_cb_start_index);
+    }
+
+    uint32_t local_cb_size = max_local_end_index * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
+    uint32_t remote_cb_offset = base_offset + local_cb_size;
+    for (auto& kg : kernel_groups) {
+        kg.launch_msg.kernel_config.local_cb_offset = base_offset;
+        kg.launch_msg.kernel_config.remote_cb_offset = remote_cb_offset;
+    }
+
+    uint32_t remote_cb_size = (NUM_CIRCULAR_BUFFERS - min_remote_start_index) *
+                              UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
+    uint32_t total_cb_size = local_cb_size + remote_cb_size;
+    this->program_configs_[programmable_core_type_index].cb_offset = base_offset;
+    this->program_configs_[programmable_core_type_index].cb_size = total_cb_size;
+    this->program_configs_[programmable_core_type_index].local_cb_size = local_cb_size;
+
+    return align(base_offset + total_cb_size, hal.get_alignment(HalMemType::L1));
 }
 
 uint32_t detail::Program_::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) {
-    return program_utils::finalize_kernel_bins(device, programmable_core_type_index, this->kernels_[programmable_core_type_index], this->get_kernel_groups(programmable_core_type_index), base_offset, this->program_configs_[programmable_core_type_index].kernel_text_offset, this->program_configs_[programmable_core_type_index].kernel_text_size);
+
+    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
+
+    uint32_t max_offset = 0;
+    for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) {
+        uint32_t offset = base_offset;
+
+        for (int class_id = 0; class_id < DISPATCH_CLASS_MAX; class_id++) {
+            auto& optional_id = kg.kernel_ids[class_id];
+            if (optional_id) {
+                const auto kernel = this->get_kernel(optional_id.value());
+                std::vector<ll_api::memory const*> const& binaries = kernel->binaries(device->build_key());
+                // TODO: this is really ugly, save me future-HAL!
+                if (programmable_core_type_index == hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) {
+                    uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0);
+
+                    if (class_id == DISPATCH_CLASS_TENSIX_DM0) {
+                        kg.kernel_bin_sizes[0] = binary_packed_size;
+                        kg.kernel_text_offsets[0] = offset;
+                        kg.launch_msg.kernel_config.kernel_text_offset[0] = offset;
+                        offset += binary_packed_size;
+                        offset = align(offset, l1_alignment);
+                    } else if (class_id == DISPATCH_CLASS_TENSIX_DM1) {
+                        kg.kernel_bin_sizes[1] = binary_packed_size;
+                        kg.kernel_text_offsets[1] = offset;
+                        kg.launch_msg.kernel_config.kernel_text_offset[1] = offset;
+                        offset += binary_packed_size;
+                        offset = align(offset, l1_alignment);
+
+                        uint32_t binary_text_size = kernel->get_binary_text_size(device, 0);
+                        TT_ASSERT(binary_text_size >> 4 <= std::numeric_limits<uint16_t>::max());
+                        kg.launch_msg.kernel_config.ncrisc_kernel_size16 = (binary_text_size + 15) >> 4;
+                    } else {
+                        constexpr uint32_t max_math_processors_count = 3;
+                        for (uint32_t proc_type_index = 0; proc_type_index < max_math_processors_count; proc_type_index++) {
+                            uint32_t binary_packed_size = kernel->get_binary_packed_size(device, proc_type_index);
+                            kg.kernel_bin_sizes[2 + proc_type_index] = binary_packed_size;
+                            kg.kernel_text_offsets[2 + proc_type_index] = offset;
+                            kg.launch_msg.kernel_config.kernel_text_offset[2 + proc_type_index] = offset;
+                            offset += binary_packed_size;
+                            offset = align(offset, l1_alignment);
+                        }
+                    }
+                } else {
+                    uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0);
+                    kg.kernel_bin_sizes[class_id] = binary_packed_size;
+
+                    // No kernel config buffer on active eth yet
+                    if (hal.get_programmable_core_type(kg.programmable_core_type_index) ==
+                        HalProgrammableCoreType::IDLE_ETH) {
+                        kg.kernel_text_offsets[class_id] = offset;
+                        kg.launch_msg.kernel_config.kernel_text_offset[class_id] = offset;
+                        offset += binary_packed_size;
+                        offset = align(offset, l1_alignment);
+                    } else {
+                        kg.kernel_text_offsets[class_id] = binaries[0]->get_text_addr();
+                        kg.launch_msg.kernel_config.kernel_text_offset[class_id] = binaries[0]->get_text_addr();
+                    }
+                }
+            }
+        }
+
+        max_offset = std::max(offset, max_offset);
+    }
+
+    this->program_configs_[programmable_core_type_index].kernel_text_offset = base_offset;
+    this->program_configs_[programmable_core_type_index].kernel_text_size = max_offset - base_offset;
+
+    return max_offset;
 }
 
 uint32_t& detail::Program_::get_program_config_size(uint32_t programmable_core_type_index) {
@@ -1238,13 +1427,13 @@ const std::vector<SubDeviceId> &detail::Program_::determine_sub_device_ids(const
                 for (const auto& kg : program_kgs) {
                     for (size_t i = 0; i < device->num_sub_devices(); ++i) {
                         const auto& sub_device_cores = device->worker_cores(core_type, SubDeviceId{i});
-                        auto intersection = sub_device_cores.intersection(kg->core_ranges);
+                        auto intersection = sub_device_cores.intersection(kg.core_ranges);
                         if (intersection.size() > 0) {
                             used_sub_device_ids.insert(SubDeviceId{i});
                             num_intersections += intersection.num_cores();
                         }
                     }
-                    num_cores += kg->core_ranges.num_cores();
+                    num_cores += kg.core_ranges.num_cores();
                 }
                 TT_FATAL(num_intersections == num_cores,
                          "Kernel group cores do not match sub device cores for programmable core type {}",
@@ -1273,6 +1462,15 @@ void detail::Program_::finalize(Device *device) {
     // Store the number of tensix "go signals" for use by CQ
     // CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix)
     // TODO: should store all the counts
+    this->tensix_go_signal_count_ = 0;
+    for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
+        CoreType core_type = hal.get_core_type(index);
+        if (core_type == CoreType::WORKER) {
+            for (auto& kg : this->get_kernel_groups(index)) {
+                this->tensix_go_signal_count_ += kg.core_ranges.size();
+            }
+        }
+    }
 
     for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
         HalProgrammableCoreType programmable_core_type = static_cast<HalProgrammableCoreType>(index);
@@ -1309,33 +1507,6 @@ void detail::Program_::finalize(Device *device) {
     finalized_ = true;
 }
 
-void Program::set_launch_msg_sem_offsets() { pimpl_->set_launch_msg_sem_offsets(); }
-void Program::populate_dispatch_data(Device* device) { pimpl_->populate_dispatch_data(device); }
-
-void Program::generate_dispatch_commands(Device* device) {
-    bool is_cached = this->is_cached();
-    uint64_t command_hash = device->build_key();
-    if (not hal.is_coordinate_virtualization_enabled()) {
-        // When coordinate virtualization is not enabled, explicitly encode the device
-        // id into the command hash, to always assert on programs being reused across devices.
-        command_hash = (command_hash << 32) | (device->id());
-    }
-    auto& cached_program_command_sequences = this->get_cached_program_command_sequences();
-    if (!is_cached) {
-        auto sub_device_id = this->determine_sub_device_ids(device)[0];
-        ProgramCommandSequence program_command_sequence;
-        program_utils::insert_empty_program_dispatch_preamble_cmd(program_command_sequence);
-        program_utils::insert_stall_cmds(program_command_sequence, sub_device_id, device);
-        program_utils::assemble_runtime_args_commands(program_command_sequence, *this, device);
-        program_utils::assemble_device_commands(program_command_sequence, *this, device, sub_device_id);
-        cached_program_command_sequences.insert({command_hash, std::move(program_command_sequence)});
-        this->set_cached();
-    } else {
-        auto cached_cmd_iter = cached_program_command_sequences.find(command_hash);
-        TT_FATAL(cached_cmd_iter != cached_program_command_sequences.end(), "Enqueueing a Program across devices with different cores harvested is not supported, unless coordinate virtualization is enabled (only enabled on Wormhole and above).");
-    }
-}
-
 void Program::allocate_kernel_bin_buf_on_device(Device* device) { pimpl_->allocate_kernel_bin_buf_on_device(device); }
 
 void Program::finalize(Device *device) { pimpl_->finalize(device); }
@@ -1642,11 +1813,7 @@ std::shared_ptr<Buffer> Program::get_kernels_buffer(Device* device) const noexce
     return nullptr;
 }
 
-void Program::set_kernels_bin_buffer(const std::shared_ptr<Buffer>& buffer) {
-    pimpl_->kernels_buffer_.insert({buffer->device()->id(), buffer});
-}
-
-std::vector<uint32_t> &Program::get_program_config_sizes() const noexcept { return pimpl_->program_config_sizes_; }
+const std::vector<uint32_t> &Program::get_program_config_sizes() const noexcept { return pimpl_->program_config_sizes_; }
 
 std::unordered_map<uint64_t, ProgramCommandSequence> &Program::get_cached_program_command_sequences() noexcept {
     return pimpl_->cached_program_command_sequences_;
diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp
index b237860be15..aee260a805a 100644
--- a/tt_metal/impl/program/program.hpp
+++ b/tt_metal/impl/program/program.hpp
@@ -12,7 +12,6 @@
 #include "tt_metal/impl/buffers/semaphore.hpp"
 #include "tt_metal/impl/dispatch/program_command_sequence.hpp"
 #include "tt_metal/impl/program/program_device_map.hpp"
-#include "tt_metal/impl/dispatch/worker_config_buffer.hpp"
 #include "dev_msgs.h"
 
 namespace tt {
@@ -43,11 +42,6 @@ CBHandle CreateCircularBuffer(
 }  // namespace experimental
 }  // namespace v1
 
-namespace program_utils {
-    void assemble_device_commands(
-        ProgramCommandSequence& program_command_sequence, Program& program, Device* device, SubDeviceId sub_device_id);
-} // namespace program_utils
-
 class EnqueueProgramCommand;
 class HWCommandQueue;
 class JitBuildOptions;
@@ -138,8 +132,7 @@ class Program {
     const std::vector< Semaphore > & semaphores() const;
 
     KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index);
-    std::vector<std::shared_ptr<KernelGroup>>& get_kernel_groups(uint32_t programmable_core_type_index);
-    std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& get_kernels(uint32_t programmable_core_type_index);
+    std::vector<KernelGroup>& get_kernel_groups(uint32_t programmable_core_type_index);
     void add_buffer(std::shared_ptr<Buffer> buf);
     void release_buffers();
     std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_core(const CoreCoord &core) const;
@@ -158,8 +151,6 @@ class Program {
 
     void compile(Device * device, bool fd_bootloader_mode = false);
 
-    void generate_dispatch_commands(Device* device);
-
     void invalidate_circular_buffer_allocation();
 
     void allocate_circular_buffers(const Device *device);
@@ -183,7 +174,7 @@ class Program {
     void set_last_used_command_queue_for_testing(HWCommandQueue *queue);
 
     const std::vector<SubDeviceId> &determine_sub_device_ids(const Device *device);
-    void set_kernels_bin_buffer(const std::shared_ptr<Buffer>& buffer);
+
    private:
     std::unique_ptr<detail::Program_> pimpl_;
 
@@ -209,23 +200,20 @@ class Program {
 
     void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type);
 
-    void set_launch_msg_sem_offsets();
-    void populate_dispatch_data(Device* device);
-    const ProgramTransferInfo &get_program_transfer_info() const noexcept;
-    std::shared_ptr<Buffer> get_kernels_buffer(Device* device) const noexcept;
-    std::vector<uint32_t> &get_program_config_sizes() const noexcept;
+    friend void detail::AddConfigBuffer(Program &program, const std::shared_ptr<Buffer>& config_buffer);
+
     bool runs_on_noc_unicast_only_cores();
     bool runs_on_noc_multicast_only_cores();
-    std::unordered_map<uint64_t, ProgramCommandSequence> &get_cached_program_command_sequences() noexcept;
     bool kernel_binary_always_stored_in_ringbuffer();
 
-    friend void detail::AddConfigBuffer(Program &program, const std::shared_ptr<Buffer>& config_buffer);
-    friend void program_utils::assemble_device_commands(
-        ProgramCommandSequence& program_command_sequence, Program& program, Device* device, SubDeviceId sub_device_id);
-
     friend HWCommandQueue;
     friend EnqueueProgramCommand;
     friend detail::Internal_;
+
+    const ProgramTransferInfo &get_program_transfer_info() const noexcept;
+    std::shared_ptr<Buffer> get_kernels_buffer(Device* device) const noexcept;
+    const std::vector<uint32_t> &get_program_config_sizes() const noexcept;
+    std::unordered_map<uint64_t, ProgramCommandSequence> &get_cached_program_command_sequences() noexcept;
 };
 
 }  // namespace v0
diff --git a/tt_metal/impl/program/program_dispatch_utils.cpp b/tt_metal/impl/program/program_dispatch_utils.cpp
deleted file mode 100644
index c59357cdf26..00000000000
--- a/tt_metal/impl/program/program_dispatch_utils.cpp
+++ /dev/null
@@ -1,1472 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "program_dispatch_utils.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/dispatch/data_collection.hpp"
-
-namespace tt::tt_metal {
-namespace program_utils {
-
-enum DispatchWriteOffsets {
-    DISPATCH_WRITE_OFFSET_ZERO = 0,
-    DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE = 1,
-    DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE = 2,
-};
-
-uint32_t configure_rta_offsets_for_kernel_groups(
-    uint32_t programmable_core_type_index,
-    std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& kernels,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t base_offset) {
-    uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index);
-    std::vector<uint32_t> max_rtas(processor_classes);
-    uint32_t max_unique_rta_size = 0;
-    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-
-    for (auto& kg : kernel_groups) {
-        for (std::size_t dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
-            max_rtas[dispatch_class] = 0;
-            auto& optional_id = kg->kernel_ids[dispatch_class];
-            if (optional_id) {
-                auto kernel = kernels.at(optional_id.value());
-                for (const CoreRange& core_range : kg->core_ranges.ranges()) {
-                    for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
-                        for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
-                            CoreCoord core_coord(x, y);
-                            max_rtas[dispatch_class] =
-                                std::max(max_rtas[dispatch_class], (uint32_t)kernel->runtime_args(core_coord).size());
-                        }
-                    }
-                }
-            }
-        }
-        uint32_t offset = 0;
-        for (std::size_t dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
-            auto& optional_id = kg->kernel_ids[dispatch_class];
-            kg->rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t);
-            if (optional_id) {
-                auto kernel = kernels.at(optional_id.value());
-                kernel->set_runtime_args_count(kg->core_ranges, max_rtas[dispatch_class]);
-                kg->launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = base_offset + offset;
-                offset += max_rtas[dispatch_class] * sizeof(uint32_t);
-            } else {
-                kg->launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = 0;
-            }
-        }
-        kg->total_rta_size = offset;
-        offset = align(offset, l1_alignment);
-        max_unique_rta_size = std::max(offset, max_unique_rta_size);
-    }
-    return max_unique_rta_size;
-}
-
-uint32_t configure_crta_offsets_for_kernel_groups(
-    uint32_t programmable_core_type_index,
-    std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& kernels,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t crta_base_offset,
-    std::array<uint32_t, DISPATCH_CLASS_MAX>& crta_offsets,
-    std::array<uint32_t, DISPATCH_CLASS_MAX>& crta_sizes) {
-    uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index);
-    std::vector<uint32_t> max_crtas(processor_classes);
-
-    for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
-        max_crtas[dispatch_class] = 0;
-    }
-    // Find the max # common RTAs across all kernels for each dispatch class
-    for (auto& kernel_info : kernels) {
-        auto kernel = kernel_info.second;
-        uint32_t dispatch_class = kernel->dispatch_class();
-        max_crtas[dispatch_class] = std::max(max_crtas[dispatch_class], (uint32_t)kernel->common_runtime_args().size());
-    }
-
-    // Derive crta offsets and sizes per dispatch class
-    uint32_t offset = 0;
-    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-    for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
-        uint32_t size = max_crtas[dispatch_class] * sizeof(uint32_t);
-        crta_offsets[dispatch_class] = crta_base_offset + offset;
-        crta_sizes[dispatch_class] = size;
-        offset += size;
-        offset = align(offset, l1_alignment);
-    }
-    uint32_t total_crta_size = offset;
-
-    // Set the runtime_args_data sizing info based on the shared max
-    for (auto& kernel_info : kernels) {
-        auto kernel = kernel_info.second;
-        uint32_t dispatch_class = kernel->dispatch_class();
-        kernel->set_common_runtime_args_count(max_crtas[dispatch_class]);
-    }
-    // Set the kernel group common runtime arg offsets use in the launch message
-    for (auto& kg : kernel_groups) {
-        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
-            kg->launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset = crta_offsets[dispatch_class];
-        }
-    }
-    return total_crta_size;
-}
-
-uint32_t finalize_rt_args(
-    std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& kernels,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t base_offset,
-    uint32_t programmable_core_type_index,
-    uint32_t& rta_offset,
-    std::array<uint32_t, DISPATCH_CLASS_MAX>& crta_offsets,
-    std::array<uint32_t, DISPATCH_CLASS_MAX>& crta_sizes) {
-    CoreType core_type = hal.get_core_type(programmable_core_type_index);
-    HalProgrammableCoreType programmable_core_type = hal.get_programmable_core_type(programmable_core_type_index);
-
-    uint32_t max_unique_rta_size = program_utils::configure_rta_offsets_for_kernel_groups(
-        programmable_core_type_index, kernels, kernel_groups, base_offset);
-    uint32_t crta_base_offset = base_offset + max_unique_rta_size;
-    uint32_t total_crta_size = program_utils::configure_crta_offsets_for_kernel_groups(
-        programmable_core_type_index, kernels, kernel_groups, crta_base_offset, crta_offsets, crta_sizes);
-
-    uint32_t offset = max_unique_rta_size + total_crta_size;
-    // TODO: this is asserted here as the leveling above can break the limits enforced by the API
-    // Once we use a ring buffer, memory space will be dynamic and this assert won't matter
-    std::uint32_t l1_kernel_config_size = tt::tt_metal::hal.get_dev_size(
-        tt::tt_metal::HalProgrammableCoreType::TENSIX, tt::tt_metal::HalL1MemAddrType::KERNEL_CONFIG);
-    TT_FATAL(offset <= l1_kernel_config_size, "offset {} cannot exceed config size {}", offset, l1_kernel_config_size);
-
-    rta_offset = base_offset;
-    return offset;
-}
-
-uint32_t finalize_sems(
-    uint32_t programmable_core_type_index,
-    uint32_t sem_base_offset,
-    const std::vector<Semaphore>& semaphores,
-    uint32_t& semaphore_offset,
-    uint32_t& semaphore_size) {
-    int max_id = -1;
-    CoreType core_type = hal.get_core_type(programmable_core_type_index);
-    for (const auto& sem : semaphores) {
-        if (sem.core_type() == core_type && (int)sem.id() > max_id) {
-            max_id = sem.id();
-        }
-    }
-    uint32_t sem_size = (max_id + 1) * hal.get_alignment(HalMemType::L1);
-    semaphore_offset = sem_base_offset;
-    semaphore_size = sem_size;
-    return sem_base_offset + sem_size;
-}
-
-uint32_t finalize_cbs(
-    uint32_t programmable_core_type_index,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t base_offset,
-    uint32_t& cb_offset,
-    uint32_t& cb_size,
-    uint32_t& local_cb_size) {
-    uint32_t max_local_end_index = 0;
-    uint32_t min_remote_start_index = NUM_CIRCULAR_BUFFERS;
-
-    for (auto& kg : kernel_groups) {
-        max_local_end_index =
-            std::max(max_local_end_index, (uint32_t)kg->launch_msg.kernel_config.max_local_cb_end_index);
-        min_remote_start_index =
-            std::min(min_remote_start_index, (uint32_t)kg->launch_msg.kernel_config.min_remote_cb_start_index);
-    }
-
-    local_cb_size = max_local_end_index * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
-    uint32_t remote_cb_offset = base_offset + local_cb_size;
-    for (auto& kg : kernel_groups) {
-        kg->launch_msg.kernel_config.local_cb_offset = base_offset;
-        kg->launch_msg.kernel_config.remote_cb_offset = remote_cb_offset;
-    }
-
-    uint32_t remote_cb_size = (NUM_CIRCULAR_BUFFERS - min_remote_start_index) *
-                              UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
-    uint32_t total_cb_size = local_cb_size + remote_cb_size;
-    cb_offset = base_offset;
-    cb_size = total_cb_size;
-
-    return align(base_offset + total_cb_size, hal.get_alignment(HalMemType::L1));
-}
-
-uint32_t finalize_kernel_bins(
-    Device* device,
-    uint32_t programmable_core_type_index,
-    const std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& kernels,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t base_offset,
-    uint32_t& kernel_text_offset,
-    uint32_t& kernel_text_size) {
-    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-
-    uint32_t max_offset = 0;
-    for (auto& kg : kernel_groups) {
-        uint32_t offset = base_offset;
-
-        for (int class_id = 0; class_id < DISPATCH_CLASS_MAX; class_id++) {
-            auto& optional_id = kg->kernel_ids[class_id];
-            if (optional_id) {
-                const auto kernel = kernels.at(optional_id.value());
-                const std::vector<const ll_api::memory*>& binaries = kernel->binaries(device->build_key());
-                // TODO: this is really ugly, save me future-HAL!
-                if (programmable_core_type_index ==
-                    hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) {
-                    uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0);
-
-                    if (class_id == DISPATCH_CLASS_TENSIX_DM0) {
-                        kg->kernel_bin_sizes[0] = binary_packed_size;
-                        kg->kernel_text_offsets[0] = offset;
-                        kg->launch_msg.kernel_config.kernel_text_offset[0] = offset;
-                        offset += binary_packed_size;
-                        offset = align(offset, l1_alignment);
-                    } else if (class_id == DISPATCH_CLASS_TENSIX_DM1) {
-                        kg->kernel_bin_sizes[1] = binary_packed_size;
-                        kg->kernel_text_offsets[1] = offset;
-                        kg->launch_msg.kernel_config.kernel_text_offset[1] = offset;
-                        offset += binary_packed_size;
-                        offset = align(offset, l1_alignment);
-
-                        uint32_t binary_text_size = kernel->get_binary_text_size(device, 0);
-                        TT_ASSERT(binary_text_size >> 4 <= std::numeric_limits<uint16_t>::max());
-                        kg->launch_msg.kernel_config.ncrisc_kernel_size16 = (binary_text_size + 15) >> 4;
-                    } else {
-                        constexpr uint32_t max_math_processors_count = 3;
-                        for (uint32_t proc_type_index = 0; proc_type_index < max_math_processors_count;
-                             proc_type_index++) {
-                            uint32_t binary_packed_size = kernel->get_binary_packed_size(device, proc_type_index);
-                            kg->kernel_bin_sizes[2 + proc_type_index] = binary_packed_size;
-                            kg->kernel_text_offsets[2 + proc_type_index] = offset;
-                            kg->launch_msg.kernel_config.kernel_text_offset[2 + proc_type_index] = offset;
-                            offset += binary_packed_size;
-                            offset = align(offset, l1_alignment);
-                        }
-                    }
-                } else {
-                    uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0);
-                    kg->kernel_bin_sizes[class_id] = binary_packed_size;
-
-                    // No kernel config buffer on active eth yet
-                    if (hal.get_programmable_core_type(kg->programmable_core_type_index) ==
-                        HalProgrammableCoreType::IDLE_ETH) {
-                        kg->kernel_text_offsets[class_id] = offset;
-                        kg->launch_msg.kernel_config.kernel_text_offset[class_id] = offset;
-                        offset += binary_packed_size;
-                        offset = align(offset, l1_alignment);
-                    } else {
-                        kg->kernel_text_offsets[class_id] = binaries[0]->get_text_addr();
-                        kg->launch_msg.kernel_config.kernel_text_offset[class_id] = binaries[0]->get_text_addr();
-                    }
-                }
-            }
-        }
-
-        max_offset = std::max(offset, max_offset);
-    }
-    kernel_text_offset = base_offset;
-    kernel_text_size = max_offset - base_offset;
-    return max_offset;
-}
-
-uint32_t get_packed_write_max_unicast_sub_cmds(Device* device) {
-    return device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y;
-}
-
-void insert_empty_program_dispatch_preamble_cmd(ProgramCommandSequence& program_command_sequence) {
-    // Initialize an empty preamble command in the Program Dispatch Cmd Sequence, which will be
-    // updated with the correct write offsets when the program is enqueued
-    uint32_t preamble_cmd_sizeB = hal.get_alignment(HalMemType::HOST);
-    program_command_sequence.preamble_command_sequence = HostMemDeviceCommand(preamble_cmd_sizeB);
-    program_command_sequence.preamble_command_sequence.add_dispatch_set_write_offsets(0, 0, 0);
-}
-
-void insert_stall_cmds(ProgramCommandSequence& program_command_sequence, SubDeviceId sub_device_id, Device* device) {
-    // Initialize stall command sequences for this program.
-    auto dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
-    uint32_t dispatch_message_addr =
-        dispatch_constants::get(dispatch_core_type)
-            .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE) +
-        dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(sub_device_id.to_index());
-    uint32_t uncached_stall_cmd_sizeB = hal.get_alignment(HalMemType::HOST) + hal.get_alignment(HalMemType::HOST);
-    uint32_t cached_stall_cmd_seqB = hal.get_alignment(HalMemType::HOST);
-
-    program_command_sequence.stall_command_sequences[UncachedStallSequenceIdx] =
-        HostMemDeviceCommand(uncached_stall_cmd_sizeB);
-    // Empty wait command initialized here. Will get updated when program is enqueued.
-    program_command_sequence.stall_command_sequences[UncachedStallSequenceIdx].add_dispatch_wait_with_prefetch_stall(
-        true, dispatch_message_addr, 0);
-    // Empty wait command initialized here. Will get updated when program is enqueued.
-    program_command_sequence.stall_command_sequences[CachedStallSequenceIdx] =
-        HostMemDeviceCommand(cached_stall_cmd_seqB);
-    program_command_sequence.stall_command_sequences[CachedStallSequenceIdx].add_dispatch_wait(
-        false, dispatch_message_addr, 0);
-}
-
-template <typename PackedSubCmd>
-uint32_t get_max_write_packed_sub_cmds(
-    uint32_t data_size, uint32_t max_prefetch_cmd_size, uint32_t packed_write_max_unicast_sub_cmds, bool no_stride) {
-    static_assert(
-        std::is_same<PackedSubCmd, CQDispatchWritePackedUnicastSubCmd>::value or
-        std::is_same<PackedSubCmd, CQDispatchWritePackedMulticastSubCmd>::value);
-    constexpr bool is_unicast = std::is_same<PackedSubCmd, CQDispatchWritePackedUnicastSubCmd>::value;
-    uint32_t sub_cmd_sizeB =
-        is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd);
-    // Approximate calculation due to alignment
-    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-    uint32_t max_prefetch_size = max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - hal.get_alignment(HalMemType::HOST) -
-                                 sizeof(CQDispatchCmd) - l1_alignment;
-    uint32_t max_prefetch_num_packed_cmds =
-        no_stride ? (max_prefetch_size - align(data_size * sizeof(uint32_t), l1_alignment)) / sub_cmd_sizeB
-                  : max_prefetch_size / (align(data_size * sizeof(uint32_t), l1_alignment) + sub_cmd_sizeB);
-
-    uint32_t packed_write_max_multicast_sub_cmds =
-        get_packed_write_max_multicast_sub_cmds(packed_write_max_unicast_sub_cmds);
-    return std::min(
-        max_prefetch_num_packed_cmds,
-        is_unicast ? packed_write_max_unicast_sub_cmds : packed_write_max_multicast_sub_cmds);
-};
-
-template <typename PackedSubCmd>
-void generate_runtime_args_cmds(
-    std::vector<HostMemDeviceCommand>& runtime_args_command_sequences,
-    const uint32_t& l1_arg_base_addr,
-    const std::vector<PackedSubCmd>& sub_cmds,
-    const std::vector<std::vector<std::tuple<const void*, uint32_t, uint32_t>>>& rt_data_and_sizes,
-    const uint32_t& max_runtime_args_len,
-    std::vector<std::vector<std::reference_wrapper<RuntimeArgsData>>>& rt_args_data,
-    const uint32_t max_prefetch_command_size,
-    const uint32_t packed_write_max_unicast_sub_cmds,
-    bool no_stride,
-    enum DispatchWriteOffsets write_offset_index) {
-    static_assert(
-        std::is_same<PackedSubCmd, CQDispatchWritePackedUnicastSubCmd>::value or
-        std::is_same<PackedSubCmd, CQDispatchWritePackedMulticastSubCmd>::value);
-
-    thread_local static auto get_runtime_payload_sizeB =
-        [](uint32_t num_packed_cmds, uint32_t runtime_args_len, bool is_unicast, bool no_stride) {
-            uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-            uint32_t sub_cmd_sizeB =
-                is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd);
-            uint32_t dispatch_cmd_sizeB = sizeof(CQDispatchCmd) + align(num_packed_cmds * sub_cmd_sizeB, l1_alignment);
-            uint32_t aligned_runtime_data_sizeB =
-                (no_stride ? 1 : num_packed_cmds) * align(runtime_args_len * sizeof(uint32_t), l1_alignment);
-            return dispatch_cmd_sizeB + aligned_runtime_data_sizeB;
-        };
-    thread_local static auto get_runtime_args_data_offset =
-        [](uint32_t num_packed_cmds, uint32_t runtime_args_len, bool is_unicast) {
-            uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-            uint32_t sub_cmd_sizeB =
-                is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd);
-            uint32_t dispatch_cmd_sizeB = sizeof(CQDispatchCmd) + align(num_packed_cmds * sub_cmd_sizeB, l1_alignment);
-            return sizeof(CQPrefetchCmd) + dispatch_cmd_sizeB;
-        };
-
-    constexpr bool unicast = std::is_same<PackedSubCmd, CQDispatchWritePackedUnicastSubCmd>::value;
-
-    uint32_t num_packed_cmds_in_seq = sub_cmds.size();
-    uint32_t max_packed_cmds = get_max_write_packed_sub_cmds<PackedSubCmd>(
-        max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, no_stride);
-    uint32_t offset_idx = 0;
-    if (no_stride) {
-        TT_FATAL(
-            max_packed_cmds >= num_packed_cmds_in_seq,
-            "num_packed_cmds_in_seq {} cannot exceed max_packed_cmds {} when no_stride is true",
-            num_packed_cmds_in_seq,
-            max_packed_cmds);
-    }
-    uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST);
-    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-    while (num_packed_cmds_in_seq != 0) {
-        // Generate the device command
-        uint32_t num_packed_cmds = std::min(num_packed_cmds_in_seq, max_packed_cmds);
-        uint32_t rt_payload_sizeB =
-            get_runtime_payload_sizeB(num_packed_cmds, max_runtime_args_len, unicast, no_stride);
-        uint32_t cmd_sequence_sizeB = align(sizeof(CQPrefetchCmd) + rt_payload_sizeB, pcie_alignment);
-        runtime_args_command_sequences.emplace_back(cmd_sequence_sizeB);
-        runtime_args_command_sequences.back().add_dispatch_write_packed<PackedSubCmd>(
-            num_packed_cmds,
-            l1_arg_base_addr,
-            max_runtime_args_len * sizeof(uint32_t),
-            rt_payload_sizeB,
-            sub_cmds,
-            rt_data_and_sizes,
-            packed_write_max_unicast_sub_cmds,
-            offset_idx,
-            no_stride,
-            write_offset_index);
-
-        // Update kernel RTA pointers to point into the generated command
-        // Future RTA updates through the API will update the command sequence directly
-        uint32_t data_offset = (uint32_t)get_runtime_args_data_offset(num_packed_cmds, max_runtime_args_len, unicast);
-        const uint32_t data_inc = align(max_runtime_args_len * sizeof(uint32_t), l1_alignment);
-        uint32_t num_data_copies = no_stride ? 1 : num_packed_cmds;
-        for (uint32_t i = offset_idx; i < offset_idx + num_data_copies; ++i) {
-            uint32_t offset = 0;
-            for (auto& data : rt_args_data[i]) {
-                data.get().rt_args_data =
-                    (uint32_t*)((char*)runtime_args_command_sequences.back().data() + data_offset + offset);
-                offset += data.get().rt_args_count * sizeof(uint32_t);
-            }
-            data_offset += data_inc;
-        }
-        num_packed_cmds_in_seq -= num_packed_cmds;
-        offset_idx += num_packed_cmds;
-    }
-}
-
-void assemble_runtime_args_commands(
-    ProgramCommandSequence& program_command_sequence, Program& program, Device* device) {
-    static const uint32_t packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(device);
-    NOC noc_index = dispatch_downstream_noc;
-    CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
-    const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
-
-    // Dispatch Commands to Unicast Unique Runtime Args to Workers
-    std::vector<CQDispatchWritePackedUnicastSubCmd> unique_sub_cmds;
-    std::vector<std::vector<std::tuple<const void*, uint32_t, uint32_t>>> unique_rt_data_and_sizes;
-    std::vector<std::vector<std::reference_wrapper<RuntimeArgsData>>> unique_rt_args_data;
-    // Dispatch Commands to Multicast Common Runtime Args to Workers
-    std::variant<std::vector<CQDispatchWritePackedMulticastSubCmd>, std::vector<CQDispatchWritePackedUnicastSubCmd>>
-        common_sub_cmds;
-    std::vector<std::vector<std::tuple<const void*, uint32_t, uint32_t>>> common_rt_data_and_sizes;
-    std::vector<std::vector<std::reference_wrapper<RuntimeArgsData>>> common_rt_args_data;
-
-    program_command_sequence.runtime_args_command_sequences = {};
-    uint32_t command_count = 0;
-
-    // Unique RTAs
-    for (uint32_t programmable_core_type_index = 0;
-         programmable_core_type_index < hal.get_programmable_core_type_count();
-         programmable_core_type_index++) {
-        if (hal.get_programmable_core_type(programmable_core_type_index) == HalProgrammableCoreType::IDLE_ETH) {
-            // Fast dispatch not supported on IDLE_ETH yet
-            continue;
-        }
-        for (auto& kg : program.get_kernel_groups(programmable_core_type_index)) {
-            if (kg->total_rta_size != 0) {
-                uint32_t num_sub_cmds = kg->core_ranges.num_cores();
-                uint32_t max_runtime_args_len = kg->total_rta_size / sizeof(uint32_t);
-                uint32_t max_packed_cmds = get_max_write_packed_sub_cmds<decltype(unique_sub_cmds)::value_type>(
-                    max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false);
-                command_count += div_up(num_sub_cmds, max_packed_cmds);
-            }
-        }
-    }
-    // Common RTAs
-    for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) {
-        auto kernel = detail::GetKernel(program, kernel_id);
-        auto programmable_core_type = kernel->get_kernel_programmable_core_type();
-        if (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) {
-            // Fast dispatch not supported on IDLE_ETH yet
-            continue;
-        }
-        uint32_t programmable_core_type_index = hal.get_programmable_core_type_index(programmable_core_type);
-        uint32_t common_size =
-            program.get_program_config(programmable_core_type_index).crta_sizes[kernel->dispatch_class()];
-        if (common_size != 0) {
-            uint32_t max_runtime_args_len = common_size / sizeof(uint32_t);
-            const auto& common_rt_args = kernel->common_runtime_args();
-            if (common_rt_args.size() > 0) {
-                CoreType core_type = hal.get_core_type(programmable_core_type_index);
-                if (core_type == CoreType::ETH) {
-                    uint32_t num_sub_cmds = kernel->logical_cores().size();
-                    uint32_t max_packed_cmds = get_max_write_packed_sub_cmds<CQDispatchWritePackedUnicastSubCmd>(
-                        max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true);
-                    command_count += div_up(num_sub_cmds, max_packed_cmds);
-                } else {
-                    uint32_t num_sub_cmds = kernel->logical_coreranges().size();
-                    uint32_t max_packed_cmds = get_max_write_packed_sub_cmds<CQDispatchWritePackedMulticastSubCmd>(
-                        max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true);
-                    command_count += div_up(num_sub_cmds, max_packed_cmds);
-                }
-            }
-        }
-    }
-
-    program_command_sequence.runtime_args_command_sequences.reserve(command_count);
-    for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
-        if (hal.get_programmable_core_type(index) == HalProgrammableCoreType::IDLE_ETH) {
-            // Fast dispatch not supported on IDLE_ETH yet
-            // TODO: can't just loop here as code below confuses ACTIVE/IDLE
-            continue;
-        }
-        CoreType core_type = hal.get_core_type(index);
-        uint32_t processor_classes = hal.get_processor_classes_count(index);
-
-        for (auto& kg : program.get_kernel_groups(index)) {
-            if (kg->total_rta_size != 0) {
-                for (const CoreRange& core_range : kg->core_ranges.ranges()) {
-                    for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
-                        for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
-                            CoreCoord core_coord(x, y);
-
-                            unique_rt_args_data.resize(unique_rt_args_data.size() + 1);
-                            unique_rt_data_and_sizes.resize(unique_rt_data_and_sizes.size() + 1);
-                            for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
-                                auto& optional_id = kg->kernel_ids[dispatch_class];
-                                if (optional_id) {
-                                    auto device_local_kernel_handle =
-                                        get_device_local_kernel_handle(optional_id.value());
-                                    auto kernel = detail::GetKernel(program, device_local_kernel_handle);
-                                    if (!kernel->cores_with_runtime_args().empty()) {
-                                        const auto& runtime_args_data = kernel->runtime_args(core_coord);
-                                        unique_rt_args_data.back().emplace_back(kernel->runtime_args_data(core_coord));
-                                        TT_ASSERT(
-                                            runtime_args_data.size() * sizeof(uint32_t) <=
-                                            kg->rta_sizes[dispatch_class]);
-                                        unique_rt_data_and_sizes.back().emplace_back(
-                                            runtime_args_data.data(),
-                                            runtime_args_data.size() * sizeof(uint32_t),
-                                            kg->rta_sizes[dispatch_class]);
-                                    }
-                                }
-                            }
-                            CoreCoord virtual_core = device->virtual_core_from_logical_core(core_coord, core_type);
-                            unique_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{
-                                .noc_xy_addr = device->get_noc_unicast_encoding(noc_index, virtual_core)});
-                        }
-                    }
-                }
-                uint32_t rta_offset = program.get_program_config(index).rta_offset;
-                generate_runtime_args_cmds(
-                    program_command_sequence.runtime_args_command_sequences,
-                    rta_offset,
-                    unique_sub_cmds,
-                    unique_rt_data_and_sizes,
-                    kg->total_rta_size / sizeof(uint32_t),
-                    unique_rt_args_data,
-                    max_prefetch_command_size,
-                    packed_write_max_unicast_sub_cmds,
-                    false,
-                    core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE
-                                                  : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE);
-                for (auto& data_per_kernel : unique_rt_data_and_sizes) {
-                    for (auto& data_and_sizes : data_per_kernel) {
-                        RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes));
-                    }
-                }
-                unique_sub_cmds.clear();
-                unique_rt_data_and_sizes.clear();
-                unique_rt_args_data.clear();
-            }
-        }
-
-        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
-            uint32_t common_size = program.get_program_config(index).crta_sizes[dispatch_class];
-            if (common_size == 0) {
-                continue;
-            }
-            for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) {
-                auto kernel = detail::GetKernel(program, kernel_id);
-                if (kernel->get_kernel_core_type() != core_type) {
-                    continue;  // TODO: fixme, need list of kernels by core_typexdispatch_class
-                }
-                if (kernel->dispatch_class() != dispatch_class) {
-                    continue;  // TODO: fixme, need list of kernels by core_typexdispatch_class
-                }
-
-                const auto& common_rt_args = kernel->common_runtime_args();
-                if (common_rt_args.size() > 0) {
-                    common_rt_args_data.resize(common_rt_args_data.size() + 1);
-                    common_rt_data_and_sizes.resize(common_rt_data_and_sizes.size() + 1);
-
-                    TT_ASSERT(kernel->common_runtime_args_data().size() * sizeof(uint32_t) == common_size);
-                    TT_ASSERT(common_rt_args.size() * sizeof(uint32_t) <= common_size);
-                    common_rt_data_and_sizes.back().emplace_back(
-                        common_rt_args.data(), common_rt_args.size() * sizeof(uint32_t), common_size);
-                    common_rt_args_data.back().emplace_back(kernel->common_runtime_args_data());
-
-                    if (core_type == CoreType::ETH) {
-                        common_sub_cmds.emplace<std::vector<CQDispatchWritePackedUnicastSubCmd>>(
-                            std::vector<CQDispatchWritePackedUnicastSubCmd>());
-                        auto& unicast_sub_cmd =
-                            std::get<std::vector<CQDispatchWritePackedUnicastSubCmd>>(common_sub_cmds);
-                        unicast_sub_cmd.reserve(kernel->logical_cores().size());
-                        for (auto& core_coord : kernel->logical_cores()) {
-                            // can make a vector of unicast encodings here
-                            CoreCoord virtual_core_coords =
-                                device->virtual_core_from_logical_core(core_coord, CoreType::ETH);
-                            unicast_sub_cmd.emplace_back(CQDispatchWritePackedUnicastSubCmd{
-                                .noc_xy_addr = device->get_noc_unicast_encoding(noc_index, virtual_core_coords)});
-                        }
-                    } else {
-                        std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
-                            device->extract_dst_noc_multicast_info(
-                                kernel->logical_coreranges(), core_type);
-                        common_sub_cmds.emplace<std::vector<CQDispatchWritePackedMulticastSubCmd>>(
-                            std::vector<CQDispatchWritePackedMulticastSubCmd>());
-                        auto& multicast_sub_cmd =
-                            std::get<std::vector<CQDispatchWritePackedMulticastSubCmd>>(common_sub_cmds);
-                        multicast_sub_cmd.reserve(dst_noc_multicast_info.size());
-                        for (const auto& mcast_dests : dst_noc_multicast_info) {
-                            multicast_sub_cmd.emplace_back(CQDispatchWritePackedMulticastSubCmd{
-                                .noc_xy_addr = device->get_noc_multicast_encoding(
-                                    noc_index, std::get<CoreRange>(mcast_dests.first)),
-                                .num_mcast_dests = mcast_dests.second});
-                        }
-                    }
-                }
-            }
-
-            uint32_t crta_offset = program.get_program_config(index).crta_offsets[dispatch_class];
-
-            // Common rtas are always expected to fit in one prefetch cmd
-            // TODO: use a linear write instead of a packed-write
-            std::visit(
-                [&](auto&& sub_cmds) {
-                    generate_runtime_args_cmds(
-                        program_command_sequence.runtime_args_command_sequences,
-                        crta_offset,
-                        sub_cmds,
-                        common_rt_data_and_sizes,
-                        common_size / sizeof(uint32_t),
-                        common_rt_args_data,
-                        max_prefetch_command_size,
-                        packed_write_max_unicast_sub_cmds,
-                        true,
-                        core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE
-                                                      : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE);
-                    sub_cmds.clear();
-                },
-                common_sub_cmds);
-
-            for (auto& data_per_kernel : common_rt_data_and_sizes) {
-                for (auto& data_and_sizes : data_per_kernel) {
-                    RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes));
-                }
-            }
-            common_rt_data_and_sizes.clear();
-            common_rt_args_data.clear();
-        }
-    }
-    TT_ASSERT(
-        command_count >= program_command_sequence.runtime_args_command_sequences.size(),
-        "Incorrect number of commands reserved {}, final size {}. Vector reallocation causes cached addresses to be "
-        "incorrect.",
-        command_count,
-        program_command_sequence.runtime_args_command_sequences.size());
-
-    uint32_t runtime_args_fetch_size_bytes = 0;
-    for (const auto& cmds : program_command_sequence.runtime_args_command_sequences) {
-        // BRISC, NCRISC, TRISC...
-        runtime_args_fetch_size_bytes += cmds.size_bytes();
-    }
-    program_command_sequence.runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes;
-}
-
-template <typename PackedSubCmd>
-uint32_t insert_write_packed_payloads(
-    const uint32_t num_sub_cmds,
-    const uint32_t sub_cmd_sizeB,
-    const uint32_t max_prefetch_command_size,
-    const uint32_t packed_write_max_unicast_sub_cmds,
-    std::vector<std::pair<uint32_t, uint32_t>>& packed_cmd_payloads) {
-    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-    const uint32_t aligned_sub_cmd_sizeB = align(sub_cmd_sizeB, l1_alignment);
-    const uint32_t max_packed_sub_cmds_per_cmd = get_max_write_packed_sub_cmds<PackedSubCmd>(
-        aligned_sub_cmd_sizeB, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false);
-    uint32_t rem_num_sub_cmds = num_sub_cmds;
-    uint32_t cmd_payload_sizeB = 0;
-    uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST);
-    while (rem_num_sub_cmds != 0) {
-        const uint32_t num_sub_cmds_in_cmd = std::min(max_packed_sub_cmds_per_cmd, rem_num_sub_cmds);
-        const uint32_t aligned_data_sizeB = aligned_sub_cmd_sizeB * num_sub_cmds_in_cmd;
-        const uint32_t dispatch_cmd_sizeB =
-            align(sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), l1_alignment);
-        packed_cmd_payloads.emplace_back(num_sub_cmds_in_cmd, dispatch_cmd_sizeB + aligned_data_sizeB);
-        cmd_payload_sizeB += align(sizeof(CQPrefetchCmd) + packed_cmd_payloads.back().second, pcie_alignment);
-        rem_num_sub_cmds -= num_sub_cmds_in_cmd;
-    }
-    return cmd_payload_sizeB;
-}
-
-void assemble_device_commands(
-    ProgramCommandSequence& program_command_sequence, Program& program, Device* device, SubDeviceId sub_device_id) {
-    uint32_t cmd_sequence_sizeB = 0;
-    CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
-    NOC noc_index = dispatch_downstream_noc;
-    const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
-    static const uint32_t packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(device);
-    const auto& program_transfer_info = program.get_program_transfer_info();
-    // Multicast Semaphore Cmd
-    uint32_t num_multicast_semaphores = program_transfer_info.multicast_semaphores.size();
-    std::vector<std::vector<CQDispatchWritePackedMulticastSubCmd>> multicast_sem_sub_cmds(num_multicast_semaphores);
-    std::vector<std::vector<std::pair<const void*, uint32_t>>> multicast_sem_data(num_multicast_semaphores);
-    std::vector<std::vector<std::pair<uint32_t, uint32_t>>> multicast_sem_payload(num_multicast_semaphores);
-    std::vector<std::pair<uint32_t, uint32_t>> multicast_sem_dst_size;
-    multicast_sem_dst_size.reserve(num_multicast_semaphores);
-
-    if (num_multicast_semaphores > 0) {
-        uint32_t i = 0;
-        for (const auto& [dst, transfer_info_vec] : program_transfer_info.multicast_semaphores) {
-            // TODO: loop over things inside transfer_info[i]
-            uint32_t write_packed_len = transfer_info_vec[0].data.size();
-            multicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t)));
-
-            for (const auto& transfer_info : transfer_info_vec) {
-                for (const auto& dst_noc_info : transfer_info.dst_noc_info) {
-                    TT_ASSERT(
-                        transfer_info.data.size() == write_packed_len,
-                        "Not all data std::vectors in write packed semaphore cmd equal in len");
-                    multicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedMulticastSubCmd{
-                        .noc_xy_addr =
-                            device->get_noc_multicast_encoding(noc_index, std::get<CoreRange>(dst_noc_info.first)),
-                        .num_mcast_dests = dst_noc_info.second});
-                    multicast_sem_data[i].emplace_back(
-                        transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t));
-                }
-            }
-            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(
-                multicast_sem_sub_cmds[i].size(),
-                multicast_sem_dst_size.back().second,
-                max_prefetch_command_size,
-                packed_write_max_unicast_sub_cmds,
-                multicast_sem_payload[i]);
-            i++;
-        }
-    }
-
-    // Unicast Semaphore Cmd
-    uint32_t num_unicast_semaphores = program_transfer_info.unicast_semaphores.size();
-    std::vector<std::vector<CQDispatchWritePackedUnicastSubCmd>> unicast_sem_sub_cmds(num_unicast_semaphores);
-    std::vector<std::vector<std::pair<const void*, uint32_t>>> unicast_sem_data(num_unicast_semaphores);
-    std::vector<std::vector<std::pair<uint32_t, uint32_t>>> unicast_sem_payload(num_unicast_semaphores);
-    std::vector<std::pair<uint32_t, uint32_t>> unicast_sem_dst_size;
-    unicast_sem_dst_size.reserve(num_unicast_semaphores);
-    if (num_unicast_semaphores > 0) {
-        uint32_t i = 0;
-        for (const auto& [dst, transfer_info_vec] : program_transfer_info.unicast_semaphores) {
-            // TODO: loop over things inside transfer_info[i]
-            uint32_t write_packed_len = transfer_info_vec[0].data.size();
-            unicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t)));
-
-            for (const auto& transfer_info : transfer_info_vec) {
-                for (const auto& dst_noc_info : transfer_info.dst_noc_info) {
-                    TT_ASSERT(
-                        transfer_info.data.size() == write_packed_len,
-                        "Not all data std::vectors in write packed semaphore cmd equal in len");
-                    unicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedUnicastSubCmd{
-                        .noc_xy_addr =
-                            device->get_noc_unicast_encoding(noc_index, std::get<CoreCoord>(dst_noc_info.first))});
-                    unicast_sem_data[i].emplace_back(
-                        transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t));
-                }
-            }
-            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedUnicastSubCmd>(
-                unicast_sem_sub_cmds[i].size(),
-                unicast_sem_dst_size.back().second,
-                max_prefetch_command_size,
-                packed_write_max_unicast_sub_cmds,
-                unicast_sem_payload[i]);
-            i++;
-        }
-    }
-
-    uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-
-    const auto& circular_buffers_unique_coreranges = program.circular_buffers_unique_coreranges();
-    const uint16_t num_multicast_cb_sub_cmds = circular_buffers_unique_coreranges.size();
-    std::vector<std::pair<uint32_t, uint32_t>> mcast_cb_payload;
-    uint16_t cb_config_size_bytes = 0;
-    uint32_t aligned_cb_config_size_bytes = 0;
-    std::vector<std::vector<uint32_t>> cb_config_payloads(
-        num_multicast_cb_sub_cmds,
-        std::vector<uint32_t>(program.get_program_config(index).cb_size / sizeof(uint32_t), 0));
-    std::vector<CQDispatchWritePackedMulticastSubCmd> multicast_cb_config_sub_cmds;
-    std::vector<std::pair<const void*, uint32_t>> multicast_cb_config_data;
-    if (num_multicast_cb_sub_cmds > 0) {
-        multicast_cb_config_sub_cmds.reserve(num_multicast_cb_sub_cmds);
-        multicast_cb_config_data.reserve(num_multicast_cb_sub_cmds);
-        program_command_sequence.circular_buffers_on_core_ranges.resize(num_multicast_cb_sub_cmds);
-        uint32_t i = 0;
-        uint32_t max_overall_index = 0;
-        uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t);
-        for (const CoreRange& core_range : circular_buffers_unique_coreranges) {
-            const CoreCoord virtual_start =
-                device->virtual_core_from_logical_core(core_range.start_coord, CoreType::WORKER);
-            const CoreCoord virtual_end =
-                device->virtual_core_from_logical_core(core_range.end_coord, CoreType::WORKER);
-
-            const uint32_t num_receivers = core_range.size();
-            auto& cb_config_payload = cb_config_payloads[i];
-            uint32_t max_index = 0;
-            const auto& circular_buffers_on_corerange = program.circular_buffers_on_corerange(core_range);
-            program_command_sequence.circular_buffers_on_core_ranges[i].reserve(circular_buffers_on_corerange.size());
-            for (const std::shared_ptr<CircularBuffer>& cb : circular_buffers_on_corerange) {
-                program_command_sequence.circular_buffers_on_core_ranges[i].emplace_back(cb);
-                const uint32_t cb_address = cb->address();
-                const uint32_t cb_size = cb->size();
-                for (const auto& buffer_index : cb->local_buffer_indices()) {
-                    // 1 cmd for all 32 buffer indices, populate with real data for specified indices
-                    // cb config payload
-                    const uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
-                    cb_config_payload[base_index] = cb_address;
-                    cb_config_payload[base_index + 1] = cb_size;
-                    cb_config_payload[base_index + 2] = cb->num_pages(buffer_index);
-                    cb_config_payload[base_index + 3] = cb->page_size(buffer_index);
-                    max_index = std::max(max_index, base_index + UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG);
-                }
-                for (const auto& buffer_index : cb->remote_buffer_indices()) {
-                    const uint32_t base_index =
-                        remote_offset_index +
-                        (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) * UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG;
-                    cb_config_payload[base_index] = cb->config_address();
-                    cb_config_payload[base_index + 1] = cb->page_size(buffer_index);
-                    max_index = std::max(max_index, base_index + UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG);
-                }
-            }
-            multicast_cb_config_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{
-                .noc_xy_addr = device->get_noc_multicast_encoding(noc_index, CoreRange(virtual_start, virtual_end)),
-                .num_mcast_dests = (uint32_t)core_range.size()});
-            multicast_cb_config_data.emplace_back(cb_config_payload.data(), max_index * sizeof(uint32_t));
-            max_overall_index = std::max(max_overall_index, max_index);
-            i++;
-        }
-        uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-        cb_config_size_bytes = max_overall_index * sizeof(uint32_t);
-        aligned_cb_config_size_bytes = align(cb_config_size_bytes, l1_alignment);
-        cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(
-            num_multicast_cb_sub_cmds,
-            cb_config_size_bytes,
-            max_prefetch_command_size,
-            packed_write_max_unicast_sub_cmds,
-            mcast_cb_payload);
-    }
-
-    // Program Binaries and Go Signals
-    // Get launch msg data while getting size of cmds
-    std::vector<std::vector<CQPrefetchRelayPagedPackedSubCmd>> kernel_bins_prefetch_subcmds;
-    std::vector<std::vector<CQDispatchWritePackedLargeSubCmd>> kernel_bins_dispatch_subcmds;
-    std::vector<uint32_t> kernel_bins_write_packed_large_data_aligned_sizeB;
-    std::vector<HostMemDeviceCommand> kernel_bins_unicast_cmds;
-    const uint32_t max_length_per_sub_cmd = dispatch_constants::get(dispatch_core_type).scratch_db_size() / 2;
-    const uint32_t max_paged_length_per_sub_cmd =
-        max_length_per_sub_cmd / HostMemDeviceCommand::PROGRAM_PAGE_SIZE * HostMemDeviceCommand::PROGRAM_PAGE_SIZE;
-    if (program_transfer_info.kernel_bins.size()) {
-        TT_FATAL(
-            program.get_kernels_buffer(device).get(), "Expected Kernel Binary Buffer to be allocated for program.");
-    }
-    const auto kernels_buffer = program.get_kernels_buffer(device);
-    for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program_transfer_info.kernel_bins) {
-        bool write_linear;
-        uint32_t noc_encoding;
-        std::visit(
-            [&](auto&& cores) {
-                using T = std::decay_t<decltype(cores)>;
-                if constexpr (std::is_same_v<T, CoreRange>) {
-                    noc_encoding = device->get_noc_multicast_encoding(noc_index, cores);
-                    write_linear = false;
-                } else {
-                    noc_encoding = device->get_noc_unicast_encoding(noc_index, cores);
-                    write_linear = true;
-                }
-            },
-            cores);
-        for (uint32_t kernel_idx = 0; kernel_idx < kg_transfer_info.dst_base_addrs.size(); kernel_idx++) {
-            if (write_linear) {
-                kernel_bins_unicast_cmds.emplace_back(2 * hal.get_alignment(HalMemType::HOST));
-                cmd_sequence_sizeB += 2 * hal.get_alignment(HalMemType::HOST);
-                constexpr bool flush_prefetch = false;
-                kernel_bins_unicast_cmds.back().add_dispatch_write_linear<flush_prefetch>(
-                    num_mcast_dests,  // num_mcast_dests
-                    noc_encoding,     // noc_xy_addr
-                    kg_transfer_info.dst_base_addrs[kernel_idx],
-                    kg_transfer_info.lengths[kernel_idx]);
-                RecordDispatchData(
-                    program,
-                    DISPATCH_DATA_BINARY,
-                    kg_transfer_info.lengths[kernel_idx],
-                    kg_transfer_info.riscvs[kernel_idx]);
-                // Difference between prefetch total relayed pages and dispatch write linear
-                uint32_t relayed_bytes =
-                    align(kg_transfer_info.lengths[kernel_idx], HostMemDeviceCommand::PROGRAM_PAGE_SIZE);
-                uint16_t length_adjust = uint16_t(relayed_bytes - kg_transfer_info.lengths[kernel_idx]);
-
-                uint32_t base_address, page_offset;
-                if (kg_transfer_info.page_offsets[kernel_idx] > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) {
-                    const uint32_t num_banks = device->num_banks(kernels_buffer->buffer_type());
-                    page_offset = kg_transfer_info.page_offsets[kernel_idx] % num_banks;
-                    uint32_t num_full_pages_written_per_bank = kg_transfer_info.page_offsets[kernel_idx] / num_banks;
-                    base_address =
-                        kernels_buffer->address() + num_full_pages_written_per_bank * kernels_buffer->page_size();
-                } else {
-                    base_address = kernels_buffer->address();
-                    page_offset = kg_transfer_info.page_offsets[kernel_idx];
-                }
-
-                kernel_bins_unicast_cmds.back().add_prefetch_relay_paged(
-                    true,  // is_dram
-                    page_offset,
-                    base_address,
-                    kernels_buffer->page_size(),
-                    relayed_bytes / kernels_buffer->page_size(),
-                    length_adjust);
-            } else {
-                uint32_t base_address = kernels_buffer->address();
-                uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx];
-
-                // TODO: pack all these writes into 1 linear write
-                uint32_t kernel_config_buffer_offset = kg_transfer_info.dst_base_addrs[kernel_idx];
-                uint32_t aligned_length =
-                    align(kg_transfer_info.lengths[kernel_idx], hal.get_alignment(HalMemType::DRAM));
-                uint32_t padding = aligned_length - kg_transfer_info.lengths[kernel_idx];
-                while (aligned_length != 0) {
-                    if (kernel_bins_dispatch_subcmds.empty() ||
-                        kernel_bins_dispatch_subcmds.back().size() == CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_MAX_SUB_CMDS) {
-                        kernel_bins_dispatch_subcmds.push_back({});
-                        kernel_bins_prefetch_subcmds.push_back({});
-                        kernel_bins_write_packed_large_data_aligned_sizeB.push_back(0);
-                    }
-                    uint32_t write_length, read_length;
-                    if (aligned_length <= max_length_per_sub_cmd) {
-                        read_length = aligned_length;
-                        write_length = read_length - padding;
-                    } else {
-                        read_length = max_paged_length_per_sub_cmd;
-                        write_length = read_length;
-                    }
-                    if (!kernel_bins_dispatch_subcmds.back().empty()) {
-                        auto& back = kernel_bins_dispatch_subcmds.back().back();
-                        if (back.noc_xy_addr != noc_encoding) {
-                            back.flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;
-                        }
-                    }
-                    kernel_bins_dispatch_subcmds.back().emplace_back(CQDispatchWritePackedLargeSubCmd{
-                        .noc_xy_addr = noc_encoding,
-                        .addr = kernel_config_buffer_offset,
-                        .length = (uint16_t)write_length,
-                        .num_mcast_dests = (uint8_t)num_mcast_dests,
-                        .flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE});
-                    RecordDispatchData(
-                        program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]);
-                    kernel_config_buffer_offset += write_length;
-
-                    kernel_bins_prefetch_subcmds.back().emplace_back(CQPrefetchRelayPagedPackedSubCmd{
-                        .start_page = (uint16_t)page_offset,
-                        .log_page_size = (uint16_t)HostMemDeviceCommand::LOG2_PROGRAM_PAGE_SIZE,
-                        .base_addr = base_address,
-                        .length = read_length});
-                    page_offset += read_length / HostMemDeviceCommand::PROGRAM_PAGE_SIZE;
-                    aligned_length -= read_length;
-                    kernel_bins_write_packed_large_data_aligned_sizeB.back() += read_length;
-                }
-            }
-        }
-    }
-    // Unlink the last subcmd of every dispatch, to ensure we don't hold the
-    // path reservation for an incredible long time. This also prevents a hang
-    // if the next mcast is to a different destination.
-    for (auto& subcmd_list : kernel_bins_dispatch_subcmds) {
-        if (!subcmd_list.empty()) {
-            subcmd_list.back().flags |= CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;
-        }
-    }
-    uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST);
-    for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) {
-        cmd_sequence_sizeB += align(
-            ((sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd))) +
-                kernel_bins_dispatch_subcmds[i].size() * sizeof(CQDispatchWritePackedLargeSubCmd),
-            pcie_alignment);
-        cmd_sequence_sizeB += align(
-            kernel_bins_prefetch_subcmds[i].size() * sizeof(CQPrefetchRelayPagedPackedSubCmd) + sizeof(CQPrefetchCmd),
-            pcie_alignment);
-    }
-    std::vector<std::pair<const void*, uint32_t>> multicast_go_signal_data;
-    std::vector<std::pair<const void*, uint32_t>> unicast_go_signal_data;
-    std::vector<CQDispatchWritePackedMulticastSubCmd> multicast_go_signal_sub_cmds;
-    std::vector<CQDispatchWritePackedUnicastSubCmd> unicast_go_signal_sub_cmds;
-    std::vector<std::pair<uint32_t, uint32_t>> multicast_go_signals_payload;
-    std::vector<std::pair<uint32_t, uint32_t>> unicast_go_signals_payload;
-    constexpr uint32_t go_signal_sizeB = sizeof(launch_msg_t);
-    uint32_t aligned_go_signal_sizeB = align(go_signal_sizeB, hal.get_alignment(HalMemType::L1));
-    uint32_t go_signal_size_words = aligned_go_signal_sizeB / sizeof(uint32_t);
-
-    // TODO: eventually the code below could be structured to loop over programmable_indices
-    // and check for mcast/unicast
-    uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-    for (auto& kernel_group : program.get_kernel_groups(programmable_core_index)) {
-        kernel_group->launch_msg.kernel_config.mode = DISPATCH_MODE_DEV;
-        for (uint32_t i = 0; i < NUM_PROGRAMMABLE_CORE_TYPES; i++) {
-            kernel_group->launch_msg.kernel_config.kernel_config_base[i] = 0;
-        }
-        kernel_group->launch_msg.kernel_config.host_assigned_id = program.get_runtime_id();
-        const void* launch_message_data = (const void*)(&(kernel_group->launch_msg));
-        for (const CoreRange& core_range : kernel_group->core_ranges.ranges()) {
-            CoreCoord virtual_start =
-                device->virtual_core_from_logical_core(core_range.start_coord, kernel_group->get_core_type());
-            CoreCoord virtual_end =
-                device->virtual_core_from_logical_core(core_range.end_coord, kernel_group->get_core_type());
-
-            multicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{
-                .noc_xy_addr = device->get_noc_multicast_encoding(noc_index, CoreRange(virtual_start, virtual_end)),
-                .num_mcast_dests = (uint32_t)core_range.size()});
-            multicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB);
-        }
-    }
-    if (multicast_go_signal_sub_cmds.size() > 0) {
-        cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(
-            multicast_go_signal_sub_cmds.size(),
-            go_signal_sizeB,
-            max_prefetch_command_size,
-            packed_write_max_unicast_sub_cmds,
-            multicast_go_signals_payload);
-    }
-
-    programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
-    // TODO: ugly, can be fixed by looping over indices w/ some work
-    if (programmable_core_index != -1) {
-        for (auto& kernel_group : program.get_kernel_groups(programmable_core_index)) {
-            kernel_group->launch_msg.kernel_config.mode = DISPATCH_MODE_DEV;
-            // Set the kernel_config_base addrs to 0 when generating the dispatch commands for the program
-            // Will be resolved at runtime
-            for (uint32_t i = 0; i < NUM_PROGRAMMABLE_CORE_TYPES; i++) {
-                kernel_group->launch_msg.kernel_config.kernel_config_base[i] = 0;
-            }
-            kernel_group->launch_msg.kernel_config.host_assigned_id = program.get_runtime_id();
-            const void* launch_message_data = (const launch_msg_t*)(&(kernel_group->launch_msg));
-            for (const CoreRange& core_range : kernel_group->core_ranges.ranges()) {
-                for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
-                    for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
-                        CoreCoord virtual_coord =
-                            device->virtual_core_from_logical_core(CoreCoord({x, y}), kernel_group->get_core_type());
-                        unicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{
-                            .noc_xy_addr = device->get_noc_unicast_encoding(noc_index, virtual_coord)});
-                        unicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB);
-                    }
-                }
-            }
-        }
-    }
-
-    if (unicast_go_signal_sub_cmds.size() > 0) {
-        cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedUnicastSubCmd>(
-            unicast_go_signal_sub_cmds.size(),
-            go_signal_sizeB,
-            max_prefetch_command_size,
-            packed_write_max_unicast_sub_cmds,
-            unicast_go_signals_payload);
-    }
-
-    // if dispatch_s is enabled have dispatch_d send a semaphore update to dispatch_s (this will include a write barrier
-    // on dispatch_d if program is active) if not,  check if the program is active on workers. If active, have
-    // dispatch_d issue a write barrier
-    cmd_sequence_sizeB += (device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) *
-                          hal.get_alignment(HalMemType::HOST);
-
-    // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command)
-    const auto& noc_data_start_idx = device->noc_data_start_index(
-        sub_device_id, multicast_go_signal_sub_cmds.size() > 0, unicast_go_signal_sub_cmds.size() > 0);
-    const auto& num_noc_mcast_txns =
-        multicast_go_signal_sub_cmds.size() > 0 ? device->num_noc_mcast_txns(sub_device_id) : 0;
-    const auto& num_noc_unicast_txns =
-        unicast_go_signal_sub_cmds.size() > 0 ? device->num_noc_unicast_txns(sub_device_id) : 0;
-    cmd_sequence_sizeB += align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment);
-
-    program_command_sequence.device_command_sequence = HostMemDeviceCommand(cmd_sequence_sizeB);
-
-    auto& device_command_sequence = program_command_sequence.device_command_sequence;
-
-    uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
-
-    // Semaphores
-    // Multicast Semaphore Cmd
-    index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-    for (uint32_t i = 0; i < num_multicast_semaphores; ++i) {
-        uint32_t curr_sub_cmd_idx = 0;
-        for (const auto& [num_sub_cmds_in_cmd, multicast_sem_payload_sizeB] : multicast_sem_payload[i]) {
-            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedMulticastSubCmd>(
-                num_sub_cmds_in_cmd,
-                multicast_sem_dst_size[i].first + program.get_program_config(index).sem_offset,
-                multicast_sem_dst_size[i].second,
-                multicast_sem_payload_sizeB,
-                multicast_sem_sub_cmds[i],
-                multicast_sem_data[i],
-                packed_write_max_unicast_sub_cmds,
-                curr_sub_cmd_idx,
-                false,
-                DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE);
-            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
-            for (auto& data_and_size : multicast_sem_data[i]) {
-                RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second);
-            }
-        }
-    }
-
-    // Unicast Semaphore Cmd
-    index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
-    for (uint32_t i = 0; i < num_unicast_semaphores; ++i) {
-        uint32_t curr_sub_cmd_idx = 0;
-        for (const auto& [num_sub_cmds_in_cmd, unicast_sem_payload_sizeB] : unicast_sem_payload[i]) {
-            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedUnicastSubCmd>(
-                num_sub_cmds_in_cmd,
-                unicast_sem_dst_size[i].first + program.get_program_config(index).sem_offset,
-                unicast_sem_dst_size[i].second,
-                unicast_sem_payload_sizeB,
-                unicast_sem_sub_cmds[i],
-                unicast_sem_data[i],
-                packed_write_max_unicast_sub_cmds,
-                curr_sub_cmd_idx,
-                false,
-                DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE);
-            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
-            for (auto& data_and_size : unicast_sem_data[i]) {
-                RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second);
-            }
-        }
-    }
-
-    // CB Configs commands
-    index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-    if (num_multicast_cb_sub_cmds > 0) {
-        uint32_t curr_sub_cmd_idx = 0;
-        program_command_sequence.cb_configs_payloads.reserve(num_multicast_cb_sub_cmds);
-        const uint32_t cb_config_size_words = aligned_cb_config_size_bytes / sizeof(uint32_t);
-        for (const auto& [num_sub_cmds_in_cmd, mcast_cb_payload_sizeB] : mcast_cb_payload) {
-            uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes();
-            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedMulticastSubCmd>(
-                num_sub_cmds_in_cmd,
-                program.get_program_config(index).cb_offset,
-                cb_config_size_bytes,
-                mcast_cb_payload_sizeB,
-                multicast_cb_config_sub_cmds,
-                multicast_cb_config_data,
-                packed_write_max_unicast_sub_cmds,
-                curr_sub_cmd_idx,
-                false,
-                DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE);
-            for (auto& data_and_size : multicast_cb_config_data) {
-                RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, data_and_size.second);
-            }
-            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
-            RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, mcast_cb_payload_sizeB);
-            uint32_t curr_sub_cmd_data_offset_words =
-                (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) +
-                 align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), l1_alignment)) /
-                sizeof(uint32_t);
-            for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
-                program_command_sequence.cb_configs_payloads.push_back(
-                    (uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words);
-                curr_sub_cmd_data_offset_words += cb_config_size_words;
-            }
-        }
-    }
-    // All Previous Cmds Up to This Point Go Into the Kernel Config Buffer
-    program_command_sequence.program_config_buffer_data_size_bytes = device_command_sequence.write_offset_bytes();
-
-    // Program Binaries
-    for (const auto& kernel_bins_unicast_cmd : kernel_bins_unicast_cmds) {
-        device_command_sequence.add_data(
-            kernel_bins_unicast_cmd.data(), kernel_bins_unicast_cmd.size_bytes(), kernel_bins_unicast_cmd.size_bytes());
-    }
-    uint32_t dram_alignment = hal.get_alignment(HalMemType::DRAM);
-    for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) {
-        device_command_sequence.add_dispatch_write_packed_large(
-            dram_alignment,
-            kernel_bins_dispatch_subcmds[i].size(),
-            kernel_bins_dispatch_subcmds[i],
-            0,
-            DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE);
-        device_command_sequence.add_prefetch_relay_paged_packed(
-            kernel_bins_write_packed_large_data_aligned_sizeB[i],
-            kernel_bins_prefetch_subcmds[i],
-            kernel_bins_prefetch_subcmds[i].size());
-    }
-
-    // Go Signals
-    program_command_sequence.go_signals.reserve(
-        multicast_go_signal_sub_cmds.size() + unicast_go_signal_sub_cmds.size());
-
-    // Launch Message address is resolved when the program is enqueued
-    uint32_t multicast_launch_msg_addr = 0;
-
-    if (multicast_go_signal_sub_cmds.size() > 0) {
-        uint32_t curr_sub_cmd_idx = 0;
-        for (const auto& [num_sub_cmds_in_cmd, multicast_go_signal_payload_sizeB] : multicast_go_signals_payload) {
-            uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes();
-            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedMulticastSubCmd>(
-                num_sub_cmds_in_cmd,
-                multicast_launch_msg_addr,
-                go_signal_sizeB,
-                multicast_go_signal_payload_sizeB,
-                multicast_go_signal_sub_cmds,
-                multicast_go_signal_data,
-                packed_write_max_unicast_sub_cmds,
-                curr_sub_cmd_idx);
-            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
-            program_command_sequence.launch_msg_write_packed_cmd_ptrs.push_back(
-                &((CQDispatchCmd*)((uint32_t*)device_command_sequence.data() +
-                                   (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))
-                     ->write_packed);
-            uint32_t curr_sub_cmd_data_offset_words =
-                (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) +
-                 align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), l1_alignment)) /
-                sizeof(uint32_t);
-            for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
-                program_command_sequence.go_signals.push_back(
-                    (launch_msg_t*)((uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words));
-                curr_sub_cmd_data_offset_words += go_signal_size_words;
-            }
-        }
-    }
-
-    if (unicast_go_signal_sub_cmds.size() > 0) {
-        // Launch Message address is resolved when the program is enqueued
-        uint32_t unicast_launch_msg_addr = 0;
-        uint32_t curr_sub_cmd_idx = 0;
-        for (const auto& [num_sub_cmds_in_cmd, unicast_go_signal_payload_sizeB] : unicast_go_signals_payload) {
-            uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes();
-            device_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedUnicastSubCmd>(
-                num_sub_cmds_in_cmd,
-                unicast_launch_msg_addr,
-                go_signal_sizeB,
-                unicast_go_signal_payload_sizeB,
-                unicast_go_signal_sub_cmds,
-                unicast_go_signal_data,
-                packed_write_max_unicast_sub_cmds,
-                curr_sub_cmd_idx);
-            curr_sub_cmd_idx += num_sub_cmds_in_cmd;
-            program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs.push_back(
-                &((CQDispatchCmd*)((uint32_t*)device_command_sequence.data() +
-                                   (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))
-                     ->write_packed);
-            uint32_t curr_sub_cmd_data_offset_words =
-                (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) +
-                 align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedUnicastSubCmd), l1_alignment)) /
-                sizeof(uint32_t);
-            for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
-                program_command_sequence.go_signals.push_back(
-                    (launch_msg_t*)((uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words));
-                curr_sub_cmd_data_offset_words += go_signal_size_words;
-            }
-        }
-    }
-
-    DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER;
-    auto sub_device_index = sub_device_id.to_index();
-    uint32_t dispatch_message_addr =
-        dispatch_constants::get(dispatch_core_type)
-            .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE) +
-        dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(sub_device_index);
-    if (device->dispatch_s_enabled()) {
-        // dispatch_d signals dispatch_s to send the go signal, use a barrier if there are cores active
-        uint16_t index_bitmask = 0;
-        index_bitmask |= 1 << sub_device_index;
-        device_command_sequence.add_notify_dispatch_s_go_signal_cmd(
-            program_transfer_info.num_active_cores > 0, index_bitmask);
-        dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE;
-    } else {
-        // Wait Noc Write Barrier, wait for binaries/configs and launch_msg to be written to worker cores
-        if (program_transfer_info.num_active_cores > 0) {
-            device_command_sequence.add_dispatch_wait(true, dispatch_message_addr, 0, 0, false, false);
-        }
-    }
-    go_msg_t run_program_go_signal;
-    run_program_go_signal.signal = RUN_MSG_GO;
-    // Dispatch X/Y resolved when the program is enqueued
-    run_program_go_signal.master_x = 0;
-    run_program_go_signal.master_y = 0;
-    run_program_go_signal.dispatch_message_offset =
-        (uint8_t)dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(sub_device_index);
-    uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes();
-    // Num Workers Resolved when the program is enqueued
-    device_command_sequence.add_dispatch_go_signal_mcast(
-        0,
-        *reinterpret_cast<uint32_t*>(&run_program_go_signal),
-        dispatch_message_addr,
-        num_noc_mcast_txns,
-        num_noc_unicast_txns,
-        noc_data_start_idx,
-        dispatcher_for_go_signal);
-    program_command_sequence.mcast_go_signal_cmd_ptr =
-        &((CQDispatchCmd*)((uint32_t*)device_command_sequence.data() +
-                           (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))
-             ->mcast;
-}
-
-void reserve_space_in_kernel_config_buffer(
-    WorkerConfigBufferMgr& config_buffer_mgr,
-    const std::vector<uint32_t>& program_config_sizes,
-    bool kernel_binary_always_stored_in_ringbuffer,
-    ProgramBinaryStatus program_binary_status,
-    uint32_t num_program_workers,
-    uint32_t expected_num_workers_completed,
-    ProgramDispatchMetadata& dispatch_md) {
-    // Reserve space in kernel config ring buffer for the current program
-    std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> reservation =
-        config_buffer_mgr.reserve(program_config_sizes);
-    // Determine where a sync (dispatch wait on workers) must be inserted in the program sequence and the number
-    // of workers to wait on
-    dispatch_md.sync_count = 0;
-    dispatch_md.stall_first = reservation.first.need_sync;
-    dispatch_md.stall_before_program = false;
-
-    if (!kernel_binary_always_stored_in_ringbuffer) {
-        // Program runs on cores without ring buffer. Sync on all
-        // workers before dispatching program data
-        dispatch_md.sync_count = expected_num_workers_completed;
-        dispatch_md.stall_before_program = !dispatch_md.stall_first;
-    } else if (reservation.first.need_sync) {
-        // TODO: attempt to send RTA only without stalling.
-        dispatch_md.sync_count = reservation.first.sync_count;
-        // Check if the launch message is the only thing preventing us from
-        // sending the program. If so, we can at least send the RTAs. Ideally we
-        // would also send the kernel binaries in this case, but the rest of the
-        // code isn't set up for that.
-        auto config_sizes = program_config_sizes;
-        config_sizes[config_sizes.size() - 1] = 0;
-        const std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> memory_reservation =
-            config_buffer_mgr.reserve(config_sizes);
-        if (!memory_reservation.first.need_sync) {
-            dispatch_md.stall_first = false;
-            dispatch_md.stall_before_program = true;
-        }
-        reservation = config_buffer_mgr.reserve(program_config_sizes);
-    }
-
-    if (program_binary_status == ProgramBinaryStatus::InFlight) {
-        // Program binary not commited to DRAM. Sync on all workers before dispatching kernel
-        // binaries for this program. This requires freeing the entire kernel config buffer.
-        config_buffer_mgr.free(expected_num_workers_completed);
-    } else {
-        if (dispatch_md.stall_first || dispatch_md.stall_before_program) {
-            config_buffer_mgr.free(dispatch_md.sync_count);
-        }
-    }
-    config_buffer_mgr.alloc(expected_num_workers_completed + num_program_workers);
-
-    if (program_binary_status != ProgramBinaryStatus::Committed) {
-        // Insert a stall before writing any program configs when binaries are in flight
-        dispatch_md.stall_first = true;
-        dispatch_md.stall_before_program = false;
-        // Wait on all previous workers before writing kernel binaries to workers
-        dispatch_md.sync_count = expected_num_workers_completed;
-    }
-
-    dispatch_md.kernel_config_addrs = reservation.second;
-}
-
-void update_program_dispatch_commands(
-    Program& program,
-    ProgramCommandSequence& cached_program_command_sequence,
-    const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs,
-    uint32_t multicast_cores_launch_message_wptr,
-    uint32_t unicast_cores_launch_message_wptr,
-    uint32_t expected_num_workers_completed,
-    CoreCoord dispatch_core,
-    CoreType dispatch_core_type,
-    SubDeviceId sub_device_id,
-    const ProgramDispatchMetadata& dispatch_md,
-    ProgramBinaryStatus program_binary_status,
-    int num_unicast_txns) {
-    uint32_t i = 0;
-    ZoneScopedN("program_loaded_on_device");
-
-    static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count));
-    static constexpr uint32_t tensix_l1_write_offset_offset =
-        (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset1));
-    static constexpr uint32_t eth_l1_write_offset_offset =
-        (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset2));
-    // Update Stall Command Sequence
-    if (program_binary_status != ProgramBinaryStatus::Committed) {
-        // Program binary is in flight. Issue a Prefetch Stall
-        cached_program_command_sequence.current_stall_seq_idx = UncachedStallSequenceIdx;
-    } else {
-        // Program Binary is in DRAM. Prefetcher does not need to stall before reading
-        // binary
-        cached_program_command_sequence.current_stall_seq_idx = CachedStallSequenceIdx;
-    }
-
-    auto& curr_stall_seq_idx = cached_program_command_sequence.current_stall_seq_idx;
-    cached_program_command_sequence.stall_command_sequences[curr_stall_seq_idx].update_cmd_sequence(
-        wait_count_offset, &(dispatch_md.sync_count), sizeof(uint32_t));
-
-    // Update preamble based on kernel config ring buffer slot
-    cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence(
-        tensix_l1_write_offset_offset,
-        &kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)],
-        sizeof(uint32_t));
-    if (hal.get_programmable_core_type_count() >= 2) {
-        cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence(
-            eth_l1_write_offset_offset,
-            &kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)],
-            sizeof(uint32_t));
-    }
-
-    // Update CB Configs
-    uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-    uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t);
-    for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) {
-        uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i];
-        for (const std::shared_ptr<CircularBuffer>& cb : cbs_on_core_range) {
-            const uint32_t cb_address = cb->address();
-            const uint32_t cb_size = cb->size();
-            for (const auto& buffer_index : cb->local_buffer_indices()) {
-                // 1 cmd for all 32 buffer indices, populate with real data for specified indices
-
-                // cb config payload
-                uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
-                cb_config_payload[base_index] = cb_address;
-                cb_config_payload[base_index + 1] = cb_size;
-                cb_config_payload[base_index + 2] = cb->num_pages(buffer_index);
-                cb_config_payload[base_index + 3] = cb->page_size(buffer_index);
-            }
-            for (const auto& buffer_index : cb->remote_buffer_indices()) {
-                const uint32_t base_index = remote_offset_index + (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) *
-                                                                      UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG;
-                cb_config_payload[base_index] = cb->config_address();
-                cb_config_payload[base_index + 1] = cb->page_size(buffer_index);
-            }
-        }
-        i++;
-    }
-    // Update launch messages
-    for (auto& go_signal : cached_program_command_sequence.go_signals) {
-        for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) {
-            go_signal->kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr;
-        }
-        go_signal->kernel_config.host_assigned_id = program.get_runtime_id();
-    }
-    // Update launch message addresses to reflect new launch_msg slot in ring buffer
-    uint32_t multicast_cores_launch_msg_addr =
-        hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::LAUNCH) +
-        multicast_cores_launch_message_wptr * sizeof(launch_msg_t);
-    for (auto launch_msg_cmd_ptr : cached_program_command_sequence.launch_msg_write_packed_cmd_ptrs) {
-        launch_msg_cmd_ptr->addr = multicast_cores_launch_msg_addr;
-    }
-    if (cached_program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs.size()) {
-        uint32_t unicast_cores_launch_message_addr =
-            hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH) +
-            unicast_cores_launch_message_wptr * sizeof(launch_msg_t);
-        for (auto launch_msg_cmd_ptr : cached_program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs) {
-            launch_msg_cmd_ptr->addr = unicast_cores_launch_message_addr;
-        }
-    }
-    // Update go signal to reflect potentially modified dispatch core and new wait count
-    go_msg_t run_program_go_signal;
-    run_program_go_signal.signal = RUN_MSG_GO;
-    run_program_go_signal.master_x = (uint8_t)dispatch_core.x;
-    run_program_go_signal.master_y = (uint8_t)dispatch_core.y;
-    run_program_go_signal.dispatch_message_offset =
-        (uint8_t)dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(sub_device_id.to_index());
-    cached_program_command_sequence.mcast_go_signal_cmd_ptr->go_signal =
-        *reinterpret_cast<uint32_t*>(&run_program_go_signal);
-    cached_program_command_sequence.mcast_go_signal_cmd_ptr->wait_count = expected_num_workers_completed;
-    // Update the number of unicast txns based on user provided parameter
-    // This is required when a MeshWorkload users ethernet cores on a set of devices
-    // where the number of active eth cores is heterogenous across devices.
-    // Update the number of unicast txns to eth cores to match the minimum number of cores
-    // across devices (specified by user)
-    if (num_unicast_txns >= 0 && cached_program_command_sequence.mcast_go_signal_cmd_ptr->num_unicast_txns) {
-        cached_program_command_sequence.mcast_go_signal_cmd_ptr->num_unicast_txns = num_unicast_txns;
-    }
-}
-
-KernelHandle get_device_local_kernel_handle(KernelHandle kernel_handle) {
-    // Device local Kernel Handle/Kernel Ids are 16 bit. The top 16 bits of
-    // the Kernel Handle may encode device coordinates when MeshWorkloads are
-    // being dispatched.
-    return kernel_handle & 0xffff;
-}
-
-}  // namespace program_utils
-
-}  // namespace tt::tt_metal
diff --git a/tt_metal/impl/program/program_dispatch_utils.hpp b/tt_metal/impl/program/program_dispatch_utils.hpp
deleted file mode 100644
index 68cd099f3d2..00000000000
--- a/tt_metal/impl/program/program_dispatch_utils.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_metal/impl/dispatch/worker_config_buffer.hpp"
-
-namespace tt {
-
-namespace tt_metal {
-
-namespace program_utils {
-#define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST)
-
-struct ProgramDispatchMetadata {
-    std::vector<ConfigBufferEntry> kernel_config_addrs;
-    uint32_t sync_count;
-    uint32_t stall_first;
-    uint32_t stall_before_program;
-};
-
-uint32_t configure_rta_offsets_for_kernel_groups(
-    uint32_t programmable_core_type_index,
-    std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& kernels,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t base_offset);
-
-uint32_t configure_crta_offsets_for_kernel_groups(
-    uint32_t programmable_core_type_index,
-    std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& kernels,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t crta_base_offset,
-    std::array<uint32_t, DISPATCH_CLASS_MAX>& crta_offsets,
-    std::array<uint32_t, DISPATCH_CLASS_MAX>& crta_sizes);
-
-uint32_t finalize_rt_args(
-    std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& kernels,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t base_offset,
-    uint32_t programmable_core_type_index,
-    uint32_t& rta_offset,
-    std::array<uint32_t, DISPATCH_CLASS_MAX>& crta_offsets,
-    std::array<uint32_t, DISPATCH_CLASS_MAX>& crta_sizes);
-
-uint32_t finalize_sems(
-    uint32_t programmable_core_type_index,
-    uint32_t sem_base_offset,
-    const std::vector<Semaphore>& semaphores,
-    uint32_t& semaphore_offset,
-    uint32_t& semaphore_size);
-
-uint32_t finalize_cbs(
-    uint32_t programmable_core_type_index,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t base_offset,
-    uint32_t& cb_offset,
-    uint32_t& cb_size,
-    uint32_t& local_cb_size);
-
-uint32_t finalize_kernel_bins(
-    Device* device,
-    uint32_t programmable_core_type_index,
-    const std::unordered_map<KernelHandle, std::shared_ptr<Kernel>>& kernels,
-    std::vector<std::shared_ptr<KernelGroup>>& kernel_groups,
-    uint32_t base_offset,
-    uint32_t& kernel_text_offset,
-    uint32_t& kernel_text_size);
-
-void insert_empty_program_dispatch_preamble_cmd(ProgramCommandSequence& program_command_sequence);
-
-void insert_stall_cmds(ProgramCommandSequence& program_command_sequence, SubDeviceId sub_device_id, Device* device);
-
-void assemble_runtime_args_commands(ProgramCommandSequence& program_command_sequence, Program& program, Device* device);
-
-void assemble_device_commands(
-    ProgramCommandSequence& program_command_sequence, Program& program, Device* device, SubDeviceId sub_device_id);
-
-void reserve_space_in_kernel_config_buffer(
-    WorkerConfigBufferMgr& config_buffer_mgr,
-    const std::vector<uint32_t>& program_config_sizes,
-    bool kernel_binary_always_stored_in_ringbuffer,
-    ProgramBinaryStatus program_binary_status,
-    uint32_t num_program_workers,
-    uint32_t expected_num_workers_completed,
-    ProgramDispatchMetadata& dispatch_md);
-
-void update_program_dispatch_commands(
-    Program& program,
-    ProgramCommandSequence& cached_program_command_sequence,
-    const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs,
-    uint32_t multicast_cores_launch_message_wptr,
-    uint32_t unicast_cores_launch_message_wptr,
-    uint32_t expected_num_workers_completed,
-    CoreCoord dispatch_core,
-    CoreType dispatch_core_type,
-    SubDeviceId sub_device_id,
-    const ProgramDispatchMetadata& dispatch_md,
-    ProgramBinaryStatus program_binary_status,
-    int num_unicast_txns = -1);
-
-KernelHandle get_device_local_kernel_handle(KernelHandle kernel_handle);
-
-}  // namespace program_utils
-
-}  // namespace tt_metal
-
-}  // namespace tt
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index d6ed7beab3d..38d17d6631f 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -801,15 +801,15 @@ void WriteRuntimeArgsToDevice(Device* device, Program& program) {
     for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
         CoreType core_type = hal.get_core_type(index);
         uint32_t processor_classes = hal.get_processor_classes_count(index);
-        for (const auto& kg : program.get_kernel_groups(index)) {
-            uint32_t kernel_config_base = kg->launch_msg.kernel_config.kernel_config_base[index];
-            for (const CoreRange& core_range : kg->core_ranges.ranges()) {
+        for (auto& kg : program.get_kernel_groups(index)) {
+            uint32_t kernel_config_base = kg.launch_msg.kernel_config.kernel_config_base[index];
+            for (const CoreRange& core_range : kg.core_ranges.ranges()) {
                 for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
                     for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
                         CoreCoord logical_core(x, y);
                         auto physical_core = device->virtual_core_from_logical_core(logical_core, core_type);
                         for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
-                            auto& optional_id = kg->kernel_ids[dispatch_class];
+                            auto& optional_id = kg.kernel_ids[dispatch_class];
                             if (optional_id) {
                                 const auto& kernel = detail::GetKernel(program, optional_id.value());
                                 const auto& rt_args = kernel->runtime_args(logical_core);
@@ -817,7 +817,7 @@ void WriteRuntimeArgsToDevice(Device* device, Program& program) {
                                 if (rt_args.size() > 0) {
                                     auto rt_args_addr =
                                         kernel_config_base +
-                                        kg->launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset;
+                                        kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset;
                                     log_trace(
                                         tt::LogMetal,
                                         "{} - Writing {} unique rtargs to core {} (physical: {}) addr 0x{:x} => args: "
@@ -835,7 +835,7 @@ void WriteRuntimeArgsToDevice(Device* device, Program& program) {
                                 if (common_rt_args.size() > 0) {
                                     auto common_rt_args_addr =
                                         kernel_config_base +
-                                        kg->launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset;
+                                        kg.launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset;
                                     log_trace(
                                         tt::LogMetal,
                                         "{} - Writing {} common rtargs to core {} (physical: {}) addr 0x{:x} => args: "