From 956e2bb7a77c14f8627a1b9aeec7a58373c6bf60 Mon Sep 17 00:00:00 2001
From: Austin Ho <aho@tenstorrent.com>
Date: Wed, 26 Jun 2024 21:29:06 +0000
Subject: [PATCH] #9044: Move dispatch core x y to be part of launch msg, which
 is set in assemble_device_commands. This enables running programs on any cq

---
 .../test_kernels/dataflow/dram_copy.cpp       |   3 +-
 .../test_kernels/misc/watcher_asserts.cpp     |   3 +-
 .../CMakeLists.txt                            |   1 +
 .../command_queue/test_EnqueueProgram.cpp     | 267 ++++++++++++++++++
 tt_metal/detail/tt_metal.hpp                  |  15 +-
 tt_metal/hw/firmware/src/brisc.cc             |   2 +-
 tt_metal/hw/firmware/src/erisck.cc            |   2 +-
 tt_metal/hw/firmware/src/idle_erisc.cc        |   2 +-
 tt_metal/hw/inc/dev_msgs.h                    |   2 +
 tt_metal/impl/device/device.cpp               |  31 +-
 tt_metal/impl/device/device.hpp               |   1 +
 tt_metal/impl/dispatch/command_queue.cpp      | 226 ++++++++++-----
 tt_metal/impl/dispatch/command_queue.hpp      |  16 +-
 tt_metal/jit_build/genfiles.cpp               |  11 +-
 tt_metal/jit_build/genfiles.hpp               |   3 +-
 15 files changed, 463 insertions(+), 122 deletions(-)
 create mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp

diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp
index 1deac01e9236..8d63d1f80a3c 100644
--- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp
@@ -29,7 +29,8 @@ void kernel_main() {
         //Need to signal completion to dispatcher before hanging so that
         //Dispatcher Kernel is able to finish.
         //Device Close () requires fast dispatch kernels to finish.
-        uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
+        tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE);
+        uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
         noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31, false);
 #endif
 
diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp
index c995df2b5ab7..745fb376f41f 100644
--- a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp
@@ -35,7 +35,8 @@ void MAIN {
         //Need to signal completion to dispatcher before hanging so that
         //Dispatcher Kernel is able to finish.
         //Device Close () requires fast dispatch kernels to finish.
-        uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
+        tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE);
+        uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
         noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/);
     }
 #else
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt
index 3b3578ba3c67..154abb70d8d5 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt
@@ -1,5 +1,6 @@
 
 set(UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueProgram.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueTrace.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWaitForEvent.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp
new file mode 100644
index 000000000000..9b5e28d0ee0c
--- /dev/null
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp
@@ -0,0 +1,267 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <memory>
+#include "command_queue_fixture.hpp"
+#include "command_queue_test_utils.hpp"
+#include "gtest/gtest.h"
+#include "impl/buffers/buffer.hpp"
+#include "tt_metal/common/bfloat16.hpp"
+#include "tt_metal/common/scoped_timer.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+
+using namespace tt::tt_metal;
+
+struct CBConfig {
+    uint32_t cb_id;
+    uint32_t num_pages;
+    uint32_t page_size;
+    tt::DataFormat data_format;
+};
+
+struct DummyProgramConfig {
+    CoreRangeSet cr_set;
+    CBConfig cb_config;
+    uint32_t num_cbs;
+    uint32_t num_sems;
+};
+
+struct DummyProgramMultiCBConfig {
+    CoreRangeSet cr_set;
+    std::vector<CBConfig> cb_config_vector;
+    uint32_t num_sems;
+};
+
+
+namespace local_test_functions {
+
+// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two.
+// Optionally force the max size for one of the vectors.
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> create_runtime_args(bool force_max_size = false, uint32_t unique_base = 0, uint32_t common_base = 100){
+
+    constexpr uint32_t MAX_RUNTIME_ARGS = 255;
+
+    // Generate Unique Runtime Args. Common RT args starting address must be L1 Aligned, so account for that here via padding
+    uint32_t num_rt_args_unique = num_rt_args_unique = rand() % (MAX_RUNTIME_ARGS + 1);
+    uint32_t num_rt_args_unique_padded = align(num_rt_args_unique, L1_ALIGNMENT / sizeof(uint32_t));
+    uint32_t num_rt_args_common = num_rt_args_unique_padded < MAX_RUNTIME_ARGS ? rand() % (MAX_RUNTIME_ARGS - num_rt_args_unique_padded + 1) : 0;
+
+    if (force_max_size) {
+        if (rand() % 2) {
+            num_rt_args_unique = MAX_RUNTIME_ARGS;
+            num_rt_args_common = 0;
+        } else {
+            num_rt_args_common = MAX_RUNTIME_ARGS;
+            num_rt_args_unique = 0;
+        }
+    }
+
+    vector<uint32_t> rt_args_common;
+    for (uint32_t i = 0; i < num_rt_args_common; i++) {
+        rt_args_common.push_back(common_base + i);
+    }
+
+    vector<uint32_t> rt_args_unique;
+    for (uint32_t i = 0; i < num_rt_args_unique; i++) {
+        rt_args_unique.push_back(unique_base + i);
+    }
+
+    log_trace(tt::LogTest, "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}", __FUNCTION__, num_rt_args_unique, num_rt_args_common, force_max_size);
+    return std::make_pair(rt_args_unique, rt_args_common);
+}
+
+
+}  // namespace local_test_functions
+
+namespace stress_tests {
+
+TEST_F(MultiCommandQueueSingleDeviceFixture, TestRandomizedProgram) {
+    uint32_t NUM_PROGRAMS = 100;
+    uint32_t MAX_LOOP = 100;
+    uint32_t page_size = 1024;
+
+    // Make random
+    auto random_seed = 0; // (unsigned int)time(NULL);
+    uint32_t seed = tt::parse_env("SEED", random_seed);
+    log_info(tt::LogTest, "Using Test Seed: {}", seed);
+    srand(seed);
+
+    CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size();
+    CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
+    CoreRangeSet cr_set({cr});
+
+    log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS);
+
+    vector<Program> programs;
+    for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
+        programs.push_back(Program());
+        Program& program = programs.back();
+
+        std::map<string, string> data_movement_defines = {{"DATA_MOVEMENT", "1"}};
+        std::map<string, string> compute_defines = {{"COMPUTE", "1"}};
+
+        // brisc
+        uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS;
+        bool USE_MAX_RT_ARGS;
+
+        if (i == 0) {
+            // Ensures that we get at least one compilation with the max amount to
+            // ensure it compiles and runs
+            BRISC_OUTER_LOOP = MAX_LOOP;
+            BRISC_MIDDLE_LOOP = MAX_LOOP;
+            BRISC_INNER_LOOP = MAX_LOOP;
+            NUM_CBS = NUM_CIRCULAR_BUFFERS;
+            NUM_SEMS = NUM_SEMAPHORES;
+            USE_MAX_RT_ARGS = true;
+        } else {
+            BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
+            BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
+            BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
+            NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1;
+            NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1;
+            USE_MAX_RT_ARGS = false;
+        }
+
+        log_debug(tt::LogTest, "Compiling program {}/{} w/ BRISC_OUTER_LOOP: {} BRISC_MIDDLE_LOOP: {} BRISC_INNER_LOOP: {} NUM_CBS: {} NUM_SEMS: {} USE_MAX_RT_ARGS: {}",
+                 i+1, NUM_PROGRAMS, BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, USE_MAX_RT_ARGS);
+
+        for (uint32_t j = 0; j < NUM_CBS; j++) {
+            CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}).set_page_size(j, page_size * (j + 1));
+            auto cb = CreateCircularBuffer(program, cr_set, cb_config);
+        }
+
+        for (uint32_t j = 0; j < NUM_SEMS; j++) {
+            CreateSemaphore(program, cr_set, j + 1);
+        }
+
+        auto [brisc_unique_rtargs, brisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
+        uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size();
+        uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size();
+        vector<uint32_t> brisc_compile_args = {BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_brisc_unique_rtargs, num_brisc_common_rtargs, page_size};
+
+        // ncrisc
+        uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP;
+        if (i == 0) {
+            NCRISC_OUTER_LOOP = MAX_LOOP;
+            NCRISC_MIDDLE_LOOP = MAX_LOOP;
+            NCRISC_INNER_LOOP = MAX_LOOP;
+        } else {
+            NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
+            NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
+            NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
+        }
+
+        auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
+        uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size();
+        uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size();
+        vector<uint32_t> ncrisc_compile_args = {NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_ncrisc_unique_rtargs, num_ncrisc_common_rtargs, page_size};
+
+        // trisc
+        uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP;
+        if (i == 0) {
+            TRISC_OUTER_LOOP = MAX_LOOP;
+            TRISC_MIDDLE_LOOP = MAX_LOOP;
+            TRISC_INNER_LOOP = MAX_LOOP;
+        } else {
+            TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
+            TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
+            TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
+        }
+
+        auto [trisc_unique_rtargs, trisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
+        uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size();
+        uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size();
+        vector<uint32_t> trisc_compile_args = {TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_trisc_unique_rtargs, num_trisc_common_rtargs, page_size};
+
+        bool at_least_one_kernel = false;
+        if (i == 0 or ((rand() % 2) == 0)) {
+            auto dummy_brisc_kernel = CreateKernel(
+                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
+                    .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines});
+            SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
+            SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
+            at_least_one_kernel = true;
+        }
+
+        if (i == 0 or ((rand() % 2) == 0)) {
+            auto dummy_ncrisc_kernel = CreateKernel(
+                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
+                    .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines});
+            SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
+            SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
+            at_least_one_kernel = true;
+        }
+
+        if (i == 0 or ((rand() % 2) == 0)) {
+            auto dummy_trisc_kernel = CreateKernel(
+                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{
+                    .math_approx_mode = false,
+                    .compile_args = trisc_compile_args,
+                    .defines = compute_defines
+                });
+            SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
+            SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
+            at_least_one_kernel = true;
+        }
+
+        if (not at_least_one_kernel) {
+            uint32_t random_risc = rand() % 3 + 1;
+            if (random_risc == 1) {
+                auto dummy_brisc_kernel = CreateKernel(
+                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
+                        .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines});
+                SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
+                SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
+            } else if (random_risc == 2) {
+                auto dummy_ncrisc_kernel = CreateKernel(
+                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
+                        .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines});
+                SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
+                SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
+            } else if (random_risc == 3) {
+                auto dummy_trisc_kernel = CreateKernel(
+                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{
+                        .math_approx_mode = false,
+                        .compile_args = trisc_compile_args,
+                        .defines = compute_defines
+                    });
+                SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
+                SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
+            } else {
+                TT_ASSERT("Invalid");
+            }
+        }
+
+        tt::tt_metal::detail::CompileProgram(this->device_, program);
+    }
+
+    for (uint8_t cq_id = 0; cq_id < this->device_->num_hw_cqs(); ++cq_id) {
+        log_info(tt::LogTest, "Running {} programs on cq {} for cache warmup.", programs.size(), (uint32_t)cq_id);
+        // This loop caches program and runs
+        for (Program& program: programs) {
+            EnqueueProgram(this->device_->command_queue(cq_id), program, false);
+        }
+
+        // This loops assumes already cached
+        uint32_t NUM_ITERATIONS = 500; // TODO(agrebenisan): Bump this to 5000, saw hangs for very large number of iterations, need to come back to that
+
+        log_info(tt::LogTest, "Running {} programs on cq {} for {} iterations now.", programs.size(), (uint32_t)cq_id, NUM_ITERATIONS);
+        for (uint32_t i = 0; i < NUM_ITERATIONS; i++) {
+            auto rng = std::default_random_engine {};
+            std::shuffle(std::begin(programs), std::end(programs), rng);
+            if (i % 10 == 0) {
+                log_debug(tt::LogTest, "Enqueueing {} programs on cq {} for iter: {}/{} now.", programs.size(), (uint32_t)cq_id, i+1, NUM_ITERATIONS);
+            }
+            for (Program& program: programs) {
+                EnqueueProgram(this->device_->command_queue(cq_id), program, false);
+            }
+        }
+
+        log_info(tt::LogTest, "Calling Finish.");
+        Finish(this->device_->command_queue(cq_id));
+    }
+}
+
+}  // namespace stress_tests
diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
index 426d8a83647e..4cf6a69ce37c 100644
--- a/tt_metal/detail/tt_metal.hpp
+++ b/tt_metal/detail/tt_metal.hpp
@@ -399,18 +399,6 @@ namespace tt::tt_metal{
                 pcie_chan_end_addr += tt::Cluster::instance().get_host_channel_size(device->id(), pcie_chan);
             }
 
-            uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-            const uint8_t cq_id = 0; // Currently, only the first command queue is responsible for enqueuing programs
-            tt_cxy_pair enqueue_program_dispatch_core;
-            if (device->is_mmio_capable()) {
-                enqueue_program_dispatch_core = dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_core(device->id(), channel, cq_id);
-            } else {
-                enqueue_program_dispatch_core = dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_d_core(device->id(), channel, cq_id);
-            }
-
-            CoreType core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
-            CoreCoord physical_enqueue_program_dispatch_core = get_physical_core_coordinate(enqueue_program_dispatch_core, core_type);
-
             jit_build_genfiles_noc_addr_ranges_header(
                 path,
                 pcie_chan_base_addr,
@@ -421,8 +409,7 @@ namespace tt::tt_metal{
                 soc_d.get_dram_cores(),
                 soc_d.get_physical_ethernet_cores(),
                 soc_d.grid_size,
-                harvested_rows,
-                physical_enqueue_program_dispatch_core);
+                harvested_rows);
         }
 
         inline void CheckDataMovementConfig(Program &program, const std::string &file_name, const CoreRangeSet &core_ranges) {
diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc
index baab7633ae33..d88ea516ceae 100644
--- a/tt_metal/hw/firmware/src/brisc.cc
+++ b/tt_metal/hw/firmware/src/brisc.cc
@@ -400,7 +400,7 @@ int main() {
             // Notify dispatcher core that it has completed
             if (mailboxes->launch.mode == DISPATCH_MODE_DEV) {
                 uint64_t dispatch_addr =
-                    NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
+                    NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
                 DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4);
                 noc_fast_atomic_increment(
                     noc_index,
diff --git a/tt_metal/hw/firmware/src/erisck.cc b/tt_metal/hw/firmware/src/erisck.cc
index 6a173069a91e..76fdb1645054 100644
--- a/tt_metal/hw/firmware/src/erisck.cc
+++ b/tt_metal/hw/firmware/src/erisck.cc
@@ -33,7 +33,7 @@ void __attribute__((section("erisc_l1_code"))) kernel_launch() {
 
     kernel_main();
     mailboxes->launch.run = RUN_MSG_DONE;
-    uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
+    uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
     if (routing_info->routing_enabled and mailboxes->launch.mode == DISPATCH_MODE_DEV) {
         internal_::notify_dispatch_core_done(dispatch_addr);
     }
diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc
index c09fb0c62e47..a5bacb80d1f9 100644
--- a/tt_metal/hw/firmware/src/idle_erisc.cc
+++ b/tt_metal/hw/firmware/src/idle_erisc.cc
@@ -132,7 +132,7 @@ int main() {
 
             // Notify dispatcher core that it has completed
             if (mailboxes->launch.mode == DISPATCH_MODE_DEV) {
-                uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
+                uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
                 DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4);
                 noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/);
             }
diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h
index f8e53124b785..c63ac7070507 100644
--- a/tt_metal/hw/inc/dev_msgs.h
+++ b/tt_metal/hw/inc/dev_msgs.h
@@ -73,6 +73,8 @@ struct launch_msg_t {  // must be cacheline aligned
     volatile uint8_t brisc_noc_id;
     volatile uint8_t enables[DISPATCH_CLASS_MAX_PROC];
     volatile uint8_t max_cb_index;
+    volatile uint8_t dispatch_core_x;
+    volatile uint8_t dispatch_core_y;
     volatile uint8_t run;  // must be in last cacheline of this msg
 };
 
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index c129651eaba7..46a77c88e37d 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -31,14 +31,8 @@ Device::Device(
     chip_id_t device_id, const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap, bool minimal, uint32_t worker_core) :
     id_(device_id), worker_thread_core(worker_core), work_executor(worker_core, device_id) {
     ZoneScoped;
-    TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs < 3, "num_hw_cqs can be between 1 and 2");
+    TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs <= Device::max_num_hw_cqs, "num_hw_cqs can be between 1 and {}", Device::max_num_hw_cqs);
     this->build_key_ = tt::Cluster::instance().get_harvesting_mask(device_id);
-    if (!this->is_mmio_capable()) {
-        //Remote device Vs MMIO Device need to be unique even if they have same harvesting mask because
-        //dispatch cores are allocated differently on two types of devices.
-        //harvest mask is 12-bit mask so adding 100000 should not alias with any possible harvest mask value.
-        this->build_key_ += 100000;
-    }
     tunnel_device_dispatch_workers_ = {};
     this->initialize(num_hw_cqs, l1_small_size, trace_region_size, l1_bank_remap, minimal);
 }
@@ -1900,12 +1894,23 @@ uint32_t Device::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& ph
 
 uint32_t Device::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const {
     const auto& grid_size = tt::Cluster::instance().get_soc_desc(this->id()).grid_size;
-    return NOC_MULTICAST_ENCODING(
-        NOC_0_X(noc_index, grid_size.x, physical_cores.start.x),
-        NOC_0_Y(noc_index, grid_size.y, physical_cores.start.y),
-        NOC_0_X(noc_index, grid_size.x, physical_cores.end.x),
-        NOC_0_Y(noc_index, grid_size.y, physical_cores.end.y)
-    );
+
+    // NOC 1 mcasts from bottom left to top right, so we need to reverse the coords
+    if (noc_index == 0) {
+        return NOC_MULTICAST_ENCODING(
+            NOC_0_X(noc_index, grid_size.x, physical_cores.start.x),
+            NOC_0_Y(noc_index, grid_size.y, physical_cores.start.y),
+            NOC_0_X(noc_index, grid_size.x, physical_cores.end.x),
+            NOC_0_Y(noc_index, grid_size.y, physical_cores.end.y)
+        );
+    } else {
+        return NOC_MULTICAST_ENCODING(
+            NOC_0_X(noc_index, grid_size.x, physical_cores.end.x),
+            NOC_0_Y(noc_index, grid_size.y, physical_cores.end.y),
+            NOC_0_X(noc_index, grid_size.x, physical_cores.start.x),
+            NOC_0_Y(noc_index, grid_size.y, physical_cores.start.y)
+        );
+    }
 }
 
 void Device::check_allocator_is_initialized() const {
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index 7d29a4ba3eee..35a962273e9b 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -244,6 +244,7 @@ class Device {
     friend class SystemMemoryManager;
 
     static constexpr MemoryAllocator allocator_scheme_ = MemoryAllocator::L1_BANKING;
+    static constexpr uint32_t max_num_hw_cqs = 2;
     chip_id_t id_;
     uint32_t build_key_;
     std::unique_ptr<Allocator> allocator_ = nullptr;
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 8d61a55281d9..9cca6e99760f 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -40,7 +40,9 @@ std::condition_variable finish_cv;
 namespace tt::tt_metal {
 
 // TODO: Delete entries when programs are deleted to save memory
-thread_local std::unordered_map<uint64_t, EnqueueProgramCommand::CachedProgramCommandSequence>
+thread_local std::unordered_map<
+    uint64_t,
+    std::array<std::optional<EnqueueProgramCommand::CachedProgramCommandSequence>, Device::max_num_hw_cqs>>
     EnqueueProgramCommand::cached_program_command_sequences = {};
 
 // EnqueueReadBufferCommandSection
@@ -160,8 +162,8 @@ void EnqueueWriteInterleavedBufferCommand::add_dispatch_write(HugepageDeviceComm
 void EnqueueWriteInterleavedBufferCommand::add_buffer_data(HugepageDeviceCommand& command_sequence) {
     uint32_t data_size_bytes = this->pages_to_write * this->padded_page_size;
 
-    uint32_t full_page_size = this->buffer.aligned_page_size(); // this->padded_page_size could be a partial page if buffer
-                                                             // page size > MAX_PREFETCH_CMD_SIZE
+    uint32_t full_page_size = this->buffer.aligned_page_size();  // this->padded_page_size could be a partial page if
+                                                                 // buffer page size > MAX_PREFETCH_CMD_SIZE
     bool write_partial_pages = this->padded_page_size < full_page_size;
 
     uint32_t buffer_addr_offset = this->bank_base_address - this->buffer.address();
@@ -186,7 +188,8 @@ void EnqueueWriteInterleavedBufferCommand::add_buffer_data(HugepageDeviceCommand
         uint32_t unpadded_src_offset =
             (((buffer_addr_offset / this->padded_page_size) * num_banks) + this->dst_page_index) *
             this->buffer.page_size();
-        if (this->buffer.page_size() % this->buffer.alignment() != 0 and this->buffer.page_size() != this->buffer.size()) {
+        if (this->buffer.page_size() % this->buffer.alignment() != 0 and
+            this->buffer.page_size() != this->buffer.size()) {
             // If page size is not 32B-aligned, we cannot do a contiguous write
             uint32_t src_address_offset = unpadded_src_offset;
             for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes;
@@ -268,8 +271,8 @@ void EnqueueWriteBufferCommand::process() {
 
     this->add_dispatch_write(command_sequence);
 
-    uint32_t full_page_size = this->buffer.aligned_page_size(); // this->padded_page_size could be a partial page if buffer
-                                                             // page size > MAX_PREFETCH_CMD_SIZE
+    uint32_t full_page_size = this->buffer.aligned_page_size();  // this->padded_page_size could be a partial page if
+                                                                 // buffer page size > MAX_PREFETCH_CMD_SIZE
     bool write_partial_pages = this->padded_page_size < full_page_size;
 
     this->add_buffer_data(command_sequence);
@@ -287,18 +290,21 @@ EnqueueProgramCommand::EnqueueProgramCommand(
     Device* device,
     NOC noc_index,
     Program& program,
+    CoreCoord& dispatch_core,
     SystemMemoryManager& manager,
     uint32_t expected_num_workers_completed) :
     command_queue_id(command_queue_id),
     noc_index(noc_index),
     manager(manager),
     expected_num_workers_completed(expected_num_workers_completed),
-    program(program) {
+    program(program),
+    dispatch_core(dispatch_core) {
     this->device = device;
     this->dispatch_core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
 }
 
-void EnqueueProgramCommand::assemble_preamble_commands(bool prefetch_stall) {
+void EnqueueProgramCommand::assemble_preamble_commands(
+    CachedProgramCommandSequence& cached_program_command_sequence, bool prefetch_stall) {
     if (prefetch_stall) {
         // Wait command so previous program finishes
         // Wait command with barrier for binaries to commit to DRAM
@@ -307,24 +313,21 @@ void EnqueueProgramCommand::assemble_preamble_commands(bool prefetch_stall) {
             CQ_PREFETCH_CMD_BARE_MIN_SIZE +  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
             CQ_PREFETCH_CMD_BARE_MIN_SIZE;   // CQ_PREFETCH_CMD_STALL
 
-        this->cached_program_command_sequences[program.id].preamble_command_sequence =
-            HostMemDeviceCommand(uncached_cmd_sequence_sizeB);
+        cached_program_command_sequence.preamble_command_sequence = HostMemDeviceCommand(uncached_cmd_sequence_sizeB);
 
         // Wait for Noc Write Barrier
         // wait for binaries to commit to dram, also wait for previous program to be done
         // Wait Noc Write Barrier, wait for binaries to be written to worker cores
         // Stall to allow binaries to commit to DRAM first
         // TODO: this can be removed for all but the first program run
-        this->cached_program_command_sequences[program.id]
-            .preamble_command_sequence.add_dispatch_wait_with_prefetch_stall(
-                true, DISPATCH_MESSAGE_ADDR, this->expected_num_workers_completed);
+        cached_program_command_sequence.preamble_command_sequence.add_dispatch_wait_with_prefetch_stall(
+            true, DISPATCH_MESSAGE_ADDR, this->expected_num_workers_completed);
     } else {
         // Wait command so previous program finishes
         constexpr uint32_t cached_cmd_sequence_sizeB =
             CQ_PREFETCH_CMD_BARE_MIN_SIZE;  // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
-        this->cached_program_command_sequences[program.id].preamble_command_sequence =
-            HostMemDeviceCommand(cached_cmd_sequence_sizeB);
-        this->cached_program_command_sequences[program.id].preamble_command_sequence.add_dispatch_wait(
+        cached_program_command_sequence.preamble_command_sequence = HostMemDeviceCommand(cached_cmd_sequence_sizeB);
+        cached_program_command_sequence.preamble_command_sequence.add_dispatch_wait(
             false, DISPATCH_MESSAGE_ADDR, this->expected_num_workers_completed);
     }
 }
@@ -338,25 +341,33 @@ uint32_t get_max_write_packed_sub_cmds(uint32_t data_size, uint32_t max_prefetch
     uint32_t sub_cmd_sizeB =
         is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd);
     // Approximate calculation due to alignment
-    uint32_t max_prefetch_size = max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - PCIE_ALIGNMENT - sizeof(CQDispatchCmd) - L1_ALIGNMENT;
+    uint32_t max_prefetch_size =
+        max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - PCIE_ALIGNMENT - sizeof(CQDispatchCmd) - L1_ALIGNMENT;
     uint32_t max_prefetch_num_packed_cmds =
         no_stride ? (max_prefetch_size - align(data_size * sizeof(uint32_t), L1_ALIGNMENT)) / sub_cmd_sizeB
-                : max_prefetch_size / (align(data_size * sizeof(uint32_t), L1_ALIGNMENT) + sub_cmd_sizeB);
-    return min(max_prefetch_num_packed_cmds, is_unicast ? CQ_DISPATCH_CMD_PACKED_WRITE_MAX_UNICAST_SUB_CMDS : CQ_DISPATCH_CMD_PACKED_WRITE_MAX_MULTICAST_SUB_CMDS);
+                  : max_prefetch_size / (align(data_size * sizeof(uint32_t), L1_ALIGNMENT) + sub_cmd_sizeB);
+    return min(
+        max_prefetch_num_packed_cmds,
+        is_unicast ? CQ_DISPATCH_CMD_PACKED_WRITE_MAX_UNICAST_SUB_CMDS
+                   : CQ_DISPATCH_CMD_PACKED_WRITE_MAX_MULTICAST_SUB_CMDS);
 };
 
 template <typename PackedSubCmd>
-uint32_t insert_write_packed_payloads(const uint32_t num_sub_cmds, const uint32_t sub_cmd_sizeB, const uint32_t max_prefetch_command_size, std::vector<std::pair<uint32_t, uint32_t>>& packed_cmd_payloads) {
+uint32_t insert_write_packed_payloads(
+    const uint32_t num_sub_cmds,
+    const uint32_t sub_cmd_sizeB,
+    const uint32_t max_prefetch_command_size,
+    std::vector<std::pair<uint32_t, uint32_t>>& packed_cmd_payloads) {
     const uint32_t aligned_sub_cmd_sizeB = align(sub_cmd_sizeB, L1_ALIGNMENT);
-    const uint32_t max_packed_sub_cmds_per_cmd = get_max_write_packed_sub_cmds<PackedSubCmd>(aligned_sub_cmd_sizeB, max_prefetch_command_size, false);
+    const uint32_t max_packed_sub_cmds_per_cmd =
+        get_max_write_packed_sub_cmds<PackedSubCmd>(aligned_sub_cmd_sizeB, max_prefetch_command_size, false);
     uint32_t rem_num_sub_cmds = num_sub_cmds;
     uint32_t cmd_payload_sizeB = 0;
     while (rem_num_sub_cmds != 0) {
         const uint32_t num_sub_cmds_in_cmd = std::min(max_packed_sub_cmds_per_cmd, rem_num_sub_cmds);
         const uint32_t aligned_data_sizeB = aligned_sub_cmd_sizeB * num_sub_cmds_in_cmd;
-        const uint32_t dispatch_cmd_sizeB = align(
-            sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd),
-            L1_ALIGNMENT);
+        const uint32_t dispatch_cmd_sizeB =
+            align(sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), L1_ALIGNMENT);
         packed_cmd_payloads.emplace_back(num_sub_cmds_in_cmd, dispatch_cmd_sizeB + aligned_data_sizeB);
         cmd_payload_sizeB += align(sizeof(CQPrefetchCmd) + packed_cmd_payloads.back().second, PCIE_ALIGNMENT);
         rem_num_sub_cmds -= num_sub_cmds_in_cmd;
@@ -434,7 +445,8 @@ void generate_runtime_args_cmds(
 }
 
 // Generate command sequence for unique (unicast) and common (multicast) runtime args
-void EnqueueProgramCommand::assemble_runtime_args_commands() {
+void EnqueueProgramCommand::assemble_runtime_args_commands(
+    CachedProgramCommandSequence& cached_program_command_sequence) {
     // Maps to enum class RISCV, tt_backend_api_types.h
     thread_local static const std::vector<uint32_t> unique_processor_to_l1_arg_base_addr = {
         L1_KERNEL_CONFIG_BASE,
@@ -542,8 +554,8 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
     }
     // Reserve 2x for unique rtas as we pontentially split the cmds due to not fitting in one prefetch cmd
     // Common rtas are always expected to fit in one prefetch cmd
-    this->cached_program_command_sequences[program.id].runtime_args_command_sequences = {};
-    this->cached_program_command_sequences[program.id].runtime_args_command_sequences.reserve(
+    cached_program_command_sequence.runtime_args_command_sequences = {};
+    cached_program_command_sequence.runtime_args_command_sequences.reserve(
         2 * unique_processors.size() + common_kernels.size());
     std::vector<std::pair<uint32_t, uint32_t>> runtime_args_data_index;
     runtime_args_data_index.reserve(2 * (unique_processors.size() + common_kernels.size()));
@@ -552,7 +564,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
     // same for all splits
     for (const uint32_t& processor_idx : unique_processors) {
         generate_runtime_args_cmds(
-            this->cached_program_command_sequences[program.id].runtime_args_command_sequences,
+            cached_program_command_sequence.runtime_args_command_sequences,
             unique_processor_to_l1_arg_base_addr[processor_idx],
             unique_sub_cmds[processor_idx],
             unique_rt_data_and_sizes[processor_idx],
@@ -566,7 +578,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
         std::visit(
             [&](auto&& sub_cmds) {
                 generate_runtime_args_cmds(
-                    this->cached_program_command_sequences[program.id].runtime_args_command_sequences,
+                    cached_program_command_sequence.runtime_args_command_sequences,
                     common_processor_to_l1_arg_base_addr[kernel_id],
                     sub_cmds,
                     common_rt_data_and_sizes[kernel_id],
@@ -579,20 +591,21 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
             common_sub_cmds[kernel_id]);
     }
     uint32_t runtime_args_fetch_size_bytes = 0;
-    for (const auto& cmds : this->cached_program_command_sequences[program.id].runtime_args_command_sequences) {
+    for (const auto& cmds : cached_program_command_sequence.runtime_args_command_sequences) {
         // BRISC, NCRISC, TRISC...
         runtime_args_fetch_size_bytes += cmds.size_bytes();
     }
-    this->cached_program_command_sequences[program.id].runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes;
+    cached_program_command_sequence.runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes;
 }
 
-void EnqueueProgramCommand::assemble_device_commands() {
-    auto& cached_program_command_sequence = this->cached_program_command_sequences[this->program.id];
-    if (!program.loaded_onto_device) {
+void EnqueueProgramCommand::assemble_device_commands(
+    CachedProgramCommandSequence& cached_program_command_sequence, bool is_cached) {
+    if (not is_cached) {
         // Calculate size of command and fill program indices of data to update
         // TODO: Would be nice if we could pull this out of program
         uint32_t cmd_sequence_sizeB = 0;
-        const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
+        const uint32_t max_prefetch_command_size =
+            dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
 
         for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.multicast_semaphores) {
             uint32_t num_packed_cmds = 0;
@@ -688,9 +701,11 @@ void EnqueueProgramCommand::assemble_device_commands() {
                 max_overall_base_index = max(max_overall_base_index, max_base_index);
                 i++;
             }
-            cb_config_size_bytes = (max_overall_base_index + UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG) * sizeof(uint32_t);
+            cb_config_size_bytes =
+                (max_overall_base_index + UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG) * sizeof(uint32_t);
             aligned_cb_config_size_bytes = align(cb_config_size_bytes, L1_ALIGNMENT);
-            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(num_multicast_cb_sub_cmds, cb_config_size_bytes, max_prefetch_command_size, mcast_cb_payload);
+            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(
+                num_multicast_cb_sub_cmds, cb_config_size_bytes, max_prefetch_command_size, mcast_cb_payload);
         }
 
         // Program Binaries and Go Signals
@@ -821,9 +836,13 @@ void EnqueueProgramCommand::assemble_device_commands() {
         std::vector<CQDispatchWritePackedUnicastSubCmd> unicast_go_signal_sub_cmds;
         std::vector<std::pair<uint32_t, uint32_t>> multicast_go_signals_payload;
         std::vector<std::pair<uint32_t, uint32_t>> unicast_go_signals_payload;
-        const uint32_t go_signal_sizeB = sizeof(launch_msg_t);
+        constexpr uint32_t go_signal_sizeB = sizeof(launch_msg_t);
+        constexpr uint32_t aligned_go_signal_sizeB = align(go_signal_sizeB, L1_ALIGNMENT);
+        constexpr uint32_t go_signal_size_words = aligned_go_signal_sizeB / sizeof(uint32_t);
         for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::WORKER)) {
             kernel_group.launch_msg.mode = DISPATCH_MODE_DEV;
+            kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x;
+            kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y;
             const void* launch_message_data = (const void*)(&kernel_group.launch_msg);
             for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) {
                 CoreCoord physical_start =
@@ -839,12 +858,18 @@ void EnqueueProgramCommand::assemble_device_commands() {
             }
         }
         if (multicast_go_signal_sub_cmds.size() > 0) {
-            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(multicast_go_signal_sub_cmds.size(), sizeof(launch_msg_t), max_prefetch_command_size, multicast_go_signals_payload);
+            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedMulticastSubCmd>(
+                multicast_go_signal_sub_cmds.size(),
+                go_signal_sizeB,
+                max_prefetch_command_size,
+                multicast_go_signals_payload);
         }
 
         for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::ETH)) {
             kernel_group.launch_msg.mode = DISPATCH_MODE_DEV;
-            const void* launch_message_data = (const void*)(&kernel_group.launch_msg);
+            kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x;
+            kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y;
+            const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg);
             for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) {
                 for (auto x = core_range.start.x; x <= core_range.end.x; x++) {
                     for (auto y = core_range.start.y; y <= core_range.end.y; y++) {
@@ -858,7 +883,11 @@ void EnqueueProgramCommand::assemble_device_commands() {
             }
         }
         if (unicast_go_signal_sub_cmds.size() > 0) {
-            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedUnicastSubCmd>(unicast_go_signal_sub_cmds.size(), sizeof(launch_msg_t), max_prefetch_command_size, unicast_go_signals_payload);
+            cmd_sequence_sizeB += insert_write_packed_payloads<CQDispatchWritePackedUnicastSubCmd>(
+                unicast_go_signal_sub_cmds.size(),
+                go_signal_sizeB,
+                max_prefetch_command_size,
+                unicast_go_signals_payload);
         }
 
         cached_program_command_sequence.program_command_sequence = HostMemDeviceCommand(cmd_sequence_sizeB);
@@ -928,7 +957,7 @@ void EnqueueProgramCommand::assemble_device_commands() {
         if (num_multicast_cb_sub_cmds > 0) {
             uint32_t curr_sub_cmd_idx = 0;
             cached_program_command_sequence.cb_configs_payloads.reserve(num_multicast_cb_sub_cmds);
-            uint32_t cb_config_size_words = aligned_cb_config_size_bytes / sizeof(uint32_t);
+            const uint32_t cb_config_size_words = aligned_cb_config_size_bytes / sizeof(uint32_t);
             for (const auto& [num_sub_cmds_in_cmd, mcast_cb_payload_sizeB] : mcast_cb_payload) {
                 uint32_t write_offset_bytes = program_command_sequence.write_offset_bytes();
                 program_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedMulticastSubCmd>(
@@ -940,9 +969,13 @@ void EnqueueProgramCommand::assemble_device_commands() {
                     multicast_cb_config_data,
                     curr_sub_cmd_idx);
                 curr_sub_cmd_idx += num_sub_cmds_in_cmd;
-                uint32_t curr_sub_cmd_data_offset_words = (write_offset_bytes + CQ_PREFETCH_CMD_BARE_MIN_SIZE + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), L1_ALIGNMENT)) / sizeof(uint32_t);
+                uint32_t curr_sub_cmd_data_offset_words =
+                    (write_offset_bytes + CQ_PREFETCH_CMD_BARE_MIN_SIZE +
+                     align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), L1_ALIGNMENT)) /
+                    sizeof(uint32_t);
                 for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
-                    cached_program_command_sequence.cb_configs_payloads.push_back((uint32_t *)program_command_sequence.data() + curr_sub_cmd_data_offset_words);
+                    cached_program_command_sequence.cb_configs_payloads.push_back(
+                        (uint32_t*)program_command_sequence.data() + curr_sub_cmd_data_offset_words);
                     curr_sub_cmd_data_offset_words += cb_config_size_words;
                 }
             }
@@ -970,9 +1003,12 @@ void EnqueueProgramCommand::assemble_device_commands() {
         }
 
         // Go Signals
+        cached_program_command_sequence.go_signals.reserve(
+            multicast_go_signal_sub_cmds.size() + unicast_go_signal_sub_cmds.size());
         if (multicast_go_signal_sub_cmds.size() > 0) {
             uint32_t curr_sub_cmd_idx = 0;
             for (const auto& [num_sub_cmds_in_cmd, multicast_go_signal_payload_sizeB] : multicast_go_signals_payload) {
+                uint32_t write_offset_bytes = program_command_sequence.write_offset_bytes();
                 program_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedMulticastSubCmd>(
                     num_sub_cmds_in_cmd,
                     GET_MAILBOX_ADDRESS_HOST(launch),
@@ -982,12 +1018,22 @@ void EnqueueProgramCommand::assemble_device_commands() {
                     multicast_go_signal_data,
                     curr_sub_cmd_idx);
                 curr_sub_cmd_idx += num_sub_cmds_in_cmd;
+                uint32_t curr_sub_cmd_data_offset_words =
+                    (write_offset_bytes + CQ_PREFETCH_CMD_BARE_MIN_SIZE +
+                     align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), L1_ALIGNMENT)) /
+                    sizeof(uint32_t);
+                for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
+                    cached_program_command_sequence.go_signals.push_back(
+                        (launch_msg_t*)((uint32_t*)program_command_sequence.data() + curr_sub_cmd_data_offset_words));
+                    curr_sub_cmd_data_offset_words += go_signal_size_words;
+                }
             }
         }
 
         if (unicast_go_signal_sub_cmds.size() > 0) {
             uint32_t curr_sub_cmd_idx = 0;
             for (const auto& [num_sub_cmds_in_cmd, unicast_go_signal_payload_sizeB] : unicast_go_signals_payload) {
+                uint32_t write_offset_bytes = program_command_sequence.write_offset_bytes();
                 program_command_sequence.add_dispatch_write_packed<CQDispatchWritePackedUnicastSubCmd>(
                     num_sub_cmds_in_cmd,
                     GET_ETH_MAILBOX_ADDRESS_HOST(launch),
@@ -997,9 +1043,19 @@ void EnqueueProgramCommand::assemble_device_commands() {
                     unicast_go_signal_data,
                     curr_sub_cmd_idx);
                 curr_sub_cmd_idx += num_sub_cmds_in_cmd;
+                uint32_t curr_sub_cmd_data_offset_words =
+                    (write_offset_bytes + CQ_PREFETCH_CMD_BARE_MIN_SIZE +
+                     align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedUnicastSubCmd), L1_ALIGNMENT)) /
+                    sizeof(uint32_t);
+                for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) {
+                    cached_program_command_sequence.go_signals.push_back(
+                        (launch_msg_t*)((uint32_t*)program_command_sequence.data() + curr_sub_cmd_data_offset_words));
+                    curr_sub_cmd_data_offset_words += go_signal_size_words;
+                }
             }
         }
     } else {
+        // CB Configs
         uint32_t i = 0;
         for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) {
             uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i];
@@ -1019,38 +1075,43 @@ void EnqueueProgramCommand::assemble_device_commands() {
             }
             i++;
         }
+        // Go Signals
+        // for (auto & go_signal : cached_program_command_sequence.go_signals) {
+        //     go_signal->dispatch_core_x = this->dispatch_core.x;
+        //     go_signal->dispatch_core_y = this->dispatch_core.y;
+        // }
     }
 }
 
 void EnqueueProgramCommand::process() {
-    // Calculate all commands size and determine how many fetch q entries to use
+    // Invalidate cache of all cqs for the program if it has been recompiled
+    if (not program.loaded_onto_device) {
+        this->cached_program_command_sequences[program.id].fill(std::nullopt);
+        program.loaded_onto_device = true;
+    }
 
     // Preamble, some waits and stalls
     // can be written directly to the issue queue
-    if (not program.loaded_onto_device) {
-        this->assemble_preamble_commands(true);
+    auto& cached_program_command_sequence = this->cached_program_command_sequences[program.id][this->command_queue_id];
+    bool cmd_is_cached = cached_program_command_sequence.has_value();
+    if (!cmd_is_cached) {
+        cached_program_command_sequence = {};
+        this->assemble_preamble_commands(*cached_program_command_sequence, true);
         // Runtime Args Command Sequence
-        this->assemble_runtime_args_commands();
+        this->assemble_runtime_args_commands(*cached_program_command_sequence);
         // Main Command Sequence
-        this->assemble_device_commands();
+        this->assemble_device_commands(*cached_program_command_sequence, false);
     } else {
         static constexpr uint32_t count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count));
-        TT_ASSERT(
-            this->cached_program_command_sequences.find(program.id) != this->cached_program_command_sequences.end(),
-            "Program cache hit, but no stored command sequence");
-        this->cached_program_command_sequences[program.id].preamble_command_sequence.update_cmd_sequence(
+        cached_program_command_sequence->preamble_command_sequence.update_cmd_sequence(
             count_offset, &this->expected_num_workers_completed, sizeof(uint32_t));
-        this->assemble_device_commands();
+        this->assemble_device_commands(*cached_program_command_sequence, true);
     }
 
-    const auto& cached_program_command_sequence = this->cached_program_command_sequences[program.id];
-
-    uint32_t preamble_fetch_size_bytes = cached_program_command_sequence.preamble_command_sequence.size_bytes();
-
-    uint32_t runtime_args_fetch_size_bytes = cached_program_command_sequence.runtime_args_fetch_size_bytes;
-
-    uint32_t program_fetch_size_bytes = cached_program_command_sequence.program_command_sequence.size_bytes();
-
+    // Calculate all commands size and determine how many fetch q entries to use
+    uint32_t preamble_fetch_size_bytes = cached_program_command_sequence->preamble_command_sequence.size_bytes();
+    uint32_t runtime_args_fetch_size_bytes = cached_program_command_sequence->runtime_args_fetch_size_bytes;
+    uint32_t program_fetch_size_bytes = cached_program_command_sequence->program_command_sequence.size_bytes();
     uint32_t total_fetch_size_bytes =
         preamble_fetch_size_bytes + runtime_args_fetch_size_bytes + program_fetch_size_bytes;
 
@@ -1061,16 +1122,16 @@ void EnqueueProgramCommand::process() {
         uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id);
 
         this->manager.cq_write(
-            cached_program_command_sequence.preamble_command_sequence.data(), preamble_fetch_size_bytes, write_ptr);
+            cached_program_command_sequence->preamble_command_sequence.data(), preamble_fetch_size_bytes, write_ptr);
         write_ptr += preamble_fetch_size_bytes;
 
-        for (const auto& cmds : cached_program_command_sequence.runtime_args_command_sequences) {
+        for (const auto& cmds : cached_program_command_sequence->runtime_args_command_sequences) {
             this->manager.cq_write(cmds.data(), cmds.size_bytes(), write_ptr);
             write_ptr += cmds.size_bytes();
         }
 
         this->manager.cq_write(
-            cached_program_command_sequence.program_command_sequence.data(), program_fetch_size_bytes, write_ptr);
+            cached_program_command_sequence->program_command_sequence.data(), program_fetch_size_bytes, write_ptr);
 
         this->manager.issue_queue_push_back(total_fetch_size_bytes, this->command_queue_id);
 
@@ -1081,14 +1142,14 @@ void EnqueueProgramCommand::process() {
         this->manager.issue_queue_reserve(preamble_fetch_size_bytes, this->command_queue_id);
         uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id);
         this->manager.cq_write(
-            cached_program_command_sequence.preamble_command_sequence.data(), preamble_fetch_size_bytes, write_ptr);
+            cached_program_command_sequence->preamble_command_sequence.data(), preamble_fetch_size_bytes, write_ptr);
         this->manager.issue_queue_push_back(preamble_fetch_size_bytes, this->command_queue_id);
         // One fetch queue entry for just the wait and stall, very inefficient
         this->manager.fetch_queue_reserve_back(this->command_queue_id);
         this->manager.fetch_queue_write(preamble_fetch_size_bytes, this->command_queue_id);
 
         // TODO: We can pack multiple RT args into one fetch q entry
-        for (const auto& cmds : cached_program_command_sequence.runtime_args_command_sequences) {
+        for (const auto& cmds : cached_program_command_sequence->runtime_args_command_sequences) {
             uint32_t fetch_size_bytes = cmds.size_bytes();
             this->manager.issue_queue_reserve(fetch_size_bytes, this->command_queue_id);
             write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id);
@@ -1102,7 +1163,7 @@ void EnqueueProgramCommand::process() {
         this->manager.issue_queue_reserve(program_fetch_size_bytes, this->command_queue_id);
         write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id);
         this->manager.cq_write(
-            cached_program_command_sequence.program_command_sequence.data(), program_fetch_size_bytes, write_ptr);
+            cached_program_command_sequence->program_command_sequence.data(), program_fetch_size_bytes, write_ptr);
         this->manager.issue_queue_push_back(program_fetch_size_bytes, this->command_queue_id);
         // One fetch queue entry for rest of program commands
         this->manager.fetch_queue_reserve_back(this->command_queue_id);
@@ -1110,9 +1171,8 @@ void EnqueueProgramCommand::process() {
     }
 
     // Front load generating and caching preamble without stall during program loading stage
-    if (not program.loaded_onto_device) {
-        this->assemble_preamble_commands(false);
-        program.loaded_onto_device = true;
+    if (not cmd_is_cached) {
+        this->assemble_preamble_commands(*cached_program_command_sequence, false);
     }
 }
 
@@ -1319,6 +1379,19 @@ HWCommandQueue::HWCommandQueue(Device* device, uint32_t id, NOC noc_index) :
         // Galaxy puts 4 devices per host channel until umd can provide one channel per device.
         this->size_B = this->size_B / 4;
     }
+
+    CoreCoord enqueue_program_dispatch_core;
+    if (device->is_mmio_capable()) {
+        enqueue_program_dispatch_core =
+            dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_core(device->id(), channel, id);
+    } else {
+        enqueue_program_dispatch_core =
+            dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_d_core(device->id(), channel, id);
+    }
+    CoreType core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
+    this->physical_enqueue_program_dispatch_core =
+        device->physical_core_from_logical_core(enqueue_program_dispatch_core, core_type);
+
     tt_cxy_pair completion_q_writer_location =
         dispatch_core_manager::get(device->num_hw_cqs()).completion_queue_writer_core(device->id(), channel, this->id);
 
@@ -1768,7 +1841,13 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
     }
 
     auto command = EnqueueProgramCommand(
-        this->id, this->device, this->noc_index, program, this->manager, expected_workers_completed);
+        this->id,
+        this->device,
+        this->noc_index,
+        program,
+        this->physical_enqueue_program_dispatch_core,
+        this->manager,
+        expected_workers_completed);
     this->enqueue_command(command, blocking);
 
 #ifdef DEBUG
@@ -2394,9 +2473,6 @@ void EnqueueWriteBufferImpl(
 void EnqueueProgram(
     CommandQueue& cq, std::variant<std::reference_wrapper<Program>, std::shared_ptr<Program>> program, bool blocking) {
     detail::DispatchStateCheck(true);
-    if (cq.get_mode() != CommandQueue::CommandQueueMode::TRACE) {
-        TT_FATAL(cq.id() == 0, "EnqueueProgram only supported on first command queue on device for time being.");
-    }
     cq.run_command(
         CommandInterface{.type = EnqueueCommandType::ENQUEUE_PROGRAM, .blocking = blocking, .program = program});
 }
diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp
index f874edb25247..960a7c7b8b0e 100644
--- a/tt_metal/impl/dispatch/command_queue.hpp
+++ b/tt_metal/impl/dispatch/command_queue.hpp
@@ -294,6 +294,7 @@ class EnqueueProgramCommand : public Command {
     NOC noc_index;
     Program& program;
     SystemMemoryManager& manager;
+    CoreCoord dispatch_core;
     CoreType dispatch_core_type;
     uint32_t expected_num_workers_completed;
 
@@ -303,22 +304,26 @@ class EnqueueProgramCommand : public Command {
         std::vector<HostMemDeviceCommand> runtime_args_command_sequences;
         uint32_t runtime_args_fetch_size_bytes;
         HostMemDeviceCommand program_command_sequence;
-        std::vector<uint32_t *> cb_configs_payloads;
+        std::vector<uint32_t*> cb_configs_payloads;
         std::vector<std::vector<std::shared_ptr<CircularBuffer>>> circular_buffers_on_core_ranges;
+        std::vector<launch_msg_t*> go_signals;
     };
-    thread_local static std::unordered_map<uint64_t, CachedProgramCommandSequence> cached_program_command_sequences;
+    thread_local static std::
+        unordered_map<uint64_t, std::array<std::optional<CachedProgramCommandSequence>, Device::max_num_hw_cqs>>
+            cached_program_command_sequences;
 
     EnqueueProgramCommand(
         uint32_t command_queue_id,
         Device* device,
         NOC noc_index,
         Program& program,
+        CoreCoord& dispatch_core,
         SystemMemoryManager& manager,
         uint32_t expected_num_workers_completed);
 
-    void assemble_preamble_commands(bool prefetch_stall);
-    void assemble_device_commands();
-    void assemble_runtime_args_commands();
+    void assemble_preamble_commands(CachedProgramCommandSequence& cached_program_command_sequence, bool prefetch_stall);
+    void assemble_device_commands(CachedProgramCommandSequence& cached_program_command_sequence, bool is_cached);
+    void assemble_runtime_args_commands(CachedProgramCommandSequence& cached_program_command_sequence);
 
     void process();
 
@@ -490,6 +495,7 @@ class HWCommandQueue {
 
     ~HWCommandQueue();
 
+    CoreCoord physical_enqueue_program_dispatch_core;
     CoreCoord completion_queue_writer_core;
     NOC noc_index;
     volatile bool is_dprint_server_hung();
diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp
index a67d3a7da520..05e7fa738b4a 100644
--- a/tt_metal/jit_build/genfiles.cpp
+++ b/tt_metal/jit_build/genfiles.cpp
@@ -595,8 +595,7 @@ static string generate_noc_addr_ranges_string(
     const std::vector<CoreCoord>& dram_cores,
     const std::vector<CoreCoord>& ethernet_cores,
     CoreCoord grid_size,
-    const std::vector<uint32_t>& harvested_rows,
-    const CoreCoord& enqueue_program_physical_dispatch_core) {
+    const std::vector<uint32_t>& harvested_rows) {
 
     stringstream ss;
 
@@ -675,9 +674,6 @@ static string generate_noc_addr_ranges_string(
     ss << endl;
     ss << endl;
 
-    ss << "#define DISPATCH_CORE_X " << enqueue_program_physical_dispatch_core.x << endl;
-    ss << "#define DISPATCH_CORE_Y " << enqueue_program_physical_dispatch_core.y << endl;
-
     return ss.str();
 }
 
@@ -691,11 +687,10 @@ void jit_build_genfiles_noc_addr_ranges_header(
     const std::vector<CoreCoord>& dram_cores,
     const std::vector<CoreCoord>& ethernet_cores,
     CoreCoord grid_size,
-    const std::vector<uint32_t>& harvested_rows,
-    const CoreCoord& enqueue_program_physical_dispatch_core) {
+    const std::vector<uint32_t>& harvested_rows) {
 
     string output_string = generate_noc_addr_ranges_string(pcie_addr_base, pcie_addr_size, dram_addr_base, dram_addr_size,
-                                                           pcie_cores, dram_cores, ethernet_cores, grid_size, harvested_rows, enqueue_program_physical_dispatch_core);
+                                                           pcie_cores, dram_cores, ethernet_cores, grid_size, harvested_rows);
 
     ofstream file_stream_br(path + "/brisc/noc_addr_ranges_gen.h");
     file_stream_br << output_string;
diff --git a/tt_metal/jit_build/genfiles.hpp b/tt_metal/jit_build/genfiles.hpp
index 49d5876de2fe..8ede269f2ea1 100644
--- a/tt_metal/jit_build/genfiles.hpp
+++ b/tt_metal/jit_build/genfiles.hpp
@@ -42,8 +42,7 @@ void jit_build_genfiles_noc_addr_ranges_header(
     const std::vector<CoreCoord>& dram_cores,
     const std::vector<CoreCoord>& ethernet_cores,
     CoreCoord grid_size,
-    const std::vector<uint32_t>& harvested_rows,
-    const CoreCoord& enqueue_program_physical_dispatch_core);
+    const std::vector<uint32_t>& harvested_rows);
 
 void jit_build_genfiles_descriptors(const JitBuildEnv& env,
                                     JitBuildOptions& options);