From 0f2a9f8990d0eaf65997267bd38ff9d76b4c2f95 Mon Sep 17 00:00:00 2001
From: Michael Chiou <156848643+ttmchiou@users.noreply.github.com>
Date: Thu, 16 May 2024 16:16:08 -0700
Subject: [PATCH 01/40] #7944: add convenience script that installs docker deps
 and compose (#8566)

---
 scripts/docker/build_docker_image.sh |  9 ++++
 scripts/docker/run_docker_cmd.sh     | 25 ++++++++++++
 scripts/docker/run_docker_func.sh    | 61 ++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100755 scripts/docker/build_docker_image.sh
 create mode 100755 scripts/docker/run_docker_cmd.sh
 create mode 100755 scripts/docker/run_docker_func.sh

diff --git a/scripts/docker/build_docker_image.sh b/scripts/docker/build_docker_image.sh
new file mode 100755
index 000000000000..82df50664e43
--- /dev/null
+++ b/scripts/docker/build_docker_image.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+TT_METAL_DOCKER_IMAGE_TAG=${1:-ubuntu-20.04-amd64:latest}
+
+TT_METAL_HOME=$(git rev-parse --show-toplevel)
+(
+  cd ${TT_METAL_HOME} || exit
+  docker build -f dockerfile/ubuntu-20.04-x86.Dockerfile -t ${TT_METAL_DOCKER_IMAGE_TAG} .
+)
\ No newline at end of file
diff --git a/scripts/docker/run_docker_cmd.sh b/scripts/docker/run_docker_cmd.sh
new file mode 100755
index 000000000000..e62ee4298031
--- /dev/null
+++ b/scripts/docker/run_docker_cmd.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -e
+
+if [[ -z "${TT_METAL_DOCKER_IMAGE_TAG}" ]]; then
+  echo "TT_METAL_DOCKER_IMAGE_TAG is not set or is empty, setting to ubuntu-20.04-amd64:latest"
+  TT_METAL_DOCKER_IMAGE_TAG="ubuntu-20.04-amd64:latest"
+else
+  echo "TT_METAL_DOCKER_IMAGE_TAG is set to ${TT_METAL_DOCKER_IMAGE_TAG}"
+fi
+
+if [[ -z "${ARCH_NAME}" ]]; then
+  echo "Must provide ARCH_NAME in environment" 1>&2
+  exit 1
+fi
+
+if [[ $# -eq 0 ]] ; then
+    echo 'You must provide an argument to run in docker!'
+    exit 1
+fi
+TT_METAL_HOME=$(git rev-parse --show-toplevel)
+# Allows this script to be called anywhere in the tt-metal repo
+source $TT_METAL_HOME/scripts/docker/run_docker_func.sh
+
+run_docker_common "$@"
diff --git a/scripts/docker/run_docker_func.sh b/scripts/docker/run_docker_func.sh
new file mode 100755
index 000000000000..5db7f6133e01
--- /dev/null
+++ b/scripts/docker/run_docker_func.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+set -e
+
+if [[ -z "${TT_METAL_DOCKER_IMAGE_TAG}" ]]; then
+  echo "TT_METAL_DOCKER_IMAGE_TAG is not set or is empty, setting to ubuntu-20.04-amd64:latest"
+  TT_METAL_DOCKER_IMAGE_TAG="ubuntu-20.04-amd64:latest"
+else
+  echo "TT_METAL_DOCKER_IMAGE_TAG is set to ${TT_METAL_DOCKER_IMAGE_TAG}"
+fi
+
+GID=$(id -g "${USER}")
+
+if [[ -z "${TT_METAL_HOME}" ]]; then
+  TT_METAL_HOME=$(git rev-parse --show-toplevel)
+else
+  echo "TT_METAL_DOCKER_IMAGE_TAG is set to ${TT_METAL_DOCKER_IMAGE_TAG}"
+fi
+
+[ -d ${TT_METAL_HOME}/.pipcache ] || mkdir ${TT_METAL_HOME}/.pipcache
+
+function run_docker_common {
+    # Split the arguments into docker options and command
+    local docker_opts=()
+    local cmd=()
+    local append_cmd=false
+    for arg in "$@"; do
+        if $append_cmd; then
+            cmd+=("$arg")
+        elif [[ $arg == "--" && $append_cmd == false ]]; then
+            append_cmd=true
+        else
+            docker_opts+=("$arg")
+        fi
+    done
+
+    docker run \
+        --rm \
+        -v ${TT_METAL_HOME}:/${TT_METAL_HOME} \
+        -v /home:/home \
+        -v /dev/hugepages-1G:/dev/hugepages-1G \
+        -v /etc/group:/etc/group:ro \
+        -v /etc/passwd:/etc/passwd:ro \
+        -v /etc/shadow:/etc/shadow:ro \
+        -w ${TT_METAL_HOME} \
+        -e TT_METAL_HOME=${TT_METAL_HOME} \
+        -e TT_METAL_ENV=${TT_METAL_ENV} \
+        -e LOGURU_LEVEL=${LOGURU_LEVEL} \
+        -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
+        -e CONFIG=${CONFIG} \
+        -e ARCH_NAME=${ARCH_NAME} \
+        -e PYTHONPATH=${TT_METAL_HOME} \
+        -e XDG_CACHE_HOME=${TT_METAL_HOME}/.pipcache \
+        -e SILENT=${SILENT} \
+		-e VERBOSE=${VERBOSE} \
+        -u ${UID}:${GID} \
+        --net host \
+        "${docker_opts[@]}" \
+        ${TT_METAL_DOCKER_IMAGE_TAG} \
+        "${cmd[@]}"
+}

From 5d0ff822971bc21df5108fbaa66a0d4d386a2a66 Mon Sep 17 00:00:00 2001
From: Umair <ucheema@tenstorrent.com>
Date: Thu, 9 May 2024 18:09:50 +0000
Subject: [PATCH 02/40] #8079: Clean up compile_command_queue_programs()

---
 tt_metal/impl/device/device.cpp | 778 ++++++++++++--------------------
 tt_metal/impl/device/device.hpp |   2 +
 2 files changed, 288 insertions(+), 492 deletions(-)

diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 5f56184d6a75..b6146f64f5bb 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -334,9 +334,56 @@ void Device::clear_l1_state() {
     // TODO: clear idle eriscs as well
 }
 
-// TODO (abhullar): Refactor this with #2593 to allow each target fast dispatch (FD) device to program their associated FD cores regardless of whether they are on the target device or not.
-// Currently we have to program FD cores for the remote device when initializing the MMIO device because completion queue cores are on MMIO device
-//  and we don't have handle on MMIO device when initializing the remote device
+void Device::configure_kernel_variant(
+    Program& program,
+    string path,
+    std::vector<uint32_t> compile_args,
+    CoreCoord kernel_core,
+    CoreCoord kernel_physical_core,
+    CoreType dispatch_core_type,
+    CoreCoord upstream_physical_core,
+    CoreCoord downstream_physical_core,
+    std::map<string, string> defines_in,
+    bool is_active_eth_core) {
+
+    std::map<string, string> defines = {
+        {"DISPATCH_KERNEL", "1"},
+        {"MY_NOC_X", std::to_string(kernel_physical_core.x)},
+        {"MY_NOC_Y", std::to_string(kernel_physical_core.y)},
+        {"UPSTREAM_NOC_X", std::to_string(upstream_physical_core.x)},
+        {"UPSTREAM_NOC_Y", std::to_string(upstream_physical_core.y)},
+        {"DOWNSTREAM_NOC_X", std::to_string(downstream_physical_core.x)},
+        {"DOWNSTREAM_NOC_Y", std::to_string(downstream_physical_core.y)},
+    };
+    defines.insert(defines_in.begin(), defines_in.end());
+
+    if (dispatch_core_type == CoreType::WORKER) {
+        tt::tt_metal::CreateKernel(
+            program,
+            path,
+            kernel_core,
+            tt::tt_metal::DataMovementConfig {
+                .processor = tt::tt_metal::DataMovementProcessor::RISCV_1,
+                .noc = NOC::NOC_0,
+                .compile_args = compile_args,
+                .defines = defines
+            }
+        );
+    } else {
+        tt::tt_metal::CreateKernel(
+            program,
+            path,
+            kernel_core,
+            tt::tt_metal::EthernetConfig{
+                .eth_mode = is_active_eth_core ? Eth::SENDER : Eth::IDLE,
+                .noc = NOC::NOC_0,
+                .compile_args = compile_args,
+                .defines = defines
+            }
+        );
+    }
+}
+
 void Device::compile_command_queue_programs() {
     ZoneScoped;
     unique_ptr<Program, detail::ProgramDeleter> command_queue_program_ptr(new Program);
@@ -348,16 +395,16 @@ void Device::compile_command_queue_programs() {
     // TODO: These are semaphore IDs, remove these when CreateSemaphore returns ID rather than address
     constexpr uint32_t prefetch_sync_sem = 0;
     constexpr uint32_t prefetch_downstream_cb_sem = 1;
-    constexpr uint32_t dispatch_sync_sem = 0;
-    constexpr uint32_t dispatch_cb_sem = 1;
+    constexpr uint32_t prefetch_sem = 1;
+    constexpr uint32_t dispatch_sem = 0;
+    constexpr uint32_t mux_sem = 0;
+    constexpr uint32_t demux_sem = 0;
 
     constexpr uint32_t prefetch_d_sync_sem = 0;
     constexpr uint32_t prefetch_d_upstream_cb_sem = 1;
     constexpr uint32_t prefetch_d_downstream_cb_sem = 2;
     constexpr uint32_t prefetch_h_exec_buf_sem = 2;
-    constexpr uint32_t mux_upstream_cb_sem = 1;
-    constexpr uint32_t demux_downstream_cb_sem = 1;
-    constexpr uint32_t dispatch_downstream_cb_sem = 2;
+    constexpr uint32_t dispatch_downstream_cb_sem = 1;
 
     if (this->is_mmio_capable()) {
         auto device_id = this->id();
@@ -367,21 +414,15 @@ void Device::compile_command_queue_programs() {
 
         for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) {
             CoreType dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id);
-            //add apis for dispatch_h/d prefetch_h
-            tt_cxy_pair prefetch_location = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id);
-            tt_cxy_pair completion_q_writer_location = dispatch_core_manager::get(num_hw_cqs).completion_queue_writer_core(device_id, channel, cq_id);
-            tt_cxy_pair dispatch_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id);
+            tt_cxy_pair prefetch_core = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id);
+            tt_cxy_pair dispatch_core = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id);
 
-            TT_ASSERT(prefetch_location.chip == this->id() and completion_q_writer_location.chip == this->id(),
-                "Issue queue interface is on device {} and completion queue interface is on device {} but they are expected to be on device {}", prefetch_location.chip, completion_q_writer_location.chip, this->id());
-
-            CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type);
-            CoreCoord completion_q_physical_core = get_physical_core_coordinate(completion_q_writer_location, dispatch_core_type);
-            CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type);
+            CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_core, dispatch_core_type);
+            CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type);
 
             log_debug(LogDevice, "Dispatching out of {} cores",  magic_enum::enum_name(dispatch_core_type));
-            log_debug(LogDevice, "Prefetch HD logical location: {} physical core: {}", prefetch_location.str(), prefetch_physical_core.str());
-            log_debug(LogDevice, "Dispatch HD logical location: {} physical core {}", dispatch_location.str(), dispatch_physical_core.str());
+            log_debug(LogDevice, "Prefetch HD logical location: {} physical core: {}", prefetch_core.str(), prefetch_physical_core.str());
+            log_debug(LogDevice, "Dispatch HD logical location: {} physical core {}", dispatch_core.str(), dispatch_physical_core.str());
 
             uint32_t command_queue_start_addr = get_absolute_cq_offset(channel, cq_id, cq_size);
             uint32_t issue_queue_start_addr = command_queue_start_addr + CQ_START;
@@ -389,23 +430,12 @@ void Device::compile_command_queue_programs() {
             uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
             uint32_t completion_queue_size = this->sysmem_manager_->get_completion_queue_size(cq_id);
 
-
-            std::map<string, string> prefetch_defines = {
-                {"DISPATCH_KERNEL", "1"},
-                {"MY_NOC_X", std::to_string(prefetch_physical_core.x)},
-                {"MY_NOC_Y", std::to_string(prefetch_physical_core.y)},
-                {"UPSTREAM_NOC_X", std::to_string(0)},
-                {"UPSTREAM_NOC_Y", std::to_string(0)},
-                {"DOWNSTREAM_NOC_X", std::to_string(dispatch_physical_core.x)},
-                {"DOWNSTREAM_NOC_Y", std::to_string(dispatch_physical_core.y)},
-            };
-
             std::vector<uint32_t> prefetch_compile_args = {
                 dispatch_constants::DISPATCH_BUFFER_BASE,
                 dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
                 dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(),
-                prefetch_downstream_cb_sem,
-                dispatch_cb_sem,
+                prefetch_sem,
+                dispatch_sem,
                 issue_queue_start_addr,
                 issue_queue_size,
                 dispatch_constants::PREFETCH_Q_BASE,
@@ -417,8 +447,8 @@ void Device::compile_command_queue_programs() {
                 dispatch_constants::get(dispatch_core_type).scratch_db_size(),
                 prefetch_sync_sem,
                 dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), // prefetch_d only
-                prefetch_d_upstream_cb_sem, // prefetch_d only
-                prefetch_downstream_cb_sem, // prefetch_d only
+                0, //prefetch_d_upstream_cb_sem, // prefetch_d only
+                0, //prefetch_downstream_cb_sem, // prefetch_d only
                 dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
                 dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
                 prefetch_h_exec_buf_sem,
@@ -426,43 +456,28 @@ void Device::compile_command_queue_programs() {
                 true    // is_host_variant
             };
 
-            if (dispatch_core_type == CoreType::WORKER) {
-                tt::tt_metal::CreateKernel(
-                    *command_queue_program_ptr, prefetch_kernel_path, prefetch_location,
-                    DataMovementConfig{
-                        .processor = DataMovementProcessor::RISCV_1,
-                        .noc = NOC::NOC_0,
-                        .compile_args = prefetch_compile_args,
-                        .defines = prefetch_defines});
-            } else {
-                tt::tt_metal::CreateKernel(
-                    *command_queue_program_ptr, prefetch_kernel_path, prefetch_location,
-                    EthernetConfig{
-                        .eth_mode = Eth::IDLE,
-                        .noc = NOC::NOC_0,
-                        .compile_args = prefetch_compile_args,
-                        .defines = prefetch_defines});
-            }
+            configure_kernel_variant(
+                *command_queue_program_ptr,
+                "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp",
+                prefetch_compile_args,
+                prefetch_core,
+                prefetch_physical_core,
+                dispatch_core_type,
+                CoreCoord{0, 0},
+                dispatch_physical_core,
+                std::map<string, string> {}
+            );
+
+            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_core, 0, dispatch_core_type); // prefetch_sync_sem
+            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_core, dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(), dispatch_core_type); // prefetch_sem
+            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_core, 0, dispatch_core_type); // prefetch_h_exec_buf_sem
 
-            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_location, 0, dispatch_core_type);
-            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_location, dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(), dispatch_core_type);
-            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_location, 0, dispatch_core_type);
-
-            std::map<string, string> dispatch_defines = {
-                {"DISPATCH_KERNEL", "1"},
-                {"MY_NOC_X", std::to_string(dispatch_physical_core.x)},
-                {"MY_NOC_Y", std::to_string(dispatch_physical_core.y)},
-                {"UPSTREAM_NOC_X", std::to_string(prefetch_physical_core.x)},
-                {"UPSTREAM_NOC_Y", std::to_string(prefetch_physical_core.y)},
-                {"DOWNSTREAM_NOC_X", std::to_string(0)},
-                {"DOWNSTREAM_NOC_Y", std::to_string(0)},
-            };
             std::vector<uint32_t> dispatch_compile_args = {
                 dispatch_constants::DISPATCH_BUFFER_BASE,
                 dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
                 dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(),
-                dispatch_cb_sem,
-                prefetch_downstream_cb_sem,
+                dispatch_sem,
+                prefetch_sem,
                 dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
                 prefetch_sync_sem,
                 command_queue_start_addr,
@@ -470,34 +485,26 @@ void Device::compile_command_queue_programs() {
                 completion_queue_size,
                 dispatch_constants::DISPATCH_BUFFER_BASE,
                 (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(),
-                0, // unused on hd, filled in below for h and d
-                0, // unused on hd, filled in below for h and d
-                0, // unused unless tunneler is between h and d
+                0, // unused
+                0, // unused
+                0, // unused
                 true,   // is_dram_variant
                 true    // is_host_variant
             };
 
-            if (dispatch_core_type == CoreType::WORKER) {
-                tt::tt_metal::CreateKernel(
-                    *command_queue_program_ptr, dispatch_kernel_path, dispatch_location,
-                    DataMovementConfig{
-                        .processor = DataMovementProcessor::RISCV_1,
-                        .noc = NOC::NOC_0,
-                        .compile_args = dispatch_compile_args,
-                        .defines = dispatch_defines});
-            } else {
-                tt::tt_metal::CreateKernel(
-                    *command_queue_program_ptr, dispatch_kernel_path, dispatch_location,
-                    EthernetConfig{
-                        .eth_mode = Eth::IDLE,
-                        .noc = NOC::NOC_0,
-                        .compile_args = dispatch_compile_args,
-                        .defines = dispatch_defines});
-            }
+            configure_kernel_variant(
+                *command_queue_program_ptr,
+                "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp",
+                dispatch_compile_args,
+                dispatch_core,
+                dispatch_physical_core,
+                dispatch_core_type,
+                prefetch_physical_core,
+                CoreCoord{0, 0},
+                std::map<string, string> {}
+            );
 
-            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_location, 0, dispatch_core_type);
-            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_location, 0, dispatch_core_type);
-            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_location, dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(), dispatch_core_type);
+            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_core, 0, dispatch_core_type); // dispatch_sem
         }
         detail::CompileProgram(this, *command_queue_program_ptr);
         this->command_queue_programs.push_back(std::move(command_queue_program_ptr));
@@ -513,19 +520,19 @@ void Device::compile_command_queue_programs() {
 
 
         CoreType dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(mmio_device_id);
-        tt_cxy_pair prefetch_location = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id);
-        tt_cxy_pair dispatch_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id);
+        tt_cxy_pair prefetch_core = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id);
+        tt_cxy_pair dispatch_core = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id);
         bool dispatch_on_eth = dispatch_core_type == CoreType::ETH;
 
-        TT_ASSERT(prefetch_location.chip == mmio_device_id and dispatch_location.chip == mmio_device_id,
-            "Prefetcher is on device {} and Dispatcher is on device {} but they are expected to be on device {}", prefetch_location.chip, dispatch_location.chip, mmio_device_id);
+        TT_ASSERT(prefetch_core.chip == mmio_device_id and dispatch_core.chip == mmio_device_id,
+            "Prefetcher is on device {} and Dispatcher is on device {} but they are expected to be on device {}", prefetch_core.chip, dispatch_core.chip, mmio_device_id);
 
-        CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type);
-        CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type);
+        CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_core, dispatch_core_type);
+        CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type);
 
         log_debug(LogDevice, "Dispatching out of {} cores",  magic_enum::enum_name(dispatch_core_type));
-        log_debug(LogDevice, "Prefetch H logical location: {} physical core: {}", prefetch_location.str(), prefetch_physical_core.str());
-        log_debug(LogDevice, "Dispatch H logical location: {} physical core {}", dispatch_location.str(), dispatch_physical_core.str());
+        log_debug(LogDevice, "Prefetch H logical location: {} physical core: {}", prefetch_core.str(), prefetch_physical_core.str());
+        log_debug(LogDevice, "Dispatch H logical location: {} physical core {}", dispatch_core.str(), dispatch_physical_core.str());
 
         uint32_t command_queue_start_addr = get_absolute_cq_offset(channel, cq_id, cq_size);
         uint32_t issue_queue_start_addr = command_queue_start_addr + CQ_START;
@@ -533,18 +540,18 @@ void Device::compile_command_queue_programs() {
         uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
         uint32_t completion_queue_size = mmio_device->sysmem_manager_->get_completion_queue_size(cq_id);
 
-        tt_cxy_pair mux_location = dispatch_core_manager::get(num_hw_cqs).mux_core(device_id, channel, cq_id);
-        tt_cxy_pair demux_location = dispatch_core_manager::get(num_hw_cqs).demux_core(device_id, channel, cq_id);
+        tt_cxy_pair mux_core = dispatch_core_manager::get(num_hw_cqs).mux_core(device_id, channel, cq_id);
+        tt_cxy_pair demux_core = dispatch_core_manager::get(num_hw_cqs).demux_core(device_id, channel, cq_id);
         tt_cxy_pair tunneler_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(device_id, channel, cq_id);
-        CoreCoord tunneler_logical_core = CoreCoord(tunneler_location.x, tunneler_location.y);
+        CoreCoord tunneler_core = CoreCoord(tunneler_location.x, tunneler_location.y);
         TT_ASSERT(tunneler_location.chip == mmio_device_id,
             "Tunneler is on device {} but it is expected to be on device {}", tunneler_location.chip, mmio_device_id);
-        CoreCoord r_tunneler_logical_core = std::get<1>(tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(tunneler_location.chip, tunneler_logical_core)));
-        CoreCoord r_tunneler_physical_core = this->ethernet_core_from_logical_core(r_tunneler_logical_core);
+        CoreCoord r_tunneler_core = std::get<1>(tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(tunneler_location.chip, tunneler_core)));
+        CoreCoord r_tunneler_physical_core = this->ethernet_core_from_logical_core(r_tunneler_core);
 
         CoreCoord tunneler_physical_core = mmio_device->ethernet_core_from_logical_core(tunneler_location);
-        CoreCoord mux_physical_core = get_physical_core_coordinate(mux_location, dispatch_core_type);
-        CoreCoord demux_physical_core = get_physical_core_coordinate(demux_location, dispatch_core_type);
+        CoreCoord mux_physical_core = get_physical_core_coordinate(mux_core, dispatch_core_type);
+        CoreCoord demux_physical_core = get_physical_core_coordinate(demux_core, dispatch_core_type);
 
         uint32_t tunneler_queue_start_addr = 0x19000;
         uint32_t tunneler_queue_size_bytes = 0x10000;
@@ -553,44 +560,27 @@ void Device::compile_command_queue_programs() {
         constexpr uint32_t packetized_path_test_results_addr = BRISC_L1_RESULT_BASE;
         constexpr uint32_t packetized_path_test_results_size = 1024;
 
-        // Packetized path buffer, can be at any available address.
-        constexpr uint32_t relay_demux_queue_start_addr = L1_UNRESERVED_BASE;
-        constexpr uint32_t relay_demux_queue_size_bytes = 0x10000;
         constexpr uint32_t src_endpoint_start_id = 0xaa;
         constexpr uint32_t dest_endpoint_start_id = 0xbb;
 
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_location, 0, dispatch_core_type); // prefetch_sync_sem
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_location, dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), dispatch_core_type); // prefetch_downstream_cb_sem
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_location, 0, dispatch_core_type);
-
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_location, 0, dispatch_core_type); // unused mux semaphore
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_location, 0, dispatch_core_type); // mux_upstream_cb_sem
-
-        tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_location, 0, dispatch_core_type); // unused
-        tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_location, 0, dispatch_core_type); // unused
-        // for the unpacketize stage, we use rptr/wptr for flow control, and poll semaphore
-        // value only to update the rptr:
-        tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_location, 0, dispatch_core_type);
-
-        constexpr uint32_t dispatch_h_cb_sem = 0;
-        tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, dispatch_location, 0, dispatch_core_type);
-
-        std::map<string, string> prefetch_defines = {
-            {"DISPATCH_KERNEL", "1"},
-            {"MY_NOC_X", std::to_string(prefetch_physical_core.x)},
-            {"MY_NOC_Y", std::to_string(prefetch_physical_core.y)},
-            {"UPSTREAM_NOC_X", std::to_string(0)},
-            {"UPSTREAM_NOC_Y", std::to_string(0)},
-            {"DOWNSTREAM_NOC_X", std::to_string(mux_physical_core.x)},
-            {"DOWNSTREAM_NOC_Y", std::to_string(mux_physical_core.y)},
-        };
+        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, 0, dispatch_core_type); // prefetch_sync_sem
+        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), dispatch_core_type); // prefetch_sem
+        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, 0, dispatch_core_type); // prefetch_h_exec_buf_sem
+
+        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_core, 0, dispatch_core_type); // mux_sem
+
+        tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_core, 0, dispatch_core_type); //demux_sem
+
+        constexpr uint32_t dispatch_h_cb_sem = 0; // remove it.
+        constexpr uint32_t dispatch_sem = 0;
+        tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, dispatch_core, 0, dispatch_core_type); // dispatch_sem
 
         std::vector<uint32_t> prefetch_compile_args = {
             dispatch_constants::DISPATCH_BUFFER_BASE,
             dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
             dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(),
-            prefetch_downstream_cb_sem,
-            mux_upstream_cb_sem,
+            prefetch_sem,
+            mux_sem,
             issue_queue_start_addr,
             issue_queue_size,
             dispatch_constants::PREFETCH_Q_BASE,
@@ -602,46 +592,34 @@ void Device::compile_command_queue_programs() {
             dispatch_constants::get(dispatch_core_type).scratch_db_size(), // unused for prefetch_h
             prefetch_sync_sem, // unused for prefetch_h
             dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), // prefetch_d only
-            prefetch_d_upstream_cb_sem, // prefetch_d only
-            prefetch_downstream_cb_sem, // prefetch_d only
+            0, // prefetch_d only
+            0, //prefetch_downstream_cb_sem, // prefetch_d only
             dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
             dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
             prefetch_h_exec_buf_sem,
-            false,   // is_dram_variant
+            false,  // is_dram_variant
             true    // is_host_variant
         };
 
-        if (dispatch_on_eth) {
-            tt::tt_metal::CreateKernel(
-                *mmio_command_queue_program_ptr,
-                "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp",
-                prefetch_location,
-                EthernetConfig{
-                    .eth_mode = Eth::IDLE,
-                    .noc = NOC::NOC_0,
-                    .compile_args = prefetch_compile_args,
-                    .defines = prefetch_defines});
-        } else {
-        tt::tt_metal::CreateKernel(
+        configure_kernel_variant(
             *mmio_command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp", // update this for remote device
-            prefetch_location,
-            tt::tt_metal::DataMovementConfig {
-                .processor = tt::tt_metal::DataMovementProcessor::RISCV_1,
-                .noc = tt::tt_metal::NOC::RISCV_0_default,
-                .compile_args = prefetch_compile_args,
-                .defines = prefetch_defines});
-        }
-        log_debug(LogDevice, "run prefetch_h {}", prefetch_location.str());
+            "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp",
+            prefetch_compile_args,
+            prefetch_core,
+            prefetch_physical_core,
+            dispatch_core_type,
+            CoreCoord{0, 0},
+            mux_physical_core,
+            std::map<string, string> {}
+        );
+
+        log_debug(LogDevice, "run prefetch_h {}", prefetch_core.str());
 
-        uint32_t relay_mux_queue_start_addr = dispatch_constants::DISPATCH_BUFFER_BASE;
-        uint32_t relay_mux_queue_size_bytes = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size();
-        uint32_t timeout_mcycles = 0;
         std::vector<uint32_t> mux_compile_args =
         {
             0, // 0: reserved
-            (relay_mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words
-            (relay_mux_queue_size_bytes >> 4), // 2: rx_queue_size_words
+            (dispatch_constants::DISPATCH_BUFFER_BASE >> 4), // 1: rx_queue_start_addr_words
+            (dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size() >> 4), // 2: rx_queue_size_words
             1, // 3: mux_fan_in
             packet_switch_4B_pack((uint32_t)prefetch_physical_core.x,
                                 (uint32_t)prefetch_physical_core.y,
@@ -667,14 +645,14 @@ void Device::compile_command_queue_programs() {
             (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type
             packetized_path_test_results_addr, // 14: test_results_addr
             packetized_path_test_results_size, // 15: test_results_size
-            timeout_mcycles * 1000 * 1000, // 16: timeout_cycles
+            0, // 16: timeout_cycles
             0x0,// 17: output_depacketize
             0x0,// 18: output_depacketize info
             // 19: input 0 packetize info:
             packet_switch_4B_pack(0x1,
                                 dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-                                mux_upstream_cb_sem, // local sem
-                                prefetch_downstream_cb_sem), // upstream sem
+                                prefetch_sem, // upstream sem
+                                mux_sem),     // local sem
             packet_switch_4B_pack(0, 0, 0, 0), // 20: input 1 packetize info
             packet_switch_4B_pack(0, 0, 0, 0), // 21: input 2 packetize info
             packet_switch_4B_pack(0, 0, 0, 0), // 22: input 3 packetize info
@@ -682,32 +660,19 @@ void Device::compile_command_queue_programs() {
             packet_switch_4B_pack(dest_endpoint_start_id, 0, 0, 0), // 24: packetized input dest id
         };
 
-        log_debug(LogDevice, "run mux at {}", mux_location.str());
-        if (dispatch_on_eth) {
-            tt::tt_metal::CreateKernel(
-                *mmio_command_queue_program_ptr,
-                "tt_metal/impl/dispatch/kernels/packet_mux.cpp",
-                mux_location,
-                EthernetConfig{
-                    .eth_mode = Eth::IDLE,
-                    .noc = NOC::NOC_0,
-                    .compile_args = mux_compile_args,
-                    .defines = {{"SKIP_NOC_LOGGING", "1"}}
-                }
-            );
-        } else {
-        tt_metal::CreateKernel(
+        log_debug(LogDevice, "run mux at {}", mux_core.str());
+
+        configure_kernel_variant(
             *mmio_command_queue_program_ptr,
             "tt_metal/impl/dispatch/kernels/packet_mux.cpp",
-            mux_location,
-            tt_metal::DataMovementConfig{
-                .processor = tt_metal::DataMovementProcessor::RISCV_0,
-                .noc = tt_metal::NOC::RISCV_0_default,
-                .compile_args = mux_compile_args,
-                .defines = {{"SKIP_NOC_LOGGING", "1"}}
-            }
+            mux_compile_args,
+            mux_core,
+            CoreCoord{0, 0},
+            dispatch_core_type,
+            CoreCoord{0, 0},
+            CoreCoord{0, 0},
+            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
         );
-        }
 
         std::vector<uint32_t> tunneler_l_compile_args =
         {
@@ -725,33 +690,34 @@ void Device::compile_command_queue_programs() {
                                 (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_receiver_1_info
             tunneler_queue_start_addr >> 4, // 6: remote_receiver_queue_start_addr_words 0
             tunneler_queue_size_bytes >> 4, // 7: remote_receiver_queue_size_words 0
-            (relay_demux_queue_start_addr >> 4), // 8: remote_receiver_queue_start_addr_words 1
-            (relay_demux_queue_size_bytes >> 4), // 9: remote_receiver_queue_size_words 1
+            (L1_UNRESERVED_BASE >> 4), // 8: remote_receiver_queue_start_addr_words 1
+            (0x10000 >> 4), // 9: remote_receiver_queue_size_words 1
             packet_switch_4B_pack(mux_physical_core.x,
                                 mux_physical_core.y,
-                                1,//num_dest_endpoints,
+                                1, // mux output queue id
                                 (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_sender_0_info
             packet_switch_4B_pack(r_tunneler_physical_core.x,
                                 r_tunneler_physical_core.y,
-                                3,
+                                3, // r tunneler output queue id
                                 (uint32_t)DispatchRemoteNetworkType::ETH), // 11: remote_sender_1_info
             tunneler_test_results_addr, // 12: test_results_addr
             tunneler_test_results_size, // 13: test_results_size
-            timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles
+            0, // 14: timeout_cycles
         };
 
-        tt_metal::CreateKernel(
+        configure_kernel_variant(
             *mmio_command_queue_program_ptr,
             "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp",
-            tunneler_logical_core,
-            tt_metal::EthernetConfig{
-                .noc = tt_metal::NOC::NOC_0,
-                .compile_args = tunneler_l_compile_args,
-                // Skip noc logging for tunneling cores, since stopping the print server can hang
-                // the chip in this case.
-                .defines = {{"SKIP_NOC_LOGGING", "1"}}
-            }
+            tunneler_l_compile_args,
+            tunneler_core,
+            CoreCoord{0, 0},
+            CoreType::ETH,
+            CoreCoord{0, 0},
+            CoreCoord{0, 0},
+            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}},
+            true
         );
+
         log_debug(LogDevice, "run tunneler at {}", tunneler_location.str());
 
         uint32_t dest_map_array[4] = {0, 1, 2, 3};
@@ -759,8 +725,8 @@ void Device::compile_command_queue_programs() {
         std::vector<uint32_t> demux_compile_args =
         {
             dest_endpoint_start_id, // 0: endpoint_id_start_index
-            (relay_demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words
-            (relay_demux_queue_size_bytes >> 4), // 2: rx_queue_size_words
+            (L1_UNRESERVED_BASE >> 4), // 1: rx_queue_start_addr_words
+            (0x10000 >> 4), // 2: rx_queue_size_words
             1, // 3: demux_fan_out
             packet_switch_4B_pack(dispatch_physical_core.x,
                                     dispatch_physical_core.y,
@@ -786,9 +752,6 @@ void Device::compile_command_queue_programs() {
             0, // 13: remote_tx_queue_size_words 2
             0, // 14: remote_tx_queue_start_addr_words 3
             0, // 15: remote_tx_queue_size_words 3
-            //(uint32_t)phys_dispatch_relay_mux_core.x, // 16: remote_rx_x
-            //(uint32_t)phys_dispatch_relay_mux_core.y, // 17: remote_rx_y
-            //num_dest_endpoints, // 18: remote_rx_queue_id
             (uint32_t)tunneler_physical_core.x, // 16: remote_rx_x
             (uint32_t)tunneler_physical_core.y, // 17: remote_rx_y
             3, // 18: remote_rx_queue_id
@@ -797,52 +760,39 @@ void Device::compile_command_queue_programs() {
             (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo
             packetized_path_test_results_addr, // 22: test_results_addr
             packetized_path_test_results_size, // 23: test_results_size
-            timeout_mcycles * 1000 * 1000, // 24: timeout_cycles
+            0, // 24: timeout_cycles
             0x1, // 25: output_depacketize_mask
             // 26: output 0 packetize info:
             packet_switch_4B_pack(dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-                                    dispatch_h_cb_sem, // downstream sem
-                                    dispatch_downstream_cb_sem, // local sem
+                                    dispatch_sem, // downstream sem
+                                    demux_sem,    // local sem
                                     1), // remove header
             packet_switch_4B_pack(0, 0, 0, 0), // 27: output 1 packetize info
             packet_switch_4B_pack(0, 0, 0, 0), // 28: output 2 packetize info
             packet_switch_4B_pack(0, 0, 0, 0), // 29: output 3 packetize info
         };
 
-        log_debug(LogDevice, "run dispatch demux at {}", demux_location.str());
-
-        if (dispatch_on_eth) {
-            tt::tt_metal::CreateKernel(
-                *mmio_command_queue_program_ptr,
-                "tt_metal/impl/dispatch/kernels/packet_demux.cpp",
-                demux_location,
-                EthernetConfig{
-                    .eth_mode = Eth::IDLE,
-                    .noc = NOC::NOC_0,
-                    .compile_args = demux_compile_args,
-                    .defines = {{"SKIP_NOC_LOGGING", "1"}}
-                }
-            );
-        } else {
-        tt_metal::CreateKernel(
+
+        configure_kernel_variant(
             *mmio_command_queue_program_ptr,
             "tt_metal/impl/dispatch/kernels/packet_demux.cpp",
-            {demux_location},
-            tt_metal::DataMovementConfig{
-                .processor = tt_metal::DataMovementProcessor::RISCV_0,
-                .noc = tt_metal::NOC::RISCV_0_default,
-                .compile_args = demux_compile_args,
-                .defines = {{"SKIP_NOC_LOGGING", "1"}}
-            }
+            demux_compile_args,
+            demux_core,
+            CoreCoord{0, 0},
+            dispatch_core_type,
+            CoreCoord{0, 0},
+            CoreCoord{0, 0},
+            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
         );
-        }
+
+        log_debug(LogDevice, "run dispatch demux at {}", demux_core.str());
 
         std::vector<uint32_t> dispatch_compile_args = {
             dispatch_constants::DISPATCH_BUFFER_BASE,
             dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
             dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(),
-            dispatch_h_cb_sem, // overridden below for h
-            prefetch_d_downstream_cb_sem,
+            dispatch_sem,
+            demux_sem,
             dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
             prefetch_sync_sem,
             command_queue_start_addr,
@@ -850,53 +800,28 @@ void Device::compile_command_queue_programs() {
             completion_queue_size,
             dispatch_constants::DISPATCH_BUFFER_BASE,
             (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(),
-            dispatch_h_cb_sem, // unused on hd, filled in below for h and d
-            dispatch_downstream_cb_sem, // unused on hd, filled in below for h and d
+            0, // unused: local ds semaphore
+            0, // unused: remote ds semaphore
             0, // preamble size. unused unless tunneler is between h and d
             false,   // is_dram_variant
             true     // is_host_variant
         };
 
-        std::map<string, string> dispatch_defines = {
-            {"DISPATCH_KERNEL", "1"},
-            {"MY_NOC_X", std::to_string(dispatch_physical_core.x)},
-            {"MY_NOC_Y", std::to_string(dispatch_physical_core.y)},
-            {"UPSTREAM_NOC_X", std::to_string(demux_physical_core.x)},
-            {"UPSTREAM_NOC_Y", std::to_string(demux_physical_core.y)},
-            {"DOWNSTREAM_NOC_X", std::to_string(0xffffffff)},
-            {"DOWNSTREAM_NOC_Y", std::to_string(0xffffffff)},
-        };
-
-        log_debug(LogDevice, "run dispatch_h at {}", dispatch_location.str());
-
-        if (dispatch_on_eth) {
-            tt::tt_metal::CreateKernel(
-                *mmio_command_queue_program_ptr,
-                "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp",
-                dispatch_location,
-                EthernetConfig{
-                    .eth_mode = Eth::IDLE,
-                    .noc = NOC::NOC_0,
-                    .compile_args = dispatch_compile_args,
-                    .defines = dispatch_defines
-                }
-            );
-        } else {
-        tt::tt_metal::CreateKernel(
+        configure_kernel_variant(
             *mmio_command_queue_program_ptr,
             "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp",
-            dispatch_location,
-            tt::tt_metal::DataMovementConfig {
-                .processor = tt::tt_metal::DataMovementProcessor::RISCV_1,
-                .noc = tt::tt_metal::NOC::RISCV_0_default,
-                .compile_args = dispatch_compile_args,
-                .defines = dispatch_defines});
-        }
+            dispatch_compile_args,
+            dispatch_core,
+            dispatch_physical_core,
+            dispatch_core_type,
+            demux_physical_core,
+            CoreCoord{0xffffffff, 0xffffffff},
+            std::map<string, string> {}
+        );
+
+        log_debug(LogDevice, "run dispatch_h at {}", dispatch_core.str());
 
         /////////////////Following section is for Remote Device
-        //auto device_id = this->id();
-        //uint8_t num_hw_cqs = 1;
-        //uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
         dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id);
         dispatch_on_eth = dispatch_core_type == CoreType::ETH;
 
@@ -907,63 +832,27 @@ void Device::compile_command_queue_programs() {
         constexpr uint32_t demux_queue_start_addr = L1_UNRESERVED_BASE;
         constexpr uint32_t demux_queue_size_bytes = 0x10000;
 
-        //uint32_t tunneler_queue_start_addr = 0x19000;
-        //uint32_t tunneler_queue_size_bytes = 0x10000;
-        //uint32_t tunneler_test_results_addr = 0x39000;
-        //uint32_t tunneler_test_results_size = 0x7000;
-        //constexpr uint32_t packetized_path_test_results_addr = BRISC_L1_RESULT_BASE;
-        //constexpr uint32_t packetized_path_test_results_size = 1024;
-
-        // For tests with checkers enabled, packetized path may time out and
-        // cause the test to fail.
-        // To save inner loop cycles, presently the packetized components have
-        // a 32-bit timeout cycle counter so 4K cycles is the maximum timeout.
-        // Setting this to 0 disables the timeout.
-        //uint32_t timeout_mcycles = 0;
-
-        // These could start from 0, but we assign values that are easy to
-        // identify for debug.
-        //constexpr uint32_t src_endpoint_start_id = 0xaa;
-        //constexpr uint32_t dest_endpoint_start_id = 0xbb;
-
-        //uint32_t cq_id = num_hw_cqs - 1;
-        //tt_cxy_pair tunneler_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(device_id, channel, cq_id);
-        //CoreCoord tunneler_logical_core = CoreCoord(tunneler_location.x, tunneler_location.y);
-        //CoreCoord tunneler_physical_core = tt::Cluster::instance().ethernet_core_from_logical_core(tunneler_location.chip, tunneler_logical_core);
+        tt_cxy_pair mux_d_core = dispatch_core_manager::get(num_hw_cqs).mux_d_core(device_id, channel, cq_id);
+        CoreCoord mux_d_physical_core = get_physical_core_coordinate(mux_d_core, dispatch_core_type);
+        tt_cxy_pair demux_d_core = dispatch_core_manager::get(num_hw_cqs).demux_d_core(device_id, channel, cq_id);
+        CoreCoord demux_d_physical_core = get_physical_core_coordinate(demux_d_core, dispatch_core_type);
 
-        //std::tuple<chip_id_t, CoreCoord> connected_eth_core = tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(tunneler_location.chip, tunneler_logical_core));
+        tt_cxy_pair prefetch_d_core = dispatch_core_manager::get(num_hw_cqs).prefetcher_d_core(device_id, channel, cq_id);
+        CoreCoord prefetch_d_physical_core = get_physical_core_coordinate(prefetch_d_core, dispatch_core_type);
 
-        //CoreCoord r_tunneler_logical_core = std::get<1>(connected_eth_core);
-        //CoreCoord r_tunneler_physical_core = this->ethernet_core_from_logical_core(r_tunneler_logical_core);
+        dispatch_core = dispatch_core_manager::get(num_hw_cqs).dispatcher_d_core(device_id, channel, cq_id);
+        dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type);
 
-        tt_cxy_pair mux_d_location = dispatch_core_manager::get(num_hw_cqs).mux_d_core(device_id, channel, cq_id);
-        CoreCoord mux_d_physical_core = get_physical_core_coordinate(mux_d_location, dispatch_core_type);
-        tt_cxy_pair demux_d_location = dispatch_core_manager::get(num_hw_cqs).demux_d_core(device_id, channel, cq_id);
-        CoreCoord demux_d_physical_core = get_physical_core_coordinate(demux_d_location, dispatch_core_type);
+        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, 0, dispatch_core_type); // prefetch_d_sync_sem
+        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, 0, dispatch_core_type); // prefetch_d_upstream_cb_sem
+        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, dispatch_buffer_pages, dispatch_core_type); // prefetch_d_downstream_cb_sem
 
-        tt_cxy_pair prefetch_d_location = dispatch_core_manager::get(num_hw_cqs).prefetcher_d_core(device_id, channel, cq_id);
-        CoreCoord prefetch_d_physical_core = get_physical_core_coordinate(prefetch_d_location, dispatch_core_type);
+        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, {demux_d_core}, 0, dispatch_core_type); // demux_sem
 
-        //tt_cxy_pair dispatch_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_d_core(device_id, channel, cq_id);
-        //CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type);
-        dispatch_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_d_core(device_id, channel, cq_id);
-        dispatch_physical_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type);
+        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_core, 0, dispatch_core_type); // dispatch_sem
+        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_core, dispatch_buffer_pages, dispatch_core_type); // dispatch_downstream_cb_sem
 
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_location, 0, dispatch_core_type); // prefetch_d_sync_sem
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_location, 0, dispatch_core_type); // prefetch_d_upstream_cb_sem
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_location, dispatch_buffer_pages, dispatch_core_type); // prefetch_d_downstream_cb_sem
-
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, {demux_d_location}, 0, dispatch_core_type); // unused demux semaphore
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, {demux_d_location}, 0, dispatch_core_type); // demux_downstream_cb_sem
-
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_location, 0, dispatch_core_type); // dispatch_sync_sem
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_location, 0, dispatch_core_type); // dispatch_cb_sem
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_location, dispatch_buffer_pages, dispatch_core_type); // dispatch_downstream_cb_sem
-
-        //constexpr uint32_t dispatch_h_cb_sem = 0;
-        tt_metal::CreateSemaphore(*command_queue_program_ptr, mux_d_location, 0, dispatch_core_type);
-
-        uint32_t prefetch_d_buffer_base = dispatch_constants::DISPATCH_BUFFER_BASE;
+        tt_metal::CreateSemaphore(*command_queue_program_ptr, mux_d_core, 0, dispatch_core_type); // mux_sem
 
         std::vector<uint32_t> tunneler_r_compile_args =
         {
@@ -979,8 +868,8 @@ void Device::compile_command_queue_programs() {
                                     tunneler_physical_core.y,
                                     1,
                                     (uint32_t)DispatchRemoteNetworkType::ETH), // 5: remote_receiver_1_info
-            (demux_queue_start_addr >> 4), // 6: remote_receiver_queue_start_addr_words 0
-            (demux_queue_size_bytes >> 4), // 7: remote_receiver_queue_size_words 0
+            (L1_UNRESERVED_BASE >> 4), // 6: remote_receiver_queue_start_addr_words 0
+            (0x10000 >> 4), // 7: remote_receiver_queue_size_words 0
             (tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4, // 8: remote_receiver_queue_start_addr_words 1
             tunneler_queue_size_bytes >> 4, // 9: remote_receiver_queue_size_words 1
             packet_switch_4B_pack(tunneler_physical_core.x,
@@ -993,30 +882,29 @@ void Device::compile_command_queue_programs() {
                                 (uint32_t)DispatchRemoteNetworkType::NOC0), // 11: remote_sender_1_info
             tunneler_test_results_addr, // 12: test_results_addr
             tunneler_test_results_size, // 13: test_results_size
-            timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles
+            0, // 14: timeout_cycles
         };
 
-        tt_metal::CreateKernel(
+        configure_kernel_variant(
             *command_queue_program_ptr,
             "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp",
-            r_tunneler_logical_core,
-            tt_metal::EthernetConfig{
-                .noc = tt_metal::NOC::NOC_0,
-                .compile_args = tunneler_r_compile_args,
-                // Skip noc logging for tunneling cores, since stopping the print server can hang
-                // the chip in this case.
-                .defines = {{"SKIP_NOC_LOGGING", "1"}}
-            }
+            tunneler_r_compile_args,
+            r_tunneler_core,
+            CoreCoord{0, 0},
+            CoreType::ETH,
+            CoreCoord{0, 0},
+            CoreCoord{0, 0},
+            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}},
+            true
         );
-        log_debug(LogDevice, "run tunneler at device {} Core {}", this->id(), r_tunneler_logical_core.str());
 
-        //uint32_t dest_map_array[4] = {0, 1, 2, 3};
-        //uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4);
+        log_debug(LogDevice, "run tunneler at device {} Core {}", this->id(), r_tunneler_core.str());
+
         std::vector<uint32_t> demux_d_compile_args =
         {
             dest_endpoint_start_id, // 0: endpoint_id_start_index
-            (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words
-            (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words
+            (L1_UNRESERVED_BASE >> 4), // 1: rx_queue_start_addr_words
+            (0x10000 >> 4), // 2: rx_queue_size_words
             1, // 3: demux_fan_out
             packet_switch_4B_pack(prefetch_d_physical_core.x,
                                 prefetch_d_physical_core.y,
@@ -1034,7 +922,7 @@ void Device::compile_command_queue_programs() {
                                 0,
                                 0,
                                 (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info
-            (prefetch_d_buffer_base >> 4), // 8: remote_tx_queue_start_addr_words 0
+            (dispatch_constants::DISPATCH_BUFFER_BASE >> 4), // 8: remote_tx_queue_start_addr_words 0
             dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size() >> 4, // 9: remote_tx_queue_size_words 0
             0, // 10: remote_tx_queue_start_addr_words 1
             0, // 11: remote_tx_queue_size_words 1
@@ -1050,83 +938,59 @@ void Device::compile_command_queue_programs() {
             (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo
             packetized_path_test_results_addr, // 22: test_results_addr
             packetized_path_test_results_size, // 23: test_results_size
-            timeout_mcycles * 1000 * 1000, // 24: timeout_cycles
+            0, // 24: timeout_cycles
             0x1, // 25: output_depacketize_mask
             // 26: output 0 packetize info:
             packet_switch_4B_pack(dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-                                demux_downstream_cb_sem, // local sem
                                 prefetch_d_upstream_cb_sem, // downstream sem
+                                demux_sem,      // local sem
                                 0),
             packet_switch_4B_pack(0, 0, 0, 0), // 27: output 1 packetize info
             packet_switch_4B_pack(0, 0, 0, 0), // 28: output 2 packetize info
             packet_switch_4B_pack(0, 0, 0, 0), // 29: output 3 packetize info
         };
 
-        log_debug(LogDevice, "run demux at {}", demux_d_location.str());
-
-        if (dispatch_on_eth) {
-            tt::tt_metal::CreateKernel(
-                *command_queue_program_ptr,
-                "tt_metal/impl/dispatch/kernels/packet_demux.cpp",
-                demux_d_location,
-                EthernetConfig{
-                    .eth_mode = Eth::IDLE,
-                    .noc = NOC::NOC_0,
-                    .compile_args = demux_d_compile_args,
-                    .defines = {{"SKIP_NOC_LOGGING", "1"}}
-                }
-            );
-        } else {
-        tt_metal::CreateKernel(
+        configure_kernel_variant(
             *command_queue_program_ptr,
             "tt_metal/impl/dispatch/kernels/packet_demux.cpp",
-            demux_d_location,
-            tt_metal::DataMovementConfig{
-                .processor = tt_metal::DataMovementProcessor::RISCV_0,
-                .noc = tt_metal::NOC::RISCV_0_default,
-                .compile_args = demux_d_compile_args,
-                .defines = {{"SKIP_NOC_LOGGING", "1"}}
-            }
+            demux_d_compile_args,
+            demux_d_core,
+            CoreCoord{0, 0},
+            dispatch_core_type,
+            CoreCoord{0, 0},
+            CoreCoord{0, 0},
+            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
         );
-        }
+
+        log_debug(LogDevice, "run demux at {}", demux_d_core.str());
 
         // prefetch_d
-        uint32_t scratch_db_base = (prefetch_d_buffer_base + dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size()
+        uint32_t scratch_db_base = (dispatch_constants::DISPATCH_BUFFER_BASE + dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size()
                                     + PCIE_ALIGNMENT - 1) & (~(PCIE_ALIGNMENT - 1));
         uint32_t scratch_db_size = dispatch_constants::get(dispatch_core_type).scratch_db_size();
         const uint32_t l1_size = dispatch_core_type == CoreType::WORKER ? MEM_L1_SIZE : MEM_ETH_SIZE;
 
         TT_ASSERT(scratch_db_base + scratch_db_size <= l1_size);
 
-        std::map<string, string> prefetch_d_defines = {
-            {"DISPATCH_KERNEL", "1"},
-            {"MY_NOC_X", std::to_string(prefetch_d_physical_core.x)},
-            {"MY_NOC_Y", std::to_string(prefetch_d_physical_core.y)},
-            {"UPSTREAM_NOC_X", std::to_string(demux_d_physical_core.x)},
-            {"UPSTREAM_NOC_Y", std::to_string(demux_d_physical_core.y)},
-            {"DOWNSTREAM_NOC_X", std::to_string(dispatch_physical_core.x)},
-            {"DOWNSTREAM_NOC_Y", std::to_string(dispatch_physical_core.y)},
-        };
-
         std::vector<uint32_t> prefetch_d_compile_args = {
             dispatch_constants::DISPATCH_BUFFER_BASE, // overridden below for prefetch_h
             dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, // overridden below for prefetch_h
             dispatch_buffer_pages, // overridden below for prefetch_h
             prefetch_d_downstream_cb_sem, // overridden below for prefetch_d
-            dispatch_cb_sem, // overridden below for prefetch_h
+            dispatch_sem, // overridden below for prefetch_h
             0, //issue_queue_start_addr,
             0, //issue_queue_size,
             0, //prefetch_q_base,
             dispatch_constants::get(dispatch_core_type).prefetch_q_size(),
             CQ_PREFETCH_Q_RD_PTR,
-            prefetch_d_buffer_base, // overridden for split below
+            dispatch_constants::DISPATCH_BUFFER_BASE, // overridden for split below
             dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size(), // overridden for split below
             scratch_db_base, // scratch_db_base filled in below if used
             scratch_db_size,
-            prefetch_sync_sem,
+            prefetch_d_sync_sem,
             dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), // prefetch_d only
             prefetch_d_upstream_cb_sem, // prefetch_d only my upstream
-            demux_downstream_cb_sem, // prefetch_d only upstream
+            demux_sem, // prefetch_d only upstream
             dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
             dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
             prefetch_h_exec_buf_sem,
@@ -1134,87 +998,53 @@ void Device::compile_command_queue_programs() {
             false
         };
 
-        if (dispatch_on_eth) {
-            tt::tt_metal::CreateKernel(
-                *command_queue_program_ptr,
-                "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp",
-                prefetch_d_location,
-                EthernetConfig{
-                    .eth_mode = Eth::IDLE,
-                    .noc = NOC::NOC_0,
-                    .compile_args = prefetch_d_compile_args,
-                    .defines = prefetch_d_defines
-                }
-            );
-        } else {
-        tt::tt_metal::CreateKernel(
+        configure_kernel_variant(
             *command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp", // update this for remote device
-            prefetch_d_location,
-            tt::tt_metal::DataMovementConfig {
-                .processor = tt::tt_metal::DataMovementProcessor::RISCV_1,
-                .noc = tt::tt_metal::NOC::RISCV_0_default,
-                .compile_args = prefetch_d_compile_args,
-                .defines = prefetch_d_defines});
-        }
-
-        log_debug(LogDevice, "run prefertch_d at {}", prefetch_d_location.str());
+            "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp",
+            prefetch_d_compile_args,
+            prefetch_d_core,
+            prefetch_d_physical_core,
+            dispatch_core_type,
+            demux_d_physical_core,
+            dispatch_physical_core,
+            std::map<string, string> {}
+        );
 
+        log_debug(LogDevice, "run prefertch_d at {}", prefetch_d_core.str());
 
-        std::map<string, string> dispatch_d_defines = {
-            {"DISPATCH_KERNEL", "1"},
-            {"MY_NOC_X", std::to_string(dispatch_physical_core.x)},
-            {"MY_NOC_Y", std::to_string(dispatch_physical_core.y)},
-            {"UPSTREAM_NOC_X", std::to_string(prefetch_d_physical_core.x)},
-            {"UPSTREAM_NOC_Y", std::to_string(prefetch_d_physical_core.y)},
-            {"DOWNSTREAM_NOC_X", std::to_string(mux_d_physical_core.x)},
-            {"DOWNSTREAM_NOC_Y", std::to_string(mux_d_physical_core.y)},
-        };
         std::vector<uint32_t> dispatch_d_compile_args = {
             dispatch_constants::DISPATCH_BUFFER_BASE,
             dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
             dispatch_buffer_pages,
-            dispatch_cb_sem,
+            dispatch_sem,
             prefetch_d_downstream_cb_sem,
             dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
-            dispatch_sync_sem,
+            prefetch_d_sync_sem,
             128,
             128 + 256 * 1024 * 1024,
             256 * 1024 * 1024,
             dispatch_constants::DISPATCH_BUFFER_BASE,
             (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_buffer_pages,
             dispatch_downstream_cb_sem, // unused on hd, filled in below for h and d
-            dispatch_h_cb_sem, // unused on hd, filled in below for h and d
+            mux_sem, // unused on hd, filled in below for h and d
             sizeof(dispatch_packet_header_t), // unused unless tunneler is between h and d
             true,   // is_dram_variant
             false    // is_host_variant
         };
 
-        if (dispatch_on_eth) {
-            tt::tt_metal::CreateKernel(
-                *command_queue_program_ptr,
-                "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp",
-                dispatch_location,
-                EthernetConfig{
-                    .eth_mode = Eth::IDLE,
-                    .noc = NOC::NOC_0,
-                    .compile_args = dispatch_d_compile_args,
-                    .defines = dispatch_d_defines
-                }
-            );
-        } else {
-        tt::tt_metal::CreateKernel(
+        configure_kernel_variant(
             *command_queue_program_ptr,
             "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp",
-            dispatch_location,
-            tt::tt_metal::DataMovementConfig {
-                .processor = tt::tt_metal::DataMovementProcessor::RISCV_1,
-                .noc = tt::tt_metal::NOC::RISCV_0_default,
-                .compile_args = dispatch_d_compile_args,
-                .defines = dispatch_d_defines});
-        }
+            dispatch_d_compile_args,
+            dispatch_core,
+            dispatch_physical_core,
+            dispatch_core_type,
+            prefetch_d_physical_core,
+            mux_d_physical_core,
+            std::map<string, string> {}
+        );
 
-        log_debug(LogDevice, "run dispatch at {}", dispatch_location.str());
+        log_debug(LogDevice, "run dispatch at {}", dispatch_core.str());
 
         std::vector<uint32_t> mux_d_compile_args =
         {
@@ -1246,14 +1076,14 @@ void Device::compile_command_queue_programs() {
             (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type
             packetized_path_test_results_addr, // 14: test_results_addr
             packetized_path_test_results_size, // 15: test_results_size
-            timeout_mcycles * 1000 * 1000, // 16: timeout_cycles
+            0, // 16: timeout_cycles
             0x0,// 17: output_depacketize
             0x0,// 18: output_depacketize info
             // 19: input 0 packetize info:
             packet_switch_4B_pack(0x1,
                                     dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
                                     dispatch_downstream_cb_sem, // upstream sem
-                                    dispatch_h_cb_sem), // local sem
+                                    mux_sem), // local sem
             packet_switch_4B_pack(0, 0, 0, 0), // 20: input 1 packetize info
             packet_switch_4B_pack(0, 0, 0, 0), // 21: input 2 packetize info
             packet_switch_4B_pack(0, 0, 0, 0), // 22: input 3 packetize info
@@ -1261,33 +1091,19 @@ void Device::compile_command_queue_programs() {
             packet_switch_4B_pack(dest_endpoint_start_id, 0, 0, 0), // 24: packetized input dest id
         };
 
-        log_debug(LogDevice, "run mux at {}", mux_d_location.str());
-
-        if (dispatch_on_eth) {
-            tt::tt_metal::CreateKernel(
-                *command_queue_program_ptr,
-                "tt_metal/impl/dispatch/kernels/packet_mux.cpp",
-                mux_d_location,
-                EthernetConfig{
-                    .eth_mode = Eth::IDLE,
-                    .noc = NOC::NOC_0,
-                    .compile_args = mux_d_compile_args,
-                    .defines = {{"SKIP_NOC_LOGGING", "1"}}
-                }
-            );
-        } else {
-        tt_metal::CreateKernel(
+        configure_kernel_variant(
             *command_queue_program_ptr,
             "tt_metal/impl/dispatch/kernels/packet_mux.cpp",
-            mux_d_location,
-            tt_metal::DataMovementConfig{
-                .processor = tt_metal::DataMovementProcessor::RISCV_0,
-                .noc = tt_metal::NOC::RISCV_0_default,
-                .compile_args = mux_d_compile_args,
-                .defines = {{"SKIP_NOC_LOGGING", "1"}}
-            }
+            mux_d_compile_args,
+            mux_d_core,
+            CoreCoord{0, 0},
+            dispatch_core_type,
+            CoreCoord{0, 0},
+            CoreCoord{0, 0},
+            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
         );
-        }
+
+        log_debug(LogDevice, "run mux at {}", mux_d_core.str());
 
         detail::CompileProgram(this, *command_queue_program_ptr);
         this->command_queue_programs.push_back(std::move(command_queue_program_ptr));
@@ -1531,17 +1347,6 @@ bool Device::close() {
                         cores_to_skip.insert(phys_core);
                         log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will keep running on MMIO Device.", demux_location.str(), phys_core.str());
                     }
-                    /*
-                    tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_core(device_id, curr_channel, cq_id);
-                    tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_core(device_id, curr_channel, cq_id);
-                    tt_cxy_pair mux_location = dispatch_core_manager::get(curr_num_hw_cqs).mux_core(device_id, curr_channel, cq_id);
-                    tt_cxy_pair demux_location = dispatch_core_manager::get(curr_num_hw_cqs).demux_core(device_id, curr_channel, cq_id);
-                    cores_to_skip.insert(get_physical_core_coordinate(dispatch_location, dispatch_core_type));
-                    cores_to_skip.insert(get_physical_core_coordinate(prefetch_location, dispatch_core_type));
-                    cores_to_skip.insert(get_physical_core_coordinate(mux_location, dispatch_core_type));
-                    cores_to_skip.insert(get_physical_core_coordinate(demux_location, dispatch_core_type));
-                    log_debug(tt::LogMetal, "Remote Device dispatch cores: {} : {} : {} : {} will keep running on MMIO Device.", dispatch_location.str(), prefetch_location.str(), mux_location.str(), demux_location.str());
-                    */
                 }
             }
         }
@@ -1576,17 +1381,6 @@ bool Device::close() {
                 not_done_dispatch_cores.insert(phys_core);
                 log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will be reset on MMIO Device.", demux_location.str(), phys_core.str());
             }
-            /*
-            tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_core(device_id, curr_channel, cq_id);
-            tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_core(device_id, curr_channel, cq_id);
-            tt_cxy_pair mux_location = dispatch_core_manager::get(curr_num_hw_cqs).mux_core(device_id, curr_channel, cq_id);
-            tt_cxy_pair demux_location = dispatch_core_manager::get(curr_num_hw_cqs).demux_core(device_id, curr_channel, cq_id);
-            not_done_dispatch_cores.insert(get_physical_core_coordinate(dispatch_location, dispatch_core_type));
-            not_done_dispatch_cores.insert(get_physical_core_coordinate(prefetch_location, dispatch_core_type));
-            not_done_dispatch_cores.insert(get_physical_core_coordinate(mux_location, dispatch_core_type));
-            not_done_dispatch_cores.insert(get_physical_core_coordinate(demux_location, dispatch_core_type));
-            log_debug(tt::LogMetal, "Remote Device dispatch cores {} : {} : {} : {} will be reset on MMIO Device.", dispatch_location.str(), prefetch_location.str(), mux_location.str(), demux_location.str());
-            */
         }
     }
 
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index 16f2f3fe9368..07a2af34385d 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -219,6 +219,8 @@ class Device {
     void initialize_and_launch_firmware();
     void initialize_command_queue();
     void initialize_synchronous_sw_cmd_queue();
+    void configure_kernel_variant(Program& program, string path, std::vector<uint32_t> compile_args, CoreCoord kernel_core, CoreCoord Kernel_physical_core,
+                                  CoreType dispatch_core_type, CoreCoord upstream_physical_core, CoreCoord downstream_physical_core, std::map<string, string> defines_in , bool is_active_eth_core = false);
     void compile_command_queue_programs();
     void configure_command_queue_programs();
     void clear_l1_state();

From 060b8c1d6f6cfd75fd09d8a4c4ab47f855c63032 Mon Sep 17 00:00:00 2001
From: Akhmed Rakhmati <akhmed.rakhmati@gmail.com>
Date: Thu, 16 May 2024 19:45:53 +0000
Subject: [PATCH 03/40] #8569: Handle static and dynamic OP validation
 performantly

---
 tt_eager/tt_dnn/op_library/operation.hpp     | 16 +++++
 tt_eager/tt_dnn/op_library/run_operation.cpp | 19 ++++--
 tt_eager/ttnn/config.hpp                     | 72 ++++++++++++++++++++
 ttnn/cpp/ttnn/core.hpp                       | 56 +--------------
 4 files changed, 103 insertions(+), 60 deletions(-)
 create mode 100644 tt_eager/ttnn/config.hpp

diff --git a/tt_eager/tt_dnn/op_library/operation.hpp b/tt_eager/tt_dnn/op_library/operation.hpp
index a4091a2699be..0c1acf9183ff 100644
--- a/tt_eager/tt_dnn/op_library/operation.hpp
+++ b/tt_eager/tt_dnn/op_library/operation.hpp
@@ -12,6 +12,7 @@
 #include "tt_metal/impl/program/program.hpp"
 #include "tt_stl/concepts.hpp"
 #include "tt_stl/reflection.hpp"
+#include "ttnn/config.hpp"
 
 namespace tt {
 
@@ -498,6 +499,8 @@ struct DeviceOperation final {
             output_tensors);
     }
 
+    inline bool uses_custom_program_hash() const { return this->uses_custom_program_hash_impl_(); }
+
     inline const Hash compute_program_hash(
         const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors) const {
         ZoneScoped;
@@ -536,6 +539,9 @@ struct DeviceOperation final {
                const Tensors& input_tensors,
                const OptionalConstTensors& optional_input_tensors,
                const OptionalTensors& optional_output_tensors) -> void {
+                if (ttnn::CONFIG.enable_fast_runtime_mode) {
+                    return;
+                }
                 const auto& operation = *reinterpret_cast<const std::decay_t<T>*>(&storage);
                 if constexpr (
                     (detail::implements_validate<T>() or
@@ -663,6 +669,15 @@ struct DeviceOperation final {
                     static_assert(tt::stl::concepts::always_false_v<T>, "Operation doesn't implement create_program");
                 }
             }},
+        uses_custom_program_hash_impl_{[]() -> bool {
+            if constexpr (detail::implements_compute_program_hash<T>()) {
+                return true;
+            } else if constexpr (detail::implements_compute_program_hash_with_optional_input_tensors<T>()) {
+                return true;
+            } else {
+                return false;
+            }
+        }},
         create_profiler_info_impl_{[](const storage_t& storage, const Tensors& input_tensors) -> const ProfilerInfo {
             const auto& operation = *reinterpret_cast<const std::decay_t<T>*>(&storage);
             std::optional<std::string> preferred_name = tt::stl::get_type_name<T>();
@@ -720,6 +735,7 @@ struct DeviceOperation final {
         const Tensors&,
         const std::vector<std::optional<const Tensor>>&,
         OutputTensors&);
+    bool (*uses_custom_program_hash_impl_)();
     const Hash (*compute_program_hash_impl_)(
         const storage_t& value, const Tensors&, const std::vector<std::optional<const Tensor>>&);
     const ProfilerInfo (*create_profiler_info_impl_)(const storage_t& value, const Tensors& input_tensors);
diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp
index 05f7747ad5db..93c12c554221 100644
--- a/tt_eager/tt_dnn/op_library/run_operation.cpp
+++ b/tt_eager/tt_dnn/op_library/run_operation.cpp
@@ -146,7 +146,8 @@ OutputTensors run_device_operation(
         const DeviceOperation<OutputTensors>&,
         const Tensors&,
         const OptionalConstTensors&,
-        OutputTensors&)>
+        OutputTensors&,
+        const OptionalTensors&)>
         get_or_create_program;
 
     auto& program_cache = input_tensors[0].device()->program_cache;
@@ -157,12 +158,18 @@ OutputTensors run_device_operation(
                                     const DeviceOperation<OutputTensors>& operation,
                                     const Tensors& input_tensors,
                                     const OptionalConstTensors& optional_input_tensors,
-                                    OutputTensors& output_tensors) -> std::reference_wrapper<Program> {
+                                    OutputTensors& output_tensors,
+                                    const OptionalTensors& optional_output_tensors) -> std::reference_wrapper<Program> {
             program_hash = operation.compute_program_hash(input_tensors, optional_input_tensors);
             auto program_ptr = program_cache.find(program_hash);
 
             bool cache_hit = program_ptr.has_value();
             log_debug(tt::LogOp, "Program Hash: {} ({})", program_hash, cache_hit ? "HIT" : "MISS");
+
+            if (not cache_hit or operation.uses_custom_program_hash()) {
+                operation.validate(input_tensors, optional_input_tensors, optional_output_tensors);
+            }
+
             if (not cache_hit) {
                 program_ptr = std::make_shared<operation::CacheableProgram<OutputTensors>>(operation.create_program(input_tensors, optional_input_tensors, output_tensors));
                 program_cache.insert(program_hash, program_ptr.value());
@@ -196,16 +203,18 @@ OutputTensors run_device_operation(
         get_or_create_program = [](const DeviceOperation<OutputTensors>& operation,
                                    const Tensors& input_tensors,
                                    const OptionalConstTensors& optional_input_tensors,
-                                   OutputTensors& output_tensors) -> std::shared_ptr<Program> {
+                                   OutputTensors& output_tensors,
+                                   const OptionalTensors& optional_output_tensors) -> std::shared_ptr<Program> {
+            operation.validate(input_tensors, optional_input_tensors, optional_output_tensors);
             auto program_with_callbacks =
                 operation.create_program(input_tensors, optional_input_tensors, output_tensors);
             return std::make_shared<Program>(std::move(program_with_callbacks.program));
         };
     }
 
-    operation.validate(input_tensors, optional_input_tensors, optional_output_tensors);
     auto output_tensors = operation.create_output_tensors(input_tensors, optional_output_tensors);
-    auto program = get_or_create_program(operation, input_tensors, optional_input_tensors, output_tensors);
+    auto program = get_or_create_program(
+        operation, input_tensors, optional_input_tensors, output_tensors, optional_output_tensors);
     uint32_t device_id = detail::get_device(input_tensors, optional_input_tensors)->id();
 
     // Enqueue or Launch Program
diff --git a/tt_eager/ttnn/config.hpp b/tt_eager/ttnn/config.hpp
new file mode 100644
index 000000000000..e13635e127bc
--- /dev/null
+++ b/tt_eager/ttnn/config.hpp
@@ -0,0 +1,72 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <optional>
+#include <string>
+#include <tuple>
+
+namespace ttnn {
+
+namespace core {
+
+struct Config {
+    std::string cache_path = "/home/.cache/ttnn";
+    std::string model_cache_path = "/home/.cache/ttnn/models";
+    std::string tmp_dir = "/tmp/ttnn";
+    bool enable_model_cache = false;
+    bool enable_fast_runtime_mode = false;
+    bool throw_exception_on_fallback = false;
+    bool enable_logging = false;
+    bool enable_graph_report = false;
+    bool enable_detailed_buffer_report = false;
+    bool enable_detailed_tensor_report = false;
+    bool enable_comparison_mode = false;
+    float comparison_mode_pcc = 0.9999;
+    std::string root_report_path = "generated/ttnn/reports";
+    std::optional<std::string> report_name = std::nullopt;
+
+    static constexpr auto attribute_names = std::make_tuple(
+        "cache_path",
+        "model_cache_path",
+        "tmp_dir",
+        "enable_model_cache",
+        "enable_fast_runtime_mode",
+        "throw_exception_on_fallback",
+        "enable_logging",
+        "enable_graph_report",
+        "enable_detailed_buffer_report",
+        "enable_detailed_tensor_report",
+        "enable_comparison_mode",
+        "comparison_mode_pcc",
+        "root_report_path",
+        "report_name");
+
+    const auto attribute_values() const {
+        return std::make_tuple(
+            std::cref(this->cache_path),
+            std::cref(this->model_cache_path),
+            std::cref(this->tmp_dir),
+            std::cref(this->enable_model_cache),
+            std::cref(this->enable_fast_runtime_mode),
+            std::cref(this->throw_exception_on_fallback),
+            std::cref(this->enable_logging),
+            std::cref(this->enable_graph_report),
+            std::cref(this->enable_detailed_buffer_report),
+            std::cref(this->enable_detailed_tensor_report),
+            std::cref(this->enable_comparison_mode),
+            std::cref(this->comparison_mode_pcc),
+            std::cref(this->root_report_path),
+            std::cref(this->report_name));
+    }
+};
+
+inline Config CONFIG{};
+
+}  // namespace core
+
+using core::CONFIG;
+using core::Config;
+}  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/core.hpp b/ttnn/cpp/ttnn/core.hpp
index 1d40f720af04..4ba605f87694 100644
--- a/ttnn/cpp/ttnn/core.hpp
+++ b/ttnn/cpp/ttnn/core.hpp
@@ -11,6 +11,7 @@
 #include "tt_eager/tensor/tensor_impl.hpp"  // TTNN_TENSOR_PRINT_PROFILE
 #include "tt_eager/tensor/types.hpp"
 #include "tt_eager/tt_dnn/op_library/operation.hpp"
+#include "ttnn/config.hpp"
 #include "ttnn/types.hpp"
 
 namespace ttnn {
@@ -29,59 +30,6 @@ namespace ttnn {
 
 namespace core {
 
-struct Config {
-    std::string cache_path = "/home/.cache/ttnn";
-    std::string model_cache_path = "/home/.cache/ttnn/models";
-    std::string tmp_dir = "/tmp/ttnn";
-    bool enable_model_cache = false;
-    bool enable_fast_runtime_mode = false;
-    bool throw_exception_on_fallback = false;
-    bool enable_logging = false;
-    bool enable_graph_report = false;
-    bool enable_detailed_buffer_report = false;
-    bool enable_detailed_tensor_report = false;
-    bool enable_comparison_mode = false;
-    float comparison_mode_pcc = 0.9999;
-    std::string root_report_path = "generated/ttnn/reports";
-    std::optional<std::string> report_name = std::nullopt;
-
-    static constexpr auto attribute_names = std::make_tuple(
-        "cache_path",
-        "model_cache_path",
-        "tmp_dir",
-        "enable_model_cache",
-        "enable_fast_runtime_mode",
-        "throw_exception_on_fallback",
-        "enable_logging",
-        "enable_graph_report",
-        "enable_detailed_buffer_report",
-        "enable_detailed_tensor_report",
-        "enable_comparison_mode",
-        "comparison_mode_pcc",
-        "root_report_path",
-        "report_name");
-
-    const auto attribute_values() const {
-        return std::make_tuple(
-            std::cref(this->cache_path),
-            std::cref(this->model_cache_path),
-            std::cref(this->tmp_dir),
-            std::cref(this->enable_model_cache),
-            std::cref(this->enable_fast_runtime_mode),
-            std::cref(this->throw_exception_on_fallback),
-            std::cref(this->enable_logging),
-            std::cref(this->enable_graph_report),
-            std::cref(this->enable_detailed_buffer_report),
-            std::cref(this->enable_detailed_tensor_report),
-            std::cref(this->enable_comparison_mode),
-            std::cref(this->comparison_mode_pcc),
-            std::cref(this->root_report_path),
-            std::cref(this->report_name));
-    }
-};
-
-inline Config CONFIG{};
-
 inline std::uint32_t pad_to_multiple_of_tile_size(std::uint32_t value) {
     return (value + (ttnn::TILE_SIZE - 1)) / ttnn::TILE_SIZE * ttnn::TILE_SIZE;
 }
@@ -118,8 +66,6 @@ inline void dump_stack_trace_on_segfault() {
 
 }  // namespace core
 
-using core::CONFIG;
-using core::Config;
 using core::get_memory_config;
 using core::has_storage_type_of;
 using core::pad_to_multiple_of_tile_size;

From 9068f70e55f1328ff33ba51f8ecd162226dcde20 Mon Sep 17 00:00:00 2001
From: Vincent Tang <vtang@tenstorrent.com>
Date: Fri, 17 May 2024 05:03:13 +0000
Subject: [PATCH 04/40] #8027: source python_env in build_metal

---
 build_metal.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/build_metal.sh b/build_metal.sh
index 6c1854fd5e26..8aa47d49384a 100755
--- a/build_metal.sh
+++ b/build_metal.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 set -eo pipefail
 
+if [ -z "$PYTHON_ENV_DIR" ]; then
+    PYTHON_ENV_DIR=$(pwd)/python_env
+fi
+
 if [ -z "$CONFIG" ]; then
     echo "Build type defaulted to Release"
 else
@@ -23,6 +27,7 @@ if [ "$CONFIG" != "ci" ]; then
     echo "Building cpp tests"
     cmake --build build --target tests -- -j`nproc`
 
+    source $PYTHON_ENV_DIR/bin/activate
     echo "Generating stubs"
     stubgen -m tt_lib -m tt_lib.device -m tt_lib.profiler -m tt_lib.tensor -m tt_lib.operations -m tt_lib.operations.primary -m tt_lib.operations.primary.transformers -o tt_eager
     stubgen -p ttnn._ttnn -o ttnn

From db7d8ae1eaaa56851a15025a2639b7f95839cd74 Mon Sep 17 00:00:00 2001
From: Vincent Tang <vtang@tenstorrent.com>
Date: Fri, 17 May 2024 02:13:35 +0000
Subject: [PATCH 05/40] #0: re-add `test_add.cpp` to ttnn unit tests

---
 tests/ttnn/unit_tests/gtests/CMakeLists.txt               | 3 ++-
 tests/ttnn/unit_tests/{operations => gtests}/test_add.cpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)
 rename tests/ttnn/unit_tests/{operations => gtests}/test_add.cpp (96%)

diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
index 5b39ae4e4503..3c022b5e5e3a 100644
--- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt
+++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
@@ -1,6 +1,7 @@
 
 set(TTNN_UNIT_TESTS_SRC
-    ${CMAKE_SOURCE_DIR}/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_async_runtime.cpp
 )
 
 add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC})
diff --git a/tests/ttnn/unit_tests/operations/test_add.cpp b/tests/ttnn/unit_tests/gtests/test_add.cpp
similarity index 96%
rename from tests/ttnn/unit_tests/operations/test_add.cpp
rename to tests/ttnn/unit_tests/gtests/test_add.cpp
index a1df5fc1280a..53226c5cb49c 100644
--- a/tests/ttnn/unit_tests/operations/test_add.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_add.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
-#include "tests/ttnn/unit_tests/ttnn_test_fixtures.hpp"
+#include "ttnn_test_fixtures.hpp"
 #include "ttnn/device.hpp"
 #include "ttnn/operations/binary.hpp"
 #include "ttnn/operations/core.hpp"

From 836bf20fe5418f1c3e544434fb1963304466606b Mon Sep 17 00:00:00 2001
From: umadevimcw <umadevi@multicorewareinc.com>
Date: Tue, 7 May 2024 11:15:36 +0000
Subject: [PATCH 06/40] #7571: Support padding by user for argmax

---
 .../op_library/composite/composite_ops.cpp    | 21 ++++++++--
 tt_eager/tt_numpy/functions.hpp               | 39 +++++++++++++++++++
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
index d9b8cec98180..34d89247c4df 100644
--- a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
+++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
@@ -1566,15 +1566,30 @@ Tensor pow(const Tensor& input_a, int exponent, const MemoryConfig& output_mem_c
     return power(input_a, exponent, output_mem_config);
 }
 
+Tensor create_mask(const Tensor& input_a, const MemoryConfig& output_mem_config)
+{
+    auto& padded_shape = input_a.get_legacy_shape();
+    auto& unpadded_shape = padded_shape.without_padding();
+
+    if (padded_shape == unpadded_shape)
+        return input_a;
+    float t_inf = -std::numeric_limits<float>::infinity();
+    Tensor masked_input = tt::numpy::mask_padded_input<bfloat16>(padded_shape, unpadded_shape, DataType::BFLOAT16);
+    masked_input = where(eqz(masked_input, output_mem_config), t_inf, input_a, output_mem_config);
+    return masked_input;
+}
 // Argmax returns the index of maximum element in the tensor
-Tensor _argmax(const Tensor& input_a, int64_t _dim, bool all, const MemoryConfig& output_mem_config) {
-    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_a}))};
+Tensor _argmax(const Tensor& input, int64_t _dim, bool all, const MemoryConfig& output_mem_config) {
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input}))};
     operation::launch_with_autoformat(
         [_dim, all, output_mem_config] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
             const auto& input_a = input_tensors.at(0);
             auto& input_shape = input_a.get_legacy_shape();
             TT_FATAL(input_shape.rank() == 4, "supported for rank-4 tensors at this time");
 
+            Tensor input_a = create_mask(input, output_mem_config);
+
+
             uint32_t dim = input_shape.get_normalized_index(_dim);
             int size = input_a.volume();
 
@@ -1672,7 +1687,7 @@ Tensor _argmax(const Tensor& input_a, int64_t _dim, bool all, const MemoryConfig
             max_indices.deallocate();
             result = global_min(result, output_mem_config);
             return {result};
-    }, {input_a}, output_tensors);
+    }, {input}, output_tensors);
     return output_tensors.at(0);
 }
 
diff --git a/tt_eager/tt_numpy/functions.hpp b/tt_eager/tt_numpy/functions.hpp
index 94bf502a9e2d..a63e529562e8 100644
--- a/tt_eager/tt_numpy/functions.hpp
+++ b/tt_eager/tt_numpy/functions.hpp
@@ -371,6 +371,45 @@ static Tensor index_all(
     return output;
 }
 
+template <typename T>
+static Tensor mask_padded_input(
+    const Shape& padded_shape,
+    const Shape& unpadded_shape,
+    DataType data_type,
+    const Layout layout = Layout::ROW_MAJOR,
+    Device* device = nullptr,
+    const MemoryConfig& output_mem_config = MemoryConfig{
+        .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
+    auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(padded_shape));
+
+    auto index = 0;
+    //auto value = 0;
+    auto rank = padded_shape.rank();
+    auto penultimate = rank - 2;
+    auto ultimate = rank - 1;
+    for (uint32_t b = 0; b < padded_shape[rank - 4]; b++) {
+        for (uint32_t c = 0; c < padded_shape[rank - 3]; c++) {
+            for (uint32_t y = 0; y < padded_shape[penultimate]; y++) {
+                for (uint32_t x = 0; x < padded_shape[ultimate]; x++) {
+                    if (b < unpadded_shape[rank - 4] && c < unpadded_shape[rank - 3] && y < unpadded_shape[penultimate] && x < unpadded_shape[ultimate])
+                    {
+                        owned_buffer[index++] = T(static_cast<float>(1.0));
+                    }
+                    else
+                    {
+                        owned_buffer[index++] = T(static_cast<float>(0.0));
+                    }
+                }  // dim W
+            }      // dim H
+        }          // dim C
+    }              // dim N
+    auto output = Tensor(OwnedStorage{owned_buffer}, padded_shape, data_type, Layout::ROW_MAJOR).to(layout);
+    if (device != nullptr) {
+        output = output.to(device, output_mem_config);
+    }
+    return output;
+}
+
 template<typename T>
 static Tensor fill_first_val_into_tensor(const Tensor& input_tensor, DataType data_type,
 			  const Layout layout , Device * device = nullptr,

From b46b3991a050d0596bb659b00c9231cc2e0797d0 Mon Sep 17 00:00:00 2001
From: umadevimcw <umadevi@multicorewareinc.com>
Date: Wed, 8 May 2024 11:00:32 +0000
Subject: [PATCH 07/40] #7571: Add test files for argmax padding

---
 .../grayskull/test_argmax_padding.py          | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py

diff --git a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
new file mode 100644
index 000000000000..70d5119540b9
--- /dev/null
+++ b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import pytest
+import tt_lib
+from loguru import logger
+from tests.tt_eager.python_api_testing.sweep_tests import comparison_funcs
+
+
+@pytest.mark.parametrize(
+    "input_shapes",
+    (
+        (torch.Size([1, 1, 1, 10])),
+        (torch.Size([1, 1, 1, 20])),
+        (torch.Size([1, 1, 1, 3])),
+    ),
+)
+@pytest.mark.parametrize(
+    "dim",
+    (3,),
+)
+@pytest.mark.parametrize("all", (True,))
+class TestArgmax:
+    def test_argmax(self, input_shapes, dim, all, device):
+        torch.manual_seed(10)
+
+        # input_data = torch.tensor([5,3,2,1,10,9,6,7,4,8]).reshape(input_shapes).bfloat16()
+        input_data = torch.randn(input_shapes).bfloat16()
+        print("input_data", input_data)
+        input_tensor = (
+            tt_lib.tensor.Tensor(input_data, tt_lib.tensor.DataType.BFLOAT16)
+            .pad_to_tile(100)
+            .to(tt_lib.tensor.Layout.TILE)
+            .to(device)
+        )
+        tt_output_tensor_on_device = tt_lib.tensor.argmax(input_tensor, dim=dim, all=all)
+
+        tt_out_tensor = tt_output_tensor_on_device.cpu().to(tt_lib.tensor.Layout.ROW_MAJOR).to_torch()
+        # tt_out_tensor = tt_output_tensor_on_device.cpu().to(tt_lib.tensor.Layout.ROW_MAJOR)
+        if all:
+            golden_tensor = torch.argmax(input_data)
+            tt_out_tensor = tt_out_tensor[0, 0, 0, 0]
+        else:
+            golden_tensor = torch.argmax(input_data, dim=dim)
+            if dim == 1 or dim == -3 or dim == 0 or dim == -4:
+                tt_out_tensor = tt_out_tensor[0]
+            else:
+                if input_shapes[1] != 1 or input_shapes[0] != 1:
+                    tt_out_tensor = tt_out_tensor[0]
+                else:
+                    tt_out_tensor = tt_out_tensor[0, 0, 0]
+
+        pt_out_tensor = golden_tensor
+        comp_pass, comp_out = comparison_funcs.comp_pcc(pt_out_tensor, tt_out_tensor, pcc=0.99)
+        comp_all, _ = comparison_funcs.comp_allclose(pt_out_tensor, tt_out_tensor, atol=4, rtol=1e-1)
+        logger.info(comp_pass)
+        logger.info(comp_all)
+        logger.info(comp_out)
+        status = comp_pass | comp_all
+
+        print("pt_out_tensor", pt_out_tensor)
+        print("tt_out_tensor", tt_out_tensor)
+        assert status

From 0abfd91b98351088e3f722061c1f0cb888d4141a Mon Sep 17 00:00:00 2001
From: umadevimcw <umadevi@multicorewareinc.com>
Date: Thu, 9 May 2024 09:04:22 +0000
Subject: [PATCH 08/40] #7571: Remove using eqz op and test file prints

---
 .../non_working_unit_tests/grayskull/test_argmax_padding.py   | 4 ----
 tt_eager/tt_dnn/op_library/composite/composite_ops.cpp        | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
index 70d5119540b9..78b7134b1c81 100644
--- a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
+++ b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
@@ -28,7 +28,6 @@ def test_argmax(self, input_shapes, dim, all, device):
 
         # input_data = torch.tensor([5,3,2,1,10,9,6,7,4,8]).reshape(input_shapes).bfloat16()
         input_data = torch.randn(input_shapes).bfloat16()
-        print("input_data", input_data)
         input_tensor = (
             tt_lib.tensor.Tensor(input_data, tt_lib.tensor.DataType.BFLOAT16)
             .pad_to_tile(100)
@@ -59,7 +58,4 @@ def test_argmax(self, input_shapes, dim, all, device):
         logger.info(comp_all)
         logger.info(comp_out)
         status = comp_pass | comp_all
-
-        print("pt_out_tensor", pt_out_tensor)
-        print("tt_out_tensor", tt_out_tensor)
         assert status
diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
index 34d89247c4df..7c7d01c684b1 100644
--- a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
+++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
@@ -1575,7 +1575,7 @@ Tensor create_mask(const Tensor& input_a, const MemoryConfig& output_mem_config)
         return input_a;
     float t_inf = -std::numeric_limits<float>::infinity();
     Tensor masked_input = tt::numpy::mask_padded_input<bfloat16>(padded_shape, unpadded_shape, DataType::BFLOAT16);
-    masked_input = where(eqz(masked_input, output_mem_config), t_inf, input_a, output_mem_config);
+    masked_input = where(masked_input, input_a, t_inf, output_mem_config);
     return masked_input;
 }
 // Argmax returns the index of maximum element in the tensor

From c54ee8f5f1a32af6e57083134bf74fbbafc02981 Mon Sep 17 00:00:00 2001
From: umadevimcw <umadevi@multicorewareinc.com>
Date: Mon, 13 May 2024 10:14:35 +0000
Subject: [PATCH 09/40] #7571: Update index calculation module for all mode

---
 tt_eager/tt_numpy/functions.hpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tt_eager/tt_numpy/functions.hpp b/tt_eager/tt_numpy/functions.hpp
index a63e529562e8..0cf2a51e6ffd 100644
--- a/tt_eager/tt_numpy/functions.hpp
+++ b/tt_eager/tt_numpy/functions.hpp
@@ -348,19 +348,21 @@ static Tensor index_all(
     const MemoryConfig& output_mem_config = MemoryConfig{
         .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
     auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(shape));
-
+    std::fill(owned_buffer.begin(), owned_buffer.end(), -std::numeric_limits<float>::infinity());
+    auto& up_shape = shape.without_padding();
     auto index = 0;
     auto value = 0;
-    auto rank = shape.rank();
+    auto rank = up_shape.rank();
     auto penultimate = rank - 2;
     auto ultimate = rank - 1;
-    for (uint32_t b = 0; b < shape[rank - 4]; b++) {
-        for (uint32_t c = 0; c < shape[rank - 3]; c++) {
-            for (uint32_t y = 0; y < shape[penultimate]; y++) {
-                for (uint32_t x = 0; x < shape[ultimate]; x++) {
+    for (uint32_t b = 0; b < up_shape[rank - 4]; b++) {
+        for (uint32_t c = 0; c < up_shape[rank - 3]; c++) {
+            for (uint32_t y = 0; y < up_shape[penultimate]; y++) {
+                for (uint32_t x = 0; x < up_shape[ultimate]; x++) {
                     owned_buffer[index++] = T(static_cast<float>(value));
                     value = value + 1;
                 }  // dim W
+                index = index + (shape[ultimate] -  up_shape[ultimate]);
             }      // dim H
         }          // dim C
     }              // dim N

From 43b9e07319f0dd3b1b5dd615db157e2a7aaf6dc3 Mon Sep 17 00:00:00 2001
From: umadevimcw <umadevi@multicorewareinc.com>
Date: Mon, 13 May 2024 14:15:32 +0000
Subject: [PATCH 10/40] #7571: Update index module for dimension 3 and 2

---
 .../grayskull/test_argmax_padding.py          | 24 ++---
 tt_eager/tt_numpy/functions.hpp               | 87 ++++++++++---------
 2 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
index 78b7134b1c81..7cc6eac11e32 100644
--- a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
+++ b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
@@ -13,15 +13,14 @@
     "input_shapes",
     (
         (torch.Size([1, 1, 1, 10])),
-        (torch.Size([1, 1, 1, 20])),
-        (torch.Size([1, 1, 1, 3])),
+        (torch.Size([1, 1, 10, 20])),
+        (torch.Size([1, 1, 30, 3])),
+        (torch.Size([1, 4, 3, 5])),
+        (torch.Size([5, 4, 3, 20])),
     ),
 )
-@pytest.mark.parametrize(
-    "dim",
-    (3,),
-)
-@pytest.mark.parametrize("all", (True,))
+@pytest.mark.parametrize("dim", (3, 2, -1, -2))
+@pytest.mark.parametrize("all", (True, False))
 class TestArgmax:
     def test_argmax(self, input_shapes, dim, all, device):
         torch.manual_seed(10)
@@ -47,10 +46,15 @@ def test_argmax(self, input_shapes, dim, all, device):
                 tt_out_tensor = tt_out_tensor[0]
             else:
                 if input_shapes[1] != 1 or input_shapes[0] != 1:
-                    tt_out_tensor = tt_out_tensor[0]
+                    if dim == 2 or dim == -2:
+                        tt_out_tensor = tt_out_tensor[0, :, :, 0 : input_shapes[3]]
+                    else:
+                        tt_out_tensor = tt_out_tensor[0, :, :, 0 : input_shapes[2]]
                 else:
-                    tt_out_tensor = tt_out_tensor[0, 0, 0]
-
+                    if dim == 2 or dim == -2:
+                        tt_out_tensor = tt_out_tensor[0, 0, 0, 0 : input_shapes[3]]
+                    else:
+                        tt_out_tensor = tt_out_tensor[0, 0, 0, 0 : input_shapes[2]]
         pt_out_tensor = golden_tensor
         comp_pass, comp_out = comparison_funcs.comp_pcc(pt_out_tensor, tt_out_tensor, pcc=0.99)
         comp_all, _ = comparison_funcs.comp_allclose(pt_out_tensor, tt_out_tensor, atol=4, rtol=1e-1)
diff --git a/tt_eager/tt_numpy/functions.hpp b/tt_eager/tt_numpy/functions.hpp
index 0cf2a51e6ffd..d17d341b3712 100644
--- a/tt_eager/tt_numpy/functions.hpp
+++ b/tt_eager/tt_numpy/functions.hpp
@@ -277,26 +277,28 @@ static Tensor index_width(
     Device* device = nullptr,
     const MemoryConfig& output_mem_config = MemoryConfig{
         .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
-    auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(shape));
 
+    auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(shape));
+    std::fill(owned_buffer.begin(), owned_buffer.end(), -std::numeric_limits<float>::infinity());
+    auto& up_shape = shape.without_padding();
     auto index = 0;
     auto value = 0;
-    auto rank = shape.rank();
+    auto rank = up_shape.rank();
     auto penultimate = rank - 2;
     auto ultimate = rank - 1;
-    auto offset = shape[penultimate] * shape[ultimate];
-    auto iterations = 1;
-    for (int itr = 0; itr < rank - 2; itr++) iterations *= shape[itr];
-    for (uint32_t itr = 0; itr < iterations; itr++) {
-        for (int32_t y = 0; y < shape[penultimate]; y++) {
-            for (int32_t x = 0; x < shape[ultimate]; x++) {
-                owned_buffer[index + y * shape[ultimate] + x] = T(static_cast<float>(value));
-                value = value + 1;
-            }  // dim X
-            value = 0;
-        }  // dim Y
-        index += offset;
-    }  // dim W
+    for (uint32_t b = 0; b < up_shape[rank - 4]; b++) {
+        for (uint32_t c = 0; c < up_shape[rank - 3]; c++) {
+            for (uint32_t y = 0; y < up_shape[penultimate]; y++) {
+                for (uint32_t x = 0; x < up_shape[ultimate]; x++) {
+                    owned_buffer[index++] = T(static_cast<float>(value));
+                    value = value + 1;
+                }  // dim W
+                value = 0;
+                index = index + (shape[ultimate] -  up_shape[ultimate]);
+            }// dim H
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
+        } //dim c
+    }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
     if (device != nullptr) {
         output = output.to(device, output_mem_config);
@@ -313,25 +315,26 @@ static Tensor index_height(
     const MemoryConfig& output_mem_config = MemoryConfig{
         .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
     auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(shape));
-
+    std::fill(owned_buffer.begin(), owned_buffer.end(), -std::numeric_limits<float>::infinity());
+    auto& up_shape = shape.without_padding();
     auto index = 0;
     auto value = 0;
-    auto rank = shape.rank();
+    auto rank = up_shape.rank();
     auto penultimate = rank - 2;
     auto ultimate = rank - 1;
-    auto offset = shape[penultimate] * shape[ultimate];
-    auto iterations = 1;
-    for (int itr = 0; itr < rank - 2; itr++) iterations *= shape[itr];
-    for (uint32_t itr = 0; itr < iterations; itr++) {
-        for (int32_t y = 0; y < shape[penultimate]; y++) {
-            for (int32_t x = 0; x < shape[ultimate]; x++) {
-                owned_buffer[index + y * shape[ultimate] + x] = T(static_cast<float>(value));
-            }  // dim X
-            value = value + 1;
-        }  // dim Y
-        value = 0;
-        index += offset;
-    }  // dim W
+    for (uint32_t b = 0; b < up_shape[rank - 4]; b++) {
+        for (uint32_t c = 0; c < up_shape[rank - 3]; c++) {
+            for (uint32_t y = 0; y < up_shape[penultimate]; y++) {
+                for (uint32_t x = 0; x < up_shape[ultimate]; x++) {
+                    owned_buffer[index++] = T(static_cast<float>(value));
+                }  // dim W
+                value = value + 1;
+                index = index + (shape[ultimate] -  up_shape[ultimate]);
+            }      // dim H
+            value = 0;
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
+        }          // dim C
+    }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
     if (device != nullptr) {
         output = output.to(device, output_mem_config);
@@ -363,8 +366,10 @@ static Tensor index_all(
                     value = value + 1;
                 }  // dim W
                 index = index + (shape[ultimate] -  up_shape[ultimate]);
-            }      // dim H
+            }// dim H
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
         }          // dim C
+        //index = index + ((shape[rank - 3] -  up_shape[rank - 3]) * TILE_WIDTH * TILE_HEIGHT);
     }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
     if (device != nullptr) {
@@ -534,24 +539,28 @@ static Tensor index_channel(
     Device* device = nullptr,
     const MemoryConfig& output_mem_config = MemoryConfig{
         .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
-    auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(shape));
 
+    auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(shape));
+    std::fill(owned_buffer.begin(), owned_buffer.end(), -std::numeric_limits<float>::infinity());
+    auto& up_shape = shape.without_padding();
     auto index = 0;
     auto value = 0;
-    auto rank = shape.rank();
+    auto rank = up_shape.rank();
     auto penultimate = rank - 2;
     auto ultimate = rank - 1;
-    for (uint32_t b = 0; b < shape[rank - 4]; b++) {
-        for (uint32_t c = 0; c < shape[rank - 3]; c++) {
-            for (uint32_t y = 0; y < shape[penultimate]; y++) {
-                for (uint32_t x = 0; x < shape[ultimate]; x++) {
+    for (uint32_t b = 0; b < up_shape[rank - 4]; b++) {
+        for (uint32_t c = 0; c < up_shape[rank - 3]; c++) {
+            for (uint32_t y = 0; y < up_shape[penultimate]; y++) {
+                for (uint32_t x = 0; x < up_shape[ultimate]; x++) {
                     owned_buffer[index++] = T(static_cast<float>(value));
                 }  // dim W
+                index = index + (shape[ultimate] - up_shape[ultimate]);
             }      // dim H
             value = value + 1;
-        }  // dim C
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
+        }          // dim C
         value = 0;
-    }  // dim N
+    }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
     if (device != nullptr) {
         output = output.to(device, output_mem_config);

From fd6379bfc49c4634be5199840d10da93e4512378 Mon Sep 17 00:00:00 2001
From: umadevimcw <umadevi@multicorewareinc.com>
Date: Tue, 14 May 2024 10:18:19 +0000
Subject: [PATCH 11/40] #7571: Update index module for dim 1, 0 and test files

---
 .../grayskull/test_argmax_padding.py          |  8 ++-----
 .../op_library/composite/composite_ops.cpp    | 12 +++++------
 tt_eager/tt_numpy/functions.hpp               | 21 ++++++++++---------
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
index 7cc6eac11e32..fc52f8357071 100644
--- a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
+++ b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
@@ -19,13 +19,11 @@
         (torch.Size([5, 4, 3, 20])),
     ),
 )
-@pytest.mark.parametrize("dim", (3, 2, -1, -2))
+@pytest.mark.parametrize("dim", (3, 2, 1, 0, -1, -2, -3, -4))
 @pytest.mark.parametrize("all", (True, False))
 class TestArgmax:
     def test_argmax(self, input_shapes, dim, all, device):
         torch.manual_seed(10)
-
-        # input_data = torch.tensor([5,3,2,1,10,9,6,7,4,8]).reshape(input_shapes).bfloat16()
         input_data = torch.randn(input_shapes).bfloat16()
         input_tensor = (
             tt_lib.tensor.Tensor(input_data, tt_lib.tensor.DataType.BFLOAT16)
@@ -34,16 +32,14 @@ def test_argmax(self, input_shapes, dim, all, device):
             .to(device)
         )
         tt_output_tensor_on_device = tt_lib.tensor.argmax(input_tensor, dim=dim, all=all)
-
         tt_out_tensor = tt_output_tensor_on_device.cpu().to(tt_lib.tensor.Layout.ROW_MAJOR).to_torch()
-        # tt_out_tensor = tt_output_tensor_on_device.cpu().to(tt_lib.tensor.Layout.ROW_MAJOR)
         if all:
             golden_tensor = torch.argmax(input_data)
             tt_out_tensor = tt_out_tensor[0, 0, 0, 0]
         else:
             golden_tensor = torch.argmax(input_data, dim=dim)
             if dim == 1 or dim == -3 or dim == 0 or dim == -4:
-                tt_out_tensor = tt_out_tensor[0]
+                tt_out_tensor = tt_out_tensor[0, :, 0 : input_shapes[2], 0 : input_shapes[3]]
             else:
                 if input_shapes[1] != 1 or input_shapes[0] != 1:
                     if dim == 2 or dim == -2:
diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
index 7c7d01c684b1..47cd6b74086f 100644
--- a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
+++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
@@ -1570,7 +1570,6 @@ Tensor create_mask(const Tensor& input_a, const MemoryConfig& output_mem_config)
 {
     auto& padded_shape = input_a.get_legacy_shape();
     auto& unpadded_shape = padded_shape.without_padding();
-
     if (padded_shape == unpadded_shape)
         return input_a;
     float t_inf = -std::numeric_limits<float>::infinity();
@@ -1579,17 +1578,16 @@ Tensor create_mask(const Tensor& input_a, const MemoryConfig& output_mem_config)
     return masked_input;
 }
 // Argmax returns the index of maximum element in the tensor
-Tensor _argmax(const Tensor& input, int64_t _dim, bool all, const MemoryConfig& output_mem_config) {
-    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input}))};
+Tensor _argmax(const Tensor& input_t, int64_t _dim, bool all, const MemoryConfig& output_mem_config) {
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_t}))};
     operation::launch_with_autoformat(
         [_dim, all, output_mem_config] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
-            const auto& input_a = input_tensors.at(0);
-            auto& input_shape = input_a.get_legacy_shape();
+            const auto& input = input_tensors.at(0);
+            auto& input_shape = input.get_legacy_shape();
             TT_FATAL(input_shape.rank() == 4, "supported for rank-4 tensors at this time");
 
             Tensor input_a = create_mask(input, output_mem_config);
 
-
             uint32_t dim = input_shape.get_normalized_index(_dim);
             int size = input_a.volume();
 
@@ -1687,7 +1685,7 @@ Tensor _argmax(const Tensor& input, int64_t _dim, bool all, const MemoryConfig&
             max_indices.deallocate();
             result = global_min(result, output_mem_config);
             return {result};
-    }, {input}, output_tensors);
+    }, {input_t}, output_tensors);
     return output_tensors.at(0);
 }
 
diff --git a/tt_eager/tt_numpy/functions.hpp b/tt_eager/tt_numpy/functions.hpp
index d17d341b3712..7b6018e6074b 100644
--- a/tt_eager/tt_numpy/functions.hpp
+++ b/tt_eager/tt_numpy/functions.hpp
@@ -369,7 +369,6 @@ static Tensor index_all(
             }// dim H
             index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
         }          // dim C
-        //index = index + ((shape[rank - 3] -  up_shape[rank - 3]) * TILE_WIDTH * TILE_HEIGHT);
     }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
     if (device != nullptr) {
@@ -390,7 +389,6 @@ static Tensor mask_padded_input(
     auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(padded_shape));
 
     auto index = 0;
-    //auto value = 0;
     auto rank = padded_shape.rank();
     auto penultimate = rank - 2;
     auto ultimate = rank - 1;
@@ -576,24 +574,27 @@ static Tensor index_batch(
     Device* device = nullptr,
     const MemoryConfig& output_mem_config = MemoryConfig{
         .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
-    auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(shape));
 
+    auto owned_buffer = tt_metal::owned_buffer::create<T>(tt_metal::compute_volume(shape));
+    std::fill(owned_buffer.begin(), owned_buffer.end(), -std::numeric_limits<float>::infinity());
+    auto& up_shape = shape.without_padding();
     auto index = 0;
     auto value = 0;
-    auto rank = shape.rank();
+    auto rank = up_shape.rank();
     auto penultimate = rank - 2;
     auto ultimate = rank - 1;
-    for (uint32_t b = 0; b < shape[rank - 4]; b++) {
-        for (uint32_t c = 0; c < shape[rank - 3]; c++) {
-            for (uint32_t y = 0; y < shape[penultimate]; y++) {
-                for (uint32_t x = 0; x < shape[ultimate]; x++) {
+    for (uint32_t b = 0; b < up_shape[rank - 4]; b++) {
+        for (uint32_t c = 0; c < up_shape[rank - 3]; c++) {
+            for (uint32_t y = 0; y < up_shape[penultimate]; y++) {
+                for (uint32_t x = 0; x < up_shape[ultimate]; x++) {
                     owned_buffer[index++] = T(static_cast<float>(value));
                 }  // dim W
+                index = index + (shape[ultimate] - up_shape[ultimate]);
             }      // dim H
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
         }          // dim C
         value = value + 1;
-    }  // dim N
-
+    }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
     if (device != nullptr) {
         output = output.to(device, output_mem_config);

From b3da88912dc6f0112ee2dbcc9dc8974160d486e6 Mon Sep 17 00:00:00 2001
From: umadevimcw <umadevi@multicorewareinc.com>
Date: Thu, 16 May 2024 06:28:41 +0000
Subject: [PATCH 12/40] #7571: Fix build issue in WHB0

---
 .../sweep_tests/pytests/tt_dnn}/test_argmax_padding.py |  0
 tt_eager/tt_numpy/functions.hpp                        | 10 +++++-----
 2 files changed, 5 insertions(+), 5 deletions(-)
 rename tests/{ttnn/python_api_testing/non_working_unit_tests/grayskull => tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn}/test_argmax_padding.py (100%)

diff --git a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_argmax_padding.py
similarity index 100%
rename from tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
rename to tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_argmax_padding.py
diff --git a/tt_eager/tt_numpy/functions.hpp b/tt_eager/tt_numpy/functions.hpp
index 7b6018e6074b..104ae536814c 100644
--- a/tt_eager/tt_numpy/functions.hpp
+++ b/tt_eager/tt_numpy/functions.hpp
@@ -296,7 +296,7 @@ static Tensor index_width(
                 value = 0;
                 index = index + (shape[ultimate] -  up_shape[ultimate]);
             }// dim H
-            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * tt::constants::TILE_WIDTH);
         } //dim c
     }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
@@ -332,7 +332,7 @@ static Tensor index_height(
                 index = index + (shape[ultimate] -  up_shape[ultimate]);
             }      // dim H
             value = 0;
-            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * tt::constants::TILE_WIDTH);
         }          // dim C
     }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
@@ -367,7 +367,7 @@ static Tensor index_all(
                 }  // dim W
                 index = index + (shape[ultimate] -  up_shape[ultimate]);
             }// dim H
-            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * tt::constants::TILE_WIDTH);
         }          // dim C
     }              // dim N
     auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR).to(layout);
@@ -555,7 +555,7 @@ static Tensor index_channel(
                 index = index + (shape[ultimate] - up_shape[ultimate]);
             }      // dim H
             value = value + 1;
-            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * tt::constants::TILE_WIDTH);
         }          // dim C
         value = 0;
     }              // dim N
@@ -591,7 +591,7 @@ static Tensor index_batch(
                 }  // dim W
                 index = index + (shape[ultimate] - up_shape[ultimate]);
             }      // dim H
-            index = index + ((shape[penultimate] -  up_shape[penultimate]) * TILE_WIDTH);
+            index = index + ((shape[penultimate] -  up_shape[penultimate]) * tt::constants::TILE_WIDTH);
         }          // dim C
         value = value + 1;
     }              // dim N

From 3874da6b28b04149371bbcadfb175524598a5a61 Mon Sep 17 00:00:00 2001
From: Akhmed Rakhmati <akhmed.rakhmati@gmail.com>
Date: Fri, 17 May 2024 09:02:05 -0400
Subject: [PATCH 13/40] #5389: revert ef62d9aceff1dc0e3df77006e705b16ca6798b4e
 (#8588)

---
 ttnn/cpp/pybind11/operations/unary.hpp   | 180 +++--------
 ttnn/cpp/ttnn/operations/transformer.hpp |   2 +-
 ttnn/cpp/ttnn/operations/unary.hpp       |  99 +-----
 ttnn/ttnn/__init__.py                    | 116 +++----
 ttnn/ttnn/operations/relational.py       | 103 ++++++
 ttnn/ttnn/operations/unary.py            | 395 +++++++++++------------
 6 files changed, 407 insertions(+), 488 deletions(-)
 create mode 100644 ttnn/ttnn/operations/relational.py

diff --git a/ttnn/cpp/pybind11/operations/unary.hpp b/ttnn/cpp/pybind11/operations/unary.hpp
index 56881f6d4a54..f98b4c1aecb4 100644
--- a/ttnn/cpp/pybind11/operations/unary.hpp
+++ b/ttnn/cpp/pybind11/operations/unary.hpp
@@ -24,22 +24,22 @@ void bind_unary(py::module& module, const unary_operation_t& operation) {
     auto doc = fmt::format(
         R"doc({0}(input_tensor: ttnn.Tensor, *, memory_config: Optional[ttnn.MemoryConfig] = None) -> ttnn.Tensor
 
-            Applies {0} to :attr:`input_tensor` element-wise.
+        Applies {0} to :attr:`input_tensor` element-wise.
 
-            .. math::
-                {0}(\\mathrm{{input\\_tensor}}_i)
+        .. math::
+            {0}(\\mathrm{{input\\_tensor}}_i)
 
-            Args:
-                * :attr:`input_tensor`
+        Args:
+            * :attr:`input_tensor`
 
-            Keyword Args:
-                * :attr:`memory_config` (Optional[ttnn.MemoryConfig]): Memory configuration for the operation.
+        Keyword Args:
+            * :attr:`memory_config` (Optional[ttnn.MemoryConfig]): Memory configuration for the operation.
 
-            Example::
+        Example::
 
-                >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
-                >>> output = {1}(tensor)
-        )doc",
+            >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
+            >>> output = {1}(tensor)
+    )doc",
         operation.name(),
         operation.python_fully_qualified_name());
 
@@ -51,28 +51,36 @@ void bind_unary(py::module& module, const unary_operation_t& operation) {
 }
 
 template <typename unary_operation_t>
-void bind_unary_with_fast_and_approximate_mode(py::module& module, const unary_operation_t& operation) {
+void bind_unary_with_bool_parameter_set_to_false_by_default(py::module& module, const unary_operation_t& operation) {
+    std::string parameter_description;
+    if (operation.name() == "exp") {
+        parameter_description = "Use fast and approximate mode";
+    } else {
+        TT_THROW("Unknown name!");
+    }
+
     auto doc = fmt::format(
-        R"doc({0}(input_tensor: ttnn.Tensor, *, fast_and_approximate_mode: bool = False, memory_config: Optional[ttnn.MemoryConfig] = None) -> ttnn.Tensor
+        R"doc({0}(input_tensor: ttnn.Tensor, *, parameter: bool = False, memory_config: Optional[ttnn.MemoryConfig] = None) -> ttnn.Tensor
 
-            Applies {0} to :attr:`input_tensor` element-wise.
+        Applies {0} to :attr:`input_tensor` element-wise.
 
-            .. math::
-                {0}(\\mathrm{{input\\_tensor}}_i)
+        .. math::
+            {0}(\\mathrm{{input\\_tensor}}_i)
 
-            Args:
-                * :attr:`input_tensor`
+        Args:
+            * :attr:`input_tensor`
 
-            Keyword Args:
-                * :attr:`fast_and_approximate_mode` (bool): "Use fast and approximate mode".
-                * :attr:`memory_config` (Optional[ttnn.MemoryConfig]): Memory configuration for the operation.
+        Keyword Args:
+            * :attr:`parameter` (bool): {2}.
+            * :attr:`memory_config` (Optional[ttnn.MemoryConfig]): Memory configuration for the operation.
 
-            Example::
+        Example::
 
-                >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
-                >>> output = {1}(tensor, fast_and_approximate_mode=true)
-        )doc",
+            >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
+            >>> output = {1}(tensor, parameter=true)
+    )doc",
         operation.name(),
+        parameter_description,
         operation.python_fully_qualified_name());
 
     bind_registered_operation(
@@ -82,71 +90,32 @@ void bind_unary_with_fast_and_approximate_mode(py::module& module, const unary_o
         ttnn::pybind_arguments_t{
             py::arg("input_tensor"),
             py::kw_only(),
-            py::arg("fast_and_approximate_mode") = false,
+            py::arg("parameter") = false,
             py::arg("memory_config") = std::nullopt});
 }
 
-template <typename unary_operation_t>
-void bind_unary_with_float_parameter(
-    py::module& module,
-    const unary_operation_t& operation,
-    const std::string& parameter_name,
-    const std::string& parameter_doc) {
-    auto doc = fmt::format(
-        R"doc({0}(input_tensor: ttnn.Tensor, *, fast_and_approximate_mode: bool = False, memory_config: Optional[ttnn.MemoryConfig] = None) -> ttnn.Tensor
-
-            Applies {0} to :attr:`input_tensor` element-wise.
-
-            .. math::
-                {0}(\\mathrm{{input\\_tensor}}_i)
-
-            Args:
-                * :attr:`input_tensor`
-
-            Keyword Args:
-                * :attr:`{2}` (bool): {3}.
-                * :attr:`memory_config` (Optional[ttnn.MemoryConfig]): Memory configuration for the operation.
-
-            Example::
-
-                >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
-                >>> output = {1}(tensor, {2}=true)
-        )doc",
-        operation.name(),
-        operation.python_fully_qualified_name(),
-        parameter_name,
-        parameter_doc);
-
-    bind_registered_operation(
-        module,
-        operation,
-        doc,
-        ttnn::pybind_arguments_t{
-            py::arg("input_tensor"), py::arg(parameter_name.c_str()), py::kw_only(), py::arg("memory_config") = std::nullopt});
-}
-
 void bind_softplus(py::module& module) {
     auto doc = fmt::format(
         R"doc({0}(input_tensor: ttnn.Tensor, *, beta: float = 1.0, threshold: float = 20.0, memory_config: Optional[ttnn.MemoryConfig] = None) -> ttnn.Tensor
 
-            Applies {0} to :attr:`input_tensor` element-wise.
+        Applies {0} to :attr:`input_tensor` element-wise.
 
-            .. math::
-                {0}(\\mathrm{{input\\_tensor}}_i)
+        .. math::
+            {0}(\\mathrm{{input\\_tensor}}_i)
 
-            Args:
-                * :attr:`input_tensor`
+        Args:
+            * :attr:`input_tensor`
 
-            Keyword Args:
-                * :attr:`beta` (float): Scales the input before applying the Softplus function. By modifying beta, you can adjust the steepness of the function. A higher beta value makes the function steeper, approaching a hard threshold like the ReLU function for large values of beta
-                * :attr:`threshold` (float): Used to switch to a linear function for large values to improve numerical stability. This avoids issues with floating-point representation for very large values
-                * :attr:`memory_config` (Optional[ttnn.MemoryConfig]): Memory configuration for the operation.
+        Keyword Args:
+            * :attr:`beta` (float): Scales the input before applying the Softplus function. By modifying beta, you can adjust the steepness of the function. A higher beta value makes the function steeper, approaching a hard threshold like the ReLU function for large values of beta
+            * :attr:`threshold` (float): Used to switch to a linear function for large values to improve numerical stability. This avoids issues with floating-point representation for very large values
+            * :attr:`memory_config` (Optional[ttnn.MemoryConfig]): Memory configuration for the operation.
 
-            Example::
+        Example::
 
-                >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
-                >>> output = {1}(tensor, parameter=true)
-        )doc",
+            >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
+            >>> output = {1}(tensor, parameter=true)
+    )doc",
         ttnn::softplus.name(),
         ttnn::softplus.python_fully_qualified_name());
 
@@ -165,61 +134,8 @@ void bind_softplus(py::module& module) {
 }  // namespace detail
 
 void py_module(py::module& module) {
-    detail::bind_unary(module, ttnn::abs);
-    detail::bind_unary(module, ttnn::acos);
-    detail::bind_unary(module, ttnn::asin);
-    detail::bind_unary(module, ttnn::atan);
-    detail::bind_unary(module, ttnn::cos);
-    detail::bind_unary(module, ttnn::erfinv);
-    detail::bind_unary(module, ttnn::exp2);
-    detail::bind_unary(module, ttnn::expm1);
-    detail::bind_unary(module, ttnn::eqz);
-    detail::bind_unary(module, ttnn::gez);
-    detail::bind_unary(module, ttnn::gtz);
-    detail::bind_unary(module, ttnn::i0);
-    detail::bind_unary(module, ttnn::isfinite);
-    detail::bind_unary(module, ttnn::isinf);
-    detail::bind_unary(module, ttnn::isnan);
-    detail::bind_unary(module, ttnn::isneginf);
-    detail::bind_unary(module, ttnn::isposinf);
-    detail::bind_unary(module, ttnn::lez);
-    detail::bind_unary(module, ttnn::log);
-    detail::bind_unary(module, ttnn::log10);
-    detail::bind_unary(module, ttnn::log2);
-    detail::bind_unary(module, ttnn::logical_not);
-    detail::bind_unary(module, ttnn::ltz);
-    detail::bind_unary(module, ttnn::neg);
-    detail::bind_unary(module, ttnn::nez);
-    detail::bind_unary(module, ttnn::reciprocal);
-    detail::bind_unary(module, ttnn::relu);
-    detail::bind_unary(module, ttnn::relu6);
-    detail::bind_unary(module, ttnn::sigmoid);
-    detail::bind_unary(module, ttnn::sign);
-    detail::bind_unary(module, ttnn::signbit);
+    detail::bind_unary_with_bool_parameter_set_to_false_by_default(module, ttnn::exp);
     detail::bind_unary(module, ttnn::silu);
-    detail::bind_unary(module, ttnn::sin);
-    detail::bind_unary(module, ttnn::sqrt);
-    detail::bind_unary(module, ttnn::square);
-    detail::bind_unary(module, ttnn::tan);
-    detail::bind_unary(module, ttnn::tanh);
-
-    //  Unaries with fast_and_approximate_mode
-    detail::bind_unary_with_fast_and_approximate_mode(module, ttnn::exp);
-    detail::bind_unary_with_fast_and_approximate_mode(module, ttnn::erf);
-    detail::bind_unary_with_fast_and_approximate_mode(module, ttnn::erfc);
-    detail::bind_unary_with_fast_and_approximate_mode(module, ttnn::gelu);
-    detail::bind_unary_with_fast_and_approximate_mode(module, ttnn::rsqrt);
-
-    // Unaries with float parameter
-    detail::bind_unary_with_float_parameter(module, ttnn::elu, "alpha", "The alpha parameter for the ELU function");
-    detail::bind_unary_with_float_parameter(
-        module, ttnn::heaviside, "value", "The value parameter for the Heaviside function");
-    detail::bind_unary_with_float_parameter(
-        module, ttnn::leaky_relu, "slope", "The slope parameter for the Leaky ReLU function");
-    // detail::bind_unary_with_float_parameter(module, ttnn::prelu, "weight", "The weight parameter for the PReLU
-    // function");
-
-    // Other unaries (composite operations)
     detail::bind_softplus(module);
 }
 
diff --git a/ttnn/cpp/ttnn/operations/transformer.hpp b/ttnn/cpp/ttnn/operations/transformer.hpp
index 1767e8f84d02..29a4a78541b0 100644
--- a/ttnn/cpp/ttnn/operations/transformer.hpp
+++ b/ttnn/cpp/ttnn/operations/transformer.hpp
@@ -260,7 +260,7 @@ struct AttentionSoftmax : public tt::operations::primary::Softmax {
             tt::operations::primary::transformers::SoftmaxDefaultProgramConfig{},
         const std::optional<bool> causal_mask = false,
         const std::optional<ttnn::MemoryConfig>& memory_config = std::nullopt) {
-        float head_size = head_size_arg.has_value() ? 1.0f / std::sqrt(head_size_arg.value()) : 1.0f;
+        float head_size = head_size_arg.has_value() ? 1.0f / ::sqrt(head_size_arg.value()) : 1.0f;
         if constexpr (in_place) {
             TT_FATAL(attention_mask.has_value(), "Cannot apply divide by sqrt(head_size) using in-place version!");
         } else {
diff --git a/ttnn/cpp/ttnn/operations/unary.hpp b/ttnn/cpp/ttnn/operations/unary.hpp
index 0561cc096582..fdaa0a2f27a7 100644
--- a/ttnn/cpp/ttnn/operations/unary.hpp
+++ b/ttnn/cpp/ttnn/operations/unary.hpp
@@ -67,8 +67,7 @@ struct Unary : public EltwiseUnary {
     }
 };
 
-template <UnaryOpType unary_op_type>
-struct UnaryWithFastAndApproximateMode : public EltwiseUnary {
+struct Exp : public EltwiseUnary {
     static const std::array<TensorSchema, 1> input_tensor_schemas() { return detail::input_tensor_schemas(); }
 
     template <typename... Args>
@@ -81,29 +80,14 @@ struct UnaryWithFastAndApproximateMode : public EltwiseUnary {
         const bool parameter = false,
         const std::optional<MemoryConfig>& memory_config = std::nullopt) {
         return detail::execute(
-            input_tensor, {UnaryWithParam{unary_op_type, static_cast<float>(parameter)}}, memory_config);
-    }
-};
-
-template <UnaryOpType unary_op_type>
-struct UnaryWithFloatParameter : public EltwiseUnary {
-    static const std::array<TensorSchema, 1> input_tensor_schemas() { return detail::input_tensor_schemas(); }
-
-    template <typename... Args>
-    static auto input_tensors_to_validate(const Tensor& input_tensor, Args&&... args) {
-        return detail::input_tensors_to_validate(input_tensor, std::forward<Args>(args)...);
-    }
-
-    static Tensor execute(
-        const Tensor& input_tensor,
-        const float parameter,
-        const std::optional<MemoryConfig>& memory_config = std::nullopt) {
-        return detail::execute(
-            input_tensor, {UnaryWithParam{unary_op_type, static_cast<float>(parameter)}}, memory_config);
+            input_tensor,
+            {UnaryWithParam{
+                ttnn::operations::unary::UnaryOpType::EXP, static_cast<float>(parameter)}},
+            memory_config);
     }
 };
 
-struct Softplus {
+struct Softplus : public EltwiseUnary {
     static const std::array<TensorSchema, 1> input_tensor_schemas() { return detail::input_tensor_schemas(); }
 
     template <typename... Args>
@@ -126,72 +110,11 @@ struct Softplus {
 }  // namespace unary
 }  // namespace operations
 
-#define REGISTER_UNARY_OPERATION(operation_name, operation_type)                               \
-    constexpr auto operation_name = ttnn::register_operation<                                  \
-        ttnn::operations::unary::Unary<ttnn::operations::unary::UnaryOpType::operation_type>>( \
-        "ttnn::" #operation_name);
-
-#define REGISTER_UNARY_OPERATION_WITH_FAST_AND_APPROXIMATE_MODE(operation_name, operation_type)                        \
-    constexpr auto operation_name = ttnn::register_operation<ttnn::operations::unary::UnaryWithFastAndApproximateMode< \
-        ttnn::operations::unary::UnaryOpType::operation_type>>("ttnn::" #operation_name);
-
-#define REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(operation_name, operation_type)                            \
-    constexpr auto operation_name = ttnn::register_operation<                                                    \
-        ttnn::operations::unary::UnaryWithFloatParameter<ttnn::operations::unary::UnaryOpType::operation_type>>( \
-        "ttnn::" #operation_name);
-
-REGISTER_UNARY_OPERATION(abs, ABS);
-REGISTER_UNARY_OPERATION(acos, ACOS);
-REGISTER_UNARY_OPERATION(asin, ASIN);
-REGISTER_UNARY_OPERATION(atan, ATAN);
-REGISTER_UNARY_OPERATION(cos, COS);
-REGISTER_UNARY_OPERATION(erfinv, ERFINV);
-REGISTER_UNARY_OPERATION(exp2, EXP2);
-REGISTER_UNARY_OPERATION(expm1, EXPM1);
-REGISTER_UNARY_OPERATION(eqz, EQZ);
-REGISTER_UNARY_OPERATION(gez, GEZ);
-REGISTER_UNARY_OPERATION(gtz, GTZ);
-REGISTER_UNARY_OPERATION(i0, I0);
-REGISTER_UNARY_OPERATION(isfinite, ISFINITE);
-REGISTER_UNARY_OPERATION(isinf, ISINF);
-REGISTER_UNARY_OPERATION(isnan, ISNAN);
-REGISTER_UNARY_OPERATION(isneginf, ISNEGINF);
-REGISTER_UNARY_OPERATION(isposinf, ISPOSINF);
-REGISTER_UNARY_OPERATION(lez, LEZ);
-REGISTER_UNARY_OPERATION(log, LOG);
-REGISTER_UNARY_OPERATION(log10, LOG10);
-REGISTER_UNARY_OPERATION(log2, LOG2);
-REGISTER_UNARY_OPERATION(logical_not, LOGICAL_NOT_UNARY);
-REGISTER_UNARY_OPERATION(ltz, LTZ);
-REGISTER_UNARY_OPERATION(neg, NEG);
-REGISTER_UNARY_OPERATION(nez, NEZ);
-REGISTER_UNARY_OPERATION(reciprocal, RECIP);
-REGISTER_UNARY_OPERATION(relu, RELU);
-REGISTER_UNARY_OPERATION(relu6, RELU6);
-REGISTER_UNARY_OPERATION(sigmoid, SIGMOID);
-REGISTER_UNARY_OPERATION(sign, SIGN);
-REGISTER_UNARY_OPERATION(signbit, SIGNBIT);
-REGISTER_UNARY_OPERATION(silu, SILU);
-REGISTER_UNARY_OPERATION(sin, SIN);
-REGISTER_UNARY_OPERATION(sqrt, SQRT);
-REGISTER_UNARY_OPERATION(square, SQUARE);
-REGISTER_UNARY_OPERATION(tan, TAN);
-REGISTER_UNARY_OPERATION(tanh, TANH);
-
-// Unaries with fast_and_approximate_mode
-REGISTER_UNARY_OPERATION_WITH_FAST_AND_APPROXIMATE_MODE(exp, EXP);
-REGISTER_UNARY_OPERATION_WITH_FAST_AND_APPROXIMATE_MODE(erf, ERF);
-REGISTER_UNARY_OPERATION_WITH_FAST_AND_APPROXIMATE_MODE(erfc, ERFC);
-REGISTER_UNARY_OPERATION_WITH_FAST_AND_APPROXIMATE_MODE(gelu, GELU);
-REGISTER_UNARY_OPERATION_WITH_FAST_AND_APPROXIMATE_MODE(rsqrt, RSQRT);
-
-// Unaries with float parameter
-REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(elu, ELU);
-REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(heaviside, HEAVISIDE);
-REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(leaky_relu, LEAKY_RELU);
-auto prelu = leaky_relu;  // Alias for leaky_relu. TODO(#8544): implement PReLU properly
-
-// Other unaries (composite operations)
+constexpr auto exp = ttnn::register_operation<ttnn::operations::unary::Exp>("ttnn::exp");
+
 constexpr auto softplus = ttnn::register_operation<ttnn::operations::unary::Softplus>("ttnn::softplus");
 
+constexpr auto silu =
+    ttnn::register_operation<ttnn::operations::unary::Unary<ttnn::operations::unary::UnaryOpType::SILU>>("ttnn::silu");
+
 } // namespace ttnn
diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py
index 47ec5fc67cf4..ee9f70bb26ad 100644
--- a/ttnn/ttnn/__init__.py
+++ b/ttnn/ttnn/__init__.py
@@ -311,88 +311,81 @@ def manage_config(name, value):
 )
 
 from ttnn.operations.unary import (
-    abs,
-    acos,
-    acosh,
+    exp,
+    tanh,
+    gelu,
+    rsqrt,
+    relu,
+    silu,
+    log,
+    sin,
+    cos,
+    tan,
     asin,
-    asinh,
+    acos,
     atan,
-    atanh,
-    cbrt,
-    celu,
-    clip,
-    cos,
+    sinh,
     cosh,
-    deg2rad,
-    digamma,
-    elu,
-    eqz,
-    erf,
-    erfc,
-    erfinv,
-    exp,
-    exp2,
-    expm1,
-    glu,
-    gelu,
-    geglu,
-    gez,
-    gtz,
-    hardshrink,
-    hardsigmoid,
-    hardswish,
-    hardtanh,
-    heaviside,
+    asinh,
+    acosh,
+    atanh,
+    logical_not,
+    logit,
+    signbit,
+    softplus,
     i0,
     isfinite,
     isinf,
     isnan,
     isneginf,
     isposinf,
-    leaky_relu,
-    lez,
-    logical_not,
-    ltz,
     lgamma,
-    log,
     log10,
     log1p,
     log2,
-    log_sigmoid,
-    logit,
-    log_sigmoid,
-    mish,
     multigammaln,
     neg,
-    nez,
-    polygamma,
-    prelu,
+    abs,
+    cbrt,
+    deg2rad,
+    digamma,
+    erf,
+    erfc,
+    erfinv,
+    exp2,
+    expm1,
     rad2deg,
     reciprocal,
-    relu,
-    reglu,
+    sqrt,
+    square,
+    tril,
+    triu,
+    hardsigmoid,
+    hardswish,
+    hardtanh,
+    log_sigmoid,
+    mish,
     relu6,
-    rsqrt,
     sigmoid,
     sigmoid_accurate,
     sign,
-    signbit,
-    silu,
-    sin,
-    sinh,
-    softplus,
-    softshrink,
     softsign,
-    sqrt,
-    square,
-    swiglu,
     swish,
-    tan,
-    tanh,
     tanhshrink,
+    polygamma,
+    clip,
+    elu,
+    hardshrink,
+    heaviside,
+    leaky_relu,
+    prelu,
+    softshrink,
     threshold,
-    tril,
-    triu,
+    glu,
+    geglu,
+    reglu,
+    swiglu,
+    celu,
 )
 
 from ttnn.operations.binary import (
@@ -438,6 +431,15 @@ def manage_config(name, value):
     lerp,
 )
 
+from ttnn.operations.relational import (
+    gtz,
+    ltz,
+    gez,
+    lez,
+    nez,
+    eqz,
+)
+
 from ttnn.operations.normalization import (
     softmax,
     layer_norm,
diff --git a/ttnn/ttnn/operations/relational.py b/ttnn/ttnn/operations/relational.py
new file mode 100644
index 000000000000..616bfedc5c46
--- /dev/null
+++ b/ttnn/ttnn/operations/relational.py
@@ -0,0 +1,103 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import math
+
+from typing import Union
+
+import tt_lib as ttl
+
+import ttnn
+
+
+THIS_MODULE = sys.modules[__name__]
+
+__all__ = []
+
+
+def register_ttl_relational_function_zero(name, ttl_relational_function, op_function):
+    def _golden_function(input_tensor: ttnn.Tensor, **_):
+        import torch
+
+        name_to_golden_function_function = {
+            "gtz": torch.gt,
+            "ltz": torch.lt,
+            "gez": torch.ge,
+            "lez": torch.le,
+            "nez": torch.ne,
+            "eqz": torch.eq,
+        }
+        torch_function = name_to_golden_function_function[name]
+        return torch_function(input_tensor, 0)
+
+    def _relational_validate_input_tensors(operation_name, input_tensor, *args, **kwargs):
+        ttnn.validate_input_tensor(
+            operation_name,
+            input_tensor,
+            ranks=(2, 3, 4),
+            dtypes=(ttnn.bfloat16, ttnn.bfloat8_b),
+            layouts=(ttnn.TILE_LAYOUT,),
+            can_be_on_device=True,
+            can_be_on_cpu=False,
+        )
+
+    @ttnn.register_operation(
+        name=f"ttnn.{name}",
+        validate_input_tensors=_relational_validate_input_tensors,
+        golden_function=_golden_function,
+    )
+    def relational_function(
+        input_tensor: ttnn.Tensor, *, memory_config: ttnn.MemoryConfig = ttnn.DRAM_MEMORY_CONFIG
+    ) -> ttnn.Tensor:
+        original_shape = input_tensor.shape
+        input_tensor = ttnn.unsqueeze_to_4D(input_tensor)
+
+        if not isinstance(input_tensor, ttnn.Tensor):
+            raise TypeError("Expected first argument to be a ttnn.Tensor")
+
+        if not ttnn.is_tensor_storage_on_device(input_tensor):
+            raise RuntimeError("input_tensor must be on device!")
+
+        output_tensor = ttl_relational_function(input_tensor, output_mem_config=memory_config)
+        output_tensor = ttnn.reshape(output_tensor, original_shape)
+        return output_tensor
+
+    if isinstance(relational_function, ttnn.decorators.Operation):
+        relational_function.__name__ = f"ttnn.{name}"
+        relational_function.decorated_function.__doc__ = f"""{name}(input_tensor: ttnn.Tensor, *, memory_config: ttnn.MemoryConfig = ttnn.DRAM_MEMORY_CONFIG) -> ttnn.Tensor
+
+            Returns tensor with the {op_function} of all of the elements of the input tensor :attr:`input_tensor` element-wise.
+
+            .. math::
+                {name}(\\mathrm{{input\\_tensor}}_i)
+
+            Args:
+                * :attr:`input_tensor`
+
+            Example::
+
+                >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
+                >>> output = ttnn.{name}(tensor)
+
+            """
+    setattr(THIS_MODULE, name, relational_function)
+
+
+def _is_scalar(value):
+    return isinstance(value, (int, float))
+
+
+TTL_RELATIONAL_FUNCTIONS_ZERO = [
+    ("gtz", ttl.tensor.gtz, "greater than zero"),
+    ("ltz", ttl.tensor.ltz, "less than zero"),
+    ("gez", ttl.tensor.gez, "greater than or equal to zero"),
+    ("lez", ttl.tensor.lez, "less than or equal to zero"),
+    ("nez", ttl.tensor.nez, "not equal to zero"),
+    ("eqz", ttl.tensor.eqz, "equal to zero"),
+]
+
+
+for relational_function_name, ttl_relational_function, name in TTL_RELATIONAL_FUNCTIONS_ZERO:
+    register_ttl_relational_function_zero(relational_function_name, ttl_relational_function, name)
diff --git a/ttnn/ttnn/operations/unary.py b/ttnn/ttnn/operations/unary.py
index 32259cda06ba..d168c9cd0eab 100644
--- a/ttnn/ttnn/operations/unary.py
+++ b/ttnn/ttnn/operations/unary.py
@@ -12,146 +12,6 @@
 THIS_MODULE = sys.modules[__name__]
 
 
-def register_ttnn_cpp_unary_function(unary_function):
-    import torch
-
-    def torch_heaviside(x, *args, **kwargs):
-        value = kwargs.pop("scalar")
-        result = torch.heaviside(x, torch.tensor(value, dtype=x.dtype))
-        return result
-
-    def torch_prelu(x, *args, **kwargs):
-        weight = kwargs.pop("scalar")
-        result = torch.nn.functional.prelu(x, torch.tensor(weight, dtype=x.dtype))
-        return result
-
-    name_to_golden_function = {
-        "abs": torch.abs,
-        "acos": torch.acos,
-        "asin": torch.asin,
-        "atan": torch.atan,
-        "cos": torch.cos,
-        "erfinv": torch.erfinv,
-        "exp2": torch.exp2,
-        "expm1": torch.expm1,
-        "eqz": lambda x: torch.eq(x, 0),
-        "gez": lambda x: torch.ge(x, 0),
-        "gtz": lambda x: torch.gt(x, 0),
-        "i0": torch.i0,
-        "isfinite": torch.isfinite,
-        "isinf": torch.inf,
-        "isnan": torch.isnan,
-        "isneginf": torch.isneginf,
-        "isposinf": torch.isposinf,
-        "lez": lambda x: torch.le(x, 0),
-        "log": torch.log,
-        "log10": torch.log10,
-        "log2": torch.log2,
-        "logical_not": torch.logical_not,
-        "ltz": lambda x: torch.lt(x, 0),
-        "neg": torch.neg,
-        "nez": lambda x: torch.ne(x, 0),
-        "reciprocal": torch.reciprocal,
-        "relu": torch.relu,
-        "relu6": torch.nn.functional.relu6,
-        "sigmoid": torch.sigmoid,
-        "sign": torch.sign,
-        "signbit": torch.signbit,
-        "silu": torch.nn.functional.silu,
-        "sin": torch.sin,
-        "sqrt": torch.sqrt,
-        "square": torch.square,
-        "tan": torch.tan,
-        "tanh": torch.tanh,
-        # Unaries with fast_and_approximate_mode
-        "exp": torch.exp,
-        "erf": torch.erf,
-        "erfc": torch.erfc,
-        "gelu": torch.nn.functional.gelu,
-        "rsqrt": torch.rsqrt,
-        # Unaries with float parameter
-        "elu": torch.nn.functional.elu,
-        "heaviside": torch_heaviside,
-        "leaky_relu": torch.nn.functional.leaky_relu,
-        # "prelu": torch_prelu, # Alias for leaky_relu. TODO(#8544): implement PReLU properly
-        # Other unaries (composite operations)
-        "softplus": torch.nn.functional.softplus,
-    }
-
-    golden_keys = set(name_to_golden_function.keys())
-    function_names = {function.name for function in TTNN_ELTWISE_UNARY_CPP_FUNCTIONS}
-    if golden_keys != function_names:
-        raise ImportError(f"Missing or extra golden functions:\n{golden_keys}\nshould be equal to\n{function_names}")
-
-    def _golden_function(input_tensor: ttnn.Tensor, **_):
-        torch_function = name_to_golden_function[unary_function.name]
-        return torch_function(input_tensor)
-
-    operation = ttnn.register_operation(golden_function=_golden_function)(unary_function)
-    setattr(THIS_MODULE, unary_function.name, operation)
-
-
-TTNN_ELTWISE_UNARY_CPP_FUNCTIONS = [
-    ttnn._ttnn.operations.unary.abs,
-    ttnn._ttnn.operations.unary.acos,
-    ttnn._ttnn.operations.unary.asin,
-    ttnn._ttnn.operations.unary.atan,
-    ttnn._ttnn.operations.unary.cos,
-    ttnn._ttnn.operations.unary.erfinv,
-    ttnn._ttnn.operations.unary.exp2,
-    ttnn._ttnn.operations.unary.expm1,
-    ttnn._ttnn.operations.unary.eqz,
-    ttnn._ttnn.operations.unary.gez,
-    ttnn._ttnn.operations.unary.gtz,
-    ttnn._ttnn.operations.unary.i0,
-    ttnn._ttnn.operations.unary.isfinite,
-    ttnn._ttnn.operations.unary.isinf,
-    ttnn._ttnn.operations.unary.isnan,
-    ttnn._ttnn.operations.unary.isneginf,
-    ttnn._ttnn.operations.unary.isposinf,
-    ttnn._ttnn.operations.unary.lez,
-    ttnn._ttnn.operations.unary.log,
-    ttnn._ttnn.operations.unary.log10,
-    ttnn._ttnn.operations.unary.log2,
-    ttnn._ttnn.operations.unary.logical_not,
-    ttnn._ttnn.operations.unary.ltz,
-    ttnn._ttnn.operations.unary.neg,
-    ttnn._ttnn.operations.unary.nez,
-    ttnn._ttnn.operations.unary.reciprocal,
-    ttnn._ttnn.operations.unary.relu,
-    ttnn._ttnn.operations.unary.relu6,
-    ttnn._ttnn.operations.unary.sigmoid,
-    ttnn._ttnn.operations.unary.sign,
-    ttnn._ttnn.operations.unary.signbit,
-    ttnn._ttnn.operations.unary.silu,
-    ttnn._ttnn.operations.unary.sin,
-    ttnn._ttnn.operations.unary.sqrt,
-    ttnn._ttnn.operations.unary.square,
-    ttnn._ttnn.operations.unary.tan,
-    ttnn._ttnn.operations.unary.tanh,
-    # Unaries with fast_and_approximate_mode
-    ttnn._ttnn.operations.unary.exp,
-    ttnn._ttnn.operations.unary.erf,
-    ttnn._ttnn.operations.unary.erfc,
-    ttnn._ttnn.operations.unary.gelu,
-    ttnn._ttnn.operations.unary.rsqrt,
-    # Unaries with float parameter
-    ttnn._ttnn.operations.unary.elu,
-    ttnn._ttnn.operations.unary.heaviside,
-    ttnn._ttnn.operations.unary.leaky_relu,
-    # ttnn._ttnn.operations.unary.prelu,  # Alias for leaky_relu. TODO(#8544): implement PReLU properly
-    # Other unaries (composite operations)
-    ttnn._ttnn.operations.unary.softplus,
-]
-for unary_function in TTNN_ELTWISE_UNARY_CPP_FUNCTIONS:
-    register_ttnn_cpp_unary_function(unary_function)
-
-
-def prelu(*args, **kwargs):  # Alias for leaky_relu. TODO(#8544): implement PReLU properly
-    leaky_relu = getattr(THIS_MODULE, "leaky_relu")
-    return leaky_relu(*args, **kwargs)
-
-
 def torch_cbrt(x, *args, **kwargs):
     import torch
 
@@ -170,41 +30,70 @@ def torch_multigammaln(x, *args, **kwargs):
 
 
 def register_ttl_unary_function(name, ttl_unary_function):
-    import torch
-
-    name_to_golden_function = {
-        "acosh": torch.acosh,
-        "asinh": torch.asinh,
-        "atanh": torch.atanh,
-        "cbrt": torch_cbrt,
-        "cosh": torch.cosh,
-        "deg2rad": torch.deg2rad,
-        "digamma": torch.digamma,
-        "hardswish": torch.nn.functional.hardswish,
-        "hardsigmoid": torch.nn.functional.hardsigmoid,
-        "hardtanh": torch.nn.functional.hardtanh,
-        "lgamma": torch.lgamma,
-        "log1p": torch.log1p,
-        "log_sigmoid": torch.nn.functional.logsigmoid,
-        "mish": lambda _x: torch.nn.functional.mish(_x.to(torch.float)),
-        "multigammaln": torch_multigammaln,
-        "rad2deg": torch.rad2deg,
-        "sigmoid_accurate": torch.sigmoid,
-        "sinh": torch.sinh,
-        "softsign": torch.nn.functional.softsign,
-        "swish": torch.nn.functional.hardswish,
-        "tanhshrink": ttl.tensor.tanhshrink,
-        "tril": torch.tril,
-        "triu": torch.triu,
-    }
-
-    golden_keys = set(name_to_golden_function.keys())
-    function_names = {name for name, _ in TTL_UNARY_FUNCTIONS}
-    if golden_keys != function_names:
-        raise ImportError(f"Missing or extra golden functions:\n{golden_keys}\nshould be equal to\n{function_names}")
-
     def _golden_function(input_tensor: ttnn.Tensor, **_):
-        torch_function = name_to_golden_function[name]
+        import torch
+
+        name_to_golden_function_function = {
+            "tanh": torch.tanh,
+            "gelu": torch.nn.functional.gelu,
+            "rsqrt": torch.rsqrt,
+            "relu": torch.relu,
+            "log": torch.log,
+            "sin": torch.sin,
+            "cos": torch.cos,
+            "tan": torch.tan,
+            "asin": torch.asin,
+            "acos": torch.acos,
+            "atan": torch.atan,
+            "sinh": torch.sinh,
+            "cosh": torch.cosh,
+            "asinh": torch.asinh,
+            "acosh": torch.acosh,
+            "atanh": torch.atanh,
+            "logical_not": torch.logical_not,
+            "signbit": torch.signbit,
+            "i0": torch.i0,
+            "isfinite": torch.isfinite,
+            "isinf": torch.inf,
+            "isnan": torch.isnan,
+            "isneginf": torch.isneginf,
+            "isposinf": torch.isposinf,
+            "lgamma": torch.lgamma,
+            "log10": torch.log10,
+            "log1p": torch.log1p,
+            "log2": torch.log2,
+            "multigammaln": torch_multigammaln,
+            "neg": torch.neg,
+            "abs": torch.abs,
+            "cbrt": torch_cbrt,
+            "deg2rad": torch.deg2rad,
+            "digamma": torch.digamma,
+            "erf": torch.erf,
+            "erfc": torch.erfc,
+            "erfinv": torch.erfinv,
+            "exp2": torch.exp2,
+            "expm1": torch.expm1,
+            "rad2deg": torch.rad2deg,
+            "reciprocal": torch.reciprocal,
+            "sqrt": torch.sqrt,
+            "square": torch.square,
+            "tril": torch.tril,
+            "triu": torch.triu,
+            "hardsigmoid": torch.nn.functional.hardsigmoid,
+            "hardswish": torch.nn.functional.hardswish,
+            "hardtanh": torch.nn.functional.hardtanh,
+            "log_sigmoid": torch.nn.functional.logsigmoid,
+            "mish": lambda _x: torch.nn.functional.mish(_x.to(torch.float)),
+            "relu6": torch.nn.functional.relu6,
+            "sigmoid": torch.sigmoid,
+            "sigmoid_accurate": torch.sigmoid,
+            "sign": torch.sign,
+            "celu": torch.nn.functional.celu,
+            "softsign": torch.nn.functional.softsign,
+            "swish": torch.nn.functional.hardswish,
+            "softplus": torch.nn.functional.softplus,
+        }
+        torch_function = name_to_golden_function_function[name]
         return torch_function(input_tensor)
 
     def _unary_validate_input_tensors(operation_name, input_tensor, *args, **kwargs):
@@ -263,29 +152,66 @@ def unary_function(
 
 
 TTL_UNARY_FUNCTIONS = [
-    ("acosh", ttl.tensor.acosh),  # composite
-    ("asinh", ttl.tensor.asinh),  # composite
-    ("atanh", ttl.tensor.atanh),  # composite
-    ("cbrt", ttl.tensor.cbrt),  # composite
-    ("cosh", ttl.tensor.cosh),  # composite
-    ("deg2rad", ttl.tensor.deg2rad),  # composite
-    ("digamma", ttl.tensor.digamma),  # composite
-    ("hardswish", ttl.tensor.hardswish),  # composite
-    ("hardsigmoid", ttl.tensor.hardsigmoid),  # composite
-    ("hardtanh", ttl.tensor.hardtanh),  # composite
-    ("lgamma", ttl.tensor.lgamma),  # composite
-    ("log1p", ttl.tensor.log1p),  # composite
-    ("log_sigmoid", ttl.tensor.log_sigmoid),  # composite
-    ("mish", ttl.tensor.mish),  # composite
-    ("multigammaln", ttl.tensor.multigammaln),  # composite
-    ("rad2deg", ttl.tensor.rad2deg),  # composite
-    ("sigmoid_accurate", ttl.tensor.sigmoid_accurate),  # composite
-    ("sinh", ttl.tensor.sinh),  # composite
-    ("softsign", ttl.tensor.softsign),  # composite
-    ("swish", ttl.tensor.swish),  # composite
-    ("tanhshrink", ttl.tensor.tanhshrink),  # composite
-    ("tril", ttl.tensor.tril),  # composite
-    ("triu", ttl.tensor.triu),  # composite
+    ("tanh", ttl.tensor.tanh),
+    ("gelu", ttl.tensor.gelu),
+    ("relu", ttl.tensor.relu),
+    ("rsqrt", ttl.tensor.rsqrt),
+    ("log", ttl.tensor.log),
+    ("sin", ttl.tensor.sin),
+    ("cos", ttl.tensor.cos),
+    ("tan", ttl.tensor.tan),
+    ("asin", ttl.tensor.asin),
+    ("acos", ttl.tensor.acos),
+    ("atan", ttl.tensor.atan),
+    ("sinh", ttl.tensor.sinh),
+    ("cosh", ttl.tensor.cosh),
+    ("asinh", ttl.tensor.asinh),
+    ("acosh", ttl.tensor.acosh),
+    ("atanh", ttl.tensor.atanh),
+    ("logical_not", ttl.tensor.logical_not_unary),
+    ("signbit", ttl.tensor.signbit),
+    ("i0", ttl.tensor.i0),
+    ("isfinite", ttl.tensor.isfinite),
+    ("isinf", ttl.tensor.isinf),
+    ("isnan", ttl.tensor.isnan),
+    ("isneginf", ttl.tensor.isneginf),
+    ("isposinf", ttl.tensor.isposinf),
+    ("lgamma", ttl.tensor.lgamma),
+    ("log10", ttl.tensor.log10),
+    ("log1p", ttl.tensor.log1p),
+    ("log2", ttl.tensor.log2),
+    (
+        "multigammaln",
+        ttl.tensor.multigammaln,
+    ),
+    ("neg", ttl.tensor.neg),
+    ("abs", ttl.tensor.abs),
+    ("cbrt", ttl.tensor.cbrt),
+    ("deg2rad", ttl.tensor.deg2rad),
+    ("digamma", ttl.tensor.digamma),
+    ("erf", ttl.tensor.erf),
+    ("erfc", ttl.tensor.erfc),
+    ("erfinv", ttl.tensor.erfinv),
+    ("exp2", ttl.tensor.exp2),
+    ("expm1", ttl.tensor.expm1),
+    ("rad2deg", ttl.tensor.rad2deg),
+    ("reciprocal", ttl.tensor.recip),
+    ("sqrt", ttl.tensor.sqrt),
+    ("square", ttl.tensor.square),
+    ("tril", ttl.tensor.tril),
+    ("triu", ttl.tensor.triu),
+    ("hardsigmoid", ttl.tensor.hardsigmoid),
+    ("hardswish", ttl.tensor.hardswish),
+    ("hardtanh", ttl.tensor.hardtanh),
+    ("log_sigmoid", ttl.tensor.log_sigmoid),
+    ("mish", ttl.tensor.mish),
+    ("relu6", ttl.tensor.relu6),
+    ("sigmoid", ttl.tensor.sigmoid),
+    ("sigmoid_accurate", ttl.tensor.sigmoid_accurate),
+    ("sign", ttl.tensor.sign),
+    ("softsign", ttl.tensor.softsign),
+    ("swish", ttl.tensor.swish),
+    ("tanhshrink", ttl.tensor.tanhshrink),
 ]
 
 
@@ -301,11 +227,11 @@ def register_ttl_unary_function_with_float(name, ttl_unary_function, param):
     def _golden_function(input_tensor: ttnn.Tensor, parameter, **_):
         import torch
 
-        name_to_golden_function = {
+        name_to_golden_function_function = {
             "logit": torch.logit,
             "polygamma": torch.special.polygamma,
         }
-        torch_function = name_to_golden_function[name]
+        torch_function = name_to_golden_function_function[name]
         return torch_function(input_tensor, parameter)
 
     def _unary_validate_input_tensors(operation_name, input_tensor, *args, **kwargs):
@@ -366,24 +292,69 @@ def unary_function(
 
 
 TTL_UNARY_FUNCTIONS_WITH_FLOAT_PARAM = [
-    ("logit", ttl.tensor.logit, "eps"),  # composite
-    ("polygamma", ttl.tensor.polygamma, "parameter"),  # composite
+    ("logit", ttl.tensor.logit, "eps"),
+    ("polygamma", ttl.tensor.polygamma, "parameter"),
 ]
 
 for unary_function_name, ttl_unary_function, param in TTL_UNARY_FUNCTIONS_WITH_FLOAT_PARAM:
     register_ttl_unary_function_with_float(unary_function_name, ttl_unary_function, param)
 
 
+def register_eltwise_unary_cpp_function(unary_function):
+    def _golden_function(input_tensor: ttnn.Tensor, **_):
+        import torch
+
+        ttnn_function_to_golden_function = {
+            ttnn._ttnn.operations.unary.exp: torch.exp,
+            ttnn._ttnn.operations.unary.silu: torch.nn.functional.silu,
+            ttnn._ttnn.operations.unary.softplus: torch.nn.functional.softplus,
+        }
+        torch_function = ttnn_function_to_golden_function[unary_function]
+        return torch_function(input_tensor)
+
+    operation = ttnn.register_operation(golden_function=_golden_function)(unary_function)
+    setattr(THIS_MODULE, unary_function.name, operation)
+
+
+TTNN_ELTWISE_UNARY_CPP_FUNCTIONS = [
+    ttnn._ttnn.operations.unary.exp,
+    ttnn._ttnn.operations.unary.silu,
+    ttnn._ttnn.operations.unary.softplus,
+]
+for unary_function in TTNN_ELTWISE_UNARY_CPP_FUNCTIONS:
+    register_eltwise_unary_cpp_function(unary_function)
+
+
 def _is_scalar(value):
     return isinstance(value, (int, float))
 
 
+def torch_heaviside(x, *args, **kwargs):
+    import torch
+
+    value = kwargs.pop("scalar")
+    result = torch.heaviside(x, torch.tensor(value, dtype=x.dtype))
+    return result
+
+
+def torch_prelu(x, *args, **kwargs):
+    import torch
+
+    weight = kwargs.pop("scalar")
+    result = torch.nn.functional.prelu(x, torch.tensor(weight, dtype=x.dtype))
+    return result
+
+
 def register_ttl_activation_function_with_float(name, ttl_activation_function, param):
     def _golden_function(input_tensor: ttnn.Tensor, parameter, **_):
         import torch
 
         name_to_torch_function = {
             "hardshrink": torch.nn.functional.hardshrink,
+            "heaviside": torch_heaviside,
+            "leaky_relu": torch.nn.functional.leaky_relu,
+            "prelu": torch_prelu,
+            "elu": torch.nn.functional.elu,
             "softshrink": torch.nn.functional.softshrink,
             "tanhshrink": torch.nn.functional.tanhshrink,
         }
@@ -452,9 +423,13 @@ def activation_function(
 
 
 TTL_ACTIVATION_FUNCTIONS_WITH_FLOAT_PARAM = [
-    ("hardshrink", ttl.tensor.hardshrink, "lambda"),  # composite
-    ("celu", ttl.tensor.celu, "alpha"),  # composite
-    ("softshrink", ttl.tensor.softshrink, "lambda"),  # composite
+    ("hardshrink", ttl.tensor.hardshrink, "lambda"),
+    ("heaviside", ttl.tensor.heaviside, "value"),
+    ("leaky_relu", ttl.tensor.leaky_relu, "slope"),
+    ("prelu", ttl.tensor.prelu, "weight"),
+    ("elu", ttl.tensor.elu, "alpha"),
+    ("celu", ttl.tensor.celu, "alpha"),
+    ("softshrink", ttl.tensor.softshrink, "lambda"),
 ]
 
 for activation_function_name, ttl_activation_function, param in TTL_ACTIVATION_FUNCTIONS_WITH_FLOAT_PARAM:
@@ -537,8 +512,8 @@ def activation_function(
 
 
 TTL_ACTIVATION_FUNCTIONS_WITH_TWO_FLOAT_PARAMS = [
-    ("clip", ttl.tensor.clip, "min", "max"),  # composite
-    ("threshold", ttl.tensor.threshold, "value", "threshold"),  # composite
+    ("clip", ttl.tensor.clip, "min", "max"),
+    ("threshold", ttl.tensor.threshold, "value", "threshold"),
 ]
 
 for (
@@ -656,10 +631,10 @@ def activation_function(
 
 
 TTL_ACTIVATION_FUNCTIONS_GLU = [
-    ("glu", ttl.tensor.glu, "dim"),  # composite
-    ("reglu", ttl.tensor.reglu, "dim"),  # composite
-    ("swiglu", ttl.tensor.swiglu, "dim"),  # composite
-    ("geglu", ttl.tensor.geglu, "dim"),  # composite
+    ("glu", ttl.tensor.glu, "dim"),
+    ("reglu", ttl.tensor.reglu, "dim"),
+    ("swiglu", ttl.tensor.swiglu, "dim"),
+    ("geglu", ttl.tensor.geglu, "dim"),
 ]
 
 

From f1c31305df4232a9fd24aa417be777a7c3b1d618 Mon Sep 17 00:00:00 2001
From: Austin Ho <aho@tenstorrent.com>
Date: Fri, 17 May 2024 03:10:40 +0000
Subject: [PATCH 14/40] #0: Clean up tt_dnn op enums

---
 tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp |  6 ++--
 tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp     | 19 +++++-----
 .../tt_dnn/op_library/concat/concat_op.hpp    |  2 +-
 tt_eager/tt_dnn/op_library/conv/conv_op.hpp   |  2 +-
 .../op_library/conv/optimized_conv_op.hpp     |  2 +-
 tt_eager/tt_dnn/op_library/copy/copy_op.hpp   |  2 +-
 .../eltwise_binary/eltwise_binary_op.hpp      | 36 +++++++++----------
 .../eltwise_unary/eltwise_unary_op.hpp        | 10 +-----
 .../op_library/embeddings/embeddings_op.hpp   |  4 +--
 tt_eager/tt_dnn/op_library/fold/fold_op.hpp   |  2 +-
 .../moreh_softmax/moreh_softmax_op.hpp        | 18 +++++-----
 .../moreh_softmax_backward_op.hpp             | 18 +++++-----
 tt_eager/tt_dnn/op_library/move/move_op.hpp   |  2 +-
 .../tt_dnn/op_library/pool/average_pool.hpp   |  2 +-
 .../tt_dnn/op_library/reduce/reduce_op.hpp    |  6 ++--
 .../tt_dnn/op_library/repeat/repeat_op.hpp    |  2 +-
 .../rotary_embedding/rotary_embedding_op.hpp  |  2 +-
 .../op_library/rotate_half/rotate_half_op.hpp |  2 +-
 tt_eager/tt_dnn/op_library/scan/scan_op.hpp   |  2 +-
 .../tt_dnn/op_library/sharded/sharded_op.hpp  |  2 +-
 .../sharded_partial/sharded_op_partial.hpp    |  2 +-
 .../tt_dnn/op_library/tilize/tilize_op.hpp    |  4 +--
 .../op_library/transpose/transpose_op.hpp     |  4 +--
 tt_eager/tt_dnn/op_library/unpad/unpad_op.hpp |  2 +-
 .../op_library/untilize/untilize_op.hpp       |  4 +--
 .../update_cache/update_cache_op.hpp          |  4 +--
 .../op_library/upsample/upsample_op.hpp       |  2 +-
 27 files changed, 77 insertions(+), 86 deletions(-)

diff --git a/tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp b/tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp
index 0fb876f555eb..be53c3f3f586 100644
--- a/tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp
+++ b/tt_eager/tt_dnn/op_library/bcast/bcast_op.hpp
@@ -14,12 +14,12 @@ namespace tt {
 
 namespace tt_metal {
 
-enum class BcastOpMath { ADD = 0, SUB = 1, MUL = 2 };
+enum class BcastOpMath { ADD, SUB, MUL };
 
-enum class BcastOpDim { H = 0, W = 1, HW = 2 };
+enum class BcastOpDim { H, W, HW };
 
 // TODO: Accept parallelization
-enum class BcastOpParallelizationStrategy { MULTI_CORE_H = 0, MULTI_CORE_W = 1, MULTI_CORE_HW = 2, SINGLE_CORE = 3 };
+enum class BcastOpParallelizationStrategy { MULTI_CORE_H, MULTI_CORE_W, MULTI_CORE_HW, SINGLE_CORE };
 
 operation::ProgramWithCallbacks bcast_single_core(
     const Tensor &input_tensor_a,
diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp
index bad8eeaddfd0..2c25530f90f6 100644
--- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp
+++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp
@@ -15,17 +15,16 @@ namespace tt {
 
 namespace tt_metal {
 
-// TODO: Accept parallelization
 enum class MatmulParallelizationStrategy {
-    MULTI_CORE = 0,
-    MULTI_CORE_REUSE = 1,
-    MULTI_CORE_REUSE_PADDING = 2,
-    MULTI_CORE_REUSE_OPTIMIZED = 3,
-    MULTI_CORE_REUSE_MCAST_2D_OPTIMIZED = 4,
-    MULTI_CORE_REUSE_MCAST_2D_TRANSPOSED_OPTIMIZED = 5,
-    MULTI_CORE_REUSE_MCAST_1D_IN0_OPTIMIZED = 6,
-    MULTI_CORE_REUSE_MCAST_1D_IN1_OPTIMIZED = 7,
-    SINGLE_CORE = 8
+    MULTI_CORE,
+    MULTI_CORE_REUSE,
+    MULTI_CORE_REUSE_PADDING,
+    MULTI_CORE_REUSE_OPTIMIZED,
+    MULTI_CORE_REUSE_MCAST_2D_OPTIMIZED,
+    MULTI_CORE_REUSE_MCAST_2D_TRANSPOSED_OPTIMIZED,
+    MULTI_CORE_REUSE_MCAST_1D_IN0_OPTIMIZED,
+    MULTI_CORE_REUSE_MCAST_1D_IN1_OPTIMIZED,
+    SINGLE_CORE
 };
 
 
diff --git a/tt_eager/tt_dnn/op_library/concat/concat_op.hpp b/tt_eager/tt_dnn/op_library/concat/concat_op.hpp
index 3124bb5ed1a5..bd44d8d8d7d6 100644
--- a/tt_eager/tt_dnn/op_library/concat/concat_op.hpp
+++ b/tt_eager/tt_dnn/op_library/concat/concat_op.hpp
@@ -11,7 +11,7 @@ namespace tt {
 
 namespace tt_metal {
 
-enum class ConcatOpParallelizationStrategy { SINGLE_CORE = 0, MULTI_CORE = 1, SHARDED_MULTI_CORE = 2 };
+enum class ConcatOpParallelizationStrategy { SINGLE_CORE, MULTI_CORE, SHARDED_MULTI_CORE };
 
 struct Concat {
     uint32_t dim;
diff --git a/tt_eager/tt_dnn/op_library/conv/conv_op.hpp b/tt_eager/tt_dnn/op_library/conv/conv_op.hpp
index 908c6f6c3341..961dbbf14d4e 100644
--- a/tt_eager/tt_dnn/op_library/conv/conv_op.hpp
+++ b/tt_eager/tt_dnn/op_library/conv/conv_op.hpp
@@ -13,7 +13,7 @@ namespace tt_metal {
 
 // TODO: Accept parallelization
 enum class ConvOpParallelizationStrategy {
-    MULTI_CORE = 0, MULTI_CORE_REUSE = 1, MULTI_CORE_REUSE_MCAST = 2, SINGLE_CORE = 3
+    MULTI_CORE, MULTI_CORE_REUSE, MULTI_CORE_REUSE_MCAST, SINGLE_CORE
 };
 
 struct Conv {
diff --git a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.hpp b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.hpp
index 9b602ebf10d6..0400917b38d7 100644
--- a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.hpp
+++ b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.hpp
@@ -14,7 +14,7 @@ namespace tt_metal {
 
 // TODO: Accept parallelization
 enum class OptimizedConvOpParallelizationStrategy {
-    MULTI_CORE = 0, MULTI_CORE_REUSE = 1, MULTI_CORE_REUSE_MCAST = 2, SINGLE_CORE = 3
+    MULTI_CORE, MULTI_CORE_REUSE, MULTI_CORE_REUSE_MCAST, SINGLE_CORE
 };
 
 struct OptimizedConvParallelizationConfig {
diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.hpp b/tt_eager/tt_dnn/op_library/copy/copy_op.hpp
index 7fd0ae54db46..92f03d19e04a 100644
--- a/tt_eager/tt_dnn/op_library/copy/copy_op.hpp
+++ b/tt_eager/tt_dnn/op_library/copy/copy_op.hpp
@@ -19,7 +19,7 @@ namespace tt {
 namespace tt_metal {
 
 enum class CopyOpParallelizationStrategy {
-    MULTI_CORE = 0, SINGLE_CORE = 1
+    MULTI_CORE, SINGLE_CORE
 };
 
 struct Copy {
diff --git a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp
index 68f4ee08f8fb..c13a3a81670a 100644
--- a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp
+++ b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp
@@ -19,26 +19,26 @@ namespace tt_metal {
 
 enum class BinaryOpType {
 
-    ADD = 0,
-    SUB = 1,
-    MUL = 2,
-    GT = 3,
-    LT = 4,
-    LTE = 5,
-    GTE = 6,
-    EQ = 7,
-    NE = 8,
-    SQUARED_DIFFERENCE = 9,
-    BIAS_GELU = 10,
-    LOGADDEXP = 11,
-    LOGICAL_AND = 12,
-    LOGICAL_OR = 13,
-    LDEXP = 14,
-    LOGADDEXP2 = 15,
-    DIV_FAST = 16
+    ADD,
+    SUB,
+    MUL,
+    GT,
+    LT,
+    LTE,
+    GTE,
+    EQ,
+    NE,
+    SQUARED_DIFFERENCE,
+    BIAS_GELU,
+    LOGADDEXP,
+    LOGICAL_AND,
+    LOGICAL_OR,
+    LDEXP,
+    LOGADDEXP2,
+    DIV_FAST
 };
 
-enum class BinaryOpParallelizationStrategy { MULTI_CORE = 0, SINGLE_CORE = 1 };
+enum class BinaryOpParallelizationStrategy { MULTI_CORE, SINGLE_CORE };
 
 operation::ProgramWithCallbacks eltwise_binary_single_core(
     const Tensor &a,
diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp
index f0ff9b15d306..80563193d13f 100644
--- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp
+++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp
@@ -68,10 +68,6 @@ enum class UnaryOpType {
     SILU,
     IDENTITY,
     NEG,
-    ADD_UNARY,
-    SUB_UNARY,
-    MUL_UNARY,
-    DIV_UNARY,
     ADD_UNARY_SFPU,
     SUB_UNARY_SFPU,
     MUL_UNARY_SFPU,
@@ -99,10 +95,6 @@ bool is_parametrized_type(T val) {
         case UnaryOpType::RSUB:
         case UnaryOpType::RDIV:
         case UnaryOpType::EXP:
-        case UnaryOpType::ADD_UNARY:
-        case UnaryOpType::SUB_UNARY:
-        case UnaryOpType::MUL_UNARY:
-        case UnaryOpType::DIV_UNARY:
         case UnaryOpType::ADD_UNARY_SFPU:
         case UnaryOpType::SUB_UNARY_SFPU:
         case UnaryOpType::MUL_UNARY_SFPU:
@@ -165,7 +157,7 @@ inline UnaryWithParam string_to_unary_with_param(const std::string& name) {
     TT_THROW("Unknown unary op: " + name);
 }
 
-enum class UnaryOpParallelizationStrategy { SINGLE_CORE = 0, MULTI_CORE = 1, SHARDED_MULTI_CORE = 2 };
+enum class UnaryOpParallelizationStrategy { SINGLE_CORE, MULTI_CORE, SHARDED_MULTI_CORE };
 
 struct EltwiseUnary {
     const std::vector<UnaryWithParam> op_chain;
diff --git a/tt_eager/tt_dnn/op_library/embeddings/embeddings_op.hpp b/tt_eager/tt_dnn/op_library/embeddings/embeddings_op.hpp
index f18f663c0d42..f9bfb4c3a063 100644
--- a/tt_eager/tt_dnn/op_library/embeddings/embeddings_op.hpp
+++ b/tt_eager/tt_dnn/op_library/embeddings/embeddings_op.hpp
@@ -15,8 +15,8 @@ namespace tt {
 
 namespace tt_metal {
 
-enum class EmbeddingsType { GENERIC = 0, PADDED = 1, BINARY = 2 };
-enum class EmbeddingsIndexType { UINT32 = 0, BFP16 = 1};
+enum class EmbeddingsType { GENERIC, PADDED, BINARY };
+enum class EmbeddingsIndexType { UINT32, BFP16};
 
 struct Embeddings {
     const MemoryConfig output_mem_config;
diff --git a/tt_eager/tt_dnn/op_library/fold/fold_op.hpp b/tt_eager/tt_dnn/op_library/fold/fold_op.hpp
index 94e97cf110c4..7459345da769 100644
--- a/tt_eager/tt_dnn/op_library/fold/fold_op.hpp
+++ b/tt_eager/tt_dnn/op_library/fold/fold_op.hpp
@@ -11,7 +11,7 @@
 
 namespace tt::tt_metal {
 
-enum class FoldOpParallelizationStrategy { SINGLE_CORE = 0, SHARDED_MULTI_CORE = 1 };
+enum class FoldOpParallelizationStrategy { SINGLE_CORE, SHARDED_MULTI_CORE };
 
 struct Fold {
     uint8_t stride_h;
diff --git a/tt_eager/tt_dnn/op_library/moreh_softmax/moreh_softmax_op.hpp b/tt_eager/tt_dnn/op_library/moreh_softmax/moreh_softmax_op.hpp
index 74d818ed1c22..2cadce0e39e1 100644
--- a/tt_eager/tt_dnn/op_library/moreh_softmax/moreh_softmax_op.hpp
+++ b/tt_eager/tt_dnn/op_library/moreh_softmax/moreh_softmax_op.hpp
@@ -18,18 +18,18 @@ namespace primary {
 using namespace tt_metal;
 
 enum class MorehSoftmaxOpParallelizationStrategy {
-    NONE = 0,
-    SMALL_W = 1,
-    SMALL_H = 2,
-    LARGE_W = 3,
-    LARGE_H = 4,
-    LARGE_C = 5,
+    NONE,
+    SMALL_W,
+    SMALL_H,
+    LARGE_W,
+    LARGE_H,
+    LARGE_C,
 };
 
 enum class MorehSoftmaxOp {
-    SOFTMAX = 0,
-    SOFTMIN = 1,
-    LOGSOFTMAX = 2,
+    SOFTMAX,
+    SOFTMIN,
+    LOGSOFTMAX,
 };
 
 bool is_moreh_softmax_w_small_available(const Tensor &tensor);
diff --git a/tt_eager/tt_dnn/op_library/moreh_softmax_backward/moreh_softmax_backward_op.hpp b/tt_eager/tt_dnn/op_library/moreh_softmax_backward/moreh_softmax_backward_op.hpp
index f82f77eac151..7c61dea7e118 100644
--- a/tt_eager/tt_dnn/op_library/moreh_softmax_backward/moreh_softmax_backward_op.hpp
+++ b/tt_eager/tt_dnn/op_library/moreh_softmax_backward/moreh_softmax_backward_op.hpp
@@ -17,18 +17,18 @@ namespace primary {
 using namespace tt_metal;
 
 enum class MorehSoftmaxBackwardOpParallelizationStrategy {
-    NONE = 0,
-    SMALL_W = 1,
-    SMALL_H = 2,
-    LARGE_W = 3,
-    LARGE_H = 4,
-    LARGE_C = 5
+    NONE,
+    SMALL_W,
+    SMALL_H,
+    LARGE_W,
+    LARGE_H,
+    LARGE_C
 };
 
 enum class MorehSoftmaxBackwardOp {
-    SOFTMAX = 0,
-    SOFTMIN = 1,
-    LOGSOFTMAX = 2,
+    SOFTMAX,
+    SOFTMIN,
+    LOGSOFTMAX,
 };
 
 bool is_moreh_softmax_backward_w_small_available(const Tensor &tensor);
diff --git a/tt_eager/tt_dnn/op_library/move/move_op.hpp b/tt_eager/tt_dnn/op_library/move/move_op.hpp
index fbff9de5c5c9..7c02c1d0718d 100644
--- a/tt_eager/tt_dnn/op_library/move/move_op.hpp
+++ b/tt_eager/tt_dnn/op_library/move/move_op.hpp
@@ -26,7 +26,7 @@ namespace tt {
 namespace tt_metal {
 
 enum class MoveOpParallelizationStrategy {
-    MULTI_CORE = 0, SINGLE_CORE = 1, MULTI_CORE_OVERLAP = 2, MULTI_CORE_SHARDED = 3
+    MULTI_CORE, SINGLE_CORE, MULTI_CORE_OVERLAP, MULTI_CORE_SHARDED
 };
 
 struct Move {
diff --git a/tt_eager/tt_dnn/op_library/pool/average_pool.hpp b/tt_eager/tt_dnn/op_library/pool/average_pool.hpp
index 8be426cece97..93edb8da22be 100644
--- a/tt_eager/tt_dnn/op_library/pool/average_pool.hpp
+++ b/tt_eager/tt_dnn/op_library/pool/average_pool.hpp
@@ -13,7 +13,7 @@ namespace tt {
 namespace tt_metal {
 
 enum class PoolType {
-    AVG = 0
+    AVG
 };
 
 Tensor average_pool_2d(const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, const std::optional<DataType>& output_dtype = std::nullopt);
diff --git a/tt_eager/tt_dnn/op_library/reduce/reduce_op.hpp b/tt_eager/tt_dnn/op_library/reduce/reduce_op.hpp
index 3a237cd7df50..f7a49ae440ef 100644
--- a/tt_eager/tt_dnn/op_library/reduce/reduce_op.hpp
+++ b/tt_eager/tt_dnn/op_library/reduce/reduce_op.hpp
@@ -13,15 +13,15 @@ namespace tt {
 
 namespace tt_metal {
 enum class ReduceOpMath {
-    SUM = 0, MAX = 1, MIN = 2
+    SUM, MAX, MIN
 };
 
 enum class ReduceOpDim {
-    H = 0, W = 1, HW = 2
+    H, W, HW
 };
 
 enum class ReduceOpParallelizationStrategy {
-    MULTI_CORE_H = 0, MULTI_CORE_W = 1, MULTI_CORE_HW = 2, SINGLE_CORE = 3
+    MULTI_CORE_H, MULTI_CORE_W, MULTI_CORE_HW, SINGLE_CORE
 };
 
 // TODO: Accept parallelization
diff --git a/tt_eager/tt_dnn/op_library/repeat/repeat_op.hpp b/tt_eager/tt_dnn/op_library/repeat/repeat_op.hpp
index 99b91b0edcb2..16eff985ff5b 100644
--- a/tt_eager/tt_dnn/op_library/repeat/repeat_op.hpp
+++ b/tt_eager/tt_dnn/op_library/repeat/repeat_op.hpp
@@ -11,7 +11,7 @@ namespace tt {
 
 namespace tt_metal {
 
-enum class RepeatOpParallelizationStrategy { SINGLE_CORE = 0, MULTI_CORE = 1 };
+enum class RepeatOpParallelizationStrategy { SINGLE_CORE, MULTI_CORE };
 
 struct Repeat {
     const uint32_t repeat_dim;
diff --git a/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.hpp b/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.hpp
index 8843a81aca7d..5bef302ff0a7 100644
--- a/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.hpp
+++ b/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.hpp
@@ -16,7 +16,7 @@ namespace tt {
 
 namespace tt_metal {
 
-enum class RotaryEmbeddingOpParallelizationStrategy { MULTI_CORE = 0, SINGLE_CORE = 1 };
+enum class RotaryEmbeddingOpParallelizationStrategy { MULTI_CORE, SINGLE_CORE };
 
 operation::ProgramWithCallbacks rotary_embedding_single_core(
     const Tensor &input, const Tensor &cos, const Tensor &sin, Tensor &output, std::optional<uint32_t> token_idx, DeviceComputeKernelConfig compute_kernel_config);
diff --git a/tt_eager/tt_dnn/op_library/rotate_half/rotate_half_op.hpp b/tt_eager/tt_dnn/op_library/rotate_half/rotate_half_op.hpp
index 8250493bd71e..6f7e4b2435f9 100644
--- a/tt_eager/tt_dnn/op_library/rotate_half/rotate_half_op.hpp
+++ b/tt_eager/tt_dnn/op_library/rotate_half/rotate_half_op.hpp
@@ -16,7 +16,7 @@ namespace tt {
 namespace tt_metal {
 
 enum class RotateHalfOpParallelizationStrategy {
-    MULTI_CORE = 0, SINGLE_CORE = 1
+    MULTI_CORE, SINGLE_CORE
 };
 
 operation::ProgramWithCallbacks rotate_half_single_core(const Tensor &input_tensor, Tensor &output_tensor);
diff --git a/tt_eager/tt_dnn/op_library/scan/scan_op.hpp b/tt_eager/tt_dnn/op_library/scan/scan_op.hpp
index 655a4cb7c645..cd1b99970e5e 100644
--- a/tt_eager/tt_dnn/op_library/scan/scan_op.hpp
+++ b/tt_eager/tt_dnn/op_library/scan/scan_op.hpp
@@ -11,7 +11,7 @@
 
 namespace tt::tt_metal {
 
-enum class ScanOpParallelizationStrategy { SHARDED_MULTI_CORE = 0 };
+enum class ScanOpParallelizationStrategy { SHARDED_MULTI_CORE };
 
 enum class ScanOpDirection { ROWS, COLS, ROWS_REVERSED, COLS_REVERSED };
 
diff --git a/tt_eager/tt_dnn/op_library/sharded/sharded_op.hpp b/tt_eager/tt_dnn/op_library/sharded/sharded_op.hpp
index cedb2ac34fc1..766c46c5f730 100644
--- a/tt_eager/tt_dnn/op_library/sharded/sharded_op.hpp
+++ b/tt_eager/tt_dnn/op_library/sharded/sharded_op.hpp
@@ -15,7 +15,7 @@ namespace tt {
 
 namespace tt_metal {
 
-enum class ShardedOpParallelizationStrategy { MULTI_CORE = 0 };
+enum class ShardedOpParallelizationStrategy { MULTI_CORE };
 
 enum class ShardedOpType { InterleavedToSharded, ShardedToInterleaved };
 
diff --git a/tt_eager/tt_dnn/op_library/sharded_partial/sharded_op_partial.hpp b/tt_eager/tt_dnn/op_library/sharded_partial/sharded_op_partial.hpp
index 0143fc618582..60a7f001704d 100644
--- a/tt_eager/tt_dnn/op_library/sharded_partial/sharded_op_partial.hpp
+++ b/tt_eager/tt_dnn/op_library/sharded_partial/sharded_op_partial.hpp
@@ -17,7 +17,7 @@ namespace tt {
 
 namespace tt_metal {
 
-enum class ShardedOpPartialParallelizationStrategy { MULTI_CORE = 0 };
+enum class ShardedOpPartialParallelizationStrategy { MULTI_CORE };
 
 enum class ShardedOpPartialType { InterleavedToShardedPartial, ShardedToInterleavedPartial };
 
diff --git a/tt_eager/tt_dnn/op_library/tilize/tilize_op.hpp b/tt_eager/tt_dnn/op_library/tilize/tilize_op.hpp
index 7ac9f827ee1e..f8a3733a8e17 100644
--- a/tt_eager/tt_dnn/op_library/tilize/tilize_op.hpp
+++ b/tt_eager/tt_dnn/op_library/tilize/tilize_op.hpp
@@ -11,7 +11,7 @@ namespace tt {
 
 namespace tt_metal {
 
-enum class TilizeOpParallelizationStrategy { MULTI_CORE = 0, SINGLE_CORE = 1 };
+enum class TilizeOpParallelizationStrategy { MULTI_CORE, SINGLE_CORE };
 
 struct Tilize {
     const MemoryConfig output_mem_config;
@@ -32,7 +32,7 @@ struct Tilize {
     }
 };
 
-enum class TilizeWithValPaddingOpParallelizationStrategy { MULTI_CORE = 0, SINGLE_CORE = 1 };
+enum class TilizeWithValPaddingOpParallelizationStrategy { MULTI_CORE, SINGLE_CORE };
 
 struct TilizeWithValPadding {
     const Shape output_tensor_shape;
diff --git a/tt_eager/tt_dnn/op_library/transpose/transpose_op.hpp b/tt_eager/tt_dnn/op_library/transpose/transpose_op.hpp
index 78d124d152a6..f07cbc9c52e5 100644
--- a/tt_eager/tt_dnn/op_library/transpose/transpose_op.hpp
+++ b/tt_eager/tt_dnn/op_library/transpose/transpose_op.hpp
@@ -12,11 +12,11 @@ namespace tt {
 namespace tt_metal {
 
 enum class TransposeOpDim {
-    WH = 0, HC = 1, CN = 2, NH = 3, NW = 4, CW = 5
+    WH, HC, CN, NH, NW, CW
 };
 
 enum class TransposeOpParallelizationStrategy {
-    MULTI_CORE_WH = 0, MULTI_CORE_HC = 1, MULTI_CORE_CN = 2, SINGLE_CORE = 3
+    MULTI_CORE_WH, MULTI_CORE_HC, MULTI_CORE_CN, SINGLE_CORE
 };
 
 struct Transpose {
diff --git a/tt_eager/tt_dnn/op_library/unpad/unpad_op.hpp b/tt_eager/tt_dnn/op_library/unpad/unpad_op.hpp
index b1b8d8f379b1..4a1de5200b3a 100644
--- a/tt_eager/tt_dnn/op_library/unpad/unpad_op.hpp
+++ b/tt_eager/tt_dnn/op_library/unpad/unpad_op.hpp
@@ -13,7 +13,7 @@ namespace tt {
 namespace tt_metal {
 
 enum class UnpadOpParallelizationStrategy {
-    MULTI_CORE = 0, SINGLE_CORE = 1
+    MULTI_CORE, SINGLE_CORE
 };
 
 uint32_t get_tiled_start_offset(const Tensor &input_tensor, const Shape &output_tensor_start);
diff --git a/tt_eager/tt_dnn/op_library/untilize/untilize_op.hpp b/tt_eager/tt_dnn/op_library/untilize/untilize_op.hpp
index 4fe808e2c835..08170215b9f3 100644
--- a/tt_eager/tt_dnn/op_library/untilize/untilize_op.hpp
+++ b/tt_eager/tt_dnn/op_library/untilize/untilize_op.hpp
@@ -15,7 +15,7 @@ namespace tt_metal {
 #define MAX_PACK_UNTILIZE_WIDTH 8       // pack untilize currently does not support > 8 width
 
 enum class UntilizeOpParallelizationStrategy {
-    MULTI_CORE = 0, SINGLE_CORE = 1
+    MULTI_CORE, SINGLE_CORE
 };
 
 struct Untilize {
@@ -39,7 +39,7 @@ struct Untilize {
 };
 
 enum class UntilizeWithUnpaddingOpParallelizationStrategy {
-    MULTI_CORE = 0, SINGLE_CORE = 1
+    MULTI_CORE, SINGLE_CORE
 };
 
 struct UntilizeWithUnpadding {
diff --git a/tt_eager/tt_dnn/op_library/update_cache/update_cache_op.hpp b/tt_eager/tt_dnn/op_library/update_cache/update_cache_op.hpp
index 79c4f26b77ce..1bf023e22069 100644
--- a/tt_eager/tt_dnn/op_library/update_cache/update_cache_op.hpp
+++ b/tt_eager/tt_dnn/op_library/update_cache/update_cache_op.hpp
@@ -17,11 +17,11 @@ namespace tt {
 namespace tt_metal {
 
 enum class UpdateCacheOpParallelizationStrategy {
-    MULTI_CORE = 0, SINGLE_CORE = 1
+    MULTI_CORE, SINGLE_CORE
 };
 
 enum class UpdateCacheOpType {
-    FILL = 0, UPDATE = 1
+    FILL, UPDATE
 };
 
 operation::ProgramWithCallbacks update_cache_multi_core(const Tensor& cache_tensor, const Tensor &input_tensor, const uint32_t update_idx, const uint32_t batch_offset, DeviceComputeKernelConfig compute_kernel_config);
diff --git a/tt_eager/tt_dnn/op_library/upsample/upsample_op.hpp b/tt_eager/tt_dnn/op_library/upsample/upsample_op.hpp
index 5581cb382107..92b77cf87450 100644
--- a/tt_eager/tt_dnn/op_library/upsample/upsample_op.hpp
+++ b/tt_eager/tt_dnn/op_library/upsample/upsample_op.hpp
@@ -11,7 +11,7 @@ namespace tt {
 namespace tt_metal {
 
 enum class UpSampleParallelizationStrategy {
-    MULTI_CORE = 0, SINGLE_CORE = 1
+    MULTI_CORE, SINGLE_CORE
 };
 
 struct UpSample{

From 8024d15b17d2e4c754a3d5948eb92d937c078f3f Mon Sep 17 00:00:00 2001
From: Austin Ho <aho@tenstorrent.com>
Date: Fri, 17 May 2024 12:54:41 +0000
Subject: [PATCH 15/40] #0: Add mixed precision support for bcast

---
 .../multi_core_h/bcast_op_multi_core_h.cpp    | 20 +++++++------
 .../multi_core_hw/bcast_op_multi_core_hw.cpp  | 28 +++++++++++--------
 .../multi_core_w/bcast_op_multi_core_w.cpp    | 20 +++++++------
 .../single_core/bcast_op_single_core.cpp      | 20 +++++++------
 4 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/tt_eager/tt_dnn/op_library/bcast/multi_core_h/bcast_op_multi_core_h.cpp b/tt_eager/tt_dnn/op_library/bcast/multi_core_h/bcast_op_multi_core_h.cpp
index 16667d3598d9..3e15c7b3aa08 100644
--- a/tt_eager/tt_dnn/op_library/bcast/multi_core_h/bcast_op_multi_core_h.cpp
+++ b/tt_eager/tt_dnn/op_library/bcast/multi_core_h/bcast_op_multi_core_h.cpp
@@ -47,9 +47,13 @@ operation::ProgramWithCallbacks bcast_multi_core_h(const Tensor &a, const Tensor
 
     tt_metal::Device *device = a.device();
 
-    tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::DataFormat src0_cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::DataFormat src1_cb_data_format = tt_metal::datatype_to_dataformat_converter(b.get_dtype());
+    tt::DataFormat dst_cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype());
 
-    uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format);
+    uint32_t src0_single_tile_size = tt_metal::detail::TileSize(src0_cb_data_format);
+    uint32_t src1_single_tile_size = tt_metal::detail::TileSize(src1_cb_data_format);
+    uint32_t dst_single_tile_size = tt_metal::detail::TileSize(dst_cb_data_format);
 
     auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
     uint32_t num_cores_x = compute_with_storage_grid_size.x;
@@ -69,19 +73,19 @@ operation::ProgramWithCallbacks bcast_multi_core_h(const Tensor &a, const Tensor
 	uint32_t src0_cb_index = 0;
 	uint32_t num_input_tiles = 2;
 
-	tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
-		.set_page_size(src0_cb_index, single_tile_size);
+	tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * src0_single_tile_size, {{src0_cb_index, src0_cb_data_format}})
+		.set_page_size(src0_cb_index, src0_single_tile_size);
 	auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_device_cores, src0_cb_config);
 
 	uint32_t src1_cb_index = 1;
-	tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
-		.set_page_size(src1_cb_index, single_tile_size);
+	tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * src1_single_tile_size, {{src1_cb_index, src1_cb_data_format}})
+		.set_page_size(src1_cb_index, src1_single_tile_size);
 	auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, src1_cb_config);
 
 	uint32_t output_cb_index = 16; // output operands start at index 16
 	uint32_t num_output_tiles = 2;
-	tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
-		.set_page_size(output_cb_index, single_tile_size);
+	tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
+		.set_page_size(output_cb_index, dst_single_tile_size);
 	auto cb_output = tt_metal::CreateCircularBuffer(program, all_device_cores, output_cb_config);
 
 	bool src0_is_dram = src0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
diff --git a/tt_eager/tt_dnn/op_library/bcast/multi_core_hw/bcast_op_multi_core_hw.cpp b/tt_eager/tt_dnn/op_library/bcast/multi_core_hw/bcast_op_multi_core_hw.cpp
index e9e5b1565d01..16d9c543033f 100644
--- a/tt_eager/tt_dnn/op_library/bcast/multi_core_hw/bcast_op_multi_core_hw.cpp
+++ b/tt_eager/tt_dnn/op_library/bcast/multi_core_hw/bcast_op_multi_core_hw.cpp
@@ -58,9 +58,13 @@ operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tenso
         shard_spec = output.shard_spec().value();
     }
 
-    tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::DataFormat src0_cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::DataFormat src1_cb_data_format = tt_metal::datatype_to_dataformat_converter(b.get_dtype());
+    tt::DataFormat dst_cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype());
 
-    uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format);
+    uint32_t src0_single_tile_size = tt_metal::detail::TileSize(src0_cb_data_format);
+    uint32_t src1_single_tile_size = tt_metal::detail::TileSize(src1_cb_data_format);
+    uint32_t dst_single_tile_size = tt_metal::detail::TileSize(dst_cb_data_format);
 
     auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
     uint32_t num_cores_x = compute_with_storage_grid_size.x;
@@ -91,22 +95,22 @@ operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tenso
 
     uint32_t num_input_tiles_cb0 = src0_sharded ? num_tiles_per_shard : num_input_tiles;
 
-    tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(num_input_tiles_cb0 * single_tile_size, {{src0_cb_index, cb_data_format}})
-        .set_page_size(src0_cb_index, single_tile_size);
+    tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(num_input_tiles_cb0 * src0_single_tile_size, {{src0_cb_index, src0_cb_data_format}})
+        .set_page_size(src0_cb_index, src0_single_tile_size);
     if (src0_sharded) {
         src0_cb_config = src0_cb_config.set_globally_allocated_address(*a.buffer());
     }
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_device_cores, src0_cb_config);
 
     uint32_t src1_cb_index = 1;
-    tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
-        .set_page_size(src1_cb_index, single_tile_size);
+    tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * src1_single_tile_size, {{src1_cb_index, src1_cb_data_format}})
+        .set_page_size(src1_cb_index, src1_single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, src1_cb_config);
 
     uint32_t output_cb_index = 16; // output operands start at index 16
     uint32_t num_output_tiles = output_sharded ? num_tiles_per_shard : 2;
-    tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
-        .set_page_size(output_cb_index, single_tile_size);
+    tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
+        .set_page_size(output_cb_index, dst_single_tile_size);
     if (output_sharded) {
         output_cb_config = output_cb_config.set_globally_allocated_address(*output.buffer());
     }
@@ -211,7 +215,9 @@ operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tenso
             bcast_kernel_id,
             compute_with_storage_grid_size,
             cb_src0,
-            single_tile_size,
+            src0_single_tile_size,
+            src1_single_tile_size,
+            dst_single_tile_size,
             cb_output
         ]
     (
@@ -324,12 +330,12 @@ operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tenso
 
         if (src0_sharded) {
             UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer_a);
-            UpdateCircularBufferTotalSize(program, cb_src0, num_tiles_per_core_group_1 * single_tile_size);
+            UpdateCircularBufferTotalSize(program, cb_src0, num_tiles_per_core_group_1 * src0_single_tile_size);
         }
 
         if (out_sharded) {
             UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer);
-            UpdateCircularBufferTotalSize(program, cb_output, num_tiles_per_core_group_1 * single_tile_size);
+            UpdateCircularBufferTotalSize(program, cb_output, num_tiles_per_core_group_1 * dst_single_tile_size);
         }
     };
 
diff --git a/tt_eager/tt_dnn/op_library/bcast/multi_core_w/bcast_op_multi_core_w.cpp b/tt_eager/tt_dnn/op_library/bcast/multi_core_w/bcast_op_multi_core_w.cpp
index 80bdf3e221f3..a617d247964d 100644
--- a/tt_eager/tt_dnn/op_library/bcast/multi_core_w/bcast_op_multi_core_w.cpp
+++ b/tt_eager/tt_dnn/op_library/bcast/multi_core_w/bcast_op_multi_core_w.cpp
@@ -47,9 +47,13 @@ operation::ProgramWithCallbacks bcast_multi_core_w(const Tensor &a, const Tensor
 
     tt_metal::Device *device = a.device();
 
-	tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+	tt::DataFormat src0_cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::DataFormat src1_cb_data_format = tt_metal::datatype_to_dataformat_converter(b.get_dtype());
+    tt::DataFormat dst_cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype());
 
-    uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format);
+    uint32_t src0_single_tile_size = tt_metal::detail::TileSize(src0_cb_data_format);
+    uint32_t src1_single_tile_size = tt_metal::detail::TileSize(src1_cb_data_format);
+    uint32_t dst_single_tile_size = tt_metal::detail::TileSize(dst_cb_data_format);
 
 	auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
     uint32_t num_cores_x = compute_with_storage_grid_size.x;
@@ -69,19 +73,19 @@ operation::ProgramWithCallbacks bcast_multi_core_w(const Tensor &a, const Tensor
 	uint32_t src0_cb_index = 0;
 	uint32_t num_input_tiles = 2;
 
-	tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
-		.set_page_size(src0_cb_index, single_tile_size);
+	tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * src0_single_tile_size, {{src0_cb_index, src0_cb_data_format}})
+		.set_page_size(src0_cb_index, src0_single_tile_size);
 	auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_device_cores, src0_cb_config);
 
 	uint32_t src1_cb_index = 1;
-	tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
-		.set_page_size(src1_cb_index, single_tile_size);
+	tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * src1_single_tile_size, {{src1_cb_index, src1_cb_data_format}})
+		.set_page_size(src1_cb_index, src1_single_tile_size);
 	auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, src1_cb_config);
 
 	uint32_t output_cb_index = 16; // output operands start at index 16
 	uint32_t num_output_tiles = 2;
-	tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
-		.set_page_size(output_cb_index, single_tile_size);
+	tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
+		.set_page_size(output_cb_index, dst_single_tile_size);
 	auto cb_output = tt_metal::CreateCircularBuffer(program, all_device_cores, output_cb_config);
 
 
diff --git a/tt_eager/tt_dnn/op_library/bcast/single_core/bcast_op_single_core.cpp b/tt_eager/tt_dnn/op_library/bcast/single_core/bcast_op_single_core.cpp
index ddaa195bea12..b27e1a11eff3 100644
--- a/tt_eager/tt_dnn/op_library/bcast/single_core/bcast_op_single_core.cpp
+++ b/tt_eager/tt_dnn/op_library/bcast/single_core/bcast_op_single_core.cpp
@@ -45,25 +45,29 @@ operation::ProgramWithCallbacks bcast_single_core(const Tensor &a, const Tensor
     // This should allocate a DRAM buffer on the device
     tt_metal::Device *device = a.device();
 
-    tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::DataFormat src0_cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::DataFormat src1_cb_data_format = tt_metal::datatype_to_dataformat_converter(b.get_dtype());
+    tt::DataFormat dst_cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype());
 
-    uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format);
+    uint32_t src0_single_tile_size = tt_metal::detail::TileSize(src0_cb_data_format);
+    uint32_t src1_single_tile_size = tt_metal::detail::TileSize(src1_cb_data_format);
+    uint32_t dst_single_tile_size = tt_metal::detail::TileSize(dst_cb_data_format);
 
     uint32_t src0_cb_index = 0;
     uint32_t num_input_tiles = 2;
-    tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
-		.set_page_size(src0_cb_index, single_tile_size);
+    tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * src0_single_tile_size, {{src0_cb_index, src0_cb_data_format}})
+		.set_page_size(src0_cb_index, src0_single_tile_size);
 	auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, src0_cb_config);
 
 	uint32_t src1_cb_index = 1;
-	tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
-		.set_page_size(src1_cb_index, single_tile_size);
+	tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * src1_single_tile_size, {{src1_cb_index, src1_cb_data_format}})
+		.set_page_size(src1_cb_index, src1_single_tile_size);
 	auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, src1_cb_config);
 
 	uint32_t output_cb_index = 16; // output operands start at index 16
 	uint32_t num_output_tiles = 2;
-	tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
-		.set_page_size(output_cb_index, single_tile_size);
+	tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
+		.set_page_size(output_cb_index, dst_single_tile_size);
 	auto cb_output = tt_metal::CreateCircularBuffer(program, core, output_cb_config);
 
     uint32_t bnc1 = (bN*bC == 1);

From 607faa1b2240281136f2eb719a3ae347fabf6e41 Mon Sep 17 00:00:00 2001
From: Tapasvi Patel <tpatel@tenstorrent.com>
Date: Thu, 16 May 2024 19:13:33 +0000
Subject: [PATCH 16/40] #8558: Refactor t3000, tg and  tgg pipelines, workflows
 and run test scripts

---
 .../workflows/run-profiler-regression.yaml    |   2 -
 ...o-end-demos.yaml => t3000-demo-tests.yaml} |  24 ++-
 ...requent.yaml => t3000-frequent-tests.yaml} |  30 ++--
 ...odels.yaml => t3000-model-perf-tests.yaml} |  54 ++++---
 .github/workflows/t3000-profiler-tests.yaml   |  41 +++++
 ...-unit-tests.yaml => t3000-unit-tests.yaml} |  25 ++--
 .github/workflows/tg-unit-tests.yaml          |  32 ++--
 .github/workflows/tgg-unit-tests.yaml         |  28 ++--
 CODEOWNERS                                    |   9 +-
 .../demos/falcon7b/tests/test_perf_falcon.py  |   2 +-
 .../mixtral8x7b/tests/test_mixtral_perf.py    |   2 +-
 pytest.ini                                    |   2 +-
 .../multi_chip/run_end_to_end_demos.sh        |  23 ---
 .../run_frequent_regressions_multi_device.sh  |  35 -----
 ...re_post_commit_regressions_multi_device.sh |  49 ------
 .../multi_chip/run_unstable_multi_device.sh   |  13 --
 tests/scripts/run_performance.sh              |  29 ----
 tests/scripts/run_tests.sh                    | 141 ++++++++++--------
 tests/scripts/t3000/run_t3000_demo_tests.sh   |  46 ++++++
 .../scripts/t3000/run_t3000_frequent_tests.sh | 121 +++++++++++++++
 .../t3000/run_t3000_model_perf_tests.sh       | 111 ++++++++++++++
 tests/scripts/t3000/run_t3000_unit_tests.sh   | 127 ++++++++++++++++
 .../tg/run_pre_post_commit_regressions_tg.sh  |  17 ---
 tests/scripts/tg/run_tg_unit_tests.sh         |  28 ++++
 .../run_pre_post_commit_regressions_tgg.sh    |  17 ---
 tests/scripts/tgg/run_tgg_unit_tests.sh       |  28 ++++
 26 files changed, 687 insertions(+), 349 deletions(-)
 rename .github/workflows/{multi-device-end-to-end-demos.yaml => t3000-demo-tests.yaml} (65%)
 rename .github/workflows/{multi-device-build-and-unit-tests-frequent.yaml => t3000-frequent-tests.yaml} (55%)
 rename .github/workflows/{multi-device-perf-models.yaml => t3000-model-perf-tests.yaml} (53%)
 create mode 100644 .github/workflows/t3000-profiler-tests.yaml
 rename .github/workflows/{multi-device-build-and-unit-tests.yaml => t3000-unit-tests.yaml} (62%)
 delete mode 100755 tests/scripts/multi_chip/run_end_to_end_demos.sh
 delete mode 100755 tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh
 delete mode 100755 tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh
 delete mode 100644 tests/scripts/multi_chip/run_unstable_multi_device.sh
 create mode 100755 tests/scripts/t3000/run_t3000_demo_tests.sh
 create mode 100755 tests/scripts/t3000/run_t3000_frequent_tests.sh
 create mode 100755 tests/scripts/t3000/run_t3000_model_perf_tests.sh
 create mode 100755 tests/scripts/t3000/run_t3000_unit_tests.sh
 delete mode 100755 tests/scripts/tg/run_pre_post_commit_regressions_tg.sh
 create mode 100755 tests/scripts/tg/run_tg_unit_tests.sh
 delete mode 100755 tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh
 create mode 100755 tests/scripts/tgg/run_tgg_unit_tests.sh

diff --git a/.github/workflows/run-profiler-regression.yaml b/.github/workflows/run-profiler-regression.yaml
index f975e54429e4..fbedb2b0d1f7 100644
--- a/.github/workflows/run-profiler-regression.yaml
+++ b/.github/workflows/run-profiler-regression.yaml
@@ -18,8 +18,6 @@ jobs:
           {arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-1"]},
           # N300
           {arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-2"]},
-          # N300 2x4
-          {name: "n300-2x4", arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"]},
         ]
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
diff --git a/.github/workflows/multi-device-end-to-end-demos.yaml b/.github/workflows/t3000-demo-tests.yaml
similarity index 65%
rename from .github/workflows/multi-device-end-to-end-demos.yaml
rename to .github/workflows/t3000-demo-tests.yaml
index 11be7baeb77f..393b7859ecc0 100644
--- a/.github/workflows/multi-device-end-to-end-demos.yaml
+++ b/.github/workflows/t3000-demo-tests.yaml
@@ -1,9 +1,9 @@
-name: "Multi-chip demo tests"
+name: "[T3K] T3000 demo tests"
 
 on:
   workflow_dispatch:
   schedule:
-    - cron: '0 0 * * *' # This cron schedule runs the workflow every day at 12am UTC
+    - cron: '0 0 * * 6' # This cron schedule runs the workflow every Saturday at 12am UTC
 
 jobs:
   build-artifact:
@@ -11,28 +11,25 @@ jobs:
     with:
       arch: '["wormhole_b0"]'
     secrets: inherit
-
-  multi-chip-unit-tests:
+  t3000-demo-tests:
     needs: build-artifact
     strategy:
-      # Do not fail-fast because we need to ensure all tests go to completion
-      # so we try not to get hanging machines
       fail-fast: false
       matrix:
         test-group: [
-          # N300 2x4
           {
-            name: "T3000 end to end demo tests",
+            name: "T3000 demo tests",
             arch: wormhole_b0,
-            runs-on: ["perf-t3000", "arch-wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"],
-            machine-type: "bare_metal",
-            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type end_to_end_demos_multi_device --dispatch-mode ""'
+            runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type demos_t3000_device --dispatch-mode ""'
           },
         ]
-    name: ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
+    name: ${{ matrix.test-group.name }}
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
       ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
     environment: dev
     runs-on: ${{ matrix.test-group.runs-on }}
     steps:
@@ -46,9 +43,10 @@ jobs:
       - name: Extract files
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
-      - name: Run pre/post regression tests
+      - name: Run demo regression tests
         timeout-minutes: 180
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
           export PYTHONPATH=$TT_METAL_HOME
           ${{ matrix.test-group.cmd }}
diff --git a/.github/workflows/multi-device-build-and-unit-tests-frequent.yaml b/.github/workflows/t3000-frequent-tests.yaml
similarity index 55%
rename from .github/workflows/multi-device-build-and-unit-tests-frequent.yaml
rename to .github/workflows/t3000-frequent-tests.yaml
index b108d0f2f674..f7b89b659887 100644
--- a/.github/workflows/multi-device-build-and-unit-tests-frequent.yaml
+++ b/.github/workflows/t3000-frequent-tests.yaml
@@ -1,4 +1,4 @@
-name: "Nightly multi-chip tests"
+name: "[T3K] T3000 frequent tests"
 
 on:
   workflow_dispatch:
@@ -11,23 +11,27 @@ jobs:
     with:
       arch: '["wormhole_b0"]'
     secrets: inherit
-
-  multi-chip-nightly:
+  t3000-frequent-tests:
     needs: build-artifact
     strategy:
-      # Do not fail-fast because we need to ensure all tests go to completion
-      # so we try not to get hanging machines
       fail-fast: false
       matrix:
-        runner-info: [
-          # N300 2x4
-          {name: "n300-2x4", arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"]},
+        test-group: [
+          {
+            name: "T3000 frequent tests",
+            arch: wormhole_b0,
+            runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_t3000_device --dispatch-mode ""'
+          },
         ]
+    name: ${{ matrix.test-group.name }}
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ matrix.runner-info.arch }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
     environment: dev
-    runs-on: ${{ matrix.runner-info.runs-on }}
+    runs-on: ${{ matrix.test-group.runs-on }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
       - name: Set up dynamic env vars for build
@@ -35,9 +39,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.runner-info.arch }}
+          name: TTMetal_build_${{ matrix.test-group.arch }}
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent regression tests
         timeout-minutes: 60
@@ -45,4 +49,4 @@ jobs:
           source ${{ github.workspace }}/python_env/bin/activate
           cd $TT_METAL_HOME
           export PYTHONPATH=$TT_METAL_HOME
-          ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type frequent_multi_device --dispatch-mode ""
+          ${{ matrix.test-group.cmd }}
diff --git a/.github/workflows/multi-device-perf-models.yaml b/.github/workflows/t3000-model-perf-tests.yaml
similarity index 53%
rename from .github/workflows/multi-device-perf-models.yaml
rename to .github/workflows/t3000-model-perf-tests.yaml
index 4a43bf1e4384..683158cbc62f 100644
--- a/.github/workflows/multi-device-perf-models.yaml
+++ b/.github/workflows/t3000-model-perf-tests.yaml
@@ -1,4 +1,4 @@
-name: "Multi-Nebula model perf regressions and output report"
+name: "[T3K] T3000 model perf tests"
 
 on:
   workflow_dispatch:
@@ -11,32 +11,38 @@ jobs:
     with:
       arch: '["wormhole_b0"]'
     secrets: inherit
-
-  multi-device-models-perf:
+  t3000-model-perf-tests:
     needs: build-artifact
     strategy:
-      # Do not fail-fast because we need to ensure all tests go to completion
-      # so we try not to get hanging machines
       fail-fast: false
       matrix:
-        runner-info: [
-          # N300 2x4
-          # NOTE: Never use arch-wormhole_b0 tags, however we're using it here because this machine is used by devs during the day
-          # We don't want other CI runs to interrupt dev flows. However, we need to fix this once we have more 2x4 machines dedicated to CI
-          {name: "n300-2x4", arch: wormhole_b0, runs-on: ["perf-t3000", "arch-wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"], machine-type: "bare_metal"},
+        test-group: [
+          {
+            name: "T3000 LLM model perf tests", 
+            model-type: "LLM",
+            arch: wormhole_b0, 
+            runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_t3000_device --dispatch-mode ""'
+          },
+          {
+            name: "T3000 CNN model perf tests", 
+            model-type: "CNN",
+            arch: wormhole_b0, 
+            runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_t3000_device --dispatch-mode ""'
+          },
         ]
-        model-type: [llm_javelin, cnn_javelin, other]
-    name: "${{ matrix.model-type }} ${{ matrix.runner-info.arch }}"
+    name: ${{ matrix.test-group.name }}
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ matrix.runner-info.arch }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
-      TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}'
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
     environment: dev
-    runs-on: ${{ matrix.runner-info.runs-on }}
+    runs-on: ${{ matrix.test-group.runs-on }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
-      - name: Enable Performance mode
+      - name: Enable performance mode
         run: |
           sudo cpupower frequency-set -g performance
       - name: Ensure weka mount is active
@@ -50,15 +56,17 @@ jobs:
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.runner-info.arch }}
+          name: TTMetal_build_${{ matrix.test-group.arch }}
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
-      - name: Run performance regressions
+      - name: Run model perf regression tests
         timeout-minutes: 60
         run: |
-          source python_env/bin/activate
-          ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.runner-info.machine-type }}_multi_device
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          ${{ matrix.test-group.cmd }}
       - name: Check perf report exists
         id: check-perf-report
         if: ${{ !cancelled() }}
@@ -71,9 +79,9 @@ jobs:
         if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
         uses: actions/upload-artifact@v4
         with:
-          name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }}
+          name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
           path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
-      - name: Disable Performance mode
+      - name: Disable performance mode
         if: always()
         run: |
           sudo cpupower frequency-set -g ondemand
diff --git a/.github/workflows/t3000-profiler-tests.yaml b/.github/workflows/t3000-profiler-tests.yaml
new file mode 100644
index 000000000000..99942f933141
--- /dev/null
+++ b/.github/workflows/t3000-profiler-tests.yaml
@@ -0,0 +1,41 @@
+name: "[T3K] T3000 profiler tests"
+
+on:
+  workflow_dispatch:
+  workflow_call:
+  schedule:
+    - cron: "0 */8 * * *" # This cron schedule runs the workflow every 8 hours
+
+jobs:
+  t3000-profiler-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          {
+            name: "T3000 profiler tests",
+            arch: wormhole_b0,
+            runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
+            cmd: './tests/scripts/run_profiler_regressions.sh'
+          },
+        ]
+    name: ${{ matrix.test-group.name }}
+    env:
+      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    environment: dev
+    runs-on: ${{ matrix.test-group.runs-on }}
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - name: Set up dynamic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - name: Build tt-metal and libs
+        run: |
+          ./scripts/build_scripts/build_with_profiler_opt.sh
+      - name: Run profiler regression tests
+        timeout-minutes: 30
+        run: |
+          ./tests/scripts/run_profiler_regressions.sh
diff --git a/.github/workflows/multi-device-build-and-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml
similarity index 62%
rename from .github/workflows/multi-device-build-and-unit-tests.yaml
rename to .github/workflows/t3000-unit-tests.yaml
index 6c346077d983..935f4c93750e 100644
--- a/.github/workflows/multi-device-build-and-unit-tests.yaml
+++ b/.github/workflows/t3000-unit-tests.yaml
@@ -1,4 +1,4 @@
-name: "Multi-chip unit tests"
+name: "[T3K] T3000 unit tests"
 
 on:
   workflow_dispatch:
@@ -11,33 +11,25 @@ jobs:
     with:
       arch: '["wormhole_b0"]'
     secrets: inherit
-
-  multi-chip-unit-tests:
+  t3000-unit-tests:
     needs: build-artifact
     strategy:
-      # Do not fail-fast because we need to ensure all tests go to completion
-      # so we try not to get hanging machines
       fail-fast: false
       matrix:
         test-group: [
-          # N300 2x4
           {
             name: "T3000 unit tests",
             arch: wormhole_b0,
-            runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"],
-            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_multi_device --dispatch-mode ""'
+            runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_t3000_device --dispatch-mode ""'
           },
-          # {
-          #   name: "T3000 unstable tests",
-          #   arch: wormhole_b0,
-          #   runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"],
-          #   cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_multi_device_unstable --dispatch-mode ""'
-          # },
         ]
-    name: ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
+    name: ${{ matrix.test-group.name }}
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
       ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
     environment: dev
     runs-on: ${{ matrix.test-group.runs-on }}
     steps:
@@ -51,9 +43,10 @@ jobs:
       - name: Extract files
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
-      - name: Run pre/post regression tests
+      - name: Run unit regression tests
         timeout-minutes: 120
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
           export PYTHONPATH=$TT_METAL_HOME
           ${{ matrix.test-group.cmd }}
diff --git a/.github/workflows/tg-unit-tests.yaml b/.github/workflows/tg-unit-tests.yaml
index 89dcec33fd9f..12163a65d0d0 100644
--- a/.github/workflows/tg-unit-tests.yaml
+++ b/.github/workflows/tg-unit-tests.yaml
@@ -1,9 +1,6 @@
 name: "[TG] TG unit tests"
 
 on:
-  push:
-    branches:
-      - galaxy/main
   schedule:
     - cron: '0 0 * * *' # Runs every day at 12am UTC
   workflow_dispatch:
@@ -11,27 +8,30 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
     secrets: inherit
   TG-tests:
+    needs: build-artifact
     strategy:
-      # Do not fail-fast because we need to ensure all tests go to completion
-      # so we try not to get hanging machines
       fail-fast: false
       matrix:
-        runner-info: [
-          # TG
-          {arch: wormhole_b0, runs-on: ["config-tg", "in-service"]},
-        ]
         test-group: [
-          {name: "TG Unit Tests", cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_tg --dispatch-mode ""'},
+          {
+            name: "TG unit tests",
+            arch: wormhole_b0,
+            runs-on: [arch-wormhole_b0, "config-tg", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_tg_device --dispatch-mode ""'
+          },
         ]
-    name: ${{ matrix.test-group.name }} ${{ matrix.runner-info.arch }} ${{ matrix.runner-info.name }}
+    name: ${{ matrix.test-group.name }}
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ matrix.runner-info.arch }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ${{ matrix.runner-info.runs-on }}
+    environment: dev
+    runs-on: ${{ matrix.test-group.runs-on }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
       - name: Set up dynamic env vars for build
@@ -39,11 +39,11 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.runner-info.arch }}
+          name: TTMetal_build_${{ matrix.test-group.arch }}
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
-      - name: ${{ matrix.test-group.name }} tests
+      - name: Run unit regression tests
         timeout-minutes: 45
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml
index eb3d6c23d182..5351b9ae824b 100644
--- a/.github/workflows/tgg-unit-tests.yaml
+++ b/.github/workflows/tgg-unit-tests.yaml
@@ -8,27 +8,29 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
     secrets: inherit
   TGG-tests:
+    needs: build-artifact
     strategy:
-      # Do not fail-fast because we need to ensure all tests go to completion
-      # so we try not to get hanging machines
       fail-fast: false
       matrix:
-        runner-info: [
-          # TGG
-          {arch: wormhole_b0, runs-on: ["config-tgg", "in-service"]},
-        ]
         test-group: [
-          {name: "TGG Unit Tests", cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_tgg --dispatch-mode ""'},
+          {
+            name: "TGG unit tests",
+            arch: wormhole_b0,
+            runs-on: [arch-wormhole_b0, "config-tgg", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_tgg_device --dispatch-mode ""'
+          },
         ]
-    name: ${{ matrix.test-group.name }} ${{ matrix.runner-info.arch }} ${{ matrix.runner-info.name }}
+    name: ${{ matrix.test-group.name }}
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ matrix.runner-info.arch }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ${{ matrix.runner-info.runs-on }}
+    runs-on: ${{ matrix.test-group.runs-on }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
       - name: Set up dynamic env vars for build
@@ -36,11 +38,11 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.runner-info.arch }}
+          name: TTMetal_build_${{ matrix.test-group.arch }}
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
-      - name: ${{ matrix.test-group.name }} tests
+      - name: Run unit regression tests
         timeout-minutes: 45
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
diff --git a/CODEOWNERS b/CODEOWNERS
index 4864b93c26a5..147b0e9a2463 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,6 +2,13 @@
 # precedence.
 
 .github/ @tt-rkim
+.github/t3000-unit-tests.yaml @tapspatel
+.github/t3000-profiler-tests.yaml @tapspatel
+.github/t3000-model-perf-tests.yaml @tapspatel
+.github/t3000-frequent-tests.yaml @tapspatel
+.github/t3000-demo-tests.yaml @tapspatel
+.github/tg-unit-tests.yaml @tapspatel
+.github/tgg-unit-tests.yaml @tapspatel
 
 /infra/ @tt-rkim
 
@@ -39,7 +46,7 @@ tests/scripts/run_pre_post_commit_regressions_multi_device.sh @tt-rkim @aliuTT @
 tests/scripts/run_pre_post_commit_regressions_fast_dispatch.sh @tt-rkim @TT-billteng @ttmchiou
 tests/scripts/run_models.sh @tt-rkim
 tests/scripts/nightly/ @tt-rkim @vtangTT
-tests/scripts/multi_chip/ @tapspatel
+tests/scripts/t3000/ @tapspatel
 tests/scripts/tg/ @tapspatel
 tests/scripts/tgg/ @tapspatel
 
diff --git a/models/demos/falcon7b/tests/test_perf_falcon.py b/models/demos/falcon7b/tests/test_perf_falcon.py
index 7e036c3b2391..0765ed994123 100644
--- a/models/demos/falcon7b/tests/test_perf_falcon.py
+++ b/models/demos/falcon7b/tests/test_perf_falcon.py
@@ -584,7 +584,7 @@ def test_perf_wh_bare_metal(
             async_mode,
         )
 
-    @pytest.mark.models_performance_bare_metal_multi_device
+    @pytest.mark.model_perf_t3000
     @pytest.mark.parametrize(
         "llm_mode, num_devices, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time, async_mode",
         (
diff --git a/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py b/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py
index 2fc1a67e1803..77cf593e60a4 100644
--- a/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py
+++ b/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py
@@ -36,7 +36,7 @@ def forward(self, x):
         return self.emb(x)
 
 
-@pytest.mark.models_performance_bare_metal_multi_device
+@pytest.mark.model_perf_t3000
 @pytest.mark.parametrize(
     "generation_start_pos, expected_compile_time, expected_inference_time",
     (
diff --git a/pytest.ini b/pytest.ini
index 10ee11deb4c3..b1f2bdbc22c0 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -13,4 +13,4 @@ markers =
     models_performance_bare_metal: mark model silicon tests for performance on bare metal
     models_performance_virtual_machine: mark model silicon tests for performance on virtual_machine
     models_device_performance_bare_metal: mark model silicon tests for device performance on bare metal
-    models_performance_bare_metal_multi_device: mark model silicon tests for performance on multi-chip bare metal
+    model_perf_t3000: mark model silicon tests for performance on t3000 bare metal
diff --git a/tests/scripts/multi_chip/run_end_to_end_demos.sh b/tests/scripts/multi_chip/run_end_to_end_demos.sh
deleted file mode 100755
index f15ee8d92563..000000000000
--- a/tests/scripts/multi_chip/run_end_to_end_demos.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-
-#/bin/bash
-
-set -eo pipefail
-
-if [[ -z "$TT_METAL_HOME" ]]; then
-  echo "Must provide TT_METAL_HOME in environment" 1>&2
-  exit 1
-fi
-
-if [[ -z "$ARCH_NAME" ]]; then
-  echo "Must provide ARCH_NAME in environment" 1>&2
-  exit 1
-fi
-
-cd $TT_METAL_HOME
-export PYTHONPATH=$TT_METAL_HOME
-
-# Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py
-
-# Falcon40B end to end demo (prefill + decode)
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_5_loops.py
diff --git a/tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh b/tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh
deleted file mode 100755
index ac7e4fd1f128..000000000000
--- a/tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-
-#/bin/bash
-
-set -eo pipefail
-
-if [[ -z "$TT_METAL_HOME" ]]; then
-  echo "Must provide TT_METAL_HOME in environment" 1>&2
-  exit 1
-fi
-
-if [[ -z "$ARCH_NAME" ]]; then
-  echo "Must provide ARCH_NAME in environment" 1>&2
-  exit 1
-fi
-
-cd $TT_METAL_HOME
-export PYTHONPATH=$TT_METAL_HOME
-
-pytest tests/ttnn/unit_tests/test_multi_device.py
-
-pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py
-pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py
-
-# Llama2_70b related cached files and tests (the test should parse env variables similar to these)
-export LLAMA_CKPT_DIR=/mnt/MLPerf/tt_dnn-models/llama-2/llama-2-70b-repacked/
-export LLAMA_TOKENIZER_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/tokenizer.model
-export LLAMA_CACHE_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/llama-data-cache/weights-cache-2
-
-pytest models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py
-pytest models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py
-pytest models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py
-pytest models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py
-
-# Mistral8x7b 8 chip decode model test (env flags set inside the test)
-pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[10-1-pcc]
diff --git a/tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh b/tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh
deleted file mode 100755
index a2081e36d58c..000000000000
--- a/tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-
-#/bin/bash
-
-set -eo pipefail
-
-if [[ -z "$TT_METAL_HOME" ]]; then
-  echo "Must provide TT_METAL_HOME in environment" 1>&2
-  exit 1
-fi
-
-if [[ -z "$ARCH_NAME" ]]; then
-  echo "Must provide ARCH_NAME in environment" 1>&2
-  exit 1
-fi
-
-TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips"
-TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips"
-TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips"
-TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips"
-
-TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*"
-./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*"
-./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*"
-pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit
-
-# ttnn multi-chip apis unit tests
-pytest tests/ttnn/unit_tests/test_multi_device.py
-
-# Falcon40b unit tests; prefill required 8x8 grids
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_mlp.py
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_attention.py
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_decoder.py
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_causallm.py
-WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py
-
-# Mistral8x7b 8 chip decode tests (env flags set inside the tests)
-pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py
-pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py
-pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py
-pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py
-pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py
-pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py
-pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[1-1-pcc]
-
-# Falcon7B data parallel tests
-pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py
-pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py
-pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py
-pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py
diff --git a/tests/scripts/multi_chip/run_unstable_multi_device.sh b/tests/scripts/multi_chip/run_unstable_multi_device.sh
deleted file mode 100644
index bd08c570b9b1..000000000000
--- a/tests/scripts/multi_chip/run_unstable_multi_device.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#/bin/bash
-
-set -eo pipefail
-
-if [[ -z "$TT_METAL_HOME" ]]; then
-  echo "Must provide TT_METAL_HOME in environment" 1>&2
-  exit 1
-fi
-
-if [[ -z "$ARCH_NAME" ]]; then
-  echo "Must provide ARCH_NAME in environment" 1>&2
-  exit 1
-fi
diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh
index 92b6e086ded3..0b0a0692c967 100755
--- a/tests/scripts/run_performance.sh
+++ b/tests/scripts/run_performance.sh
@@ -43,19 +43,6 @@ run_perf_models_llm_javelin() {
     env python models/perf/merge_perf_results.py
 }
 
-run_perf_models_llm_javelin_multi_device() {
-    local tt_arch=$1
-    local test_marker=$2
-
-    env pytest models/demos/falcon7b/tests -m $test_marker
-
-    # Mistral8x7b env flags are set inside the tests
-    env pytest models/demos/t3000/mixtral8x7b/tests -m $test_marker
-
-    ## Merge all the generated reports
-    env python models/perf/merge_perf_results.py
-}
-
 run_perf_models_cnn_javelin() {
     local tt_arch=$1
     local test_marker=$2
@@ -70,16 +57,6 @@ run_perf_models_cnn_javelin() {
     env python models/perf/merge_perf_results.py
 }
 
-run_perf_models_cnn_javelin_multi_device() {
-    local tt_arch=$1
-    local test_marker=$2
-
-    # Add tests here
-
-    ## Merge all the generated reports
-    env python models/perf/merge_perf_results.py
-}
-
 run_device_perf_models() {
     local test_marker=$1
 
@@ -153,8 +130,6 @@ main() {
         test_marker="models_performance_virtual_machine"
     elif [[ "$pipeline_type" == *"device_performance_bare_metal"* ]]; then
         test_marker="models_device_performance_bare_metal"
-    elif [[ "$pipeline_type" == *"_bare_metal_multi_device"* ]]; then
-        test_marker="models_performance_bare_metal_multi_device"
     elif [[ "$pipeline_type" == *"_bare_metal"* ]]; then
         test_marker="models_performance_bare_metal"
     else
@@ -165,12 +140,8 @@ main() {
     if [[ "$pipeline_type" == *"device_performance"* ]]; then
         run_device_perf_models "$test_marker"
         run_device_perf_ops "$test_marker"
-    elif [[ "$pipeline_type" == "llm_javelin_models_performance_bare_metal_multi_device" ]]; then
-        run_perf_models_llm_javelin_multi_device "$tt_arch" "$test_marker"
     elif [[ "$pipeline_type" == "llm_javelin_models_performance"* ]]; then
         run_perf_models_llm_javelin "$tt_arch" "$test_marker"
-    elif [[ "$pipeline_type" == "cnn_javelin_models_performance_bare_metal_multi_device" ]]; then
-        run_perf_models_cnn_javelin_multi_device "$tt_arch" "$test_marker"
     elif [[ "$pipeline_type" == "cnn_javelin_models_performance"* ]]; then
         run_perf_models_cnn_javelin "$tt_arch" "$test_marker"
     elif [[ "$pipeline_type" == *"other_models_performance"* ]]; then
diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
index 818cf3d6327d..37580d883098 100755
--- a/tests/scripts/run_tests.sh
+++ b/tests/scripts/run_tests.sh
@@ -102,42 +102,6 @@ run_frequent_api_pipeline_tests() {
     fi
 }
 
-# Run frequent multi device pipeline tests - these are the t3000 + 4xn300 tests
-run_frequent_multi_device_pipeline_tests() {
-    local tt_arch=$1
-    local pipeline_type=$2
-    local dispatch_mode=$3
-
-    ./tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh
-}
-
-# Run end to end demos - these are the t3000 + 4xn300 tests
-run_end_to_end_demos_multi_device() {
-    local tt_arch=$1
-    local pipeline_type=$2
-    local dispatch_mode=$3
-
-    ./tests/scripts/multi_chip/run_end_to_end_demos.sh
-}
-
-# Run post commit TG tests - these are 4xn150 + galaxy tests
-run_post_commit_tg_pipeline_tests() {
-    local tt_arch=$1
-    local pipeline_type=$2
-    local dispatch_mode=$3
-
-    ./tests/scripts/tg/run_pre_post_commit_regressions_tg.sh
-}
-
-# Run post commit TGG tests - these are 8xn150 + 2xgalaxy tests
-run_post_commit_tgg_pipeline_tests() {
-    local tt_arch=$1
-    local pipeline_type=$2
-    local dispatch_mode=$3
-
-    ./tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh
-}
-
 run_models_performance() {
     local tt_arch=$1
     local pipeline_type=$2
@@ -160,14 +124,6 @@ run_models_performance_bare_metal_pipeline_tests() {
     run_models_performance "$tt_arch" "$pipeline_type"
 }
 
-run_models_performance_bare_metal_multi_device_pipeline_tests() {
-    local tt_arch=$1
-    local pipeline_type=$2
-    local dispatch_mode=$3
-
-    run_models_performance_multi_device "$tt_arch" "$pipeline_type"
-}
-
 run_models_performance_virtual_machine_pipeline_tests() {
     local tt_arch=$1
     local pipeline_type=$2
@@ -203,14 +159,6 @@ run_stress_post_commit_pipeline_tests() {
     done
 }
 
-run_post_commit_multi_device_pipeline_tests() {
-    local tt_arch=$1
-    local pipeline_type=$2
-    local dispatch_mode=$3
-
-    ./tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh
-}
-
 run_post_commit_multi_device_unstable_pipeline_tests() {
     local tt_arch=$1
     local pipeline_type=$2
@@ -239,6 +187,66 @@ run_ttnn_sweeps_pipeline_tests() {
     ./tests/scripts/run_ttnn_sweeps.sh
 }
 
+##########################T3000##########################
+# Run t3000 unit tests
+unit_t3000_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/t3000/run_t3000_unit_tests.sh
+}
+
+# Run t3000 frequent tests
+frequent_t3000_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/t3000/run_t3000_frequent_tests.sh
+}
+
+# Run t3000 demo tests
+demos_t3000_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/t3000/run_t3000_demo_tests.sh
+}
+
+# Run t3000 model perf tests
+model_perf_t3000_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/t3000/run_t3000_model_perf_tests.sh --pipeline-type "$pipeline_type"
+}
+##########################T3000##########################
+
+##########################TG##########################
+# Run TG unit tests
+unit_tg_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/tg/run_tg_unit_tests.sh
+}
+##########################TG##########################
+
+##########################TGG##########################
+# Run TGG unit tests
+unit_tgg_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/tgg/run_tgg_unit_tests.sh
+}
+##########################TGG##########################
+
 run_pipeline_tests() {
     local tt_arch=$1
     local pipeline_type=$2
@@ -257,28 +265,29 @@ run_pipeline_tests() {
         run_eager_package_end_to_end_pipeline_tests "$tt_arch" "$pipeline_type"
     elif [[ $pipeline_type == *"models_performance_bare_metal" || $pipeline_type == "models_device_performance_bare_metal" ]]; then
         run_models_performance_bare_metal_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
-    elif [[ $pipeline_type == *"models_performance_bare_metal_multi_device" ]]; then
-        run_models_performance_bare_metal_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
     elif [[ $pipeline_type == "models_performance_virtual_machine" ]]; then
         run_models_performance_virtual_machine_pipeline_tests "$tt_arch" "$pipeline_type"
     elif [[ $pipeline_type == "stress_post_commit" ]]; then
         run_stress_post_commit_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
-    elif [[ $pipeline_type == "post_commit_multi_device" ]]; then
-        run_post_commit_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
-    elif [[ $pipeline_type == "post_commit_multi_device_unstable" ]]; then
-        run_post_commit_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
-    elif [[ $pipeline_type == "frequent_multi_device" ]]; then
-        run_frequent_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
-    elif [[ $pipeline_type == "end_to_end_demos_multi_device" ]]; then
-        run_end_to_end_demos_multi_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
-    elif [[ $pipeline_type == "post_commit_tg" ]]; then
-        run_post_commit_tg_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
-    elif [[ $pipeline_type == "post_commit_tgg" ]]; then
-        run_post_commit_tgg_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
     elif [[ $pipeline_type == "microbenchmarks" ]]; then
         run_microbenchmarks_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
     elif [[ $pipeline_type == "ttnn_sweeps" ]]; then
         run_ttnn_sweeps_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    # T3000 pipelines
+    elif [[ $pipeline_type == "unit_t3000_device" ]]; then
+        unit_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == "frequent_t3000_device" ]]; then
+        frequent_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == "demos_t3000_device" ]]; then
+        demos_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == *"model_perf_t3000_device" ]]; then
+        model_perf_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    # TG pipelines
+    elif [[ $pipeline_type == "unit_tg_device" ]]; then
+        unit_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    # TGG pipelines
+    elif [[ $pipeline_type == "unit_tgg_device" ]]; then
+        unit_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
     else
         echo "Unknown pipeline: $pipeline_type"
         exit 1
diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh
new file mode 100755
index 000000000000..c7b7a2ad24f4
--- /dev/null
+++ b/tests/scripts/t3000/run_t3000_demo_tests.sh
@@ -0,0 +1,46 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_t3000_falcon40b_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_falcon40b_tests"
+
+  # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py
+
+  # Falcon40B end to end demo (prefill + decode)
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_5_loops.py
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_ethernet_tests $duration seconds to complete"
+}
+
+run_t3000_tests() {
+  # Run falcon40b tests
+  run_t3000_falcon40b_tests
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  run_t3000_tests
+}
+
+main "$@"
diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh
new file mode 100755
index 000000000000..719bda685d91
--- /dev/null
+++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh
@@ -0,0 +1,121 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_t3000_ethernet_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_ethernet_tests"
+
+  pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py
+  pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_ethernet_tests $duration seconds to complete"
+}
+
+run_t3000_llama2_70b_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_llama2_70b_tests"
+
+  # Llama2_70b related cached files and tests (the test should parse env variables similar to these)
+  export LLAMA_CKPT_DIR=/mnt/MLPerf/tt_dnn-models/llama-2/llama-2-70b-repacked/
+  export LLAMA_TOKENIZER_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/tokenizer.model
+  export LLAMA_CACHE_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/llama-data-cache/weights-cache-2
+
+  pytest models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py
+  pytest models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py
+  pytest models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py
+  pytest models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete"
+}
+
+run_t3000_mixtral_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_mixtral_tests"
+
+  # mixtral8x7b 8 chip decode model test (env flags set inside the test)
+  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[10-1-pcc]
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete"
+}
+
+run_t3000_tteager_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_tteager_tests"
+
+  pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_tteager_tests $duration seconds to complete"
+}
+
+run_t3000_falcon40b_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_falcon40b_tests"
+  
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_causallm.py
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete"
+}
+
+run_t3000_tests() {
+  # Run ethernet tests
+  #run_t3000_ethernet_tests
+
+  # Run tteager tests
+  #run_t3000_tteager_tests
+
+  # Run llama2-70b tests
+  run_t3000_llama2_70b_tests
+
+  # Run mixtral tests
+  run_t3000_mixtral_tests
+
+  # Run falcon40b tests
+  run_t3000_falcon40b_tests
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  run_t3000_tests
+}
+
+main "$@"
diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
new file mode 100755
index 000000000000..5e26d9c7de29
--- /dev/null
+++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
@@ -0,0 +1,111 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_t3000_falcon7b_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_falcon7b_tests"
+
+  env pytest models/demos/falcon7b/tests -m "model_perf_t3000"
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete"
+}
+
+run_t3000_mixtral_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_mixtral_tests"
+
+  env pytest models/demos/t3000/mixtral8x7b/tests -m "model_perf_t3000"
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete"
+}
+
+run_t3000_llama2_70b_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_llama2_70b_tests"
+
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000"
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete"
+}
+
+run_t3000_llm_tests() {
+  # Run falcon7b tests
+  run_t3000_falcon7b_tests
+
+  # Run mixtral tests
+  run_t3000_mixtral_tests
+
+  # Run llama2-70b tests
+  #run_t3000_llama2_70b_tests
+
+  # Merge all the generated reports
+  env python models/perf/merge_perf_results.py
+}
+
+run_t3000_cnn_tests() {
+  # Merge all the generated reports
+  env python models/perf/merge_perf_results.py
+}
+
+main() {
+  # Parse the arguments
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      --pipeline-type)
+        pipeline_type=$2
+        shift
+        ;;
+      *)
+        echo "Unknown option: $1"
+        exit 1
+        ;;
+    esac
+    shift
+  done
+    
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$pipeline_type" ]]; then
+    echo "--pipeline-type cannot be empty" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  if [[ "$pipeline_type" == "llm_model_perf_t3000_device" ]]; then
+    run_t3000_llm_tests
+  elif [[ "$pipeline_type" == "cnn_model_perf_t3000_device" ]]; then
+    run_t3000_cnn_tests
+  else
+    echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1
+    exit 1
+  fi
+}
+
+main "$@"
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
new file mode 100755
index 000000000000..677b9d7cdf12
--- /dev/null
+++ b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -0,0 +1,127 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_t3000_ttmetal_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_ttmetal_tests"
+
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips"
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips"
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips"
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips"
+  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*"
+  ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*"
+  ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*"
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_ttmetal_tests $duration seconds to complete"
+}
+
+run_t3000_ttnn_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_ttnn_tests"
+
+  pytest tests/ttnn/unit_tests/test_multi_device.py
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_ttnn_tests $duration seconds to complete"
+}
+
+run_t3000_falcon7b_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_falcon7b_tests"
+
+  pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py
+  pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py
+  pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py
+  #pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete"
+}
+
+run_t3000_falcon40b_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_falcon40b_tests"
+
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_mlp.py
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_attention.py
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_decoder.py
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete"
+}
+
+run_t3000_mixtral_tests() {
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_mixtral_tests"
+
+  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py
+  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py
+  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py
+  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py
+  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py
+  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py
+  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[1-1-pcc]
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete"
+}
+
+run_t3000_tests() {
+  # Run ttmetal tests
+  run_t3000_ttmetal_tests
+
+  # Run ttnn tests
+  run_t3000_ttnn_tests
+
+  # Run falcon7b tests
+  run_t3000_falcon7b_tests
+
+  # Run falcon40b tests
+  run_t3000_falcon40b_tests
+
+  # Run mixtral tests
+  run_t3000_mixtral_tests
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  run_t3000_tests
+}
+
+main "$@"
diff --git a/tests/scripts/tg/run_pre_post_commit_regressions_tg.sh b/tests/scripts/tg/run_pre_post_commit_regressions_tg.sh
deleted file mode 100755
index 3d8f32fdf8ef..000000000000
--- a/tests/scripts/tg/run_pre_post_commit_regressions_tg.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-
-#/bin/bash
-
-set -eo pipefail
-
-if [[ -z "$TT_METAL_HOME" ]]; then
-  echo "Must provide TT_METAL_HOME in environment" 1>&2
-  exit 1
-fi
-
-if [[ -z "$ARCH_NAME" ]]; then
-  echo "Must provide ARCH_NAME in environment" 1>&2
-  exit 1
-fi
-
-# Write tests here!
-echo "Fill me!"
diff --git a/tests/scripts/tg/run_tg_unit_tests.sh b/tests/scripts/tg/run_tg_unit_tests.sh
new file mode 100755
index 000000000000..5a5c93de2aef
--- /dev/null
+++ b/tests/scripts/tg/run_tg_unit_tests.sh
@@ -0,0 +1,28 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_tg_tests() {
+  # Write tests here
+  echo "LOG_METAL: Fill me!"
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+  
+  run_tg_tests
+}
+
+main "$@"
diff --git a/tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh b/tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh
deleted file mode 100755
index 3d8f32fdf8ef..000000000000
--- a/tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-
-#/bin/bash
-
-set -eo pipefail
-
-if [[ -z "$TT_METAL_HOME" ]]; then
-  echo "Must provide TT_METAL_HOME in environment" 1>&2
-  exit 1
-fi
-
-if [[ -z "$ARCH_NAME" ]]; then
-  echo "Must provide ARCH_NAME in environment" 1>&2
-  exit 1
-fi
-
-# Write tests here!
-echo "Fill me!"
diff --git a/tests/scripts/tgg/run_tgg_unit_tests.sh b/tests/scripts/tgg/run_tgg_unit_tests.sh
new file mode 100755
index 000000000000..b8c209a22cb0
--- /dev/null
+++ b/tests/scripts/tgg/run_tgg_unit_tests.sh
@@ -0,0 +1,28 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_tgg_tests() {
+  # Write tests here
+  echo "LOG_METAL: Fill me!"
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+  
+  run_tgg_tests
+}
+
+main "$@"

From 22c258cbb825dba071dcd76db57e7fe19f2368c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CNenad?= <“npetrovic@tenstorrent.com”>
Date: Wed, 15 May 2024 08:59:37 +0000
Subject: [PATCH 17/40] #7292: Add subtract and apply activation

---
 .../sweep_tests/pytorch_ops.py                | 12 ++++-
 ...se_subtract_and_apply_activation_test.yaml | 28 +++++++++++
 ...se_subtract_and_apply_activation_test.yaml | 28 +++++++++++
 .../sweep_tests/ttnn_ops.py                   | 48 +++++++++++++++++++
 4 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_subtract_and_apply_activation_test.yaml
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test.yaml

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
index e445abb66791..8c03ef5dde2e 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
@@ -1865,10 +1865,20 @@ def max_pool2d(x, *args, **kwargs):
 def repeat_2(x, *args, shape, **kwargs):
     return x.repeat(*shape)
 
-
 def power_2(x, y, *args, exponent=None, **kwargs):
     if exponent is None:
         result = torch.pow(x, y)
     else:
         result = x**exponent
     return result
+
+def subtract_and_apply_activation(x, y, *args, **kwargs):
+    activation = kwargs.pop("activation")
+    output = torch.sub(x, y)
+
+    if activation == "relu":
+        output = torch.relu(output)
+    elif activation == "gelu":
+        output = torch.gelu(output)
+
+    return output
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_subtract_and_apply_activation_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_subtract_and_apply_activation_test.yaml
new file mode 100644
index 000000000000..391b97a71b28
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_subtract_and_apply_activation_test.yaml
@@ -0,0 +1,28 @@
+---
+test-list:
+  - ttnn-eltwise-subtract_and_apply_activation:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 128
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_subtract_and_apply_activation_sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM", "L1"]
+        out-buffer-type: ["DRAM", "L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test.yaml
new file mode 100644
index 000000000000..391b97a71b28
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test.yaml
@@ -0,0 +1,28 @@
+---
+test-list:
+  - ttnn-eltwise-subtract_and_apply_activation:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 128
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_subtract_and_apply_activation_sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM", "L1"]
+        out-buffer-type: ["DRAM", "L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py b/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
index 59a3a3c8df0b..cbf4bda992a5 100644
--- a/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
+++ b/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
@@ -3207,3 +3207,51 @@ def repeat(
     t1 = ttnn.repeat(t0, ttnn.Shape(shape))
 
     return ttnn_tensor_to_torch(t1)
+
+
+def eltwise_subtract_and_apply_activation(
+    x,
+    y,
+    *args,
+    activation,
+    device,
+    dtype,
+    layout,
+    input_mem_config,
+    output_mem_config,
+    **kwargs,
+):
+    if activation is not None:
+        activations = [activation]
+    else:
+        activations = None
+
+    t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0])
+    t1 = setup_ttnn_tensor(y, device, layout[1], input_mem_config[1], dtype[1])
+    t2 = ttnn.subtract(t0, t1, activations=activations, memory_config=memory_config_to_ttnn(output_mem_config))
+
+    return ttnn_tensor_to_torch(t2)
+
+
+def eltwise_subtract_and_apply_activation(
+    x,
+    y,
+    *args,
+    activation,
+    device,
+    dtype,
+    layout,
+    input_mem_config,
+    output_mem_config,
+    **kwargs,
+):
+    if activation is not None:
+        activations = [activation]
+    else:
+        activations = None
+
+    t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0])
+    t1 = setup_ttnn_tensor(y, device, layout[1], input_mem_config[1], dtype[1])
+    t2 = ttnn.subtract_(t0, t1, activations=activations, memory_config=memory_config_to_ttnn(output_mem_config))
+
+    return ttnn_tensor_to_torch(t2)

From 4167daf2869271b52da4ebc19a3705269d52ac94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CNenad?= <“npetrovic@tenstorrent.com”>
Date: Wed, 15 May 2024 10:01:37 +0000
Subject: [PATCH 18/40] #7292: Add sub and multiply activation sweeps

---
 .../sweep_tests/pytorch_ops.py                | 12 +++++
 ...e_multiply_and_apply_activation__test.yaml | 54 +++++++++++++++++++
 ...se_multiply_and_apply_activation_test.yaml | 28 ++++++++++
 ...e_subtract_and_apply_activation__test.yaml | 54 +++++++++++++++++++
 ...e_multiply_and_apply_activation__test.yaml | 54 +++++++++++++++++++
 ...se_multiply_and_apply_activation_test.yaml | 28 ++++++++++
 ...e_subtract_and_apply_activation__test.yaml | 54 +++++++++++++++++++
 ...btract_and_apply_activation_test copy.yaml | 28 ++++++++++
 .../sweep_tests/ttnn_ops.py                   | 50 ++++++++++++++++-
 9 files changed, 361 insertions(+), 1 deletion(-)
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_multiply_and_apply_activation__test.yaml
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_multiply_and_apply_activation_test.yaml
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_subtract_and_apply_activation__test.yaml
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_multiply_and_apply_activation__test.yaml
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_multiply_and_apply_activation_test.yaml
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation__test.yaml
 create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test copy.yaml

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
index 8c03ef5dde2e..9aaa1f58b300 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
@@ -1882,3 +1882,15 @@ def subtract_and_apply_activation(x, y, *args, **kwargs):
         output = torch.gelu(output)
 
     return output
+
+
+def multiply_and_apply_activation(x, y, *args, **kwargs):
+    activation = kwargs.pop("activation")
+    output = torch.mul(x, y)
+
+    if activation == "relu":
+        output = torch.relu(output)
+    elif activation == "gelu":
+        output = torch.gelu(output)
+
+    return output
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_multiply_and_apply_activation__test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_multiply_and_apply_activation__test.yaml
new file mode 100644
index 000000000000..ef2588ba4eba
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_multiply_and_apply_activation__test.yaml
@@ -0,0 +1,54 @@
+---
+test-list:
+  - ttnn-eltwise-multiply_and_apply_activation_:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 64
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_multiply_and_apply_activation__sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM"]
+        out-buffer-type: ["DRAM"]
+  - ttnn-eltwise-multiply_and_apply_activation_:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 64
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_multiply_and_apply_activation__sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["L1"]
+        out-buffer-type: ["L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_multiply_and_apply_activation_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_multiply_and_apply_activation_test.yaml
new file mode 100644
index 000000000000..3b8985a836b8
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_multiply_and_apply_activation_test.yaml
@@ -0,0 +1,28 @@
+---
+test-list:
+  - ttnn-eltwise-multiply_and_apply_activation:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 128
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_multiply_and_apply_activation_sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM", "L1"]
+        out-buffer-type: ["DRAM", "L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_subtract_and_apply_activation__test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_subtract_and_apply_activation__test.yaml
new file mode 100644
index 000000000000..655e26d8f9e9
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_subtract_and_apply_activation__test.yaml
@@ -0,0 +1,54 @@
+---
+test-list:
+  - ttnn-eltwise-subtract_and_apply_activation_:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 64
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_subtract_and_apply_activation__sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM"]
+        out-buffer-type: ["DRAM"]
+  - ttnn-eltwise-subtract_and_apply_activation_:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 64
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_subtract_and_apply_activation__sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["L1"]
+        out-buffer-type: ["L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_multiply_and_apply_activation__test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_multiply_and_apply_activation__test.yaml
new file mode 100644
index 000000000000..ef2588ba4eba
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_multiply_and_apply_activation__test.yaml
@@ -0,0 +1,54 @@
+---
+test-list:
+  - ttnn-eltwise-multiply_and_apply_activation_:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 64
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_multiply_and_apply_activation__sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM"]
+        out-buffer-type: ["DRAM"]
+  - ttnn-eltwise-multiply_and_apply_activation_:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 64
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_multiply_and_apply_activation__sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["L1"]
+        out-buffer-type: ["L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_multiply_and_apply_activation_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_multiply_and_apply_activation_test.yaml
new file mode 100644
index 000000000000..3b8985a836b8
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_multiply_and_apply_activation_test.yaml
@@ -0,0 +1,28 @@
+---
+test-list:
+  - ttnn-eltwise-multiply_and_apply_activation:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 128
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_multiply_and_apply_activation_sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM", "L1"]
+        out-buffer-type: ["DRAM", "L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation__test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation__test.yaml
new file mode 100644
index 000000000000..655e26d8f9e9
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation__test.yaml
@@ -0,0 +1,54 @@
+---
+test-list:
+  - ttnn-eltwise-subtract_and_apply_activation_:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 64
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_subtract_and_apply_activation__sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM"]
+        out-buffer-type: ["DRAM"]
+  - ttnn-eltwise-subtract_and_apply_activation_:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 64
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_subtract_and_apply_activation__sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["L1"]
+        out-buffer-type: ["L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test copy.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test copy.yaml
new file mode 100644
index 000000000000..391b97a71b28
--- /dev/null
+++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test copy.yaml	
@@ -0,0 +1,28 @@
+---
+test-list:
+  - ttnn-eltwise-subtract_and_apply_activation:
+      shape:
+        start-shape: [1, 1, 32, 32]
+        end-shape: [6, 12, 256, 256]
+        interval: [1, 1, 32, 32]
+        num-dims: [4]
+        num-shapes: 2
+        num-samples: 128
+        args-sampling-strategy: "all"
+      env:
+        # TT_PCI_DMA_BUF_SIZE: "1048576"
+      datagen:
+        function: gen_rand
+        args:
+          low: -100
+          high: 100
+      comparison:
+        function: comp_pcc
+      args-gen: gen_activation_args
+      sanitize-args: False
+      output-file: eltwise_subtract_and_apply_activation_sweep.csv
+      args:
+        data-layout: ["TILE"]
+        data-type: ["BFLOAT16", "BFLOAT8_B"]
+        buffer-type: ["DRAM", "L1"]
+        out-buffer-type: ["DRAM", "L1"]
diff --git a/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py b/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
index cbf4bda992a5..7d7319857027 100644
--- a/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
+++ b/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
@@ -3233,7 +3233,7 @@ def eltwise_subtract_and_apply_activation(
     return ttnn_tensor_to_torch(t2)
 
 
-def eltwise_subtract_and_apply_activation(
+def eltwise_subtract_and_apply_activation_(
     x,
     y,
     *args,
@@ -3255,3 +3255,51 @@ def eltwise_subtract_and_apply_activation(
     t2 = ttnn.subtract_(t0, t1, activations=activations, memory_config=memory_config_to_ttnn(output_mem_config))
 
     return ttnn_tensor_to_torch(t2)
+
+
+def eltwise_multiply_and_apply_activation(
+    x,
+    y,
+    *args,
+    activation,
+    device,
+    dtype,
+    layout,
+    input_mem_config,
+    output_mem_config,
+    **kwargs,
+):
+    if activation is not None:
+        activations = [activation]
+    else:
+        activations = None
+
+    t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0])
+    t1 = setup_ttnn_tensor(y, device, layout[1], input_mem_config[1], dtype[1])
+    t2 = ttnn.multiply(t0, t1, activations=activations, memory_config=memory_config_to_ttnn(output_mem_config))
+
+    return ttnn_tensor_to_torch(t2)
+
+
+def eltwise_multiply_and_apply_activation_(
+    x,
+    y,
+    *args,
+    activation,
+    device,
+    dtype,
+    layout,
+    input_mem_config,
+    output_mem_config,
+    **kwargs,
+):
+    if activation is not None:
+        activations = [activation]
+    else:
+        activations = None
+
+    t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0])
+    t1 = setup_ttnn_tensor(y, device, layout[1], input_mem_config[1], dtype[1])
+    t2 = ttnn.multiply_(t0, t1, activations=activations, memory_config=memory_config_to_ttnn(output_mem_config))
+
+    return ttnn_tensor_to_torch(t2)

From 0319be1b4a3582549077f35abcf53252709435a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CNenad?= <“npetrovic@tenstorrent.com”>
Date: Wed, 15 May 2024 10:02:48 +0000
Subject: [PATCH 19/40] #7292: Add sub and multiply activation sweeps

---
 ...btract_and_apply_activation_test copy.yaml | 28 -------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test copy.yaml

diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test copy.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test copy.yaml
deleted file mode 100644
index 391b97a71b28..000000000000
--- a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_subtract_and_apply_activation_test copy.yaml	
+++ /dev/null
@@ -1,28 +0,0 @@
----
-test-list:
-  - ttnn-eltwise-subtract_and_apply_activation:
-      shape:
-        start-shape: [1, 1, 32, 32]
-        end-shape: [6, 12, 256, 256]
-        interval: [1, 1, 32, 32]
-        num-dims: [4]
-        num-shapes: 2
-        num-samples: 128
-        args-sampling-strategy: "all"
-      env:
-        # TT_PCI_DMA_BUF_SIZE: "1048576"
-      datagen:
-        function: gen_rand
-        args:
-          low: -100
-          high: 100
-      comparison:
-        function: comp_pcc
-      args-gen: gen_activation_args
-      sanitize-args: False
-      output-file: eltwise_subtract_and_apply_activation_sweep.csv
-      args:
-        data-layout: ["TILE"]
-        data-type: ["BFLOAT16", "BFLOAT8_B"]
-        buffer-type: ["DRAM", "L1"]
-        out-buffer-type: ["DRAM", "L1"]

From a9ed21646a69c4d5b13f8c7e5c0e5e8da79c750c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CNenad?= <“npetrovic@tenstorrent.com”>
Date: Thu, 16 May 2024 13:04:59 +0000
Subject: [PATCH 20/40] #7292: Fix op maps

---
 .../python_api_testing/sweep_tests/op_map.py     | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/ttnn/python_api_testing/sweep_tests/op_map.py b/tests/ttnn/python_api_testing/sweep_tests/op_map.py
index c8730a9695bf..2cfa3505f284 100644
--- a/tests/ttnn/python_api_testing/sweep_tests/op_map.py
+++ b/tests/ttnn/python_api_testing/sweep_tests/op_map.py
@@ -673,4 +673,20 @@
         "tt_op": ttnn_ops.repeat,
         "pytorch_op": pytorch_ops.repeat_2,
     },
+    "ttnn-eltwise-subtract_and_apply_activation": {
+        "tt_op": ttnn_ops.eltwise_subtract_and_apply_activation,
+        "pytorch_op": pytorch_ops.subtract_and_apply_activation,
+    },
+    "ttnn-eltwise-subtract_and_apply_activation_": {
+        "tt_op": ttnn_ops.eltwise_subtract_and_apply_activation_,
+        "pytorch_op": pytorch_ops.subtract_and_apply_activation,
+    },
+    "ttnn-eltwise-multiply_and_apply_activation": {
+        "tt_op": ttnn_ops.eltwise_multiply_and_apply_activation,
+        "pytorch_op": pytorch_ops.multiply_and_apply_activation,
+    },
+    "ttnn-eltwise-multiply_and_apply_activation_": {
+        "tt_op": ttnn_ops.eltwise_multiply_and_apply_activation_,
+        "pytorch_op": pytorch_ops.multiply_and_apply_activation,
+    },
 }

From d29f73365345a85bd7785ed84505bf0179ad54ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CNenad?= <“npetrovic@tenstorrent.com”>
Date: Fri, 17 May 2024 08:34:14 +0000
Subject: [PATCH 21/40] #7292: Linter applied

---
 tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
index 9aaa1f58b300..b2be7ec52c46 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
@@ -1865,6 +1865,7 @@ def max_pool2d(x, *args, **kwargs):
 def repeat_2(x, *args, shape, **kwargs):
     return x.repeat(*shape)
 
+
 def power_2(x, y, *args, exponent=None, **kwargs):
     if exponent is None:
         result = torch.pow(x, y)
@@ -1872,6 +1873,7 @@ def power_2(x, y, *args, exponent=None, **kwargs):
         result = x**exponent
     return result
 
+
 def subtract_and_apply_activation(x, y, *args, **kwargs):
     activation = kwargs.pop("activation")
     output = torch.sub(x, y)

From 1b6796c22c62bf029fd221ed4f1a31c706455847 Mon Sep 17 00:00:00 2001
From: Evan Smal <esmal@tenstorrent.com>
Date: Fri, 17 May 2024 14:56:02 +0000
Subject: [PATCH 22/40] #0: Disable Mamba end-to-end perf test due to hangs in
 CI

---
 models/demos/mamba/tests/test_mamba_perf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/models/demos/mamba/tests/test_mamba_perf.py b/models/demos/mamba/tests/test_mamba_perf.py
index cae6044e7d3c..1563a29d00bc 100644
--- a/models/demos/mamba/tests/test_mamba_perf.py
+++ b/models/demos/mamba/tests/test_mamba_perf.py
@@ -15,10 +15,11 @@
 
 from models.perf.perf_utils import prep_perf_report
 from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report
-from models.utility_functions import profiler, enable_persistent_kernel_cache, skip_for_grayskull
+from models.utility_functions import profiler, enable_persistent_kernel_cache, skip_for_grayskull, skip_for_wormhole_b0
 from tt_metal.tools.profiler.process_model_log import get_samples_per_s
 
 
+@skip_for_wormhole_b0("Non-deterministic hang on CI (#8606)")
 @skip_for_grayskull("Requires eth connected devices to run")
 @pytest.mark.models_performance_bare_metal
 @pytest.mark.parametrize(

From ddbdc9b1b5b374dd5d37ed58f3efe339f27b9be7 Mon Sep 17 00:00:00 2001
From: Tapasvi Patel <tpatel@tenstorrent.com>
Date: Fri, 17 May 2024 16:08:31 +0000
Subject: [PATCH 23/40] #8586: Add tg and tgg frequent, perf and demo pipelines

---
 .github/workflows/tg-demo-tests.yaml          | 52 +++++++++++
 .github/workflows/tg-frequent-tests.yaml      | 52 +++++++++++
 .github/workflows/tg-model-perf-tests.yaml    | 87 +++++++++++++++++++
 .github/workflows/tg-unit-tests.yaml          |  4 +-
 .github/workflows/tgg-demo-tests.yaml         | 52 +++++++++++
 .github/workflows/tgg-frequent-tests.yaml     | 52 +++++++++++
 .github/workflows/tgg-model-perf-tests.yaml   | 87 +++++++++++++++++++
 .github/workflows/tgg-unit-tests.yaml         |  2 +-
 CODEOWNERS                                    | 12 +++
 tests/scripts/run_tests.sh                    | 70 ++++++++++++++-
 tests/scripts/tg/run_tg_demo_tests.sh         | 28 ++++++
 tests/scripts/tg/run_tg_frequent_tests.sh     | 28 ++++++
 tests/scripts/tg/run_tg_model_perf_tests.sh   | 60 +++++++++++++
 tests/scripts/tg/run_tg_unit_tests.sh         |  2 +-
 tests/scripts/tgg/run_tgg_demo_tests.sh       | 28 ++++++
 tests/scripts/tgg/run_tgg_frequent_tests.sh   | 28 ++++++
 tests/scripts/tgg/run_tgg_model_perf_tests.sh | 60 +++++++++++++
 tests/scripts/tgg/run_tgg_unit_tests.sh       |  2 +-
 18 files changed, 699 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/tg-demo-tests.yaml
 create mode 100644 .github/workflows/tg-frequent-tests.yaml
 create mode 100644 .github/workflows/tg-model-perf-tests.yaml
 create mode 100644 .github/workflows/tgg-demo-tests.yaml
 create mode 100644 .github/workflows/tgg-frequent-tests.yaml
 create mode 100644 .github/workflows/tgg-model-perf-tests.yaml
 create mode 100755 tests/scripts/tg/run_tg_demo_tests.sh
 create mode 100755 tests/scripts/tg/run_tg_frequent_tests.sh
 create mode 100755 tests/scripts/tg/run_tg_model_perf_tests.sh
 create mode 100755 tests/scripts/tgg/run_tgg_demo_tests.sh
 create mode 100755 tests/scripts/tgg/run_tgg_frequent_tests.sh
 create mode 100755 tests/scripts/tgg/run_tgg_model_perf_tests.sh

diff --git a/.github/workflows/tg-demo-tests.yaml b/.github/workflows/tg-demo-tests.yaml
new file mode 100644
index 000000000000..b8b51fcac5ee
--- /dev/null
+++ b/.github/workflows/tg-demo-tests.yaml
@@ -0,0 +1,52 @@
+name: "[TG] TG demo tests"
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 6' # This cron schedule runs the workflow every Saturday at 12am UTC
+
+jobs:
+  build-artifact:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+    secrets: inherit
+  tg-demo-tests:
+    needs: build-artifact
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          {
+            name: "TG demo tests",
+            arch: wormhole_b0,
+            runs-on: [arch-wormhole_b0, "config-tg", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type demos_tg_device --dispatch-mode ""'
+          },
+        ]
+    name: ${{ matrix.test-group.name }}
+    env:
+      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    environment: dev
+    runs-on: ${{ matrix.test-group.runs-on }}
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - name: Set up dynamic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}
+      - name: Extract files
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+      - uses: ./.github/actions/install-python-deps
+      - name: Run demo regression tests
+        timeout-minutes: 180
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          ${{ matrix.test-group.cmd }}
diff --git a/.github/workflows/tg-frequent-tests.yaml b/.github/workflows/tg-frequent-tests.yaml
new file mode 100644
index 000000000000..b044662e7da9
--- /dev/null
+++ b/.github/workflows/tg-frequent-tests.yaml
@@ -0,0 +1,52 @@
+name: "[TG] TG frequent tests"
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 */8 * * *" # This cron schedule runs the workflow every 8 hours
+
+jobs:
+  build-artifact:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+    secrets: inherit
+  tg-frequent-tests:
+    needs: build-artifact
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          {
+            name: "TG frequent tests",
+            arch: wormhole_b0,
+            runs-on: [arch-wormhole_b0, "config-tg", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_tg_device --dispatch-mode ""'
+          },
+        ]
+    name: ${{ matrix.test-group.name }}
+    env:
+      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    environment: dev
+    runs-on: ${{ matrix.test-group.runs-on }}
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - name: Set up dynamic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}
+      - name: Extract files
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+      - uses: ./.github/actions/install-python-deps
+      - name: Run frequent regression tests
+        timeout-minutes: 60
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          ${{ matrix.test-group.cmd }}
diff --git a/.github/workflows/tg-model-perf-tests.yaml b/.github/workflows/tg-model-perf-tests.yaml
new file mode 100644
index 000000000000..937e45db0acb
--- /dev/null
+++ b/.github/workflows/tg-model-perf-tests.yaml
@@ -0,0 +1,87 @@
+name: "[TG] TG model perf tests"
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 */12 * * *" # This cron schedule runs the workflow every 12 hours
+
+jobs:
+  build-artifact:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+    secrets: inherit
+  tg-model-perf-tests:
+    needs: build-artifact
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          {
+            name: "TG LLM model perf tests", 
+            model-type: "LLM",
+            arch: wormhole_b0, 
+            runs-on: [arch-wormhole_b0, "config-tg", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tg_device --dispatch-mode ""'
+          },
+          {
+            name: "TG CNN model perf tests", 
+            model-type: "CNN",
+            arch: wormhole_b0, 
+            runs-on: [arch-wormhole_b0, "config-tg", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""'
+          },
+        ]
+    name: ${{ matrix.test-group.name }}
+    env:
+      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    environment: dev
+    runs-on: ${{ matrix.test-group.runs-on }}
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - name: Enable performance mode
+        run: |
+          sudo cpupower frequency-set -g performance
+      - name: Ensure weka mount is active
+        run: |
+          sudo systemctl restart mnt-MLPerf.mount
+          sudo /etc/rc.local
+          ls -al /mnt/MLPerf/bit_error_tests
+      - name: Set up dynamic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+          echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}
+      - name: Extract files
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+      - uses: ./.github/actions/install-python-deps
+      - name: Run model perf regression tests
+        timeout-minutes: 60
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          ${{ matrix.test-group.cmd }}
+      - name: Check perf report exists
+        id: check-perf-report
+        if: ${{ !cancelled() }}
+        run: |
+          ls -hal
+          export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
+          ls -hal $PERF_REPORT_FILENAME
+          echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
+      - name: Upload perf report
+        if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
+          path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
+      - name: Disable performance mode
+        if: always()
+        run: |
+          sudo cpupower frequency-set -g ondemand
diff --git a/.github/workflows/tg-unit-tests.yaml b/.github/workflows/tg-unit-tests.yaml
index 12163a65d0d0..b3a8814a7b39 100644
--- a/.github/workflows/tg-unit-tests.yaml
+++ b/.github/workflows/tg-unit-tests.yaml
@@ -1,9 +1,9 @@
 name: "[TG] TG unit tests"
 
 on:
-  schedule:
-    - cron: '0 0 * * *' # Runs every day at 12am UTC
   workflow_dispatch:
+  schedule:
+    - cron: "0 */3 * * *" # This cron schedule runs the workflow every 3 hours
 
 jobs:
   build-artifact:
diff --git a/.github/workflows/tgg-demo-tests.yaml b/.github/workflows/tgg-demo-tests.yaml
new file mode 100644
index 000000000000..16f0cc21ddb2
--- /dev/null
+++ b/.github/workflows/tgg-demo-tests.yaml
@@ -0,0 +1,52 @@
+name: "[TGG] TGG demo tests"
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 6' # This cron schedule runs the workflow every Saturday at 12am UTC
+
+jobs:
+  build-artifact:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+    secrets: inherit
+  tgg-demo-tests:
+    needs: build-artifact
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          {
+            name: "TGG demo tests",
+            arch: wormhole_b0,
+            runs-on: [arch-wormhole_b0, "config-tgg", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type demos_tgg_device --dispatch-mode ""'
+          },
+        ]
+    name: ${{ matrix.test-group.name }}
+    env:
+      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    environment: dev
+    runs-on: ${{ matrix.test-group.runs-on }}
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - name: Set up dynamic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}
+      - name: Extract files
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+      - uses: ./.github/actions/install-python-deps
+      - name: Run demo regression tests
+        timeout-minutes: 180
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          ${{ matrix.test-group.cmd }}
diff --git a/.github/workflows/tgg-frequent-tests.yaml b/.github/workflows/tgg-frequent-tests.yaml
new file mode 100644
index 000000000000..31f0719a70de
--- /dev/null
+++ b/.github/workflows/tgg-frequent-tests.yaml
@@ -0,0 +1,52 @@
+name: "[TGG] TGG frequent tests"
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 */8 * * *" # This cron schedule runs the workflow every 8 hours
+
+jobs:
+  build-artifact:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+    secrets: inherit
+  tgg-frequent-tests:
+    needs: build-artifact
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          {
+            name: "TGG frequent tests",
+            arch: wormhole_b0,
+            runs-on: [arch-wormhole_b0, "config-tgg", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_tgg_device --dispatch-mode ""'
+          },
+        ]
+    name: ${{ matrix.test-group.name }}
+    env:
+      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    environment: dev
+    runs-on: ${{ matrix.test-group.runs-on }}
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - name: Set up dynamic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}
+      - name: Extract files
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+      - uses: ./.github/actions/install-python-deps
+      - name: Run frequent regression tests
+        timeout-minutes: 60
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          ${{ matrix.test-group.cmd }}
diff --git a/.github/workflows/tgg-model-perf-tests.yaml b/.github/workflows/tgg-model-perf-tests.yaml
new file mode 100644
index 000000000000..15a7aad1d45c
--- /dev/null
+++ b/.github/workflows/tgg-model-perf-tests.yaml
@@ -0,0 +1,87 @@
+name: "[TGG] TGG model perf tests"
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 */12 * * *" # This cron schedule runs the workflow every 12 hours
+
+jobs:
+  build-artifact:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+    secrets: inherit
+  tgg-model-perf-tests:
+    needs: build-artifact
+    strategy:
+      fail-fast: false
+      matrix:
+        test-group: [
+          {
+            name: "TGG LLM model perf tests", 
+            model-type: "LLM",
+            arch: wormhole_b0, 
+            runs-on: [arch-wormhole_b0, "config-tgg", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tgg_device --dispatch-mode ""'
+          },
+          {
+            name: "TGG CNN model perf tests", 
+            model-type: "CNN",
+            arch: wormhole_b0, 
+            runs-on: [arch-wormhole_b0, "config-tgg", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tgg_device --dispatch-mode ""'
+          },
+        ]
+    name: ${{ matrix.test-group.name }}
+    env:
+      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    environment: dev
+    runs-on: ${{ matrix.test-group.runs-on }}
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - name: Enable performance mode
+        run: |
+          sudo cpupower frequency-set -g performance
+      - name: Ensure weka mount is active
+        run: |
+          sudo systemctl restart mnt-MLPerf.mount
+          sudo /etc/rc.local
+          ls -al /mnt/MLPerf/bit_error_tests
+      - name: Set up dynamic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+          echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}
+      - name: Extract files
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+      - uses: ./.github/actions/install-python-deps
+      - name: Run model perf regression tests
+        timeout-minutes: 60
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          ${{ matrix.test-group.cmd }}
+      - name: Check perf report exists
+        id: check-perf-report
+        if: ${{ !cancelled() }}
+        run: |
+          ls -hal
+          export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
+          ls -hal $PERF_REPORT_FILENAME
+          echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
+      - name: Upload perf report
+        if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
+          path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
+      - name: Disable performance mode
+        if: always()
+        run: |
+          sudo cpupower frequency-set -g ondemand
diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml
index 5351b9ae824b..d03a4b6d2112 100644
--- a/.github/workflows/tgg-unit-tests.yaml
+++ b/.github/workflows/tgg-unit-tests.yaml
@@ -3,7 +3,7 @@ name: "[TGG] TGG unit tests"
 on:
   workflow_dispatch:
   schedule:
-    - cron: '0 0 * * *' # This cron schedule runs the workflow every day at 12am UTC
+    - cron: "0 */3 * * *" # This cron schedule runs the workflow every 3 hours
 
 jobs:
   build-artifact:
diff --git a/CODEOWNERS b/CODEOWNERS
index 147b0e9a2463..e4e6c841c956 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,13 +2,25 @@
 # precedence.
 
 .github/ @tt-rkim
+
+# T3000 workflows
 .github/t3000-unit-tests.yaml @tapspatel
 .github/t3000-profiler-tests.yaml @tapspatel
 .github/t3000-model-perf-tests.yaml @tapspatel
 .github/t3000-frequent-tests.yaml @tapspatel
 .github/t3000-demo-tests.yaml @tapspatel
+
+# TG workflows
 .github/tg-unit-tests.yaml @tapspatel
+.github/tg-model-perf-tests.yaml @tapspatel
+.github/tg-frequent-tests.yaml @tapspatel
+.github/tg-demo-tests.yaml @tapspatel
+
+# TGG workflows
 .github/tgg-unit-tests.yaml @tapspatel
+.github/tgg-model-perf-tests.yaml @tapspatel
+.github/tgg-frequent-tests.yaml @tapspatel
+.github/tgg-demo-tests.yaml @tapspatel
 
 /infra/ @tt-rkim
 
diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
index 37580d883098..fe6a4e5276a8 100755
--- a/tests/scripts/run_tests.sh
+++ b/tests/scripts/run_tests.sh
@@ -226,7 +226,7 @@ model_perf_t3000_device() {
 ##########################T3000##########################
 
 ##########################TG##########################
-# Run TG unit tests
+# Run tg unit tests
 unit_tg_device() {
     local tt_arch=$1
     local pipeline_type=$2
@@ -234,10 +234,37 @@ unit_tg_device() {
 
     ./tests/scripts/tg/run_tg_unit_tests.sh
 }
+
+# Run tg frequent tests
+frequent_tg_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/tg/run_tg_frequent_tests.sh
+}
+
+# Run tg demo tests
+demos_tg_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/tg/run_tg_demo_tests.sh
+}
+
+# Run tg model perf tests
+model_perf_tg_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/tg/run_tg_model_perf_tests.sh --pipeline-type "$pipeline_type"
+}
 ##########################TG##########################
 
 ##########################TGG##########################
-# Run TGG unit tests
+# Run tgg unit tests
 unit_tgg_device() {
     local tt_arch=$1
     local pipeline_type=$2
@@ -245,6 +272,33 @@ unit_tgg_device() {
 
     ./tests/scripts/tgg/run_tgg_unit_tests.sh
 }
+
+# Run tgg frequent tests
+frequent_tgg_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/tgg/run_tgg_frequent_tests.sh
+}
+
+# Run tgg demo tests
+demos_tgg_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/tgg/run_tgg_demo_tests.sh
+}
+
+# Run tgg model perf tests
+model_perf_tgg_device() {
+    local tt_arch=$1
+    local pipeline_type=$2
+    local dispatch_mode=$3
+
+    ./tests/scripts/tgg/run_tgg_model_perf_tests.sh --pipeline-type "$pipeline_type"
+}
 ##########################TGG##########################
 
 run_pipeline_tests() {
@@ -285,9 +339,21 @@ run_pipeline_tests() {
     # TG pipelines
     elif [[ $pipeline_type == "unit_tg_device" ]]; then
         unit_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == "frequent_tg_device" ]]; then
+        frequent_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == "demos_tg_device" ]]; then
+        demos_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == *"model_perf_tg_device" ]]; then
+        model_perf_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
     # TGG pipelines
     elif [[ $pipeline_type == "unit_tgg_device" ]]; then
         unit_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == "frequent_tgg_device" ]]; then
+        frequent_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == "demos_tgg_device" ]]; then
+        demos_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == *"model_perf_tgg_device" ]]; then
+        model_perf_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
     else
         echo "Unknown pipeline: $pipeline_type"
         exit 1
diff --git a/tests/scripts/tg/run_tg_demo_tests.sh b/tests/scripts/tg/run_tg_demo_tests.sh
new file mode 100755
index 000000000000..c10fb083f002
--- /dev/null
+++ b/tests/scripts/tg/run_tg_demo_tests.sh
@@ -0,0 +1,28 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_tg_tests() {
+  # Add tests here
+  echo "Fill me!"
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  run_tg_tests
+}
+
+main "$@"
\ No newline at end of file
diff --git a/tests/scripts/tg/run_tg_frequent_tests.sh b/tests/scripts/tg/run_tg_frequent_tests.sh
new file mode 100755
index 000000000000..c10fb083f002
--- /dev/null
+++ b/tests/scripts/tg/run_tg_frequent_tests.sh
@@ -0,0 +1,28 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_tg_tests() {
+  # Add tests here
+  echo "Fill me!"
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  run_tg_tests
+}
+
+main "$@"
\ No newline at end of file
diff --git a/tests/scripts/tg/run_tg_model_perf_tests.sh b/tests/scripts/tg/run_tg_model_perf_tests.sh
new file mode 100755
index 000000000000..5f35be30f2f7
--- /dev/null
+++ b/tests/scripts/tg/run_tg_model_perf_tests.sh
@@ -0,0 +1,60 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_tg_llm_tests() {
+  # Merge all the generated reports
+  env python models/perf/merge_perf_results.py
+}
+
+run_tg_cnn_tests() {
+  # Merge all the generated reports
+  env python models/perf/merge_perf_results.py
+}
+
+main() {
+  # Parse the arguments
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      --pipeline-type)
+        pipeline_type=$2
+        shift
+        ;;
+      *)
+        echo "Unknown option: $1"
+        exit 1
+        ;;
+    esac
+    shift
+  done
+    
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$pipeline_type" ]]; then
+    echo "--pipeline-type cannot be empty" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  if [[ "$pipeline_type" == "llm_model_perf_tg_device" ]]; then
+    run_tg_llm_tests
+  elif [[ "$pipeline_type" == "cnn_model_perf_tg_device" ]]; then
+    run_tg_cnn_tests
+  else
+    echo "$pipeline_type is invalid (supported: [cnn_model_perf_tg_device, cnn_model_perf_tg_device])" 2>&1
+    exit 1
+  fi
+}
+
+main "$@"
\ No newline at end of file
diff --git a/tests/scripts/tg/run_tg_unit_tests.sh b/tests/scripts/tg/run_tg_unit_tests.sh
index 5a5c93de2aef..dbcc07780683 100755
--- a/tests/scripts/tg/run_tg_unit_tests.sh
+++ b/tests/scripts/tg/run_tg_unit_tests.sh
@@ -25,4 +25,4 @@ main() {
   run_tg_tests
 }
 
-main "$@"
+main "$@"
\ No newline at end of file
diff --git a/tests/scripts/tgg/run_tgg_demo_tests.sh b/tests/scripts/tgg/run_tgg_demo_tests.sh
new file mode 100755
index 000000000000..26c2f890f9cd
--- /dev/null
+++ b/tests/scripts/tgg/run_tgg_demo_tests.sh
@@ -0,0 +1,28 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_tgg_tests() {
+  # Add tests here
+  echo "Fill me!"
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  run_tgg_tests
+}
+
+main "$@"
\ No newline at end of file
diff --git a/tests/scripts/tgg/run_tgg_frequent_tests.sh b/tests/scripts/tgg/run_tgg_frequent_tests.sh
new file mode 100755
index 000000000000..26c2f890f9cd
--- /dev/null
+++ b/tests/scripts/tgg/run_tgg_frequent_tests.sh
@@ -0,0 +1,28 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_tgg_tests() {
+  # Add tests here
+  echo "Fill me!"
+}
+
+main() {
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  run_tgg_tests
+}
+
+main "$@"
\ No newline at end of file
diff --git a/tests/scripts/tgg/run_tgg_model_perf_tests.sh b/tests/scripts/tgg/run_tgg_model_perf_tests.sh
new file mode 100755
index 000000000000..ebf6aa946dba
--- /dev/null
+++ b/tests/scripts/tgg/run_tgg_model_perf_tests.sh
@@ -0,0 +1,60 @@
+
+#/bin/bash
+set -eo pipefail
+
+run_tgg_llm_tests() {
+  # Merge all the generated reports
+  env python models/perf/merge_perf_results.py
+}
+
+run_tgg_cnn_tests() {
+  # Merge all the generated reports
+  env python models/perf/merge_perf_results.py
+}
+
+main() {
+  # Parse the arguments
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      --pipeline-type)
+        pipeline_type=$2
+        shift
+        ;;
+      *)
+        echo "Unknown option: $1"
+        exit 1
+        ;;
+    esac
+    shift
+  done
+    
+  if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$ARCH_NAME" ]]; then
+    echo "Must provide ARCH_NAME in environment" 1>&2
+    exit 1
+  fi
+
+  if [[ -z "$pipeline_type" ]]; then
+    echo "--pipeline-type cannot be empty" 1>&2
+    exit 1
+  fi
+
+  # Run all tests
+  cd $TT_METAL_HOME
+  export PYTHONPATH=$TT_METAL_HOME
+
+  if [[ "$pipeline_type" == "llm_model_perf_tgg_device" ]]; then
+    run_tgg_llm_tests
+  elif [[ "$pipeline_type" == "cnn_model_perf_tgg_device" ]]; then
+    run_tgg_cnn_tests
+  else
+    echo "$pipeline_type is invalid (supported: [cnn_model_perf_tgg_device, cnn_model_perf_tgg_device])" 2>&1
+    exit 1
+  fi
+}
+
+main "$@"
\ No newline at end of file
diff --git a/tests/scripts/tgg/run_tgg_unit_tests.sh b/tests/scripts/tgg/run_tgg_unit_tests.sh
index b8c209a22cb0..8f5130ea8d03 100755
--- a/tests/scripts/tgg/run_tgg_unit_tests.sh
+++ b/tests/scripts/tgg/run_tgg_unit_tests.sh
@@ -25,4 +25,4 @@ main() {
   run_tgg_tests
 }
 
-main "$@"
+main "$@"
\ No newline at end of file

From d752ca1f5fcc75288b7069b8a6518030dedc9e2c Mon Sep 17 00:00:00 2001
From: Kyle Mabee <kmabee@tenstorrent.com>
Date: Sun, 5 May 2024 18:53:52 +0000
Subject: [PATCH 24/40] #0: Change CQ_DISPATCH_CMD_WAIT(clear_count=1) impl in
 cq_dispatcher.cpp

 - Use noc_semaphore_inc(neg_val) + noc_async_atomic_barrier() as per
   Paul feedback, instead of ~unsafe direct set to ptr value
---
 tt_metal/impl/dispatch/kernels/cq_dispatch.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
index fd9acf990054..9462dc87b211 100644
--- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
@@ -673,7 +673,9 @@ static void process_wait() {
     DEBUG_STATUS("PWD");
 
     if (clear_count) {
-        *sem_addr = 0;
+        uint32_t neg_sem_val = -(*sem_addr);
+        noc_semaphore_inc(get_noc_addr_helper(my_noc_xy, addr), neg_sem_val);
+        noc_async_atomic_barrier();
     }
 
     if (notify_prefetch) {

From cc8a0b9e1c252f5dc2fd247f39fb74d0ccfd0189 Mon Sep 17 00:00:00 2001
From: Kyle Mabee <kmabee@tenstorrent.com>
Date: Fri, 26 Apr 2024 20:23:20 +0000
Subject: [PATCH 25/40] #8062: Add Prefetcher Stall flag used by
 CQ_PREFETCH_CMD_EXEC_BUF (FetchQ MSB)

 - Host sets MSB of FetchQ entry on ExecBuf cmd to denote
   that prefetcher should stall and not fetch any more cmds
   since ExecBuf will read TraceBuffer data and write to CmdDataQ
   which will clobber subsequently fetched cmds. Remove previous
   "ugly hack" that was doing similiar thing.

 - On STALL_NEXT, barrier/wait for fetched cmd requesting a stall to
   return, and increase fence, before moving to STALLED state and
   early exit in fetch_q_get_cmds() when STALLED

 - PR Feedback and fix for NOT_STALLED setting
 - Add assert to make sure ExecBuf is comes with stall_flag=true, it's
   required now that ugly-hack is removed, otherwise hang.
 - Update test_prefetcher.cpp to set stall flag (FetchQ MSB) for
   ExecBuf, otherwise hang. Needed to change to make cmd_sizes uint32_t
   instead of uint16_t through the code since currently today FetchQ entry size
   dispatch_constants::prefetch_q_entry_type is uint32_t, to be able to carry
   though the MSB bit properly.
---
 .../dispatch/test_prefetcher.cpp              | 106 ++++++++----------
 tt_metal/impl/dispatch/command_queue.cpp      |   4 +-
 .../impl/dispatch/command_queue_interface.hpp |  11 +-
 .../impl/dispatch/kernels/cq_prefetch.cpp     |  49 ++++++--
 4 files changed, 97 insertions(+), 73 deletions(-)

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
index cc16a4cca56b..c6dae75b4460 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
@@ -227,7 +227,7 @@ void add_bare_prefetcher_cmd(vector<uint32_t>& cmds,
 }
 
 void add_prefetcher_paged_read_cmd(vector<uint32_t>& cmds,
-                             vector<uint16_t>& sizes,
+                             vector<uint32_t>& sizes,
                              uint32_t start_page,
                              uint32_t base_addr,
                              uint32_t page_size,
@@ -253,7 +253,7 @@ void add_prefetcher_paged_read_cmd(vector<uint32_t>& cmds,
 
 void add_prefetcher_linear_read_cmd(Device *device,
                                     vector<uint32_t>& cmds,
-                                    vector<uint16_t>& sizes,
+                                    vector<uint32_t>& sizes,
                                     CoreCoord worker_core,
                                     uint32_t addr,
                                     uint32_t length) {
@@ -296,7 +296,7 @@ void add_prefetcher_debug_epilogue(vector<uint32_t>& cmds,
 }
 
 void add_prefetcher_cmd_to_hostq(vector<uint32_t>& cmds,
-                                 vector<uint16_t>& sizes,
+                                 vector<uint32_t>& sizes,
                                  const vector<uint32_t>& payload,
                                  size_t prior_end) {
     uint32_t cmd_size_bytes = (cmds.size() - prior_end) * sizeof(uint32_t);
@@ -315,7 +315,7 @@ void add_prefetcher_cmd_to_hostq(vector<uint32_t>& cmds,
 }
 
 void add_prefetcher_cmd(vector<uint32_t>& cmds,
-                        vector<uint16_t>& sizes,
+                        vector<uint32_t>& sizes,
                         CQPrefetchCmd cmd) {
 
     vector<uint32_t> empty_payload;
@@ -330,7 +330,7 @@ void add_prefetcher_cmd(vector<uint32_t>& cmds,
 }
 
 void add_prefetcher_cmd(vector<uint32_t>& cmds,
-                        vector<uint16_t>& sizes,
+                        vector<uint32_t>& sizes,
                         CQPrefetchCmdId id,
                         vector<uint32_t>& payload) {
 
@@ -421,7 +421,7 @@ void add_paged_dram_data_to_device_data(Device *device,
 // Interleaved/Paged Read of DRAM to Worker L1
 void gen_dram_read_cmd(Device *device,
                        vector<uint32_t>& prefetch_cmds,
-                       vector<uint16_t>& cmd_sizes,
+                       vector<uint32_t>& cmd_sizes,
                        DeviceData& device_data,
                        CoreCoord worker_core,
                        uint32_t start_page,
@@ -458,7 +458,7 @@ void gen_dram_read_cmd(Device *device,
 // Interleaved/Paged Write to DRAM.
 void gen_dram_write_cmd(Device *device,
                     vector<uint32_t>& prefetch_cmds,
-                    vector<uint16_t>& cmd_sizes,
+                    vector<uint32_t>& cmd_sizes,
                     DeviceData& device_data,
                     uint32_t start_page,
                     uint32_t page_size,
@@ -475,7 +475,7 @@ void gen_dram_write_cmd(Device *device,
 // This is pretty much a blit: copies from worker core's start of data back to the end of data
 void gen_linear_read_cmd(Device *device,
                          vector<uint32_t>& prefetch_cmds,
-                         vector<uint16_t>& cmd_sizes,
+                         vector<uint32_t>& cmd_sizes,
                          DeviceData& device_data,
                          CoreCoord worker_core,
                          uint32_t length,
@@ -504,7 +504,7 @@ void gen_linear_read_cmd(Device *device,
 
 void gen_wait_and_stall_cmd(Device *device,
                             vector<uint32_t>& prefetch_cmds,
-                            vector<uint16_t>& cmd_sizes) {
+                            vector<uint32_t>& cmd_sizes) {
 
     vector<uint32_t> dispatch_cmds;
 
@@ -523,7 +523,7 @@ void gen_wait_and_stall_cmd(Device *device,
 
 void gen_dispatcher_delay_cmd(Device *device,
                               vector<uint32_t>& prefetch_cmds,
-                              vector<uint16_t>& cmd_sizes,
+                              vector<uint32_t>& cmd_sizes,
                               uint32_t count) {
 
     vector<uint32_t> dispatch_cmds;
@@ -537,7 +537,7 @@ void gen_dispatcher_delay_cmd(Device *device,
 
 void gen_paged_read_dram_test(Device *device,
                    vector<uint32_t>& prefetch_cmds,
-                   vector<uint16_t>& cmd_sizes,
+                   vector<uint32_t>& cmd_sizes,
                    DeviceData& device_data,
                    CoreCoord worker_core) {
 
@@ -567,7 +567,7 @@ void gen_paged_read_dram_test(Device *device,
 //  3. Do previous 2 steps in a loop, reading and writing new data until DEVICE_DATA_SIZE bytes is written to worker core.
 void gen_paged_write_read_dram_test(Device *device,
                    vector<uint32_t>& prefetch_cmds,
-                   vector<uint16_t>& cmd_sizes,
+                   vector<uint32_t>& cmd_sizes,
                    DeviceData& device_data,
                    CoreCoord worker_core,
                    uint32_t dst_addr) {
@@ -601,7 +601,7 @@ void gen_paged_write_read_dram_test(Device *device,
 
 void gen_pcie_test(Device *device,
                    vector<uint32_t>& prefetch_cmds,
-                   vector<uint16_t>& cmd_sizes,
+                   vector<uint32_t>& cmd_sizes,
                    DeviceData& device_data,
                    CoreCoord worker_core) {
 
@@ -632,7 +632,7 @@ static void pad_host_data(DeviceData& device_data) {
 
 void gen_host_test(Device *device,
                    vector<uint32_t>& prefetch_cmds,
-                   vector<uint16_t>& cmd_sizes,
+                   vector<uint32_t>& cmd_sizes,
                    DeviceData& device_data) {
 
     constexpr uint32_t data_size = 614400;
@@ -668,7 +668,7 @@ void gen_host_test(Device *device,
 
 void gen_rnd_dram_paged_cmd(Device *device,
                             vector<uint32_t>& prefetch_cmds,
-                            vector<uint16_t>& cmd_sizes,
+                            vector<uint32_t>& cmd_sizes,
                             DeviceData& device_data,
                             CoreCoord worker_core) {
 
@@ -704,7 +704,7 @@ void gen_rnd_dram_paged_cmd(Device *device,
 
 void gen_rnd_inline_cmd(Device *device,
                         vector<uint32_t>& prefetch_cmds,
-                        vector<uint16_t>& cmd_sizes,
+                        vector<uint32_t>& cmd_sizes,
                         DeviceData& device_data,
                         CoreCoord worker_core) {
 
@@ -742,7 +742,7 @@ void gen_rnd_inline_cmd(Device *device,
 }
 
 void gen_rnd_debug_cmd(vector<uint32_t>& prefetch_cmds,
-                       vector<uint16_t>& cmd_sizes,
+                       vector<uint32_t>& cmd_sizes,
                        DeviceData& device_data) {
 
     vector<uint32_t> rnd_payload;
@@ -752,7 +752,7 @@ void gen_rnd_debug_cmd(vector<uint32_t>& prefetch_cmds,
 
 void gen_rnd_test(Device *device,
                   vector<uint32_t>& prefetch_cmds,
-                  vector<uint16_t>& cmd_sizes,
+                  vector<uint32_t>& cmd_sizes,
                   DeviceData& device_data) {
 
     while (device_data.size() * sizeof(uint32_t) < DEVICE_DATA_SIZE) {
@@ -787,7 +787,7 @@ void gen_rnd_test(Device *device,
 void gen_prefetcher_exec_buf_cmd_and_write_to_dram(Device *device,
                                                    vector<uint32_t>& prefetch_cmds,
                                                    vector<uint32_t> buf_cmds,
-                                                   vector<uint16_t>& cmd_sizes) {
+                                                   vector<uint32_t>& cmd_sizes) {
 
     vector<uint32_t> empty_payload; // don't give me grief, it is just a test
 
@@ -805,7 +805,7 @@ void gen_prefetcher_exec_buf_cmd_and_write_to_dram(Device *device,
         dcmd.write_linear.length = 16;
 
         vector<uint32_t> dispatch_cmds;
-        vector<uint16_t> empty_sizes;
+        vector<uint32_t> empty_sizes;
         add_bare_dispatcher_cmd(dispatch_cmds, dcmd);
         dispatch_cmds.push_back(1);
         dispatch_cmds.push_back(0);
@@ -857,11 +857,17 @@ void gen_prefetcher_exec_buf_cmd_and_write_to_dram(Device *device,
     cmd.exec_buf.pages = pages;
 
     add_prefetcher_cmd(prefetch_cmds, cmd_sizes, cmd);
+
+    // CQ_PREFETCH_CMD_EXEC_BUF command requires stall_prefetcher flag to be set. This is MSB on FetchQ entry
+    // Hacky, but set it here, on the last cmd_size (FetchQ entry write, later)
+    const bool stall_prefetcher = true;
+    cmd_sizes[cmd_sizes.size() - 1] |= (stall_prefetcher << ((sizeof(dispatch_constants::prefetch_q_entry_type) * 8) - 1));
+
 }
 
 void gen_smoke_test(Device *device,
                     vector<uint32_t>& prefetch_cmds,
-                    vector<uint16_t>& cmd_sizes,
+                    vector<uint32_t>& cmd_sizes,
                     DeviceData& device_data,
                     CoreCoord worker_core) {
 
@@ -1028,7 +1034,7 @@ void gen_smoke_test(Device *device,
 
 void gen_prefetcher_cmds(Device *device,
                          vector<uint32_t>& prefetch_cmds,
-                         vector<uint16_t>& cmd_sizes,
+                         vector<uint32_t>& cmd_sizes,
                          DeviceData& device_data,
                          uint32_t dst_addr) {
 
@@ -1064,7 +1070,7 @@ void gen_prefetcher_cmds(Device *device,
 }
 
 void gen_terminate_cmds(vector<uint32_t>& prefetch_cmds,
-                        vector<uint16_t>& cmd_sizes) {
+                        vector<uint32_t>& cmd_sizes) {
     vector<uint32_t> empty_payload; // don't give me grief, it is just a test
     vector<uint32_t> dispatch_cmds;
 
@@ -1100,15 +1106,10 @@ void nt_memcpy(uint8_t *__restrict dst, const uint8_t * __restrict src, size_t n
         _mm_sfence();
 }
 
-// how_many is an experiment w/ writing multiple FetchQ entries w/ one PCIe write
-// code here is set up to let it be either 1 or 2 and presently relies on
-// FetchQ entries being 16bits, though should be generalized
-// disabled for now pending other changes/fixes
 void write_prefetcher_cmd(Device *device,
                           vector<uint32_t>& cmds,
                           uint32_t& cmd_offset,
-                          uint16_t* cmd_sizes16b,
-                          uint32_t& how_many,
+                          uint32_t& cmd_size16b,
                           uint32_t*& host_mem_ptr,
                           uint32_t& prefetch_q_dev_ptr,
                           uint32_t& prefetch_q_dev_fence,
@@ -1134,31 +1135,24 @@ void write_prefetcher_cmd(Device *device,
         }
     }
 
-    if (prefetch_q_dev_ptr == prefetch_q_base + (prefetch_q_entries_g - 1) * sizeof(dispatch_constants::prefetch_q_entry_type)) {
-        how_many = 1;
-    }
-
-    uint16_t* cmd_sizes_tmp = cmd_sizes16b;
-    for (int i = 0; i < how_many; i++) {
-        uint32_t cmd_size_bytes = (uint32_t)*cmd_sizes_tmp << dispatch_constants::PREFETCH_Q_LOG_MINSIZE;
-        cmd_sizes_tmp++;
-        uint32_t cmd_size_words = cmd_size_bytes / sizeof(uint32_t);
+    constexpr uint32_t prefetch_q_msb_mask = (1 << ((sizeof(dispatch_constants::prefetch_q_entry_type) * 8) - 1));
+    uint32_t cmd_size_bytes = (cmd_size16b & ~prefetch_q_msb_mask) << dispatch_constants::PREFETCH_Q_LOG_MINSIZE;
+    uint32_t cmd_size_words = cmd_size_bytes / sizeof(uint32_t);
 
-        nt_memcpy((uint8_t *)host_mem_ptr, (uint8_t *)&cmds[cmd_offset], cmd_size_bytes);
-        cmd_offset += cmd_size_words;
-        host_mem_ptr += cmd_size_words;
-    }
+    nt_memcpy((uint8_t *)host_mem_ptr, (uint8_t *)&cmds[cmd_offset], cmd_size_bytes);
+    cmd_offset += cmd_size_words;
+    host_mem_ptr += cmd_size_words;
 
-    uint32_t cmd_size16b = (how_many == 1) ? cmd_sizes16b[0] : ((cmd_sizes16b[1] << 16) | cmd_sizes16b[0]);
+    // This updates FetchQ where each entry of type prefetch_q_entry_type is size in 16B.
     tt::Cluster::instance().write_reg(&cmd_size16b, tt_cxy_pair(device->id(), phys_prefetch_core), prefetch_q_dev_ptr);
 
-    prefetch_q_dev_ptr += sizeof(dispatch_constants::prefetch_q_entry_type) * how_many;
+    prefetch_q_dev_ptr += sizeof(dispatch_constants::prefetch_q_entry_type);
 }
 
 void write_prefetcher_cmds(uint32_t iterations,
                            Device *device,
                            vector<uint32_t> prefetch_cmds, // yes copy for dram_exec_buf
-                           vector<uint16_t>& cmd_sizes,
+                           vector<uint32_t>& cmd_sizes,
                            void * host_hugepage_base,
                            uint32_t dev_hugepage_base,
                            uint32_t prefetch_q_base,
@@ -1194,19 +1188,7 @@ void write_prefetcher_cmds(uint32_t iterations,
 
     for (uint32_t i = 0; i < iterations; i++) {
         uint32_t cmd_ptr = 0;
-        uint32_t how_many = 0;
-        for (uint32_t j = 0; j < cmd_sizes.size(); j += how_many) {
-            // how_many = (j == cmd_sizes.size() - 1) ? 1 : 2;
-            how_many = 1; // see comment above, experiment on hold
-            if (how_many == 2) {
-                uint32_t cmd_size = cmd_sizes[j] + cmd_sizes[j+1];
-                uint32_t cmd_size_words = ((uint32_t)cmd_size << dispatch_constants::PREFETCH_Q_LOG_MINSIZE) / sizeof(uint32_t);
-
-                if ((void *)(host_mem_ptr + cmd_size_words) > (void *)((uint8_t *)host_hugepage_base + hugepage_issue_buffer_size_g)) {
-                    how_many = 1;
-                }
-            }
-
+        for (uint32_t j = 0; j < cmd_sizes.size(); j++) {
             uint32_t cmd_size_words = ((uint32_t)cmd_sizes[j] << dispatch_constants::PREFETCH_Q_LOG_MINSIZE) / sizeof(uint32_t);
             uint32_t space_at_end_for_wrap_words = CQ_PREFETCH_CMD_BARE_MIN_SIZE / sizeof(uint32_t);
             if ((void *)(host_mem_ptr + cmd_size_words) > (void *)((uint8_t *)host_hugepage_base + hugepage_issue_buffer_size_g)) {
@@ -1215,7 +1197,7 @@ void write_prefetcher_cmds(uint32_t iterations,
                 host_mem_ptr = (uint32_t *)host_hugepage_base;
             }
 
-            write_prefetcher_cmd(device, prefetch_cmds, cmd_ptr, &cmd_sizes[j], how_many,
+            write_prefetcher_cmd(device, prefetch_cmds, cmd_ptr, cmd_sizes[j],
                                  host_mem_ptr, prefetch_q_dev_ptr, prefetch_q_dev_fence, prefetch_q_base, prefetch_q_rd_ptr_addr, phys_prefetch_core);
         }
     }
@@ -1244,8 +1226,8 @@ std::chrono::duration<double> run_test(uint32_t iterations,
                                        Program& program,
                                        Device *device_r,
                                        Program& program_r,
-                                       vector<uint16_t>& cmd_sizes,
-                                       vector<uint16_t>& terminate_sizes,
+                                       vector<uint32_t>& cmd_sizes,
+                                       vector<uint32_t>& terminate_sizes,
                                        vector<uint32_t>& cmds,
                                        vector<uint32_t>& terminate_cmds,
                                        void * host_hugepage_base,
@@ -2640,7 +2622,7 @@ int main(int argc, char **argv) {
         log_info(LogTest, "Iterations: {}", iterations_g);
 
         vector<uint32_t> cmds, terminate_cmds;
-        vector<uint16_t> cmd_sizes, terminate_sizes;
+        vector<uint32_t> cmd_sizes, terminate_sizes;
         DeviceData device_data(device, all_workers_g, l1_buf_base_g, DRAM_DATA_BASE_ADDR, (uint32_t*)host_hugepage_completion_buffer_base_g, false, DRAM_DATA_SIZE_WORDS);
         num_dram_banks_g = device->num_banks(BufferType::DRAM);
 
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 27b4e03d929b..9cb033ef51f9 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -1115,7 +1115,9 @@ void EnqueueTraceCommand::process() {
 
     this->manager.fetch_queue_reserve_back(this->command_queue_id);
 
-    this->manager.fetch_queue_write(cmd_sequence_sizeB, this->command_queue_id);
+    const bool stall_prefetcher = true;
+    this->manager.fetch_queue_write(cmd_sequence_sizeB, this->command_queue_id, stall_prefetcher);
+
     // log_trace(LogDispatch, "EnqueueTraceCommand issued write_ptr={}, fetch_size={}, commands={}", write_ptr, fetch_size_bytes, this->commands);
 }
 
diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp
index 411c01eda21c..d3dfad5cc801 100644
--- a/tt_metal/impl/dispatch/command_queue_interface.hpp
+++ b/tt_metal/impl/dispatch/command_queue_interface.hpp
@@ -590,7 +590,7 @@ class SystemMemoryManager {
         }
     }
 
-    void fetch_queue_write(uint32_t command_size_B, const uint8_t cq_id) {
+    void fetch_queue_write(uint32_t command_size_B, const uint8_t cq_id, bool stall_prefetcher = false) {
         CoreType dispatch_core_type = dispatch_core_manager::get(this->num_hw_cqs).get_dispatch_core_type(this->device_id);
         uint32_t max_command_size_B = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
         TT_ASSERT(command_size_B <= max_command_size_B, "Generated prefetcher command of size {} B exceeds max command size {} B", command_size_B, max_command_size_B);
@@ -598,6 +598,15 @@ class SystemMemoryManager {
         if (this->bypass_enable) return;
         tt_driver_atomics::sfence();
         uint32_t command_size_16B = command_size_B >> dispatch_constants::PREFETCH_Q_LOG_MINSIZE;
+
+        // stall_prefetcher is used for enqueuing traces, as replaying a trace will hijack the cmd_data_q
+        // so prefetcher fetches multiple cmds that include the trace cmd, they will be corrupted by trace pulling data from DRAM
+        // stall flag prevents pulling prefetch q entries that occur after the stall entry
+        // Stall flag for prefetcher is MSB of FetchQ entry.
+        if (stall_prefetcher) {
+            command_size_16B |= (1 << ((sizeof(dispatch_constants::prefetch_q_entry_type) * 8) - 1));
+        }
+
         tt::Cluster::instance().write_reg(&command_size_16B, this->prefetcher_cores[cq_id], this->prefetch_q_dev_ptrs[cq_id]);
         this->prefetch_q_dev_ptrs[cq_id] += sizeof(dispatch_constants::prefetch_q_entry_type);
     }
diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
index 5e3fc910beeb..8210278c729f 100644
--- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
@@ -81,6 +81,9 @@ static struct PrefetchExecBufState {
     uint32_t length;
 } exec_buf_state;
 
+// Feature to stall the prefetcher, mainly for ExecBuf impl which reuses CmdDataQ
+static enum StallState { STALL_NEXT = 2, STALLED = 1, NOT_STALLED = 0} stall_state = NOT_STALLED;
+
 static_assert((downstream_cb_base & (downstream_cb_page_size - 1)) == 0);
 
 template<bool cmddat_wrap_enable,
@@ -108,6 +111,15 @@ void write_downstream(uint32_t& data_ptr,
     downstream_data_ptr += length;
 }
 
+// If prefetcher must stall after this fetch, wait for data to come back, and move to stalled state.
+FORCE_INLINE
+void barrier_and_stall(uint32_t& pending_read_size, uint32_t& fence) {
+    noc_async_read_barrier();
+    fence += pending_read_size;
+    pending_read_size = 0;
+    stall_state = STALLED;
+}
+
 template<uint32_t preamble_size>
 FORCE_INLINE
 void read_from_pcie(volatile tt_l1_ptr uint32_t *& prefetch_q_rd_ptr,
@@ -176,6 +188,12 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
 
     static uint32_t pending_read_size = 0;
     static volatile tt_l1_ptr uint32_t* prefetch_q_rd_ptr = (volatile tt_l1_ptr uint32_t*)prefetch_q_base;
+    constexpr uint32_t prefetch_q_msb_mask = 1u << 31; // dispatch_constants::prefetch_q_entry_type is 32 bit.
+
+    if (stall_state == STALLED) {
+         ASSERT(pending_read_size == 0); // Before stalling, fetch must have been completed.
+         return;
+    }
 
     DPRINT << "fetch_q_get_cmds: " << cmd_ptr << " " << fence << ENDL();
     if (fence < cmd_ptr) {
@@ -184,11 +202,18 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
     }
 
     bool cmd_ready = (cmd_ptr != fence);
-    uint32_t fetch_size = (uint32_t)*prefetch_q_rd_ptr << prefetch_q_log_minsize;
+
+    uint32_t prefetch_q_rd_ptr_local = *prefetch_q_rd_ptr;
+    uint32_t fetch_size = (prefetch_q_rd_ptr_local & ~prefetch_q_msb_mask) << prefetch_q_log_minsize;
+    bool stall_flag = (prefetch_q_rd_ptr_local & prefetch_q_msb_mask) != 0;
+    stall_state = static_cast<StallState>(stall_flag << 1); // NOT_STALLED -> STALL_NEXT if stall_flag is set
 
     if (fetch_size != 0 && pending_read_size == 0) {
         read_from_pcie<preamble_size>
             (prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
+        if (stall_state == STALL_NEXT) {
+            barrier_and_stall(pending_read_size, fence); // STALL_NEXT -> STALLED
+        }
     }
     if (!cmd_ready) {
         if (pending_read_size != 0) {
@@ -203,14 +228,18 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
             fence += pending_read_size;
             pending_read_size = 0;
 
-            // Ugly hack for now.  Snoops the command, don't fetch the next if we are doing an exec_buf
-            volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr;
-            if (cmd->base.cmd_id != CQ_PREFETCH_CMD_EXEC_BUF) {
-                // After the stall, re-check the host
-                fetch_size = (uint32_t)*prefetch_q_rd_ptr << prefetch_q_log_minsize;
-                if (fetch_size != 0) {
-                    read_from_pcie<preamble_size>
-                        (prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
+            // After the stall, re-check the host
+            prefetch_q_rd_ptr_local = *prefetch_q_rd_ptr;
+            fetch_size = (prefetch_q_rd_ptr_local & ~prefetch_q_msb_mask) << prefetch_q_log_minsize;
+
+            if (fetch_size != 0) {
+                stall_flag = (prefetch_q_rd_ptr_local & prefetch_q_msb_mask) != 0;
+                stall_state = static_cast<StallState>(stall_flag << 1); // NOT_STALLED -> STALL_NEXT if stall_flag is set
+
+                read_from_pcie<preamble_size>
+                    (prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
+                if (stall_state == STALL_NEXT) {
+                    barrier_and_stall(pending_read_size, fence); // STALL_NEXT -> STALLED
                 }
             }
         } else {
@@ -817,7 +846,9 @@ bool process_cmd(uint32_t& cmd_ptr,
     case CQ_PREFETCH_CMD_EXEC_BUF:
         DPRINT << "exec buf: " << cmd_ptr << ENDL();
         ASSERT(!exec_buf);
+        ASSERT(stall_state == STALLED); // ExecBuf must be preceded by a stall
         stride = process_exec_buf_cmd(cmd_ptr, downstream_data_ptr);
+        stall_state = NOT_STALLED; // Stall is no longer required after ExecBuf finishd.
         break;
 
     case CQ_PREFETCH_CMD_EXEC_BUF_END:

From a34576419ddb3c1947250d67bdd8a4d36fab9021 Mon Sep 17 00:00:00 2001
From: Kyle Mabee <kmabee@tenstorrent.com>
Date: Wed, 8 May 2024 01:13:22 +0000
Subject: [PATCH 26/40] #8062: Fix for test_prefetcher split_prefetcher ExecBuf
 issues

 - Hang (assert with watcher) that STALL state wasn't seen when
   handling ExecBuf

 - Don't know if this is correct fix, but it seems to work...
---
 tt_metal/impl/dispatch/kernels/cq_prefetch.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
index 8210278c729f..ef200e75630c 100644
--- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
@@ -846,9 +846,11 @@ bool process_cmd(uint32_t& cmd_ptr,
     case CQ_PREFETCH_CMD_EXEC_BUF:
         DPRINT << "exec buf: " << cmd_ptr << ENDL();
         ASSERT(!exec_buf);
-        ASSERT(stall_state == STALLED); // ExecBuf must be preceded by a stall
+        if (is_h_variant) {
+            ASSERT(stall_state == STALLED); // ExecBuf must be preceded by a prefetcher stall
+        }
         stride = process_exec_buf_cmd(cmd_ptr, downstream_data_ptr);
-        stall_state = NOT_STALLED; // Stall is no longer required after ExecBuf finishd.
+        stall_state = NOT_STALLED; // Stall is no longer required after ExecBuf finished.
         break;
 
     case CQ_PREFETCH_CMD_EXEC_BUF_END:
@@ -1014,6 +1016,7 @@ void kernel_main_h() {
         if (cmd_id == CQ_PREFETCH_CMD_EXEC_BUF) {
             DPRINT << "exec buf\n";
             process_exec_buf_cmd_h();
+            stall_state = NOT_STALLED; // Stall is no longer required after ExecBuf finished
         } else if (cmd_id == CQ_PREFETCH_CMD_TERMINATE) {
             DPRINT << "prefetch terminating_" << is_h_variant << is_d_variant << ENDL();;
             done = true;

From 4518ebc9487a83f9c1ea2dbb85e8a8327c655b5c Mon Sep 17 00:00:00 2001
From: Austin Ho <aho@tenstorrent.com>
Date: Thu, 9 May 2024 16:08:50 +0000
Subject: [PATCH 27/40] #0: Preallocate trace buffer and removed trace
 owned_pool Update device trace cmds to take in cq_id, remove multi-device
 apis Add tracing tests for metal Resnet50. TODO: Cleanup/reuse code Disable
 allocations after capturing trace Update trace apis to return/take in trace
 id. Make device own TraceBuffer mapping. Remove trace apis that correspond to
 allowing users to create Trace objects #8383: End any active traces during
 device close and assert tracing is not enabled for terminate cmd

---
 docs/aspell-dictionary.pws                    |   7 +-
 .../host_apis/command_queue/BeginTrace.rst    |   4 -
 .../command_queue/BeginTraceCapture.rst       |   4 +
 .../apis/host_apis/command_queue/EndTrace.rst |   4 -
 .../command_queue/EndTraceCapture.rst         |   4 +
 .../command_queue/InstantiateTrace.rst        |   4 -
 .../host_apis/command_queue/ReleaseTrace.rst  |   4 +
 .../host_apis/command_queue/ReplayTrace.rst   |   4 +
 .../host_apis/command_queue/command_queue.rst |   7 +-
 .../demos/resnet/tests/test_metal_resnet50.py | 120 +++
 models/demos/resnet/tests/test_perf_resnet.py | 163 +++-
 models/demos/resnet/tt/metalResnetBlock50.py  |  14 +
 tests/scripts/nightly/run_gs_only.sh          |   6 +-
 .../trace_testing/misc/test_average_pool.py   |  50 +-
 .../trace_testing/misc/test_bert_ops.py       | 706 ++----------------
 .../common/common_fixture.hpp                 |  17 +-
 .../command_queue/test_EnqueueTrace.cpp       | 242 +-----
 .../command_queue/test_EnqueueTrace.cpp       |  49 +-
 tt_eager/tt_lib/csrc/tt_lib_bindings.cpp      |  10 +-
 tt_metal/detail/tt_metal.hpp                  |  16 +-
 tt_metal/host_api.hpp                         |  60 +-
 tt_metal/impl/allocator/allocator.cpp         |   9 +
 tt_metal/impl/allocator/allocator.hpp         |   6 +
 tt_metal/impl/device/device.cpp               |  93 +--
 tt_metal/impl/device/device.hpp               |  14 +-
 tt_metal/impl/dispatch/command_queue.cpp      |  97 +--
 tt_metal/impl/dispatch/command_queue.hpp      |  25 +-
 tt_metal/impl/program/program.hpp             |   1 +
 tt_metal/impl/trace/trace.cpp                 | 155 +---
 tt_metal/impl/trace/trace.hpp                 | 112 +--
 tt_metal/impl/trace/trace_buffer.hpp          |  32 +
 tt_metal/tt_metal.cpp                         |  60 +-
 32 files changed, 738 insertions(+), 1361 deletions(-)
 delete mode 100644 docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/BeginTrace.rst
 create mode 100644 docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/BeginTraceCapture.rst
 delete mode 100644 docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/EndTrace.rst
 create mode 100644 docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/EndTraceCapture.rst
 delete mode 100644 docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/InstantiateTrace.rst
 create mode 100644 docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/ReleaseTrace.rst
 create mode 100644 docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/ReplayTrace.rst
 create mode 100644 tt_metal/impl/trace/trace_buffer.hpp

diff --git a/docs/aspell-dictionary.pws b/docs/aspell-dictionary.pws
index b1206699cb20..bc614816e2a5 100644
--- a/docs/aspell-dictionary.pws
+++ b/docs/aspell-dictionary.pws
@@ -12,7 +12,7 @@ BRISC
 BRISCs
 BUF
 BUFs
-BeginTrace
+BeginTraceCapture
 BertIntermediate
 BinaryOpType
 BorrowedStorage
@@ -62,7 +62,7 @@ DumpDeviceProfileResults
 ENDL
 ETH
 EltwiseUnary
-EndTrace
+EndTraceCapture
 EnqueueProgram
 EnqueueReadBuffer
 EnqueueRecordEvent
@@ -84,7 +84,6 @@ Grayskull
 HW
 HiFi
 HostDataType
-InstantiateTrace
 InterleavedBufferConfig
 Jupyter
 KernelHandle
@@ -136,6 +135,8 @@ RISCVs
 RISCs
 ReadFromDevice
 ReduceDim
+RepeatTrace
+ReplayTrace
 ResNet
 RuntimeArgs
 SETPRECISION
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/BeginTrace.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/BeginTrace.rst
deleted file mode 100644
index 363760a71222..000000000000
--- a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/BeginTrace.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-BeginTrace
-==========
-
-.. doxygenfunction:: BeginTrace(Trace &trace)
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/BeginTraceCapture.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/BeginTraceCapture.rst
new file mode 100644
index 000000000000..2b09d801459c
--- /dev/null
+++ b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/BeginTraceCapture.rst
@@ -0,0 +1,4 @@
+BeginTraceCapture
+=================
+
+.. doxygenfunction:: BeginTraceCapture
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/EndTrace.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/EndTrace.rst
deleted file mode 100644
index 4d28c71e3d65..000000000000
--- a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/EndTrace.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-EndTrace
-========
-
-.. doxygenfunction:: EndTrace(Trace &trace)
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/EndTraceCapture.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/EndTraceCapture.rst
new file mode 100644
index 000000000000..c00dc574e46b
--- /dev/null
+++ b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/EndTraceCapture.rst
@@ -0,0 +1,4 @@
+EndTraceCapture
+===============
+
+.. doxygenfunction:: EndTraceCapture
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/InstantiateTrace.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/InstantiateTrace.rst
deleted file mode 100644
index 0ce3019a44dc..000000000000
--- a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/InstantiateTrace.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-InstantiateTrace
-================
-
-.. doxygenfunction:: InstantiateTrace(Trace &trace, CommandQueue &cq)
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/ReleaseTrace.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/ReleaseTrace.rst
new file mode 100644
index 000000000000..e61728730bc4
--- /dev/null
+++ b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/ReleaseTrace.rst
@@ -0,0 +1,4 @@
+ReleaseTrace
+============
+
+.. doxygenfunction:: ReleaseTrace
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/ReplayTrace.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/ReplayTrace.rst
new file mode 100644
index 000000000000..457f5d42c4c8
--- /dev/null
+++ b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/ReplayTrace.rst
@@ -0,0 +1,4 @@
+ReplayTrace
+===========
+
+.. doxygenfunction:: ReplayTrace
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/command_queue.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/command_queue.rst
index d8ecc3f7736f..a42994d884f9 100644
--- a/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/command_queue.rst
+++ b/docs/source/tt-metalium/tt_metal/apis/host_apis/command_queue/command_queue.rst
@@ -9,8 +9,9 @@ CommandQueue
   EnqueueWaitForEvent
   EventQuery
   EventSynchronize
-  BeginTrace
-  EndTrace
-  InstantiateTrace
+  BeginTraceCapture
+  EndTraceCapture
+  ReplayTrace
+  ReleaseTrace
   EnqueueTrace
   Finish
diff --git a/models/demos/resnet/tests/test_metal_resnet50.py b/models/demos/resnet/tests/test_metal_resnet50.py
index 525bb86e6bbb..b1791bbf0ee8 100644
--- a/models/demos/resnet/tests/test_metal_resnet50.py
+++ b/models/demos/resnet/tests/test_metal_resnet50.py
@@ -219,3 +219,123 @@ def test_run_resnet50_inference(
         passing_pcc, _ = comp_pcc(torch_output, tt_output, pcc=valid_pcc)
         assert passing_pcc
         # assert passing # fails because of torch.allclose
+
+
+@skip_for_wormhole_b0("This test is not supported on WHB0, please use the TTNN version.")
+@pytest.mark.parametrize("device_l1_small_size", [24576], indirect=True)
+@pytest.mark.parametrize("batch_size", [1, 2, 16, 20], ids=["batch_1", "batch_2", "batch_16", "batch_20"])
+@pytest.mark.parametrize(
+    "weights_dtype",
+    [tt_lib.tensor.DataType.BFLOAT16, tt_lib.tensor.DataType.BFLOAT8_B],
+    ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
+)
+@pytest.mark.parametrize(
+    "activations_dtype",
+    [tt_lib.tensor.DataType.BFLOAT16, tt_lib.tensor.DataType.BFLOAT8_B],
+    ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
+)
+@pytest.mark.parametrize(
+    "math_fidelity",
+    [tt_lib.tensor.MathFidelity.HiFi4, tt_lib.tensor.MathFidelity.HiFi2, tt_lib.tensor.MathFidelity.LoFi],
+    ids=["HiFi4", "HiFi2", "LoFi"],
+)
+def test_run_resnet50_trace_inference(
+    device, use_program_cache, batch_size, weights_dtype, activations_dtype, math_fidelity, imagenet_sample_input
+):
+    if is_e75(device):
+        pytest.skip("Resnet50 is not supported on E75")
+
+    if batch_size > 8 and (
+        activations_dtype != tt_lib.tensor.DataType.BFLOAT8_B or weights_dtype != tt_lib.tensor.DataType.BFLOAT8_B
+    ):
+        pytest.skip("Batch > 8 must be run fully bfp8")
+    if batch_size <= 2:
+        pytest.skip("batch 1 and 2 are not supported with sharded data")
+    image1 = imagenet_sample_input
+    image = image1
+    model_config = {
+        "MATH_FIDELITY": math_fidelity,
+        "WEIGHTS_DTYPE": weights_dtype,
+        "ACTIVATIONS_DTYPE": activations_dtype,
+    }
+    for i in range(batch_size - 1):
+        image = torch.cat((image, image1), dim=0)
+    with torch.no_grad():
+        torch.manual_seed(1234)
+
+        tt_lib.device.EnableMemoryReports()
+
+        torch_resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
+        torch_resnet50.eval()
+
+        state_dict = torch_resnet50.state_dict()
+        storage_in_dram = False
+        sharded = False
+        if batch_size >= 8:
+            sharded = True
+        # run once to compile ops
+        tt_resnet50 = ResNet(
+            Bottleneck,
+            [3, 4, 6, 3],
+            device=device,
+            state_dict=state_dict,
+            base_address="",
+            fold_batchnorm=True,
+            storage_in_dram=storage_in_dram,
+            batch_size=batch_size,
+            model_config=model_config,
+            sharded=sharded,
+        )
+
+        torch_output = torch_resnet50(image).unsqueeze(1).unsqueeze(1)
+        interleaved_mem_config_DRAM = tt_lib.tensor.MemoryConfig(
+            memory_layout=tt_lib.tensor.TensorMemoryLayout.INTERLEAVED,
+            buffer_type=tt_lib.tensor.BufferType.DRAM,
+        )
+
+        tt_image_res = tt_resnet50.preprocessing(image).to(device, interleaved_mem_config_DRAM)
+
+        # Compile
+        tt_resnet50(tt_image_res)
+        # Trace
+        tid = tt_lib.device.BeginTraceCapture(device, 0, 1304576)
+        tt_output_res = tt_resnet50(tt_image_res)
+        tt_lib.device.EndTraceCapture(device, 0, tid)
+
+        tt_lib.device.ReplayTrace(device, 0, tid, True)
+
+        tt_output = tt_output_res.cpu().to_torch().to(torch.float)
+
+        # # run again to measure end to end perf
+        # start_time = datetime.now()
+        # tt_output = tt_resnet50(image)
+        # end_time = datetime.now()
+        # diff = end_time - start_time
+        # logger.info("End to end time (microseconds))", diff.microseconds)
+        # throughput_fps = (float) (1000000 / diff.microseconds)
+        # logger.info("Throughput (fps)", throughput_fps)
+
+        _, _, _, info = get_atol_rtol_pcc(torch_output, tt_output)
+        logger.info(info)
+
+        valid_pcc = 1.0
+        if batch_size >= 8:
+            valid_pcc = golden_pcc[batch_size][
+                (model_config["MATH_FIDELITY"], model_config["WEIGHTS_DTYPE"], model_config["ACTIVATIONS_DTYPE"])
+            ]
+        else:
+            if model_config["ACTIVATIONS_DTYPE"] == tt_lib.tensor.DataType.BFLOAT8_B:
+                if model_config["MATH_FIDELITY"] == tt_lib.tensor.MathFidelity.LoFi:
+                    valid_pcc = 0.87
+                else:
+                    valid_pcc = 0.94
+            else:
+                if model_config["MATH_FIDELITY"] == tt_lib.tensor.MathFidelity.LoFi:
+                    valid_pcc = 0.93
+                else:
+                    valid_pcc = 0.982
+        passing_pcc, _ = comp_pcc(torch_output, tt_output, pcc=valid_pcc)
+        assert passing_pcc
+        # assert passing # fails because of torch.allclose
+    # Done with the trace, can deallocate the buffers now.
+    tt_lib.device.ReleaseTrace(device, tid)
diff --git a/models/demos/resnet/tests/test_perf_resnet.py b/models/demos/resnet/tests/test_perf_resnet.py
index f6ddb18a7751..ac3f54cc9cbd 100644
--- a/models/demos/resnet/tests/test_perf_resnet.py
+++ b/models/demos/resnet/tests/test_perf_resnet.py
@@ -88,22 +88,19 @@ def run_perf_resnet(
         warm_end = warm_start + num_warm_iterations
 
         outputs = []
-        inference_time_sum = 0
+        profiler.start(f"run")
         for iter in range(warm_start, warm_end):
-            profiler.start(f"run")
             outputs.append(tt_resnet50(tt_inputs).cpu(blocking=False))
-            profiler.end(f"run")
-            inference_time_sum += profiler.get("run")
-            tt_lib.device.DumpDeviceProfiler(device)
-
         tt_lib.device.Synchronize(device)
+        profiler.end(f"run")
+        tt_lib.device.DumpDeviceProfiler(device)
 
         # enable_persistent_kernel_cache()
 
     first_iter_time = profiler.get(f"{0}_key")
 
     # ensuring inference time fluctuations is not noise
-    inference_time_avg = inference_time_sum / num_warm_iterations
+    inference_time_avg = profiler.get("run") / num_warm_iterations
 
     cpu_time = profiler.get(cpu_key)
     compile_time = first_iter_time - inference_time_avg
@@ -152,3 +149,155 @@ def test_perf_bare_metal(
         hf_cat_image_sample_input,
         device,
     )
+
+
+def run_perf_resnet_trace(
+    batch_size,
+    expected_inference_time,
+    expected_compile_time,
+    hf_cat_image_sample_input,
+    device,
+):
+    disable_persistent_kernel_cache()
+    if batch_size <= 2:
+        pytest.skip("Batch size 1 and 2 are not supported with sharded data")
+    first_key = f"first_iter_batchsize{batch_size}"
+    second_key = f"second_iter_batchsize{batch_size}"
+    cpu_key = f"ref_key_batchsize{batch_size}"
+    model_name = "microsoft/resnet-50"
+
+    image = hf_cat_image_sample_input
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    inputs = image_processor(image, return_tensors="pt")
+
+    inputs = inputs["pixel_values"]
+    comments = f"{list(inputs.shape)[-2]}x{list(inputs.shape)[-1]}_batchsize{batch_size}"
+
+    inputs1 = inputs
+    for i in range(batch_size - 1):
+        inputs = torch.cat((inputs, inputs1), dim=0)
+
+    torch_resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
+    torch_resnet50.eval()
+
+    state_dict = torch_resnet50.state_dict()
+    sharded = False
+    if batch_size >= 8:
+        sharded = True
+    tt_resnet50 = ResNet(
+        Bottleneck,
+        [3, 4, 6, 3],
+        device=device,
+        state_dict=state_dict,
+        base_address="",
+        fold_batchnorm=True,
+        storage_in_dram=False,
+        batch_size=batch_size,
+        model_config=model_config,
+        sharded=sharded,
+    )
+
+    with torch.no_grad():
+        profiler.start(cpu_key)
+        logits = torch_resnet50(inputs)
+        profiler.end(cpu_key)
+
+        tt_inputs = tt_resnet50.preprocessing(inputs)
+        interleaved_mem_config_DRAM = tt_lib.tensor.MemoryConfig(
+            memory_layout=tt_lib.tensor.TensorMemoryLayout.INTERLEAVED,
+            buffer_type=tt_lib.tensor.BufferType.DRAM,
+        )
+        tt_image_res = tt_inputs.to(device, interleaved_mem_config_DRAM)
+        # Compile
+        profiler.start(f"{0}_key")
+        tt_lib.tensor.write_tensor(tt_inputs, tt_image_res)
+        tt_resnet50(tt_image_res).cpu(blocking=True)
+        profiler.end(f"{0}_key")
+        tt_lib.device.DumpDeviceProfiler(device)
+
+        # Capture
+        tid = tt_lib.device.BeginTraceCapture(device, 0, 1304576)
+        tt_output_res = tt_resnet50(tt_image_res)
+        tt_lib.device.EndTraceCapture(device, 0, tid)
+        tt_lib.device.DumpDeviceProfiler(device)
+
+        warmup_end = 6
+        for iter in range(1, warmup_end):
+            profiler.start(f"{iter}_key")
+            tt_lib.tensor.write_tensor(tt_inputs, tt_image_res)
+            tt_lib.device.ReplayTrace(device, 0, tid, False)
+            _ = tt_output_res.cpu(blocking=True)
+            profiler.end(f"{iter}_key")
+            tt_lib.device.DumpDeviceProfiler(device)
+
+        num_warm_iterations = 15
+        warm_start = warmup_end
+        warm_end = warm_start + num_warm_iterations
+
+        outputs = []
+        profiler.start(f"run")
+        for iter in range(warm_start, warm_end):
+            tt_lib.tensor.write_tensor(tt_inputs, tt_image_res)
+            tt_lib.device.ReplayTrace(device, 0, tid, False)
+            outputs.append(tt_output_res.cpu(blocking=False))
+        tt_lib.device.Synchronize(device)
+        profiler.end(f"run")
+        tt_lib.device.DumpDeviceProfiler(device)
+
+        # enable_persistent_kernel_cache()
+
+    first_iter_time = profiler.get(f"{0}_key")
+
+    # ensuring inference time fluctuations is not noise
+    inference_time_avg = profiler.get("run") / num_warm_iterations
+
+    cpu_time = profiler.get(cpu_key)
+    compile_time = first_iter_time - inference_time_avg
+    prep_perf_report(
+        model_name=f"resnet50_trace_batch_size{batch_size}",
+        batch_size=batch_size,
+        inference_and_compile_time=first_iter_time,
+        inference_time=inference_time_avg,
+        expected_compile_time=expected_compile_time,
+        expected_inference_time=expected_inference_time,
+        comments=comments,
+        inference_time_cpu=cpu_time,
+    )
+
+    logger.info(f"resnet50 {comments} inference time (avg): {inference_time_avg}")
+    logger.info(f"resnet50 compile time: {compile_time}")
+
+    tt_lib.device.ReleaseTrace(device, tid)
+
+    assert inference_time_avg < expected_inference_time, f"resnet50 {comments} inference is too slow"
+    assert compile_time < expected_compile_time, f"resnet50 {comments} compilation is too slow"
+
+
+@skip_for_wormhole_b0(reason_str="Not tested on single WH")
+@pytest.mark.parametrize("device_l1_small_size", [32768], indirect=True)
+@pytest.mark.models_performance_bare_metal
+@pytest.mark.parametrize(
+    "batch_size, expected_inference_time, expected_compile_time",
+    (
+        (16, 0.04, 25),
+        (20, 0.04, 25),
+    ),
+)
+def test_perf_trace_bare_metal(
+    device,
+    use_program_cache,
+    batch_size,
+    expected_inference_time,
+    expected_compile_time,
+    hf_cat_image_sample_input,
+):
+    if is_e75(device):
+        pytest.skip("Resnet is not supported on E75")
+
+    run_perf_resnet_trace(
+        batch_size,
+        expected_inference_time,
+        expected_compile_time,
+        hf_cat_image_sample_input,
+        device,
+    )
diff --git a/models/demos/resnet/tt/metalResnetBlock50.py b/models/demos/resnet/tt/metalResnetBlock50.py
index 85849a5b502b..219b68433761 100644
--- a/models/demos/resnet/tt/metalResnetBlock50.py
+++ b/models/demos/resnet/tt/metalResnetBlock50.py
@@ -2127,6 +2127,20 @@ def forward(self, x: tt_lib.tensor) -> tt_lib.tensor:
                 tt_lib.tensor.TensorMemoryLayout.HEIGHT_SHARDED, tt_lib.tensor.BufferType.L1, shard_spec
             )
             x = x.to(self.device, mem_config)
+        else:
+            shard_spec = tt_lib.tensor.ShardSpec(
+                self.shard_grid,
+                [
+                    x.get_legacy_shape()[2] // self.first_conv_num_cores_nhw,
+                    x.get_legacy_shape()[3],
+                ],
+                tt_lib.tensor.ShardOrientation.ROW_MAJOR,
+                False,
+            )
+            mem_config = tt_lib.tensor.MemoryConfig(
+                tt_lib.tensor.TensorMemoryLayout.HEIGHT_SHARDED, tt_lib.tensor.BufferType.L1, shard_spec
+            )
+            x = tt_lib.tensor.interleaved_to_sharded(x, mem_config)
 
         x = self.conv1(x)
         # Relu is fused with conv1
diff --git a/tests/scripts/nightly/run_gs_only.sh b/tests/scripts/nightly/run_gs_only.sh
index 66462b516257..67d287ab0518 100755
--- a/tests/scripts/nightly/run_gs_only.sh
+++ b/tests/scripts/nightly/run_gs_only.sh
@@ -11,6 +11,6 @@ echo "Running model nightly tests for GS only"
 
 env pytest models/demos/metal_BERT_large_11/tests/test_demo.py
 
-# why is this not in test_perf_device_resnet.py, also these parameters are specifically skipped inside the test
-# env pytest models/demos/resnet/tests/test_metal_resnet50.py::test_run_resnet50_inference[HiFi4-activations_BFLOAT16-weights_BFLOAT16-batch_1]
-# env pytest models/demos/resnet/tests/test_metal_resnet50.py::test_run_resnet50_inference[HiFi4-activations_BFLOAT16-weights_BFLOAT16-batch_2]
+env pytest models/demos/resnet/tests/test_metal_resnet50.py::test_run_resnet50_inference[HiFi2-activations_BFLOAT8_B-weights_BFLOAT8_B-batch_20-24576]
+
+env pytest models/demos/resnet/tests/test_metal_resnet50.py::test_run_resnet50_trace_inference -k "HiFi2-activations_BFLOAT8_B-weights_BFLOAT8_B-batch_20-24576"
diff --git a/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py b/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
index 53b1118d61f1..a8ec7b13742c 100644
--- a/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
+++ b/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
@@ -34,34 +34,54 @@ def shape_padded(shape):
         "BFLOAT16",
     ],
 )
-def test_run_average_pool(act_shape, dtype, device):
+def test_run_average_pool(act_shape, dtype, device, use_program_cache):
     batch_size, _, _, channels = act_shape
 
     torch.manual_seed(0)
 
-    trace_captured = False
+    interleaved_mem_config_L1 = ttl.tensor.MemoryConfig(
+        memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
+        buffer_type=ttl.tensor.BufferType.L1,
+    )
+
     trace_loops = 10
 
+    out_shape = [1] * len(act_shape)
+    out_shape[-1] = act_shape[-1]
+    out_shape_padded = shape_padded(out_shape)
+
     act = torch.randn(act_shape, dtype=torch.bfloat16).float()
     ttact = ttl.tensor.Tensor(act, ttl.tensor.DataType.BFLOAT16)
     act_shape_padded = shape_padded(act_shape)
     if act_shape != act_shape_padded:
         ttact = ttact.pad_to_tile(0.0)
 
-    for iter in range(trace_loops):
-        ttact = ttact.to(device)
+    ttact_res = ttact.to(device)
 
-        if not trace_captured:
-            ttl.device.BeginTraceCapture(device)
-            out = ttl.tensor.average_pool_2d(ttact)
-            ttl.device.EndTraceCapture(device)
-            trace_captured = True
-            logger.info("Trace captured")
+    def run_ops(ttact_res):
+        return ttl.tensor.average_pool_2d(ttact_res)
+
+    # Compile
+    run_ops(ttact_res)
+    # Trace
+    logger.info("Start Trace capture")
+    tid = ttl.device.BeginTraceCapture(device, 0, 11264)
+    out_res = run_ops(ttact_res)
+    ttl.device.EndTraceCapture(device, 0, tid)
+    logger.info("Trace captured")
+
+    for iter in range(trace_loops):
+        act = torch.randn(act_shape, dtype=torch.bfloat16).float()
+        ttact_updated = ttl.tensor.Tensor(act, ttl.tensor.DataType.BFLOAT16)
+        act_shape_padded = shape_padded(act_shape)
+        if act_shape != act_shape_padded:
+            ttact_updated = ttact_updated.pad_to_tile(0.0)
+        ttl.tensor.write_tensor(ttact_updated, ttact_res)
 
         logger.info(f"Running iteration {iter}")
-        ttl.device.ExecuteLastTrace(device, True)
+        ttl.device.ReplayTrace(device, 0, tid, True)
 
-        out = out.cpu().to(ttl.tensor.Layout.ROW_MAJOR)
+        out = out_res.cpu().to(ttl.tensor.Layout.ROW_MAJOR)
         out_shape = [batch_size, 1, 1, channels]
         out_shape_padded = shape_padded(out_shape)
         if out_shape != out_shape_padded:
@@ -76,10 +96,10 @@ def test_run_average_pool(act_shape, dtype, device):
 
         ## test for equivalance
         passing_pcc, output_pcc = comp_pcc(golden_pytorch, out_pytorch)
-        print(f"Passing PCC = {passing_pcc}")
-        print(f"Output PCC = {output_pcc}")
+        logger.debug(f"Passing PCC = {passing_pcc}")
+        logger.debug(f"Output PCC = {output_pcc}")
 
         assert passing_pcc
 
     # Done with the trace, can deallocate the buffers now.
-    ttl.device.ReleaseLastTrace(device)
+    ttl.device.ReleaseTrace(device, tid)
diff --git a/tests/tt_eager/python_api_testing/trace_testing/misc/test_bert_ops.py b/tests/tt_eager/python_api_testing/trace_testing/misc/test_bert_ops.py
index 987fff4d014e..4893e71dfa91 100644
--- a/tests/tt_eager/python_api_testing/trace_testing/misc/test_bert_ops.py
+++ b/tests/tt_eager/python_api_testing/trace_testing/misc/test_bert_ops.py
@@ -7,12 +7,8 @@
 import math
 
 import tt_lib as ttl
-from tests.tt_eager.python_api_testing.sweep_tests import (
-    pytorch_ops,
-    tt_lib_ops,
-)
+
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import (
-    comp_equal,
     comp_pcc,
 )
 from models.utility_functions import is_wormhole_b0, is_grayskull, skip_for_wormhole_b0
@@ -40,12 +36,23 @@
     ],
 )
 def test_bert_linear(
-    device, fidelity, in0_sharded, out_sharded, in1_in_dram, M, K, N, activation, function_level_defaults
+    device,
+    fidelity,
+    in0_sharded,
+    out_sharded,
+    in1_in_dram,
+    M,
+    K,
+    N,
+    activation,
+    use_program_cache,
+    function_level_defaults,
 ):
     has_bias = False
     in0_shape = [1, 1, M, K]
     in1_shape = [1, 1, K, N]
     bias_shape = [1, 1, N]
+    out_shape = [1, 1, M, N]
     grid_size = (12, 8)
     # grid_size = (2, 2)
     shard_shape = [M // grid_size[0], K // grid_size[1]]  # shard height, width
@@ -90,14 +97,9 @@ def test_bert_linear(
     in1 = torch.randn(in1_shape).bfloat16().float()
     bias = torch.randn(bias_shape).bfloat16().float()
 
-    if in0_sharded:
-        in0_t = torch2tt_tensor(
-            in0, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
-        )
-    else:
-        in0_t = torch2tt_tensor(
-            in0, device, tt_memory_config=interleaved_mem_config_L1, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
-        )
+    in0_t_res = torch2tt_tensor(
+        in0, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
+    )
 
     if in1_in_dram:
         in1_t = torch2tt_tensor(
@@ -109,233 +111,11 @@ def test_bert_linear(
         )
 
     output_mem_config = sharded_mem_config if out_sharded else interleaved_mem_config_L1
-    bias_t = pad_by_zero(
-        bias, device, tt_memory_config=interleaved_mem_config_L1, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
-    )[0]
-
-    trace_captured = False
-    trace_loops = 4
-
-    for iter in range(trace_loops):
-        if not trace_captured:
-            ttl.device.BeginTraceCapture(device)
-
-            if in0_sharded:
-                in0_t = ttl.tensor.interleaved_to_sharded(
-                    in0_t,
-                    grid_size,
-                    [M // grid_size[0], K // grid_size[1]],
-                    ttl.tensor.TensorMemoryLayout.BLOCK_SHARDED,
-                    ttl.tensor.ShardOrientation.COL_MAJOR,
-                )
-
-            program_config = ttl.operations.primary.MatmulMultiCoreReuseMultiCastProgramConfig(
-                compute_with_storage_grid_size=grid_size,
-                in0_block_w=in0_block_w,
-                out_subblock_h=out_subblock_h,
-                out_subblock_w=out_subblock_w,
-                per_core_M=out_block_h,
-                per_core_N=out_block_w,
-                transpose_mcast=True,
-                # transpose_mcast=False,
-                fused_activation=activation,
-            )
-
-            compute_kernel_config = ttl.tensor.GrayskullComputeKernelConfig(
-                math_fidelity=fidelity, math_approx_mode=True
-            )
-
-            if has_bias:
-                output_t = ttl.operations.primary.matmul(
-                    in0_t,
-                    in1_t,
-                    bias=bias_t,
-                    program_config=program_config,
-                    output_mem_config=output_mem_config,
-                    compute_kernel_config=compute_kernel_config,
-                )
-            else:
-                output_t = ttl.operations.primary.matmul(
-                    in0_t,
-                    in1_t,
-                    program_config=program_config,
-                    output_mem_config=output_mem_config,
-                    compute_kernel_config=compute_kernel_config,
-                )
-
-            if out_sharded:
-                output_t = ttl.tensor.sharded_to_interleaved(output_t, interleaved_mem_config_L1)
-
-            ttl.device.EndTraceCapture(device)
-            trace_captured = True
-            logger.info("Trace captured")
-
-        logger.info(f"Running iteration {iter}")
-        ttl.device.ExecuteLastTrace(device, True)
-
-        pt_out = in0 @ in1
-
-        if has_bias:
-            pt_out = pt_out + bias
-
-        if activation != None:
-            pt_out = torch.nn.functional.gelu(pt_out)
-        tt_out = tt2torch_tensor(output_t)
-
-        passing, output = comp_pcc(pt_out, tt_out)
-        logger.info(output)
-        assert passing
-    ttl.device.ReleaseLastTrace(device)
-
-
-@pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32")
-@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"])
-@pytest.mark.parametrize("fp32_acc_mode", [True, False], ids=["fp32", "no_fp32"])
-@pytest.mark.parametrize(
-    "fidelity",
-    [
-        ttl.tensor.MathFidelity.LoFi,
-    ],
-    ids=["LoFi"],
-)
-@pytest.mark.parametrize("has_bias", [True, False], ids=["bias", "no_bias"])
-@pytest.mark.parametrize(
-    "in1_in_dram, out_sharded, in0_sharded, M, K, N, activation",
-    [
-        # # in1-L1-fusedQKV
-        (False, True, True, 2688, 1024, 3072, None),  # both sharded
-        (False, True, False, 2688, 1024, 3072, None),  # out sharded, in0 interleaved
-        (False, False, True, 2688, 1024, 3072, None),  # out interleaved, in0 sharded
-        (False, False, False, 2688, 1024, 3072, None),  # out interleaved, in0 interleaved
-        # # # # in1-dram-fusedQKV
-        (True, True, True, 2688, 1024, 3072, None),
-        (True, True, False, 2688, 1024, 3072, None),
-        (True, False, True, 2688, 1024, 3072, None),
-        (True, False, False, 2688, 1024, 3072, None),
-        # # # # in1-L1-selfout
-        (False, True, True, 2688, 1024, 1024, None),
-        (False, True, False, 2688, 1024, 1024, None),
-        (False, False, True, 2688, 1024, 1024, None),
-        (False, False, False, 2688, 1024, 1024, None),
-        # # # # in1-dram-selfout
-        (True, True, True, 2688, 1024, 1024, None),
-        (True, True, False, 2688, 1024, 1024, None),
-        (True, False, True, 2688, 1024, 1024, None),
-        (True, False, False, 2688, 1024, 1024, None),
-        # # # # in1-L1-ff1
-        (False, True, True, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        (False, True, False, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        (False, False, True, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        (False, False, False, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        # # # # in1-dram-ff1
-        (True, True, True, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        (True, True, False, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        (True, False, True, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        (True, False, False, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        # # # # # in1-L1-ff1 - no Gelu
-        (False, True, True, 2688, 1024, 4096, None),
-        (False, True, False, 2688, 1024, 4096, None),
-        (False, False, True, 2688, 1024, 4096, None),
-        (False, False, False, 2688, 1024, 4096, None),
-        # # # # in1-dram-ff1 - no Gelu
-        (True, True, True, 2688, 1024, 4096, None),
-        (True, True, False, 2688, 1024, 4096, None),
-        (True, False, True, 2688, 1024, 4096, None),
-        (True, False, False, 2688, 1024, 4096, None),
-        # # # # in1-L1-ff2
-        (False, True, True, 2688, 4096, 1024, None),
-        (False, True, False, 2688, 4096, 1024, None),
-        (False, False, True, 2688, 4096, 1024, None),
-        (False, False, False, 2688, 4096, 1024, None),
-        # # # # in1-dram-ff2
-        (True, True, True, 2688, 4096, 1024, None),
-        (True, True, False, 2688, 4096, 1024, None),
-        (True, False, True, 2688, 4096, 1024, None),
-        (True, False, False, 2688, 4096, 1024, None),
-    ],
-)
-@skip_for_wormhole_b0("WH ND hang, see issue #4392")
-def test_bert_linear_batch7(
-    device,
-    fidelity,
-    in0_sharded,
-    out_sharded,
-    in1_in_dram,
-    has_bias,
-    fp32_acc_mode,
-    packer_l1_acc,
-    M,
-    K,
-    N,
-    activation,
-    function_level_defaults,
-):
-    in0_shape = [1, 1, M, K]
-    in1_shape = [1, 1, K, N]
-    bias_shape = [1, 1, N]
-    grid_size = (8, 7)
-
-    in0_block_h = M // grid_size[1] // 32
-    in0_block_w = K // grid_size[0] // 32
-    out_block_h = M // grid_size[1] // 32
-    out_block_w = N // grid_size[0] // 32
 
-    if fp32_acc_mode == True:
-        out_subblock_w = 4
-        out_subblock_h = 1
-    else:
-        if out_block_w <= 8:
-            out_subblock_w = out_block_w
-            out_subblock_h = 8 // out_subblock_w
-        else:
-            out_subblock_h = 1
-            out_subblock_w = 8 // out_subblock_h
-            while out_block_w % out_subblock_w != 0:
-                out_subblock_w = out_block_w // 2
-
-    logger.debug("in0 block w h " + str(in0_block_w * 32) + " " + str(in0_block_h * 32))
-    logger.debug("in1 block w h " + str(out_block_w * 32) + " " + str(in0_block_w * 32))
-    logger.debug("out block w h " + str(out_block_w * 32) + " " + str(out_block_h * 32))
-    logger.debug("out subblock w h " + str(out_subblock_w * 32) + " " + str(out_subblock_h * 32))
-
-    interleaved_mem_config_L1 = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
-        buffer_type=ttl.tensor.BufferType.L1,
-    )
-    interleaved_mem_config_DRAM = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
-        buffer_type=ttl.tensor.BufferType.DRAM,
-    )
-    sharded_mem_config = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.BLOCK_SHARDED,
-        buffer_type=ttl.tensor.BufferType.L1,
-    )
-
-    in0 = torch.randn(in0_shape).bfloat16().float()
-    in1 = torch.randn(in1_shape).bfloat16().float()
-    bias = torch.randn(bias_shape).bfloat16().float()
-
-    in0_t = torch2tt_tensor(
-        in0, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
-    )
-    in1_t = torch2tt_tensor(
-        in1, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
-    )
-
-    output_mem_config = sharded_mem_config if out_sharded else interleaved_mem_config_L1
     bias_t = pad_by_zero(
         bias, device, tt_memory_config=interleaved_mem_config_L1, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
     )[0]
 
-    if in0_sharded:
-        in0_t = ttl.tensor.interleaved_to_sharded(
-            in0_t,
-            grid_size,
-            [M // grid_size[1], K // grid_size[0]],
-            ttl.tensor.TensorMemoryLayout.BLOCK_SHARDED,
-            ttl.tensor.ShardOrientation.ROW_MAJOR,
-        )
-
     program_config = ttl.operations.primary.MatmulMultiCoreReuseMultiCastProgramConfig(
         compute_with_storage_grid_size=grid_size,
         in0_block_w=in0_block_w,
@@ -343,411 +123,79 @@ def test_bert_linear_batch7(
         out_subblock_w=out_subblock_w,
         per_core_M=out_block_h,
         per_core_N=out_block_w,
-        transpose_mcast=False,
+        transpose_mcast=True,
+        # transpose_mcast=False,
         fused_activation=activation,
     )
 
-    compute_kernel_config = ttl.tensor.WormholeComputeKernelConfig(
-        math_fidelity=fidelity,
-        math_approx_mode=True,
-        fp32_dest_acc_en=fp32_acc_mode,
-        packer_l1_acc=packer_l1_acc,
-    )
-
-    if has_bias:
-        output_t = ttl.operations.primary.matmul(
-            in0_t,
-            in1_t,
-            bias=bias_t,
-            program_config=program_config,
-            output_mem_config=output_mem_config,
-            compute_kernel_config=compute_kernel_config,
-        )
-    else:
-        output_t = ttl.operations.primary.matmul(
-            in0_t,
-            in1_t,
-            program_config=program_config,
-            output_mem_config=output_mem_config,
-            compute_kernel_config=compute_kernel_config,
-        )
-
-    if out_sharded:
-        output_t = ttl.tensor.sharded_to_interleaved(output_t, interleaved_mem_config_L1)
-
-    pt_out = in0 @ in1
-
-    if has_bias:
-        pt_out = pt_out + bias
-
-    if activation != None:
-        pt_out = torch.nn.functional.gelu(pt_out)
-    tt_out = tt2torch_tensor(output_t)
+    compute_kernel_config = ttl.tensor.GrayskullComputeKernelConfig(math_fidelity=fidelity, math_approx_mode=True)
 
-    passing, output = comp_pcc(pt_out, tt_out)
-    logger.info(output)
-    assert passing
-
-
-def run_bert_linear_batch4(
-    device,
-    in0_sharded,
-    out_sharded,
-    in1_in_dram,
-    M,
-    K,
-    N,
-    fidelity,
-    has_bias,
-    activation,
-    packer_l1_acc,
-    fp32_acc_mode,
-    function_level_defaults,
-):
-    in0_shape = [1, 1, M, K]
-    in1_shape = [1, 1, K, N]
-    bias_shape = [1, 1, N]
-    grid_size = (8, 4)
-
-    in0_block_h = M // grid_size[1] // 32
-    in0_block_w = K // grid_size[0] // 32
-    out_block_h = M // grid_size[1] // 32
-    out_block_w = N // grid_size[0] // 32
+    trace_loops = 4
 
-    if fp32_acc_mode == True:
-        out_subblock_w = 4
-        out_subblock_h = 1
-    else:
-        if out_block_w <= 8:
-            out_subblock_w = out_block_w
-            out_subblock_h = 8 // out_subblock_w
+    def run_ops(in0_t_res):
+        if in0_sharded:
+            in0_t = ttl.tensor.interleaved_to_sharded(
+                in0_t_res,
+                grid_size,
+                [M // grid_size[0], K // grid_size[1]],
+                ttl.tensor.TensorMemoryLayout.BLOCK_SHARDED,
+                ttl.tensor.ShardOrientation.COL_MAJOR,
+            )
         else:
-            out_subblock_h = 1
-            out_subblock_w = 8 // out_subblock_h
-            while out_block_w % out_subblock_w != 0:
-                out_subblock_w = out_block_w // 2
-
-    logger.debug("in0 block w h " + str(in0_block_w * 32) + " " + str(in0_block_h * 32))
-    logger.debug("in1 block w h " + str(out_block_w * 32) + " " + str(in0_block_w * 32))
-    logger.debug("out block w h " + str(out_block_w * 32) + " " + str(out_block_h * 32))
-    logger.debug("out subblock w h " + str(out_subblock_w * 32) + " " + str(out_subblock_h * 32))
-
-    interleaved_mem_config_L1 = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
-        buffer_type=ttl.tensor.BufferType.L1,
-    )
-    interleaved_mem_config_DRAM = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
-        buffer_type=ttl.tensor.BufferType.DRAM,
-    )
-    sharded_mem_config = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.BLOCK_SHARDED,
-        buffer_type=ttl.tensor.BufferType.L1,
-    )
-
-    in0 = torch.randn(in0_shape).bfloat16().float()
-    in1 = torch.randn(in1_shape).bfloat16().float()
-    bias = torch.randn(bias_shape).bfloat16().float()
-
-    in0_t = torch2tt_tensor(
-        in0, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
-    )
-    in1_t = torch2tt_tensor(
-        in1, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
-    )
-
-    output_mem_config = sharded_mem_config if out_sharded else interleaved_mem_config_L1
-    bias_t = pad_by_zero(
-        bias, device, tt_memory_config=interleaved_mem_config_L1, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
-    )[0]
-
-    if in0_sharded:
-        in0_t = ttl.tensor.interleaved_to_sharded(
-            in0_t,
-            grid_size,
-            [M // grid_size[1], K // grid_size[0]],
-            ttl.tensor.TensorMemoryLayout.BLOCK_SHARDED,
-            ttl.tensor.ShardOrientation.ROW_MAJOR,
-        )
-
-    program_config = ttl.operations.primary.MatmulMultiCoreReuseMultiCastProgramConfig(
-        compute_with_storage_grid_size=grid_size,
-        in0_block_w=in0_block_w,
-        out_subblock_h=out_subblock_h,
-        out_subblock_w=out_subblock_w,
-        per_core_M=out_block_h,
-        per_core_N=out_block_w,
-        transpose_mcast=False,
-        fused_activation=activation,
-    )
-
-    compute_kernel_config = ttl.tensor.WormholeComputeKernelConfig(
-        math_fidelity=fidelity,
-        math_approx_mode=True,
-        fp32_dest_acc_en=fp32_acc_mode,
-        packer_l1_acc=packer_l1_acc,
-    )
-
-    if has_bias:
-        output_t = ttl.operations.primary.matmul(
-            in0_t,
-            in1_t,
-            bias=bias_t,
-            program_config=program_config,
-            output_mem_config=output_mem_config,
-            compute_kernel_config=compute_kernel_config,
-        )
-    else:
-        output_t = ttl.operations.primary.matmul(
-            in0_t,
-            in1_t,
-            program_config=program_config,
-            output_mem_config=output_mem_config,
-            compute_kernel_config=compute_kernel_config,
-        )
-
-    if out_sharded:
-        output_t = ttl.tensor.sharded_to_interleaved(output_t, interleaved_mem_config_L1)
-
-    pt_out = in0 @ in1
-
-    if has_bias:
-        pt_out = pt_out + bias
-
-    if activation != None:
-        pt_out = torch.nn.functional.gelu(pt_out)
-    tt_out = tt2torch_tensor(output_t)
+            in0_t = ttl.tensor.clone(in0_t_res, interleaved_mem_config_L1)
 
-    passing, output = comp_pcc(pt_out, tt_out)
-    logger.info(output)
-    assert passing
-
-
-def not_fit_l1(M, K, N, fp32):
-    return (M * K + K * N > 5000000) and (fp32 == True)
-
-
-@pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32")
-@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"])
-@pytest.mark.parametrize("fp32_acc_mode", [True, False], ids=["fp32", "no_fp32"])
-@pytest.mark.parametrize(
-    "fidelity",
-    [
-        ttl.tensor.MathFidelity.LoFi,
-    ],
-    ids=["LoFi"],
-)
-@pytest.mark.parametrize("has_bias", [True, False], ids=["bias", "no_bias"])
-@pytest.mark.parametrize(
-    "in1_in_dram, out_sharded, in0_sharded, M, K, N, activation",
-    [
-        # in1-L1-fusedQKV
-        (False, True, True, 1536, 1024, 3072, None),  # both sharded
-        # in1-dram-fusedQKV
-        (True, True, True, 1536, 1024, 3072, None),
-        # in1-L1-selfout
-        (False, True, True, 1536, 1024, 1024, None),
-        # in1-dram-selfout
-        (True, True, True, 1536, 1024, 1024, None),
-        # in1-L1-ff1
-        (False, True, True, 1536, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        # in1-dram-ff1
-        (True, True, True, 1536, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        # in1-L1-ff1 - no Gelu
-        (False, True, True, 1536, 1024, 4096, None),
-        # in1-dram-ff1 - no Gelu
-        (True, True, True, 1536, 1024, 4096, None),
-        # in1-L1-ff2
-        (False, True, True, 1536, 4096, 1024, None),
-        # in1-dram-ff2
-        (True, True, True, 1536, 4096, 1024, None),
-    ],
-)
-def test_bert_linear_batch4(
-    device,
-    in0_sharded,
-    out_sharded,
-    in1_in_dram,
-    M,
-    K,
-    N,
-    fidelity,
-    has_bias,
-    activation,
-    packer_l1_acc,
-    fp32_acc_mode,
-    function_level_defaults,
-):
-    for i in range(1):
-        logger.info(i)
-        if not not_fit_l1(M, K, N, fp32_acc_mode):
-            run_bert_linear_batch4(
-                device,
-                in0_sharded,
-                out_sharded,
-                in1_in_dram,
-                M,
-                K,
-                N,
-                fidelity,
-                has_bias,
-                activation,
-                packer_l1_acc,
-                fp32_acc_mode,
-                function_level_defaults,
+        if has_bias:
+            output_t = ttl.operations.primary.matmul(
+                in0_t,
+                in1_t,
+                bias=bias_t,
+                program_config=program_config,
+                output_mem_config=output_mem_config,
+                compute_kernel_config=compute_kernel_config,
             )
         else:
-            logger.warning("L1 cannot fit large tensors in fp32 mode")
-
-
-@pytest.mark.skipif(is_grayskull(), reason="not tested for GS")
-@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"])
-@pytest.mark.parametrize(
-    "fp32_acc_mode",
-    [
-        True,
-    ],
-    ids=["fp32"],
-)
-@pytest.mark.parametrize(
-    "fidelity",
-    [
-        ttl.tensor.MathFidelity.LoFi,
-        ttl.tensor.MathFidelity.HiFi4,
-    ],
-    ids=["LoFi", "HiFi4"],
-)
-@pytest.mark.parametrize("has_bias", [True, False], ids=["bias", "no_bias"])
-@pytest.mark.parametrize(
-    "M, K, N, activation",
-    [
-        # not sharded due to L1 size
-        # small tensor test
-        (128, 256, 256, None),
-        (256, 256, 256, None),
-        (256, 512, 512, None),
-        # in1-dram-fusedQKV
-        (1536, 1024, 3072, None),
-        # in1-dram-selfout
-        (1536, 1024, 1024, None),
-        # in1-dram-ff1
-        (1536, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
-        # in1-dram-ff1 - no Gelu
-        (1536, 1024, 4096, None),
-        # in1-dram-ff2
-        (1536, 4096, 1024, None),
-    ],
-)
-def test_bert_linear_batch4_fp32_input_output(
-    device,
-    fidelity,
-    has_bias,
-    fp32_acc_mode,
-    packer_l1_acc,
-    M,
-    K,
-    N,
-    activation,
-    function_level_defaults,
-):
-    in0_shape = [1, 1, M, K]
-    in1_shape = [1, 1, K, N]
-    bias_shape = [1, 1, N]
-    grid_size = (8, 4)
-
-    in0_block_h = M // grid_size[1] // 32
-    in0_block_w = K // grid_size[0] // 32
-    out_block_h = M // grid_size[1] // 32
-    out_block_w = N // grid_size[0] // 32
-
-    # full block too large to fit in L1
-    if in0_block_h * in0_block_w >= 48 or in0_block_w * out_block_w >= 48:
-        in0_block_w = in0_block_w // 2
-
-    if out_block_w < 4:
-        out_subblock_w = out_block_w
-        out_subblock_h = out_block_h // out_subblock_w
-    else:
-        out_subblock_w = 4
-        out_subblock_h = 1
-
-    logger.debug("in0 block w h " + str(in0_block_w * 32) + " " + str(in0_block_h * 32))
-    logger.debug("in1 block w h " + str(out_block_w * 32) + " " + str(in0_block_w * 32))
-    logger.debug("out block w h " + str(out_block_w * 32) + " " + str(out_block_h * 32))
-    logger.debug("out subblock w h " + str(out_subblock_w * 32) + " " + str(out_subblock_h * 32))
-
-    interleaved_mem_config_L1 = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
-        buffer_type=ttl.tensor.BufferType.L1,
-    )
-    interleaved_mem_config_DRAM = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
-        buffer_type=ttl.tensor.BufferType.DRAM,
-    )
-    sharded_mem_config = ttl.tensor.MemoryConfig(
-        memory_layout=ttl.tensor.TensorMemoryLayout.BLOCK_SHARDED,
-        buffer_type=ttl.tensor.BufferType.L1,
-    )
-
-    in0 = torch.rand(in0_shape).float() * 1000.0
-    in1 = torch.rand(in1_shape).float() * 1000.0
-    bias = torch.rand(bias_shape).float() * 1000.0
-
-    in0_t = torch2tt_tensor(
-        in0, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.FLOAT32
-    )
-    in1_t = torch2tt_tensor(
-        in1, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.FLOAT32
-    )
+            output_t = ttl.operations.primary.matmul(
+                in0_t,
+                in1_t,
+                program_config=program_config,
+                output_mem_config=output_mem_config,
+                compute_kernel_config=compute_kernel_config,
+            )
+        if out_sharded:
+            output_t = ttl.tensor.sharded_to_interleaved(output_t, interleaved_mem_config_L1)
+        return output_t
+
+    # Compile
+    run_ops(in0_t_res)
+    # Capture
+    logger.info("Start Trace capture")
+    tid = ttl.device.BeginTraceCapture(device, 0, 34816)
+    output_t_res = run_ops(in0_t_res)
+    ttl.device.EndTraceCapture(device, 0, tid)
+    logger.info("Trace captured")
 
-    output_mem_config = interleaved_mem_config_DRAM
-    bias_t = pad_by_zero(
-        bias, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.FLOAT32
-    )[0]
+    for iter in range(trace_loops):
+        in0 = torch.randn(in0_shape).bfloat16().float()
+        in0_t_updated = torch2tt_tensor(
+            in0, None, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
+        )
+        ttl.tensor.write_tensor(in0_t_updated, in0_t_res)
+        logger.info(f"Running iteration {iter}")
+        ttl.device.ReplayTrace(device, 0, tid, True)
 
-    program_config = ttl.operations.primary.MatmulMultiCoreReuseMultiCastProgramConfig(
-        compute_with_storage_grid_size=grid_size,
-        in0_block_w=in0_block_w,
-        out_subblock_h=out_subblock_h,
-        out_subblock_w=out_subblock_w,
-        per_core_M=out_block_h,
-        per_core_N=out_block_w,
-        transpose_mcast=False,
-        fused_activation=activation,
-    )
+        pt_out = in0 @ in1
 
-    compute_kernel_config = ttl.tensor.WormholeComputeKernelConfig(
-        math_fidelity=fidelity,
-        math_approx_mode=True,
-        fp32_dest_acc_en=fp32_acc_mode,
-        packer_l1_acc=packer_l1_acc,
-    )
+        if has_bias:
+            pt_out = pt_out + bias
 
-    if has_bias:
-        output_t = ttl.operations.primary.matmul(
-            in0_t,
-            in1_t,
-            bias=bias_t,
-            program_config=program_config,
-            output_mem_config=output_mem_config,
-            compute_kernel_config=compute_kernel_config,
-        )
-    else:
-        output_t = ttl.operations.primary.matmul(
-            in0_t,
-            in1_t,
-            program_config=program_config,
-            output_mem_config=output_mem_config,
-            compute_kernel_config=compute_kernel_config,
-        )
+        if activation != None:
+            pt_out = torch.nn.functional.gelu(pt_out)
+        tt_out = tt2torch_tensor(output_t_res)
 
-    pt_out = in0 @ in1
-    if has_bias:
-        pt_out = pt_out + bias
-    if activation != None:
-        pt_out = torch.nn.functional.gelu(pt_out)
-    tt_out = tt2torch_tensor(output_t)
+        passing, output = comp_pcc(pt_out, tt_out)
+        logger.info(output)
+        assert passing
+    ttl.device.ReleaseLastTrace(device)
 
-    passing, output = comp_pcc(pt_out, tt_out)
-    logger.info(output)
-    assert passing
+    # Done with the trace, can deallocate the buffers now.
+    ttl.device.ReleaseTrace(device, tid)
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
index 5cb65e328f0c..57c15a10a277 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
@@ -14,20 +14,21 @@ class CommonFixture: public ::testing::Test {
 public:
     // A function to run a program, according to which dispatch mode is set.
     void RunProgram(tt::tt_metal::Device* device, Program& program) {
-        static std::unordered_set<uint64_t> trace_caputured;
+        static std::unordered_map<uint64_t, uint32_t> trace_captured;
         uint64_t program_id = program.get_id();
         if (this->slow_dispatch_) {
             tt::tt_metal::detail::LaunchProgram(device, program);
         } else if (this->metal_trace_) {
-            if (trace_caputured.find(program_id) == trace_caputured.end()) {
-                tt::tt_metal::detail::BeginTraceCapture(device);
-                EnqueueProgram(device->command_queue(), program, false);
-                tt::tt_metal::detail::EndTraceCapture(device);
-                trace_caputured.insert(program_id);
+            CommandQueue& cq = device->command_queue();
+            if (trace_captured.find(program_id) == trace_captured.end()) {
+                uint32_t tid = tt::tt_metal::BeginTraceCapture(device, cq.id(), 2048);
+                EnqueueProgram(cq, program, false);
+                tt::tt_metal::EndTraceCapture(device, cq.id(), tid);
+                trace_captured[program_id] = tid;
             }
             log_debug(tt::LogTest, "Executing trace for program {}", program_id);
-            tt::tt_metal::detail::ExecuteLastTrace(device, false);
-            Finish(device->command_queue());
+            tt::tt_metal::ReplayTrace(device, cq.id(), trace_captured[program_id], false);
+            Finish(cq);
         } else {
             CommandQueue& cq = device->command_queue();
             EnqueueProgram(cq, program, false);
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
index 38bd2d9bb90c..c68b56bf44fb 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
@@ -91,83 +91,6 @@ constexpr bool kBlocking = true;
 constexpr bool kNonBlocking = false;
 vector<bool> blocking_flags = {kBlocking, kNonBlocking};
 
-TEST_F(CommandQueueFixture, TraceInstanceManagement) {
-    CommandQueue& cq = this->device_->command_queue();
-    vector<uint64_t> trace_size = {32*1024, 32};
-    vector<uint64_t> page_size = {HostMemDeviceCommand::PROGRAM_PAGE_SIZE, 32};
-    vector<uint64_t> buf_size_per_bank;
-
-    for (int i=0; i<trace_size.size(); i++) {
-        int banks = cq.device()->num_banks(BufferType::DRAM);
-        int pages = trace_size.at(i) / page_size.at(i);
-        int pages_per_bank = pages / banks + (pages % banks ? 1 : 0);
-        buf_size_per_bank.push_back(pages_per_bank * page_size.at(i));
-    }
-
-    auto mem_idle = cq.device()->get_memory_allocation_statistics(BufferType::DRAM);
-    log_debug(LogTest, "DRAM usage before trace buffer allocation: {}, {}, {}",
-        mem_idle.total_allocatable_size_bytes,
-        mem_idle.total_free_bytes,
-        mem_idle.total_allocated_bytes);
-
-    // Add instances scope, trace buffers go out of scope yet remain cached in memory
-    {
-        TraceBuffer trace_buffer0 = {{}, std::make_shared<Buffer>(
-            cq.device(), trace_size.at(0), page_size.at(0), BufferType::DRAM, TensorMemoryLayout::INTERLEAVED)};
-        TraceBuffer trace_buffer1 = {{}, std::make_shared<Buffer>(
-            cq.device(), trace_size.at(1), page_size.at(1), BufferType::DRAM, TensorMemoryLayout::INTERLEAVED)};
-        auto mem_multi_trace = cq.device()->get_memory_allocation_statistics(BufferType::DRAM);
-        log_debug(
-            LogTest,
-            "DRAM usage post trace buffer allocation: {}, {}, {}",
-            mem_multi_trace.total_allocatable_size_bytes,
-            mem_multi_trace.total_free_bytes,
-            mem_multi_trace.total_allocated_bytes);
-
-        // Cache the trace buffer in memory via instance pinning calls
-        Trace::add_instance(0, trace_buffer0);
-        Trace::add_instance(1, trace_buffer1);
-    }
-
-    // Some user interaction with traces, unimportant... check that traces are still cached
-    auto mem_multi_trace = cq.device()->get_memory_allocation_statistics(BufferType::DRAM);
-    EXPECT_EQ(mem_idle.total_allocated_bytes, mem_multi_trace.total_allocated_bytes - buf_size_per_bank.at(0) - buf_size_per_bank.at(1));
-    EXPECT_EQ(mem_idle.total_free_bytes, mem_multi_trace.total_free_bytes + buf_size_per_bank.at(0) + buf_size_per_bank.at(1));
-
-    // Release instances scope, trace buffers remain cached in memory until released by user
-    {
-        ReleaseTrace(1);
-        auto mem_release_one = cq.device()->get_memory_allocation_statistics(BufferType::DRAM);
-        EXPECT_EQ(mem_idle.total_allocated_bytes, mem_release_one.total_allocated_bytes - buf_size_per_bank.at(0));
-        EXPECT_EQ(mem_idle.total_free_bytes, mem_release_one.total_free_bytes + buf_size_per_bank.at(0));
-
-        ReleaseTrace(0);
-        auto mem_release_two = cq.device()->get_memory_allocation_statistics(BufferType::DRAM);
-        EXPECT_EQ(mem_idle.total_allocatable_size_bytes, mem_release_two.total_allocatable_size_bytes);
-        EXPECT_EQ(mem_idle.total_free_bytes, mem_release_two.total_free_bytes);
-        EXPECT_EQ(mem_idle.total_allocated_bytes, mem_release_two.total_allocated_bytes);
-    }
-
-    // Add instances scope, trace buffers go out of scope yet remain cached in memory
-    {
-        TraceBuffer trace_buffer0 = {{}, std::make_shared<Buffer>(
-            cq.device(), trace_size.at(0), page_size.at(0), BufferType::DRAM, TensorMemoryLayout::INTERLEAVED)};
-        TraceBuffer trace_buffer1 = {{}, std::make_shared<Buffer>(
-            cq.device(), trace_size.at(1), page_size.at(1), BufferType::DRAM, TensorMemoryLayout::INTERLEAVED)};
-        auto mem_multi_trace = cq.device()->get_memory_allocation_statistics(BufferType::DRAM);
-
-        // Cache the trace buffer in memory via instance pinning calls
-        Trace::add_instance(0, trace_buffer0);
-        Trace::add_instance(1, trace_buffer1);
-    }
-
-    ReleaseTrace(-1);
-    auto mem_release_all = cq.device()->get_memory_allocation_statistics(BufferType::DRAM);
-    EXPECT_EQ(mem_idle.total_allocatable_size_bytes, mem_release_all.total_allocatable_size_bytes);
-    EXPECT_EQ(mem_idle.total_free_bytes, mem_release_all.total_free_bytes);
-    EXPECT_EQ(mem_idle.total_allocated_bytes, mem_release_all.total_allocated_bytes);
-}
-
 TEST_F(CommandQueueFixture, InstantiateTraceSanity) {
     CommandQueue& command_queue = this->device_->command_queue();
 
@@ -176,112 +99,27 @@ TEST_F(CommandQueueFixture, InstantiateTraceSanity) {
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
     }
-
-    // Capture trace on a trace queue
-    Trace trace;
-    BeginTrace(trace);
-    EnqueueWriteBuffer(trace.queue(), input, input_data.data(), kNonBlocking);
-    EnqueueWriteBuffer(trace.queue(), input, input_data.data(), kNonBlocking);
-    EndTrace(trace);
+    Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
+    auto simple_program = std::make_shared<Program>(create_simple_unary_program(input, output));
+    EnqueueProgram(command_queue, simple_program, true);
+    uint32_t tid = BeginTraceCapture(this->device_, command_queue.id(), 1280);
+    EnqueueProgram(command_queue, simple_program, kNonBlocking);
+    EndTraceCapture(this->device_, command_queue.id(), tid);
 
     // Instantiate a trace on a device bound command queue
-    uint32_t trace_id = InstantiateTrace(trace, command_queue);
-    auto trace_inst = Trace::get_instance(trace_id);
+    auto trace_inst = this->device_->get_trace(tid);
     vector<uint32_t> data_fd, data_bd;
 
     // Backdoor read the trace buffer
-    ::detail::ReadFromBuffer(trace_inst.buffer, data_bd);
+    ::detail::ReadFromBuffer(trace_inst->buffer, data_bd);
 
     // Frontdoor reaad the trace buffer
-    data_fd.resize(trace_inst.buffer->size() / sizeof(uint32_t));
-    EnqueueReadBuffer(command_queue, trace_inst.buffer, data_fd.data(), kBlocking);
+    data_fd.resize(trace_inst->buffer->size() / sizeof(uint32_t));
+    EnqueueReadBuffer(command_queue, trace_inst->buffer, data_fd.data(), kBlocking);
     EXPECT_EQ(data_fd, data_bd);
 
-    // Check for content correctness in the trace buffer
-    // The following commands are expected based on the trace capture
-    CQPrefetchCmd* p_cmd;
-    CQDispatchCmd* d_cmd;
-    size_t p_size = (sizeof(CQPrefetchCmd) / sizeof(uint32_t));
-    size_t d_size = (sizeof(CQDispatchCmd) / sizeof(uint32_t));
-    size_t offset = 0;
-    p_cmd = (CQPrefetchCmd*)(data_fd.data() + offset);
-    offset += p_size;
-    EXPECT_EQ(p_cmd->base.cmd_id, CQ_PREFETCH_CMD_RELAY_INLINE);
-
-    d_cmd = (CQDispatchCmd*)(data_fd.data() + offset);
-    offset += d_size;
-    EXPECT_EQ(d_cmd->base.cmd_id, CQ_DISPATCH_CMD_WAIT);
-
-    p_cmd = (CQPrefetchCmd*)(data_fd.data() + offset);
-    offset += p_size;
-    EXPECT_EQ(p_cmd->base.cmd_id, CQ_PREFETCH_CMD_RELAY_INLINE);
-
-    d_cmd = (CQDispatchCmd*)(data_fd.data() + offset);
-    offset += d_size;
-    EXPECT_EQ(d_cmd->base.cmd_id, CQ_DISPATCH_CMD_WRITE_PAGED);
-    EXPECT_EQ(d_cmd->write_paged.is_dram, true);
-    EXPECT_EQ(d_cmd->write_paged.page_size, 2048);
-
     log_trace(LogTest, "Trace buffer content: {}", data_fd);
-    ReleaseTrace(trace_id);
-}
-
-TEST_F(CommandQueueFixture, EnqueueTraceWriteBufferCommand) {
-    CommandQueue& command_queue = this->device_->command_queue();
-
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    vector<uint32_t> input_first(input.size() / sizeof(uint32_t), 0xfaceface);
-    vector<uint32_t> input_last(input.size() / sizeof(uint32_t), 0);
-    for (uint32_t i = 0; i < input_last.size(); i++) {
-        input_last[i] = i;
-    }
-
-    // TRACE CAPTURE & INSTANTIATE MODE
-    // Capture trace on a trace queue
-    Trace trace;
-    BeginTrace(trace);
-    EnqueueWriteBuffer(trace.queue(), input, input_first.data(), kNonBlocking);
-    EnqueueWriteBuffer(trace.queue(), input, input_last.data(), kNonBlocking);
-    EndTrace(trace);
-
-    // Instantiate a trace on a device bound command queue
-    uint32_t trace_id = InstantiateTrace(trace, command_queue);
-
-    // Repeat traces, check that last write occurs correctly during each iteration
-    vector<uint32_t> readback(input.size() / sizeof(uint32_t), 0);
-    for (int i = 0; i < 10; i++) {
-        EnqueueTrace(command_queue, trace_id, true);
-        EnqueueReadBuffer(command_queue, input, readback.data(), kBlocking);
-        EXPECT_EQ(input_last, readback);
-    }
-
-    ReleaseTrace(trace_id);
-}
-
-TEST_F(CommandQueueFixture, EnqueueTraceWriteBufferCommandViaDevice) {
-    CommandQueue& command_queue = this->device_->command_queue();
-
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    vector<uint32_t> input_first(input.size() / sizeof(uint32_t), 0xfaceface);
-    vector<uint32_t> input_last(input.size() / sizeof(uint32_t), 0);
-    for (uint32_t i = 0; i < input_last.size(); i++) {
-        input_last[i] = i;
-    }
-
-    // DEVICE CAPTURE AND REPLAY MODE
-    // Capture trace on a device rather than a trace objet
-    detail::BeginTraceCapture(this->device_);
-    EnqueueWriteBuffer(command_queue, input, input_first.data(), kNonBlocking);
-    EnqueueWriteBuffer(command_queue, input, input_last.data(), kNonBlocking);
-    detail::EndTraceCapture(this->device_);
-
-    // Repeat traces, check that last write occurs correctly during each iteration
-    vector<uint32_t> readback(input.size() / sizeof(uint32_t), 0);
-    for (int i = 0; i < 10; i++) {
-        detail::ExecuteLastTrace(this->device_, true);
-        EnqueueReadBuffer(command_queue, input, readback.data(), kBlocking);
-        EXPECT_EQ(input_last, readback);
-    }
+    ReleaseTrace(this->device_, tid);
 }
 
 TEST_F(CommandQueueFixture, EnqueueProgramTraceCapture) {
@@ -305,23 +143,19 @@ TEST_F(CommandQueueFixture, EnqueueProgramTraceCapture) {
     EnqueueProgram(command_queue, simple_program, true);
     EnqueueReadBuffer(command_queue, output, eager_output_data.data(), true);
 
-    // TRACE CAPTURE & INSTANTIATE MODE
-    Trace trace;
     EnqueueWriteBuffer(command_queue, input, input_data.data(), true);
 
-    BeginTrace(trace);
-    EnqueueProgram(trace.queue(), simple_program, false);
-    EndTrace(trace);
+    uint32_t tid = BeginTraceCapture(this->device_, command_queue.id(), 2048);
+    EnqueueProgram(command_queue, simple_program, false);
+    EndTraceCapture(this->device_, command_queue.id(), tid);
 
-    // Instantiate a trace on a device queue
-    uint32_t trace_id = InstantiateTrace(trace, command_queue);
-
-    EnqueueTrace(command_queue, trace_id, true);
+    EnqueueTrace(command_queue, tid, true);
     EnqueueReadBuffer(command_queue, output, trace_output_data.data(), true);
     EXPECT_TRUE(eager_output_data == trace_output_data);
 
     // Done
     Finish(command_queue);
+    ReleaseTrace(this->device_, tid);
 }
 
 TEST_F(CommandQueueFixture, EnqueueProgramDeviceCapture) {
@@ -341,9 +175,10 @@ TEST_F(CommandQueueFixture, EnqueueProgramDeviceCapture) {
     trace_output_data.resize(input_data.size());
 
     bool has_eager = true;
+    std::shared_ptr<Program> simple_program;
     // EAGER MODE EXECUTION
     if (has_eager) {
-        Program simple_program = create_simple_unary_program(input, output);
+        simple_program = std::make_shared<Program>(create_simple_unary_program(input, output));
         EnqueueWriteBuffer(command_queue, input, input_data.data(), true);
         EnqueueProgram(command_queue, simple_program, true);
         EnqueueReadBuffer(command_queue, output, eager_output_data.data(), true);
@@ -351,16 +186,18 @@ TEST_F(CommandQueueFixture, EnqueueProgramDeviceCapture) {
 
     // DEVICE CAPTURE AND REPLAY MODE
     bool has_trace = false;
+    uint32_t tid = 0;
     for (int i = 0; i < 1; i++) {
         EnqueueWriteBuffer(command_queue, input, input_data.data(), true);
 
         if (!has_trace) {
-            detail::BeginTraceCapture(this->device_);
-            EnqueueProgram(command_queue, std::make_shared<Program>(create_simple_unary_program(input, output)), true);
-            detail::EndTraceCapture(this->device_);
+            // Program must be cached first
+            tid = BeginTraceCapture(this->device_, command_queue.id(), 2048);
+            EnqueueProgram(command_queue, simple_program, false);
+            EndTraceCapture(this->device_, command_queue.id(), tid);
             has_trace = true;
         }
-        detail::ExecuteLastTrace(this->device_, true);
+        ReplayTrace(this->device_, command_queue.id(), tid, true);
 
         EnqueueReadBuffer(command_queue, output, trace_output_data.data(), true);
         if (has_eager) EXPECT_TRUE(eager_output_data == trace_output_data);
@@ -368,6 +205,7 @@ TEST_F(CommandQueueFixture, EnqueueProgramDeviceCapture) {
 
     // Done
     Finish(command_queue);
+    ReleaseTrace(this->device_, tid);
 }
 
 TEST_F(CommandQueueFixture, EnqueueTwoProgramTrace) {
@@ -424,24 +262,20 @@ TEST_F(CommandQueueFixture, EnqueueTwoProgramTrace) {
     }
 
     // Capture trace on a trace queue
-    Trace trace;
-    CommandQueue& trace_queue = BeginTrace(trace);
-    EnqueueProgram(trace_queue, op0, kNonBlocking);
-    EnqueueProgram(trace_queue, op1, kNonBlocking);
-    EndTrace(trace);
-
-    // Instantiate a trace on a device bound command queue
-    uint32_t trace_id = InstantiateTrace(trace, command_queue);
+    uint32_t tid = BeginTraceCapture(this->device_, command_queue.id(), 4096);
+    EnqueueProgram(command_queue, op0, kNonBlocking);
+    EnqueueProgram(command_queue, op1, kNonBlocking);
+    EndTraceCapture(this->device_, command_queue.id(), tid);
 
     // Trace mode execution
     for (auto i = 0; i < num_loops; i++) {
         ScopedTimer timer("Trace loop " + std::to_string(i));
         EnqueueWriteBuffer(command_queue, input, input_data.data(), kNonBlocking);
-        EnqueueTrace(command_queue, trace_id, kNonBlocking);
+        EnqueueTrace(command_queue, tid, kNonBlocking);
         EnqueueReadBuffer(command_queue, output, trace_outputs[i].data(), kNonBlocking);
     }
     Finish(command_queue);
-    ReleaseTrace(trace_id);
+    ReleaseTrace(this->device_, tid);
 
     // Expect same output across all loops
     for (auto i = 0; i < num_loops; i++) {
@@ -506,25 +340,21 @@ TEST_F(CommandQueueFixture, EnqueueMultiProgramTraceBenchmark) {
     }
 
     // Capture trace on a trace queue
-    Trace trace;
-    CommandQueue& trace_queue = BeginTrace(trace);
+    uint32_t tid = BeginTraceCapture(this->device_, command_queue.id(), 6144);
     for (uint32_t i = 0; i < num_programs; i++) {
-        EnqueueProgram(trace_queue, programs[i], kNonBlocking);
+        EnqueueProgram(command_queue, programs[i], kNonBlocking);
     }
-    EndTrace(trace);
-
-    // Instantiate a trace on a device bound command queue
-    uint32_t trace_id = InstantiateTrace(trace, command_queue);
+    EndTraceCapture(this->device_, command_queue.id(), tid);
 
     // Trace mode execution
     for (auto i = 0; i < num_loops; i++) {
         ScopedTimer timer("Trace loop " + std::to_string(i));
         EnqueueWriteBuffer(command_queue, input, input_data.data(), kNonBlocking);
-        EnqueueTrace(command_queue, trace_id, kNonBlocking);
+        EnqueueTrace(command_queue, tid, kNonBlocking);
         EnqueueReadBuffer(command_queue, output, trace_outputs[i].data(), kNonBlocking);
     }
     Finish(command_queue);
-    ReleaseTrace(trace_id);
+    ReleaseTrace(this->device_, tid);
 }
 
 } // end namespace basic_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
index 22471c72c216..7a974adc63d9 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
@@ -76,8 +76,6 @@ Program create_simple_unary_program(const Buffer& input, const Buffer& output) {
 namespace basic_tests {
 
 TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTrace) {
-    // TODO: Re-enable when Trace brought up
-    GTEST_SKIP();
 
     Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
     Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
@@ -103,26 +101,22 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTrace) {
     vector<uint32_t> trace_output_data;
     trace_output_data.resize(input_data.size());
 
-    Trace trace;
     EnqueueWriteBuffer(data_movement_queue, input, input_data.data(), true);
 
-    BeginTrace(trace);
-    EnqueueProgram(trace.queue(), simple_program, false);
-    EndTrace(trace);
-    // Instantiate a trace on a device queue
-    uint32_t trace_id = InstantiateTrace(trace, command_queue);
+    uint32_t tid = BeginTraceCapture(this->device_, command_queue.id(), 2048);
+    EnqueueProgram(command_queue, simple_program, false);
+    EndTraceCapture(this->device_, command_queue.id(), tid);
 
-    EnqueueTrace(command_queue, trace_id, true);
+    EnqueueTrace(command_queue, tid, true);
     EnqueueReadBuffer(data_movement_queue, output, trace_output_data.data(), true);
     EXPECT_TRUE(eager_output_data == trace_output_data);
 
     // Done
     Finish(command_queue);
+    ReleaseTrace(this->device_, tid);
 }
 
 TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTraceLoops) {
-    // TODO: Re-enable when Trace brought up
-    GTEST_SKIP();
 
     Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
     Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
@@ -145,19 +139,19 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTraceLoops) {
         trace_outputs[i].resize(input_data.size());
     }
 
+    // Compile
+    EnqueueProgram(command_queue, simple_program, true);
+
     // Trace mode execution
-    Trace trace;
-    uint32_t trace_id;
+    uint32_t trace_id = 0;
     bool trace_captured = false;
     for (auto i = 0; i < num_loops; i++) {
         EnqueueWriteBuffer(data_movement_queue, input, input_data.data(), true);
 
         if (not trace_captured) {
-            BeginTrace(trace);
-            EnqueueProgram(trace.queue(), simple_program, false);
-            EndTrace(trace);
-            // Instantiate a trace on a device queue
-            trace_id = InstantiateTrace(trace, command_queue);
+            trace_id = BeginTraceCapture(this->device_, command_queue.id(), 4096);
+            EnqueueProgram(command_queue, simple_program, false);
+            EndTraceCapture(this->device_, command_queue.id(), trace_id);
             trace_captured = true;
         }
 
@@ -170,11 +164,11 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTraceLoops) {
 
     // Done
     Finish(command_queue);
+    ReleaseTrace(this->device_, trace_id);
 }
 
 TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTraceBenchmark) {
-    // TODO: Re-enable when Trace brought up
-    GTEST_SKIP();
+
     Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
     Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
 
@@ -186,7 +180,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTraceBenchmark) {
     // Keep this queue in passthrough mode for now
     CommandQueue& command_queue = this->device_->command_queue(0);
 
-    Program simple_program = create_simple_unary_program(input, output);
+    auto simple_program = create_simple_unary_program(input, output);
     vector<uint32_t> input_data(input.size() / sizeof(uint32_t), 0);
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
@@ -229,19 +223,15 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTraceBenchmark) {
     }
 
     // Capture trace on a trace queue
-    Trace trace;
-    CommandQueue& trace_queue = BeginTrace(trace);
-    EnqueueProgram(trace_queue, simple_program, false);
-    EndTrace(trace);
-
-    // Instantiate a trace on a device bound command queue
-    uint32_t trace_id = InstantiateTrace(trace, command_queue);
+    uint32_t tid = BeginTraceCapture(this->device_, command_queue.id(), 6144);
+    EnqueueProgram(command_queue, simple_program, false);
+    EndTraceCapture(this->device_, command_queue.id(), tid);
 
     // Trace mode execution
     for (auto i = 0; i < num_loops; i++) {
         tt::ScopedTimer timer("Trace loop " + std::to_string(i));
         EnqueueWriteBuffer(command_queue, input, input_data.data(), kNonBlocking);
-        EnqueueTrace(command_queue, trace_id, kNonBlocking);
+        EnqueueTrace(command_queue, tid, kNonBlocking);
         EnqueueReadBuffer(command_queue, output, trace_outputs[i].data(), kNonBlocking);
     }
     Finish(command_queue);
@@ -250,6 +240,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, EnqueueOneProgramTraceBenchmark) {
     for (auto i = 0; i < num_loops; i++) {
         EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]);
     }
+    ReleaseTrace(this->device_, tid);
 }
 
 } // end namespace basic_tests
diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
index 16b6e308e975..e555c1adaf6c 100644
--- a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
+++ b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
@@ -204,16 +204,16 @@ void DeviceModule(py::module &m_device) {
     m_device.def("DeallocateBuffers", &detail::DeallocateBuffers, R"doc(
         Deallocate all buffers associated with Device handle
     )doc");
-    m_device.def("BeginTraceCapture", &detail::BeginTraceCapture, R"doc(
+    m_device.def("BeginTraceCapture", &BeginTraceCapture, R"doc(
         Begin trace capture on Device handle
     )doc");
-    m_device.def("EndTraceCapture", &detail::EndTraceCapture, R"doc(
+    m_device.def("EndTraceCapture", &EndTraceCapture, R"doc(
         End trace capture on Device handle
     )doc");
-    m_device.def("ExecuteLastTrace", &detail::ExecuteLastTrace, R"doc(
-        Execute last captured trace on Device handle
+    m_device.def("ReplayTrace", &ReplayTrace, R"doc(
+        Replay last captured trace on Device handle
     )doc");
-    m_device.def("ReleaseLastTrace", &detail::ReleaseLastTrace, R"doc(
+    m_device.def("ReleaseTrace", &ReleaseTrace, R"doc(
         Release last captured Trace on Device handle
     )doc");
 
diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
index b668221ce6a2..48629c5d14e9 100644
--- a/tt_metal/detail/tt_metal.hpp
+++ b/tt_metal/detail/tt_metal.hpp
@@ -25,7 +25,7 @@ namespace tt::tt_metal{
 
     namespace device_pool {
 
-    // Definition of the global device vector
+        // Definition of the global device vector
         extern std::vector<Device*> devices;
 
     } // device_pool
@@ -47,15 +47,6 @@ namespace tt::tt_metal{
         void CloseDevices(std::map<chip_id_t, Device *> devices);
         Device *GetDeviceHandle(chip_id_t device_id);
 
-        void BeginTraceCapture(Device *device);
-        void EndTraceCapture(Device *device);
-        void ExecuteLastTrace(Device *device, bool blocking);
-        void ReleaseLastTrace(Device *device);
-
-        void BeginTraceCaptures(std::map<chip_id_t, Device *> devices);
-        void EndTraceCaptures(std::map<chip_id_t, Device *> devices);
-        void ExecuteLastTraces(std::map<chip_id_t, Device *> devices, bool blocking);
-
         /**
         * Copies data from a host buffer into the specified buffer
         *
@@ -163,7 +154,7 @@ namespace tt::tt_metal{
          * |---------------|---------------------------------------------------|-----------------|---------------------------|----------|
          * | device        | The device holding the program being profiled.    | Device *        |                           | True     |
          * */
-	void InitDeviceProfiler(Device *device);
+	    void InitDeviceProfiler(Device *device);
 
         /**
          * Read device side profiler data and dump results into device side CSV log
@@ -358,6 +349,9 @@ namespace tt::tt_metal{
             device->deallocate_buffers();
         }
 
+        void DisableAllocs(Device *device);
+        void EnableAllocs(Device *device);
+
         inline void GenerateDeviceHeaders(Device *device,
                                           const std::string &path)
         {
diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp
index bf1188d56035..5f91cbbd8854 100644
--- a/tt_metal/host_api.hpp
+++ b/tt_metal/host_api.hpp
@@ -465,68 +465,76 @@ void Finish(CommandQueue& cq);
 
 /**
  * Begins capture on a trace, when the trace is in capture mode all programs pushed into the trace queue will have their execution delayed until the trace is instantiated and enqueued.
- * The capture must be later ended via EndTrace, and can be instantiated via InstantiateTrace on a device command queue, and finally scheduled to be executed via EnqueueTrace.
+ * The capture must be later ended via EndTraceCapture, and finally scheduled to be executed via ReplayTrace.
+ * Beginning a trace capture enabled buffer allocations until capture has ended.
  *
- * Return value: CommandQueue&
+ * Return value: Trace ID
  *
- * | Argument     | Description                                                            | Type                          | Valid Range                        | Required |
- * |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
- * | trace        | Trace in which to initiate the capture                                 | Trace &                       |                                    | Yes      |
+ * | Argument        | Description                                                            | Type                          | Valid Range                        | Required |
+ * |-----------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
+ * | device          | The device holding being traced.                                       | Device *                      |                                    | Yes      |
+ * | cq_id           | The command queue id associated with the trace.                        | uint8_t                       |                                    | Yes      |
+ * | trace_buff_size | The size of the trace buffer to pre-allocate.                          | uint32_t                      |                                    | Yes      |
 */
-CommandQueue& BeginTrace(Trace &trace);
+uint32_t BeginTraceCapture(Device *device, const uint8_t cq_id, const uint32_t trace_buff_size);
 
 /**
  * Completes capture on a trace, if captured commands do not conform to the rules of the trace, the trace will be invalidated.
- * This trace can later be instantiated via InstantiateTrace on a device command queue, and enqueued for execution via EnqueueTrace on the same device command queue.
+ * This trace can be enqueued for execution via ReplayTrace on the same device command queue.
+ * After ending a trace capture, buffer allocations on device are disabled until either a new trace begins capture,
+ * or all traces on the device are released
  *
  * Return value: void
  *
  * | Argument     | Description                                                            | Type                          | Valid Range                        | Required |
  * |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
- * | trace        | Trace in which to end the capture                                      | Trace &                       |                                    | Yes      |
+ * | device       | The device holding being traced.                                       | Device *                      |                                    | Yes      |
+ * | cq_id        | The command queue id associated with the trace.                        | uint8_t                       |                                    | Yes      |
+ * | tid          | A unique id from BeginTraceCapture for the trace being captured        | uint32_t                      |                                    | Yes      |
  */
-void EndTrace(Trace &trace);
+void EndTraceCapture(Device *device, const uint8_t cq_id, const uint32_t tid);
 
 /**
- * Instantiates a trace on a device command queue, triggering the staging of traced commands and data to the device.
- * Staging is a blocking operation and must be completed before the trace can be enqueued for exeuction. A unique trace instance id is returned
+ * Replay a trace of previously generated commands and data.
  *
- * Return value: uint32_t
+ * Return value: void
  *
  * | Argument     | Description                                                            | Type                          | Valid Range                        | Required |
  * |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
- * | trace        | The trace object to instantiate                                        | Trace &                       |                                    | Yes      |
- * | cq           | The device command queue on which to instantiate the trace             | CommandQueue &                |                                    | Yes      |
-*/
-uint32_t InstantiateTrace(Trace &trace, CommandQueue &cq);
+ * | device       | The device holding the trace.                                          | Device *                      |                                    | Yes      |
+ * | cq_id        | The command queue id associated with the trace.                        | uint8_t                       |                                    | Yes      |
+ * | trace_id     | A unique id representing an existing captured trace.                   | uint32_t                      |                                    | Yes      |
+ * | blocking     | Whether or not this is a blocking operation                            | bool                          |                                    | Yes      |
+ */
+void ReplayTrace(Device *device, const uint8_t cq_id, const uint32_t tid, const bool blocking);
 
 /**
- * Enqueues a trace of previously generated commands and data.
+ * Release a previously instantiated trace, deallocating the associated trace buffers on device
+ * This operation is not thread-safe, user must ensure that the trace being released is no longer needed by device threads
+ * If this releases the last trace on a device, then buffer allocations are re-enabled
  *
  * Return value: void
  *
  * | Argument     | Description                                                            | Type                          | Valid Range                        | Required |
  * |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
- * | cq           | The command queue object which dispatches the command to the hardware  | CommandQueue &                |                                    | Yes      |
- * | trace_id     | A unique id representing an existing on-device trace, which has been   | uint32_t                      |                                    | Yes      |
- * |              | instantiated via InstantiateTrace where the trace_id is returned       |                               |                                    |          |
- * | blocking     | Whether or not this is a blocking operation                            | bool                          |                                    | Yes      |
+ * | device       | The device holding the trace.                                          | Device *                      |                                    | Yes      |
+ * | trace_id     | A unique id representing an existing captured trace.                   | uint32_t                      |                                    | Yes      |
  */
-void EnqueueTrace(CommandQueue &cq, uint32_t trace_id, bool blocking);
-
+void ReleaseTrace(Device *device, const uint32_t tid);
 
 /**
- * Release a previously instantiated trace, deallocating the associated trace buffers on device
- * This operation is not thread-safe, user must ensure that the trace being released is no longer needed by device threads
+ * Enqueues a trace of previously generated commands and data.
  *
  * Return value: void
  *
  * | Argument     | Description                                                            | Type                          | Valid Range                        | Required |
  * |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
+ * | cq           | The command queue object which dispatches the command to the hardware  | CommandQueue &                |                                    | Yes      |
  * | trace_id     | A unique id representing an existing on-device trace, which has been   | uint32_t                      |                                    | Yes      |
  * |              | instantiated via InstantiateTrace where the trace_id is returned       |                               |                                    |          |
+ * | blocking     | Whether or not this is a blocking operation                            | bool                          |                                    | Yes      |
  */
-void ReleaseTrace(uint32_t trace_id);
+void EnqueueTrace(CommandQueue &cq, uint32_t trace_id, bool blocking);
 
 /**
  * Read device side profiler data and dump results into device side CSV log
diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp
index 67923f25b50a..e79629d6d99e 100644
--- a/tt_metal/impl/allocator/allocator.cpp
+++ b/tt_metal/impl/allocator/allocator.cpp
@@ -306,8 +306,17 @@ uint64_t base_alloc(const AllocatorConfig &config, BankManager &bank_manager, ui
     return bank_manager.allocate_buffer(size, page_size, bottom_up, config.compute_grid_size, num_shards);
 }
 
+void disable_allocs(Allocator &allocator) {
+    allocator.disabled_allocs = true;
+}
+
+void enable_allocs(Allocator &allocator) {
+    allocator.disabled_allocs = false;
+}
+
 uint64_t allocate_buffer(Allocator &allocator, uint32_t size, uint32_t page_size, const BufferType &buffer_type, bool bottom_up, std::optional<uint32_t> num_shards) {
     uint64_t address = 0;
+    TT_FATAL(!allocator.disabled_allocs, "Allocation of new buffers has been disabled");
     switch (buffer_type) {
         case BufferType::DRAM: return allocator.descriptor.dram.alloc(allocator.config, allocator.dram_manager, size, page_size, bottom_up, std::nullopt);
         case BufferType::L1: return allocator.descriptor.l1.alloc(allocator.config, allocator.l1_manager, size, page_size, bottom_up, num_shards);
diff --git a/tt_metal/impl/allocator/allocator.hpp b/tt_metal/impl/allocator/allocator.hpp
index f59a537b7d03..f8dd28d16ca0 100644
--- a/tt_metal/impl/allocator/allocator.hpp
+++ b/tt_metal/impl/allocator/allocator.hpp
@@ -99,6 +99,10 @@ uint64_t base_alloc(const AllocatorConfig & config, BankManager &bank_manager, u
 
 uint64_t allocate_buffer(Allocator &allocator, uint32_t size, uint32_t page_size, const BufferType &buffer_type, bool bottom_up, std::optional<uint32_t> num_shards = std::nullopt);
 
+void disable_allocs(Allocator &allocator);
+
+void enable_allocs(Allocator &allocator);
+
 void deallocate_buffer(Allocator &allocator, uint64_t address, const BufferType &buffer_type);
 void deallocate_buffers(Allocator &allocator);
 
@@ -109,6 +113,8 @@ void clear(Allocator &allocatator);
 struct Allocator {
     Allocator(const AllocatorConfig &alloc_config, const allocator::AllocDescriptor &alloc_descriptor);
 
+    bool disabled_allocs = false;
+
     allocator::BankManager dram_manager;
     allocator::BankManager l1_manager;
     allocator::BankManager l1_small_manager;
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index b6146f64f5bb..68faeab5a66e 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -1293,8 +1293,13 @@ bool Device::close() {
     watcher_detach(this);
 
     for (const std::unique_ptr<HWCommandQueue> &hw_command_queue : hw_command_queues_) {
+        if (hw_command_queue->manager.get_bypass_mode()) {
+            hw_command_queue->record_end();
+        }
         hw_command_queue->terminate();
     }
+    this->trace_buffer_pool_.clear();
+    detail::EnableAllocs(this);
 
     std::unordered_set<CoreCoord> not_done_dispatch_cores;
     std::unordered_set<CoreCoord> cores_to_skip;
@@ -1674,58 +1679,58 @@ bool Device::using_slow_dispatch() const {
     return not (this->using_fast_dispatch);
 }
 
-void Device::begin_trace() {
-    this->trace_contexts_.clear();
-    for (size_t cq_id = 0; cq_id < num_hw_cqs(); cq_id++) {
-        trace_contexts_.push_back(std::make_shared<detail::TraceDescriptor>());
-        hw_command_queues_[cq_id]->record_begin(trace_contexts_.at(cq_id));
+void Device::begin_trace(const uint8_t cq_id, const uint32_t tid, const uint32_t trace_buff_size) {
+    TT_FATAL(this->trace_buffer_pool_.count(tid) == 0, "Trace already exists for tid {} on device", tid);
+    TT_FATAL(!this->hw_command_queues_[cq_id]->tid.has_value(), "CQ {} is already being used for tracing tid {}", (uint32_t)cq_id, tid);
+    auto desc = std::make_shared<detail::TraceDescriptor>();
+    detail::EnableAllocs(this);
+    this->trace_buffer_pool_.insert({tid, Trace::create_trace_buffer(this->command_queue(cq_id), desc, trace_buff_size)});
+    this->hw_command_queues_[cq_id]->record_begin(tid, desc);
+}
+
+void Device::end_trace(const uint8_t cq_id, const uint32_t tid) {
+    TT_FATAL(this->hw_command_queues_[cq_id]->tid == tid, "CQ {} is not being used for tracing tid {}", (uint32_t)cq_id, tid);
+    TT_FATAL(this->trace_buffer_pool_.count(tid) > 0, "Trace instance " + std::to_string(tid) + " must exist on device");
+    this->hw_command_queues_[cq_id]->record_end();
+    auto &data = this->trace_buffer_pool_[tid]->desc->data;
+    data = std::move(this->sysmem_manager().get_bypass_data());
+    // Add command to terminate the trace buffer
+    DeviceCommand command_sequence(CQ_PREFETCH_CMD_BARE_MIN_SIZE);
+    command_sequence.add_prefetch_exec_buf_end();
+    for (int i = 0; i < command_sequence.size_bytes() / sizeof(uint32_t); i++) {
+        data.push_back(((uint32_t*)command_sequence.data())[i]);
     }
+    Trace::initialize_buffer(this->command_queue(cq_id), this->trace_buffer_pool_[tid]);
+    detail::DisableAllocs(this);
 }
 
-void Device::end_trace() {
-
-    // Currently only supports one trace at a time per CQ, so release last trace
-    // before instantiating new ones.
-    this->release_last_trace();
-
-    for (size_t cq_id = 0; cq_id < num_hw_cqs(); cq_id++) {
-        hw_command_queues_[cq_id]->record_end();
-        trace_contexts_.at(cq_id)->data = std::move(this->sysmem_manager().get_bypass_data());
-        uint32_t tid = Trace::instantiate(this->command_queue(cq_id), trace_contexts_.at(cq_id));
-        trace_insts_.push_back(tid);
-    }
-}
-
-void Device::execute_last_trace(bool blocking) {
+void Device::replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking) {
     constexpr bool check = false;
-    for (size_t cq_id = 0; cq_id < num_hw_cqs(); cq_id++) {
-        if (this->trace_insts_.at(cq_id).has_value()) {
-            uint32_t tid = this->trace_insts_.at(cq_id).value();
-            TT_FATAL(Trace::has_instance(tid), "Trace instance " + std::to_string(tid) + " must exist on device");
-            if constexpr (check) {
-                Trace::validate_instance(tid);
-            }
-            this->command_queue(cq_id).run_command(CommandInterface{
-                .type = EnqueueCommandType::ENQUEUE_TRACE,
-                .blocking = blocking,
-                .trace_id = tid
-            });
-        }
+    TT_FATAL(this->trace_buffer_pool_.count(tid) > 0, "Trace instance " + std::to_string(tid) + " must exist on device");
+    if constexpr (check) {
+        Trace::validate_instance(*this->trace_buffer_pool_[tid]);
+    }
+    this->command_queue(cq_id).run_command(CommandInterface{
+        .type = EnqueueCommandType::ENQUEUE_TRACE,
+        .blocking = blocking,
+        .trace_id = tid
+    });
+}
+
+void Device::release_trace(const uint32_t tid) {
+    uint32_t erased = this->trace_buffer_pool_.erase(tid);
+    // Only enable allocations once all captured traces are released
+    if (this->trace_buffer_pool_.empty()) {
+        detail::EnableAllocs(this);
     }
 }
 
-void Device::release_last_trace() {
-    for (size_t cq_id = 0; cq_id < num_hw_cqs(); cq_id++) {
-        if (this->trace_insts_.size() > cq_id) {
-            if (this->trace_insts_.at(cq_id).has_value()) {
-                uint32_t tid = this->trace_insts_.at(cq_id).value();
-                if (Trace::has_instance(tid)) {
-                    Trace::remove_instance(tid);
-                }
-            }
-        }
+std::shared_ptr<TraceBuffer> Device::get_trace(const uint32_t tid) {
+    if (auto trace = this->trace_buffer_pool_.find(tid); trace != this->trace_buffer_pool_.end()) {
+        return trace->second;
+    } else {
+        return nullptr;
     }
-    this->trace_insts_.clear();
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index 07a2af34385d..8f922d198e89 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -11,7 +11,7 @@
 #include "impl/dispatch/work_executor.hpp"
 #include "tt_metal/impl/allocator/basic_allocator.hpp"
 #include "tt_metal/impl/allocator/l1_banking_allocator.hpp"
-// #include "tt_metal/impl/trace/trace.hpp"
+#include "tt_metal/impl/trace/trace_buffer.hpp"
 #include "tt_metal/jit_build/build.hpp"
 #include "llrt/tt_cluster.hpp"
 #include "dev_msgs.h"
@@ -200,10 +200,11 @@ class Device {
     CommandQueue& command_queue(size_t cq_id = 0);
 
     // Metal trace device capture mode
-    void begin_trace();
-    void end_trace();
-    void execute_last_trace(bool blocking);
-    void release_last_trace();
+    void begin_trace(const uint8_t cq_id, const uint32_t tid, const uint32_t trace_buff_size);
+    void end_trace(const uint8_t cq_id, const uint32_t tid);
+    void replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking);
+    void release_trace(const uint32_t tid);
+    std::shared_ptr<TraceBuffer> get_trace(const uint32_t tid);
 
     bool using_slow_dispatch() const;
     void check_allocator_is_initialized() const;
@@ -298,8 +299,7 @@ class Device {
     }
 
    private:
-    std::vector<std::optional<uint32_t>> trace_insts_;
-    std::vector<std::shared_ptr<tt::tt_metal::detail::TraceDescriptor>> trace_contexts_;
+    std::unordered_map<uint32_t, std::shared_ptr<TraceBuffer>> trace_buffer_pool_;
 };
 
 }  // namespace tt_metal
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 9cb033ef51f9..30cde282927d 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -1117,8 +1117,6 @@ void EnqueueTraceCommand::process() {
 
     const bool stall_prefetcher = true;
     this->manager.fetch_queue_write(cmd_sequence_sizeB, this->command_queue_id, stall_prefetcher);
-
-    // log_trace(LogDispatch, "EnqueueTraceCommand issued write_ptr={}, fetch_size={}, commands={}", write_ptr, fetch_size_bytes, this->commands);
 }
 
 EnqueueTerminateCommand::EnqueueTerminateCommand(
@@ -1224,6 +1222,8 @@ void HWCommandQueue::enqueue_read_buffer(std::shared_ptr<Buffer> buffer, void* d
 // Read buffer command is enqueued in the issue region and device writes requested buffer data into the completion region
 void HWCommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking) {
     ZoneScopedN("HWCommandQueue_read_buffer");
+    TT_FATAL(!this->manager.get_bypass_mode(), "Enqueue Read Buffer cannot be used with tracing");
+
     chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id());
     uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id());
     CoreType dispatch_core_type = dispatch_core_manager::get(this->device->num_hw_cqs()).get_dispatch_core_type(this->device->id());
@@ -1310,6 +1310,7 @@ CoreType HWCommandQueue::get_dispatch_core_type() {
 
 void HWCommandQueue::enqueue_write_buffer(const Buffer& buffer, const void* src, bool blocking) {
     ZoneScopedN("HWCommandQueue_write_buffer");
+    TT_FATAL(!this->manager.get_bypass_mode(), "Enqueue Write Buffer cannot be used with tracing");
 
     uint32_t padded_page_size = align(buffer.page_size(), ADDRESS_ALIGNMENT);
 
@@ -1441,15 +1442,14 @@ void HWCommandQueue::enqueue_write_buffer(const Buffer& buffer, const void* src,
 void HWCommandQueue::enqueue_program(
     Program& program, bool blocking) {
     ZoneScopedN("HWCommandQueue_enqueue_program");
-
-    this->force_commands([&]() {
-        if (not program.loaded_onto_device) {
-            TT_ASSERT(program.program_transfer_info.kernel_bins.size() == program.kg_buffers.size());
-            for (int buffer_idx = 0; buffer_idx < program.program_transfer_info.kernel_bins.size(); buffer_idx++) {
-                this->enqueue_write_buffer(*program.kg_buffers[buffer_idx], program.program_transfer_info.kernel_bins[buffer_idx].data.data(), false);
-            }
+    if (not program.loaded_onto_device) {
+        TT_FATAL(!this->manager.get_bypass_mode(), "Tracing should only be used when programs have been cached");
+        TT_ASSERT(program.program_transfer_info.kernel_bins.size() == program.kg_buffers.size());
+        for (int buffer_idx = 0; buffer_idx < program.program_transfer_info.kernel_bins.size(); buffer_idx++) {
+            this->enqueue_write_buffer(*program.kg_buffers[buffer_idx], program.program_transfer_info.kernel_bins[buffer_idx].data.data(), false);
         }
-    });
+    }
+
 
     // Snapshot of expected workers from previous programs, used for dispatch_wait cmd generation.
     uint32_t expected_workers_completed = this->manager.get_bypass_mode() ? this->trace_ctx->num_completion_worker_cores : this->expected_num_workers_completed;
@@ -1461,8 +1461,6 @@ void HWCommandQueue::enqueue_program(
 
     if (this->manager.get_bypass_mode()) {
         this->trace_ctx->num_completion_worker_cores += program.program_transfer_info.num_active_cores;
-        this->trace_ctx->owned_buffer_pool.insert(this->trace_ctx->owned_buffer_pool.end(), program.kg_buffers.begin(), program.kg_buffers.end());
-        this->trace_ctx->owned_buffer_pool.insert(this->trace_ctx->owned_buffer_pool.end(), program.owned_buffer_pool.begin(), program.owned_buffer_pool.end());
     } else {
         this->expected_num_workers_completed += program.program_transfer_info.num_active_cores;
     }
@@ -1471,6 +1469,8 @@ void HWCommandQueue::enqueue_program(
 void HWCommandQueue::enqueue_record_event(std::shared_ptr<Event> event, bool clear_count) {
     ZoneScopedN("HWCommandQueue_enqueue_record_event");
 
+    TT_FATAL(!this->manager.get_bypass_mode(), "Enqueue Record Event cannot be used with tracing");
+
     // Populate event struct for caller. When async queues are enabled, this is in child thread, so consumers
     // of the event must wait for it to be ready (ie. populated) here. Set ready flag last. This couldn't be
     // in main thread otherwise event_id selection would get out of order due to main/worker thread timing.
@@ -1479,23 +1479,14 @@ void HWCommandQueue::enqueue_record_event(std::shared_ptr<Event> event, bool cle
     event->device = this->device;
     event->ready = true; // what does this mean???
 
-    if (this->manager.get_bypass_mode()) {
-        TT_FATAL(this->trace_ctx != nullptr, "A trace context must be present in bypass mode!");
-        event->event_id = this->trace_ctx->relative_event_id(event->event_id);
-    }
     auto command = EnqueueRecordEventCommand(this->id, this->device, this->manager, event->event_id, this->expected_num_workers_completed, clear_count);
     this->enqueue_command(command, false);
 
     if (clear_count) {
         this->expected_num_workers_completed = 0;
     }
-    if (this->manager.get_bypass_mode()) {
-        this->trace_ctx->traced_completion_q_reads.push(detail::ReadEventDescriptor(event->event_id));
-        this->trace_ctx->num_completion_q_reads++;
-    } else {
-        this->issued_completion_q_reads.push(detail::ReadEventDescriptor(event->event_id));
-        this->increment_num_entries_in_completion_q();
-    }
+    this->issued_completion_q_reads.push(detail::ReadEventDescriptor(event->event_id));
+    this->increment_num_entries_in_completion_q();
 }
 
 void HWCommandQueue::enqueue_wait_for_event(std::shared_ptr<Event> sync_event, bool clear_count) {
@@ -1514,36 +1505,13 @@ void HWCommandQueue::enqueue_wait_for_event(std::shared_ptr<Event> sync_event, b
 void HWCommandQueue::enqueue_trace(const uint32_t trace_id, bool blocking) {
     ZoneScopedN("HWCommandQueue_enqueue_trace");
 
-    auto trace_inst = Trace::get_instance(trace_id);
-    auto command = EnqueueTraceCommand(this->id, this->device, this->manager, *trace_inst.buffer, this->expected_num_workers_completed);
-
-    // Emit the completion queue entries from the trace
-    auto& cmpl_q = trace_inst.desc->traced_completion_q_reads;
-    uint32_t num_events = 0;
-    uint32_t event_id = this->manager.get_next_event(this->id);
-    for (auto read_descriptor : cmpl_q) {
-        std::visit(
-            [&](auto&& read_descriptor) {
-                using T = std::decay_t<decltype(read_descriptor)>;
-                if constexpr (std::is_same_v<T, detail::ReadBufferDescriptor>) {
-                    TT_THROW("Device trace does not support ReadBuffer commands, please perform on Host instead!");
-                } else if constexpr (std::is_same_v<T, detail::ReadEventDescriptor>) {
-                    read_descriptor.set_global_offset(event_id);
-                    this->issued_completion_q_reads.push(read_descriptor);
-                    this->increment_num_entries_in_completion_q();
-                    num_events++;
-                }
-            },
-            read_descriptor);
+    auto trace_inst = this->device->get_trace(trace_id);
+    auto command = EnqueueTraceCommand(this->id, this->device, this->manager, *trace_inst->buffer, this->expected_num_workers_completed);
 
-    }
     this->enqueue_command(command, false);
 
-    // Increment the global event counter due to trace emitting events in a batch
-    this->manager.increment_event_id(this->id, num_events);
-
     // Increment the exepected worker cores counter due to trace programs completions
-    this->expected_num_workers_completed += trace_inst.desc->num_completion_worker_cores;
+    this->expected_num_workers_completed += trace_inst->desc->num_completion_worker_cores;
 
     if (blocking) {
         this->finish();
@@ -1777,7 +1745,6 @@ void HWCommandQueue::finish() {
     tt::log_debug(tt::LogDispatch, "Finish for command queue {}", this->id);
     std::shared_ptr<Event> event = std::make_shared<Event>();
     this->enqueue_record_event(event);
-
     if (tt::llrt::OptionsG.get_test_mode_enabled()) {
         while (this->num_entries_in_completion_q > this->num_completed_completion_q_reads) {
             if (DPrintServerHangDetected()) {
@@ -1805,22 +1772,25 @@ volatile bool HWCommandQueue::is_noc_hung() {
     return illegal_noc_txn_hang;
 }
 
-void HWCommandQueue::record_begin(std::shared_ptr<detail::TraceDescriptor> ctx) {
+void HWCommandQueue::record_begin(const uint32_t tid, std::shared_ptr<detail::TraceDescriptor> ctx) {
     // Issue event as a barrier and a counter reset
     std::shared_ptr<Event> event = std::make_shared<Event>();
     this->enqueue_record_event(event, true);
     // Record commands using bypass mode
+    this->tid = tid;
     this->trace_ctx = ctx;
     this->manager.set_bypass_mode(true, true);  // start
 }
 
 void HWCommandQueue::record_end() {
+    this->tid = std::nullopt;
     this->trace_ctx = nullptr;
     this->manager.set_bypass_mode(false, false);  // stop
 }
 
 void HWCommandQueue::terminate() {
     ZoneScopedN("HWCommandQueue_terminate");
+    TT_FATAL(!this->manager.get_bypass_mode(), "Terminate cannot be used with tracing");
     tt::log_debug(tt::LogDispatch, "Terminating dispatch kernels for command queue {}", this->id);
     auto command = EnqueueTerminateCommand(this->id, this->device, this->manager);
     this->enqueue_command(command, false);
@@ -2114,33 +2084,10 @@ void FinishImpl(CommandQueue& cq) {
     cq.hw_command_queue().finish();
 }
 
-CommandQueue& BeginTrace(Trace& trace) {
-    log_debug(LogMetalTrace, "Begin trace capture");
-    trace.begin_capture();
-    return trace.queue();
-}
-
-void EndTrace(Trace& trace) {
-    trace.end_capture();
-    log_debug(LogMetalTrace, "End trace capture");
-}
-
-uint32_t InstantiateTrace(Trace& trace, CommandQueue& cq) {
-    uint32_t trace_id = trace.instantiate(cq);
-    return trace_id;
-}
-
-void ReleaseTrace(uint32_t trace_id) {
-    if (trace_id == -1) {
-        Trace::release_all();
-    } else if (Trace::has_instance(trace_id)) {
-        Trace::remove_instance(trace_id);
-    }
-}
 
 void EnqueueTrace(CommandQueue& cq, uint32_t trace_id, bool blocking) {
     detail::DispatchStateCheck(true);
-    TT_FATAL(Trace::has_instance(trace_id), "Trace instance " + std::to_string(trace_id) + " must exist on device");
+    TT_FATAL(cq.device()->get_trace(trace_id) != nullptr, "Trace instance " + std::to_string(trace_id) + " must exist on device");
     cq.run_command(CommandInterface{
         .type = EnqueueCommandType::ENQUEUE_TRACE,
         .blocking = blocking,
diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp
index 6acfe40a4470..3a643b8969a6 100644
--- a/tt_metal/impl/dispatch/command_queue.hpp
+++ b/tt_metal/impl/dispatch/command_queue.hpp
@@ -15,7 +15,7 @@
 #include "tt_metal/impl/dispatch/command_queue_interface.hpp"
 #include "tt_metal/impl/dispatch/device_command.hpp"
 #include "tt_metal/impl/dispatch/lock_free_queue.hpp"
-#include "tt_metal/impl/trace/trace.hpp"
+#include "tt_metal/impl/trace/trace_buffer.hpp"
 #include "tt_metal/common/base.hpp"
 #include "tt_metal/impl/program/program.hpp"
 #include "common/env_lib.hpp"
@@ -59,7 +59,6 @@ string EnqueueCommandTypeToString(EnqueueCommandType ctype);
 uint32_t get_noc_unicast_encoding(const CoreCoord &coord);
 uint32_t get_noc_multcast_encoding(const CoreCoord &start, const CoreCoord &end);
 
-class Trace;
 class CommandQueue;
 class CommandInterface;
 
@@ -389,7 +388,6 @@ class EnqueueTerminateCommand : public Command {
 };
 
 namespace detail {
-class TraceDescriptor;
 inline bool LAZY_COMMAND_QUEUE_MODE = false;
 
 /*
@@ -455,30 +453,13 @@ class HWCommandQueue {
     volatile bool is_dprint_server_hung();
     volatile bool is_noc_hung();
 
-    void record_begin(std::shared_ptr<detail::TraceDescriptor> ctx);
+    void record_begin(const uint32_t tid, std::shared_ptr<detail::TraceDescriptor> ctx);
     void record_end();
 
-    // Record all commands and metadata from run_commands function
-    template <typename Func>
-    inline std::vector<uint32_t> record_commands(std::shared_ptr<detail::TraceDescriptor> ctx, Func run_commands) {
-        this->record_begin(ctx);
-        run_commands();
-        this->record_end();
-        return std::move(this->manager.get_bypass_data());
-    }
-
-    // Force commands to be issued, overrides tracing if this called within record_commands
-    template <typename Func>
-    inline void force_commands(Func run_commands) {
-        bool bypass = this->manager.get_bypass_mode();
-        this->manager.set_bypass_mode(false, false);  // pause
-        run_commands();
-        this->manager.set_bypass_mode(bypass, false);  // resume
-    }
-
    private:
     uint32_t id;
     uint32_t size_B;
+    std::optional<uint32_t> tid;
     std::shared_ptr<detail::TraceDescriptor> trace_ctx;
     std::thread completion_queue_thread;
     SystemMemoryManager& manager;
diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp
index 882140ef0d64..29353e9978ee 100644
--- a/tt_metal/impl/program/program.hpp
+++ b/tt_metal/impl/program/program.hpp
@@ -56,6 +56,7 @@ template <typename CoreRangeContainer>
 vector<pair<uint32_t, uint32_t>> extract_dst_noc_multicast_info(Device* device, const CoreRangeContainer& ranges, const CoreType core_type) {
     // This API extracts all the pairs of noc multicast encodings given a set of core ranges
     vector<pair<uint32_t, uint32_t>> dst_noc_multicast_info;
+    dst_noc_multicast_info.reserve(ranges.size());
     for (const CoreRange& core_range : ranges) {
         CoreCoord physical_start = device->physical_core_from_logical_core(core_range.start, core_type);
         CoreCoord physical_end = device->physical_core_from_logical_core(core_range.end, core_type);
diff --git a/tt_metal/impl/trace/trace.cpp b/tt_metal/impl/trace/trace.cpp
index 53af006a66b0..05bf650b9cbc 100644
--- a/tt_metal/impl/trace/trace.cpp
+++ b/tt_metal/impl/trace/trace.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "impl/trace/trace.hpp"
+
 #include <memory>
 #include <string>
 
@@ -24,7 +25,8 @@ static constexpr uint32_t kExecBufPageMin = 1024;
 static constexpr uint32_t kExecBufPageMax = 4096;
 
 // Assumes pages are interleaved across all banks starting at 0
-size_t interleaved_page_size(const uint32_t buf_size, const uint32_t num_banks, const uint32_t min_size, const uint32_t max_size) {
+size_t interleaved_page_size(
+    const uint32_t buf_size, const uint32_t num_banks, const uint32_t min_size, const uint32_t max_size) {
     // Populate power of 2 numbers within min and max as candidates
     TT_FATAL(min_size > 0 and min_size <= max_size);
     vector<uint32_t> candidates;
@@ -50,149 +52,62 @@ size_t interleaved_page_size(const uint32_t buf_size, const uint32_t num_banks,
     TT_FATAL(pick >= min_size and pick <= max_size);
     return pick;
 }
-}
+}  // namespace
 
 namespace tt::tt_metal {
 
-unordered_map<uint32_t, TraceBuffer> Trace::buffer_pool;
-std::mutex Trace::pool_mutex;
-
-Trace::Trace() {
-    this->reset();
-}
-
-void Trace::reset() {
-    this->state = TraceState::EMPTY;
-    this->tq = std::make_unique<CommandQueue>(*this);
-}
-
-void Trace::begin_capture() {
-    TT_FATAL(this->state == TraceState::EMPTY, "Cannot begin capture in a non-empty state");
-    TT_FATAL(this->queue().empty(), "Cannot begin trace on one that already captured commands");
-    this->state = TraceState::CAPTURING;
-}
-
-void Trace::end_capture() {
-    TT_FATAL(this->state == TraceState::CAPTURING, "Cannot end capture that has not begun");
-    this->validate();
-    this->state = TraceState::CAPTURED;
-}
-
-void Trace::validate() {
-    for (const auto& cmd : this->queue().worker_queue) {
-        if (cmd.blocking.has_value()) {
-            // The workload being traced needs to be self-contained and not require any host interaction
-            // Blocking by definition yields control back to the host, consider breaking it into multiple traces
-            TT_FATAL(cmd.blocking.value() == false, "Only non-blocking commands can be captured in Metal Trace!");
-        }
-    }
-}
+std::atomic<uint32_t> Trace::global_trace_id = 0;
 
 uint32_t Trace::next_id() {
-    static uint32_t global_trace_id = 0;
     return global_trace_id++;
 }
 
-// Stage the trace commands into device DRAM as an interleaved buffer for execution
-uint32_t Trace::instantiate(CommandQueue& cq) {
-    this->state = TraceState::INSTANTIATING;
-    auto desc = std::make_shared<detail::TraceDescriptor>();
-
-    // Record the captured Host API as commands via trace_commands,
-    desc->data = cq.hw_command_queue().record_commands(desc, [&]() {
-        for (auto cmd : this->queue().worker_queue) {
-            cq.run_command(cmd);
-        }
-        cq.wait_until_empty();
-    });
+std::shared_ptr<TraceBuffer> Trace::create_trace_buffer(
+    const CommandQueue& cq, shared_ptr<detail::TraceDescriptor> desc, uint32_t unpadded_size) {
+    size_t page_size = interleaved_page_size(
+        unpadded_size, cq.device()->num_banks(BufferType::DRAM), kExecBufPageMin, kExecBufPageMax);
+    uint64_t padded_size = round_up(unpadded_size, page_size);
 
-    uint32_t tid = Trace::instantiate(cq, desc);
-    this->state = TraceState::READY;
-    return tid;
+    // Commit the trace buffer to device DRAM
+    return std::make_shared<TraceBuffer>(
+        desc,
+        std::make_shared<Buffer>(
+            cq.device(), padded_size, page_size, BufferType::DRAM, TensorMemoryLayout::INTERLEAVED));
 }
 
-uint32_t Trace::instantiate(CommandQueue& cq, shared_ptr<detail::TraceDescriptor> desc) {
-    uint32_t tid = Trace::next_id();
-    TT_FATAL(Trace::has_instance(tid) == false, "Trace ID " + std::to_string(tid) + " already exists");
+void Trace::initialize_buffer(CommandQueue& cq, std::shared_ptr<TraceBuffer> trace_buffer) {
+    vector<uint32_t>& data = trace_buffer->desc->data;
 
-    vector<uint32_t>& data = desc->data;
-
-    // Add command to terminate the trace buffer
-    DeviceCommand command_sequence(CQ_PREFETCH_CMD_BARE_MIN_SIZE);
-    command_sequence.add_prefetch_exec_buf_end();
-    for (int i = 0; i < command_sequence.size_bytes() / sizeof(uint32_t); i++) {
-        data.push_back(((uint32_t*)command_sequence.data())[i]);
-    }
-
-    // Pad the trace buffer to the next fully banked page
     uint64_t unpadded_size = data.size() * sizeof(uint32_t);
-    size_t page_size = interleaved_page_size(
-        unpadded_size, cq.device()->num_banks(BufferType::DRAM), kExecBufPageMin, kExecBufPageMax);
-    size_t numel_page = page_size / sizeof(uint32_t);
-    size_t numel_padding = numel_page - data.size() % numel_page;
+    TT_FATAL(
+        unpadded_size <= trace_buffer->buffer->size(),
+        "Trace data size {} is larger than specified trace buffer size {}. Increase specified buffer size.",
+        unpadded_size,
+        trace_buffer->buffer->size());
+    size_t numel_padding = (trace_buffer->buffer->size() - unpadded_size) / sizeof(uint32_t);
     if (numel_padding > 0) {
-        data.resize(data.size() + numel_padding, 0/*padding value*/);
+        data.resize(data.size() + numel_padding, 0 /*padding value*/);
     }
     uint64_t padded_size = data.size() * sizeof(uint32_t);
+    EnqueueWriteBuffer(cq, trace_buffer->buffer, data, kBlocking);
 
-    // Commit the trace buffer to device DRAM
-    auto buffer = std::make_shared<Buffer>(cq.device(), padded_size, page_size, BufferType::DRAM, TensorMemoryLayout::INTERLEAVED);
-
-    EnqueueWriteBuffer(cq, buffer, data, kBlocking);
-    Finish(cq);  // clear side effects flag
-
-    // Pin the trace buffer in memory until explicitly released by the user
-    Trace::add_instance(tid, {desc, buffer});
-    log_trace(LogMetalTrace,
-        "Trace {} instantiated with completion buffer num_entries={}, issue buffer unpadded size={}, padded size={}, num_pages={}",
-        tid, desc->num_completion_q_reads, unpadded_size, padded_size, padded_size / page_size);
-    return tid;
-}
-
-bool Trace::has_instance(const uint32_t tid) {
-    return _safe_pool([&] {
-        return Trace::buffer_pool.find(tid) != Trace::buffer_pool.end();
-    });
-}
-
-void Trace::add_instance(const uint32_t tid, TraceBuffer buf) {
-    _safe_pool([&] {
-        TT_FATAL(Trace::buffer_pool.find(tid) == Trace::buffer_pool.end());
-        Trace::buffer_pool.insert({tid, buf});
-    });
-}
-
-void Trace::remove_instance(const uint32_t tid) {
-    _safe_pool([&] {
-        TT_FATAL(Trace::buffer_pool.find(tid) != Trace::buffer_pool.end());
-        Trace::buffer_pool.erase(tid);
-    });
-}
-
-void Trace::release_all() {
-    _safe_pool([&] {
-        Trace::buffer_pool.clear();
-    });
+    log_trace(
+        LogMetalTrace,
+        "Trace issue buffer unpadded size={}, padded size={}, num_pages={}",
+        unpadded_size,
+        padded_size,
+        trace_buffer->buffer->num_pages());
 }
 
 // there is a cost to validation, please use it judiciously
-void Trace::validate_instance(const uint32_t tid) {
+void Trace::validate_instance(const TraceBuffer& trace_buffer) {
     vector<uint32_t> backdoor_data;
-    auto trace_inst = Trace::get_instance(tid);
-    detail::ReadFromBuffer(trace_inst.buffer, backdoor_data);
-    if (backdoor_data != trace_inst.desc->data) {
-        log_info(LogMetalTrace, "Trace buffer expected: {}", trace_inst.desc->data);
+    detail::ReadFromBuffer(trace_buffer.buffer, backdoor_data);
+    if (backdoor_data != trace_buffer.desc->data) {
+        log_info(LogMetalTrace, "Trace buffer expected: {}", trace_buffer.desc->data);
         log_info(LogMetalTrace, "Trace buffer observed: {}", backdoor_data);
-        TT_THROW("Trace buffer data mismatch for instance {}", tid);
     }
     // add more checks
 }
 
-TraceBuffer Trace::get_instance(const uint32_t tid) {
-    return _safe_pool([&] {
-        TT_FATAL(Trace::buffer_pool.find(tid) != Trace::buffer_pool.end());
-        return Trace::buffer_pool[tid];
-    });
-}
-
 }  // namespace tt::tt_metal
diff --git a/tt_metal/impl/trace/trace.hpp b/tt_metal/impl/trace/trace.hpp
index 7653ee06bf21..a03ff2d7a67a 100644
--- a/tt_metal/impl/trace/trace.hpp
+++ b/tt_metal/impl/trace/trace.hpp
@@ -10,122 +10,26 @@
 #include <utility>
 #include <variant>
 
-#include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/buffers/buffer.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/dispatch/lock_free_queue.hpp"
-#include "tt_metal/impl/program/program.hpp"
+#include "tt_metal/impl/trace/trace_buffer.hpp"
 
 namespace tt::tt_metal {
 
-using std::shared_ptr;
-using std::unique_ptr;
-using std::unordered_map;
-using std::vector;
-
-class CommandQueue;
-
-namespace detail {
-struct ReadBufferDescriptor;
-struct ReadEventDescriptor;
-typedef LockFreeQueue<std::variant<ReadBufferDescriptor, ReadEventDescriptor>> CompletionReaderQueue;
-
-struct TraceDescriptor {
-    std::optional<uint32_t> initial_event_id;
-    CompletionReaderQueue traced_completion_q_reads;
-    uint32_t num_completion_q_reads;
-    uint32_t num_completion_worker_cores;
-    std::vector<uint32_t> data;
-
-    std::vector<std::shared_ptr<Buffer>> owned_buffer_pool;
-
-    TraceDescriptor() {
-        this->reset();
-    }
-
-    void reset() {
-        this->initial_event_id.reset();
-        this->traced_completion_q_reads.clear();
-        this->num_completion_q_reads = 0;
-        this->num_completion_worker_cores = 0;
-        this->owned_buffer_pool.clear();
-    }
-
-    // Calculate relative offset to the initial event ID of the trace
-    uint32_t relative_event_id(uint32_t event_id) {
-        if (not this->initial_event_id.has_value()) {
-            initial_event_id = event_id;
-        }
-        TT_FATAL(event_id >= initial_event_id.value(), "Traced event ID must be greater or equal to initial event ID");
-        return event_id - initial_event_id.value();
-    }
-};
-}
-
-struct TraceBuffer {
-    shared_ptr<detail::TraceDescriptor> desc;
-    shared_ptr<Buffer> buffer;
-};
-
-enum class TraceState {
-    EMPTY,
-    CAPTURING,
-    CAPTURED,
-    INSTANTIATING,
-    READY
-};
-
 class Trace {
    private:
-    friend class EnqueueProgramCommand;
-    friend void EnqueueTrace(CommandQueue& cq, uint32_t tid, bool blocking);
-
-    TraceState state;
-
-    // Trace queue used to capture commands
-    unique_ptr<CommandQueue> tq;
+    static std::atomic<uint32_t> global_trace_id;
 
-    // Trace instance id to buffer mapping mananged via instantiate and release calls
-    // a static map keeps trace buffers alive until explicitly released by the user
-    static unordered_map<uint32_t, TraceBuffer> buffer_pool;
-
-    // Thread safe accessor to trace::buffer_pool
-    static std::mutex pool_mutex;
-    template <typename Func>
-    static inline auto _safe_pool(Func func) {
-        std::lock_guard<std::mutex> lock(Trace::pool_mutex);
-        return func();
-    }
+   public:
+    Trace() = delete;
 
     static uint32_t next_id();
 
-   public :
-    Trace();
-    ~Trace() {
-        TT_FATAL(this->state != TraceState::CAPTURING, "Trace capture incomplete before destruction!");
-        TT_FATAL(this->state != TraceState::INSTANTIATING, "Trace instantiation incomplete before destruction!");
-    }
-
-    // Return the captured trace queue
-    CommandQueue& queue() const { return *tq; };
-
-    // Stages a trace buffer into device DRAM via the CQ passed in and returns a unique trace id
-    uint32_t instantiate(CommandQueue& cq);
-
-    // Trace capture, validation, and query methods
-    void begin_capture();
-    void end_capture();
-    void validate();
-    void reset();
-
     // Thread-safe accessors to manage trace instances
-    static bool has_instance(const uint32_t tid);
-    static void add_instance(const uint32_t tid, TraceBuffer buf);
-    static void remove_instance(const uint32_t tid);
-    static void validate_instance(const uint32_t tid);
-    static void release_all();  // note all instances across all devices are released
-    static TraceBuffer get_instance(const uint32_t tid);
-    static uint32_t instantiate(CommandQueue& cq, shared_ptr<detail::TraceDescriptor> desc);
+    static void validate_instance(const TraceBuffer& trace_buffer);
+    static void initialize_buffer(CommandQueue& cq, std::shared_ptr<TraceBuffer> trace_buffer);
+    static std::shared_ptr<TraceBuffer> create_trace_buffer(
+        const CommandQueue& cq, shared_ptr<detail::TraceDescriptor> desc, uint32_t unpadded_size);
 };
 
 }  // namespace tt::tt_metal
diff --git a/tt_metal/impl/trace/trace_buffer.hpp b/tt_metal/impl/trace/trace_buffer.hpp
new file mode 100644
index 000000000000..c3fb503e65d4
--- /dev/null
+++ b/tt_metal/impl/trace/trace_buffer.hpp
@@ -0,0 +1,32 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <utility>
+#include <variant>
+
+#include "tt_metal/impl/buffers/buffer.hpp"
+
+namespace tt::tt_metal {
+
+namespace detail {
+struct TraceDescriptor {
+    uint32_t num_completion_worker_cores = 0;
+    std::vector<uint32_t> data;
+};
+}  // namespace detail
+
+struct TraceBuffer {
+    std::shared_ptr<detail::TraceDescriptor> desc;
+    std::shared_ptr<Buffer> buffer;
+
+    TraceBuffer(std::shared_ptr<detail::TraceDescriptor> desc, std::shared_ptr<Buffer> buffer) : desc(desc), buffer(buffer) {
+    }
+};
+
+}  // namespace tt::tt_metal
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index 18752e7da23e..ce31d1557f0e 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -18,6 +18,7 @@
 #include "tools/profiler/profiler.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/detail/program.hpp"
+#include "tt_metal/impl/trace/trace.hpp"
 
 #include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
 
@@ -696,44 +697,21 @@ void CloseDevices(std::map<chip_id_t, Device *> devices) {
         EnqueueGetBufferAddr(buffer->device()->command_queue(), address_on_host, buffer, false);
     }
 
-    void BeginTraceCapture(Device *device) {
-        device->begin_trace();
-    }
-
-    void EndTraceCapture(Device *device) {
-        device->end_trace();
-    }
-
-    void ExecuteLastTrace(Device *device, bool blocking) {
-        device->execute_last_trace(blocking);
-    }
-
-    void ReleaseLastTrace(Device *device) {
-        device->release_last_trace();
-    }
-
-    void BeginTraceCaptures(std::map<chip_id_t, Device *> devices) {
-        for (const auto &[device_id, dev] : devices) {
-            dev->begin_trace();
-        }
-    }
-    void EndTraceCaptures(std::map<chip_id_t, Device *> devices) {
-        for (const auto &[device_id, dev] : devices) {
-            dev->end_trace();
-        }
-    }
-    void ExecuteLastTraces(std::map<chip_id_t, Device *> devices, bool blocking) {
-        for (const auto &[device_id, dev] : devices) {
-            dev->execute_last_trace(blocking);
-        }
-    }
-
     Device *GetDeviceHandle(chip_id_t device_id) {
         ZoneScoped;
         TT_ASSERT(device_id < device_pool::devices.size());
         TT_ASSERT(device_pool::devices[device_id] != nullptr);
         return device_pool::devices[device_id];
     }
+
+    void DisableAllocs(Device *device) {
+        tt::tt_metal::allocator::disable_allocs(*(device->allocator_));
+    }
+
+    void EnableAllocs(Device *device) {
+        tt::tt_metal::allocator::enable_allocs(*(device->allocator_));
+    }
+
 }   // namespace detail
 
 size_t GetNumAvailableDevices() {
@@ -973,6 +951,24 @@ RuntimeArgsData & GetCommonRuntimeArgs(const Program &program, KernelHandle kern
     return detail::GetKernel(program, kernel_id)->common_runtime_args_data().at(0);
 }
 
+uint32_t BeginTraceCapture(Device *device, const uint8_t cq_id, const uint32_t trace_buff_size) {
+    const uint32_t tid = Trace::next_id();
+    device->begin_trace(cq_id, tid, trace_buff_size);
+    return tid;
+}
+
+void EndTraceCapture(Device *device, const uint8_t cq_id, const uint32_t tid) {
+    device->end_trace(cq_id, tid);
+}
+
+void ReplayTrace(Device *device, const uint8_t cq_id, const uint32_t tid, const bool blocking) {
+    device->replay_trace(cq_id, tid, blocking);
+}
+
+void ReleaseTrace(Device *device, const uint32_t tid) {
+    device->release_trace(tid);
+}
+
 }  // namespace tt_metal
 
 }  // namespace tt

From c55435efa5e55c7a094523f43d275c1e3e34d499 Mon Sep 17 00:00:00 2001
From: Austin Ho <aho@tenstorrent.com>
Date: Fri, 10 May 2024 16:54:59 +0000
Subject: [PATCH 28/40] #0: Improve cq_dispatch using one_packet apis

---
 tt_metal/hw/inc/dataflow_api.h                | 105 ++++++++++++------
 .../impl/dispatch/kernels/cq_dispatch.cpp     |  36 +++---
 2 files changed, 92 insertions(+), 49 deletions(-)

diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h
index ec3fa3a3184e..c2c3e78deea9 100644
--- a/tt_metal/hw/inc/dataflow_api.h
+++ b/tt_metal/hw/inc/dataflow_api.h
@@ -687,7 +687,40 @@ void noc_async_write_one_packet(std::uint32_t src_local_l1_addr, std::uint64_t d
     NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
     noc_nonposted_writes_num_issued[noc_index] += 1;
     noc_nonposted_writes_acked[noc_index] += 1;  // num_dests
- }
+}
+
+// TODO: write docs
+// this issues only a single packet with size <= NOC_MAX_BURST_SIZE (ie maximum packet size)
+FORCE_INLINE
+void noc_async_write_multicast_one_packet(
+    std::uint32_t src_local_l1_addr,
+    std::uint64_t dst_noc_addr_multicast,
+    std::uint32_t size,
+    std::uint32_t num_dests,
+    bool linked = false,
+    bool multicast_path_reserve = true) {
+    DEBUG_STATUS("NMPW");
+    DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(dst_noc_addr_multicast, src_local_l1_addr, size);
+    while (!noc_cmd_buf_ready(noc_index, NCRISC_WR_CMD_BUF));
+    DEBUG_STATUS("NWPD");
+
+    uint32_t noc_cmd_field =
+                            NOC_CMD_CPY | NOC_CMD_WR |
+                            NOC_CMD_VC_STATIC |
+                            NOC_CMD_STATIC_VC(NOC_MULTICAST_WRITE_VC) |
+                            (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                            ((multicast_path_reserve ? NOC_CMD_PATH_RESERVE : 0) | NOC_CMD_BRCST_PACKET) |
+                            NOC_CMD_RESP_MARKED;
+
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_CTRL, noc_cmd_field);
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_TARG_ADDR_LO, src_local_l1_addr);
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_RET_ADDR_LO, (uint32_t)dst_noc_addr_multicast);
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_RET_ADDR_MID, dst_noc_addr_multicast >> 32);
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE,  size);
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
+    noc_nonposted_writes_num_issued[noc_index] += 1;
+    noc_nonposted_writes_acked[noc_index] += num_dests;
+}
 
 // TODO: write docs
 // this sets the state for issuing a single packet with size <= NOC_MAX_BURST_SIZE (ie maximum packet size)
@@ -707,7 +740,7 @@ void noc_async_write_one_packet_set_state(std::uint64_t dst_noc_addr, std::uint3
     NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_CTRL, noc_cmd_field);
     NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_RET_ADDR_MID, dst_noc_addr >> 32);
     NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE,  size);
- }
+}
 
 // TODO: write docs
 // this issues only a single packet with cmd buf state with size <= NOC_MAX_BURST_SIZE (ie maximum packet size)
@@ -730,7 +763,7 @@ void noc_async_write_one_packet_with_state(std::uint32_t src_local_l1_addr, std:
         noc_nonposted_writes_num_issued[noc_index] += 1;
         noc_nonposted_writes_acked[noc_index] += 1;  // num_dests
     }
- }
+}
 
 template <bool DRAM>
 struct InterleavedAddrGen {
@@ -1187,22 +1220,27 @@ FORCE_INLINE void noc_async_read_tile(
  * | dst_noc_addr      | Encoding of the destination DRAM location (x,y)+address | uint64_t | DOX-TODO(insert a reference  to what constitutes valid coords) | True     |
  * | size              | Size of data transfer in bytes | uint32_t | 0..1MB                                                    | True     |
  */
+template<uint32_t max_page_size=NOC_MAX_BURST_SIZE + 1>
 inline
 void noc_async_write(std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, std::uint32_t size) {
-    DEBUG_STATUS("NAWW");
-    DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(dst_noc_addr, src_local_l1_addr,size);
-    ncrisc_noc_fast_write_any_len(
-        noc_index,
-        NCRISC_WR_CMD_BUF,
-        src_local_l1_addr,
-        dst_noc_addr,
-        size,
-        NOC_UNICAST_WRITE_VC,
-        false,
-        false,
-        1,
-        true);
-    DEBUG_STATUS("NAWD");
+    if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) {
+        noc_async_write_one_packet(src_local_l1_addr, dst_noc_addr, size);
+    } else {
+        DEBUG_STATUS("NAWW");
+        DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(dst_noc_addr, src_local_l1_addr,size);
+        ncrisc_noc_fast_write_any_len(
+            noc_index,
+            NCRISC_WR_CMD_BUF,
+            src_local_l1_addr,
+            dst_noc_addr,
+            size,
+            NOC_UNICAST_WRITE_VC,
+            false,
+            false,
+            1,
+            true);
+        DEBUG_STATUS("NAWD");
+    }
 }
 
 template <bool DRAM>
@@ -1270,6 +1308,7 @@ void noc_semaphore_set_remote(std::uint32_t src_local_l1_addr, std::uint64_t dst
  * | size                   | Size of data transfer in bytes | uint32_t | 0..1MB | True     |
  * | num_dests              | Number of destinations that the multicast source is targetting           | uint32_t | 0..119                                                        | True     |
  */
+template<uint32_t max_page_size=NOC_MAX_BURST_SIZE + 1>
 inline
 void noc_async_write_multicast(
     std::uint32_t src_local_l1_addr,
@@ -1278,20 +1317,24 @@ void noc_async_write_multicast(
     std::uint32_t num_dests,
     bool linked = false,
     bool multicast_path_reserve = true) {
-    DEBUG_STATUS("NMWW");
-    DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(dst_noc_addr_multicast, src_local_l1_addr,size);
-    ncrisc_noc_fast_write_any_len(
-        noc_index,
-        NCRISC_WR_CMD_BUF,
-        src_local_l1_addr,
-        dst_noc_addr_multicast,
-        size,
-        NOC_MULTICAST_WRITE_VC,
-        true,
-        linked,
-        num_dests,
-        multicast_path_reserve);
-    DEBUG_STATUS("NMWD");
+    if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) {
+        noc_async_write_multicast_one_packet(src_local_l1_addr, dst_noc_addr_multicast, size, num_dests, linked, multicast_path_reserve);
+    } else {
+        DEBUG_STATUS("NMWW");
+        DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(dst_noc_addr_multicast, src_local_l1_addr,size);
+        ncrisc_noc_fast_write_any_len(
+            noc_index,
+            NCRISC_WR_CMD_BUF,
+            src_local_l1_addr,
+            dst_noc_addr_multicast,
+            size,
+            NOC_MULTICAST_WRITE_VC,
+            true,
+            linked,
+            num_dests,
+            multicast_path_reserve);
+        DEBUG_STATUS("NMWD");
+    }
 }
 
 /**
diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
index 9462dc87b211..e2690f264be6 100644
--- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
@@ -114,7 +114,7 @@ void notify_host_of_completion_queue_write_pointer() {
     uint32_t completion_wr_ptr_and_toggle = cq_write_interface.completion_fifo_wr_ptr | (cq_write_interface.completion_fifo_wr_toggle << 31);
     volatile tt_l1_ptr uint32_t* completion_wr_ptr_addr = get_cq_completion_write_ptr();
     completion_wr_ptr_addr[0] = completion_wr_ptr_and_toggle;
-    noc_async_write(CQ_COMPLETION_WRITE_PTR, pcie_address, 4);
+    noc_async_write_one_packet(CQ_COMPLETION_WRITE_PTR, pcie_address, 4);
     block_noc_writes_to_clear[rd_block_idx]++;
 }
 
@@ -255,7 +255,7 @@ void relay_to_next_cb(uint32_t data_ptr,
                     ASSERT(dispatch_cb_end - data_ptr == preamble_size);
                     if (orphan_size != 0) {
                         cb_acquire_pages<my_noc_xy, my_downstream_cb_sem_id>(1); // XXXX optimize, take all availabl
-                        noc_async_write(data_ptr, dst, orphan_size);
+                        noc_async_write<dispatch_cb_page_size>(data_ptr, dst, orphan_size);
                         block_noc_writes_to_clear[rd_block_idx]++;
                         page_acquired = true;
                         length -= orphan_size;
@@ -295,7 +295,7 @@ void relay_to_next_cb(uint32_t data_ptr,
         if (page_acquired == false) {
             cb_acquire_pages<my_noc_xy, my_downstream_cb_sem_id>(1); // XXXX optimize, take all available
         }
-        noc_async_write(data_ptr, dst, xfer_size);
+        noc_async_write<dispatch_cb_page_size>(data_ptr, dst, xfer_size);
         block_noc_writes_to_clear[rd_block_idx]++; // XXXXX maybe just write the noc internal api counter
         cb_release_pages<downstream_noc_xy, downstream_cb_sem_id>(1); // XXXX optimize, take all available
 
@@ -364,9 +364,9 @@ void process_write_linear(uint32_t num_mcast_dests) {
                     uint32_t orphan_size = dispatch_cb_end - data_ptr;
                     if (orphan_size != 0) {
                         if constexpr (multicast){
-                            noc_async_write_multicast(data_ptr, dst, orphan_size, num_mcast_dests);
+                            noc_async_write_multicast<dispatch_cb_page_size>(data_ptr, dst, orphan_size, num_mcast_dests);
                         } else {
-                            noc_async_write(data_ptr, dst, orphan_size);
+                            noc_async_write<dispatch_cb_page_size>(data_ptr, dst, orphan_size);
                         }
                         block_noc_writes_to_clear[rd_block_idx]++;
                         length -= orphan_size;
@@ -400,9 +400,9 @@ void process_write_linear(uint32_t num_mcast_dests) {
         }
 
         if constexpr (multicast){
-            noc_async_write_multicast(data_ptr, dst, xfer_size, num_mcast_dests);
+            noc_async_write_multicast<dispatch_cb_page_size>(data_ptr, dst, xfer_size, num_mcast_dests);
         } else {
-            noc_async_write(data_ptr, dst, xfer_size);
+            noc_async_write<dispatch_cb_page_size>(data_ptr, dst, xfer_size);
         }
         block_noc_writes_to_clear[rd_block_idx]++; // XXXXX maybe just write the noc internal api counter
 
@@ -456,7 +456,7 @@ void process_write_paged() {
                 if (rd_block_idx == dispatch_cb_blocks - 1) {
                     uint32_t orphan_size = dispatch_cb_end - data_ptr;
                     if (orphan_size != 0) {
-                        noc_async_write(data_ptr, dst, orphan_size);
+                        noc_async_write<dispatch_cb_page_size>(data_ptr, dst, orphan_size);
                         block_noc_writes_to_clear[rd_block_idx]++;
                         write_length -= orphan_size;
                         xfer_size -= orphan_size;
@@ -487,7 +487,7 @@ void process_write_paged() {
                                                                 wr_block_idx);
         }
 
-        noc_async_write(data_ptr, dst, xfer_size);
+        noc_async_write<dispatch_cb_page_size>(data_ptr, dst, xfer_size);
         block_noc_writes_to_clear[rd_block_idx]++; // XXXXX maybe just write the noc internal api counter
 
         // If paged write is not completed for a page (dispatch_cb_page_size < page_size) then add offset, otherwise incr page_id.
@@ -556,10 +556,10 @@ void process_write_packed() {
                 if (rd_block_idx == dispatch_cb_blocks - 1) {
                     orphan_size = dispatch_cb_end - data_ptr;
                     if (orphan_size != 0) {
-                        if (mcast) {
-                            noc_async_write_multicast(data_ptr, dst, remainder_xfer_size, num_dests);
+                        if constexpr (mcast) {
+                            noc_async_write_multicast<dispatch_cb_page_size>(data_ptr, dst, remainder_xfer_size, num_dests);
                         } else {
-                            noc_async_write(data_ptr, dst, orphan_size);
+                            noc_async_write<dispatch_cb_page_size>(data_ptr, dst, orphan_size);
                         }
                         block_noc_writes_to_clear[rd_block_idx]++;
                         remainder_xfer_size = xfer_size - orphan_size;
@@ -584,10 +584,10 @@ void process_write_packed() {
             // This is done here so the common case doesn't have to restore the pointers
             if (remainder_xfer_size != 0) {
                 uint64_t dst = get_noc_addr_helper(dst_noc, remainder_dst_addr);
-                if (mcast) {
-                    noc_async_write_multicast(data_ptr, dst, remainder_xfer_size, num_dests);
+                if constexpr (mcast) {
+                    noc_async_write_multicast<dispatch_cb_page_size>(data_ptr, dst, remainder_xfer_size, num_dests);
                 } else {
-                    noc_async_write(data_ptr, dst, remainder_xfer_size);
+                    noc_async_write<dispatch_cb_page_size>(data_ptr, dst, remainder_xfer_size);
                 }
                 block_noc_writes_to_clear[rd_block_idx]++;
 
@@ -598,10 +598,10 @@ void process_write_packed() {
             }
         }
 
-        if (mcast) {
-            noc_async_write_multicast(data_ptr, dst, xfer_size, num_dests);
+        if constexpr (mcast) {
+            noc_async_write_multicast<dispatch_cb_page_size>(data_ptr, dst, xfer_size, num_dests);
         } else {
-            noc_async_write(data_ptr, dst, xfer_size);
+            noc_async_write<dispatch_cb_page_size>(data_ptr, dst, xfer_size);
         }
 
         block_noc_writes_to_clear[rd_block_idx]++; // XXXXX maybe just write the noc internal api counter

From 117499a202200e9a3a6ac3667868f9ebf60f8220 Mon Sep 17 00:00:00 2001
From: Austin Ho <aho@tenstorrent.com>
Date: Thu, 9 May 2024 15:33:05 +0000
Subject: [PATCH 29/40] #0: Unify create_device_tensor and
 create_sharded_device_tensor

---
 .../tensors/test_raw_host_memory_pointer.cpp  |  1 -
 tt_eager/tensor/tensor.cpp                    | 72 ++++++++-----------
 tt_eager/tensor/tensor.hpp                    |  2 -
 .../op_library/all_gather/all_gather_op.cpp   |  2 +-
 tt_eager/tt_dnn/op_library/bcast/bcast_op.cpp |  2 +-
 tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp     |  6 +-
 .../tt_dnn/op_library/concat/concat_op.cpp    |  2 +-
 .../op_library/conv/optimized_conv_op.cpp     |  4 +-
 .../op_library/downsample/downsample_op.cpp   |  2 +-
 .../eltwise_binary/eltwise_binary_op.cpp      |  2 +-
 .../eltwise_unary/eltwise_unary_op.cpp        |  2 +-
 tt_eager/tt_dnn/op_library/fold/fold_op.cpp   |  2 +-
 .../op_library/groupnorm/groupnorm_op.cpp     |  2 +-
 .../op_library/layernorm/layernorm_op.cpp     |  2 +-
 tt_eager/tt_dnn/op_library/move/move_op.hpp   |  2 +-
 .../tt_dnn/op_library/nlp_tms/nlp_tms.cpp     | 28 ++++----
 tt_eager/tt_dnn/op_library/pool/max_pool.cpp  |  2 +-
 .../tt_dnn/op_library/reduce/reduce_op.cpp    |  2 +-
 .../rotary_embedding/rotary_embedding_op.cpp  |  2 +-
 .../tt_dnn/op_library/sharded/sharded_op.cpp  |  4 +-
 .../sharded_partial/sharded_op_partial.cpp    |  2 +-
 .../tt_dnn/op_library/tilize/tilize_op.cpp    |  4 +-
 .../transformer_tms/transformer_tms.cpp       |  8 +--
 .../op_library/transpose/transpose_op.cpp     |  2 +-
 .../op_library/untilize/untilize_op.cpp       |  6 +-
 .../untilize/untilize_with_halo_op.cpp        |  2 +-
 .../untilize/untilize_with_halo_op_v2.cpp     |  2 +-
 .../op_library/upsample/upsample_op.cpp       |  4 +-
 ttnn/cpp/ttnn/op_library/binary/binary_op.cpp |  4 +-
 ttnn/cpp/ttnn/operations/transformer.hpp      |  2 +-
 30 files changed, 82 insertions(+), 97 deletions(-)

diff --git a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
index 787c404e1aae..bcb145e24ebc 100644
--- a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
+++ b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
@@ -175,7 +175,6 @@ void test_raw_host_memory_pointer() {
     Tensor e_dev = tt::tt_metal::add(c_dev, d_dev);
 
     tt::tt_metal::memcpy(tensor_for_printing, e_dev);
-    tensor_for_printing.print();
 
     for (auto& element : owned_buffer::get_as<bfloat16>(tensor_for_printing)) {
         TT_ASSERT(element == bfloat16(10.0f));
diff --git a/tt_eager/tensor/tensor.cpp b/tt_eager/tensor/tensor.cpp
index 2515e21d5095..555aa91fc21b 100644
--- a/tt_eager/tensor/tensor.cpp
+++ b/tt_eager/tensor/tensor.cpp
@@ -795,38 +795,36 @@ uint32_t Tensor::volume() const { return tt::tt_metal::compute_volume(this->get_
 
 Tensor create_device_tensor(const Shape& shape, DataType data_type, Layout layout, Device *device, const MemoryConfig& memory_config) {
     ZoneScoped;
-    uint32_t packed_size_in_bytes = tensor_impl::packed_buffer_size_bytes_wrapper(data_type, compute_buffer_size(shape, data_type));
-    auto device_buffer = tensor_impl::allocate_buffer_on_device(packed_size_in_bytes, device, shape, data_type, layout, memory_config);
-    return Tensor(DeviceStorage{device_buffer}, shape, data_type, layout);
-}
+    if (memory_config.is_sharded()) {
+        TT_ASSERT(memory_config.shard_spec.has_value());
+        TT_ASSERT(memory_config.is_l1());
 
-Tensor create_sharded_device_tensor(const Shape& shape, DataType data_type, Layout layout, Device *device, const MemoryConfig& memory_config) {
-    ZoneScoped;
-    TT_ASSERT(memory_config.is_sharded());
-    TT_ASSERT(memory_config.shard_spec.has_value());
-    TT_ASSERT(memory_config.is_l1());
+        auto shard_spec = memory_config.shard_spec.value();
+        auto& shard_shape = shard_spec.shape;
 
-    auto shard_spec = memory_config.shard_spec.value();
-    auto& shard_shape = shard_spec.shape;
+        auto width = shape[-1];
+        auto other_dims = 1;
+        for (int i = 0; i < shape.rank() - 1; i++) {
+            other_dims *= shape[i];
+        }
 
-    auto width = shape[-1];
-    auto other_dims = 1;
-    for (int i = 0; i < shape.rank() - 1; i++) {
-        other_dims *= shape[i];
+        auto element_size = tensor_impl::element_size_bytes_wrapper(data_type);
+        auto page_shape = tensor_impl::get_sharded_page_shape(layout, data_type, shard_spec.shape);
+        std::array<uint32_t,2> tensor2d_size = {other_dims/page_shape[0], width/page_shape[1]};
+        ShardSpecBuffer shard_spec_buffer(shard_spec, page_shape, tensor2d_size);
+        uint32_t packed_size_in_bytes;
+
+        packed_size_in_bytes = tensor_impl::packed_buffer_size_bytes_wrapper(data_type, compute_buffer_size(shape, data_type));
+        auto device_buffer = tensor_impl::allocate_buffer_on_device(packed_size_in_bytes, device, shape,
+                                                                data_type, layout, memory_config,
+                                                                std::make_optional<ShardSpecBuffer>(shard_spec_buffer)
+                                                                );
+        return Tensor(DeviceStorage{device_buffer}, shape, data_type, layout);
+    } else {
+        uint32_t packed_size_in_bytes = tensor_impl::packed_buffer_size_bytes_wrapper(data_type, compute_buffer_size(shape, data_type));
+        auto device_buffer = tensor_impl::allocate_buffer_on_device(packed_size_in_bytes, device, shape, data_type, layout, memory_config);
+        return Tensor(DeviceStorage{device_buffer}, shape, data_type, layout);
     }
-
-    auto element_size = tensor_impl::element_size_bytes_wrapper(data_type);
-    auto page_shape = tensor_impl::get_sharded_page_shape(layout, data_type, shard_spec.shape);
-    std::array<uint32_t,2> tensor2d_size = {other_dims/page_shape[0], width/page_shape[1]};
-    ShardSpecBuffer shard_spec_buffer(shard_spec, page_shape, tensor2d_size);
-    uint32_t packed_size_in_bytes;
-
-    packed_size_in_bytes = tensor_impl::packed_buffer_size_bytes_wrapper(data_type, compute_buffer_size(shape, data_type));
-    auto device_buffer = tensor_impl::allocate_buffer_on_device(packed_size_in_bytes, device, shape,
-                                                            data_type, layout, memory_config,
-                                                            std::make_optional<ShardSpecBuffer>(shard_spec_buffer)
-                                                            );
-    return Tensor(DeviceStorage{device_buffer}, shape, data_type, layout);
 }
 
 void* get_raw_host_data_ptr(const Tensor& tensor) {
@@ -909,13 +907,8 @@ Tensor allocate_tensor_on_device(const Shape& shape, DataType data_type, Layout
     uint32_t device_tensor_ref_count = device_tensor.tensor_attributes->record_main_thread_ref_count();
     device->push_work(
         [shape, data_type, layout, device, memory_config, device_tensor] () mutable {
-            if (memory_config.is_sharded()) {
-                auto local_tensor = create_sharded_device_tensor(shape, data_type, layout, device, memory_config);
-                device_tensor.populate_buffers_and_metadata(local_tensor);
-            } else {
-                auto local_tensor = create_device_tensor(shape, data_type, layout, device, memory_config);
-                device_tensor.populate_buffers_and_metadata(local_tensor);
-            }
+            auto local_tensor = create_device_tensor(shape, data_type, layout, device, memory_config);
+            device_tensor.populate_buffers_and_metadata(local_tensor);
         }
     );
     device_tensor.tensor_attributes->update_main_thread_ref_count(device, device_tensor_ref_count);
@@ -933,13 +926,8 @@ Tensor allocate_tensor_on_device(const Shape& shape, DataType data_type, Layout
         auto& worker = workers[worker_index];
         worker->push_work(
             [shape, data_type, layout, worker, memory_config, device_tensor, worker_index] () mutable {
-                if (memory_config.is_sharded()) {
-                    auto local_tensor = create_sharded_device_tensor(shape, data_type, layout, worker, memory_config);
-                    insert_buffer_and_shape_for_device(worker, local_tensor, device_tensor, worker_index);
-                } else {
-                    auto local_tensor = create_device_tensor(shape, data_type, layout, worker, memory_config);
-                    insert_buffer_and_shape_for_device(worker, local_tensor, device_tensor, worker_index);
-                }
+                auto local_tensor = create_device_tensor(shape, data_type, layout, worker, memory_config);
+                insert_buffer_and_shape_for_device(worker, local_tensor, device_tensor, worker_index);
                 if (not worker->id()) {
                     device_tensor.set_shape(ttnn::Shape(shape));
                     device_tensor.set_dtype(data_type);
diff --git a/tt_eager/tensor/tensor.hpp b/tt_eager/tensor/tensor.hpp
index 8603b560c185..bef6f2e2b62d 100644
--- a/tt_eager/tensor/tensor.hpp
+++ b/tt_eager/tensor/tensor.hpp
@@ -339,8 +339,6 @@ struct Tensor {
 
 Tensor create_device_tensor(const Shape& shape, DataType dtype, Layout layout, Device *device, const MemoryConfig& memory_config = {.memory_layout=tt::tt_metal::TensorMemoryLayout::INTERLEAVED});
 
-Tensor create_sharded_device_tensor(const Shape& shape, DataType data_type, Layout layout, Device *device, const MemoryConfig& memory_config);
-
 // template<typename Buffer>
 // void *get_host_buffer(const Tensor &tensor);
 void *get_raw_host_data_ptr(const Tensor &tensor);
diff --git a/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp b/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp
index f3dd5879205a..babc461ddd60 100644
--- a/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp
+++ b/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp
@@ -73,7 +73,7 @@ std::vector<Shape> AllGather::compute_output_shapes(const std::vector<Tensor> &i
 std::vector<Tensor> AllGather::create_output_tensors(const std::vector<Tensor> &input_tensors) const {
     const auto& input_tensor = input_tensors[0];
     if(this->output_mem_config.is_sharded()) {
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             this->compute_output_shapes(input_tensors).at(0),
             input_tensor.get_dtype(),
             input_tensor.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/bcast/bcast_op.cpp b/tt_eager/tt_dnn/op_library/bcast/bcast_op.cpp
index 702ce34e1113..3ee9e134ef29 100644
--- a/tt_eager/tt_dnn/op_library/bcast/bcast_op.cpp
+++ b/tt_eager/tt_dnn/op_library/bcast/bcast_op.cpp
@@ -152,7 +152,7 @@ std::vector<Tensor> EltwiseBinaryBroadcast::create_output_tensors(const std::vec
         }
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = shard_spec;
-        return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config)};
+        return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config)};
     } else {
         return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), Layout::TILE, this->output_mem_config);
     }
diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
index 72013283fcff..92d20d955f29 100644
--- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
+++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
@@ -932,7 +932,7 @@ std::vector<Tensor> Matmul::create_output_tensors(const std::vector<Tensor>& inp
                     ShardSpec shard_spec = ShardSpec{all_cores, {per_core_M * TILE_HEIGHT, per_core_N * TILE_WIDTH}, ShardOrientation::ROW_MAJOR};
                     auto mem_config = this->output_mem_config;
                     mem_config.shard_spec = shard_spec;
-                    return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), this->output_dtype, output_layout, input_tensor_a.device(), mem_config)};
+                    return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), this->output_dtype, output_layout, input_tensor_a.device(), mem_config)};
                 } else if constexpr (
                     std::is_same_v<ProgramConfigType, MatmulMultiCoreReuseMultiCastProgramConfig>
                 ) {
@@ -957,7 +957,7 @@ std::vector<Tensor> Matmul::create_output_tensors(const std::vector<Tensor>& inp
                     ShardSpec shard_spec = ShardSpec{all_cores, {per_core_M * TILE_HEIGHT, per_core_N * TILE_WIDTH}, shard_orientation};
                     auto mem_config = this->output_mem_config;
                     mem_config.shard_spec = shard_spec;
-                    return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), this->output_dtype, output_layout, input_tensor_a.device(), mem_config)};
+                    return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), this->output_dtype, output_layout, input_tensor_a.device(), mem_config)};
                 } else if constexpr (
                     std::is_same_v<ProgramConfigType, MatmulMultiCoreReuseProgramConfig>
                 ) {
@@ -981,7 +981,7 @@ std::vector<Tensor> Matmul::create_output_tensors(const std::vector<Tensor>& inp
                     ShardSpec shard_spec = ShardSpec{all_cores, {per_core_M * TILE_HEIGHT, per_core_N * TILE_WIDTH}, shard_orientation};
                     auto mem_config = this->output_mem_config;
                     mem_config.shard_spec = shard_spec;
-                    return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), this->output_dtype, output_layout, input_tensor_a.device(), mem_config)};
+                    return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), this->output_dtype, output_layout, input_tensor_a.device(), mem_config)};
                 } else {
                     TT_FATAL(false, "Unsupported op for output sharding");
                     return {};
diff --git a/tt_eager/tt_dnn/op_library/concat/concat_op.cpp b/tt_eager/tt_dnn/op_library/concat/concat_op.cpp
index 6c2c7107317b..8f82d78c3f5e 100644
--- a/tt_eager/tt_dnn/op_library/concat/concat_op.cpp
+++ b/tt_eager/tt_dnn/op_library/concat/concat_op.cpp
@@ -81,7 +81,7 @@ std::vector<Tensor> Concat::create_output_tensors(const std::vector<Tensor> &inp
     const Tensor &ref_in_tensor = input_tensors.at(0);
 
     if (this->output_mem_config.is_sharded()) {
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             this->compute_output_shapes(input_tensors).at(0),
             ref_in_tensor.get_dtype(),
             ref_in_tensor.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp
index 45ac4461db58..4c1de9d03be1 100644
--- a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp
+++ b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp
@@ -165,7 +165,7 @@ std::vector<Tensor> OptimizedConv::create_output_tensors(const std::vector<Tenso
             auto shard_spec = ShardSpec{shard_grid, shard_shape, ShardOrientation::ROW_MAJOR};
             auto mem_config = this->output_mem_config;
             mem_config.shard_spec = shard_spec;
-            return {create_sharded_device_tensor(output_shape, this->output_dtype, output_layout, input_tensor.device(), mem_config)};
+            return {create_device_tensor(output_shape, this->output_dtype, output_layout, input_tensor.device(), mem_config)};
         } else {
             auto [act_matrix_shape, act_matrix_shape_unpadded] = optimized_conv_op_utils::compute_opt_conv_activation_as_mm_shape(this->input_tensor_shape, conv_params, this->parallelization_config.per_core_out_matrix_height_ntiles, extra_padding_for_32B_alignment);
             uint32_t act_matrix_height = (uint32_t) act_matrix_shape[1];
@@ -180,7 +180,7 @@ std::vector<Tensor> OptimizedConv::create_output_tensors(const std::vector<Tenso
             auto shard_spec = ShardSpec{shard_grid, shard_shape, transpose_mcast ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR};
             auto mem_config = this->output_mem_config;
             mem_config.shard_spec = shard_spec;
-            return {create_sharded_device_tensor(output_shape, this->output_dtype, output_layout, input_tensor.device(), mem_config)};
+            return {create_device_tensor(output_shape, this->output_dtype, output_layout, input_tensor.device(), mem_config)};
         }
 
     }
diff --git a/tt_eager/tt_dnn/op_library/downsample/downsample_op.cpp b/tt_eager/tt_dnn/op_library/downsample/downsample_op.cpp
index 544d6593b1ae..8a8988782ed2 100644
--- a/tt_eager/tt_dnn/op_library/downsample/downsample_op.cpp
+++ b/tt_eager/tt_dnn/op_library/downsample/downsample_op.cpp
@@ -70,7 +70,7 @@ std::vector<Tensor> Downsample::create_output_tensors(const std::vector<Tensor>
     uint32_t output_shard_width = round_up(output_shape[3], num_cores_width_sliced * TILE_WIDTH) / num_cores_width_sliced;
     auto mem_config = input_tensor.memory_config();
     mem_config.shard_spec =  ShardSpec {input_tensor.shard_spec().value().grid, std::array<uint32_t, 2>{{output_shard_height, output_shard_width}}, input_tensor.shard_spec().value().orientation};
-    return {create_sharded_device_tensor(output_shape, this->output_dtype, Layout::TILE, input_tensor.device(), mem_config)};
+    return {create_device_tensor(output_shape, this->output_dtype, Layout::TILE, input_tensor.device(), mem_config)};
 }
 
 operation::ProgramWithCallbacks Downsample::create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const {
diff --git a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.cpp b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.cpp
index dfa13e352cdf..9d7b75ecb52e 100644
--- a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.cpp
+++ b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.cpp
@@ -199,7 +199,7 @@ std::vector<Tensor> EltwiseBinary::create_output_tensors(
         }
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = shard_spec;
-        return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), this->output_dtype, Layout::TILE, input_tensor_a.device(), mem_config)};
+        return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), this->output_dtype, Layout::TILE, input_tensor_a.device(), mem_config)};
     }
     return operation::generic_create_output_tensors(*this, input_tensors, this->output_dtype, Layout::TILE, this->output_mem_config);
 }
diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp
index 5fc6c892ae77..c0e29951d561 100644
--- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp
+++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp
@@ -299,7 +299,7 @@ std::vector<Tensor> EltwiseUnary::create_output_tensors(const std::vector<Tensor
     const auto& input_tensor = input_tensors.at(0);
     if (this->output_mem_config.is_sharded()) {
         Shape output_shape = compute_output_shapes(input_tensors).at(0);
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             output_shape,
             input_tensor.get_dtype(),
             input_tensor.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/fold/fold_op.cpp b/tt_eager/tt_dnn/op_library/fold/fold_op.cpp
index 01efa6015e83..9f69e8cc4c17 100644
--- a/tt_eager/tt_dnn/op_library/fold/fold_op.cpp
+++ b/tt_eager/tt_dnn/op_library/fold/fold_op.cpp
@@ -60,7 +60,7 @@ std::vector<Tensor> Fold::create_output_tensors(const std::vector<Tensor> &input
         mem_config.shard_spec->shape[0] /= stride_h * stride_w;
         mem_config.shard_spec->shape[1] *= stride_h * stride_w;
 
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             compute_output_shapes(input_tensors).at(0),
             output_dtype,
             input_tensor.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/groupnorm/groupnorm_op.cpp b/tt_eager/tt_dnn/op_library/groupnorm/groupnorm_op.cpp
index 24067e7fab7e..e96450bb6e3a 100644
--- a/tt_eager/tt_dnn/op_library/groupnorm/groupnorm_op.cpp
+++ b/tt_eager/tt_dnn/op_library/groupnorm/groupnorm_op.cpp
@@ -1841,7 +1841,7 @@ std::vector<Tensor> GroupNorm::create_output_tensors(const std::vector<Tensor> &
     } else {
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = input_tensor.shard_spec();
-        return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), program_config.out_data_format, Layout::ROW_MAJOR, input_tensor.device(), mem_config)};
+        return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), program_config.out_data_format, Layout::ROW_MAJOR, input_tensor.device(), mem_config)};
     }
 }
 operation::ProgramWithCallbacks GroupNorm::create_program(
diff --git a/tt_eager/tt_dnn/op_library/layernorm/layernorm_op.cpp b/tt_eager/tt_dnn/op_library/layernorm/layernorm_op.cpp
index 59ab71b20c22..f94963dc24f9 100644
--- a/tt_eager/tt_dnn/op_library/layernorm/layernorm_op.cpp
+++ b/tt_eager/tt_dnn/op_library/layernorm/layernorm_op.cpp
@@ -154,7 +154,7 @@ std::vector<Tensor> LayerNorm::create_output_tensors(const std::vector<Tensor> &
                 } else {
                     auto mem_config = this->output_mem_config;
                     mem_config.shard_spec = input_tensor.shard_spec().value();
-                    return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), input_tensors.at(0).get_dtype(), Layout::TILE, input_tensor.device(), mem_config)};
+                    return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), input_tensors.at(0).get_dtype(), Layout::TILE, input_tensor.device(), mem_config)};
                 }
             } else {
                 return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), Layout::TILE, this->output_mem_config);
diff --git a/tt_eager/tt_dnn/op_library/move/move_op.hpp b/tt_eager/tt_dnn/op_library/move/move_op.hpp
index 7c02c1d0718d..2b8ab0709a1b 100644
--- a/tt_eager/tt_dnn/op_library/move/move_op.hpp
+++ b/tt_eager/tt_dnn/op_library/move/move_op.hpp
@@ -136,7 +136,7 @@ inline Tensor move_sharded(const Tensor& input_tensor, const std::optional<Memor
     // log_debug(LogOp, "OUTPUT SHARD SPEC: {}", out_shard_spec);
     auto shard_mem_config = output_mem_config;
     shard_mem_config.shard_spec = shard_spec;
-    auto output_tensor = create_sharded_device_tensor(input_shape, input_dtype, input_layout, input_tensor.device(), shard_mem_config);
+    auto output_tensor = create_device_tensor(input_shape, input_dtype, input_layout, input_tensor.device(), shard_mem_config);
     if (input_tensor.buffer()->address() == output_tensor.buffer()->address()) {
         tt::log_debug(tt::LogOp, "WARNING: No space to move the tensor. Move op's input address and output address are equal: {}", input_address);
         return output_tensor;
diff --git a/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.cpp b/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.cpp
index a75ecd456ac9..cfa35d0ccf31 100644
--- a/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.cpp
+++ b/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.cpp
@@ -132,9 +132,9 @@ std::vector<Tensor> NlpCreateHeadsDecode::create_output_tensors(const std::vecto
     kv_mem_config.shard_spec = kv_shard_spec;
     auto output_shapes = this->compute_output_shapes(input_tensors);
     return {
-        create_sharded_device_tensor(output_shapes[0], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), q_mem_config),
-        create_sharded_device_tensor(output_shapes[1], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), kv_mem_config),
-        create_sharded_device_tensor(output_shapes[2], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), kv_mem_config)
+        create_device_tensor(output_shapes[0], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), q_mem_config),
+        create_device_tensor(output_shapes[1], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), kv_mem_config),
+        create_device_tensor(output_shapes[2], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), kv_mem_config)
     };
 }
 
@@ -253,9 +253,9 @@ std::vector<Tensor> NlpCreateHeads::create_output_tensors(const std::vector<Tens
         kv_mem_config.shard_spec = kv_shard_spec;
         auto output_shapes = this->compute_output_shapes(input_tensors);
         return {
-            create_sharded_device_tensor(output_shapes[0], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), q_mem_config),
-            create_sharded_device_tensor(output_shapes[1], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), kv_mem_config),
-            create_sharded_device_tensor(output_shapes[2], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), kv_mem_config)
+            create_device_tensor(output_shapes[0], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), q_mem_config),
+            create_device_tensor(output_shapes[1], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), kv_mem_config),
+            create_device_tensor(output_shapes[2], input_tensor.get_dtype(), input_tensor.get_layout(), input_tensor.device(), kv_mem_config)
         };
 
     } else {
@@ -333,7 +333,7 @@ std::vector<Tensor> NlpConcatHeads::create_output_tensors(const std::vector<Tens
         shard_spec.shape = {shard_spec.shape[0] / heads_per_shard, shard_spec.shape[1] * heads_per_shard};
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = shard_spec;
-        return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config)};
+        return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config)};
     } else {
         return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), Layout::TILE, this->output_mem_config);
     }
@@ -409,7 +409,7 @@ std::vector<Tensor> NlpConcatHeadsDecode::create_output_tensors(const std::vecto
     auto mem_config = tt::tt_metal::MemoryConfig{TensorMemoryLayout::WIDTH_SHARDED, BufferType::L1};
     mem_config.shard_spec = shard_spec;
 
-    return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config)};
+    return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config)};
 }
 
 operation::ProgramWithCallbacks NlpConcatHeadsDecode::create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const {
@@ -592,9 +592,9 @@ std::vector<Tensor> CreateQKVHeads::create_output_tensors(const std::vector<Tens
     auto mem_config_v = this->output_mem_config;
     mem_config_v.shard_spec = v_spec;
 
-    auto out_tensor_q = create_sharded_device_tensor(q_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_q);
-    auto out_tensor_k = create_sharded_device_tensor(k_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_k);
-    auto out_tensor_v = create_sharded_device_tensor(v_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_v);
+    auto out_tensor_q = create_device_tensor(q_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_q);
+    auto out_tensor_k = create_device_tensor(k_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_k);
+    auto out_tensor_v = create_device_tensor(v_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_v);
     return {out_tensor_q, out_tensor_k, out_tensor_v};
 }
 
@@ -731,9 +731,9 @@ std::vector<Tensor> CreateQKVHeadsSeparateTensors::create_output_tensors(const s
     auto mem_config_v = this->output_mem_config;
     mem_config_v.shard_spec = v_spec;
 
-    auto out_tensor_q = create_sharded_device_tensor(q_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_q);
-    auto out_tensor_k = create_sharded_device_tensor(k_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_k);
-    auto out_tensor_v = create_sharded_device_tensor(v_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_v);
+    auto out_tensor_q = create_device_tensor(q_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_q);
+    auto out_tensor_k = create_device_tensor(k_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_k);
+    auto out_tensor_v = create_device_tensor(v_shape, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_v);
     return {out_tensor_q, out_tensor_k, out_tensor_v};
 }
 
diff --git a/tt_eager/tt_dnn/op_library/pool/max_pool.cpp b/tt_eager/tt_dnn/op_library/pool/max_pool.cpp
index 68d6ee025f12..cb8ed3b1a208 100644
--- a/tt_eager/tt_dnn/op_library/pool/max_pool.cpp
+++ b/tt_eager/tt_dnn/op_library/pool/max_pool.cpp
@@ -108,7 +108,7 @@ std::vector<Tensor> MaxPool::create_output_tensors(const std::vector<Tensor> &in
             std::array<uint32_t, 2> shard_shape = {out_nhw_per_core, input.get_legacy_shape()[-1]};
             mem_config.shard_spec = ShardSpec{shard_grid, shard_shape, ShardOrientation::ROW_MAJOR, false};
         }
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             output_shape, input.get_dtype(), input.get_layout(), input.device(), mem_config)};
     } else {
         return operation::generic_create_output_tensors(*this, inputs, input.get_dtype(), input.get_layout(), out_mem_config_);
diff --git a/tt_eager/tt_dnn/op_library/reduce/reduce_op.cpp b/tt_eager/tt_dnn/op_library/reduce/reduce_op.cpp
index 56fa3d869219..20f0e9cc9daf 100644
--- a/tt_eager/tt_dnn/op_library/reduce/reduce_op.cpp
+++ b/tt_eager/tt_dnn/op_library/reduce/reduce_op.cpp
@@ -104,7 +104,7 @@ std::vector<Tensor> Reduce::create_output_tensors(const std::vector<Tensor> &inp
         shard_spec.shape[0] = tt_metal::compute_volume(output_shape) / output_shape[-1];
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = shard_spec;
-        return {create_sharded_device_tensor(output_shape, this->output_dtype, Layout::TILE, input_tensor.device(), mem_config)};
+        return {create_device_tensor(output_shape, this->output_dtype, Layout::TILE, input_tensor.device(), mem_config)};
     } else {
         return operation::generic_create_output_tensors(*this, input_tensors, this->output_dtype, Layout::TILE, this->output_mem_config);
     }
diff --git a/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.cpp b/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.cpp
index 5fafd9af479c..16cc9ac2f24f 100644
--- a/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.cpp
+++ b/tt_eager/tt_dnn/op_library/rotary_embedding/rotary_embedding_op.cpp
@@ -91,7 +91,7 @@ std::vector<Tensor> RotaryEmbedding::create_output_tensors(const std::vector<Ten
         }
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = shard_spec;
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             output_shape,
             input_tensor.get_dtype(),
             input_tensor.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/sharded/sharded_op.cpp b/tt_eager/tt_dnn/op_library/sharded/sharded_op.cpp
index c6fc3014d3f6..e4650f0fa60c 100644
--- a/tt_eager/tt_dnn/op_library/sharded/sharded_op.cpp
+++ b/tt_eager/tt_dnn/op_library/sharded/sharded_op.cpp
@@ -47,7 +47,7 @@ std::vector<Tensor> Sharded::create_output_tensors(const std::vector<Tensor>& in
     if (this->sharded_op_type == ShardedOpType::InterleavedToSharded) {
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = this->shard_spec;
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             this->compute_output_shapes(input_tensors).at(0),
             this->output_dtype,
             input_tensor.get_layout(),
@@ -109,7 +109,7 @@ std::vector<Tensor> Reshard::create_output_tensors(const std::vector<Tensor>& in
     const auto& input_tensor = input_tensors.at(0);
     auto mem_config = this->output_mem_config;
 
-    return {create_sharded_device_tensor(
+    return {create_device_tensor(
         this->compute_output_shapes(input_tensors).at(0),
         input_tensor.get_dtype(),
         input_tensor.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/sharded_partial/sharded_op_partial.cpp b/tt_eager/tt_dnn/op_library/sharded_partial/sharded_op_partial.cpp
index b2115e896a33..5037f89d600a 100644
--- a/tt_eager/tt_dnn/op_library/sharded_partial/sharded_op_partial.cpp
+++ b/tt_eager/tt_dnn/op_library/sharded_partial/sharded_op_partial.cpp
@@ -73,7 +73,7 @@ std::vector<Tensor> ShardedPartial::create_output_tensors(const std::vector<Tens
     if (this->sharded_op_type == ShardedOpPartialType::InterleavedToShardedPartial) {
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = this->shard_spec;
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             this->compute_output_shapes(input_tensors).at(0),
             this->output_dtype,
             input_tensor.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp b/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp
index 2e13f5e5874d..72f2e039ae22 100644
--- a/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp
+++ b/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp
@@ -58,7 +58,7 @@ std::vector<Tensor> Tilize::create_output_tensors(const std::vector<Tensor>& inp
     if (input_tensor.memory_config().is_sharded()) {
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = input_tensor.memory_config().shard_spec;
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             this->compute_output_shapes(input_tensors).at(0),
             this->output_dtype,
             Layout::TILE,
@@ -157,7 +157,7 @@ std::vector<Tensor> TilizeWithValPadding::create_output_tensors(const std::vecto
         shard_spec.shape[0] = tt_metal::compute_volume(output_shape) / output_shape[-1];
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = shard_spec;
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             output_shape, this->output_dtype, Layout::TILE, input_tensor_a.device(), mem_config)};
     } else {
         return operation::generic_create_output_tensors(
diff --git a/tt_eager/tt_dnn/op_library/transformer_tms/transformer_tms.cpp b/tt_eager/tt_dnn/op_library/transformer_tms/transformer_tms.cpp
index f31aa3d574f0..904bfa1eecfb 100644
--- a/tt_eager/tt_dnn/op_library/transformer_tms/transformer_tms.cpp
+++ b/tt_eager/tt_dnn/op_library/transformer_tms/transformer_tms.cpp
@@ -78,15 +78,15 @@ std::vector<Tensor> SplitFusedQKVAndSplitHeads::create_output_tensors(const std:
         mem_config_qv.shard_spec = shard_spec_qv;
         auto mem_config_k = this->output_mem_config;
         mem_config_k.shard_spec = shard_spec_k;
-        auto out_tensor_q = create_sharded_device_tensor(
+        auto out_tensor_q = create_device_tensor(
             Shape{batch, num_heads, M, K},
             input_tensor.get_dtype(),
             Layout::TILE,
             input_tensor.device(),
             mem_config_qv);
-        auto out_tensor_k = create_sharded_device_tensor(
+        auto out_tensor_k = create_device_tensor(
             Shape{batch, num_heads, K, M}, input_tensor.get_dtype(), Layout::TILE, input_tensor.device(), mem_config_k);
-        auto out_tensor_v = create_sharded_device_tensor(
+        auto out_tensor_v = create_device_tensor(
             Shape{batch, num_heads, M, K},
             input_tensor.get_dtype(),
             Layout::TILE,
@@ -443,7 +443,7 @@ std::vector<Tensor> GroupAttnMatmul::create_output_tensors(const std::vector<Ten
             ShardSpec shard_spec = ShardSpec{all_cores, {output_shape[2], output_shape[3]}, shard_orientation};
             output_mem_config.shard_spec = shard_spec;
         }
-        return {create_sharded_device_tensor(
+        return {create_device_tensor(
             this->compute_output_shapes(input_tensors).at(0),
             this->output_dtype,
             Layout::TILE,
diff --git a/tt_eager/tt_dnn/op_library/transpose/transpose_op.cpp b/tt_eager/tt_dnn/op_library/transpose/transpose_op.cpp
index 908532d4ccb4..2b7c5b970caa 100644
--- a/tt_eager/tt_dnn/op_library/transpose/transpose_op.cpp
+++ b/tt_eager/tt_dnn/op_library/transpose/transpose_op.cpp
@@ -100,7 +100,7 @@ std::vector<Tensor> Transpose::create_output_tensors(const std::vector<Tensor> &
             const auto output_shape = this->compute_output_shapes(input_tensors)[0];
             auto mem_config = this->output_mem_config;
             mem_config.shard_spec = shard_spec;
-            return {create_sharded_device_tensor(
+            return {create_device_tensor(
                 output_shape,
                 input_tensor.get_dtype(),
                 input_tensor.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp b/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp
index 6a247053b9c4..54ff45280652 100644
--- a/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp
+++ b/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp
@@ -63,7 +63,7 @@ std::vector<Tensor> Untilize::create_output_tensors(const std::vector<Tensor> &i
         if (input_tensor.memory_config().is_sharded()) {
             auto mem_config = this->output_mem_config;
             mem_config.shard_spec = input_tensor.memory_config().shard_spec;
-            return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), output_dtype, Layout::ROW_MAJOR, input_tensor.device(), mem_config)};
+            return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), output_dtype, Layout::ROW_MAJOR, input_tensor.device(), mem_config)};
         } else {
             uint32_t ntiles = input_tensor.volume() / TILE_HW;
             uint32_t ntiles_per_block = input_tensor.get_legacy_shape()[-1] / TILE_WIDTH;
@@ -75,7 +75,7 @@ std::vector<Tensor> Untilize::create_output_tensors(const std::vector<Tensor> &i
             ShardSpec shard_spec{shard_grid, shard_shape, ShardOrientation::ROW_MAJOR};
             auto mem_config = this->output_mem_config;
             mem_config.shard_spec = shard_spec;
-            return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), output_dtype, Layout::ROW_MAJOR, input_tensor.device(), mem_config)};
+            return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), output_dtype, Layout::ROW_MAJOR, input_tensor.device(), mem_config)};
         }
     } else {
         return operation::generic_create_output_tensors(*this, input_tensors, output_dtype, Layout::ROW_MAJOR, this->output_mem_config);
@@ -201,7 +201,7 @@ std::vector<Tensor> UntilizeWithUnpadding::create_output_tensors(const std::vect
         shard_spec.shape = shard_shape;
         auto mem_config = this->output_mem_config;
         mem_config.shard_spec = shard_spec;
-        return {create_sharded_device_tensor(this->compute_output_shapes(input_tensors).at(0), output_dtype, Layout::ROW_MAJOR, input_tensor_a.device(), mem_config)};
+        return {create_device_tensor(this->compute_output_shapes(input_tensors).at(0), output_dtype, Layout::ROW_MAJOR, input_tensor_a.device(), mem_config)};
     } else {
         return operation::generic_create_output_tensors(*this, input_tensors, output_dtype, Layout::ROW_MAJOR, this->output_mem_config);
     }
diff --git a/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op.cpp b/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op.cpp
index 183be80c2d4b..1c84def04aaa 100644
--- a/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op.cpp
+++ b/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op.cpp
@@ -1359,7 +1359,7 @@ std::vector<Tensor> UntilizeWithHalo::create_output_tensors(const std::vector<Te
     // log_debug(LogOp, "derived ncores: {}", ncores);
     auto mem_config = this->output_mem_config;
     mem_config.shard_spec = shard_spec;
-    return {create_sharded_device_tensor(output_shape, output_dtype, Layout::ROW_MAJOR, input_tensor.device(), mem_config)};
+    return {create_device_tensor(output_shape, output_dtype, Layout::ROW_MAJOR, input_tensor.device(), mem_config)};
 }
 
 operation::ProgramWithCallbacks UntilizeWithHalo::create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const {
diff --git a/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op_v2.cpp b/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op_v2.cpp
index 8186457bac2c..b151ef322612 100644
--- a/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op_v2.cpp
+++ b/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op_v2.cpp
@@ -313,7 +313,7 @@ std::vector<Tensor> UntilizeWithHaloV2::create_output_tensors(const std::vector<
     out_mem_config.shard_spec->shape[0] = div_up(output_shape[0] * output_shape[2], ncores_nhw_);
     out_mem_config.shard_spec->shape[1] = input_tensor.memory_config().shard_spec->shape[1];
     out_mem_config.shard_spec->halo = true;
-    return {create_sharded_device_tensor(
+    return {create_device_tensor(
         output_shape, output_dtype, Layout::ROW_MAJOR, input_tensor.device(), out_mem_config)};
 }
 
diff --git a/tt_eager/tt_dnn/op_library/upsample/upsample_op.cpp b/tt_eager/tt_dnn/op_library/upsample/upsample_op.cpp
index 734a1e277d52..81d80de759f6 100644
--- a/tt_eager/tt_dnn/op_library/upsample/upsample_op.cpp
+++ b/tt_eager/tt_dnn/op_library/upsample/upsample_op.cpp
@@ -63,7 +63,7 @@ std::vector<Tensor> UpSample::create_output_tensors(const std::vector<Tensor> &i
                 mem_config.shard_spec = output_shard_spec;
                 log_debug(LogOp, "output_shard_shape: {}", output_shard_shape);
                 log_debug(LogOp, "output_shard_spec: {}", output_shard_spec);
-                return {create_sharded_device_tensor(output_shape, input.get_dtype(), input.get_layout(), input.device(), mem_config)};
+                return {create_device_tensor(output_shape, input.get_dtype(), input.get_layout(), input.device(), mem_config)};
             } else if (input.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
                 auto shard_grid = input_shard_spec.grid.ranges();
                 TT_FATAL(shard_grid.size() == 1, "Block sharded input should have only one CoreRange");
@@ -78,7 +78,7 @@ std::vector<Tensor> UpSample::create_output_tensors(const std::vector<Tensor> &i
                 auto output_shard_shape = output_shard_spec.shape;
                 log_debug(LogOp, "ncores_w, ncores_h: {} {}", ncores_w, ncores_h);
                 log_debug(LogOp, "output_shard_shape: {}", output_shard_shape);
-                return {create_sharded_device_tensor(output_shape, input.get_dtype(), input.get_layout(), input.device(), mem_config)};
+                return {create_device_tensor(output_shape, input.get_dtype(), input.get_layout(), input.device(), mem_config)};
             } else {
                 TT_FATAL(false, "input memory config is not HEIGHT or BLOCK sharded");
             }
diff --git a/ttnn/cpp/ttnn/op_library/binary/binary_op.cpp b/ttnn/cpp/ttnn/op_library/binary/binary_op.cpp
index bf34e66321bf..ab63e57f7752 100644
--- a/ttnn/cpp/ttnn/op_library/binary/binary_op.cpp
+++ b/ttnn/cpp/ttnn/op_library/binary/binary_op.cpp
@@ -226,7 +226,7 @@ std::vector<Tensor> Binary<binary_op_type, in_place>::create_output_tensors(
                 }
                 auto memory_config = this->program_config.memory_config;
                 memory_config.shard_spec = shard_spec;
-                return {create_sharded_device_tensor(
+                return {create_device_tensor(
                     this->compute_output_shapes(input_tensors).at(0),
                     this->program_config.dtype,
                     Layout::TILE,
@@ -242,7 +242,7 @@ std::vector<Tensor> Binary<binary_op_type, in_place>::create_output_tensors(
                 }
                 auto memory_config = this->program_config.memory_config;
                 memory_config.shard_spec = shard_spec;
-                return {create_sharded_device_tensor(
+                return {create_device_tensor(
                     this->compute_output_shapes(input_tensors).at(0),
                     this->program_config.dtype,
                     Layout::TILE,
diff --git a/ttnn/cpp/ttnn/operations/transformer.hpp b/ttnn/cpp/ttnn/operations/transformer.hpp
index 29a4a78541b0..b43939c33aee 100644
--- a/ttnn/cpp/ttnn/operations/transformer.hpp
+++ b/ttnn/cpp/ttnn/operations/transformer.hpp
@@ -206,7 +206,7 @@ struct ConcatenateHeads : public tt::tt_metal::NlpConcatHeads {
             shard_spec.shape = {shard_spec.shape[0] / heads_per_shard, shard_spec.shape[1] * heads_per_shard};
             auto mem_config = this->output_mem_config;
             mem_config.shard_spec = shard_spec;
-            return {create_sharded_device_tensor(
+            return {create_device_tensor(
                 this->compute_output_shapes(input_tensors).at(0),
                 input_tensor.get_dtype(),
                 Layout::TILE,

From c1a891ae9c680225a7013dbaacdcab8a83d97a68 Mon Sep 17 00:00:00 2001
From: asaigal <asaigal@tenstorrent.com>
Date: Sat, 11 May 2024 21:51:37 +0000
Subject: [PATCH 30/40] #0: TTNN Async and Multi Device Trace Support

  - Add async safe ttnn and tt_lib trace APIs
  - Single and multi-chip trace tests added to ttnn
    post commit
  - Resnet50 Async Trace tests added (after porting the model
    over to async)
  - Certain multichip tests with all-gather currently disabled
    since they hang with trace
---
 .../demos/resnet/tests/test_metal_resnet50.py |  13 +-
 models/demos/resnet/tests/test_perf_resnet.py |   5 +-
 tests/scripts/t3000/run_t3000_unit_tests.sh   |   4 +-
 .../trace_testing/misc/test_average_pool.py   |   6 +-
 .../trace_testing/misc/test_bert_ops.py       |   6 +-
 .../misc/test_tensor_prealloc_and_write.py    |   3 +-
 .../unit_tests/test_multi_device_trace.py     | 222 ++++++++++++++++++
 .../unit_tests/test_single_device_trace.py    | 159 +++++++++++++
 tt_eager/tensor/tensor.cpp                    |  12 +-
 tt_eager/tensor/tensor.hpp                    |   4 +-
 tt_eager/tensor/types.hpp                     |   4 +-
 .../op_library/conv/optimized_conv_op.cpp     |  53 +++--
 tt_eager/tt_dnn/op_library/copy/copy_op.cpp   |  14 +-
 .../op_library/downsample/downsample_op.cpp   |   7 +-
 tt_eager/tt_dnn/op_library/move/move_op.hpp   |  65 ++---
 .../tt_dnn/op_library/nlp_tms/nlp_tms.cpp     |   2 +-
 tt_eager/tt_dnn/op_library/pool/max_pool.cpp  |  38 +--
 .../tt_dnn/op_library/tilize/tilize_op.cpp    |  34 ++-
 .../op_library/untilize/untilize_op.cpp       |  38 +--
 .../untilize/untilize_with_halo_op_v2.cpp     |  36 +--
 tt_eager/tt_lib/csrc/tt_lib_bindings.cpp      |  40 +++-
 .../tt_lib/csrc/tt_lib_bindings_tensor.cpp    |   4 +-
 ttnn/cpp/pybind11/device.hpp                  |   1 +
 ttnn/cpp/pybind11/operations/core.hpp         | 100 ++++++++
 ttnn/cpp/ttnn/operations/core.hpp             | 104 ++++++++
 ttnn/ttnn/__init__.py                         |   6 +
 ttnn/ttnn/operations/core.py                  |  23 ++
 27 files changed, 851 insertions(+), 152 deletions(-)
 create mode 100644 tests/ttnn/unit_tests/test_multi_device_trace.py
 create mode 100644 tests/ttnn/unit_tests/test_single_device_trace.py

diff --git a/models/demos/resnet/tests/test_metal_resnet50.py b/models/demos/resnet/tests/test_metal_resnet50.py
index b1791bbf0ee8..e5ade5e7802c 100644
--- a/models/demos/resnet/tests/test_metal_resnet50.py
+++ b/models/demos/resnet/tests/test_metal_resnet50.py
@@ -239,12 +239,20 @@ def test_run_resnet50_inference(
     [tt_lib.tensor.MathFidelity.HiFi4, tt_lib.tensor.MathFidelity.HiFi2, tt_lib.tensor.MathFidelity.LoFi],
     ids=["HiFi4", "HiFi2", "LoFi"],
 )
+@pytest.mark.parametrize("enable_async", [True, False])
 def test_run_resnet50_trace_inference(
-    device, use_program_cache, batch_size, weights_dtype, activations_dtype, math_fidelity, imagenet_sample_input
+    device,
+    use_program_cache,
+    batch_size,
+    weights_dtype,
+    activations_dtype,
+    math_fidelity,
+    imagenet_sample_input,
+    enable_async,
 ):
     if is_e75(device):
         pytest.skip("Resnet50 is not supported on E75")
-
+    device.enable_async(enable_async)
     if batch_size > 8 and (
         activations_dtype != tt_lib.tensor.DataType.BFLOAT8_B or weights_dtype != tt_lib.tensor.DataType.BFLOAT8_B
     ):
@@ -339,3 +347,4 @@ def test_run_resnet50_trace_inference(
         # assert passing # fails because of torch.allclose
     # Done with the trace, can deallocate the buffers now.
     tt_lib.device.ReleaseTrace(device, tid)
+    device.enable_async(False)
diff --git a/models/demos/resnet/tests/test_perf_resnet.py b/models/demos/resnet/tests/test_perf_resnet.py
index ac3f54cc9cbd..f8811cd01cce 100644
--- a/models/demos/resnet/tests/test_perf_resnet.py
+++ b/models/demos/resnet/tests/test_perf_resnet.py
@@ -283,6 +283,7 @@ def run_perf_resnet_trace(
         (20, 0.04, 25),
     ),
 )
+@pytest.mark.parametrize("enable_async", [True, False])
 def test_perf_trace_bare_metal(
     device,
     use_program_cache,
@@ -290,10 +291,11 @@ def test_perf_trace_bare_metal(
     expected_inference_time,
     expected_compile_time,
     hf_cat_image_sample_input,
+    enable_async,
 ):
     if is_e75(device):
         pytest.skip("Resnet is not supported on E75")
-
+    device.enable_async(enable_async)
     run_perf_resnet_trace(
         batch_size,
         expected_inference_time,
@@ -301,3 +303,4 @@ def test_perf_trace_bare_metal(
         hf_cat_image_sample_input,
         device,
     )
+    device.enable_async(False)
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
index 677b9d7cdf12..c491fc0f898c 100755
--- a/tests/scripts/t3000/run_t3000_unit_tests.sh
+++ b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -27,9 +27,9 @@ run_t3000_ttnn_tests() {
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_ttnn_tests"
-
+  pytest tests/ttnn/unit_tests/test_multi_device_trace.py
   pytest tests/ttnn/unit_tests/test_multi_device.py
-
+  pytest tests/ttnn/unit_tests/test_multi_device_async.py
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
diff --git a/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py b/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
index a8ec7b13742c..0ea2a8c5e00f 100644
--- a/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
+++ b/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
@@ -34,7 +34,10 @@ def shape_padded(shape):
         "BFLOAT16",
     ],
 )
-def test_run_average_pool(act_shape, dtype, device, use_program_cache):
+@pytest.mark.parametrize("enable_async", [True, False])
+def test_run_average_pool(act_shape, dtype, device, use_program_cache, enable_async):
+    device.enable_async(enable_async)
+
     batch_size, _, _, channels = act_shape
 
     torch.manual_seed(0)
@@ -103,3 +106,4 @@ def run_ops(ttact_res):
 
     # Done with the trace, can deallocate the buffers now.
     ttl.device.ReleaseTrace(device, tid)
+    device.enable_async(False)
diff --git a/tests/tt_eager/python_api_testing/trace_testing/misc/test_bert_ops.py b/tests/tt_eager/python_api_testing/trace_testing/misc/test_bert_ops.py
index 4893e71dfa91..820d6782083b 100644
--- a/tests/tt_eager/python_api_testing/trace_testing/misc/test_bert_ops.py
+++ b/tests/tt_eager/python_api_testing/trace_testing/misc/test_bert_ops.py
@@ -35,6 +35,7 @@
         (False, False, False, 4608, 1024, 3072, None),  # out interleaved, in0 interleaved
     ],
 )
+@pytest.mark.parametrize("enable_async", [True, False])
 def test_bert_linear(
     device,
     fidelity,
@@ -47,7 +48,9 @@ def test_bert_linear(
     activation,
     use_program_cache,
     function_level_defaults,
+    enable_async,
 ):
+    device.enable_async(enable_async)
     has_bias = False
     in0_shape = [1, 1, M, K]
     in1_shape = [1, 1, K, N]
@@ -96,7 +99,6 @@ def test_bert_linear(
     in0 = torch.randn(in0_shape).bfloat16().float()
     in1 = torch.randn(in1_shape).bfloat16().float()
     bias = torch.randn(bias_shape).bfloat16().float()
-
     in0_t_res = torch2tt_tensor(
         in0, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
     )
@@ -195,7 +197,7 @@ def run_ops(in0_t_res):
         passing, output = comp_pcc(pt_out, tt_out)
         logger.info(output)
         assert passing
-    ttl.device.ReleaseLastTrace(device)
 
     # Done with the trace, can deallocate the buffers now.
     ttl.device.ReleaseTrace(device, tid)
+    device.enable_async(False)
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_prealloc_and_write.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_prealloc_and_write.py
index a1929ab34390..9c7333cbc809 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_prealloc_and_write.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_prealloc_and_write.py
@@ -7,6 +7,7 @@
 import torch
 
 import tt_lib as ttl
+import ttnn
 from models.utility_functions import comp_pcc
 from models.utility_functions import is_grayskull
 
@@ -31,7 +32,7 @@ def test_tensor_preallocation_and_write_apis(
     for tensor_shape in shapes:
         # Preallocate tensor on device
         preallocated_tensor = ttl.tensor.allocate_tensor_on_device(
-            tensor_shape,
+            ttnn.Shape(tensor_shape),
             in_dtype,
             tensor_layout,
             device,
diff --git a/tests/ttnn/unit_tests/test_multi_device_trace.py b/tests/ttnn/unit_tests/test_multi_device_trace.py
new file mode 100644
index 000000000000..e75279713485
--- /dev/null
+++ b/tests/ttnn/unit_tests/test_multi_device_trace.py
@@ -0,0 +1,222 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import typing
+import pytest
+import ttnn
+import tempfile
+from loguru import logger
+from tests.ttnn.utils_for_testing import assert_with_pcc
+from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor, ListMeshToTensor
+
+
+@pytest.mark.parametrize("shape", [(1, 1, 512, 512), (1, 1, 32, 32), (1, 3, 512, 512), (1, 3, 32, 32)])
+@pytest.mark.parametrize("use_all_gather", [True, False])
+@pytest.mark.parametrize("enable_async", [True, False])
+def test_multi_device_single_trace(pcie_device_mesh, shape, use_all_gather, enable_async):
+    # Trace requires program cache to be enabled
+    for device_id in pcie_device_mesh.get_device_ids():
+        pcie_device_mesh.get_device(device_id).enable_async(enable_async)
+        pcie_device_mesh.get_device(device_id).enable_program_cache()
+
+    # Preallocate activation tensors. These will be used when capturing and executing the trace
+    input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, pcie_device_mesh)
+    input_1_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, pcie_device_mesh)
+
+    # Op chains to be traced
+    def run_op_chain(input_0, input_1):
+        single_dev_output = ttnn.neg(ttnn.add(ttnn.mul(input_1, ttnn.neg(ttnn.gelu(input_0))), ttnn.relu(input_1)))
+        if use_all_gather:
+            return ttnn.all_gather(single_dev_output, dim=0, num_links=1)
+        return single_dev_output
+
+    # Compile program binaries
+    run_op_chain(input_0_dev, input_1_dev)
+
+    # Capture Trace
+    logger.info("Capture Trace")
+    tid = ttnn.begin_trace_capture(pcie_device_mesh, trace_buffer_size=106496, cq_id=0)
+    output_tensor = run_op_chain(input_0_dev, input_1_dev)
+    ttnn.end_trace_capture(pcie_device_mesh, tid, cq_id=0)
+
+    for i in range(50):
+        # Create torch inputs
+        torch_input_tensor_0 = torch.rand(
+            (pcie_device_mesh.get_num_devices(), shape[1], shape[2], shape[3]), dtype=torch.bfloat16
+        )
+        torch_input_tensor_1 = torch.rand(
+            (pcie_device_mesh.get_num_devices(), shape[1], shape[2], shape[3]), dtype=torch.bfloat16
+        )
+        # Compute PT Golden
+        torch_output_golden = torch.neg(
+            torch.add(
+                torch.mul(torch_input_tensor_1, torch.neg(torch.nn.functional.gelu(torch_input_tensor_0))),
+                torch.relu(torch_input_tensor_1),
+            )
+        )
+        # Convert torch tensors to TTNN Multi-Device Host Tensors
+        ttnn_input_tensor_0 = ttnn.from_torch(
+            torch_input_tensor_0, layout=ttnn.TILE_LAYOUT, mesh_mapper=ShardTensorToMesh(pcie_device_mesh, dim=0)
+        )
+        ttnn_input_tensor_1 = ttnn.from_torch(
+            torch_input_tensor_1, layout=ttnn.TILE_LAYOUT, mesh_mapper=ShardTensorToMesh(pcie_device_mesh, dim=0)
+        )
+
+        # Copy TTNN host tensors into preallocated Mult-Device tensors
+        logger.info("Send Inputs to Device")
+        ttnn.copy_host_to_device_tensor(ttnn_input_tensor_0, input_0_dev)
+        ttnn.copy_host_to_device_tensor(ttnn_input_tensor_1, input_1_dev)
+        logger.info("Execute Trace")
+        # Execute trace
+        ttnn.execute_trace(pcie_device_mesh, tid, cq_id=0, blocking=False)
+
+        if use_all_gather:
+            # Device All-Gather: Iterate through tensors on all devices. Ensure they match the full tensor
+            logger.info("Read Back Trace Outputs")
+            device_tensors: typing.List[ttnn.Tensor] = ttnn.get_device_tensors(output_tensor)
+            for device_tensor in device_tensors:
+                device_tensor_torch = ttnn.to_torch(device_tensor)
+                assert_with_pcc(device_tensor_torch, torch_output_golden, pcc=0.99)
+
+        else:
+            # Perform host All-Gather
+            ttnn_torch_output_tensor = ttnn.to_torch(
+                output_tensor, mesh_composer=ConcatMeshToTensor(pcie_device_mesh, dim=0)
+            )
+            assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden, pcc=0.99)
+
+    # Release trace buffer once workload is complete
+    ttnn.release_trace(pcie_device_mesh, tid)
+
+    for device_id in pcie_device_mesh.get_device_ids():
+        pcie_device_mesh.get_device(device_id).enable_async(False)
+
+
+@pytest.mark.parametrize("shape", [(1, 1, 512, 512), (1, 1, 32, 32), (1, 3, 512, 512), (1, 3, 32, 32)])
+@pytest.mark.parametrize("use_all_gather", [True, False])
+@pytest.mark.parametrize("enable_async", [True, False])
+def test_multi_device_multi_trace(pcie_device_mesh, shape, use_all_gather, enable_async):
+    if use_all_gather:
+        # Currently all-gather tests pass only if blocking == False
+        if shape == (1, 1, 32, 32) or shape == (1, 3, 512, 512) or shape == (1, 3, 32, 32):
+            pytest.skip("This configuration is not working with all-gather")
+
+    # Trace requires program cache to be enabled
+    for device_id in pcie_device_mesh.get_device_ids():
+        pcie_device_mesh.get_device(device_id).enable_async(enable_async)
+        pcie_device_mesh.get_device(device_id).enable_program_cache()
+
+    # Preallocate activation tensors. These will be used when capturing and executing the trace
+    input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, pcie_device_mesh)
+    input_1_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, pcie_device_mesh)
+    weight_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, pcie_device_mesh)
+
+    # Op chains to be traced
+    def run_op_chain(input_0, input_1, weight):
+        single_dev_output = ttnn.neg(
+            ttnn.add(ttnn.mul(input_1, ttnn.neg(ttnn.gelu(input_0))), ttnn.relu(input_1))
+        ) @ ttnn.silu(weight)
+        if use_all_gather:
+            return ttnn.all_gather(single_dev_output, dim=0, num_links=1)
+        return single_dev_output
+
+    def run_op_chain_1(input_0, input_1, weight):
+        single_dev_output = ttnn.tanh(ttnn.mul(ttnn.sub(input_0, input_1), weight))
+        if use_all_gather:
+            return ttnn.all_gather(single_dev_output, dim=0, num_links=1)
+        return single_dev_output
+
+    # Compile program binaries
+    run_op_chain(input_0_dev, input_1_dev, weight_dev)
+    run_op_chain_1(input_0_dev, input_1_dev, weight_dev)
+
+    # Capture Trace 0
+    logger.info("Capture Trace 0")
+    tid = ttnn.begin_trace_capture(pcie_device_mesh, trace_buffer_size=106496, cq_id=0)
+    output_tensor = run_op_chain(input_0_dev, input_1_dev, weight_dev)
+    ttnn.end_trace_capture(pcie_device_mesh, tid, cq_id=0)
+
+    # Capture Trace 1
+    logger.info("Capture Trace 1")
+    tid_1 = ttnn.begin_trace_capture(pcie_device_mesh, trace_buffer_size=26624, cq_id=0)
+    output_tensor_1 = run_op_chain_1(input_0_dev, input_1_dev, weight_dev)
+    ttnn.end_trace_capture(pcie_device_mesh, tid_1, cq_id=0)
+
+    # Execute and verify trace against pytorch
+    torch_silu = torch.nn.SiLU()
+    for i in range(50):
+        # Create torch inputs
+        torch_input_tensor_0 = torch.rand(
+            (pcie_device_mesh.get_num_devices(), shape[1], shape[2], shape[3]), dtype=torch.bfloat16
+        )
+        torch_input_tensor_1 = torch.rand(
+            (pcie_device_mesh.get_num_devices(), shape[1], shape[2], shape[3]), dtype=torch.bfloat16
+        )
+        torch_weight = torch.rand(shape, dtype=torch.bfloat16)
+        # Compute PT Golden
+        torch_output_golden = torch.neg(
+            torch.add(
+                torch.mul(torch_input_tensor_1, torch.neg(torch.nn.functional.gelu(torch_input_tensor_0))),
+                torch.relu(torch_input_tensor_1),
+            )
+        ) @ torch_silu(torch_weight)
+
+        torch_output_golden_1 = torch.tanh(
+            torch.mul(torch.sub(torch_input_tensor_0, torch_input_tensor_1), torch_weight)
+        )
+
+        # Convert torch tensors to TTNN Multi-Device Host Tensors
+        ttnn_input_tensor_0 = ttnn.from_torch(
+            torch_input_tensor_0, layout=ttnn.TILE_LAYOUT, mesh_mapper=ShardTensorToMesh(pcie_device_mesh, dim=0)
+        )
+        ttnn_input_tensor_1 = ttnn.from_torch(
+            torch_input_tensor_1, layout=ttnn.TILE_LAYOUT, mesh_mapper=ShardTensorToMesh(pcie_device_mesh, dim=0)
+        )
+        ttnn_weight = ttnn.from_torch(
+            torch_weight, layout=ttnn.TILE_LAYOUT, mesh_mapper=ReplicateTensorToMesh(pcie_device_mesh)
+        )
+
+        # Copy TTNN host tensors into preallocated Mult-Device tensors
+        logger.info("Send Inputs to Device")
+        ttnn.copy_host_to_device_tensor(ttnn_input_tensor_0, input_0_dev)
+        ttnn.copy_host_to_device_tensor(ttnn_input_tensor_1, input_1_dev)
+        ttnn.copy_host_to_device_tensor(ttnn_weight, weight_dev)
+
+        logger.info("Execute Trace 0")
+        # Execute trace
+        ttnn.execute_trace(pcie_device_mesh, tid, cq_id=0, blocking=False)
+        logger.info("Execute Trace 1")
+        ttnn.execute_trace(pcie_device_mesh, tid_1, cq_id=0, blocking=False)
+        if use_all_gather:
+            # Device All-Gather: Iterate through tensors on all devices. Ensure they match the full tensor
+            logger.info("Read Back Trace 0 Outputs")
+            device_tensors: typing.List[ttnn.Tensor] = ttnn.get_device_tensors(output_tensor)
+            for device_tensor in device_tensors:
+                device_tensor_torch = ttnn.to_torch(device_tensor)
+                assert_with_pcc(device_tensor_torch, torch_output_golden, pcc=0.99)
+
+            logger.info("Read Back Trace 1 Outputs")
+            device_tensors: typing.List[ttnn.Tensor] = ttnn.get_device_tensors(output_tensor_1)
+            for device_tensor in device_tensors:
+                device_tensor_torch = ttnn.to_torch(device_tensor)
+                assert_with_pcc(device_tensor_torch, torch_output_golden_1, pcc=0.99)
+        else:
+            # Perform host All-Gather
+            ttnn_torch_output_tensor = ttnn.to_torch(
+                output_tensor, mesh_composer=ConcatMeshToTensor(pcie_device_mesh, dim=0)
+            )
+            assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden, pcc=0.99)
+
+            ttnn_torch_output_tensor = ttnn.to_torch(
+                output_tensor_1, mesh_composer=ConcatMeshToTensor(pcie_device_mesh, dim=0)
+            )
+            assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden_1, pcc=0.99)
+
+    # Release trace buffer once workload is complete
+    ttnn.release_trace(pcie_device_mesh, tid)
+    ttnn.release_trace(pcie_device_mesh, tid_1)
+
+    for device_id in pcie_device_mesh.get_device_ids():
+        pcie_device_mesh.get_device(device_id).enable_async(False)
diff --git a/tests/ttnn/unit_tests/test_single_device_trace.py b/tests/ttnn/unit_tests/test_single_device_trace.py
new file mode 100644
index 000000000000..1ce721d2073e
--- /dev/null
+++ b/tests/ttnn/unit_tests/test_single_device_trace.py
@@ -0,0 +1,159 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import typing
+import pytest
+import ttnn
+import tempfile
+from loguru import logger
+from tests.ttnn.utils_for_testing import assert_with_pcc
+
+
+@pytest.mark.parametrize("shape", [(1, 1, 512, 512), (1, 1, 32, 32), (1, 3, 512, 512), (1, 3, 32, 32)])
+@pytest.mark.parametrize("enable_async", [True, False])
+@pytest.mark.parametrize("blocking", [True, False])
+def test_single_device_single_trace(device, shape, enable_async, blocking):
+    device.enable_async(enable_async)
+    device.enable_program_cache()
+
+    # Preallocate activation tensors. These will be used when capturing and executing the trace
+    input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
+    input_1_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
+
+    # Op chain to be traced
+    def run_op_chain(input_0, input_1):
+        return ttnn.neg(ttnn.add(ttnn.mul(input_1, ttnn.neg(ttnn.gelu(input_0))), ttnn.relu(input_1)))
+
+    # Compile program binaries
+    run_op_chain(input_0_dev, input_1_dev)
+
+    # Capture Trace
+    logger.info("Capture Trace")
+    tid = ttnn.begin_trace_capture(device, trace_buffer_size=106496, cq_id=0)
+    output_tensor = run_op_chain(input_0_dev, input_1_dev)
+    ttnn.end_trace_capture(device, tid, cq_id=0)
+
+    for i in range(50):
+        # Create torch inputs
+        torch_input_tensor_0 = torch.rand(shape, dtype=torch.bfloat16)
+        torch_input_tensor_1 = torch.rand(shape, dtype=torch.bfloat16)
+        # Compute PT Golden
+        torch_output_golden = torch.neg(
+            torch.add(
+                torch.mul(torch_input_tensor_1, torch.neg(torch.nn.functional.gelu(torch_input_tensor_0))),
+                torch.relu(torch_input_tensor_1),
+            )
+        )
+
+        # Convert torch tensors to TTNN Multi-Device Host Tensors
+        ttnn_input_tensor_0 = ttnn.from_torch(torch_input_tensor_0, layout=ttnn.TILE_LAYOUT)
+        ttnn_input_tensor_1 = ttnn.from_torch(torch_input_tensor_1, layout=ttnn.TILE_LAYOUT)
+
+        # Copy TTNN host tensors into preallocated Mult-Device tensors
+        logger.info("Send Inputs to Device")
+        ttnn.copy_host_to_device_tensor(ttnn_input_tensor_0, input_0_dev)
+        ttnn.copy_host_to_device_tensor(ttnn_input_tensor_1, input_1_dev)
+
+        if blocking:
+            ttnn.synchronize_device(device)
+        logger.info("Execute Trace")
+        # Execute trace
+        ttnn.execute_trace(device, tid, cq_id=0, blocking=blocking)
+        # Readback data
+        logger.info("Read Back Trace Outputs")
+        ttnn_torch_output_tensor = ttnn.to_torch(output_tensor)
+        assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden, pcc=0.99)
+
+    ttnn.release_trace(device, tid)
+    device.enable_async(False)
+
+
+@pytest.mark.parametrize("shape", [(1, 1, 512, 512), (1, 1, 32, 32), (1, 3, 512, 512), (1, 3, 32, 32)])
+@pytest.mark.parametrize("enable_async", [True, False])
+@pytest.mark.parametrize("blocking", [True, False])
+def test_single_device_multi_trace(device, shape, enable_async, blocking):
+    device.enable_async(enable_async)
+    device.enable_program_cache()
+
+    # Preallocate activation tensors. These will be used when capturing and executing the trace
+    input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
+    input_1_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
+    weight_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
+
+    # Op chains to be traced
+    def run_op_chain(input_0, input_1, weight):
+        return ttnn.neg(ttnn.add(ttnn.mul(input_1, ttnn.neg(ttnn.gelu(input_0))), ttnn.relu(input_1))) @ ttnn.silu(
+            weight
+        )
+
+    def run_op_chain_1(input_0, input_1, weight):
+        return ttnn.tanh(ttnn.mul(ttnn.sub(input_0, input_1), weight))
+
+    # Compile program binaries
+    run_op_chain(input_0_dev, input_1_dev, weight_dev)
+    run_op_chain_1(input_0_dev, input_1_dev, weight_dev)
+
+    # Capture Trace 0
+    logger.info("Capture Trace 0")
+    tid = ttnn.begin_trace_capture(device, trace_buffer_size=106496, cq_id=0)
+    output_tensor = run_op_chain(input_0_dev, input_1_dev, weight_dev)
+    ttnn.end_trace_capture(device, tid, cq_id=0)
+
+    # Capture Trace 1
+    logger.info("Capture Trace 1")
+    tid_1 = ttnn.begin_trace_capture(device, trace_buffer_size=26624, cq_id=0)
+    output_tensor_1 = run_op_chain_1(input_0_dev, input_1_dev, weight_dev)
+    ttnn.end_trace_capture(device, tid_1, cq_id=0)
+
+    # Execute and verify trace against pytorch
+    torch_silu = torch.nn.SiLU()
+    for i in range(50):
+        # Create torch inputs
+        torch_input_tensor_0 = torch.rand(shape, dtype=torch.bfloat16)
+        torch_input_tensor_1 = torch.rand(shape, dtype=torch.bfloat16)
+        torch_weight = torch.rand(shape, dtype=torch.bfloat16)
+        # Compute PT Golden
+        torch_output_golden = torch.neg(
+            torch.add(
+                torch.mul(torch_input_tensor_1, torch.neg(torch.nn.functional.gelu(torch_input_tensor_0))),
+                torch.relu(torch_input_tensor_1),
+            )
+        ) @ torch_silu(torch_weight)
+
+        torch_output_golden_1 = torch.tanh(
+            torch.mul(torch.sub(torch_input_tensor_0, torch_input_tensor_1), torch_weight)
+        )
+
+        # Convert torch tensors to TTNN Multi-Device Host Tensors
+        ttnn_input_tensor_0 = ttnn.from_torch(torch_input_tensor_0, layout=ttnn.TILE_LAYOUT)
+        ttnn_input_tensor_1 = ttnn.from_torch(torch_input_tensor_1, layout=ttnn.TILE_LAYOUT)
+        ttnn_weight = ttnn.from_torch(torch_weight, layout=ttnn.TILE_LAYOUT)
+
+        # Copy TTNN host tensors into preallocated Mult-Device tensors
+        logger.info("Send Inputs to Device")
+        ttnn.copy_host_to_device_tensor(ttnn_input_tensor_0, input_0_dev)
+        ttnn.copy_host_to_device_tensor(ttnn_input_tensor_1, input_1_dev)
+        ttnn.copy_host_to_device_tensor(ttnn_weight, weight_dev)
+
+        if blocking:
+            ttnn.synchronize_device(device)
+        logger.info("Execute Trace 0")
+        # Execute trace
+        ttnn.execute_trace(device, tid, cq_id=0, blocking=blocking)
+        logger.info("Execute Trace 1")
+        ttnn.execute_trace(device, tid_1, cq_id=0, blocking=blocking)
+
+        logger.info("Read Back Trace 0 Outputs")
+        ttnn_torch_output_tensor = ttnn.to_torch(output_tensor)
+        assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden, pcc=0.99)
+        logger.info("Read Back Trace 1 Outputs")
+        ttnn_torch_output_tensor = ttnn.to_torch(output_tensor_1)
+        assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden_1, pcc=0.99)
+
+    # Release trace buffer once workload is complete
+    ttnn.release_trace(device, tid)
+    ttnn.release_trace(device, tid_1)
+
+    device.enable_async(False)
diff --git a/tt_eager/tensor/tensor.cpp b/tt_eager/tensor/tensor.cpp
index 555aa91fc21b..ce56ef79a68b 100644
--- a/tt_eager/tensor/tensor.cpp
+++ b/tt_eager/tensor/tensor.cpp
@@ -901,13 +901,13 @@ void memcpy(Tensor& dst, const Tensor& src, const std::optional<std::size_t> tra
     }
 }
 
-Tensor allocate_tensor_on_device(const Shape& shape, DataType data_type, Layout layout, Device *device, const MemoryConfig& memory_config) {
+Tensor allocate_tensor_on_device(const ttnn::Shape& shape, DataType data_type, Layout layout, Device *device, const MemoryConfig& memory_config) {
     // Top level wrapper to asynchronously create a device tensor (single device)
     Tensor device_tensor = Tensor({device});
     uint32_t device_tensor_ref_count = device_tensor.tensor_attributes->record_main_thread_ref_count();
     device->push_work(
         [shape, data_type, layout, device, memory_config, device_tensor] () mutable {
-            auto local_tensor = create_device_tensor(shape, data_type, layout, device, memory_config);
+            auto local_tensor = create_device_tensor(shape.value(), data_type, layout, device, memory_config);
             device_tensor.populate_buffers_and_metadata(local_tensor);
         }
     );
@@ -915,7 +915,7 @@ Tensor allocate_tensor_on_device(const Shape& shape, DataType data_type, Layout
     return device_tensor;
 }
 
-Tensor allocate_tensor_on_device(const Shape& shape, DataType data_type, Layout layout, DeviceMesh *device_mesh, const MemoryConfig& memory_config) {
+Tensor allocate_tensor_on_device(const ttnn::Shape& shape, DataType data_type, Layout layout, DeviceMesh *device_mesh, const MemoryConfig& memory_config) {
     // Top level wrapper to asynchronously create a device tensor (multi-device)
     Tensor device_tensor = Tensor(device_mesh->get_devices());
     uint32_t device_tensor_ref_count = device_tensor.tensor_attributes->record_main_thread_ref_count();
@@ -926,7 +926,7 @@ Tensor allocate_tensor_on_device(const Shape& shape, DataType data_type, Layout
         auto& worker = workers[worker_index];
         worker->push_work(
             [shape, data_type, layout, worker, memory_config, device_tensor, worker_index] () mutable {
-                auto local_tensor = create_device_tensor(shape, data_type, layout, worker, memory_config);
+                auto local_tensor = create_device_tensor(shape.value(), data_type, layout, worker, memory_config);
                 insert_buffer_and_shape_for_device(worker, local_tensor, device_tensor, worker_index);
                 if (not worker->id()) {
                     device_tensor.set_shape(ttnn::Shape(shape));
@@ -971,10 +971,10 @@ void write_tensor(Tensor host_tensor, Tensor device_tensor, uint8_t cq_id) {
                                 std::visit([&host_data] (auto&& b) { host_data = b.begin(); }, host_storage.get_buffer());
                             }
                             EnqueueWriteBuffer(worker->command_queue(cq_id), s.get_buffer(), host_data, false);
-                        } else if constexpr (std::is_same_v<DeviceStorage, StorageType>) {
+                        } else if constexpr (std::is_same_v<MultiDeviceStorage, StorageType>) {
                             auto host_storage = std::get<MultiDeviceHostStorage>(async_safe_tensor.get_storage());
                             std::visit([worker_index, &host_data] (auto&& b) { host_data = b.begin(); }, host_storage.get_buffer(worker_index));
-                            EnqueueWriteBuffer(worker->command_queue(cq_id), s.get_buffer(worker), host_data, false);
+                            EnqueueWriteBuffer(worker->command_queue(cq_id), s.get_buffer_for_device(worker), host_data, false);
                         }
                     }, device_tensor.get_storage());
             }
diff --git a/tt_eager/tensor/tensor.hpp b/tt_eager/tensor/tensor.hpp
index bef6f2e2b62d..afad4bab1d9f 100644
--- a/tt_eager/tensor/tensor.hpp
+++ b/tt_eager/tensor/tensor.hpp
@@ -354,8 +354,8 @@ void memcpy(void *dst, const Tensor &src, const std::optional<std::size_t> trans
 void memcpy(Tensor &dst, const void *src, const std::optional<std::size_t> transfer_size = std::nullopt);
 void memcpy(Tensor &dst, const Tensor &src, const std::optional<std::size_t> transfer_size = std::nullopt);
 
-Tensor allocate_tensor_on_device(const Shape& shape, DataType data_type, Layout layout, Device *device, const MemoryConfig& memory_config = {.memory_layout=tt::tt_metal::TensorMemoryLayout::INTERLEAVED});
-Tensor allocate_tensor_on_device(const Shape& shape, DataType data_type, Layout layout, DeviceMesh *device_mesh, const MemoryConfig& memory_config = {.memory_layout=tt::tt_metal::TensorMemoryLayout::INTERLEAVED});
+Tensor allocate_tensor_on_device(const ttnn::Shape& shape, DataType data_type, Layout layout, Device *device, const MemoryConfig& memory_config = {.memory_layout=tt::tt_metal::TensorMemoryLayout::INTERLEAVED});
+Tensor allocate_tensor_on_device(const ttnn::Shape& shape, DataType data_type, Layout layout, DeviceMesh *device_mesh, const MemoryConfig& memory_config = {.memory_layout=tt::tt_metal::TensorMemoryLayout::INTERLEAVED});
 void write_tensor(Tensor host_tensor, Tensor device_tensor, uint8_t cq_id = 0);
 
 }  // namespace tt_metal
diff --git a/tt_eager/tensor/types.hpp b/tt_eager/tensor/types.hpp
index 0332081201ad..2a6ff9c64c35 100644
--- a/tt_eager/tensor/types.hpp
+++ b/tt_eager/tensor/types.hpp
@@ -426,7 +426,7 @@ struct MultiDeviceHostStorage {
             return shapes[shape_index];
         }
 
-        uint32_t num_buffers() {
+        uint32_t num_buffers() const {
             std::lock_guard<std::mutex> lock(mtx);
             return buffers.size();
         }
@@ -524,7 +524,7 @@ struct MultiDeviceHostStorage {
             return shapes.at(device->id());
         }
 
-        uint32_t num_buffers() {
+        uint32_t num_buffers() const {
             std::lock_guard<std::mutex> lock(mtx);
             return buffers.size();
         }
diff --git a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp
index 4c1de9d03be1..c14aee74ad6c 100644
--- a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp
+++ b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp
@@ -72,28 +72,37 @@ Tensor optimized_conv(const Tensor& a,
             bool transpose_mcast,
             std::optional<const DeviceComputeKernelConfig> compute_kernel_config
 ) {
-    //TT_ASSERT(!untilize_out, "Optimized conv only supports tiled out");
-    TT_ASSERT(b.get_layout() == Layout::TILE); // Weights should already be formatted
-    const auto& ashape = input_tensor_shape.has_value() ? Shape(input_tensor_shape.value()) : a.get_legacy_shape();
-    auto padded_a_shape = Shape({ashape[0], ashape[1], ashape[2], round_up(ashape[3], 16)});
-    FormatParams input_a_format_params = {.pad_shape=padded_a_shape, .pad_value=0.0, .target_layout=Layout::ROW_MAJOR};
-    FormatParams input_b_format_params = {.pad_shape=b.get_legacy_shape(), .pad_value=0.0, .target_layout=Layout::TILE};
-    FormatParams input_bias_format_params = {};
-    if (has_bias) {
-        input_bias_format_params = {.pad_shape=bias.value().get_legacy_shape(), .pad_value=0, .target_layout=Layout::TILE};
-    }
-    auto output_layout = untilize_out ? Layout::ROW_MAJOR : Layout::TILE;
-    if (output_mem_config.has_value()) {
-        TT_ASSERT((output_mem_config.value().is_sharded() || output_mem_config.value().memory_layout == TensorMemoryLayout::INTERLEAVED));
-    }
-    auto arch = a.storage_type() == StorageType::DEVICE ? a.device()->arch() : AutoFormat::GetDefaultDevice()->arch();
-    bool fp32_accum = a.device()->arch() == ARCH::WORMHOLE_B0;  // && compute_kernel_config.has_value()) ? compute_kernel_config.value().fp32_dest_acc_en : false;
-    auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::LoFi, true, fp32_accum, false);
-    return operation::run_without_autoformat(
-        OptimizedConv(conv_params, output_channels, untilize_out, has_bias, fuse_relu, math_fidelity, parallelization_config, block_config, extra_padding_for_32B_alignment, output_mem_config.value_or(a.memory_config()), output_dtype.value_or(a.get_dtype()), ashape, use_shallow_conv_variant, transpose_mcast, kernel_config_val
-        ),
-        {a, b},
-        {bias, conv_reader_indices}).at(0);
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({a, b}))};
+    operation::launch_op(
+        [conv_params, output_channels, untilize_out, has_bias, fuse_relu, math_fidelity, parallelization_config, block_config, extra_padding_for_32B_alignment, output_mem_config, output_dtype, input_tensor_shape, use_shallow_conv_variant, transpose_mcast, compute_kernel_config]
+            (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+                auto& a = input_tensors.at(0);
+                auto& b = input_tensors.at(1);
+                auto& bias = optional_input_tensors.at(0);
+                //TT_ASSERT(!untilize_out, "Optimized conv only supports tiled out");
+                TT_ASSERT(b.get_layout() == Layout::TILE); // Weights should already be formatted
+                const auto& ashape = input_tensor_shape.has_value() ? Shape(input_tensor_shape.value()) : a.get_legacy_shape();
+                auto padded_a_shape = Shape({ashape[0], ashape[1], ashape[2], round_up(ashape[3], 16)});
+                FormatParams input_a_format_params = {.pad_shape=padded_a_shape, .pad_value=0.0, .target_layout=Layout::ROW_MAJOR};
+                FormatParams input_b_format_params = {.pad_shape=b.get_legacy_shape(), .pad_value=0.0, .target_layout=Layout::TILE};
+                FormatParams input_bias_format_params = {};
+                if (has_bias) {
+                    input_bias_format_params = {.pad_shape=bias.value().get_legacy_shape(), .pad_value=0, .target_layout=Layout::TILE};
+                }
+                auto output_layout = untilize_out ? Layout::ROW_MAJOR : Layout::TILE;
+                if (output_mem_config.has_value()) {
+                    TT_ASSERT((output_mem_config.value().is_sharded() || output_mem_config.value().memory_layout == TensorMemoryLayout::INTERLEAVED));
+                }
+                auto arch = a.storage_type() == StorageType::DEVICE ? a.device()->arch() : AutoFormat::GetDefaultDevice()->arch();
+                bool fp32_accum = a.device()->arch() == ARCH::WORMHOLE_B0;  // && compute_kernel_config.has_value()) ? compute_kernel_config.value().fp32_dest_acc_en : false;
+                auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::LoFi, true, fp32_accum, false);
+                return operation::run_without_autoformat(
+                    OptimizedConv(conv_params, output_channels, untilize_out, has_bias, fuse_relu, math_fidelity, parallelization_config, block_config, extra_padding_for_32B_alignment, output_mem_config.value_or(a.memory_config()), output_dtype.value_or(a.get_dtype()), ashape, use_shallow_conv_variant, transpose_mcast, kernel_config_val
+                    ),
+                    input_tensors,
+                    optional_input_tensors);
+            }, {a, b}, output_tensors, {bias, conv_reader_indices});
+    return output_tensors.at(0);
 }
 
 void OptimizedConv::validate(const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors) const {
diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp
index 03816e5bb8a6..afcb50acf2c5 100644
--- a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp
+++ b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp
@@ -82,14 +82,14 @@ tt::stl::reflection::Attributes Copy::attributes() const {
 }
 
 Tensor copy(const Tensor& src_tensor, const Tensor& dst_tensor) {
-    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({src_tensor}))};
+    std::vector<Tensor> dummy_outputs = {Tensor(operation::get_workers_for_op_output({src_tensor}))};
     operation::launch_op(
-    [] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
-        const auto& src_tensor = input_tensors.at(0);
-        const auto& dst_tensor = input_tensors.at(1);
-        operation::run(Copy{dst_tensor.memory_config(), dst_tensor.get_dtype()}, {src_tensor, dst_tensor});
-        return {dst_tensor};
-    }, {src_tensor, dst_tensor}, output_tensors);
+        [] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+            auto& src_tensor = input_tensors.at(0);
+            auto& dst_tensor = optional_output_tensors.at(0).value();
+            operation::run(Copy{dst_tensor.memory_config(), dst_tensor.get_dtype()}, {src_tensor, dst_tensor});
+            return {};
+        }, {src_tensor}, dummy_outputs, {}, {dst_tensor});
     return dst_tensor;
 }
 
diff --git a/tt_eager/tt_dnn/op_library/downsample/downsample_op.cpp b/tt_eager/tt_dnn/op_library/downsample/downsample_op.cpp
index 8a8988782ed2..01279831d4e1 100644
--- a/tt_eager/tt_dnn/op_library/downsample/downsample_op.cpp
+++ b/tt_eager/tt_dnn/op_library/downsample/downsample_op.cpp
@@ -80,7 +80,12 @@ operation::ProgramWithCallbacks Downsample::create_program(const std::vector<Ten
 }
 
 Tensor downsample(const Tensor &input_tensor_a, std::array<uint32_t, 5> downsample_params, std::optional<DataType> output_dtype) {
-    return operation::run_without_autoformat(Downsample{downsample_params, output_dtype.value_or(input_tensor_a.get_dtype())}, {input_tensor_a}).at(0);
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))};
+    operation::launch_op(
+        [downsample_params, output_dtype] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+            return operation::run_without_autoformat(Downsample{downsample_params, output_dtype.value_or(input_tensors.at(0).get_dtype())}, input_tensors);
+        }, {input_tensor_a}, output_tensors);
+    return output_tensors.at(0);
 }
 
 struct DownsampleReadPatternParams {
diff --git a/tt_eager/tt_dnn/op_library/move/move_op.hpp b/tt_eager/tt_dnn/op_library/move/move_op.hpp
index 2b8ab0709a1b..eaebf156f0a8 100644
--- a/tt_eager/tt_dnn/op_library/move/move_op.hpp
+++ b/tt_eager/tt_dnn/op_library/move/move_op.hpp
@@ -114,36 +114,41 @@ inline Tensor move(const Tensor& input_tensor, const std::optional<MemoryConfig>
 }
 
 inline Tensor move_sharded(const Tensor& input_tensor, const std::optional<MemoryConfig>& mem_config) {
-    TT_ASSERT(input_tensor.is_allocated(), "Expected input tensor to be allocated");
-    auto input_mem_config = input_tensor.memory_config();
-    TT_FATAL(input_mem_config.is_sharded(), "Expected input tensor to be sharded");
-    auto input_address = input_tensor.buffer()->address();
-    auto output_mem_config = mem_config.value_or(input_mem_config);
-    TT_FATAL(output_mem_config.is_sharded(), "Expected output tensor memory config to be sharded");
-    if (not move_op_utils::can_deallocate(input_tensor)) {
-        TT_FATAL(false, "Expect input tensor to be deallocated after move op. Cannot deallocate before there is probably another consumer.");
-        // TODO: Should this throw error?
-        return input_tensor;
-    }
-    auto shard_spec = input_tensor.shard_spec().value();
-    auto shard_shape = shard_spec.shape;
-    auto shard_grid = shard_spec.grid;
-    auto input_shape = input_tensor.get_legacy_shape();
-    auto input_dtype = input_tensor.get_dtype();
-    auto input_layout = input_tensor.get_layout();
-
-    DeallocateBuffer(*input_tensor.buffer());
-    // log_debug(LogOp, "OUTPUT SHARD SPEC: {}", out_shard_spec);
-    auto shard_mem_config = output_mem_config;
-    shard_mem_config.shard_spec = shard_spec;
-    auto output_tensor = create_device_tensor(input_shape, input_dtype, input_layout, input_tensor.device(), shard_mem_config);
-    if (input_tensor.buffer()->address() == output_tensor.buffer()->address()) {
-        tt::log_debug(tt::LogOp, "WARNING: No space to move the tensor. Move op's input address and output address are equal: {}", input_address);
-        return output_tensor;
-    }
-    MoveOpParallelizationStrategy move_op_parallelization_strategy = MoveOpParallelizationStrategy::MULTI_CORE_SHARDED;
-    auto output = operation::run(Move{output_mem_config, move_op_parallelization_strategy}, {input_tensor, output_tensor}).at(0);
-    return output;
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
+    operation::launch_op(
+        [mem_config] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+            auto& input_tensor = input_tensors.at(0);
+            TT_ASSERT(input_tensor.is_allocated(), "Expected input tensor to be allocated");
+            auto input_mem_config = input_tensor.memory_config();
+            TT_FATAL(input_mem_config.is_sharded(), "Expected input tensor to be sharded");
+            auto input_address = input_tensor.buffer()->address();
+            auto output_mem_config = mem_config.value_or(input_mem_config);
+            TT_FATAL(output_mem_config.is_sharded(), "Expected output tensor memory config to be sharded");
+            if (not move_op_utils::can_deallocate(input_tensor)) {
+                TT_FATAL(false, "Expect input tensor to be deallocated after move op. Cannot deallocate before there is probably another consumer.");
+                // TODO: Should this throw error?
+                return {input_tensor};
+            }
+            auto shard_spec = input_tensor.shard_spec().value();
+            auto shard_shape = shard_spec.shape;
+            auto shard_grid = shard_spec.grid;
+            auto input_shape = input_tensor.get_legacy_shape();
+            auto input_dtype = input_tensor.get_dtype();
+            auto input_layout = input_tensor.get_layout();
+
+            DeallocateBuffer(*input_tensor.buffer());
+            // log_debug(LogOp, "OUTPUT SHARD SPEC: {}", out_shard_spec);
+            auto shard_mem_config = output_mem_config;
+            shard_mem_config.shard_spec = shard_spec;
+            auto output_tensor = create_device_tensor(input_shape, input_dtype, input_layout, input_tensor.device(), shard_mem_config);
+            if (input_tensor.buffer()->address() == output_tensor.buffer()->address()) {
+                tt::log_debug(tt::LogOp, "WARNING: No space to move the tensor. Move op's input address and output address are equal: {}", input_address);
+                return {output_tensor};
+            }
+            MoveOpParallelizationStrategy move_op_parallelization_strategy = MoveOpParallelizationStrategy::MULTI_CORE_SHARDED;
+            return operation::run(Move{output_mem_config, move_op_parallelization_strategy}, {input_tensor, output_tensor});
+        }, {input_tensor}, output_tensors);
+    return output_tensors.at(0);
 }
 
 }  // namespace tt_metal
diff --git a/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.cpp b/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.cpp
index cfa35d0ccf31..0be4d0d73dfc 100644
--- a/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.cpp
+++ b/tt_eager/tt_dnn/op_library/nlp_tms/nlp_tms.cpp
@@ -485,7 +485,7 @@ std::vector<Tensor> NlpKVCacheLoadSlice::create_output_tensors(const std::vector
     auto mem_config = tt::tt_metal::MemoryConfig{TensorMemoryLayout::HEIGHT_SHARDED, BufferType::L1};
     mem_config.shard_spec = shard_spec;
 
-    return {create_sharded_device_tensor(
+    return {create_device_tensor(
         this->compute_output_shapes(input_tensors).at(0),
         input_tensor_a.get_dtype(),
         input_tensor_a.get_layout(),
diff --git a/tt_eager/tt_dnn/op_library/pool/max_pool.cpp b/tt_eager/tt_dnn/op_library/pool/max_pool.cpp
index cb8ed3b1a208..241cfe13fe41 100644
--- a/tt_eager/tt_dnn/op_library/pool/max_pool.cpp
+++ b/tt_eager/tt_dnn/op_library/pool/max_pool.cpp
@@ -199,22 +199,28 @@ Tensor max_pool2d_v2(const Tensor &input,
                   const MemoryConfig& out_mem_config,
                   uint32_t nblocks,
                   bool use_multicore) {
-    TT_ASSERT(dilation_h == 1 && dilation_w == 1 && "Dilation not yet supported in max_pool2d.");
-    TT_ASSERT(pad_h < 2 && pad_w < 2 && "Padding > 1 not yet supported.");
-    TT_ASSERT(stride_h == stride_w && "Stride should be equal for both H and W for now.");
-    // calculate the H and W dims for output
-    uint32_t out_h = ((in_h + 2 * pad_h - (dilation_h * kernel_size_h - 1) - 1) / stride_h) + 1;   // floor
-    uint32_t out_w = ((in_w + 2 * pad_w - (dilation_w * kernel_size_w - 1) - 1) / stride_w) + 1;   // floor
-    return operation::run_without_autoformat(MaxPool{in_n, in_h, in_w,
-                                                     out_h, out_w,
-                                                     kernel_size_h, kernel_size_w,
-                                                     stride_h, stride_w,
-                                                     pad_h, pad_w,
-                                                     dilation_h, dilation_w,
-                                                     out_mem_config,
-                                                     nblocks,
-                                                     use_multicore},
-                                             {input, reader_indices}).at(0);
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input, reader_indices}))};
+    operation::launch_op(
+        [in_n, in_h, in_w, kernel_size_h, kernel_size_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, out_mem_config, nblocks, use_multicore]
+            (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+                TT_ASSERT(dilation_h == 1 && dilation_w == 1 && "Dilation not yet supported in max_pool2d.");
+                TT_ASSERT(pad_h < 2 && pad_w < 2 && "Padding > 1 not yet supported.");
+                TT_ASSERT(stride_h == stride_w && "Stride should be equal for both H and W for now.");
+                // calculate the H and W dims for output
+                uint32_t out_h = ((in_h + 2 * pad_h - (dilation_h * kernel_size_h - 1) - 1) / stride_h) + 1;   // floor
+                uint32_t out_w = ((in_w + 2 * pad_w - (dilation_w * kernel_size_w - 1) - 1) / stride_w) + 1;   // floor
+                return operation::run_without_autoformat(MaxPool{in_n, in_h, in_w,
+                                                                out_h, out_w,
+                                                                kernel_size_h, kernel_size_w,
+                                                                stride_h, stride_w,
+                                                                pad_h, pad_w,
+                                                                dilation_h, dilation_w,
+                                                                out_mem_config,
+                                                                nblocks,
+                                                                use_multicore},
+                                                        input_tensors);
+            }, {input, reader_indices}, output_tensors);
+    return output_tensors.at(0);
 }
 
 operation::OpPerformanceModel MaxPool::create_op_performance_model(const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors, const std::vector<Tensor> &output_tensors) const {
diff --git a/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp b/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp
index 72f2e039ae22..6b2280136f63 100644
--- a/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp
+++ b/tt_eager/tt_dnn/op_library/tilize/tilize_op.cpp
@@ -197,30 +197,28 @@ Tensor tilize_with_val_padding(
     bool use_multicore) {
     // No-op (Will do a tensor copy)
     // TODO: We need to run asserts before this
-    if (input_tensor_a.get_layout() == Layout::TILE) {
-        if (output_tensor_shape == input_tensor_a.get_legacy_shape()) {
-            log_warning("Perf warning: tilize with padding called on already tilized tensor of target shape.");
-            return input_tensor_a;
-        } else {
-            TT_FATAL(false, "Cannot tilize and pad tensor that is already tilized");
-        }
-    }
-    if (is_multi_device_tensor(input_tensor_a)) {
-        return transform(input_tensor_a, [&](const Tensor& tensor) {
-            return tilize_with_val_padding(
-                tensor, output_tensor_shape, pad_value, output_mem_config, output_dtype, use_multicore);
-        });
-    }
-
-    return operation::run_without_autoformat(
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))};
+    operation::launch_op(
+        [output_tensor_shape, pad_value, output_mem_config, output_dtype, use_multicore] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+            auto& input_tensor_a = input_tensors.at(0);
+            if (input_tensor_a.get_layout() == Layout::TILE) {
+                if (output_tensor_shape == input_tensor_a.get_legacy_shape()) {
+                    log_warning("Perf warning: tilize with padding called on already tilized tensor of target shape.");
+                    return {input_tensor_a};
+                } else {
+                    TT_FATAL(false, "Cannot tilize and pad tensor that is already tilized");
+                }
+            }
+            return operation::run_without_autoformat(
                TilizeWithValPadding{
                    output_tensor_shape,
                    pad_value,
                    output_mem_config,
                    output_dtype.value_or(input_tensor_a.get_dtype()),
                    use_multicore},
-               {input_tensor_a})
-        .at(0);
+               {input_tensor_a});
+        }, {input_tensor_a}, output_tensors);
+    return output_tensors.at(0);
 }
 
 Tensor tilize_with_zero_padding(
diff --git a/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp b/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp
index 54ff45280652..7ca06831cc5c 100644
--- a/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp
+++ b/tt_eager/tt_dnn/op_library/untilize/untilize_op.cpp
@@ -230,22 +230,28 @@ UntilizeWithUnpaddingOpParallelizationStrategy UntilizeWithUnpadding::get_parall
 Tensor untilize_with_unpadding(const Tensor &input_tensor_a, const Shape &output_tensor_start, const Shape &output_tensor_end, const MemoryConfig& output_mem_config, bool use_pack_untilize) {
     // No-op (Will do a tensor copy)
     // TODO: We need to run asserts before this
-    const Shape output_tensor_shape = {
-        output_tensor_end[0] - output_tensor_start[0] + 1,
-        output_tensor_end[1] - output_tensor_start[1] + 1,
-        output_tensor_end[2] - output_tensor_start[2] + 1,
-        output_tensor_end[3] - output_tensor_start[3] + 1,
-    };
-    if (input_tensor_a.get_layout() != Layout::TILE) {
-        if (input_tensor_a.get_legacy_shape() == output_tensor_shape) {
-            log_warning("Perf warning: Untilize with unpadding called on already untilized tensor of target shape");
-            return AutoFormat::move_tensor_to_mem_config(input_tensor_a, output_mem_config);
-        } else {
-            TT_FATAL(false, "Cannot untilize and unpad input which is not tilized");
-        }
-    }
-    bool fp32_dest_acc_en = input_tensor_a.get_dtype() == DataType::UINT32;            // MT: Currently only uint32 is moved to DST directly, fp32 is converted to fp16b
-    return operation::run_without_autoformat(UntilizeWithUnpadding{output_tensor_start, output_tensor_end, output_mem_config, use_pack_untilize, fp32_dest_acc_en}, {input_tensor_a}).at(0);
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))};
+    operation::launch_op(
+        [output_tensor_start, output_tensor_end, output_mem_config, use_pack_untilize] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+            auto& input_tensor_a = input_tensors.at(0);
+            const Shape output_tensor_shape = {
+                output_tensor_end[0] - output_tensor_start[0] + 1,
+                output_tensor_end[1] - output_tensor_start[1] + 1,
+                output_tensor_end[2] - output_tensor_start[2] + 1,
+                output_tensor_end[3] - output_tensor_start[3] + 1,
+            };
+            if (input_tensor_a.get_layout() != Layout::TILE) {
+                if (input_tensor_a.get_legacy_shape() == output_tensor_shape) {
+                    log_warning("Perf warning: Untilize with unpadding called on already untilized tensor of target shape");
+                    return {AutoFormat::move_tensor_to_mem_config(input_tensor_a, output_mem_config)};
+                } else {
+                    TT_FATAL(false, "Cannot untilize and unpad input which is not tilized");
+                }
+            }
+            bool fp32_dest_acc_en = input_tensor_a.get_dtype() == DataType::UINT32;            // MT: Currently only uint32 is moved to DST directly, fp32 is converted to fp16b
+            return operation::run_without_autoformat(UntilizeWithUnpadding{output_tensor_start, output_tensor_end, output_mem_config, use_pack_untilize, fp32_dest_acc_en}, {input_tensor_a});
+        }, {input_tensor_a}, output_tensors);
+    return output_tensors.at(0);
 }
 
 }  // namespace tt_metal
diff --git a/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op_v2.cpp b/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op_v2.cpp
index b151ef322612..a81d5ec19c7e 100644
--- a/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op_v2.cpp
+++ b/tt_eager/tt_dnn/op_library/untilize/untilize_with_halo_op_v2.cpp
@@ -348,20 +348,28 @@ Tensor untilize_with_halo_v2(
     const MemoryConfig& mem_config,
     const bool remote_read,
     const bool transpose_mcast) {
-    TT_ASSERT(input_tensor.memory_config().is_sharded());
-    TT_ASSERT(input_tensor.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || input_tensor.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED);
-    // NOTE: for HEIGHT_SHARDED, ncores_nhw == ncores
-    //       for BLOCK_SHARDED, ncores_nhw is just the ncores along height dim (last tensor dim is split along width)
-
-    return operation::run_without_autoformat(
-               UntilizeWithHaloV2{pad_val, ncores_nhw, max_out_nsticks_per_core, mem_config, remote_read, transpose_mcast},
-               {
-                   input_tensor,
-                   padding_config,
-                   local_config,
-                   remote_config,
-               })
-        .at(0);
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor, padding_config, local_config, remote_config}))};
+    operation::launch_op(
+        [pad_val, ncores_nhw, max_out_nsticks_per_core, mem_config, remote_read, transpose_mcast] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+            auto& input_tensor = input_tensors.at(0);
+            auto& padding_config = input_tensors.at(1);
+            auto& local_config = input_tensors.at(2);
+            auto& remote_config = input_tensors.at(3);
+            TT_ASSERT(input_tensor.memory_config().is_sharded());
+            TT_ASSERT(input_tensor.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || input_tensor.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED);
+            // NOTE: for HEIGHT_SHARDED, ncores_nhw == ncores
+            //       for BLOCK_SHARDED, ncores_nhw is just the ncores along height dim (last tensor dim is split along width)
+
+            return operation::run_without_autoformat(
+                    UntilizeWithHaloV2{pad_val, ncores_nhw, max_out_nsticks_per_core, mem_config, remote_read, transpose_mcast},
+                    {
+                        input_tensor,
+                        padding_config,
+                        local_config,
+                        remote_config,
+                    });
+        }, {input_tensor, padding_config, local_config, remote_config}, output_tensors);
+    return output_tensors.at(0);
 }
 
 }  // namespace tt_metal
diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
index e555c1adaf6c..778f9b1bbae0 100644
--- a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
+++ b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
@@ -12,6 +12,7 @@
 #include "tt_metal/detail/reports/compilation_reporter.hpp"
 #include "tt_metal/detail/reports/memory_reporter.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/impl/trace/trace.hpp"
 #include "tt_metal/tools/profiler/op_profiler.hpp"
 #include "type_caster.hpp"
 
@@ -204,17 +205,44 @@ void DeviceModule(py::module &m_device) {
     m_device.def("DeallocateBuffers", &detail::DeallocateBuffers, R"doc(
         Deallocate all buffers associated with Device handle
     )doc");
-    m_device.def("BeginTraceCapture", &BeginTraceCapture, R"doc(
+    m_device.def("BeginTraceCapture",
+        [] (Device* device, const uint8_t cq_id, const uint32_t trace_buff_size) {
+            uint32_t tid = Trace::next_id();
+            device->push_work([device, cq_id, tid, trace_buff_size] () mutable {
+                device->begin_trace(cq_id, tid, trace_buff_size);
+            });
+            return tid;
+        }, R"doc(
         Begin trace capture on Device handle
     )doc");
-    m_device.def("EndTraceCapture", &EndTraceCapture, R"doc(
+    m_device.def("EndTraceCapture",
+        [] (Device* device, const uint8_t cq_id, const uint32_t tid) {
+            device->push_work([device, cq_id, tid] () mutable {
+                device->end_trace(cq_id, tid);
+            });
+        }, R"doc(
         End trace capture on Device handle
     )doc");
-    m_device.def("ReplayTrace", &ReplayTrace, R"doc(
-        Replay last captured trace on Device handle
+    m_device.def("ReplayTrace",
+        [] (Device* device, const uint8_t cq_id, const uint32_t tid, bool blocking) {
+            // If blocking, ensure that worker thread blocks until trace is completed
+            device->push_work([device, cq_id, tid, blocking] {
+                device->replay_trace(cq_id, tid, blocking);
+            });
+            // If blocking, wait until worker threads have completed
+            if (blocking) {
+                device->synchronize();
+            }
+        }, R"doc(
+        Replay captured trace on Device handle
     )doc");
-    m_device.def("ReleaseTrace", &ReleaseTrace, R"doc(
-        Release last captured Trace on Device handle
+    m_device.def("ReleaseTrace",
+        [] (Device* device, const uint32_t tid) {
+            device->push_work([device, tid] {
+                device->release_trace(tid);
+            });
+        }, R"doc(
+        Release captured Trace on Device handle
     )doc");
 
     m_device.attr("DEFAULT_L1_SMALL_SIZE") = py::int_(DEFAULT_L1_SMALL_SIZE);
diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor.cpp
index a7b2ec5b2636..c764a866220f 100644
--- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor.cpp
+++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor.cpp
@@ -772,7 +772,7 @@ void TensorModule(py::module &m_tensor) {
 
     m_tensor.def(
         "allocate_tensor_on_device",
-        py::overload_cast<const Shape&, DataType, Layout, Device*, const MemoryConfig&>(&allocate_tensor_on_device),
+        py::overload_cast<const ttnn::Shape&, DataType, Layout, Device*, const MemoryConfig&>(&allocate_tensor_on_device),
         py::arg("shape"), py::arg("dtype"), py::arg("layout"), py::arg("device"), py::arg("memory_config") = MemoryConfig{.memory_layout=TensorMemoryLayout::INTERLEAVED},
         R"doc(
             Allocate a tensor with specified attributes on a device.
@@ -781,7 +781,7 @@ void TensorModule(py::module &m_tensor) {
 
     m_tensor.def(
         "allocate_tensor_on_device",
-        py::overload_cast<const Shape&, DataType, Layout, DeviceMesh*, const MemoryConfig&>(&allocate_tensor_on_device),
+        py::overload_cast<const ttnn::Shape&, DataType, Layout, DeviceMesh*, const MemoryConfig&>(&allocate_tensor_on_device),
         py::arg("shape"), py::arg("dtype"), py::arg("layout"), py::arg("device"), py::arg("memory_config") = MemoryConfig{.memory_layout=TensorMemoryLayout::INTERLEAVED},
         R"doc(
             Allocate a tensor with specified attributes on a device.
diff --git a/ttnn/cpp/pybind11/device.hpp b/ttnn/cpp/pybind11/device.hpp
index 99e1db36730d..a81f07d5506f 100644
--- a/ttnn/cpp/pybind11/device.hpp
+++ b/ttnn/cpp/pybind11/device.hpp
@@ -27,6 +27,7 @@ void py_module(py::module& module) {
     module.def("enable_program_cache", &ttnn::enable_program_cache, py::arg("device"), py::kw_only());
 
     module.def("disable_and_clear_program_cache", &ttnn::disable_and_clear_program_cache, py::arg("device"), py::kw_only());
+
 }
 
 }  // namespace device
diff --git a/ttnn/cpp/pybind11/operations/core.hpp b/ttnn/cpp/pybind11/operations/core.hpp
index d045816bdc7b..76b9985305e2 100644
--- a/ttnn/cpp/pybind11/operations/core.hpp
+++ b/ttnn/cpp/pybind11/operations/core.hpp
@@ -130,6 +130,106 @@ Deallocates device tensor and returns a reallocated tensor
     * :attr:`input_tensor`: Input Tensor
     )doc");
 
+    module.def(
+        "allocate_tensor_on_device",
+        py::overload_cast<const ttnn::Shape&, ttnn::DataType, ttnn::Layout, Device*, const std::optional<ttnn::MemoryConfig>&>(
+            &ttnn::operations::core::allocate_tensor_on_device),
+        py::arg("shape"),
+        py::arg("dtype"),
+        py::arg("layout"),
+        py::arg("device"),
+        py::arg("memory_config") = std::nullopt);
+
+    module.def(
+        "allocate_tensor_on_device",
+        py::overload_cast<const ttnn::Shape&, ttnn::DataType, ttnn::Layout, DeviceMesh*, const std::optional<ttnn::MemoryConfig>&>(
+            &ttnn::operations::core::allocate_tensor_on_device),
+        py::arg("shape"),
+        py::arg("dtype"),
+        py::arg("layout"),
+        py::arg("device_mesh"),
+        py::arg("memory_config") = std::nullopt);
+
+    module.def("copy_host_to_device_tensor", &ttnn::operations::core::copy_host_to_device_tensor, py::arg("host_tensor"), py::arg("device_tensor"), py::arg("cq_id") = 0);
+
+    module.def(
+        "begin_trace_capture",
+        py::overload_cast<Device*, const uint32_t, const uint8_t>(
+            &ttnn::operations::core::begin_trace_capture
+        ),
+        py::arg("device"),
+        py::kw_only(),
+        py::arg("trace_buffer_size"),
+        py::arg("cq_id") = 0);
+
+    module.def(
+        "end_trace_capture",
+        py::overload_cast<Device*, const uint32_t, const uint8_t>(
+            &ttnn::operations::core::end_trace_capture
+        ),
+        py::arg("device"),
+        py::arg("trace_id"),
+        py::kw_only(),
+        py::arg("cq_id") = 0);
+
+    module.def(
+        "execute_trace",
+        py::overload_cast<Device*, const uint32_t, const uint8_t, bool>(
+            &ttnn::operations::core::execute_trace
+        ),
+        py::arg("device"),
+        py::arg("trace_id"),
+        py::kw_only(),
+        py::arg("cq_id") = 0,
+        py::arg("blocking") = true);
+
+    module.def(
+        "release_trace",
+        py::overload_cast<Device*, const uint32_t>(
+            &ttnn::operations::core::release_trace
+        ),
+        py::arg("device"),
+        py::arg("trace_id"));
+
+    module.def(
+        "begin_trace_capture",
+        py::overload_cast<DeviceMesh*, const uint32_t, const uint8_t>(
+            &ttnn::operations::core::begin_trace_capture
+        ),
+        py::arg("device_mesh"),
+        py::kw_only(),
+        py::arg("trace_buffer_size"),
+        py::arg("cq_id") = 0);
+
+    module.def(
+        "end_trace_capture",
+        py::overload_cast<DeviceMesh*, const uint32_t, const uint8_t>(
+            &ttnn::operations::core::end_trace_capture
+        ),
+        py::arg("device_mesh"),
+        py::arg("trace_id"),
+        py::kw_only(),
+        py::arg("cq_id") = 0);
+
+    module.def(
+        "execute_trace",
+        py::overload_cast<DeviceMesh*, const uint32_t, const uint8_t, bool>(
+            &ttnn::operations::core::execute_trace
+        ),
+        py::arg("device_mesh"),
+        py::arg("trace_id"),
+        py::kw_only(),
+        py::arg("cq_id") = 0,
+        py::arg("blocking") = true);
+
+    module.def(
+        "release_trace",
+        py::overload_cast<DeviceMesh*, const uint32_t>(
+            &ttnn::operations::core::release_trace
+        ),
+        py::arg("device_mesh"),
+        py::arg("trace_id"));
+
     bind_registered_operation(
         module,
         ttnn::to_layout,
diff --git a/ttnn/cpp/ttnn/operations/core.hpp b/ttnn/cpp/ttnn/operations/core.hpp
index 911dd40b3b04..9eaf1e9d6cbd 100644
--- a/ttnn/cpp/ttnn/operations/core.hpp
+++ b/ttnn/cpp/ttnn/operations/core.hpp
@@ -14,6 +14,7 @@
 #include "tt_eager/tt_dnn/op_library/tilize/tilize_op.hpp"
 #include "tt_eager/tt_dnn/op_library/untilize/untilize_op.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
+#include "tt_metal/impl/trace/trace.hpp"
 #include "ttnn/core.hpp"
 #include "ttnn/decorators.hpp"
 #include "ttnn/op_library/to_layout/to_layout_op.hpp"
@@ -165,6 +166,20 @@ inline ttnn::Tensor to_device(
     return tensor.to(device_mesh, memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG));
 }
 
+inline ttnn::Tensor allocate_tensor_on_device(
+    const Shape& shape, DataType data_type, Layout layout, Device *device, const std::optional<MemoryConfig>& memory_config) {
+    return tt::tt_metal::allocate_tensor_on_device(shape, data_type, layout, device, memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG));
+}
+
+inline ttnn::Tensor allocate_tensor_on_device(
+    const Shape& shape, DataType data_type, Layout layout, DeviceMesh *device_mesh, const std::optional<MemoryConfig>& memory_config) {
+    return tt::tt_metal::allocate_tensor_on_device(shape, data_type, layout, device_mesh, memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG));
+}
+
+inline void copy_host_to_device_tensor(ttnn::Tensor host_tensor, ttnn::Tensor device_tensor, uint8_t cq_id = 0) {
+    tt::tt_metal::write_tensor(host_tensor, device_tensor, cq_id);
+}
+
 inline ttnn::Tensor from_device(const ttnn::Tensor& tensor, bool blocking = true) { return tensor.cpu(blocking); }
 
 inline void deallocate(Tensor& tensor, bool force = true) { tensor.deallocate(force); }
@@ -177,6 +192,95 @@ inline Tensor reallocate(const Tensor& input_tensor, const std::optional<MemoryC
     }
 }
 
+// Trace APIs - Single Device
+inline uint32_t begin_trace_capture(Device* device, const uint32_t trace_buff_size, const uint8_t cq_id) {
+    uint32_t tid = Trace::next_id();
+    device->push_work(
+        [device, trace_buff_size, cq_id, tid] () mutable {
+            device->begin_trace(cq_id, tid, trace_buff_size);
+        });
+    return tid;
+}
+
+inline void end_trace_capture(Device* device, const uint32_t tid, const uint8_t cq_id) {
+    device->push_work(
+        [device, cq_id, tid] () mutable {
+            device->end_trace(cq_id, tid);
+        }
+    );
+}
+
+inline void execute_trace(Device* device, const uint32_t tid, const uint8_t cq_id, bool blocking) {
+    // If blocking, ensure that worker thread blocks until trace is completed
+    device->push_work(
+        [device, cq_id, tid, blocking] () mutable {
+            device->replay_trace(cq_id, tid, blocking);
+        }
+    );
+    // If blocking, wait until worker threads have completed
+    if (blocking) {
+        device->synchronize();
+    }
+}
+
+inline void release_trace(Device* device, const uint32_t tid) {
+    device->push_work(
+        [device, tid] () mutable {
+            device->release_trace(tid);
+        }
+    );
+}
+
+// Trace APIs - Multi Device
+inline uint32_t begin_trace_capture(DeviceMesh* device, const uint32_t trace_buff_size, const uint8_t cq_id = 0) {
+    auto workers = device->get_devices();
+    uint32_t tid = Trace::next_id();
+    for (auto& worker : workers) {
+        worker->push_work(
+            [worker, trace_buff_size, cq_id, tid] () mutable {
+                worker->begin_trace(cq_id, tid, trace_buff_size);
+            });
+    }
+    return tid;
+}
+
+inline void end_trace_capture(DeviceMesh* device, const uint32_t tid, const uint8_t cq_id = 0) {
+    auto workers = device->get_devices();
+    for (auto& worker : workers) {
+        worker->push_work(
+            [worker, cq_id, tid] () mutable {
+                worker->end_trace(cq_id, tid);
+            });
+    }
+}
+
+inline void execute_trace(DeviceMesh* device, const uint32_t tid, const uint8_t cq_id = 0, bool blocking = true) {
+    auto workers = device->get_devices();
+    // If blocking, ensure that each worker thread blocks until device-local trace is completed
+    for (auto& worker : workers) {
+        worker->push_work(
+            [worker, cq_id, tid, blocking] () mutable {
+                worker->replay_trace(cq_id, tid, blocking);
+            });
+    }
+    // If blocking, wait until worker threads have completed
+    if (blocking) {
+        for (auto& worker : workers) {
+            worker->synchronize();
+        }
+    }
+}
+
+inline void release_trace(DeviceMesh* device, const uint32_t tid) {
+    auto workers = device->get_devices();
+    for (auto& worker : workers) {
+        worker->push_work(
+            [worker, tid] () mutable {
+                worker->release_trace(tid);
+            });
+    }
+}
+
 }  // namespace core
 }  // namespace operations
 
diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py
index ee9f70bb26ad..ba4336e11285 100644
--- a/ttnn/ttnn/__init__.py
+++ b/ttnn/ttnn/__init__.py
@@ -260,6 +260,12 @@ def manage_config(name, value):
     squeeze,
     clone,
     as_tensor,
+    allocate_tensor_on_device,
+    copy_host_to_device_tensor,
+    begin_trace_capture,
+    end_trace_capture,
+    execute_trace,
+    release_trace,
 )
 
 from ttnn.operations.matmul import (
diff --git a/ttnn/ttnn/operations/core.py b/ttnn/ttnn/operations/core.py
index aea2c429fff2..386c169d227c 100644
--- a/ttnn/ttnn/operations/core.py
+++ b/ttnn/ttnn/operations/core.py
@@ -454,6 +454,13 @@ def _golden_function(tensor, *args, **kwargs):
     doc=doc,
 )(ttnn._ttnn.operations.core.from_device)
 
+allocate_tensor_on_device = ttnn.register_operation(
+    name="ttnn.allocate_tensor_on_device",
+)(ttnn._ttnn.operations.core.allocate_tensor_on_device)
+
+copy_host_to_device_tensor = ttnn.register_operation(
+    name="ttnn.copy_host_to_device_tensor",
+)(ttnn._ttnn.operations.core.copy_host_to_device_tensor)
 
 doc = """
 deallocate(tensor: ttnn.Tensor, force: bool = True) -> None
@@ -661,4 +668,20 @@ def dispatch_to_device_on_load(device) -> bool:
         return tensor
 
 
+begin_trace_capture = ttnn.register_operation(
+    name="ttnn.begin_trace_capture",
+)(ttnn._ttnn.operations.core.begin_trace_capture)
+
+end_trace_capture = ttnn.register_operation(
+    name="ttnn.end_trace_capture",
+)(ttnn._ttnn.operations.core.end_trace_capture)
+
+execute_trace = ttnn.register_operation(
+    name="ttnn.execute_trace",
+)(ttnn._ttnn.operations.core.execute_trace)
+
+release_trace = ttnn.register_operation(
+    name="ttnn.release_trace",
+)(ttnn._ttnn.operations.core.release_trace)
+
 __all__ = []

From b55742263f97ffeae90395f51c1efe4afc67c128 Mon Sep 17 00:00:00 2001
From: asaigal <asaigal@tenstorrent.com>
Date: Fri, 17 May 2024 02:39:51 +0000
Subject: [PATCH 31/40] #0: Move ttnn trace APIs to separate trace.py include

---
 ttnn/ttnn/__init__.py         | 11 +++++++----
 ttnn/ttnn/operations/core.py  | 16 ----------------
 ttnn/ttnn/operations/trace.py | 22 ++++++++++++++++++++++
 3 files changed, 29 insertions(+), 20 deletions(-)
 create mode 100644 ttnn/ttnn/operations/trace.py

diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py
index ba4336e11285..52d3c3f409b8 100644
--- a/ttnn/ttnn/__init__.py
+++ b/ttnn/ttnn/__init__.py
@@ -262,10 +262,6 @@ def manage_config(name, value):
     as_tensor,
     allocate_tensor_on_device,
     copy_host_to_device_tensor,
-    begin_trace_capture,
-    end_trace_capture,
-    execute_trace,
-    release_trace,
 )
 
 from ttnn.operations.matmul import (
@@ -457,6 +453,13 @@ def manage_config(name, value):
     get_group_norm_cores_accross_channel,
 )
 
+from ttnn.operations.trace import (
+    begin_trace_capture,
+    end_trace_capture,
+    execute_trace,
+    release_trace,
+)
+
 from ttnn.operations.ccl import all_gather
 
 from ttnn.operations import transformer
diff --git a/ttnn/ttnn/operations/core.py b/ttnn/ttnn/operations/core.py
index 386c169d227c..8ef9fc5ca6f6 100644
--- a/ttnn/ttnn/operations/core.py
+++ b/ttnn/ttnn/operations/core.py
@@ -668,20 +668,4 @@ def dispatch_to_device_on_load(device) -> bool:
         return tensor
 
 
-begin_trace_capture = ttnn.register_operation(
-    name="ttnn.begin_trace_capture",
-)(ttnn._ttnn.operations.core.begin_trace_capture)
-
-end_trace_capture = ttnn.register_operation(
-    name="ttnn.end_trace_capture",
-)(ttnn._ttnn.operations.core.end_trace_capture)
-
-execute_trace = ttnn.register_operation(
-    name="ttnn.execute_trace",
-)(ttnn._ttnn.operations.core.execute_trace)
-
-release_trace = ttnn.register_operation(
-    name="ttnn.release_trace",
-)(ttnn._ttnn.operations.core.release_trace)
-
 __all__ = []
diff --git a/ttnn/ttnn/operations/trace.py b/ttnn/ttnn/operations/trace.py
new file mode 100644
index 000000000000..fa57a9219c44
--- /dev/null
+++ b/ttnn/ttnn/operations/trace.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+import ttnn
+
+THIS_MODULE = sys.modules[__name__]
+
+__all__ = []
+
+
+begin_trace_capture = ttnn.register_operation(name="ttnn.begin_trace_capture")(
+    ttnn._ttnn.operations.core.begin_trace_capture
+)
+
+end_trace_capture = ttnn.register_operation(name="ttnn.end_trace_capture")(ttnn._ttnn.operations.core.end_trace_capture)
+
+execute_trace = ttnn.register_operation(name="ttnn.execute_trace")(ttnn._ttnn.operations.core.execute_trace)
+
+release_trace = ttnn.register_operation(name="ttnn.release_trace")(ttnn._ttnn.operations.core.release_trace)

From bba24633b7f8e2b87e11c09dfccf525dd7f37a2a Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Wed, 15 May 2024 21:27:10 +0000
Subject: [PATCH 32/40] #8530: Add blackhole folders in tt_metal/hw + compile
 fixes for ARCH_NAME=blackhole

---
 module.mk                                     |    4 -
 tt_metal/common/tt_backend_api_types.cpp      |    4 +
 tt_metal/common/tt_backend_api_types.hpp      |   11 -
 tt_metal/hw/firmware/src/blackhole/noc.c      |  485 ++
 tt_metal/hw/inc/blackhole/c_tensix_core.h     |  448 ++
 tt_metal/hw/inc/blackhole/cfg_defines.h       | 4159 +++++++++++++++++
 tt_metal/hw/inc/blackhole/cmd_params.h        | 1015 ++++
 tt_metal/hw/inc/blackhole/cmds.def            |   43 +
 tt_metal/hw/inc/blackhole/dev_mem_map.h       |  101 +
 .../hw/inc/blackhole/dev_mem_map_versim.h     |   11 +
 tt_metal/hw/inc/blackhole/dram_address_map.h  |   40 +
 .../hw/inc/blackhole/eth_l1_address_map.h     |   96 +
 tt_metal/hw/inc/blackhole/noc/noc.h           |  472 ++
 .../blackhole/noc/noc_overlay_parameters.cpp  | 3471 ++++++++++++++
 .../blackhole/noc/noc_overlay_parameters.h    |  872 ++++
 .../blackhole/noc/noc_overlay_parameters.hpp  |  321 ++
 .../hw/inc/blackhole/noc/noc_parameters.h     |  362 ++
 .../hw/inc/blackhole/noc_nonblocking_api.h    |  472 ++
 .../hw/inc/blackhole/risc_chip_specific.h     |  142 +
 tt_metal/hw/inc/blackhole/stream_interface.h  |  518 ++
 tt_metal/hw/inc/blackhole/stream_io_map.h     |  203 +
 tt_metal/hw/inc/blackhole/tdma_xmov.h         |   14 +
 tt_metal/hw/inc/blackhole/tensix.h            |  735 +++
 tt_metal/hw/inc/blackhole/tensix_types.h      |  372 ++
 tt_metal/hw/inc/wormhole/epoch_q.h            |   55 -
 tt_metal/hw/inc/wormhole/noc/noc_overlay.c    |   22 -
 tt_metal/hw/inc/wormhole/noc/noc_overlay.h    |  476 --
 .../wormhole/noc/noc_overlay_parameters.erb   | 1458 ------
 tt_metal/jit_build/build.cpp                  |    7 +
 tt_metal/llrt/tt_cluster.cpp                  |   15 +
 tt_metal/llrt/tt_cluster.hpp                  |    1 -
 31 files changed, 14378 insertions(+), 2027 deletions(-)
 create mode 100644 tt_metal/hw/firmware/src/blackhole/noc.c
 create mode 100644 tt_metal/hw/inc/blackhole/c_tensix_core.h
 create mode 100644 tt_metal/hw/inc/blackhole/cfg_defines.h
 create mode 100644 tt_metal/hw/inc/blackhole/cmd_params.h
 create mode 100644 tt_metal/hw/inc/blackhole/cmds.def
 create mode 100644 tt_metal/hw/inc/blackhole/dev_mem_map.h
 create mode 100644 tt_metal/hw/inc/blackhole/dev_mem_map_versim.h
 create mode 100644 tt_metal/hw/inc/blackhole/dram_address_map.h
 create mode 100644 tt_metal/hw/inc/blackhole/eth_l1_address_map.h
 create mode 100644 tt_metal/hw/inc/blackhole/noc/noc.h
 create mode 100644 tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.cpp
 create mode 100644 tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.h
 create mode 100644 tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.hpp
 create mode 100644 tt_metal/hw/inc/blackhole/noc/noc_parameters.h
 create mode 100644 tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
 create mode 100644 tt_metal/hw/inc/blackhole/risc_chip_specific.h
 create mode 100644 tt_metal/hw/inc/blackhole/stream_interface.h
 create mode 100644 tt_metal/hw/inc/blackhole/stream_io_map.h
 create mode 100644 tt_metal/hw/inc/blackhole/tdma_xmov.h
 create mode 100644 tt_metal/hw/inc/blackhole/tensix.h
 create mode 100644 tt_metal/hw/inc/blackhole/tensix_types.h
 delete mode 100644 tt_metal/hw/inc/wormhole/epoch_q.h
 delete mode 100644 tt_metal/hw/inc/wormhole/noc/noc_overlay.c
 delete mode 100644 tt_metal/hw/inc/wormhole/noc/noc_overlay.h
 delete mode 100644 tt_metal/hw/inc/wormhole/noc/noc_overlay_parameters.erb

diff --git a/module.mk b/module.mk
index e619405ce829..95125643bed2 100644
--- a/module.mk
+++ b/module.mk
@@ -35,10 +35,6 @@ else
 $(error Unknown value for CONFIG "$(CONFIG)")
 endif
 
-ifeq ("$(ARCH_NAME)", "blackhole")
-$(error Blackhole support not yet added!)
-endif
-
 ifeq ($(TT_METAL_VERSIM_DISABLED),0)
   UMD_VERSIM_STUB = 0
 else
diff --git a/tt_metal/common/tt_backend_api_types.cpp b/tt_metal/common/tt_backend_api_types.cpp
index b7f7eaddf026..75903e6f9b8d 100644
--- a/tt_metal/common/tt_backend_api_types.cpp
+++ b/tt_metal/common/tt_backend_api_types.cpp
@@ -10,6 +10,7 @@ std::string tt::get_string(tt::ARCH arch) {
         case tt::ARCH::GRAYSKULL: return "GRAYSKULL"; break;
         case tt::ARCH::WORMHOLE: return "WORMHOLE"; break;
         case tt::ARCH::WORMHOLE_B0: return "WORMHOLE_B0"; break;
+        case tt::ARCH::BLACKHOLE: return "BLACKHOLE"; break;
         case tt::ARCH::Invalid: return "Invalid"; break;
         default: return "Invalid"; break;
     }
@@ -21,6 +22,7 @@ std::string tt::get_string_lowercase(tt::ARCH arch) {
         case tt::ARCH::GRAYSKULL: return "grayskull"; break;
         case tt::ARCH::WORMHOLE: return "wormhole"; break;
         case tt::ARCH::WORMHOLE_B0: return "wormhole_b0"; break;
+        case tt::ARCH::BLACKHOLE: return "blackhole"; break;
         case tt::ARCH::Invalid: return "invalid"; break;
         default: return "invalid"; break;
     }
@@ -37,6 +39,8 @@ tt::ARCH tt::get_arch_from_string(const std::string &arch_str) {
         arch = tt::ARCH::WORMHOLE;
     } else if ((arch_str == "wormhole_b0") || (arch_str == "WORMHOLE_B0")) {
         arch = tt::ARCH::WORMHOLE_B0;
+    } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")) {
+        arch = tt::ARCH::BLACKHOLE;
     } else if ((arch_str == "Invalid") || (arch_str == "INVALID")) {
         arch = tt::ARCH::Invalid;
     } else {
diff --git a/tt_metal/common/tt_backend_api_types.hpp b/tt_metal/common/tt_backend_api_types.hpp
index 963a9e18d027..0d08ad497c3d 100644
--- a/tt_metal/common/tt_backend_api_types.hpp
+++ b/tt_metal/common/tt_backend_api_types.hpp
@@ -134,17 +134,6 @@ inline constexpr static uint32_t tile_size(const DataFormat &format) {
     }
 }
 
-/**
- * @brief Device Enums
- */
-enum class DEVICE {
-    JAWBRIDGE = 0,
-    GRAYSKULL = 1,
-    WORMHOLE = 2,
-    WORMHOLE_B0 = 3,
-    Invalid = 0xFF,
-};
-
 std::string get_string(ARCH arch);
 std::string get_string_lowercase(ARCH arch);
 ARCH get_arch_from_string(const std::string &arch_str);
diff --git a/tt_metal/hw/firmware/src/blackhole/noc.c b/tt_metal/hw/firmware/src/blackhole/noc.c
new file mode 100644
index 000000000000..ce1599e08e34
--- /dev/null
+++ b/tt_metal/hw/firmware/src/blackhole/noc.c
@@ -0,0 +1,485 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "noc.h"
+#include <stdint.h>
+#include <stdbool.h>
+#include "noc_parameters.h"
+
+////
+
+#ifdef TB_NOC
+
+#include "noc_api_dpi.h"
+
+#else
+
+#define NOC_WRITE_REG(addr, val) ((*((volatile uint32_t*)(noc_get_cmd_buf()*NOC_CMD_BUF_OFFSET+noc_get_active_instance()*NOC_INSTANCE_OFFSET+(addr)))) = (val))
+#define NOC_READ_REG(addr)       (*((volatile uint32_t*)(noc_get_cmd_buf()*NOC_CMD_BUF_OFFSET+noc_get_active_instance()*NOC_INSTANCE_OFFSET+(addr))))
+
+#endif
+
+#ifdef ARC_FW_NOC
+#include "arc_fw_noc.h"
+#endif
+
+///
+
+
+static uint32_t active_cmd_buf = 0;
+static uint32_t active_noc_instance = 0;
+
+void noc_set_cmd_buf(uint32_t cmd_buf_id) {
+#ifdef TB_NOC
+  api_set_active_cmd_buf(cmd_buf_id);
+#else
+  active_cmd_buf = cmd_buf_id;
+#endif
+}
+
+uint32_t noc_get_cmd_buf() {
+#ifdef TB_NOC
+  return api_get_active_cmd_buf();
+#else
+  return active_cmd_buf;
+#endif
+}
+
+void noc_set_active_instance(uint32_t noc_id) {
+#ifdef TB_NOC
+  api_set_active_noc_instance(noc_id);
+#else
+  active_noc_instance = noc_id;
+#endif
+}
+
+uint32_t noc_get_active_instance() {
+#ifdef TB_NOC
+  return api_get_active_noc_instance();
+#else
+  return active_noc_instance;
+#endif
+}
+
+
+static void noc_transfer(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, bool multicast, uint32_t multicast_mode, uint32_t multicast_exclude, bool src_local, uint32_t vc_arb_priority, bool src_include, uint8_t transaction_id) {
+  while (!noc_command_ready());
+  NOC_WRITE_REG(NOC_TARG_ADDR_LO, (uint32_t)(src_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_TARG_ADDR_MID, (uint32_t)(src_addr >> 32));
+  NOC_WRITE_REG(NOC_TARG_ADDR_HI, src_coordinate);
+  NOC_WRITE_REG(NOC_RET_ADDR_LO, (uint32_t)(dst_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_RET_ADDR_MID, (uint32_t)(dst_addr >> 32));
+  NOC_WRITE_REG(NOC_RET_ADDR_HI, dst_coordinate);
+  NOC_WRITE_REG(NOC_AT_LEN_BE, size);
+  NOC_WRITE_REG(NOC_AT_LEN_BE_1, 0x0);
+  NOC_WRITE_REG(NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(transaction_id));
+  if (multicast) {
+    NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                            (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+                            NOC_CMD_STATIC_VC(static_vc) |
+                            NOC_CMD_PATH_RESERVE |
+                            NOC_CMD_CPY |
+                            (src_local ? NOC_CMD_WR : NOC_CMD_RD) |
+                            NOC_CMD_BRCST_PACKET |
+                            (posted ? 0x0 : NOC_CMD_RESP_MARKED) |
+                            (src_include ? NOC_CMD_BRCST_SRC_INCLUDE : 0x0) |
+                            NOC_CMD_BRCST_XY(multicast_mode));
+    NOC_WRITE_REG(NOC_BRCST_EXCLUDE, multicast_exclude);
+  } else {
+    NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                            (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+                            NOC_CMD_STATIC_VC(static_vc) |
+                            NOC_CMD_CPY |
+                            (src_local ? NOC_CMD_WR : NOC_CMD_RD) |
+                            (posted ? 0x0 : NOC_CMD_RESP_MARKED) |
+                            NOC_CMD_ARB_PRIORITY(vc_arb_priority));
+  }
+  NOC_WRITE_REG(NOC_CMD_CTRL, 0x1);
+}
+
+static bool unicast_addr_local(uint32_t noc_coordinate) {
+  bool noc_id_translate_en = (noc_get_cfg_reg(NIU_CFG_0) >> NIU_CFG_0_NOC_ID_TRANSLATE_EN) & 0x1;
+  uint32_t local_node_id = noc_id_translate_en ? noc_get_cfg_reg(NOC_ID_LOGICAL) : noc_local_node_id();
+  uint32_t local_x = (local_node_id & NOC_NODE_ID_MASK);
+  uint32_t local_y = ((local_node_id>>NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK);
+  return (NOC_UNICAST_COORDINATE_X(noc_coordinate) == local_x) && (NOC_UNICAST_COORDINATE_Y(noc_coordinate) == local_y);
+}
+
+////
+
+void noc_copy(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint32_t vc_arb_priority, uint8_t transaction_id) {
+  bool src_local = unicast_addr_local(src_coordinate);
+  if (!src_local) {
+    posted = true;
+  }
+  noc_transfer(src_coordinate, src_addr, dst_coordinate, dst_addr, size, linked, posted, static_vc_alloc, static_vc, false, 0, 0, src_local, vc_arb_priority, false, transaction_id);
+}
+
+void noc_accumulate(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, bool multicast, uint32_t multicast_mode, uint32_t vc_arb_priority, uint8_t transaction_id, uint8_t data_format, bool disable_saturation) {
+  bool src_local = unicast_addr_local(src_coordinate);
+  if (!src_local) {
+    posted = true;
+  }
+  while (!noc_command_ready());
+  NOC_WRITE_REG(NOC_TARG_ADDR_LO, (uint32_t)(src_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_TARG_ADDR_MID, (uint32_t)(src_addr >> 32));
+  NOC_WRITE_REG(NOC_TARG_ADDR_HI, src_coordinate);
+  NOC_WRITE_REG(NOC_RET_ADDR_LO, (uint32_t)(dst_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_RET_ADDR_MID, (uint32_t)(dst_addr >> 32));
+  NOC_WRITE_REG(NOC_RET_ADDR_HI, dst_coordinate);
+  NOC_WRITE_REG(NOC_AT_LEN_BE, size);
+  NOC_WRITE_REG(NOC_AT_LEN_BE_1, 0x0);
+  NOC_WRITE_REG(NOC_L1_ACC_AT_INSTRN, NOC_AT_INS(NOC_AT_INS_ACC) | NOC_AT_ACC_FORMAT(data_format) | NOC_AT_ACC_SAT_DIS(disable_saturation));
+  NOC_WRITE_REG(NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(transaction_id));
+  NOC_WRITE_REG(NOC_BRCST_EXCLUDE, 0x0);
+  if (multicast) {
+    NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                            (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+                            NOC_CMD_STATIC_VC(static_vc) |
+                            NOC_CMD_PATH_RESERVE |
+                            NOC_CMD_CPY |
+                            NOC_CMD_L1_ACC_AT_EN |
+                            (src_local ? NOC_CMD_WR : NOC_CMD_RD) |
+                            NOC_CMD_BRCST_PACKET |
+                            (posted ? 0x0 : NOC_CMD_RESP_MARKED) |
+                            NOC_CMD_BRCST_XY(multicast_mode));
+    NOC_WRITE_REG(NOC_BRCST_EXCLUDE, 0x0);
+  } else {
+    NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                            (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+                            NOC_CMD_STATIC_VC(static_vc) |
+                            NOC_CMD_CPY |
+                            NOC_CMD_L1_ACC_AT_EN |
+                            (src_local ? NOC_CMD_WR : NOC_CMD_RD) |
+                            (posted ? 0x0 : NOC_CMD_RESP_MARKED) |
+                            NOC_CMD_ARB_PRIORITY(vc_arb_priority));
+  }
+  NOC_WRITE_REG(NOC_CMD_CTRL, 0x1);
+}
+
+static void transfer_word_be(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint64_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, bool multicast, uint32_t multicast_mode, uint8_t transaction_id) {
+  while (!noc_command_ready());
+  NOC_WRITE_REG(NOC_TARG_ADDR_LO, (uint32_t)(src_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_TARG_ADDR_MID, (uint32_t)(src_addr >> 32));
+  NOC_WRITE_REG(NOC_TARG_ADDR_HI, src_coordinate);
+  NOC_WRITE_REG(NOC_RET_ADDR_LO, (uint32_t)(dst_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_RET_ADDR_MID, (uint32_t)(dst_addr >> 32));
+  NOC_WRITE_REG(NOC_RET_ADDR_HI, dst_coordinate);
+  NOC_WRITE_REG(NOC_AT_LEN_BE, (uint32_t)(be & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_AT_LEN_BE_1, (uint32_t)(be >> 32));
+  NOC_WRITE_REG(NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(transaction_id));
+  NOC_WRITE_REG(NOC_BRCST_EXCLUDE, 0x0);
+  if (multicast) {
+    NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                            (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+                            NOC_CMD_STATIC_VC(static_vc) |
+                            NOC_CMD_PATH_RESERVE |
+                            NOC_CMD_WR |
+                            NOC_CMD_WR_BE |
+                            NOC_CMD_BRCST_PACKET |
+                            (posted ? 0x0 : NOC_CMD_RESP_MARKED) |
+                            NOC_CMD_BRCST_XY(multicast_mode));
+    NOC_WRITE_REG(NOC_BRCST_EXCLUDE, 0x0);
+  } else {
+    NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                            (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+                            NOC_CMD_STATIC_VC(static_vc) |
+                            NOC_CMD_WR |
+                            NOC_CMD_WR_BE |
+                            (posted ? 0x0 : NOC_CMD_RESP_MARKED));
+  }
+  NOC_WRITE_REG(NOC_CMD_CTRL, 0x1);
+}
+
+static void noc_transfer_dw_inline(uint32_t dst_coordinate, uint64_t dst_addr, uint32_t val, uint8_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, bool multicast, uint32_t multicast_mode, uint8_t transaction_id) {
+
+  while (!noc_command_ready());
+  NOC_WRITE_REG(NOC_TARG_ADDR_LO, (uint32_t)(dst_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_TARG_ADDR_MID, (uint32_t)(dst_addr >> 32));
+  NOC_WRITE_REG(NOC_TARG_ADDR_HI, dst_coordinate);
+
+  uint64_t be64 = be;
+  uint32_t be_shift = (dst_addr & (NOC_WORD_BYTES-1));
+  be64 = (be64 << be_shift);
+  NOC_WRITE_REG(NOC_AT_LEN_BE, (uint32_t)(be64 & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_AT_LEN_BE_1, (uint32_t)(be64 >> 32));
+
+  NOC_WRITE_REG(NOC_AT_DATA, val);
+  NOC_WRITE_REG(NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(transaction_id));
+  NOC_WRITE_REG(NOC_BRCST_EXCLUDE, 0x0);
+  if (multicast) {
+    NOC_WRITE_REG(NOC_CTRL,
+                  (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                  (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+                  NOC_CMD_STATIC_VC(static_vc) |
+                  NOC_CMD_PATH_RESERVE |
+                  NOC_CMD_WR |
+                  NOC_CMD_WR_INLINE |
+                  NOC_CMD_BRCST_PACKET |
+                  (posted ? 0x0 : NOC_CMD_RESP_MARKED) |
+                  NOC_CMD_BRCST_XY(multicast_mode));
+    NOC_WRITE_REG(NOC_BRCST_EXCLUDE, 0x0);
+  } else {
+    NOC_WRITE_REG(NOC_CTRL,
+                  (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                  (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+                  NOC_CMD_STATIC_VC(static_vc) |
+                  NOC_CMD_WR |
+                  NOC_CMD_WR_INLINE |
+                  (posted ? 0x0 : NOC_CMD_RESP_MARKED));
+  }
+  NOC_WRITE_REG(NOC_CMD_CTRL, 0x1);
+}
+
+
+void noc_write_dw_inline(uint32_t dst_coordinate, uint64_t dst_addr, uint32_t val, uint8_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id) {
+  noc_transfer_dw_inline(dst_coordinate, dst_addr, val, be, linked, posted, static_vc_alloc, static_vc, false, 0, transaction_id);
+}
+
+void noc_multicast_write_dw_inline(uint32_t dst_coordinate, uint64_t dst_addr, uint32_t val, uint32_t multicast_mode, uint8_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id) {
+  noc_transfer_dw_inline(dst_coordinate, dst_addr, val, be, linked, posted, static_vc_alloc, static_vc, true, multicast_mode, transaction_id);
+}
+
+
+void noc_copy_word_be(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint64_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id) {
+  transfer_word_be(src_coordinate, src_addr, dst_coordinate, dst_addr, be, linked, posted, static_vc_alloc, static_vc, false, 0, transaction_id);
+}
+
+
+void noc_multicast_copy_word_be(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint64_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id) {
+  transfer_word_be(src_coordinate, src_addr, dst_coordinate, dst_addr, be, linked, posted, static_vc_alloc, static_vc, true, multicast_mode, transaction_id);
+}
+
+
+void noc_multicast_copy(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id) {
+  noc_transfer(src_coordinate, src_addr, dst_coordinate, dst_addr, size, linked, posted, static_vc_alloc, static_vc, true, multicast_mode, 0, unicast_addr_local(src_coordinate), 0, false, transaction_id);
+}
+
+void noc_multicast_copy_src_include(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id) {
+  noc_transfer(src_coordinate, src_addr, dst_coordinate, dst_addr, size, linked, posted, static_vc_alloc, static_vc, true, multicast_mode, 0, unicast_addr_local(src_coordinate), 0, true, transaction_id);
+}
+
+void noc_multicast_copy_exclude(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint32_t multicast_exclude, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id) {
+  noc_transfer(src_coordinate, src_addr, dst_coordinate, dst_addr, size, linked, posted, static_vc_alloc, static_vc, true, multicast_mode, multicast_exclude, unicast_addr_local(src_coordinate), 0, false, transaction_id);
+}
+
+
+void noc_atomic_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t incr, uint32_t wrap, bool linked) {
+  while (!noc_command_ready());
+  NOC_WRITE_REG(NOC_TARG_ADDR_LO, (uint32_t)(addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_TARG_ADDR_MID, (uint32_t)(addr >> 32));
+  NOC_WRITE_REG(NOC_TARG_ADDR_HI, noc_coordinate);
+  // NOC_WRITE_REG(NOC_RET_ADDR_LO, 0);
+  // NOC_WRITE_REG(NOC_RET_ADDR_MID, 0);
+  NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) | NOC_CMD_AT);
+  NOC_WRITE_REG(NOC_AT_LEN_BE, NOC_AT_INS(NOC_AT_INS_INCR_GET) | NOC_AT_WRAP(wrap) | NOC_AT_IND_32((addr>>2) & 0x3) | NOC_AT_IND_32_SRC(0));
+  NOC_WRITE_REG(NOC_AT_LEN_BE_1, 0x0);
+  NOC_WRITE_REG(NOC_AT_DATA, incr);
+  NOC_WRITE_REG(NOC_CMD_CTRL, 0x1);
+}
+
+
+void noc_atomic_read_and_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t incr, uint32_t wrap, uint32_t read_coordinate, uint64_t read_addr, bool linked, uint8_t transaction_id) {
+  while (!noc_command_ready());
+  NOC_WRITE_REG(NOC_TARG_ADDR_LO, (uint32_t)(addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_TARG_ADDR_MID, (uint32_t)(addr >> 32));
+  NOC_WRITE_REG(NOC_TARG_ADDR_HI, noc_coordinate);
+  NOC_WRITE_REG(NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(transaction_id));
+  NOC_WRITE_REG(NOC_RET_ADDR_LO, (uint32_t)(read_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_RET_ADDR_MID, (uint32_t)(read_addr >> 32));
+  NOC_WRITE_REG(NOC_RET_ADDR_HI, read_coordinate);
+  NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                         NOC_CMD_AT |
+                         NOC_CMD_RESP_MARKED);
+  NOC_WRITE_REG(NOC_AT_LEN_BE, NOC_AT_INS(NOC_AT_INS_INCR_GET) | NOC_AT_WRAP(wrap) | NOC_AT_IND_32((addr>>2) & 0x3) | NOC_AT_IND_32_SRC(0));
+  NOC_WRITE_REG(NOC_AT_LEN_BE_1, 0x0);
+  NOC_WRITE_REG(NOC_AT_DATA, incr);
+  NOC_WRITE_REG(NOC_CMD_CTRL, 0x1);
+}
+
+
+void noc_multicast_atomic_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t multicast_mode, uint32_t incr, uint32_t wrap, bool linked) {
+  while (!noc_command_ready());
+  NOC_WRITE_REG(NOC_TARG_ADDR_LO, (uint32_t)(addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_TARG_ADDR_MID, (uint32_t)(addr >> 32));
+  NOC_WRITE_REG(NOC_TARG_ADDR_HI, noc_coordinate);
+  // NOC_WRITE_REG(NOC_RET_ADDR_LO, 0);
+  // NOC_WRITE_REG(NOC_RET_ADDR_MID, 0);
+  NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                         NOC_CMD_PATH_RESERVE |
+                         NOC_CMD_AT |
+                         NOC_CMD_BRCST_PACKET |
+                         NOC_CMD_BRCST_XY(multicast_mode));
+  NOC_WRITE_REG(NOC_BRCST_EXCLUDE, 0x0);
+  NOC_WRITE_REG(NOC_AT_LEN_BE, NOC_AT_INS(NOC_AT_INS_INCR_GET) | NOC_AT_WRAP(wrap) | NOC_AT_IND_32((addr>>2) & 0x3) | NOC_AT_IND_32_SRC(0));
+  NOC_WRITE_REG(NOC_AT_LEN_BE_1, 0x0);
+  NOC_WRITE_REG(NOC_AT_DATA, incr);
+  NOC_WRITE_REG(NOC_CMD_CTRL, 0x1);
+}
+
+
+void noc_multicast_atomic_read_and_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t multicast_mode, uint32_t incr, uint32_t wrap, uint32_t read_coordinate, uint64_t read_addr, bool linked, uint8_t transaction_id) {
+  while (!noc_command_ready());
+  NOC_WRITE_REG(NOC_TARG_ADDR_LO, (uint32_t)(addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_TARG_ADDR_MID, (uint32_t)(addr >> 32));
+  NOC_WRITE_REG(NOC_TARG_ADDR_HI, noc_coordinate);
+  NOC_WRITE_REG(NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(transaction_id));
+  NOC_WRITE_REG(NOC_RET_ADDR_LO, (uint32_t)(read_addr & 0xFFFFFFFF));
+  NOC_WRITE_REG(NOC_RET_ADDR_MID, (uint32_t)(read_addr >> 32));
+  NOC_WRITE_REG(NOC_RET_ADDR_HI, read_coordinate);
+  NOC_WRITE_REG(NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                         NOC_CMD_PATH_RESERVE |
+                         NOC_CMD_AT |
+                         NOC_CMD_RESP_MARKED |
+                         NOC_CMD_BRCST_PACKET |
+                         NOC_CMD_BRCST_XY(multicast_mode));
+  NOC_WRITE_REG(NOC_BRCST_EXCLUDE, 0x0);
+  NOC_WRITE_REG(NOC_AT_LEN_BE, NOC_AT_INS(NOC_AT_INS_INCR_GET) | NOC_AT_WRAP(wrap) | NOC_AT_IND_32((addr>>2) & 0x3) | NOC_AT_IND_32_SRC(0));
+  NOC_WRITE_REG(NOC_AT_LEN_BE_1, 0x0);
+  NOC_WRITE_REG(NOC_AT_DATA, incr);
+  NOC_WRITE_REG(NOC_CMD_CTRL, 0x1);
+}
+
+
+bool noc_command_ready() {
+  return (NOC_READ_REG(NOC_CMD_CTRL) == NOC_CTRL_STATUS_READY);
+}
+
+uint32_t noc_atomic_read_updates_completed() {
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result =noc_status_reg(NIU_MST_ATOMIC_RESP_RECEIVED);
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+volatile uint32_t noc_wr_ack_received() {
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result =noc_status_reg(NIU_MST_WR_ACK_RECEIVED);
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+volatile uint32_t noc_rd_resp_received() {
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result = noc_status_reg(NIU_MST_RD_RESP_RECEIVED);
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+uint32_t noc_local_node_id() {
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result = NOC_READ_REG(NOC_NODE_ID);
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+uint32_t noc_status_reg(uint32_t status_reg_id) {
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result = NOC_READ_REG(NOC_STATUS(status_reg_id));
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+void noc_set_cfg_reg(uint32_t cfg_reg_id, uint32_t val) {
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  NOC_WRITE_REG(NOC_CFG(cfg_reg_id), val);
+  noc_set_cmd_buf(save_cmd_buf);
+}
+
+
+uint32_t noc_get_cfg_reg(uint32_t cfg_reg_id) {
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result = NOC_READ_REG(NOC_CFG(cfg_reg_id));
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////
+//////////////////////// ECC Functions ///////////////////////////
+//////////////////////////////////////////////////////////////////
+
+void noc_ecc_cfg_stage_1(bool header_ckh_bits_en)
+{
+  uint32_t mask;
+  uint32_t cfg_reg;
+
+  cfg_reg = noc_get_cfg_reg(ROUTER_CFG_0);
+  mask = (1 << ROUTER_CFG_0_ECC_HEADER_CHKBITS_EN);
+  cfg_reg = (cfg_reg & ~mask) | (header_ckh_bits_en << ROUTER_CFG_0_ECC_HEADER_CHKBITS_EN);
+  noc_set_cfg_reg(ROUTER_CFG_0, cfg_reg);
+}
+
+void noc_ecc_cfg_stage_2(bool niu_mem_parity_en, bool router_mem_parity_en, bool header_secded_en, bool mem_parity_int_en, bool header_sec_int_en, bool header_ded_int_en)
+{
+  uint32_t mask;
+  uint32_t cfg_reg;
+
+  cfg_reg = noc_get_cfg_reg(NIU_CFG_0);
+  mask = (1 << NIU_CFG_0_ECC_NIU_MEM_PARITY_EN) | (1 << NIU_CFG_0_ECC_MEM_PARITY_INT_EN) | (1 << NIU_CFG_0_ECC_HEADER_1B_INT_EN) | (1 << NIU_CFG_0_ECC_HEADER_2B_INT_EN);
+  cfg_reg = (cfg_reg & ~mask) | (niu_mem_parity_en << NIU_CFG_0_ECC_NIU_MEM_PARITY_EN) | (mem_parity_int_en << NIU_CFG_0_ECC_MEM_PARITY_INT_EN) | (header_sec_int_en << NIU_CFG_0_ECC_HEADER_1B_INT_EN) | (header_ded_int_en << NIU_CFG_0_ECC_HEADER_2B_INT_EN);
+  noc_set_cfg_reg(NIU_CFG_0, cfg_reg);
+
+  cfg_reg = noc_get_cfg_reg(ROUTER_CFG_0);
+  mask = (1 << ROUTER_CFG_0_ECC_ROUTER_MEM_PARITY_EN) | (1 << ROUTER_CFG_0_ECC_HEADER_SECDED_EN);
+  cfg_reg = (cfg_reg & ~mask) | (router_mem_parity_en << ROUTER_CFG_0_ECC_ROUTER_MEM_PARITY_EN) | (header_secded_en << ROUTER_CFG_0_ECC_HEADER_SECDED_EN);
+  noc_set_cfg_reg(ROUTER_CFG_0, cfg_reg);
+}
+
+void noc_ecc_clear_err(bool clear_mem_parity_err, bool clear_header_sec, bool clear_header_ded)
+{
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  NOC_WRITE_REG(ECC_CTRL, ((clear_mem_parity_err | (clear_header_sec << 1) | (clear_header_ded << 2)) & 0x7));
+  noc_set_cmd_buf(save_cmd_buf);
+}
+
+void noc_ecc_force_err(bool force_mem_parity_err, bool force_header_sec, bool force_header_ded)
+{
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  NOC_WRITE_REG(ECC_CTRL, (((force_mem_parity_err | (force_header_sec << 1) | (force_header_ded << 2)) & 0x7) << 3));
+  noc_set_cmd_buf(save_cmd_buf);
+}
+
+uint32_t noc_ecc_get_num_mem_parity_errs()
+{
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result = NOC_READ_REG(NUM_MEM_PARITY_ERR);
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+uint32_t noc_ecc_get_num_header_sec()
+{
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result = NOC_READ_REG(NUM_HEADER_1B_ERR);
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+uint32_t noc_ecc_get_num_header_ded()
+{
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  uint32_t result = NOC_READ_REG(NUM_HEADER_2B_ERR);
+  noc_set_cmd_buf(save_cmd_buf);
+  return result;
+}
+
+
+void noc_clear_req_id_cnt(uint32_t id_mask) {
+  uint32_t save_cmd_buf = noc_get_cmd_buf();
+  noc_set_cmd_buf(0);
+  NOC_WRITE_REG(NOC_CLEAR_OUTSTANDING_REQ_CNT, id_mask);
+  noc_set_cmd_buf(save_cmd_buf);
+}
diff --git a/tt_metal/hw/inc/blackhole/c_tensix_core.h b/tt_metal/hw/inc/blackhole/c_tensix_core.h
new file mode 100644
index 000000000000..cfe7cee195e9
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/c_tensix_core.h
@@ -0,0 +1,448 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <type_traits>
+#include <cstdint>
+#include <initializer_list>
+
+#include "fw_debug.h"
+#include "tensix.h"
+#include "tensix_functions.h"
+#include "atomic_rwptr.h"
+#include "noc_overlay_parameters.h"
+
+class c_tensix_core {
+
+public:
+    static const bool is_emulated = false;
+
+    static vptr_uint instrn_buf_base(uint32_t thread_id)
+    {
+        const uint32_t addr[] = { INSTRN_BUF_BASE, INSTRN1_BUF_BASE, INSTRN2_BUF_BASE };
+        return reinterpret_cast<uint32_t*>(addr[thread_id]);
+    }
+    static vptr_pc_buf pc_buf_base(uint32_t thread_id)
+    {
+        const uint32_t addr[] = { PC_BUF_BASE, PC1_BUF_BASE, PC2_BUF_BASE };
+        return reinterpret_cast<uint32_t*>(addr[thread_id]);
+    }
+    static vptr_uint regfile_base() { return reinterpret_cast<uint32_t*>(REGFILE_BASE); }
+    static vptr_uint cfg_regs_base(uint state_id = 0)
+    {
+        if (state_id == 0)
+            return reinterpret_cast<uint32_t *>(TENSIX_CFG_BASE);
+
+        return reinterpret_cast<uint32_t *>(TENSIX_CFG_BASE + CFG_STATE_SIZE * 4 * 4);
+    }
+    static vptr_mailbox mailbox_base(uint32_t thread_id)
+    {
+        const uint32_t addr[] = { TENSIX_MAILBOX1_BASE, TENSIX_MAILBOX2_BASE, TENSIX_MAILBOX3_BASE };
+        return reinterpret_cast<uint32_t*>(addr[thread_id]);
+    }
+    static volatile uint32_t &test_mailbox() { extern volatile std::uint32_t TEST_MAILBOX; return TEST_MAILBOX; }
+
+    static volatile uint64_t *wall_clock_mailbox()
+    {
+        extern volatile std::uint64_t WALL_CLOCK_MAILBOX[];
+        return WALL_CLOCK_MAILBOX;
+    }
+
+    static volatile uint32_t *debug_mailbox()
+    {
+        extern volatile std::uint32_t DEBUG_MAILBOX[];
+        return DEBUG_MAILBOX;
+    }
+
+    static volatile uint32_t &cq_mailbox() { extern volatile std::uint32_t CQ_MAILBOX; return CQ_MAILBOX; }
+
+    static void set_cq_mailbox(std::uint32_t value) {
+      auto &cq_mb = cq_mailbox();
+      cq_mb = value;
+    }
+
+    static volatile uint32_t *get_io_queue_pointer_base(uint32_t base_addr, uint32_t id)
+    {
+      return reinterpret_cast<volatile uint32_t *>(base_addr) + (id << 2) + id;
+    }
+
+    // These are used to track dynamic allocation/deallocations for perf analysis. They don't do anything by default, but writes to perf scratch area could be added.
+    static void record_dynamic_allocation(int buffer_id, int loc, std::intptr_t ptr, uint32_t size) {}
+    static void record_dynamic_deallocation(int buffer_id) {}
+
+    //static void ex_sync_kernel(vptr_mailbox mailbox) { ::ex_sync_kernel(mailbox); }
+    //static void ex_sync_instrn(vptr_uint instrn_buf, vptr_mailbox mailbox) { ::ex_sync_instrn(instrn_buf, mailbox); }
+    static void ex_stallwait(vptr_uint instrn_buf, uint wait_res, uint stall_res ) { :: ex_stallwait(wait_res, stall_res, instrn_buf); }
+    static void ex_setc16(uint addr, uint val, vptr_uint instrn_buf) { ::ex_setc16(addr, val, instrn_buf); }
+    static void ex_instrn_wrcfg(uint gpr, uint cfg_addr, vptr_uint instrn_buf) { ::ex_instrn_wrcfg(gpr, cfg_addr, instrn_buf); }
+    static void ex_instrn_rdcfg(uint gpr, uint cfg_addr, vptr_uint instrn_buf) { ::ex_instrn_rdcfg(gpr, cfg_addr, instrn_buf); }
+    static void ex_rmw_cfg_gpr(uint state_id, uint cfg_addr32, uint cfg_shamt, uint32_t cfg_mask, uint gpr_index)
+      { ::ex_rmw_cfg_gpr(cfg_addr32, cfg_shamt, cfg_mask, gpr_index, regfile_base(), cfg_regs_base(state_id)); }
+    static void ex_rmw_cfg(uint8_t state_id, uint cfg_addr32, uint cfg_shamt, uint32_t cfg_mask, uint wr_val)
+      { ::ex_rmw_cfg(cfg_addr32, cfg_shamt, cfg_mask, wr_val, cfg_regs_base(state_id)); }
+
+    static void ex_nop(vptr_uint instrn_buf) { :: ex_nop(instrn_buf); }
+
+    //static void ex_set_stride_prepacked(cnt_id_t cntset_ind, uint chan_ind, uint xy_stride, uint zw_stride, vptr_uint instrn_buf);
+    static void ex_setadc(cnt_id_t cnt_ind, uint chan_ind, uint dim_ind, uint val, vptr_uint instrn_buf);
+    static void ex_setpkedgof(uint edge_mask, vptr_uint instrn_buf);
+    static void ex_clear_dvalid(uint clear_ab, uint reset, vptr_uint instrn_buffer);
+    static void ex_sem_init(uint semaphore, uint max_value, uint init_value, vptr_uint instrn_buffer);
+    static void ex_zeroacc(vptr_uint instrn_buf, uint clear_mode = 3, uint dest_register = 0, uint addressing_mode = 0);
+    static void ex_encc(vptr_uint instrn_buf);
+    static void ex_load_const(vptr_uint instrn_buf);
+
+    static void ex_instrn(vptr_uint  instrn_buffer, unsigned int instruction) { ::execute_instruction(instrn_buffer, instruction); }
+    static void thcon_load_ind(vptr_uint instrn_buffer, std::uint32_t base_addr_index, std::uint32_t dst_data_index, std::uint32_t offset_index, std::uint32_t autoinc, std::uint32_t size);
+    static void thcon_incr_get_ptr(vptr_uint instrn_buffer, std::uint32_t mem_addr_index, std::uint32_t data_reg_index, std::uint32_t incr_val, std::uint32_t wrap_val, bool rd_wr, bool l0_l1_sel);
+    static void thcon_incr_get_ptr_noinc(vptr_uint instrn_buffer, std::uint32_t mem_addr_index, std::uint32_t data_reg_index, std::uint32_t incr_val, std::uint32_t wrap_val, bool rd_wr, bool l0_l1_sel);
+    static void thcon_reg_to_flops(vptr_uint instrn_buffer, uint32_t mode_32b_16B, uint32_t reg_index, uint32_t flop_index, uint32_t target_select=0, uint32_t byte_offset=0);
+    static void thcon_set_descriptor(vptr_uint instrn_buf,uint reg_index, uint unpacker_id);
+
+    static uint read_packed_size(uint thread); // Return size in bytes of last packer output for a thread.
+    static uint read_accumulated_packed_size(uint thread); // Return accumulated size of tiles processed by the packer
+
+    static uint wait(int cycles);
+
+    static uint64_t read_wall_clock();
+    static uint32_t read_wall_clock_l();
+
+    static atomic_rwptr<uint>& fifo_wptr(uint *addr);
+    static atomic_rwptr<uint>& fifo_rdptr(uint *addr);
+    static atomic_rwptr<uint>& fifo_endptr(uint *addr);
+    static atomic_rwptr<uint>& fifo_wptr(uint addr);
+    static atomic_rwptr<uint>& fifo_rdptr(uint addr);
+    static atomic_rwptr<uint>& fifo_endptr(uint addr);
+
+    template <class T, std::enable_if_t<std::is_pointer<T>::value, int> = 0>
+    static T l1_cast(uint32_t l1_offset)
+    {
+        return reinterpret_cast<T>(l1_offset);
+    }
+
+    template <class T>
+    static std::uint32_t l1_cast(T *l1_pointer)
+    {
+        return reinterpret_cast<uint32_t>(l1_pointer);
+    }
+
+    static std::uint32_t l1_size() { return SIM_L1_SIZE; }
+
+//MM July 19 2022: In a desperate bid to fix copmiler errors, I just
+//copy-pasted the version of these NOC functions directly from c_tensix_core.h
+//in blackhole. Fingers crossed...
+/*
+    static void noc_copy(uint64_t src_addr, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool wr_blocking = false, bool rd_blocking = false, uint16_t be = 0xffff);
+    static void noc_atomic_increment(uint64_t addr, uint32_t incr, uint32_t wrap, bool linked);
+    // if blocking copy is requested, set num_blocking_cores to the number of receiving cores
+    static void noc_multicast_copy(uint64_t src_addr, uint64_t dst_addr, uint32_t multicast_mode, uint32_t size, bool linked, bool posted, uint32_t num_blocking_cores = 0);
+    static void noc_multicast_atomic_increment(uint64_t addr, uint32_t multicast_mode, uint32_t incr, uint32_t wrap, bool linked);
+
+    static std::uint32_t noc_id();
+*/
+
+    static void noc_copy(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool wr_blocking = false, bool rd_blocking = false, uint16_t be = 0xffff);
+    static void noc_atomic_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t incr, uint32_t wrap, bool linked);
+    // if blocking copy is requested, set num_blocking_cores to the number of receiving cores
+    static void noc_multicast_copy(uint32_t noc_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint32_t size, bool linked, bool posted, uint32_t num_blocking_cores = 0);
+    static void noc_multicast_atomic_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t multicast_mode, uint32_t incr, uint32_t wrap, bool linked);
+
+    static std::uint32_t noc_id();
+
+
+
+
+    static inline void write_stream_register(uint32_t stream_id, uint32_t index, uint32_t value);
+    static inline uint32_t read_stream_register(uint32_t stream_id, uint32_t index);
+    static inline uint32_t read_stream_register_field(uint32_t stream_id, uint32_t index, uint32_t shift, uint32_t width);
+
+    static inline void ExtraKernelParams(uint /*thread_id*/, uint /*kernel_id*/, std::initializer_list<uint32_t> /*params*/) { }
+
+    static inline void check_l1_address_range(std::uint32_t byte_addr, std::size_t length);
+
+private:
+    static inline volatile uint32_t* noc_stream_registers(uint32_t stream_id);
+};
+
+
+/*inline void c_tensix_core::ex_set_stride_prepacked(cnt_id_t cntset_ind, uint chan_ind, uint xy_stride, uint zw_stride, volatile uint * instrn_buf)
+{
+    ::ex_set_stride_prepacked(cntset_ind, chan_ind, xy_stride, zw_stride, instrn_buf);
+}*/
+
+inline void c_tensix_core::ex_setpkedgof(uint edge_mask, vptr_uint instrn_buf)
+{
+    ::ex_setpkedgof(edge_mask, instrn_buf);
+}
+
+inline void c_tensix_core::ex_clear_dvalid(uint clear_ab, uint reset, vptr_uint instrn_buffer)
+{
+    ::ex_clear_dvalid(clear_ab, reset, instrn_buffer);
+}
+
+inline void c_tensix_core::ex_sem_init(uint semaphore, uint max_value, uint init_value, vptr_uint instrn_buffer)
+{
+    ::ex_sem_init(semaphore, max_value, init_value, instrn_buffer);
+}
+
+inline void c_tensix_core::ex_zeroacc(vptr_uint instrn_buf, uint clear_mode, uint dest_register, uint addressing_mode)
+{
+    ::ex_zeroacc(instrn_buf, clear_mode, dest_register, addressing_mode);
+}
+
+inline void c_tensix_core::ex_encc(vptr_uint instrn_buf)
+{
+   ::ex_encc(instrn_buf);
+}
+
+inline void c_tensix_core::ex_load_const(vptr_uint instrn_buf)
+{
+    // Load LREG11 w/ -1.0f by convention
+    uint instrn;
+    instrn = (0xbf80 << 0); // Load LREG0 w/ -1.0f
+    ex_push_insn(instrn_buf, INSTRN_SFPLOADI(instrn));
+    instrn = (11 << 4);     // Set LREG11 to LREG0
+    ex_push_insn(instrn_buf, INSTRN_SFPCONFIG(instrn));
+}
+
+inline void c_tensix_core::ex_setadc(cnt_id_t cnt_ind, uint chan_ind, uint dim_ind, uint val, vptr_uint instrn_buf)
+{
+    ::ex_setadc(cnt_ind, chan_ind, dim_ind, val, instrn_buf);
+}
+
+inline void c_tensix_core::thcon_load_ind(vptr_uint instrn_buffer, uint base_addr_index, uint dst_data_index, uint offset_index, uint autoinc, uint size)
+{
+    ::thcon_load_ind(instrn_buffer, base_addr_index, dst_data_index, offset_index, autoinc, size);
+}
+
+inline void c_tensix_core::thcon_incr_get_ptr(vptr_uint instrn_buffer,uint mem_addr_index, uint data_reg_index, uint incr_val, uint wrap_val, bool rd_wr, bool l0_l1_sel)
+{
+    ::thcon_incr_get_ptr(instrn_buffer, mem_addr_index, data_reg_index, incr_val, wrap_val, rd_wr, l0_l1_sel);
+}
+
+inline void c_tensix_core::thcon_incr_get_ptr_noinc(vptr_uint instrn_buffer,uint mem_addr_index, uint data_reg_index, uint incr_val, uint wrap_val, bool rd_wr, bool l0_l1_sel)
+{
+    ::thcon_incr_get_ptr_noinc(instrn_buffer, mem_addr_index, data_reg_index, incr_val, wrap_val, rd_wr, l0_l1_sel);
+}
+
+inline void c_tensix_core::thcon_reg_to_flops(vptr_uint instrn_buffer,uint mode_32b_16B, uint reg_index, uint flop_index, uint target_select, uint byte_offset)
+{
+    ::thcon_reg_to_flops(instrn_buffer, mode_32b_16B, reg_index, flop_index, target_select, byte_offset);
+}
+
+inline void c_tensix_core::thcon_set_descriptor(vptr_uint instrn_buf,uint reg_index, uint unpacker_id)
+{
+    ::thcon_set_descriptor(instrn_buf, reg_index, unpacker_id);
+}
+
+inline uint c_tensix_core::read_packed_size(uint thread)
+{
+  uint packed_size = memory_read(RISCV_TDMA_REG_PACKED_SIZE);
+  if (thread == 0) {
+    packed_size &= 0xFFFF;
+  }
+  else {
+    packed_size >>= 16;
+  }
+
+  return packed_size;
+}
+
+inline uint c_tensix_core::read_accumulated_packed_size(uint thread)
+{
+  uint packed_size = memory_read(RISCV_TDMA_REG_ACC_PACKED_SIZE);
+  if (thread == 0) {
+    packed_size &= 0xFFFF;
+  }
+  else {
+    packed_size >>= 16;
+  }
+
+  return packed_size;
+}
+
+inline uint c_tensix_core::wait(int cycles)
+{
+  int count = 0;
+  uint bla = 0;
+
+  volatile uint * mailbox = mailbox_base(0);
+  while (count < cycles) {
+      bla = mailbox[0];
+      count++;
+  }
+  return bla;
+}
+
+inline atomic_rwptr<uint>& c_tensix_core::fifo_wptr(uint *addr)
+{
+  return make_atomic_rwptr(addr - 3);
+}
+
+inline atomic_rwptr<uint>& c_tensix_core::fifo_rdptr(uint *addr)
+{
+  return make_atomic_rwptr(addr - 4);
+}
+
+inline atomic_rwptr<uint>& c_tensix_core::fifo_endptr(uint *addr)
+{
+  return make_atomic_rwptr(addr - 1);
+}
+
+inline atomic_rwptr<uint>& c_tensix_core::fifo_wptr(uint addr)
+{
+  return fifo_wptr(l1_cast<uint*>(addr));
+}
+
+inline atomic_rwptr<uint>& c_tensix_core::fifo_rdptr(uint addr)
+{
+  return fifo_rdptr(l1_cast<uint*>(addr));
+}
+
+inline atomic_rwptr<uint>& c_tensix_core::fifo_endptr(uint addr)
+{
+  return fifo_endptr(l1_cast<uint*>(addr));
+}
+
+// NOC API
+//MM July 19 2022: In a desperate bid to fix copmiler errors, I just
+//copy-pasted the version of these NOC functions directly from c_tensix_core.h
+//in blackhole. Fingers crossed...
+/*
+inline void c_tensix_core::noc_copy(uint64_t src_addr, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool wr_blocking, bool rd_blocking, uint16_t be) {
+
+    FWASSERT("Write-Blocking behaviour is only supported when posted=false", wr_blocking == false || posted == false);
+    FWASSERT("Byte-enable is only supported for a word copy", ( be == 0xffff || size <= 16 ));
+
+    uint32_t acks = wr_blocking ? noc_wr_ack_received() : noc_rd_resp_received();
+    uint32_t num_acks = size / NOC_MAX_BURST_SIZE + ((size % NOC_MAX_BURST_SIZE) ? 1 : 0);
+
+    if(be != 0xffff)
+    {
+      ::noc_copy_word_be(src_addr, dst_addr, be, linked, posted, false, 0, 0);
+    }
+    else
+    {
+      ::noc_copy(src_addr, dst_addr, size, linked, posted, false, 0, 0, 0);
+    }
+
+    // if blocking copy, wait until all the wacks have been received
+    while((wr_blocking && (acks + num_acks != noc_wr_ack_received())) || // block on wacks
+          (rd_blocking && (acks + num_acks != noc_rd_resp_received())));  // block on read-responses
+}
+
+inline void c_tensix_core::noc_atomic_increment(uint64_t addr, uint32_t incr, uint32_t wrap, bool linked) {
+    ::noc_atomic_increment(addr, incr, wrap, linked);
+ }
+
+inline void c_tensix_core::noc_multicast_copy(uint64_t src_addr, uint64_t dst_addr, uint32_t multicast_mode, uint32_t size, bool linked, bool posted, uint32_t num_blocking_cores) {
+
+    uint32_t wacks = noc_wr_ack_received();
+    uint32_t num_wacks = size / NOC_MAX_BURST_SIZE + ((size % NOC_MAX_BURST_SIZE) ? 1 : 0);
+    num_wacks *= num_blocking_cores;
+
+    FWASSERT("Blocking behaviour is only supported when posted=false", num_blocking_cores == 0 || posted == false);
+
+    ::noc_multicast_copy(src_addr, dst_addr, multicast_mode, size, linked, posted, false, 0, 0);
+
+    // if blocking copy, wait until all the wacks have been received
+    while(num_blocking_cores && (wacks + num_wacks != noc_wr_ack_received()));
+}
+
+inline void c_tensix_core::noc_multicast_atomic_increment(uint64_t addr, uint32_t multicast_mode, uint32_t incr, uint32_t wrap, bool linked) {
+    ::noc_multicast_atomic_increment(addr, multicast_mode, incr, wrap, linked);
+}
+
+inline std::uint32_t c_tensix_core::noc_id()
+{
+    std::uint32_t id = ::noc_local_node_id();
+    return (id & 0xFFF);
+}
+*/
+inline void c_tensix_core::noc_copy(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool wr_blocking, bool rd_blocking, uint16_t be) {
+
+    FWASSERT("Write-Blocking behaviour is only supported when posted=false", wr_blocking == false || posted == false);
+    FWASSERT("Byte-enable is only supported for a word copy", ( be == 0xffff || size <= 16 ));
+
+    uint32_t acks = wr_blocking ? noc_wr_ack_received() : noc_rd_resp_received();
+    uint32_t num_acks = size / NOC_MAX_BURST_SIZE + ((size % NOC_MAX_BURST_SIZE) ? 1 : 0);
+
+    if(be != 0xffff)
+    {
+      ::noc_copy_word_be(src_coordinate, src_addr, dst_coordinate, dst_addr, be, linked, posted, false, 0, 0);
+    }
+    else
+    {
+      ::noc_copy(src_coordinate, src_addr, dst_coordinate, dst_addr, size, linked, posted, false, 0, 0, 0);
+    }
+
+    // if blocking copy, wait until all the wacks have been received
+    while((wr_blocking && (acks + num_acks != noc_wr_ack_received())) || // block on wacks
+          (rd_blocking && (acks + num_acks != noc_rd_resp_received())));  // block on read-responses
+}
+
+inline void c_tensix_core::noc_atomic_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t incr, uint32_t wrap, bool linked) {
+    ::noc_atomic_increment(noc_coordinate, addr, incr, wrap, linked);
+ }
+
+inline void c_tensix_core::noc_multicast_copy(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint32_t size, bool linked, bool posted, uint32_t num_blocking_cores) {
+
+    uint32_t wacks = noc_wr_ack_received();
+    uint32_t num_wacks = size / NOC_MAX_BURST_SIZE + ((size % NOC_MAX_BURST_SIZE) ? 1 : 0);
+    num_wacks *= num_blocking_cores;
+
+    FWASSERT("Blocking behaviour is only supported when posted=false", num_blocking_cores == 0 || posted == false);
+
+    ::noc_multicast_copy(src_coordinate, src_addr, dst_coordinate, dst_addr, multicast_mode, size, linked, posted, false, 0, 0);
+
+    // if blocking copy, wait until all the wacks have been received
+    while(num_blocking_cores && (wacks + num_wacks != noc_wr_ack_received()));
+}
+
+inline void c_tensix_core::noc_multicast_atomic_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t multicast_mode, uint32_t incr, uint32_t wrap, bool linked) {
+    ::noc_multicast_atomic_increment(noc_coordinate, addr, multicast_mode, incr, wrap, linked);
+}
+
+inline std::uint32_t c_tensix_core::noc_id()
+{
+    std::uint32_t id = ::noc_local_node_id();
+    return (id & 0xFFF);
+}
+
+
+
+
+inline void c_tensix_core::write_stream_register(uint32_t stream_id, uint32_t index, uint32_t value)
+{
+  NOC_STREAM_WRITE_REG(stream_id, index, value);
+}
+
+inline uint32_t c_tensix_core::read_stream_register(uint32_t stream_id, uint32_t index)
+{
+  return NOC_STREAM_READ_REG(stream_id, index);
+}
+
+inline uint32_t c_tensix_core::read_stream_register_field(uint32_t stream_id, uint32_t index, uint32_t shift, uint32_t width)
+{
+  return ( read_stream_register(stream_id, index) >> shift ) & ((1 << width)-1);
+}
+
+inline uint32_t c_tensix_core::read_wall_clock_l()
+{
+  return memory_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+}
+
+inline uint64_t c_tensix_core::read_wall_clock()
+{
+  uint32_t low = memory_read(RISCV_DEBUG_REG_WALL_CLOCK_L); // latches high
+  uint32_t high = memory_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+
+  return ((uint64_t)high << 32) | low;
+}
+
+inline void c_tensix_core::check_l1_address_range(std::uint32_t byte_addr, std::size_t length)
+{
+    FWASSERT("Exceeded L1 of 1MB!!", ((byte_addr + length) <= (1U << 20)));
+}
diff --git a/tt_metal/hw/inc/blackhole/cfg_defines.h b/tt_metal/hw/inc/blackhole/cfg_defines.h
new file mode 100644
index 000000000000..1781c1d47d80
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/cfg_defines.h
@@ -0,0 +1,4159 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+////////////////////////////////////////////////////////////////////////
+// File generated by
+// ${ROOT}/src/meta/regspecs/src/genCfgRegs.py
+// from
+// ${ROOT}/src/meta/regspecs/yaml/local_regs_per_exu.yaml
+//
+// Constants for Tensix registers
+//
+////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#define CFG_STATE_SIZE 56
+#define THD_STATE_SIZE 68
+
+////////////////////////////////////////////////////////////////////////
+// Registers for THREAD
+////////////////////////////////////////////////////////////////////////
+
+#define THREAD_CFGREG_BASE_ADDR32                                   0
+
+#define CFG_STATE_ID_StateID_ADDR32                                 0
+#define CFG_STATE_ID_StateID_SHAMT                                  0
+#define CFG_STATE_ID_StateID_MASK                                 0x1
+#define CFG_STATE_ID_StateID_RMW                                  CFG_STATE_ID_StateID_ADDR32, CFG_STATE_ID_StateID_SHAMT, CFG_STATE_ID_StateID_MASK
+
+#define DEST_TARGET_REG_CFG_MATH_Offset_ADDR32                      1
+#define DEST_TARGET_REG_CFG_MATH_Offset_SHAMT                       0
+#define DEST_TARGET_REG_CFG_MATH_Offset_MASK                      0xfff
+#define DEST_TARGET_REG_CFG_MATH_Offset_RMW                       DEST_TARGET_REG_CFG_MATH_Offset_ADDR32, DEST_TARGET_REG_CFG_MATH_Offset_SHAMT, DEST_TARGET_REG_CFG_MATH_Offset_MASK
+
+#define DISABLE_IMPLIED_SRCA_FMT_Base_ADDR32                        2
+#define DISABLE_IMPLIED_SRCA_FMT_Base_SHAMT                         0
+#define DISABLE_IMPLIED_SRCA_FMT_Base_MASK                        0x1
+#define DISABLE_IMPLIED_SRCA_FMT_Base_RMW                         DISABLE_IMPLIED_SRCA_FMT_Base_ADDR32, DISABLE_IMPLIED_SRCA_FMT_Base_SHAMT, DISABLE_IMPLIED_SRCA_FMT_Base_MASK
+
+#define DISABLE_IMPLIED_SRCB_FMT_Base_ADDR32                        3
+#define DISABLE_IMPLIED_SRCB_FMT_Base_SHAMT                         0
+#define DISABLE_IMPLIED_SRCB_FMT_Base_MASK                        0x1
+#define DISABLE_IMPLIED_SRCB_FMT_Base_RMW                         DISABLE_IMPLIED_SRCB_FMT_Base_ADDR32, DISABLE_IMPLIED_SRCB_FMT_Base_SHAMT, DISABLE_IMPLIED_SRCB_FMT_Base_MASK
+
+#define SFPU_DEST_FMT_Enable_ADDR32                                 4
+#define SFPU_DEST_FMT_Enable_SHAMT                                  0
+#define SFPU_DEST_FMT_Enable_MASK                                 0x1
+#define SFPU_DEST_FMT_Enable_RMW                                  SFPU_DEST_FMT_Enable_ADDR32, SFPU_DEST_FMT_Enable_SHAMT, SFPU_DEST_FMT_Enable_MASK
+
+#define SFPU_DEST_FMT_Base_ADDR32                                   4
+#define SFPU_DEST_FMT_Base_SHAMT                                    1
+#define SFPU_DEST_FMT_Base_MASK                                   0x1e
+#define SFPU_DEST_FMT_Base_RMW                                    SFPU_DEST_FMT_Base_ADDR32, SFPU_DEST_FMT_Base_SHAMT, SFPU_DEST_FMT_Base_MASK
+
+#define SRCA_SET_Base_ADDR32                                        5
+#define SRCA_SET_Base_SHAMT                                         0
+#define SRCA_SET_Base_MASK                                        0x3
+#define SRCA_SET_Base_RMW                                         SRCA_SET_Base_ADDR32, SRCA_SET_Base_SHAMT, SRCA_SET_Base_MASK
+
+#define SRCA_SET_SetOvrdWithAddr_ADDR32                             5
+#define SRCA_SET_SetOvrdWithAddr_SHAMT                              2
+#define SRCA_SET_SetOvrdWithAddr_MASK                             0x4
+#define SRCA_SET_SetOvrdWithAddr_RMW                              SRCA_SET_SetOvrdWithAddr_ADDR32, SRCA_SET_SetOvrdWithAddr_SHAMT, SRCA_SET_SetOvrdWithAddr_MASK
+
+#define SRCB_SET_Base_ADDR32                                        6
+#define SRCB_SET_Base_SHAMT                                         0
+#define SRCB_SET_Base_MASK                                        0x3
+#define SRCB_SET_Base_RMW                                         SRCB_SET_Base_ADDR32, SRCB_SET_Base_SHAMT, SRCB_SET_Base_MASK
+
+#define CLR_DVALID_SrcA_Disable_ADDR32                              7
+#define CLR_DVALID_SrcA_Disable_SHAMT                               0
+#define CLR_DVALID_SrcA_Disable_MASK                              0x1
+#define CLR_DVALID_SrcA_Disable_RMW                               CLR_DVALID_SrcA_Disable_ADDR32, CLR_DVALID_SrcA_Disable_SHAMT, CLR_DVALID_SrcA_Disable_MASK
+
+#define CLR_DVALID_SrcB_Disable_ADDR32                              7
+#define CLR_DVALID_SrcB_Disable_SHAMT                               1
+#define CLR_DVALID_SrcB_Disable_MASK                              0x2
+#define CLR_DVALID_SrcB_Disable_RMW                               CLR_DVALID_SrcB_Disable_ADDR32, CLR_DVALID_SrcB_Disable_SHAMT, CLR_DVALID_SrcB_Disable_MASK
+
+#define SCBD_BANK_MASK_32b_Enable_ADDR32                            8
+#define SCBD_BANK_MASK_32b_Enable_SHAMT                             0
+#define SCBD_BANK_MASK_32b_Enable_MASK                            0x1
+#define SCBD_BANK_MASK_32b_Enable_RMW                             SCBD_BANK_MASK_32b_Enable_ADDR32, SCBD_BANK_MASK_32b_Enable_SHAMT, SCBD_BANK_MASK_32b_Enable_MASK
+
+#define PACK_SCBD_BANK_MASK_32b_Enable_ADDR32                       9
+#define PACK_SCBD_BANK_MASK_32b_Enable_SHAMT                        0
+#define PACK_SCBD_BANK_MASK_32b_Enable_MASK                       0x1
+#define PACK_SCBD_BANK_MASK_32b_Enable_RMW                        PACK_SCBD_BANK_MASK_32b_Enable_ADDR32, PACK_SCBD_BANK_MASK_32b_Enable_SHAMT, PACK_SCBD_BANK_MASK_32b_Enable_MASK
+
+#define UNPACK_SCBD_BANK_MASK_32b_Enable_ADDR32                    10
+#define UNPACK_SCBD_BANK_MASK_32b_Enable_SHAMT                      0
+#define UNPACK_SCBD_BANK_MASK_32b_Enable_MASK                     0x1
+#define UNPACK_SCBD_BANK_MASK_32b_Enable_RMW                      UNPACK_SCBD_BANK_MASK_32b_Enable_ADDR32, UNPACK_SCBD_BANK_MASK_32b_Enable_SHAMT, UNPACK_SCBD_BANK_MASK_32b_Enable_MASK
+
+#define FIDELITY_BASE_Phase_ADDR32                                 11
+#define FIDELITY_BASE_Phase_SHAMT                                   0
+#define FIDELITY_BASE_Phase_MASK                                  0x3
+#define FIDELITY_BASE_Phase_RMW                                   FIDELITY_BASE_Phase_ADDR32, FIDELITY_BASE_Phase_SHAMT, FIDELITY_BASE_Phase_MASK
+
+#define ADDR_MOD_AB_SEC0_SrcAIncr_ADDR32                           12
+#define ADDR_MOD_AB_SEC0_SrcAIncr_SHAMT                             0
+#define ADDR_MOD_AB_SEC0_SrcAIncr_MASK                            0x3f
+#define ADDR_MOD_AB_SEC0_SrcAIncr_RMW                             ADDR_MOD_AB_SEC0_SrcAIncr_ADDR32, ADDR_MOD_AB_SEC0_SrcAIncr_SHAMT, ADDR_MOD_AB_SEC0_SrcAIncr_MASK
+
+#define ADDR_MOD_AB_SEC0_SrcACR_ADDR32                             12
+#define ADDR_MOD_AB_SEC0_SrcACR_SHAMT                               6
+#define ADDR_MOD_AB_SEC0_SrcACR_MASK                              0x40
+#define ADDR_MOD_AB_SEC0_SrcACR_RMW                               ADDR_MOD_AB_SEC0_SrcACR_ADDR32, ADDR_MOD_AB_SEC0_SrcACR_SHAMT, ADDR_MOD_AB_SEC0_SrcACR_MASK
+
+#define ADDR_MOD_AB_SEC0_SrcAClear_ADDR32                          12
+#define ADDR_MOD_AB_SEC0_SrcAClear_SHAMT                            7
+#define ADDR_MOD_AB_SEC0_SrcAClear_MASK                           0x80
+#define ADDR_MOD_AB_SEC0_SrcAClear_RMW                            ADDR_MOD_AB_SEC0_SrcAClear_ADDR32, ADDR_MOD_AB_SEC0_SrcAClear_SHAMT, ADDR_MOD_AB_SEC0_SrcAClear_MASK
+
+#define ADDR_MOD_AB_SEC0_SrcBIncr_ADDR32                           12
+#define ADDR_MOD_AB_SEC0_SrcBIncr_SHAMT                             8
+#define ADDR_MOD_AB_SEC0_SrcBIncr_MASK                            0x3f00
+#define ADDR_MOD_AB_SEC0_SrcBIncr_RMW                             ADDR_MOD_AB_SEC0_SrcBIncr_ADDR32, ADDR_MOD_AB_SEC0_SrcBIncr_SHAMT, ADDR_MOD_AB_SEC0_SrcBIncr_MASK
+
+#define ADDR_MOD_AB_SEC0_SrcBCR_ADDR32                             12
+#define ADDR_MOD_AB_SEC0_SrcBCR_SHAMT                              14
+#define ADDR_MOD_AB_SEC0_SrcBCR_MASK                              0x4000
+#define ADDR_MOD_AB_SEC0_SrcBCR_RMW                               ADDR_MOD_AB_SEC0_SrcBCR_ADDR32, ADDR_MOD_AB_SEC0_SrcBCR_SHAMT, ADDR_MOD_AB_SEC0_SrcBCR_MASK
+
+#define ADDR_MOD_AB_SEC0_SrcBClear_ADDR32                          12
+#define ADDR_MOD_AB_SEC0_SrcBClear_SHAMT                           15
+#define ADDR_MOD_AB_SEC0_SrcBClear_MASK                           0x8000
+#define ADDR_MOD_AB_SEC0_SrcBClear_RMW                            ADDR_MOD_AB_SEC0_SrcBClear_ADDR32, ADDR_MOD_AB_SEC0_SrcBClear_SHAMT, ADDR_MOD_AB_SEC0_SrcBClear_MASK
+
+#define ADDR_MOD_AB_SEC1_SrcAIncr_ADDR32                           13
+#define ADDR_MOD_AB_SEC1_SrcAIncr_SHAMT                             0
+#define ADDR_MOD_AB_SEC1_SrcAIncr_MASK                            0x3f
+#define ADDR_MOD_AB_SEC1_SrcAIncr_RMW                             ADDR_MOD_AB_SEC1_SrcAIncr_ADDR32, ADDR_MOD_AB_SEC1_SrcAIncr_SHAMT, ADDR_MOD_AB_SEC1_SrcAIncr_MASK
+
+#define ADDR_MOD_AB_SEC1_SrcACR_ADDR32                             13
+#define ADDR_MOD_AB_SEC1_SrcACR_SHAMT                               6
+#define ADDR_MOD_AB_SEC1_SrcACR_MASK                              0x40
+#define ADDR_MOD_AB_SEC1_SrcACR_RMW                               ADDR_MOD_AB_SEC1_SrcACR_ADDR32, ADDR_MOD_AB_SEC1_SrcACR_SHAMT, ADDR_MOD_AB_SEC1_SrcACR_MASK
+
+#define ADDR_MOD_AB_SEC1_SrcAClear_ADDR32                          13
+#define ADDR_MOD_AB_SEC1_SrcAClear_SHAMT                            7
+#define ADDR_MOD_AB_SEC1_SrcAClear_MASK                           0x80
+#define ADDR_MOD_AB_SEC1_SrcAClear_RMW                            ADDR_MOD_AB_SEC1_SrcAClear_ADDR32, ADDR_MOD_AB_SEC1_SrcAClear_SHAMT, ADDR_MOD_AB_SEC1_SrcAClear_MASK
+
+#define ADDR_MOD_AB_SEC1_SrcBIncr_ADDR32                           13
+#define ADDR_MOD_AB_SEC1_SrcBIncr_SHAMT                             8
+#define ADDR_MOD_AB_SEC1_SrcBIncr_MASK                            0x3f00
+#define ADDR_MOD_AB_SEC1_SrcBIncr_RMW                             ADDR_MOD_AB_SEC1_SrcBIncr_ADDR32, ADDR_MOD_AB_SEC1_SrcBIncr_SHAMT, ADDR_MOD_AB_SEC1_SrcBIncr_MASK
+
+#define ADDR_MOD_AB_SEC1_SrcBCR_ADDR32                             13
+#define ADDR_MOD_AB_SEC1_SrcBCR_SHAMT                              14
+#define ADDR_MOD_AB_SEC1_SrcBCR_MASK                              0x4000
+#define ADDR_MOD_AB_SEC1_SrcBCR_RMW                               ADDR_MOD_AB_SEC1_SrcBCR_ADDR32, ADDR_MOD_AB_SEC1_SrcBCR_SHAMT, ADDR_MOD_AB_SEC1_SrcBCR_MASK
+
+#define ADDR_MOD_AB_SEC1_SrcBClear_ADDR32                          13
+#define ADDR_MOD_AB_SEC1_SrcBClear_SHAMT                           15
+#define ADDR_MOD_AB_SEC1_SrcBClear_MASK                           0x8000
+#define ADDR_MOD_AB_SEC1_SrcBClear_RMW                            ADDR_MOD_AB_SEC1_SrcBClear_ADDR32, ADDR_MOD_AB_SEC1_SrcBClear_SHAMT, ADDR_MOD_AB_SEC1_SrcBClear_MASK
+
+#define ADDR_MOD_AB_SEC2_SrcAIncr_ADDR32                           14
+#define ADDR_MOD_AB_SEC2_SrcAIncr_SHAMT                             0
+#define ADDR_MOD_AB_SEC2_SrcAIncr_MASK                            0x3f
+#define ADDR_MOD_AB_SEC2_SrcAIncr_RMW                             ADDR_MOD_AB_SEC2_SrcAIncr_ADDR32, ADDR_MOD_AB_SEC2_SrcAIncr_SHAMT, ADDR_MOD_AB_SEC2_SrcAIncr_MASK
+
+#define ADDR_MOD_AB_SEC2_SrcACR_ADDR32                             14
+#define ADDR_MOD_AB_SEC2_SrcACR_SHAMT                               6
+#define ADDR_MOD_AB_SEC2_SrcACR_MASK                              0x40
+#define ADDR_MOD_AB_SEC2_SrcACR_RMW                               ADDR_MOD_AB_SEC2_SrcACR_ADDR32, ADDR_MOD_AB_SEC2_SrcACR_SHAMT, ADDR_MOD_AB_SEC2_SrcACR_MASK
+
+#define ADDR_MOD_AB_SEC2_SrcAClear_ADDR32                          14
+#define ADDR_MOD_AB_SEC2_SrcAClear_SHAMT                            7
+#define ADDR_MOD_AB_SEC2_SrcAClear_MASK                           0x80
+#define ADDR_MOD_AB_SEC2_SrcAClear_RMW                            ADDR_MOD_AB_SEC2_SrcAClear_ADDR32, ADDR_MOD_AB_SEC2_SrcAClear_SHAMT, ADDR_MOD_AB_SEC2_SrcAClear_MASK
+
+#define ADDR_MOD_AB_SEC2_SrcBIncr_ADDR32                           14
+#define ADDR_MOD_AB_SEC2_SrcBIncr_SHAMT                             8
+#define ADDR_MOD_AB_SEC2_SrcBIncr_MASK                            0x3f00
+#define ADDR_MOD_AB_SEC2_SrcBIncr_RMW                             ADDR_MOD_AB_SEC2_SrcBIncr_ADDR32, ADDR_MOD_AB_SEC2_SrcBIncr_SHAMT, ADDR_MOD_AB_SEC2_SrcBIncr_MASK
+
+#define ADDR_MOD_AB_SEC2_SrcBCR_ADDR32                             14
+#define ADDR_MOD_AB_SEC2_SrcBCR_SHAMT                              14
+#define ADDR_MOD_AB_SEC2_SrcBCR_MASK                              0x4000
+#define ADDR_MOD_AB_SEC2_SrcBCR_RMW                               ADDR_MOD_AB_SEC2_SrcBCR_ADDR32, ADDR_MOD_AB_SEC2_SrcBCR_SHAMT, ADDR_MOD_AB_SEC2_SrcBCR_MASK
+
+#define ADDR_MOD_AB_SEC2_SrcBClear_ADDR32                          14
+#define ADDR_MOD_AB_SEC2_SrcBClear_SHAMT                           15
+#define ADDR_MOD_AB_SEC2_SrcBClear_MASK                           0x8000
+#define ADDR_MOD_AB_SEC2_SrcBClear_RMW                            ADDR_MOD_AB_SEC2_SrcBClear_ADDR32, ADDR_MOD_AB_SEC2_SrcBClear_SHAMT, ADDR_MOD_AB_SEC2_SrcBClear_MASK
+
+#define ADDR_MOD_AB_SEC3_SrcAIncr_ADDR32                           15
+#define ADDR_MOD_AB_SEC3_SrcAIncr_SHAMT                             0
+#define ADDR_MOD_AB_SEC3_SrcAIncr_MASK                            0x3f
+#define ADDR_MOD_AB_SEC3_SrcAIncr_RMW                             ADDR_MOD_AB_SEC3_SrcAIncr_ADDR32, ADDR_MOD_AB_SEC3_SrcAIncr_SHAMT, ADDR_MOD_AB_SEC3_SrcAIncr_MASK
+
+#define ADDR_MOD_AB_SEC3_SrcACR_ADDR32                             15
+#define ADDR_MOD_AB_SEC3_SrcACR_SHAMT                               6
+#define ADDR_MOD_AB_SEC3_SrcACR_MASK                              0x40
+#define ADDR_MOD_AB_SEC3_SrcACR_RMW                               ADDR_MOD_AB_SEC3_SrcACR_ADDR32, ADDR_MOD_AB_SEC3_SrcACR_SHAMT, ADDR_MOD_AB_SEC3_SrcACR_MASK
+
+#define ADDR_MOD_AB_SEC3_SrcAClear_ADDR32                          15
+#define ADDR_MOD_AB_SEC3_SrcAClear_SHAMT                            7
+#define ADDR_MOD_AB_SEC3_SrcAClear_MASK                           0x80
+#define ADDR_MOD_AB_SEC3_SrcAClear_RMW                            ADDR_MOD_AB_SEC3_SrcAClear_ADDR32, ADDR_MOD_AB_SEC3_SrcAClear_SHAMT, ADDR_MOD_AB_SEC3_SrcAClear_MASK
+
+#define ADDR_MOD_AB_SEC3_SrcBIncr_ADDR32                           15
+#define ADDR_MOD_AB_SEC3_SrcBIncr_SHAMT                             8
+#define ADDR_MOD_AB_SEC3_SrcBIncr_MASK                            0x3f00
+#define ADDR_MOD_AB_SEC3_SrcBIncr_RMW                             ADDR_MOD_AB_SEC3_SrcBIncr_ADDR32, ADDR_MOD_AB_SEC3_SrcBIncr_SHAMT, ADDR_MOD_AB_SEC3_SrcBIncr_MASK
+
+#define ADDR_MOD_AB_SEC3_SrcBCR_ADDR32                             15
+#define ADDR_MOD_AB_SEC3_SrcBCR_SHAMT                              14
+#define ADDR_MOD_AB_SEC3_SrcBCR_MASK                              0x4000
+#define ADDR_MOD_AB_SEC3_SrcBCR_RMW                               ADDR_MOD_AB_SEC3_SrcBCR_ADDR32, ADDR_MOD_AB_SEC3_SrcBCR_SHAMT, ADDR_MOD_AB_SEC3_SrcBCR_MASK
+
+#define ADDR_MOD_AB_SEC3_SrcBClear_ADDR32                          15
+#define ADDR_MOD_AB_SEC3_SrcBClear_SHAMT                           15
+#define ADDR_MOD_AB_SEC3_SrcBClear_MASK                           0x8000
+#define ADDR_MOD_AB_SEC3_SrcBClear_RMW                            ADDR_MOD_AB_SEC3_SrcBClear_ADDR32, ADDR_MOD_AB_SEC3_SrcBClear_SHAMT, ADDR_MOD_AB_SEC3_SrcBClear_MASK
+
+#define ADDR_MOD_AB_SEC4_SrcAIncr_ADDR32                           16
+#define ADDR_MOD_AB_SEC4_SrcAIncr_SHAMT                             0
+#define ADDR_MOD_AB_SEC4_SrcAIncr_MASK                            0x3f
+#define ADDR_MOD_AB_SEC4_SrcAIncr_RMW                             ADDR_MOD_AB_SEC4_SrcAIncr_ADDR32, ADDR_MOD_AB_SEC4_SrcAIncr_SHAMT, ADDR_MOD_AB_SEC4_SrcAIncr_MASK
+
+#define ADDR_MOD_AB_SEC4_SrcACR_ADDR32                             16
+#define ADDR_MOD_AB_SEC4_SrcACR_SHAMT                               6
+#define ADDR_MOD_AB_SEC4_SrcACR_MASK                              0x40
+#define ADDR_MOD_AB_SEC4_SrcACR_RMW                               ADDR_MOD_AB_SEC4_SrcACR_ADDR32, ADDR_MOD_AB_SEC4_SrcACR_SHAMT, ADDR_MOD_AB_SEC4_SrcACR_MASK
+
+#define ADDR_MOD_AB_SEC4_SrcAClear_ADDR32                          16
+#define ADDR_MOD_AB_SEC4_SrcAClear_SHAMT                            7
+#define ADDR_MOD_AB_SEC4_SrcAClear_MASK                           0x80
+#define ADDR_MOD_AB_SEC4_SrcAClear_RMW                            ADDR_MOD_AB_SEC4_SrcAClear_ADDR32, ADDR_MOD_AB_SEC4_SrcAClear_SHAMT, ADDR_MOD_AB_SEC4_SrcAClear_MASK
+
+#define ADDR_MOD_AB_SEC4_SrcBIncr_ADDR32                           16
+#define ADDR_MOD_AB_SEC4_SrcBIncr_SHAMT                             8
+#define ADDR_MOD_AB_SEC4_SrcBIncr_MASK                            0x3f00
+#define ADDR_MOD_AB_SEC4_SrcBIncr_RMW                             ADDR_MOD_AB_SEC4_SrcBIncr_ADDR32, ADDR_MOD_AB_SEC4_SrcBIncr_SHAMT, ADDR_MOD_AB_SEC4_SrcBIncr_MASK
+
+#define ADDR_MOD_AB_SEC4_SrcBCR_ADDR32                             16
+#define ADDR_MOD_AB_SEC4_SrcBCR_SHAMT                              14
+#define ADDR_MOD_AB_SEC4_SrcBCR_MASK                              0x4000
+#define ADDR_MOD_AB_SEC4_SrcBCR_RMW                               ADDR_MOD_AB_SEC4_SrcBCR_ADDR32, ADDR_MOD_AB_SEC4_SrcBCR_SHAMT, ADDR_MOD_AB_SEC4_SrcBCR_MASK
+
+#define ADDR_MOD_AB_SEC4_SrcBClear_ADDR32                          16
+#define ADDR_MOD_AB_SEC4_SrcBClear_SHAMT                           15
+#define ADDR_MOD_AB_SEC4_SrcBClear_MASK                           0x8000
+#define ADDR_MOD_AB_SEC4_SrcBClear_RMW                            ADDR_MOD_AB_SEC4_SrcBClear_ADDR32, ADDR_MOD_AB_SEC4_SrcBClear_SHAMT, ADDR_MOD_AB_SEC4_SrcBClear_MASK
+
+#define ADDR_MOD_AB_SEC5_SrcAIncr_ADDR32                           17
+#define ADDR_MOD_AB_SEC5_SrcAIncr_SHAMT                             0
+#define ADDR_MOD_AB_SEC5_SrcAIncr_MASK                            0x3f
+#define ADDR_MOD_AB_SEC5_SrcAIncr_RMW                             ADDR_MOD_AB_SEC5_SrcAIncr_ADDR32, ADDR_MOD_AB_SEC5_SrcAIncr_SHAMT, ADDR_MOD_AB_SEC5_SrcAIncr_MASK
+
+#define ADDR_MOD_AB_SEC5_SrcACR_ADDR32                             17
+#define ADDR_MOD_AB_SEC5_SrcACR_SHAMT                               6
+#define ADDR_MOD_AB_SEC5_SrcACR_MASK                              0x40
+#define ADDR_MOD_AB_SEC5_SrcACR_RMW                               ADDR_MOD_AB_SEC5_SrcACR_ADDR32, ADDR_MOD_AB_SEC5_SrcACR_SHAMT, ADDR_MOD_AB_SEC5_SrcACR_MASK
+
+#define ADDR_MOD_AB_SEC5_SrcAClear_ADDR32                          17
+#define ADDR_MOD_AB_SEC5_SrcAClear_SHAMT                            7
+#define ADDR_MOD_AB_SEC5_SrcAClear_MASK                           0x80
+#define ADDR_MOD_AB_SEC5_SrcAClear_RMW                            ADDR_MOD_AB_SEC5_SrcAClear_ADDR32, ADDR_MOD_AB_SEC5_SrcAClear_SHAMT, ADDR_MOD_AB_SEC5_SrcAClear_MASK
+
+#define ADDR_MOD_AB_SEC5_SrcBIncr_ADDR32                           17
+#define ADDR_MOD_AB_SEC5_SrcBIncr_SHAMT                             8
+#define ADDR_MOD_AB_SEC5_SrcBIncr_MASK                            0x3f00
+#define ADDR_MOD_AB_SEC5_SrcBIncr_RMW                             ADDR_MOD_AB_SEC5_SrcBIncr_ADDR32, ADDR_MOD_AB_SEC5_SrcBIncr_SHAMT, ADDR_MOD_AB_SEC5_SrcBIncr_MASK
+
+#define ADDR_MOD_AB_SEC5_SrcBCR_ADDR32                             17
+#define ADDR_MOD_AB_SEC5_SrcBCR_SHAMT                              14
+#define ADDR_MOD_AB_SEC5_SrcBCR_MASK                              0x4000
+#define ADDR_MOD_AB_SEC5_SrcBCR_RMW                               ADDR_MOD_AB_SEC5_SrcBCR_ADDR32, ADDR_MOD_AB_SEC5_SrcBCR_SHAMT, ADDR_MOD_AB_SEC5_SrcBCR_MASK
+
+#define ADDR_MOD_AB_SEC5_SrcBClear_ADDR32                          17
+#define ADDR_MOD_AB_SEC5_SrcBClear_SHAMT                           15
+#define ADDR_MOD_AB_SEC5_SrcBClear_MASK                           0x8000
+#define ADDR_MOD_AB_SEC5_SrcBClear_RMW                            ADDR_MOD_AB_SEC5_SrcBClear_ADDR32, ADDR_MOD_AB_SEC5_SrcBClear_SHAMT, ADDR_MOD_AB_SEC5_SrcBClear_MASK
+
+#define ADDR_MOD_AB_SEC6_SrcAIncr_ADDR32                           18
+#define ADDR_MOD_AB_SEC6_SrcAIncr_SHAMT                             0
+#define ADDR_MOD_AB_SEC6_SrcAIncr_MASK                            0x3f
+#define ADDR_MOD_AB_SEC6_SrcAIncr_RMW                             ADDR_MOD_AB_SEC6_SrcAIncr_ADDR32, ADDR_MOD_AB_SEC6_SrcAIncr_SHAMT, ADDR_MOD_AB_SEC6_SrcAIncr_MASK
+
+#define ADDR_MOD_AB_SEC6_SrcACR_ADDR32                             18
+#define ADDR_MOD_AB_SEC6_SrcACR_SHAMT                               6
+#define ADDR_MOD_AB_SEC6_SrcACR_MASK                              0x40
+#define ADDR_MOD_AB_SEC6_SrcACR_RMW                               ADDR_MOD_AB_SEC6_SrcACR_ADDR32, ADDR_MOD_AB_SEC6_SrcACR_SHAMT, ADDR_MOD_AB_SEC6_SrcACR_MASK
+
+#define ADDR_MOD_AB_SEC6_SrcAClear_ADDR32                          18
+#define ADDR_MOD_AB_SEC6_SrcAClear_SHAMT                            7
+#define ADDR_MOD_AB_SEC6_SrcAClear_MASK                           0x80
+#define ADDR_MOD_AB_SEC6_SrcAClear_RMW                            ADDR_MOD_AB_SEC6_SrcAClear_ADDR32, ADDR_MOD_AB_SEC6_SrcAClear_SHAMT, ADDR_MOD_AB_SEC6_SrcAClear_MASK
+
+#define ADDR_MOD_AB_SEC6_SrcBIncr_ADDR32                           18
+#define ADDR_MOD_AB_SEC6_SrcBIncr_SHAMT                             8
+#define ADDR_MOD_AB_SEC6_SrcBIncr_MASK                            0x3f00
+#define ADDR_MOD_AB_SEC6_SrcBIncr_RMW                             ADDR_MOD_AB_SEC6_SrcBIncr_ADDR32, ADDR_MOD_AB_SEC6_SrcBIncr_SHAMT, ADDR_MOD_AB_SEC6_SrcBIncr_MASK
+
+#define ADDR_MOD_AB_SEC6_SrcBCR_ADDR32                             18
+#define ADDR_MOD_AB_SEC6_SrcBCR_SHAMT                              14
+#define ADDR_MOD_AB_SEC6_SrcBCR_MASK                              0x4000
+#define ADDR_MOD_AB_SEC6_SrcBCR_RMW                               ADDR_MOD_AB_SEC6_SrcBCR_ADDR32, ADDR_MOD_AB_SEC6_SrcBCR_SHAMT, ADDR_MOD_AB_SEC6_SrcBCR_MASK
+
+#define ADDR_MOD_AB_SEC6_SrcBClear_ADDR32                          18
+#define ADDR_MOD_AB_SEC6_SrcBClear_SHAMT                           15
+#define ADDR_MOD_AB_SEC6_SrcBClear_MASK                           0x8000
+#define ADDR_MOD_AB_SEC6_SrcBClear_RMW                            ADDR_MOD_AB_SEC6_SrcBClear_ADDR32, ADDR_MOD_AB_SEC6_SrcBClear_SHAMT, ADDR_MOD_AB_SEC6_SrcBClear_MASK
+
+#define ADDR_MOD_AB_SEC7_SrcAIncr_ADDR32                           19
+#define ADDR_MOD_AB_SEC7_SrcAIncr_SHAMT                             0
+#define ADDR_MOD_AB_SEC7_SrcAIncr_MASK                            0x3f
+#define ADDR_MOD_AB_SEC7_SrcAIncr_RMW                             ADDR_MOD_AB_SEC7_SrcAIncr_ADDR32, ADDR_MOD_AB_SEC7_SrcAIncr_SHAMT, ADDR_MOD_AB_SEC7_SrcAIncr_MASK
+
+#define ADDR_MOD_AB_SEC7_SrcACR_ADDR32                             19
+#define ADDR_MOD_AB_SEC7_SrcACR_SHAMT                               6
+#define ADDR_MOD_AB_SEC7_SrcACR_MASK                              0x40
+#define ADDR_MOD_AB_SEC7_SrcACR_RMW                               ADDR_MOD_AB_SEC7_SrcACR_ADDR32, ADDR_MOD_AB_SEC7_SrcACR_SHAMT, ADDR_MOD_AB_SEC7_SrcACR_MASK
+
+#define ADDR_MOD_AB_SEC7_SrcAClear_ADDR32                          19
+#define ADDR_MOD_AB_SEC7_SrcAClear_SHAMT                            7
+#define ADDR_MOD_AB_SEC7_SrcAClear_MASK                           0x80
+#define ADDR_MOD_AB_SEC7_SrcAClear_RMW                            ADDR_MOD_AB_SEC7_SrcAClear_ADDR32, ADDR_MOD_AB_SEC7_SrcAClear_SHAMT, ADDR_MOD_AB_SEC7_SrcAClear_MASK
+
+#define ADDR_MOD_AB_SEC7_SrcBIncr_ADDR32                           19
+#define ADDR_MOD_AB_SEC7_SrcBIncr_SHAMT                             8
+#define ADDR_MOD_AB_SEC7_SrcBIncr_MASK                            0x3f00
+#define ADDR_MOD_AB_SEC7_SrcBIncr_RMW                             ADDR_MOD_AB_SEC7_SrcBIncr_ADDR32, ADDR_MOD_AB_SEC7_SrcBIncr_SHAMT, ADDR_MOD_AB_SEC7_SrcBIncr_MASK
+
+#define ADDR_MOD_AB_SEC7_SrcBCR_ADDR32                             19
+#define ADDR_MOD_AB_SEC7_SrcBCR_SHAMT                              14
+#define ADDR_MOD_AB_SEC7_SrcBCR_MASK                              0x4000
+#define ADDR_MOD_AB_SEC7_SrcBCR_RMW                               ADDR_MOD_AB_SEC7_SrcBCR_ADDR32, ADDR_MOD_AB_SEC7_SrcBCR_SHAMT, ADDR_MOD_AB_SEC7_SrcBCR_MASK
+
+#define ADDR_MOD_AB_SEC7_SrcBClear_ADDR32                          19
+#define ADDR_MOD_AB_SEC7_SrcBClear_SHAMT                           15
+#define ADDR_MOD_AB_SEC7_SrcBClear_MASK                           0x8000
+#define ADDR_MOD_AB_SEC7_SrcBClear_RMW                            ADDR_MOD_AB_SEC7_SrcBClear_ADDR32, ADDR_MOD_AB_SEC7_SrcBClear_SHAMT, ADDR_MOD_AB_SEC7_SrcBClear_MASK
+
+#define ADDR_MOD_AB2_SEC0_SrcAIncr_ADDR32                          20
+#define ADDR_MOD_AB2_SEC0_SrcAIncr_SHAMT                            0
+#define ADDR_MOD_AB2_SEC0_SrcAIncr_MASK                           0x1
+#define ADDR_MOD_AB2_SEC0_SrcAIncr_RMW                            ADDR_MOD_AB2_SEC0_SrcAIncr_ADDR32, ADDR_MOD_AB2_SEC0_SrcAIncr_SHAMT, ADDR_MOD_AB2_SEC0_SrcAIncr_MASK
+
+#define ADDR_MOD_AB2_SEC0_SrcBIncr_ADDR32                          20
+#define ADDR_MOD_AB2_SEC0_SrcBIncr_SHAMT                            1
+#define ADDR_MOD_AB2_SEC0_SrcBIncr_MASK                           0x2
+#define ADDR_MOD_AB2_SEC0_SrcBIncr_RMW                            ADDR_MOD_AB2_SEC0_SrcBIncr_ADDR32, ADDR_MOD_AB2_SEC0_SrcBIncr_SHAMT, ADDR_MOD_AB2_SEC0_SrcBIncr_MASK
+
+#define ADDR_MOD_AB2_SEC1_SrcAIncr_ADDR32                          21
+#define ADDR_MOD_AB2_SEC1_SrcAIncr_SHAMT                            0
+#define ADDR_MOD_AB2_SEC1_SrcAIncr_MASK                           0x1
+#define ADDR_MOD_AB2_SEC1_SrcAIncr_RMW                            ADDR_MOD_AB2_SEC1_SrcAIncr_ADDR32, ADDR_MOD_AB2_SEC1_SrcAIncr_SHAMT, ADDR_MOD_AB2_SEC1_SrcAIncr_MASK
+
+#define ADDR_MOD_AB2_SEC1_SrcBIncr_ADDR32                          21
+#define ADDR_MOD_AB2_SEC1_SrcBIncr_SHAMT                            1
+#define ADDR_MOD_AB2_SEC1_SrcBIncr_MASK                           0x2
+#define ADDR_MOD_AB2_SEC1_SrcBIncr_RMW                            ADDR_MOD_AB2_SEC1_SrcBIncr_ADDR32, ADDR_MOD_AB2_SEC1_SrcBIncr_SHAMT, ADDR_MOD_AB2_SEC1_SrcBIncr_MASK
+
+#define ADDR_MOD_AB2_SEC2_SrcAIncr_ADDR32                          22
+#define ADDR_MOD_AB2_SEC2_SrcAIncr_SHAMT                            0
+#define ADDR_MOD_AB2_SEC2_SrcAIncr_MASK                           0x1
+#define ADDR_MOD_AB2_SEC2_SrcAIncr_RMW                            ADDR_MOD_AB2_SEC2_SrcAIncr_ADDR32, ADDR_MOD_AB2_SEC2_SrcAIncr_SHAMT, ADDR_MOD_AB2_SEC2_SrcAIncr_MASK
+
+#define ADDR_MOD_AB2_SEC2_SrcBIncr_ADDR32                          22
+#define ADDR_MOD_AB2_SEC2_SrcBIncr_SHAMT                            1
+#define ADDR_MOD_AB2_SEC2_SrcBIncr_MASK                           0x2
+#define ADDR_MOD_AB2_SEC2_SrcBIncr_RMW                            ADDR_MOD_AB2_SEC2_SrcBIncr_ADDR32, ADDR_MOD_AB2_SEC2_SrcBIncr_SHAMT, ADDR_MOD_AB2_SEC2_SrcBIncr_MASK
+
+#define ADDR_MOD_AB2_SEC3_SrcAIncr_ADDR32                          23
+#define ADDR_MOD_AB2_SEC3_SrcAIncr_SHAMT                            0
+#define ADDR_MOD_AB2_SEC3_SrcAIncr_MASK                           0x1
+#define ADDR_MOD_AB2_SEC3_SrcAIncr_RMW                            ADDR_MOD_AB2_SEC3_SrcAIncr_ADDR32, ADDR_MOD_AB2_SEC3_SrcAIncr_SHAMT, ADDR_MOD_AB2_SEC3_SrcAIncr_MASK
+
+#define ADDR_MOD_AB2_SEC3_SrcBIncr_ADDR32                          23
+#define ADDR_MOD_AB2_SEC3_SrcBIncr_SHAMT                            1
+#define ADDR_MOD_AB2_SEC3_SrcBIncr_MASK                           0x2
+#define ADDR_MOD_AB2_SEC3_SrcBIncr_RMW                            ADDR_MOD_AB2_SEC3_SrcBIncr_ADDR32, ADDR_MOD_AB2_SEC3_SrcBIncr_SHAMT, ADDR_MOD_AB2_SEC3_SrcBIncr_MASK
+
+#define ADDR_MOD_AB2_SEC4_SrcAIncr_ADDR32                          24
+#define ADDR_MOD_AB2_SEC4_SrcAIncr_SHAMT                            0
+#define ADDR_MOD_AB2_SEC4_SrcAIncr_MASK                           0x1
+#define ADDR_MOD_AB2_SEC4_SrcAIncr_RMW                            ADDR_MOD_AB2_SEC4_SrcAIncr_ADDR32, ADDR_MOD_AB2_SEC4_SrcAIncr_SHAMT, ADDR_MOD_AB2_SEC4_SrcAIncr_MASK
+
+#define ADDR_MOD_AB2_SEC4_SrcBIncr_ADDR32                          24
+#define ADDR_MOD_AB2_SEC4_SrcBIncr_SHAMT                            1
+#define ADDR_MOD_AB2_SEC4_SrcBIncr_MASK                           0x2
+#define ADDR_MOD_AB2_SEC4_SrcBIncr_RMW                            ADDR_MOD_AB2_SEC4_SrcBIncr_ADDR32, ADDR_MOD_AB2_SEC4_SrcBIncr_SHAMT, ADDR_MOD_AB2_SEC4_SrcBIncr_MASK
+
+#define ADDR_MOD_AB2_SEC5_SrcAIncr_ADDR32                          25
+#define ADDR_MOD_AB2_SEC5_SrcAIncr_SHAMT                            0
+#define ADDR_MOD_AB2_SEC5_SrcAIncr_MASK                           0x1
+#define ADDR_MOD_AB2_SEC5_SrcAIncr_RMW                            ADDR_MOD_AB2_SEC5_SrcAIncr_ADDR32, ADDR_MOD_AB2_SEC5_SrcAIncr_SHAMT, ADDR_MOD_AB2_SEC5_SrcAIncr_MASK
+
+#define ADDR_MOD_AB2_SEC5_SrcBIncr_ADDR32                          25
+#define ADDR_MOD_AB2_SEC5_SrcBIncr_SHAMT                            1
+#define ADDR_MOD_AB2_SEC5_SrcBIncr_MASK                           0x2
+#define ADDR_MOD_AB2_SEC5_SrcBIncr_RMW                            ADDR_MOD_AB2_SEC5_SrcBIncr_ADDR32, ADDR_MOD_AB2_SEC5_SrcBIncr_SHAMT, ADDR_MOD_AB2_SEC5_SrcBIncr_MASK
+
+#define ADDR_MOD_AB2_SEC6_SrcAIncr_ADDR32                          26
+#define ADDR_MOD_AB2_SEC6_SrcAIncr_SHAMT                            0
+#define ADDR_MOD_AB2_SEC6_SrcAIncr_MASK                           0x1
+#define ADDR_MOD_AB2_SEC6_SrcAIncr_RMW                            ADDR_MOD_AB2_SEC6_SrcAIncr_ADDR32, ADDR_MOD_AB2_SEC6_SrcAIncr_SHAMT, ADDR_MOD_AB2_SEC6_SrcAIncr_MASK
+
+#define ADDR_MOD_AB2_SEC6_SrcBIncr_ADDR32                          26
+#define ADDR_MOD_AB2_SEC6_SrcBIncr_SHAMT                            1
+#define ADDR_MOD_AB2_SEC6_SrcBIncr_MASK                           0x2
+#define ADDR_MOD_AB2_SEC6_SrcBIncr_RMW                            ADDR_MOD_AB2_SEC6_SrcBIncr_ADDR32, ADDR_MOD_AB2_SEC6_SrcBIncr_SHAMT, ADDR_MOD_AB2_SEC6_SrcBIncr_MASK
+
+#define ADDR_MOD_AB2_SEC7_SrcAIncr_ADDR32                          27
+#define ADDR_MOD_AB2_SEC7_SrcAIncr_SHAMT                            0
+#define ADDR_MOD_AB2_SEC7_SrcAIncr_MASK                           0x1
+#define ADDR_MOD_AB2_SEC7_SrcAIncr_RMW                            ADDR_MOD_AB2_SEC7_SrcAIncr_ADDR32, ADDR_MOD_AB2_SEC7_SrcAIncr_SHAMT, ADDR_MOD_AB2_SEC7_SrcAIncr_MASK
+
+#define ADDR_MOD_AB2_SEC7_SrcBIncr_ADDR32                          27
+#define ADDR_MOD_AB2_SEC7_SrcBIncr_SHAMT                            1
+#define ADDR_MOD_AB2_SEC7_SrcBIncr_MASK                           0x2
+#define ADDR_MOD_AB2_SEC7_SrcBIncr_RMW                            ADDR_MOD_AB2_SEC7_SrcBIncr_ADDR32, ADDR_MOD_AB2_SEC7_SrcBIncr_SHAMT, ADDR_MOD_AB2_SEC7_SrcBIncr_MASK
+
+#define ADDR_MOD_DST_SEC0_DestIncr_ADDR32                          28
+#define ADDR_MOD_DST_SEC0_DestIncr_SHAMT                            0
+#define ADDR_MOD_DST_SEC0_DestIncr_MASK                           0x3ff
+#define ADDR_MOD_DST_SEC0_DestIncr_RMW                            ADDR_MOD_DST_SEC0_DestIncr_ADDR32, ADDR_MOD_DST_SEC0_DestIncr_SHAMT, ADDR_MOD_DST_SEC0_DestIncr_MASK
+
+#define ADDR_MOD_DST_SEC0_DestCR_ADDR32                            28
+#define ADDR_MOD_DST_SEC0_DestCR_SHAMT                             10
+#define ADDR_MOD_DST_SEC0_DestCR_MASK                             0x400
+#define ADDR_MOD_DST_SEC0_DestCR_RMW                              ADDR_MOD_DST_SEC0_DestCR_ADDR32, ADDR_MOD_DST_SEC0_DestCR_SHAMT, ADDR_MOD_DST_SEC0_DestCR_MASK
+
+#define ADDR_MOD_DST_SEC0_DestClear_ADDR32                         28
+#define ADDR_MOD_DST_SEC0_DestClear_SHAMT                          11
+#define ADDR_MOD_DST_SEC0_DestClear_MASK                          0x800
+#define ADDR_MOD_DST_SEC0_DestClear_RMW                           ADDR_MOD_DST_SEC0_DestClear_ADDR32, ADDR_MOD_DST_SEC0_DestClear_SHAMT, ADDR_MOD_DST_SEC0_DestClear_MASK
+
+#define ADDR_MOD_DST_SEC0_DestCToCR_ADDR32                         28
+#define ADDR_MOD_DST_SEC0_DestCToCR_SHAMT                          12
+#define ADDR_MOD_DST_SEC0_DestCToCR_MASK                          0x1000
+#define ADDR_MOD_DST_SEC0_DestCToCR_RMW                           ADDR_MOD_DST_SEC0_DestCToCR_ADDR32, ADDR_MOD_DST_SEC0_DestCToCR_SHAMT, ADDR_MOD_DST_SEC0_DestCToCR_MASK
+
+#define ADDR_MOD_DST_SEC0_FidelityIncr_ADDR32                      28
+#define ADDR_MOD_DST_SEC0_FidelityIncr_SHAMT                       13
+#define ADDR_MOD_DST_SEC0_FidelityIncr_MASK                       0x6000
+#define ADDR_MOD_DST_SEC0_FidelityIncr_RMW                        ADDR_MOD_DST_SEC0_FidelityIncr_ADDR32, ADDR_MOD_DST_SEC0_FidelityIncr_SHAMT, ADDR_MOD_DST_SEC0_FidelityIncr_MASK
+
+#define ADDR_MOD_DST_SEC0_FidelityClear_ADDR32                     28
+#define ADDR_MOD_DST_SEC0_FidelityClear_SHAMT                      15
+#define ADDR_MOD_DST_SEC0_FidelityClear_MASK                      0x8000
+#define ADDR_MOD_DST_SEC0_FidelityClear_RMW                       ADDR_MOD_DST_SEC0_FidelityClear_ADDR32, ADDR_MOD_DST_SEC0_FidelityClear_SHAMT, ADDR_MOD_DST_SEC0_FidelityClear_MASK
+
+#define ADDR_MOD_DST_SEC1_DestIncr_ADDR32                          29
+#define ADDR_MOD_DST_SEC1_DestIncr_SHAMT                            0
+#define ADDR_MOD_DST_SEC1_DestIncr_MASK                           0x3ff
+#define ADDR_MOD_DST_SEC1_DestIncr_RMW                            ADDR_MOD_DST_SEC1_DestIncr_ADDR32, ADDR_MOD_DST_SEC1_DestIncr_SHAMT, ADDR_MOD_DST_SEC1_DestIncr_MASK
+
+#define ADDR_MOD_DST_SEC1_DestCR_ADDR32                            29
+#define ADDR_MOD_DST_SEC1_DestCR_SHAMT                             10
+#define ADDR_MOD_DST_SEC1_DestCR_MASK                             0x400
+#define ADDR_MOD_DST_SEC1_DestCR_RMW                              ADDR_MOD_DST_SEC1_DestCR_ADDR32, ADDR_MOD_DST_SEC1_DestCR_SHAMT, ADDR_MOD_DST_SEC1_DestCR_MASK
+
+#define ADDR_MOD_DST_SEC1_DestClear_ADDR32                         29
+#define ADDR_MOD_DST_SEC1_DestClear_SHAMT                          11
+#define ADDR_MOD_DST_SEC1_DestClear_MASK                          0x800
+#define ADDR_MOD_DST_SEC1_DestClear_RMW                           ADDR_MOD_DST_SEC1_DestClear_ADDR32, ADDR_MOD_DST_SEC1_DestClear_SHAMT, ADDR_MOD_DST_SEC1_DestClear_MASK
+
+#define ADDR_MOD_DST_SEC1_DestCToCR_ADDR32                         29
+#define ADDR_MOD_DST_SEC1_DestCToCR_SHAMT                          12
+#define ADDR_MOD_DST_SEC1_DestCToCR_MASK                          0x1000
+#define ADDR_MOD_DST_SEC1_DestCToCR_RMW                           ADDR_MOD_DST_SEC1_DestCToCR_ADDR32, ADDR_MOD_DST_SEC1_DestCToCR_SHAMT, ADDR_MOD_DST_SEC1_DestCToCR_MASK
+
+#define ADDR_MOD_DST_SEC1_FidelityIncr_ADDR32                      29
+#define ADDR_MOD_DST_SEC1_FidelityIncr_SHAMT                       13
+#define ADDR_MOD_DST_SEC1_FidelityIncr_MASK                       0x6000
+#define ADDR_MOD_DST_SEC1_FidelityIncr_RMW                        ADDR_MOD_DST_SEC1_FidelityIncr_ADDR32, ADDR_MOD_DST_SEC1_FidelityIncr_SHAMT, ADDR_MOD_DST_SEC1_FidelityIncr_MASK
+
+#define ADDR_MOD_DST_SEC1_FidelityClear_ADDR32                     29
+#define ADDR_MOD_DST_SEC1_FidelityClear_SHAMT                      15
+#define ADDR_MOD_DST_SEC1_FidelityClear_MASK                      0x8000
+#define ADDR_MOD_DST_SEC1_FidelityClear_RMW                       ADDR_MOD_DST_SEC1_FidelityClear_ADDR32, ADDR_MOD_DST_SEC1_FidelityClear_SHAMT, ADDR_MOD_DST_SEC1_FidelityClear_MASK
+
+#define ADDR_MOD_DST_SEC2_DestIncr_ADDR32                          30
+#define ADDR_MOD_DST_SEC2_DestIncr_SHAMT                            0
+#define ADDR_MOD_DST_SEC2_DestIncr_MASK                           0x3ff
+#define ADDR_MOD_DST_SEC2_DestIncr_RMW                            ADDR_MOD_DST_SEC2_DestIncr_ADDR32, ADDR_MOD_DST_SEC2_DestIncr_SHAMT, ADDR_MOD_DST_SEC2_DestIncr_MASK
+
+#define ADDR_MOD_DST_SEC2_DestCR_ADDR32                            30
+#define ADDR_MOD_DST_SEC2_DestCR_SHAMT                             10
+#define ADDR_MOD_DST_SEC2_DestCR_MASK                             0x400
+#define ADDR_MOD_DST_SEC2_DestCR_RMW                              ADDR_MOD_DST_SEC2_DestCR_ADDR32, ADDR_MOD_DST_SEC2_DestCR_SHAMT, ADDR_MOD_DST_SEC2_DestCR_MASK
+
+#define ADDR_MOD_DST_SEC2_DestClear_ADDR32                         30
+#define ADDR_MOD_DST_SEC2_DestClear_SHAMT                          11
+#define ADDR_MOD_DST_SEC2_DestClear_MASK                          0x800
+#define ADDR_MOD_DST_SEC2_DestClear_RMW                           ADDR_MOD_DST_SEC2_DestClear_ADDR32, ADDR_MOD_DST_SEC2_DestClear_SHAMT, ADDR_MOD_DST_SEC2_DestClear_MASK
+
+#define ADDR_MOD_DST_SEC2_DestCToCR_ADDR32                         30
+#define ADDR_MOD_DST_SEC2_DestCToCR_SHAMT                          12
+#define ADDR_MOD_DST_SEC2_DestCToCR_MASK                          0x1000
+#define ADDR_MOD_DST_SEC2_DestCToCR_RMW                           ADDR_MOD_DST_SEC2_DestCToCR_ADDR32, ADDR_MOD_DST_SEC2_DestCToCR_SHAMT, ADDR_MOD_DST_SEC2_DestCToCR_MASK
+
+#define ADDR_MOD_DST_SEC2_FidelityIncr_ADDR32                      30
+#define ADDR_MOD_DST_SEC2_FidelityIncr_SHAMT                       13
+#define ADDR_MOD_DST_SEC2_FidelityIncr_MASK                       0x6000
+#define ADDR_MOD_DST_SEC2_FidelityIncr_RMW                        ADDR_MOD_DST_SEC2_FidelityIncr_ADDR32, ADDR_MOD_DST_SEC2_FidelityIncr_SHAMT, ADDR_MOD_DST_SEC2_FidelityIncr_MASK
+
+#define ADDR_MOD_DST_SEC2_FidelityClear_ADDR32                     30
+#define ADDR_MOD_DST_SEC2_FidelityClear_SHAMT                      15
+#define ADDR_MOD_DST_SEC2_FidelityClear_MASK                      0x8000
+#define ADDR_MOD_DST_SEC2_FidelityClear_RMW                       ADDR_MOD_DST_SEC2_FidelityClear_ADDR32, ADDR_MOD_DST_SEC2_FidelityClear_SHAMT, ADDR_MOD_DST_SEC2_FidelityClear_MASK
+
+#define ADDR_MOD_DST_SEC3_DestIncr_ADDR32                          31
+#define ADDR_MOD_DST_SEC3_DestIncr_SHAMT                            0
+#define ADDR_MOD_DST_SEC3_DestIncr_MASK                           0x3ff
+#define ADDR_MOD_DST_SEC3_DestIncr_RMW                            ADDR_MOD_DST_SEC3_DestIncr_ADDR32, ADDR_MOD_DST_SEC3_DestIncr_SHAMT, ADDR_MOD_DST_SEC3_DestIncr_MASK
+
+#define ADDR_MOD_DST_SEC3_DestCR_ADDR32                            31
+#define ADDR_MOD_DST_SEC3_DestCR_SHAMT                             10
+#define ADDR_MOD_DST_SEC3_DestCR_MASK                             0x400
+#define ADDR_MOD_DST_SEC3_DestCR_RMW                              ADDR_MOD_DST_SEC3_DestCR_ADDR32, ADDR_MOD_DST_SEC3_DestCR_SHAMT, ADDR_MOD_DST_SEC3_DestCR_MASK
+
+#define ADDR_MOD_DST_SEC3_DestClear_ADDR32                         31
+#define ADDR_MOD_DST_SEC3_DestClear_SHAMT                          11
+#define ADDR_MOD_DST_SEC3_DestClear_MASK                          0x800
+#define ADDR_MOD_DST_SEC3_DestClear_RMW                           ADDR_MOD_DST_SEC3_DestClear_ADDR32, ADDR_MOD_DST_SEC3_DestClear_SHAMT, ADDR_MOD_DST_SEC3_DestClear_MASK
+
+#define ADDR_MOD_DST_SEC3_DestCToCR_ADDR32                         31
+#define ADDR_MOD_DST_SEC3_DestCToCR_SHAMT                          12
+#define ADDR_MOD_DST_SEC3_DestCToCR_MASK                          0x1000
+#define ADDR_MOD_DST_SEC3_DestCToCR_RMW                           ADDR_MOD_DST_SEC3_DestCToCR_ADDR32, ADDR_MOD_DST_SEC3_DestCToCR_SHAMT, ADDR_MOD_DST_SEC3_DestCToCR_MASK
+
+#define ADDR_MOD_DST_SEC3_FidelityIncr_ADDR32                      31
+#define ADDR_MOD_DST_SEC3_FidelityIncr_SHAMT                       13
+#define ADDR_MOD_DST_SEC3_FidelityIncr_MASK                       0x6000
+#define ADDR_MOD_DST_SEC3_FidelityIncr_RMW                        ADDR_MOD_DST_SEC3_FidelityIncr_ADDR32, ADDR_MOD_DST_SEC3_FidelityIncr_SHAMT, ADDR_MOD_DST_SEC3_FidelityIncr_MASK
+
+#define ADDR_MOD_DST_SEC3_FidelityClear_ADDR32                     31
+#define ADDR_MOD_DST_SEC3_FidelityClear_SHAMT                      15
+#define ADDR_MOD_DST_SEC3_FidelityClear_MASK                      0x8000
+#define ADDR_MOD_DST_SEC3_FidelityClear_RMW                       ADDR_MOD_DST_SEC3_FidelityClear_ADDR32, ADDR_MOD_DST_SEC3_FidelityClear_SHAMT, ADDR_MOD_DST_SEC3_FidelityClear_MASK
+
+#define ADDR_MOD_DST_SEC4_DestIncr_ADDR32                          32
+#define ADDR_MOD_DST_SEC4_DestIncr_SHAMT                            0
+#define ADDR_MOD_DST_SEC4_DestIncr_MASK                           0x3ff
+#define ADDR_MOD_DST_SEC4_DestIncr_RMW                            ADDR_MOD_DST_SEC4_DestIncr_ADDR32, ADDR_MOD_DST_SEC4_DestIncr_SHAMT, ADDR_MOD_DST_SEC4_DestIncr_MASK
+
+#define ADDR_MOD_DST_SEC4_DestCR_ADDR32                            32
+#define ADDR_MOD_DST_SEC4_DestCR_SHAMT                             10
+#define ADDR_MOD_DST_SEC4_DestCR_MASK                             0x400
+#define ADDR_MOD_DST_SEC4_DestCR_RMW                              ADDR_MOD_DST_SEC4_DestCR_ADDR32, ADDR_MOD_DST_SEC4_DestCR_SHAMT, ADDR_MOD_DST_SEC4_DestCR_MASK
+
+#define ADDR_MOD_DST_SEC4_DestClear_ADDR32                         32
+#define ADDR_MOD_DST_SEC4_DestClear_SHAMT                          11
+#define ADDR_MOD_DST_SEC4_DestClear_MASK                          0x800
+#define ADDR_MOD_DST_SEC4_DestClear_RMW                           ADDR_MOD_DST_SEC4_DestClear_ADDR32, ADDR_MOD_DST_SEC4_DestClear_SHAMT, ADDR_MOD_DST_SEC4_DestClear_MASK
+
+#define ADDR_MOD_DST_SEC4_DestCToCR_ADDR32                         32
+#define ADDR_MOD_DST_SEC4_DestCToCR_SHAMT                          12
+#define ADDR_MOD_DST_SEC4_DestCToCR_MASK                          0x1000
+#define ADDR_MOD_DST_SEC4_DestCToCR_RMW                           ADDR_MOD_DST_SEC4_DestCToCR_ADDR32, ADDR_MOD_DST_SEC4_DestCToCR_SHAMT, ADDR_MOD_DST_SEC4_DestCToCR_MASK
+
+#define ADDR_MOD_DST_SEC4_FidelityIncr_ADDR32                      32
+#define ADDR_MOD_DST_SEC4_FidelityIncr_SHAMT                       13
+#define ADDR_MOD_DST_SEC4_FidelityIncr_MASK                       0x6000
+#define ADDR_MOD_DST_SEC4_FidelityIncr_RMW                        ADDR_MOD_DST_SEC4_FidelityIncr_ADDR32, ADDR_MOD_DST_SEC4_FidelityIncr_SHAMT, ADDR_MOD_DST_SEC4_FidelityIncr_MASK
+
+#define ADDR_MOD_DST_SEC4_FidelityClear_ADDR32                     32
+#define ADDR_MOD_DST_SEC4_FidelityClear_SHAMT                      15
+#define ADDR_MOD_DST_SEC4_FidelityClear_MASK                      0x8000
+#define ADDR_MOD_DST_SEC4_FidelityClear_RMW                       ADDR_MOD_DST_SEC4_FidelityClear_ADDR32, ADDR_MOD_DST_SEC4_FidelityClear_SHAMT, ADDR_MOD_DST_SEC4_FidelityClear_MASK
+
+#define ADDR_MOD_DST_SEC5_DestIncr_ADDR32                          33
+#define ADDR_MOD_DST_SEC5_DestIncr_SHAMT                            0
+#define ADDR_MOD_DST_SEC5_DestIncr_MASK                           0x3ff
+#define ADDR_MOD_DST_SEC5_DestIncr_RMW                            ADDR_MOD_DST_SEC5_DestIncr_ADDR32, ADDR_MOD_DST_SEC5_DestIncr_SHAMT, ADDR_MOD_DST_SEC5_DestIncr_MASK
+
+#define ADDR_MOD_DST_SEC5_DestCR_ADDR32                            33
+#define ADDR_MOD_DST_SEC5_DestCR_SHAMT                             10
+#define ADDR_MOD_DST_SEC5_DestCR_MASK                             0x400
+#define ADDR_MOD_DST_SEC5_DestCR_RMW                              ADDR_MOD_DST_SEC5_DestCR_ADDR32, ADDR_MOD_DST_SEC5_DestCR_SHAMT, ADDR_MOD_DST_SEC5_DestCR_MASK
+
+#define ADDR_MOD_DST_SEC5_DestClear_ADDR32                         33
+#define ADDR_MOD_DST_SEC5_DestClear_SHAMT                          11
+#define ADDR_MOD_DST_SEC5_DestClear_MASK                          0x800
+#define ADDR_MOD_DST_SEC5_DestClear_RMW                           ADDR_MOD_DST_SEC5_DestClear_ADDR32, ADDR_MOD_DST_SEC5_DestClear_SHAMT, ADDR_MOD_DST_SEC5_DestClear_MASK
+
+#define ADDR_MOD_DST_SEC5_DestCToCR_ADDR32                         33
+#define ADDR_MOD_DST_SEC5_DestCToCR_SHAMT                          12
+#define ADDR_MOD_DST_SEC5_DestCToCR_MASK                          0x1000
+#define ADDR_MOD_DST_SEC5_DestCToCR_RMW                           ADDR_MOD_DST_SEC5_DestCToCR_ADDR32, ADDR_MOD_DST_SEC5_DestCToCR_SHAMT, ADDR_MOD_DST_SEC5_DestCToCR_MASK
+
+#define ADDR_MOD_DST_SEC5_FidelityIncr_ADDR32                      33
+#define ADDR_MOD_DST_SEC5_FidelityIncr_SHAMT                       13
+#define ADDR_MOD_DST_SEC5_FidelityIncr_MASK                       0x6000
+#define ADDR_MOD_DST_SEC5_FidelityIncr_RMW                        ADDR_MOD_DST_SEC5_FidelityIncr_ADDR32, ADDR_MOD_DST_SEC5_FidelityIncr_SHAMT, ADDR_MOD_DST_SEC5_FidelityIncr_MASK
+
+#define ADDR_MOD_DST_SEC5_FidelityClear_ADDR32                     33
+#define ADDR_MOD_DST_SEC5_FidelityClear_SHAMT                      15
+#define ADDR_MOD_DST_SEC5_FidelityClear_MASK                      0x8000
+#define ADDR_MOD_DST_SEC5_FidelityClear_RMW                       ADDR_MOD_DST_SEC5_FidelityClear_ADDR32, ADDR_MOD_DST_SEC5_FidelityClear_SHAMT, ADDR_MOD_DST_SEC5_FidelityClear_MASK
+
+#define ADDR_MOD_DST_SEC6_DestIncr_ADDR32                          34
+#define ADDR_MOD_DST_SEC6_DestIncr_SHAMT                            0
+#define ADDR_MOD_DST_SEC6_DestIncr_MASK                           0x3ff
+#define ADDR_MOD_DST_SEC6_DestIncr_RMW                            ADDR_MOD_DST_SEC6_DestIncr_ADDR32, ADDR_MOD_DST_SEC6_DestIncr_SHAMT, ADDR_MOD_DST_SEC6_DestIncr_MASK
+
+#define ADDR_MOD_DST_SEC6_DestCR_ADDR32                            34
+#define ADDR_MOD_DST_SEC6_DestCR_SHAMT                             10
+#define ADDR_MOD_DST_SEC6_DestCR_MASK                             0x400
+#define ADDR_MOD_DST_SEC6_DestCR_RMW                              ADDR_MOD_DST_SEC6_DestCR_ADDR32, ADDR_MOD_DST_SEC6_DestCR_SHAMT, ADDR_MOD_DST_SEC6_DestCR_MASK
+
+#define ADDR_MOD_DST_SEC6_DestClear_ADDR32                         34
+#define ADDR_MOD_DST_SEC6_DestClear_SHAMT                          11
+#define ADDR_MOD_DST_SEC6_DestClear_MASK                          0x800
+#define ADDR_MOD_DST_SEC6_DestClear_RMW                           ADDR_MOD_DST_SEC6_DestClear_ADDR32, ADDR_MOD_DST_SEC6_DestClear_SHAMT, ADDR_MOD_DST_SEC6_DestClear_MASK
+
+#define ADDR_MOD_DST_SEC6_DestCToCR_ADDR32                         34
+#define ADDR_MOD_DST_SEC6_DestCToCR_SHAMT                          12
+#define ADDR_MOD_DST_SEC6_DestCToCR_MASK                          0x1000
+#define ADDR_MOD_DST_SEC6_DestCToCR_RMW                           ADDR_MOD_DST_SEC6_DestCToCR_ADDR32, ADDR_MOD_DST_SEC6_DestCToCR_SHAMT, ADDR_MOD_DST_SEC6_DestCToCR_MASK
+
+#define ADDR_MOD_DST_SEC6_FidelityIncr_ADDR32                      34
+#define ADDR_MOD_DST_SEC6_FidelityIncr_SHAMT                       13
+#define ADDR_MOD_DST_SEC6_FidelityIncr_MASK                       0x6000
+#define ADDR_MOD_DST_SEC6_FidelityIncr_RMW                        ADDR_MOD_DST_SEC6_FidelityIncr_ADDR32, ADDR_MOD_DST_SEC6_FidelityIncr_SHAMT, ADDR_MOD_DST_SEC6_FidelityIncr_MASK
+
+#define ADDR_MOD_DST_SEC6_FidelityClear_ADDR32                     34
+#define ADDR_MOD_DST_SEC6_FidelityClear_SHAMT                      15
+#define ADDR_MOD_DST_SEC6_FidelityClear_MASK                      0x8000
+#define ADDR_MOD_DST_SEC6_FidelityClear_RMW                       ADDR_MOD_DST_SEC6_FidelityClear_ADDR32, ADDR_MOD_DST_SEC6_FidelityClear_SHAMT, ADDR_MOD_DST_SEC6_FidelityClear_MASK
+
+#define ADDR_MOD_DST_SEC7_DestIncr_ADDR32                          35
+#define ADDR_MOD_DST_SEC7_DestIncr_SHAMT                            0
+#define ADDR_MOD_DST_SEC7_DestIncr_MASK                           0x3ff
+#define ADDR_MOD_DST_SEC7_DestIncr_RMW                            ADDR_MOD_DST_SEC7_DestIncr_ADDR32, ADDR_MOD_DST_SEC7_DestIncr_SHAMT, ADDR_MOD_DST_SEC7_DestIncr_MASK
+
+#define ADDR_MOD_DST_SEC7_DestCR_ADDR32                            35
+#define ADDR_MOD_DST_SEC7_DestCR_SHAMT                             10
+#define ADDR_MOD_DST_SEC7_DestCR_MASK                             0x400
+#define ADDR_MOD_DST_SEC7_DestCR_RMW                              ADDR_MOD_DST_SEC7_DestCR_ADDR32, ADDR_MOD_DST_SEC7_DestCR_SHAMT, ADDR_MOD_DST_SEC7_DestCR_MASK
+
+#define ADDR_MOD_DST_SEC7_DestClear_ADDR32                         35
+#define ADDR_MOD_DST_SEC7_DestClear_SHAMT                          11
+#define ADDR_MOD_DST_SEC7_DestClear_MASK                          0x800
+#define ADDR_MOD_DST_SEC7_DestClear_RMW                           ADDR_MOD_DST_SEC7_DestClear_ADDR32, ADDR_MOD_DST_SEC7_DestClear_SHAMT, ADDR_MOD_DST_SEC7_DestClear_MASK
+
+#define ADDR_MOD_DST_SEC7_DestCToCR_ADDR32                         35
+#define ADDR_MOD_DST_SEC7_DestCToCR_SHAMT                          12
+#define ADDR_MOD_DST_SEC7_DestCToCR_MASK                          0x1000
+#define ADDR_MOD_DST_SEC7_DestCToCR_RMW                           ADDR_MOD_DST_SEC7_DestCToCR_ADDR32, ADDR_MOD_DST_SEC7_DestCToCR_SHAMT, ADDR_MOD_DST_SEC7_DestCToCR_MASK
+
+#define ADDR_MOD_DST_SEC7_FidelityIncr_ADDR32                      35
+#define ADDR_MOD_DST_SEC7_FidelityIncr_SHAMT                       13
+#define ADDR_MOD_DST_SEC7_FidelityIncr_MASK                       0x6000
+#define ADDR_MOD_DST_SEC7_FidelityIncr_RMW                        ADDR_MOD_DST_SEC7_FidelityIncr_ADDR32, ADDR_MOD_DST_SEC7_FidelityIncr_SHAMT, ADDR_MOD_DST_SEC7_FidelityIncr_MASK
+
+#define ADDR_MOD_DST_SEC7_FidelityClear_ADDR32                     35
+#define ADDR_MOD_DST_SEC7_FidelityClear_SHAMT                      15
+#define ADDR_MOD_DST_SEC7_FidelityClear_MASK                      0x8000
+#define ADDR_MOD_DST_SEC7_FidelityClear_RMW                       ADDR_MOD_DST_SEC7_FidelityClear_ADDR32, ADDR_MOD_DST_SEC7_FidelityClear_SHAMT, ADDR_MOD_DST_SEC7_FidelityClear_MASK
+
+#define SFPU_STACK_Incr_ADDR32                                     36
+#define SFPU_STACK_Incr_SHAMT                                       0
+#define SFPU_STACK_Incr_MASK                                      0x3ff
+#define SFPU_STACK_Incr_RMW                                       SFPU_STACK_Incr_ADDR32, SFPU_STACK_Incr_SHAMT, SFPU_STACK_Incr_MASK
+
+#define ADDR_MOD_PACK_SEC0_YsrcIncr_ADDR32                         37
+#define ADDR_MOD_PACK_SEC0_YsrcIncr_SHAMT                           0
+#define ADDR_MOD_PACK_SEC0_YsrcIncr_MASK                          0xf
+#define ADDR_MOD_PACK_SEC0_YsrcIncr_RMW                           ADDR_MOD_PACK_SEC0_YsrcIncr_ADDR32, ADDR_MOD_PACK_SEC0_YsrcIncr_SHAMT, ADDR_MOD_PACK_SEC0_YsrcIncr_MASK
+
+#define ADDR_MOD_PACK_SEC0_YsrcCR_ADDR32                           37
+#define ADDR_MOD_PACK_SEC0_YsrcCR_SHAMT                             4
+#define ADDR_MOD_PACK_SEC0_YsrcCR_MASK                            0x10
+#define ADDR_MOD_PACK_SEC0_YsrcCR_RMW                             ADDR_MOD_PACK_SEC0_YsrcCR_ADDR32, ADDR_MOD_PACK_SEC0_YsrcCR_SHAMT, ADDR_MOD_PACK_SEC0_YsrcCR_MASK
+
+#define ADDR_MOD_PACK_SEC0_YsrcClear_ADDR32                        37
+#define ADDR_MOD_PACK_SEC0_YsrcClear_SHAMT                          5
+#define ADDR_MOD_PACK_SEC0_YsrcClear_MASK                         0x20
+#define ADDR_MOD_PACK_SEC0_YsrcClear_RMW                          ADDR_MOD_PACK_SEC0_YsrcClear_ADDR32, ADDR_MOD_PACK_SEC0_YsrcClear_SHAMT, ADDR_MOD_PACK_SEC0_YsrcClear_MASK
+
+#define ADDR_MOD_PACK_SEC0_YdstIncr_ADDR32                         37
+#define ADDR_MOD_PACK_SEC0_YdstIncr_SHAMT                           6
+#define ADDR_MOD_PACK_SEC0_YdstIncr_MASK                          0x3c0
+#define ADDR_MOD_PACK_SEC0_YdstIncr_RMW                           ADDR_MOD_PACK_SEC0_YdstIncr_ADDR32, ADDR_MOD_PACK_SEC0_YdstIncr_SHAMT, ADDR_MOD_PACK_SEC0_YdstIncr_MASK
+
+#define ADDR_MOD_PACK_SEC0_YdstCR_ADDR32                           37
+#define ADDR_MOD_PACK_SEC0_YdstCR_SHAMT                            10
+#define ADDR_MOD_PACK_SEC0_YdstCR_MASK                            0x400
+#define ADDR_MOD_PACK_SEC0_YdstCR_RMW                             ADDR_MOD_PACK_SEC0_YdstCR_ADDR32, ADDR_MOD_PACK_SEC0_YdstCR_SHAMT, ADDR_MOD_PACK_SEC0_YdstCR_MASK
+
+#define ADDR_MOD_PACK_SEC0_YdstClear_ADDR32                        37
+#define ADDR_MOD_PACK_SEC0_YdstClear_SHAMT                         11
+#define ADDR_MOD_PACK_SEC0_YdstClear_MASK                         0x800
+#define ADDR_MOD_PACK_SEC0_YdstClear_RMW                          ADDR_MOD_PACK_SEC0_YdstClear_ADDR32, ADDR_MOD_PACK_SEC0_YdstClear_SHAMT, ADDR_MOD_PACK_SEC0_YdstClear_MASK
+
+#define ADDR_MOD_PACK_SEC0_ZsrcIncr_ADDR32                         37
+#define ADDR_MOD_PACK_SEC0_ZsrcIncr_SHAMT                          12
+#define ADDR_MOD_PACK_SEC0_ZsrcIncr_MASK                          0x1000
+#define ADDR_MOD_PACK_SEC0_ZsrcIncr_RMW                           ADDR_MOD_PACK_SEC0_ZsrcIncr_ADDR32, ADDR_MOD_PACK_SEC0_ZsrcIncr_SHAMT, ADDR_MOD_PACK_SEC0_ZsrcIncr_MASK
+
+#define ADDR_MOD_PACK_SEC0_ZsrcClear_ADDR32                        37
+#define ADDR_MOD_PACK_SEC0_ZsrcClear_SHAMT                         13
+#define ADDR_MOD_PACK_SEC0_ZsrcClear_MASK                         0x2000
+#define ADDR_MOD_PACK_SEC0_ZsrcClear_RMW                          ADDR_MOD_PACK_SEC0_ZsrcClear_ADDR32, ADDR_MOD_PACK_SEC0_ZsrcClear_SHAMT, ADDR_MOD_PACK_SEC0_ZsrcClear_MASK
+
+#define ADDR_MOD_PACK_SEC0_ZdstIncr_ADDR32                         37
+#define ADDR_MOD_PACK_SEC0_ZdstIncr_SHAMT                          14
+#define ADDR_MOD_PACK_SEC0_ZdstIncr_MASK                          0x4000
+#define ADDR_MOD_PACK_SEC0_ZdstIncr_RMW                           ADDR_MOD_PACK_SEC0_ZdstIncr_ADDR32, ADDR_MOD_PACK_SEC0_ZdstIncr_SHAMT, ADDR_MOD_PACK_SEC0_ZdstIncr_MASK
+
+#define ADDR_MOD_PACK_SEC0_ZdstClear_ADDR32                        37
+#define ADDR_MOD_PACK_SEC0_ZdstClear_SHAMT                         15
+#define ADDR_MOD_PACK_SEC0_ZdstClear_MASK                         0x8000
+#define ADDR_MOD_PACK_SEC0_ZdstClear_RMW                          ADDR_MOD_PACK_SEC0_ZdstClear_ADDR32, ADDR_MOD_PACK_SEC0_ZdstClear_SHAMT, ADDR_MOD_PACK_SEC0_ZdstClear_MASK
+
+#define ADDR_MOD_PACK_SEC1_YsrcIncr_ADDR32                         38
+#define ADDR_MOD_PACK_SEC1_YsrcIncr_SHAMT                           0
+#define ADDR_MOD_PACK_SEC1_YsrcIncr_MASK                          0xf
+#define ADDR_MOD_PACK_SEC1_YsrcIncr_RMW                           ADDR_MOD_PACK_SEC1_YsrcIncr_ADDR32, ADDR_MOD_PACK_SEC1_YsrcIncr_SHAMT, ADDR_MOD_PACK_SEC1_YsrcIncr_MASK
+
+#define ADDR_MOD_PACK_SEC1_YsrcCR_ADDR32                           38
+#define ADDR_MOD_PACK_SEC1_YsrcCR_SHAMT                             4
+#define ADDR_MOD_PACK_SEC1_YsrcCR_MASK                            0x10
+#define ADDR_MOD_PACK_SEC1_YsrcCR_RMW                             ADDR_MOD_PACK_SEC1_YsrcCR_ADDR32, ADDR_MOD_PACK_SEC1_YsrcCR_SHAMT, ADDR_MOD_PACK_SEC1_YsrcCR_MASK
+
+#define ADDR_MOD_PACK_SEC1_YsrcClear_ADDR32                        38
+#define ADDR_MOD_PACK_SEC1_YsrcClear_SHAMT                          5
+#define ADDR_MOD_PACK_SEC1_YsrcClear_MASK                         0x20
+#define ADDR_MOD_PACK_SEC1_YsrcClear_RMW                          ADDR_MOD_PACK_SEC1_YsrcClear_ADDR32, ADDR_MOD_PACK_SEC1_YsrcClear_SHAMT, ADDR_MOD_PACK_SEC1_YsrcClear_MASK
+
+#define ADDR_MOD_PACK_SEC1_YdstIncr_ADDR32                         38
+#define ADDR_MOD_PACK_SEC1_YdstIncr_SHAMT                           6
+#define ADDR_MOD_PACK_SEC1_YdstIncr_MASK                          0x3c0
+#define ADDR_MOD_PACK_SEC1_YdstIncr_RMW                           ADDR_MOD_PACK_SEC1_YdstIncr_ADDR32, ADDR_MOD_PACK_SEC1_YdstIncr_SHAMT, ADDR_MOD_PACK_SEC1_YdstIncr_MASK
+
+#define ADDR_MOD_PACK_SEC1_YdstCR_ADDR32                           38
+#define ADDR_MOD_PACK_SEC1_YdstCR_SHAMT                            10
+#define ADDR_MOD_PACK_SEC1_YdstCR_MASK                            0x400
+#define ADDR_MOD_PACK_SEC1_YdstCR_RMW                             ADDR_MOD_PACK_SEC1_YdstCR_ADDR32, ADDR_MOD_PACK_SEC1_YdstCR_SHAMT, ADDR_MOD_PACK_SEC1_YdstCR_MASK
+
+#define ADDR_MOD_PACK_SEC1_YdstClear_ADDR32                        38
+#define ADDR_MOD_PACK_SEC1_YdstClear_SHAMT                         11
+#define ADDR_MOD_PACK_SEC1_YdstClear_MASK                         0x800
+#define ADDR_MOD_PACK_SEC1_YdstClear_RMW                          ADDR_MOD_PACK_SEC1_YdstClear_ADDR32, ADDR_MOD_PACK_SEC1_YdstClear_SHAMT, ADDR_MOD_PACK_SEC1_YdstClear_MASK
+
+#define ADDR_MOD_PACK_SEC1_ZsrcIncr_ADDR32                         38
+#define ADDR_MOD_PACK_SEC1_ZsrcIncr_SHAMT                          12
+#define ADDR_MOD_PACK_SEC1_ZsrcIncr_MASK                          0x1000
+#define ADDR_MOD_PACK_SEC1_ZsrcIncr_RMW                           ADDR_MOD_PACK_SEC1_ZsrcIncr_ADDR32, ADDR_MOD_PACK_SEC1_ZsrcIncr_SHAMT, ADDR_MOD_PACK_SEC1_ZsrcIncr_MASK
+
+#define ADDR_MOD_PACK_SEC1_ZsrcClear_ADDR32                        38
+#define ADDR_MOD_PACK_SEC1_ZsrcClear_SHAMT                         13
+#define ADDR_MOD_PACK_SEC1_ZsrcClear_MASK                         0x2000
+#define ADDR_MOD_PACK_SEC1_ZsrcClear_RMW                          ADDR_MOD_PACK_SEC1_ZsrcClear_ADDR32, ADDR_MOD_PACK_SEC1_ZsrcClear_SHAMT, ADDR_MOD_PACK_SEC1_ZsrcClear_MASK
+
+#define ADDR_MOD_PACK_SEC1_ZdstIncr_ADDR32                         38
+#define ADDR_MOD_PACK_SEC1_ZdstIncr_SHAMT                          14
+#define ADDR_MOD_PACK_SEC1_ZdstIncr_MASK                          0x4000
+#define ADDR_MOD_PACK_SEC1_ZdstIncr_RMW                           ADDR_MOD_PACK_SEC1_ZdstIncr_ADDR32, ADDR_MOD_PACK_SEC1_ZdstIncr_SHAMT, ADDR_MOD_PACK_SEC1_ZdstIncr_MASK
+
+#define ADDR_MOD_PACK_SEC1_ZdstClear_ADDR32                        38
+#define ADDR_MOD_PACK_SEC1_ZdstClear_SHAMT                         15
+#define ADDR_MOD_PACK_SEC1_ZdstClear_MASK                         0x8000
+#define ADDR_MOD_PACK_SEC1_ZdstClear_RMW                          ADDR_MOD_PACK_SEC1_ZdstClear_ADDR32, ADDR_MOD_PACK_SEC1_ZdstClear_SHAMT, ADDR_MOD_PACK_SEC1_ZdstClear_MASK
+
+#define ADDR_MOD_PACK_SEC2_YsrcIncr_ADDR32                         39
+#define ADDR_MOD_PACK_SEC2_YsrcIncr_SHAMT                           0
+#define ADDR_MOD_PACK_SEC2_YsrcIncr_MASK                          0xf
+#define ADDR_MOD_PACK_SEC2_YsrcIncr_RMW                           ADDR_MOD_PACK_SEC2_YsrcIncr_ADDR32, ADDR_MOD_PACK_SEC2_YsrcIncr_SHAMT, ADDR_MOD_PACK_SEC2_YsrcIncr_MASK
+
+#define ADDR_MOD_PACK_SEC2_YsrcCR_ADDR32                           39
+#define ADDR_MOD_PACK_SEC2_YsrcCR_SHAMT                             4
+#define ADDR_MOD_PACK_SEC2_YsrcCR_MASK                            0x10
+#define ADDR_MOD_PACK_SEC2_YsrcCR_RMW                             ADDR_MOD_PACK_SEC2_YsrcCR_ADDR32, ADDR_MOD_PACK_SEC2_YsrcCR_SHAMT, ADDR_MOD_PACK_SEC2_YsrcCR_MASK
+
+#define ADDR_MOD_PACK_SEC2_YsrcClear_ADDR32                        39
+#define ADDR_MOD_PACK_SEC2_YsrcClear_SHAMT                          5
+#define ADDR_MOD_PACK_SEC2_YsrcClear_MASK                         0x20
+#define ADDR_MOD_PACK_SEC2_YsrcClear_RMW                          ADDR_MOD_PACK_SEC2_YsrcClear_ADDR32, ADDR_MOD_PACK_SEC2_YsrcClear_SHAMT, ADDR_MOD_PACK_SEC2_YsrcClear_MASK
+
+#define ADDR_MOD_PACK_SEC2_YdstIncr_ADDR32                         39
+#define ADDR_MOD_PACK_SEC2_YdstIncr_SHAMT                           6
+#define ADDR_MOD_PACK_SEC2_YdstIncr_MASK                          0x3c0
+#define ADDR_MOD_PACK_SEC2_YdstIncr_RMW                           ADDR_MOD_PACK_SEC2_YdstIncr_ADDR32, ADDR_MOD_PACK_SEC2_YdstIncr_SHAMT, ADDR_MOD_PACK_SEC2_YdstIncr_MASK
+
+#define ADDR_MOD_PACK_SEC2_YdstCR_ADDR32                           39
+#define ADDR_MOD_PACK_SEC2_YdstCR_SHAMT                            10
+#define ADDR_MOD_PACK_SEC2_YdstCR_MASK                            0x400
+#define ADDR_MOD_PACK_SEC2_YdstCR_RMW                             ADDR_MOD_PACK_SEC2_YdstCR_ADDR32, ADDR_MOD_PACK_SEC2_YdstCR_SHAMT, ADDR_MOD_PACK_SEC2_YdstCR_MASK
+
+#define ADDR_MOD_PACK_SEC2_YdstClear_ADDR32                        39
+#define ADDR_MOD_PACK_SEC2_YdstClear_SHAMT                         11
+#define ADDR_MOD_PACK_SEC2_YdstClear_MASK                         0x800
+#define ADDR_MOD_PACK_SEC2_YdstClear_RMW                          ADDR_MOD_PACK_SEC2_YdstClear_ADDR32, ADDR_MOD_PACK_SEC2_YdstClear_SHAMT, ADDR_MOD_PACK_SEC2_YdstClear_MASK
+
+#define ADDR_MOD_PACK_SEC2_ZsrcIncr_ADDR32                         39
+#define ADDR_MOD_PACK_SEC2_ZsrcIncr_SHAMT                          12
+#define ADDR_MOD_PACK_SEC2_ZsrcIncr_MASK                          0x1000
+#define ADDR_MOD_PACK_SEC2_ZsrcIncr_RMW                           ADDR_MOD_PACK_SEC2_ZsrcIncr_ADDR32, ADDR_MOD_PACK_SEC2_ZsrcIncr_SHAMT, ADDR_MOD_PACK_SEC2_ZsrcIncr_MASK
+
+#define ADDR_MOD_PACK_SEC2_ZsrcClear_ADDR32                        39
+#define ADDR_MOD_PACK_SEC2_ZsrcClear_SHAMT                         13
+#define ADDR_MOD_PACK_SEC2_ZsrcClear_MASK                         0x2000
+#define ADDR_MOD_PACK_SEC2_ZsrcClear_RMW                          ADDR_MOD_PACK_SEC2_ZsrcClear_ADDR32, ADDR_MOD_PACK_SEC2_ZsrcClear_SHAMT, ADDR_MOD_PACK_SEC2_ZsrcClear_MASK
+
+#define ADDR_MOD_PACK_SEC2_ZdstIncr_ADDR32                         39
+#define ADDR_MOD_PACK_SEC2_ZdstIncr_SHAMT                          14
+#define ADDR_MOD_PACK_SEC2_ZdstIncr_MASK                          0x4000
+#define ADDR_MOD_PACK_SEC2_ZdstIncr_RMW                           ADDR_MOD_PACK_SEC2_ZdstIncr_ADDR32, ADDR_MOD_PACK_SEC2_ZdstIncr_SHAMT, ADDR_MOD_PACK_SEC2_ZdstIncr_MASK
+
+#define ADDR_MOD_PACK_SEC2_ZdstClear_ADDR32                        39
+#define ADDR_MOD_PACK_SEC2_ZdstClear_SHAMT                         15
+#define ADDR_MOD_PACK_SEC2_ZdstClear_MASK                         0x8000
+#define ADDR_MOD_PACK_SEC2_ZdstClear_RMW                          ADDR_MOD_PACK_SEC2_ZdstClear_ADDR32, ADDR_MOD_PACK_SEC2_ZdstClear_SHAMT, ADDR_MOD_PACK_SEC2_ZdstClear_MASK
+
+#define ADDR_MOD_PACK_SEC3_YsrcIncr_ADDR32                         40
+#define ADDR_MOD_PACK_SEC3_YsrcIncr_SHAMT                           0
+#define ADDR_MOD_PACK_SEC3_YsrcIncr_MASK                          0xf
+#define ADDR_MOD_PACK_SEC3_YsrcIncr_RMW                           ADDR_MOD_PACK_SEC3_YsrcIncr_ADDR32, ADDR_MOD_PACK_SEC3_YsrcIncr_SHAMT, ADDR_MOD_PACK_SEC3_YsrcIncr_MASK
+
+#define ADDR_MOD_PACK_SEC3_YsrcCR_ADDR32                           40
+#define ADDR_MOD_PACK_SEC3_YsrcCR_SHAMT                             4
+#define ADDR_MOD_PACK_SEC3_YsrcCR_MASK                            0x10
+#define ADDR_MOD_PACK_SEC3_YsrcCR_RMW                             ADDR_MOD_PACK_SEC3_YsrcCR_ADDR32, ADDR_MOD_PACK_SEC3_YsrcCR_SHAMT, ADDR_MOD_PACK_SEC3_YsrcCR_MASK
+
+#define ADDR_MOD_PACK_SEC3_YsrcClear_ADDR32                        40
+#define ADDR_MOD_PACK_SEC3_YsrcClear_SHAMT                          5
+#define ADDR_MOD_PACK_SEC3_YsrcClear_MASK                         0x20
+#define ADDR_MOD_PACK_SEC3_YsrcClear_RMW                          ADDR_MOD_PACK_SEC3_YsrcClear_ADDR32, ADDR_MOD_PACK_SEC3_YsrcClear_SHAMT, ADDR_MOD_PACK_SEC3_YsrcClear_MASK
+
+#define ADDR_MOD_PACK_SEC3_YdstIncr_ADDR32                         40
+#define ADDR_MOD_PACK_SEC3_YdstIncr_SHAMT                           6
+#define ADDR_MOD_PACK_SEC3_YdstIncr_MASK                          0x3c0
+#define ADDR_MOD_PACK_SEC3_YdstIncr_RMW                           ADDR_MOD_PACK_SEC3_YdstIncr_ADDR32, ADDR_MOD_PACK_SEC3_YdstIncr_SHAMT, ADDR_MOD_PACK_SEC3_YdstIncr_MASK
+
+#define ADDR_MOD_PACK_SEC3_YdstCR_ADDR32                           40
+#define ADDR_MOD_PACK_SEC3_YdstCR_SHAMT                            10
+#define ADDR_MOD_PACK_SEC3_YdstCR_MASK                            0x400
+#define ADDR_MOD_PACK_SEC3_YdstCR_RMW                             ADDR_MOD_PACK_SEC3_YdstCR_ADDR32, ADDR_MOD_PACK_SEC3_YdstCR_SHAMT, ADDR_MOD_PACK_SEC3_YdstCR_MASK
+
+#define ADDR_MOD_PACK_SEC3_YdstClear_ADDR32                        40
+#define ADDR_MOD_PACK_SEC3_YdstClear_SHAMT                         11
+#define ADDR_MOD_PACK_SEC3_YdstClear_MASK                         0x800
+#define ADDR_MOD_PACK_SEC3_YdstClear_RMW                          ADDR_MOD_PACK_SEC3_YdstClear_ADDR32, ADDR_MOD_PACK_SEC3_YdstClear_SHAMT, ADDR_MOD_PACK_SEC3_YdstClear_MASK
+
+#define ADDR_MOD_PACK_SEC3_ZsrcIncr_ADDR32                         40
+#define ADDR_MOD_PACK_SEC3_ZsrcIncr_SHAMT                          12
+#define ADDR_MOD_PACK_SEC3_ZsrcIncr_MASK                          0x1000
+#define ADDR_MOD_PACK_SEC3_ZsrcIncr_RMW                           ADDR_MOD_PACK_SEC3_ZsrcIncr_ADDR32, ADDR_MOD_PACK_SEC3_ZsrcIncr_SHAMT, ADDR_MOD_PACK_SEC3_ZsrcIncr_MASK
+
+#define ADDR_MOD_PACK_SEC3_ZsrcClear_ADDR32                        40
+#define ADDR_MOD_PACK_SEC3_ZsrcClear_SHAMT                         13
+#define ADDR_MOD_PACK_SEC3_ZsrcClear_MASK                         0x2000
+#define ADDR_MOD_PACK_SEC3_ZsrcClear_RMW                          ADDR_MOD_PACK_SEC3_ZsrcClear_ADDR32, ADDR_MOD_PACK_SEC3_ZsrcClear_SHAMT, ADDR_MOD_PACK_SEC3_ZsrcClear_MASK
+
+#define ADDR_MOD_PACK_SEC3_ZdstIncr_ADDR32                         40
+#define ADDR_MOD_PACK_SEC3_ZdstIncr_SHAMT                          14
+#define ADDR_MOD_PACK_SEC3_ZdstIncr_MASK                          0x4000
+#define ADDR_MOD_PACK_SEC3_ZdstIncr_RMW                           ADDR_MOD_PACK_SEC3_ZdstIncr_ADDR32, ADDR_MOD_PACK_SEC3_ZdstIncr_SHAMT, ADDR_MOD_PACK_SEC3_ZdstIncr_MASK
+
+#define ADDR_MOD_PACK_SEC3_ZdstClear_ADDR32                        40
+#define ADDR_MOD_PACK_SEC3_ZdstClear_SHAMT                         15
+#define ADDR_MOD_PACK_SEC3_ZdstClear_MASK                         0x8000
+#define ADDR_MOD_PACK_SEC3_ZdstClear_RMW                          ADDR_MOD_PACK_SEC3_ZdstClear_ADDR32, ADDR_MOD_PACK_SEC3_ZdstClear_SHAMT, ADDR_MOD_PACK_SEC3_ZdstClear_MASK
+
+#define UNPACK_MISC_CFG_CfgContextOffset_0_ADDR32                  41
+#define UNPACK_MISC_CFG_CfgContextOffset_0_SHAMT                    0
+#define UNPACK_MISC_CFG_CfgContextOffset_0_MASK                   0xf
+#define UNPACK_MISC_CFG_CfgContextOffset_0_RMW                    UNPACK_MISC_CFG_CfgContextOffset_0_ADDR32, UNPACK_MISC_CFG_CfgContextOffset_0_SHAMT, UNPACK_MISC_CFG_CfgContextOffset_0_MASK
+
+#define UNPACK_MISC_CFG_CfgContextCntReset_0_ADDR32                41
+#define UNPACK_MISC_CFG_CfgContextCntReset_0_SHAMT                  4
+#define UNPACK_MISC_CFG_CfgContextCntReset_0_MASK                 0x10
+#define UNPACK_MISC_CFG_CfgContextCntReset_0_RMW                  UNPACK_MISC_CFG_CfgContextCntReset_0_ADDR32, UNPACK_MISC_CFG_CfgContextCntReset_0_SHAMT, UNPACK_MISC_CFG_CfgContextCntReset_0_MASK
+
+#define UNPACK_MISC_CFG_CfgContextCntInc_0_ADDR32                  41
+#define UNPACK_MISC_CFG_CfgContextCntInc_0_SHAMT                    5
+#define UNPACK_MISC_CFG_CfgContextCntInc_0_MASK                   0x20
+#define UNPACK_MISC_CFG_CfgContextCntInc_0_RMW                    UNPACK_MISC_CFG_CfgContextCntInc_0_ADDR32, UNPACK_MISC_CFG_CfgContextCntInc_0_SHAMT, UNPACK_MISC_CFG_CfgContextCntInc_0_MASK
+
+#define UNPACK_MISC_CFG_CfgContextOffset_1_ADDR32                  41
+#define UNPACK_MISC_CFG_CfgContextOffset_1_SHAMT                    8
+#define UNPACK_MISC_CFG_CfgContextOffset_1_MASK                   0xf00
+#define UNPACK_MISC_CFG_CfgContextOffset_1_RMW                    UNPACK_MISC_CFG_CfgContextOffset_1_ADDR32, UNPACK_MISC_CFG_CfgContextOffset_1_SHAMT, UNPACK_MISC_CFG_CfgContextOffset_1_MASK
+
+#define UNPACK_MISC_CFG_CfgContextCntReset_1_ADDR32                41
+#define UNPACK_MISC_CFG_CfgContextCntReset_1_SHAMT                 12
+#define UNPACK_MISC_CFG_CfgContextCntReset_1_MASK                 0x1000
+#define UNPACK_MISC_CFG_CfgContextCntReset_1_RMW                  UNPACK_MISC_CFG_CfgContextCntReset_1_ADDR32, UNPACK_MISC_CFG_CfgContextCntReset_1_SHAMT, UNPACK_MISC_CFG_CfgContextCntReset_1_MASK
+
+#define UNPACK_MISC_CFG_CfgContextCntInc_1_ADDR32                  41
+#define UNPACK_MISC_CFG_CfgContextCntInc_1_SHAMT                   13
+#define UNPACK_MISC_CFG_CfgContextCntInc_1_MASK                   0x2000
+#define UNPACK_MISC_CFG_CfgContextCntInc_1_RMW                    UNPACK_MISC_CFG_CfgContextCntInc_1_ADDR32, UNPACK_MISC_CFG_CfgContextCntInc_1_SHAMT, UNPACK_MISC_CFG_CfgContextCntInc_1_MASK
+
+#define UNPACK_MISC_CFG_CfgContextCntReset_metadata_ADDR32         41
+#define UNPACK_MISC_CFG_CfgContextCntReset_metadata_SHAMT          14
+#define UNPACK_MISC_CFG_CfgContextCntReset_metadata_MASK          0x4000
+#define UNPACK_MISC_CFG_CfgContextCntReset_metadata_RMW           UNPACK_MISC_CFG_CfgContextCntReset_metadata_ADDR32, UNPACK_MISC_CFG_CfgContextCntReset_metadata_SHAMT, UNPACK_MISC_CFG_CfgContextCntReset_metadata_MASK
+
+#define UNPACK_MISC_CFG_CfgContextCntReset_metadata_zstart_ADDR32  41
+#define UNPACK_MISC_CFG_CfgContextCntReset_metadata_zstart_SHAMT   15
+#define UNPACK_MISC_CFG_CfgContextCntReset_metadata_zstart_MASK   0x8000
+#define UNPACK_MISC_CFG_CfgContextCntReset_metadata_zstart_RMW    UNPACK_MISC_CFG_CfgContextCntReset_metadata_zstart_ADDR32, UNPACK_MISC_CFG_CfgContextCntReset_metadata_zstart_SHAMT, UNPACK_MISC_CFG_CfgContextCntReset_metadata_zstart_MASK
+
+#define NOC_OVERLAY_MSG_CLEAR_StreamId_0_ADDR32                    42
+#define NOC_OVERLAY_MSG_CLEAR_StreamId_0_SHAMT                      0
+#define NOC_OVERLAY_MSG_CLEAR_StreamId_0_MASK                     0x3f
+#define NOC_OVERLAY_MSG_CLEAR_StreamId_0_RMW                      NOC_OVERLAY_MSG_CLEAR_StreamId_0_ADDR32, NOC_OVERLAY_MSG_CLEAR_StreamId_0_SHAMT, NOC_OVERLAY_MSG_CLEAR_StreamId_0_MASK
+
+#define NOC_OVERLAY_MSG_CLEAR_MsgNum_0_ADDR32                      42
+#define NOC_OVERLAY_MSG_CLEAR_MsgNum_0_SHAMT                        8
+#define NOC_OVERLAY_MSG_CLEAR_MsgNum_0_MASK                       0x700
+#define NOC_OVERLAY_MSG_CLEAR_MsgNum_0_RMW                        NOC_OVERLAY_MSG_CLEAR_MsgNum_0_ADDR32, NOC_OVERLAY_MSG_CLEAR_MsgNum_0_SHAMT, NOC_OVERLAY_MSG_CLEAR_MsgNum_0_MASK
+
+#define NOC_OVERLAY_MSG_CLEAR_StreamId_1_ADDR32                    43
+#define NOC_OVERLAY_MSG_CLEAR_StreamId_1_SHAMT                      0
+#define NOC_OVERLAY_MSG_CLEAR_StreamId_1_MASK                     0x3f
+#define NOC_OVERLAY_MSG_CLEAR_StreamId_1_RMW                      NOC_OVERLAY_MSG_CLEAR_StreamId_1_ADDR32, NOC_OVERLAY_MSG_CLEAR_StreamId_1_SHAMT, NOC_OVERLAY_MSG_CLEAR_StreamId_1_MASK
+
+#define NOC_OVERLAY_MSG_CLEAR_MsgNum_1_ADDR32                      43
+#define NOC_OVERLAY_MSG_CLEAR_MsgNum_1_SHAMT                        8
+#define NOC_OVERLAY_MSG_CLEAR_MsgNum_1_MASK                       0x700
+#define NOC_OVERLAY_MSG_CLEAR_MsgNum_1_RMW                        NOC_OVERLAY_MSG_CLEAR_MsgNum_1_ADDR32, NOC_OVERLAY_MSG_CLEAR_MsgNum_1_SHAMT, NOC_OVERLAY_MSG_CLEAR_MsgNum_1_MASK
+
+#define PERF_CNT_CMD_Cmd0Start_ADDR32                              44
+#define PERF_CNT_CMD_Cmd0Start_SHAMT                                0
+#define PERF_CNT_CMD_Cmd0Start_MASK                               0x1
+#define PERF_CNT_CMD_Cmd0Start_RMW                                PERF_CNT_CMD_Cmd0Start_ADDR32, PERF_CNT_CMD_Cmd0Start_SHAMT, PERF_CNT_CMD_Cmd0Start_MASK
+
+#define PERF_CNT_CMD_Cmd0Stop_ADDR32                               44
+#define PERF_CNT_CMD_Cmd0Stop_SHAMT                                 1
+#define PERF_CNT_CMD_Cmd0Stop_MASK                                0x2
+#define PERF_CNT_CMD_Cmd0Stop_RMW                                 PERF_CNT_CMD_Cmd0Stop_ADDR32, PERF_CNT_CMD_Cmd0Stop_SHAMT, PERF_CNT_CMD_Cmd0Stop_MASK
+
+#define PERF_CNT_CMD_Cmd1Start_ADDR32                              44
+#define PERF_CNT_CMD_Cmd1Start_SHAMT                                2
+#define PERF_CNT_CMD_Cmd1Start_MASK                               0x4
+#define PERF_CNT_CMD_Cmd1Start_RMW                                PERF_CNT_CMD_Cmd1Start_ADDR32, PERF_CNT_CMD_Cmd1Start_SHAMT, PERF_CNT_CMD_Cmd1Start_MASK
+
+#define PERF_CNT_CMD_Cmd1Stop_ADDR32                               44
+#define PERF_CNT_CMD_Cmd1Stop_SHAMT                                 3
+#define PERF_CNT_CMD_Cmd1Stop_MASK                                0x8
+#define PERF_CNT_CMD_Cmd1Stop_RMW                                 PERF_CNT_CMD_Cmd1Stop_ADDR32, PERF_CNT_CMD_Cmd1Stop_SHAMT, PERF_CNT_CMD_Cmd1Stop_MASK
+
+#define PERF_CNT_CMD_Cmd2Start_ADDR32                              44
+#define PERF_CNT_CMD_Cmd2Start_SHAMT                                4
+#define PERF_CNT_CMD_Cmd2Start_MASK                               0x10
+#define PERF_CNT_CMD_Cmd2Start_RMW                                PERF_CNT_CMD_Cmd2Start_ADDR32, PERF_CNT_CMD_Cmd2Start_SHAMT, PERF_CNT_CMD_Cmd2Start_MASK
+
+#define PERF_CNT_CMD_Cmd2Stop_ADDR32                               44
+#define PERF_CNT_CMD_Cmd2Stop_SHAMT                                 5
+#define PERF_CNT_CMD_Cmd2Stop_MASK                                0x20
+#define PERF_CNT_CMD_Cmd2Stop_RMW                                 PERF_CNT_CMD_Cmd2Stop_ADDR32, PERF_CNT_CMD_Cmd2Stop_SHAMT, PERF_CNT_CMD_Cmd2Stop_MASK
+
+#define PERF_CNT_CMD_Cmd3Start_ADDR32                              44
+#define PERF_CNT_CMD_Cmd3Start_SHAMT                                6
+#define PERF_CNT_CMD_Cmd3Start_MASK                               0x40
+#define PERF_CNT_CMD_Cmd3Start_RMW                                PERF_CNT_CMD_Cmd3Start_ADDR32, PERF_CNT_CMD_Cmd3Start_SHAMT, PERF_CNT_CMD_Cmd3Start_MASK
+
+#define PERF_CNT_CMD_Cmd3Stop_ADDR32                               44
+#define PERF_CNT_CMD_Cmd3Stop_SHAMT                                 7
+#define PERF_CNT_CMD_Cmd3Stop_MASK                                0x80
+#define PERF_CNT_CMD_Cmd3Stop_RMW                                 PERF_CNT_CMD_Cmd3Stop_ADDR32, PERF_CNT_CMD_Cmd3Stop_SHAMT, PERF_CNT_CMD_Cmd3Stop_MASK
+
+#define ENABLE_ACC_STATS_Enable_ADDR32                             45
+#define ENABLE_ACC_STATS_Enable_SHAMT                               0
+#define ENABLE_ACC_STATS_Enable_MASK                              0x1
+#define ENABLE_ACC_STATS_Enable_RMW                               ENABLE_ACC_STATS_Enable_ADDR32, ENABLE_ACC_STATS_Enable_SHAMT, ENABLE_ACC_STATS_Enable_MASK
+
+#define FPU_BIAS_SEL_Pointer_ADDR32                                46
+#define FPU_BIAS_SEL_Pointer_SHAMT                                  0
+#define FPU_BIAS_SEL_Pointer_MASK                                 0x1
+#define FPU_BIAS_SEL_Pointer_RMW                                  FPU_BIAS_SEL_Pointer_ADDR32, FPU_BIAS_SEL_Pointer_SHAMT, FPU_BIAS_SEL_Pointer_MASK
+
+#define ADDR_MOD_BIAS_SEC0_BiasIncr_ADDR32                         47
+#define ADDR_MOD_BIAS_SEC0_BiasIncr_SHAMT                           0
+#define ADDR_MOD_BIAS_SEC0_BiasIncr_MASK                          0xf
+#define ADDR_MOD_BIAS_SEC0_BiasIncr_RMW                           ADDR_MOD_BIAS_SEC0_BiasIncr_ADDR32, ADDR_MOD_BIAS_SEC0_BiasIncr_SHAMT, ADDR_MOD_BIAS_SEC0_BiasIncr_MASK
+
+#define ADDR_MOD_BIAS_SEC0_BiasClear_ADDR32                        47
+#define ADDR_MOD_BIAS_SEC0_BiasClear_SHAMT                          4
+#define ADDR_MOD_BIAS_SEC0_BiasClear_MASK                         0x10
+#define ADDR_MOD_BIAS_SEC0_BiasClear_RMW                          ADDR_MOD_BIAS_SEC0_BiasClear_ADDR32, ADDR_MOD_BIAS_SEC0_BiasClear_SHAMT, ADDR_MOD_BIAS_SEC0_BiasClear_MASK
+
+#define ADDR_MOD_BIAS_SEC1_BiasIncr_ADDR32                         48
+#define ADDR_MOD_BIAS_SEC1_BiasIncr_SHAMT                           0
+#define ADDR_MOD_BIAS_SEC1_BiasIncr_MASK                          0xf
+#define ADDR_MOD_BIAS_SEC1_BiasIncr_RMW                           ADDR_MOD_BIAS_SEC1_BiasIncr_ADDR32, ADDR_MOD_BIAS_SEC1_BiasIncr_SHAMT, ADDR_MOD_BIAS_SEC1_BiasIncr_MASK
+
+#define ADDR_MOD_BIAS_SEC1_BiasClear_ADDR32                        48
+#define ADDR_MOD_BIAS_SEC1_BiasClear_SHAMT                          4
+#define ADDR_MOD_BIAS_SEC1_BiasClear_MASK                         0x10
+#define ADDR_MOD_BIAS_SEC1_BiasClear_RMW                          ADDR_MOD_BIAS_SEC1_BiasClear_ADDR32, ADDR_MOD_BIAS_SEC1_BiasClear_SHAMT, ADDR_MOD_BIAS_SEC1_BiasClear_MASK
+
+#define ADDR_MOD_BIAS_SEC2_BiasIncr_ADDR32                         49
+#define ADDR_MOD_BIAS_SEC2_BiasIncr_SHAMT                           0
+#define ADDR_MOD_BIAS_SEC2_BiasIncr_MASK                          0xf
+#define ADDR_MOD_BIAS_SEC2_BiasIncr_RMW                           ADDR_MOD_BIAS_SEC2_BiasIncr_ADDR32, ADDR_MOD_BIAS_SEC2_BiasIncr_SHAMT, ADDR_MOD_BIAS_SEC2_BiasIncr_MASK
+
+#define ADDR_MOD_BIAS_SEC2_BiasClear_ADDR32                        49
+#define ADDR_MOD_BIAS_SEC2_BiasClear_SHAMT                          4
+#define ADDR_MOD_BIAS_SEC2_BiasClear_MASK                         0x10
+#define ADDR_MOD_BIAS_SEC2_BiasClear_RMW                          ADDR_MOD_BIAS_SEC2_BiasClear_ADDR32, ADDR_MOD_BIAS_SEC2_BiasClear_SHAMT, ADDR_MOD_BIAS_SEC2_BiasClear_MASK
+
+#define ADDR_MOD_BIAS_SEC3_BiasIncr_ADDR32                         50
+#define ADDR_MOD_BIAS_SEC3_BiasIncr_SHAMT                           0
+#define ADDR_MOD_BIAS_SEC3_BiasIncr_MASK                          0xf
+#define ADDR_MOD_BIAS_SEC3_BiasIncr_RMW                           ADDR_MOD_BIAS_SEC3_BiasIncr_ADDR32, ADDR_MOD_BIAS_SEC3_BiasIncr_SHAMT, ADDR_MOD_BIAS_SEC3_BiasIncr_MASK
+
+#define ADDR_MOD_BIAS_SEC3_BiasClear_ADDR32                        50
+#define ADDR_MOD_BIAS_SEC3_BiasClear_SHAMT                          4
+#define ADDR_MOD_BIAS_SEC3_BiasClear_MASK                         0x10
+#define ADDR_MOD_BIAS_SEC3_BiasClear_RMW                          ADDR_MOD_BIAS_SEC3_BiasClear_ADDR32, ADDR_MOD_BIAS_SEC3_BiasClear_SHAMT, ADDR_MOD_BIAS_SEC3_BiasClear_MASK
+
+#define ADDR_MOD_BIAS_SEC4_BiasIncr_ADDR32                         51
+#define ADDR_MOD_BIAS_SEC4_BiasIncr_SHAMT                           0
+#define ADDR_MOD_BIAS_SEC4_BiasIncr_MASK                          0xf
+#define ADDR_MOD_BIAS_SEC4_BiasIncr_RMW                           ADDR_MOD_BIAS_SEC4_BiasIncr_ADDR32, ADDR_MOD_BIAS_SEC4_BiasIncr_SHAMT, ADDR_MOD_BIAS_SEC4_BiasIncr_MASK
+
+#define ADDR_MOD_BIAS_SEC4_BiasClear_ADDR32                        51
+#define ADDR_MOD_BIAS_SEC4_BiasClear_SHAMT                          4
+#define ADDR_MOD_BIAS_SEC4_BiasClear_MASK                         0x10
+#define ADDR_MOD_BIAS_SEC4_BiasClear_RMW                          ADDR_MOD_BIAS_SEC4_BiasClear_ADDR32, ADDR_MOD_BIAS_SEC4_BiasClear_SHAMT, ADDR_MOD_BIAS_SEC4_BiasClear_MASK
+
+#define ADDR_MOD_BIAS_SEC5_BiasIncr_ADDR32                         52
+#define ADDR_MOD_BIAS_SEC5_BiasIncr_SHAMT                           0
+#define ADDR_MOD_BIAS_SEC5_BiasIncr_MASK                          0xf
+#define ADDR_MOD_BIAS_SEC5_BiasIncr_RMW                           ADDR_MOD_BIAS_SEC5_BiasIncr_ADDR32, ADDR_MOD_BIAS_SEC5_BiasIncr_SHAMT, ADDR_MOD_BIAS_SEC5_BiasIncr_MASK
+
+#define ADDR_MOD_BIAS_SEC5_BiasClear_ADDR32                        52
+#define ADDR_MOD_BIAS_SEC5_BiasClear_SHAMT                          4
+#define ADDR_MOD_BIAS_SEC5_BiasClear_MASK                         0x10
+#define ADDR_MOD_BIAS_SEC5_BiasClear_RMW                          ADDR_MOD_BIAS_SEC5_BiasClear_ADDR32, ADDR_MOD_BIAS_SEC5_BiasClear_SHAMT, ADDR_MOD_BIAS_SEC5_BiasClear_MASK
+
+#define ADDR_MOD_BIAS_SEC6_BiasIncr_ADDR32                         53
+#define ADDR_MOD_BIAS_SEC6_BiasIncr_SHAMT                           0
+#define ADDR_MOD_BIAS_SEC6_BiasIncr_MASK                          0xf
+#define ADDR_MOD_BIAS_SEC6_BiasIncr_RMW                           ADDR_MOD_BIAS_SEC6_BiasIncr_ADDR32, ADDR_MOD_BIAS_SEC6_BiasIncr_SHAMT, ADDR_MOD_BIAS_SEC6_BiasIncr_MASK
+
+#define ADDR_MOD_BIAS_SEC6_BiasClear_ADDR32                        53
+#define ADDR_MOD_BIAS_SEC6_BiasClear_SHAMT                          4
+#define ADDR_MOD_BIAS_SEC6_BiasClear_MASK                         0x10
+#define ADDR_MOD_BIAS_SEC6_BiasClear_RMW                          ADDR_MOD_BIAS_SEC6_BiasClear_ADDR32, ADDR_MOD_BIAS_SEC6_BiasClear_SHAMT, ADDR_MOD_BIAS_SEC6_BiasClear_MASK
+
+#define ADDR_MOD_BIAS_SEC7_BiasIncr_ADDR32                         54
+#define ADDR_MOD_BIAS_SEC7_BiasIncr_SHAMT                           0
+#define ADDR_MOD_BIAS_SEC7_BiasIncr_MASK                          0xf
+#define ADDR_MOD_BIAS_SEC7_BiasIncr_RMW                           ADDR_MOD_BIAS_SEC7_BiasIncr_ADDR32, ADDR_MOD_BIAS_SEC7_BiasIncr_SHAMT, ADDR_MOD_BIAS_SEC7_BiasIncr_MASK
+
+#define ADDR_MOD_BIAS_SEC7_BiasClear_ADDR32                        54
+#define ADDR_MOD_BIAS_SEC7_BiasClear_SHAMT                          4
+#define ADDR_MOD_BIAS_SEC7_BiasClear_MASK                         0x10
+#define ADDR_MOD_BIAS_SEC7_BiasClear_RMW                          ADDR_MOD_BIAS_SEC7_BiasClear_ADDR32, ADDR_MOD_BIAS_SEC7_BiasClear_SHAMT, ADDR_MOD_BIAS_SEC7_BiasClear_MASK
+
+#define FP16A_FORCE_Enable_ADDR32                                  55
+#define FP16A_FORCE_Enable_SHAMT                                    0
+#define FP16A_FORCE_Enable_MASK                                   0x1
+#define FP16A_FORCE_Enable_RMW                                    FP16A_FORCE_Enable_ADDR32, FP16A_FORCE_Enable_SHAMT, FP16A_FORCE_Enable_MASK
+
+#define TENSIX_TRISC_SYNC_TrackGlobalCfg_ADDR32                    56
+#define TENSIX_TRISC_SYNC_TrackGlobalCfg_SHAMT                      0
+#define TENSIX_TRISC_SYNC_TrackGlobalCfg_MASK                     0x1
+#define TENSIX_TRISC_SYNC_TrackGlobalCfg_RMW                      TENSIX_TRISC_SYNC_TrackGlobalCfg_ADDR32, TENSIX_TRISC_SYNC_TrackGlobalCfg_SHAMT, TENSIX_TRISC_SYNC_TrackGlobalCfg_MASK
+
+#define TENSIX_TRISC_SYNC_EnSubdividedCfgForUnpacr_ADDR32          56
+#define TENSIX_TRISC_SYNC_EnSubdividedCfgForUnpacr_SHAMT            1
+#define TENSIX_TRISC_SYNC_EnSubdividedCfgForUnpacr_MASK           0x2
+#define TENSIX_TRISC_SYNC_EnSubdividedCfgForUnpacr_RMW            TENSIX_TRISC_SYNC_EnSubdividedCfgForUnpacr_ADDR32, TENSIX_TRISC_SYNC_EnSubdividedCfgForUnpacr_SHAMT, TENSIX_TRISC_SYNC_EnSubdividedCfgForUnpacr_MASK
+
+#define TENSIX_TRISC_SYNC_TrackGPR_ADDR32                          56
+#define TENSIX_TRISC_SYNC_TrackGPR_SHAMT                            2
+#define TENSIX_TRISC_SYNC_TrackGPR_MASK                           0x4
+#define TENSIX_TRISC_SYNC_TrackGPR_RMW                            TENSIX_TRISC_SYNC_TrackGPR_ADDR32, TENSIX_TRISC_SYNC_TrackGPR_SHAMT, TENSIX_TRISC_SYNC_TrackGPR_MASK
+
+#define TENSIX_TRISC_SYNC_TrackTDMARegs_ADDR32                     56
+#define TENSIX_TRISC_SYNC_TrackTDMARegs_SHAMT                       3
+#define TENSIX_TRISC_SYNC_TrackTDMARegs_MASK                      0x8
+#define TENSIX_TRISC_SYNC_TrackTDMARegs_RMW                       TENSIX_TRISC_SYNC_TrackTDMARegs_ADDR32, TENSIX_TRISC_SYNC_TrackTDMARegs_SHAMT, TENSIX_TRISC_SYNC_TrackTDMARegs_MASK
+
+#define TENSIX_TRISC_SYNC_TrackTensixInstructions_ADDR32           56
+#define TENSIX_TRISC_SYNC_TrackTensixInstructions_SHAMT             4
+#define TENSIX_TRISC_SYNC_TrackTensixInstructions_MASK            0x10
+#define TENSIX_TRISC_SYNC_TrackTensixInstructions_RMW             TENSIX_TRISC_SYNC_TrackTensixInstructions_ADDR32, TENSIX_TRISC_SYNC_TrackTensixInstructions_SHAMT, TENSIX_TRISC_SYNC_TrackTensixInstructions_MASK
+
+#define STREAMWAIT_PHASE_HI_Val_ADDR32                             57
+#define STREAMWAIT_PHASE_HI_Val_SHAMT                               0
+#define STREAMWAIT_PHASE_HI_Val_MASK                              0x3ff
+#define STREAMWAIT_PHASE_HI_Val_RMW                               STREAMWAIT_PHASE_HI_Val_ADDR32, STREAMWAIT_PHASE_HI_Val_SHAMT, STREAMWAIT_PHASE_HI_Val_MASK
+
+#define STREAMWAIT_NUM_MSGS_HI_Val_ADDR32                          58
+#define STREAMWAIT_NUM_MSGS_HI_Val_SHAMT                            0
+#define STREAMWAIT_NUM_MSGS_HI_Val_MASK                           0x7f
+#define STREAMWAIT_NUM_MSGS_HI_Val_RMW                            STREAMWAIT_NUM_MSGS_HI_Val_ADDR32, STREAMWAIT_NUM_MSGS_HI_Val_SHAMT, STREAMWAIT_NUM_MSGS_HI_Val_MASK
+
+#define STREAM_ID_SYNC_SEC0_BankSel_ADDR32                         59
+#define STREAM_ID_SYNC_SEC0_BankSel_SHAMT                           0
+#define STREAM_ID_SYNC_SEC0_BankSel_MASK                          0x3f
+#define STREAM_ID_SYNC_SEC0_BankSel_RMW                           STREAM_ID_SYNC_SEC0_BankSel_ADDR32, STREAM_ID_SYNC_SEC0_BankSel_SHAMT, STREAM_ID_SYNC_SEC0_BankSel_MASK
+
+#define STREAM_ID_SYNC_SEC1_BankSel_ADDR32                         60
+#define STREAM_ID_SYNC_SEC1_BankSel_SHAMT                           0
+#define STREAM_ID_SYNC_SEC1_BankSel_MASK                          0x3f
+#define STREAM_ID_SYNC_SEC1_BankSel_RMW                           STREAM_ID_SYNC_SEC1_BankSel_ADDR32, STREAM_ID_SYNC_SEC1_BankSel_SHAMT, STREAM_ID_SYNC_SEC1_BankSel_MASK
+
+#define STREAM_ID_SYNC_SEC2_BankSel_ADDR32                         61
+#define STREAM_ID_SYNC_SEC2_BankSel_SHAMT                           0
+#define STREAM_ID_SYNC_SEC2_BankSel_MASK                          0x3f
+#define STREAM_ID_SYNC_SEC2_BankSel_RMW                           STREAM_ID_SYNC_SEC2_BankSel_ADDR32, STREAM_ID_SYNC_SEC2_BankSel_SHAMT, STREAM_ID_SYNC_SEC2_BankSel_MASK
+
+#define STREAM_ID_SYNC_SEC3_BankSel_ADDR32                         62
+#define STREAM_ID_SYNC_SEC3_BankSel_SHAMT                           0
+#define STREAM_ID_SYNC_SEC3_BankSel_MASK                          0x3f
+#define STREAM_ID_SYNC_SEC3_BankSel_RMW                           STREAM_ID_SYNC_SEC3_BankSel_ADDR32, STREAM_ID_SYNC_SEC3_BankSel_SHAMT, STREAM_ID_SYNC_SEC3_BankSel_MASK
+
+#define STREAM_ID_TRISC_SEC0_BankSel_ADDR32                        63
+#define STREAM_ID_TRISC_SEC0_BankSel_SHAMT                          0
+#define STREAM_ID_TRISC_SEC0_BankSel_MASK                         0x3f
+#define STREAM_ID_TRISC_SEC0_BankSel_RMW                          STREAM_ID_TRISC_SEC0_BankSel_ADDR32, STREAM_ID_TRISC_SEC0_BankSel_SHAMT, STREAM_ID_TRISC_SEC0_BankSel_MASK
+
+#define STREAM_ID_TRISC_SEC1_BankSel_ADDR32                        64
+#define STREAM_ID_TRISC_SEC1_BankSel_SHAMT                          0
+#define STREAM_ID_TRISC_SEC1_BankSel_MASK                         0x3f
+#define STREAM_ID_TRISC_SEC1_BankSel_RMW                          STREAM_ID_TRISC_SEC1_BankSel_ADDR32, STREAM_ID_TRISC_SEC1_BankSel_SHAMT, STREAM_ID_TRISC_SEC1_BankSel_MASK
+
+#define STREAM_ID_TRISC_SEC2_BankSel_ADDR32                        65
+#define STREAM_ID_TRISC_SEC2_BankSel_SHAMT                          0
+#define STREAM_ID_TRISC_SEC2_BankSel_MASK                         0x3f
+#define STREAM_ID_TRISC_SEC2_BankSel_RMW                          STREAM_ID_TRISC_SEC2_BankSel_ADDR32, STREAM_ID_TRISC_SEC2_BankSel_SHAMT, STREAM_ID_TRISC_SEC2_BankSel_MASK
+
+#define STREAM_ID_TRISC_SEC3_BankSel_ADDR32                        66
+#define STREAM_ID_TRISC_SEC3_BankSel_SHAMT                          0
+#define STREAM_ID_TRISC_SEC3_BankSel_MASK                         0x3f
+#define STREAM_ID_TRISC_SEC3_BankSel_RMW                          STREAM_ID_TRISC_SEC3_BankSel_ADDR32, STREAM_ID_TRISC_SEC3_BankSel_SHAMT, STREAM_ID_TRISC_SEC3_BankSel_MASK
+
+#define TENSIX_CSR_CONFIG_RawBusyStatus_ADDR32                     67
+#define TENSIX_CSR_CONFIG_RawBusyStatus_SHAMT                       0
+#define TENSIX_CSR_CONFIG_RawBusyStatus_MASK                      0x1
+#define TENSIX_CSR_CONFIG_RawBusyStatus_RMW                       TENSIX_CSR_CONFIG_RawBusyStatus_ADDR32, TENSIX_CSR_CONFIG_RawBusyStatus_SHAMT, TENSIX_CSR_CONFIG_RawBusyStatus_MASK
+
+////////////////////////////////////////////////////////////////////////
+// Registers for ALU
+////////////////////////////////////////////////////////////////////////
+
+#define ALU_CFGREG_BASE_ADDR32                            0
+
+#define ALU_FORMAT_SPEC_REG_SrcA_val_ADDR32               0
+#define ALU_FORMAT_SPEC_REG_SrcA_val_SHAMT                0
+#define ALU_FORMAT_SPEC_REG_SrcA_val_MASK               0xf
+#define ALU_FORMAT_SPEC_REG_SrcA_val_RMW                ALU_FORMAT_SPEC_REG_SrcA_val_ADDR32, ALU_FORMAT_SPEC_REG_SrcA_val_SHAMT, ALU_FORMAT_SPEC_REG_SrcA_val_MASK
+
+#define ALU_FORMAT_SPEC_REG_SrcA_override_ADDR32          0
+#define ALU_FORMAT_SPEC_REG_SrcA_override_SHAMT           4
+#define ALU_FORMAT_SPEC_REG_SrcA_override_MASK          0x10
+#define ALU_FORMAT_SPEC_REG_SrcA_override_RMW           ALU_FORMAT_SPEC_REG_SrcA_override_ADDR32, ALU_FORMAT_SPEC_REG_SrcA_override_SHAMT, ALU_FORMAT_SPEC_REG_SrcA_override_MASK
+
+#define ALU_FORMAT_SPEC_REG_SrcB_val_ADDR32               0
+#define ALU_FORMAT_SPEC_REG_SrcB_val_SHAMT                5
+#define ALU_FORMAT_SPEC_REG_SrcB_val_MASK               0x1e0
+#define ALU_FORMAT_SPEC_REG_SrcB_val_RMW                ALU_FORMAT_SPEC_REG_SrcB_val_ADDR32, ALU_FORMAT_SPEC_REG_SrcB_val_SHAMT, ALU_FORMAT_SPEC_REG_SrcB_val_MASK
+
+#define ALU_FORMAT_SPEC_REG_SrcB_override_ADDR32          0
+#define ALU_FORMAT_SPEC_REG_SrcB_override_SHAMT           9
+#define ALU_FORMAT_SPEC_REG_SrcB_override_MASK          0x200
+#define ALU_FORMAT_SPEC_REG_SrcB_override_RMW           ALU_FORMAT_SPEC_REG_SrcB_override_ADDR32, ALU_FORMAT_SPEC_REG_SrcB_override_SHAMT, ALU_FORMAT_SPEC_REG_SrcB_override_MASK
+
+#define ALU_FORMAT_SPEC_REG_Dstacc_val_ADDR32             0
+#define ALU_FORMAT_SPEC_REG_Dstacc_val_SHAMT             10
+#define ALU_FORMAT_SPEC_REG_Dstacc_val_MASK             0x3c00
+#define ALU_FORMAT_SPEC_REG_Dstacc_val_RMW              ALU_FORMAT_SPEC_REG_Dstacc_val_ADDR32, ALU_FORMAT_SPEC_REG_Dstacc_val_SHAMT, ALU_FORMAT_SPEC_REG_Dstacc_val_MASK
+
+#define ALU_FORMAT_SPEC_REG_Dstacc_override_ADDR32        0
+#define ALU_FORMAT_SPEC_REG_Dstacc_override_SHAMT        14
+#define ALU_FORMAT_SPEC_REG_Dstacc_override_MASK        0x4000
+#define ALU_FORMAT_SPEC_REG_Dstacc_override_RMW         ALU_FORMAT_SPEC_REG_Dstacc_override_ADDR32, ALU_FORMAT_SPEC_REG_Dstacc_override_SHAMT, ALU_FORMAT_SPEC_REG_Dstacc_override_MASK
+
+#define ALU_ROUNDING_MODE_Fpu_srnd_en_ADDR32              1
+#define ALU_ROUNDING_MODE_Fpu_srnd_en_SHAMT               0
+#define ALU_ROUNDING_MODE_Fpu_srnd_en_MASK              0x1
+#define ALU_ROUNDING_MODE_Fpu_srnd_en_RMW               ALU_ROUNDING_MODE_Fpu_srnd_en_ADDR32, ALU_ROUNDING_MODE_Fpu_srnd_en_SHAMT, ALU_ROUNDING_MODE_Fpu_srnd_en_MASK
+
+#define ALU_ROUNDING_MODE_Gasket_srnd_en_ADDR32           1
+#define ALU_ROUNDING_MODE_Gasket_srnd_en_SHAMT            1
+#define ALU_ROUNDING_MODE_Gasket_srnd_en_MASK           0x2
+#define ALU_ROUNDING_MODE_Gasket_srnd_en_RMW            ALU_ROUNDING_MODE_Gasket_srnd_en_ADDR32, ALU_ROUNDING_MODE_Gasket_srnd_en_SHAMT, ALU_ROUNDING_MODE_Gasket_srnd_en_MASK
+
+#define ALU_ROUNDING_MODE_Packer_srnd_en_ADDR32           1
+#define ALU_ROUNDING_MODE_Packer_srnd_en_SHAMT            2
+#define ALU_ROUNDING_MODE_Packer_srnd_en_MASK           0x4
+#define ALU_ROUNDING_MODE_Packer_srnd_en_RMW            ALU_ROUNDING_MODE_Packer_srnd_en_ADDR32, ALU_ROUNDING_MODE_Packer_srnd_en_SHAMT, ALU_ROUNDING_MODE_Packer_srnd_en_MASK
+
+#define ALU_ROUNDING_MODE_Padding_ADDR32                  1
+#define ALU_ROUNDING_MODE_Padding_SHAMT                   3
+#define ALU_ROUNDING_MODE_Padding_MASK                  0x1ff8
+#define ALU_ROUNDING_MODE_Padding_RMW                   ALU_ROUNDING_MODE_Padding_ADDR32, ALU_ROUNDING_MODE_Padding_SHAMT, ALU_ROUNDING_MODE_Padding_MASK
+
+#define ALU_ROUNDING_MODE_GS_LF_ADDR32                    1
+#define ALU_ROUNDING_MODE_GS_LF_SHAMT                    13
+#define ALU_ROUNDING_MODE_GS_LF_MASK                    0x2000
+#define ALU_ROUNDING_MODE_GS_LF_RMW                     ALU_ROUNDING_MODE_GS_LF_ADDR32, ALU_ROUNDING_MODE_GS_LF_SHAMT, ALU_ROUNDING_MODE_GS_LF_MASK
+
+#define ALU_ROUNDING_MODE_Bfp8_HF_ADDR32                  1
+#define ALU_ROUNDING_MODE_Bfp8_HF_SHAMT                  14
+#define ALU_ROUNDING_MODE_Bfp8_HF_MASK                  0x4000
+#define ALU_ROUNDING_MODE_Bfp8_HF_RMW                   ALU_ROUNDING_MODE_Bfp8_HF_ADDR32, ALU_ROUNDING_MODE_Bfp8_HF_SHAMT, ALU_ROUNDING_MODE_Bfp8_HF_MASK
+
+#define ALU_FORMAT_SPEC_REG0_SrcAUnsigned_ADDR32          1
+#define ALU_FORMAT_SPEC_REG0_SrcAUnsigned_SHAMT          15
+#define ALU_FORMAT_SPEC_REG0_SrcAUnsigned_MASK          0x8000
+#define ALU_FORMAT_SPEC_REG0_SrcAUnsigned_RMW           ALU_FORMAT_SPEC_REG0_SrcAUnsigned_ADDR32, ALU_FORMAT_SPEC_REG0_SrcAUnsigned_SHAMT, ALU_FORMAT_SPEC_REG0_SrcAUnsigned_MASK
+
+#define ALU_FORMAT_SPEC_REG0_SrcBUnsigned_ADDR32          1
+#define ALU_FORMAT_SPEC_REG0_SrcBUnsigned_SHAMT          16
+#define ALU_FORMAT_SPEC_REG0_SrcBUnsigned_MASK          0x10000
+#define ALU_FORMAT_SPEC_REG0_SrcBUnsigned_RMW           ALU_FORMAT_SPEC_REG0_SrcBUnsigned_ADDR32, ALU_FORMAT_SPEC_REG0_SrcBUnsigned_SHAMT, ALU_FORMAT_SPEC_REG0_SrcBUnsigned_MASK
+
+#define ALU_FORMAT_SPEC_REG0_SrcA_ADDR32                  1
+#define ALU_FORMAT_SPEC_REG0_SrcA_SHAMT                  17
+#define ALU_FORMAT_SPEC_REG0_SrcA_MASK                  0x1e0000
+#define ALU_FORMAT_SPEC_REG0_SrcA_RMW                   ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, ALU_FORMAT_SPEC_REG0_SrcA_SHAMT, ALU_FORMAT_SPEC_REG0_SrcA_MASK
+
+#define ALU_FORMAT_SPEC_REG1_SrcB_ADDR32                  1
+#define ALU_FORMAT_SPEC_REG1_SrcB_SHAMT                  21
+#define ALU_FORMAT_SPEC_REG1_SrcB_MASK                  0x1e00000
+#define ALU_FORMAT_SPEC_REG1_SrcB_RMW                   ALU_FORMAT_SPEC_REG1_SrcB_ADDR32, ALU_FORMAT_SPEC_REG1_SrcB_SHAMT, ALU_FORMAT_SPEC_REG1_SrcB_MASK
+
+#define ALU_FORMAT_SPEC_REG2_Dstacc_ADDR32                1
+#define ALU_FORMAT_SPEC_REG2_Dstacc_SHAMT                25
+#define ALU_FORMAT_SPEC_REG2_Dstacc_MASK                0x1e000000
+#define ALU_FORMAT_SPEC_REG2_Dstacc_RMW                 ALU_FORMAT_SPEC_REG2_Dstacc_ADDR32, ALU_FORMAT_SPEC_REG2_Dstacc_SHAMT, ALU_FORMAT_SPEC_REG2_Dstacc_MASK
+
+#define ALU_ACC_CTRL_Fp32_enabled_ADDR32                  1
+#define ALU_ACC_CTRL_Fp32_enabled_SHAMT                  29
+#define ALU_ACC_CTRL_Fp32_enabled_MASK                  0x20000000
+#define ALU_ACC_CTRL_Fp32_enabled_RMW                   ALU_ACC_CTRL_Fp32_enabled_ADDR32, ALU_ACC_CTRL_Fp32_enabled_SHAMT, ALU_ACC_CTRL_Fp32_enabled_MASK
+
+#define ALU_ACC_CTRL_SFPU_Fp32_enabled_ADDR32             1
+#define ALU_ACC_CTRL_SFPU_Fp32_enabled_SHAMT             30
+#define ALU_ACC_CTRL_SFPU_Fp32_enabled_MASK             0x40000000
+#define ALU_ACC_CTRL_SFPU_Fp32_enabled_RMW              ALU_ACC_CTRL_SFPU_Fp32_enabled_ADDR32, ALU_ACC_CTRL_SFPU_Fp32_enabled_SHAMT, ALU_ACC_CTRL_SFPU_Fp32_enabled_MASK
+
+#define ALU_ACC_CTRL_INT8_math_enabled_ADDR32             1
+#define ALU_ACC_CTRL_INT8_math_enabled_SHAMT             31
+#define ALU_ACC_CTRL_INT8_math_enabled_MASK             0x80000000
+#define ALU_ACC_CTRL_INT8_math_enabled_RMW              ALU_ACC_CTRL_INT8_math_enabled_ADDR32, ALU_ACC_CTRL_INT8_math_enabled_SHAMT, ALU_ACC_CTRL_INT8_math_enabled_MASK
+
+#define ALU_ACC_CTRL_Zero_Flag_disabled_src_ADDR32        2
+#define ALU_ACC_CTRL_Zero_Flag_disabled_src_SHAMT         0
+#define ALU_ACC_CTRL_Zero_Flag_disabled_src_MASK        0x1
+#define ALU_ACC_CTRL_Zero_Flag_disabled_src_RMW         ALU_ACC_CTRL_Zero_Flag_disabled_src_ADDR32, ALU_ACC_CTRL_Zero_Flag_disabled_src_SHAMT, ALU_ACC_CTRL_Zero_Flag_disabled_src_MASK
+
+#define ALU_ACC_CTRL_Zero_Flag_disabled_dst_ADDR32        2
+#define ALU_ACC_CTRL_Zero_Flag_disabled_dst_SHAMT         1
+#define ALU_ACC_CTRL_Zero_Flag_disabled_dst_MASK        0x2
+#define ALU_ACC_CTRL_Zero_Flag_disabled_dst_RMW         ALU_ACC_CTRL_Zero_Flag_disabled_dst_ADDR32, ALU_ACC_CTRL_Zero_Flag_disabled_dst_SHAMT, ALU_ACC_CTRL_Zero_Flag_disabled_dst_MASK
+
+#define STACC_RELU_ApplyRelu_ADDR32                       2
+#define STACC_RELU_ApplyRelu_SHAMT                        2
+#define STACC_RELU_ApplyRelu_MASK                       0x3c
+#define STACC_RELU_ApplyRelu_RMW                        STACC_RELU_ApplyRelu_ADDR32, STACC_RELU_ApplyRelu_SHAMT, STACC_RELU_ApplyRelu_MASK
+
+#define STACC_RELU_ReluThreshold_ADDR32                   2
+#define STACC_RELU_ReluThreshold_SHAMT                    6
+#define STACC_RELU_ReluThreshold_MASK                   0x3fffc0
+#define STACC_RELU_ReluThreshold_RMW                    STACC_RELU_ReluThreshold_ADDR32, STACC_RELU_ReluThreshold_SHAMT, STACC_RELU_ReluThreshold_MASK
+
+#define DISABLE_RISC_BP_Disable_main_ADDR32               2
+#define DISABLE_RISC_BP_Disable_main_SHAMT               22
+#define DISABLE_RISC_BP_Disable_main_MASK               0x400000
+#define DISABLE_RISC_BP_Disable_main_RMW                DISABLE_RISC_BP_Disable_main_ADDR32, DISABLE_RISC_BP_Disable_main_SHAMT, DISABLE_RISC_BP_Disable_main_MASK
+
+#define DISABLE_RISC_BP_Disable_trisc_ADDR32              2
+#define DISABLE_RISC_BP_Disable_trisc_SHAMT              23
+#define DISABLE_RISC_BP_Disable_trisc_MASK              0x3800000
+#define DISABLE_RISC_BP_Disable_trisc_RMW               DISABLE_RISC_BP_Disable_trisc_ADDR32, DISABLE_RISC_BP_Disable_trisc_SHAMT, DISABLE_RISC_BP_Disable_trisc_MASK
+
+#define DISABLE_RISC_BP_Disable_ncrisc_ADDR32             2
+#define DISABLE_RISC_BP_Disable_ncrisc_SHAMT             26
+#define DISABLE_RISC_BP_Disable_ncrisc_MASK             0x4000000
+#define DISABLE_RISC_BP_Disable_ncrisc_RMW              DISABLE_RISC_BP_Disable_ncrisc_ADDR32, DISABLE_RISC_BP_Disable_ncrisc_SHAMT, DISABLE_RISC_BP_Disable_ncrisc_MASK
+
+#define DISABLE_RISC_BP_Disable_bmp_clear_main_ADDR32     2
+#define DISABLE_RISC_BP_Disable_bmp_clear_main_SHAMT     27
+#define DISABLE_RISC_BP_Disable_bmp_clear_main_MASK     0x8000000
+#define DISABLE_RISC_BP_Disable_bmp_clear_main_RMW      DISABLE_RISC_BP_Disable_bmp_clear_main_ADDR32, DISABLE_RISC_BP_Disable_bmp_clear_main_SHAMT, DISABLE_RISC_BP_Disable_bmp_clear_main_MASK
+
+#define DISABLE_RISC_BP_Disable_bmp_clear_trisc_ADDR32    2
+#define DISABLE_RISC_BP_Disable_bmp_clear_trisc_SHAMT    28
+#define DISABLE_RISC_BP_Disable_bmp_clear_trisc_MASK    0x70000000
+#define DISABLE_RISC_BP_Disable_bmp_clear_trisc_RMW     DISABLE_RISC_BP_Disable_bmp_clear_trisc_ADDR32, DISABLE_RISC_BP_Disable_bmp_clear_trisc_SHAMT, DISABLE_RISC_BP_Disable_bmp_clear_trisc_MASK
+
+#define DISABLE_RISC_BP_Disable_bmp_clear_ncrisc_ADDR32   2
+#define DISABLE_RISC_BP_Disable_bmp_clear_ncrisc_SHAMT   31
+#define DISABLE_RISC_BP_Disable_bmp_clear_ncrisc_MASK   0x80000000
+#define DISABLE_RISC_BP_Disable_bmp_clear_ncrisc_RMW    DISABLE_RISC_BP_Disable_bmp_clear_ncrisc_ADDR32, DISABLE_RISC_BP_Disable_bmp_clear_ncrisc_SHAMT, DISABLE_RISC_BP_Disable_bmp_clear_ncrisc_MASK
+
+#define ECC_SCRUBBER_Enable_ADDR32                        3
+#define ECC_SCRUBBER_Enable_SHAMT                         0
+#define ECC_SCRUBBER_Enable_MASK                        0x1
+#define ECC_SCRUBBER_Enable_RMW                         ECC_SCRUBBER_Enable_ADDR32, ECC_SCRUBBER_Enable_SHAMT, ECC_SCRUBBER_Enable_MASK
+
+#define ECC_SCRUBBER_Scrub_On_Error_ADDR32                3
+#define ECC_SCRUBBER_Scrub_On_Error_SHAMT                 1
+#define ECC_SCRUBBER_Scrub_On_Error_MASK                0x2
+#define ECC_SCRUBBER_Scrub_On_Error_RMW                 ECC_SCRUBBER_Scrub_On_Error_ADDR32, ECC_SCRUBBER_Scrub_On_Error_SHAMT, ECC_SCRUBBER_Scrub_On_Error_MASK
+
+#define ECC_SCRUBBER_Scrub_On_Error_Immediately_ADDR32    3
+#define ECC_SCRUBBER_Scrub_On_Error_Immediately_SHAMT     2
+#define ECC_SCRUBBER_Scrub_On_Error_Immediately_MASK    0x4
+#define ECC_SCRUBBER_Scrub_On_Error_Immediately_RMW     ECC_SCRUBBER_Scrub_On_Error_Immediately_ADDR32, ECC_SCRUBBER_Scrub_On_Error_Immediately_SHAMT, ECC_SCRUBBER_Scrub_On_Error_Immediately_MASK
+
+#define ECC_SCRUBBER_Delay_ADDR32                         3
+#define ECC_SCRUBBER_Delay_SHAMT                          3
+#define ECC_SCRUBBER_Delay_MASK                         0x3ff8
+#define ECC_SCRUBBER_Delay_RMW                          ECC_SCRUBBER_Delay_ADDR32, ECC_SCRUBBER_Delay_SHAMT, ECC_SCRUBBER_Delay_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC0_no_swizzle_ADDR32      3
+#define RISC_DEST_ACCESS_CTRL_SEC0_no_swizzle_SHAMT      14
+#define RISC_DEST_ACCESS_CTRL_SEC0_no_swizzle_MASK      0x4000
+#define RISC_DEST_ACCESS_CTRL_SEC0_no_swizzle_RMW       RISC_DEST_ACCESS_CTRL_SEC0_no_swizzle_ADDR32, RISC_DEST_ACCESS_CTRL_SEC0_no_swizzle_SHAMT, RISC_DEST_ACCESS_CTRL_SEC0_no_swizzle_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC0_unsigned_int_ADDR32    3
+#define RISC_DEST_ACCESS_CTRL_SEC0_unsigned_int_SHAMT    15
+#define RISC_DEST_ACCESS_CTRL_SEC0_unsigned_int_MASK    0x8000
+#define RISC_DEST_ACCESS_CTRL_SEC0_unsigned_int_RMW     RISC_DEST_ACCESS_CTRL_SEC0_unsigned_int_ADDR32, RISC_DEST_ACCESS_CTRL_SEC0_unsigned_int_SHAMT, RISC_DEST_ACCESS_CTRL_SEC0_unsigned_int_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC0_fmt_ADDR32             3
+#define RISC_DEST_ACCESS_CTRL_SEC0_fmt_SHAMT             16
+#define RISC_DEST_ACCESS_CTRL_SEC0_fmt_MASK             0x70000
+#define RISC_DEST_ACCESS_CTRL_SEC0_fmt_RMW              RISC_DEST_ACCESS_CTRL_SEC0_fmt_ADDR32, RISC_DEST_ACCESS_CTRL_SEC0_fmt_SHAMT, RISC_DEST_ACCESS_CTRL_SEC0_fmt_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC1_no_swizzle_ADDR32      3
+#define RISC_DEST_ACCESS_CTRL_SEC1_no_swizzle_SHAMT      19
+#define RISC_DEST_ACCESS_CTRL_SEC1_no_swizzle_MASK      0x80000
+#define RISC_DEST_ACCESS_CTRL_SEC1_no_swizzle_RMW       RISC_DEST_ACCESS_CTRL_SEC1_no_swizzle_ADDR32, RISC_DEST_ACCESS_CTRL_SEC1_no_swizzle_SHAMT, RISC_DEST_ACCESS_CTRL_SEC1_no_swizzle_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC1_unsigned_int_ADDR32    3
+#define RISC_DEST_ACCESS_CTRL_SEC1_unsigned_int_SHAMT    20
+#define RISC_DEST_ACCESS_CTRL_SEC1_unsigned_int_MASK    0x100000
+#define RISC_DEST_ACCESS_CTRL_SEC1_unsigned_int_RMW     RISC_DEST_ACCESS_CTRL_SEC1_unsigned_int_ADDR32, RISC_DEST_ACCESS_CTRL_SEC1_unsigned_int_SHAMT, RISC_DEST_ACCESS_CTRL_SEC1_unsigned_int_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC1_fmt_ADDR32             3
+#define RISC_DEST_ACCESS_CTRL_SEC1_fmt_SHAMT             21
+#define RISC_DEST_ACCESS_CTRL_SEC1_fmt_MASK             0xe00000
+#define RISC_DEST_ACCESS_CTRL_SEC1_fmt_RMW              RISC_DEST_ACCESS_CTRL_SEC1_fmt_ADDR32, RISC_DEST_ACCESS_CTRL_SEC1_fmt_SHAMT, RISC_DEST_ACCESS_CTRL_SEC1_fmt_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC2_no_swizzle_ADDR32      3
+#define RISC_DEST_ACCESS_CTRL_SEC2_no_swizzle_SHAMT      24
+#define RISC_DEST_ACCESS_CTRL_SEC2_no_swizzle_MASK      0x1000000
+#define RISC_DEST_ACCESS_CTRL_SEC2_no_swizzle_RMW       RISC_DEST_ACCESS_CTRL_SEC2_no_swizzle_ADDR32, RISC_DEST_ACCESS_CTRL_SEC2_no_swizzle_SHAMT, RISC_DEST_ACCESS_CTRL_SEC2_no_swizzle_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC2_unsigned_int_ADDR32    3
+#define RISC_DEST_ACCESS_CTRL_SEC2_unsigned_int_SHAMT    25
+#define RISC_DEST_ACCESS_CTRL_SEC2_unsigned_int_MASK    0x2000000
+#define RISC_DEST_ACCESS_CTRL_SEC2_unsigned_int_RMW     RISC_DEST_ACCESS_CTRL_SEC2_unsigned_int_ADDR32, RISC_DEST_ACCESS_CTRL_SEC2_unsigned_int_SHAMT, RISC_DEST_ACCESS_CTRL_SEC2_unsigned_int_MASK
+
+#define RISC_DEST_ACCESS_CTRL_SEC2_fmt_ADDR32             3
+#define RISC_DEST_ACCESS_CTRL_SEC2_fmt_SHAMT             26
+#define RISC_DEST_ACCESS_CTRL_SEC2_fmt_MASK             0x1c000000
+#define RISC_DEST_ACCESS_CTRL_SEC2_fmt_RMW              RISC_DEST_ACCESS_CTRL_SEC2_fmt_ADDR32, RISC_DEST_ACCESS_CTRL_SEC2_fmt_SHAMT, RISC_DEST_ACCESS_CTRL_SEC2_fmt_MASK
+
+#define STATE_RESET_EN_ADDR32                             4
+#define STATE_RESET_EN_SHAMT                              0
+#define STATE_RESET_EN_MASK                             0x1
+#define STATE_RESET_EN_RMW                              STATE_RESET_EN_ADDR32, STATE_RESET_EN_SHAMT, STATE_RESET_EN_MASK
+
+#define DEST_OFFSET_Enable_ADDR32                         5
+#define DEST_OFFSET_Enable_SHAMT                          0
+#define DEST_OFFSET_Enable_MASK                         0x1
+#define DEST_OFFSET_Enable_RMW                          DEST_OFFSET_Enable_ADDR32, DEST_OFFSET_Enable_SHAMT, DEST_OFFSET_Enable_MASK
+
+#define DEST_REGW_BASE_Base_ADDR32                        6
+#define DEST_REGW_BASE_Base_SHAMT                         0
+#define DEST_REGW_BASE_Base_MASK                        0xffff
+#define DEST_REGW_BASE_Base_RMW                         DEST_REGW_BASE_Base_ADDR32, DEST_REGW_BASE_Base_SHAMT, DEST_REGW_BASE_Base_MASK
+
+#define DEST_SP_BASE_Base_ADDR32                          7
+#define DEST_SP_BASE_Base_SHAMT                           0
+#define DEST_SP_BASE_Base_MASK                          0xffff
+#define DEST_SP_BASE_Base_RMW                           DEST_SP_BASE_Base_ADDR32, DEST_SP_BASE_Base_SHAMT, DEST_SP_BASE_Base_MASK
+
+#define INT_DESCALE_Enable_ADDR32                         8
+#define INT_DESCALE_Enable_SHAMT                          0
+#define INT_DESCALE_Enable_MASK                         0x1
+#define INT_DESCALE_Enable_RMW                          INT_DESCALE_Enable_ADDR32, INT_DESCALE_Enable_SHAMT, INT_DESCALE_Enable_MASK
+
+#define INT_DESCALE_Mode_ADDR32                           8
+#define INT_DESCALE_Mode_SHAMT                            1
+#define INT_DESCALE_Mode_MASK                           0x2
+#define INT_DESCALE_Mode_RMW                            INT_DESCALE_Mode_ADDR32, INT_DESCALE_Mode_SHAMT, INT_DESCALE_Mode_MASK
+
+////////////////////////////////////////////////////////////////////////
+// Registers for PACK0
+////////////////////////////////////////////////////////////////////////
+
+#define PACK0_CFGREG_BASE_ADDR32                                     12
+
+#define PCK0_ADDR_CTRL_XY_REG_0_Xstride_ADDR32                       12
+#define PCK0_ADDR_CTRL_XY_REG_0_Xstride_SHAMT                         0
+#define PCK0_ADDR_CTRL_XY_REG_0_Xstride_MASK                        0xffff
+#define PCK0_ADDR_CTRL_XY_REG_0_Xstride_RMW                         PCK0_ADDR_CTRL_XY_REG_0_Xstride_ADDR32, PCK0_ADDR_CTRL_XY_REG_0_Xstride_SHAMT, PCK0_ADDR_CTRL_XY_REG_0_Xstride_MASK
+
+#define PCK0_ADDR_CTRL_XY_REG_0_Ystride_ADDR32                       12
+#define PCK0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT                        16
+#define PCK0_ADDR_CTRL_XY_REG_0_Ystride_MASK                        0xffff0000
+#define PCK0_ADDR_CTRL_XY_REG_0_Ystride_RMW                         PCK0_ADDR_CTRL_XY_REG_0_Ystride_ADDR32, PCK0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT, PCK0_ADDR_CTRL_XY_REG_0_Ystride_MASK
+
+#define PCK0_ADDR_CTRL_ZW_REG_0_Zstride_ADDR32                       13
+#define PCK0_ADDR_CTRL_ZW_REG_0_Zstride_SHAMT                         0
+#define PCK0_ADDR_CTRL_ZW_REG_0_Zstride_MASK                        0xffff
+#define PCK0_ADDR_CTRL_ZW_REG_0_Zstride_RMW                         PCK0_ADDR_CTRL_ZW_REG_0_Zstride_ADDR32, PCK0_ADDR_CTRL_ZW_REG_0_Zstride_SHAMT, PCK0_ADDR_CTRL_ZW_REG_0_Zstride_MASK
+
+#define PCK0_ADDR_CTRL_ZW_REG_0_Wstride_ADDR32                       13
+#define PCK0_ADDR_CTRL_ZW_REG_0_Wstride_SHAMT                        16
+#define PCK0_ADDR_CTRL_ZW_REG_0_Wstride_MASK                        0xffff0000
+#define PCK0_ADDR_CTRL_ZW_REG_0_Wstride_RMW                         PCK0_ADDR_CTRL_ZW_REG_0_Wstride_ADDR32, PCK0_ADDR_CTRL_ZW_REG_0_Wstride_SHAMT, PCK0_ADDR_CTRL_ZW_REG_0_Wstride_MASK
+
+#define PCK0_ADDR_CTRL_XY_REG_1_Xstride_ADDR32                       14
+#define PCK0_ADDR_CTRL_XY_REG_1_Xstride_SHAMT                         0
+#define PCK0_ADDR_CTRL_XY_REG_1_Xstride_MASK                        0xffff
+#define PCK0_ADDR_CTRL_XY_REG_1_Xstride_RMW                         PCK0_ADDR_CTRL_XY_REG_1_Xstride_ADDR32, PCK0_ADDR_CTRL_XY_REG_1_Xstride_SHAMT, PCK0_ADDR_CTRL_XY_REG_1_Xstride_MASK
+
+#define PCK0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32                       14
+#define PCK0_ADDR_CTRL_XY_REG_1_Ystride_SHAMT                        16
+#define PCK0_ADDR_CTRL_XY_REG_1_Ystride_MASK                        0xffff0000
+#define PCK0_ADDR_CTRL_XY_REG_1_Ystride_RMW                         PCK0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32, PCK0_ADDR_CTRL_XY_REG_1_Ystride_SHAMT, PCK0_ADDR_CTRL_XY_REG_1_Ystride_MASK
+
+#define PCK0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32                       15
+#define PCK0_ADDR_CTRL_ZW_REG_1_Zstride_SHAMT                         0
+#define PCK0_ADDR_CTRL_ZW_REG_1_Zstride_MASK                        0xffff
+#define PCK0_ADDR_CTRL_ZW_REG_1_Zstride_RMW                         PCK0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32, PCK0_ADDR_CTRL_ZW_REG_1_Zstride_SHAMT, PCK0_ADDR_CTRL_ZW_REG_1_Zstride_MASK
+
+#define PCK0_ADDR_CTRL_ZW_REG_1_Wstride_ADDR32                       15
+#define PCK0_ADDR_CTRL_ZW_REG_1_Wstride_SHAMT                        16
+#define PCK0_ADDR_CTRL_ZW_REG_1_Wstride_MASK                        0xffff0000
+#define PCK0_ADDR_CTRL_ZW_REG_1_Wstride_RMW                         PCK0_ADDR_CTRL_ZW_REG_1_Wstride_ADDR32, PCK0_ADDR_CTRL_ZW_REG_1_Wstride_SHAMT, PCK0_ADDR_CTRL_ZW_REG_1_Wstride_MASK
+
+#define PCK0_ADDR_BASE_REG_0_Base_ADDR32                             16
+#define PCK0_ADDR_BASE_REG_0_Base_SHAMT                               0
+#define PCK0_ADDR_BASE_REG_0_Base_MASK                              0x3ffff
+#define PCK0_ADDR_BASE_REG_0_Base_RMW                               PCK0_ADDR_BASE_REG_0_Base_ADDR32, PCK0_ADDR_BASE_REG_0_Base_SHAMT, PCK0_ADDR_BASE_REG_0_Base_MASK
+
+#define PCK0_ADDR_BASE_REG_1_Base_ADDR32                             17
+#define PCK0_ADDR_BASE_REG_1_Base_SHAMT                               0
+#define PCK0_ADDR_BASE_REG_1_Base_MASK                              0x3ffff
+#define PCK0_ADDR_BASE_REG_1_Base_RMW                               PCK0_ADDR_BASE_REG_1_Base_ADDR32, PCK0_ADDR_BASE_REG_1_Base_SHAMT, PCK0_ADDR_BASE_REG_1_Base_MASK
+
+#define PCK_DEST_RD_CTRL_Read_32b_data_ADDR32                        18
+#define PCK_DEST_RD_CTRL_Read_32b_data_SHAMT                          0
+#define PCK_DEST_RD_CTRL_Read_32b_data_MASK                         0x1
+#define PCK_DEST_RD_CTRL_Read_32b_data_RMW                          PCK_DEST_RD_CTRL_Read_32b_data_ADDR32, PCK_DEST_RD_CTRL_Read_32b_data_SHAMT, PCK_DEST_RD_CTRL_Read_32b_data_MASK
+
+#define PCK_DEST_RD_CTRL_Read_unsigned_ADDR32                        18
+#define PCK_DEST_RD_CTRL_Read_unsigned_SHAMT                          1
+#define PCK_DEST_RD_CTRL_Read_unsigned_MASK                         0x2
+#define PCK_DEST_RD_CTRL_Read_unsigned_RMW                          PCK_DEST_RD_CTRL_Read_unsigned_ADDR32, PCK_DEST_RD_CTRL_Read_unsigned_SHAMT, PCK_DEST_RD_CTRL_Read_unsigned_MASK
+
+#define PCK_DEST_RD_CTRL_Read_int8_ADDR32                            18
+#define PCK_DEST_RD_CTRL_Read_int8_SHAMT                              2
+#define PCK_DEST_RD_CTRL_Read_int8_MASK                             0x4
+#define PCK_DEST_RD_CTRL_Read_int8_RMW                              PCK_DEST_RD_CTRL_Read_int8_ADDR32, PCK_DEST_RD_CTRL_Read_int8_SHAMT, PCK_DEST_RD_CTRL_Read_int8_MASK
+
+#define PCK_DEST_RD_CTRL_Round_10b_mant_ADDR32                       18
+#define PCK_DEST_RD_CTRL_Round_10b_mant_SHAMT                         3
+#define PCK_DEST_RD_CTRL_Round_10b_mant_MASK                        0x8
+#define PCK_DEST_RD_CTRL_Round_10b_mant_RMW                         PCK_DEST_RD_CTRL_Round_10b_mant_ADDR32, PCK_DEST_RD_CTRL_Round_10b_mant_SHAMT, PCK_DEST_RD_CTRL_Round_10b_mant_MASK
+
+#define PCK_EDGE_TILE_FACE_SET_SELECT_select_ADDR32                  19
+#define PCK_EDGE_TILE_FACE_SET_SELECT_select_SHAMT                    0
+#define PCK_EDGE_TILE_FACE_SET_SELECT_select_MASK                   0xff
+#define PCK_EDGE_TILE_FACE_SET_SELECT_select_RMW                    PCK_EDGE_TILE_FACE_SET_SELECT_select_ADDR32, PCK_EDGE_TILE_FACE_SET_SELECT_select_SHAMT, PCK_EDGE_TILE_FACE_SET_SELECT_select_MASK
+
+#define PCK_EDGE_TILE_FACE_SET_SELECT_enable_ADDR32                  19
+#define PCK_EDGE_TILE_FACE_SET_SELECT_enable_SHAMT                    8
+#define PCK_EDGE_TILE_FACE_SET_SELECT_enable_MASK                   0x100
+#define PCK_EDGE_TILE_FACE_SET_SELECT_enable_RMW                    PCK_EDGE_TILE_FACE_SET_SELECT_enable_ADDR32, PCK_EDGE_TILE_FACE_SET_SELECT_enable_SHAMT, PCK_EDGE_TILE_FACE_SET_SELECT_enable_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_0_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_0_SHAMT                0
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_0_MASK               0x3
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_0_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_0_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_0_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_0_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_1_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_1_SHAMT                2
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_1_MASK               0xc
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_1_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_1_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_1_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_1_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_2_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_2_SHAMT                4
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_2_MASK               0x30
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_2_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_2_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_2_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_2_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_3_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_3_SHAMT                6
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_3_MASK               0xc0
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_3_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_3_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_3_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_3_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_4_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_4_SHAMT                8
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_4_MASK               0x300
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_4_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_4_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_4_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_4_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_5_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_5_SHAMT               10
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_5_MASK               0xc00
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_5_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_5_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_5_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_5_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_6_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_6_SHAMT               12
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_6_MASK               0x3000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_6_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_6_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_6_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_6_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_7_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_7_SHAMT               14
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_7_MASK               0xc000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_7_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_7_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_7_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_7_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_8_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_8_SHAMT               16
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_8_MASK               0x30000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_8_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_8_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_8_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_8_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_9_ADDR32              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_9_SHAMT               18
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_9_MASK               0xc0000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_9_RMW                TILE_ROW_SET_MAPPING_0_row_set_mapping_9_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_9_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_9_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_10_ADDR32             20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_10_SHAMT              20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_10_MASK              0x300000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_10_RMW               TILE_ROW_SET_MAPPING_0_row_set_mapping_10_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_10_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_10_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_11_ADDR32             20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_11_SHAMT              22
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_11_MASK              0xc00000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_11_RMW               TILE_ROW_SET_MAPPING_0_row_set_mapping_11_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_11_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_11_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_12_ADDR32             20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_12_SHAMT              24
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_12_MASK              0x3000000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_12_RMW               TILE_ROW_SET_MAPPING_0_row_set_mapping_12_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_12_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_12_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_13_ADDR32             20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_13_SHAMT              26
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_13_MASK              0xc000000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_13_RMW               TILE_ROW_SET_MAPPING_0_row_set_mapping_13_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_13_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_13_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_14_ADDR32             20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_14_SHAMT              28
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_14_MASK              0x30000000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_14_RMW               TILE_ROW_SET_MAPPING_0_row_set_mapping_14_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_14_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_14_MASK
+
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_15_ADDR32             20
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_15_SHAMT              30
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_15_MASK              0xc0000000
+#define TILE_ROW_SET_MAPPING_0_row_set_mapping_15_RMW               TILE_ROW_SET_MAPPING_0_row_set_mapping_15_ADDR32, TILE_ROW_SET_MAPPING_0_row_set_mapping_15_SHAMT, TILE_ROW_SET_MAPPING_0_row_set_mapping_15_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_0_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_0_SHAMT                0
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_0_MASK               0x3
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_0_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_0_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_0_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_0_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_1_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_1_SHAMT                2
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_1_MASK               0xc
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_1_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_1_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_1_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_1_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_2_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_2_SHAMT                4
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_2_MASK               0x30
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_2_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_2_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_2_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_2_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_3_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_3_SHAMT                6
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_3_MASK               0xc0
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_3_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_3_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_3_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_3_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_4_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_4_SHAMT                8
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_4_MASK               0x300
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_4_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_4_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_4_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_4_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_5_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_5_SHAMT               10
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_5_MASK               0xc00
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_5_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_5_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_5_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_5_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_6_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_6_SHAMT               12
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_6_MASK               0x3000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_6_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_6_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_6_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_6_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_7_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_7_SHAMT               14
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_7_MASK               0xc000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_7_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_7_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_7_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_7_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_8_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_8_SHAMT               16
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_8_MASK               0x30000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_8_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_8_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_8_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_8_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_9_ADDR32              21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_9_SHAMT               18
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_9_MASK               0xc0000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_9_RMW                TILE_ROW_SET_MAPPING_1_row_set_mapping_9_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_9_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_9_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_10_ADDR32             21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_10_SHAMT              20
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_10_MASK              0x300000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_10_RMW               TILE_ROW_SET_MAPPING_1_row_set_mapping_10_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_10_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_10_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_11_ADDR32             21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_11_SHAMT              22
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_11_MASK              0xc00000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_11_RMW               TILE_ROW_SET_MAPPING_1_row_set_mapping_11_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_11_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_11_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_12_ADDR32             21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_12_SHAMT              24
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_12_MASK              0x3000000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_12_RMW               TILE_ROW_SET_MAPPING_1_row_set_mapping_12_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_12_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_12_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_13_ADDR32             21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_13_SHAMT              26
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_13_MASK              0xc000000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_13_RMW               TILE_ROW_SET_MAPPING_1_row_set_mapping_13_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_13_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_13_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_14_ADDR32             21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_14_SHAMT              28
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_14_MASK              0x30000000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_14_RMW               TILE_ROW_SET_MAPPING_1_row_set_mapping_14_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_14_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_14_MASK
+
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_15_ADDR32             21
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_15_SHAMT              30
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_15_MASK              0xc0000000
+#define TILE_ROW_SET_MAPPING_1_row_set_mapping_15_RMW               TILE_ROW_SET_MAPPING_1_row_set_mapping_15_ADDR32, TILE_ROW_SET_MAPPING_1_row_set_mapping_15_SHAMT, TILE_ROW_SET_MAPPING_1_row_set_mapping_15_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_0_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_0_SHAMT                0
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_0_MASK               0x3
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_0_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_0_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_0_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_0_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_1_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_1_SHAMT                2
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_1_MASK               0xc
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_1_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_1_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_1_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_1_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_2_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_2_SHAMT                4
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_2_MASK               0x30
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_2_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_2_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_2_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_2_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_3_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_3_SHAMT                6
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_3_MASK               0xc0
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_3_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_3_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_3_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_3_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_4_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_4_SHAMT                8
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_4_MASK               0x300
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_4_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_4_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_4_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_4_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_5_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_5_SHAMT               10
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_5_MASK               0xc00
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_5_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_5_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_5_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_5_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_6_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_6_SHAMT               12
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_6_MASK               0x3000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_6_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_6_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_6_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_6_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_7_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_7_SHAMT               14
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_7_MASK               0xc000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_7_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_7_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_7_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_7_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_8_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_8_SHAMT               16
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_8_MASK               0x30000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_8_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_8_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_8_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_8_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_9_ADDR32              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_9_SHAMT               18
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_9_MASK               0xc0000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_9_RMW                TILE_ROW_SET_MAPPING_2_row_set_mapping_9_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_9_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_9_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_10_ADDR32             22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_10_SHAMT              20
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_10_MASK              0x300000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_10_RMW               TILE_ROW_SET_MAPPING_2_row_set_mapping_10_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_10_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_10_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_11_ADDR32             22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_11_SHAMT              22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_11_MASK              0xc00000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_11_RMW               TILE_ROW_SET_MAPPING_2_row_set_mapping_11_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_11_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_11_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_12_ADDR32             22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_12_SHAMT              24
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_12_MASK              0x3000000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_12_RMW               TILE_ROW_SET_MAPPING_2_row_set_mapping_12_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_12_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_12_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_13_ADDR32             22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_13_SHAMT              26
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_13_MASK              0xc000000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_13_RMW               TILE_ROW_SET_MAPPING_2_row_set_mapping_13_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_13_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_13_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_14_ADDR32             22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_14_SHAMT              28
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_14_MASK              0x30000000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_14_RMW               TILE_ROW_SET_MAPPING_2_row_set_mapping_14_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_14_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_14_MASK
+
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_15_ADDR32             22
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_15_SHAMT              30
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_15_MASK              0xc0000000
+#define TILE_ROW_SET_MAPPING_2_row_set_mapping_15_RMW               TILE_ROW_SET_MAPPING_2_row_set_mapping_15_ADDR32, TILE_ROW_SET_MAPPING_2_row_set_mapping_15_SHAMT, TILE_ROW_SET_MAPPING_2_row_set_mapping_15_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_0_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_0_SHAMT                0
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_0_MASK               0x3
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_0_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_0_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_0_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_0_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_1_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_1_SHAMT                2
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_1_MASK               0xc
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_1_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_1_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_1_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_1_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_2_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_2_SHAMT                4
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_2_MASK               0x30
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_2_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_2_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_2_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_2_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_3_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_3_SHAMT                6
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_3_MASK               0xc0
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_3_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_3_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_3_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_3_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_4_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_4_SHAMT                8
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_4_MASK               0x300
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_4_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_4_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_4_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_4_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_5_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_5_SHAMT               10
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_5_MASK               0xc00
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_5_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_5_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_5_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_5_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_6_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_6_SHAMT               12
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_6_MASK               0x3000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_6_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_6_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_6_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_6_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_7_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_7_SHAMT               14
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_7_MASK               0xc000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_7_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_7_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_7_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_7_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_8_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_8_SHAMT               16
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_8_MASK               0x30000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_8_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_8_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_8_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_8_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_9_ADDR32              23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_9_SHAMT               18
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_9_MASK               0xc0000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_9_RMW                TILE_ROW_SET_MAPPING_3_row_set_mapping_9_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_9_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_9_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_10_ADDR32             23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_10_SHAMT              20
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_10_MASK              0x300000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_10_RMW               TILE_ROW_SET_MAPPING_3_row_set_mapping_10_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_10_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_10_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_11_ADDR32             23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_11_SHAMT              22
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_11_MASK              0xc00000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_11_RMW               TILE_ROW_SET_MAPPING_3_row_set_mapping_11_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_11_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_11_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_12_ADDR32             23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_12_SHAMT              24
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_12_MASK              0x3000000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_12_RMW               TILE_ROW_SET_MAPPING_3_row_set_mapping_12_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_12_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_12_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_13_ADDR32             23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_13_SHAMT              26
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_13_MASK              0xc000000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_13_RMW               TILE_ROW_SET_MAPPING_3_row_set_mapping_13_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_13_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_13_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_14_ADDR32             23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_14_SHAMT              28
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_14_MASK              0x30000000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_14_RMW               TILE_ROW_SET_MAPPING_3_row_set_mapping_14_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_14_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_14_MASK
+
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_15_ADDR32             23
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_15_SHAMT              30
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_15_MASK              0xc0000000
+#define TILE_ROW_SET_MAPPING_3_row_set_mapping_15_RMW               TILE_ROW_SET_MAPPING_3_row_set_mapping_15_ADDR32, TILE_ROW_SET_MAPPING_3_row_set_mapping_15_SHAMT, TILE_ROW_SET_MAPPING_3_row_set_mapping_15_MASK
+
+#define PCK_EDGE_OFFSET_SEC0_mask_ADDR32                             24
+#define PCK_EDGE_OFFSET_SEC0_mask_SHAMT                               0
+#define PCK_EDGE_OFFSET_SEC0_mask_MASK                              0xffff
+#define PCK_EDGE_OFFSET_SEC0_mask_RMW                               PCK_EDGE_OFFSET_SEC0_mask_ADDR32, PCK_EDGE_OFFSET_SEC0_mask_SHAMT, PCK_EDGE_OFFSET_SEC0_mask_MASK
+
+#define PCK_EDGE_MODE_mode_ADDR32                                    24
+#define PCK_EDGE_MODE_mode_SHAMT                                     16
+#define PCK_EDGE_MODE_mode_MASK                                     0x10000
+#define PCK_EDGE_MODE_mode_RMW                                      PCK_EDGE_MODE_mode_ADDR32, PCK_EDGE_MODE_mode_SHAMT, PCK_EDGE_MODE_mode_MASK
+
+#define PCK_EDGE_TILE_ROW_SET_SELECT_select_ADDR32                   24
+#define PCK_EDGE_TILE_ROW_SET_SELECT_select_SHAMT                    17
+#define PCK_EDGE_TILE_ROW_SET_SELECT_select_MASK                    0x1fe0000
+#define PCK_EDGE_TILE_ROW_SET_SELECT_select_RMW                     PCK_EDGE_TILE_ROW_SET_SELECT_select_ADDR32, PCK_EDGE_TILE_ROW_SET_SELECT_select_SHAMT, PCK_EDGE_TILE_ROW_SET_SELECT_select_MASK
+
+#define PCK_EDGE_OFFSET_SEC1_mask_ADDR32                             25
+#define PCK_EDGE_OFFSET_SEC1_mask_SHAMT                               0
+#define PCK_EDGE_OFFSET_SEC1_mask_MASK                              0xffff
+#define PCK_EDGE_OFFSET_SEC1_mask_RMW                               PCK_EDGE_OFFSET_SEC1_mask_ADDR32, PCK_EDGE_OFFSET_SEC1_mask_SHAMT, PCK_EDGE_OFFSET_SEC1_mask_MASK
+
+#define PCK_EDGE_OFFSET_SEC2_mask_ADDR32                             26
+#define PCK_EDGE_OFFSET_SEC2_mask_SHAMT                               0
+#define PCK_EDGE_OFFSET_SEC2_mask_MASK                              0xffff
+#define PCK_EDGE_OFFSET_SEC2_mask_RMW                               PCK_EDGE_OFFSET_SEC2_mask_ADDR32, PCK_EDGE_OFFSET_SEC2_mask_SHAMT, PCK_EDGE_OFFSET_SEC2_mask_MASK
+
+#define PCK_EDGE_OFFSET_SEC3_mask_ADDR32                             27
+#define PCK_EDGE_OFFSET_SEC3_mask_SHAMT                               0
+#define PCK_EDGE_OFFSET_SEC3_mask_MASK                              0xffff
+#define PCK_EDGE_OFFSET_SEC3_mask_RMW                               PCK_EDGE_OFFSET_SEC3_mask_ADDR32, PCK_EDGE_OFFSET_SEC3_mask_SHAMT, PCK_EDGE_OFFSET_SEC3_mask_MASK
+
+#define PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32                  28
+#define PACK_COUNTERS_SEC0_pack_per_xy_plane_SHAMT                    0
+#define PACK_COUNTERS_SEC0_pack_per_xy_plane_MASK                   0xff
+#define PACK_COUNTERS_SEC0_pack_per_xy_plane_RMW                    PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32, PACK_COUNTERS_SEC0_pack_per_xy_plane_SHAMT, PACK_COUNTERS_SEC0_pack_per_xy_plane_MASK
+
+#define PACK_COUNTERS_SEC0_pack_reads_per_xy_plane_ADDR32            28
+#define PACK_COUNTERS_SEC0_pack_reads_per_xy_plane_SHAMT              8
+#define PACK_COUNTERS_SEC0_pack_reads_per_xy_plane_MASK             0xff00
+#define PACK_COUNTERS_SEC0_pack_reads_per_xy_plane_RMW              PACK_COUNTERS_SEC0_pack_reads_per_xy_plane_ADDR32, PACK_COUNTERS_SEC0_pack_reads_per_xy_plane_SHAMT, PACK_COUNTERS_SEC0_pack_reads_per_xy_plane_MASK
+
+#define PACK_COUNTERS_SEC0_pack_xys_per_tile_ADDR32                  28
+#define PACK_COUNTERS_SEC0_pack_xys_per_tile_SHAMT                   16
+#define PACK_COUNTERS_SEC0_pack_xys_per_tile_MASK                   0x7f0000
+#define PACK_COUNTERS_SEC0_pack_xys_per_tile_RMW                    PACK_COUNTERS_SEC0_pack_xys_per_tile_ADDR32, PACK_COUNTERS_SEC0_pack_xys_per_tile_SHAMT, PACK_COUNTERS_SEC0_pack_xys_per_tile_MASK
+
+#define PACK_COUNTERS_SEC0_pack_yz_transposed_ADDR32                 28
+#define PACK_COUNTERS_SEC0_pack_yz_transposed_SHAMT                  23
+#define PACK_COUNTERS_SEC0_pack_yz_transposed_MASK                  0x800000
+#define PACK_COUNTERS_SEC0_pack_yz_transposed_RMW                   PACK_COUNTERS_SEC0_pack_yz_transposed_ADDR32, PACK_COUNTERS_SEC0_pack_yz_transposed_SHAMT, PACK_COUNTERS_SEC0_pack_yz_transposed_MASK
+
+#define PACK_COUNTERS_SEC0_auto_ctxt_inc_xys_cnt_ADDR32              28
+#define PACK_COUNTERS_SEC0_auto_ctxt_inc_xys_cnt_SHAMT               24
+#define PACK_COUNTERS_SEC0_auto_ctxt_inc_xys_cnt_MASK               0xff000000
+#define PACK_COUNTERS_SEC0_auto_ctxt_inc_xys_cnt_RMW                PACK_COUNTERS_SEC0_auto_ctxt_inc_xys_cnt_ADDR32, PACK_COUNTERS_SEC0_auto_ctxt_inc_xys_cnt_SHAMT, PACK_COUNTERS_SEC0_auto_ctxt_inc_xys_cnt_MASK
+
+#define PACK_COUNTERS_SEC1_pack_per_xy_plane_ADDR32                  29
+#define PACK_COUNTERS_SEC1_pack_per_xy_plane_SHAMT                    0
+#define PACK_COUNTERS_SEC1_pack_per_xy_plane_MASK                   0xff
+#define PACK_COUNTERS_SEC1_pack_per_xy_plane_RMW                    PACK_COUNTERS_SEC1_pack_per_xy_plane_ADDR32, PACK_COUNTERS_SEC1_pack_per_xy_plane_SHAMT, PACK_COUNTERS_SEC1_pack_per_xy_plane_MASK
+
+#define PACK_COUNTERS_SEC1_pack_reads_per_xy_plane_ADDR32            29
+#define PACK_COUNTERS_SEC1_pack_reads_per_xy_plane_SHAMT              8
+#define PACK_COUNTERS_SEC1_pack_reads_per_xy_plane_MASK             0xff00
+#define PACK_COUNTERS_SEC1_pack_reads_per_xy_plane_RMW              PACK_COUNTERS_SEC1_pack_reads_per_xy_plane_ADDR32, PACK_COUNTERS_SEC1_pack_reads_per_xy_plane_SHAMT, PACK_COUNTERS_SEC1_pack_reads_per_xy_plane_MASK
+
+#define PACK_COUNTERS_SEC1_pack_xys_per_tile_ADDR32                  29
+#define PACK_COUNTERS_SEC1_pack_xys_per_tile_SHAMT                   16
+#define PACK_COUNTERS_SEC1_pack_xys_per_tile_MASK                   0x7f0000
+#define PACK_COUNTERS_SEC1_pack_xys_per_tile_RMW                    PACK_COUNTERS_SEC1_pack_xys_per_tile_ADDR32, PACK_COUNTERS_SEC1_pack_xys_per_tile_SHAMT, PACK_COUNTERS_SEC1_pack_xys_per_tile_MASK
+
+#define PACK_COUNTERS_SEC1_pack_yz_transposed_ADDR32                 29
+#define PACK_COUNTERS_SEC1_pack_yz_transposed_SHAMT                  23
+#define PACK_COUNTERS_SEC1_pack_yz_transposed_MASK                  0x800000
+#define PACK_COUNTERS_SEC1_pack_yz_transposed_RMW                   PACK_COUNTERS_SEC1_pack_yz_transposed_ADDR32, PACK_COUNTERS_SEC1_pack_yz_transposed_SHAMT, PACK_COUNTERS_SEC1_pack_yz_transposed_MASK
+
+#define PACK_COUNTERS_SEC1_auto_ctxt_inc_xys_cnt_ADDR32              29
+#define PACK_COUNTERS_SEC1_auto_ctxt_inc_xys_cnt_SHAMT               24
+#define PACK_COUNTERS_SEC1_auto_ctxt_inc_xys_cnt_MASK               0xff000000
+#define PACK_COUNTERS_SEC1_auto_ctxt_inc_xys_cnt_RMW                PACK_COUNTERS_SEC1_auto_ctxt_inc_xys_cnt_ADDR32, PACK_COUNTERS_SEC1_auto_ctxt_inc_xys_cnt_SHAMT, PACK_COUNTERS_SEC1_auto_ctxt_inc_xys_cnt_MASK
+
+#define PACK_COUNTERS_SEC2_pack_per_xy_plane_ADDR32                  30
+#define PACK_COUNTERS_SEC2_pack_per_xy_plane_SHAMT                    0
+#define PACK_COUNTERS_SEC2_pack_per_xy_plane_MASK                   0xff
+#define PACK_COUNTERS_SEC2_pack_per_xy_plane_RMW                    PACK_COUNTERS_SEC2_pack_per_xy_plane_ADDR32, PACK_COUNTERS_SEC2_pack_per_xy_plane_SHAMT, PACK_COUNTERS_SEC2_pack_per_xy_plane_MASK
+
+#define PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_ADDR32            30
+#define PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT              8
+#define PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_MASK             0xff00
+#define PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_RMW              PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_ADDR32, PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT, PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_MASK
+
+#define PACK_COUNTERS_SEC2_pack_xys_per_tile_ADDR32                  30
+#define PACK_COUNTERS_SEC2_pack_xys_per_tile_SHAMT                   16
+#define PACK_COUNTERS_SEC2_pack_xys_per_tile_MASK                   0x7f0000
+#define PACK_COUNTERS_SEC2_pack_xys_per_tile_RMW                    PACK_COUNTERS_SEC2_pack_xys_per_tile_ADDR32, PACK_COUNTERS_SEC2_pack_xys_per_tile_SHAMT, PACK_COUNTERS_SEC2_pack_xys_per_tile_MASK
+
+#define PACK_COUNTERS_SEC2_pack_yz_transposed_ADDR32                 30
+#define PACK_COUNTERS_SEC2_pack_yz_transposed_SHAMT                  23
+#define PACK_COUNTERS_SEC2_pack_yz_transposed_MASK                  0x800000
+#define PACK_COUNTERS_SEC2_pack_yz_transposed_RMW                   PACK_COUNTERS_SEC2_pack_yz_transposed_ADDR32, PACK_COUNTERS_SEC2_pack_yz_transposed_SHAMT, PACK_COUNTERS_SEC2_pack_yz_transposed_MASK
+
+#define PACK_COUNTERS_SEC2_auto_ctxt_inc_xys_cnt_ADDR32              30
+#define PACK_COUNTERS_SEC2_auto_ctxt_inc_xys_cnt_SHAMT               24
+#define PACK_COUNTERS_SEC2_auto_ctxt_inc_xys_cnt_MASK               0xff000000
+#define PACK_COUNTERS_SEC2_auto_ctxt_inc_xys_cnt_RMW                PACK_COUNTERS_SEC2_auto_ctxt_inc_xys_cnt_ADDR32, PACK_COUNTERS_SEC2_auto_ctxt_inc_xys_cnt_SHAMT, PACK_COUNTERS_SEC2_auto_ctxt_inc_xys_cnt_MASK
+
+#define PACK_COUNTERS_SEC3_pack_per_xy_plane_ADDR32                  31
+#define PACK_COUNTERS_SEC3_pack_per_xy_plane_SHAMT                    0
+#define PACK_COUNTERS_SEC3_pack_per_xy_plane_MASK                   0xff
+#define PACK_COUNTERS_SEC3_pack_per_xy_plane_RMW                    PACK_COUNTERS_SEC3_pack_per_xy_plane_ADDR32, PACK_COUNTERS_SEC3_pack_per_xy_plane_SHAMT, PACK_COUNTERS_SEC3_pack_per_xy_plane_MASK
+
+#define PACK_COUNTERS_SEC3_pack_reads_per_xy_plane_ADDR32            31
+#define PACK_COUNTERS_SEC3_pack_reads_per_xy_plane_SHAMT              8
+#define PACK_COUNTERS_SEC3_pack_reads_per_xy_plane_MASK             0xff00
+#define PACK_COUNTERS_SEC3_pack_reads_per_xy_plane_RMW              PACK_COUNTERS_SEC3_pack_reads_per_xy_plane_ADDR32, PACK_COUNTERS_SEC3_pack_reads_per_xy_plane_SHAMT, PACK_COUNTERS_SEC3_pack_reads_per_xy_plane_MASK
+
+#define PACK_COUNTERS_SEC3_pack_xys_per_tile_ADDR32                  31
+#define PACK_COUNTERS_SEC3_pack_xys_per_tile_SHAMT                   16
+#define PACK_COUNTERS_SEC3_pack_xys_per_tile_MASK                   0x7f0000
+#define PACK_COUNTERS_SEC3_pack_xys_per_tile_RMW                    PACK_COUNTERS_SEC3_pack_xys_per_tile_ADDR32, PACK_COUNTERS_SEC3_pack_xys_per_tile_SHAMT, PACK_COUNTERS_SEC3_pack_xys_per_tile_MASK
+
+#define PACK_COUNTERS_SEC3_pack_yz_transposed_ADDR32                 31
+#define PACK_COUNTERS_SEC3_pack_yz_transposed_SHAMT                  23
+#define PACK_COUNTERS_SEC3_pack_yz_transposed_MASK                  0x800000
+#define PACK_COUNTERS_SEC3_pack_yz_transposed_RMW                   PACK_COUNTERS_SEC3_pack_yz_transposed_ADDR32, PACK_COUNTERS_SEC3_pack_yz_transposed_SHAMT, PACK_COUNTERS_SEC3_pack_yz_transposed_MASK
+
+#define PACK_COUNTERS_SEC3_auto_ctxt_inc_xys_cnt_ADDR32              31
+#define PACK_COUNTERS_SEC3_auto_ctxt_inc_xys_cnt_SHAMT               24
+#define PACK_COUNTERS_SEC3_auto_ctxt_inc_xys_cnt_MASK               0xff000000
+#define PACK_COUNTERS_SEC3_auto_ctxt_inc_xys_cnt_RMW                PACK_COUNTERS_SEC3_auto_ctxt_inc_xys_cnt_ADDR32, PACK_COUNTERS_SEC3_auto_ctxt_inc_xys_cnt_SHAMT, PACK_COUNTERS_SEC3_auto_ctxt_inc_xys_cnt_MASK
+
+#define PACK_CONCAT_MASK_SEC0_pack_concat_mask_ADDR32                32
+#define PACK_CONCAT_MASK_SEC0_pack_concat_mask_SHAMT                  0
+#define PACK_CONCAT_MASK_SEC0_pack_concat_mask_MASK                 0xffff
+#define PACK_CONCAT_MASK_SEC0_pack_concat_mask_RMW                  PACK_CONCAT_MASK_SEC0_pack_concat_mask_ADDR32, PACK_CONCAT_MASK_SEC0_pack_concat_mask_SHAMT, PACK_CONCAT_MASK_SEC0_pack_concat_mask_MASK
+
+#define PACK_CONCAT_MASK_SEC1_pack_concat_mask_ADDR32                33
+#define PACK_CONCAT_MASK_SEC1_pack_concat_mask_SHAMT                  0
+#define PACK_CONCAT_MASK_SEC1_pack_concat_mask_MASK                 0xffff
+#define PACK_CONCAT_MASK_SEC1_pack_concat_mask_RMW                  PACK_CONCAT_MASK_SEC1_pack_concat_mask_ADDR32, PACK_CONCAT_MASK_SEC1_pack_concat_mask_SHAMT, PACK_CONCAT_MASK_SEC1_pack_concat_mask_MASK
+
+#define PACK_CONCAT_MASK_SEC2_pack_concat_mask_ADDR32                34
+#define PACK_CONCAT_MASK_SEC2_pack_concat_mask_SHAMT                  0
+#define PACK_CONCAT_MASK_SEC2_pack_concat_mask_MASK                 0xffff
+#define PACK_CONCAT_MASK_SEC2_pack_concat_mask_RMW                  PACK_CONCAT_MASK_SEC2_pack_concat_mask_ADDR32, PACK_CONCAT_MASK_SEC2_pack_concat_mask_SHAMT, PACK_CONCAT_MASK_SEC2_pack_concat_mask_MASK
+
+#define PACK_CONCAT_MASK_SEC3_pack_concat_mask_ADDR32                35
+#define PACK_CONCAT_MASK_SEC3_pack_concat_mask_SHAMT                  0
+#define PACK_CONCAT_MASK_SEC3_pack_concat_mask_MASK                 0xffff
+#define PACK_CONCAT_MASK_SEC3_pack_concat_mask_RMW                  PACK_CONCAT_MASK_SEC3_pack_concat_mask_ADDR32, PACK_CONCAT_MASK_SEC3_pack_concat_mask_SHAMT, PACK_CONCAT_MASK_SEC3_pack_concat_mask_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_0_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_0_SHAMT              0
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_0_MASK             0x3
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_0_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_0_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_0_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_0_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_1_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_1_SHAMT              2
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_1_MASK             0xc
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_1_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_1_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_1_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_1_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_2_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_2_SHAMT              4
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_2_MASK             0x30
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_2_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_2_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_2_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_2_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_3_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_3_SHAMT              6
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_3_MASK             0xc0
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_3_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_3_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_3_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_3_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_4_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_4_SHAMT              8
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_4_MASK             0x300
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_4_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_4_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_4_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_4_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_5_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_5_SHAMT             10
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_5_MASK             0xc00
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_5_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_5_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_5_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_5_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_6_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_6_SHAMT             12
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_6_MASK             0x3000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_6_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_6_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_6_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_6_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_7_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_7_SHAMT             14
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_7_MASK             0xc000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_7_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_7_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_7_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_7_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_8_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_8_SHAMT             16
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_8_MASK             0x30000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_8_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_8_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_8_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_8_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_9_ADDR32            36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_9_SHAMT             18
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_9_MASK             0xc0000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_9_RMW              TILE_FACE_SET_MAPPING_0_face_set_mapping_9_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_9_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_9_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_10_ADDR32           36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_10_SHAMT            20
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_10_MASK            0x300000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_10_RMW             TILE_FACE_SET_MAPPING_0_face_set_mapping_10_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_10_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_10_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_11_ADDR32           36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_11_SHAMT            22
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_11_MASK            0xc00000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_11_RMW             TILE_FACE_SET_MAPPING_0_face_set_mapping_11_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_11_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_11_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_12_ADDR32           36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_12_SHAMT            24
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_12_MASK            0x3000000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_12_RMW             TILE_FACE_SET_MAPPING_0_face_set_mapping_12_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_12_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_12_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_13_ADDR32           36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_13_SHAMT            26
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_13_MASK            0xc000000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_13_RMW             TILE_FACE_SET_MAPPING_0_face_set_mapping_13_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_13_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_13_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_14_ADDR32           36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_14_SHAMT            28
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_14_MASK            0x30000000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_14_RMW             TILE_FACE_SET_MAPPING_0_face_set_mapping_14_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_14_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_14_MASK
+
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_15_ADDR32           36
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_15_SHAMT            30
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_15_MASK            0xc0000000
+#define TILE_FACE_SET_MAPPING_0_face_set_mapping_15_RMW             TILE_FACE_SET_MAPPING_0_face_set_mapping_15_ADDR32, TILE_FACE_SET_MAPPING_0_face_set_mapping_15_SHAMT, TILE_FACE_SET_MAPPING_0_face_set_mapping_15_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_0_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_0_SHAMT              0
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_0_MASK             0x3
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_0_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_0_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_0_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_0_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_1_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_1_SHAMT              2
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_1_MASK             0xc
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_1_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_1_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_1_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_1_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_2_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_2_SHAMT              4
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_2_MASK             0x30
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_2_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_2_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_2_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_2_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_3_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_3_SHAMT              6
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_3_MASK             0xc0
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_3_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_3_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_3_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_3_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_4_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_4_SHAMT              8
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_4_MASK             0x300
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_4_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_4_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_4_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_4_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_5_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_5_SHAMT             10
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_5_MASK             0xc00
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_5_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_5_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_5_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_5_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_6_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_6_SHAMT             12
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_6_MASK             0x3000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_6_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_6_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_6_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_6_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_7_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_7_SHAMT             14
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_7_MASK             0xc000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_7_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_7_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_7_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_7_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_8_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_8_SHAMT             16
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_8_MASK             0x30000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_8_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_8_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_8_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_8_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_9_ADDR32            37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_9_SHAMT             18
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_9_MASK             0xc0000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_9_RMW              TILE_FACE_SET_MAPPING_1_face_set_mapping_9_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_9_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_9_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_10_ADDR32           37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_10_SHAMT            20
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_10_MASK            0x300000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_10_RMW             TILE_FACE_SET_MAPPING_1_face_set_mapping_10_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_10_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_10_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_11_ADDR32           37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_11_SHAMT            22
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_11_MASK            0xc00000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_11_RMW             TILE_FACE_SET_MAPPING_1_face_set_mapping_11_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_11_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_11_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_12_ADDR32           37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_12_SHAMT            24
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_12_MASK            0x3000000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_12_RMW             TILE_FACE_SET_MAPPING_1_face_set_mapping_12_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_12_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_12_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_13_ADDR32           37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_13_SHAMT            26
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_13_MASK            0xc000000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_13_RMW             TILE_FACE_SET_MAPPING_1_face_set_mapping_13_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_13_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_13_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_14_ADDR32           37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_14_SHAMT            28
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_14_MASK            0x30000000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_14_RMW             TILE_FACE_SET_MAPPING_1_face_set_mapping_14_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_14_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_14_MASK
+
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_15_ADDR32           37
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_15_SHAMT            30
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_15_MASK            0xc0000000
+#define TILE_FACE_SET_MAPPING_1_face_set_mapping_15_RMW             TILE_FACE_SET_MAPPING_1_face_set_mapping_15_ADDR32, TILE_FACE_SET_MAPPING_1_face_set_mapping_15_SHAMT, TILE_FACE_SET_MAPPING_1_face_set_mapping_15_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_0_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_0_SHAMT              0
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_0_MASK             0x3
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_0_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_0_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_0_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_0_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_1_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_1_SHAMT              2
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_1_MASK             0xc
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_1_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_1_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_1_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_1_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_2_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_2_SHAMT              4
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_2_MASK             0x30
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_2_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_2_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_2_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_2_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_3_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_3_SHAMT              6
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_3_MASK             0xc0
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_3_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_3_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_3_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_3_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_4_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_4_SHAMT              8
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_4_MASK             0x300
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_4_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_4_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_4_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_4_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_5_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_5_SHAMT             10
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_5_MASK             0xc00
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_5_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_5_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_5_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_5_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_6_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_6_SHAMT             12
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_6_MASK             0x3000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_6_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_6_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_6_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_6_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_7_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_7_SHAMT             14
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_7_MASK             0xc000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_7_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_7_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_7_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_7_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_8_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_8_SHAMT             16
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_8_MASK             0x30000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_8_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_8_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_8_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_8_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_9_ADDR32            38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_9_SHAMT             18
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_9_MASK             0xc0000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_9_RMW              TILE_FACE_SET_MAPPING_2_face_set_mapping_9_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_9_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_9_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_10_ADDR32           38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_10_SHAMT            20
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_10_MASK            0x300000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_10_RMW             TILE_FACE_SET_MAPPING_2_face_set_mapping_10_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_10_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_10_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_11_ADDR32           38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_11_SHAMT            22
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_11_MASK            0xc00000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_11_RMW             TILE_FACE_SET_MAPPING_2_face_set_mapping_11_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_11_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_11_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_12_ADDR32           38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_12_SHAMT            24
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_12_MASK            0x3000000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_12_RMW             TILE_FACE_SET_MAPPING_2_face_set_mapping_12_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_12_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_12_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_13_ADDR32           38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_13_SHAMT            26
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_13_MASK            0xc000000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_13_RMW             TILE_FACE_SET_MAPPING_2_face_set_mapping_13_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_13_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_13_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_14_ADDR32           38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_14_SHAMT            28
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_14_MASK            0x30000000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_14_RMW             TILE_FACE_SET_MAPPING_2_face_set_mapping_14_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_14_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_14_MASK
+
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_15_ADDR32           38
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_15_SHAMT            30
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_15_MASK            0xc0000000
+#define TILE_FACE_SET_MAPPING_2_face_set_mapping_15_RMW             TILE_FACE_SET_MAPPING_2_face_set_mapping_15_ADDR32, TILE_FACE_SET_MAPPING_2_face_set_mapping_15_SHAMT, TILE_FACE_SET_MAPPING_2_face_set_mapping_15_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_0_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_0_SHAMT              0
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_0_MASK             0x3
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_0_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_0_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_0_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_0_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_1_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_1_SHAMT              2
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_1_MASK             0xc
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_1_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_1_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_1_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_1_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_2_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_2_SHAMT              4
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_2_MASK             0x30
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_2_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_2_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_2_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_2_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_3_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_3_SHAMT              6
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_3_MASK             0xc0
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_3_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_3_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_3_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_3_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_4_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_4_SHAMT              8
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_4_MASK             0x300
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_4_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_4_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_4_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_4_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_5_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_5_SHAMT             10
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_5_MASK             0xc00
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_5_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_5_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_5_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_5_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_6_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_6_SHAMT             12
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_6_MASK             0x3000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_6_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_6_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_6_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_6_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_7_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_7_SHAMT             14
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_7_MASK             0xc000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_7_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_7_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_7_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_7_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_8_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_8_SHAMT             16
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_8_MASK             0x30000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_8_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_8_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_8_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_8_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_9_ADDR32            39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_9_SHAMT             18
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_9_MASK             0xc0000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_9_RMW              TILE_FACE_SET_MAPPING_3_face_set_mapping_9_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_9_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_9_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_10_ADDR32           39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_10_SHAMT            20
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_10_MASK            0x300000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_10_RMW             TILE_FACE_SET_MAPPING_3_face_set_mapping_10_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_10_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_10_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_11_ADDR32           39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_11_SHAMT            22
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_11_MASK            0xc00000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_11_RMW             TILE_FACE_SET_MAPPING_3_face_set_mapping_11_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_11_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_11_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_12_ADDR32           39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_12_SHAMT            24
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_12_MASK            0x3000000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_12_RMW             TILE_FACE_SET_MAPPING_3_face_set_mapping_12_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_12_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_12_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_13_ADDR32           39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_13_SHAMT            26
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_13_MASK            0xc000000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_13_RMW             TILE_FACE_SET_MAPPING_3_face_set_mapping_13_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_13_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_13_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_14_ADDR32           39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_14_SHAMT            28
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_14_MASK            0x30000000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_14_RMW             TILE_FACE_SET_MAPPING_3_face_set_mapping_14_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_14_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_14_MASK
+
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_15_ADDR32           39
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_15_SHAMT            30
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_15_MASK            0xc0000000
+#define TILE_FACE_SET_MAPPING_3_face_set_mapping_15_RMW             TILE_FACE_SET_MAPPING_3_face_set_mapping_15_ADDR32, TILE_FACE_SET_MAPPING_3_face_set_mapping_15_SHAMT, TILE_FACE_SET_MAPPING_3_face_set_mapping_15_MASK
+
+#define PACK_GLOBAL_CFG_CTL_pack_disable_fast_tile_end_drain_ADDR32  40
+#define PACK_GLOBAL_CFG_CTL_pack_disable_fast_tile_end_drain_SHAMT    0
+#define PACK_GLOBAL_CFG_CTL_pack_disable_fast_tile_end_drain_MASK   0x1
+#define PACK_GLOBAL_CFG_CTL_pack_disable_fast_tile_end_drain_RMW    PACK_GLOBAL_CFG_CTL_pack_disable_fast_tile_end_drain_ADDR32, PACK_GLOBAL_CFG_CTL_pack_disable_fast_tile_end_drain_SHAMT, PACK_GLOBAL_CFG_CTL_pack_disable_fast_tile_end_drain_MASK
+
+////////////////////////////////////////////////////////////////////////
+// Registers for UNPACK0
+////////////////////////////////////////////////////////////////////////
+
+#define UNPACK0_CFGREG_BASE_ADDR32                         44
+
+#define UNP0_ADDR_CTRL_XY_REG_0_Xstride_ADDR32             44
+#define UNP0_ADDR_CTRL_XY_REG_0_Xstride_SHAMT               0
+#define UNP0_ADDR_CTRL_XY_REG_0_Xstride_MASK              0xffff
+#define UNP0_ADDR_CTRL_XY_REG_0_Xstride_RMW               UNP0_ADDR_CTRL_XY_REG_0_Xstride_ADDR32, UNP0_ADDR_CTRL_XY_REG_0_Xstride_SHAMT, UNP0_ADDR_CTRL_XY_REG_0_Xstride_MASK
+
+#define UNP0_ADDR_CTRL_XY_REG_0_Ystride_ADDR32             44
+#define UNP0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT              16
+#define UNP0_ADDR_CTRL_XY_REG_0_Ystride_MASK              0xffff0000
+#define UNP0_ADDR_CTRL_XY_REG_0_Ystride_RMW               UNP0_ADDR_CTRL_XY_REG_0_Ystride_ADDR32, UNP0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT, UNP0_ADDR_CTRL_XY_REG_0_Ystride_MASK
+
+#define UNP0_ADDR_CTRL_ZW_REG_0_Zstride_ADDR32             45
+#define UNP0_ADDR_CTRL_ZW_REG_0_Zstride_SHAMT               0
+#define UNP0_ADDR_CTRL_ZW_REG_0_Zstride_MASK              0xffff
+#define UNP0_ADDR_CTRL_ZW_REG_0_Zstride_RMW               UNP0_ADDR_CTRL_ZW_REG_0_Zstride_ADDR32, UNP0_ADDR_CTRL_ZW_REG_0_Zstride_SHAMT, UNP0_ADDR_CTRL_ZW_REG_0_Zstride_MASK
+
+#define UNP0_ADDR_CTRL_ZW_REG_0_Wstride_ADDR32             45
+#define UNP0_ADDR_CTRL_ZW_REG_0_Wstride_SHAMT              16
+#define UNP0_ADDR_CTRL_ZW_REG_0_Wstride_MASK              0xffff0000
+#define UNP0_ADDR_CTRL_ZW_REG_0_Wstride_RMW               UNP0_ADDR_CTRL_ZW_REG_0_Wstride_ADDR32, UNP0_ADDR_CTRL_ZW_REG_0_Wstride_SHAMT, UNP0_ADDR_CTRL_ZW_REG_0_Wstride_MASK
+
+#define UNP1_ADDR_CTRL_XY_REG_0_Xstride_ADDR32             46
+#define UNP1_ADDR_CTRL_XY_REG_0_Xstride_SHAMT               0
+#define UNP1_ADDR_CTRL_XY_REG_0_Xstride_MASK              0xffff
+#define UNP1_ADDR_CTRL_XY_REG_0_Xstride_RMW               UNP1_ADDR_CTRL_XY_REG_0_Xstride_ADDR32, UNP1_ADDR_CTRL_XY_REG_0_Xstride_SHAMT, UNP1_ADDR_CTRL_XY_REG_0_Xstride_MASK
+
+#define UNP1_ADDR_CTRL_XY_REG_0_Ystride_ADDR32             46
+#define UNP1_ADDR_CTRL_XY_REG_0_Ystride_SHAMT              16
+#define UNP1_ADDR_CTRL_XY_REG_0_Ystride_MASK              0xffff0000
+#define UNP1_ADDR_CTRL_XY_REG_0_Ystride_RMW               UNP1_ADDR_CTRL_XY_REG_0_Ystride_ADDR32, UNP1_ADDR_CTRL_XY_REG_0_Ystride_SHAMT, UNP1_ADDR_CTRL_XY_REG_0_Ystride_MASK
+
+#define UNP1_ADDR_CTRL_ZW_REG_0_Zstride_ADDR32             47
+#define UNP1_ADDR_CTRL_ZW_REG_0_Zstride_SHAMT               0
+#define UNP1_ADDR_CTRL_ZW_REG_0_Zstride_MASK              0xffff
+#define UNP1_ADDR_CTRL_ZW_REG_0_Zstride_RMW               UNP1_ADDR_CTRL_ZW_REG_0_Zstride_ADDR32, UNP1_ADDR_CTRL_ZW_REG_0_Zstride_SHAMT, UNP1_ADDR_CTRL_ZW_REG_0_Zstride_MASK
+
+#define UNP1_ADDR_CTRL_ZW_REG_0_Wstride_ADDR32             47
+#define UNP1_ADDR_CTRL_ZW_REG_0_Wstride_SHAMT              16
+#define UNP1_ADDR_CTRL_ZW_REG_0_Wstride_MASK              0xffff0000
+#define UNP1_ADDR_CTRL_ZW_REG_0_Wstride_RMW               UNP1_ADDR_CTRL_ZW_REG_0_Wstride_ADDR32, UNP1_ADDR_CTRL_ZW_REG_0_Wstride_SHAMT, UNP1_ADDR_CTRL_ZW_REG_0_Wstride_MASK
+
+#define UNP0_ADDR_BASE_REG_0_Base_ADDR32                   48
+#define UNP0_ADDR_BASE_REG_0_Base_SHAMT                     0
+#define UNP0_ADDR_BASE_REG_0_Base_MASK                    0x3ffff
+#define UNP0_ADDR_BASE_REG_0_Base_RMW                     UNP0_ADDR_BASE_REG_0_Base_ADDR32, UNP0_ADDR_BASE_REG_0_Base_SHAMT, UNP0_ADDR_BASE_REG_0_Base_MASK
+
+#define UNP0_ADDR_BASE_REG_1_Base_ADDR32                   49
+#define UNP0_ADDR_BASE_REG_1_Base_SHAMT                     0
+#define UNP0_ADDR_BASE_REG_1_Base_MASK                    0x3ffff
+#define UNP0_ADDR_BASE_REG_1_Base_RMW                     UNP0_ADDR_BASE_REG_1_Base_ADDR32, UNP0_ADDR_BASE_REG_1_Base_SHAMT, UNP0_ADDR_BASE_REG_1_Base_MASK
+
+#define UNP0_FORCED_SHARED_EXP_shared_exp_ADDR32           50
+#define UNP0_FORCED_SHARED_EXP_shared_exp_SHAMT             0
+#define UNP0_FORCED_SHARED_EXP_shared_exp_MASK            0xff
+#define UNP0_FORCED_SHARED_EXP_shared_exp_RMW             UNP0_FORCED_SHARED_EXP_shared_exp_ADDR32, UNP0_FORCED_SHARED_EXP_shared_exp_SHAMT, UNP0_FORCED_SHARED_EXP_shared_exp_MASK
+
+#define UNP0_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_ADDR32  50
+#define UNP0_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_SHAMT    8
+#define UNP0_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_MASK   0x100
+#define UNP0_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_RMW    UNP0_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_ADDR32, UNP0_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_SHAMT, UNP0_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_MASK
+
+#define UNP0_BLOBS_Y_START_CNTX_01_blobs_y_start_ADDR32    51
+#define UNP0_BLOBS_Y_START_CNTX_01_blobs_y_start_SHAMT      0
+#define UNP0_BLOBS_Y_START_CNTX_01_blobs_y_start_MASK     0xffffffff
+#define UNP0_BLOBS_Y_START_CNTX_01_blobs_y_start_RMW      UNP0_BLOBS_Y_START_CNTX_01_blobs_y_start_ADDR32, UNP0_BLOBS_Y_START_CNTX_01_blobs_y_start_SHAMT, UNP0_BLOBS_Y_START_CNTX_01_blobs_y_start_MASK
+
+#define UNP0_BLOBS_Y_START_CNTX_23_blobs_y_start_ADDR32    52
+#define UNP0_BLOBS_Y_START_CNTX_23_blobs_y_start_SHAMT      0
+#define UNP0_BLOBS_Y_START_CNTX_23_blobs_y_start_MASK     0xffffffff
+#define UNP0_BLOBS_Y_START_CNTX_23_blobs_y_start_RMW      UNP0_BLOBS_Y_START_CNTX_23_blobs_y_start_ADDR32, UNP0_BLOBS_Y_START_CNTX_23_blobs_y_start_SHAMT, UNP0_BLOBS_Y_START_CNTX_23_blobs_y_start_MASK
+
+#define UNP0_NOP_REG_CLR_VAL_nop_reg_clr_val_ADDR32        53
+#define UNP0_NOP_REG_CLR_VAL_nop_reg_clr_val_SHAMT          0
+#define UNP0_NOP_REG_CLR_VAL_nop_reg_clr_val_MASK         0xffffffff
+#define UNP0_NOP_REG_CLR_VAL_nop_reg_clr_val_RMW          UNP0_NOP_REG_CLR_VAL_nop_reg_clr_val_ADDR32, UNP0_NOP_REG_CLR_VAL_nop_reg_clr_val_SHAMT, UNP0_NOP_REG_CLR_VAL_nop_reg_clr_val_MASK
+
+////////////////////////////////////////////////////////////////////////
+// Registers for UNPACK1
+////////////////////////////////////////////////////////////////////////
+
+#define UNPACK1_CFGREG_BASE_ADDR32                         56
+
+#define UNP0_ADDR_CTRL_XY_REG_1_Xstride_ADDR32             56
+#define UNP0_ADDR_CTRL_XY_REG_1_Xstride_SHAMT               0
+#define UNP0_ADDR_CTRL_XY_REG_1_Xstride_MASK              0xffff
+#define UNP0_ADDR_CTRL_XY_REG_1_Xstride_RMW               UNP0_ADDR_CTRL_XY_REG_1_Xstride_ADDR32, UNP0_ADDR_CTRL_XY_REG_1_Xstride_SHAMT, UNP0_ADDR_CTRL_XY_REG_1_Xstride_MASK
+
+#define UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32             56
+#define UNP0_ADDR_CTRL_XY_REG_1_Ystride_SHAMT              16
+#define UNP0_ADDR_CTRL_XY_REG_1_Ystride_MASK              0xffff0000
+#define UNP0_ADDR_CTRL_XY_REG_1_Ystride_RMW               UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32, UNP0_ADDR_CTRL_XY_REG_1_Ystride_SHAMT, UNP0_ADDR_CTRL_XY_REG_1_Ystride_MASK
+
+#define UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32             57
+#define UNP0_ADDR_CTRL_ZW_REG_1_Zstride_SHAMT               0
+#define UNP0_ADDR_CTRL_ZW_REG_1_Zstride_MASK              0xffff
+#define UNP0_ADDR_CTRL_ZW_REG_1_Zstride_RMW               UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32, UNP0_ADDR_CTRL_ZW_REG_1_Zstride_SHAMT, UNP0_ADDR_CTRL_ZW_REG_1_Zstride_MASK
+
+#define UNP0_ADDR_CTRL_ZW_REG_1_Wstride_ADDR32             57
+#define UNP0_ADDR_CTRL_ZW_REG_1_Wstride_SHAMT              16
+#define UNP0_ADDR_CTRL_ZW_REG_1_Wstride_MASK              0xffff0000
+#define UNP0_ADDR_CTRL_ZW_REG_1_Wstride_RMW               UNP0_ADDR_CTRL_ZW_REG_1_Wstride_ADDR32, UNP0_ADDR_CTRL_ZW_REG_1_Wstride_SHAMT, UNP0_ADDR_CTRL_ZW_REG_1_Wstride_MASK
+
+#define UNP1_ADDR_CTRL_XY_REG_1_Xstride_ADDR32             58
+#define UNP1_ADDR_CTRL_XY_REG_1_Xstride_SHAMT               0
+#define UNP1_ADDR_CTRL_XY_REG_1_Xstride_MASK              0xffff
+#define UNP1_ADDR_CTRL_XY_REG_1_Xstride_RMW               UNP1_ADDR_CTRL_XY_REG_1_Xstride_ADDR32, UNP1_ADDR_CTRL_XY_REG_1_Xstride_SHAMT, UNP1_ADDR_CTRL_XY_REG_1_Xstride_MASK
+
+#define UNP1_ADDR_CTRL_XY_REG_1_Ystride_ADDR32             58
+#define UNP1_ADDR_CTRL_XY_REG_1_Ystride_SHAMT              16
+#define UNP1_ADDR_CTRL_XY_REG_1_Ystride_MASK              0xffff0000
+#define UNP1_ADDR_CTRL_XY_REG_1_Ystride_RMW               UNP1_ADDR_CTRL_XY_REG_1_Ystride_ADDR32, UNP1_ADDR_CTRL_XY_REG_1_Ystride_SHAMT, UNP1_ADDR_CTRL_XY_REG_1_Ystride_MASK
+
+#define UNP1_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32             59
+#define UNP1_ADDR_CTRL_ZW_REG_1_Zstride_SHAMT               0
+#define UNP1_ADDR_CTRL_ZW_REG_1_Zstride_MASK              0xffff
+#define UNP1_ADDR_CTRL_ZW_REG_1_Zstride_RMW               UNP1_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32, UNP1_ADDR_CTRL_ZW_REG_1_Zstride_SHAMT, UNP1_ADDR_CTRL_ZW_REG_1_Zstride_MASK
+
+#define UNP1_ADDR_CTRL_ZW_REG_1_Wstride_ADDR32             59
+#define UNP1_ADDR_CTRL_ZW_REG_1_Wstride_SHAMT              16
+#define UNP1_ADDR_CTRL_ZW_REG_1_Wstride_MASK              0xffff0000
+#define UNP1_ADDR_CTRL_ZW_REG_1_Wstride_RMW               UNP1_ADDR_CTRL_ZW_REG_1_Wstride_ADDR32, UNP1_ADDR_CTRL_ZW_REG_1_Wstride_SHAMT, UNP1_ADDR_CTRL_ZW_REG_1_Wstride_MASK
+
+#define UNP1_ADDR_BASE_REG_0_Base_ADDR32                   60
+#define UNP1_ADDR_BASE_REG_0_Base_SHAMT                     0
+#define UNP1_ADDR_BASE_REG_0_Base_MASK                    0x3ffff
+#define UNP1_ADDR_BASE_REG_0_Base_RMW                     UNP1_ADDR_BASE_REG_0_Base_ADDR32, UNP1_ADDR_BASE_REG_0_Base_SHAMT, UNP1_ADDR_BASE_REG_0_Base_MASK
+
+#define UNP1_ADDR_BASE_REG_1_Base_ADDR32                   61
+#define UNP1_ADDR_BASE_REG_1_Base_SHAMT                     0
+#define UNP1_ADDR_BASE_REG_1_Base_MASK                    0x3ffff
+#define UNP1_ADDR_BASE_REG_1_Base_RMW                     UNP1_ADDR_BASE_REG_1_Base_ADDR32, UNP1_ADDR_BASE_REG_1_Base_SHAMT, UNP1_ADDR_BASE_REG_1_Base_MASK
+
+#define UNP1_FORCED_SHARED_EXP_shared_exp_ADDR32           62
+#define UNP1_FORCED_SHARED_EXP_shared_exp_SHAMT             0
+#define UNP1_FORCED_SHARED_EXP_shared_exp_MASK            0xff
+#define UNP1_FORCED_SHARED_EXP_shared_exp_RMW             UNP1_FORCED_SHARED_EXP_shared_exp_ADDR32, UNP1_FORCED_SHARED_EXP_shared_exp_SHAMT, UNP1_FORCED_SHARED_EXP_shared_exp_MASK
+
+#define UNP1_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_ADDR32  62
+#define UNP1_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_SHAMT    8
+#define UNP1_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_MASK   0x100
+#define UNP1_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_RMW    UNP1_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_ADDR32, UNP1_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_SHAMT, UNP1_ADD_DEST_ADDR_CNTR_add_dest_addr_cntr_MASK
+
+#define UNP1_NOP_REG_CLR_VAL_nop_reg_clr_val_ADDR32        63
+#define UNP1_NOP_REG_CLR_VAL_nop_reg_clr_val_SHAMT          0
+#define UNP1_NOP_REG_CLR_VAL_nop_reg_clr_val_MASK         0xffffffff
+#define UNP1_NOP_REG_CLR_VAL_nop_reg_clr_val_RMW          UNP1_NOP_REG_CLR_VAL_nop_reg_clr_val_ADDR32, UNP1_NOP_REG_CLR_VAL_nop_reg_clr_val_SHAMT, UNP1_NOP_REG_CLR_VAL_nop_reg_clr_val_MASK
+
+////////////////////////////////////////////////////////////////////////
+// Registers for THCON
+////////////////////////////////////////////////////////////////////////
+
+#define THCON_CFGREG_BASE_ADDR32                                    64
+
+#define THCON_SEC0_REG0_TileDescriptor_ADDR32                       64
+#define THCON_SEC0_REG0_TileDescriptor_SHAMT                         0
+#define THCON_SEC0_REG0_TileDescriptor_MASK                        0xffffffffffffffffffffffffffffffff
+#define THCON_SEC0_REG0_TileDescriptor_RMW                         THCON_SEC0_REG0_TileDescriptor_ADDR32, THCON_SEC0_REG0_TileDescriptor_SHAMT, THCON_SEC0_REG0_TileDescriptor_MASK
+
+#define THCON_SEC0_REG1_Row_start_section_size_ADDR32               68
+#define THCON_SEC0_REG1_Row_start_section_size_SHAMT                 0
+#define THCON_SEC0_REG1_Row_start_section_size_MASK                0xffff
+#define THCON_SEC0_REG1_Row_start_section_size_RMW                 THCON_SEC0_REG1_Row_start_section_size_ADDR32, THCON_SEC0_REG1_Row_start_section_size_SHAMT, THCON_SEC0_REG1_Row_start_section_size_MASK
+
+#define THCON_SEC0_REG1_Exp_section_size_ADDR32                     68
+#define THCON_SEC0_REG1_Exp_section_size_SHAMT                      16
+#define THCON_SEC0_REG1_Exp_section_size_MASK                      0xffff0000
+#define THCON_SEC0_REG1_Exp_section_size_RMW                       THCON_SEC0_REG1_Exp_section_size_ADDR32, THCON_SEC0_REG1_Exp_section_size_SHAMT, THCON_SEC0_REG1_Exp_section_size_MASK
+
+#define THCON_SEC0_REG1_L1_Dest_addr_ADDR32                         69
+#define THCON_SEC0_REG1_L1_Dest_addr_SHAMT                           0
+#define THCON_SEC0_REG1_L1_Dest_addr_MASK                          0xffffffff
+#define THCON_SEC0_REG1_L1_Dest_addr_RMW                           THCON_SEC0_REG1_L1_Dest_addr_ADDR32, THCON_SEC0_REG1_L1_Dest_addr_SHAMT, THCON_SEC0_REG1_L1_Dest_addr_MASK
+
+#define THCON_SEC0_REG1_Disable_zero_compress_ADDR32                70
+#define THCON_SEC0_REG1_Disable_zero_compress_SHAMT                  0
+#define THCON_SEC0_REG1_Disable_zero_compress_MASK                 0x1
+#define THCON_SEC0_REG1_Disable_zero_compress_RMW                  THCON_SEC0_REG1_Disable_zero_compress_ADDR32, THCON_SEC0_REG1_Disable_zero_compress_SHAMT, THCON_SEC0_REG1_Disable_zero_compress_MASK
+
+#define THCON_SEC0_REG1_Add_l1_dest_addr_offset_ADDR32              70
+#define THCON_SEC0_REG1_Add_l1_dest_addr_offset_SHAMT                1
+#define THCON_SEC0_REG1_Add_l1_dest_addr_offset_MASK               0x2
+#define THCON_SEC0_REG1_Add_l1_dest_addr_offset_RMW                THCON_SEC0_REG1_Add_l1_dest_addr_offset_ADDR32, THCON_SEC0_REG1_Add_l1_dest_addr_offset_SHAMT, THCON_SEC0_REG1_Add_l1_dest_addr_offset_MASK
+
+#define THCON_SEC0_REG1_Disable_pack_zero_flags_ADDR32              70
+#define THCON_SEC0_REG1_Disable_pack_zero_flags_SHAMT                2
+#define THCON_SEC0_REG1_Disable_pack_zero_flags_MASK               0x4
+#define THCON_SEC0_REG1_Disable_pack_zero_flags_RMW                THCON_SEC0_REG1_Disable_pack_zero_flags_ADDR32, THCON_SEC0_REG1_Disable_pack_zero_flags_SHAMT, THCON_SEC0_REG1_Disable_pack_zero_flags_MASK
+
+#define THCON_SEC0_REG1_ovrd_default_throttle_mode_ADDR32           70
+#define THCON_SEC0_REG1_ovrd_default_throttle_mode_SHAMT             3
+#define THCON_SEC0_REG1_ovrd_default_throttle_mode_MASK            0x8
+#define THCON_SEC0_REG1_ovrd_default_throttle_mode_RMW             THCON_SEC0_REG1_ovrd_default_throttle_mode_ADDR32, THCON_SEC0_REG1_ovrd_default_throttle_mode_SHAMT, THCON_SEC0_REG1_ovrd_default_throttle_mode_MASK
+
+#define THCON_SEC0_REG1_Out_data_format_ADDR32                      70
+#define THCON_SEC0_REG1_Out_data_format_SHAMT                        4
+#define THCON_SEC0_REG1_Out_data_format_MASK                       0xf0
+#define THCON_SEC0_REG1_Out_data_format_RMW                        THCON_SEC0_REG1_Out_data_format_ADDR32, THCON_SEC0_REG1_Out_data_format_SHAMT, THCON_SEC0_REG1_Out_data_format_MASK
+
+#define THCON_SEC0_REG1_In_data_format_ADDR32                       70
+#define THCON_SEC0_REG1_In_data_format_SHAMT                         8
+#define THCON_SEC0_REG1_In_data_format_MASK                        0xf00
+#define THCON_SEC0_REG1_In_data_format_RMW                         THCON_SEC0_REG1_In_data_format_ADDR32, THCON_SEC0_REG1_In_data_format_SHAMT, THCON_SEC0_REG1_In_data_format_MASK
+
+#define THCON_SEC0_REG1_Dis_shared_exp_assembler_ADDR32             70
+#define THCON_SEC0_REG1_Dis_shared_exp_assembler_SHAMT              12
+#define THCON_SEC0_REG1_Dis_shared_exp_assembler_MASK              0x1000
+#define THCON_SEC0_REG1_Dis_shared_exp_assembler_RMW               THCON_SEC0_REG1_Dis_shared_exp_assembler_ADDR32, THCON_SEC0_REG1_Dis_shared_exp_assembler_SHAMT, THCON_SEC0_REG1_Dis_shared_exp_assembler_MASK
+
+#define THCON_SEC0_REG1_Auto_set_last_pacr_intf_sel_ADDR32          70
+#define THCON_SEC0_REG1_Auto_set_last_pacr_intf_sel_SHAMT           13
+#define THCON_SEC0_REG1_Auto_set_last_pacr_intf_sel_MASK           0x2000
+#define THCON_SEC0_REG1_Auto_set_last_pacr_intf_sel_RMW            THCON_SEC0_REG1_Auto_set_last_pacr_intf_sel_ADDR32, THCON_SEC0_REG1_Auto_set_last_pacr_intf_sel_SHAMT, THCON_SEC0_REG1_Auto_set_last_pacr_intf_sel_MASK
+
+#define THCON_SEC0_REG1_Enable_out_fifo_ADDR32                      70
+#define THCON_SEC0_REG1_Enable_out_fifo_SHAMT                       14
+#define THCON_SEC0_REG1_Enable_out_fifo_MASK                       0x4000
+#define THCON_SEC0_REG1_Enable_out_fifo_RMW                        THCON_SEC0_REG1_Enable_out_fifo_ADDR32, THCON_SEC0_REG1_Enable_out_fifo_SHAMT, THCON_SEC0_REG1_Enable_out_fifo_MASK
+
+#define THCON_SEC0_REG1_Sub_l1_tile_header_size_ADDR32              70
+#define THCON_SEC0_REG1_Sub_l1_tile_header_size_SHAMT               15
+#define THCON_SEC0_REG1_Sub_l1_tile_header_size_MASK               0x8000
+#define THCON_SEC0_REG1_Sub_l1_tile_header_size_RMW                THCON_SEC0_REG1_Sub_l1_tile_header_size_ADDR32, THCON_SEC0_REG1_Sub_l1_tile_header_size_SHAMT, THCON_SEC0_REG1_Sub_l1_tile_header_size_MASK
+
+#define THCON_SEC0_REG1_Source_interface_selection_ADDR32           70
+#define THCON_SEC0_REG1_Source_interface_selection_SHAMT            16
+#define THCON_SEC0_REG1_Source_interface_selection_MASK            0x10000
+#define THCON_SEC0_REG1_Source_interface_selection_RMW             THCON_SEC0_REG1_Source_interface_selection_ADDR32, THCON_SEC0_REG1_Source_interface_selection_SHAMT, THCON_SEC0_REG1_Source_interface_selection_MASK
+
+#define THCON_SEC0_REG1_pack_start_intf_pos_ADDR32                  70
+#define THCON_SEC0_REG1_pack_start_intf_pos_SHAMT                   17
+#define THCON_SEC0_REG1_pack_start_intf_pos_MASK                   0x1e0000
+#define THCON_SEC0_REG1_pack_start_intf_pos_RMW                    THCON_SEC0_REG1_pack_start_intf_pos_ADDR32, THCON_SEC0_REG1_pack_start_intf_pos_SHAMT, THCON_SEC0_REG1_pack_start_intf_pos_MASK
+
+#define THCON_SEC0_REG1_All_pack_disable_zero_compress_ovrd_ADDR32  70
+#define THCON_SEC0_REG1_All_pack_disable_zero_compress_ovrd_SHAMT   21
+#define THCON_SEC0_REG1_All_pack_disable_zero_compress_ovrd_MASK   0x200000
+#define THCON_SEC0_REG1_All_pack_disable_zero_compress_ovrd_RMW    THCON_SEC0_REG1_All_pack_disable_zero_compress_ovrd_ADDR32, THCON_SEC0_REG1_All_pack_disable_zero_compress_ovrd_SHAMT, THCON_SEC0_REG1_All_pack_disable_zero_compress_ovrd_MASK
+
+#define THCON_SEC0_REG1_Add_tile_header_size_ADDR32                 70
+#define THCON_SEC0_REG1_Add_tile_header_size_SHAMT                  22
+#define THCON_SEC0_REG1_Add_tile_header_size_MASK                  0x400000
+#define THCON_SEC0_REG1_Add_tile_header_size_RMW                   THCON_SEC0_REG1_Add_tile_header_size_ADDR32, THCON_SEC0_REG1_Add_tile_header_size_SHAMT, THCON_SEC0_REG1_Add_tile_header_size_MASK
+
+#define THCON_SEC0_REG1_pack_dis_y_pos_start_offset_ADDR32          70
+#define THCON_SEC0_REG1_pack_dis_y_pos_start_offset_SHAMT           23
+#define THCON_SEC0_REG1_pack_dis_y_pos_start_offset_MASK           0x800000
+#define THCON_SEC0_REG1_pack_dis_y_pos_start_offset_RMW            THCON_SEC0_REG1_pack_dis_y_pos_start_offset_ADDR32, THCON_SEC0_REG1_pack_dis_y_pos_start_offset_SHAMT, THCON_SEC0_REG1_pack_dis_y_pos_start_offset_MASK
+
+#define THCON_SEC0_REG1_L1_source_addr_ADDR32                       70
+#define THCON_SEC0_REG1_L1_source_addr_SHAMT                        24
+#define THCON_SEC0_REG1_L1_source_addr_MASK                        0xff000000
+#define THCON_SEC0_REG1_L1_source_addr_RMW                         THCON_SEC0_REG1_L1_source_addr_ADDR32, THCON_SEC0_REG1_L1_source_addr_SHAMT, THCON_SEC0_REG1_L1_source_addr_MASK
+
+#define THCON_SEC0_REG1_Downsample_mask_ADDR32                      71
+#define THCON_SEC0_REG1_Downsample_mask_SHAMT                        0
+#define THCON_SEC0_REG1_Downsample_mask_MASK                       0xffff
+#define THCON_SEC0_REG1_Downsample_mask_RMW                        THCON_SEC0_REG1_Downsample_mask_ADDR32, THCON_SEC0_REG1_Downsample_mask_SHAMT, THCON_SEC0_REG1_Downsample_mask_MASK
+
+#define THCON_SEC0_REG1_Downsample_rate_ADDR32                      71
+#define THCON_SEC0_REG1_Downsample_rate_SHAMT                       16
+#define THCON_SEC0_REG1_Downsample_rate_MASK                       0x70000
+#define THCON_SEC0_REG1_Downsample_rate_RMW                        THCON_SEC0_REG1_Downsample_rate_ADDR32, THCON_SEC0_REG1_Downsample_rate_SHAMT, THCON_SEC0_REG1_Downsample_rate_MASK
+
+#define THCON_SEC0_REG1_Pack_L1_Acc_ADDR32                          71
+#define THCON_SEC0_REG1_Pack_L1_Acc_SHAMT                           19
+#define THCON_SEC0_REG1_Pack_L1_Acc_MASK                           0x80000
+#define THCON_SEC0_REG1_Pack_L1_Acc_RMW                            THCON_SEC0_REG1_Pack_L1_Acc_ADDR32, THCON_SEC0_REG1_Pack_L1_Acc_SHAMT, THCON_SEC0_REG1_Pack_L1_Acc_MASK
+
+#define THCON_SEC0_REG1_Exp_threshold_en_ADDR32                     71
+#define THCON_SEC0_REG1_Exp_threshold_en_SHAMT                      20
+#define THCON_SEC0_REG1_Exp_threshold_en_MASK                      0x100000
+#define THCON_SEC0_REG1_Exp_threshold_en_RMW                       THCON_SEC0_REG1_Exp_threshold_en_ADDR32, THCON_SEC0_REG1_Exp_threshold_en_SHAMT, THCON_SEC0_REG1_Exp_threshold_en_MASK
+
+#define THCON_SEC0_REG1_Unp_LF8_4b_exp_ADDR32                       71
+#define THCON_SEC0_REG1_Unp_LF8_4b_exp_SHAMT                        22
+#define THCON_SEC0_REG1_Unp_LF8_4b_exp_MASK                        0x400000
+#define THCON_SEC0_REG1_Unp_LF8_4b_exp_RMW                         THCON_SEC0_REG1_Unp_LF8_4b_exp_ADDR32, THCON_SEC0_REG1_Unp_LF8_4b_exp_SHAMT, THCON_SEC0_REG1_Unp_LF8_4b_exp_MASK
+
+#define THCON_SEC0_REG1_Pac_LF8_4b_exp_ADDR32                       71
+#define THCON_SEC0_REG1_Pac_LF8_4b_exp_SHAMT                        23
+#define THCON_SEC0_REG1_Pac_LF8_4b_exp_MASK                        0x800000
+#define THCON_SEC0_REG1_Pac_LF8_4b_exp_RMW                         THCON_SEC0_REG1_Pac_LF8_4b_exp_ADDR32, THCON_SEC0_REG1_Pac_LF8_4b_exp_SHAMT, THCON_SEC0_REG1_Pac_LF8_4b_exp_MASK
+
+#define THCON_SEC0_REG1_Exp_threshold_ADDR32                        71
+#define THCON_SEC0_REG1_Exp_threshold_SHAMT                         24
+#define THCON_SEC0_REG1_Exp_threshold_MASK                         0xff000000
+#define THCON_SEC0_REG1_Exp_threshold_RMW                          THCON_SEC0_REG1_Exp_threshold_ADDR32, THCON_SEC0_REG1_Exp_threshold_SHAMT, THCON_SEC0_REG1_Exp_threshold_MASK
+
+#define THCON_SEC0_REG2_Out_data_format_ADDR32                      72
+#define THCON_SEC0_REG2_Out_data_format_SHAMT                        0
+#define THCON_SEC0_REG2_Out_data_format_MASK                       0xf
+#define THCON_SEC0_REG2_Out_data_format_RMW                        THCON_SEC0_REG2_Out_data_format_ADDR32, THCON_SEC0_REG2_Out_data_format_SHAMT, THCON_SEC0_REG2_Out_data_format_MASK
+
+#define THCON_SEC0_REG2_Throttle_mode_ADDR32                        72
+#define THCON_SEC0_REG2_Throttle_mode_SHAMT                          4
+#define THCON_SEC0_REG2_Throttle_mode_MASK                         0x30
+#define THCON_SEC0_REG2_Throttle_mode_RMW                          THCON_SEC0_REG2_Throttle_mode_ADDR32, THCON_SEC0_REG2_Throttle_mode_SHAMT, THCON_SEC0_REG2_Throttle_mode_MASK
+
+#define THCON_SEC0_REG2_Context_count_ADDR32                        72
+#define THCON_SEC0_REG2_Context_count_SHAMT                          6
+#define THCON_SEC0_REG2_Context_count_MASK                         0xc0
+#define THCON_SEC0_REG2_Context_count_RMW                          THCON_SEC0_REG2_Context_count_ADDR32, THCON_SEC0_REG2_Context_count_SHAMT, THCON_SEC0_REG2_Context_count_MASK
+
+#define THCON_SEC0_REG2_Haloize_mode_ADDR32                         72
+#define THCON_SEC0_REG2_Haloize_mode_SHAMT                           8
+#define THCON_SEC0_REG2_Haloize_mode_MASK                          0x100
+#define THCON_SEC0_REG2_Haloize_mode_RMW                           THCON_SEC0_REG2_Haloize_mode_ADDR32, THCON_SEC0_REG2_Haloize_mode_SHAMT, THCON_SEC0_REG2_Haloize_mode_MASK
+
+#define THCON_SEC0_REG2_Tileize_mode_ADDR32                         72
+#define THCON_SEC0_REG2_Tileize_mode_SHAMT                           9
+#define THCON_SEC0_REG2_Tileize_mode_MASK                          0x200
+#define THCON_SEC0_REG2_Tileize_mode_RMW                           THCON_SEC0_REG2_Tileize_mode_ADDR32, THCON_SEC0_REG2_Tileize_mode_SHAMT, THCON_SEC0_REG2_Tileize_mode_MASK
+
+#define THCON_SEC0_REG2_Unpack_Src_Reg_Set_Upd_ADDR32               72
+#define THCON_SEC0_REG2_Unpack_Src_Reg_Set_Upd_SHAMT                10
+#define THCON_SEC0_REG2_Unpack_Src_Reg_Set_Upd_MASK                0x400
+#define THCON_SEC0_REG2_Unpack_Src_Reg_Set_Upd_RMW                 THCON_SEC0_REG2_Unpack_Src_Reg_Set_Upd_ADDR32, THCON_SEC0_REG2_Unpack_Src_Reg_Set_Upd_SHAMT, THCON_SEC0_REG2_Unpack_Src_Reg_Set_Upd_MASK
+
+#define THCON_SEC0_REG2_Unpack_If_Sel_ADDR32                        72
+#define THCON_SEC0_REG2_Unpack_If_Sel_SHAMT                         11
+#define THCON_SEC0_REG2_Unpack_If_Sel_MASK                         0x800
+#define THCON_SEC0_REG2_Unpack_If_Sel_RMW                          THCON_SEC0_REG2_Unpack_If_Sel_ADDR32, THCON_SEC0_REG2_Unpack_If_Sel_SHAMT, THCON_SEC0_REG2_Unpack_If_Sel_MASK
+
+#define THCON_SEC0_REG2_Upsample_rate_ADDR32                        72
+#define THCON_SEC0_REG2_Upsample_rate_SHAMT                         12
+#define THCON_SEC0_REG2_Upsample_rate_MASK                         0x3000
+#define THCON_SEC0_REG2_Upsample_rate_RMW                          THCON_SEC0_REG2_Upsample_rate_ADDR32, THCON_SEC0_REG2_Upsample_rate_SHAMT, THCON_SEC0_REG2_Upsample_rate_MASK
+
+#define THCON_SEC0_REG2_Ovrd_data_format_ADDR32                     72
+#define THCON_SEC0_REG2_Ovrd_data_format_SHAMT                      14
+#define THCON_SEC0_REG2_Ovrd_data_format_MASK                      0x4000
+#define THCON_SEC0_REG2_Ovrd_data_format_RMW                       THCON_SEC0_REG2_Ovrd_data_format_ADDR32, THCON_SEC0_REG2_Ovrd_data_format_SHAMT, THCON_SEC0_REG2_Ovrd_data_format_MASK
+
+#define THCON_SEC0_REG2_Upsample_and_interleave_ADDR32              72
+#define THCON_SEC0_REG2_Upsample_and_interleave_SHAMT               15
+#define THCON_SEC0_REG2_Upsample_and_interleave_MASK               0x8000
+#define THCON_SEC0_REG2_Upsample_and_interleave_RMW                THCON_SEC0_REG2_Upsample_and_interleave_ADDR32, THCON_SEC0_REG2_Upsample_and_interleave_SHAMT, THCON_SEC0_REG2_Upsample_and_interleave_MASK
+
+#define THCON_SEC0_REG2_Shift_amount_cntx0_ADDR32                   72
+#define THCON_SEC0_REG2_Shift_amount_cntx0_SHAMT                    16
+#define THCON_SEC0_REG2_Shift_amount_cntx0_MASK                    0xf0000
+#define THCON_SEC0_REG2_Shift_amount_cntx0_RMW                     THCON_SEC0_REG2_Shift_amount_cntx0_ADDR32, THCON_SEC0_REG2_Shift_amount_cntx0_SHAMT, THCON_SEC0_REG2_Shift_amount_cntx0_MASK
+
+#define THCON_SEC0_REG2_Shift_amount_cntx1_ADDR32                   72
+#define THCON_SEC0_REG2_Shift_amount_cntx1_SHAMT                    20
+#define THCON_SEC0_REG2_Shift_amount_cntx1_MASK                    0xf00000
+#define THCON_SEC0_REG2_Shift_amount_cntx1_RMW                     THCON_SEC0_REG2_Shift_amount_cntx1_ADDR32, THCON_SEC0_REG2_Shift_amount_cntx1_SHAMT, THCON_SEC0_REG2_Shift_amount_cntx1_MASK
+
+#define THCON_SEC0_REG2_Shift_amount_cntx2_ADDR32                   72
+#define THCON_SEC0_REG2_Shift_amount_cntx2_SHAMT                    24
+#define THCON_SEC0_REG2_Shift_amount_cntx2_MASK                    0xf000000
+#define THCON_SEC0_REG2_Shift_amount_cntx2_RMW                     THCON_SEC0_REG2_Shift_amount_cntx2_ADDR32, THCON_SEC0_REG2_Shift_amount_cntx2_SHAMT, THCON_SEC0_REG2_Shift_amount_cntx2_MASK
+
+#define THCON_SEC0_REG2_Shift_amount_cntx3_ADDR32                   72
+#define THCON_SEC0_REG2_Shift_amount_cntx3_SHAMT                    28
+#define THCON_SEC0_REG2_Shift_amount_cntx3_MASK                    0xf0000000
+#define THCON_SEC0_REG2_Shift_amount_cntx3_RMW                     THCON_SEC0_REG2_Shift_amount_cntx3_ADDR32, THCON_SEC0_REG2_Shift_amount_cntx3_SHAMT, THCON_SEC0_REG2_Shift_amount_cntx3_MASK
+
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx0_ADDR32          73
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx0_SHAMT            0
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx0_MASK           0x1
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx0_RMW            THCON_SEC0_REG2_Disable_zero_compress_cntx0_ADDR32, THCON_SEC0_REG2_Disable_zero_compress_cntx0_SHAMT, THCON_SEC0_REG2_Disable_zero_compress_cntx0_MASK
+
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx1_ADDR32          73
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx1_SHAMT            1
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx1_MASK           0x2
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx1_RMW            THCON_SEC0_REG2_Disable_zero_compress_cntx1_ADDR32, THCON_SEC0_REG2_Disable_zero_compress_cntx1_SHAMT, THCON_SEC0_REG2_Disable_zero_compress_cntx1_MASK
+
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx2_ADDR32          73
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx2_SHAMT            2
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx2_MASK           0x4
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx2_RMW            THCON_SEC0_REG2_Disable_zero_compress_cntx2_ADDR32, THCON_SEC0_REG2_Disable_zero_compress_cntx2_SHAMT, THCON_SEC0_REG2_Disable_zero_compress_cntx2_MASK
+
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx3_ADDR32          73
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx3_SHAMT            3
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx3_MASK           0x8
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx3_RMW            THCON_SEC0_REG2_Disable_zero_compress_cntx3_ADDR32, THCON_SEC0_REG2_Disable_zero_compress_cntx3_SHAMT, THCON_SEC0_REG2_Disable_zero_compress_cntx3_MASK
+
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx0_ADDR32                  73
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx0_SHAMT                    4
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx0_MASK                   0x10
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx0_RMW                    THCON_SEC0_REG2_Unpack_if_sel_cntx0_ADDR32, THCON_SEC0_REG2_Unpack_if_sel_cntx0_SHAMT, THCON_SEC0_REG2_Unpack_if_sel_cntx0_MASK
+
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx1_ADDR32                  73
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx1_SHAMT                    5
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx1_MASK                   0x20
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx1_RMW                    THCON_SEC0_REG2_Unpack_if_sel_cntx1_ADDR32, THCON_SEC0_REG2_Unpack_if_sel_cntx1_SHAMT, THCON_SEC0_REG2_Unpack_if_sel_cntx1_MASK
+
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx2_ADDR32                  73
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx2_SHAMT                    6
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx2_MASK                   0x40
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx2_RMW                    THCON_SEC0_REG2_Unpack_if_sel_cntx2_ADDR32, THCON_SEC0_REG2_Unpack_if_sel_cntx2_SHAMT, THCON_SEC0_REG2_Unpack_if_sel_cntx2_MASK
+
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx3_ADDR32                  73
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx3_SHAMT                    7
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx3_MASK                   0x80
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx3_RMW                    THCON_SEC0_REG2_Unpack_if_sel_cntx3_ADDR32, THCON_SEC0_REG2_Unpack_if_sel_cntx3_SHAMT, THCON_SEC0_REG2_Unpack_if_sel_cntx3_MASK
+
+#define THCON_SEC0_REG2_Force_shared_exp_ADDR32                     73
+#define THCON_SEC0_REG2_Force_shared_exp_SHAMT                       8
+#define THCON_SEC0_REG2_Force_shared_exp_MASK                      0x100
+#define THCON_SEC0_REG2_Force_shared_exp_RMW                       THCON_SEC0_REG2_Force_shared_exp_ADDR32, THCON_SEC0_REG2_Force_shared_exp_SHAMT, THCON_SEC0_REG2_Force_shared_exp_MASK
+
+#define THCON_SEC0_REG2_Context_count_non_log2_ADDR32               73
+#define THCON_SEC0_REG2_Context_count_non_log2_SHAMT                 9
+#define THCON_SEC0_REG2_Context_count_non_log2_MASK                0xe00
+#define THCON_SEC0_REG2_Context_count_non_log2_RMW                 THCON_SEC0_REG2_Context_count_non_log2_ADDR32, THCON_SEC0_REG2_Context_count_non_log2_SHAMT, THCON_SEC0_REG2_Context_count_non_log2_MASK
+
+#define THCON_SEC0_REG2_Context_count_non_log2_en_ADDR32            73
+#define THCON_SEC0_REG2_Context_count_non_log2_en_SHAMT             12
+#define THCON_SEC0_REG2_Context_count_non_log2_en_MASK             0x1000
+#define THCON_SEC0_REG2_Context_count_non_log2_en_RMW              THCON_SEC0_REG2_Context_count_non_log2_en_ADDR32, THCON_SEC0_REG2_Context_count_non_log2_en_SHAMT, THCON_SEC0_REG2_Context_count_non_log2_en_MASK
+
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx4_ADDR32          73
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx4_SHAMT           16
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx4_MASK           0x10000
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx4_RMW            THCON_SEC0_REG2_Disable_zero_compress_cntx4_ADDR32, THCON_SEC0_REG2_Disable_zero_compress_cntx4_SHAMT, THCON_SEC0_REG2_Disable_zero_compress_cntx4_MASK
+
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx5_ADDR32          73
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx5_SHAMT           17
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx5_MASK           0x20000
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx5_RMW            THCON_SEC0_REG2_Disable_zero_compress_cntx5_ADDR32, THCON_SEC0_REG2_Disable_zero_compress_cntx5_SHAMT, THCON_SEC0_REG2_Disable_zero_compress_cntx5_MASK
+
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx6_ADDR32          73
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx6_SHAMT           18
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx6_MASK           0x40000
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx6_RMW            THCON_SEC0_REG2_Disable_zero_compress_cntx6_ADDR32, THCON_SEC0_REG2_Disable_zero_compress_cntx6_SHAMT, THCON_SEC0_REG2_Disable_zero_compress_cntx6_MASK
+
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx7_ADDR32          73
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx7_SHAMT           19
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx7_MASK           0x80000
+#define THCON_SEC0_REG2_Disable_zero_compress_cntx7_RMW            THCON_SEC0_REG2_Disable_zero_compress_cntx7_ADDR32, THCON_SEC0_REG2_Disable_zero_compress_cntx7_SHAMT, THCON_SEC0_REG2_Disable_zero_compress_cntx7_MASK
+
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx4_ADDR32                  73
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx4_SHAMT                   20
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx4_MASK                   0x100000
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx4_RMW                    THCON_SEC0_REG2_Unpack_if_sel_cntx4_ADDR32, THCON_SEC0_REG2_Unpack_if_sel_cntx4_SHAMT, THCON_SEC0_REG2_Unpack_if_sel_cntx4_MASK
+
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx5_ADDR32                  73
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx5_SHAMT                   21
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx5_MASK                   0x200000
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx5_RMW                    THCON_SEC0_REG2_Unpack_if_sel_cntx5_ADDR32, THCON_SEC0_REG2_Unpack_if_sel_cntx5_SHAMT, THCON_SEC0_REG2_Unpack_if_sel_cntx5_MASK
+
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx6_ADDR32                  73
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx6_SHAMT                   22
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx6_MASK                   0x400000
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx6_RMW                    THCON_SEC0_REG2_Unpack_if_sel_cntx6_ADDR32, THCON_SEC0_REG2_Unpack_if_sel_cntx6_SHAMT, THCON_SEC0_REG2_Unpack_if_sel_cntx6_MASK
+
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx7_ADDR32                  73
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx7_SHAMT                   23
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx7_MASK                   0x800000
+#define THCON_SEC0_REG2_Unpack_if_sel_cntx7_RMW                    THCON_SEC0_REG2_Unpack_if_sel_cntx7_ADDR32, THCON_SEC0_REG2_Unpack_if_sel_cntx7_SHAMT, THCON_SEC0_REG2_Unpack_if_sel_cntx7_MASK
+
+#define THCON_SEC0_REG2_Metadata_x_end_ADDR32                       73
+#define THCON_SEC0_REG2_Metadata_x_end_SHAMT                        24
+#define THCON_SEC0_REG2_Metadata_x_end_MASK                        0xff000000
+#define THCON_SEC0_REG2_Metadata_x_end_RMW                         THCON_SEC0_REG2_Metadata_x_end_ADDR32, THCON_SEC0_REG2_Metadata_x_end_SHAMT, THCON_SEC0_REG2_Metadata_x_end_MASK
+
+#define THCON_SEC0_REG2_Unpack_limit_address_ADDR32                 74
+#define THCON_SEC0_REG2_Unpack_limit_address_SHAMT                   0
+#define THCON_SEC0_REG2_Unpack_limit_address_MASK                  0x1ffff
+#define THCON_SEC0_REG2_Unpack_limit_address_RMW                   THCON_SEC0_REG2_Unpack_limit_address_ADDR32, THCON_SEC0_REG2_Unpack_limit_address_SHAMT, THCON_SEC0_REG2_Unpack_limit_address_MASK
+
+#define THCON_SEC0_REG2_Unpack_fifo_size_ADDR32                     75
+#define THCON_SEC0_REG2_Unpack_fifo_size_SHAMT                       0
+#define THCON_SEC0_REG2_Unpack_fifo_size_MASK                      0x1ffff
+#define THCON_SEC0_REG2_Unpack_fifo_size_RMW                       THCON_SEC0_REG2_Unpack_fifo_size_ADDR32, THCON_SEC0_REG2_Unpack_fifo_size_SHAMT, THCON_SEC0_REG2_Unpack_fifo_size_MASK
+
+#define THCON_SEC0_REG3_Base_address_ADDR32                         76
+#define THCON_SEC0_REG3_Base_address_SHAMT                           0
+#define THCON_SEC0_REG3_Base_address_MASK                          0xffffffff
+#define THCON_SEC0_REG3_Base_address_RMW                           THCON_SEC0_REG3_Base_address_ADDR32, THCON_SEC0_REG3_Base_address_SHAMT, THCON_SEC0_REG3_Base_address_MASK
+
+#define THCON_SEC0_REG3_Base_cntx1_address_ADDR32                   77
+#define THCON_SEC0_REG3_Base_cntx1_address_SHAMT                     0
+#define THCON_SEC0_REG3_Base_cntx1_address_MASK                    0xffffffff
+#define THCON_SEC0_REG3_Base_cntx1_address_RMW                     THCON_SEC0_REG3_Base_cntx1_address_ADDR32, THCON_SEC0_REG3_Base_cntx1_address_SHAMT, THCON_SEC0_REG3_Base_cntx1_address_MASK
+
+#define THCON_SEC0_REG3_Base_cntx2_address_ADDR32                   78
+#define THCON_SEC0_REG3_Base_cntx2_address_SHAMT                     0
+#define THCON_SEC0_REG3_Base_cntx2_address_MASK                    0xffffffff
+#define THCON_SEC0_REG3_Base_cntx2_address_RMW                     THCON_SEC0_REG3_Base_cntx2_address_ADDR32, THCON_SEC0_REG3_Base_cntx2_address_SHAMT, THCON_SEC0_REG3_Base_cntx2_address_MASK
+
+#define THCON_SEC0_REG3_Base_cntx3_address_ADDR32                   79
+#define THCON_SEC0_REG3_Base_cntx3_address_SHAMT                     0
+#define THCON_SEC0_REG3_Base_cntx3_address_MASK                    0xffffffff
+#define THCON_SEC0_REG3_Base_cntx3_address_RMW                     THCON_SEC0_REG3_Base_cntx3_address_ADDR32, THCON_SEC0_REG3_Base_cntx3_address_SHAMT, THCON_SEC0_REG3_Base_cntx3_address_MASK
+
+#define THCON_SEC0_REG4_Base_cntx4_address_ADDR32                   80
+#define THCON_SEC0_REG4_Base_cntx4_address_SHAMT                     0
+#define THCON_SEC0_REG4_Base_cntx4_address_MASK                    0xffffffff
+#define THCON_SEC0_REG4_Base_cntx4_address_RMW                     THCON_SEC0_REG4_Base_cntx4_address_ADDR32, THCON_SEC0_REG4_Base_cntx4_address_SHAMT, THCON_SEC0_REG4_Base_cntx4_address_MASK
+
+#define THCON_SEC0_REG4_Base_cntx5_address_ADDR32                   81
+#define THCON_SEC0_REG4_Base_cntx5_address_SHAMT                     0
+#define THCON_SEC0_REG4_Base_cntx5_address_MASK                    0xffffffff
+#define THCON_SEC0_REG4_Base_cntx5_address_RMW                     THCON_SEC0_REG4_Base_cntx5_address_ADDR32, THCON_SEC0_REG4_Base_cntx5_address_SHAMT, THCON_SEC0_REG4_Base_cntx5_address_MASK
+
+#define THCON_SEC0_REG4_Base_cntx6_address_ADDR32                   82
+#define THCON_SEC0_REG4_Base_cntx6_address_SHAMT                     0
+#define THCON_SEC0_REG4_Base_cntx6_address_MASK                    0xffffffff
+#define THCON_SEC0_REG4_Base_cntx6_address_RMW                     THCON_SEC0_REG4_Base_cntx6_address_ADDR32, THCON_SEC0_REG4_Base_cntx6_address_SHAMT, THCON_SEC0_REG4_Base_cntx6_address_MASK
+
+#define THCON_SEC0_REG4_Base_cntx7_address_ADDR32                   83
+#define THCON_SEC0_REG4_Base_cntx7_address_SHAMT                     0
+#define THCON_SEC0_REG4_Base_cntx7_address_MASK                    0xffffffff
+#define THCON_SEC0_REG4_Base_cntx7_address_RMW                     THCON_SEC0_REG4_Base_cntx7_address_ADDR32, THCON_SEC0_REG4_Base_cntx7_address_SHAMT, THCON_SEC0_REG4_Base_cntx7_address_MASK
+
+#define THCON_SEC0_REG5_Dest_cntx0_address_ADDR32                   84
+#define THCON_SEC0_REG5_Dest_cntx0_address_SHAMT                     0
+#define THCON_SEC0_REG5_Dest_cntx0_address_MASK                    0xffff
+#define THCON_SEC0_REG5_Dest_cntx0_address_RMW                     THCON_SEC0_REG5_Dest_cntx0_address_ADDR32, THCON_SEC0_REG5_Dest_cntx0_address_SHAMT, THCON_SEC0_REG5_Dest_cntx0_address_MASK
+
+#define THCON_SEC0_REG5_Dest_cntx1_address_ADDR32                   84
+#define THCON_SEC0_REG5_Dest_cntx1_address_SHAMT                    16
+#define THCON_SEC0_REG5_Dest_cntx1_address_MASK                    0xffff0000
+#define THCON_SEC0_REG5_Dest_cntx1_address_RMW                     THCON_SEC0_REG5_Dest_cntx1_address_ADDR32, THCON_SEC0_REG5_Dest_cntx1_address_SHAMT, THCON_SEC0_REG5_Dest_cntx1_address_MASK
+
+#define THCON_SEC0_REG5_Dest_cntx2_address_ADDR32                   85
+#define THCON_SEC0_REG5_Dest_cntx2_address_SHAMT                     0
+#define THCON_SEC0_REG5_Dest_cntx2_address_MASK                    0xffff
+#define THCON_SEC0_REG5_Dest_cntx2_address_RMW                     THCON_SEC0_REG5_Dest_cntx2_address_ADDR32, THCON_SEC0_REG5_Dest_cntx2_address_SHAMT, THCON_SEC0_REG5_Dest_cntx2_address_MASK
+
+#define THCON_SEC0_REG5_Dest_cntx3_address_ADDR32                   85
+#define THCON_SEC0_REG5_Dest_cntx3_address_SHAMT                    16
+#define THCON_SEC0_REG5_Dest_cntx3_address_MASK                    0xffff0000
+#define THCON_SEC0_REG5_Dest_cntx3_address_RMW                     THCON_SEC0_REG5_Dest_cntx3_address_ADDR32, THCON_SEC0_REG5_Dest_cntx3_address_SHAMT, THCON_SEC0_REG5_Dest_cntx3_address_MASK
+
+#define THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32                     86
+#define THCON_SEC0_REG5_Tile_x_dim_cntx0_SHAMT                       0
+#define THCON_SEC0_REG5_Tile_x_dim_cntx0_MASK                      0xffff
+#define THCON_SEC0_REG5_Tile_x_dim_cntx0_RMW                       THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32, THCON_SEC0_REG5_Tile_x_dim_cntx0_SHAMT, THCON_SEC0_REG5_Tile_x_dim_cntx0_MASK
+
+#define THCON_SEC0_REG5_Tile_x_dim_cntx1_ADDR32                     86
+#define THCON_SEC0_REG5_Tile_x_dim_cntx1_SHAMT                      16
+#define THCON_SEC0_REG5_Tile_x_dim_cntx1_MASK                      0xffff0000
+#define THCON_SEC0_REG5_Tile_x_dim_cntx1_RMW                       THCON_SEC0_REG5_Tile_x_dim_cntx1_ADDR32, THCON_SEC0_REG5_Tile_x_dim_cntx1_SHAMT, THCON_SEC0_REG5_Tile_x_dim_cntx1_MASK
+
+#define THCON_SEC0_REG5_Tile_x_dim_cntx2_ADDR32                     87
+#define THCON_SEC0_REG5_Tile_x_dim_cntx2_SHAMT                       0
+#define THCON_SEC0_REG5_Tile_x_dim_cntx2_MASK                      0xffff
+#define THCON_SEC0_REG5_Tile_x_dim_cntx2_RMW                       THCON_SEC0_REG5_Tile_x_dim_cntx2_ADDR32, THCON_SEC0_REG5_Tile_x_dim_cntx2_SHAMT, THCON_SEC0_REG5_Tile_x_dim_cntx2_MASK
+
+#define THCON_SEC0_REG5_Tile_x_dim_cntx3_ADDR32                     87
+#define THCON_SEC0_REG5_Tile_x_dim_cntx3_SHAMT                      16
+#define THCON_SEC0_REG5_Tile_x_dim_cntx3_MASK                      0xffff0000
+#define THCON_SEC0_REG5_Tile_x_dim_cntx3_RMW                       THCON_SEC0_REG5_Tile_x_dim_cntx3_ADDR32, THCON_SEC0_REG5_Tile_x_dim_cntx3_SHAMT, THCON_SEC0_REG5_Tile_x_dim_cntx3_MASK
+
+#define THCON_SEC0_REG6_Source_address_ADDR32                       88
+#define THCON_SEC0_REG6_Source_address_SHAMT                         0
+#define THCON_SEC0_REG6_Source_address_MASK                        0xffffffff
+#define THCON_SEC0_REG6_Source_address_RMW                         THCON_SEC0_REG6_Source_address_ADDR32, THCON_SEC0_REG6_Source_address_SHAMT, THCON_SEC0_REG6_Source_address_MASK
+
+#define THCON_SEC0_REG6_Destination_address_ADDR32                  89
+#define THCON_SEC0_REG6_Destination_address_SHAMT                    0
+#define THCON_SEC0_REG6_Destination_address_MASK                   0xffffffff
+#define THCON_SEC0_REG6_Destination_address_RMW                    THCON_SEC0_REG6_Destination_address_ADDR32, THCON_SEC0_REG6_Destination_address_SHAMT, THCON_SEC0_REG6_Destination_address_MASK
+
+#define THCON_SEC0_REG6_Buffer_size_ADDR32                          90
+#define THCON_SEC0_REG6_Buffer_size_SHAMT                            0
+#define THCON_SEC0_REG6_Buffer_size_MASK                           0x3fffffff
+#define THCON_SEC0_REG6_Buffer_size_RMW                            THCON_SEC0_REG6_Buffer_size_ADDR32, THCON_SEC0_REG6_Buffer_size_SHAMT, THCON_SEC0_REG6_Buffer_size_MASK
+
+#define THCON_SEC0_REG6_Transfer_direction_ADDR32                   90
+#define THCON_SEC0_REG6_Transfer_direction_SHAMT                    30
+#define THCON_SEC0_REG6_Transfer_direction_MASK                    0xc0000000
+#define THCON_SEC0_REG6_Transfer_direction_RMW                     THCON_SEC0_REG6_Transfer_direction_ADDR32, THCON_SEC0_REG6_Transfer_direction_SHAMT, THCON_SEC0_REG6_Transfer_direction_MASK
+
+#define THCON_SEC0_REG6_Metadata_misc_ADDR32                        91
+#define THCON_SEC0_REG6_Metadata_misc_SHAMT                          0
+#define THCON_SEC0_REG6_Metadata_misc_MASK                         0xffffffff
+#define THCON_SEC0_REG6_Metadata_misc_RMW                          THCON_SEC0_REG6_Metadata_misc_ADDR32, THCON_SEC0_REG6_Metadata_misc_SHAMT, THCON_SEC0_REG6_Metadata_misc_MASK
+
+#define THCON_SEC0_REG7_Offset_address_ADDR32                       92
+#define THCON_SEC0_REG7_Offset_address_SHAMT                         0
+#define THCON_SEC0_REG7_Offset_address_MASK                        0xffff
+#define THCON_SEC0_REG7_Offset_address_RMW                         THCON_SEC0_REG7_Offset_address_ADDR32, THCON_SEC0_REG7_Offset_address_SHAMT, THCON_SEC0_REG7_Offset_address_MASK
+
+#define THCON_SEC0_REG7_Unpack_data_format_cntx0_ADDR32             92
+#define THCON_SEC0_REG7_Unpack_data_format_cntx0_SHAMT              16
+#define THCON_SEC0_REG7_Unpack_data_format_cntx0_MASK              0xf0000
+#define THCON_SEC0_REG7_Unpack_data_format_cntx0_RMW               THCON_SEC0_REG7_Unpack_data_format_cntx0_ADDR32, THCON_SEC0_REG7_Unpack_data_format_cntx0_SHAMT, THCON_SEC0_REG7_Unpack_data_format_cntx0_MASK
+
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx0_ADDR32         92
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx0_SHAMT          20
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx0_MASK          0xf00000
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx0_RMW           THCON_SEC0_REG7_Unpack_out_data_format_cntx0_ADDR32, THCON_SEC0_REG7_Unpack_out_data_format_cntx0_SHAMT, THCON_SEC0_REG7_Unpack_out_data_format_cntx0_MASK
+
+#define THCON_SEC0_REG7_Unpack_data_format_cntx4_ADDR32             92
+#define THCON_SEC0_REG7_Unpack_data_format_cntx4_SHAMT              24
+#define THCON_SEC0_REG7_Unpack_data_format_cntx4_MASK              0xf000000
+#define THCON_SEC0_REG7_Unpack_data_format_cntx4_RMW               THCON_SEC0_REG7_Unpack_data_format_cntx4_ADDR32, THCON_SEC0_REG7_Unpack_data_format_cntx4_SHAMT, THCON_SEC0_REG7_Unpack_data_format_cntx4_MASK
+
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx4_ADDR32         92
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx4_SHAMT          28
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx4_MASK          0xf0000000
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx4_RMW           THCON_SEC0_REG7_Unpack_out_data_format_cntx4_ADDR32, THCON_SEC0_REG7_Unpack_out_data_format_cntx4_SHAMT, THCON_SEC0_REG7_Unpack_out_data_format_cntx4_MASK
+
+#define THCON_SEC0_REG7_Offset_cntx1_address_ADDR32                 93
+#define THCON_SEC0_REG7_Offset_cntx1_address_SHAMT                   0
+#define THCON_SEC0_REG7_Offset_cntx1_address_MASK                  0xffff
+#define THCON_SEC0_REG7_Offset_cntx1_address_RMW                   THCON_SEC0_REG7_Offset_cntx1_address_ADDR32, THCON_SEC0_REG7_Offset_cntx1_address_SHAMT, THCON_SEC0_REG7_Offset_cntx1_address_MASK
+
+#define THCON_SEC0_REG7_Unpack_data_format_cntx1_ADDR32             93
+#define THCON_SEC0_REG7_Unpack_data_format_cntx1_SHAMT              16
+#define THCON_SEC0_REG7_Unpack_data_format_cntx1_MASK              0xf0000
+#define THCON_SEC0_REG7_Unpack_data_format_cntx1_RMW               THCON_SEC0_REG7_Unpack_data_format_cntx1_ADDR32, THCON_SEC0_REG7_Unpack_data_format_cntx1_SHAMT, THCON_SEC0_REG7_Unpack_data_format_cntx1_MASK
+
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx1_ADDR32         93
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx1_SHAMT          20
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx1_MASK          0xf00000
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx1_RMW           THCON_SEC0_REG7_Unpack_out_data_format_cntx1_ADDR32, THCON_SEC0_REG7_Unpack_out_data_format_cntx1_SHAMT, THCON_SEC0_REG7_Unpack_out_data_format_cntx1_MASK
+
+#define THCON_SEC0_REG7_Unpack_data_format_cntx5_ADDR32             93
+#define THCON_SEC0_REG7_Unpack_data_format_cntx5_SHAMT              24
+#define THCON_SEC0_REG7_Unpack_data_format_cntx5_MASK              0xf000000
+#define THCON_SEC0_REG7_Unpack_data_format_cntx5_RMW               THCON_SEC0_REG7_Unpack_data_format_cntx5_ADDR32, THCON_SEC0_REG7_Unpack_data_format_cntx5_SHAMT, THCON_SEC0_REG7_Unpack_data_format_cntx5_MASK
+
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx5_ADDR32         93
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx5_SHAMT          28
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx5_MASK          0xf0000000
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx5_RMW           THCON_SEC0_REG7_Unpack_out_data_format_cntx5_ADDR32, THCON_SEC0_REG7_Unpack_out_data_format_cntx5_SHAMT, THCON_SEC0_REG7_Unpack_out_data_format_cntx5_MASK
+
+#define THCON_SEC0_REG7_Offset_cntx2_address_ADDR32                 94
+#define THCON_SEC0_REG7_Offset_cntx2_address_SHAMT                   0
+#define THCON_SEC0_REG7_Offset_cntx2_address_MASK                  0xffff
+#define THCON_SEC0_REG7_Offset_cntx2_address_RMW                   THCON_SEC0_REG7_Offset_cntx2_address_ADDR32, THCON_SEC0_REG7_Offset_cntx2_address_SHAMT, THCON_SEC0_REG7_Offset_cntx2_address_MASK
+
+#define THCON_SEC0_REG7_Unpack_data_format_cntx2_ADDR32             94
+#define THCON_SEC0_REG7_Unpack_data_format_cntx2_SHAMT              16
+#define THCON_SEC0_REG7_Unpack_data_format_cntx2_MASK              0xf0000
+#define THCON_SEC0_REG7_Unpack_data_format_cntx2_RMW               THCON_SEC0_REG7_Unpack_data_format_cntx2_ADDR32, THCON_SEC0_REG7_Unpack_data_format_cntx2_SHAMT, THCON_SEC0_REG7_Unpack_data_format_cntx2_MASK
+
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx2_ADDR32         94
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx2_SHAMT          20
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx2_MASK          0xf00000
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx2_RMW           THCON_SEC0_REG7_Unpack_out_data_format_cntx2_ADDR32, THCON_SEC0_REG7_Unpack_out_data_format_cntx2_SHAMT, THCON_SEC0_REG7_Unpack_out_data_format_cntx2_MASK
+
+#define THCON_SEC0_REG7_Unpack_data_format_cntx6_ADDR32             94
+#define THCON_SEC0_REG7_Unpack_data_format_cntx6_SHAMT              24
+#define THCON_SEC0_REG7_Unpack_data_format_cntx6_MASK              0xf000000
+#define THCON_SEC0_REG7_Unpack_data_format_cntx6_RMW               THCON_SEC0_REG7_Unpack_data_format_cntx6_ADDR32, THCON_SEC0_REG7_Unpack_data_format_cntx6_SHAMT, THCON_SEC0_REG7_Unpack_data_format_cntx6_MASK
+
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx6_ADDR32         94
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx6_SHAMT          28
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx6_MASK          0xf0000000
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx6_RMW           THCON_SEC0_REG7_Unpack_out_data_format_cntx6_ADDR32, THCON_SEC0_REG7_Unpack_out_data_format_cntx6_SHAMT, THCON_SEC0_REG7_Unpack_out_data_format_cntx6_MASK
+
+#define THCON_SEC0_REG7_Offset_cntx3_address_ADDR32                 95
+#define THCON_SEC0_REG7_Offset_cntx3_address_SHAMT                   0
+#define THCON_SEC0_REG7_Offset_cntx3_address_MASK                  0xffff
+#define THCON_SEC0_REG7_Offset_cntx3_address_RMW                   THCON_SEC0_REG7_Offset_cntx3_address_ADDR32, THCON_SEC0_REG7_Offset_cntx3_address_SHAMT, THCON_SEC0_REG7_Offset_cntx3_address_MASK
+
+#define THCON_SEC0_REG7_Unpack_data_format_cntx3_ADDR32             95
+#define THCON_SEC0_REG7_Unpack_data_format_cntx3_SHAMT              16
+#define THCON_SEC0_REG7_Unpack_data_format_cntx3_MASK              0xf0000
+#define THCON_SEC0_REG7_Unpack_data_format_cntx3_RMW               THCON_SEC0_REG7_Unpack_data_format_cntx3_ADDR32, THCON_SEC0_REG7_Unpack_data_format_cntx3_SHAMT, THCON_SEC0_REG7_Unpack_data_format_cntx3_MASK
+
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx3_ADDR32         95
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx3_SHAMT          20
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx3_MASK          0xf00000
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx3_RMW           THCON_SEC0_REG7_Unpack_out_data_format_cntx3_ADDR32, THCON_SEC0_REG7_Unpack_out_data_format_cntx3_SHAMT, THCON_SEC0_REG7_Unpack_out_data_format_cntx3_MASK
+
+#define THCON_SEC0_REG7_Unpack_data_format_cntx7_ADDR32             95
+#define THCON_SEC0_REG7_Unpack_data_format_cntx7_SHAMT              24
+#define THCON_SEC0_REG7_Unpack_data_format_cntx7_MASK              0xf000000
+#define THCON_SEC0_REG7_Unpack_data_format_cntx7_RMW               THCON_SEC0_REG7_Unpack_data_format_cntx7_ADDR32, THCON_SEC0_REG7_Unpack_data_format_cntx7_SHAMT, THCON_SEC0_REG7_Unpack_data_format_cntx7_MASK
+
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx7_ADDR32         95
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx7_SHAMT          28
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx7_MASK          0xf0000000
+#define THCON_SEC0_REG7_Unpack_out_data_format_cntx7_RMW           THCON_SEC0_REG7_Unpack_out_data_format_cntx7_ADDR32, THCON_SEC0_REG7_Unpack_out_data_format_cntx7_SHAMT, THCON_SEC0_REG7_Unpack_out_data_format_cntx7_MASK
+
+#define THCON_SEC0_REG8_Row_start_section_size_ADDR32               96
+#define THCON_SEC0_REG8_Row_start_section_size_SHAMT                 0
+#define THCON_SEC0_REG8_Row_start_section_size_MASK                0xffff
+#define THCON_SEC0_REG8_Row_start_section_size_RMW                 THCON_SEC0_REG8_Row_start_section_size_ADDR32, THCON_SEC0_REG8_Row_start_section_size_SHAMT, THCON_SEC0_REG8_Row_start_section_size_MASK
+
+#define THCON_SEC0_REG8_Exp_section_size_ADDR32                     96
+#define THCON_SEC0_REG8_Exp_section_size_SHAMT                      16
+#define THCON_SEC0_REG8_Exp_section_size_MASK                      0xffff0000
+#define THCON_SEC0_REG8_Exp_section_size_RMW                       THCON_SEC0_REG8_Exp_section_size_ADDR32, THCON_SEC0_REG8_Exp_section_size_SHAMT, THCON_SEC0_REG8_Exp_section_size_MASK
+
+#define THCON_SEC0_REG8_L1_Dest_addr_ADDR32                         97
+#define THCON_SEC0_REG8_L1_Dest_addr_SHAMT                           0
+#define THCON_SEC0_REG8_L1_Dest_addr_MASK                          0xffffffff
+#define THCON_SEC0_REG8_L1_Dest_addr_RMW                           THCON_SEC0_REG8_L1_Dest_addr_ADDR32, THCON_SEC0_REG8_L1_Dest_addr_SHAMT, THCON_SEC0_REG8_L1_Dest_addr_MASK
+
+#define THCON_SEC0_REG8_Disable_zero_compress_ADDR32                98
+#define THCON_SEC0_REG8_Disable_zero_compress_SHAMT                  0
+#define THCON_SEC0_REG8_Disable_zero_compress_MASK                 0x1
+#define THCON_SEC0_REG8_Disable_zero_compress_RMW                  THCON_SEC0_REG8_Disable_zero_compress_ADDR32, THCON_SEC0_REG8_Disable_zero_compress_SHAMT, THCON_SEC0_REG8_Disable_zero_compress_MASK
+
+#define THCON_SEC0_REG8_Add_l1_dest_addr_offset_ADDR32              98
+#define THCON_SEC0_REG8_Add_l1_dest_addr_offset_SHAMT                1
+#define THCON_SEC0_REG8_Add_l1_dest_addr_offset_MASK               0x2
+#define THCON_SEC0_REG8_Add_l1_dest_addr_offset_RMW                THCON_SEC0_REG8_Add_l1_dest_addr_offset_ADDR32, THCON_SEC0_REG8_Add_l1_dest_addr_offset_SHAMT, THCON_SEC0_REG8_Add_l1_dest_addr_offset_MASK
+
+#define THCON_SEC0_REG8_Disable_pack_zero_flags_ADDR32              98
+#define THCON_SEC0_REG8_Disable_pack_zero_flags_SHAMT                2
+#define THCON_SEC0_REG8_Disable_pack_zero_flags_MASK               0x4
+#define THCON_SEC0_REG8_Disable_pack_zero_flags_RMW                THCON_SEC0_REG8_Disable_pack_zero_flags_ADDR32, THCON_SEC0_REG8_Disable_pack_zero_flags_SHAMT, THCON_SEC0_REG8_Disable_pack_zero_flags_MASK
+
+#define THCON_SEC0_REG8_Unused1_ADDR32                              98
+#define THCON_SEC0_REG8_Unused1_SHAMT                                3
+#define THCON_SEC0_REG8_Unused1_MASK                               0x8
+#define THCON_SEC0_REG8_Unused1_RMW                                THCON_SEC0_REG8_Unused1_ADDR32, THCON_SEC0_REG8_Unused1_SHAMT, THCON_SEC0_REG8_Unused1_MASK
+
+#define THCON_SEC0_REG8_Out_data_format_ADDR32                      98
+#define THCON_SEC0_REG8_Out_data_format_SHAMT                        4
+#define THCON_SEC0_REG8_Out_data_format_MASK                       0xf0
+#define THCON_SEC0_REG8_Out_data_format_RMW                        THCON_SEC0_REG8_Out_data_format_ADDR32, THCON_SEC0_REG8_Out_data_format_SHAMT, THCON_SEC0_REG8_Out_data_format_MASK
+
+#define THCON_SEC0_REG8_In_data_format_ADDR32                       98
+#define THCON_SEC0_REG8_In_data_format_SHAMT                         8
+#define THCON_SEC0_REG8_In_data_format_MASK                        0xf00
+#define THCON_SEC0_REG8_In_data_format_RMW                         THCON_SEC0_REG8_In_data_format_ADDR32, THCON_SEC0_REG8_In_data_format_SHAMT, THCON_SEC0_REG8_In_data_format_MASK
+
+#define THCON_SEC0_REG8_Dis_shared_exp_assembler_ADDR32             98
+#define THCON_SEC0_REG8_Dis_shared_exp_assembler_SHAMT              12
+#define THCON_SEC0_REG8_Dis_shared_exp_assembler_MASK              0x1000
+#define THCON_SEC0_REG8_Dis_shared_exp_assembler_RMW               THCON_SEC0_REG8_Dis_shared_exp_assembler_ADDR32, THCON_SEC0_REG8_Dis_shared_exp_assembler_SHAMT, THCON_SEC0_REG8_Dis_shared_exp_assembler_MASK
+
+#define THCON_SEC0_REG8_Auto_set_last_pacr_intf_sel_ADDR32          98
+#define THCON_SEC0_REG8_Auto_set_last_pacr_intf_sel_SHAMT           13
+#define THCON_SEC0_REG8_Auto_set_last_pacr_intf_sel_MASK           0x2000
+#define THCON_SEC0_REG8_Auto_set_last_pacr_intf_sel_RMW            THCON_SEC0_REG8_Auto_set_last_pacr_intf_sel_ADDR32, THCON_SEC0_REG8_Auto_set_last_pacr_intf_sel_SHAMT, THCON_SEC0_REG8_Auto_set_last_pacr_intf_sel_MASK
+
+#define THCON_SEC0_REG8_Enable_out_fifo_ADDR32                      98
+#define THCON_SEC0_REG8_Enable_out_fifo_SHAMT                       14
+#define THCON_SEC0_REG8_Enable_out_fifo_MASK                       0x4000
+#define THCON_SEC0_REG8_Enable_out_fifo_RMW                        THCON_SEC0_REG8_Enable_out_fifo_ADDR32, THCON_SEC0_REG8_Enable_out_fifo_SHAMT, THCON_SEC0_REG8_Enable_out_fifo_MASK
+
+#define THCON_SEC0_REG8_Sub_l1_tile_header_size_ADDR32              98
+#define THCON_SEC0_REG8_Sub_l1_tile_header_size_SHAMT               15
+#define THCON_SEC0_REG8_Sub_l1_tile_header_size_MASK               0x8000
+#define THCON_SEC0_REG8_Sub_l1_tile_header_size_RMW                THCON_SEC0_REG8_Sub_l1_tile_header_size_ADDR32, THCON_SEC0_REG8_Sub_l1_tile_header_size_SHAMT, THCON_SEC0_REG8_Sub_l1_tile_header_size_MASK
+
+#define THCON_SEC0_REG8_Source_interface_selection_ADDR32           98
+#define THCON_SEC0_REG8_Source_interface_selection_SHAMT            16
+#define THCON_SEC0_REG8_Source_interface_selection_MASK            0x10000
+#define THCON_SEC0_REG8_Source_interface_selection_RMW             THCON_SEC0_REG8_Source_interface_selection_ADDR32, THCON_SEC0_REG8_Source_interface_selection_SHAMT, THCON_SEC0_REG8_Source_interface_selection_MASK
+
+#define THCON_SEC0_REG8_Add_tile_header_size_ADDR32                 98
+#define THCON_SEC0_REG8_Add_tile_header_size_SHAMT                  17
+#define THCON_SEC0_REG8_Add_tile_header_size_MASK                  0x20000
+#define THCON_SEC0_REG8_Add_tile_header_size_RMW                   THCON_SEC0_REG8_Add_tile_header_size_ADDR32, THCON_SEC0_REG8_Add_tile_header_size_SHAMT, THCON_SEC0_REG8_Add_tile_header_size_MASK
+
+#define THCON_SEC0_REG8_pack_dis_y_pos_start_offset_ADDR32          98
+#define THCON_SEC0_REG8_pack_dis_y_pos_start_offset_SHAMT           18
+#define THCON_SEC0_REG8_pack_dis_y_pos_start_offset_MASK           0x40000
+#define THCON_SEC0_REG8_pack_dis_y_pos_start_offset_RMW            THCON_SEC0_REG8_pack_dis_y_pos_start_offset_ADDR32, THCON_SEC0_REG8_pack_dis_y_pos_start_offset_SHAMT, THCON_SEC0_REG8_pack_dis_y_pos_start_offset_MASK
+
+#define THCON_SEC0_REG8_unpack_tile_offset_ADDR32                   98
+#define THCON_SEC0_REG8_unpack_tile_offset_SHAMT                    19
+#define THCON_SEC0_REG8_unpack_tile_offset_MASK                    0xf80000
+#define THCON_SEC0_REG8_unpack_tile_offset_RMW                     THCON_SEC0_REG8_unpack_tile_offset_ADDR32, THCON_SEC0_REG8_unpack_tile_offset_SHAMT, THCON_SEC0_REG8_unpack_tile_offset_MASK
+
+#define THCON_SEC0_REG8_L1_source_addr_ADDR32                       98
+#define THCON_SEC0_REG8_L1_source_addr_SHAMT                        24
+#define THCON_SEC0_REG8_L1_source_addr_MASK                        0xff000000
+#define THCON_SEC0_REG8_L1_source_addr_RMW                         THCON_SEC0_REG8_L1_source_addr_ADDR32, THCON_SEC0_REG8_L1_source_addr_SHAMT, THCON_SEC0_REG8_L1_source_addr_MASK
+
+#define THCON_SEC0_REG8_Downsample_mask_ADDR32                      99
+#define THCON_SEC0_REG8_Downsample_mask_SHAMT                        0
+#define THCON_SEC0_REG8_Downsample_mask_MASK                       0xffff
+#define THCON_SEC0_REG8_Downsample_mask_RMW                        THCON_SEC0_REG8_Downsample_mask_ADDR32, THCON_SEC0_REG8_Downsample_mask_SHAMT, THCON_SEC0_REG8_Downsample_mask_MASK
+
+#define THCON_SEC0_REG8_Downsample_rate_ADDR32                      99
+#define THCON_SEC0_REG8_Downsample_rate_SHAMT                       16
+#define THCON_SEC0_REG8_Downsample_rate_MASK                       0x70000
+#define THCON_SEC0_REG8_Downsample_rate_RMW                        THCON_SEC0_REG8_Downsample_rate_ADDR32, THCON_SEC0_REG8_Downsample_rate_SHAMT, THCON_SEC0_REG8_Downsample_rate_MASK
+
+#define THCON_SEC0_REG8_Pack_L1_Acc_ADDR32                          99
+#define THCON_SEC0_REG8_Pack_L1_Acc_SHAMT                           19
+#define THCON_SEC0_REG8_Pack_L1_Acc_MASK                           0x80000
+#define THCON_SEC0_REG8_Pack_L1_Acc_RMW                            THCON_SEC0_REG8_Pack_L1_Acc_ADDR32, THCON_SEC0_REG8_Pack_L1_Acc_SHAMT, THCON_SEC0_REG8_Pack_L1_Acc_MASK
+
+#define THCON_SEC0_REG8_Exp_threshold_en_ADDR32                     99
+#define THCON_SEC0_REG8_Exp_threshold_en_SHAMT                      20
+#define THCON_SEC0_REG8_Exp_threshold_en_MASK                      0x100000
+#define THCON_SEC0_REG8_Exp_threshold_en_RMW                       THCON_SEC0_REG8_Exp_threshold_en_ADDR32, THCON_SEC0_REG8_Exp_threshold_en_SHAMT, THCON_SEC0_REG8_Exp_threshold_en_MASK
+
+#define THCON_SEC0_REG8_Exp_threshold_ADDR32                        99
+#define THCON_SEC0_REG8_Exp_threshold_SHAMT                         24
+#define THCON_SEC0_REG8_Exp_threshold_MASK                         0xff000000
+#define THCON_SEC0_REG8_Exp_threshold_RMW                          THCON_SEC0_REG8_Exp_threshold_ADDR32, THCON_SEC0_REG8_Exp_threshold_SHAMT, THCON_SEC0_REG8_Exp_threshold_MASK
+
+#define THCON_SEC0_REG9_Pack_0_2_limit_address_ADDR32              100
+#define THCON_SEC0_REG9_Pack_0_2_limit_address_SHAMT                 0
+#define THCON_SEC0_REG9_Pack_0_2_limit_address_MASK                0x1ffff
+#define THCON_SEC0_REG9_Pack_0_2_limit_address_RMW                 THCON_SEC0_REG9_Pack_0_2_limit_address_ADDR32, THCON_SEC0_REG9_Pack_0_2_limit_address_SHAMT, THCON_SEC0_REG9_Pack_0_2_limit_address_MASK
+
+#define THCON_SEC0_REG9_Pack_0_2_fifo_size_ADDR32                  101
+#define THCON_SEC0_REG9_Pack_0_2_fifo_size_SHAMT                     0
+#define THCON_SEC0_REG9_Pack_0_2_fifo_size_MASK                    0x1ffff
+#define THCON_SEC0_REG9_Pack_0_2_fifo_size_RMW                     THCON_SEC0_REG9_Pack_0_2_fifo_size_ADDR32, THCON_SEC0_REG9_Pack_0_2_fifo_size_SHAMT, THCON_SEC0_REG9_Pack_0_2_fifo_size_MASK
+
+#define THCON_SEC0_REG9_Pack_1_3_limit_address_ADDR32              102
+#define THCON_SEC0_REG9_Pack_1_3_limit_address_SHAMT                 0
+#define THCON_SEC0_REG9_Pack_1_3_limit_address_MASK                0x1ffff
+#define THCON_SEC0_REG9_Pack_1_3_limit_address_RMW                 THCON_SEC0_REG9_Pack_1_3_limit_address_ADDR32, THCON_SEC0_REG9_Pack_1_3_limit_address_SHAMT, THCON_SEC0_REG9_Pack_1_3_limit_address_MASK
+
+#define THCON_SEC0_REG9_Pack_1_3_fifo_size_ADDR32                  103
+#define THCON_SEC0_REG9_Pack_1_3_fifo_size_SHAMT                     0
+#define THCON_SEC0_REG9_Pack_1_3_fifo_size_MASK                    0x1ffff
+#define THCON_SEC0_REG9_Pack_1_3_fifo_size_RMW                     THCON_SEC0_REG9_Pack_1_3_fifo_size_ADDR32, THCON_SEC0_REG9_Pack_1_3_fifo_size_SHAMT, THCON_SEC0_REG9_Pack_1_3_fifo_size_MASK
+
+#define THCON_SEC0_REG10_Unpack_limit_address_ADDR32               104
+#define THCON_SEC0_REG10_Unpack_limit_address_SHAMT                  0
+#define THCON_SEC0_REG10_Unpack_limit_address_MASK                 0x1ffff
+#define THCON_SEC0_REG10_Unpack_limit_address_RMW                  THCON_SEC0_REG10_Unpack_limit_address_ADDR32, THCON_SEC0_REG10_Unpack_limit_address_SHAMT, THCON_SEC0_REG10_Unpack_limit_address_MASK
+
+#define THCON_SEC0_REG10_Unpack_fifo_size_ADDR32                   105
+#define THCON_SEC0_REG10_Unpack_fifo_size_SHAMT                      0
+#define THCON_SEC0_REG10_Unpack_fifo_size_MASK                     0x1ffff
+#define THCON_SEC0_REG10_Unpack_fifo_size_RMW                      THCON_SEC0_REG10_Unpack_fifo_size_ADDR32, THCON_SEC0_REG10_Unpack_fifo_size_SHAMT, THCON_SEC0_REG10_Unpack_fifo_size_MASK
+
+#define THCON_SEC0_REG10_Unpack_limit_address_en_ADDR32            105
+#define THCON_SEC0_REG10_Unpack_limit_address_en_SHAMT              17
+#define THCON_SEC0_REG10_Unpack_limit_address_en_MASK              0x20000
+#define THCON_SEC0_REG10_Unpack_limit_address_en_RMW               THCON_SEC0_REG10_Unpack_limit_address_en_ADDR32, THCON_SEC0_REG10_Unpack_limit_address_en_SHAMT, THCON_SEC0_REG10_Unpack_limit_address_en_MASK
+
+#define THCON_SEC0_REG10_Unpacker_Reg_Wr_Addr_ADDR32               106
+#define THCON_SEC0_REG10_Unpacker_Reg_Wr_Addr_SHAMT                  0
+#define THCON_SEC0_REG10_Unpacker_Reg_Wr_Addr_MASK                 0xffffff
+#define THCON_SEC0_REG10_Unpacker_Reg_Wr_Addr_RMW                  THCON_SEC0_REG10_Unpacker_Reg_Wr_Addr_ADDR32, THCON_SEC0_REG10_Unpacker_Reg_Wr_Addr_SHAMT, THCON_SEC0_REG10_Unpacker_Reg_Wr_Addr_MASK
+
+#define THCON_SEC0_REG10_Packer_Reg_Wr_Addr_ADDR32                 107
+#define THCON_SEC0_REG10_Packer_Reg_Wr_Addr_SHAMT                    0
+#define THCON_SEC0_REG10_Packer_Reg_Wr_Addr_MASK                   0xffffff
+#define THCON_SEC0_REG10_Packer_Reg_Wr_Addr_RMW                    THCON_SEC0_REG10_Packer_Reg_Wr_Addr_ADDR32, THCON_SEC0_REG10_Packer_Reg_Wr_Addr_SHAMT, THCON_SEC0_REG10_Packer_Reg_Wr_Addr_MASK
+
+#define THCON_SEC0_REG11_Metadata_l1_addr_ADDR32                   108
+#define THCON_SEC0_REG11_Metadata_l1_addr_SHAMT                      0
+#define THCON_SEC0_REG11_Metadata_l1_addr_MASK                     0xffffffff
+#define THCON_SEC0_REG11_Metadata_l1_addr_RMW                      THCON_SEC0_REG11_Metadata_l1_addr_ADDR32, THCON_SEC0_REG11_Metadata_l1_addr_SHAMT, THCON_SEC0_REG11_Metadata_l1_addr_MASK
+
+#define THCON_SEC0_REG11_Metadata_limit_addr_ADDR32                109
+#define THCON_SEC0_REG11_Metadata_limit_addr_SHAMT                   0
+#define THCON_SEC0_REG11_Metadata_limit_addr_MASK                  0xffffffff
+#define THCON_SEC0_REG11_Metadata_limit_addr_RMW                   THCON_SEC0_REG11_Metadata_limit_addr_ADDR32, THCON_SEC0_REG11_Metadata_limit_addr_SHAMT, THCON_SEC0_REG11_Metadata_limit_addr_MASK
+
+#define THCON_SEC0_REG11_Metadata_fifo_size_ADDR32                 110
+#define THCON_SEC0_REG11_Metadata_fifo_size_SHAMT                    0
+#define THCON_SEC0_REG11_Metadata_fifo_size_MASK                   0xffffffff
+#define THCON_SEC0_REG11_Metadata_fifo_size_RMW                    THCON_SEC0_REG11_Metadata_fifo_size_ADDR32, THCON_SEC0_REG11_Metadata_fifo_size_SHAMT, THCON_SEC0_REG11_Metadata_fifo_size_MASK
+
+#define THCON_SEC0_REG11_Metadata_z_cntr_rst_unpacr_count_ADDR32   111
+#define THCON_SEC0_REG11_Metadata_z_cntr_rst_unpacr_count_SHAMT      0
+#define THCON_SEC0_REG11_Metadata_z_cntr_rst_unpacr_count_MASK     0xff
+#define THCON_SEC0_REG11_Metadata_z_cntr_rst_unpacr_count_RMW      THCON_SEC0_REG11_Metadata_z_cntr_rst_unpacr_count_ADDR32, THCON_SEC0_REG11_Metadata_z_cntr_rst_unpacr_count_SHAMT, THCON_SEC0_REG11_Metadata_z_cntr_rst_unpacr_count_MASK
+
+#define THCON_SEC0_REG11_Metadata_cntxt_switch_unpacr_count_ADDR32 111
+#define THCON_SEC0_REG11_Metadata_cntxt_switch_unpacr_count_SHAMT    8
+#define THCON_SEC0_REG11_Metadata_cntxt_switch_unpacr_count_MASK   0xff00
+#define THCON_SEC0_REG11_Metadata_cntxt_switch_unpacr_count_RMW    THCON_SEC0_REG11_Metadata_cntxt_switch_unpacr_count_ADDR32, THCON_SEC0_REG11_Metadata_cntxt_switch_unpacr_count_SHAMT, THCON_SEC0_REG11_Metadata_cntxt_switch_unpacr_count_MASK
+
+#define THCON_SEC1_REG0_TileDescriptor_ADDR32                      112
+#define THCON_SEC1_REG0_TileDescriptor_SHAMT                         0
+#define THCON_SEC1_REG0_TileDescriptor_MASK                        0xffffffffffffffffffffffffffffffff
+#define THCON_SEC1_REG0_TileDescriptor_RMW                         THCON_SEC1_REG0_TileDescriptor_ADDR32, THCON_SEC1_REG0_TileDescriptor_SHAMT, THCON_SEC1_REG0_TileDescriptor_MASK
+
+#define THCON_SEC1_REG1_Row_start_section_size_ADDR32              116
+#define THCON_SEC1_REG1_Row_start_section_size_SHAMT                 0
+#define THCON_SEC1_REG1_Row_start_section_size_MASK                0xffff
+#define THCON_SEC1_REG1_Row_start_section_size_RMW                 THCON_SEC1_REG1_Row_start_section_size_ADDR32, THCON_SEC1_REG1_Row_start_section_size_SHAMT, THCON_SEC1_REG1_Row_start_section_size_MASK
+
+#define THCON_SEC1_REG1_Exp_section_size_ADDR32                    116
+#define THCON_SEC1_REG1_Exp_section_size_SHAMT                      16
+#define THCON_SEC1_REG1_Exp_section_size_MASK                      0xffff0000
+#define THCON_SEC1_REG1_Exp_section_size_RMW                       THCON_SEC1_REG1_Exp_section_size_ADDR32, THCON_SEC1_REG1_Exp_section_size_SHAMT, THCON_SEC1_REG1_Exp_section_size_MASK
+
+#define THCON_SEC1_REG1_L1_Dest_addr_ADDR32                        117
+#define THCON_SEC1_REG1_L1_Dest_addr_SHAMT                           0
+#define THCON_SEC1_REG1_L1_Dest_addr_MASK                          0xffffffff
+#define THCON_SEC1_REG1_L1_Dest_addr_RMW                           THCON_SEC1_REG1_L1_Dest_addr_ADDR32, THCON_SEC1_REG1_L1_Dest_addr_SHAMT, THCON_SEC1_REG1_L1_Dest_addr_MASK
+
+#define THCON_SEC1_REG1_Disable_zero_compress_ADDR32               118
+#define THCON_SEC1_REG1_Disable_zero_compress_SHAMT                  0
+#define THCON_SEC1_REG1_Disable_zero_compress_MASK                 0x1
+#define THCON_SEC1_REG1_Disable_zero_compress_RMW                  THCON_SEC1_REG1_Disable_zero_compress_ADDR32, THCON_SEC1_REG1_Disable_zero_compress_SHAMT, THCON_SEC1_REG1_Disable_zero_compress_MASK
+
+#define THCON_SEC1_REG1_Add_l1_dest_addr_offset_ADDR32             118
+#define THCON_SEC1_REG1_Add_l1_dest_addr_offset_SHAMT                1
+#define THCON_SEC1_REG1_Add_l1_dest_addr_offset_MASK               0x2
+#define THCON_SEC1_REG1_Add_l1_dest_addr_offset_RMW                THCON_SEC1_REG1_Add_l1_dest_addr_offset_ADDR32, THCON_SEC1_REG1_Add_l1_dest_addr_offset_SHAMT, THCON_SEC1_REG1_Add_l1_dest_addr_offset_MASK
+
+#define THCON_SEC1_REG1_Disable_pack_zero_flags_ADDR32             118
+#define THCON_SEC1_REG1_Disable_pack_zero_flags_SHAMT                2
+#define THCON_SEC1_REG1_Disable_pack_zero_flags_MASK               0x4
+#define THCON_SEC1_REG1_Disable_pack_zero_flags_RMW                THCON_SEC1_REG1_Disable_pack_zero_flags_ADDR32, THCON_SEC1_REG1_Disable_pack_zero_flags_SHAMT, THCON_SEC1_REG1_Disable_pack_zero_flags_MASK
+
+#define THCON_SEC1_REG1_ovrd_default_throttle_mode_ADDR32          118
+#define THCON_SEC1_REG1_ovrd_default_throttle_mode_SHAMT             3
+#define THCON_SEC1_REG1_ovrd_default_throttle_mode_MASK            0x8
+#define THCON_SEC1_REG1_ovrd_default_throttle_mode_RMW             THCON_SEC1_REG1_ovrd_default_throttle_mode_ADDR32, THCON_SEC1_REG1_ovrd_default_throttle_mode_SHAMT, THCON_SEC1_REG1_ovrd_default_throttle_mode_MASK
+
+#define THCON_SEC1_REG1_Out_data_format_ADDR32                     118
+#define THCON_SEC1_REG1_Out_data_format_SHAMT                        4
+#define THCON_SEC1_REG1_Out_data_format_MASK                       0xf0
+#define THCON_SEC1_REG1_Out_data_format_RMW                        THCON_SEC1_REG1_Out_data_format_ADDR32, THCON_SEC1_REG1_Out_data_format_SHAMT, THCON_SEC1_REG1_Out_data_format_MASK
+
+#define THCON_SEC1_REG1_In_data_format_ADDR32                      118
+#define THCON_SEC1_REG1_In_data_format_SHAMT                         8
+#define THCON_SEC1_REG1_In_data_format_MASK                        0xf00
+#define THCON_SEC1_REG1_In_data_format_RMW                         THCON_SEC1_REG1_In_data_format_ADDR32, THCON_SEC1_REG1_In_data_format_SHAMT, THCON_SEC1_REG1_In_data_format_MASK
+
+#define THCON_SEC1_REG1_Dis_shared_exp_assembler_ADDR32            118
+#define THCON_SEC1_REG1_Dis_shared_exp_assembler_SHAMT              12
+#define THCON_SEC1_REG1_Dis_shared_exp_assembler_MASK              0x1000
+#define THCON_SEC1_REG1_Dis_shared_exp_assembler_RMW               THCON_SEC1_REG1_Dis_shared_exp_assembler_ADDR32, THCON_SEC1_REG1_Dis_shared_exp_assembler_SHAMT, THCON_SEC1_REG1_Dis_shared_exp_assembler_MASK
+
+#define THCON_SEC1_REG1_Auto_set_last_pacr_intf_sel_ADDR32         118
+#define THCON_SEC1_REG1_Auto_set_last_pacr_intf_sel_SHAMT           13
+#define THCON_SEC1_REG1_Auto_set_last_pacr_intf_sel_MASK           0x2000
+#define THCON_SEC1_REG1_Auto_set_last_pacr_intf_sel_RMW            THCON_SEC1_REG1_Auto_set_last_pacr_intf_sel_ADDR32, THCON_SEC1_REG1_Auto_set_last_pacr_intf_sel_SHAMT, THCON_SEC1_REG1_Auto_set_last_pacr_intf_sel_MASK
+
+#define THCON_SEC1_REG1_Enable_out_fifo_ADDR32                     118
+#define THCON_SEC1_REG1_Enable_out_fifo_SHAMT                       14
+#define THCON_SEC1_REG1_Enable_out_fifo_MASK                       0x4000
+#define THCON_SEC1_REG1_Enable_out_fifo_RMW                        THCON_SEC1_REG1_Enable_out_fifo_ADDR32, THCON_SEC1_REG1_Enable_out_fifo_SHAMT, THCON_SEC1_REG1_Enable_out_fifo_MASK
+
+#define THCON_SEC1_REG1_Sub_l1_tile_header_size_ADDR32             118
+#define THCON_SEC1_REG1_Sub_l1_tile_header_size_SHAMT               15
+#define THCON_SEC1_REG1_Sub_l1_tile_header_size_MASK               0x8000
+#define THCON_SEC1_REG1_Sub_l1_tile_header_size_RMW                THCON_SEC1_REG1_Sub_l1_tile_header_size_ADDR32, THCON_SEC1_REG1_Sub_l1_tile_header_size_SHAMT, THCON_SEC1_REG1_Sub_l1_tile_header_size_MASK
+
+#define THCON_SEC1_REG1_Source_interface_selection_ADDR32          118
+#define THCON_SEC1_REG1_Source_interface_selection_SHAMT            16
+#define THCON_SEC1_REG1_Source_interface_selection_MASK            0x10000
+#define THCON_SEC1_REG1_Source_interface_selection_RMW             THCON_SEC1_REG1_Source_interface_selection_ADDR32, THCON_SEC1_REG1_Source_interface_selection_SHAMT, THCON_SEC1_REG1_Source_interface_selection_MASK
+
+#define THCON_SEC1_REG1_pack_start_intf_pos_ADDR32                 118
+#define THCON_SEC1_REG1_pack_start_intf_pos_SHAMT                   17
+#define THCON_SEC1_REG1_pack_start_intf_pos_MASK                   0x1e0000
+#define THCON_SEC1_REG1_pack_start_intf_pos_RMW                    THCON_SEC1_REG1_pack_start_intf_pos_ADDR32, THCON_SEC1_REG1_pack_start_intf_pos_SHAMT, THCON_SEC1_REG1_pack_start_intf_pos_MASK
+
+#define THCON_SEC1_REG1_All_pack_disable_zero_compress_ovrd_ADDR32 118
+#define THCON_SEC1_REG1_All_pack_disable_zero_compress_ovrd_SHAMT   21
+#define THCON_SEC1_REG1_All_pack_disable_zero_compress_ovrd_MASK   0x200000
+#define THCON_SEC1_REG1_All_pack_disable_zero_compress_ovrd_RMW    THCON_SEC1_REG1_All_pack_disable_zero_compress_ovrd_ADDR32, THCON_SEC1_REG1_All_pack_disable_zero_compress_ovrd_SHAMT, THCON_SEC1_REG1_All_pack_disable_zero_compress_ovrd_MASK
+
+#define THCON_SEC1_REG1_Add_tile_header_size_ADDR32                118
+#define THCON_SEC1_REG1_Add_tile_header_size_SHAMT                  22
+#define THCON_SEC1_REG1_Add_tile_header_size_MASK                  0x400000
+#define THCON_SEC1_REG1_Add_tile_header_size_RMW                   THCON_SEC1_REG1_Add_tile_header_size_ADDR32, THCON_SEC1_REG1_Add_tile_header_size_SHAMT, THCON_SEC1_REG1_Add_tile_header_size_MASK
+
+#define THCON_SEC1_REG1_pack_dis_y_pos_start_offset_ADDR32         118
+#define THCON_SEC1_REG1_pack_dis_y_pos_start_offset_SHAMT           23
+#define THCON_SEC1_REG1_pack_dis_y_pos_start_offset_MASK           0x800000
+#define THCON_SEC1_REG1_pack_dis_y_pos_start_offset_RMW            THCON_SEC1_REG1_pack_dis_y_pos_start_offset_ADDR32, THCON_SEC1_REG1_pack_dis_y_pos_start_offset_SHAMT, THCON_SEC1_REG1_pack_dis_y_pos_start_offset_MASK
+
+#define THCON_SEC1_REG1_L1_source_addr_ADDR32                      118
+#define THCON_SEC1_REG1_L1_source_addr_SHAMT                        24
+#define THCON_SEC1_REG1_L1_source_addr_MASK                        0xff000000
+#define THCON_SEC1_REG1_L1_source_addr_RMW                         THCON_SEC1_REG1_L1_source_addr_ADDR32, THCON_SEC1_REG1_L1_source_addr_SHAMT, THCON_SEC1_REG1_L1_source_addr_MASK
+
+#define THCON_SEC1_REG1_Downsample_mask_ADDR32                     119
+#define THCON_SEC1_REG1_Downsample_mask_SHAMT                        0
+#define THCON_SEC1_REG1_Downsample_mask_MASK                       0xffff
+#define THCON_SEC1_REG1_Downsample_mask_RMW                        THCON_SEC1_REG1_Downsample_mask_ADDR32, THCON_SEC1_REG1_Downsample_mask_SHAMT, THCON_SEC1_REG1_Downsample_mask_MASK
+
+#define THCON_SEC1_REG1_Downsample_rate_ADDR32                     119
+#define THCON_SEC1_REG1_Downsample_rate_SHAMT                       16
+#define THCON_SEC1_REG1_Downsample_rate_MASK                       0x70000
+#define THCON_SEC1_REG1_Downsample_rate_RMW                        THCON_SEC1_REG1_Downsample_rate_ADDR32, THCON_SEC1_REG1_Downsample_rate_SHAMT, THCON_SEC1_REG1_Downsample_rate_MASK
+
+#define THCON_SEC1_REG1_Pack_L1_Acc_ADDR32                         119
+#define THCON_SEC1_REG1_Pack_L1_Acc_SHAMT                           19
+#define THCON_SEC1_REG1_Pack_L1_Acc_MASK                           0x80000
+#define THCON_SEC1_REG1_Pack_L1_Acc_RMW                            THCON_SEC1_REG1_Pack_L1_Acc_ADDR32, THCON_SEC1_REG1_Pack_L1_Acc_SHAMT, THCON_SEC1_REG1_Pack_L1_Acc_MASK
+
+#define THCON_SEC1_REG1_Exp_threshold_en_ADDR32                    119
+#define THCON_SEC1_REG1_Exp_threshold_en_SHAMT                      20
+#define THCON_SEC1_REG1_Exp_threshold_en_MASK                      0x100000
+#define THCON_SEC1_REG1_Exp_threshold_en_RMW                       THCON_SEC1_REG1_Exp_threshold_en_ADDR32, THCON_SEC1_REG1_Exp_threshold_en_SHAMT, THCON_SEC1_REG1_Exp_threshold_en_MASK
+
+#define THCON_SEC1_REG1_Unp_LF8_4b_exp_ADDR32                      119
+#define THCON_SEC1_REG1_Unp_LF8_4b_exp_SHAMT                        22
+#define THCON_SEC1_REG1_Unp_LF8_4b_exp_MASK                        0x400000
+#define THCON_SEC1_REG1_Unp_LF8_4b_exp_RMW                         THCON_SEC1_REG1_Unp_LF8_4b_exp_ADDR32, THCON_SEC1_REG1_Unp_LF8_4b_exp_SHAMT, THCON_SEC1_REG1_Unp_LF8_4b_exp_MASK
+
+#define THCON_SEC1_REG1_Pac_LF8_4b_exp_ADDR32                      119
+#define THCON_SEC1_REG1_Pac_LF8_4b_exp_SHAMT                        23
+#define THCON_SEC1_REG1_Pac_LF8_4b_exp_MASK                        0x800000
+#define THCON_SEC1_REG1_Pac_LF8_4b_exp_RMW                         THCON_SEC1_REG1_Pac_LF8_4b_exp_ADDR32, THCON_SEC1_REG1_Pac_LF8_4b_exp_SHAMT, THCON_SEC1_REG1_Pac_LF8_4b_exp_MASK
+
+#define THCON_SEC1_REG1_Exp_threshold_ADDR32                       119
+#define THCON_SEC1_REG1_Exp_threshold_SHAMT                         24
+#define THCON_SEC1_REG1_Exp_threshold_MASK                         0xff000000
+#define THCON_SEC1_REG1_Exp_threshold_RMW                          THCON_SEC1_REG1_Exp_threshold_ADDR32, THCON_SEC1_REG1_Exp_threshold_SHAMT, THCON_SEC1_REG1_Exp_threshold_MASK
+
+#define THCON_SEC1_REG2_Out_data_format_ADDR32                     120
+#define THCON_SEC1_REG2_Out_data_format_SHAMT                        0
+#define THCON_SEC1_REG2_Out_data_format_MASK                       0xf
+#define THCON_SEC1_REG2_Out_data_format_RMW                        THCON_SEC1_REG2_Out_data_format_ADDR32, THCON_SEC1_REG2_Out_data_format_SHAMT, THCON_SEC1_REG2_Out_data_format_MASK
+
+#define THCON_SEC1_REG2_Throttle_mode_ADDR32                       120
+#define THCON_SEC1_REG2_Throttle_mode_SHAMT                          4
+#define THCON_SEC1_REG2_Throttle_mode_MASK                         0x30
+#define THCON_SEC1_REG2_Throttle_mode_RMW                          THCON_SEC1_REG2_Throttle_mode_ADDR32, THCON_SEC1_REG2_Throttle_mode_SHAMT, THCON_SEC1_REG2_Throttle_mode_MASK
+
+#define THCON_SEC1_REG2_Context_count_ADDR32                       120
+#define THCON_SEC1_REG2_Context_count_SHAMT                          6
+#define THCON_SEC1_REG2_Context_count_MASK                         0xc0
+#define THCON_SEC1_REG2_Context_count_RMW                          THCON_SEC1_REG2_Context_count_ADDR32, THCON_SEC1_REG2_Context_count_SHAMT, THCON_SEC1_REG2_Context_count_MASK
+
+#define THCON_SEC1_REG2_Haloize_mode_ADDR32                        120
+#define THCON_SEC1_REG2_Haloize_mode_SHAMT                           8
+#define THCON_SEC1_REG2_Haloize_mode_MASK                          0x100
+#define THCON_SEC1_REG2_Haloize_mode_RMW                           THCON_SEC1_REG2_Haloize_mode_ADDR32, THCON_SEC1_REG2_Haloize_mode_SHAMT, THCON_SEC1_REG2_Haloize_mode_MASK
+
+#define THCON_SEC1_REG2_Tileize_mode_ADDR32                        120
+#define THCON_SEC1_REG2_Tileize_mode_SHAMT                           9
+#define THCON_SEC1_REG2_Tileize_mode_MASK                          0x200
+#define THCON_SEC1_REG2_Tileize_mode_RMW                           THCON_SEC1_REG2_Tileize_mode_ADDR32, THCON_SEC1_REG2_Tileize_mode_SHAMT, THCON_SEC1_REG2_Tileize_mode_MASK
+
+#define THCON_SEC1_REG2_Unpack_Src_Reg_Set_Upd_ADDR32              120
+#define THCON_SEC1_REG2_Unpack_Src_Reg_Set_Upd_SHAMT                10
+#define THCON_SEC1_REG2_Unpack_Src_Reg_Set_Upd_MASK                0x400
+#define THCON_SEC1_REG2_Unpack_Src_Reg_Set_Upd_RMW                 THCON_SEC1_REG2_Unpack_Src_Reg_Set_Upd_ADDR32, THCON_SEC1_REG2_Unpack_Src_Reg_Set_Upd_SHAMT, THCON_SEC1_REG2_Unpack_Src_Reg_Set_Upd_MASK
+
+#define THCON_SEC1_REG2_Unpack_If_Sel_ADDR32                       120
+#define THCON_SEC1_REG2_Unpack_If_Sel_SHAMT                         11
+#define THCON_SEC1_REG2_Unpack_If_Sel_MASK                         0x800
+#define THCON_SEC1_REG2_Unpack_If_Sel_RMW                          THCON_SEC1_REG2_Unpack_If_Sel_ADDR32, THCON_SEC1_REG2_Unpack_If_Sel_SHAMT, THCON_SEC1_REG2_Unpack_If_Sel_MASK
+
+#define THCON_SEC1_REG2_Upsample_rate_ADDR32                       120
+#define THCON_SEC1_REG2_Upsample_rate_SHAMT                         12
+#define THCON_SEC1_REG2_Upsample_rate_MASK                         0x3000
+#define THCON_SEC1_REG2_Upsample_rate_RMW                          THCON_SEC1_REG2_Upsample_rate_ADDR32, THCON_SEC1_REG2_Upsample_rate_SHAMT, THCON_SEC1_REG2_Upsample_rate_MASK
+
+#define THCON_SEC1_REG2_Ovrd_data_format_ADDR32                    120
+#define THCON_SEC1_REG2_Ovrd_data_format_SHAMT                      14
+#define THCON_SEC1_REG2_Ovrd_data_format_MASK                      0x4000
+#define THCON_SEC1_REG2_Ovrd_data_format_RMW                       THCON_SEC1_REG2_Ovrd_data_format_ADDR32, THCON_SEC1_REG2_Ovrd_data_format_SHAMT, THCON_SEC1_REG2_Ovrd_data_format_MASK
+
+#define THCON_SEC1_REG2_Upsample_and_interleave_ADDR32             120
+#define THCON_SEC1_REG2_Upsample_and_interleave_SHAMT               15
+#define THCON_SEC1_REG2_Upsample_and_interleave_MASK               0x8000
+#define THCON_SEC1_REG2_Upsample_and_interleave_RMW                THCON_SEC1_REG2_Upsample_and_interleave_ADDR32, THCON_SEC1_REG2_Upsample_and_interleave_SHAMT, THCON_SEC1_REG2_Upsample_and_interleave_MASK
+
+#define THCON_SEC1_REG2_Shift_amount_cntx0_ADDR32                  120
+#define THCON_SEC1_REG2_Shift_amount_cntx0_SHAMT                    16
+#define THCON_SEC1_REG2_Shift_amount_cntx0_MASK                    0xf0000
+#define THCON_SEC1_REG2_Shift_amount_cntx0_RMW                     THCON_SEC1_REG2_Shift_amount_cntx0_ADDR32, THCON_SEC1_REG2_Shift_amount_cntx0_SHAMT, THCON_SEC1_REG2_Shift_amount_cntx0_MASK
+
+#define THCON_SEC1_REG2_Shift_amount_cntx1_ADDR32                  120
+#define THCON_SEC1_REG2_Shift_amount_cntx1_SHAMT                    20
+#define THCON_SEC1_REG2_Shift_amount_cntx1_MASK                    0xf00000
+#define THCON_SEC1_REG2_Shift_amount_cntx1_RMW                     THCON_SEC1_REG2_Shift_amount_cntx1_ADDR32, THCON_SEC1_REG2_Shift_amount_cntx1_SHAMT, THCON_SEC1_REG2_Shift_amount_cntx1_MASK
+
+#define THCON_SEC1_REG2_Shift_amount_cntx2_ADDR32                  120
+#define THCON_SEC1_REG2_Shift_amount_cntx2_SHAMT                    24
+#define THCON_SEC1_REG2_Shift_amount_cntx2_MASK                    0xf000000
+#define THCON_SEC1_REG2_Shift_amount_cntx2_RMW                     THCON_SEC1_REG2_Shift_amount_cntx2_ADDR32, THCON_SEC1_REG2_Shift_amount_cntx2_SHAMT, THCON_SEC1_REG2_Shift_amount_cntx2_MASK
+
+#define THCON_SEC1_REG2_Shift_amount_cntx3_ADDR32                  120
+#define THCON_SEC1_REG2_Shift_amount_cntx3_SHAMT                    28
+#define THCON_SEC1_REG2_Shift_amount_cntx3_MASK                    0xf0000000
+#define THCON_SEC1_REG2_Shift_amount_cntx3_RMW                     THCON_SEC1_REG2_Shift_amount_cntx3_ADDR32, THCON_SEC1_REG2_Shift_amount_cntx3_SHAMT, THCON_SEC1_REG2_Shift_amount_cntx3_MASK
+
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx0_ADDR32         121
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx0_SHAMT            0
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx0_MASK           0x1
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx0_RMW            THCON_SEC1_REG2_Disable_zero_compress_cntx0_ADDR32, THCON_SEC1_REG2_Disable_zero_compress_cntx0_SHAMT, THCON_SEC1_REG2_Disable_zero_compress_cntx0_MASK
+
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx1_ADDR32         121
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx1_SHAMT            1
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx1_MASK           0x2
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx1_RMW            THCON_SEC1_REG2_Disable_zero_compress_cntx1_ADDR32, THCON_SEC1_REG2_Disable_zero_compress_cntx1_SHAMT, THCON_SEC1_REG2_Disable_zero_compress_cntx1_MASK
+
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx2_ADDR32         121
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx2_SHAMT            2
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx2_MASK           0x4
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx2_RMW            THCON_SEC1_REG2_Disable_zero_compress_cntx2_ADDR32, THCON_SEC1_REG2_Disable_zero_compress_cntx2_SHAMT, THCON_SEC1_REG2_Disable_zero_compress_cntx2_MASK
+
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx3_ADDR32         121
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx3_SHAMT            3
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx3_MASK           0x8
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx3_RMW            THCON_SEC1_REG2_Disable_zero_compress_cntx3_ADDR32, THCON_SEC1_REG2_Disable_zero_compress_cntx3_SHAMT, THCON_SEC1_REG2_Disable_zero_compress_cntx3_MASK
+
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx0_ADDR32                 121
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx0_SHAMT                    4
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx0_MASK                   0x10
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx0_RMW                    THCON_SEC1_REG2_Unpack_if_sel_cntx0_ADDR32, THCON_SEC1_REG2_Unpack_if_sel_cntx0_SHAMT, THCON_SEC1_REG2_Unpack_if_sel_cntx0_MASK
+
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx1_ADDR32                 121
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx1_SHAMT                    5
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx1_MASK                   0x20
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx1_RMW                    THCON_SEC1_REG2_Unpack_if_sel_cntx1_ADDR32, THCON_SEC1_REG2_Unpack_if_sel_cntx1_SHAMT, THCON_SEC1_REG2_Unpack_if_sel_cntx1_MASK
+
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx2_ADDR32                 121
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx2_SHAMT                    6
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx2_MASK                   0x40
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx2_RMW                    THCON_SEC1_REG2_Unpack_if_sel_cntx2_ADDR32, THCON_SEC1_REG2_Unpack_if_sel_cntx2_SHAMT, THCON_SEC1_REG2_Unpack_if_sel_cntx2_MASK
+
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx3_ADDR32                 121
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx3_SHAMT                    7
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx3_MASK                   0x80
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx3_RMW                    THCON_SEC1_REG2_Unpack_if_sel_cntx3_ADDR32, THCON_SEC1_REG2_Unpack_if_sel_cntx3_SHAMT, THCON_SEC1_REG2_Unpack_if_sel_cntx3_MASK
+
+#define THCON_SEC1_REG2_Force_shared_exp_ADDR32                    121
+#define THCON_SEC1_REG2_Force_shared_exp_SHAMT                       8
+#define THCON_SEC1_REG2_Force_shared_exp_MASK                      0x100
+#define THCON_SEC1_REG2_Force_shared_exp_RMW                       THCON_SEC1_REG2_Force_shared_exp_ADDR32, THCON_SEC1_REG2_Force_shared_exp_SHAMT, THCON_SEC1_REG2_Force_shared_exp_MASK
+
+#define THCON_SEC1_REG2_Context_count_non_log2_ADDR32              121
+#define THCON_SEC1_REG2_Context_count_non_log2_SHAMT                 9
+#define THCON_SEC1_REG2_Context_count_non_log2_MASK                0xe00
+#define THCON_SEC1_REG2_Context_count_non_log2_RMW                 THCON_SEC1_REG2_Context_count_non_log2_ADDR32, THCON_SEC1_REG2_Context_count_non_log2_SHAMT, THCON_SEC1_REG2_Context_count_non_log2_MASK
+
+#define THCON_SEC1_REG2_Context_count_non_log2_en_ADDR32           121
+#define THCON_SEC1_REG2_Context_count_non_log2_en_SHAMT             12
+#define THCON_SEC1_REG2_Context_count_non_log2_en_MASK             0x1000
+#define THCON_SEC1_REG2_Context_count_non_log2_en_RMW              THCON_SEC1_REG2_Context_count_non_log2_en_ADDR32, THCON_SEC1_REG2_Context_count_non_log2_en_SHAMT, THCON_SEC1_REG2_Context_count_non_log2_en_MASK
+
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx4_ADDR32         121
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx4_SHAMT           16
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx4_MASK           0x10000
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx4_RMW            THCON_SEC1_REG2_Disable_zero_compress_cntx4_ADDR32, THCON_SEC1_REG2_Disable_zero_compress_cntx4_SHAMT, THCON_SEC1_REG2_Disable_zero_compress_cntx4_MASK
+
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx5_ADDR32         121
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx5_SHAMT           17
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx5_MASK           0x20000
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx5_RMW            THCON_SEC1_REG2_Disable_zero_compress_cntx5_ADDR32, THCON_SEC1_REG2_Disable_zero_compress_cntx5_SHAMT, THCON_SEC1_REG2_Disable_zero_compress_cntx5_MASK
+
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx6_ADDR32         121
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx6_SHAMT           18
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx6_MASK           0x40000
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx6_RMW            THCON_SEC1_REG2_Disable_zero_compress_cntx6_ADDR32, THCON_SEC1_REG2_Disable_zero_compress_cntx6_SHAMT, THCON_SEC1_REG2_Disable_zero_compress_cntx6_MASK
+
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx7_ADDR32         121
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx7_SHAMT           19
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx7_MASK           0x80000
+#define THCON_SEC1_REG2_Disable_zero_compress_cntx7_RMW            THCON_SEC1_REG2_Disable_zero_compress_cntx7_ADDR32, THCON_SEC1_REG2_Disable_zero_compress_cntx7_SHAMT, THCON_SEC1_REG2_Disable_zero_compress_cntx7_MASK
+
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx4_ADDR32                 121
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx4_SHAMT                   20
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx4_MASK                   0x100000
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx4_RMW                    THCON_SEC1_REG2_Unpack_if_sel_cntx4_ADDR32, THCON_SEC1_REG2_Unpack_if_sel_cntx4_SHAMT, THCON_SEC1_REG2_Unpack_if_sel_cntx4_MASK
+
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx5_ADDR32                 121
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx5_SHAMT                   21
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx5_MASK                   0x200000
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx5_RMW                    THCON_SEC1_REG2_Unpack_if_sel_cntx5_ADDR32, THCON_SEC1_REG2_Unpack_if_sel_cntx5_SHAMT, THCON_SEC1_REG2_Unpack_if_sel_cntx5_MASK
+
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx6_ADDR32                 121
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx6_SHAMT                   22
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx6_MASK                   0x400000
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx6_RMW                    THCON_SEC1_REG2_Unpack_if_sel_cntx6_ADDR32, THCON_SEC1_REG2_Unpack_if_sel_cntx6_SHAMT, THCON_SEC1_REG2_Unpack_if_sel_cntx6_MASK
+
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx7_ADDR32                 121
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx7_SHAMT                   23
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx7_MASK                   0x800000
+#define THCON_SEC1_REG2_Unpack_if_sel_cntx7_RMW                    THCON_SEC1_REG2_Unpack_if_sel_cntx7_ADDR32, THCON_SEC1_REG2_Unpack_if_sel_cntx7_SHAMT, THCON_SEC1_REG2_Unpack_if_sel_cntx7_MASK
+
+#define THCON_SEC1_REG2_Metadata_x_end_ADDR32                      121
+#define THCON_SEC1_REG2_Metadata_x_end_SHAMT                        24
+#define THCON_SEC1_REG2_Metadata_x_end_MASK                        0xff000000
+#define THCON_SEC1_REG2_Metadata_x_end_RMW                         THCON_SEC1_REG2_Metadata_x_end_ADDR32, THCON_SEC1_REG2_Metadata_x_end_SHAMT, THCON_SEC1_REG2_Metadata_x_end_MASK
+
+#define THCON_SEC1_REG2_Unpack_limit_address_ADDR32                122
+#define THCON_SEC1_REG2_Unpack_limit_address_SHAMT                   0
+#define THCON_SEC1_REG2_Unpack_limit_address_MASK                  0x1ffff
+#define THCON_SEC1_REG2_Unpack_limit_address_RMW                   THCON_SEC1_REG2_Unpack_limit_address_ADDR32, THCON_SEC1_REG2_Unpack_limit_address_SHAMT, THCON_SEC1_REG2_Unpack_limit_address_MASK
+
+#define THCON_SEC1_REG2_Unpack_fifo_size_ADDR32                    123
+#define THCON_SEC1_REG2_Unpack_fifo_size_SHAMT                       0
+#define THCON_SEC1_REG2_Unpack_fifo_size_MASK                      0x1ffff
+#define THCON_SEC1_REG2_Unpack_fifo_size_RMW                       THCON_SEC1_REG2_Unpack_fifo_size_ADDR32, THCON_SEC1_REG2_Unpack_fifo_size_SHAMT, THCON_SEC1_REG2_Unpack_fifo_size_MASK
+
+#define THCON_SEC1_REG3_Base_address_ADDR32                        124
+#define THCON_SEC1_REG3_Base_address_SHAMT                           0
+#define THCON_SEC1_REG3_Base_address_MASK                          0xffffffff
+#define THCON_SEC1_REG3_Base_address_RMW                           THCON_SEC1_REG3_Base_address_ADDR32, THCON_SEC1_REG3_Base_address_SHAMT, THCON_SEC1_REG3_Base_address_MASK
+
+#define THCON_SEC1_REG3_Base_cntx1_address_ADDR32                  125
+#define THCON_SEC1_REG3_Base_cntx1_address_SHAMT                     0
+#define THCON_SEC1_REG3_Base_cntx1_address_MASK                    0xffffffff
+#define THCON_SEC1_REG3_Base_cntx1_address_RMW                     THCON_SEC1_REG3_Base_cntx1_address_ADDR32, THCON_SEC1_REG3_Base_cntx1_address_SHAMT, THCON_SEC1_REG3_Base_cntx1_address_MASK
+
+#define THCON_SEC1_REG3_Base_cntx2_address_ADDR32                  126
+#define THCON_SEC1_REG3_Base_cntx2_address_SHAMT                     0
+#define THCON_SEC1_REG3_Base_cntx2_address_MASK                    0xffffffff
+#define THCON_SEC1_REG3_Base_cntx2_address_RMW                     THCON_SEC1_REG3_Base_cntx2_address_ADDR32, THCON_SEC1_REG3_Base_cntx2_address_SHAMT, THCON_SEC1_REG3_Base_cntx2_address_MASK
+
+#define THCON_SEC1_REG3_Base_cntx3_address_ADDR32                  127
+#define THCON_SEC1_REG3_Base_cntx3_address_SHAMT                     0
+#define THCON_SEC1_REG3_Base_cntx3_address_MASK                    0xffffffff
+#define THCON_SEC1_REG3_Base_cntx3_address_RMW                     THCON_SEC1_REG3_Base_cntx3_address_ADDR32, THCON_SEC1_REG3_Base_cntx3_address_SHAMT, THCON_SEC1_REG3_Base_cntx3_address_MASK
+
+#define THCON_SEC1_REG4_Base_cntx4_address_ADDR32                  128
+#define THCON_SEC1_REG4_Base_cntx4_address_SHAMT                     0
+#define THCON_SEC1_REG4_Base_cntx4_address_MASK                    0xffffffff
+#define THCON_SEC1_REG4_Base_cntx4_address_RMW                     THCON_SEC1_REG4_Base_cntx4_address_ADDR32, THCON_SEC1_REG4_Base_cntx4_address_SHAMT, THCON_SEC1_REG4_Base_cntx4_address_MASK
+
+#define THCON_SEC1_REG4_Base_cntx5_address_ADDR32                  129
+#define THCON_SEC1_REG4_Base_cntx5_address_SHAMT                     0
+#define THCON_SEC1_REG4_Base_cntx5_address_MASK                    0xffffffff
+#define THCON_SEC1_REG4_Base_cntx5_address_RMW                     THCON_SEC1_REG4_Base_cntx5_address_ADDR32, THCON_SEC1_REG4_Base_cntx5_address_SHAMT, THCON_SEC1_REG4_Base_cntx5_address_MASK
+
+#define THCON_SEC1_REG4_Base_cntx6_address_ADDR32                  130
+#define THCON_SEC1_REG4_Base_cntx6_address_SHAMT                     0
+#define THCON_SEC1_REG4_Base_cntx6_address_MASK                    0xffffffff
+#define THCON_SEC1_REG4_Base_cntx6_address_RMW                     THCON_SEC1_REG4_Base_cntx6_address_ADDR32, THCON_SEC1_REG4_Base_cntx6_address_SHAMT, THCON_SEC1_REG4_Base_cntx6_address_MASK
+
+#define THCON_SEC1_REG4_Base_cntx7_address_ADDR32                  131
+#define THCON_SEC1_REG4_Base_cntx7_address_SHAMT                     0
+#define THCON_SEC1_REG4_Base_cntx7_address_MASK                    0xffffffff
+#define THCON_SEC1_REG4_Base_cntx7_address_RMW                     THCON_SEC1_REG4_Base_cntx7_address_ADDR32, THCON_SEC1_REG4_Base_cntx7_address_SHAMT, THCON_SEC1_REG4_Base_cntx7_address_MASK
+
+#define THCON_SEC1_REG5_Dest_cntx0_address_ADDR32                  132
+#define THCON_SEC1_REG5_Dest_cntx0_address_SHAMT                     0
+#define THCON_SEC1_REG5_Dest_cntx0_address_MASK                    0xffff
+#define THCON_SEC1_REG5_Dest_cntx0_address_RMW                     THCON_SEC1_REG5_Dest_cntx0_address_ADDR32, THCON_SEC1_REG5_Dest_cntx0_address_SHAMT, THCON_SEC1_REG5_Dest_cntx0_address_MASK
+
+#define THCON_SEC1_REG5_Dest_cntx1_address_ADDR32                  132
+#define THCON_SEC1_REG5_Dest_cntx1_address_SHAMT                    16
+#define THCON_SEC1_REG5_Dest_cntx1_address_MASK                    0xffff0000
+#define THCON_SEC1_REG5_Dest_cntx1_address_RMW                     THCON_SEC1_REG5_Dest_cntx1_address_ADDR32, THCON_SEC1_REG5_Dest_cntx1_address_SHAMT, THCON_SEC1_REG5_Dest_cntx1_address_MASK
+
+#define THCON_SEC1_REG5_Dest_cntx2_address_ADDR32                  133
+#define THCON_SEC1_REG5_Dest_cntx2_address_SHAMT                     0
+#define THCON_SEC1_REG5_Dest_cntx2_address_MASK                    0xffff
+#define THCON_SEC1_REG5_Dest_cntx2_address_RMW                     THCON_SEC1_REG5_Dest_cntx2_address_ADDR32, THCON_SEC1_REG5_Dest_cntx2_address_SHAMT, THCON_SEC1_REG5_Dest_cntx2_address_MASK
+
+#define THCON_SEC1_REG5_Dest_cntx3_address_ADDR32                  133
+#define THCON_SEC1_REG5_Dest_cntx3_address_SHAMT                    16
+#define THCON_SEC1_REG5_Dest_cntx3_address_MASK                    0xffff0000
+#define THCON_SEC1_REG5_Dest_cntx3_address_RMW                     THCON_SEC1_REG5_Dest_cntx3_address_ADDR32, THCON_SEC1_REG5_Dest_cntx3_address_SHAMT, THCON_SEC1_REG5_Dest_cntx3_address_MASK
+
+#define THCON_SEC1_REG5_Tile_x_dim_cntx0_ADDR32                    134
+#define THCON_SEC1_REG5_Tile_x_dim_cntx0_SHAMT                       0
+#define THCON_SEC1_REG5_Tile_x_dim_cntx0_MASK                      0xffff
+#define THCON_SEC1_REG5_Tile_x_dim_cntx0_RMW                       THCON_SEC1_REG5_Tile_x_dim_cntx0_ADDR32, THCON_SEC1_REG5_Tile_x_dim_cntx0_SHAMT, THCON_SEC1_REG5_Tile_x_dim_cntx0_MASK
+
+#define THCON_SEC1_REG5_Tile_x_dim_cntx1_ADDR32                    134
+#define THCON_SEC1_REG5_Tile_x_dim_cntx1_SHAMT                      16
+#define THCON_SEC1_REG5_Tile_x_dim_cntx1_MASK                      0xffff0000
+#define THCON_SEC1_REG5_Tile_x_dim_cntx1_RMW                       THCON_SEC1_REG5_Tile_x_dim_cntx1_ADDR32, THCON_SEC1_REG5_Tile_x_dim_cntx1_SHAMT, THCON_SEC1_REG5_Tile_x_dim_cntx1_MASK
+
+#define THCON_SEC1_REG5_Tile_x_dim_cntx2_ADDR32                    135
+#define THCON_SEC1_REG5_Tile_x_dim_cntx2_SHAMT                       0
+#define THCON_SEC1_REG5_Tile_x_dim_cntx2_MASK                      0xffff
+#define THCON_SEC1_REG5_Tile_x_dim_cntx2_RMW                       THCON_SEC1_REG5_Tile_x_dim_cntx2_ADDR32, THCON_SEC1_REG5_Tile_x_dim_cntx2_SHAMT, THCON_SEC1_REG5_Tile_x_dim_cntx2_MASK
+
+#define THCON_SEC1_REG5_Tile_x_dim_cntx3_ADDR32                    135
+#define THCON_SEC1_REG5_Tile_x_dim_cntx3_SHAMT                      16
+#define THCON_SEC1_REG5_Tile_x_dim_cntx3_MASK                      0xffff0000
+#define THCON_SEC1_REG5_Tile_x_dim_cntx3_RMW                       THCON_SEC1_REG5_Tile_x_dim_cntx3_ADDR32, THCON_SEC1_REG5_Tile_x_dim_cntx3_SHAMT, THCON_SEC1_REG5_Tile_x_dim_cntx3_MASK
+
+#define THCON_SEC1_REG6_Source_address_ADDR32                      136
+#define THCON_SEC1_REG6_Source_address_SHAMT                         0
+#define THCON_SEC1_REG6_Source_address_MASK                        0xffffffff
+#define THCON_SEC1_REG6_Source_address_RMW                         THCON_SEC1_REG6_Source_address_ADDR32, THCON_SEC1_REG6_Source_address_SHAMT, THCON_SEC1_REG6_Source_address_MASK
+
+#define THCON_SEC1_REG6_Destination_address_ADDR32                 137
+#define THCON_SEC1_REG6_Destination_address_SHAMT                    0
+#define THCON_SEC1_REG6_Destination_address_MASK                   0xffffffff
+#define THCON_SEC1_REG6_Destination_address_RMW                    THCON_SEC1_REG6_Destination_address_ADDR32, THCON_SEC1_REG6_Destination_address_SHAMT, THCON_SEC1_REG6_Destination_address_MASK
+
+#define THCON_SEC1_REG6_Buffer_size_ADDR32                         138
+#define THCON_SEC1_REG6_Buffer_size_SHAMT                            0
+#define THCON_SEC1_REG6_Buffer_size_MASK                           0x3fffffff
+#define THCON_SEC1_REG6_Buffer_size_RMW                            THCON_SEC1_REG6_Buffer_size_ADDR32, THCON_SEC1_REG6_Buffer_size_SHAMT, THCON_SEC1_REG6_Buffer_size_MASK
+
+#define THCON_SEC1_REG6_Transfer_direction_ADDR32                  138
+#define THCON_SEC1_REG6_Transfer_direction_SHAMT                    30
+#define THCON_SEC1_REG6_Transfer_direction_MASK                    0xc0000000
+#define THCON_SEC1_REG6_Transfer_direction_RMW                     THCON_SEC1_REG6_Transfer_direction_ADDR32, THCON_SEC1_REG6_Transfer_direction_SHAMT, THCON_SEC1_REG6_Transfer_direction_MASK
+
+#define THCON_SEC1_REG6_Metadata_misc_ADDR32                       139
+#define THCON_SEC1_REG6_Metadata_misc_SHAMT                          0
+#define THCON_SEC1_REG6_Metadata_misc_MASK                         0xffffffff
+#define THCON_SEC1_REG6_Metadata_misc_RMW                          THCON_SEC1_REG6_Metadata_misc_ADDR32, THCON_SEC1_REG6_Metadata_misc_SHAMT, THCON_SEC1_REG6_Metadata_misc_MASK
+
+#define THCON_SEC1_REG7_Offset_address_ADDR32                      140
+#define THCON_SEC1_REG7_Offset_address_SHAMT                         0
+#define THCON_SEC1_REG7_Offset_address_MASK                        0xffff
+#define THCON_SEC1_REG7_Offset_address_RMW                         THCON_SEC1_REG7_Offset_address_ADDR32, THCON_SEC1_REG7_Offset_address_SHAMT, THCON_SEC1_REG7_Offset_address_MASK
+
+#define THCON_SEC1_REG7_Unpack_data_format_cntx0_ADDR32            140
+#define THCON_SEC1_REG7_Unpack_data_format_cntx0_SHAMT              16
+#define THCON_SEC1_REG7_Unpack_data_format_cntx0_MASK              0xf0000
+#define THCON_SEC1_REG7_Unpack_data_format_cntx0_RMW               THCON_SEC1_REG7_Unpack_data_format_cntx0_ADDR32, THCON_SEC1_REG7_Unpack_data_format_cntx0_SHAMT, THCON_SEC1_REG7_Unpack_data_format_cntx0_MASK
+
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx0_ADDR32        140
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx0_SHAMT          20
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx0_MASK          0xf00000
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx0_RMW           THCON_SEC1_REG7_Unpack_out_data_format_cntx0_ADDR32, THCON_SEC1_REG7_Unpack_out_data_format_cntx0_SHAMT, THCON_SEC1_REG7_Unpack_out_data_format_cntx0_MASK
+
+#define THCON_SEC1_REG7_Unpack_data_format_cntx4_ADDR32            140
+#define THCON_SEC1_REG7_Unpack_data_format_cntx4_SHAMT              24
+#define THCON_SEC1_REG7_Unpack_data_format_cntx4_MASK              0xf000000
+#define THCON_SEC1_REG7_Unpack_data_format_cntx4_RMW               THCON_SEC1_REG7_Unpack_data_format_cntx4_ADDR32, THCON_SEC1_REG7_Unpack_data_format_cntx4_SHAMT, THCON_SEC1_REG7_Unpack_data_format_cntx4_MASK
+
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx4_ADDR32        140
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx4_SHAMT          28
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx4_MASK          0xf0000000
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx4_RMW           THCON_SEC1_REG7_Unpack_out_data_format_cntx4_ADDR32, THCON_SEC1_REG7_Unpack_out_data_format_cntx4_SHAMT, THCON_SEC1_REG7_Unpack_out_data_format_cntx4_MASK
+
+#define THCON_SEC1_REG7_Offset_cntx1_address_ADDR32                141
+#define THCON_SEC1_REG7_Offset_cntx1_address_SHAMT                   0
+#define THCON_SEC1_REG7_Offset_cntx1_address_MASK                  0xffff
+#define THCON_SEC1_REG7_Offset_cntx1_address_RMW                   THCON_SEC1_REG7_Offset_cntx1_address_ADDR32, THCON_SEC1_REG7_Offset_cntx1_address_SHAMT, THCON_SEC1_REG7_Offset_cntx1_address_MASK
+
+#define THCON_SEC1_REG7_Unpack_data_format_cntx1_ADDR32            141
+#define THCON_SEC1_REG7_Unpack_data_format_cntx1_SHAMT              16
+#define THCON_SEC1_REG7_Unpack_data_format_cntx1_MASK              0xf0000
+#define THCON_SEC1_REG7_Unpack_data_format_cntx1_RMW               THCON_SEC1_REG7_Unpack_data_format_cntx1_ADDR32, THCON_SEC1_REG7_Unpack_data_format_cntx1_SHAMT, THCON_SEC1_REG7_Unpack_data_format_cntx1_MASK
+
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx1_ADDR32        141
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx1_SHAMT          20
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx1_MASK          0xf00000
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx1_RMW           THCON_SEC1_REG7_Unpack_out_data_format_cntx1_ADDR32, THCON_SEC1_REG7_Unpack_out_data_format_cntx1_SHAMT, THCON_SEC1_REG7_Unpack_out_data_format_cntx1_MASK
+
+#define THCON_SEC1_REG7_Unpack_data_format_cntx5_ADDR32            141
+#define THCON_SEC1_REG7_Unpack_data_format_cntx5_SHAMT              24
+#define THCON_SEC1_REG7_Unpack_data_format_cntx5_MASK              0xf000000
+#define THCON_SEC1_REG7_Unpack_data_format_cntx5_RMW               THCON_SEC1_REG7_Unpack_data_format_cntx5_ADDR32, THCON_SEC1_REG7_Unpack_data_format_cntx5_SHAMT, THCON_SEC1_REG7_Unpack_data_format_cntx5_MASK
+
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx5_ADDR32        141
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx5_SHAMT          28
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx5_MASK          0xf0000000
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx5_RMW           THCON_SEC1_REG7_Unpack_out_data_format_cntx5_ADDR32, THCON_SEC1_REG7_Unpack_out_data_format_cntx5_SHAMT, THCON_SEC1_REG7_Unpack_out_data_format_cntx5_MASK
+
+#define THCON_SEC1_REG7_Offset_cntx2_address_ADDR32                142
+#define THCON_SEC1_REG7_Offset_cntx2_address_SHAMT                   0
+#define THCON_SEC1_REG7_Offset_cntx2_address_MASK                  0xffff
+#define THCON_SEC1_REG7_Offset_cntx2_address_RMW                   THCON_SEC1_REG7_Offset_cntx2_address_ADDR32, THCON_SEC1_REG7_Offset_cntx2_address_SHAMT, THCON_SEC1_REG7_Offset_cntx2_address_MASK
+
+#define THCON_SEC1_REG7_Unpack_data_format_cntx2_ADDR32            142
+#define THCON_SEC1_REG7_Unpack_data_format_cntx2_SHAMT              16
+#define THCON_SEC1_REG7_Unpack_data_format_cntx2_MASK              0xf0000
+#define THCON_SEC1_REG7_Unpack_data_format_cntx2_RMW               THCON_SEC1_REG7_Unpack_data_format_cntx2_ADDR32, THCON_SEC1_REG7_Unpack_data_format_cntx2_SHAMT, THCON_SEC1_REG7_Unpack_data_format_cntx2_MASK
+
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx2_ADDR32        142
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx2_SHAMT          20
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx2_MASK          0xf00000
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx2_RMW           THCON_SEC1_REG7_Unpack_out_data_format_cntx2_ADDR32, THCON_SEC1_REG7_Unpack_out_data_format_cntx2_SHAMT, THCON_SEC1_REG7_Unpack_out_data_format_cntx2_MASK
+
+#define THCON_SEC1_REG7_Unpack_data_format_cntx6_ADDR32            142
+#define THCON_SEC1_REG7_Unpack_data_format_cntx6_SHAMT              24
+#define THCON_SEC1_REG7_Unpack_data_format_cntx6_MASK              0xf000000
+#define THCON_SEC1_REG7_Unpack_data_format_cntx6_RMW               THCON_SEC1_REG7_Unpack_data_format_cntx6_ADDR32, THCON_SEC1_REG7_Unpack_data_format_cntx6_SHAMT, THCON_SEC1_REG7_Unpack_data_format_cntx6_MASK
+
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx6_ADDR32        142
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx6_SHAMT          28
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx6_MASK          0xf0000000
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx6_RMW           THCON_SEC1_REG7_Unpack_out_data_format_cntx6_ADDR32, THCON_SEC1_REG7_Unpack_out_data_format_cntx6_SHAMT, THCON_SEC1_REG7_Unpack_out_data_format_cntx6_MASK
+
+#define THCON_SEC1_REG7_Offset_cntx3_address_ADDR32                143
+#define THCON_SEC1_REG7_Offset_cntx3_address_SHAMT                   0
+#define THCON_SEC1_REG7_Offset_cntx3_address_MASK                  0xffff
+#define THCON_SEC1_REG7_Offset_cntx3_address_RMW                   THCON_SEC1_REG7_Offset_cntx3_address_ADDR32, THCON_SEC1_REG7_Offset_cntx3_address_SHAMT, THCON_SEC1_REG7_Offset_cntx3_address_MASK
+
+#define THCON_SEC1_REG7_Unpack_data_format_cntx3_ADDR32            143
+#define THCON_SEC1_REG7_Unpack_data_format_cntx3_SHAMT              16
+#define THCON_SEC1_REG7_Unpack_data_format_cntx3_MASK              0xf0000
+#define THCON_SEC1_REG7_Unpack_data_format_cntx3_RMW               THCON_SEC1_REG7_Unpack_data_format_cntx3_ADDR32, THCON_SEC1_REG7_Unpack_data_format_cntx3_SHAMT, THCON_SEC1_REG7_Unpack_data_format_cntx3_MASK
+
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx3_ADDR32        143
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx3_SHAMT          20
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx3_MASK          0xf00000
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx3_RMW           THCON_SEC1_REG7_Unpack_out_data_format_cntx3_ADDR32, THCON_SEC1_REG7_Unpack_out_data_format_cntx3_SHAMT, THCON_SEC1_REG7_Unpack_out_data_format_cntx3_MASK
+
+#define THCON_SEC1_REG7_Unpack_data_format_cntx7_ADDR32            143
+#define THCON_SEC1_REG7_Unpack_data_format_cntx7_SHAMT              24
+#define THCON_SEC1_REG7_Unpack_data_format_cntx7_MASK              0xf000000
+#define THCON_SEC1_REG7_Unpack_data_format_cntx7_RMW               THCON_SEC1_REG7_Unpack_data_format_cntx7_ADDR32, THCON_SEC1_REG7_Unpack_data_format_cntx7_SHAMT, THCON_SEC1_REG7_Unpack_data_format_cntx7_MASK
+
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx7_ADDR32        143
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx7_SHAMT          28
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx7_MASK          0xf0000000
+#define THCON_SEC1_REG7_Unpack_out_data_format_cntx7_RMW           THCON_SEC1_REG7_Unpack_out_data_format_cntx7_ADDR32, THCON_SEC1_REG7_Unpack_out_data_format_cntx7_SHAMT, THCON_SEC1_REG7_Unpack_out_data_format_cntx7_MASK
+
+#define THCON_SEC1_REG8_Row_start_section_size_ADDR32              144
+#define THCON_SEC1_REG8_Row_start_section_size_SHAMT                 0
+#define THCON_SEC1_REG8_Row_start_section_size_MASK                0xffff
+#define THCON_SEC1_REG8_Row_start_section_size_RMW                 THCON_SEC1_REG8_Row_start_section_size_ADDR32, THCON_SEC1_REG8_Row_start_section_size_SHAMT, THCON_SEC1_REG8_Row_start_section_size_MASK
+
+#define THCON_SEC1_REG8_Exp_section_size_ADDR32                    144
+#define THCON_SEC1_REG8_Exp_section_size_SHAMT                      16
+#define THCON_SEC1_REG8_Exp_section_size_MASK                      0xffff0000
+#define THCON_SEC1_REG8_Exp_section_size_RMW                       THCON_SEC1_REG8_Exp_section_size_ADDR32, THCON_SEC1_REG8_Exp_section_size_SHAMT, THCON_SEC1_REG8_Exp_section_size_MASK
+
+#define THCON_SEC1_REG8_L1_Dest_addr_ADDR32                        145
+#define THCON_SEC1_REG8_L1_Dest_addr_SHAMT                           0
+#define THCON_SEC1_REG8_L1_Dest_addr_MASK                          0xffffffff
+#define THCON_SEC1_REG8_L1_Dest_addr_RMW                           THCON_SEC1_REG8_L1_Dest_addr_ADDR32, THCON_SEC1_REG8_L1_Dest_addr_SHAMT, THCON_SEC1_REG8_L1_Dest_addr_MASK
+
+#define THCON_SEC1_REG8_Disable_zero_compress_ADDR32               146
+#define THCON_SEC1_REG8_Disable_zero_compress_SHAMT                  0
+#define THCON_SEC1_REG8_Disable_zero_compress_MASK                 0x1
+#define THCON_SEC1_REG8_Disable_zero_compress_RMW                  THCON_SEC1_REG8_Disable_zero_compress_ADDR32, THCON_SEC1_REG8_Disable_zero_compress_SHAMT, THCON_SEC1_REG8_Disable_zero_compress_MASK
+
+#define THCON_SEC1_REG8_Add_l1_dest_addr_offset_ADDR32             146
+#define THCON_SEC1_REG8_Add_l1_dest_addr_offset_SHAMT                1
+#define THCON_SEC1_REG8_Add_l1_dest_addr_offset_MASK               0x2
+#define THCON_SEC1_REG8_Add_l1_dest_addr_offset_RMW                THCON_SEC1_REG8_Add_l1_dest_addr_offset_ADDR32, THCON_SEC1_REG8_Add_l1_dest_addr_offset_SHAMT, THCON_SEC1_REG8_Add_l1_dest_addr_offset_MASK
+
+#define THCON_SEC1_REG8_Disable_pack_zero_flags_ADDR32             146
+#define THCON_SEC1_REG8_Disable_pack_zero_flags_SHAMT                2
+#define THCON_SEC1_REG8_Disable_pack_zero_flags_MASK               0x4
+#define THCON_SEC1_REG8_Disable_pack_zero_flags_RMW                THCON_SEC1_REG8_Disable_pack_zero_flags_ADDR32, THCON_SEC1_REG8_Disable_pack_zero_flags_SHAMT, THCON_SEC1_REG8_Disable_pack_zero_flags_MASK
+
+#define THCON_SEC1_REG8_Unused1_ADDR32                             146
+#define THCON_SEC1_REG8_Unused1_SHAMT                                3
+#define THCON_SEC1_REG8_Unused1_MASK                               0x8
+#define THCON_SEC1_REG8_Unused1_RMW                                THCON_SEC1_REG8_Unused1_ADDR32, THCON_SEC1_REG8_Unused1_SHAMT, THCON_SEC1_REG8_Unused1_MASK
+
+#define THCON_SEC1_REG8_Out_data_format_ADDR32                     146
+#define THCON_SEC1_REG8_Out_data_format_SHAMT                        4
+#define THCON_SEC1_REG8_Out_data_format_MASK                       0xf0
+#define THCON_SEC1_REG8_Out_data_format_RMW                        THCON_SEC1_REG8_Out_data_format_ADDR32, THCON_SEC1_REG8_Out_data_format_SHAMT, THCON_SEC1_REG8_Out_data_format_MASK
+
+#define THCON_SEC1_REG8_In_data_format_ADDR32                      146
+#define THCON_SEC1_REG8_In_data_format_SHAMT                         8
+#define THCON_SEC1_REG8_In_data_format_MASK                        0xf00
+#define THCON_SEC1_REG8_In_data_format_RMW                         THCON_SEC1_REG8_In_data_format_ADDR32, THCON_SEC1_REG8_In_data_format_SHAMT, THCON_SEC1_REG8_In_data_format_MASK
+
+#define THCON_SEC1_REG8_Dis_shared_exp_assembler_ADDR32            146
+#define THCON_SEC1_REG8_Dis_shared_exp_assembler_SHAMT              12
+#define THCON_SEC1_REG8_Dis_shared_exp_assembler_MASK              0x1000
+#define THCON_SEC1_REG8_Dis_shared_exp_assembler_RMW               THCON_SEC1_REG8_Dis_shared_exp_assembler_ADDR32, THCON_SEC1_REG8_Dis_shared_exp_assembler_SHAMT, THCON_SEC1_REG8_Dis_shared_exp_assembler_MASK
+
+#define THCON_SEC1_REG8_Auto_set_last_pacr_intf_sel_ADDR32         146
+#define THCON_SEC1_REG8_Auto_set_last_pacr_intf_sel_SHAMT           13
+#define THCON_SEC1_REG8_Auto_set_last_pacr_intf_sel_MASK           0x2000
+#define THCON_SEC1_REG8_Auto_set_last_pacr_intf_sel_RMW            THCON_SEC1_REG8_Auto_set_last_pacr_intf_sel_ADDR32, THCON_SEC1_REG8_Auto_set_last_pacr_intf_sel_SHAMT, THCON_SEC1_REG8_Auto_set_last_pacr_intf_sel_MASK
+
+#define THCON_SEC1_REG8_Enable_out_fifo_ADDR32                     146
+#define THCON_SEC1_REG8_Enable_out_fifo_SHAMT                       14
+#define THCON_SEC1_REG8_Enable_out_fifo_MASK                       0x4000
+#define THCON_SEC1_REG8_Enable_out_fifo_RMW                        THCON_SEC1_REG8_Enable_out_fifo_ADDR32, THCON_SEC1_REG8_Enable_out_fifo_SHAMT, THCON_SEC1_REG8_Enable_out_fifo_MASK
+
+#define THCON_SEC1_REG8_Sub_l1_tile_header_size_ADDR32             146
+#define THCON_SEC1_REG8_Sub_l1_tile_header_size_SHAMT               15
+#define THCON_SEC1_REG8_Sub_l1_tile_header_size_MASK               0x8000
+#define THCON_SEC1_REG8_Sub_l1_tile_header_size_RMW                THCON_SEC1_REG8_Sub_l1_tile_header_size_ADDR32, THCON_SEC1_REG8_Sub_l1_tile_header_size_SHAMT, THCON_SEC1_REG8_Sub_l1_tile_header_size_MASK
+
+#define THCON_SEC1_REG8_Source_interface_selection_ADDR32          146
+#define THCON_SEC1_REG8_Source_interface_selection_SHAMT            16
+#define THCON_SEC1_REG8_Source_interface_selection_MASK            0x10000
+#define THCON_SEC1_REG8_Source_interface_selection_RMW             THCON_SEC1_REG8_Source_interface_selection_ADDR32, THCON_SEC1_REG8_Source_interface_selection_SHAMT, THCON_SEC1_REG8_Source_interface_selection_MASK
+
+#define THCON_SEC1_REG8_Add_tile_header_size_ADDR32                146
+#define THCON_SEC1_REG8_Add_tile_header_size_SHAMT                  17
+#define THCON_SEC1_REG8_Add_tile_header_size_MASK                  0x20000
+#define THCON_SEC1_REG8_Add_tile_header_size_RMW                   THCON_SEC1_REG8_Add_tile_header_size_ADDR32, THCON_SEC1_REG8_Add_tile_header_size_SHAMT, THCON_SEC1_REG8_Add_tile_header_size_MASK
+
+#define THCON_SEC1_REG8_pack_dis_y_pos_start_offset_ADDR32         146
+#define THCON_SEC1_REG8_pack_dis_y_pos_start_offset_SHAMT           18
+#define THCON_SEC1_REG8_pack_dis_y_pos_start_offset_MASK           0x40000
+#define THCON_SEC1_REG8_pack_dis_y_pos_start_offset_RMW            THCON_SEC1_REG8_pack_dis_y_pos_start_offset_ADDR32, THCON_SEC1_REG8_pack_dis_y_pos_start_offset_SHAMT, THCON_SEC1_REG8_pack_dis_y_pos_start_offset_MASK
+
+#define THCON_SEC1_REG8_unpack_tile_offset_ADDR32                  146
+#define THCON_SEC1_REG8_unpack_tile_offset_SHAMT                    19
+#define THCON_SEC1_REG8_unpack_tile_offset_MASK                    0xf80000
+#define THCON_SEC1_REG8_unpack_tile_offset_RMW                     THCON_SEC1_REG8_unpack_tile_offset_ADDR32, THCON_SEC1_REG8_unpack_tile_offset_SHAMT, THCON_SEC1_REG8_unpack_tile_offset_MASK
+
+#define THCON_SEC1_REG8_L1_source_addr_ADDR32                      146
+#define THCON_SEC1_REG8_L1_source_addr_SHAMT                        24
+#define THCON_SEC1_REG8_L1_source_addr_MASK                        0xff000000
+#define THCON_SEC1_REG8_L1_source_addr_RMW                         THCON_SEC1_REG8_L1_source_addr_ADDR32, THCON_SEC1_REG8_L1_source_addr_SHAMT, THCON_SEC1_REG8_L1_source_addr_MASK
+
+#define THCON_SEC1_REG8_Downsample_mask_ADDR32                     147
+#define THCON_SEC1_REG8_Downsample_mask_SHAMT                        0
+#define THCON_SEC1_REG8_Downsample_mask_MASK                       0xffff
+#define THCON_SEC1_REG8_Downsample_mask_RMW                        THCON_SEC1_REG8_Downsample_mask_ADDR32, THCON_SEC1_REG8_Downsample_mask_SHAMT, THCON_SEC1_REG8_Downsample_mask_MASK
+
+#define THCON_SEC1_REG8_Downsample_rate_ADDR32                     147
+#define THCON_SEC1_REG8_Downsample_rate_SHAMT                       16
+#define THCON_SEC1_REG8_Downsample_rate_MASK                       0x70000
+#define THCON_SEC1_REG8_Downsample_rate_RMW                        THCON_SEC1_REG8_Downsample_rate_ADDR32, THCON_SEC1_REG8_Downsample_rate_SHAMT, THCON_SEC1_REG8_Downsample_rate_MASK
+
+#define THCON_SEC1_REG8_Pack_L1_Acc_ADDR32                         147
+#define THCON_SEC1_REG8_Pack_L1_Acc_SHAMT                           19
+#define THCON_SEC1_REG8_Pack_L1_Acc_MASK                           0x80000
+#define THCON_SEC1_REG8_Pack_L1_Acc_RMW                            THCON_SEC1_REG8_Pack_L1_Acc_ADDR32, THCON_SEC1_REG8_Pack_L1_Acc_SHAMT, THCON_SEC1_REG8_Pack_L1_Acc_MASK
+
+#define THCON_SEC1_REG8_Exp_threshold_en_ADDR32                    147
+#define THCON_SEC1_REG8_Exp_threshold_en_SHAMT                      20
+#define THCON_SEC1_REG8_Exp_threshold_en_MASK                      0x100000
+#define THCON_SEC1_REG8_Exp_threshold_en_RMW                       THCON_SEC1_REG8_Exp_threshold_en_ADDR32, THCON_SEC1_REG8_Exp_threshold_en_SHAMT, THCON_SEC1_REG8_Exp_threshold_en_MASK
+
+#define THCON_SEC1_REG8_Exp_threshold_ADDR32                       147
+#define THCON_SEC1_REG8_Exp_threshold_SHAMT                         24
+#define THCON_SEC1_REG8_Exp_threshold_MASK                         0xff000000
+#define THCON_SEC1_REG8_Exp_threshold_RMW                          THCON_SEC1_REG8_Exp_threshold_ADDR32, THCON_SEC1_REG8_Exp_threshold_SHAMT, THCON_SEC1_REG8_Exp_threshold_MASK
+
+#define THCON_SEC1_REG9_Pack_0_2_limit_address_ADDR32              148
+#define THCON_SEC1_REG9_Pack_0_2_limit_address_SHAMT                 0
+#define THCON_SEC1_REG9_Pack_0_2_limit_address_MASK                0x1ffff
+#define THCON_SEC1_REG9_Pack_0_2_limit_address_RMW                 THCON_SEC1_REG9_Pack_0_2_limit_address_ADDR32, THCON_SEC1_REG9_Pack_0_2_limit_address_SHAMT, THCON_SEC1_REG9_Pack_0_2_limit_address_MASK
+
+#define THCON_SEC1_REG9_Pack_0_2_fifo_size_ADDR32                  149
+#define THCON_SEC1_REG9_Pack_0_2_fifo_size_SHAMT                     0
+#define THCON_SEC1_REG9_Pack_0_2_fifo_size_MASK                    0x1ffff
+#define THCON_SEC1_REG9_Pack_0_2_fifo_size_RMW                     THCON_SEC1_REG9_Pack_0_2_fifo_size_ADDR32, THCON_SEC1_REG9_Pack_0_2_fifo_size_SHAMT, THCON_SEC1_REG9_Pack_0_2_fifo_size_MASK
+
+#define THCON_SEC1_REG9_Pack_1_3_limit_address_ADDR32              150
+#define THCON_SEC1_REG9_Pack_1_3_limit_address_SHAMT                 0
+#define THCON_SEC1_REG9_Pack_1_3_limit_address_MASK                0x1ffff
+#define THCON_SEC1_REG9_Pack_1_3_limit_address_RMW                 THCON_SEC1_REG9_Pack_1_3_limit_address_ADDR32, THCON_SEC1_REG9_Pack_1_3_limit_address_SHAMT, THCON_SEC1_REG9_Pack_1_3_limit_address_MASK
+
+#define THCON_SEC1_REG9_Pack_1_3_fifo_size_ADDR32                  151
+#define THCON_SEC1_REG9_Pack_1_3_fifo_size_SHAMT                     0
+#define THCON_SEC1_REG9_Pack_1_3_fifo_size_MASK                    0x1ffff
+#define THCON_SEC1_REG9_Pack_1_3_fifo_size_RMW                     THCON_SEC1_REG9_Pack_1_3_fifo_size_ADDR32, THCON_SEC1_REG9_Pack_1_3_fifo_size_SHAMT, THCON_SEC1_REG9_Pack_1_3_fifo_size_MASK
+
+#define THCON_SEC1_REG10_Unpack_limit_address_ADDR32               152
+#define THCON_SEC1_REG10_Unpack_limit_address_SHAMT                  0
+#define THCON_SEC1_REG10_Unpack_limit_address_MASK                 0x1ffff
+#define THCON_SEC1_REG10_Unpack_limit_address_RMW                  THCON_SEC1_REG10_Unpack_limit_address_ADDR32, THCON_SEC1_REG10_Unpack_limit_address_SHAMT, THCON_SEC1_REG10_Unpack_limit_address_MASK
+
+#define THCON_SEC1_REG10_Unpack_fifo_size_ADDR32                   153
+#define THCON_SEC1_REG10_Unpack_fifo_size_SHAMT                      0
+#define THCON_SEC1_REG10_Unpack_fifo_size_MASK                     0x1ffff
+#define THCON_SEC1_REG10_Unpack_fifo_size_RMW                      THCON_SEC1_REG10_Unpack_fifo_size_ADDR32, THCON_SEC1_REG10_Unpack_fifo_size_SHAMT, THCON_SEC1_REG10_Unpack_fifo_size_MASK
+
+#define THCON_SEC1_REG10_Unpack_limit_address_en_ADDR32            153
+#define THCON_SEC1_REG10_Unpack_limit_address_en_SHAMT              17
+#define THCON_SEC1_REG10_Unpack_limit_address_en_MASK              0x20000
+#define THCON_SEC1_REG10_Unpack_limit_address_en_RMW               THCON_SEC1_REG10_Unpack_limit_address_en_ADDR32, THCON_SEC1_REG10_Unpack_limit_address_en_SHAMT, THCON_SEC1_REG10_Unpack_limit_address_en_MASK
+
+#define THCON_SEC1_REG10_Unpacker_Reg_Wr_Addr_ADDR32               154
+#define THCON_SEC1_REG10_Unpacker_Reg_Wr_Addr_SHAMT                  0
+#define THCON_SEC1_REG10_Unpacker_Reg_Wr_Addr_MASK                 0xffffff
+#define THCON_SEC1_REG10_Unpacker_Reg_Wr_Addr_RMW                  THCON_SEC1_REG10_Unpacker_Reg_Wr_Addr_ADDR32, THCON_SEC1_REG10_Unpacker_Reg_Wr_Addr_SHAMT, THCON_SEC1_REG10_Unpacker_Reg_Wr_Addr_MASK
+
+#define THCON_SEC1_REG10_Packer_Reg_Wr_Addr_ADDR32                 155
+#define THCON_SEC1_REG10_Packer_Reg_Wr_Addr_SHAMT                    0
+#define THCON_SEC1_REG10_Packer_Reg_Wr_Addr_MASK                   0xffffff
+#define THCON_SEC1_REG10_Packer_Reg_Wr_Addr_RMW                    THCON_SEC1_REG10_Packer_Reg_Wr_Addr_ADDR32, THCON_SEC1_REG10_Packer_Reg_Wr_Addr_SHAMT, THCON_SEC1_REG10_Packer_Reg_Wr_Addr_MASK
+
+#define THCON_SEC1_REG11_Metadata_l1_addr_ADDR32                   156
+#define THCON_SEC1_REG11_Metadata_l1_addr_SHAMT                      0
+#define THCON_SEC1_REG11_Metadata_l1_addr_MASK                     0xffffffff
+#define THCON_SEC1_REG11_Metadata_l1_addr_RMW                      THCON_SEC1_REG11_Metadata_l1_addr_ADDR32, THCON_SEC1_REG11_Metadata_l1_addr_SHAMT, THCON_SEC1_REG11_Metadata_l1_addr_MASK
+
+#define THCON_SEC1_REG11_Metadata_limit_addr_ADDR32                157
+#define THCON_SEC1_REG11_Metadata_limit_addr_SHAMT                   0
+#define THCON_SEC1_REG11_Metadata_limit_addr_MASK                  0xffffffff
+#define THCON_SEC1_REG11_Metadata_limit_addr_RMW                   THCON_SEC1_REG11_Metadata_limit_addr_ADDR32, THCON_SEC1_REG11_Metadata_limit_addr_SHAMT, THCON_SEC1_REG11_Metadata_limit_addr_MASK
+
+#define THCON_SEC1_REG11_Metadata_fifo_size_ADDR32                 158
+#define THCON_SEC1_REG11_Metadata_fifo_size_SHAMT                    0
+#define THCON_SEC1_REG11_Metadata_fifo_size_MASK                   0xffffffff
+#define THCON_SEC1_REG11_Metadata_fifo_size_RMW                    THCON_SEC1_REG11_Metadata_fifo_size_ADDR32, THCON_SEC1_REG11_Metadata_fifo_size_SHAMT, THCON_SEC1_REG11_Metadata_fifo_size_MASK
+
+#define THCON_SEC1_REG11_Metadata_z_cntr_rst_unpacr_count_ADDR32   159
+#define THCON_SEC1_REG11_Metadata_z_cntr_rst_unpacr_count_SHAMT      0
+#define THCON_SEC1_REG11_Metadata_z_cntr_rst_unpacr_count_MASK     0xff
+#define THCON_SEC1_REG11_Metadata_z_cntr_rst_unpacr_count_RMW      THCON_SEC1_REG11_Metadata_z_cntr_rst_unpacr_count_ADDR32, THCON_SEC1_REG11_Metadata_z_cntr_rst_unpacr_count_SHAMT, THCON_SEC1_REG11_Metadata_z_cntr_rst_unpacr_count_MASK
+
+#define THCON_SEC1_REG11_Metadata_cntxt_switch_unpacr_count_ADDR32 159
+#define THCON_SEC1_REG11_Metadata_cntxt_switch_unpacr_count_SHAMT    8
+#define THCON_SEC1_REG11_Metadata_cntxt_switch_unpacr_count_MASK   0xff00
+#define THCON_SEC1_REG11_Metadata_cntxt_switch_unpacr_count_RMW    THCON_SEC1_REG11_Metadata_cntxt_switch_unpacr_count_ADDR32, THCON_SEC1_REG11_Metadata_cntxt_switch_unpacr_count_SHAMT, THCON_SEC1_REG11_Metadata_cntxt_switch_unpacr_count_MASK
+
+////////////////////////////////////////////////////////////////////////
+// Registers for GLOBAL
+////////////////////////////////////////////////////////////////////////
+
+#define GLOBAL_CFGREG_BASE_ADDR32                                          180
+
+#define DEST_TARGET_REG_CFG_PACK_SEC0_Offset_ADDR32                        180
+#define DEST_TARGET_REG_CFG_PACK_SEC0_Offset_SHAMT                           0
+#define DEST_TARGET_REG_CFG_PACK_SEC0_Offset_MASK                          0xfff
+#define DEST_TARGET_REG_CFG_PACK_SEC0_Offset_RMW                           DEST_TARGET_REG_CFG_PACK_SEC0_Offset_ADDR32, DEST_TARGET_REG_CFG_PACK_SEC0_Offset_SHAMT, DEST_TARGET_REG_CFG_PACK_SEC0_Offset_MASK
+
+#define DEST_TARGET_REG_CFG_PACK_SEC0_ZOffset_ADDR32                       180
+#define DEST_TARGET_REG_CFG_PACK_SEC0_ZOffset_SHAMT                         12
+#define DEST_TARGET_REG_CFG_PACK_SEC0_ZOffset_MASK                         0x3f000
+#define DEST_TARGET_REG_CFG_PACK_SEC0_ZOffset_RMW                          DEST_TARGET_REG_CFG_PACK_SEC0_ZOffset_ADDR32, DEST_TARGET_REG_CFG_PACK_SEC0_ZOffset_SHAMT, DEST_TARGET_REG_CFG_PACK_SEC0_ZOffset_MASK
+
+#define DEST_TARGET_REG_CFG_PACK_SEC1_Offset_ADDR32                        181
+#define DEST_TARGET_REG_CFG_PACK_SEC1_Offset_SHAMT                           0
+#define DEST_TARGET_REG_CFG_PACK_SEC1_Offset_MASK                          0xfff
+#define DEST_TARGET_REG_CFG_PACK_SEC1_Offset_RMW                           DEST_TARGET_REG_CFG_PACK_SEC1_Offset_ADDR32, DEST_TARGET_REG_CFG_PACK_SEC1_Offset_SHAMT, DEST_TARGET_REG_CFG_PACK_SEC1_Offset_MASK
+
+#define DEST_TARGET_REG_CFG_PACK_SEC1_ZOffset_ADDR32                       181
+#define DEST_TARGET_REG_CFG_PACK_SEC1_ZOffset_SHAMT                         12
+#define DEST_TARGET_REG_CFG_PACK_SEC1_ZOffset_MASK                         0x3f000
+#define DEST_TARGET_REG_CFG_PACK_SEC1_ZOffset_RMW                          DEST_TARGET_REG_CFG_PACK_SEC1_ZOffset_ADDR32, DEST_TARGET_REG_CFG_PACK_SEC1_ZOffset_SHAMT, DEST_TARGET_REG_CFG_PACK_SEC1_ZOffset_MASK
+
+#define DEST_TARGET_REG_CFG_PACK_SEC2_Offset_ADDR32                        182
+#define DEST_TARGET_REG_CFG_PACK_SEC2_Offset_SHAMT                           0
+#define DEST_TARGET_REG_CFG_PACK_SEC2_Offset_MASK                          0xfff
+#define DEST_TARGET_REG_CFG_PACK_SEC2_Offset_RMW                           DEST_TARGET_REG_CFG_PACK_SEC2_Offset_ADDR32, DEST_TARGET_REG_CFG_PACK_SEC2_Offset_SHAMT, DEST_TARGET_REG_CFG_PACK_SEC2_Offset_MASK
+
+#define DEST_TARGET_REG_CFG_PACK_SEC2_ZOffset_ADDR32                       182
+#define DEST_TARGET_REG_CFG_PACK_SEC2_ZOffset_SHAMT                         12
+#define DEST_TARGET_REG_CFG_PACK_SEC2_ZOffset_MASK                         0x3f000
+#define DEST_TARGET_REG_CFG_PACK_SEC2_ZOffset_RMW                          DEST_TARGET_REG_CFG_PACK_SEC2_ZOffset_ADDR32, DEST_TARGET_REG_CFG_PACK_SEC2_ZOffset_SHAMT, DEST_TARGET_REG_CFG_PACK_SEC2_ZOffset_MASK
+
+#define DEST_TARGET_REG_CFG_PACK_SEC3_Offset_ADDR32                        183
+#define DEST_TARGET_REG_CFG_PACK_SEC3_Offset_SHAMT                           0
+#define DEST_TARGET_REG_CFG_PACK_SEC3_Offset_MASK                          0xfff
+#define DEST_TARGET_REG_CFG_PACK_SEC3_Offset_RMW                           DEST_TARGET_REG_CFG_PACK_SEC3_Offset_ADDR32, DEST_TARGET_REG_CFG_PACK_SEC3_Offset_SHAMT, DEST_TARGET_REG_CFG_PACK_SEC3_Offset_MASK
+
+#define DEST_TARGET_REG_CFG_PACK_SEC3_ZOffset_ADDR32                       183
+#define DEST_TARGET_REG_CFG_PACK_SEC3_ZOffset_SHAMT                         12
+#define DEST_TARGET_REG_CFG_PACK_SEC3_ZOffset_MASK                         0x3f000
+#define DEST_TARGET_REG_CFG_PACK_SEC3_ZOffset_RMW                          DEST_TARGET_REG_CFG_PACK_SEC3_ZOffset_ADDR32, DEST_TARGET_REG_CFG_PACK_SEC3_ZOffset_SHAMT, DEST_TARGET_REG_CFG_PACK_SEC3_ZOffset_MASK
+
+#define CG_SRC_PIPELINE_GateSrcAPipeEn_ADDR32                              184
+#define CG_SRC_PIPELINE_GateSrcAPipeEn_SHAMT                                 0
+#define CG_SRC_PIPELINE_GateSrcAPipeEn_MASK                                0x1
+#define CG_SRC_PIPELINE_GateSrcAPipeEn_RMW                                 CG_SRC_PIPELINE_GateSrcAPipeEn_ADDR32, CG_SRC_PIPELINE_GateSrcAPipeEn_SHAMT, CG_SRC_PIPELINE_GateSrcAPipeEn_MASK
+
+#define CG_SRC_PIPELINE_GateSrcBPipeEn_ADDR32                              184
+#define CG_SRC_PIPELINE_GateSrcBPipeEn_SHAMT                                 1
+#define CG_SRC_PIPELINE_GateSrcBPipeEn_MASK                                0x2
+#define CG_SRC_PIPELINE_GateSrcBPipeEn_RMW                                 CG_SRC_PIPELINE_GateSrcBPipeEn_ADDR32, CG_SRC_PIPELINE_GateSrcBPipeEn_SHAMT, CG_SRC_PIPELINE_GateSrcBPipeEn_MASK
+
+#define RISCV_IC_INVALIDATE_InvalidateAll_ADDR32                           185
+#define RISCV_IC_INVALIDATE_InvalidateAll_SHAMT                              0
+#define RISCV_IC_INVALIDATE_InvalidateAll_MASK                             0x1f
+#define RISCV_IC_INVALIDATE_InvalidateAll_RMW                              RISCV_IC_INVALIDATE_InvalidateAll_ADDR32, RISCV_IC_INVALIDATE_InvalidateAll_SHAMT, RISCV_IC_INVALIDATE_InvalidateAll_MASK
+
+#define PRNG_SEED_Seed_Val_ADDR32                                          186
+#define PRNG_SEED_Seed_Val_SHAMT                                             0
+#define PRNG_SEED_Seed_Val_MASK                                            0xffffffff
+#define PRNG_SEED_Seed_Val_RMW                                             PRNG_SEED_Seed_Val_ADDR32, PRNG_SEED_Seed_Val_SHAMT, PRNG_SEED_Seed_Val_MASK
+
+#define INT_DESCALE_VALUES_SEC0_Value_ADDR32                               187
+#define INT_DESCALE_VALUES_SEC0_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC0_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC0_Value_RMW                                  INT_DESCALE_VALUES_SEC0_Value_ADDR32, INT_DESCALE_VALUES_SEC0_Value_SHAMT, INT_DESCALE_VALUES_SEC0_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC1_Value_ADDR32                               188
+#define INT_DESCALE_VALUES_SEC1_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC1_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC1_Value_RMW                                  INT_DESCALE_VALUES_SEC1_Value_ADDR32, INT_DESCALE_VALUES_SEC1_Value_SHAMT, INT_DESCALE_VALUES_SEC1_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC2_Value_ADDR32                               189
+#define INT_DESCALE_VALUES_SEC2_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC2_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC2_Value_RMW                                  INT_DESCALE_VALUES_SEC2_Value_ADDR32, INT_DESCALE_VALUES_SEC2_Value_SHAMT, INT_DESCALE_VALUES_SEC2_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC3_Value_ADDR32                               190
+#define INT_DESCALE_VALUES_SEC3_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC3_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC3_Value_RMW                                  INT_DESCALE_VALUES_SEC3_Value_ADDR32, INT_DESCALE_VALUES_SEC3_Value_SHAMT, INT_DESCALE_VALUES_SEC3_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC4_Value_ADDR32                               191
+#define INT_DESCALE_VALUES_SEC4_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC4_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC4_Value_RMW                                  INT_DESCALE_VALUES_SEC4_Value_ADDR32, INT_DESCALE_VALUES_SEC4_Value_SHAMT, INT_DESCALE_VALUES_SEC4_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC5_Value_ADDR32                               192
+#define INT_DESCALE_VALUES_SEC5_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC5_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC5_Value_RMW                                  INT_DESCALE_VALUES_SEC5_Value_ADDR32, INT_DESCALE_VALUES_SEC5_Value_SHAMT, INT_DESCALE_VALUES_SEC5_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC6_Value_ADDR32                               193
+#define INT_DESCALE_VALUES_SEC6_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC6_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC6_Value_RMW                                  INT_DESCALE_VALUES_SEC6_Value_ADDR32, INT_DESCALE_VALUES_SEC6_Value_SHAMT, INT_DESCALE_VALUES_SEC6_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC7_Value_ADDR32                               194
+#define INT_DESCALE_VALUES_SEC7_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC7_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC7_Value_RMW                                  INT_DESCALE_VALUES_SEC7_Value_ADDR32, INT_DESCALE_VALUES_SEC7_Value_SHAMT, INT_DESCALE_VALUES_SEC7_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC8_Value_ADDR32                               195
+#define INT_DESCALE_VALUES_SEC8_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC8_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC8_Value_RMW                                  INT_DESCALE_VALUES_SEC8_Value_ADDR32, INT_DESCALE_VALUES_SEC8_Value_SHAMT, INT_DESCALE_VALUES_SEC8_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC9_Value_ADDR32                               196
+#define INT_DESCALE_VALUES_SEC9_Value_SHAMT                                  0
+#define INT_DESCALE_VALUES_SEC9_Value_MASK                                 0xffffffff
+#define INT_DESCALE_VALUES_SEC9_Value_RMW                                  INT_DESCALE_VALUES_SEC9_Value_ADDR32, INT_DESCALE_VALUES_SEC9_Value_SHAMT, INT_DESCALE_VALUES_SEC9_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC10_Value_ADDR32                              197
+#define INT_DESCALE_VALUES_SEC10_Value_SHAMT                                 0
+#define INT_DESCALE_VALUES_SEC10_Value_MASK                                0xffffffff
+#define INT_DESCALE_VALUES_SEC10_Value_RMW                                 INT_DESCALE_VALUES_SEC10_Value_ADDR32, INT_DESCALE_VALUES_SEC10_Value_SHAMT, INT_DESCALE_VALUES_SEC10_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC11_Value_ADDR32                              198
+#define INT_DESCALE_VALUES_SEC11_Value_SHAMT                                 0
+#define INT_DESCALE_VALUES_SEC11_Value_MASK                                0xffffffff
+#define INT_DESCALE_VALUES_SEC11_Value_RMW                                 INT_DESCALE_VALUES_SEC11_Value_ADDR32, INT_DESCALE_VALUES_SEC11_Value_SHAMT, INT_DESCALE_VALUES_SEC11_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC12_Value_ADDR32                              199
+#define INT_DESCALE_VALUES_SEC12_Value_SHAMT                                 0
+#define INT_DESCALE_VALUES_SEC12_Value_MASK                                0xffffffff
+#define INT_DESCALE_VALUES_SEC12_Value_RMW                                 INT_DESCALE_VALUES_SEC12_Value_ADDR32, INT_DESCALE_VALUES_SEC12_Value_SHAMT, INT_DESCALE_VALUES_SEC12_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC13_Value_ADDR32                              200
+#define INT_DESCALE_VALUES_SEC13_Value_SHAMT                                 0
+#define INT_DESCALE_VALUES_SEC13_Value_MASK                                0xffffffff
+#define INT_DESCALE_VALUES_SEC13_Value_RMW                                 INT_DESCALE_VALUES_SEC13_Value_ADDR32, INT_DESCALE_VALUES_SEC13_Value_SHAMT, INT_DESCALE_VALUES_SEC13_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC14_Value_ADDR32                              201
+#define INT_DESCALE_VALUES_SEC14_Value_SHAMT                                 0
+#define INT_DESCALE_VALUES_SEC14_Value_MASK                                0xffffffff
+#define INT_DESCALE_VALUES_SEC14_Value_RMW                                 INT_DESCALE_VALUES_SEC14_Value_ADDR32, INT_DESCALE_VALUES_SEC14_Value_SHAMT, INT_DESCALE_VALUES_SEC14_Value_MASK
+
+#define INT_DESCALE_VALUES_SEC15_Value_ADDR32                              202
+#define INT_DESCALE_VALUES_SEC15_Value_SHAMT                                 0
+#define INT_DESCALE_VALUES_SEC15_Value_MASK                                0xffffffff
+#define INT_DESCALE_VALUES_SEC15_Value_RMW                                 INT_DESCALE_VALUES_SEC15_Value_ADDR32, INT_DESCALE_VALUES_SEC15_Value_SHAMT, INT_DESCALE_VALUES_SEC15_Value_MASK
+
+#define TRISC_END_PC_SEC0_PC_ADDR32                                        203
+#define TRISC_END_PC_SEC0_PC_SHAMT                                           0
+#define TRISC_END_PC_SEC0_PC_MASK                                          0xffffffff
+#define TRISC_END_PC_SEC0_PC_RMW                                           TRISC_END_PC_SEC0_PC_ADDR32, TRISC_END_PC_SEC0_PC_SHAMT, TRISC_END_PC_SEC0_PC_MASK
+
+#define TRISC_END_PC_SEC1_PC_ADDR32                                        204
+#define TRISC_END_PC_SEC1_PC_SHAMT                                           0
+#define TRISC_END_PC_SEC1_PC_MASK                                          0xffffffff
+#define TRISC_END_PC_SEC1_PC_RMW                                           TRISC_END_PC_SEC1_PC_ADDR32, TRISC_END_PC_SEC1_PC_SHAMT, TRISC_END_PC_SEC1_PC_MASK
+
+#define TRISC_END_PC_SEC2_PC_ADDR32                                        205
+#define TRISC_END_PC_SEC2_PC_SHAMT                                           0
+#define TRISC_END_PC_SEC2_PC_MASK                                          0xffffffff
+#define TRISC_END_PC_SEC2_PC_RMW                                           TRISC_END_PC_SEC2_PC_ADDR32, TRISC_END_PC_SEC2_PC_SHAMT, TRISC_END_PC_SEC2_PC_MASK
+
+#define BRISC_END_PC_PC_ADDR32                                             206
+#define BRISC_END_PC_PC_SHAMT                                                0
+#define BRISC_END_PC_PC_MASK                                               0xffffffff
+#define BRISC_END_PC_PC_RMW                                                BRISC_END_PC_PC_ADDR32, BRISC_END_PC_PC_SHAMT, BRISC_END_PC_PC_MASK
+
+#define NOC_RISC_END_PC_PC_ADDR32                                          207
+#define NOC_RISC_END_PC_PC_SHAMT                                             0
+#define NOC_RISC_END_PC_PC_MASK                                            0xffffffff
+#define NOC_RISC_END_PC_PC_RMW                                             NOC_RISC_END_PC_PC_ADDR32, NOC_RISC_END_PC_PC_SHAMT, NOC_RISC_END_PC_PC_MASK
+
+#define RISC_PREFETCH_CTRL_Enable_Trisc_ADDR32                             208
+#define RISC_PREFETCH_CTRL_Enable_Trisc_SHAMT                                0
+#define RISC_PREFETCH_CTRL_Enable_Trisc_MASK                               0x7
+#define RISC_PREFETCH_CTRL_Enable_Trisc_RMW                                RISC_PREFETCH_CTRL_Enable_Trisc_ADDR32, RISC_PREFETCH_CTRL_Enable_Trisc_SHAMT, RISC_PREFETCH_CTRL_Enable_Trisc_MASK
+
+#define RISC_PREFETCH_CTRL_Enable_Brisc_ADDR32                             208
+#define RISC_PREFETCH_CTRL_Enable_Brisc_SHAMT                                3
+#define RISC_PREFETCH_CTRL_Enable_Brisc_MASK                               0x8
+#define RISC_PREFETCH_CTRL_Enable_Brisc_RMW                                RISC_PREFETCH_CTRL_Enable_Brisc_ADDR32, RISC_PREFETCH_CTRL_Enable_Brisc_SHAMT, RISC_PREFETCH_CTRL_Enable_Brisc_MASK
+
+#define RISC_PREFETCH_CTRL_Enable_NocRisc_ADDR32                           208
+#define RISC_PREFETCH_CTRL_Enable_NocRisc_SHAMT                              4
+#define RISC_PREFETCH_CTRL_Enable_NocRisc_MASK                             0x10
+#define RISC_PREFETCH_CTRL_Enable_NocRisc_RMW                              RISC_PREFETCH_CTRL_Enable_NocRisc_ADDR32, RISC_PREFETCH_CTRL_Enable_NocRisc_SHAMT, RISC_PREFETCH_CTRL_Enable_NocRisc_MASK
+
+#define RISC_PREFETCH_CTRL_Max_Req_Count_ADDR32                            208
+#define RISC_PREFETCH_CTRL_Max_Req_Count_SHAMT                               5
+#define RISC_PREFETCH_CTRL_Max_Req_Count_MASK                              0x1fe0
+#define RISC_PREFETCH_CTRL_Max_Req_Count_RMW                               RISC_PREFETCH_CTRL_Max_Req_Count_ADDR32, RISC_PREFETCH_CTRL_Max_Req_Count_SHAMT, RISC_PREFETCH_CTRL_Max_Req_Count_MASK
+
+#define SCRATCH_SEC0_val_ADDR32                                            209
+#define SCRATCH_SEC0_val_SHAMT                                               0
+#define SCRATCH_SEC0_val_MASK                                              0xffffffff
+#define SCRATCH_SEC0_val_RMW                                               SCRATCH_SEC0_val_ADDR32, SCRATCH_SEC0_val_SHAMT, SCRATCH_SEC0_val_MASK
+
+#define SCRATCH_SEC1_val_ADDR32                                            210
+#define SCRATCH_SEC1_val_SHAMT                                               0
+#define SCRATCH_SEC1_val_MASK                                              0xffffffff
+#define SCRATCH_SEC1_val_RMW                                               SCRATCH_SEC1_val_ADDR32, SCRATCH_SEC1_val_SHAMT, SCRATCH_SEC1_val_MASK
+
+#define SCRATCH_SEC2_val_ADDR32                                            211
+#define SCRATCH_SEC2_val_SHAMT                                               0
+#define SCRATCH_SEC2_val_MASK                                              0xffffffff
+#define SCRATCH_SEC2_val_RMW                                               SCRATCH_SEC2_val_ADDR32, SCRATCH_SEC2_val_SHAMT, SCRATCH_SEC2_val_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Search_Enable_ADDR32                     212
+#define L1_CACHE_TAG_SEARCH_ACCEL_Search_Enable_SHAMT                        0
+#define L1_CACHE_TAG_SEARCH_ACCEL_Search_Enable_MASK                       0x1
+#define L1_CACHE_TAG_SEARCH_ACCEL_Search_Enable_RMW                        L1_CACHE_TAG_SEARCH_ACCEL_Search_Enable_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Search_Enable_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Search_Enable_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Start_Addr_ADDR32                        212
+#define L1_CACHE_TAG_SEARCH_ACCEL_Start_Addr_SHAMT                           1
+#define L1_CACHE_TAG_SEARCH_ACCEL_Start_Addr_MASK                          0x3fffe
+#define L1_CACHE_TAG_SEARCH_ACCEL_Start_Addr_RMW                           L1_CACHE_TAG_SEARCH_ACCEL_Start_Addr_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Start_Addr_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Start_Addr_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_End_Addr_ADDR32                          213
+#define L1_CACHE_TAG_SEARCH_ACCEL_End_Addr_SHAMT                             0
+#define L1_CACHE_TAG_SEARCH_ACCEL_End_Addr_MASK                            0x1ffff
+#define L1_CACHE_TAG_SEARCH_ACCEL_End_Addr_RMW                             L1_CACHE_TAG_SEARCH_ACCEL_End_Addr_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_End_Addr_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_End_Addr_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_low_ADDR32                     214
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_low_SHAMT                        0
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_low_MASK                       0xffffffff
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_low_RMW                        L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_low_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_low_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_low_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_high_ADDR32                    215
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_high_SHAMT                       0
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_high_MASK                      0xffffffff
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_high_RMW                       L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_high_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_high_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Tag_Value_high_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Width_ADDR32                         216
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Width_SHAMT                            0
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Width_MASK                           0x3
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_Width_RMW                            L1_CACHE_TAG_SEARCH_ACCEL_Tag_Width_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Tag_Width_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Tag_Width_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_start_addr_ADDR32      216
+#define L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_start_addr_SHAMT         2
+#define L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_start_addr_MASK        0x7fffc
+#define L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_start_addr_RMW         L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_start_addr_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_start_addr_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_start_addr_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_end_addr_ADDR32        217
+#define L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_end_addr_SHAMT           0
+#define L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_end_addr_MASK          0x1ffff
+#define L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_end_addr_RMW           L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_end_addr_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_end_addr_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Valid_bit_section_end_addr_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_bit_section_start_addr_ADDR32 218
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_bit_section_start_addr_SHAMT    0
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_bit_section_start_addr_MASK   0x1ffff
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_bit_section_start_addr_RMW    L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_bit_section_start_addr_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_bit_section_start_addr_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_bit_section_start_addr_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_chk_ADDR32                    218
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_chk_SHAMT                      17
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_chk_MASK                      0x20000
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_chk_RMW                       L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_chk_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_chk_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_chk_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_offset_ADDR32                 219
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_offset_SHAMT                    0
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_offset_MASK                   0xffffff
+#define L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_offset_RMW                    L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_offset_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_offset_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Data_Valid_offset_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_ADDR32                           219
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_SHAMT                             24
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_MASK                             0x1000000
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_RMW                              L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_all_ADDR32                       219
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_all_SHAMT                         25
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_all_MASK                         0x2000000
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_all_RMW                          L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_all_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_all_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Tag_inv_all_MASK
+
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_alloc_ADDR32                         219
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_alloc_SHAMT                           26
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_alloc_MASK                           0x4000000
+#define L1_CACHE_TAG_SEARCH_ACCEL_Tag_alloc_RMW                            L1_CACHE_TAG_SEARCH_ACCEL_Tag_alloc_ADDR32, L1_CACHE_TAG_SEARCH_ACCEL_Tag_alloc_SHAMT, L1_CACHE_TAG_SEARCH_ACCEL_Tag_alloc_MASK
+
+#define DEST_ACCESS_CFG_swizzle_32b_ADDR32                                 220
+#define DEST_ACCESS_CFG_swizzle_32b_SHAMT                                    0
+#define DEST_ACCESS_CFG_swizzle_32b_MASK                                   0x1
+#define DEST_ACCESS_CFG_swizzle_32b_RMW                                    DEST_ACCESS_CFG_swizzle_32b_ADDR32, DEST_ACCESS_CFG_swizzle_32b_SHAMT, DEST_ACCESS_CFG_swizzle_32b_MASK
+
+#define DEST_ACCESS_CFG_remap_addrs_ADDR32                                 220
+#define DEST_ACCESS_CFG_remap_addrs_SHAMT                                    1
+#define DEST_ACCESS_CFG_remap_addrs_MASK                                   0x2
+#define DEST_ACCESS_CFG_remap_addrs_RMW                                    DEST_ACCESS_CFG_remap_addrs_ADDR32, DEST_ACCESS_CFG_remap_addrs_SHAMT, DEST_ACCESS_CFG_remap_addrs_MASK
+
+#define DEST_ACCESS_CFG_disable_full_write_dest_q_bypass_ADDR32            220
+#define DEST_ACCESS_CFG_disable_full_write_dest_q_bypass_SHAMT               2
+#define DEST_ACCESS_CFG_disable_full_write_dest_q_bypass_MASK              0x4
+#define DEST_ACCESS_CFG_disable_full_write_dest_q_bypass_RMW               DEST_ACCESS_CFG_disable_full_write_dest_q_bypass_ADDR32, DEST_ACCESS_CFG_disable_full_write_dest_q_bypass_SHAMT, DEST_ACCESS_CFG_disable_full_write_dest_q_bypass_MASK
+
+#define DEST_ACCESS_CFG_zeroacc_absolute_tile_mode_ADDR32                  220
+#define DEST_ACCESS_CFG_zeroacc_absolute_tile_mode_SHAMT                     3
+#define DEST_ACCESS_CFG_zeroacc_absolute_tile_mode_MASK                    0x8
+#define DEST_ACCESS_CFG_zeroacc_absolute_tile_mode_RMW                     DEST_ACCESS_CFG_zeroacc_absolute_tile_mode_ADDR32, DEST_ACCESS_CFG_zeroacc_absolute_tile_mode_SHAMT, DEST_ACCESS_CFG_zeroacc_absolute_tile_mode_MASK
+
+#define SRC_ACCESS_CFG_math_view_srca_as_one_bank_ADDR32                   221
+#define SRC_ACCESS_CFG_math_view_srca_as_one_bank_SHAMT                      0
+#define SRC_ACCESS_CFG_math_view_srca_as_one_bank_MASK                     0x1
+#define SRC_ACCESS_CFG_math_view_srca_as_one_bank_RMW                      SRC_ACCESS_CFG_math_view_srca_as_one_bank_ADDR32, SRC_ACCESS_CFG_math_view_srca_as_one_bank_SHAMT, SRC_ACCESS_CFG_math_view_srca_as_one_bank_MASK
+
+#define SRC_ACCESS_CFG_math_view_srcb_as_one_bank_ADDR32                   221
+#define SRC_ACCESS_CFG_math_view_srcb_as_one_bank_SHAMT                      1
+#define SRC_ACCESS_CFG_math_view_srcb_as_one_bank_MASK                     0x2
+#define SRC_ACCESS_CFG_math_view_srcb_as_one_bank_RMW                      SRC_ACCESS_CFG_math_view_srcb_as_one_bank_ADDR32, SRC_ACCESS_CFG_math_view_srcb_as_one_bank_SHAMT, SRC_ACCESS_CFG_math_view_srcb_as_one_bank_MASK
+
+#define SRC_ACCESS_CFG_disable_contig_srca_dvalid_phase_ADDR32             221
+#define SRC_ACCESS_CFG_disable_contig_srca_dvalid_phase_SHAMT                2
+#define SRC_ACCESS_CFG_disable_contig_srca_dvalid_phase_MASK               0x4
+#define SRC_ACCESS_CFG_disable_contig_srca_dvalid_phase_RMW                SRC_ACCESS_CFG_disable_contig_srca_dvalid_phase_ADDR32, SRC_ACCESS_CFG_disable_contig_srca_dvalid_phase_SHAMT, SRC_ACCESS_CFG_disable_contig_srca_dvalid_phase_MASK
+
+#define SRC_ACCESS_CFG_disable_contig_srcb_dvalid_phase_ADDR32             221
+#define SRC_ACCESS_CFG_disable_contig_srcb_dvalid_phase_SHAMT                3
+#define SRC_ACCESS_CFG_disable_contig_srcb_dvalid_phase_MASK               0x8
+#define SRC_ACCESS_CFG_disable_contig_srcb_dvalid_phase_RMW                SRC_ACCESS_CFG_disable_contig_srcb_dvalid_phase_ADDR32, SRC_ACCESS_CFG_disable_contig_srcb_dvalid_phase_SHAMT, SRC_ACCESS_CFG_disable_contig_srcb_dvalid_phase_MASK
+
+#define CHICKEN_BITS_sfpu_scbd_disable_ADDR32                              222
+#define CHICKEN_BITS_sfpu_scbd_disable_SHAMT                                 0
+#define CHICKEN_BITS_sfpu_scbd_disable_MASK                                0x1
+#define CHICKEN_BITS_sfpu_scbd_disable_RMW                                 CHICKEN_BITS_sfpu_scbd_disable_ADDR32, CHICKEN_BITS_sfpu_scbd_disable_SHAMT, CHICKEN_BITS_sfpu_scbd_disable_MASK
diff --git a/tt_metal/hw/inc/blackhole/cmd_params.h b/tt_metal/hw/inc/blackhole/cmd_params.h
new file mode 100644
index 000000000000..d01837b049c0
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/cmd_params.h
@@ -0,0 +1,1015 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <array>
+#include <cassert>
+#include <limits>
+#include <cstring>
+#ifndef DISABLE_CMD_DEBUG
+#include <iostream>
+#endif
+
+#include "tensix.h"
+#include "tensix_types.h"
+
+#include "cmd_defs.h"
+
+// [[deprecated("There should be no more traditional fifos.")]]
+inline std::uint32_t unpack_fifo_address(std::uint32_t fifo_address)
+{
+    return (fifo_address << FIFO_BASE_ADDRESS_ALIGN_BITS);
+}
+
+inline std::uint32_t unpack_address(std::uint32_t address)
+{
+    return (address << FIFO_BASE_ADDRESS_ALIGN_BITS);
+}
+
+inline std::uint16_t pack_address(std::uint32_t address)
+{
+#ifdef ASSERT
+    ASSERT(!(address & bitmask<std::uint32_t>(FIFO_BASE_ADDRESS_ALIGN_BITS)), "Address not aligned and cannot be packed");
+#else
+    assert(!(address & bitmask<std::uint32_t>(FIFO_BASE_ADDRESS_ALIGN_BITS)) && "Address not aligned and cannot be packed");
+#endif
+    return (address >> FIFO_BASE_ADDRESS_ALIGN_BITS);
+}
+
+inline std::uint32_t pack_32b_field(std::uint32_t x, unsigned int bits, unsigned int to_shift)
+{
+    assert(bits + to_shift <= std::numeric_limits<std::uint32_t>::digits);
+    assert((x & ~bitmask<std::uint32_t>(bits)) == 0);
+
+    return x << to_shift;
+}
+
+inline std::uint32_t unpack_field(std::uint32_t x, unsigned int bits, unsigned int to_shift)
+{
+  return ((x >> to_shift) & bitmask<std::uint32_t>(bits));
+}
+
+constexpr int MAX_NUM_PACKS = 4;
+
+struct PackOperation {
+  std::uint32_t config_blob[8]; // This is really a FirmwareCommand<8>
+
+  std::uint8_t stream_ids[4];
+  std::uint8_t y_start;
+  std::uint8_t y_dim;
+  std::uint8_t strip_y_dim;
+  std::uint8_t strip_mask : 4;  // Which strips / stream IDs are valid
+  bool skip_strip_setup : 1;  // Workaround for tb_tensix tests which don't use ExtendedMegaConfig
+  bool force_max_xy : 1;
+  bool strip_yz_transposed : 1; // Only used for fused pack-pack_z top/bottom pack
+  bool align_rowsetmap_y_start:1;
+
+  std::uint32_t row_mask_select;
+  std::uint32_t dest_start_offset;
+
+  // last bit of each phase id (20-bits) is lplc (LocalProducer&LocalConsumer)
+  std::uint32_t phase_ids[4];
+
+  void print() const {
+    #ifndef DISABLE_CMD_DEBUG
+    std::cout << "\tPackOperation:" << std::endl;
+    std::cout << "\t\tconfig_blob=";
+    for(unsigned i = 0; i < 8; i++) {
+      std::cout << config_blob[i] << ", ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "\t\tstream_ids=";
+    for(unsigned i = 0; i < 4; i++) {
+      std::cout << (unsigned)stream_ids[i] << ", ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "\t\ty_start=" << (unsigned)y_start << std::endl;
+    std::cout << "\t\ty_dim=" << (unsigned)y_dim << std::endl;
+    std::cout << "\t\tstrip_y_dim=" << (unsigned)strip_y_dim << std::endl;
+    std::cout << "\t\tstrip_mask=" << (unsigned)strip_mask << std::endl;
+    std::cout << "\t\trow_mask_select=" << row_mask_select << std::endl;
+    std::cout << "\t\tdest_start_offset=" << dest_start_offset << std::endl;
+
+    std::cout << "\t\tphase_ids=";
+    for(unsigned i = 0; i < 4; i++) {
+      std::cout << ( phase_ids[i] & 0x7FFFFFFF) << ", ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "\t\tlplc=";
+    for(unsigned i = 0; i < 4; i++) {
+      std::cout << ( phase_ids[i] >> 31) << ", ";
+    }
+    std::cout << std::endl;
+
+    #endif
+  }
+};
+static_assert(sizeof(PackOperation) % 16 == 0,
+    "PackOperation must be packable into a 16-byte aligned array");
+
+struct SfpuParams {
+  std::uint16_t leaky_slope;
+  std::uint32_t sfpu_scale_val;
+  std::uint8_t data_format_src;
+  std::uint8_t sfpu_rnd_fmt;
+  bool fp32_acc;
+  bool sfpu_rnd_unsigned_int;
+  bool sfpu_stoch_rnd;
+  bool sfpu_sort_rand_indx;
+  bool sfpu_signed_int16;
+  bool z_first_layout;
+  bool sfpu_sort_col_major;
+  std::uint32_t num_output_faces;
+  bool sfpu_half_rows;
+  std::uint32_t log_mult_const;
+  std::uint32_t dropout_prob;
+};
+
+struct PackParams {
+  PackOperation pack_ops[MAX_NUM_PACKS];
+
+  bool output_tile_id_passthrough;
+  bool unpack_to_dest;
+  std::uint8_t num_packs;
+  std::uint16_t kernel_id_packer;
+  std::uint16_t kernel_id_sfunc;
+  std::uint16_t output_tile_id_offset;
+  std::uint16_t num_output_tiles;
+  std::uint16_t tile_id_offset_by_packer;
+
+  std::uint32_t bias_section_addr;
+  std::uint16_t result32b;
+  std::uint8_t data_size_mult_log2;
+
+  SfpuParams sfpu_params;
+
+  bool pool_idx;
+  bool pool_chk_data;
+  std::uint32_t start_row;
+
+  bool risc_dest_perm_test;
+  bool risc_dest_vector_perm_test;
+
+  PackParams() {
+    std::memset(this, 0, sizeof(*this));
+  }
+
+  void SetPackConfigBlob(int idx, std::array<std::uint32_t, 8> pack_config) {
+    const auto size = sizeof(PackOperation::config_blob);
+    std::memcpy(&pack_ops[idx].config_blob, pack_config.data(), size);
+  }
+
+  void parse(const std::uint32_t *command_data) {
+    num_packs = unpack_field(command_data[0], 8, 16);
+    kernel_id_packer = unpack_field(command_data[0], 16, 0);
+
+    output_tile_id_offset = unpack_field(command_data[1], 16, 16);
+    num_output_tiles = unpack_field(command_data[1], 16, 0);
+
+    bias_section_addr = unpack_field(command_data[2], 16, 0);
+
+    output_tile_id_passthrough = unpack_field(command_data[3], 1, 0);
+    unpack_to_dest = unpack_field(command_data[3], 1, 1);
+    result32b = unpack_field(command_data[3], 1, 2);
+    data_size_mult_log2 = unpack_field(command_data[3], 8, 3);
+    tile_id_offset_by_packer = unpack_field(command_data[3], 16, 16);
+
+    sfpu_params.leaky_slope = unpack_field(command_data[4], 16, 0);
+
+    sfpu_params.sfpu_scale_val = unpack_field(command_data[5], 32, 0);
+
+
+    risc_dest_vector_perm_test        = unpack_field(command_data[6], 1, 27);
+    risc_dest_perm_test               = unpack_field(command_data[6], 1, 26);
+    sfpu_params.sfpu_half_rows        = unpack_field(command_data[6], 1, 25);
+    pool_chk_data                     = unpack_field(command_data[6], 1, 24);
+    pool_idx                          = unpack_field(command_data[6], 1, 23);
+    sfpu_params.sfpu_sort_col_major   = unpack_field(command_data[6], 1, 22);
+    sfpu_params.z_first_layout        = unpack_field(command_data[6], 1, 21);
+    sfpu_params.sfpu_signed_int16     = unpack_field(command_data[6], 1, 20);
+    sfpu_params.sfpu_sort_rand_indx   = unpack_field(command_data[6], 1, 19);
+    sfpu_params.sfpu_stoch_rnd        = unpack_field(command_data[6], 1, 18);
+    sfpu_params.sfpu_rnd_unsigned_int = unpack_field(command_data[6], 1, 17);
+    sfpu_params.fp32_acc              = unpack_field(command_data[6], 1, 16);
+    sfpu_params.sfpu_rnd_fmt          = unpack_field(command_data[6], 8, 8);
+    sfpu_params.data_format_src       = unpack_field(command_data[6], 8, 0);
+
+    sfpu_params.num_output_faces      = unpack_field(command_data[7], 9, 0);
+    sfpu_params.log_mult_const        = unpack_field(command_data[8], 32, 0);
+    sfpu_params.dropout_prob          = unpack_field(command_data[9], 32, 0);
+    start_row                         = unpack_field(command_data[10], 32, 0);
+  }
+
+  std::array<std::uint32_t, 96> create_pack_cmd() const {
+
+    std::array<std::uint32_t, 96> cmd;
+    cmd.fill(0);
+
+    cmd[0] = pack_32b_field(CMD_PACK, 8, 24) |
+             pack_32b_field(num_packs, 8, 16) |
+             pack_32b_field(kernel_id_packer, 16, 0);
+
+    cmd[1] = pack_32b_field(output_tile_id_offset, 16, 16) |
+             pack_32b_field(num_output_tiles, 16, 0);
+
+    cmd[2] = bias_section_addr;
+
+    cmd[3] = pack_32b_field(tile_id_offset_by_packer, 16, 16) |
+             pack_32b_field(data_size_mult_log2, 8, 3)                  |
+             pack_32b_field(result32b, 1, 2)                  |
+             pack_32b_field(unpack_to_dest, 1, 1) |
+             pack_32b_field(output_tile_id_passthrough, 1, 0);
+
+    cmd[4] = pack_32b_field(sfpu_params.leaky_slope, 16, 0);
+
+    cmd[5] = sfpu_params.sfpu_scale_val;
+
+    cmd[6] =  pack_32b_field(risc_dest_vector_perm_test, 1, 27)         |
+              pack_32b_field(risc_dest_perm_test, 1, 26)                |
+              pack_32b_field(sfpu_params.sfpu_half_rows, 1, 25)         |
+              pack_32b_field(pool_chk_data, 1, 24)                      |
+              pack_32b_field(pool_idx, 1, 23)                           |
+              pack_32b_field(sfpu_params.sfpu_sort_col_major, 1, 22)    |
+              pack_32b_field(sfpu_params.z_first_layout, 1, 21)         |
+              pack_32b_field(sfpu_params.sfpu_signed_int16, 1, 20)      |
+              pack_32b_field(sfpu_params.sfpu_sort_rand_indx, 1, 19)    |
+              pack_32b_field(sfpu_params.sfpu_stoch_rnd, 1, 18)         |
+              pack_32b_field(sfpu_params.sfpu_rnd_unsigned_int, 1, 17)  |
+              pack_32b_field(sfpu_params.fp32_acc, 1, 16)               |
+              pack_32b_field(sfpu_params.sfpu_rnd_fmt, 8, 8)            |
+              pack_32b_field(sfpu_params.data_format_src, 8, 0)         ;
+
+    cmd[7] =  pack_32b_field(sfpu_params.num_output_faces, 9, 0)        ;
+    cmd[8] =  sfpu_params.log_mult_const;
+    cmd[9] =  sfpu_params.dropout_prob;
+    cmd[10] = start_row;
+
+    //cmd[31:8] - Reserved
+
+    std::memcpy(cmd.data() + 32, pack_ops, sizeof(pack_ops));
+    assert(sizeof(pack_ops) == MAX_NUM_PACKS * sizeof(PackOperation));
+    assert(128 + sizeof(pack_ops) == cmd.size() * 4);
+
+    return cmd;
+  }
+
+  void print() const {
+    #ifndef DISABLE_CMD_DEBUG
+    std::cout << "PackParams:" << std::endl;
+    for(unsigned i = 0; i < MAX_NUM_PACKS; i++) {
+      pack_ops[i].print();
+    }
+
+    std::cout << "\toutput_tile_id_passthrough=" << output_tile_id_passthrough << std::endl;
+    std::cout << "\tunpack_to_dest=" << unpack_to_dest << std::endl;
+    std::cout << "\tnum_packs=" << (unsigned)num_packs << std::endl;
+    std::cout << "\tkernel_id_packer=" << kernel_id_packer << std::endl;
+    std::cout << "\toutput_tile_id_offset=" << output_tile_id_offset << std::endl;
+    std::cout << "\tnum_output_tiles=" << num_output_tiles << std::endl;
+    std::cout << "\tbias_section_addr=" << bias_section_addr << std::endl;
+    std::cout << "\ttile_id_offset_by_packer=" << std::hex << tile_id_offset_by_packer << std::endl;
+    std::cout << "\tresult32b=" << std::hex << result32b << std::endl;
+    std::cout << "\tstart_row=" << start_row << std::endl;
+
+    std::cout << "\tleaky_slope=" << std::hex << sfpu_params.leaky_slope << std::endl;
+
+    std::cout << "\tsfpu_scale_val=" << std::hex << sfpu_params.sfpu_scale_val << std::endl;
+
+    std::cout << "\tsfpu_params.sfpu_sort_col_major=" << sfpu_params.sfpu_sort_col_major << std::endl;
+    std::cout << "\tsfpu_params.sfpu_sort_rand_indx=" << sfpu_params.sfpu_sort_rand_indx << std::endl;
+    std::cout << "\tsfpu_params.sfpu_stoch_rnd=" << sfpu_params.sfpu_stoch_rnd << std::endl;
+    std::cout << "\tsfpu_params.sfpu_rnd_unsigned_int=" << sfpu_params.sfpu_rnd_unsigned_int << std::endl;
+    std::cout << "\tsfpu_params.fp32_acc=" << sfpu_params.fp32_acc << std::endl;
+    std::cout << "\tsfpu_params.sfpu_half_rows" << sfpu_params.sfpu_half_rows << std::endl;
+    std::cout << "\tsfpu_params.sfpu_rnd_fmt=" << std::hex << sfpu_params.sfpu_rnd_fmt << std::endl;
+    std::cout << "\tsfpu_params.data_format_src=" << std::hex << sfpu_params.data_format_src << std::endl;
+    std::cout << "\tsfpu_params.num_output_faces=" << std::hex << sfpu_params.num_output_faces << std::endl;
+    #endif
+  }
+};
+
+struct StreamConvParams {
+  std::uint16_t kernel_id_unpacker;
+  std::uint16_t kernel_id_math;
+  std::uint16_t kernel_id_packer;
+
+  std::uint16_t input_stream_id;
+  std::uint32_t input_phase_id;
+  std::uint32_t weight_section_addr;
+  std::uint32_t bias_section_addr;
+
+  bool unpack_halo_strips[4];
+
+  std::uint16_t unpack_weights_offset;
+  std::uint16_t math_Z_dim_ratio_log2;
+
+  std::uint16_t num_input_tiles;
+  std::uint16_t num_output_tiles;
+  std::uint16_t output_tile_id_offset;
+
+  std::uint16_t math_fidelity;
+
+  std::uint16_t y_start;
+  std::uint16_t halo_dim;
+  std::uint8_t  halo_y_top = 0;
+  std::uint8_t  halo_y_bot = 0;
+  std::uint8_t  halo_y_offset = 0;
+
+  StreamConvParams() { std::memset(this, 0, sizeof(StreamConvParams));}
+
+  void parse(std::uint32_t* command_data) {
+    kernel_id_unpacker  = unpack_field(command_data[0], 8, 16);
+    kernel_id_math      = unpack_field(command_data[0], 8, 8);
+    kernel_id_packer    = unpack_field(command_data[0], 8, 0);
+
+    input_stream_id     = unpack_field(command_data[1], 8, 24);
+
+    unpack_halo_strips[3] = unpack_field(command_data[1], 1, 23);
+    unpack_halo_strips[2] = unpack_field(command_data[1], 1, 22);
+    unpack_halo_strips[1] = unpack_field(command_data[1], 1, 21);
+    unpack_halo_strips[0] = unpack_field(command_data[1], 1, 20);
+
+    math_fidelity       = unpack_field(command_data[1], 4, 12);
+
+    weight_section_addr = unpack_address(unpack_field(command_data[3], 16,  0));
+    bias_section_addr   = unpack_address(unpack_field(command_data[3], 16,  16));
+
+    unpack_weights_offset = unpack_field(command_data[4], 16, 0);
+    math_Z_dim_ratio_log2 = unpack_field(command_data[4], 16, 16);
+
+    num_output_tiles      = unpack_field(command_data[5], 16, 16);
+    output_tile_id_offset = unpack_field(command_data[5], 16, 0);
+
+    y_start               = unpack_field(command_data[6], 16, 0);
+    num_input_tiles       = unpack_field(command_data[6], 16, 16);
+
+    halo_y_top            = unpack_field(command_data[6], 6, 0);
+    halo_y_bot            = unpack_field(command_data[6], 6, 6);
+    halo_y_offset         = unpack_field(command_data[6], 4, 12);
+
+    halo_dim              = unpack_field(command_data[7], 4, 20);
+    input_phase_id        = unpack_field(command_data[7], 20, 0);
+
+    if (halo_y_bot == 0) {
+      halo_y_top = 16 - y_start;
+      halo_y_bot = 2 * halo_dim + y_start;
+    }
+  }
+
+  void print() const {
+    #ifndef DISABLE_CMD_DEBUG
+    std::cout << "StreamConvParams:" << std::endl;
+    std::cout << "\tkernel_id_unpacker=" << kernel_id_unpacker << std::endl;
+    std::cout << "\tkernel_id_math=" << kernel_id_math << std::endl;
+    std::cout << "\tkernel_id_packer=" << kernel_id_packer << std::endl;
+
+    std::cout << "\tinput_stream_id=" << input_stream_id << std::endl;
+    std::cout << "\tweight_section_addr=" << weight_section_addr << std::endl;
+    std::cout << "\tbias_section_addr=" << bias_section_addr << std::endl;
+
+    std::cout << "\tunpack_halo_strips[0]=" << (unsigned)unpack_halo_strips[0] << std::endl;
+    std::cout << "\tunpack_halo_strips[1]=" << (unsigned)unpack_halo_strips[1] << std::endl;
+    std::cout << "\tunpack_halo_strips[2]=" << (unsigned)unpack_halo_strips[2] << std::endl;
+    std::cout << "\tunpack_halo_strips[3]=" << (unsigned)unpack_halo_strips[3] << std::endl;
+
+    std::cout << "\tunpack_weights_offset=" << unpack_weights_offset << std::endl;
+    std::cout << "\tmath_Z_dim_ratio_log2=" << math_Z_dim_ratio_log2 << std::endl;
+
+    std::cout << "\tnum_input_tiles=" << num_input_tiles << std::endl;
+    std::cout << "\tnum_output_tiles=" << num_output_tiles << std::endl;
+    std::cout << "\toutput_tile_id_offset=" << output_tile_id_offset << std::endl;
+
+    std::cout << "\tmath_fidelity=" << math_fidelity << std::endl;
+
+    std::cout << "\ty_start=" << y_start << std::endl;
+    std::cout << "\thalo_y_top=" << halo_y_top << std::endl;
+    std::cout << "\thalo_y_bot=" << halo_y_bot << std::endl;
+    std::cout << "\thalo_y_offset=" << halo_y_offset << std::endl;
+    std::cout << "\thalo_dim=" << halo_dim << std::endl;
+    #endif
+  }
+
+  std::uint32_t halo_y_spec()
+  {
+    if ((halo_y_top + halo_y_bot) > 0)
+      return pack_32b_field((uint32_t)halo_y_top, 6, 0) |
+             pack_32b_field((uint32_t)halo_y_bot, 6, 6) |
+             pack_32b_field((uint32_t)halo_y_offset, 4, 12) ;
+
+    return pack_32b_field((uint32_t)y_start, 12, 0) |
+           pack_32b_field((uint32_t)halo_y_offset, 4, 12);
+  }
+
+  std::array<std::uint32_t, 8> create_cmd_stream_conv()
+  {
+    std::array<std::uint32_t, 8> cmd;
+    cmd.fill(0);
+
+    cmd[0] = pack_32b_field(CMD_STREAM_CONV_2, 8, 24) |
+             pack_32b_field(kernel_id_unpacker, 16, 0) ;
+
+    // XXX: Unclear if casting the bool to uint32_t is the best way
+    cmd[1] = pack_32b_field(input_stream_id, 8, 24) |
+             pack_32b_field((uint32_t)unpack_halo_strips[3], 1, 23) |
+             pack_32b_field((uint32_t)unpack_halo_strips[2], 1, 22) |
+             pack_32b_field((uint32_t)unpack_halo_strips[1], 1, 21) |
+             pack_32b_field((uint32_t)unpack_halo_strips[0], 1, 20) |
+             pack_32b_field(math_fidelity, 4, 12);
+
+    cmd[2] = pack_32b_field(kernel_id_math, 16, 16) |
+             pack_32b_field(kernel_id_packer, 16, 0);
+
+    cmd[3] = pack_32b_field(pack_address(bias_section_addr), 16, 16) |
+             pack_32b_field(pack_address(weight_section_addr), 16, 0);
+
+    cmd[4] = pack_32b_field(unpack_weights_offset, 16, 0) |
+             pack_32b_field(math_Z_dim_ratio_log2, 16, 16);
+
+    cmd[5] = pack_32b_field(num_output_tiles, 16, 16) |
+             pack_32b_field(output_tile_id_offset, 16, 0);
+
+    cmd[6] = pack_32b_field(num_input_tiles, 16, 16) |
+             pack_32b_field(halo_y_spec(), 16, 0);
+
+    cmd[7] = pack_32b_field(halo_dim, 4, 20) |
+             pack_32b_field(input_phase_id, 20, 0);
+
+    return cmd;
+  }
+
+};
+
+struct NopClrOperationParams
+{
+
+
+  std::uint32_t math_kernel_address ;
+  std::uint32_t math_fidelity ;
+  std::uint32_t result32b ;
+  std::uint32_t repack_Z_dim_ratio_log2 ;
+  std::uint32_t unpacker_kernel_address ;
+  std::uint32_t packer_kernel_address ;
+  std::uint32_t input_A_section_id ;
+  std::uint32_t input_A_fifo_address ;
+  std::uint32_t output_section_id;
+  std::uint32_t output_fifo_address;
+
+  std::uint32_t imm0;
+  std::uint32_t imm1;
+  std::uint32_t banks;
+  std::uint32_t stall;
+  std::uint32_t fmt;
+  std::uint32_t sel;
+  std::uint32_t unpackr_no;
+
+  std::uint32_t stream;
+
+  NopClrOperationParams() { std::memset(this, 0, sizeof(NopClrOperationParams));}
+
+  void parse(std::uint32_t* command_data)
+  {
+
+//inline std::uint32_t unpack_field(std::uint32_t x, unsigned int bits, unsigned int to_shift)
+//{
+//  return ((x >> to_shift) & bitmask<std::uint32_t>(bits));
+//}
+//  imm0        = unpack_field(command_data[0], 16, 0);
+//  imm1        = unpack_field(command_data[0], 16, 0);
+//  banks       = unpack_field(command_data[0], 16, 0);
+//  stall       = unpack_field(command_data[0], 16, 0);
+ // fmt         = unpack_field(command_data[0], 16, 0);
+//  sel         = unpack_field(command_data[0], 16, 0);
+//  unpackr_no  = unpack_field(command_data[0], 16, 0);
+  }
+
+};
+struct UnaryOperationParams
+{
+  UnaryOperationParams() { std::memset(this, 0, sizeof(UnaryOperationParams));}
+  std::uint32_t math_fidelity;
+  std::uint32_t result32b;
+  std::uint32_t repack_Z_dim_ratio_log2;
+
+  std::uint32_t kernel_id_unpacker;
+  std::uint32_t kernel_id_math;
+  std::uint32_t kernel_id_packer;
+
+  std::uint32_t num_activation_tiles;
+
+  std::uint32_t input_A_stream_id;
+  //std::uint32_t input_A_phase_id;
+
+  std::uint32_t output_stream_id[4];
+  //std::uint32_t output_phase_id[4];
+
+  // TODO: The following fields are deprecated
+  std::uint32_t unpacker_kernel_address;
+  std::uint32_t math_kernel_address;
+  std::uint32_t packer_kernel_address;
+
+  std::uint32_t input_A_section_id;
+  std::uint32_t input_A_fifo_address;
+
+  std::uint32_t output_section_id;
+  std::uint32_t output_fifo_address;
+
+  void parse(std::uint32_t* command_data)
+  {
+    kernel_id_math           = unpack_field(command_data[0], 16, 0);
+    math_fidelity            = unpack_field(command_data[0], 3, 16);
+    result32b                = unpack_field(command_data[0], 1, 19);
+    repack_Z_dim_ratio_log2  = unpack_field(command_data[0], 3, 20);
+
+    kernel_id_unpacker      =  unpack_field(command_data[1], 16, 0);
+    kernel_id_packer        =  unpack_field(command_data[1], 16, 16);
+
+    output_stream_id[0]    = unpack_field(command_data[2], 8, 0);
+    output_stream_id[1]    = unpack_field(command_data[2], 8, 8);
+    output_stream_id[2]    = unpack_field(command_data[2], 8, 16);
+    output_stream_id[3]    = unpack_field(command_data[2], 8, 24);
+
+    input_A_stream_id      = unpack_field(command_data[3], 8, 24);
+    num_activation_tiles   = unpack_field(command_data[3], 16, 0);
+  }
+};
+
+struct StreamUnaryBinaryCommonParams {
+  StreamUnaryBinaryCommonParams() { std::memset(this, 0, sizeof(StreamUnaryBinaryCommonParams));}
+  std::uint32_t math_fidelity;
+  std::uint32_t result32b = 0x0;
+  std::uint32_t repack_Z_dim_ratio_log2;
+
+  std::uint32_t kernel_id_unpacker;
+  std::uint32_t kernel_id_math;
+  std::uint32_t kernel_id_packer;
+
+  std::uint32_t num_activation_tiles;
+
+  std::uint32_t input_A_stream_id;
+  std::uint32_t input_A_phase_id;
+
+  std::uint32_t realign_start_y;
+  std::uint32_t realign_strip_mask;
+  std::uint8_t  realign_y_top = 0;
+  std::uint8_t  realign_y_bot = 0;
+  std::uint8_t  realign_y_offset = 0;
+
+  bool tile_id_passthrough = false;
+
+  // Used for all math kernels that concats in Z whenever there is multiple
+  // tiles unpacked per math iteration.
+  std::uint16_t math_Z_dim_ratio_log2;
+
+ public:
+  std::uint32_t realign_y_spec()
+  {
+    if ((realign_y_top + realign_y_bot) > 0)
+      return pack_32b_field((uint32_t)realign_y_top, 6, 0) |
+             pack_32b_field((uint32_t)realign_y_bot, 6, 6) |
+             pack_32b_field((uint32_t)realign_y_offset, 4, 12) ;
+
+    return pack_32b_field((uint32_t)realign_start_y, 12, 0) |
+           pack_32b_field((uint32_t)realign_y_offset, 4, 12);
+  }
+
+ protected:
+  void parse(std::uint32_t* command_data)
+  {
+    kernel_id_math           = unpack_field(command_data[0], 16, 0);
+    math_fidelity            = unpack_field(command_data[0], 3, 16);
+    result32b                = unpack_field(command_data[0], 1, 19);
+    repack_Z_dim_ratio_log2  = unpack_field(command_data[0], 3, 20);
+
+    kernel_id_unpacker      =  unpack_field(command_data[1], 16, 0);
+    kernel_id_packer        =  unpack_field(command_data[1], 16, 16);
+
+    input_A_stream_id      = unpack_field(command_data[2], 8, 24);
+    num_activation_tiles   = unpack_field(command_data[2], 16, 0);
+
+    realign_start_y        = unpack_field(command_data[3], 4, 28);
+    realign_strip_mask     = unpack_field(command_data[3], 4, 24);
+
+    realign_y_top          = unpack_field(command_data[3], 6, 0);
+    realign_y_bot          = unpack_field(command_data[3], 6, 6);
+    realign_y_offset       = unpack_field(command_data[3], 4, 12);
+
+    tile_id_passthrough    = unpack_field(command_data[3], 1, 20);
+
+    input_A_phase_id        =  unpack_field(command_data[4], 16, 0);
+    math_Z_dim_ratio_log2   =  unpack_field(command_data[4], 16, 16);
+  }
+
+  std::array<std::uint32_t,8> create_cmd_stream_unary_binary_common()
+  {
+    std::array<std::uint32_t, 8> cmd;
+    cmd.fill(0);
+
+    cmd[0] = pack_32b_field(kernel_id_math, 16, 0) |
+             pack_32b_field(math_fidelity, 3, 16) |
+             pack_32b_field(result32b, 1, 19) |
+             pack_32b_field(repack_Z_dim_ratio_log2, 3, 20) |
+             pack_32b_field(CMD_STREAM_UNARY_OPERATION, 8, 24);
+
+    cmd[1] = pack_32b_field(kernel_id_unpacker, 16, 0) |
+             pack_32b_field(kernel_id_packer, 16, 16);
+
+    cmd[2] = pack_32b_field(input_A_stream_id, 8, 24) |
+             pack_32b_field(num_activation_tiles, 16, 0);
+
+    cmd[3] = pack_32b_field(realign_y_spec(), 16, 0) |
+             pack_32b_field(realign_start_y, 4, 28) |
+             pack_32b_field(realign_strip_mask, 4, 24) |
+             pack_32b_field(tile_id_passthrough, 1, 20);
+
+    cmd[4] = pack_32b_field(input_A_phase_id, 16, 0) |
+             pack_32b_field(math_Z_dim_ratio_log2, 16, 16);
+    // cmd[5-8] will be used by StreamBinaryParams or StreamUnaryParams
+
+    return cmd;
+  };
+
+};
+
+struct StreamUnaryParams : StreamUnaryBinaryCommonParams
+{
+  std::uint32_t math_kernel_parameter = 0;
+
+  void parse(std::uint32_t* command_data)
+  {
+    StreamUnaryBinaryCommonParams::parse(command_data);
+    math_kernel_parameter = command_data[5];
+  }
+
+  std::array<std::uint32_t, 8> create_cmd_stream_unary()
+  {
+    std::array<std::uint32_t, 8> cmd = create_cmd_stream_unary_binary_common();
+    cmd[5] = math_kernel_parameter;
+    return cmd;
+  }
+};
+
+struct StreamBinaryParams : StreamUnaryBinaryCommonParams
+{
+
+  std::uint32_t input_B_stream_id;
+  std::uint32_t input_B_phase_id;
+  std::uint32_t scaler_section_addr;
+  std::uint32_t scaler_config;
+  std::uint32_t math_kernel_parameter;
+  std::uint32_t sfpu_scale_val;
+  std::uint8_t sfpu_rnd_fmt;
+  bool sfpu_rnd_unsigned_int;
+  bool sfpu_stoch_rnd;
+  bool sfpu_sort_rand_indx;
+  bool sfpu_signed_int16;
+  bool sfpu_sort_col_major;
+  std::uint32_t log_mult_const;
+  std::uint32_t dropout_prob;
+
+  StreamBinaryParams() :
+    //MM Feb 15 2022: Changed std::memset to this member intialization list.
+    //If you check the assembly it does actually boil down to a memset, and
+    //it's actually more efficient (in general) because it won't try to
+    //default-construct each member before calling the cosntructor body.
+    //(Of course, for POD types, there is no default construction of members,
+    //but you get the point)
+    input_B_stream_id(0), input_B_phase_id(0),
+    scaler_section_addr(0), scaler_config(0),
+    math_kernel_parameter(0), sfpu_scale_val(0),
+    sfpu_rnd_unsigned_int(false), sfpu_stoch_rnd(false),
+    sfpu_sort_rand_indx(false), sfpu_signed_int16(false),
+    sfpu_sort_col_major(false), log_mult_const(0),
+    dropout_prob(0)
+  {}
+
+  void parse(std::uint32_t* command_data)
+  {
+    StreamUnaryBinaryCommonParams::parse(command_data);
+    input_B_stream_id = unpack_field(command_data[5], 8, 0);
+    scaler_config = unpack_field(command_data[5], 24, 8);
+    input_B_phase_id  = unpack_field(command_data[6], 16, 0);
+    scaler_section_addr  = unpack_address(unpack_field(command_data[6], 16, 16));
+    math_kernel_parameter = command_data[7];
+  }
+
+  std::array<std::uint32_t, 16> create_cmd_stream_binary()
+  {
+    std::array<std::uint32_t, 16> cmd;
+    std::array<std::uint32_t, 8> cmd_unary_binary_common = create_cmd_stream_unary_binary_common();
+    for (int i =0; i < 8 ; i++) {
+     cmd[i] = cmd_unary_binary_common[i];
+    }
+
+    cmd[0] &= ~(pack_32b_field(0xFF, 8, 24));
+    cmd[0] |= pack_32b_field(CMD_STREAM_BINARY_OPERATION, 8, 24);
+
+    cmd[5] = pack_32b_field(input_B_stream_id, 8, 0) |
+             pack_32b_field(scaler_config, 24, 8);
+    cmd[6] = pack_32b_field(input_B_phase_id, 16, 0) |
+             pack_32b_field(pack_address(scaler_section_addr), 16, 16);
+    cmd[7] = math_kernel_parameter;
+    cmd[8] = sfpu_scale_val;
+    cmd[9] =  pack_32b_field(sfpu_sort_col_major, 1, 12)   |
+              pack_32b_field(sfpu_signed_int16, 1, 11)     |
+              pack_32b_field(sfpu_sort_rand_indx, 1, 10)   |
+              pack_32b_field(sfpu_stoch_rnd, 1, 9)         |
+              pack_32b_field(sfpu_rnd_unsigned_int, 1, 8)  |
+              pack_32b_field(sfpu_rnd_fmt, 8, 0);
+    cmd[10] = log_mult_const;
+    cmd[11] = dropout_prob;
+
+    return cmd;
+  }
+
+};
+
+struct BinaryOperationParams : UnaryOperationParams
+{
+  std::uint32_t math_fidelity;
+  std::uint32_t repack_Z_dim_ratio_log2;
+
+  std::uint32_t kernel_id_unpacker;
+  std::uint32_t kernel_id_math;
+  std::uint32_t kernel_id_packer;
+
+  std::uint32_t num_activation_tiles;
+
+  std::uint32_t input_A_stream_id;
+
+  std::uint32_t output_stream_id[4];
+
+  std::uint32_t input_B_stream_id;
+
+  BinaryOperationParams() :
+    //MM Feb 15 2022: Changed std::memset to this member intialization list.
+    //If you check the assembly it does actually boil down to a memset, and
+    //it's actually more efficient (in general) because it won't try to
+    //default-construct each member before calling the cosntructor body.
+    //(Of course, for POD types, there is no default construction of members,
+    //but you get the point)
+    math_fidelity(0), repack_Z_dim_ratio_log2(0),
+    kernel_id_unpacker(0), kernel_id_math(0),
+    kernel_id_packer(0), num_activation_tiles(0),
+    input_A_stream_id(0), output_stream_id{0,0,0,0},
+    input_B_stream_id(0)
+  {}
+
+  void parse(std::uint32_t* command_data)
+  {
+    kernel_id_math           = unpack_field(command_data[0], 16, 0);
+    math_fidelity            = unpack_field(command_data[0], 4, 16);
+    repack_Z_dim_ratio_log2  = unpack_field(command_data[0], 3, 20);
+
+    kernel_id_unpacker      =  unpack_field(command_data[1], 16, 0);
+    kernel_id_packer        =  unpack_field(command_data[1], 16, 16);
+
+    output_stream_id[0]    = unpack_field(command_data[2], 8, 0);
+    output_stream_id[1]    = unpack_field(command_data[2], 8, 8);
+    output_stream_id[2]    = unpack_field(command_data[2], 8, 16);
+    output_stream_id[3]    = unpack_field(command_data[2], 8, 24);
+
+    input_A_stream_id      = unpack_field(command_data[3], 8, 24);
+    input_B_stream_id      = unpack_field(command_data[3], 8, 16);
+    num_activation_tiles   = unpack_field(command_data[3], 16, 0);
+  }
+
+  std::array<std::uint32_t, 4> create_cmd_stream_binary()
+  {
+    std::array<std::uint32_t, 4> cmd;
+    cmd.fill(0);
+
+    cmd[0] = pack_32b_field(kernel_id_math, 16, 0) |
+             pack_32b_field(math_fidelity, 4, 16) |
+             pack_32b_field(repack_Z_dim_ratio_log2, 3, 20) |
+             pack_32b_field(CMD_STREAM_BINARY_OPERATION, 8, 24);
+
+    cmd[1] = pack_32b_field(kernel_id_unpacker, 16, 0) |
+             pack_32b_field(kernel_id_packer, 16, 16);
+
+    cmd[2] = pack_32b_field(output_stream_id[0], 8, 0) |
+             pack_32b_field(output_stream_id[1], 8, 8) |
+             pack_32b_field(output_stream_id[2], 8, 16) |
+             pack_32b_field(output_stream_id[3], 8, 24);
+
+    cmd[3] = pack_32b_field(input_A_stream_id, 8, 24) |
+             pack_32b_field(input_B_stream_id, 8, 16) |
+             pack_32b_field(num_activation_tiles, 16, 0);
+
+
+    return cmd;
+  }
+};
+
+struct StreamPoolParams {
+  StreamPoolParams() { std::memset(this, 0, sizeof(StreamPoolParams)); }
+  std::uint32_t kernel_id_unpacker;
+  std::uint32_t kernel_id_math;
+  std::uint32_t kernel_id_packer;
+
+  std::uint16_t math_fidelity;
+  std::uint16_t result32b;
+  std::uint32_t repack_Z_dim_ratio_log2;
+  std::uint16_t neginf_srca;
+  std::uint16_t avg_pool_coeff;
+  std::uint8_t  int8_mode;
+  std::uint8_t  rnd_unsigned_int8;
+
+  std::uint32_t input_A_stream_id;
+  std::uint32_t input_A_phase_id;
+
+  std::uint32_t num_activation_tiles;
+
+  bool unpack_halo_strips[4];
+
+  std::uint32_t input_B_l1_addr;
+
+  std::uint16_t y_start;
+  std::uint16_t halo_dim;
+  std::uint8_t  halo_y_top = 0;
+  std::uint8_t  halo_y_bot = 0;
+  std::uint8_t  halo_y_offset = 0;
+
+  std::uint32_t divide_factor = 0;
+
+  void parse(std::uint32_t* command_data) {
+    kernel_id_math = unpack_field(command_data[0], 16, 0);
+    math_fidelity = unpack_field(command_data[0], 4, 16);
+    result32b = unpack_field(command_data[0], 1, 19);
+    repack_Z_dim_ratio_log2 = unpack_field(command_data[0], 3, 20);
+    neginf_srca = unpack_field(command_data[0], 1, 23);
+
+    kernel_id_unpacker = unpack_field(command_data[1], 16, 0);
+    kernel_id_packer = unpack_field(command_data[1], 16, 16);
+
+    input_A_stream_id = unpack_field(command_data[2], 16, 0);
+    num_activation_tiles = unpack_field(command_data[2], 16, 16);
+
+    unpack_halo_strips[3] = unpack_field(command_data[3], 1, 23);
+    unpack_halo_strips[2] = unpack_field(command_data[3], 1, 22);
+    unpack_halo_strips[1] = unpack_field(command_data[3], 1, 21);
+    unpack_halo_strips[0] = unpack_field(command_data[3], 1, 20);
+    halo_dim             = unpack_field(command_data[3], 4, 28);
+
+    input_B_l1_addr = unpack_address(unpack_field(command_data[3], 16, 0));
+    input_A_phase_id = command_data[4];
+
+    y_start               = unpack_field(command_data[5], 16, 0);
+    halo_y_top            = unpack_field(command_data[5], 6, 0);
+    halo_y_bot            = unpack_field(command_data[5], 6, 6);
+    halo_y_offset         = unpack_field(command_data[5], 4, 12);
+
+    avg_pool_coeff        = unpack_field(command_data[6], 16, 0);
+    int8_mode             = unpack_field(command_data[6], 1, 16);
+    rnd_unsigned_int8     = unpack_field(command_data[6], 1, 17);
+
+    if (halo_y_bot == 0) {
+      halo_y_top = 16 - y_start;
+      halo_y_bot = 2 * halo_dim + y_start;
+    }
+
+    divide_factor     = command_data[7];
+  }
+
+  std::uint32_t halo_y_spec()
+  {
+    if ((halo_y_top + halo_y_bot) > 0)
+      return pack_32b_field((uint32_t)halo_y_top, 6, 0) |
+             pack_32b_field((uint32_t)halo_y_bot, 6, 6) |
+             pack_32b_field((uint32_t)halo_y_offset, 4, 12) ;
+
+    return pack_32b_field((uint32_t)y_start, 12, 0) |
+           pack_32b_field((uint32_t)halo_y_offset, 4, 12);
+  }
+
+  std::array<std::uint32_t, 8> create_cmd_stream_pool() {
+    std::array<std::uint32_t, 8> cmd;
+    cmd.fill(0);
+
+
+    cmd[0] = pack_32b_field(kernel_id_math, 16, 0) |
+             pack_32b_field(math_fidelity, 3, 16) |
+             pack_32b_field(result32b, 1, 19) |
+             pack_32b_field(repack_Z_dim_ratio_log2, 3, 20) |
+             pack_32b_field(neginf_srca, 1, 23) |
+             pack_32b_field(CMD_STREAM_POOL, 8, 24);
+
+
+    cmd[1] = pack_32b_field(kernel_id_unpacker, 16, 0) |
+             pack_32b_field(kernel_id_packer, 16, 16);
+
+
+    cmd[2] = pack_32b_field(input_A_stream_id, 16, 0) |
+             pack_32b_field(num_activation_tiles, 16, 16);
+
+    cmd[3] = pack_32b_field(halo_dim, 4, 28) |
+             pack_32b_field((uint32_t)unpack_halo_strips[3], 1, 23) |
+             pack_32b_field((uint32_t)unpack_halo_strips[2], 1, 22) |
+             pack_32b_field((uint32_t)unpack_halo_strips[1], 1, 21) |
+             pack_32b_field((uint32_t)unpack_halo_strips[0], 1, 20) |
+             pack_32b_field(pack_address(input_B_l1_addr), 16, 0);
+
+    cmd[4] = pack_32b_field(input_A_phase_id, 32, 0);
+
+    cmd[5] = pack_32b_field(halo_y_spec(), 16, 0);
+
+    cmd[6] = pack_32b_field(avg_pool_coeff, 16, 0) |
+             pack_32b_field(int8_mode, 1, 16) |
+             pack_32b_field(rnd_unsigned_int8, 1, 17);
+
+    cmd[7] = pack_32b_field(divide_factor, 32, 0) ;
+
+    return cmd;
+  }
+};
+
+struct BinaryOperationParams_added
+{
+  std::uint32_t input_B_fifo_address;
+  std::uint16_t int_avg_pool_coeff;
+  std::uint32_t divide_factor;
+  bool     int8_mode;
+};
+
+struct TernaryOperationParams_added
+{
+  std::uint32_t input_B_section_id;
+  std::uint32_t input_C_section_id;    // Used for Bias, for example
+  std::uint32_t input_C_fifo_address;
+};
+
+struct TernaryOperationWithBiasParams_added
+{
+  std::uint32_t bias_kernel_address;
+};
+
+struct QuinaryOperationParams_added
+{
+  std::uint32_t input_D_section_id;
+  std::uint32_t input_D_fifo_address;
+
+  std::uint32_t input_E_section_id;
+  std::uint32_t input_E_fifo_address;
+};
+
+struct SliceZParams_added
+{
+  std::uint32_t start_index;
+  std::uint32_t length;
+  std::uint32_t output_size_16B;
+};
+
+struct StreamFullConnParams
+{
+  std::uint32_t kernel_id_unpacker;
+  std::uint32_t kernel_id_math;
+  std::uint32_t kernel_id_packer;
+
+  std::uint8_t  activation_stream_id;
+  std::uint32_t activation_phase_id;
+  std::uint32_t weight_l1_address;
+  std::uint32_t weight_offset;
+
+  std::uint8_t  iterations;
+  std::uint32_t batch_size;
+
+  void parse(const std::uint32_t *command_data) {
+    kernel_id_math     = unpack_field(command_data[0], 16, 0);
+    kernel_id_unpacker = unpack_field(command_data[1], 16, 0);
+    kernel_id_packer   = unpack_field(command_data[1], 16, 16);
+
+    weight_l1_address = unpack_address(unpack_field(command_data[2], 16, 0));
+    activation_stream_id = unpack_field(command_data[2], 8, 16);
+    activation_phase_id  = command_data[3];
+
+    weight_offset        = command_data[4];
+
+    iterations = unpack_field(command_data[0], 8, 16);
+    batch_size = unpack_field(command_data[2], 8, 24);
+  }
+
+  std::array<std::uint32_t, 8> create_cmd_stream_fullconn() {
+    std::array<std::uint32_t, 8> cmd;
+    cmd.fill(0);
+
+    cmd[0] = pack_32b_field(kernel_id_math, 16, 0)
+           | pack_32b_field(iterations, 8, 16)
+           | pack_32b_field(CMD_STREAM_FULLCONN, 8, 24);
+
+    cmd[1] = pack_32b_field(kernel_id_unpacker, 16, 0)
+           | pack_32b_field(kernel_id_packer, 16, 16);
+
+    cmd[2] = pack_32b_field(pack_address(weight_l1_address), 16, 0)
+           | pack_32b_field(activation_stream_id, 8, 16)
+           | pack_32b_field(batch_size, 8, 24);
+
+    cmd[3] = pack_32b_field(activation_phase_id, 32, 0);
+
+    cmd[4] = pack_32b_field(weight_offset, 32, 0);
+
+    return cmd;
+  }
+};
diff --git a/tt_metal/hw/inc/blackhole/cmds.def b/tt_metal/hw/inc/blackhole/cmds.def
new file mode 100644
index 000000000000..fb4f14504d9b
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/cmds.def
@@ -0,0 +1,43 @@
+CMD_DECODE_DEFINE(CMD_REMOTE_FIFO_PUSH              , 32)
+CMD_DECODE_DEFINE(CMD_REMOTE_FIFO_BROADCAST_PUSH    , 32)
+CMD_DECODE_DEFINE(CMD_WAIT_FOR_SECTION              , 32)
+CMD_DECODE_DEFINE(CMD_POP_SECTION                   , 16)
+CMD_DECODE_DEFINE(CMD_INIT_CONV                     , 32)
+CMD_DECODE_DEFINE(CMD_INIT_FIFO                     , 32)
+CMD_DECODE_DEFINE(CMD_UNPACKER_CONFIG               , 32)
+CMD_DECODE_DEFINE(CMD_PACKER_CONFIG                 , 32)
+CMD_DECODE_DEFINE(CMD_LOOP_CONFIG                   , 32)
+CMD_DECODE_DEFINE(CMD_RUN_CONV                      , 16)
+CMD_DECODE_DEFINE(CMD_RUN_HALOIZE                   , 16)
+CMD_DECODE_DEFINE(CMD_RUN_UNARY_OPERATION           , 16)
+CMD_DECODE_DEFINE(CMD_RUN_BINARY_OPERATION          , 16)
+CMD_DECODE_DEFINE(CMD_RUN_CONCAT_Z                  , 32)
+CMD_DECODE_DEFINE(CMD_RUN_HALOIZE9                  , 32)
+CMD_DECODE_DEFINE(CMD_RUN_TERNARY_OPERATION         , 32)
+CMD_DECODE_DEFINE(CMD_RUN_FULLCONN                  , 32)
+CMD_DECODE_DEFINE(CMD_RUN_SLICE_Z                   , 32)
+CMD_DECODE_DEFINE(CMD_RECONFIGURE_NOC               , 16)
+CMD_DECODE_DEFINE(CMD_TEST_MESSAGE                  , 32)
+CMD_DECODE_DEFINE(CMD_INIT_BINARY_OPERATION         , 16)
+CMD_DECODE_DEFINE(CMD_INIT_BINARY_OPERATION8        , 32)
+CMD_DECODE_DEFINE(CMD_INIT_HALOIZE                  , 32)
+CMD_DECODE_DEFINE(CMD_CREATE_SECTION_FROM_SS        , 16)
+CMD_DECODE_DEFINE(CMD_CREATE_SS_FROM_SECTION        , 16)
+CMD_DECODE_DEFINE(CMD_STREAM_FROM_SS                , 16)
+CMD_DECODE_DEFINE(CMD_FLIP_STATE_ID                 , 16)
+CMD_DECODE_DEFINE(CMD_RUN_T6_INSTRUCTIONS           , 32)
+CMD_DECODE_DEFINE(CMD_TRISC_SYNC                    , 16)
+CMD_DECODE_DEFINE(CMD_EDGE_MASK_CONFIG              , 32)
+CMD_DECODE_DEFINE(CMD_MEGA_CONFIG                   , 5*32+16)
+CMD_DECODE_DEFINE(CMD_BASE_ALIGN_STREAM_BUFFER_DATA , 16)
+CMD_DECODE_DEFINE(CMD_STREAM_UNARY_OPERATION        , 32)
+CMD_DECODE_DEFINE(CMD_STREAM_BINARY_OPERATION       , 64)
+CMD_DECODE_DEFINE(CMD_STREAM_CONV_2                 , 32)
+CMD_DECODE_DEFINE(CMD_STREAM_POOL                   , 32)
+CMD_DECODE_DEFINE(CMD_RESEND_STREAM                 , 16)
+CMD_DECODE_DEFINE(CMD_STREAM_FULLCONN               , 32)
+CMD_DECODE_DEFINE(CMD_PACK                          , 384)
+CMD_DECODE_DEFINE(CMD_MEGA_CONFIG_EXTENDED          , 9*32)
+CMD_DECODE_DEFINE(CMD_LOOP_CMD_QUEUE                , 16)
+CMD_DECODE_DEFINE(CMD_NOC_COPY                      , 32)
+CMD_DECODE_DEFINE(CMD_LOAD_NCRISC_FW                , 20*4)
diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h
new file mode 100644
index 000000000000..2f0f7910a165
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// This file contains the memory map for the tensix device
+//
+// It is included on the device, on the host and in linker scripts to
+// serve as a single source of truth for memory layout for both hw design and
+// sw convention.  The requirement of including this in linker scripts
+// requires that everything within be handled by the C pre-processor, hence
+// the use of #define
+//
+// Before adding a define here, read the following:
+// 1) Any "truly global" address must be specified explicitly here.  Truly
+// global addresses are addresses that are referenced on both the host and
+// device
+// 2) Memory section sizes must be specified here, these are used in the
+// linker scripts
+// 3) Device static/global variables generally should NOT be listed here.  If
+// they are global to a core, declare them in the that core's source code and
+// tag them if needed with a section (e.g., "l1_data")
+//
+
+/////////////
+// RISC-V Address map definition (hardware)
+#define MEM_L1_BASE           0x0
+#define MEM_L1_SIZE           (1464 * 1024)
+
+#define MEM_ETH_BASE          0x0
+#define MEM_ETH_SIZE          (256 * 1024)
+
+#define MEM_LOCAL_BASE        0xFFB00000
+#define MEM_BRISC_LOCAL_SIZE  (4 * 1024)
+#define MEM_NCRISC_LOCAL_SIZE (4 * 1024)
+#define MEM_IERISC_LOCAL_SIZE (4 * 1024)
+#define MEM_TRISC_LOCAL_SIZE  (2 * 1024)
+
+#define MEM_NCRISC_IRAM_BASE  0xFFC00000
+#define MEM_NCRISC_IRAM_SIZE  (16 * 1024)
+
+/////////////
+// Firmware/kernel code holes
+#define MEM_BOOT_CODE_SIZE             4
+#define MEM_BRISC_FIRMWARE_SIZE        (10 * 1024)
+#define MEM_NCRISC_FIRMWARE_SIZE       (16 * 1024)
+#define MEM_IERISC_FIRMWARE_SIZE       (16 * 1024)
+#define MEM_TRISC0_SIZE                (16 * 1024)
+#define MEM_TRISC1_SIZE                (16 * 1024)
+#define MEM_TRISC2_SIZE                (16 * 1024)
+#define MEM_ZEROS_SIZE                 512
+
+#define MEM_BOOT_CODE_BASE             0
+#define MEM_MAILBOX_BASE               20
+#define MEM_MAILBOX_END                (MEM_MAILBOX_BASE + 128)
+#define MEM_IERISC_MAILBOX_BASE        1024
+#define MEM_IERISC_MAILBOX_END         (MEM_IERISC_MAILBOX_BASE + 128)
+#define MEM_ZEROS_BASE                 2048
+#define MEM_BRISC_FIRMWARE_BASE        (MEM_ZEROS_BASE + MEM_ZEROS_SIZE)
+#define MEM_NCRISC_FIRMWARE_BASE       MEM_NCRISC_IRAM_BASE
+#define MEM_IERISC_FIRMWARE_BASE       8192
+#define MEM_TRISC0_BASE                (MEM_BRISC_FIRMWARE_BASE + MEM_BRISC_FIRMWARE_SIZE)
+#define MEM_TRISC1_BASE                (MEM_TRISC0_BASE + MEM_TRISC0_SIZE)
+#define MEM_TRISC2_BASE                (MEM_TRISC1_BASE + MEM_TRISC1_SIZE)
+
+// These are used in ncrisc-halt.S, asserted in ncrisc.cc to be valid
+// Better way to do this would be to generate a file w/ these addresses
+#define MEM_NCRISC_HALT_STACK_MAILBOX_ADDRESS    MEM_MAILBOX_BASE + 4
+#define MEM_SLAVE_RUN_MAILBOX_ADDRESS            MEM_MAILBOX_BASE + 28
+
+/////////////
+// Initialization relocation L1 memory
+// Host downloads to these addresses, fw copies to destination
+// Note: using xmov to copy ncrisc to addresses above 1M hangs the chip
+#define MEM_BRISC_INIT_LOCAL_L1_BASE      (MEM_TRISC2_BASE + MEM_TRISC2_SIZE)
+#define MEM_NCRISC_INIT_LOCAL_L1_BASE     (MEM_BRISC_INIT_LOCAL_L1_BASE + MEM_BRISC_LOCAL_SIZE)
+#define MEM_TRISC0_INIT_LOCAL_L1_BASE     (MEM_NCRISC_INIT_LOCAL_L1_BASE + MEM_NCRISC_LOCAL_SIZE)
+#define MEM_TRISC1_INIT_LOCAL_L1_BASE     (MEM_TRISC0_INIT_LOCAL_L1_BASE + MEM_TRISC_LOCAL_SIZE)
+#define MEM_TRISC2_INIT_LOCAL_L1_BASE     (MEM_TRISC1_INIT_LOCAL_L1_BASE + MEM_TRISC_LOCAL_SIZE)
+
+#define MEM_NCRISC_INIT_IRAM_L1_BASE      (MEM_TRISC2_INIT_LOCAL_L1_BASE + MEM_TRISC_LOCAL_SIZE)
+
+#define MEM_IERISC_INIT_LOCAL_L1_BASE     (MEM_IERISC_FIRMWARE_BASE + MEM_IERISC_FIRMWARE_SIZE)
+
+/////////////
+// Stack info
+// Increasing the stack size comes at the expense of less local memory for globals
+#define MEM_BRISC_STACK_SIZE  1024
+#define MEM_NCRISC_STACK_SIZE 1024
+#define MEM_IERISC_STACK_SIZE 1024
+#define MEM_TRISC0_STACK_SIZE  256
+#define MEM_TRISC1_STACK_SIZE  256
+#define MEM_TRISC2_STACK_SIZE  768
+
+#define MEM_BRISC_STACK_BASE  (MEM_LOCAL_BASE + MEM_BRISC_LOCAL_SIZE - MEM_BRISC_STACK_SIZE)
+#define MEM_NCRISC_STACK_BASE (MEM_LOCAL_BASE + MEM_NCRISC_LOCAL_SIZE - MEM_NCRISC_STACK_SIZE)
+#define MEM_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE)
+#define MEM_TRISC0_STACK_BASE (MEM_LOCAL_BASE + MEM_TRISC_LOCAL_SIZE - MEM_TRISC0_STACK_SIZE)
+#define MEM_TRISC1_STACK_BASE (MEM_LOCAL_BASE + MEM_TRISC_LOCAL_SIZE - MEM_TRISC1_STACK_SIZE)
+#define MEM_TRISC2_STACK_BASE (MEM_LOCAL_BASE + MEM_TRISC_LOCAL_SIZE - MEM_TRISC2_STACK_SIZE)
diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map_versim.h b/tt_metal/hw/inc/blackhole/dev_mem_map_versim.h
new file mode 100644
index 000000000000..a3243bad9225
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/dev_mem_map_versim.h
@@ -0,0 +1,11 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "dev_mem_map.h"
+// This is to support some deprecated mappings for versim to get build up
+#ifndef TT_METAL_VERSIM_DISABLED
+#define TEST_MAILBOX_ADDRESS MEM_MAILBOX_BASE
+#endif
diff --git a/tt_metal/hw/inc/blackhole/dram_address_map.h b/tt_metal/hw/inc/blackhole/dram_address_map.h
new file mode 100644
index 000000000000..7438fe4d9f75
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/dram_address_map.h
@@ -0,0 +1,40 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stdint.h>
+
+#include "dev_mem_map.h"
+
+namespace dram_mem {
+
+struct address_map {
+    // Sizes
+
+    // Actual memory allocated to each bank for perf is 39896 * 1024.
+    // It is reduced below for faster dram perf dump.
+    // This can be increased to maximum 39896 * 1024 if more space was needed.
+    static constexpr std::int32_t DRAM_EACH_BANK_PERF_BUFFER_SIZE = 4800 * 1024;
+    static constexpr std::int32_t FW_DRAM_BLOCK_SIZE =
+        MEM_TRISC0_SIZE + MEM_TRISC1_SIZE + MEM_TRISC2_SIZE;
+
+    // Ensure values are in sync until l1_mem::address_map::FW_DRAM_BLOCK_SIZE is retired
+    static_assert(l1_mem::address_map::FW_DRAM_BLOCK_SIZE == FW_DRAM_BLOCK_SIZE);
+
+    // Base addresses
+
+    static constexpr std::int32_t DRAM_EACH_BANK_PERF_BUFFER_BASE = 1024 * 1024;
+
+    static constexpr std::int32_t TRISC_BASE = 0;
+    static constexpr std::int32_t TRISC0_BASE = TRISC_BASE;
+    static constexpr std::int32_t TRISC1_BASE = TRISC0_BASE + MEM_TRISC0_SIZE;
+    static constexpr std::int32_t TRISC2_BASE = TRISC1_BASE + MEM_TRISC1_SIZE;
+    static constexpr std::int32_t OVERLAY_BLOB_BASE = TRISC2_BASE + MEM_TRISC2_SIZE;
+
+    static_assert((TRISC0_BASE + MEM_TRISC0_SIZE) < FW_DRAM_BLOCK_SIZE);
+    static_assert((TRISC1_BASE + MEM_TRISC1_SIZE) < FW_DRAM_BLOCK_SIZE);
+    static_assert((TRISC2_BASE + MEM_TRISC2_SIZE) < FW_DRAM_BLOCK_SIZE);
+};
+}  // namespace dram_mem
diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h
new file mode 100644
index 000000000000..8fe6757d4326
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h
@@ -0,0 +1,96 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+
+#include "tt_metal/hostdevcommon/common_runtime_address_map.h"
+
+namespace eth_l1_mem {
+
+
+struct address_map {
+
+  static constexpr std::int32_t MAX_SIZE = 256 * 1024;
+  static constexpr std::int32_t MAX_L1_LOADING_SIZE = 1 * 256 * 1024;
+
+  // Sizes
+  static constexpr std::int32_t FIRMWARE_SIZE = 32 * 1024;
+  static constexpr std::int32_t COMMAND_Q_SIZE = 4 * 1024;
+  static constexpr std::int32_t DATA_BUFFER_SIZE_HOST = 4 * 1024;
+  static constexpr std::int32_t DATA_BUFFER_SIZE_ETH = 4 * 1024;
+  static constexpr std::int32_t DATA_BUFFER_SIZE_NOC = 16 * 1024;
+  static constexpr std::int32_t DATA_BUFFER_SIZE = 24 * 1024;
+  static constexpr std::int32_t TOOLS_MAX_L1_SIZE = 3 * 1024;
+  static constexpr std::int32_t ERISC_L1_ARGS_SIZE = 96 * 4;
+
+  // Base addresses
+  static constexpr std::int32_t FIRMWARE_BASE = 0x9040;
+  static constexpr std::int32_t L1_EPOCH_Q_BASE = 0x9000;  // Epoch Q start in L1.
+  static constexpr std::int32_t COMMAND_Q_BASE = L1_EPOCH_Q_BASE + FIRMWARE_SIZE;
+  static constexpr std::int32_t DATA_BUFFER_BASE = COMMAND_Q_BASE + COMMAND_Q_SIZE;
+  static constexpr std::int32_t TILE_HEADER_BUFFER_BASE = DATA_BUFFER_BASE + DATA_BUFFER_SIZE;
+
+  // TT Metal Specific
+  static constexpr std::int32_t ERISC_FIRMWARE_SIZE = 2 * 1024;
+  // Total 160 * 1024 L1 starting from TILE_HEADER_BUFFER_BASE
+  //    -   1 * 1024 misc args
+  //    -  53 * 1024 eth app reserved buffer space
+  //    - 106 * 1024 L1 unreserved buffer space
+  static constexpr std::int32_t MAX_NUM_CONCURRENT_TRANSACTIONS = 8;
+  static constexpr std::int32_t ERISC_BARRIER_SIZE = 32;
+  static constexpr std::int32_t ERISC_APP_ROUTING_INFO_SIZE = 48;
+  static constexpr std::int32_t ERISC_APP_SYNC_INFO_SIZE = 160 + 16 * MAX_NUM_CONCURRENT_TRANSACTIONS;
+
+  static constexpr std::int32_t ERISC_BARRIER_BASE = TILE_HEADER_BUFFER_BASE;
+  static constexpr std::int32_t ERISC_APP_ROUTING_INFO_BASE = ERISC_BARRIER_BASE + ERISC_BARRIER_SIZE;
+  static constexpr std::int32_t ERISC_APP_SYNC_INFO_BASE = ERISC_APP_ROUTING_INFO_BASE + ERISC_APP_ROUTING_INFO_SIZE;
+  static constexpr std::uint32_t SEMAPHORE_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE;
+
+  static constexpr uint32_t ISSUE_CQ_CB_BASE = SEMAPHORE_BASE + SEMAPHORE_SIZE;  // SIZE from shared common addr
+  static constexpr uint32_t COMPLETION_CQ_CB_BASE = ISSUE_CQ_CB_BASE + 7 * L1_ALIGNMENT;
+
+  // Extra 4 bytes is to make sure that the launch message is properly aligned.
+  static constexpr std::int32_t ERISC_MEM_MAILBOX_BASE = COMPLETION_CQ_CB_BASE + 7 * L1_ALIGNMENT + 4;
+  // erisc early exit functionality re-uses mailboxes_t::ncrisc_halt_msg_t::stack_save memory
+  static constexpr std::int32_t ERISC_MEM_MAILBOX_STACK_SAVE = ERISC_MEM_MAILBOX_BASE + 4;
+
+  static constexpr std::int32_t ERISC_RING_BUFFER_ADDR = ERISC_MEM_MAILBOX_BASE + 128 + 12;
+  static constexpr std::int32_t PRINT_BUFFER_ER = ERISC_RING_BUFFER_ADDR + RING_BUFFER_SIZE;
+  static constexpr std::uint32_t PROFILER_L1_BUFFER_ER = PRINT_BUFFER_ER + 256;
+  static constexpr std::uint32_t PROFILER_L1_BUFFER_CONTROL = PROFILER_L1_BUFFER_ER + PROFILER_L1_BUFFER_SIZE;
+
+  static constexpr std::int32_t ERISC_L1_ARG_BASE = PROFILER_L1_BUFFER_CONTROL + PROFILER_L1_CONTROL_BUFFER_SIZE;
+
+  static_assert(ERISC_L1_ARG_BASE - ERISC_RING_BUFFER_ADDR <= TOOLS_MAX_L1_SIZE);
+  static_assert((ERISC_RING_BUFFER_ADDR % 32) == 0);
+  static_assert((PRINT_BUFFER_ER % 32) == 0);
+  static_assert((PROFILER_L1_BUFFER_ER % 32) == 0);
+  static_assert((PROFILER_L1_BUFFER_CONTROL % 32) == 0);
+
+  static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = ERISC_L1_ARG_BASE + ERISC_L1_ARGS_SIZE;
+  static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_L1_LOADING_SIZE - ERISC_L1_UNRESERVED_BASE;
+
+  static_assert((ERISC_L1_UNRESERVED_BASE % 32) == 0);
+
+  static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = L1_EPOCH_Q_BASE + 4;
+
+  // BIDIR Tunneling Kernel Space
+  static constexpr std::int32_t ERISC_L1_TUNNEL_BUFFER_SIZE = ERISC_L1_UNRESERVED_SIZE / 2;
+
+  template <std::size_t A, std::size_t B>
+  struct TAssertEquality {
+      static_assert(A == B, "Not equal");
+      static constexpr bool _cResult = (A == B);
+  };
+
+  static constexpr std::int32_t RISC_LOCAL_MEM_BASE = 0xffb00000; // Actaul local memory address as seen from risc firmware
+                                                                   // As part of the init risc firmware will copy local memory data from
+                                                                   // l1 locations listed above into internal local memory that starts
+                                                                   // at RISC_LOCAL_MEM_BASE address
+
+  static constexpr std::uint32_t FW_VERSION_ADDR = 0x210;
+};
+}  // namespace eth_l1_mem
diff --git a/tt_metal/hw/inc/blackhole/noc/noc.h b/tt_metal/hw/inc/blackhole/noc/noc.h
new file mode 100644
index 000000000000..dd33c37a1fad
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/noc/noc.h
@@ -0,0 +1,472 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef _NOC_H_
+#define _NOC_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+//////
+
+#include "noc_parameters.h"
+
+
+/*
+
+  Basic NOC API
+
+  Common arguments:
+
+   * bool linked: all sequences of function calls with “linked” set to true
+     targeting the same destination will manifest on the NoC as a single
+     multi-command packet, guaranteeing they complete in-order.  For commands
+     targeting different destinations, it is not possible to provide this
+     guarantee.
+
+     Moreover, it is not possible to provide linked ordering between different
+     unicast/multicast classes of NOC virtual channels.
+
+   * unicast addresses: all unicast address arguments are given as 40-bit
+     addresses (type uint64_t):
+        - bits [31:0] = byte offset in local L1 memory,
+        - bits [35:32]/[39:36] = X/Y coordinate prefixes.
+
+     The addresses can be provided using the above macro NOC_XY_ADDR.  For
+     example, address 0x1000 in L1 of Tensix (1, 2) can be given as
+     NOC_XY_ADDR(1, 2, 0x1000).
+
+   * multicast addresses: all multicast address arguments are given as a 48-bit
+     combination of 32-bit local address and coordinates of the upper left and
+     lower right corners of the multicast rectangle.
+
+     The address can be provided using the macro NOC_MULTICAST_ADDR above.
+     For example, using NOC_MULTICAST_ADDR(1, 4, 6, 5) will multicast to
+     12 destinations, i.e. all those with X coordinates between 1 and 6 and
+     Y-coordinates between 4 and 5 (inclusive).
+
+   All addresses are in the form of byte offsets, regardless of the minimal
+   access granularity.  Address bits below the minimal access granularity are
+   ignored.
+
+*/
+
+
+/*
+  Copy data from source to destination address.  Supports narrow transfers
+  (size not a multiple of 16 bytes).  However, the alignment of source and
+  destination start addresses (i.e., bits [3:0]) must be identical.  If the
+  alignment is not identical, the one from destination address is assumed.
+
+  If copying from local memory to a remote destination, set posted=false to
+  request an ack that will increment the NIU_MST_WR_ACK_RECEIVED counter.
+  This value can be compared with NIU_MST_NONPOSTED_WR_REQ_STARTED to ensure
+  the writes are flushed.  (Note that copying more than NOC_MAX_BURST_SIZE
+  triggers multiple underlying NOC requests.)
+
+  If src_addr is remote, the request is always posted and the parameter is
+  ignored.
+
+  <src_coordinate> => NOC ID portion of source address (unicast)
+  <src_addr> => source address (unicast)
+  <dst_coordinate> => NOC ID portion of destination address (unicast)
+  <dst_addr> => destination address (unicast)
+  <size> => number of bytes to copy
+  <linked> => link with previous call for ordering
+  <posted> => if copying from a local address, avoid sending ack on the
+              response channel
+  <static_vc_alloc> => use static VC allocation
+  <static_vc> => use VC 0/1 for static request; don't-care if static_vc_alloc=0
+  <vc_arb_priority> =>  arbitration priority for VC allocation;
+              set to 0 disable arbitration priority & use round-robin always
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+*/
+void noc_copy(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint32_t vc_arb_priority, uint8_t transaction_id);
+
+
+/*
+  Copy data from source to destination address and accumulate at destination.
+  Supports narrow transfers (size not a multiple of 16 bytes).
+  However, the alignment of source and destination start addresses (i.e., bits [3:0])
+  must be identical.  If the alignment is not identical, the one from destination
+  address is assumed.
+
+  If copying from local memory to a remote destination, set posted=false to
+  request an ack that will increment the NIU_MST_WR_ACK_RECEIVED counter.
+  This value can be compared with NIU_MST_NONPOSTED_WR_REQ_STARTED to ensure
+  the writes are flushed.  (Note that copying more than NOC_MAX_BURST_SIZE
+  triggers multiple underlying NOC requests.)
+
+  If src_addr is remote, the request is always posted and the parameter is
+  ignored.
+
+  <src_coordinate> => NOC ID portion of source address (unicast)
+  <src_addr> => source address (unicast)
+  <dst_coordinate> => NOC ID portion of destination address (unicast)
+  <dst_addr> => destination address (unicast)
+  <size> => number of bytes to copy
+  <linked> => link with previous call for ordering
+  <posted> => if copying from a local address, avoid sending ack on the
+              response channel
+  <static_vc_alloc> => use static VC allocation
+  <static_vc> => use VC 0/1 for static request; don't-care if static_vc_alloc=0
+  <vc_arb_priority> =>  arbitration priority for VC allocation;
+              set to 0 disable arbitration priority & use round-robin always
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+  <data_format => The format of the data used for accumulation: NOC_AT_ACC_*, e.g. NOC_AT_ACC_FP32>
+  <disable_saturation => Set to disable accumulation saturation, such that values will wrap around if the addition result cannot fit.>
+*/
+void noc_accumulate(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, bool multicast, uint32_t multicast_mode, uint32_t vc_arb_priority, uint8_t transaction_id, uint8_t data_format, bool disable_saturation);
+
+/*
+  Copy a single word with byte-enables from source to destination address.
+  Works similar to noc_copy, except limited to a single-word transfer and
+  provides the option to specify partial byte-enables.
+
+  This call works only with transfers from local memory.
+
+  <src_coordinate> => NOC ID portion of source address (unicast)
+  <src_addr> => source address (unicast, must be local memory)
+  <dst_coordinate> => NOC ID portion of destination address (unicast)
+  <dst_addr> => destination address (unicast)
+  <be> => byte enable mask
+  <linked> => link with previous call for ordering
+  <posted> => if copying from a local address, avoid sending ack on the
+              response channel
+  <static_vc_alloc> => use static VC allocation
+  <static_vc> => use VC 0/1 for static request; don't-care if static_vc_alloc=0
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+*/
+void noc_copy_word_be(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint64_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id);
+
+
+/*
+  Write a single 32-bit value using inline header data. (The effect is the
+  same as when writing with noc_copy, however such writes save the bandwidth
+  of an additional flit for register access.)
+
+  <dst_coordinate> => NOC ID portion of destination address (unicast)
+  <dst_addr> => destination address (unicast)
+  <be> => byte enable mask
+  <linked> => link with previous call for ordering
+  <posted> => if copying from a local address, avoid sending ack on the
+              response channel
+  <static_vc_alloc> => use static VC allocation
+  <static_vc> => use VC 0/1 for static request; don't-care if static_vc_alloc=0
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+*/
+void noc_write_dw_inline(uint32_t dst_coordinate, uint64_t dst_addr, uint32_t val, uint8_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id);
+
+
+/*
+  Copy data from source to multiple destinations via multicast.  Supports
+  narrow transfers (size not a multiple of 16 bytes).  However, the alignment
+  of source and destination start addresses (i.e., bits [3:0]) must be identical.
+  If the alignment is not identical, the one from destination address is assumed.
+
+  If copying from local memory and posted=false, a separate ack is received from
+  each destination.
+
+  <src_coordinate> => NOC ID portion of source address (unicast)
+  <src_addr> => source address (unicast)
+  <dst_coordinate> => NOC ID portion of destination address (multicast)
+  <dst_addr> => destination address (multicast)
+  <multicast_mode> => multicast direction (0 = x-direction, 1 = y-direction)
+  <size> => number of bytes to copy
+  <linked> => link with previous call for ordering
+  <posted> => if copying from a local address, avoid sending ack on the
+              response channel
+  <static_vc_alloc> => use static VC allocation
+  <static_vc> => use VC 0/1 for static request; don't-care if static_vc_alloc=0
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+*/
+void noc_multicast_copy(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id);
+
+// support multicast ability to exclude nodes
+void noc_multicast_copy_exclude(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint32_t multicast_exclude, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id);
+
+// support src include
+void noc_multicast_copy_src_include(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint32_t size, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id);
+
+
+/*
+  Multicast version of noc_copy_word_be.
+
+  Like noc_copy_word_be, this call works only with transfers from local memory,
+  and is limited to single-word transfers.
+
+  <src_coordinate> => NOC ID portion of source address (unicast)
+  <src_addr> => source address (unicast)
+  <dst_coordinate> => NOC ID portion of destination address (multicast)
+  <dst_addr> => destination address (multicast)
+  <multicast_mode> => multicast direction (0 = x-direction, 1 = y-direction)
+  <be> => byte enable mask
+  <linked> => link with previous call for ordering
+  <posted> => if copying from a local address, avoid sending ack on the
+              response channel
+  <static_vc_alloc> => use static VC allocation
+  <static_vc> => use VC 0/1 for static request; don't-care if static_vc_alloc=0
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+*/
+void noc_multicast_copy_word_be(uint32_t src_coordinate, uint64_t src_addr, uint32_t dst_coordinate, uint64_t dst_addr, uint32_t multicast_mode, uint64_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id);
+
+
+/*
+  Multicast version of noc_write_dw_inline.
+
+  <dst_coordinate> => NOC ID portion of destination address (unicast)
+  <dst_addr> => destination address (unicast)
+  <multicast_mode> => multicast direction (0 = x-direction, 1 = y-direction)
+  <be> => byte enable mask
+  <linked> => link with previous call for ordering
+  <posted> => if copying from a local address, avoid sending ack on the
+              response channel
+  <static_vc_alloc> => use static VC allocation
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+*/
+void noc_multicast_write_dw_inline(uint32_t dst_coordinate, uint64_t dst_addr, uint32_t val, uint32_t multicast_mode, uint8_t be, bool linked, bool posted, bool static_vc_alloc, uint32_t static_vc, uint8_t transaction_id);
+
+
+/*
+  Atomic wrapping increment of 32-bit value at destination address.  The address has
+  4-byte granularity.  The increment result wraps around the address aligned relative
+  to the specified wrapping size.   Increment is an arbitrary value, while wrapping
+  limit is calculated from the given argument as 2^(<wrap>+1).  (Therefore, for 32-bit
+  values, setting <wrap>=31 implies no wrapping except the 32-bit integer maximum.)
+
+  For example, if:
+      wrap = 7 (wrap to 0x100),
+      incr = 0x80 (increase by 0x80),
+      current value = 0x21C0,
+
+  then the aligned valud is 0x2100, and the new value is:
+      0x2100 + ((0x1C0 + 0x80) % 0x100) = 0x2140.
+
+  <noc_coordinate> => NOC ID portion of addr (unicast)
+  <addr> => counter address (unicast)
+  <incr> => increment
+  <wrap> => log2(wrapping limit)-1
+  <linked> => link with previous call for ordering
+*/
+void noc_atomic_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t incr, uint32_t wrap, bool linked);
+
+
+/*
+  Performs the same operation as noc_atomic_increment and reads the previous value from the
+  destination address to <read_addr>.  The <read_addr> address also has 4-byte granularity,
+  and the return value updates only the corresponding 32 bits in local memory.
+
+  There is no alignment requirement between <addr> and <read_addr>.
+
+  This function can be used to reserve space in a remote buffer by operating on the write
+  pointer.
+
+  The status of the returned read can be determined by calling noc_atomic_read_updates_completed
+  (see below).
+
+  <noc_coordinate> => NOC ID portion of addr (unicast)
+  <addr> => counter address (unicast)
+  <incr> => increment
+  <wrap> => log2(wrapping limit)-1
+  <read_coordinate> => NOC ID portion of address
+  <read_addr> => address to store the previous value
+  <linked> => link with previous call for ordering
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+*/
+void noc_atomic_read_and_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t incr, uint32_t wrap, uint32_t read_coordinate, uint64_t read_addr, bool linked, uint8_t transaction_id);
+
+
+/*
+  Performs the same operation as noc_atomic_increment on multiple multicast destinations.
+
+  <noc_coordinate> => NOC ID portion of addr (multicast)
+  <addr> => counter address (multicast)
+  <multicast_mode> => multicast direction (0 = x-direction, 1 = y-direction)
+  <incr> => increment
+  <wrap> => log2(wrapping limit)-1
+  <linked> => link with previous call for ordering
+*/
+void noc_multicast_atomic_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t multicast_mode, uint32_t incr, uint32_t wrap, bool linked);
+
+
+/*
+  Performs the same operation as noc_atomic_read_and_increment on multiple multicast destinations.
+
+  Each destination returns the previous value, and the final value written to read_addr is undefined,
+  depending on the order in which updates are delivered.  Therefore, the 32-bit value at this address
+  must be reserved to be modified by this call, but its final value should be ignored.
+
+  The value returned by noc_atomic_read_updates_completed will increment with each returned response.
+  The intended use case for this function is to perform atomic increments at multiple destinations, and
+  subsequently call noc_atomic_read_updates_completed to ensure all the updates have completed.
+
+  <noc_coordinate> => NOC ID portion of addr (multicast)
+  <addr> => counter address (multicast)
+  <multicast_mode> => multicast direction (0 = x-direction, 1 = y-direction)
+  <incr> => increment
+  <wrap> => log2(wrapping limit)-1
+  <read_coordinate> => NOC ID portion of read_addr (multicast)
+  <read_addr> => address to store the previous value
+  <linked> => link with previous call for ordering
+  <transaction_id => optional ID tag for the outgoing request (0-15, used for
+              selective transaction flush)>
+*/
+void noc_multicast_atomic_read_and_increment(uint32_t noc_coordinate, uint64_t addr, uint32_t multicast_mode, uint32_t incr, uint32_t wrap, uint32_t read_coordinate, uint64_t read_addr, bool linked, uint8_t transaction_id);
+
+
+/*
+  Set command buffer ID (0-3) to use for the next commmand issued.
+*/
+void noc_set_cmd_buf(uint32_t cmd_buf_id);
+
+/*
+  Get current setting for command buffer ID (0-3).
+*/
+uint32_t noc_get_cmd_buf();
+
+/*
+  Set NOC instance (0-1) to use for the next commmand issued.
+*/
+void noc_set_active_instance(uint32_t noc_id);
+
+/*
+  Get current setting for NOC instance (0-1) to use.
+*/
+uint32_t noc_get_active_instance();
+
+/*
+  Returns the number of atomic operations that return a value (such as
+  atomic_read_and_increment) that have completed since the last reset event.
+
+  The counter is 32-bit and wraps when the number exceeds 2^32-1.
+
+  By tracking this value and polling for its increase, firmware can determine
+  that the remote pointer updates have completed and the responses have been
+  committed to local memory.
+ */
+uint32_t noc_atomic_read_updates_completed();
+
+/*
+  Returns the number of write acks received
+
+  The counter is 32-bit and wraps when the number exceeds 2^32-1.
+
+  By tracking this value and polling for its increase, firmware can determine
+  that the noc copy operations have completed.
+ */
+
+volatile uint32_t noc_wr_ack_received();
+
+/*
+  Returns the number of read responses received
+
+  The counter is 32-bit and wraps when the number exceeds 2^32-1.
+
+  By tracking this value and polling for its increase, firmware can determine
+  that the noc copy operations (from remote to local) have completed.
+ */
+
+volatile uint32_t noc_rd_resp_received();
+/*
+  Returns true if the active command buffer is presently available (i.e., no pending
+  request that is being backpressured by the NOC).
+
+  Issuing a command while the command buffer is busy results in undefined behavior.
+
+  All above functions use command buffer 0 and spin on noc_command_ready() at entry,
+  so there is no need to call it explicitly before these calls.
+
+*/
+bool noc_command_ready();
+
+
+/*
+  Returns ID & dateline info of the local node in the format:
+     {10'b0, i_dateline_node_y[0:0], i_dateline_node_x[0:0],
+     i_local_nocid[3:0],
+     i_noc_y_size[3:0], i_noc_x_size[3:0],
+     i_local_nodeid_y[3:0], i_local_nodeid_x[3:0]}
+*/
+uint32_t noc_local_node_id();
+
+
+/*
+  Returns value of specific status register (see noc_parameters.h for the list).
+*/
+uint32_t noc_status_reg(uint32_t status_reg_id);
+
+
+/*
+  Sets value of specific NOC config register (see noc_parameters.h for the list).
+*/
+void noc_set_cfg_reg(uint32_t cfg_reg_id, uint32_t val);
+
+/*
+  Gets value of specific NOC config register (see noc_parameters.h for the list).
+*/
+uint32_t noc_get_cfg_reg(uint32_t cfg_reg_id);
+
+
+/*
+  Reset to 0 each transaction ID outstanding request counter for which the corresponding
+  id_mask bit is set.
+*/
+void noc_clear_req_id_cnt(uint32_t id_mask);
+
+
+//////////////////////////////////////////////////////////////////
+//////////////////////// ECC Functions ///////////////////////////
+//////////////////////////////////////////////////////////////////
+
+/*
+  Allows for the enabling/disabling of ECC features in the NIU and Routers
+  Enabling full ECC is a two stage process. First you must call noc_ecc_cfg_stage_1 for all tensix, sync (ensuring all writes went through),
+  and then call noc_ecc_cfg_stage_2 for all tensix.
+*/
+void noc_ecc_cfg_stage_1(bool header_ckh_bits_en);
+
+/*
+  Allows for the enabling/disabling of ECC features in the NIU and Routers
+  Enabling full ECC is a two stage process. First you must call noc_ecc_cfg_stage_1 for all tensix, sync (ensuring all writes went through),
+  and then call noc_ecc_cfg_stage_2 for all tensix.
+*/
+void noc_ecc_cfg_stage_2(bool niu_mem_parity_en, bool router_mem_parity_en, bool header_secded_en, bool mem_parity_int_en, bool header_sec_int_en, bool header_ded_int_en);
+
+/*
+  Clears the corresponding ECC error interrupt and number of errors register
+*/
+void noc_ecc_clear_err(bool clear_mem_parity_err, bool clear_header_sec, bool clear_header_ded);
+
+/*
+  Increments the corresponding number of errors register by 1.
+  Debug use only.
+*/
+void noc_ecc_force_err(bool force_mem_parity_err, bool force_header_sec, bool force_header_ded);
+
+/*
+  Gets the number of memory parity errors. This is the sum of the number of parity errors in the router and niu memories (if enabled in noc_ecc_cfg()). This register indicates a fatal error in the system.
+*/
+uint32_t noc_ecc_get_num_mem_parity_errs();
+
+/*
+  Gets the number of single errors that were corrected in the header. This register should be treated as a warning of system instability.
+*/
+uint32_t noc_ecc_get_num_header_sec();
+
+/*
+  Gets the number of double errors detected in the header. This register indicates a fatal error in the system.
+*/
+uint32_t noc_ecc_get_num_header_ded();
+
+//////
+
+#endif //ndef _NOC_H_
diff --git a/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.cpp b/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.cpp
new file mode 100644
index 000000000000..82d709cd7a4c
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.cpp
@@ -0,0 +1,3471 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "noc_overlay_parameters.hpp"
+#include "noc_overlay_parameters.h"
+
+using namespace Noc;
+
+const std::vector<OverlayReg> OLP::registers = {
+    {
+        "STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO",
+        0,
+        {
+            {"SOURCE_ENDPOINT_NEW_MSG_ADDR", 0}
+            ,{"SOURCE_ENDPOINT_NEW_MSG_SIZE", 1}
+            ,{"SOURCE_ENDPOINT_NEW_MSG_LAST_TILE", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(SOURCE_ENDPOINT_NEW_MSG_ADDR+SOURCE_ENDPOINT_NEW_MSG_ADDR_WIDTH), 1}
+            ,{(SOURCE_ENDPOINT_NEW_MSG_SIZE+SOURCE_ENDPOINT_NEW_MSG_SIZE_WIDTH), 2}
+
+        },
+        {
+            {
+                "SOURCE_ENDPOINT_NEW_MSG_ADDR",
+                0,
+                MEM_WORD_ADDR_WIDTH,
+                ""
+            }
+            ,{
+                "SOURCE_ENDPOINT_NEW_MSG_SIZE",
+                (SOURCE_ENDPOINT_NEW_MSG_ADDR+SOURCE_ENDPOINT_NEW_MSG_ADDR_WIDTH),
+                (32-MEM_WORD_ADDR_WIDTH-1),
+                ""
+            }
+            ,{
+                "SOURCE_ENDPOINT_NEW_MSG_LAST_TILE",
+                (SOURCE_ENDPOINT_NEW_MSG_SIZE+SOURCE_ENDPOINT_NEW_MSG_SIZE_WIDTH),
+                (1),
+                ""
+            }
+
+        },
+        "// For endpoints with SOURCE_ENDPOINT == 1, this register is for firmware \n// to register new message for sending. \n// This updates the msg_info register structure directly, rather than writing to the message info\n// buffer in memory.\n// Must not be written when the message info register structure is full, or if\n// there are message info entries in the memory buffer. (This would cause a race\n// condition.)\n"
+    }
+    ,{
+        "STREAM_NUM_MSGS_RECEIVED_INC",
+        1,
+        {
+            {"SOURCE_ENDPOINT_NEW_MSGS_NUM", 0}
+            ,{"SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE", 1}
+            ,{"SOURCE_ENDPOINT_NEW_MSGS_LAST_TILE", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(SOURCE_ENDPOINT_NEW_MSGS_NUM+SOURCE_ENDPOINT_NEW_MSGS_NUM_WIDTH), 1}
+            ,{(SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE+SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE_WIDTH), 2}
+
+        },
+        {
+            {
+                "SOURCE_ENDPOINT_NEW_MSGS_NUM",
+                0,
+                12,
+                ""
+            }
+            ,{
+                "SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE",
+                (SOURCE_ENDPOINT_NEW_MSGS_NUM+SOURCE_ENDPOINT_NEW_MSGS_NUM_WIDTH),
+                MEM_WORD_ADDR_WIDTH,
+                ""
+            }
+            ,{
+                "SOURCE_ENDPOINT_NEW_MSGS_LAST_TILE",
+                (SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE+SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// For endpoints with SOURCE_ENDPOINT == 1, this register is for firmware \n// to update the number of messages whose data & header are available in the memory buffer.\n// Hardware register is incremented atomically if sending of previous messages is in progress.\n"
+    }
+    ,{
+        "STREAM_ONETIME_MISC_CFG",
+        2,
+        {
+            {"PHASE_AUTO_CONFIG", 0}
+            ,{"PHASE_AUTO_ADVANCE", 1}
+            ,{"REG_UPDATE_VC_REG", 2}
+            ,{"GLOBAL_OFFSET_TABLE_RD_SRC_INDEX", 3}
+            ,{"GLOBAL_OFFSET_TABLE_RD_DEST_INDEX", 4}
+
+        },
+        {
+            {0, 0}
+            ,{(PHASE_AUTO_CONFIG+PHASE_AUTO_CONFIG_WIDTH), 1}
+            ,{(PHASE_AUTO_ADVANCE+PHASE_AUTO_ADVANCE_WIDTH), 2}
+            ,{(REG_UPDATE_VC_REG+REG_UPDATE_VC_REG_WIDTH), 3}
+            ,{(GLOBAL_OFFSET_TABLE_RD_SRC_INDEX+GLOBAL_OFFSET_TABLE_RD_SRC_INDEX_WIDTH), 4}
+
+        },
+        {
+            {
+                "PHASE_AUTO_CONFIG",
+                0,
+                1,
+                ""
+            }
+            ,{
+                "PHASE_AUTO_ADVANCE",
+                (PHASE_AUTO_CONFIG+PHASE_AUTO_CONFIG_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "REG_UPDATE_VC_REG",
+                (PHASE_AUTO_ADVANCE+PHASE_AUTO_ADVANCE_WIDTH),
+                3,
+                "// set to one of the values (0-5) to select which VC control flow updates will be sent on\n"
+            }
+            ,{
+                "GLOBAL_OFFSET_TABLE_RD_SRC_INDEX",
+                (REG_UPDATE_VC_REG+REG_UPDATE_VC_REG_WIDTH),
+                GLOBAL_OFFSET_TABLE_SIZE_WIDTH,
+                "// Read index of global offset table, which will offset o_data_fwd_src_addr by entry value.\n"
+            }
+            ,{
+                "GLOBAL_OFFSET_TABLE_RD_DEST_INDEX",
+                (GLOBAL_OFFSET_TABLE_RD_SRC_INDEX+GLOBAL_OFFSET_TABLE_RD_SRC_INDEX_WIDTH),
+                GLOBAL_OFFSET_TABLE_SIZE_WIDTH,
+                "// Read index of global offset table, which will offset o_data_fwd_dest_addr by entry value.\n"
+            }
+
+        },
+        "// Registers that need to be programmed once per blob. (Can apply to multiple phases.)\n//   * Phase/data forward options:\n//      PHASE_AUTO_CONFIG = set to 1 for stream to fetch next phase configuration automatically.\n//      PHASE_AUTO_ADVANCE = set to 1 for stream to advance to next phase automatically \n//            (otherwise need to write STREAM_PHASE_ADVANCE below)\n"
+    }
+    ,{
+        "STREAM_MISC_CFG",
+        3,
+        {
+            {"INCOMING_DATA_NOC", 0}
+            ,{"OUTGOING_DATA_NOC", 1}
+            ,{"REMOTE_SRC_UPDATE_NOC", 2}
+            ,{"LOCAL_SOURCES_CONNECTED", 3}
+            ,{"SOURCE_ENDPOINT", 4}
+            ,{"REMOTE_SOURCE", 5}
+            ,{"RECEIVER_ENDPOINT", 6}
+            ,{"LOCAL_RECEIVER", 7}
+            ,{"REMOTE_RECEIVER", 8}
+            ,{"TOKEN_MODE", 9}
+            ,{"COPY_MODE", 10}
+            ,{"NEXT_PHASE_SRC_CHANGE", 11}
+            ,{"NEXT_PHASE_DEST_CHANGE", 12}
+            ,{"DATA_BUF_NO_FLOW_CTRL", 13}
+            ,{"DEST_DATA_BUF_NO_FLOW_CTRL", 14}
+            ,{"MSG_INFO_BUF_FLOW_CTRL", 15}
+            ,{"DEST_MSG_INFO_BUF_FLOW_CTRL", 16}
+            ,{"REMOTE_SRC_IS_MCAST", 17}
+            ,{"NO_PREV_PHASE_OUTGOING_DATA_FLUSH", 18}
+            ,{"SRC_FULL_CREDIT_FLUSH_EN", 19}
+            ,{"DST_FULL_CREDIT_FLUSH_EN", 20}
+            ,{"INFINITE_PHASE_EN", 21}
+            ,{"OOO_PHASE_EXECUTION_EN", 22}
+
+        },
+        {
+            {0, 0}
+            ,{(INCOMING_DATA_NOC+INCOMING_DATA_NOC_WIDTH), 1}
+            ,{(OUTGOING_DATA_NOC+OUTGOING_DATA_NOC_WIDTH), 2}
+            ,{(REMOTE_SRC_UPDATE_NOC+REMOTE_SRC_UPDATE_NOC_WIDTH), 3}
+            ,{(LOCAL_SOURCES_CONNECTED+LOCAL_SOURCES_CONNECTED_WIDTH), 4}
+            ,{(SOURCE_ENDPOINT+SOURCE_ENDPOINT_WIDTH), 5}
+            ,{(REMOTE_SOURCE+REMOTE_SOURCE_WIDTH), 6}
+            ,{(RECEIVER_ENDPOINT+RECEIVER_ENDPOINT_WIDTH), 7}
+            ,{(LOCAL_RECEIVER+LOCAL_RECEIVER_WIDTH), 8}
+            ,{(REMOTE_RECEIVER+REMOTE_RECEIVER_WIDTH), 9}
+            ,{(TOKEN_MODE+TOKEN_MODE_WIDTH), 10}
+            ,{(COPY_MODE+COPY_MODE_WIDTH), 11}
+            ,{(NEXT_PHASE_SRC_CHANGE+NEXT_PHASE_SRC_CHANGE_WIDTH), 12}
+            ,{(NEXT_PHASE_DEST_CHANGE+NEXT_PHASE_DEST_CHANGE_WIDTH), 13}
+            ,{(DATA_BUF_NO_FLOW_CTRL+DATA_BUF_NO_FLOW_CTRL_WIDTH), 14}
+            ,{(DEST_DATA_BUF_NO_FLOW_CTRL+DEST_DATA_BUF_NO_FLOW_CTRL_WIDTH), 15}
+            ,{(MSG_INFO_BUF_FLOW_CTRL+MSG_INFO_BUF_FLOW_CTRL_WIDTH), 16}
+            ,{(DEST_MSG_INFO_BUF_FLOW_CTRL+DEST_MSG_INFO_BUF_FLOW_CTRL_WIDTH), 17}
+            ,{(REMOTE_SRC_IS_MCAST+REMOTE_SRC_IS_MCAST_WIDTH), 18}
+            ,{(NO_PREV_PHASE_OUTGOING_DATA_FLUSH+NO_PREV_PHASE_OUTGOING_DATA_FLUSH_WIDTH), 19}
+            ,{(SRC_FULL_CREDIT_FLUSH_EN+SRC_FULL_CREDIT_FLUSH_EN_WIDTH), 20}
+            ,{(DST_FULL_CREDIT_FLUSH_EN+DST_FULL_CREDIT_FLUSH_EN_WIDTH), 21}
+            ,{(INFINITE_PHASE_EN+INFINITE_PHASE_EN_WIDTH), 22}
+
+        },
+        {
+            {
+                "INCOMING_DATA_NOC",
+                0,
+                NOC_NUM_WIDTH,
+                ""
+            }
+            ,{
+                "OUTGOING_DATA_NOC",
+                (INCOMING_DATA_NOC+INCOMING_DATA_NOC_WIDTH),
+                NOC_NUM_WIDTH,
+                ""
+            }
+            ,{
+                "REMOTE_SRC_UPDATE_NOC",
+                (OUTGOING_DATA_NOC+OUTGOING_DATA_NOC_WIDTH),
+                NOC_NUM_WIDTH,
+                ""
+            }
+            ,{
+                "LOCAL_SOURCES_CONNECTED",
+                (REMOTE_SRC_UPDATE_NOC+REMOTE_SRC_UPDATE_NOC_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "SOURCE_ENDPOINT",
+                (LOCAL_SOURCES_CONNECTED+LOCAL_SOURCES_CONNECTED_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "REMOTE_SOURCE",
+                (SOURCE_ENDPOINT+SOURCE_ENDPOINT_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "RECEIVER_ENDPOINT",
+                (REMOTE_SOURCE+REMOTE_SOURCE_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "LOCAL_RECEIVER",
+                (RECEIVER_ENDPOINT+RECEIVER_ENDPOINT_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "REMOTE_RECEIVER",
+                (LOCAL_RECEIVER+LOCAL_RECEIVER_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "TOKEN_MODE",
+                (REMOTE_RECEIVER+REMOTE_RECEIVER_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "COPY_MODE",
+                (TOKEN_MODE+TOKEN_MODE_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "NEXT_PHASE_SRC_CHANGE",
+                (COPY_MODE+COPY_MODE_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "NEXT_PHASE_DEST_CHANGE",
+                (NEXT_PHASE_SRC_CHANGE+NEXT_PHASE_SRC_CHANGE_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "DATA_BUF_NO_FLOW_CTRL",
+                (NEXT_PHASE_DEST_CHANGE+NEXT_PHASE_DEST_CHANGE_WIDTH),
+                1,
+                "// set if REMOTE_SOURCE==1 and the buffer is large enough to accept full phase data without wrapping:\n"
+            }
+            ,{
+                "DEST_DATA_BUF_NO_FLOW_CTRL",
+                (DATA_BUF_NO_FLOW_CTRL+DATA_BUF_NO_FLOW_CTRL_WIDTH),
+                1,
+                "// set if REMOTE_RECEIVER==1 and the destination buffer is large enough to accept full phase data without wrapping:\n"
+            }
+            ,{
+                "MSG_INFO_BUF_FLOW_CTRL",
+                (DEST_DATA_BUF_NO_FLOW_CTRL+DEST_DATA_BUF_NO_FLOW_CTRL_WIDTH),
+                1,
+                "// set if REMOTE_SOURCE==1 and you want the buffer to have wrapping:\n"
+            }
+            ,{
+                "DEST_MSG_INFO_BUF_FLOW_CTRL",
+                (MSG_INFO_BUF_FLOW_CTRL+MSG_INFO_BUF_FLOW_CTRL_WIDTH),
+                1,
+                "// set if REMOTE_RECEIVER==1 and you want the destination buffer to have wrapping:\n"
+            }
+            ,{
+                "REMOTE_SRC_IS_MCAST",
+                (DEST_MSG_INFO_BUF_FLOW_CTRL+DEST_MSG_INFO_BUF_FLOW_CTRL_WIDTH),
+                1,
+                "// set if REMOTE_SOURCE==1 and has mulicast enabled (i.e. this stream is part of a multicast group)\n"
+            }
+            ,{
+                "NO_PREV_PHASE_OUTGOING_DATA_FLUSH",
+                (REMOTE_SRC_IS_MCAST+REMOTE_SRC_IS_MCAST_WIDTH),
+                1,
+                "// set if no need to flush outgoing remote data from previous phase\n"
+            }
+            ,{
+                "SRC_FULL_CREDIT_FLUSH_EN",
+                (NO_PREV_PHASE_OUTGOING_DATA_FLUSH+NO_PREV_PHASE_OUTGOING_DATA_FLUSH_WIDTH),
+                1,
+                "// Set to one to enable full credit flushing on src side\n"
+            }
+            ,{
+                "DST_FULL_CREDIT_FLUSH_EN",
+                (SRC_FULL_CREDIT_FLUSH_EN+SRC_FULL_CREDIT_FLUSH_EN_WIDTH),
+                1,
+                "// Set to one to enable full credit flushing on dest side\n"
+            }
+            ,{
+                "INFINITE_PHASE_EN",
+                (DST_FULL_CREDIT_FLUSH_EN+DST_FULL_CREDIT_FLUSH_EN_WIDTH),
+                1,
+                "// Set to one to enable infinite messages per phase, accompanied by a last tile header bit which will end the phase\n"
+            }
+            ,{
+                "OOO_PHASE_EXECUTION_EN",
+                (INFINITE_PHASE_EN+INFINITE_PHASE_EN_WIDTH),
+                1,
+                "// Enables out-of-order phase execution by providing an array of size num_tiles at the end of phase blob, with order in which each tile should be sent. Each array entry contains a 17-bit tile address and a 15-bit tile size.\n"
+            }
+
+        },
+        "// The ID of NOCs used for incoming and outgoing data, followed by misc. stream configuration options:\n//   * Source - set exactly one of these to 1:\n//        SOURCE_ENDPOINT = source is local math/packer\n//        REMOTE_SOURCE = source is remote sender stream\n//        LOCAL_SOURCES_CONNECTED = source is one or more local connected streams\n//   * Destination - set one or zero of these to 1:\n//        RECEIVER_ENDPOINT = stream is read by local unpacker/math\n//        REMOTE_RECEIVER = stream forwards data to a remote destination or multicast group\n//        LOCAL_RECEIVER = stream is connected to a local destination stream\n//        None set = stream just stores data in a local buffer, without forwarding/clearing, and \n//                   finishes the phase once all messages have been received\n"
+    }
+    ,{
+        "STREAM_REMOTE_SRC",
+        4,
+        {
+            {"STREAM_REMOTE_SRC_X", 0}
+            ,{"STREAM_REMOTE_SRC_Y", 1}
+            ,{"REMOTE_SRC_STREAM_ID", 2}
+            ,{"STREAM_REMOTE_SRC_DEST_INDEX", 3}
+            ,{"DRAM_READS__TRANS_SIZE_WORDS_LO", 4}
+
+        },
+        {
+            {0, 0}
+            ,{(STREAM_REMOTE_SRC_X+STREAM_REMOTE_SRC_X_WIDTH), 1}
+            ,{(STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH), 2}
+            ,{(REMOTE_SRC_STREAM_ID+REMOTE_SRC_STREAM_ID_WIDTH), 3}
+
+        },
+        {
+            {
+                "STREAM_REMOTE_SRC_X",
+                0,
+                NOC_ID_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_REMOTE_SRC_Y",
+                (STREAM_REMOTE_SRC_X+STREAM_REMOTE_SRC_X_WIDTH),
+                NOC_ID_WIDTH,
+                ""
+            }
+            ,{
+                "REMOTE_SRC_STREAM_ID",
+                (STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH),
+                STREAM_ID_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_REMOTE_SRC_DEST_INDEX",
+                (REMOTE_SRC_STREAM_ID+REMOTE_SRC_STREAM_ID_WIDTH),
+                STREAM_ID_WIDTH,
+                ""
+            }
+            ,{
+                "DRAM_READS__TRANS_SIZE_WORDS_LO",
+                (STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH),
+                12,
+                ""
+            }
+
+        },
+        "// Properties of the remote source stream (coorindates, stream ID, and this streams destination index).\n// Dont-care unless REMOTE_SOURCE == 1.\n"
+    }
+    ,{
+        "STREAM_REMOTE_SRC_PHASE",
+        5,
+        {
+            {"DRAM_READS__SCRATCH_1_PTR", 0}
+            ,{"DRAM_READS__TRANS_SIZE_WORDS_HI", 1}
+
+        },
+        {
+            {0, 0}
+            ,{(DRAM_READS__SCRATCH_1_PTR+DRAM_READS__SCRATCH_1_PTR_WIDTH), 1}
+
+        },
+        {
+            {
+                "DRAM_READS__SCRATCH_1_PTR",
+                0,
+                19,
+                ""
+            }
+            ,{
+                "DRAM_READS__TRANS_SIZE_WORDS_HI",
+                (DRAM_READS__SCRATCH_1_PTR+DRAM_READS__SCRATCH_1_PTR_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Remote source phase (may be different from the destination stream phase.)\n// We use 20-bit phase ID, so phase count doesnt wrap until 1M phases. \n// Dont-care unless REMOTE_SOURCE == 1.\n"
+    }
+    ,{
+        "STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD",
+        6,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// 4-bit wide register that determines the threshold at which a stream\n// with remote source sends an update message to STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE.\n// Dont-care unless REMOTE_SOURCE==1.  \n// Values:\n//   value[3:0] == 0 => disable threshold. Acks send as soon as any data are cleared/forwarded. \n//   value[3:0] >  0 => threshold calculated according to the following formula:\n//         if (value[3])\n//              threshold = buf_size - (buf_size >> value[2:0])\n//         else \n//              threshold = (buf_size >> value[2:0])\n//\n// This enables setting thresholds of buf_size/2, buf_size/4, buf_size/8, ... buf_size/256, \n// as well as  3*buf_size/4, 7*buf_size/8, etc.\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST",
+        7,
+        {
+            {"STREAM_REMOTE_DEST_X", 0}
+            ,{"STREAM_REMOTE_DEST_Y", 1}
+            ,{"STREAM_REMOTE_DEST_STREAM_ID", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(STREAM_REMOTE_DEST_X+STREAM_REMOTE_DEST_X_WIDTH), 1}
+            ,{(STREAM_REMOTE_DEST_Y+STREAM_REMOTE_DEST_Y_WIDTH), 2}
+
+        },
+        {
+            {
+                "STREAM_REMOTE_DEST_X",
+                0,
+                NOC_ID_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_REMOTE_DEST_Y",
+                (STREAM_REMOTE_DEST_X+STREAM_REMOTE_DEST_X_WIDTH),
+                NOC_ID_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_REMOTE_DEST_STREAM_ID",
+                (STREAM_REMOTE_DEST_Y+STREAM_REMOTE_DEST_Y_WIDTH),
+                STREAM_ID_WIDTH,
+                ""
+            }
+
+        },
+        "// Properties of the remote destination stream (coorindates, stream ID).  Dont-care unless REMOTE_RECEIVER == 1.\n// If destination is multicast, this register specifies the starting coordinates of the destination\n// multicast group/rectangle. (The end coordinates are in STREAM_MCAST_DEST below.)\n"
+    }
+    ,{
+        "STREAM_LOCAL_DEST",
+        7,
+        {
+            {"STREAM_LOCAL_DEST_MSG_CLEAR_NUM", 0}
+            ,{"STREAM_LOCAL_DEST_STREAM_ID", 1}
+
+        },
+        {
+            {0, 0}
+            ,{(STREAM_LOCAL_DEST_MSG_CLEAR_NUM+STREAM_LOCAL_DEST_MSG_CLEAR_NUM_WIDTH), 1}
+
+        },
+        {
+            {
+                "STREAM_LOCAL_DEST_MSG_CLEAR_NUM",
+                0,
+                12,
+                ""
+            }
+            ,{
+                "STREAM_LOCAL_DEST_STREAM_ID",
+                (STREAM_LOCAL_DEST_MSG_CLEAR_NUM+STREAM_LOCAL_DEST_MSG_CLEAR_NUM_WIDTH),
+                STREAM_ID_WIDTH,
+                ""
+            }
+
+        },
+        "// Properties of the local destination gather stream connection.\n// Dont-care unless LOCAL_RECEIVER == 1.\n// Shares register space with STREAM_REMOTE_DEST_REG_INDEX.\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_BUF_START",
+        8,
+        {
+            {"DRAM_WRITES__SCRATCH_1_PTR_LO", 0}
+
+        },
+        {
+            {0, 0}
+
+        },
+        {
+            {
+                "DRAM_WRITES__SCRATCH_1_PTR_LO",
+                0,
+                16,
+                ""
+            }
+
+        },
+        "// Start address (in words) of the remote destination stream memory buffer.\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_BUF_START_HI",
+        9,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// High bits for STREAM_REMOTE_DEST_BUF_START\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_BUF_SIZE",
+        10,
+        {
+            {"REMOTE_DEST_BUF_SIZE_WORDS", 0}
+            ,{"DRAM_WRITES__SCRATCH_1_PTR_HI", 1}
+
+        },
+        {
+            {0, 0}
+
+        },
+        {
+            {
+                "REMOTE_DEST_BUF_SIZE_WORDS",
+                0,
+                MEM_WORD_ADDR_WIDTH,
+                ""
+            }
+            ,{
+                "DRAM_WRITES__SCRATCH_1_PTR_HI",
+                0,
+                3,
+                ""
+            }
+
+        },
+        "// Size (in words) of the remote destination stream memory buffer.\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_WR_PTR",
+        11,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Write pointer for the remote destination stream memory buffer. \n// Can be written directly; automatically reset to 0 when \n// STREAM_REMOTE_DEST_BUF_START is written.\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_MSG_INFO_BUF_SIZE",
+        12,
+        {
+            {"REMOTE_DEST_MSG_INFO_BUF_SIZE_POW2", 0}
+
+        },
+        {
+            {0, 0}
+
+        },
+        {
+            {
+                "REMOTE_DEST_MSG_INFO_BUF_SIZE_POW2",
+                0,
+                MSG_INFO_BUF_SIZE_POW_BITS,
+                ""
+            }
+
+        },
+        "// Size (in power2) of the remote destination stream memory buffer. \n// Bits encode powers of 2 sizes in words (2^(x+1)), e.g. 0 -> 2 words, 1 -> 4 words, 7 -> 256 words\n// Max 256 word size.\n// Only used when DEST_MSG_INFO_BUF_FLOW_CTRL is true\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_MSG_INFO_BUF_START",
+        13,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Start address (in words) of the remote destination stream memory buffer. \n// Only used when DEST_MSG_INFO_BUF_FLOW_CTRL is true\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_MSG_INFO_WR_PTR",
+        13,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Write pointer for the remote destination message info buffer. \n// Dont-care unless REMOTE_RECEIVER==1. \n// Needs to be initialized to the start of the message info buffer of the remote destination\n// at phase start, if destination is changed. \n// Subsequently its incremented automatically as messages are forwarded. \n// When DEST_MSG_INFO_BUF_FLOW_CTRL is true this pointer is the one above\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_MSG_INFO_BUF_START_HI",
+        14,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// High bits for STREAM_REMOTE_DEST_MSG_INFO_BUF_START\n// Only used when DEST_MSG_INFO_BUF_FLOW_CTRL is true\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_MSG_INFO_WR_PTR_HI",
+        14,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// High bits for STREAM_REMOTE_DEST_MSG_INFO_WR_PTR\n// When DEST_MSG_INFO_BUF_FLOW_CTRL is true this pointer is the one above\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_MSG_INFO_WRAP_WR_PTR",
+        15,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Only used when DEST_MSG_INFO_BUF_FLOW_CTRL is true\n// Write pointer for the remote destination message info buffer. \n// Dont-care unless REMOTE_RECEIVER==1. \n// Subsequently its incremented automatically as messages are forwarded.\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_TRAFFIC",
+        16,
+        {
+            {"NOC_PRIORITY", 0}
+            ,{"UNICAST_VC_REG", 1}
+
+        },
+        {
+            {0, 0}
+            ,{(NOC_PRIORITY+NOC_PRIORITY_WIDTH), 1}
+
+        },
+        {
+            {
+                "NOC_PRIORITY",
+                0,
+                4,
+                ""
+            }
+            ,{
+                "UNICAST_VC_REG",
+                (NOC_PRIORITY+NOC_PRIORITY_WIDTH),
+                3,
+                "// set to one of the values (0-5) to select which VC unicast requests will be sent on\n"
+            }
+
+        },
+        "// Priority for traffic sent to remote destination. \n// Valid only for streams capable of remote sending. \n// 4-bit value. \n// Set to 0 to send traffic under round-robin arbitration. \n// Set to 1-15 for priority arbitration (higher values are higher priority).\n"
+    }
+    ,{
+        "STREAM_BUF_START",
+        17,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Start address (in words) of the memory buffer associated with this stream.\n"
+    }
+    ,{
+        "STREAM_BUF_SIZE",
+        18,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Stream buffer size (in words).\n"
+    }
+    ,{
+        "STREAM_RD_PTR",
+        19,
+        {
+            {"STREAM_RD_PTR_VAL", 0}
+            ,{"STREAM_RD_PTR_WRAP", 1}
+
+        },
+        {
+            {0, 0}
+            ,{(STREAM_RD_PTR_VAL+STREAM_RD_PTR_VAL_WIDTH), 1}
+
+        },
+        {
+            {
+                "STREAM_RD_PTR_VAL",
+                0,
+                MEM_WORD_ADDR_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_RD_PTR_WRAP",
+                (STREAM_RD_PTR_VAL+STREAM_RD_PTR_VAL_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Read pointer value (word offset relative to buffer start).\n// Can be updated by writing the register. \n// Value does not guarantee that all data up to the current value have been sent\n// off (forwarding command may be  ongoing).  To find out free space in the buffer,\n// read STREAM_BUF_SPACE_AVAILABLE. \n// Automatically reset to 0 when STREAM_BUF_START_REG is updated.\n"
+    }
+    ,{
+        "STREAM_WR_PTR",
+        20,
+        {
+            {"STREAM_WR_PTR_VAL", 0}
+            ,{"STREAM_WR_PTR_WRAP", 1}
+
+        },
+        {
+            {0, 0}
+            ,{(STREAM_WR_PTR_VAL+STREAM_WR_PTR_VAL_WIDTH), 1}
+
+        },
+        {
+            {
+                "STREAM_WR_PTR_VAL",
+                0,
+                MEM_WORD_ADDR_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_WR_PTR_WRAP",
+                (STREAM_WR_PTR_VAL+STREAM_WR_PTR_VAL_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Write pointer value (word offset relative to buffer start). \n// Can be read to determine the location at which to write new data. \n// Can be updated by writing the register. \n// In normal operation, should be updated only by writing \n// STREAM_NUM_MSGS_RECEIVED_INC_REG or STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG.\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_BUF_SIZE",
+        21,
+        {
+            {"MSG_INFO_BUF_SIZE_POW2", 0}
+
+        },
+        {
+            {0, 0}
+
+        },
+        {
+            {
+                "MSG_INFO_BUF_SIZE_POW2",
+                0,
+                MSG_INFO_BUF_SIZE_POW_BITS,
+                ""
+            }
+
+        },
+        "// Size (in power2) of the remote destination stream memory buffer. \n// Bits encode powers of 2 sizes in words (2^(x+1)), e.g. 0 -> 2 words, 1 -> 4 words, 7 -> 256 words\n// Max 256 word size.\n// Only used when MSG_INFO_BUF_FLOW_CTRL is true\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_BUF_START",
+        22,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Start address (in words) of the msg info buffer. \n// Only used when MSG_INFO_BUF_FLOW_CTRL is true\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_PTR",
+        22,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Stream message info buffer address. \n//\n// This register needs to be initialized to the start of the message info buffer during \n// phase configuration.  Subsequently it will be incremented by hardware as data are read\n// from the buffer, thus doubling as the read pointer during phase execution. \n//\n// Stream hardware will assume that this buffer is large enough to hold info for all messages\n// within a phase, so unlike the buffer, it never needs to wrap.\n// \n// The buffer is filled automatically by snooping for streams with remote source. \n// For source enpoints, the buffer is written explicitly (along with the data buffer), after which \n// STREAM_NUM_MSGS_RECEIVED_INC is written to notify the stream that messages are available for\n// sending. \n// \n// Write pointer is also managed automatically by hardware, but can be read or reset using \n// STREAM_MSG_INFO_WR_PTR_REG. Write pointer is also reset when writing this register. \n// When MSG_INFO_BUF_FLOW_CTRL is true this pointer is the one above\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_WRAP_RD_WR_PTR",
+        23,
+        {
+            {"STREAM_MSG_INFO_WRAP_RD_PTR", 0}
+            ,{"STREAM_MSG_INFO_WRAP_RD_PTR_WRAP", 1}
+            ,{"STREAM_MSG_INFO_WRAP_WR_PTR", 2}
+            ,{"STREAM_MSG_INFO_WRAP_WR_PTR_WRAP", 3}
+
+        },
+        {
+            {0, 0}
+            ,{(STREAM_MSG_INFO_WRAP_RD_PTR+STREAM_MSG_INFO_WRAP_RD_PTR_WIDTH), 1}
+            ,{(STREAM_MSG_INFO_WRAP_RD_PTR_WRAP+STREAM_MSG_INFO_WRAP_RD_PTR_WRAP_WIDTH), 2}
+            ,{(STREAM_MSG_INFO_WRAP_WR_PTR+STREAM_MSG_INFO_WRAP_WR_PTR_WIDTH), 3}
+
+        },
+        {
+            {
+                "STREAM_MSG_INFO_WRAP_RD_PTR",
+                0,
+                MSG_INFO_BUF_SIZE_BITS,
+                ""
+            }
+            ,{
+                "STREAM_MSG_INFO_WRAP_RD_PTR_WRAP",
+                (STREAM_MSG_INFO_WRAP_RD_PTR+STREAM_MSG_INFO_WRAP_RD_PTR_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "STREAM_MSG_INFO_WRAP_WR_PTR",
+                (STREAM_MSG_INFO_WRAP_RD_PTR_WRAP+STREAM_MSG_INFO_WRAP_RD_PTR_WRAP_WIDTH),
+                MSG_INFO_BUF_SIZE_BITS,
+                ""
+            }
+            ,{
+                "STREAM_MSG_INFO_WRAP_WR_PTR_WRAP",
+                (STREAM_MSG_INFO_WRAP_WR_PTR+STREAM_MSG_INFO_WRAP_WR_PTR_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// The read and write pointers for the msg info buffer when the message info buffer is in wrapping mode.\n// Only used when MSG_INFO_BUF_FLOW_CTRL is true\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_WR_PTR",
+        23,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Write pointer value for message info buffer (absolute word address). \n// In normal operation, should be updated only by writing \n// STREAM_NUM_MSGS_RECEIVED_INC_REG or STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG.\n// When MSG_INFO_BUF_FLOW_CTRL is true this pointer is the one above\n"
+    }
+    ,{
+        "STREAM_MCAST_DEST",
+        24,
+        {
+            {"STREAM_MCAST_END_X", 0}
+            ,{"STREAM_MCAST_END_Y", 1}
+            ,{"STREAM_MCAST_EN", 2}
+            ,{"STREAM_MCAST_LINKED", 3}
+            ,{"STREAM_MCAST_VC", 4}
+            ,{"STREAM_MCAST_NO_PATH_RES", 5}
+            ,{"STREAM_MCAST_XY", 6}
+            ,{"STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED", 7}
+            ,{"STREAM_MCAST_DEST_SIDE_DYNAMIC_LINKED", 8}
+
+        },
+        {
+            {0, 0}
+            ,{(STREAM_MCAST_END_X+STREAM_MCAST_END_X_WIDTH), 1}
+            ,{(STREAM_MCAST_END_Y+STREAM_MCAST_END_Y_WIDTH), 2}
+            ,{(STREAM_MCAST_EN+STREAM_MCAST_EN_WIDTH), 3}
+            ,{(STREAM_MCAST_LINKED+STREAM_MCAST_LINKED_WIDTH), 4}
+            ,{(STREAM_MCAST_VC+STREAM_MCAST_VC_WIDTH), 5}
+            ,{(STREAM_MCAST_NO_PATH_RES+STREAM_MCAST_NO_PATH_RES_WIDTH), 6}
+            ,{(STREAM_MCAST_XY+STREAM_MCAST_XY_WIDTH), 7}
+            ,{(STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED+STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED_WIDTH), 8}
+
+        },
+        {
+            {
+                "STREAM_MCAST_END_X",
+                0,
+                NOC_ID_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_MCAST_END_Y",
+                (STREAM_MCAST_END_X+STREAM_MCAST_END_X_WIDTH),
+                NOC_ID_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_MCAST_EN",
+                (STREAM_MCAST_END_Y+STREAM_MCAST_END_Y_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "STREAM_MCAST_LINKED",
+                (STREAM_MCAST_EN+STREAM_MCAST_EN_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "STREAM_MCAST_VC",
+                (STREAM_MCAST_LINKED+STREAM_MCAST_LINKED_WIDTH),
+                1,
+                "// Set to 0 to select VC 4, and 1 to select VC 5 (default 0)\n"
+            }
+            ,{
+                "STREAM_MCAST_NO_PATH_RES",
+                (STREAM_MCAST_VC+STREAM_MCAST_VC_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "STREAM_MCAST_XY",
+                (STREAM_MCAST_NO_PATH_RES+STREAM_MCAST_NO_PATH_RES_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED",
+                (STREAM_MCAST_XY+STREAM_MCAST_XY_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "STREAM_MCAST_DEST_SIDE_DYNAMIC_LINKED",
+                (STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED+STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Destination spec for multicasting streams. STREAM_MCAST_END_X/Y are\n// the end coordinate for the multicast rectangle, with the ones from \n// STREAM_REMOTE_DEST taken as start. \n// Dont-care if STREAM_MCAST_EN == 0.\n"
+    }
+    ,{
+        "STREAM_MCAST_DEST_NUM",
+        25,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Number of multicast destinations (dont-care for non-multicast streams)\n"
+    }
+    ,{
+        "STREAM_GATHER",
+        26,
+        {
+            {"MSG_LOCAL_STREAM_CLEAR_NUM", 0}
+            ,{"MSG_GROUP_STREAM_CLEAR_TYPE", 1}
+            ,{"MSG_ARB_GROUP_SIZE", 2}
+            ,{"MSG_SRC_IN_ORDER_FWD", 3}
+            ,{"MSG_SRC_ARBITRARY_CLEAR_NUM_EN", 4}
+
+        },
+        {
+            {0, 0}
+            ,{(MSG_LOCAL_STREAM_CLEAR_NUM+MSG_LOCAL_STREAM_CLEAR_NUM_WIDTH), 1}
+            ,{(MSG_GROUP_STREAM_CLEAR_TYPE+MSG_GROUP_STREAM_CLEAR_TYPE_WIDTH), 2}
+            ,{(MSG_ARB_GROUP_SIZE+MSG_ARB_GROUP_SIZE_WIDTH), 3}
+            ,{(MSG_SRC_IN_ORDER_FWD+MSG_SRC_IN_ORDER_FWD_WIDTH), 4}
+
+        },
+        {
+            {
+                "MSG_LOCAL_STREAM_CLEAR_NUM",
+                0,
+                12,
+                ""
+            }
+            ,{
+                "MSG_GROUP_STREAM_CLEAR_TYPE",
+                (MSG_LOCAL_STREAM_CLEAR_NUM+MSG_LOCAL_STREAM_CLEAR_NUM_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "MSG_ARB_GROUP_SIZE",
+                (MSG_GROUP_STREAM_CLEAR_TYPE+MSG_GROUP_STREAM_CLEAR_TYPE_WIDTH),
+                3,
+                ""
+            }
+            ,{
+                "MSG_SRC_IN_ORDER_FWD",
+                (MSG_ARB_GROUP_SIZE+MSG_ARB_GROUP_SIZE_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "MSG_SRC_ARBITRARY_CLEAR_NUM_EN",
+                (MSG_SRC_IN_ORDER_FWD+MSG_SRC_IN_ORDER_FWD_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Specifies MSG_ARB_GROUP_SIZE. Valid values are 1 (round-robin\n// arbitration between each incoming stream) or 4 (round-robin arbitration\n// between groups of 4 incoming streams).  \n// Msg_LOCAL_STREAM_CLEAR_NUM specifies the number of messages that should \n// be cleared from a gather stream before moving onto the next stream. \n// When MSG_ARB_GROUP_SIZE > 1, the order of clearing the streams can be selected\n// with MSG_GROUP_STREAM_CLEAR_TYPE. 0 = clear the whole group MSG_LOCAL_STREAM_CLEAR_NUM times,\n// 1 = clear each stream of the group MSG_LOCAL_STREAM_CLEAR_NUM times before\n// moving onto the next stream in the group.\n"
+    }
+    ,{
+        "STREAM_MSG_SRC_IN_ORDER_FWD_NUM_MSGS",
+        27,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// When using in-order message forwarding, number of messages after which the source\n// pointer goes back to zero (without phase change).\n// Dont-care if STREAM_MCAST_EN == 0 or MSG_SRC_IN_ORDER_FWD == 0.\n"
+    }
+    ,{
+        "STREAM_CURR_PHASE_BASE",
+        28,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Actual phase number executed is STREAM_CURR_PHASE_BASE_REG_INDEX + STREAM_CURR_PHASE_REG_INDEX\n// When reprogramming this register you must also reprogram STREAM_CURR_PHASE_REG_INDEX and STREAM_REMOTE_SRC_PHASE_REG_INDEX\n"
+    }
+    ,{
+        "STREAM_CURR_PHASE",
+        29,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Current phase number executed by the stream.\n"
+    }
+    ,{
+        "STREAM_PHASE_AUTO_CFG_PTR_BASE",
+        30,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Actual address accessed will be STREAM_PHASE_AUTO_CFG_PTR_BASE_REG_INDEX + STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX\n// When reprogramming this register you must also reprogram STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX\n"
+    }
+    ,{
+        "STREAM_PHASE_AUTO_CFG_PTR",
+        31,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Pointer to the stream auto-config data. Initialized to the start of\n// the auto-config structure at workload start, automatically updated\n// subsequenty. \n// Specified as byte address, needs to be multiple of 4B.\n"
+    }
+    ,{
+        "STREAM_RELOAD_PHASE_BLOB",
+        32,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// This register acts as indirection to execute a phase that already exists somewhere in the blob.\n// It can be used to compress the blob when many phases need to be repeated.\n// When this register is written with a signed offset, the blob at address (auto_cfg pointer + offset) will be loaded.\n// The loaded blob must manually set its phase (using STREAM_CURR_PHASE) for this feature to work correctly.\n// Furthermore the phase after the reload blob phase must also set its current phase manually.\n"
+    }
+    ,{
+        "STREAM_MSG_HEADER_FORMAT",
+        33,
+        {
+            {"MSG_HEADER_WORD_CNT_OFFSET", 0}
+            ,{"MSG_HEADER_WORD_CNT_BITS", 1}
+            ,{"MSG_HEADER_INFINITE_PHASE_LAST_TILE_OFFSET", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(MSG_HEADER_WORD_CNT_OFFSET+MSG_HEADER_WORD_CNT_OFFSET_WIDTH), 1}
+            ,{(MSG_HEADER_WORD_CNT_BITS+MSG_HEADER_WORD_CNT_BITS_WIDTH), 2}
+
+        },
+        {
+            {
+                "MSG_HEADER_WORD_CNT_OFFSET",
+                0,
+                MEM_WORD_BIT_OFFSET_WIDTH,
+                ""
+            }
+            ,{
+                "MSG_HEADER_WORD_CNT_BITS",
+                (MSG_HEADER_WORD_CNT_OFFSET+MSG_HEADER_WORD_CNT_OFFSET_WIDTH),
+                MEM_WORD_BIT_OFFSET_WIDTH,
+                ""
+            }
+            ,{
+                "MSG_HEADER_INFINITE_PHASE_LAST_TILE_OFFSET",
+                (MSG_HEADER_WORD_CNT_BITS+MSG_HEADER_WORD_CNT_BITS_WIDTH),
+                MEM_WORD_BIT_OFFSET_WIDTH,
+                ""
+            }
+
+        },
+        "// Offset & size of the size field in the message header. Only valid offsets are multiples of 8\n// (i.e. byte-aligned).\n"
+    }
+    ,{
+        "STREAM_PHASE_AUTO_CFG_HEADER",
+        34,
+        {
+            {"PHASE_NUM_INCR", 0}
+            ,{"CURR_PHASE_NUM_MSGS", 1}
+            ,{"NEXT_PHASE_NUM_CFG_REG_WRITES", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(PHASE_NUM_INCR+PHASE_NUM_INCR_WIDTH), 1}
+            ,{(CURR_PHASE_NUM_MSGS+CURR_PHASE_NUM_MSGS_WIDTH), 2}
+
+        },
+        {
+            {
+                "PHASE_NUM_INCR",
+                0,
+                12,
+                ""
+            }
+            ,{
+                "CURR_PHASE_NUM_MSGS",
+                (PHASE_NUM_INCR+PHASE_NUM_INCR_WIDTH),
+                12,
+                ""
+            }
+            ,{
+                "NEXT_PHASE_NUM_CFG_REG_WRITES",
+                (CURR_PHASE_NUM_MSGS+CURR_PHASE_NUM_MSGS_WIDTH),
+                8,
+                ""
+            }
+
+        },
+        "// Register corresponding to the auto-configuration header. Written by each auto-config access\n// at phase start, can be also written by software for initial configuration or if auto-config\n// is disabled. \n// PHASE_NUM_INCR is phase number increment relative to the previous executed phase (or 0 right\n// after reset). The increment happens after auto-config is done, and before the phase is executed.\n// (Therefore reading  STREAM_CURR_PHASE_REG while auto-config is ongoing, or if it hasnt started\n// yet, may return the old phase number.)\n// This enables up to 2^12-1 phases to be skipped. If more phases need to be skipped, it is\n// necessary to insert an intermediate phase with zero messages, whose only purpose is to provide\n// an additional skip offset.\n"
+    }
+    ,{
+        "STREAM_PERF_CONFIG",
+        35,
+        {
+            {"CLOCK_GATING_EN", 0}
+            ,{"CLOCK_GATING_HYST", 1}
+            ,{"PARTIAL_SEND_WORDS_THR", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(CLOCK_GATING_EN+CLOCK_GATING_EN_WIDTH), 1}
+            ,{(CLOCK_GATING_HYST+CLOCK_GATING_HYST_WIDTH), 2}
+
+        },
+        {
+            {
+                "CLOCK_GATING_EN",
+                0,
+                1,
+                ""
+            }
+            ,{
+                "CLOCK_GATING_HYST",
+                (CLOCK_GATING_EN+CLOCK_GATING_EN_WIDTH),
+                7,
+                ""
+            }
+            ,{
+                "PARTIAL_SEND_WORDS_THR",
+                (CLOCK_GATING_HYST+CLOCK_GATING_HYST_WIDTH),
+                8,
+                "// PARTIAL_SEND_WORDS_THR contols the minimum number of 16-byte words of a tile to accumulate in a relay stream before sending it off to the destination.\n// If the size of the tile is less than or equal to PARTIAL_SEND_WORDS_THR, then this feild is ignored.\n// Default is 16 words\n"
+            }
+
+        },
+        "// Should be written only for stream 0, applies to all streams.\n"
+    }
+    ,{
+        "STREAM_SCRATCH",
+        36,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Scratch registers\n// Exists only in streams 0-3 and 8-11\n// Data can be stored at [23:0] from STREAM_SCRATCH_REG_INDEX + 0 to STREAM_SCRATCH_REG_INDEX + 5\n// Can be loaded through overlay blobs.\n"
+    }
+    ,{
+        "STREAM_SCRATCH_0",
+        36,
+        {
+            {"NCRISC_TRANS_EN", 0}
+            ,{"NCRISC_TRANS_EN_IRQ_ON_BLOB_END", 1}
+            ,{"NCRISC_CMD_ID", 2}
+            ,{"NEXT_NRISC_PIC_INT_ON_PHASE", 3}
+
+        },
+        {
+            {0, 0}
+            ,{(NCRISC_TRANS_EN + NCRISC_TRANS_EN_WIDTH), 1}
+            ,{(NCRISC_TRANS_EN_IRQ_ON_BLOB_END + NCRISC_TRANS_EN_IRQ_ON_BLOB_END_WIDTH), 2}
+            ,{(NCRISC_CMD_ID + NCRISC_CMD_ID_WIDTH), 3}
+
+        },
+        {
+            {
+                "NCRISC_TRANS_EN",
+                0,
+                1,
+                ""
+            }
+            ,{
+                "NCRISC_TRANS_EN_IRQ_ON_BLOB_END",
+                (NCRISC_TRANS_EN + NCRISC_TRANS_EN_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "NCRISC_CMD_ID",
+                (NCRISC_TRANS_EN_IRQ_ON_BLOB_END + NCRISC_TRANS_EN_IRQ_ON_BLOB_END_WIDTH),
+                3,
+                ""
+            }
+            ,{
+                "NEXT_NRISC_PIC_INT_ON_PHASE",
+                (NCRISC_CMD_ID + NCRISC_CMD_ID_WIDTH),
+                19,
+                "// Kept for compatibility with grayskull, but doesnt not exist anymore in wormhole\n"
+            }
+
+        },
+        ""
+    }
+    ,{
+        "STREAM_SCRATCH_1",
+        37,
+        {
+            {"DRAM_FIFO_RD_PTR_WORDS_LO", 0}
+            ,{"NCRISC_LOOP_COUNT", 1}
+            ,{"NCRISC_INIT_ENABLE_BLOB_DONE_IRQ", 2}
+            ,{"NCRISC_INIT_DISABLE_BLOB_DONE_IRQ", 3}
+
+        },
+        {
+            {0, 0}
+            ,{(NCRISC_INIT_ENABLE_BLOB_DONE_IRQ + NCRISC_INIT_ENABLE_BLOB_DONE_IRQ_WIDTH), 3}
+
+        },
+        {
+            {
+                "DRAM_FIFO_RD_PTR_WORDS_LO",
+                0,
+                24,
+                ""
+            }
+            ,{
+                "NCRISC_LOOP_COUNT",
+                0,
+                24,
+                ""
+            }
+            ,{
+                "NCRISC_INIT_ENABLE_BLOB_DONE_IRQ",
+                0,
+                1,
+                ""
+            }
+            ,{
+                "NCRISC_INIT_DISABLE_BLOB_DONE_IRQ",
+                (NCRISC_INIT_ENABLE_BLOB_DONE_IRQ + NCRISC_INIT_ENABLE_BLOB_DONE_IRQ_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        ""
+    }
+    ,{
+        "STREAM_SCRATCH_2",
+        38,
+        {
+            {"DRAM_FIFO_RD_PTR_WORDS_HI", 0}
+            ,{"DRAM_FIFO_WR_PTR_WORDS_LO", 1}
+            ,{"NCRISC_TOTAL_LOOP_ITER", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(DRAM_FIFO_RD_PTR_WORDS_HI + DRAM_FIFO_RD_PTR_WORDS_HI_WIDTH), 1}
+
+        },
+        {
+            {
+                "DRAM_FIFO_RD_PTR_WORDS_HI",
+                0,
+                4,
+                ""
+            }
+            ,{
+                "DRAM_FIFO_WR_PTR_WORDS_LO",
+                (DRAM_FIFO_RD_PTR_WORDS_HI + DRAM_FIFO_RD_PTR_WORDS_HI_WIDTH),
+                20,
+                ""
+            }
+            ,{
+                "NCRISC_TOTAL_LOOP_ITER",
+                0,
+                24,
+                ""
+            }
+
+        },
+        ""
+    }
+    ,{
+        "STREAM_SCRATCH_3",
+        39,
+        {
+            {"DRAM_FIFO_WR_PTR_WORDS_HI", 0}
+            ,{"DRAM_FIFO_CAPACITY_PTR_WORDS_LO", 1}
+            ,{"NCRISC_LOOP_INCR", 2}
+            ,{"NCRISC_LOOP_BACK_NUM_CFG_REG_WRITES", 3}
+
+        },
+        {
+            {0, 0}
+            ,{(DRAM_FIFO_WR_PTR_WORDS_HI + DRAM_FIFO_WR_PTR_WORDS_HI_WIDTH), 1}
+            ,{(NCRISC_LOOP_INCR+NCRISC_LOOP_INCR_WIDTH), 3}
+
+        },
+        {
+            {
+                "DRAM_FIFO_WR_PTR_WORDS_HI",
+                0,
+                8,
+                ""
+            }
+            ,{
+                "DRAM_FIFO_CAPACITY_PTR_WORDS_LO",
+                (DRAM_FIFO_WR_PTR_WORDS_HI + DRAM_FIFO_WR_PTR_WORDS_HI_WIDTH),
+                16,
+                ""
+            }
+            ,{
+                "NCRISC_LOOP_INCR",
+                0,
+                16,
+                ""
+            }
+            ,{
+                "NCRISC_LOOP_BACK_NUM_CFG_REG_WRITES",
+                (NCRISC_LOOP_INCR+NCRISC_LOOP_INCR_WIDTH),
+                8,
+                ""
+            }
+
+        },
+        ""
+    }
+    ,{
+        "STREAM_SCRATCH_4",
+        40,
+        {
+            {"DRAM_FIFO_CAPACITY_PTR_WORDS_HI", 0}
+            ,{"DRAM_FIFO_BASE_ADDR_WORDS_LO", 1}
+            ,{"NCRISC_LOOP_BACK_AUTO_CFG_PTR", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(DRAM_FIFO_CAPACITY_PTR_WORDS_HI + DRAM_FIFO_CAPACITY_PTR_WORDS_HI_WIDTH), 1}
+
+        },
+        {
+            {
+                "DRAM_FIFO_CAPACITY_PTR_WORDS_HI",
+                0,
+                12,
+                ""
+            }
+            ,{
+                "DRAM_FIFO_BASE_ADDR_WORDS_LO",
+                (DRAM_FIFO_CAPACITY_PTR_WORDS_HI + DRAM_FIFO_CAPACITY_PTR_WORDS_HI_WIDTH),
+                12,
+                ""
+            }
+            ,{
+                "NCRISC_LOOP_BACK_AUTO_CFG_PTR",
+                0,
+                24,
+                ""
+            }
+
+        },
+        ""
+    }
+    ,{
+        "STREAM_SCRATCH_5",
+        41,
+        {
+            {"DRAM_FIFO_BASE_ADDR_WORDS_HI", 0}
+            ,{"DRAM_EN_BLOCKING", 1}
+            ,{"DRAM_DATA_STRUCTURE_IS_LUT", 2}
+            ,{"DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY", 3}
+            ,{"DRAM_RESET_WR_PTR_TO_BASE_ON_FULL", 4}
+            ,{"DRAM_NO_PTR_UPDATE_ON_PHASE_END", 5}
+            ,{"DRAM_WR_BUFFER_FLUSH_AND_RST_PTRS", 6}
+            ,{"NCRISC_LOOP_NEXT_PIC_INT_ON_PHASE", 7}
+
+        },
+        {
+            {0, 0}
+            ,{(DRAM_FIFO_BASE_ADDR_WORDS_HI + DRAM_FIFO_BASE_ADDR_WORDS_HI_WIDTH), 1}
+            ,{(DRAM_EN_BLOCKING + DRAM_EN_BLOCKING_WIDTH), 2}
+            ,{(DRAM_DATA_STRUCTURE_IS_LUT + DRAM_DATA_STRUCTURE_IS_LUT_WIDTH), 3}
+            ,{(DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY + DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY_WIDTH), 4}
+            ,{(DRAM_RESET_WR_PTR_TO_BASE_ON_FULL + DRAM_RESET_WR_PTR_TO_BASE_ON_FULL_WIDTH), 5}
+            ,{(DRAM_NO_PTR_UPDATE_ON_PHASE_END + DRAM_NO_PTR_UPDATE_ON_PHASE_END_WIDTH), 6}
+
+        },
+        {
+            {
+                "DRAM_FIFO_BASE_ADDR_WORDS_HI",
+                0,
+                16,
+                ""
+            }
+            ,{
+                "DRAM_EN_BLOCKING",
+                (DRAM_FIFO_BASE_ADDR_WORDS_HI + DRAM_FIFO_BASE_ADDR_WORDS_HI_WIDTH),
+                1,
+                "// Processes the read or write operation to completeion without processing other dram streams in the meantime\n"
+            }
+            ,{
+                "DRAM_DATA_STRUCTURE_IS_LUT",
+                (DRAM_EN_BLOCKING + DRAM_EN_BLOCKING_WIDTH),
+                1,
+                "// Fifo structure in dram holds a dram pointer and size that is used as indirection to a tile in dram\n"
+            }
+            ,{
+                "DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY",
+                (DRAM_DATA_STRUCTURE_IS_LUT + DRAM_DATA_STRUCTURE_IS_LUT_WIDTH),
+                1,
+                "// During a dram read, if its detected that the fifo is empty the ncrisc will reset the read pointer back to base\n// Its expected that there is no host interaction\n"
+            }
+            ,{
+                "DRAM_RESET_WR_PTR_TO_BASE_ON_FULL",
+                (DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY + DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY_WIDTH),
+                1,
+                "// During a dram write, if its detected that the fifo is full the ncrisc will reset the write pointer back to base. Old data will be overwritten.\n// Its expected that there is no host interaction\n"
+            }
+            ,{
+                "DRAM_NO_PTR_UPDATE_ON_PHASE_END",
+                (DRAM_RESET_WR_PTR_TO_BASE_ON_FULL + DRAM_RESET_WR_PTR_TO_BASE_ON_FULL_WIDTH),
+                1,
+                "// The internal ncrisc rd/wr pointers will not be updated at phase end\n// Its expected that there is no host interaction\n"
+            }
+            ,{
+                "DRAM_WR_BUFFER_FLUSH_AND_RST_PTRS",
+                (DRAM_NO_PTR_UPDATE_ON_PHASE_END + DRAM_NO_PTR_UPDATE_ON_PHASE_END_WIDTH),
+                1,
+                "// Before ending the phase the ncrisc will wait until the host has emptied the write buffer and then reset the read and write pointers to base\n// This can be used for hosts that do not want to track wrapping\n// The host must be aware of this behaviour for this functionality to work\n"
+            }
+            ,{
+                "NCRISC_LOOP_NEXT_PIC_INT_ON_PHASE",
+                0,
+                20,
+                ""
+            }
+
+        },
+        ""
+    }
+    ,{
+        "STREAM_MSG_BLOB_BUF_START",
+        206,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Start address (in words) of the message blob buffer. \n// Only used when out-of-order execution is enabled. Read value consists of this register + current message blob offset.\n"
+    }
+    ,{
+        "STREAM_GLOBAL_OFFSET_TABLE",
+        207,
+        {
+            {"GLOBAL_OFFSET_VAL", 0}
+            ,{"GLOBAL_OFFSET_TABLE_INDEX_SEL", 1}
+            ,{"GLOBAL_OFFSET_TABLE_CLEAR", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(GLOBAL_OFFSET_VAL+GLOBAL_OFFSET_VAL_WIDTH), 1}
+            ,{(GLOBAL_OFFSET_TABLE_INDEX_SEL+GLOBAL_OFFSET_TABLE_INDEX_SEL_WIDTH), 2}
+
+        },
+        {
+            {
+                "GLOBAL_OFFSET_VAL",
+                0,
+                MEM_WORD_ADDR_WIDTH,
+                ""
+            }
+            ,{
+                "GLOBAL_OFFSET_TABLE_INDEX_SEL",
+                (GLOBAL_OFFSET_VAL+GLOBAL_OFFSET_VAL_WIDTH),
+                GLOBAL_OFFSET_TABLE_SIZE_WIDTH,
+                ""
+            }
+            ,{
+                "GLOBAL_OFFSET_TABLE_CLEAR",
+                (GLOBAL_OFFSET_TABLE_INDEX_SEL+GLOBAL_OFFSET_TABLE_INDEX_SEL_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Global offset table write entry interface.\n"
+    }
+    ,{
+        "FIRMWARE_SCRATCH",
+        208,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Scratch location for firmware usage\n// Guarantees that no side-effects occur in Overlay hardware\n// Does not map to any actual registers in streams\n"
+    }
+    ,{
+        "STREAM_LOCAL_SRC_MASK",
+        224,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Bit mask of connnected local source. Dont care if LOCAL_SOURCES_CONNECTED == 0.\n// Mask segments [23:0], [47:24], and [63:48] are at indexes STREAM_LOCAL_SRC_MASK_REG_INDEX, \n// STREAM_LOCAL_SRC_MASK_REG_INDEX+1, STREAM_LOCAL_SRC_MASK_REG_INDEX+2.\n"
+    }
+    ,{
+        "STREAM_MSG_HEADER_FETCH",
+        254,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Reserved for msg header fetch interface\n"
+    }
+    ,{
+        "RESERVED1",
+        255,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Reserved for legacy reasons. This range appears not to be used in rtl anymore.\n"
+    }
+    ,{
+        "STREAM_SCRATCH32",
+        256,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Only in receiver endpoint/dram streams\n// A 32 bit scratch register\n"
+    }
+    ,{
+        "STREAM_WAIT_STATUS",
+        257,
+        {
+            {"WAIT_SW_PHASE_ADVANCE_SIGNAL", 0}
+            ,{"WAIT_PREV_PHASE_DATA_FLUSH", 1}
+            ,{"MSG_FWD_ONGOING", 2}
+            ,{"STREAM_CURR_STATE", 3}
+            ,{"TOKEN_GOTTEN", 4}
+            ,{"INFINITE_PHASE_END_DETECTED", 5}
+            ,{"INFINITE_PHASE_END_HEADER_BUFFER_DETECTED", 6}
+
+        },
+        {
+            {0, 0}
+            ,{(WAIT_SW_PHASE_ADVANCE_SIGNAL+WAIT_SW_PHASE_ADVANCE_SIGNAL_WIDTH), 1}
+            ,{(WAIT_PREV_PHASE_DATA_FLUSH+WAIT_PREV_PHASE_DATA_FLUSH_WIDTH), 2}
+            ,{(MSG_FWD_ONGOING+MSG_FWD_ONGOING_WIDTH), 3}
+            ,{(STREAM_CURR_STATE+STREAM_CURR_STATE_WIDTH), 4}
+            ,{(TOKEN_GOTTEN+TOKEN_GOTTEN_WIDTH), 5}
+            ,{(INFINITE_PHASE_END_DETECTED+INFINITE_PHASE_END_DETECTED_WIDTH), 6}
+
+        },
+        {
+            {
+                "WAIT_SW_PHASE_ADVANCE_SIGNAL",
+                0,
+                1,
+                "// Set when stream is in START state with auto-config disabled, or if auto-config is enabled\n// but PHASE_AUTO_ADVANCE=0\n"
+            }
+            ,{
+                "WAIT_PREV_PHASE_DATA_FLUSH",
+                (WAIT_SW_PHASE_ADVANCE_SIGNAL+WAIT_SW_PHASE_ADVANCE_SIGNAL_WIDTH),
+                1,
+                "// Set when stream has configured the current phase, but waits data from the previous one to be flushed.\n"
+            }
+            ,{
+                "MSG_FWD_ONGOING",
+                (WAIT_PREV_PHASE_DATA_FLUSH+WAIT_PREV_PHASE_DATA_FLUSH_WIDTH),
+                1,
+                "// Set when stream is in data forwarding state.\n"
+            }
+            ,{
+                "STREAM_CURR_STATE",
+                (MSG_FWD_ONGOING+MSG_FWD_ONGOING_WIDTH),
+                4,
+                ""
+            }
+            ,{
+                "TOKEN_GOTTEN",
+                (STREAM_CURR_STATE+STREAM_CURR_STATE_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "INFINITE_PHASE_END_DETECTED",
+                (TOKEN_GOTTEN+TOKEN_GOTTEN_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "INFINITE_PHASE_END_HEADER_BUFFER_DETECTED",
+                (INFINITE_PHASE_END_DETECTED+INFINITE_PHASE_END_DETECTED_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Status info for the stream.\n"
+    }
+    ,{
+        "STREAM_NUM_MSGS_RECEIVED_IN_BUF_AND_MEM",
+        258,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Only in receiver endpoint streams (stream 4 and 5)\n// Read-only. Tells you the number of tiles that have arrived in L1\n"
+    }
+    ,{
+        "STREAM_NUM_MSGS_RECEIVED",
+        259,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Number of received & stored messages (read-only). \n// To get the total number of messages penidng in memory read \n// STREAM_NUM_MSGS_RECEIVED_IN_BUF_AND_MEM_REG_INDEX\n"
+    }
+    ,{
+        "STREAM_BUF_SPACE_AVAILABLE",
+        260,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Available buffer space at the stream (in 16B words). \n// Source cant send data unless available space > 0.\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_BUF_SPACE_AVAILABLE",
+        261,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Available msg info buffer space at the stream (in 16B words). \n// Source cant send data unless available space > 0.  \n// Only valid when MSG_INFO_BUF_FLOW_CTRL is true\n"
+    }
+    ,{
+        "STREAM_NEXT_RECEIVED_MSG_ADDR",
+        262,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Memory address (in words) of the next in line received message (read-only).\n"
+    }
+    ,{
+        "STREAM_NEXT_RECEIVED_MSG_SIZE",
+        263,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Size in words of the next in line received message (read-only).\n"
+    }
+    ,{
+        "STREAM_MULTI_MSG_CLEAR",
+        264,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Clear message info, move read pointer, and reclaim buffer space for one or more stored messages.\n// This is a special case of STREAM_MSG_INFO_CLEAR/STREAM_MSG_DATA_CLEAR where we arent streaming data\n// and instead we just want to clear a bunch of messages after we have used them.\n// If you are using streaming it is better to use STREAM_MSG_INFO_CLEAR/STREAM_MSG_DATA_CLEAR instead.\n// You should not use both STREAM_MSG_INFO_CLEAR/STREAM_MSG_DATA_CLEAR and STREAM_MULTI_MSG_CLEAR at the same time\n// Must be used only for streams where RECEIVER_ENDPOINT == 1.\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_CLEAR",
+        265,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Clear message info for one or more stored messages.  Only valid values are 1, 2, or 4. \n// No effect on the read pointer. \n// Should be used only for streams where RECEIVER_ENDPOINT == 1.\n"
+    }
+    ,{
+        "STREAM_MSG_DATA_CLEAR",
+        266,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Move read pointer & reclaim buffer space for one or more stored messages.  \n// Sends flow control update to the source if REMOTE_SOURCE==1. \n// Only valid values are 1, 2, or 4. \n// Should be used only for streams where RECEIVER_ENDPOINT == 1, after \n// STREAM_MSG_INFO_CLEAR_REG has been written with the same value.\n"
+    }
+    ,{
+        "STREAM_PHASE_ADVANCE",
+        267,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Write-only. Write 1 to advance to the next phase if PHASE_AUTO_ADVANCE == 0.\n"
+    }
+    ,{
+        "STREAM_DEST_PHASE_READY_UPDATE",
+        268,
+        {
+            {"PHASE_READY_DEST_NUM", 0}
+            ,{"PHASE_READY_NUM", 1}
+            ,{"PHASE_READY_MCAST", 2}
+            ,{"PHASE_READY_TWO_WAY_RESP", 3}
+
+        },
+        {
+            {0, 0}
+            ,{(PHASE_READY_DEST_NUM+PHASE_READY_DEST_NUM_WIDTH), 1}
+            ,{(PHASE_READY_NUM+PHASE_READY_NUM_WIDTH), 2}
+            ,{(PHASE_READY_MCAST+PHASE_READY_MCAST_WIDTH), 3}
+
+        },
+        {
+            {
+                "PHASE_READY_DEST_NUM",
+                0,
+                6,
+                ""
+            }
+            ,{
+                "PHASE_READY_NUM",
+                (PHASE_READY_DEST_NUM+PHASE_READY_DEST_NUM_WIDTH),
+                20,
+                ""
+            }
+            ,{
+                "PHASE_READY_MCAST",
+                (PHASE_READY_NUM+PHASE_READY_NUM_WIDTH),
+                1,
+                "// set if this stream is part of multicast group (i.e. if REMOTE_SRC_IS_MCAST==1)\n"
+            }
+            ,{
+                "PHASE_READY_TWO_WAY_RESP",
+                (PHASE_READY_MCAST+PHASE_READY_MCAST_WIDTH),
+                1,
+                "// set if the message is in response to 2-way handshake\n"
+            }
+
+        },
+        "// Write phase number to indicate destination ready for the given phase. \n// (This is done automatically by stream hardware when starting a phase with REMOTE_SOURCE=1.)\n// The phase number is the one indicated by STREAM_REMOTE_SRC_PHASE_REG at destination. \n// This register is mapped to the shared destination ready table, not a per-stream register.\n// (Stream index is taken from the register address, and stored into the table along with the\n// phase number.)\n"
+    }
+    ,{
+        "STREAM_SRC_READY_UPDATE",
+        269,
+        {
+            {"STREAM_REMOTE_RDY_SRC_X", 0}
+            ,{"STREAM_REMOTE_RDY_SRC_Y", 1}
+            ,{"REMOTE_RDY_SRC_STREAM_ID", 2}
+            ,{"IS_TOKEN_UPDATE", 3}
+
+        },
+        {
+            {0, 0}
+            ,{(STREAM_REMOTE_RDY_SRC_X+STREAM_REMOTE_RDY_SRC_X_WIDTH), 1}
+            ,{(STREAM_REMOTE_RDY_SRC_Y+STREAM_REMOTE_RDY_SRC_Y_WIDTH), 2}
+            ,{(REMOTE_RDY_SRC_STREAM_ID+REMOTE_RDY_SRC_STREAM_ID_WIDTH), 3}
+
+        },
+        {
+            {
+                "STREAM_REMOTE_RDY_SRC_X",
+                0,
+                NOC_ID_WIDTH,
+                ""
+            }
+            ,{
+                "STREAM_REMOTE_RDY_SRC_Y",
+                (STREAM_REMOTE_RDY_SRC_X+STREAM_REMOTE_RDY_SRC_X_WIDTH),
+                NOC_ID_WIDTH,
+                ""
+            }
+            ,{
+                "REMOTE_RDY_SRC_STREAM_ID",
+                (STREAM_REMOTE_RDY_SRC_Y+STREAM_REMOTE_RDY_SRC_Y_WIDTH),
+                STREAM_ID_WIDTH,
+                ""
+            }
+            ,{
+                "IS_TOKEN_UPDATE",
+                (REMOTE_RDY_SRC_STREAM_ID+REMOTE_RDY_SRC_STREAM_ID_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Source ready message register for two-way handshake (sent by source in \n// case destination ready entry is not found in the table). \n// If received by a stream that already sent its ready update, it prompts resending.\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE",
+        270,
+        {
+            {"REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM", 0}
+            ,{"REMOTE_DEST_BUF_WORDS_FREE_INC", 1}
+            ,{"REMOTE_DEST_MSG_INFO_BUF_WORDS_FREE_INC", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM+REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM_WIDTH), 1}
+            ,{(REMOTE_DEST_BUF_WORDS_FREE_INC+REMOTE_DEST_BUF_WORDS_FREE_INC_WIDTH), 2}
+
+        },
+        {
+            {
+                "REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM",
+                0,
+                6,
+                ""
+            }
+            ,{
+                "REMOTE_DEST_BUF_WORDS_FREE_INC",
+                (REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM+REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM_WIDTH),
+                MEM_WORD_ADDR_WIDTH,
+                ""
+            }
+            ,{
+                "REMOTE_DEST_MSG_INFO_BUF_WORDS_FREE_INC",
+                (REMOTE_DEST_BUF_WORDS_FREE_INC+REMOTE_DEST_BUF_WORDS_FREE_INC_WIDTH),
+                MSG_INFO_BUF_SIZE_WORDS_WIDTH,
+                ""
+            }
+
+        },
+        "// Update available buffer space at remote destination stream. \n// this is rd_ptr increment issued when a message is forwarded\n"
+    }
+    ,{
+        "STREAM_RESET",
+        271,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Write to reset & stop stream.\n"
+    }
+    ,{
+        "STREAM_MSG_GROUP_ZERO_MASK_AND",
+        272,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// AND value of zero masks for the pending message group. \n// (Header bits [95:64].)\n// Read-only.  Valid only for receiver endpoint streams.\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_FULL",
+        273,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Returns 1 if the message info register is full (read-only).\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_FULLY_LOADED",
+        274,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Returns 1 if the message info register is full (read-only), and there are no outstanding loads in progress.\n"
+    }
+    ,{
+        "STREAM_MSG_INFO_CAN_PUSH_NEW_MSG",
+        275,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Returns 1 if the message info register can accept new message push (read-only). \n// Equivalent to checking the condition:\n//   (STREAM_MSG_INFO_FULL_REG_INDEX == 0) && (STREAM_MSG_INFO_PTR_REG_INDEX == STREAM_MSG_INFO_WR_PTR_REG_INDEX)\n// (I.e. ther is free space in the msg info register, and we dont have any message info headers in the\n//  memory buffer about to be fetched.)\n"
+    }
+    ,{
+        "STREAM_MSG_GROUP_COMPRESS",
+        276,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Concat compress flags from 4 tiles in the pending message group.\n// (Header bit 52.)\n// Read-only.  Valid only for receiver endpoint streams.\n"
+    }
+    ,{
+        "STREAM_PHASE_ALL_MSGS_PUSHED",
+        277,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Returns 1 if all msgs that the phase can accept have been pushed into the stream. 0 otherwise.\n"
+    }
+    ,{
+        "STREAM_READY_FOR_MSG_PUSH",
+        278,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Returns 1 if the stream is in a state where it can accept msgs.\n"
+    }
+    ,{
+        "STREAM_GLOBAL_OFFSET_TABLE_RD",
+        279,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Returns global offset table entry 0. The rest of the table entries can be read at index\n// STREAM_GLOBAL_OFFSET_TABLE_RD_REG_INDEX+i, up to maximum entry size.\n"
+    }
+    ,{
+        "STREAM_BLOB_AUTO_CFG_DONE",
+        288,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// 32 bit register. Each bit denotes whether the corresponding stream has completed its blob run and is in idle state.\n// Resets to 0 upon starting a new stream run. Initially all are 0 to exclude streams that might not be used.\n// Can be manually reset to 0 by writing 1 to the corresponding bit.\n// Exists only in stream 0\n"
+    }
+    ,{
+        "STREAM_BLOB_NEXT_AUTO_CFG_DONE",
+        290,
+        {
+            {"BLOB_NEXT_AUTO_CFG_DONE_STREAM_ID", 0}
+            ,{"BLOB_NEXT_AUTO_CFG_DONE_VALID", 1}
+
+        },
+        {
+            {0, 0}
+            ,{16, 1}
+
+        },
+        {
+            {
+                "BLOB_NEXT_AUTO_CFG_DONE_STREAM_ID",
+                0,
+                STREAM_ID_WIDTH,
+                ""
+            }
+            ,{
+                "BLOB_NEXT_AUTO_CFG_DONE_VALID",
+                16,
+                1,
+                ""
+            }
+
+        },
+        "// Reading this register will give you a stream id of a stream that finished its blob (according to STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX)\n// Subsequent reads will give you the next stream, untill all streams are read, after which it will loop\n// This register is only valid if BLOB_NEXT_AUTO_CFG_DONE_VALID is set (i.e. if STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX non-zero)\n// Exists only in stream 0\n"
+    }
+    ,{
+        "STREAM_RECEIVER_ENDPOINT_SET_MSG_HEADER",
+        291,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// For receiver endpoint streams that expose the full message header bus to unpacker,\n// write this register to specify the full header in case the stream is not snooping\n// a remote source but instead also works as a source endpoint. \n// Write (STREAM_RECEIVER_ENDPOINT_SET_MSG_HEADER_REG_INDEX+i) to set bits [i*32 +: 32]\n// of the message header for the next message, prior to writing STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG_INDEX.\n"
+    }
+    ,{
+        "STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE",
+        297,
+        {
+            {"REMOTE_DEST_WORDS_FREE", 0}
+            ,{"REMOTE_DEST_MSG_INFO_WORDS_FREE", 1}
+
+        },
+        {
+            {0, 0}
+            ,{(REMOTE_DEST_WORDS_FREE+REMOTE_DEST_WORDS_FREE_WIDTH), 1}
+
+        },
+        {
+            {
+                "REMOTE_DEST_WORDS_FREE",
+                0,
+                MEM_WORD_ADDR_WIDTH,
+                ""
+            }
+            ,{
+                "REMOTE_DEST_MSG_INFO_WORDS_FREE",
+                (REMOTE_DEST_WORDS_FREE+REMOTE_DEST_WORDS_FREE_WIDTH),
+                MSG_INFO_BUF_SIZE_WORDS_WIDTH,
+                ""
+            }
+
+        },
+        "// Available buffer space at remote destination stream(s) for both the data buffer and msg info buffer. \n// Dont care unless REMOTE_RECEIVER == 1. \n// Source cant send data unless WORDS_FREE > 0.  \n// Read-only; updated automatically to maximum value when \n// STREAM_REMOTE_DEST_BUF_SIZE_REG/STREAM_REMOTE_DEST_MSG_INFO_BUF_SIZE_REG is updated. \n// For multicast streams, values for successive destinations are at \n// subsequent indexes (STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX+1, \n// STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX+2, etc.).\n// REMOTE_DEST_MSG_INFO_WORDS_FREE is only valid when DEST_MSG_INFO_BUF_FLOW_CTRL is true\n"
+    }
+    ,{
+        "STREAM_RECEIVER_MSG_INFO",
+        329,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Read-only register view of the bits on the o_full_msg_info bus. \n// Exposed as 32-bit read-only registers starting at this index.\n"
+    }
+    ,{
+        "STREAM_DEBUG_STATUS_SEL",
+        499,
+        {
+            {"DEBUG_STATUS_STREAM_ID_SEL", 0}
+            ,{"DISABLE_DEST_READY_TABLE", 1}
+            ,{"DISABLE_GLOBAL_OFFSET_TABLE", 2}
+
+        },
+        {
+            {0, 0}
+            ,{(DEBUG_STATUS_STREAM_ID_SEL+DEBUG_STATUS_STREAM_ID_SEL_WIDTH), 1}
+            ,{(DISABLE_DEST_READY_TABLE+DISABLE_DEST_READY_TABLE_WIDTH), 2}
+
+        },
+        {
+            {
+                "DEBUG_STATUS_STREAM_ID_SEL",
+                0,
+                STREAM_ID_WIDTH,
+                ""
+            }
+            ,{
+                "DISABLE_DEST_READY_TABLE",
+                (DEBUG_STATUS_STREAM_ID_SEL+DEBUG_STATUS_STREAM_ID_SEL_WIDTH),
+                1,
+                ""
+            }
+            ,{
+                "DISABLE_GLOBAL_OFFSET_TABLE",
+                (DISABLE_DEST_READY_TABLE+DISABLE_DEST_READY_TABLE_WIDTH),
+                1,
+                ""
+            }
+
+        },
+        "// Debug bus stream selection. Write the stream id for the stream that you want exposed on the debug bus\n// This register only exists in stream 0.\n"
+    }
+    ,{
+        "STREAM_DEBUG_ASSERTIONS",
+        500,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Debugging: Non-zero value indicates an invalid stream operation occured.\n// Sticky, write 1 to clear.\n"
+    }
+    ,{
+        "STREAM_DEBUG_STATUS",
+        501,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Read-only register that exposes internal states of the stream.\n// Useful for debugging. Valid 32-bit data from STREAM_DEBUG_STATUS_REG_INDEX + 0 to STREAM_DEBUG_STATUS_REG_INDEX + 9\n"
+    }
+    ,{
+        "RESERVED2",
+        511,
+        {
+            std::unordered_map<std::string, std::uint32_t>()
+        },
+        {
+            std::unordered_map<std::uint32_t, std::uint32_t>()
+        },
+        {
+            std::vector<OverlayField>()
+        },
+        "// Reserved for legacy reasons. This range appears not to be used in rtl anymore.\n"
+    }
+};
+
+const std::unordered_map<std::string, std::uint32_t> OLP::registers_by_name = {
+    {"STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO", 0}
+    ,{"STREAM_NUM_MSGS_RECEIVED_INC", 1}
+    ,{"STREAM_ONETIME_MISC_CFG", 2}
+    ,{"STREAM_MISC_CFG", 3}
+    ,{"STREAM_REMOTE_SRC", 4}
+    ,{"STREAM_REMOTE_SRC_PHASE", 5}
+    ,{"STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD", 6}
+    ,{"STREAM_REMOTE_DEST", 7}
+    ,{"STREAM_LOCAL_DEST", 8}
+    ,{"STREAM_REMOTE_DEST_BUF_START", 9}
+    ,{"STREAM_REMOTE_DEST_BUF_START_HI", 10}
+    ,{"STREAM_REMOTE_DEST_BUF_SIZE", 11}
+    ,{"STREAM_REMOTE_DEST_WR_PTR", 12}
+    ,{"STREAM_REMOTE_DEST_MSG_INFO_BUF_SIZE", 13}
+    ,{"STREAM_REMOTE_DEST_MSG_INFO_BUF_START", 14}
+    ,{"STREAM_REMOTE_DEST_MSG_INFO_WR_PTR", 15}
+    ,{"STREAM_REMOTE_DEST_MSG_INFO_BUF_START_HI", 16}
+    ,{"STREAM_REMOTE_DEST_MSG_INFO_WR_PTR_HI", 17}
+    ,{"STREAM_REMOTE_DEST_MSG_INFO_WRAP_WR_PTR", 18}
+    ,{"STREAM_REMOTE_DEST_TRAFFIC", 19}
+    ,{"STREAM_BUF_START", 20}
+    ,{"STREAM_BUF_SIZE", 21}
+    ,{"STREAM_RD_PTR", 22}
+    ,{"STREAM_WR_PTR", 23}
+    ,{"STREAM_MSG_INFO_BUF_SIZE", 24}
+    ,{"STREAM_MSG_INFO_BUF_START", 25}
+    ,{"STREAM_MSG_INFO_PTR", 26}
+    ,{"STREAM_MSG_INFO_WRAP_RD_WR_PTR", 27}
+    ,{"STREAM_MSG_INFO_WR_PTR", 28}
+    ,{"STREAM_MCAST_DEST", 29}
+    ,{"STREAM_MCAST_DEST_NUM", 30}
+    ,{"STREAM_GATHER", 31}
+    ,{"STREAM_MSG_SRC_IN_ORDER_FWD_NUM_MSGS", 32}
+    ,{"STREAM_CURR_PHASE_BASE", 33}
+    ,{"STREAM_CURR_PHASE", 34}
+    ,{"STREAM_PHASE_AUTO_CFG_PTR_BASE", 35}
+    ,{"STREAM_PHASE_AUTO_CFG_PTR", 36}
+    ,{"STREAM_RELOAD_PHASE_BLOB", 37}
+    ,{"STREAM_MSG_HEADER_FORMAT", 38}
+    ,{"STREAM_PHASE_AUTO_CFG_HEADER", 39}
+    ,{"STREAM_PERF_CONFIG", 40}
+    ,{"STREAM_SCRATCH", 41}
+    ,{"STREAM_SCRATCH_0", 42}
+    ,{"STREAM_SCRATCH_1", 43}
+    ,{"STREAM_SCRATCH_2", 44}
+    ,{"STREAM_SCRATCH_3", 45}
+    ,{"STREAM_SCRATCH_4", 46}
+    ,{"STREAM_SCRATCH_5", 47}
+    ,{"STREAM_MSG_BLOB_BUF_START", 48}
+    ,{"STREAM_GLOBAL_OFFSET_TABLE", 49}
+    ,{"FIRMWARE_SCRATCH", 50}
+    ,{"STREAM_LOCAL_SRC_MASK", 51}
+    ,{"STREAM_MSG_HEADER_FETCH", 52}
+    ,{"RESERVED1", 53}
+    ,{"STREAM_SCRATCH32", 54}
+    ,{"STREAM_WAIT_STATUS", 55}
+    ,{"STREAM_NUM_MSGS_RECEIVED_IN_BUF_AND_MEM", 56}
+    ,{"STREAM_NUM_MSGS_RECEIVED", 57}
+    ,{"STREAM_BUF_SPACE_AVAILABLE", 58}
+    ,{"STREAM_MSG_INFO_BUF_SPACE_AVAILABLE", 59}
+    ,{"STREAM_NEXT_RECEIVED_MSG_ADDR", 60}
+    ,{"STREAM_NEXT_RECEIVED_MSG_SIZE", 61}
+    ,{"STREAM_MULTI_MSG_CLEAR", 62}
+    ,{"STREAM_MSG_INFO_CLEAR", 63}
+    ,{"STREAM_MSG_DATA_CLEAR", 64}
+    ,{"STREAM_PHASE_ADVANCE", 65}
+    ,{"STREAM_DEST_PHASE_READY_UPDATE", 66}
+    ,{"STREAM_SRC_READY_UPDATE", 67}
+    ,{"STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE", 68}
+    ,{"STREAM_RESET", 69}
+    ,{"STREAM_MSG_GROUP_ZERO_MASK_AND", 70}
+    ,{"STREAM_MSG_INFO_FULL", 71}
+    ,{"STREAM_MSG_INFO_FULLY_LOADED", 72}
+    ,{"STREAM_MSG_INFO_CAN_PUSH_NEW_MSG", 73}
+    ,{"STREAM_MSG_GROUP_COMPRESS", 74}
+    ,{"STREAM_PHASE_ALL_MSGS_PUSHED", 75}
+    ,{"STREAM_READY_FOR_MSG_PUSH", 76}
+    ,{"STREAM_GLOBAL_OFFSET_TABLE_RD", 77}
+    ,{"STREAM_BLOB_AUTO_CFG_DONE", 78}
+    ,{"STREAM_BLOB_NEXT_AUTO_CFG_DONE", 79}
+    ,{"STREAM_RECEIVER_ENDPOINT_SET_MSG_HEADER", 80}
+    ,{"STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE", 81}
+    ,{"STREAM_RECEIVER_MSG_INFO", 82}
+    ,{"STREAM_DEBUG_STATUS_SEL", 83}
+    ,{"STREAM_DEBUG_ASSERTIONS", 84}
+    ,{"STREAM_DEBUG_STATUS", 85}
+    ,{"RESERVED2", 86}
+};
+
+const std::unordered_map<std::uint32_t, std::uint32_t> OLP::registers_by_index = {
+    {0, 0}
+    ,{1, 1}
+    ,{2, 2}
+    ,{3, 3}
+    ,{4, 4}
+    ,{5, 5}
+    ,{6, 6}
+    ,{7, 7}
+    ,{8, 9}
+    ,{9, 10}
+    ,{10, 11}
+    ,{11, 12}
+    ,{12, 13}
+    ,{13, 14}
+    ,{14, 16}
+    ,{15, 18}
+    ,{16, 19}
+    ,{17, 20}
+    ,{18, 21}
+    ,{19, 22}
+    ,{20, 23}
+    ,{21, 24}
+    ,{22, 25}
+    ,{23, 27}
+    ,{24, 29}
+    ,{25, 30}
+    ,{26, 31}
+    ,{27, 32}
+    ,{28, 33}
+    ,{29, 34}
+    ,{30, 35}
+    ,{31, 36}
+    ,{32, 37}
+    ,{33, 38}
+    ,{34, 39}
+    ,{35, 40}
+    ,{36, 41}
+    ,{37, 43}
+    ,{38, 44}
+    ,{39, 45}
+    ,{40, 46}
+    ,{41, 47}
+    ,{206, 48}
+    ,{207, 49}
+    ,{208, 50}
+    ,{224, 51}
+    ,{254, 52}
+    ,{255, 53}
+    ,{256, 54}
+    ,{257, 55}
+    ,{258, 56}
+    ,{259, 57}
+    ,{260, 58}
+    ,{261, 59}
+    ,{262, 60}
+    ,{263, 61}
+    ,{264, 62}
+    ,{265, 63}
+    ,{266, 64}
+    ,{267, 65}
+    ,{268, 66}
+    ,{269, 67}
+    ,{270, 68}
+    ,{271, 69}
+    ,{272, 70}
+    ,{273, 71}
+    ,{274, 72}
+    ,{275, 73}
+    ,{276, 74}
+    ,{277, 75}
+    ,{278, 76}
+    ,{279, 77}
+    ,{288, 78}
+    ,{290, 79}
+    ,{291, 80}
+    ,{297, 81}
+    ,{329, 82}
+    ,{499, 83}
+    ,{500, 84}
+    ,{501, 85}
+    ,{511, 86}
+};
+
+const std::vector<OverlayField> OLP::fields = {
+    {
+        "SOURCE_ENDPOINT_NEW_MSG_ADDR",
+        0,
+        MEM_WORD_ADDR_WIDTH,
+        ""
+    }
+    ,{
+        "SOURCE_ENDPOINT_NEW_MSG_SIZE",
+        (SOURCE_ENDPOINT_NEW_MSG_ADDR+SOURCE_ENDPOINT_NEW_MSG_ADDR_WIDTH),
+        (32-MEM_WORD_ADDR_WIDTH-1),
+        ""
+    }
+    ,{
+        "SOURCE_ENDPOINT_NEW_MSG_LAST_TILE",
+        (SOURCE_ENDPOINT_NEW_MSG_SIZE+SOURCE_ENDPOINT_NEW_MSG_SIZE_WIDTH),
+        (1),
+        ""
+    }
+    ,{
+        "SOURCE_ENDPOINT_NEW_MSGS_NUM",
+        0,
+        12,
+        ""
+    }
+    ,{
+        "SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE",
+        (SOURCE_ENDPOINT_NEW_MSGS_NUM+SOURCE_ENDPOINT_NEW_MSGS_NUM_WIDTH),
+        MEM_WORD_ADDR_WIDTH,
+        ""
+    }
+    ,{
+        "SOURCE_ENDPOINT_NEW_MSGS_LAST_TILE",
+        (SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE+SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "PHASE_AUTO_CONFIG",
+        0,
+        1,
+        ""
+    }
+    ,{
+        "PHASE_AUTO_ADVANCE",
+        (PHASE_AUTO_CONFIG+PHASE_AUTO_CONFIG_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "REG_UPDATE_VC_REG",
+        (PHASE_AUTO_ADVANCE+PHASE_AUTO_ADVANCE_WIDTH),
+        3,
+        "// set to one of the values (0-5) to select which VC control flow updates will be sent on\n"
+    }
+    ,{
+        "GLOBAL_OFFSET_TABLE_RD_SRC_INDEX",
+        (REG_UPDATE_VC_REG+REG_UPDATE_VC_REG_WIDTH),
+        GLOBAL_OFFSET_TABLE_SIZE_WIDTH,
+        "// Read index of global offset table, which will offset o_data_fwd_src_addr by entry value.\n"
+    }
+    ,{
+        "GLOBAL_OFFSET_TABLE_RD_DEST_INDEX",
+        (GLOBAL_OFFSET_TABLE_RD_SRC_INDEX+GLOBAL_OFFSET_TABLE_RD_SRC_INDEX_WIDTH),
+        GLOBAL_OFFSET_TABLE_SIZE_WIDTH,
+        "// Read index of global offset table, which will offset o_data_fwd_dest_addr by entry value.\n"
+    }
+    ,{
+        "INCOMING_DATA_NOC",
+        0,
+        NOC_NUM_WIDTH,
+        ""
+    }
+    ,{
+        "OUTGOING_DATA_NOC",
+        (INCOMING_DATA_NOC+INCOMING_DATA_NOC_WIDTH),
+        NOC_NUM_WIDTH,
+        ""
+    }
+    ,{
+        "REMOTE_SRC_UPDATE_NOC",
+        (OUTGOING_DATA_NOC+OUTGOING_DATA_NOC_WIDTH),
+        NOC_NUM_WIDTH,
+        ""
+    }
+    ,{
+        "LOCAL_SOURCES_CONNECTED",
+        (REMOTE_SRC_UPDATE_NOC+REMOTE_SRC_UPDATE_NOC_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "SOURCE_ENDPOINT",
+        (LOCAL_SOURCES_CONNECTED+LOCAL_SOURCES_CONNECTED_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "REMOTE_SOURCE",
+        (SOURCE_ENDPOINT+SOURCE_ENDPOINT_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "RECEIVER_ENDPOINT",
+        (REMOTE_SOURCE+REMOTE_SOURCE_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "LOCAL_RECEIVER",
+        (RECEIVER_ENDPOINT+RECEIVER_ENDPOINT_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "REMOTE_RECEIVER",
+        (LOCAL_RECEIVER+LOCAL_RECEIVER_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "TOKEN_MODE",
+        (REMOTE_RECEIVER+REMOTE_RECEIVER_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "COPY_MODE",
+        (TOKEN_MODE+TOKEN_MODE_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "NEXT_PHASE_SRC_CHANGE",
+        (COPY_MODE+COPY_MODE_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "NEXT_PHASE_DEST_CHANGE",
+        (NEXT_PHASE_SRC_CHANGE+NEXT_PHASE_SRC_CHANGE_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "DATA_BUF_NO_FLOW_CTRL",
+        (NEXT_PHASE_DEST_CHANGE+NEXT_PHASE_DEST_CHANGE_WIDTH),
+        1,
+        "// set if REMOTE_SOURCE==1 and the buffer is large enough to accept full phase data without wrapping:\n"
+    }
+    ,{
+        "DEST_DATA_BUF_NO_FLOW_CTRL",
+        (DATA_BUF_NO_FLOW_CTRL+DATA_BUF_NO_FLOW_CTRL_WIDTH),
+        1,
+        "// set if REMOTE_RECEIVER==1 and the destination buffer is large enough to accept full phase data without wrapping:\n"
+    }
+    ,{
+        "MSG_INFO_BUF_FLOW_CTRL",
+        (DEST_DATA_BUF_NO_FLOW_CTRL+DEST_DATA_BUF_NO_FLOW_CTRL_WIDTH),
+        1,
+        "// set if REMOTE_SOURCE==1 and you want the buffer to have wrapping:\n"
+    }
+    ,{
+        "DEST_MSG_INFO_BUF_FLOW_CTRL",
+        (MSG_INFO_BUF_FLOW_CTRL+MSG_INFO_BUF_FLOW_CTRL_WIDTH),
+        1,
+        "// set if REMOTE_RECEIVER==1 and you want the destination buffer to have wrapping:\n"
+    }
+    ,{
+        "REMOTE_SRC_IS_MCAST",
+        (DEST_MSG_INFO_BUF_FLOW_CTRL+DEST_MSG_INFO_BUF_FLOW_CTRL_WIDTH),
+        1,
+        "// set if REMOTE_SOURCE==1 and has mulicast enabled (i.e. this stream is part of a multicast group)\n"
+    }
+    ,{
+        "NO_PREV_PHASE_OUTGOING_DATA_FLUSH",
+        (REMOTE_SRC_IS_MCAST+REMOTE_SRC_IS_MCAST_WIDTH),
+        1,
+        "// set if no need to flush outgoing remote data from previous phase\n"
+    }
+    ,{
+        "SRC_FULL_CREDIT_FLUSH_EN",
+        (NO_PREV_PHASE_OUTGOING_DATA_FLUSH+NO_PREV_PHASE_OUTGOING_DATA_FLUSH_WIDTH),
+        1,
+        "// Set to one to enable full credit flushing on src side\n"
+    }
+    ,{
+        "DST_FULL_CREDIT_FLUSH_EN",
+        (SRC_FULL_CREDIT_FLUSH_EN+SRC_FULL_CREDIT_FLUSH_EN_WIDTH),
+        1,
+        "// Set to one to enable full credit flushing on dest side\n"
+    }
+    ,{
+        "INFINITE_PHASE_EN",
+        (DST_FULL_CREDIT_FLUSH_EN+DST_FULL_CREDIT_FLUSH_EN_WIDTH),
+        1,
+        "// Set to one to enable infinite messages per phase, accompanied by a last tile header bit which will end the phase\n"
+    }
+    ,{
+        "OOO_PHASE_EXECUTION_EN",
+        (INFINITE_PHASE_EN+INFINITE_PHASE_EN_WIDTH),
+        1,
+        "// Enables out-of-order phase execution by providing an array of size num_tiles at the end of phase blob, with order in which each tile should be sent. Each array entry contains a 17-bit tile address and a 15-bit tile size.\n"
+    }
+    ,{
+        "STREAM_REMOTE_SRC_X",
+        0,
+        NOC_ID_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_REMOTE_SRC_Y",
+        (STREAM_REMOTE_SRC_X+STREAM_REMOTE_SRC_X_WIDTH),
+        NOC_ID_WIDTH,
+        ""
+    }
+    ,{
+        "REMOTE_SRC_STREAM_ID",
+        (STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH),
+        STREAM_ID_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_REMOTE_SRC_DEST_INDEX",
+        (REMOTE_SRC_STREAM_ID+REMOTE_SRC_STREAM_ID_WIDTH),
+        STREAM_ID_WIDTH,
+        ""
+    }
+    ,{
+        "DRAM_READS__TRANS_SIZE_WORDS_LO",
+        (STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH),
+        12,
+        ""
+    }
+    ,{
+        "DRAM_READS__SCRATCH_1_PTR",
+        0,
+        19,
+        ""
+    }
+    ,{
+        "DRAM_READS__TRANS_SIZE_WORDS_HI",
+        (DRAM_READS__SCRATCH_1_PTR+DRAM_READS__SCRATCH_1_PTR_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_REMOTE_DEST_X",
+        0,
+        NOC_ID_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_REMOTE_DEST_Y",
+        (STREAM_REMOTE_DEST_X+STREAM_REMOTE_DEST_X_WIDTH),
+        NOC_ID_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_REMOTE_DEST_STREAM_ID",
+        (STREAM_REMOTE_DEST_Y+STREAM_REMOTE_DEST_Y_WIDTH),
+        STREAM_ID_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_LOCAL_DEST_MSG_CLEAR_NUM",
+        0,
+        12,
+        ""
+    }
+    ,{
+        "STREAM_LOCAL_DEST_STREAM_ID",
+        (STREAM_LOCAL_DEST_MSG_CLEAR_NUM+STREAM_LOCAL_DEST_MSG_CLEAR_NUM_WIDTH),
+        STREAM_ID_WIDTH,
+        ""
+    }
+    ,{
+        "DRAM_WRITES__SCRATCH_1_PTR_LO",
+        0,
+        16,
+        ""
+    }
+    ,{
+        "REMOTE_DEST_BUF_SIZE_WORDS",
+        0,
+        MEM_WORD_ADDR_WIDTH,
+        ""
+    }
+    ,{
+        "DRAM_WRITES__SCRATCH_1_PTR_HI",
+        0,
+        3,
+        ""
+    }
+    ,{
+        "REMOTE_DEST_MSG_INFO_BUF_SIZE_POW2",
+        0,
+        MSG_INFO_BUF_SIZE_POW_BITS,
+        ""
+    }
+    ,{
+        "NOC_PRIORITY",
+        0,
+        4,
+        ""
+    }
+    ,{
+        "UNICAST_VC_REG",
+        (NOC_PRIORITY+NOC_PRIORITY_WIDTH),
+        3,
+        "// set to one of the values (0-5) to select which VC unicast requests will be sent on\n"
+    }
+    ,{
+        "STREAM_RD_PTR_VAL",
+        0,
+        MEM_WORD_ADDR_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_RD_PTR_WRAP",
+        (STREAM_RD_PTR_VAL+STREAM_RD_PTR_VAL_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_WR_PTR_VAL",
+        0,
+        MEM_WORD_ADDR_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_WR_PTR_WRAP",
+        (STREAM_WR_PTR_VAL+STREAM_WR_PTR_VAL_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "MSG_INFO_BUF_SIZE_POW2",
+        0,
+        MSG_INFO_BUF_SIZE_POW_BITS,
+        ""
+    }
+    ,{
+        "STREAM_MSG_INFO_WRAP_RD_PTR",
+        0,
+        MSG_INFO_BUF_SIZE_BITS,
+        ""
+    }
+    ,{
+        "STREAM_MSG_INFO_WRAP_RD_PTR_WRAP",
+        (STREAM_MSG_INFO_WRAP_RD_PTR+STREAM_MSG_INFO_WRAP_RD_PTR_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_MSG_INFO_WRAP_WR_PTR",
+        (STREAM_MSG_INFO_WRAP_RD_PTR_WRAP+STREAM_MSG_INFO_WRAP_RD_PTR_WRAP_WIDTH),
+        MSG_INFO_BUF_SIZE_BITS,
+        ""
+    }
+    ,{
+        "STREAM_MSG_INFO_WRAP_WR_PTR_WRAP",
+        (STREAM_MSG_INFO_WRAP_WR_PTR+STREAM_MSG_INFO_WRAP_WR_PTR_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_MCAST_END_X",
+        0,
+        NOC_ID_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_MCAST_END_Y",
+        (STREAM_MCAST_END_X+STREAM_MCAST_END_X_WIDTH),
+        NOC_ID_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_MCAST_EN",
+        (STREAM_MCAST_END_Y+STREAM_MCAST_END_Y_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_MCAST_LINKED",
+        (STREAM_MCAST_EN+STREAM_MCAST_EN_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_MCAST_VC",
+        (STREAM_MCAST_LINKED+STREAM_MCAST_LINKED_WIDTH),
+        1,
+        "// Set to 0 to select VC 4, and 1 to select VC 5 (default 0)\n"
+    }
+    ,{
+        "STREAM_MCAST_NO_PATH_RES",
+        (STREAM_MCAST_VC+STREAM_MCAST_VC_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_MCAST_XY",
+        (STREAM_MCAST_NO_PATH_RES+STREAM_MCAST_NO_PATH_RES_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED",
+        (STREAM_MCAST_XY+STREAM_MCAST_XY_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "STREAM_MCAST_DEST_SIDE_DYNAMIC_LINKED",
+        (STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED+STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "MSG_LOCAL_STREAM_CLEAR_NUM",
+        0,
+        12,
+        ""
+    }
+    ,{
+        "MSG_GROUP_STREAM_CLEAR_TYPE",
+        (MSG_LOCAL_STREAM_CLEAR_NUM+MSG_LOCAL_STREAM_CLEAR_NUM_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "MSG_ARB_GROUP_SIZE",
+        (MSG_GROUP_STREAM_CLEAR_TYPE+MSG_GROUP_STREAM_CLEAR_TYPE_WIDTH),
+        3,
+        ""
+    }
+    ,{
+        "MSG_SRC_IN_ORDER_FWD",
+        (MSG_ARB_GROUP_SIZE+MSG_ARB_GROUP_SIZE_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "MSG_SRC_ARBITRARY_CLEAR_NUM_EN",
+        (MSG_SRC_IN_ORDER_FWD+MSG_SRC_IN_ORDER_FWD_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "MSG_HEADER_WORD_CNT_OFFSET",
+        0,
+        MEM_WORD_BIT_OFFSET_WIDTH,
+        ""
+    }
+    ,{
+        "MSG_HEADER_WORD_CNT_BITS",
+        (MSG_HEADER_WORD_CNT_OFFSET+MSG_HEADER_WORD_CNT_OFFSET_WIDTH),
+        MEM_WORD_BIT_OFFSET_WIDTH,
+        ""
+    }
+    ,{
+        "MSG_HEADER_INFINITE_PHASE_LAST_TILE_OFFSET",
+        (MSG_HEADER_WORD_CNT_BITS+MSG_HEADER_WORD_CNT_BITS_WIDTH),
+        MEM_WORD_BIT_OFFSET_WIDTH,
+        ""
+    }
+    ,{
+        "PHASE_NUM_INCR",
+        0,
+        12,
+        ""
+    }
+    ,{
+        "CURR_PHASE_NUM_MSGS",
+        (PHASE_NUM_INCR+PHASE_NUM_INCR_WIDTH),
+        12,
+        ""
+    }
+    ,{
+        "NEXT_PHASE_NUM_CFG_REG_WRITES",
+        (CURR_PHASE_NUM_MSGS+CURR_PHASE_NUM_MSGS_WIDTH),
+        8,
+        ""
+    }
+    ,{
+        "CLOCK_GATING_EN",
+        0,
+        1,
+        ""
+    }
+    ,{
+        "CLOCK_GATING_HYST",
+        (CLOCK_GATING_EN+CLOCK_GATING_EN_WIDTH),
+        7,
+        ""
+    }
+    ,{
+        "PARTIAL_SEND_WORDS_THR",
+        (CLOCK_GATING_HYST+CLOCK_GATING_HYST_WIDTH),
+        8,
+        "// PARTIAL_SEND_WORDS_THR contols the minimum number of 16-byte words of a tile to accumulate in a relay stream before sending it off to the destination.\n// If the size of the tile is less than or equal to PARTIAL_SEND_WORDS_THR, then this feild is ignored.\n// Default is 16 words\n"
+    }
+    ,{
+        "NCRISC_TRANS_EN",
+        0,
+        1,
+        ""
+    }
+    ,{
+        "NCRISC_TRANS_EN_IRQ_ON_BLOB_END",
+        (NCRISC_TRANS_EN + NCRISC_TRANS_EN_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "NCRISC_CMD_ID",
+        (NCRISC_TRANS_EN_IRQ_ON_BLOB_END + NCRISC_TRANS_EN_IRQ_ON_BLOB_END_WIDTH),
+        3,
+        ""
+    }
+    ,{
+        "NEXT_NRISC_PIC_INT_ON_PHASE",
+        (NCRISC_CMD_ID + NCRISC_CMD_ID_WIDTH),
+        19,
+        "// Kept for compatibility with grayskull, but doesnt not exist anymore in wormhole\n"
+    }
+    ,{
+        "DRAM_FIFO_RD_PTR_WORDS_LO",
+        0,
+        24,
+        ""
+    }
+    ,{
+        "NCRISC_LOOP_COUNT",
+        0,
+        24,
+        ""
+    }
+    ,{
+        "NCRISC_INIT_ENABLE_BLOB_DONE_IRQ",
+        0,
+        1,
+        ""
+    }
+    ,{
+        "NCRISC_INIT_DISABLE_BLOB_DONE_IRQ",
+        (NCRISC_INIT_ENABLE_BLOB_DONE_IRQ + NCRISC_INIT_ENABLE_BLOB_DONE_IRQ_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "DRAM_FIFO_RD_PTR_WORDS_HI",
+        0,
+        4,
+        ""
+    }
+    ,{
+        "DRAM_FIFO_WR_PTR_WORDS_LO",
+        (DRAM_FIFO_RD_PTR_WORDS_HI + DRAM_FIFO_RD_PTR_WORDS_HI_WIDTH),
+        20,
+        ""
+    }
+    ,{
+        "NCRISC_TOTAL_LOOP_ITER",
+        0,
+        24,
+        ""
+    }
+    ,{
+        "DRAM_FIFO_WR_PTR_WORDS_HI",
+        0,
+        8,
+        ""
+    }
+    ,{
+        "DRAM_FIFO_CAPACITY_PTR_WORDS_LO",
+        (DRAM_FIFO_WR_PTR_WORDS_HI + DRAM_FIFO_WR_PTR_WORDS_HI_WIDTH),
+        16,
+        ""
+    }
+    ,{
+        "NCRISC_LOOP_INCR",
+        0,
+        16,
+        ""
+    }
+    ,{
+        "NCRISC_LOOP_BACK_NUM_CFG_REG_WRITES",
+        (NCRISC_LOOP_INCR+NCRISC_LOOP_INCR_WIDTH),
+        8,
+        ""
+    }
+    ,{
+        "DRAM_FIFO_CAPACITY_PTR_WORDS_HI",
+        0,
+        12,
+        ""
+    }
+    ,{
+        "DRAM_FIFO_BASE_ADDR_WORDS_LO",
+        (DRAM_FIFO_CAPACITY_PTR_WORDS_HI + DRAM_FIFO_CAPACITY_PTR_WORDS_HI_WIDTH),
+        12,
+        ""
+    }
+    ,{
+        "NCRISC_LOOP_BACK_AUTO_CFG_PTR",
+        0,
+        24,
+        ""
+    }
+    ,{
+        "DRAM_FIFO_BASE_ADDR_WORDS_HI",
+        0,
+        16,
+        ""
+    }
+    ,{
+        "DRAM_EN_BLOCKING",
+        (DRAM_FIFO_BASE_ADDR_WORDS_HI + DRAM_FIFO_BASE_ADDR_WORDS_HI_WIDTH),
+        1,
+        "// Processes the read or write operation to completeion without processing other dram streams in the meantime\n"
+    }
+    ,{
+        "DRAM_DATA_STRUCTURE_IS_LUT",
+        (DRAM_EN_BLOCKING + DRAM_EN_BLOCKING_WIDTH),
+        1,
+        "// Fifo structure in dram holds a dram pointer and size that is used as indirection to a tile in dram\n"
+    }
+    ,{
+        "DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY",
+        (DRAM_DATA_STRUCTURE_IS_LUT + DRAM_DATA_STRUCTURE_IS_LUT_WIDTH),
+        1,
+        "// During a dram read, if its detected that the fifo is empty the ncrisc will reset the read pointer back to base\n// Its expected that there is no host interaction\n"
+    }
+    ,{
+        "DRAM_RESET_WR_PTR_TO_BASE_ON_FULL",
+        (DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY + DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY_WIDTH),
+        1,
+        "// During a dram write, if its detected that the fifo is full the ncrisc will reset the write pointer back to base. Old data will be overwritten.\n// Its expected that there is no host interaction\n"
+    }
+    ,{
+        "DRAM_NO_PTR_UPDATE_ON_PHASE_END",
+        (DRAM_RESET_WR_PTR_TO_BASE_ON_FULL + DRAM_RESET_WR_PTR_TO_BASE_ON_FULL_WIDTH),
+        1,
+        "// The internal ncrisc rd/wr pointers will not be updated at phase end\n// Its expected that there is no host interaction\n"
+    }
+    ,{
+        "DRAM_WR_BUFFER_FLUSH_AND_RST_PTRS",
+        (DRAM_NO_PTR_UPDATE_ON_PHASE_END + DRAM_NO_PTR_UPDATE_ON_PHASE_END_WIDTH),
+        1,
+        "// Before ending the phase the ncrisc will wait until the host has emptied the write buffer and then reset the read and write pointers to base\n// This can be used for hosts that do not want to track wrapping\n// The host must be aware of this behaviour for this functionality to work\n"
+    }
+    ,{
+        "NCRISC_LOOP_NEXT_PIC_INT_ON_PHASE",
+        0,
+        20,
+        ""
+    }
+    ,{
+        "GLOBAL_OFFSET_VAL",
+        0,
+        MEM_WORD_ADDR_WIDTH,
+        ""
+    }
+    ,{
+        "GLOBAL_OFFSET_TABLE_INDEX_SEL",
+        (GLOBAL_OFFSET_VAL+GLOBAL_OFFSET_VAL_WIDTH),
+        GLOBAL_OFFSET_TABLE_SIZE_WIDTH,
+        ""
+    }
+    ,{
+        "GLOBAL_OFFSET_TABLE_CLEAR",
+        (GLOBAL_OFFSET_TABLE_INDEX_SEL+GLOBAL_OFFSET_TABLE_INDEX_SEL_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "WAIT_SW_PHASE_ADVANCE_SIGNAL",
+        0,
+        1,
+        "// Set when stream is in START state with auto-config disabled, or if auto-config is enabled\n// but PHASE_AUTO_ADVANCE=0\n"
+    }
+    ,{
+        "WAIT_PREV_PHASE_DATA_FLUSH",
+        (WAIT_SW_PHASE_ADVANCE_SIGNAL+WAIT_SW_PHASE_ADVANCE_SIGNAL_WIDTH),
+        1,
+        "// Set when stream has configured the current phase, but waits data from the previous one to be flushed.\n"
+    }
+    ,{
+        "MSG_FWD_ONGOING",
+        (WAIT_PREV_PHASE_DATA_FLUSH+WAIT_PREV_PHASE_DATA_FLUSH_WIDTH),
+        1,
+        "// Set when stream is in data forwarding state.\n"
+    }
+    ,{
+        "STREAM_CURR_STATE",
+        (MSG_FWD_ONGOING+MSG_FWD_ONGOING_WIDTH),
+        4,
+        ""
+    }
+    ,{
+        "TOKEN_GOTTEN",
+        (STREAM_CURR_STATE+STREAM_CURR_STATE_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "INFINITE_PHASE_END_DETECTED",
+        (TOKEN_GOTTEN+TOKEN_GOTTEN_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "INFINITE_PHASE_END_HEADER_BUFFER_DETECTED",
+        (INFINITE_PHASE_END_DETECTED+INFINITE_PHASE_END_DETECTED_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "PHASE_READY_DEST_NUM",
+        0,
+        6,
+        ""
+    }
+    ,{
+        "PHASE_READY_NUM",
+        (PHASE_READY_DEST_NUM+PHASE_READY_DEST_NUM_WIDTH),
+        20,
+        ""
+    }
+    ,{
+        "PHASE_READY_MCAST",
+        (PHASE_READY_NUM+PHASE_READY_NUM_WIDTH),
+        1,
+        "// set if this stream is part of multicast group (i.e. if REMOTE_SRC_IS_MCAST==1)\n"
+    }
+    ,{
+        "PHASE_READY_TWO_WAY_RESP",
+        (PHASE_READY_MCAST+PHASE_READY_MCAST_WIDTH),
+        1,
+        "// set if the message is in response to 2-way handshake\n"
+    }
+    ,{
+        "STREAM_REMOTE_RDY_SRC_X",
+        0,
+        NOC_ID_WIDTH,
+        ""
+    }
+    ,{
+        "STREAM_REMOTE_RDY_SRC_Y",
+        (STREAM_REMOTE_RDY_SRC_X+STREAM_REMOTE_RDY_SRC_X_WIDTH),
+        NOC_ID_WIDTH,
+        ""
+    }
+    ,{
+        "REMOTE_RDY_SRC_STREAM_ID",
+        (STREAM_REMOTE_RDY_SRC_Y+STREAM_REMOTE_RDY_SRC_Y_WIDTH),
+        STREAM_ID_WIDTH,
+        ""
+    }
+    ,{
+        "IS_TOKEN_UPDATE",
+        (REMOTE_RDY_SRC_STREAM_ID+REMOTE_RDY_SRC_STREAM_ID_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM",
+        0,
+        6,
+        ""
+    }
+    ,{
+        "REMOTE_DEST_BUF_WORDS_FREE_INC",
+        (REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM+REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM_WIDTH),
+        MEM_WORD_ADDR_WIDTH,
+        ""
+    }
+    ,{
+        "REMOTE_DEST_MSG_INFO_BUF_WORDS_FREE_INC",
+        (REMOTE_DEST_BUF_WORDS_FREE_INC+REMOTE_DEST_BUF_WORDS_FREE_INC_WIDTH),
+        MSG_INFO_BUF_SIZE_WORDS_WIDTH,
+        ""
+    }
+    ,{
+        "BLOB_NEXT_AUTO_CFG_DONE_STREAM_ID",
+        0,
+        STREAM_ID_WIDTH,
+        ""
+    }
+    ,{
+        "BLOB_NEXT_AUTO_CFG_DONE_VALID",
+        16,
+        1,
+        ""
+    }
+    ,{
+        "REMOTE_DEST_WORDS_FREE",
+        0,
+        MEM_WORD_ADDR_WIDTH,
+        ""
+    }
+    ,{
+        "REMOTE_DEST_MSG_INFO_WORDS_FREE",
+        (REMOTE_DEST_WORDS_FREE+REMOTE_DEST_WORDS_FREE_WIDTH),
+        MSG_INFO_BUF_SIZE_WORDS_WIDTH,
+        ""
+    }
+    ,{
+        "DEBUG_STATUS_STREAM_ID_SEL",
+        0,
+        STREAM_ID_WIDTH,
+        ""
+    }
+    ,{
+        "DISABLE_DEST_READY_TABLE",
+        (DEBUG_STATUS_STREAM_ID_SEL+DEBUG_STATUS_STREAM_ID_SEL_WIDTH),
+        1,
+        ""
+    }
+    ,{
+        "DISABLE_GLOBAL_OFFSET_TABLE",
+        (DISABLE_DEST_READY_TABLE+DISABLE_DEST_READY_TABLE_WIDTH),
+        1,
+        ""
+    }
+};
+
+const std::unordered_map<std::string, std::uint32_t> OLP::fields_by_name = {
+    {"SOURCE_ENDPOINT_NEW_MSG_ADDR", 0}
+    ,{"SOURCE_ENDPOINT_NEW_MSG_SIZE", 1}
+    ,{"SOURCE_ENDPOINT_NEW_MSG_LAST_TILE", 2}
+    ,{"SOURCE_ENDPOINT_NEW_MSGS_NUM", 3}
+    ,{"SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE", 4}
+    ,{"SOURCE_ENDPOINT_NEW_MSGS_LAST_TILE", 5}
+    ,{"PHASE_AUTO_CONFIG", 6}
+    ,{"PHASE_AUTO_ADVANCE", 7}
+    ,{"REG_UPDATE_VC_REG", 8}
+    ,{"GLOBAL_OFFSET_TABLE_RD_SRC_INDEX", 9}
+    ,{"GLOBAL_OFFSET_TABLE_RD_DEST_INDEX", 10}
+    ,{"INCOMING_DATA_NOC", 11}
+    ,{"OUTGOING_DATA_NOC", 12}
+    ,{"REMOTE_SRC_UPDATE_NOC", 13}
+    ,{"LOCAL_SOURCES_CONNECTED", 14}
+    ,{"SOURCE_ENDPOINT", 15}
+    ,{"REMOTE_SOURCE", 16}
+    ,{"RECEIVER_ENDPOINT", 17}
+    ,{"LOCAL_RECEIVER", 18}
+    ,{"REMOTE_RECEIVER", 19}
+    ,{"TOKEN_MODE", 20}
+    ,{"COPY_MODE", 21}
+    ,{"NEXT_PHASE_SRC_CHANGE", 22}
+    ,{"NEXT_PHASE_DEST_CHANGE", 23}
+    ,{"DATA_BUF_NO_FLOW_CTRL", 24}
+    ,{"DEST_DATA_BUF_NO_FLOW_CTRL", 25}
+    ,{"MSG_INFO_BUF_FLOW_CTRL", 26}
+    ,{"DEST_MSG_INFO_BUF_FLOW_CTRL", 27}
+    ,{"REMOTE_SRC_IS_MCAST", 28}
+    ,{"NO_PREV_PHASE_OUTGOING_DATA_FLUSH", 29}
+    ,{"SRC_FULL_CREDIT_FLUSH_EN", 30}
+    ,{"DST_FULL_CREDIT_FLUSH_EN", 31}
+    ,{"INFINITE_PHASE_EN", 32}
+    ,{"OOO_PHASE_EXECUTION_EN", 33}
+    ,{"STREAM_REMOTE_SRC_X", 34}
+    ,{"STREAM_REMOTE_SRC_Y", 35}
+    ,{"REMOTE_SRC_STREAM_ID", 36}
+    ,{"STREAM_REMOTE_SRC_DEST_INDEX", 37}
+    ,{"DRAM_READS__TRANS_SIZE_WORDS_LO", 38}
+    ,{"DRAM_READS__SCRATCH_1_PTR", 39}
+    ,{"DRAM_READS__TRANS_SIZE_WORDS_HI", 40}
+    ,{"STREAM_REMOTE_DEST_X", 41}
+    ,{"STREAM_REMOTE_DEST_Y", 42}
+    ,{"STREAM_REMOTE_DEST_STREAM_ID", 43}
+    ,{"STREAM_LOCAL_DEST_MSG_CLEAR_NUM", 44}
+    ,{"STREAM_LOCAL_DEST_STREAM_ID", 45}
+    ,{"DRAM_WRITES__SCRATCH_1_PTR_LO", 46}
+    ,{"REMOTE_DEST_BUF_SIZE_WORDS", 47}
+    ,{"DRAM_WRITES__SCRATCH_1_PTR_HI", 48}
+    ,{"REMOTE_DEST_MSG_INFO_BUF_SIZE_POW2", 49}
+    ,{"NOC_PRIORITY", 50}
+    ,{"UNICAST_VC_REG", 51}
+    ,{"STREAM_RD_PTR_VAL", 52}
+    ,{"STREAM_RD_PTR_WRAP", 53}
+    ,{"STREAM_WR_PTR_VAL", 54}
+    ,{"STREAM_WR_PTR_WRAP", 55}
+    ,{"MSG_INFO_BUF_SIZE_POW2", 56}
+    ,{"STREAM_MSG_INFO_WRAP_RD_PTR", 57}
+    ,{"STREAM_MSG_INFO_WRAP_RD_PTR_WRAP", 58}
+    ,{"STREAM_MSG_INFO_WRAP_WR_PTR", 59}
+    ,{"STREAM_MSG_INFO_WRAP_WR_PTR_WRAP", 60}
+    ,{"STREAM_MCAST_END_X", 61}
+    ,{"STREAM_MCAST_END_Y", 62}
+    ,{"STREAM_MCAST_EN", 63}
+    ,{"STREAM_MCAST_LINKED", 64}
+    ,{"STREAM_MCAST_VC", 65}
+    ,{"STREAM_MCAST_NO_PATH_RES", 66}
+    ,{"STREAM_MCAST_XY", 67}
+    ,{"STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED", 68}
+    ,{"STREAM_MCAST_DEST_SIDE_DYNAMIC_LINKED", 69}
+    ,{"MSG_LOCAL_STREAM_CLEAR_NUM", 70}
+    ,{"MSG_GROUP_STREAM_CLEAR_TYPE", 71}
+    ,{"MSG_ARB_GROUP_SIZE", 72}
+    ,{"MSG_SRC_IN_ORDER_FWD", 73}
+    ,{"MSG_SRC_ARBITRARY_CLEAR_NUM_EN", 74}
+    ,{"MSG_HEADER_WORD_CNT_OFFSET", 75}
+    ,{"MSG_HEADER_WORD_CNT_BITS", 76}
+    ,{"MSG_HEADER_INFINITE_PHASE_LAST_TILE_OFFSET", 77}
+    ,{"PHASE_NUM_INCR", 78}
+    ,{"CURR_PHASE_NUM_MSGS", 79}
+    ,{"NEXT_PHASE_NUM_CFG_REG_WRITES", 80}
+    ,{"CLOCK_GATING_EN", 81}
+    ,{"CLOCK_GATING_HYST", 82}
+    ,{"PARTIAL_SEND_WORDS_THR", 83}
+    ,{"NCRISC_TRANS_EN", 84}
+    ,{"NCRISC_TRANS_EN_IRQ_ON_BLOB_END", 85}
+    ,{"NCRISC_CMD_ID", 86}
+    ,{"NEXT_NRISC_PIC_INT_ON_PHASE", 87}
+    ,{"DRAM_FIFO_RD_PTR_WORDS_LO", 88}
+    ,{"NCRISC_LOOP_COUNT", 89}
+    ,{"NCRISC_INIT_ENABLE_BLOB_DONE_IRQ", 90}
+    ,{"NCRISC_INIT_DISABLE_BLOB_DONE_IRQ", 91}
+    ,{"DRAM_FIFO_RD_PTR_WORDS_HI", 92}
+    ,{"DRAM_FIFO_WR_PTR_WORDS_LO", 93}
+    ,{"NCRISC_TOTAL_LOOP_ITER", 94}
+    ,{"DRAM_FIFO_WR_PTR_WORDS_HI", 95}
+    ,{"DRAM_FIFO_CAPACITY_PTR_WORDS_LO", 96}
+    ,{"NCRISC_LOOP_INCR", 97}
+    ,{"NCRISC_LOOP_BACK_NUM_CFG_REG_WRITES", 98}
+    ,{"DRAM_FIFO_CAPACITY_PTR_WORDS_HI", 99}
+    ,{"DRAM_FIFO_BASE_ADDR_WORDS_LO", 100}
+    ,{"NCRISC_LOOP_BACK_AUTO_CFG_PTR", 101}
+    ,{"DRAM_FIFO_BASE_ADDR_WORDS_HI", 102}
+    ,{"DRAM_EN_BLOCKING", 103}
+    ,{"DRAM_DATA_STRUCTURE_IS_LUT", 104}
+    ,{"DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY", 105}
+    ,{"DRAM_RESET_WR_PTR_TO_BASE_ON_FULL", 106}
+    ,{"DRAM_NO_PTR_UPDATE_ON_PHASE_END", 107}
+    ,{"DRAM_WR_BUFFER_FLUSH_AND_RST_PTRS", 108}
+    ,{"NCRISC_LOOP_NEXT_PIC_INT_ON_PHASE", 109}
+    ,{"GLOBAL_OFFSET_VAL", 110}
+    ,{"GLOBAL_OFFSET_TABLE_INDEX_SEL", 111}
+    ,{"GLOBAL_OFFSET_TABLE_CLEAR", 112}
+    ,{"WAIT_SW_PHASE_ADVANCE_SIGNAL", 113}
+    ,{"WAIT_PREV_PHASE_DATA_FLUSH", 114}
+    ,{"MSG_FWD_ONGOING", 115}
+    ,{"STREAM_CURR_STATE", 116}
+    ,{"TOKEN_GOTTEN", 117}
+    ,{"INFINITE_PHASE_END_DETECTED", 118}
+    ,{"INFINITE_PHASE_END_HEADER_BUFFER_DETECTED", 119}
+    ,{"PHASE_READY_DEST_NUM", 120}
+    ,{"PHASE_READY_NUM", 121}
+    ,{"PHASE_READY_MCAST", 122}
+    ,{"PHASE_READY_TWO_WAY_RESP", 123}
+    ,{"STREAM_REMOTE_RDY_SRC_X", 124}
+    ,{"STREAM_REMOTE_RDY_SRC_Y", 125}
+    ,{"REMOTE_RDY_SRC_STREAM_ID", 126}
+    ,{"IS_TOKEN_UPDATE", 127}
+    ,{"REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM", 128}
+    ,{"REMOTE_DEST_BUF_WORDS_FREE_INC", 129}
+    ,{"REMOTE_DEST_MSG_INFO_BUF_WORDS_FREE_INC", 130}
+    ,{"BLOB_NEXT_AUTO_CFG_DONE_STREAM_ID", 131}
+    ,{"BLOB_NEXT_AUTO_CFG_DONE_VALID", 132}
+    ,{"REMOTE_DEST_WORDS_FREE", 133}
+    ,{"REMOTE_DEST_MSG_INFO_WORDS_FREE", 134}
+    ,{"DEBUG_STATUS_STREAM_ID_SEL", 135}
+    ,{"DISABLE_DEST_READY_TABLE", 136}
+    ,{"DISABLE_GLOBAL_OFFSET_TABLE", 137}
+};
diff --git a/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.h b/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.h
new file mode 100644
index 000000000000..1561baa14dfe
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.h
@@ -0,0 +1,872 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// AUTO_GENERATED! DO NOT MODIFY!                                                                                              //
+//                                                                                                                             //
+// Please run                                                                                                                  //
+//                                                                                                                             //
+// (echo '<% type=:svh_header %>' && cat noc_overlay_parameters.erb) | erb -T - > ../rtl/overlay/tt_noc_overlay_params.svh   //
+// (echo '<% type=:c_header %>' && cat noc_overlay_parameters.erb) | erb -T - > noc_overlay_parameters.h                     //
+// (echo '<% type=:cpp_header %>' && cat noc_overlay_parameters.erb) | erb -T - > noc_overlay_parameters.hpp                 //
+// (echo '<% type=:rb_header %>' && cat noc_overlay_parameters.erb) | erb -T - > noc_overlay_parameters.rb                   //
+// Open noc_overlay_parameters.hpp and move static class varaible definitions to noc_overlay_parameters.cpp                    //
+// overriding existing ones.                                                                                                   //
+//                                                                                                                             //
+// to regenerate                                                                                                               //                                                                                                    //
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef NOC_OVERLAY_PARAMETERS_H
+#define NOC_OVERLAY_PARAMETERS_H
+
+#ifndef NOC_OVERLAY_PARAMETERS_BASIC_H
+#define NOC_OVERLAY_PARAMETERS_BASIC_H
+
+#define NOC_NUM_STREAMS 64
+#define ETH_NOC_NUM_STREAMS 32
+
+#define NUM_MCAST_STREAM_ID_START 0
+#define NUM_MCAST_STREAM_ID_END   3
+#define NUM_RECEIVER_ENDPOINT_STREAM_ID_START 4
+#define NUM_RECEIVER_ENDPOINT_STREAM_ID_END   5
+#define NUM_REMOTE_RECEIVER_STREAM_ID_START 0
+#define NUM_REMOTE_RECEIVER_STREAM_ID_END 63
+#define RECEIVER_ENDPOINT_STREAM_MSG_GROUP_SIZE 4
+#define RECEIVER_ENDPOINT_STREAM_MSG_INFO_FIFO_GROUPS     4
+#define NON_RECEIVER_ENDPOINT_STREAM_MSG_INFO_FIFO_GROUPS 2
+#define DEST_READY_COMMON_CACHE_NUM_ENTRIES 24
+#define DEST_READY_MCAST_CACHE_NUM_ENTRIES 8
+
+#define NOC_OVERLAY_START_ADDR     0xFFB40000
+#define NOC_STREAM_REG_SPACE_SIZE  0x1000
+
+#define STREAM_REG_ADDR(stream_id, reg_id) ((NOC_OVERLAY_START_ADDR) + (((uint32_t)(stream_id))*(NOC_STREAM_REG_SPACE_SIZE)) + (((uint32_t)(reg_id)) << 2))
+
+#define NUM_NOCS                   2
+#define NOC0_REGS_START_ADDR       0xFFB20000
+#define NOC1_REGS_START_ADDR       0xFFB30000
+
+#define NCRISC_STREAM_RANGE_1_START 0
+#define NCRISC_STREAM_RANGE_1_END   3
+#define NCRISC_STREAM_RANGE_2_START 8
+#define NCRISC_STREAM_RANGE_2_END   11
+#define NCRISC_PIC_CONFIG_PHASE_DEFAULT           0
+
+#ifdef TB_NOC
+
+extern "C" {
+#include "noc.h"
+#include "noc_api_dpi.h"
+}
+
+#else
+
+#define NOC_STREAM_WRITE_REG(stream_id, reg_id, val)  ((*((volatile uint32_t*)(STREAM_REG_ADDR(stream_id, reg_id)))) = (val))
+#define NOC_STREAM_READ_REG(stream_id, reg_id)        (*((volatile uint32_t*)(STREAM_REG_ADDR(stream_id, reg_id))))
+
+#define NOC_STREAM_WRITE_REG_FIELD(stream_id, reg_id, field, val) (NOC_STREAM_WRITE_REG(stream_id, reg_id, ((NOC_STREAM_READ_REG(stream_id, reg_id) & ~((1 << field##_WIDTH) - 1)) | ((val & ((1 << field##_WIDTH) - 1)) << field))))
+#define NOC_STREAM_READ_REG_FIELD(stream_id, reg_id, field)       ((NOC_STREAM_READ_REG(stream_id, reg_id) >> field) & ((1 << field##_WIDTH) - 1))
+
+#define NOC_WRITE_REG(addr, val) ((*((volatile uint32_t*)(addr)))) = (val)
+#define NOC_READ_REG(addr)       (*((volatile uint32_t*)(addr)))
+
+#endif
+
+
+#define NOC_ID_WIDTH     6
+#define STREAM_ID_WIDTH  6
+
+#define DEST_CNT_WIDTH   6
+#define NOC_NUM_WIDTH     1
+
+#define STREAM_REG_INDEX_WIDTH 9
+#define STREAM_REG_CFG_DATA_WIDTH 24
+
+#define MEM_WORD_WIDTH 16
+#define MEM_WORD_ADDR_WIDTH 17
+
+#define MEM_WORD_BIT_OFFSET_WIDTH 7
+
+#define MSG_INFO_BUF_SIZE_WORDS 256
+#define MSG_INFO_BUF_SIZE_BITS  8
+#define MSG_INFO_BUF_SIZE_POW_BITS 3
+#define MSG_INFO_BUF_SIZE_WORDS_WIDTH (MSG_INFO_BUF_SIZE_BITS + 1)
+
+#define GLOBAL_OFFSET_TABLE_SIZE 8
+#define GLOBAL_OFFSET_TABLE_SIZE_WIDTH 3
+
+#endif
+
+// For endpoints with SOURCE_ENDPOINT == 1, this register is for firmware
+// to register new message for sending.
+// This updates the msg_info register structure directly, rather than writing to the message info
+// buffer in memory.
+// Must not be written when the message info register structure is full, or if
+// there are message info entries in the memory buffer. (This would cause a race
+// condition.)
+#define   STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG_INDEX   0
+#define       SOURCE_ENDPOINT_NEW_MSG_ADDR            0
+#define       SOURCE_ENDPOINT_NEW_MSG_ADDR_WIDTH        MEM_WORD_ADDR_WIDTH
+#define       SOURCE_ENDPOINT_NEW_MSG_SIZE            (SOURCE_ENDPOINT_NEW_MSG_ADDR+SOURCE_ENDPOINT_NEW_MSG_ADDR_WIDTH)
+#define       SOURCE_ENDPOINT_NEW_MSG_SIZE_WIDTH        (32-MEM_WORD_ADDR_WIDTH-1)
+#define       SOURCE_ENDPOINT_NEW_MSG_LAST_TILE       (SOURCE_ENDPOINT_NEW_MSG_SIZE+SOURCE_ENDPOINT_NEW_MSG_SIZE_WIDTH)
+#define       SOURCE_ENDPOINT_NEW_MSG_LAST_TILE_WIDTH   (1)
+
+// For endpoints with SOURCE_ENDPOINT == 1, this register is for firmware
+// to update the number of messages whose data & header are available in the memory buffer.
+// Hardware register is incremented atomically if sending of previous messages is in progress.
+#define   STREAM_NUM_MSGS_RECEIVED_INC_REG_INDEX   1
+#define       SOURCE_ENDPOINT_NEW_MSGS_NUM              0
+#define       SOURCE_ENDPOINT_NEW_MSGS_NUM_WIDTH          12
+#define       SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE       (SOURCE_ENDPOINT_NEW_MSGS_NUM+SOURCE_ENDPOINT_NEW_MSGS_NUM_WIDTH)
+#define       SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE_WIDTH   MEM_WORD_ADDR_WIDTH
+#define       SOURCE_ENDPOINT_NEW_MSGS_LAST_TILE        (SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE+SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE_WIDTH)
+#define       SOURCE_ENDPOINT_NEW_MSGS_LAST_TILE_WIDTH    1
+
+// Registers that need to be programmed once per blob. (Can apply to multiple phases.)
+//   * Phase/data forward options:
+//      PHASE_AUTO_CONFIG = set to 1 for stream to fetch next phase configuration automatically.
+//      PHASE_AUTO_ADVANCE = set to 1 for stream to advance to next phase automatically
+//            (otherwise need to write STREAM_PHASE_ADVANCE below)
+#define   STREAM_ONETIME_MISC_CFG_REG_INDEX   2
+#define       PHASE_AUTO_CONFIG                       0
+#define       PHASE_AUTO_CONFIG_WIDTH                   1
+#define       PHASE_AUTO_ADVANCE                      (PHASE_AUTO_CONFIG+PHASE_AUTO_CONFIG_WIDTH)
+#define       PHASE_AUTO_ADVANCE_WIDTH                  1
+// set to one of the values (0-5) to select which VC control flow updates will be sent on
+#define       REG_UPDATE_VC_REG                       (PHASE_AUTO_ADVANCE+PHASE_AUTO_ADVANCE_WIDTH)
+#define       REG_UPDATE_VC_REG_WIDTH                   3
+// Read index of global offset table, which will offset o_data_fwd_src_addr by entry value.
+#define       GLOBAL_OFFSET_TABLE_RD_SRC_INDEX        (REG_UPDATE_VC_REG+REG_UPDATE_VC_REG_WIDTH)
+#define       GLOBAL_OFFSET_TABLE_RD_SRC_INDEX_WIDTH    GLOBAL_OFFSET_TABLE_SIZE_WIDTH
+// Read index of global offset table, which will offset o_data_fwd_dest_addr by entry value.
+#define       GLOBAL_OFFSET_TABLE_RD_DEST_INDEX       (GLOBAL_OFFSET_TABLE_RD_SRC_INDEX+GLOBAL_OFFSET_TABLE_RD_SRC_INDEX_WIDTH)
+#define       GLOBAL_OFFSET_TABLE_RD_DEST_INDEX_WIDTH   GLOBAL_OFFSET_TABLE_SIZE_WIDTH
+
+// The ID of NOCs used for incoming and outgoing data, followed by misc. stream configuration options:
+//   * Source - set exactly one of these to 1:
+//        SOURCE_ENDPOINT = source is local math/packer
+//        REMOTE_SOURCE = source is remote sender stream
+//        LOCAL_SOURCES_CONNECTED = source is one or more local connected streams
+//   * Destination - set one or zero of these to 1:
+//        RECEIVER_ENDPOINT = stream is read by local unpacker/math
+//        REMOTE_RECEIVER = stream forwards data to a remote destination or multicast group
+//        LOCAL_RECEIVER = stream is connected to a local destination stream
+//        None set = stream just stores data in a local buffer, without forwarding/clearing, and
+//                   finishes the phase once all messages have been received
+#define   STREAM_MISC_CFG_REG_INDEX   3
+#define       INCOMING_DATA_NOC                       0
+#define       INCOMING_DATA_NOC_WIDTH                   NOC_NUM_WIDTH
+#define       OUTGOING_DATA_NOC                       (INCOMING_DATA_NOC+INCOMING_DATA_NOC_WIDTH)
+#define       OUTGOING_DATA_NOC_WIDTH                   NOC_NUM_WIDTH
+#define       REMOTE_SRC_UPDATE_NOC                   (OUTGOING_DATA_NOC+OUTGOING_DATA_NOC_WIDTH)
+#define       REMOTE_SRC_UPDATE_NOC_WIDTH               NOC_NUM_WIDTH
+#define       LOCAL_SOURCES_CONNECTED                 (REMOTE_SRC_UPDATE_NOC+REMOTE_SRC_UPDATE_NOC_WIDTH)
+#define       LOCAL_SOURCES_CONNECTED_WIDTH             1
+#define       SOURCE_ENDPOINT                         (LOCAL_SOURCES_CONNECTED+LOCAL_SOURCES_CONNECTED_WIDTH)
+#define       SOURCE_ENDPOINT_WIDTH                     1
+#define       REMOTE_SOURCE                           (SOURCE_ENDPOINT+SOURCE_ENDPOINT_WIDTH)
+#define       REMOTE_SOURCE_WIDTH                       1
+#define       RECEIVER_ENDPOINT                       (REMOTE_SOURCE+REMOTE_SOURCE_WIDTH)
+#define       RECEIVER_ENDPOINT_WIDTH                   1
+#define       LOCAL_RECEIVER                          (RECEIVER_ENDPOINT+RECEIVER_ENDPOINT_WIDTH)
+#define       LOCAL_RECEIVER_WIDTH                      1
+#define       REMOTE_RECEIVER                         (LOCAL_RECEIVER+LOCAL_RECEIVER_WIDTH)
+#define       REMOTE_RECEIVER_WIDTH                     1
+#define       TOKEN_MODE                              (REMOTE_RECEIVER+REMOTE_RECEIVER_WIDTH)
+#define       TOKEN_MODE_WIDTH                          1
+#define       COPY_MODE                               (TOKEN_MODE+TOKEN_MODE_WIDTH)
+#define       COPY_MODE_WIDTH                           1
+#define       NEXT_PHASE_SRC_CHANGE                   (COPY_MODE+COPY_MODE_WIDTH)
+#define       NEXT_PHASE_SRC_CHANGE_WIDTH               1
+#define       NEXT_PHASE_DEST_CHANGE                  (NEXT_PHASE_SRC_CHANGE+NEXT_PHASE_SRC_CHANGE_WIDTH)
+#define       NEXT_PHASE_DEST_CHANGE_WIDTH              1
+// set if REMOTE_SOURCE==1 and the buffer is large enough to accept full phase data without wrapping:
+#define       DATA_BUF_NO_FLOW_CTRL                   (NEXT_PHASE_DEST_CHANGE+NEXT_PHASE_DEST_CHANGE_WIDTH)
+#define       DATA_BUF_NO_FLOW_CTRL_WIDTH               1
+// set if REMOTE_RECEIVER==1 and the destination buffer is large enough to accept full phase data without wrapping:
+#define       DEST_DATA_BUF_NO_FLOW_CTRL              (DATA_BUF_NO_FLOW_CTRL+DATA_BUF_NO_FLOW_CTRL_WIDTH)
+#define       DEST_DATA_BUF_NO_FLOW_CTRL_WIDTH          1
+// set if REMOTE_SOURCE==1 and you want the buffer to have wrapping:
+#define       MSG_INFO_BUF_FLOW_CTRL                  (DEST_DATA_BUF_NO_FLOW_CTRL+DEST_DATA_BUF_NO_FLOW_CTRL_WIDTH)
+#define       MSG_INFO_BUF_FLOW_CTRL_WIDTH              1
+// set if REMOTE_RECEIVER==1 and you want the destination buffer to have wrapping:
+#define       DEST_MSG_INFO_BUF_FLOW_CTRL             (MSG_INFO_BUF_FLOW_CTRL+MSG_INFO_BUF_FLOW_CTRL_WIDTH)
+#define       DEST_MSG_INFO_BUF_FLOW_CTRL_WIDTH         1
+// set if REMOTE_SOURCE==1 and has mulicast enabled (i.e. this stream is part of a multicast group)
+#define       REMOTE_SRC_IS_MCAST                     (DEST_MSG_INFO_BUF_FLOW_CTRL+DEST_MSG_INFO_BUF_FLOW_CTRL_WIDTH)
+#define       REMOTE_SRC_IS_MCAST_WIDTH                 1
+// set if no need to flush outgoing remote data from previous phase
+#define       NO_PREV_PHASE_OUTGOING_DATA_FLUSH       (REMOTE_SRC_IS_MCAST+REMOTE_SRC_IS_MCAST_WIDTH)
+#define       NO_PREV_PHASE_OUTGOING_DATA_FLUSH_WIDTH   1
+// Set to one to enable full credit flushing on src side
+#define       SRC_FULL_CREDIT_FLUSH_EN                (NO_PREV_PHASE_OUTGOING_DATA_FLUSH+NO_PREV_PHASE_OUTGOING_DATA_FLUSH_WIDTH)
+#define       SRC_FULL_CREDIT_FLUSH_EN_WIDTH            1
+// Set to one to enable full credit flushing on dest side
+#define       DST_FULL_CREDIT_FLUSH_EN                (SRC_FULL_CREDIT_FLUSH_EN+SRC_FULL_CREDIT_FLUSH_EN_WIDTH)
+#define       DST_FULL_CREDIT_FLUSH_EN_WIDTH            1
+// Set to one to enable infinite messages per phase, accompanied by a last tile header bit which will end the phase
+#define       INFINITE_PHASE_EN                       (DST_FULL_CREDIT_FLUSH_EN+DST_FULL_CREDIT_FLUSH_EN_WIDTH)
+#define       INFINITE_PHASE_EN_WIDTH                   1
+// Enables out-of-order phase execution by providing an array of size num_tiles at the end of phase blob, with order in which each tile should be sent. Each array entry contains a 17-bit tile address and a 15-bit tile size.
+#define       OOO_PHASE_EXECUTION_EN                  (INFINITE_PHASE_EN+INFINITE_PHASE_EN_WIDTH)
+#define       OOO_PHASE_EXECUTION_EN_WIDTH              1
+
+// Properties of the remote source stream (coorindates, stream ID, and this streams destination index).
+// Dont-care unless REMOTE_SOURCE == 1.
+#define   STREAM_REMOTE_SRC_REG_INDEX   4
+#define       STREAM_REMOTE_SRC_X                   0
+#define       STREAM_REMOTE_SRC_X_WIDTH               NOC_ID_WIDTH
+#define       STREAM_REMOTE_SRC_Y                   (STREAM_REMOTE_SRC_X+STREAM_REMOTE_SRC_X_WIDTH)
+#define       STREAM_REMOTE_SRC_Y_WIDTH               NOC_ID_WIDTH
+#define       REMOTE_SRC_STREAM_ID                  (STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH)
+#define       REMOTE_SRC_STREAM_ID_WIDTH              STREAM_ID_WIDTH
+#define       STREAM_REMOTE_SRC_DEST_INDEX          (REMOTE_SRC_STREAM_ID+REMOTE_SRC_STREAM_ID_WIDTH)
+#define       STREAM_REMOTE_SRC_DEST_INDEX_WIDTH      STREAM_ID_WIDTH
+#define       DRAM_READS__TRANS_SIZE_WORDS_LO       (STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH)
+#define       DRAM_READS__TRANS_SIZE_WORDS_LO_WIDTH   12
+
+// Remote source phase (may be different from the destination stream phase.)
+// We use 20-bit phase ID, so phase count doesnt wrap until 1M phases.
+// Dont-care unless REMOTE_SOURCE == 1.
+#define   STREAM_REMOTE_SRC_PHASE_REG_INDEX   5
+#define       DRAM_READS__SCRATCH_1_PTR             0
+#define       DRAM_READS__SCRATCH_1_PTR_WIDTH         19
+#define       DRAM_READS__TRANS_SIZE_WORDS_HI       (DRAM_READS__SCRATCH_1_PTR+DRAM_READS__SCRATCH_1_PTR_WIDTH)
+#define       DRAM_READS__TRANS_SIZE_WORDS_HI_WIDTH   1
+
+// 4-bit wide register that determines the threshold at which a stream
+// with remote source sends an update message to STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE.
+// Dont-care unless REMOTE_SOURCE==1.
+// Values:
+//   value[3:0] == 0 => disable threshold. Acks send as soon as any data are cleared/forwarded.
+//   value[3:0] >  0 => threshold calculated according to the following formula:
+//         if (value[3])
+//              threshold = buf_size - (buf_size >> value[2:0])
+//         else
+//              threshold = (buf_size >> value[2:0])
+//
+// This enables setting thresholds of buf_size/2, buf_size/4, buf_size/8, ... buf_size/256,
+// as well as  3*buf_size/4, 7*buf_size/8, etc.
+#define   STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD_REG_INDEX   6
+
+// Properties of the remote destination stream (coorindates, stream ID).  Dont-care unless REMOTE_RECEIVER == 1.
+// If destination is multicast, this register specifies the starting coordinates of the destination
+// multicast group/rectangle. (The end coordinates are in STREAM_MCAST_DEST below.)
+#define   STREAM_REMOTE_DEST_REG_INDEX   7
+#define       STREAM_REMOTE_DEST_X               0
+#define       STREAM_REMOTE_DEST_X_WIDTH           NOC_ID_WIDTH
+#define       STREAM_REMOTE_DEST_Y               (STREAM_REMOTE_DEST_X+STREAM_REMOTE_DEST_X_WIDTH)
+#define       STREAM_REMOTE_DEST_Y_WIDTH           NOC_ID_WIDTH
+#define       STREAM_REMOTE_DEST_STREAM_ID       (STREAM_REMOTE_DEST_Y+STREAM_REMOTE_DEST_Y_WIDTH)
+#define       STREAM_REMOTE_DEST_STREAM_ID_WIDTH   STREAM_ID_WIDTH
+
+// Properties of the local destination gather stream connection.
+// Dont-care unless LOCAL_RECEIVER == 1.
+// Shares register space with STREAM_REMOTE_DEST_REG_INDEX.
+#define   STREAM_LOCAL_DEST_REG_INDEX   7
+#define       STREAM_LOCAL_DEST_MSG_CLEAR_NUM       0
+#define       STREAM_LOCAL_DEST_MSG_CLEAR_NUM_WIDTH   12
+#define       STREAM_LOCAL_DEST_STREAM_ID           (STREAM_LOCAL_DEST_MSG_CLEAR_NUM+STREAM_LOCAL_DEST_MSG_CLEAR_NUM_WIDTH)
+#define       STREAM_LOCAL_DEST_STREAM_ID_WIDTH       STREAM_ID_WIDTH
+
+// Start address (in words) of the remote destination stream memory buffer.
+#define   STREAM_REMOTE_DEST_BUF_START_REG_INDEX   8
+#define       DRAM_WRITES__SCRATCH_1_PTR_LO       0
+#define       DRAM_WRITES__SCRATCH_1_PTR_LO_WIDTH   16
+
+// High bits for STREAM_REMOTE_DEST_BUF_START
+#define   STREAM_REMOTE_DEST_BUF_START_HI_REG_INDEX   9
+
+// Size (in words) of the remote destination stream memory buffer.
+#define   STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX   10
+#define       REMOTE_DEST_BUF_SIZE_WORDS          0
+#define       REMOTE_DEST_BUF_SIZE_WORDS_WIDTH      MEM_WORD_ADDR_WIDTH
+#define       DRAM_WRITES__SCRATCH_1_PTR_HI       0
+#define       DRAM_WRITES__SCRATCH_1_PTR_HI_WIDTH   3
+
+// Write pointer for the remote destination stream memory buffer.
+// Can be written directly; automatically reset to 0 when
+// STREAM_REMOTE_DEST_BUF_START is written.
+#define   STREAM_REMOTE_DEST_WR_PTR_REG_INDEX   11
+
+// Size (in power2) of the remote destination stream memory buffer.
+// Bits encode powers of 2 sizes in words (2^(x+1)), e.g. 0 -> 2 words, 1 -> 4 words, 7 -> 256 words
+// Max 256 word size.
+// Only used when DEST_MSG_INFO_BUF_FLOW_CTRL is true
+#define   STREAM_REMOTE_DEST_MSG_INFO_BUF_SIZE_REG_INDEX   12
+#define       REMOTE_DEST_MSG_INFO_BUF_SIZE_POW2       0
+#define       REMOTE_DEST_MSG_INFO_BUF_SIZE_POW2_WIDTH   MSG_INFO_BUF_SIZE_POW_BITS
+
+// Start address (in words) of the remote destination stream memory buffer.
+// Only used when DEST_MSG_INFO_BUF_FLOW_CTRL is true
+#define   STREAM_REMOTE_DEST_MSG_INFO_BUF_START_REG_INDEX   13
+
+// Write pointer for the remote destination message info buffer.
+// Dont-care unless REMOTE_RECEIVER==1.
+// Needs to be initialized to the start of the message info buffer of the remote destination
+// at phase start, if destination is changed.
+// Subsequently its incremented automatically as messages are forwarded.
+// When DEST_MSG_INFO_BUF_FLOW_CTRL is true this pointer is the one above
+#define   STREAM_REMOTE_DEST_MSG_INFO_WR_PTR_REG_INDEX   13
+
+#ifdef RISC_B0_HW
+// On WH B0, this register aliases STREAM_REMOTE_DEST_MSG_INFO_WR_PTR_REG_INDEX.
+// It can be used to clear multiple tiles at once, even if they haven't been loaded into the msg info
+// buffer. (Which is required for using STREAM_MSG_INFO/DATA_CLEAR_REG_INDEX.)  This way, we
+// can clear however many pending messages have actually been received in L1.
+// To clear N messages, we need to:
+//   - Poll to ensure that the register value is 0 (indicating that previous clearing is done).
+//   - Write (-2*N) in 2's complement (i.e. ~N+1) into the register.
+// This performs both info and data clear steps at once.
+// The register should be used only in RECEIVER_ENDPOINT mode.
+#define   STREAM_RECEIVER_ENDPOINT_MULTI_TILE_CLEAR_REG_INDEX 13
+#endif
+
+// High bits for STREAM_REMOTE_DEST_MSG_INFO_BUF_START
+// Only used when DEST_MSG_INFO_BUF_FLOW_CTRL is true
+#define   STREAM_REMOTE_DEST_MSG_INFO_BUF_START_HI_REG_INDEX   14
+
+// High bits for STREAM_REMOTE_DEST_MSG_INFO_WR_PTR
+// When DEST_MSG_INFO_BUF_FLOW_CTRL is true this pointer is the one above
+#define   STREAM_REMOTE_DEST_MSG_INFO_WR_PTR_HI_REG_INDEX   14
+
+// Only used when DEST_MSG_INFO_BUF_FLOW_CTRL is true
+// Write pointer for the remote destination message info buffer.
+// Dont-care unless REMOTE_RECEIVER==1.
+// Subsequently its incremented automatically as messages are forwarded.
+#define   STREAM_REMOTE_DEST_MSG_INFO_WRAP_WR_PTR_REG_INDEX   15
+
+// Priority for traffic sent to remote destination.
+// Valid only for streams capable of remote sending.
+// 4-bit value.
+// Set to 0 to send traffic under round-robin arbitration.
+// Set to 1-15 for priority arbitration (higher values are higher priority).
+#define   STREAM_REMOTE_DEST_TRAFFIC_REG_INDEX   16
+#define       NOC_PRIORITY         0
+#define       NOC_PRIORITY_WIDTH     4
+// set to one of the values (0-5) to select which VC unicast requests will be sent on
+#define       UNICAST_VC_REG       (NOC_PRIORITY+NOC_PRIORITY_WIDTH)
+#define       UNICAST_VC_REG_WIDTH   3
+
+// Start address (in words) of the memory buffer associated with this stream.
+#define   STREAM_BUF_START_REG_INDEX   17
+
+// Stream buffer size (in words).
+#define   STREAM_BUF_SIZE_REG_INDEX   18
+
+// Read pointer value (word offset relative to buffer start).
+// Can be updated by writing the register.
+// Value does not guarantee that all data up to the current value have been sent
+// off (forwarding command may be  ongoing).  To find out free space in the buffer,
+// read STREAM_BUF_SPACE_AVAILABLE.
+// Automatically reset to 0 when STREAM_BUF_START_REG is updated.
+#define   STREAM_RD_PTR_REG_INDEX   19
+#define       STREAM_RD_PTR_VAL        0
+#define       STREAM_RD_PTR_VAL_WIDTH    MEM_WORD_ADDR_WIDTH
+#define       STREAM_RD_PTR_WRAP       (STREAM_RD_PTR_VAL+STREAM_RD_PTR_VAL_WIDTH)
+#define       STREAM_RD_PTR_WRAP_WIDTH   1
+
+// Write pointer value (word offset relative to buffer start).
+// Can be read to determine the location at which to write new data.
+// Can be updated by writing the register.
+// In normal operation, should be updated only by writing
+// STREAM_NUM_MSGS_RECEIVED_INC_REG or STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG.
+#define   STREAM_WR_PTR_REG_INDEX   20
+#define       STREAM_WR_PTR_VAL        0
+#define       STREAM_WR_PTR_VAL_WIDTH    MEM_WORD_ADDR_WIDTH
+#define       STREAM_WR_PTR_WRAP       (STREAM_WR_PTR_VAL+STREAM_WR_PTR_VAL_WIDTH)
+#define       STREAM_WR_PTR_WRAP_WIDTH   1
+
+// Size (in power2) of the remote destination stream memory buffer.
+// Bits encode powers of 2 sizes in words (2^(x+1)), e.g. 0 -> 2 words, 1 -> 4 words, 7 -> 256 words
+// Max 256 word size.
+// Only used when MSG_INFO_BUF_FLOW_CTRL is true
+#define   STREAM_MSG_INFO_BUF_SIZE_REG_INDEX   21
+#define       MSG_INFO_BUF_SIZE_POW2       0
+#define       MSG_INFO_BUF_SIZE_POW2_WIDTH   MSG_INFO_BUF_SIZE_POW_BITS
+
+// Start address (in words) of the msg info buffer.
+// Only used when MSG_INFO_BUF_FLOW_CTRL is true
+#define   STREAM_MSG_INFO_BUF_START_REG_INDEX   22
+
+// Stream message info buffer address.
+//
+// This register needs to be initialized to the start of the message info buffer during
+// phase configuration.  Subsequently it will be incremented by hardware as data are read
+// from the buffer, thus doubling as the read pointer during phase execution.
+//
+// Stream hardware will assume that this buffer is large enough to hold info for all messages
+// within a phase, so unlike the buffer, it never needs to wrap.
+//
+// The buffer is filled automatically by snooping for streams with remote source.
+// For source enpoints, the buffer is written explicitly (along with the data buffer), after which
+// STREAM_NUM_MSGS_RECEIVED_INC is written to notify the stream that messages are available for
+// sending.
+//
+// Write pointer is also managed automatically by hardware, but can be read or reset using
+// STREAM_MSG_INFO_WR_PTR_REG. Write pointer is also reset when writing this register.
+// When MSG_INFO_BUF_FLOW_CTRL is true this pointer is the one above
+#define   STREAM_MSG_INFO_PTR_REG_INDEX   22
+
+// The read and write pointers for the msg info buffer when the message info buffer is in wrapping mode.
+// Only used when MSG_INFO_BUF_FLOW_CTRL is true
+#define   STREAM_MSG_INFO_WRAP_RD_WR_PTR_REG_INDEX   23
+#define       STREAM_MSG_INFO_WRAP_RD_PTR            0
+#define       STREAM_MSG_INFO_WRAP_RD_PTR_WIDTH        MSG_INFO_BUF_SIZE_BITS
+#define       STREAM_MSG_INFO_WRAP_RD_PTR_WRAP       (STREAM_MSG_INFO_WRAP_RD_PTR+STREAM_MSG_INFO_WRAP_RD_PTR_WIDTH)
+#define       STREAM_MSG_INFO_WRAP_RD_PTR_WRAP_WIDTH   1
+#define       STREAM_MSG_INFO_WRAP_WR_PTR            (STREAM_MSG_INFO_WRAP_RD_PTR_WRAP+STREAM_MSG_INFO_WRAP_RD_PTR_WRAP_WIDTH)
+#define       STREAM_MSG_INFO_WRAP_WR_PTR_WIDTH        MSG_INFO_BUF_SIZE_BITS
+#define       STREAM_MSG_INFO_WRAP_WR_PTR_WRAP       (STREAM_MSG_INFO_WRAP_WR_PTR+STREAM_MSG_INFO_WRAP_WR_PTR_WIDTH)
+#define       STREAM_MSG_INFO_WRAP_WR_PTR_WRAP_WIDTH   1
+
+// Write pointer value for message info buffer (absolute word address).
+// In normal operation, should be updated only by writing
+// STREAM_NUM_MSGS_RECEIVED_INC_REG or STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG.
+// When MSG_INFO_BUF_FLOW_CTRL is true this pointer is the one above
+#define   STREAM_MSG_INFO_WR_PTR_REG_INDEX   23
+
+// Destination spec for multicasting streams. STREAM_MCAST_END_X/Y are
+// the end coordinate for the multicast rectangle, with the ones from
+// STREAM_REMOTE_DEST taken as start.
+// Dont-care if STREAM_MCAST_EN == 0.
+#define   STREAM_MCAST_DEST_REG_INDEX   24
+#define       STREAM_MCAST_END_X                          0
+#define       STREAM_MCAST_END_X_WIDTH                      NOC_ID_WIDTH
+#define       STREAM_MCAST_END_Y                          (STREAM_MCAST_END_X+STREAM_MCAST_END_X_WIDTH)
+#define       STREAM_MCAST_END_Y_WIDTH                      NOC_ID_WIDTH
+#define       STREAM_MCAST_EN                             (STREAM_MCAST_END_Y+STREAM_MCAST_END_Y_WIDTH)
+#define       STREAM_MCAST_EN_WIDTH                         1
+#define       STREAM_MCAST_LINKED                         (STREAM_MCAST_EN+STREAM_MCAST_EN_WIDTH)
+#define       STREAM_MCAST_LINKED_WIDTH                     1
+// Set to 0 to select VC 4, and 1 to select VC 5 (default 0)
+#define       STREAM_MCAST_VC                             (STREAM_MCAST_LINKED+STREAM_MCAST_LINKED_WIDTH)
+#define       STREAM_MCAST_VC_WIDTH                         1
+#define       STREAM_MCAST_NO_PATH_RES                    (STREAM_MCAST_VC+STREAM_MCAST_VC_WIDTH)
+#define       STREAM_MCAST_NO_PATH_RES_WIDTH                1
+#define       STREAM_MCAST_XY                             (STREAM_MCAST_NO_PATH_RES+STREAM_MCAST_NO_PATH_RES_WIDTH)
+#define       STREAM_MCAST_XY_WIDTH                         1
+#define       STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED        (STREAM_MCAST_XY+STREAM_MCAST_XY_WIDTH)
+#define       STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED_WIDTH    1
+#define       STREAM_MCAST_DEST_SIDE_DYNAMIC_LINKED       (STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED+STREAM_MCAST_SRC_SIDE_DYNAMIC_LINKED_WIDTH)
+#define       STREAM_MCAST_DEST_SIDE_DYNAMIC_LINKED_WIDTH   1
+
+// Number of multicast destinations (dont-care for non-multicast streams)
+#define   STREAM_MCAST_DEST_NUM_REG_INDEX   25
+
+// Specifies MSG_ARB_GROUP_SIZE. Valid values are 1 (round-robin
+// arbitration between each incoming stream) or 4 (round-robin arbitration
+// between groups of 4 incoming streams).
+// Msg_LOCAL_STREAM_CLEAR_NUM specifies the number of messages that should
+// be cleared from a gather stream before moving onto the next stream.
+// When MSG_ARB_GROUP_SIZE > 1, the order of clearing the streams can be selected
+// with MSG_GROUP_STREAM_CLEAR_TYPE. 0 = clear the whole group MSG_LOCAL_STREAM_CLEAR_NUM times,
+// 1 = clear each stream of the group MSG_LOCAL_STREAM_CLEAR_NUM times before
+// moving onto the next stream in the group.
+#define   STREAM_GATHER_REG_INDEX   26
+#define       MSG_LOCAL_STREAM_CLEAR_NUM           0
+#define       MSG_LOCAL_STREAM_CLEAR_NUM_WIDTH       12
+#define       MSG_GROUP_STREAM_CLEAR_TYPE          (MSG_LOCAL_STREAM_CLEAR_NUM+MSG_LOCAL_STREAM_CLEAR_NUM_WIDTH)
+#define       MSG_GROUP_STREAM_CLEAR_TYPE_WIDTH      1
+#define       MSG_ARB_GROUP_SIZE                   (MSG_GROUP_STREAM_CLEAR_TYPE+MSG_GROUP_STREAM_CLEAR_TYPE_WIDTH)
+#define       MSG_ARB_GROUP_SIZE_WIDTH               3
+#define       MSG_SRC_IN_ORDER_FWD                 (MSG_ARB_GROUP_SIZE+MSG_ARB_GROUP_SIZE_WIDTH)
+#define       MSG_SRC_IN_ORDER_FWD_WIDTH             1
+#define       MSG_SRC_ARBITRARY_CLEAR_NUM_EN       (MSG_SRC_IN_ORDER_FWD+MSG_SRC_IN_ORDER_FWD_WIDTH)
+#define       MSG_SRC_ARBITRARY_CLEAR_NUM_EN_WIDTH   1
+
+// When using in-order message forwarding, number of messages after which the source
+// pointer goes back to zero (without phase change).
+// Dont-care if STREAM_MCAST_EN == 0 or MSG_SRC_IN_ORDER_FWD == 0.
+#define   STREAM_MSG_SRC_IN_ORDER_FWD_NUM_MSGS_REG_INDEX   27
+
+// Actual phase number executed is STREAM_CURR_PHASE_BASE_REG_INDEX + STREAM_CURR_PHASE_REG_INDEX
+// When reprogramming this register you must also reprogram STREAM_CURR_PHASE_REG_INDEX and STREAM_REMOTE_SRC_PHASE_REG_INDEX
+#define   STREAM_CURR_PHASE_BASE_REG_INDEX   28
+
+// Current phase number executed by the stream.
+#define   STREAM_CURR_PHASE_REG_INDEX   29
+
+// Actual address accessed will be STREAM_PHASE_AUTO_CFG_PTR_BASE_REG_INDEX + STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX
+// When reprogramming this register you must also reprogram STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX
+#define   STREAM_PHASE_AUTO_CFG_PTR_BASE_REG_INDEX   30
+
+// Pointer to the stream auto-config data. Initialized to the start of
+// the auto-config structure at workload start, automatically updated
+// subsequenty.
+// Specified as byte address, needs to be multiple of 4B.
+#define   STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX   31
+
+// This register acts as indirection to execute a phase that already exists somewhere in the blob.
+// It can be used to compress the blob when many phases need to be repeated.
+// When this register is written with a signed offset, the blob at address (auto_cfg pointer + offset) will be loaded.
+// The loaded blob must manually set its phase (using STREAM_CURR_PHASE) for this feature to work correctly.
+// Furthermore the phase after the reload blob phase must also set its current phase manually.
+#define   STREAM_RELOAD_PHASE_BLOB_REG_INDEX   32
+
+// Offset & size of the size field in the message header. Only valid offsets are multiples of 8
+// (i.e. byte-aligned).
+#define   STREAM_MSG_HEADER_FORMAT_REG_INDEX   33
+#define       MSG_HEADER_WORD_CNT_OFFSET                       0
+#define       MSG_HEADER_WORD_CNT_OFFSET_WIDTH                   MEM_WORD_BIT_OFFSET_WIDTH
+#define       MSG_HEADER_WORD_CNT_BITS                         (MSG_HEADER_WORD_CNT_OFFSET+MSG_HEADER_WORD_CNT_OFFSET_WIDTH)
+#define       MSG_HEADER_WORD_CNT_BITS_WIDTH                     MEM_WORD_BIT_OFFSET_WIDTH
+#define       MSG_HEADER_INFINITE_PHASE_LAST_TILE_OFFSET       (MSG_HEADER_WORD_CNT_BITS+MSG_HEADER_WORD_CNT_BITS_WIDTH)
+#define       MSG_HEADER_INFINITE_PHASE_LAST_TILE_OFFSET_WIDTH   MEM_WORD_BIT_OFFSET_WIDTH
+
+// Register corresponding to the auto-configuration header. Written by each auto-config access
+// at phase start, can be also written by software for initial configuration or if auto-config
+// is disabled.
+// PHASE_NUM_INCR is phase number increment relative to the previous executed phase (or 0 right
+// after reset). The increment happens after auto-config is done, and before the phase is executed.
+// (Therefore reading  STREAM_CURR_PHASE_REG while auto-config is ongoing, or if it hasnt started
+// yet, may return the old phase number.)
+// This enables up to 2^12-1 phases to be skipped. If more phases need to be skipped, it is
+// necessary to insert an intermediate phase with zero messages, whose only purpose is to provide
+// an additional skip offset.
+#define   STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX   34
+#define       PHASE_NUM_INCR                      0
+#define       PHASE_NUM_INCR_WIDTH                  12
+#define       CURR_PHASE_NUM_MSGS                 (PHASE_NUM_INCR+PHASE_NUM_INCR_WIDTH)
+#define       CURR_PHASE_NUM_MSGS_WIDTH             12
+#define       NEXT_PHASE_NUM_CFG_REG_WRITES       (CURR_PHASE_NUM_MSGS+CURR_PHASE_NUM_MSGS_WIDTH)
+#define       NEXT_PHASE_NUM_CFG_REG_WRITES_WIDTH   8
+
+// Should be written only for stream 0, applies to all streams.
+#define   STREAM_PERF_CONFIG_REG_INDEX   35
+#define       CLOCK_GATING_EN              0
+#define       CLOCK_GATING_EN_WIDTH          1
+#define       CLOCK_GATING_HYST            (CLOCK_GATING_EN+CLOCK_GATING_EN_WIDTH)
+#define       CLOCK_GATING_HYST_WIDTH        7
+// PARTIAL_SEND_WORDS_THR contols the minimum number of 16-byte words of a tile to accumulate in a relay stream before sending it off to the destination.
+// If the size of the tile is less than or equal to PARTIAL_SEND_WORDS_THR, then this feild is ignored.
+// Default is 16 words
+#define       PARTIAL_SEND_WORDS_THR       (CLOCK_GATING_HYST+CLOCK_GATING_HYST_WIDTH)
+#define       PARTIAL_SEND_WORDS_THR_WIDTH   8
+
+// Scratch registers
+// Exists only in streams 0-3 and 8-11
+// Data can be stored at [23:0] from STREAM_SCRATCH_REG_INDEX + 0 to STREAM_SCRATCH_REG_INDEX + 5
+// Can be loaded through overlay blobs.
+#define   STREAM_SCRATCH_REG_INDEX   36
+
+#define   STREAM_SCRATCH_0_REG_INDEX   36
+#define       NCRISC_TRANS_EN                       0
+#define       NCRISC_TRANS_EN_WIDTH                   1
+#define       NCRISC_TRANS_EN_IRQ_ON_BLOB_END       (NCRISC_TRANS_EN + NCRISC_TRANS_EN_WIDTH)
+#define       NCRISC_TRANS_EN_IRQ_ON_BLOB_END_WIDTH   1
+#define       NCRISC_CMD_ID                         (NCRISC_TRANS_EN_IRQ_ON_BLOB_END + NCRISC_TRANS_EN_IRQ_ON_BLOB_END_WIDTH)
+#define       NCRISC_CMD_ID_WIDTH                     3
+// Kept for compatibility with grayskull, but doesnt not exist anymore in wormhole
+#define       NEXT_NRISC_PIC_INT_ON_PHASE           (NCRISC_CMD_ID + NCRISC_CMD_ID_WIDTH)
+#define       NEXT_NRISC_PIC_INT_ON_PHASE_WIDTH       19
+
+#define   STREAM_SCRATCH_1_REG_INDEX   37
+#define       DRAM_FIFO_RD_PTR_WORDS_LO               0
+#define       DRAM_FIFO_RD_PTR_WORDS_LO_WIDTH           24
+#define       NCRISC_LOOP_COUNT                       0
+#define       NCRISC_LOOP_COUNT_WIDTH                   24
+#define       NCRISC_INIT_ENABLE_BLOB_DONE_IRQ        0
+#define       NCRISC_INIT_ENABLE_BLOB_DONE_IRQ_WIDTH    1
+#define       NCRISC_INIT_DISABLE_BLOB_DONE_IRQ       (NCRISC_INIT_ENABLE_BLOB_DONE_IRQ + NCRISC_INIT_ENABLE_BLOB_DONE_IRQ_WIDTH)
+#define       NCRISC_INIT_DISABLE_BLOB_DONE_IRQ_WIDTH   1
+
+#define   STREAM_SCRATCH_2_REG_INDEX   38
+#define       DRAM_FIFO_RD_PTR_WORDS_HI       0
+#define       DRAM_FIFO_RD_PTR_WORDS_HI_WIDTH   4
+#define       DRAM_FIFO_WR_PTR_WORDS_LO       (DRAM_FIFO_RD_PTR_WORDS_HI + DRAM_FIFO_RD_PTR_WORDS_HI_WIDTH)
+#define       DRAM_FIFO_WR_PTR_WORDS_LO_WIDTH   20
+#define       NCRISC_TOTAL_LOOP_ITER          0
+#define       NCRISC_TOTAL_LOOP_ITER_WIDTH      24
+
+#define   STREAM_SCRATCH_3_REG_INDEX   39
+#define       DRAM_FIFO_WR_PTR_WORDS_HI                 0
+#define       DRAM_FIFO_WR_PTR_WORDS_HI_WIDTH             8
+#define       DRAM_FIFO_CAPACITY_PTR_WORDS_LO           (DRAM_FIFO_WR_PTR_WORDS_HI + DRAM_FIFO_WR_PTR_WORDS_HI_WIDTH)
+#define       DRAM_FIFO_CAPACITY_PTR_WORDS_LO_WIDTH       16
+#define       NCRISC_LOOP_INCR                          0
+#define       NCRISC_LOOP_INCR_WIDTH                      16
+#define       NCRISC_LOOP_BACK_NUM_CFG_REG_WRITES       (NCRISC_LOOP_INCR+NCRISC_LOOP_INCR_WIDTH)
+#define       NCRISC_LOOP_BACK_NUM_CFG_REG_WRITES_WIDTH   8
+
+#define   STREAM_SCRATCH_4_REG_INDEX   40
+#define       DRAM_FIFO_CAPACITY_PTR_WORDS_HI       0
+#define       DRAM_FIFO_CAPACITY_PTR_WORDS_HI_WIDTH   12
+#define       DRAM_FIFO_BASE_ADDR_WORDS_LO          (DRAM_FIFO_CAPACITY_PTR_WORDS_HI + DRAM_FIFO_CAPACITY_PTR_WORDS_HI_WIDTH)
+#define       DRAM_FIFO_BASE_ADDR_WORDS_LO_WIDTH      12
+#define       NCRISC_LOOP_BACK_AUTO_CFG_PTR         0
+#define       NCRISC_LOOP_BACK_AUTO_CFG_PTR_WIDTH     24
+
+#define   STREAM_SCRATCH_5_REG_INDEX   41
+#define       DRAM_FIFO_BASE_ADDR_WORDS_HI             0
+#define       DRAM_FIFO_BASE_ADDR_WORDS_HI_WIDTH         16
+// Processes the read or write operation to completeion without processing other dram streams in the meantime
+#define       DRAM_EN_BLOCKING                         (DRAM_FIFO_BASE_ADDR_WORDS_HI + DRAM_FIFO_BASE_ADDR_WORDS_HI_WIDTH)
+#define       DRAM_EN_BLOCKING_WIDTH                     1
+// Fifo structure in dram holds a dram pointer and size that is used as indirection to a tile in dram
+#define       DRAM_DATA_STRUCTURE_IS_LUT               (DRAM_EN_BLOCKING + DRAM_EN_BLOCKING_WIDTH)
+#define       DRAM_DATA_STRUCTURE_IS_LUT_WIDTH           1
+// During a dram read, if its detected that the fifo is empty the ncrisc will reset the read pointer back to base
+// Its expected that there is no host interaction
+#define       DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY       (DRAM_DATA_STRUCTURE_IS_LUT + DRAM_DATA_STRUCTURE_IS_LUT_WIDTH)
+#define       DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY_WIDTH   1
+// During a dram write, if its detected that the fifo is full the ncrisc will reset the write pointer back to base. Old data will be overwritten.
+// Its expected that there is no host interaction
+#define       DRAM_RESET_WR_PTR_TO_BASE_ON_FULL        (DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY + DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY_WIDTH)
+#define       DRAM_RESET_WR_PTR_TO_BASE_ON_FULL_WIDTH    1
+// The internal ncrisc rd/wr pointers will not be updated at phase end
+// Its expected that there is no host interaction
+#define       DRAM_NO_PTR_UPDATE_ON_PHASE_END          (DRAM_RESET_WR_PTR_TO_BASE_ON_FULL + DRAM_RESET_WR_PTR_TO_BASE_ON_FULL_WIDTH)
+#define       DRAM_NO_PTR_UPDATE_ON_PHASE_END_WIDTH      1
+// Before ending the phase the ncrisc will wait until the host has emptied the write buffer and then reset the read and write pointers to base
+// This can be used for hosts that do not want to track wrapping
+// The host must be aware of this behaviour for this functionality to work
+#define       DRAM_WR_BUFFER_FLUSH_AND_RST_PTRS        (DRAM_NO_PTR_UPDATE_ON_PHASE_END + DRAM_NO_PTR_UPDATE_ON_PHASE_END_WIDTH)
+#define       DRAM_WR_BUFFER_FLUSH_AND_RST_PTRS_WIDTH    1
+#define       NCRISC_LOOP_NEXT_PIC_INT_ON_PHASE        0
+#define       NCRISC_LOOP_NEXT_PIC_INT_ON_PHASE_WIDTH    20
+
+// Start address (in words) of the message blob buffer.
+// Only used when out-of-order execution is enabled. Read value consists of this register + current message blob offset.
+#define   STREAM_MSG_BLOB_BUF_START_REG_INDEX   206
+
+// Global offset table write entry interface.
+#define   STREAM_GLOBAL_OFFSET_TABLE_REG_INDEX   207
+#define       GLOBAL_OFFSET_VAL                   0
+#define       GLOBAL_OFFSET_VAL_WIDTH               MEM_WORD_ADDR_WIDTH
+#define       GLOBAL_OFFSET_TABLE_INDEX_SEL       (GLOBAL_OFFSET_VAL+GLOBAL_OFFSET_VAL_WIDTH)
+#define       GLOBAL_OFFSET_TABLE_INDEX_SEL_WIDTH   GLOBAL_OFFSET_TABLE_SIZE_WIDTH
+#define       GLOBAL_OFFSET_TABLE_CLEAR           (GLOBAL_OFFSET_TABLE_INDEX_SEL+GLOBAL_OFFSET_TABLE_INDEX_SEL_WIDTH)
+#define       GLOBAL_OFFSET_TABLE_CLEAR_WIDTH       1
+
+// Scratch location for firmware usage
+// Guarantees that no side-effects occur in Overlay hardware
+// Does not map to any actual registers in streams
+#define   FIRMWARE_SCRATCH_REG_INDEX   208
+
+// Bit mask of connnected local source. Dont care if LOCAL_SOURCES_CONNECTED == 0.
+// Mask segments [23:0], [47:24], and [63:48] are at indexes STREAM_LOCAL_SRC_MASK_REG_INDEX,
+// STREAM_LOCAL_SRC_MASK_REG_INDEX+1, STREAM_LOCAL_SRC_MASK_REG_INDEX+2.
+#define   STREAM_LOCAL_SRC_MASK_REG_INDEX   224
+
+// Reserved for msg header fetch interface
+#define   STREAM_MSG_HEADER_FETCH_REG_INDEX   254
+
+// Reserved for legacy reasons. This range appears not to be used in rtl anymore.
+#define   RESERVED1_REG_INDEX   255
+
+// Only in receiver endpoint/dram streams
+// A 32 bit scratch register
+#define   STREAM_SCRATCH32_REG_INDEX   256
+
+// Status info for the stream.
+#define   STREAM_WAIT_STATUS_REG_INDEX   257
+// Set when stream is in START state with auto-config disabled, or if auto-config is enabled
+// but PHASE_AUTO_ADVANCE=0
+#define       WAIT_SW_PHASE_ADVANCE_SIGNAL                    0
+#define       WAIT_SW_PHASE_ADVANCE_SIGNAL_WIDTH                1
+// Set when stream has configured the current phase, but waits data from the previous one to be flushed.
+#define       WAIT_PREV_PHASE_DATA_FLUSH                      (WAIT_SW_PHASE_ADVANCE_SIGNAL+WAIT_SW_PHASE_ADVANCE_SIGNAL_WIDTH)
+#define       WAIT_PREV_PHASE_DATA_FLUSH_WIDTH                  1
+// Set when stream is in data forwarding state.
+#define       MSG_FWD_ONGOING                                 (WAIT_PREV_PHASE_DATA_FLUSH+WAIT_PREV_PHASE_DATA_FLUSH_WIDTH)
+#define       MSG_FWD_ONGOING_WIDTH                             1
+#define       STREAM_CURR_STATE                               (MSG_FWD_ONGOING+MSG_FWD_ONGOING_WIDTH)
+#define       STREAM_CURR_STATE_WIDTH                           4
+#define       TOKEN_GOTTEN                                    (STREAM_CURR_STATE+STREAM_CURR_STATE_WIDTH)
+#define       TOKEN_GOTTEN_WIDTH                                1
+#define       INFINITE_PHASE_END_DETECTED                     (TOKEN_GOTTEN+TOKEN_GOTTEN_WIDTH)
+#define       INFINITE_PHASE_END_DETECTED_WIDTH                 1
+#define       INFINITE_PHASE_END_HEADER_BUFFER_DETECTED       (INFINITE_PHASE_END_DETECTED+INFINITE_PHASE_END_DETECTED_WIDTH)
+#define       INFINITE_PHASE_END_HEADER_BUFFER_DETECTED_WIDTH   1
+
+// Only in receiver endpoint streams (stream 4 and 5)
+// Read-only. Tells you the number of tiles that have arrived in L1
+#define   STREAM_NUM_MSGS_RECEIVED_IN_BUF_AND_MEM_REG_INDEX   258
+
+// Number of received & stored messages (read-only).
+// To get the total number of messages penidng in memory read
+// STREAM_NUM_MSGS_RECEIVED_IN_BUF_AND_MEM_REG_INDEX
+#define   STREAM_NUM_MSGS_RECEIVED_REG_INDEX   259
+
+// Available buffer space at the stream (in 16B words).
+// Source cant send data unless available space > 0.
+#define   STREAM_BUF_SPACE_AVAILABLE_REG_INDEX   260
+
+// Available msg info buffer space at the stream (in 16B words).
+// Source cant send data unless available space > 0.
+// Only valid when MSG_INFO_BUF_FLOW_CTRL is true
+#define   STREAM_MSG_INFO_BUF_SPACE_AVAILABLE_REG_INDEX   261
+
+// Memory address (in words) of the next in line received message (read-only).
+#define   STREAM_NEXT_RECEIVED_MSG_ADDR_REG_INDEX   262
+
+// Size in words of the next in line received message (read-only).
+#define   STREAM_NEXT_RECEIVED_MSG_SIZE_REG_INDEX   263
+
+// Clear message info, move read pointer, and reclaim buffer space for one or more stored messages.
+// This is a special case of STREAM_MSG_INFO_CLEAR/STREAM_MSG_DATA_CLEAR where we arent streaming data
+// and instead we just want to clear a bunch of messages after we have used them.
+// If you are using streaming it is better to use STREAM_MSG_INFO_CLEAR/STREAM_MSG_DATA_CLEAR instead.
+// You should not use both STREAM_MSG_INFO_CLEAR/STREAM_MSG_DATA_CLEAR and STREAM_MULTI_MSG_CLEAR at the same time
+// Must be used only for streams where RECEIVER_ENDPOINT == 1.
+#define   STREAM_MULTI_MSG_CLEAR_REG_INDEX   264
+
+// Clear message info for one or more stored messages.  Only valid values are 1, 2, or 4.
+// No effect on the read pointer.
+// Should be used only for streams where RECEIVER_ENDPOINT == 1.
+#define   STREAM_MSG_INFO_CLEAR_REG_INDEX   265
+
+// Move read pointer & reclaim buffer space for one or more stored messages.
+// Sends flow control update to the source if REMOTE_SOURCE==1.
+// Only valid values are 1, 2, or 4.
+// Should be used only for streams where RECEIVER_ENDPOINT == 1, after
+// STREAM_MSG_INFO_CLEAR_REG has been written with the same value.
+#define   STREAM_MSG_DATA_CLEAR_REG_INDEX   266
+
+// Write-only. Write 1 to advance to the next phase if PHASE_AUTO_ADVANCE == 0.
+#define   STREAM_PHASE_ADVANCE_REG_INDEX   267
+
+// Write phase number to indicate destination ready for the given phase.
+// (This is done automatically by stream hardware when starting a phase with REMOTE_SOURCE=1.)
+// The phase number is the one indicated by STREAM_REMOTE_SRC_PHASE_REG at destination.
+// This register is mapped to the shared destination ready table, not a per-stream register.
+// (Stream index is taken from the register address, and stored into the table along with the
+// phase number.)
+#define   STREAM_DEST_PHASE_READY_UPDATE_REG_INDEX   268
+#define       PHASE_READY_DEST_NUM           0
+#define       PHASE_READY_DEST_NUM_WIDTH       6
+#define       PHASE_READY_NUM                (PHASE_READY_DEST_NUM+PHASE_READY_DEST_NUM_WIDTH)
+#define       PHASE_READY_NUM_WIDTH            20
+// set if this stream is part of multicast group (i.e. if REMOTE_SRC_IS_MCAST==1)
+#define       PHASE_READY_MCAST              (PHASE_READY_NUM+PHASE_READY_NUM_WIDTH)
+#define       PHASE_READY_MCAST_WIDTH          1
+// set if the message is in response to 2-way handshake
+#define       PHASE_READY_TWO_WAY_RESP       (PHASE_READY_MCAST+PHASE_READY_MCAST_WIDTH)
+#define       PHASE_READY_TWO_WAY_RESP_WIDTH   1
+
+// Source ready message register for two-way handshake (sent by source in
+// case destination ready entry is not found in the table).
+// If received by a stream that already sent its ready update, it prompts resending.
+#define   STREAM_SRC_READY_UPDATE_REG_INDEX   269
+#define       STREAM_REMOTE_RDY_SRC_X        0
+#define       STREAM_REMOTE_RDY_SRC_X_WIDTH    NOC_ID_WIDTH
+#define       STREAM_REMOTE_RDY_SRC_Y        (STREAM_REMOTE_RDY_SRC_X+STREAM_REMOTE_RDY_SRC_X_WIDTH)
+#define       STREAM_REMOTE_RDY_SRC_Y_WIDTH    NOC_ID_WIDTH
+#define       REMOTE_RDY_SRC_STREAM_ID       (STREAM_REMOTE_RDY_SRC_Y+STREAM_REMOTE_RDY_SRC_Y_WIDTH)
+#define       REMOTE_RDY_SRC_STREAM_ID_WIDTH   STREAM_ID_WIDTH
+#define       IS_TOKEN_UPDATE                (REMOTE_RDY_SRC_STREAM_ID+REMOTE_RDY_SRC_STREAM_ID_WIDTH)
+#define       IS_TOKEN_UPDATE_WIDTH            1
+
+// Update available buffer space at remote destination stream.
+// this is rd_ptr increment issued when a message is forwarded
+#define   STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX   270
+#define       REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM       0
+#define       REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM_WIDTH   6
+#define       REMOTE_DEST_BUF_WORDS_FREE_INC                        (REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM+REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM_WIDTH)
+#define       REMOTE_DEST_BUF_WORDS_FREE_INC_WIDTH                    MEM_WORD_ADDR_WIDTH
+#define       REMOTE_DEST_MSG_INFO_BUF_WORDS_FREE_INC               (REMOTE_DEST_BUF_WORDS_FREE_INC+REMOTE_DEST_BUF_WORDS_FREE_INC_WIDTH)
+#define       REMOTE_DEST_MSG_INFO_BUF_WORDS_FREE_INC_WIDTH           MSG_INFO_BUF_SIZE_WORDS_WIDTH
+
+// Write to reset & stop stream.
+#define   STREAM_RESET_REG_INDEX   271
+
+// AND value of zero masks for the pending message group.
+// (Header bits [95:64].)
+// Read-only.  Valid only for receiver endpoint streams.
+#define   STREAM_MSG_GROUP_ZERO_MASK_AND_REG_INDEX   272
+
+// Returns 1 if the message info register is full (read-only).
+#define   STREAM_MSG_INFO_FULL_REG_INDEX   273
+
+// Returns 1 if the message info register is full (read-only), and there are no outstanding loads in progress.
+#define   STREAM_MSG_INFO_FULLY_LOADED_REG_INDEX   274
+
+// Returns 1 if the message info register can accept new message push (read-only).
+// Equivalent to checking the condition:
+//   (STREAM_MSG_INFO_FULL_REG_INDEX == 0) && (STREAM_MSG_INFO_PTR_REG_INDEX == STREAM_MSG_INFO_WR_PTR_REG_INDEX)
+// (I.e. ther is free space in the msg info register, and we dont have any message info headers in the
+//  memory buffer about to be fetched.)
+#define   STREAM_MSG_INFO_CAN_PUSH_NEW_MSG_REG_INDEX   275
+
+// Concat compress flags from 4 tiles in the pending message group.
+// (Header bit 52.)
+// Read-only.  Valid only for receiver endpoint streams.
+#define   STREAM_MSG_GROUP_COMPRESS_REG_INDEX   276
+
+// Returns 1 if all msgs that the phase can accept have been pushed into the stream. 0 otherwise.
+#define   STREAM_PHASE_ALL_MSGS_PUSHED_REG_INDEX   277
+
+// Returns 1 if the stream is in a state where it can accept msgs.
+#define   STREAM_READY_FOR_MSG_PUSH_REG_INDEX   278
+
+// Returns global offset table entry 0. The rest of the table entries can be read at index
+// STREAM_GLOBAL_OFFSET_TABLE_RD_REG_INDEX+i, up to maximum entry size.
+#define   STREAM_GLOBAL_OFFSET_TABLE_RD_REG_INDEX   279
+
+// 32 bit register. Each bit denotes whether the corresponding stream has completed its blob run and is in idle state.
+// Resets to 0 upon starting a new stream run. Initially all are 0 to exclude streams that might not be used.
+// Can be manually reset to 0 by writing 1 to the corresponding bit.
+// Exists only in stream 0
+#define   STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX   288
+
+// Reading this register will give you a stream id of a stream that finished its blob (according to STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX)
+// Subsequent reads will give you the next stream, untill all streams are read, after which it will loop
+// This register is only valid if BLOB_NEXT_AUTO_CFG_DONE_VALID is set (i.e. if STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX non-zero)
+// Exists only in stream 0
+#define   STREAM_BLOB_NEXT_AUTO_CFG_DONE_REG_INDEX   290
+#define       BLOB_NEXT_AUTO_CFG_DONE_STREAM_ID       0
+#define       BLOB_NEXT_AUTO_CFG_DONE_STREAM_ID_WIDTH   STREAM_ID_WIDTH
+#define       BLOB_NEXT_AUTO_CFG_DONE_VALID           16
+#define       BLOB_NEXT_AUTO_CFG_DONE_VALID_WIDTH       1
+
+// For receiver endpoint streams that expose the full message header bus to unpacker,
+// write this register to specify the full header in case the stream is not snooping
+// a remote source but instead also works as a source endpoint.
+// Write (STREAM_RECEIVER_ENDPOINT_SET_MSG_HEADER_REG_INDEX+i) to set bits [i*32 +: 32]
+// of the message header for the next message, prior to writing STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG_INDEX.
+#define   STREAM_RECEIVER_ENDPOINT_SET_MSG_HEADER_REG_INDEX   291
+
+// Available buffer space at remote destination stream(s) for both the data buffer and msg info buffer.
+// Dont care unless REMOTE_RECEIVER == 1.
+// Source cant send data unless WORDS_FREE > 0.
+// Read-only; updated automatically to maximum value when
+// STREAM_REMOTE_DEST_BUF_SIZE_REG/STREAM_REMOTE_DEST_MSG_INFO_BUF_SIZE_REG is updated.
+// For multicast streams, values for successive destinations are at
+// subsequent indexes (STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX+1,
+// STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX+2, etc.).
+// REMOTE_DEST_MSG_INFO_WORDS_FREE is only valid when DEST_MSG_INFO_BUF_FLOW_CTRL is true
+#define   STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX   297
+#define       REMOTE_DEST_WORDS_FREE                0
+#define       REMOTE_DEST_WORDS_FREE_WIDTH            MEM_WORD_ADDR_WIDTH
+#define       REMOTE_DEST_MSG_INFO_WORDS_FREE       (REMOTE_DEST_WORDS_FREE+REMOTE_DEST_WORDS_FREE_WIDTH)
+#define       REMOTE_DEST_MSG_INFO_WORDS_FREE_WIDTH   MSG_INFO_BUF_SIZE_WORDS_WIDTH
+
+// Read-only register view of the bits on the o_full_msg_info bus.
+// Exposed as 32-bit read-only registers starting at this index.
+#define   STREAM_RECEIVER_MSG_INFO_REG_INDEX   329
+
+// Debug bus stream selection. Write the stream id for the stream that you want exposed on the debug bus
+// This register only exists in stream 0.
+#define   STREAM_DEBUG_STATUS_SEL_REG_INDEX   499
+#define       DEBUG_STATUS_STREAM_ID_SEL        0
+#define       DEBUG_STATUS_STREAM_ID_SEL_WIDTH    STREAM_ID_WIDTH
+#define       DISABLE_DEST_READY_TABLE          (DEBUG_STATUS_STREAM_ID_SEL+DEBUG_STATUS_STREAM_ID_SEL_WIDTH)
+#define       DISABLE_DEST_READY_TABLE_WIDTH      1
+#define       DISABLE_GLOBAL_OFFSET_TABLE       (DISABLE_DEST_READY_TABLE+DISABLE_DEST_READY_TABLE_WIDTH)
+#define       DISABLE_GLOBAL_OFFSET_TABLE_WIDTH   1
+
+// Debugging: Non-zero value indicates an invalid stream operation occured.
+// Sticky, write 1 to clear.
+#define   STREAM_DEBUG_ASSERTIONS_REG_INDEX   500
+
+// Read-only register that exposes internal states of the stream.
+// Useful for debugging. Valid 32-bit data from STREAM_DEBUG_STATUS_REG_INDEX + 0 to STREAM_DEBUG_STATUS_REG_INDEX + 9
+#define   STREAM_DEBUG_STATUS_REG_INDEX   501
+
+// Reserved for legacy reasons. This range appears not to be used in rtl anymore.
+#define   RESERVED2_REG_INDEX   511
+
+#endif // def NOC_OVERLAY_PARAMETERS_H
diff --git a/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.hpp b/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.hpp
new file mode 100644
index 000000000000..9de6c55378fb
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/noc/noc_overlay_parameters.hpp
@@ -0,0 +1,321 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// AUTO_GENERATED! DO NOT MODIFY!                                                                                              //
+//                                                                                                                             //
+// Please run                                                                                                                  //
+//                                                                                                                             //
+// (echo '<% type=:svh_header %>' && cat noc_overlay_parameters.erb) | erb -T - > ../rtl/overlay/tt_noc_overlay_params.svh   //
+// (echo '<% type=:c_header %>' && cat noc_overlay_parameters.erb) | erb -T - > noc_overlay_parameters.h                     //
+// (echo '<% type=:cpp_header %>' && cat noc_overlay_parameters.erb) | erb -T - > noc_overlay_parameters.hpp                 //
+// (echo '<% type=:rb_header %>' && cat noc_overlay_parameters.erb) | erb -T - > noc_overlay_parameters.rb                   //
+// Open noc_overlay_parameters.hpp and move static class varaible definitions to noc_overlay_parameters.cpp                    //
+// overriding existing ones.                                                                                                   //
+//                                                                                                                             //
+// to regenerate                                                                                                               //
+//                                                                                                                             //
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <stdexcept>
+
+#ifndef NOC_OVERLAY_PARAMETERS_BASIC_H
+#define NOC_OVERLAY_PARAMETERS_BASIC_H
+
+#define NOC_NUM_STREAMS 64
+#define ETH_NOC_NUM_STREAMS 32
+
+#define NUM_MCAST_STREAM_ID_START 0
+#define NUM_MCAST_STREAM_ID_END   3
+#define NUM_RECEIVER_ENDPOINT_STREAM_ID_START 4
+#define NUM_RECEIVER_ENDPOINT_STREAM_ID_END   5
+#define NUM_REMOTE_RECEIVER_STREAM_ID_START 0
+#define NUM_REMOTE_RECEIVER_STREAM_ID_END 63
+#define RECEIVER_ENDPOINT_STREAM_MSG_GROUP_SIZE 4
+#define RECEIVER_ENDPOINT_STREAM_MSG_INFO_FIFO_GROUPS     4
+#define NON_RECEIVER_ENDPOINT_STREAM_MSG_INFO_FIFO_GROUPS 2
+#define DEST_READY_COMMON_CACHE_NUM_ENTRIES 24
+#define DEST_READY_MCAST_CACHE_NUM_ENTRIES 8
+
+#define NOC_OVERLAY_START_ADDR     0xFFB40000
+#define NOC_STREAM_REG_SPACE_SIZE  0x1000
+
+#define STREAM_REG_ADDR(stream_id, reg_id) ((NOC_OVERLAY_START_ADDR) + (((uint32_t)(stream_id))*(NOC_STREAM_REG_SPACE_SIZE)) + (((uint32_t)(reg_id)) << 2))
+
+#define NUM_NOCS                   2
+#define NOC0_REGS_START_ADDR       0xFFB20000
+#define NOC1_REGS_START_ADDR       0xFFB30000
+
+#define NCRISC_STREAM_RANGE_1_START 0
+#define NCRISC_STREAM_RANGE_1_END   3
+#define NCRISC_STREAM_RANGE_2_START 8
+#define NCRISC_STREAM_RANGE_2_END   11
+#define NCRISC_PIC_CONFIG_PHASE_DEFAULT           0
+
+#ifdef TB_NOC
+
+extern "C" {
+#include "noc.h"
+#include "noc_api_dpi.h"
+}
+
+#else
+
+#define NOC_STREAM_WRITE_REG(stream_id, reg_id, val)  ((*((volatile uint32_t*)(STREAM_REG_ADDR(stream_id, reg_id)))) = (val))
+#define NOC_STREAM_READ_REG(stream_id, reg_id)        (*((volatile uint32_t*)(STREAM_REG_ADDR(stream_id, reg_id))))
+
+#define NOC_STREAM_WRITE_REG_FIELD(stream_id, reg_id, field, val) (NOC_STREAM_WRITE_REG(stream_id, reg_id, ((NOC_STREAM_READ_REG(stream_id, reg_id) & ~((1 << field##_WIDTH) - 1)) | ((val & ((1 << field##_WIDTH) - 1)) << field))))
+#define NOC_STREAM_READ_REG_FIELD(stream_id, reg_id, field)       ((NOC_STREAM_READ_REG(stream_id, reg_id) >> field) & ((1 << field##_WIDTH) - 1))
+
+#define NOC_WRITE_REG(addr, val) ((*((volatile uint32_t*)(addr)))) = (val)
+#define NOC_READ_REG(addr)       (*((volatile uint32_t*)(addr)))
+
+#endif
+
+
+#define NOC_ID_WIDTH     6
+#define STREAM_ID_WIDTH  6
+
+#define DEST_CNT_WIDTH   6
+#define NOC_NUM_WIDTH     1
+
+#define STREAM_REG_INDEX_WIDTH 9
+#define STREAM_REG_CFG_DATA_WIDTH 24
+
+#define MEM_WORD_WIDTH 16
+#define MEM_WORD_ADDR_WIDTH 17
+
+#define MEM_WORD_BIT_OFFSET_WIDTH 7
+
+#define MSG_INFO_BUF_SIZE_WORDS 256
+#define MSG_INFO_BUF_SIZE_BITS  8
+#define MSG_INFO_BUF_SIZE_POW_BITS 3
+#define MSG_INFO_BUF_SIZE_WORDS_WIDTH (MSG_INFO_BUF_SIZE_BITS + 1)
+
+#define GLOBAL_OFFSET_TABLE_SIZE 8
+#define GLOBAL_OFFSET_TABLE_SIZE_WIDTH 3
+
+#endif
+
+namespace Noc {
+
+typedef struct OverlayField_ {
+    std::string name;
+    std::uint32_t offset;
+    std::uint32_t width;
+    std::string description;
+} OverlayField;
+
+typedef struct OverlayReg_ {
+    std::string name;
+    std::uint32_t index;
+    std::unordered_map<std::string, std::uint32_t> fields_by_name;
+    std::unordered_map<std::uint32_t, std::uint32_t> fields_by_offset;
+    std::vector<OverlayField> fields;
+    std::string description;
+} OverlayReg;
+
+// OverLayParams
+class OLP {
+    private:
+        static const std::unordered_map<std::string, std::uint32_t> registers_by_name;
+        static const std::unordered_map<std::uint32_t, std::uint32_t> registers_by_index;
+        static const std::vector<OverlayReg> registers;
+        static const std::unordered_map<std::string, std::uint32_t> fields_by_name;
+        static const std::vector<OverlayField> fields;
+
+    private:
+        // Disallow creating an instance of this object
+        OLP() {}
+
+    public:
+        static bool HasReg(std::string label)
+        {
+            return registers_by_name.count(label) >= 1;
+        }
+
+        // There might be multiple registers with the same index
+        // If so a register you didnt intend to access might be accessed.
+        // Use accessor based on label if possible
+        static bool HasReg(std::uint32_t index)
+        {
+            return registers_by_index.count(index) >= 1;
+        }
+
+        static const std::vector<OverlayReg>& GetAllRegs()
+        {
+            return registers;
+        }
+
+        // There might be multiple registers with the same index
+        // If so a register you didnt intend to access might be accessed.
+        // Use accessor based on label if possible
+        static std::string RegName(std::uint32_t index)
+        {
+            if (HasReg(index))
+                return registers[registers_by_index.at(index)].name;
+            else
+                throw std::runtime_error("Non-existant overlay register index: " + std::to_string(index));
+        }
+
+        static std::uint32_t RegIdx(std::string label)
+        {
+            if (HasReg(label))
+                return registers[registers_by_name.at(label)].index;
+            else
+                throw std::runtime_error("Non-existant overlay register: " + std::string(label));
+        }
+
+        static std::string RegInfo(std::string label)
+        {
+            if (HasReg(label))
+                return registers[registers_by_name.at(label)].description;
+            else
+                throw std::runtime_error("Non-existant overlay register: " + std::string(label));
+        }
+
+        ////////////////////////////////////
+
+        static bool HasFld(std::string label)
+        {
+            return fields_by_name.count(label) >= 1;
+        }
+
+        static const std::vector<OverlayField>& GetAllFlds()
+        {
+            return fields;
+        }
+
+        static std::uint32_t FldOff(std::string label)
+        {
+            if (HasFld(label))
+                return fields[fields_by_name.at(label)].offset;
+            else
+                throw std::runtime_error("Non-existant overlay field: " + std::string(label));
+        }
+
+        static std::uint32_t FldW(std::string label)
+        {
+            if (HasFld(label))
+                return fields[fields_by_name.at(label)].width;
+            else
+                throw std::runtime_error("Non-existant overlay field: " + std::string(label));
+        }
+
+        static std::string FldInfo(std::string label)
+        {
+            if (HasFld(label))
+                return fields[fields_by_name.at(label)].description;
+            else
+                throw std::runtime_error("Non-existant overlay field: " + std::string(label));
+        }
+
+        ////////////////////////////////////
+
+        static bool HasFld(std::string reg_label, std::string field_label)
+        {
+            return HasReg(reg_label) &&
+                   (registers[registers_by_name.at(reg_label)].fields_by_name.count(field_label) >= 1);
+        }
+
+        // There might be multiple registers(fields) with the same index(offset)
+        // If so a register(field) you didnt intend to access might be accessed.
+        // Use accessor based on label if possible
+        static bool HasFld(std::uint32_t reg_index, std::uint32_t field_offset)
+        {
+            return HasReg(reg_index) &&
+                   (registers[registers_by_index.at(reg_index)].fields_by_offset.count(field_offset) >= 1);
+        }
+
+        static const std::vector<OverlayField>& GetAllFlds(std::string reg_label)
+        {
+            if (HasReg(reg_label)) {
+                return registers[registers_by_name.at(reg_label)].fields;
+            } else {
+                throw std::runtime_error("Non-existant overlay register: " + std::string(reg_label));
+            }
+        }
+
+        // There might be multiple registers(fields) with the same index(offset)
+        // If so a register(field) you didnt intend to access might be accessed.
+        // Use accessor based on label if possible
+        static const std::vector<OverlayField>& GetAllFlds(std::uint32_t reg_index)
+        {
+            if (HasReg(reg_index)) {
+                return registers[registers_by_index.at(reg_index)].fields;
+            } else {
+                throw std::runtime_error("Non-existant overlay register index: " + std::to_string(reg_index));
+            }
+        }
+
+        // There might be multiple registers(fields) with the same index(offset)
+        // If so a register(field) you didnt intend to access might be accessed.
+        // Use accessor based on label if possible
+        static std::string FldName(std::uint32_t reg_index, std::uint32_t field_offset)
+        {
+            if (HasFld(reg_index, field_offset)) {
+                auto field_tmp = registers[registers_by_index.at(reg_index)].fields;
+                auto index_field_temp = registers[registers_by_index.at(reg_index)].fields_by_offset.at(field_offset);
+                return field_tmp[index_field_temp].name;
+            } else {
+                throw std::runtime_error("Non-existant overlay register field (index, offset): " + std::to_string(reg_index) + ", " + std::to_string(field_offset));
+            }
+        }
+
+        static std::uint32_t FldOff(std::string reg_label, std::string field_label)
+        {
+            if (HasFld(reg_label, field_label)) {
+                auto field_tmp = registers[registers_by_name.at(reg_label)].fields;
+                auto index_field_temp = registers[registers_by_name.at(reg_label)].fields_by_name.at(field_label);
+                return field_tmp[index_field_temp].offset;
+            } else {
+                throw std::runtime_error("Non-existant overlay register field: " + std::string(reg_label) + ", " + std::string(field_label));
+            }
+        }
+
+        static std::uint32_t FldW(std::string reg_label, std::string field_label)
+        {
+            if (HasFld(reg_label, field_label)) {
+                auto field_tmp = registers[registers_by_name.at(reg_label)].fields;
+                auto index_field_temp = registers[registers_by_name.at(reg_label)].fields_by_name.at(field_label);
+                return field_tmp[index_field_temp].width;
+            } else {
+                throw std::runtime_error("Non-existant overlay register field: " + std::string(reg_label) + ", " + std::string(field_label));
+            }
+        }
+
+        // There might be multiple registers(fields) with the same index(offset)
+        // If so a register(field) you didnt intend to access might be accessed.
+        // Use accessor based on label if possible
+        static std::uint32_t FldW(std::uint32_t reg_index, std::uint32_t field_offset)
+        {
+            if (HasFld(reg_index, field_offset)) {
+                auto field_tmp = registers[registers_by_index.at(reg_index)].fields;
+                auto index_field_temp = registers[registers_by_index.at(reg_index)].fields_by_offset.at(field_offset);
+                return field_tmp[index_field_temp].width;
+            } else {
+                throw std::runtime_error("Non-existant overlay register field (index, offset): " + std::to_string(reg_index) + ", " + std::to_string(field_offset));
+            }
+        }
+
+        static std::string FldInfo(std::string reg_label, std::string field_label)
+        {
+            if (HasFld(reg_label, field_label)) {
+                auto field_tmp = registers[registers_by_name.at(reg_label)].fields;
+                auto index_field_temp = registers[registers_by_name.at(reg_label)].fields_by_name.at(field_label);
+                return field_tmp[index_field_temp].description;
+            } else {
+                throw std::runtime_error("Non-existant overlay register field: " + std::string(reg_label) + ", " + std::string(field_label));
+            }
+        }
+
+};
+
+}
diff --git a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h
new file mode 100644
index 000000000000..12b1e774e64e
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h
@@ -0,0 +1,362 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef _NOC_PARAMETERS_H_
+#define _NOC_PARAMETERS_H_
+
+#define NUM_NOCS 2
+#define NUM_TENSIXES 140
+
+#define NOC_MAX_TRANSACTION_ID    0xF
+#define NOC_MAX_TRANSACTION_ID_COUNT 255
+
+#define NOC_REG_SPACE_START_ADDR  0xFF000000
+#define NOC_REGS_START_ADDR       0xFFB20000
+#define NOC_CMD_BUF_OFFSET        0x00000800
+#define NOC_CMD_BUF_OFFSET_BIT    11
+#define NOC_INSTANCE_OFFSET       0x00010000
+#define NOC_INSTANCE_OFFSET_BIT   16
+#define NOC_CMD_BUF_INSTANCE_OFFSET(noc, buf) ((buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT))
+
+////
+// NIU master IF control registers:
+
+#define NOC_TARG_ADDR_LO         (NOC_REGS_START_ADDR+0x0)
+#define NOC_TARG_ADDR_MID        (NOC_REGS_START_ADDR+0x4)
+#define NOC_TARG_ADDR_HI         (NOC_REGS_START_ADDR+0x8)
+
+#define NOC_RET_ADDR_LO          (NOC_REGS_START_ADDR+0xC)
+#define NOC_RET_ADDR_MID         (NOC_REGS_START_ADDR+0x10)
+#define NOC_RET_ADDR_HI          (NOC_REGS_START_ADDR+0x14)
+
+#define NOC_PACKET_TAG           (NOC_REGS_START_ADDR+0x18)
+#define NOC_CTRL                 (NOC_REGS_START_ADDR+0x1C)
+#define NOC_AT_LEN_BE            (NOC_REGS_START_ADDR+0x20)
+#define NOC_AT_LEN_BE_1          (NOC_REGS_START_ADDR+0x24)
+#define NOC_AT_DATA              (NOC_REGS_START_ADDR+0x28)
+#define NOC_BRCST_EXCLUDE        (NOC_REGS_START_ADDR+0x2C)
+#define NOC_L1_ACC_AT_INSTRN     (NOC_REGS_START_ADDR+0x30)
+#define NOC_SEC_CTRL             (NOC_REGS_START_ADDR+0x34)
+
+#define NOC_CMD_CTRL             (NOC_REGS_START_ADDR+0x40)
+#define NOC_NODE_ID              (NOC_REGS_START_ADDR+0x44)
+#define NOC_ENDPOINT_ID          (NOC_REGS_START_ADDR+0x48)
+
+#define NUM_MEM_PARITY_ERR       (NOC_REGS_START_ADDR+0x50)
+#define NUM_HEADER_1B_ERR        (NOC_REGS_START_ADDR+0x54)
+#define NUM_HEADER_2B_ERR        (NOC_REGS_START_ADDR+0x58)
+#define ECC_CTRL                 (NOC_REGS_START_ADDR+0x5C)  // [2:0] = clear ECC interrupts, [5:3] = force ECC error
+
+#define NOC_CLEAR_OUTSTANDING_REQ_CNT  (NOC_REGS_START_ADDR+0x60)
+#define CMD_BUF_AVAIL                  (NOC_REGS_START_ADDR+0x64) // [28:24], [20:16], [12:8], [4:0]
+#define CMD_BUF_OVFL                   (NOC_REGS_START_ADDR+0x68)
+
+#define NOC_SEC_FENCE_RANGE(cnt)       (NOC_REGS_START_ADDR+0x400+((cnt)*4)) // 32 inst
+#define NOC_SEC_FENCE_ATTRIBUTE(cnt)   (NOC_REGS_START_ADDR+0x480+((cnt)*4)) // 8 inst
+#define NOC_SEC_FENCE_MASTER_LEVEL     (NOC_REGS_START_ADDR+0x4A0)
+#define NOC_SEC_FENCE_FIFO_STATUS      (NOC_REGS_START_ADDR+0x4A4)
+#define NOC_SEC_FENCE_FIFO_RDDATA      (NOC_REGS_START_ADDR+0x4A8)
+
+// 16 VC, 64 bit registers, 2 ports
+#define PORT1_FLIT_COUNTER_LOWER(vc)   (NOC_REGS_START_ADDR+0x500+((vc)*8))
+#define PORT1_FLIT_COUNTER_UPPER(vc)   (NOC_REGS_START_ADDR+0x504+((vc)*8))
+
+#define PORT2_FLIT_COUNTER_LOWER(vc)   (NOC_REGS_START_ADDR+0x580+((vc)*8))
+#define PORT2_FLIT_COUNTER_UPPER(vc)   (NOC_REGS_START_ADDR+0x584+((vc)*8))
+
+////
+
+#define NOC_STATUS(cnt)          (NOC_REGS_START_ADDR+0x200+((cnt)*4))
+
+// status/performance counter registers
+// IMPROVE: add offsets for misc. debug status regiters
+
+// from noc/rtl/tt_noc_params.svh
+//parameter TOTAL_STATUS_REGS = NIU_STATUS_REGS + MST_IF_INTP_STATUS_REGS + ROUTER_STATUS_REGS + SLV_IF_STATUS_REGS + MST_IF_STATUS_REGS; // 32+2+30+16+48=128
+// NIU_STATUS        : 0x60-0x7F
+// MST_IF_INTP_STATUS: 0x5E-0x5F
+// ROUTER_STATUS     : 0x40-0x5D
+// SLV_IF_STATUS     : 0x30-0x3F
+// MST_IF_STATUS     : 0x 0-0x2F
+
+#define NIU_TRANS_COUNT_RTZ_NUM                   0x5E
+#define NIU_TRANS_COUNT_RTZ_SOURCE                0x5F
+
+
+#define NIU_SLV_POSTED_WR_REQ_STARTED             0x3D
+#define NIU_SLV_NONPOSTED_WR_REQ_STARTED          0x3C
+#define NIU_SLV_POSTED_WR_REQ_RECEIVED            0x3B
+#define NIU_SLV_NONPOSTED_WR_REQ_RECEIVED         0x3A
+#define NIU_SLV_POSTED_WR_DATA_WORD_RECEIVED      0x39
+#define NIU_SLV_NONPOSTED_WR_DATA_WORD_RECEIVED   0x38
+#define NIU_SLV_POSTED_ATOMIC_RECEIVED            0x37
+#define NIU_SLV_NONPOSTED_ATOMIC_RECEIVED         0x36
+#define NIU_SLV_RD_REQ_RECEIVED                   0x35
+
+#define NIU_SLV_REQ_ACCEPTED                  0x34
+#define NIU_SLV_RD_DATA_WORD_SENT             0x33
+#define NIU_SLV_RD_RESP_SENT                  0x32
+#define NIU_SLV_WR_ACK_SENT                   0x31
+#define NIU_SLV_ATOMIC_RESP_SENT              0x30
+
+#define NIU_MST_WRITE_REQS_OUTGOING_ID(id)    (0x20 + (id))
+#define NIU_MST_REQS_OUTSTANDING_ID(id)       (0x10 + (id))
+
+#define NIU_MST_NONPOSTED_ATOMIC_STARTED      0xF
+#define NIU_MST_RD_REQ_STARTED                0xE
+#define NIU_MST_POSTED_WR_REQ_STARTED         0xD
+#define NIU_MST_NONPOSTED_WR_REQ_STARTED      0xC
+#define NIU_MST_POSTED_WR_REQ_SENT            0xB
+#define NIU_MST_NONPOSTED_WR_REQ_SENT         0xA
+#define NIU_MST_POSTED_WR_DATA_WORD_SENT      0x9
+#define NIU_MST_NONPOSTED_WR_DATA_WORD_SENT   0x8
+#define NIU_MST_POSTED_ATOMIC_SENT            0x7
+#define NIU_MST_NONPOSTED_ATOMIC_SENT         0x6
+#define NIU_MST_RD_REQ_SENT                   0x5
+
+#define NIU_MST_CMD_ACCEPTED                  0x4
+#define NIU_MST_RD_DATA_WORD_RECEIVED         0x3
+#define NIU_MST_RD_RESP_RECEIVED              0x2
+#define NIU_MST_WR_ACK_RECEIVED               0x1
+#define NIU_MST_ATOMIC_RESP_RECEIVED          0x0
+
+
+/////
+
+#define NOC_CFG(cnt)             (NOC_REGS_START_ADDR+0x100+((cnt)*4))
+
+
+// 0 = clk gt enable
+// [7:1] = clk gt hysteresis
+// [8] = NIU mem parity enable
+// [11:9] = ECC interrupts enable
+// [12] = tile clock disable
+// [13] = (noc2axi only) header double store disable
+// [14] = enable coordinate translation
+#define NIU_CFG_0     0x0
+#define NIU_CFG_0_ECC_NIU_MEM_PARITY_EN   8
+#define NIU_CFG_0_ECC_MEM_PARITY_INT_EN   9
+#define NIU_CFG_0_ECC_HEADER_1B_INT_EN    10
+#define NIU_CFG_0_ECC_HEADER_2B_INT_EN    11
+#define NIU_CFG_0_TILE_CLK_OFF            12
+#define NIU_CFG_0_TILE_HEADER_STORE_OFF   13 // NOC2AXI only
+#define NIU_CFG_0_NOC_ID_TRANSLATE_EN     14
+#define NIU_CFG_0_AXI_SLAVE_ENABLE        15
+#define NIU_CFG_0_CMD_BUFFER_FIFO_EN      16
+// NCRISC is using NIU_CFG_0[31:24] to store debug postcodes, if you need these bits for hardware move ncrisc postcode write location in ncrisc.cc.
+
+#define ROUTER_CFG_0  0x1   // 0 = clk gt enable, [7:1] = clk gt hysteresis, [11:8] = max_backoff_exp, [15:12] = log2_basic_timeout_delay, [16] = router mem parity enable, [17] = packet header chk bits enable, [18] = packet header SECDED enable
+#define ROUTER_CFG_0_ECC_ROUTER_MEM_PARITY_EN 16
+#define ROUTER_CFG_0_ECC_HEADER_CHKBITS_EN     17
+#define ROUTER_CFG_0_ECC_HEADER_SECDED_EN     18
+#define ROUTER_CFG_1  0x2   // broadcast disable row
+#define ROUTER_CFG_2  0x3
+#define ROUTER_CFG_3  0x4   // broadcast disable column
+#define ROUTER_CFG_4  0x5
+
+#define NOC_TRANSLATE_ID_WIDTH 5
+#define NOC_TRANSLATE_TABLE_XY_SIZE (32/NOC_TRANSLATE_ID_WIDTH)
+
+#define NOC_X_ID_TRANSLATE_TABLE_0  0x6 // entries 0-5 in the X ID translation table (total 32 x 5 bit entries)
+#define NOC_X_ID_TRANSLATE_TABLE_1  0x7 // entries 6-11 in the X ID translation table (total 32 x 5 bit entries)
+#define NOC_X_ID_TRANSLATE_TABLE_2  0x8 // entries 12-17 in the X ID translation table (total 32 x 5 bit entries)
+#define NOC_X_ID_TRANSLATE_TABLE_3  0x9 // entries 18-23 in the X ID translation table (total 32 x 5 bit entries)
+#define NOC_X_ID_TRANSLATE_TABLE_4  0xA // entries 24-29 in the X ID translation table (total 32 x 5 bit entries)
+#define NOC_X_ID_TRANSLATE_TABLE_5  0xB // entries 30-31 in the X ID translation table (total 32 x 5 bit entries)
+
+#define NOC_Y_ID_TRANSLATE_TABLE_0  0xC // entries 0-5 in the Y ID translation table (total 32 x 5 bit entries)
+#define NOC_Y_ID_TRANSLATE_TABLE_1  0xD // entries 6-11 in the Y ID translation table (total 32 x 5 bit entries)
+#define NOC_Y_ID_TRANSLATE_TABLE_2  0xE // entries 12-17 in the Y ID translation table (total 32 x 5 bit entries)
+#define NOC_Y_ID_TRANSLATE_TABLE_3  0xF // entries 18-23 in the Y ID translation table (total 32 x 5 bit entries)
+#define NOC_Y_ID_TRANSLATE_TABLE_4  0x10 // entries 24-29 in the X ID translation table (total 32 x 5 bit entries)
+#define NOC_Y_ID_TRANSLATE_TABLE_5  0x11 // entries 30-31 in the X ID translation table (total 32 x 5 bit entries)
+
+#define NOC_ID_LOGICAL                  0x12 // logical coordinates of the local NOC NIU if ID translation is enabled (format = {logical_y[5:0], logical_x[5:0]})
+#define MEMORY_SHUTDOWN_CONTROL         0x13 // controls Shutdown (bit0), Deepsleep (bit1), Retention Disable for Deepsleep (bit2)
+#define MEMORY_SD_BIT                      0
+#define MEMORY_DSLP_BIT                    1
+#define MEMORY_DSLPLV_BIT                  2
+#define NOC_ID_TRANSLATE_COL_MASK       0x14 // Mask to indication with column would ignore ID translation
+#define NOC_ID_TRANSLATE_ROW_MASK       0x15 // Mask to indication with row    would ignore ID translation
+#define DDR_COORD_TRANSLATE_TABLE_0     0x16 // entries  0- 5 in the DDR translation table (total 32 x 5 bit entries)
+#define DDR_COORD_TRANSLATE_TABLE_1     0x17 // entries  6-11 in the DDR translation table (total 32 x 5 bit entries)
+#define DDR_COORD_TRANSLATE_TABLE_2     0x18 // entries 12-17 in the DDR translation table (total 32 x 5 bit entries)
+#define DDR_COORD_TRANSLATE_TABLE_3     0x19 // entries 18-23 in the DDR translation table (total 32 x 5 bit entries)
+#define DDR_COORD_TRANSLATE_TABLE_4     0x1A // entries 24-29 in the DDR translation table (total 32 x 5 bit entries)
+#define DDR_COORD_TRANSLATE_TABLE_5     0x1B // entries 30-31 in the DDR translation table (total 32 x 5 bit entries)
+#define DDR_COORD_TRANSLATE_COL_SEL_WIDTH  2 //
+#define DDR_COORD_TRANSLATE_COL_SEL_EAST  10 // if bit is set, ddr translation applies to column 0.
+#define DDR_COORD_TRANSLATE_COL_SEL_WEST  11 // if bit is set, ddr translation applies to column 9.
+#define DDR_COORD_TRANSLATE_COL_SWAP    0x1C // entries 30-31 in the DDR translation table (total 32 x 5 bit entries)
+
+#define DEBUG_COUNTER_RESET             0x1D // write 1 to reset counter; self-clearing, as a reset pulse is generated when written.
+                                             // bit 0 - resets ROUTER_OUTGOING_FLIT_COUNTER
+                                             // bit 4 - clears CMD_BUFFER_FIFO_OVFL_FLAG
+#define ROUTER_OUTGOING_FLIT_COUNTER_BIT 0
+#define CMD_BUFFER_FIFO_OVFL_CLEAR_BIT   4
+#define NIU_TRANS_COUNT_RTZ_CFG         0x1E
+#define NIU_TRANS_COUNT_RTZ_CLR         0x1F
+
+/////
+
+// Flit types
+#define NOC_HEAD_FLIT 0x1
+#define NOC_BODY_FLIT 0x2
+#define NOC_TAIL_FLIT 0x4
+#define NOC_FLIT_TYPE_WIDTH 3
+
+// addr fields
+//MM Jul 21 2022: For backwards compatibility, all the BH NoC API functions
+//will accept a 36-bit address and left-pad it to 64-bits within the function
+#define NOC_ADDR_LOCAL_BITS       /*64*/ 36
+#define NOC_ADDR_NODE_ID_BITS     6
+
+// NOC CMD fields
+#define NOC_CMD_AT                (0x1 << 0)
+#define NOC_CMD_CPY               (0x0 << 0)
+#define NOC_CMD_RD                (0x0 << 1)
+#define NOC_CMD_WR                (0x1 << 1)
+#define NOC_CMD_WR_BE             (0x1 << 2)
+#define NOC_CMD_WR_INLINE         (0x1 << 3)
+#define NOC_CMD_RESP_MARKED       (0x1 << 4)
+#define NOC_CMD_BRCST_PACKET      (0x1 << 5)
+#define NOC_CMD_VC_LINKED         (0x1 << 6)
+#define NOC_CMD_VC_STATIC         (0x1 << 7)
+#define NOC_CMD_PATH_RESERVE      (0x1 << 8)
+#define NOC_CMD_MEM_RD_DROP_ACK   (0x1 << 9)
+#define NOC_CMD_STATIC_VC(vc)     (((uint32_t)(vc)) << 13)
+
+#define NOC_CMD_BRCST_XY(y)       (((uint32_t)(y)) << 16)
+#define NOC_CMD_BRCST_SRC_INCLUDE (0x1 << 17)
+#define NOC_CMD_ARB_PRIORITY(p)   (((uint32_t)(p)) << 27)
+#define NOC_CMD_L1_ACC_AT_EN      (0x1 << 31)
+
+//
+// NOC CTRL fields
+#define NOC_CTRL_SEND_REQ         (0x1 << 0)
+//
+#define NOC_CTRL_STATUS_READY     0x0
+// Atomic command codes
+#define NOC_AT_INS_NOP            0x0
+#define NOC_AT_INS_INCR_GET       0x1
+#define NOC_AT_INS_INCR_GET_PTR   0x2
+#define NOC_AT_INS_SWAP           0x3
+#define NOC_AT_INS_CAS            0x4
+#define NOC_AT_INS_GET_TILE_MAP   0x5
+#define NOC_AT_INS_STORE_IND      0x6
+#define NOC_AT_INS_SWAP_4B        0x7
+#define NOC_AT_INS_ACC            0x9
+
+#define NOC_AT_IND_32(index)      ((index) << 0)
+#define NOC_AT_IND_32_SRC(index)  ((index) << 10)
+#define NOC_AT_WRAP(wrap)         ((wrap) << 2)
+//#define NOC_AT_INCR(incr)         ((incr) << 6)
+#define NOC_AT_INS(ins)           ((ins) << 12)
+#define NOC_AT_TILE_MAP_IND(ind)  ((ind) << 2)
+#define NOC_AT_ACC_FORMAT(format) (((format) << 0) & 0x7)
+#define NOC_AT_ACC_SAT_DIS(dis)   ((dis) << 3)
+
+///
+
+#define NOC_AT_ACC_FP32          0x0
+#define NOC_AT_ACC_FP16_A        0x1
+#define NOC_AT_ACC_FP16_B        0x2
+#define NOC_AT_ACC_INT32         0x3
+#define NOC_AT_ACC_INT32_COMPL   0x4
+#define NOC_AT_ACC_INT32_UNS     0x5
+#define NOC_AT_ACC_INT8          0x6
+
+///
+
+#define NOC_PACKET_TAG_TRANSACTION_ID(id) ((id) << 10)
+#define NOC_PACKET_TAG_HEADER_STORE       (0x1 << 9)
+
+///
+
+#define NOC_DATA_WIDTH      512+3
+#define NOC_PAYLOAD_WIDTH   512
+#define NOC_WORD_BYTES      (NOC_PAYLOAD_WIDTH/8)
+#define NOC_MAX_BURST_WORDS 256
+#define NOC_MAX_BURST_SIZE  (NOC_MAX_BURST_WORDS*NOC_WORD_BYTES)
+//#define MEM_WORD_BYTES      16
+#define NOC_WORD_OFFSET_MASK (NOC_WORD_BYTES-1)
+
+#define MEM_DATA_WIDTH      128
+#define MEM_WORD_BYTES      (MEM_DATA_WIDTH/8)
+#define MEM_WORD_OFFSET_MASK (MEM_WORD_BYTES-1)
+
+#define NOC_VCS             16
+
+#define NOC_BCAST_VC_START   4
+
+#define NOC_ROUTER_PORTS     3
+#define NOC_PORT_NIU         0
+#define NOC_PORT_X           1
+#define NOC_PORT_Y           2
+
+////
+
+#define NOC_NODE_ID_MASK    ((((uint64_t)0x1) << NOC_ADDR_NODE_ID_BITS) - 1)
+#define NOC_LOCAL_ADDR_MASK ((((uint64_t)0x1) << NOC_ADDR_LOCAL_BITS) - 1)
+
+#define NOC_LOCAL_ADDR_OFFSET(addr) ((addr) & NOC_LOCAL_ADDR_MASK)
+
+#define NOC_UNICAST_ADDR_X(addr)   (((addr) >> NOC_ADDR_LOCAL_BITS)  &  NOC_NODE_ID_MASK)
+#define NOC_UNICAST_ADDR_Y(addr)   (((addr) >> (NOC_ADDR_LOCAL_BITS+NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+
+#define NOC_MCAST_ADDR_END_X(addr)   (((addr) >> NOC_ADDR_LOCAL_BITS)  &  NOC_NODE_ID_MASK)
+#define NOC_MCAST_ADDR_END_Y(addr)   (((addr) >> (NOC_ADDR_LOCAL_BITS+NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+#define NOC_MCAST_ADDR_START_X(addr)   (((addr) >> (NOC_ADDR_LOCAL_BITS+2*NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+#define NOC_MCAST_ADDR_START_Y(addr)   (((addr) >> (NOC_ADDR_LOCAL_BITS+3*NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+
+#define NOC_UNICAST_COORDINATE_Y(noc_coordinate)     (((noc_coordinate) >> (1*NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+#define NOC_UNICAST_COORDINATE_X(noc_coordinate)     (((noc_coordinate) >> (0*NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+
+#define NOC_MCAST_COORDINATE_START_Y(noc_coordinate) (((noc_coordinate) >> (3*NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+#define NOC_MCAST_COORDINATE_START_X(noc_coordinate) (((noc_coordinate) >> (2*NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+#define NOC_MCAST_COORDINATE_END_Y(noc_coordinate)   (((noc_coordinate) >> (1*NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+#define NOC_MCAST_COORDINATE_END_X(noc_coordinate)   (((noc_coordinate) >> (0*NOC_ADDR_NODE_ID_BITS))  &  NOC_NODE_ID_MASK)
+
+// Addres formats
+
+#define NOC_XY_ADDR(x, y, addr)                                        \
+  ((((uint64_t)(y)) << (NOC_ADDR_LOCAL_BITS+NOC_ADDR_NODE_ID_BITS)) |  \
+   (((uint64_t)(x)) << NOC_ADDR_LOCAL_BITS) |                          \
+   ((uint64_t)(addr)))
+
+#define NOC_XY_ENCODING(x, y)                                          \
+  ((((uint64_t)(y)) << (NOC_ADDR_LOCAL_BITS+NOC_ADDR_NODE_ID_BITS)) |  \
+   (((uint64_t)(x)) << NOC_ADDR_LOCAL_BITS))
+
+#define NOC_MULTICAST_ADDR(x_start, y_start, x_end, y_end, addr)                \
+  ((((uint64_t)(x_start)) << (NOC_ADDR_LOCAL_BITS+2*NOC_ADDR_NODE_ID_BITS)) |   \
+   (((uint64_t)(y_start)) << (NOC_ADDR_LOCAL_BITS+3*NOC_ADDR_NODE_ID_BITS)) |   \
+   (((uint64_t)(x_end))   << NOC_ADDR_LOCAL_BITS) |                             \
+   (((uint64_t)(y_end))   << (NOC_ADDR_LOCAL_BITS+NOC_ADDR_NODE_ID_BITS)) |     \
+   ((uint64_t)(addr)))
+
+#define NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end)                  \
+  ((((uint64_t)(x_start)) << (NOC_ADDR_LOCAL_BITS+2*NOC_ADDR_NODE_ID_BITS)) |   \
+   (((uint64_t)(y_start)) << (NOC_ADDR_LOCAL_BITS+3*NOC_ADDR_NODE_ID_BITS)) |   \
+   (((uint64_t)(x_end))   << NOC_ADDR_LOCAL_BITS) |                             \
+   (((uint64_t)(y_end))   << (NOC_ADDR_LOCAL_BITS+NOC_ADDR_NODE_ID_BITS)))
+
+#define NOC_XY_COORD(x, y)                      \
+  ((((uint32_t)(y)) << NOC_ADDR_NODE_ID_BITS) | \
+    ((uint32_t)(x)))
+
+#define NOC_MULTICAST_COORD(x_start, y_start, x_end, y_end) \
+  ((((uint32_t)(y_start)) << (3*NOC_ADDR_NODE_ID_BITS)) |   \
+   (((uint32_t)(x_start)) << (2*NOC_ADDR_NODE_ID_BITS)) |   \
+   (((uint32_t)(y_end  )) << (1*NOC_ADDR_NODE_ID_BITS)) |   \
+    ((uint32_t)(x_end  )))
+
+
+// Alignment restrictions
+// Should these be split for reads vs writes
+#define NOC_L1_ALIGNMENT_BYTES   16
+#define NOC_PCIE_ALIGNMENT_BYTES 32
+#define NOC_DRAM_ALIGNMENT_BYTES 64
+
+#endif
diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
new file mode 100644
index 000000000000..20275347ffc5
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
@@ -0,0 +1,472 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stdint.h>
+
+#include "noc_parameters.h"
+#include "risc_attribs.h"
+
+#ifdef RISC_B0_HW
+const uint32_t NCRISC_WR_CMD_BUF = 3;
+const uint32_t NCRISC_WR_CMD_BUF_0 = 0;
+const uint32_t NCRISC_WR_CMD_BUF_1 = 1;
+const uint32_t NCRISC_SMALL_TXN_CMD_BUF = 3;
+#else
+const uint32_t NCRISC_WR_CMD_BUF = 0;
+const uint32_t NCRISC_WR_CMD_BUF_0 = 0;
+const uint32_t NCRISC_WR_CMD_BUF_1 = 1;
+const uint32_t NCRISC_WR_CMD_BUF_2 = 2;
+const uint32_t NCRISC_SMALL_TXN_CMD_BUF = 3;
+#endif
+
+const uint32_t NCRISC_WR_DEF_TRID = 0;
+const uint32_t NCRISC_WR_LOCAL_TRID = 1;
+const uint32_t NCRISC_RD_DEF_TRID = 2;
+const uint32_t NCRISC_HEADER_RD_TRID = 3;
+const uint32_t NCRISC_RD_START_TRID = 4;
+const uint32_t NCRISC_RD_END_TRID = 13;
+const uint32_t NCRISC_ETH_START_TRID = 14;
+const uint32_t NCRISC_ETH_END_TRID = 15;
+
+extern uint32_t noc_reads_num_issued[NUM_NOCS];
+extern uint32_t noc_nonposted_writes_num_issued[NUM_NOCS];
+extern uint32_t noc_nonposted_writes_acked[NUM_NOCS];
+extern uint32_t noc_xy_local_addr[NUM_NOCS];
+
+inline void NOC_CMD_BUF_WRITE_REG(uint32_t noc, uint32_t buf, uint32_t addr, uint32_t val) {
+  uint32_t offset = (buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT) + addr;
+  volatile uint32_t* ptr = (volatile uint32_t*)offset;
+  *ptr = val;
+}
+
+
+inline uint32_t NOC_CMD_BUF_READ_REG(uint32_t noc, uint32_t buf, uint32_t addr) {
+  uint32_t offset = (buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT) + addr;
+  volatile uint32_t tt_reg_ptr * ptr = (volatile uint32_t tt_reg_ptr *)offset;
+  return *ptr;
+}
+
+
+inline uint32_t NOC_STATUS_READ_REG(uint32_t noc, uint32_t reg_id) {
+  uint32_t offset = (noc << NOC_INSTANCE_OFFSET_BIT) + NOC_STATUS(reg_id);
+  volatile uint32_t tt_reg_ptr * ptr = (volatile uint32_t tt_reg_ptr *)offset;
+  return *ptr;
+}
+
+inline __attribute__((section("code_l1"))) void NOC_CMD_BUF_WRITE_REG_L1(uint32_t noc, uint32_t buf, uint32_t addr, uint32_t val) {
+  uint32_t offset = (buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT) + addr;
+  volatile uint32_t* ptr = (volatile uint32_t*)offset;
+  *ptr = val;
+}
+
+
+inline __attribute__((section("code_l1"))) uint32_t NOC_CMD_BUF_READ_REG_L1(uint32_t noc, uint32_t buf, uint32_t addr) {
+  uint32_t offset = (buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT) + addr;
+  volatile uint32_t* ptr = (volatile uint32_t*)offset;
+  return *ptr;
+}
+
+
+inline __attribute__((section("code_l1"))) uint32_t NOC_STATUS_READ_REG_L1(uint32_t noc, uint32_t reg_id) {
+  uint32_t offset = (noc << NOC_INSTANCE_OFFSET_BIT) + NOC_STATUS(reg_id);
+  volatile uint32_t tt_reg_ptr * ptr = (volatile uint32_t tt_reg_ptr *)offset;
+  return *ptr;
+}
+
+
+inline uint32_t NOC_CFG_READ_REG(uint32_t noc, uint32_t reg_id) {
+  uint32_t offset = (noc << NOC_INSTANCE_OFFSET_BIT) + NOC_CFG(reg_id);
+  volatile uint32_t tt_reg_ptr * ptr = (volatile uint32_t tt_reg_ptr *)offset;
+  return *ptr;
+}
+
+inline void ncrisc_noc_fast_read(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id) {
+  while (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transaction_id)) > ((NOC_MAX_TRANSACTION_ID_COUNT+1)/2));
+
+  if (len_bytes > 0) {
+    //word offset noc cmd interface
+    uint32_t noc_rd_cmd_field = NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1);
+    uint32_t offset = (cmd_buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT);
+    volatile uint32_t* ptr = (volatile uint32_t*)offset;
+
+    ptr[NOC_RET_ADDR_LO  >> 2] = dest_addr;
+    ptr[NOC_RET_ADDR_MID >> 2] = 0x0;
+    ptr[NOC_RET_ADDR_HI  >> 2] = noc_xy_local_addr[noc];
+    ptr[NOC_CTRL >> 2] = noc_rd_cmd_field;
+    ptr[NOC_TARG_ADDR_LO  >> 2] = (uint32_t)src_addr;
+    ptr[NOC_TARG_ADDR_MID >> 2] = (uint32_t)(src_addr >> 32) & 0xF;
+    ptr[NOC_TARG_ADDR_HI  >> 2] = (uint32_t)(src_addr >> 36) & 0xFFFFFF;
+    ptr[NOC_PACKET_TAG >> 2] = NOC_PACKET_TAG_TRANSACTION_ID(transaction_id);
+    ptr[NOC_AT_LEN_BE >> 2] = len_bytes;
+    ptr[NOC_CMD_CTRL >> 2] = NOC_CTRL_SEND_REQ;
+  }
+}
+
+inline __attribute__((always_inline)) void ncrisc_noc_fast_read_scatter(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id) {
+  while (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transaction_id)) > ((NOC_MAX_TRANSACTION_ID_COUNT+1)/2));
+
+  if (len_bytes > 0) {
+    //word offset noc cmd interface
+    uint32_t noc_rd_cmd_field = NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1);
+    uint32_t offset = (cmd_buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT);
+    volatile uint32_t* ptr = (volatile uint32_t*)offset;
+
+    ptr[NOC_RET_ADDR_LO  >> 2] = dest_addr;
+    ptr[NOC_RET_ADDR_MID >> 2] = 0x0;
+    ptr[NOC_RET_ADDR_HI  >> 2] = noc_xy_local_addr[noc];
+    ptr[NOC_CTRL >> 2] = noc_rd_cmd_field;
+    ptr[NOC_TARG_ADDR_LO  >> 2] = (uint32_t)src_addr;
+    ptr[NOC_TARG_ADDR_MID >> 2] = (uint32_t)(src_addr >> 32) & 0xF;
+    ptr[NOC_TARG_ADDR_HI  >> 2] = (uint32_t)(src_addr >> 36) & 0xFFFFFF;
+    ptr[NOC_PACKET_TAG >> 2] = NOC_PACKET_TAG_TRANSACTION_ID(transaction_id);
+    ptr[NOC_AT_LEN_BE >> 2] = len_bytes;
+    ptr[NOC_CMD_CTRL >> 2] = NOC_CTRL_SEND_REQ;
+  }
+}
+
+
+void __attribute__((section("code_l1"))) ncrisc_noc_fast_read_l1(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id);
+
+inline bool ncrisc_noc_reads_flushed(uint32_t noc, uint32_t transaction_id) {
+  return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transaction_id)) == 0);
+}
+
+inline __attribute__((always_inline)) __attribute__((section("code_l1"))) bool ncrisc_noc_reads_flushed_l1(uint32_t noc, uint32_t transaction_id) {
+  return (NOC_STATUS_READ_REG_L1(noc, NIU_MST_REQS_OUTSTANDING_ID(transaction_id)) == 0);
+}
+
+inline bool ncrisc_noc_all_reads_flushed(uint32_t noc) {
+  bool all_flushed = true;
+  for (uint32_t id = NCRISC_RD_DEF_TRID; id <= NCRISC_RD_END_TRID; id++) {
+    all_flushed &= NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(id)) == 0;
+  }
+  return all_flushed;
+}
+
+inline __attribute__((always_inline)) __attribute__((section("code_l1"))) bool ncrisc_noc_all_reads_flushed_l1(uint32_t noc) {
+  bool all_flushed = true;
+  for (uint32_t id = NCRISC_RD_DEF_TRID; id <= NCRISC_RD_END_TRID; id++) {
+    all_flushed &= NOC_STATUS_READ_REG_L1(noc, NIU_MST_REQS_OUTSTANDING_ID(id)) == 0;
+  }
+  return all_flushed;
+}
+
+inline bool ncrisc_noc_fast_read_ok(uint32_t noc, uint32_t cmd_buf) {
+  return (NOC_CMD_BUF_READ_REG(noc, cmd_buf, NOC_CMD_CTRL) == NOC_CTRL_STATUS_READY);
+}
+
+inline __attribute__((always_inline)) __attribute__((section("code_l1"))) bool ncrisc_noc_fast_read_ok_l1(uint32_t noc, uint32_t cmd_buf) {
+  return (NOC_CMD_BUF_READ_REG_L1(noc, cmd_buf, NOC_CMD_CTRL) == NOC_CTRL_STATUS_READY);
+}
+
+inline __attribute__((always_inline)) uint32_t ncrisc_rd_data_word_recv(uint32_t noc) {
+  return NOC_STATUS_READ_REG(noc, NIU_MST_RD_DATA_WORD_RECEIVED);
+}
+
+inline void ncrisc_noc_clear_outstanding_reqs(uint32_t noc, uint32_t transaction_id) {
+  NOC_CMD_BUF_WRITE_REG(noc, 0, NOC_CLEAR_OUTSTANDING_REQ_CNT, 0x1 << transaction_id);
+}
+
+inline void ncrisc_noc_fast_write(uint32_t noc, uint32_t cmd_buf, uint32_t src_addr, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, bool mcast, bool linked, uint32_t num_dests, uint32_t transaction_id) {
+  while (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transaction_id)) > ((NOC_MAX_TRANSACTION_ID_COUNT+1)/2));
+
+  if (len_bytes > 0) {
+    uint32_t noc_cmd_field =
+      NOC_CMD_CPY | NOC_CMD_WR |
+      NOC_CMD_VC_STATIC  |
+      NOC_CMD_STATIC_VC(vc) |
+      (linked ? NOC_CMD_VC_LINKED : 0x0) |
+      (mcast ? (NOC_CMD_PATH_RESERVE | NOC_CMD_BRCST_PACKET) : 0x0) |
+      NOC_CMD_RESP_MARKED;
+
+    //word offset noc cmd interface
+    uint32_t offset = (cmd_buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT);
+    volatile uint32_t* ptr = (volatile uint32_t*)offset;
+    ptr[NOC_CTRL >> 2] = noc_cmd_field;
+    ptr[NOC_TARG_ADDR_LO  >> 2] = src_addr;
+    ptr[NOC_TARG_ADDR_MID >> 2] = 0x0;
+    ptr[NOC_TARG_ADDR_HI  >> 2] = noc_xy_local_addr[noc];
+    ptr[NOC_RET_ADDR_LO  >> 2] = (uint32_t)dest_addr;
+    ptr[NOC_RET_ADDR_MID >> 2] = (uint32_t)(dest_addr >> 32) & 0xF;
+    ptr[NOC_RET_ADDR_HI  >> 2] = (uint32_t)(dest_addr >> 36) & 0xFFFFFF;
+    ptr[NOC_PACKET_TAG >> 2] = NOC_PACKET_TAG_TRANSACTION_ID(transaction_id);
+    ptr[NOC_AT_LEN_BE >> 2] = len_bytes;
+    ptr[NOC_CMD_CTRL >> 2] = NOC_CTRL_SEND_REQ;
+  }
+}
+
+void __attribute__((section("code_l1"))) ncrisc_noc_fast_write_l1(uint32_t noc, uint32_t cmd_buf, uint32_t src_addr, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, bool mcast, bool linked, uint32_t num_dests, uint32_t transaction_id);
+
+inline bool ncrisc_noc_fast_write_ok(uint32_t noc, uint32_t cmd_buf) {
+  return (NOC_CMD_BUF_READ_REG(noc, cmd_buf, NOC_CMD_CTRL) == NOC_CTRL_STATUS_READY);
+}
+
+#ifdef RISC_B0_HW
+inline bool ncrisc_noc_fast_write_bufs_ok(uint32_t noc) {
+  return (NOC_CMD_BUF_READ_REG(noc, NCRISC_WR_CMD_BUF, NOC_CMD_CTRL) == NOC_CTRL_STATUS_READY);
+}
+#else
+inline bool ncrisc_noc_fast_write_bufs_ok(uint32_t noc) {
+  //word offset between cmd buffers
+  uint32_t cmd_buf_offset = 0x1 << (NOC_CMD_BUF_OFFSET_BIT - 2);
+  uint32_t offset = (noc << NOC_INSTANCE_OFFSET_BIT) + NOC_CMD_CTRL;
+  uint32_t* ptr = (uint32_t*)offset;
+
+  uint32_t a = ptr[0];
+  ptr += cmd_buf_offset;
+  uint32_t ok = a;
+  uint32_t b = ptr[0];
+  ptr += cmd_buf_offset;
+  ok += b;
+  uint32_t c = ptr[0];
+  ok += c;
+
+  return (ok == NOC_CTRL_STATUS_READY);
+}
+#endif
+
+inline __attribute__((always_inline)) __attribute__((section("code_l1"))) bool ncrisc_noc_fast_write_ok_l1(uint32_t noc, uint32_t cmd_buf) {
+  return (NOC_CMD_BUF_READ_REG_L1(noc, cmd_buf, NOC_CMD_CTRL) == NOC_CTRL_STATUS_READY);
+}
+
+inline void ncrisc_noc_blitz_write_setup(uint32_t noc, uint32_t cmd_buf, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, uint32_t num_times_to_write, uint32_t transaction_id) {
+  uint32_t noc_cmd_field =
+    NOC_CMD_CPY | NOC_CMD_WR |
+    NOC_CMD_VC_STATIC  |
+    NOC_CMD_STATIC_VC(vc) |
+    NOC_CMD_RESP_MARKED;
+
+  while (!ncrisc_noc_fast_write_ok(noc, cmd_buf));
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, noc_cmd_field);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dest_addr >> 32) & 0xF);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_HI,  (uint32_t)(dest_addr >> 36) & 0xFFFFFF);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(transaction_id));
+}
+
+inline __attribute__((always_inline)) void ncrisc_noc_blitz_write(uint32_t noc, uint32_t cmd_buf, uint32_t src_addr, uint32_t dest_addr) {
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, src_addr);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, dest_addr);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
+}
+
+inline bool ncrisc_noc_nonposted_writes_sent(uint32_t noc, uint32_t transaction_id) {
+  return (NOC_STATUS_READ_REG(noc, NIU_MST_WRITE_REQS_OUTGOING_ID(transaction_id)) == 0);
+}
+
+inline __attribute__((always_inline)) __attribute__((section("code_l1"))) bool ncrisc_noc_nonposted_writes_sent_l1(uint32_t noc, uint32_t transaction_id) {
+  return (NOC_STATUS_READ_REG_L1(noc, NIU_MST_WRITE_REQS_OUTGOING_ID(transaction_id)) == 0);
+}
+
+inline bool ncrisc_noc_nonposted_all_writes_sent(uint32_t noc) {
+  bool all_sent = true;
+  for (uint32_t id = NCRISC_WR_DEF_TRID; id <= NCRISC_WR_LOCAL_TRID; id++) {
+    all_sent &= NOC_STATUS_READ_REG(noc, NIU_MST_WRITE_REQS_OUTGOING_ID(id)) == 0;
+  }
+  return all_sent;
+}
+
+inline __attribute__((always_inline)) __attribute__((section("code_l1"))) bool ncrisc_noc_nonposted_all_writes_sent_l1(uint32_t noc) {
+  bool all_sent = true;
+  for (uint32_t id = NCRISC_WR_DEF_TRID; id <= NCRISC_WR_LOCAL_TRID; id++) {
+    all_sent &= NOC_STATUS_READ_REG_L1(noc, NIU_MST_WRITE_REQS_OUTGOING_ID(id)) == 0;
+  }
+  return all_sent;
+}
+
+inline bool ncrisc_noc_nonposted_writes_flushed(uint32_t noc, uint32_t transaction_id) {
+  return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transaction_id)) == 0);
+}
+
+inline __attribute__((always_inline)) __attribute__((section("code_l1"))) bool ncrisc_noc_nonposted_writes_flushed_l1(uint32_t noc, uint32_t transaction_id) {
+  return (NOC_STATUS_READ_REG_L1(noc, NIU_MST_REQS_OUTSTANDING_ID(transaction_id)) == 0);
+}
+
+inline bool ncrisc_noc_nonposted_all_writes_flushed(uint32_t noc) {
+  bool all_flushed = true;
+  for (uint32_t id = NCRISC_WR_DEF_TRID; id <= NCRISC_WR_LOCAL_TRID; id++) {
+    all_flushed &= NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(id)) == 0;
+  }
+  return all_flushed;
+}
+
+inline __attribute__((always_inline)) __attribute__((section("code_l1"))) bool ncrisc_noc_nonposted_all_writes_flushed_l1(uint32_t noc) {
+  bool all_flushed = true;
+  for (uint32_t id = NCRISC_WR_DEF_TRID; id <= NCRISC_WR_LOCAL_TRID; id++) {
+    all_flushed &= NOC_STATUS_READ_REG_L1(noc, NIU_MST_REQS_OUTSTANDING_ID(id)) == 0;
+  }
+  return all_flushed;
+}
+
+
+inline void ncrisc_noc_init() {
+  for (int noc = 0; noc < NUM_NOCS; noc++) {
+    uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(noc, 0, NOC_NODE_ID);
+    uint32_t my_x = noc_id_reg & NOC_NODE_ID_MASK;
+    uint32_t my_y = (noc_id_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK;
+    uint32_t xy_coord = NOC_XY_COORD(my_x, my_y);
+
+    noc_xy_local_addr[noc] = xy_coord;
+    NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_CMD_BUF_0, NOC_TARG_ADDR_HI, xy_coord);
+    NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_CMD_BUF_1, NOC_TARG_ADDR_HI, xy_coord);
+#ifndef RISC_B0_HW
+    NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_CMD_BUF_2, NOC_TARG_ADDR_HI, xy_coord);
+#endif
+  }
+}
+
+inline void ncrisc_noc_counters_init() {
+}
+
+inline bool ncrisc_noc_all_flushed(uint32_t noc) {
+  bool all_flushed = true;
+  for (uint32_t id = 0; id <= NOC_MAX_TRANSACTION_ID; id++) {
+    all_flushed &= NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(id)) == 0;
+  }
+  return all_flushed;
+}
+
+inline void ncrisc_noc_full_sync() {
+  for (uint32_t n = 0; n < NUM_NOCS; n++) {
+    while (!ncrisc_noc_all_flushed(n));
+  }
+}
+
+#ifdef RISC_B0_HW
+inline void ncrisc_noc_fast_read_any_len(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id) {
+  while (!ncrisc_noc_fast_read_ok(noc, cmd_buf));
+  ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, len_bytes, transaction_id);
+}
+
+inline __attribute__((always_inline)) void ncrisc_noc_fast_read_any_len_scatter(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id) {
+  while (NOC_CMD_BUF_READ_REG(noc, cmd_buf, NOC_CMD_CTRL) != NOC_CTRL_STATUS_READY);   //while (!ncrisc_noc_fast_read_ok(noc, cmd_buf));
+  ncrisc_noc_fast_read_scatter(noc, cmd_buf, src_addr, dest_addr, len_bytes, transaction_id);
+}
+
+void __attribute__((section("code_l1"))) ncrisc_noc_fast_read_any_len_l1(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id);
+#else
+inline void ncrisc_noc_fast_read_any_len(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id) {
+  while (len_bytes > NOC_MAX_BURST_SIZE) {
+    while (!ncrisc_noc_fast_read_ok(noc, cmd_buf));
+    ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, NOC_MAX_BURST_SIZE, transaction_id);
+    src_addr += NOC_MAX_BURST_SIZE;
+    dest_addr += NOC_MAX_BURST_SIZE;
+    len_bytes -= NOC_MAX_BURST_SIZE;
+  }
+  while (!ncrisc_noc_fast_read_ok(noc, cmd_buf));
+  ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, len_bytes, transaction_id);
+}
+
+inline __attribute__((always_inline)) void ncrisc_noc_fast_read_any_len_scatter(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id) {
+  while (len_bytes > NOC_MAX_BURST_SIZE) {
+    while (!ncrisc_noc_fast_read_ok(noc, cmd_buf));
+    ncrisc_noc_fast_read_scatter(noc, cmd_buf, src_addr, dest_addr, NOC_MAX_BURST_SIZE, transaction_id);
+    src_addr += NOC_MAX_BURST_SIZE;
+    dest_addr += NOC_MAX_BURST_SIZE;
+    len_bytes -= NOC_MAX_BURST_SIZE;
+  }
+  while (!ncrisc_noc_fast_read_ok(noc, cmd_buf));
+  ncrisc_noc_fast_read_scatter(noc, cmd_buf, src_addr, dest_addr, len_bytes, transaction_id);
+}
+
+void __attribute__((section("code_l1"))) ncrisc_noc_fast_read_any_len_l1(uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes, uint32_t transaction_id);
+#endif
+
+inline void ncrisc_noc_fast_write_any_len(uint32_t noc, uint32_t cmd_buf, uint32_t src_addr, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, bool mcast, bool linked, uint32_t num_dests, uint32_t transaction_id) {
+  while (len_bytes > NOC_MAX_BURST_SIZE) {
+    while (!ncrisc_noc_fast_write_ok(noc, cmd_buf));
+    ncrisc_noc_fast_write(noc, cmd_buf, src_addr, dest_addr, NOC_MAX_BURST_SIZE, vc, mcast, linked, num_dests, transaction_id);
+    src_addr += NOC_MAX_BURST_SIZE;
+    dest_addr += NOC_MAX_BURST_SIZE;
+    len_bytes -= NOC_MAX_BURST_SIZE;
+    if (!ncrisc_noc_fast_write_ok(noc, cmd_buf)) {
+      cmd_buf++;
+      if (cmd_buf >= NCRISC_SMALL_TXN_CMD_BUF) cmd_buf = NCRISC_WR_CMD_BUF;
+    }
+  }
+  while (!ncrisc_noc_fast_write_ok(noc, cmd_buf));
+  ncrisc_noc_fast_write(noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, transaction_id);
+}
+
+void __attribute__((section("code_l1"))) ncrisc_noc_fast_write_any_len_l1(uint32_t noc, uint32_t cmd_buf, uint32_t src_addr, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, bool mcast, bool linked, uint32_t num_dests, uint32_t transaction_id);
+
+inline void noc_fast_posted_write_dw_inline(uint32_t noc, uint32_t cmd_buf, uint32_t val, uint64_t dest_addr, uint32_t be, uint32_t static_vc, bool mcast) {
+  bool posted = true;
+  bool static_vc_alloc = true;
+  uint32_t noc_cmd_field =
+    (static_vc_alloc ? NOC_CMD_VC_STATIC : 0x0) |
+    NOC_CMD_STATIC_VC(static_vc) |
+    NOC_CMD_CPY | NOC_CMD_WR |
+    NOC_CMD_WR_INLINE |
+    (mcast ? (NOC_CMD_PATH_RESERVE | NOC_CMD_BRCST_PACKET) : 0x0) |
+    (posted ? 0x0 : NOC_CMD_RESP_MARKED);
+
+  uint32_t be32 = be;
+  uint32_t be_shift = (dest_addr & (NOC_WORD_BYTES-1));
+  be32 = (be32 << be_shift);
+
+  while (NOC_CMD_BUF_READ_REG(noc, cmd_buf, NOC_CMD_CTRL) != NOC_CTRL_STATUS_READY);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_DATA, val);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, noc_cmd_field);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, (uint32_t)(dest_addr & ~(NOC_WORD_BYTES-1)));
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_MID, (dest_addr >> 32) & 0xF);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_HI, (dest_addr >> 36) & 0xFFFFFF);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, be32);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE_1, 0x0);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
+}
+
+inline void noc_atomic_read_and_increment(uint32_t noc, uint32_t cmd_buf, uint64_t addr, uint32_t incr, uint32_t wrap, uint64_t read_addr, bool linked, uint32_t transaction_id) {
+
+  while (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transaction_id)) > ((NOC_MAX_TRANSACTION_ID_COUNT+1)/2));
+
+  uint32_t offset = (cmd_buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT);
+  volatile uint32_t* ptr = (volatile uint32_t*)offset;
+  uint32_t atomic_resp = NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED);
+
+  ptr[NOC_TARG_ADDR_LO  >> 2] = (uint32_t)(addr & 0xFFFFFFFF);
+  ptr[NOC_TARG_ADDR_MID >> 2] = (uint32_t)(addr >> 32) & 0xF;
+  ptr[NOC_TARG_ADDR_HI  >> 2] = (uint32_t)(addr >> 36) & 0xFFFFFF;
+  ptr[NOC_PACKET_TAG >> 2] = NOC_PACKET_TAG_TRANSACTION_ID(transaction_id);
+  ptr[NOC_RET_ADDR_LO  >> 2] = (uint32_t)(read_addr & 0xFFFFFFFF);
+  ptr[NOC_RET_ADDR_MID >> 2] = (uint32_t)(read_addr >> 32) & 0xF;
+  ptr[NOC_RET_ADDR_HI  >> 2] = (uint32_t)(read_addr >> 36) & 0xFFFFFF;
+  ptr[NOC_CTRL >> 2] = (linked ? NOC_CMD_VC_LINKED : 0x0) |
+                       NOC_CMD_AT |
+                       NOC_CMD_RESP_MARKED;
+  ptr[NOC_AT_LEN_BE >> 2] = NOC_AT_INS(NOC_AT_INS_INCR_GET) | NOC_AT_WRAP(wrap) | NOC_AT_IND_32((addr>>2) & 0x3) | NOC_AT_IND_32_SRC(0);
+  ptr[NOC_AT_DATA >> 2] = incr;
+  ptr[NOC_CMD_CTRL >> 2] = NOC_CTRL_SEND_REQ;
+
+  atomic_resp++;
+  while (atomic_resp != NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED));
+}
+
+void __attribute__((section("code_l1"))) noc_atomic_read_and_increment_l1(uint32_t noc, uint32_t cmd_buf, uint64_t addr, uint32_t incr, uint32_t wrap, uint64_t read_addr, bool linked, uint32_t transaction_id);
+
+/*
+inline void noc_fast_atomic_increment(uint32_t noc, uint32_t cmd_buf, uint64_t addr, uint32_t incr, uint32_t wrap, bool linked) {
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, (uint32_t)(addr & 0xFFFFFFFF));
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_MID, (uint32_t)(addr >> 32) & 0xF);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_HI, (uint32_t)(addr >> 36) & 0xFFFFFF);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) | NOC_CMD_AT);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, NOC_AT_INS(NOC_AT_INS_INCR_GET) | NOC_AT_WRAP(wrap) | NOC_AT_IND_32((addr>>2) & 0x3) | NOC_AT_IND_32_SRC(0));
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_DATA, incr);
+  NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, 0x1);
+}
+*/
+
+/*
+inline void noc_fast_atomic_increment_l1(uint32_t noc, uint32_t cmd_buf, uint64_t addr, uint32_t incr, uint32_t wrap, bool linked) {
+  NOC_CMD_BUF_WRITE_REG_L1(noc, cmd_buf, NOC_TARG_ADDR_LO, (uint32_t)(addr & 0xFFFFFFFF));
+  NOC_CMD_BUF_WRITE_REG_L1(noc, cmd_buf, NOC_TARG_ADDR_MID, (uint32_t)(addr >> 32) & 0xF);
+  NOC_CMD_BUF_WRITE_REG_L1(noc, cmd_buf, NOC_TARG_ADDR_HI, (uint32_t)(addr >> 36) & 0xFFFFFF);
+  NOC_CMD_BUF_WRITE_REG_L1(noc, cmd_buf, NOC_CTRL, (linked ? NOC_CMD_VC_LINKED : 0x0) | NOC_CMD_AT);
+  NOC_CMD_BUF_WRITE_REG_L1(noc, cmd_buf, NOC_AT_LEN_BE, NOC_AT_INS(NOC_AT_INS_INCR_GET) | NOC_AT_WRAP(wrap) | NOC_AT_IND_32((addr>>2) & 0x3) | NOC_AT_IND_32_SRC(0));
+  NOC_CMD_BUF_WRITE_REG_L1(noc, cmd_buf, NOC_AT_DATA, incr);
+  NOC_CMD_BUF_WRITE_REG_L1(noc, cmd_buf, NOC_CMD_CTRL, 0x1);
+}
+*/
diff --git a/tt_metal/hw/inc/blackhole/risc_chip_specific.h b/tt_metal/hw/inc/blackhole/risc_chip_specific.h
new file mode 100644
index 000000000000..3a16b33c1312
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/risc_chip_specific.h
@@ -0,0 +1,142 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stdint.h>
+#include "noc_parameters.h"
+#include "noc_nonblocking_api.h"
+#include "risc.h"
+#include "unpack_pack_stream_intf.h"
+#include "dram_stream_intf.h"
+#include "risc_common.h"
+#include "epoch.h"
+
+////
+
+const uint32_t PTR_UPDATE_TYPE_WR_PTR_UPDATE = 1 << 23;
+const uint32_t PTR_UPDATE_TYPE_EPOCH_W_STRIDE = 1 << 23;
+const uint32_t PTR_UPDATE_TYPE_EPOCH = 1 << 22;
+const uint32_t PTR_UPDATE_TYPE_STRIDE = 1 << 21;
+const uint32_t PTR_UPDATE_TYPE_DRAM_OUTPUT_STREAM_STATE = 1 << 23;
+
+const uint32_t PTR_UPDATE_REG_WR_PTR_UPDATE = 1;
+const uint32_t PTR_UPDATE_REG_TYPE = 2;
+const uint32_t PTR_UPDATE_REG_STRIDE_WRAP = 3;
+const uint32_t PTR_UPDATE_REG_STRIDE = 4;
+const uint32_t PTR_UPDATE_REG_DRAM_OUTPUT_STREAM_STATE = 5;
+
+const uint32_t CYCLES_SINCE_LAST_STREAM_DRAM_WRITE_THRESH = 650;
+
+const uint32_t DRAM_HEADER_LAST = 7; // last byte of the header
+const uint32_t PACKET_END_MARKER = 0xabcd1234;
+
+const uint32_t DRAM_STREAM_1 = 8;
+const uint32_t DRAM_STREAM_2 = 9;
+
+void init_tile_clear();
+void wait_till_tile_clear_done(uint32_t stream_id);
+void process_tile_clearing(kernel_input_stream_state_t* input_stream_state, uint32_t streams_to_clear);
+
+int get_epoch_table_x(int my_x, int my_y) __attribute__((const));
+int get_epoch_table_y(int my_x, int my_y) __attribute__((const));
+int get_epoch_index_x(int my_x) __attribute__((const));
+int get_epoch_index_y(int my_y) __attribute__((const));
+
+inline __attribute__((always_inline)) uint16_t op_pack_tiles_ptr_add(uint16_t a, uint16_t b) {
+//#ifdef RISC_B0_HW // FIXME: This cahnge isnt supported in kernels yet, reenable when supported by kernels
+//  return (a + b) & 0x3FF;
+//#else
+  return a + b;
+//#endif
+}
+
+inline __attribute__((always_inline)) uint16_t op_pack_tiles_ptr_sub(uint16_t a, uint16_t b) {
+//#ifdef RISC_B0_HW // FIXME: This cahnge isnt supported in kernels yet, reenable when supported by kernels
+//  return (a - b) & 0x3FF;
+//#else
+  return a - b;
+//#endif
+}
+
+inline __attribute__((always_inline)) bool addr_is_pcie(uint64_t dram_ptr_addr) {
+  uint32_t x = NOC_UNICAST_ADDR_X(dram_ptr_addr);
+  uint32_t y = NOC_UNICAST_ADDR_Y(dram_ptr_addr);
+  return x == 0 && y == 3;
+}
+
+inline void set_noc_trans_table(uint32_t noc, uint8_t& noc_trans_table_en, uint8_t& my_logical_x, uint8_t& my_logical_y) {
+  noc_trans_table_en = (NOC_CFG_READ_REG(noc, NIU_CFG_0) >> NIU_CFG_0_NOC_ID_TRANSLATE_EN) & 0x1;
+
+  uint32_t noc_id_logical_reg = NOC_CFG_READ_REG(noc, NOC_ID_LOGICAL);
+  my_logical_x = noc_id_logical_reg & NOC_NODE_ID_MASK;
+  my_logical_y = (noc_id_logical_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK;
+}
+
+inline __attribute__((always_inline)) bool check_packet_end_marker(uint32_t l1_addr) {
+  return false;
+}
+
+inline __attribute__((always_inline)) void set_packet_end_marker(uint32_t l1_addr) {
+}
+
+inline __attribute__((always_inline)) bool header_reads_flushed(uint32_t noc, uint32_t transaction_id, volatile uint32_t tt_l1_ptr * l1_ptr_addr) {
+  return (ncrisc_noc_reads_flushed(noc, transaction_id) || check_packet_end_marker((uint32_t)(&(l1_ptr_addr[DRAM_HEADER_LAST]))));
+}
+
+inline __attribute__((always_inline)) void dram_input_stream_issue_scatter_read_init(uint32_t data_rec_chunk_size_tiles, uint32_t dram_io_scatter_chunk_size_tiles, uint32_t dram_io_scatter_chunk_size_bytes, uint32_t stream_dest_addr, uint32_t& transaction_id) {
+  if (transaction_id == NCRISC_RD_END_TRID) {
+    transaction_id = NCRISC_RD_START_TRID;
+  } else {
+    transaction_id += 1;
+  }
+}
+
+inline __attribute__((always_inline)) bool dram_input_stream_check_next_chunk_flushed(uint32_t input_noc, uint32_t chunk_pending_start_addr, uint32_t chunk_size_bytes, uint32_t scatter_chunk_size_bytes, uint32_t& transaction_id) {
+  uint32_t transaction_id_temp = transaction_id;
+  if (transaction_id_temp == NCRISC_RD_END_TRID) {
+    transaction_id_temp = NCRISC_RD_START_TRID;
+  } else {
+    transaction_id_temp += 1;
+  }
+  bool reads_flushed = ncrisc_noc_reads_flushed(input_noc, transaction_id_temp);
+  if (reads_flushed) {
+    transaction_id = transaction_id_temp;
+  }
+  return reads_flushed;
+}
+
+inline __attribute__((always_inline)) uint32_t get_total_in_flight_tiles(dram_output_stream_state_t* curr_dram_output_stream_state) {
+#ifdef RISC_B0_HW
+  uint32_t total_in_flight_tiles = 0;
+  if (curr_dram_output_stream_state->moves_raw_data) {
+    total_in_flight_tiles = curr_dram_output_stream_state->in_flight_tiles;
+  } else {
+    total_in_flight_tiles = curr_dram_output_stream_state->in_flight_tiles + curr_dram_output_stream_state->in_flight_tiles_2;
+  }
+#else
+  uint32_t total_in_flight_tiles = curr_dram_output_stream_state->in_flight_tiles;
+#endif
+
+  return total_in_flight_tiles;
+}
+
+void risc_wait_for_cmd_buf(uint32_t noc, uint32_t cmd_buf);
+void risc_dram_write_init(uint32_t dram_stream);
+void risc_dram_write (uint32_t dram_writes_with_cmd_buf, uint32_t dram_stream, uint32_t noc, uint32_t src_addr, uint64_t dest_addr, uint32_t len_bytes, uint32_t len_tiles, uint32_t vc, uint32_t stream_msg_info_buf_addr, uint32_t transaction_id);
+bool risc_dram_write_ok(uint32_t dram_writes_with_cmd_buf, uint32_t dram_stream, uint32_t output_noc);
+bool risc_dram_writes_sent(uint32_t dram_writes_with_cmd_buf, uint32_t dram_stream);
+void replicate(uint32_t noc_id, uint32_t src_addr, uint64_t dest_addr, uint32_t chunk_size_bytes, uint32_t times_to_replicate, uint32_t transaction_id);
+void __attribute__((section("code_l1"))) replicate_l1(uint32_t noc_id, uint32_t src_addr, uint64_t dest_addr, uint32_t chunk_size_bytes, uint32_t times_to_replicate, uint32_t transaction_id);
+bool has_pending_dram_write_ptrs(uint32_t dram_stream);
+void write_pending_dram_write_ptrs(uint32_t dram_stream, dram_output_stream_state_t *dram_output_stream_state_base);
+void set_pending_dram_write_ptrs(uint32_t dram_stream, uint32_t dram_writes_with_cmd_buf, bool is_ram, bool is_strided_write, uint32_t write_stride, uint32_t total_write_strides, uint32_t dram_wrptr_q_slots, uint32_t output_noc, uint32_t output_vc,
+                                 uint64_t dram_buf_addr, dram_output_stream_state_t* curr_dram_output_stream_state, uint32_t curr_dram_output_stream_state_idx, volatile dram_io_state_t tt_l1_ptr * l1_ptrs, uint32_t curr_stride_wrap, uint32_t next_stride_wrap);
+void process_dram_write(
+  uint32_t &num_dram_output_streams, dram_output_stream_state_t *dram_output_stream_state, uint32_t &dram_ptr_update_cnt, uint32_t &total_tiles_to_clear
+);
+void process_dram_write_clear(uint32_t &num_dram_output_streams, dram_output_stream_state_t *dram_output_stream_state, uint32_t& total_tiles_to_clear);
+void __attribute__((section("code_l1"))) __attribute__((noinline)) process_dram_write_moves_raw_data_l1(dram_output_stream_state_t* curr_dram_output_stream_state, dram_q_state_t tt_l1_ptr * next_dram_q_issue, uint32_t stream_id,
+                                                                                                     uint16_t data_send_chunk_size_tiles, uint32_t output_vc, uint32_t data_send_chunk_size_bytes, uint64_t dram_buf_addr,
+                                                                                                     uint32_t& stream_rd_ptr_byte, uint32_t dram_buf_size_bytes, bool& full_q_slot_sent);
diff --git a/tt_metal/hw/inc/blackhole/stream_interface.h b/tt_metal/hw/inc/blackhole/stream_interface.h
new file mode 100644
index 000000000000..e891760e4584
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/stream_interface.h
@@ -0,0 +1,518 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef _STREAM_INTERFACE_H_
+#define _STREAM_INTERFACE_H_
+
+#include "noc_overlay_parameters.h"
+#include "noc_parameters.h"
+#include "noc_nonblocking_api.h"
+
+// Low-level chip-dependent stream/NOC functions
+
+#define STREAM_PTR_REG_MASK ((uint32_t)0xFFFF)
+#define EPOCH_SHIFT 15
+#define MAX_TILES_MSG_INFO_BUF_PER_PHASE 2048
+#define USE_2K_TILE_HEADER_BUFFER_RESET
+#define MULTI_MSG_TILES_STREAM_THESH 12 // Note streams 6 and 7 are not capable of multi-msg tiles, so dont use them for inputs
+
+
+inline __attribute__((always_inline)) void stream_phase_blob_run(uint32_t stream_id, uint32_t blob_start_addr, uint32_t start_phase_num_cfg_regs) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MULTI_MSG_CLEAR_REG_INDEX, 0); // Prevent accidental clearing
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX, blob_start_addr);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX, start_phase_num_cfg_regs << NEXT_PHASE_NUM_CFG_REG_WRITES);
+
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MISC_CFG_REG_INDEX,  (1 << NEXT_PHASE_SRC_CHANGE) | (1 << NEXT_PHASE_DEST_CHANGE));
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_ONETIME_MISC_CFG_REG_INDEX, (0x1 << PHASE_AUTO_CONFIG));
+}
+
+inline __attribute__((always_inline)) void stream_phase_blob_run_offset(uint32_t stream_id, uint32_t blob_base_addr, uint32_t blob_start_addr, uint32_t blob_size) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MULTI_MSG_CLEAR_REG_INDEX, 0); // Prevent accidental clearing
+  uint32_t blob_offset = NOC_STREAM_READ_REG(stream_id, STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX) - blob_base_addr;
+  while (blob_offset >= blob_size)
+    blob_offset -= blob_size;
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX, blob_start_addr + blob_offset);
+  uint32_t misc_cfg_reg = NOC_STREAM_READ_REG(stream_id, STREAM_ONETIME_MISC_CFG_REG_INDEX);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_ONETIME_MISC_CFG_REG_INDEX, (0x1 << PHASE_AUTO_CONFIG) | misc_cfg_reg);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_auto_cfg_ptr(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_auto_cfg_header(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_auto_cfg_header_phase_num_cfg_regs(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX) >> NEXT_PHASE_NUM_CFG_REG_WRITES;
+}
+
+inline __attribute__((always_inline)) void stream_set_auto_cfg_header(uint32_t stream_id, uint32_t val) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX, val);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_phase_is_active(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_WAIT_STATUS_REG_INDEX, MSG_FWD_ONGOING);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_curr_phase(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_CURR_PHASE_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) void set_fork_scatter_inner_loop_count(uint32_t stream_id, uint32_t val) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_SRC_REG_INDEX, val);
+}
+
+inline __attribute__((always_inline)) uint32_t get_fork_scatter_inner_loop_count(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) void set_fork_num_msgs_in_block(uint32_t stream_id, uint32_t val) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX, val);
+}
+
+inline __attribute__((always_inline)) uint32_t get_fork_num_msgs_in_block(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) bool stream_phase_id_is_active(uint32_t stream_id, uint32_t phase_id) {
+  uint32_t curr_phase = stream_get_curr_phase(stream_id);
+  bool phase_active = stream_phase_is_active(stream_id);
+  return (curr_phase == phase_id) && phase_active;
+}
+
+inline __attribute__((always_inline)) uint32_t stream_phase_advance_wait(uint32_t stream_id) {
+  uint32_t advance_wait = NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_WAIT_STATUS_REG_INDEX, WAIT_SW_PHASE_ADVANCE_SIGNAL);
+  uint32_t num_tiles_pending = NOC_STREAM_READ_REG(stream_id, STREAM_DEBUG_STATUS_REG_INDEX+9) >> MEM_WORD_ADDR_WIDTH;
+  return advance_wait && (num_tiles_pending == 0);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_input_noc(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_MISC_CFG_REG_INDEX, INCOMING_DATA_NOC);
+}
+
+inline __attribute__((always_inline)) void stream_get_remote_src_coord(uint32_t stream_id, uint32_t& x, uint32_t& y) {
+  x = NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_REMOTE_SRC_REG_INDEX, STREAM_REMOTE_SRC_X);
+  y = NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_REMOTE_SRC_REG_INDEX, STREAM_REMOTE_SRC_Y);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_output_noc(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_MISC_CFG_REG_INDEX, OUTGOING_DATA_NOC);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_output_unicast_vc(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_MISC_CFG_REG_INDEX, UNICAST_VC_REG);
+}
+
+inline __attribute__((always_inline)) void stream_get_remote_dest_coord(uint32_t stream_id, uint32_t& x, uint32_t& y) {
+  x = NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_REMOTE_DEST_REG_INDEX, STREAM_REMOTE_DEST_X);
+  y = NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_REMOTE_DEST_REG_INDEX, STREAM_REMOTE_DEST_Y);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_msg_info_rd_ptr(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_MSG_INFO_PTR_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_data_buf_addr(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_BUF_START_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_remote_data_buf_addr(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_BUF_START_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_data_buf_size(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SIZE_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_remote_data_buf_size(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_remote_data_buf_space_available(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_phase_next_recved_tile_addr(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_NEXT_RECEIVED_MSG_ADDR_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_phase_next_recved_tile_size(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_NEXT_RECEIVED_MSG_SIZE_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_phase_tiles_received(uint32_t stream_id, uint32_t msg_info_buf_start_addr) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_MSG_INFO_WR_PTR_REG_INDEX) - msg_info_buf_start_addr;
+}
+
+
+inline __attribute__((always_inline)) uint32_t stream_rec_endpoint_get_phase_tiles_count(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_REG_INDEX) & 0xffff;  // used as scratch reg for receiver endpoint streams
+}
+
+
+inline __attribute__((always_inline)) void stream_rec_endpoint_set_phase_tiles_count(uint32_t stream_id, uint32_t val) {
+  uint32_t rmw = NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_REG_INDEX) & ~0xffff;
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_REG_INDEX, (rmw | val));  // used as scratch reg for receiver endpoint streams
+}
+
+
+inline __attribute__((always_inline)) uint32_t stream_src_endpoint_get_phase_tiles_count(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX) & 0xffff;  // used as scratch reg for source endpoint streams
+}
+
+
+inline __attribute__((always_inline)) void stream_src_endpoint_set_phase_tiles_count(uint32_t stream_id, uint32_t val) {
+  uint32_t rmw = NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX) & ~0xffff;
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX, (rmw | val));  // used as scratch reg for source endpoint streams
+}
+
+
+inline __attribute__((always_inline)) uint32_t stream_get_buf_space_available_words(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SPACE_AVAILABLE_REG_INDEX);
+}
+
+inline __attribute__((always_inline)) void stream_signal_flushed_tiles(uint32_t stream_id, uint32_t num_tiles, uint32_t num_words) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_NUM_MSGS_RECEIVED_INC_REG_INDEX, (num_words << SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE) | num_tiles);
+}
+
+inline __attribute__((always_inline)) bool stream_is_dram_read_opt_enabled(uint32_t stream_id) {
+  return !NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_MISC_CFG_REG_INDEX, NEXT_PHASE_SRC_CHANGE);
+}
+
+inline __attribute__((always_inline)) bool stream_next_phase_src_change(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_MISC_CFG_REG_INDEX, NEXT_PHASE_SRC_CHANGE) || !NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_ONETIME_MISC_CFG_REG_INDEX, PHASE_AUTO_CONFIG);
+}
+
+inline __attribute__((always_inline)) int stream_get_curr_phase_num_msgs(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX, CURR_PHASE_NUM_MSGS);
+}
+
+inline __attribute__((always_inline)) void stream_set_curr_phase_num_msgs(uint32_t stream_id, uint32_t val) {
+  uint32_t rmw = NOC_STREAM_READ_REG(stream_id, STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX) & ~0xFFFFFF;
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX, (rmw | (val << CURR_PHASE_NUM_MSGS)));
+}
+
+// used by unpacker fracture
+inline __attribute__((always_inline)) void stream_relay_tiles(uint32_t stream_id, uint32_t num_tiles, uint32_t num_words) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_NUM_MSGS_RECEIVED_INC_REG_INDEX, (num_words << SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE) | num_tiles);
+}
+
+// used by packer
+inline uint32_t stream_get_free_words(uint32_t stream_id) {
+  uint32_t wait_status = NOC_STREAM_READ_REG(stream_id, STREAM_WAIT_STATUS_REG_INDEX);
+  uint32_t tiles_left_in_phase = NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX) & 0xffff; // used as scratch reg for source endpoint streams
+  uint32_t buf_space_available = NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SPACE_AVAILABLE_REG_INDEX);
+  wait_status &= (0x1 << MSG_FWD_ONGOING);
+  return (wait_status && tiles_left_in_phase) ? buf_space_available : 0;
+}
+
+inline uint32_t stream_should_packer_reset_pointers(uint32_t stream_id) {
+  uint32_t should_packer_reset_pointers = NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_REG_INDEX);
+  if (should_packer_reset_pointers)
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_SRC_REG_INDEX, 0); // used as scratch reg for source endpoint streams
+  return should_packer_reset_pointers;
+}
+
+inline uint32_t stream_dram_write_should_reset_pointers(uint32_t stream_id) {
+  uint32_t rmw = NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_REG_INDEX);
+  uint32_t should_reset_pointers = rmw >> 16;
+  if (should_reset_pointers)
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_REG_INDEX, (rmw & 0xffff));  // used as scratch reg for receiver endpoint streams
+  return should_reset_pointers;
+}
+
+inline uint32_t stream_dram_read_should_reset_pointers(uint32_t stream_id) {
+  uint32_t rmw = NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX);
+  uint32_t should_reset_pointers = rmw >> 16;
+  if (should_reset_pointers)
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX, (rmw & 0xffff));  // used as scratch reg for receiver endpoint streams
+  return should_reset_pointers;
+}
+
+template <bool fracture = false, bool with_rd_ptr = false>
+static __attribute__((unused)) __attribute__((noinline)) bool stream_get_push_flushed(uint32_t stream_id, uint32_t exp_rd_ptr=0) {
+  uint32_t prev_phase = stream_get_curr_phase(stream_id);
+  uint32_t wait_status = NOC_STREAM_READ_REG(stream_id, STREAM_WAIT_STATUS_REG_INDEX);
+  wait_status &= (0x1 << MSG_FWD_ONGOING);
+
+  if (wait_status) {
+    uint32_t buf_size = NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SIZE_REG_INDEX);
+    uint32_t buf_space_available = NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SPACE_AVAILABLE_REG_INDEX);
+    uint32_t num_tiles;
+    if constexpr (fracture)
+      num_tiles = 0;
+    else
+      num_tiles = stream_get_curr_phase_num_msgs(stream_id);
+    uint32_t rd_ptr;
+    if constexpr (with_rd_ptr)
+      rd_ptr = NOC_STREAM_READ_REG(stream_id, STREAM_RD_PTR_REG_INDEX);
+    uint32_t cur_phase = stream_get_curr_phase(stream_id);
+    if (cur_phase == prev_phase) {
+      if constexpr (with_rd_ptr)
+        return (buf_space_available != 0 && rd_ptr == exp_rd_ptr) || (buf_size == buf_space_available && num_tiles > 0); // For this case we might be resending next phase so we need the num_tiles > 0 check
+      else if constexpr (fracture)
+        return buf_size == buf_space_available; // We dont need num_tiles > 0 as there is no resend concept for fracture
+      else
+        return buf_size == buf_space_available && num_tiles > 0; // For this case we might be resending next phase so we need the num_tiles > 0 check
+    }
+  }
+
+  return stream_phase_advance_wait(stream_id);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_get_buf_space_available(uint32_t stream_id) {
+  uint32_t buf_space_available = NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SPACE_AVAILABLE_REG_INDEX);
+  return buf_space_available;
+}
+
+// used by packer
+inline __attribute__((always_inline)) void stream_push_tiles(uint32_t stream_id, uint32_t num_tiles, uint32_t num_words) {
+  uint32_t rmw = NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX);
+  uint32_t tiles_left_in_phase = rmw & 0xffff;
+  rmw = rmw & ~0xffff;
+  tiles_left_in_phase -= num_tiles;
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX, (rmw | tiles_left_in_phase)); // used as scratch reg for source endpoint streams
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_NUM_MSGS_RECEIVED_INC_REG_INDEX, (num_words << SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE) | num_tiles);
+}
+
+inline void stream_set_tiles_left_in_phase(uint32_t stream_id, uint32_t num_tiles) {
+  uint32_t rmw = NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX);
+  uint32_t tiles_left_in_phase = rmw & 0xffff;
+  rmw = rmw & ~0xffff;
+  tiles_left_in_phase -= num_tiles;
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX, (rmw | tiles_left_in_phase)); // used as scratch reg for source endpoint streams
+}
+
+#define STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE(dest_num, words_free_inc) \
+  (((dest_num) << REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM) | ((words_free_inc) << REMOTE_DEST_BUF_WORDS_FREE_INC))
+
+#define STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_ADDR(stream_id) \
+  (STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX))
+
+inline __attribute__((always_inline)) void stream_update_remote_dest_buf_space_available(uint32_t stream_id, uint32_t dest_num, uint32_t inc_val) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE(dest_num, inc_val));
+}
+
+inline __attribute__((always_inline)) bool stream_is_receiver_endpoint(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_MISC_CFG_REG_INDEX, RECEIVER_ENDPOINT);
+}
+
+inline __attribute__((always_inline)) void stream_receiver_endpoint_single_clear_op(uint32_t stream_id, uint32_t num_tiles) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MSG_INFO_CLEAR_REG_INDEX, num_tiles);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MSG_DATA_CLEAR_REG_INDEX, num_tiles);
+}
+
+inline __attribute__((always_inline)) uint32_t stream_tiles_outstanding(uint32_t stream_id) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_NUM_MSGS_RECEIVED_REG_INDEX);
+}
+
+inline void stream_receiver_endpoint_tiles_clear(uint32_t stream_id, uint32_t num_tiles) {
+  while (num_tiles > 0) {
+    uint32_t num_to_clear = (num_tiles == 1) ? 1 : 2;
+
+    // Bug fix for streams. Flow ctrl messages are sent out of order, must clear one message at the end of phase.
+    int32_t num_msgs_left_in_phase = stream_get_curr_phase_num_msgs(stream_id);
+    if (num_msgs_left_in_phase <= 2)
+      num_to_clear = 1;
+
+    stream_receiver_endpoint_single_clear_op(stream_id, num_to_clear);
+    num_tiles -= num_to_clear;
+  }
+}
+
+inline bool stream_receiver_endpoint_tile_clearing_finished(uint32_t stream_id) {
+  return (NOC_STREAM_READ_REG(stream_id,  STREAM_MULTI_MSG_CLEAR_REG_INDEX) == 0);
+}
+
+inline void stream_receiver_endpoint_tiles_clear_b0(uint32_t stream_id, uint32_t num_tiles) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MULTI_MSG_CLEAR_REG_INDEX, num_tiles);
+}
+
+inline __attribute__((always_inline)) void stream_reset(uint32_t stream_id) {
+  uint32_t val = NOC_STREAM_READ_REG(stream_id, STREAM_ONETIME_MISC_CFG_REG_INDEX);
+  val &= (~(1<<PHASE_AUTO_CONFIG));
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_ONETIME_MISC_CFG_REG_INDEX, val); // disable auto-config
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_RESET_REG_INDEX, 0x1);
+}
+
+inline __attribute__((always_inline)) void stream_force_next_phase(uint32_t stream_id) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_RESET_REG_INDEX, 0x1);
+}
+
+inline bool assert_check(uint32_t stream_id, bool hang) {
+  uint32_t debug_assert = NOC_STREAM_READ_REG(stream_id, STREAM_DEBUG_ASSERTIONS_REG_INDEX);
+  if (debug_assert > 0 && hang) {
+    while(true){};
+  }
+  return debug_assert > 0;
+}
+
+inline bool stream_done_hint() {
+  uint32_t stream_done = NOC_STREAM_READ_REG(0, STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX) | NOC_STREAM_READ_REG(0, STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX + 1);
+  if (stream_done) {
+    NOC_STREAM_WRITE_REG(0, STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX, 0xFFFFFFFF);
+    NOC_STREAM_WRITE_REG(0, STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX + 1, 0xFFFFFFFF);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline bool should_stall_for_tile_header_buffer_reset(uint32_t stream_id, uint32_t msg_info_buf_addr, uint32_t buf_size_tiles, uint32_t &prev_ack_thresh) {
+  uint32_t is_remote_src = NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_MISC_CFG_REG_INDEX, REMOTE_SOURCE);
+  uint32_t msg_info_wr_ptr = NOC_STREAM_READ_REG(stream_id, STREAM_MSG_INFO_WR_PTR_REG_INDEX);
+
+  if (is_remote_src && (msg_info_wr_ptr - msg_info_buf_addr >= MAX_TILES_MSG_INFO_BUF_PER_PHASE - 2*buf_size_tiles)) {
+    prev_ack_thresh = NOC_STREAM_READ_REG(stream_id, STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD_REG_INDEX);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD_REG_INDEX, 0);
+    return true;
+  }
+
+  return false;
+}
+
+inline bool reset_tile_header_buffer(uint32_t stream_id, uint32_t msg_info_buf_addr, uint32_t buf_size_tiles, uint32_t &prev_phases_tiles_received_inc, uint32_t &prev_ack_thresh, uint32_t num_iter_tiles) {
+  uint32_t msg_info_full = NOC_STREAM_READ_REG(stream_id, STREAM_MSG_INFO_FULL_REG_INDEX);
+  uint32_t num_msgs_recv = NOC_STREAM_READ_REG(stream_id, STREAM_NUM_MSGS_RECEIVED_REG_INDEX);
+
+  if (msg_info_full || (num_msgs_recv == buf_size_tiles)) {
+    uint32_t buf_space_available = NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SPACE_AVAILABLE_REG_INDEX);
+
+    if (buf_space_available == 0) {
+      uint32_t msg_info_rd_ptr = NOC_STREAM_READ_REG(stream_id, STREAM_MSG_INFO_PTR_REG_INDEX);
+      uint32_t msg_info_wr_ptr = NOC_STREAM_READ_REG(stream_id, STREAM_MSG_INFO_WR_PTR_REG_INDEX);
+      num_msgs_recv = NOC_STREAM_READ_REG(stream_id, STREAM_NUM_MSGS_RECEIVED_REG_INDEX);
+      uint32_t msg_info_num_tiles = msg_info_wr_ptr - msg_info_rd_ptr + num_msgs_recv;
+      prev_phases_tiles_received_inc = msg_info_rd_ptr - num_msgs_recv - msg_info_buf_addr;
+      NOC_STREAM_WRITE_REG(stream_id, STREAM_MSG_INFO_WR_PTR_REG_INDEX, msg_info_buf_addr);
+      NOC_STREAM_WRITE_REG(stream_id, STREAM_MSG_INFO_PTR_REG_INDEX, msg_info_buf_addr + num_msgs_recv);
+      NOC_STREAM_WRITE_REG(stream_id, STREAM_MSG_INFO_WR_PTR_REG_INDEX, msg_info_buf_addr + msg_info_num_tiles);
+      NOC_STREAM_WRITE_REG(stream_id, STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD_REG_INDEX, prev_ack_thresh);
+      return true;
+    }
+  }
+
+  if (num_iter_tiles <= buf_size_tiles) {
+    prev_phases_tiles_received_inc = 0;
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD_REG_INDEX, prev_ack_thresh);
+    return true;
+  }
+
+  return false;
+}
+
+inline void check_dummy_phase(uint32_t stream_id) {
+  if (stream_phase_is_active(stream_id)) {
+    uint32_t cur_phase = stream_get_curr_phase(stream_id) >> EPOCH_SHIFT;
+    if (cur_phase == 0x1F) {
+      if (NOC_STREAM_READ_REG_FIELD(stream_id, STREAM_MISC_CFG_REG_INDEX, SOURCE_ENDPOINT)) {
+        uint32_t buf_size = NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SIZE_REG_INDEX);
+        uint32_t buf_space_available = NOC_STREAM_READ_REG(stream_id, STREAM_BUF_SPACE_AVAILABLE_REG_INDEX);
+        uint32_t num_tiles = stream_get_curr_phase_num_msgs(stream_id);
+
+        if (buf_space_available == buf_size && num_tiles > 0) {
+          stream_relay_tiles(stream_id, 1, 1);
+        }
+      }
+    }
+  }
+}
+
+inline bool is_dummy_phase(uint32_t stream_id) {
+  uint32_t cur_phase = stream_get_curr_phase(stream_id) >> EPOCH_SHIFT;
+  return cur_phase == 0x1F;
+}
+
+inline void stream_dram_write_init(uint32_t stream_id, uint32_t tile_header_buffer_addr) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_CURR_PHASE_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_MSG_INFO_WR_PTR_HI_REG_INDEX, tile_header_buffer_addr >> 21); // todo remove this when noc0/noc1 NIU_CFG_0_TILE_HEADER_STORE_OFF is set for all dram cores
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_MSG_INFO_WR_PTR_REG_INDEX, tile_header_buffer_addr >> 4); // todo remove this when noc0/noc1 NIU_CFG_0_TILE_HEADER_STORE_OFF is set for all dram cores
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MCAST_DEST_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MCAST_DEST_NUM_REG_INDEX, 1);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_SCRATCH_0_REG_INDEX, (1 << 0) | (1 << 2));
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_SCRATCH_1_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_SCRATCH_2_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_SCRATCH_3_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_SCRATCH_4_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_SCRATCH_5_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_TRAFFIC_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD_REG_INDEX, 0);
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_GATHER_REG_INDEX, 0);
+}
+
+inline void stream_dram_write(uint32_t stream_id, uint32_t noc, uint32_t src_addr, uint64_t dest_addr, uint32_t len_bytes, uint32_t len_tiles, uint32_t vc, uint32_t tile_header_buf_addr_word) {
+  if (len_bytes > 0) {
+    uint32_t dest_buf_addr = NOC_LOCAL_ADDR_OFFSET(dest_addr);
+
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_PHASE_AUTO_CFG_HEADER_REG_INDEX, 1 << CURR_PHASE_NUM_MSGS);
+
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_REG_INDEX, dest_addr >> NOC_ADDR_LOCAL_BITS);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_BUF_START_HI_REG_INDEX, (dest_buf_addr/MEM_WORD_WIDTH) >> MEM_WORD_ADDR_WIDTH);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_BUF_START_REG_INDEX, dest_buf_addr/MEM_WORD_WIDTH);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX, len_bytes/MEM_WORD_WIDTH);
+
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_BUF_START_REG_INDEX, src_addr/MEM_WORD_WIDTH);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_BUF_SIZE_REG_INDEX, len_bytes/MEM_WORD_WIDTH);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_MSG_INFO_PTR_REG_INDEX, tile_header_buf_addr_word);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_MSG_INFO_WR_PTR_REG_INDEX, tile_header_buf_addr_word);
+
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_TRAFFIC_REG_INDEX, (vc << UNICAST_VC_REG));
+
+    uint32_t misc_cfg_reg = (noc << OUTGOING_DATA_NOC) |  ((1-noc) << REMOTE_SRC_UPDATE_NOC) |
+                            (1 << SOURCE_ENDPOINT) | (1 << REMOTE_RECEIVER) |
+                            (1 << NEXT_PHASE_SRC_CHANGE) | (1 << NEXT_PHASE_DEST_CHANGE) |
+                            (1 << DEST_DATA_BUF_NO_FLOW_CTRL);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_MISC_CFG_REG_INDEX, misc_cfg_reg);
+    misc_cfg_reg = (1 << PHASE_AUTO_ADVANCE) | (3 << REG_UPDATE_VC_REG);
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_ONETIME_MISC_CFG_REG_INDEX, misc_cfg_reg);
+
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_PHASE_ADVANCE_REG_INDEX, 1);
+
+    uint32_t src_ready_state;
+    do {
+      src_ready_state = (NOC_STREAM_READ_REG(stream_id, STREAM_DEBUG_STATUS_REG_INDEX+8) >> 4) & 0x7;
+    } while (src_ready_state != 4); // SRC_READY_WAIT_ALL_DESTS
+
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_DEST_PHASE_READY_UPDATE_REG_INDEX, 1<<PHASE_READY_TWO_WAY_RESP);
+
+    NOC_STREAM_WRITE_REG(stream_id, STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG_INDEX, ((src_addr/MEM_WORD_WIDTH) << SOURCE_ENDPOINT_NEW_MSG_ADDR) | ((len_bytes/MEM_WORD_WIDTH) << SOURCE_ENDPOINT_NEW_MSG_SIZE));
+  }
+}
+
+inline bool stream_dram_write_ok(uint32_t stream_id) {
+  return stream_phase_advance_wait(stream_id);
+}
+
+inline bool stream_dram_writes_sent(uint32_t stream_id) {
+  return stream_phase_advance_wait(stream_id);
+}
+
+inline uint32_t stream_dram_writes_read_scratch(uint32_t stream_id, uint32_t scratch_reg_index) {
+  return NOC_STREAM_READ_REG(stream_id, STREAM_SCRATCH_0_REG_INDEX + scratch_reg_index);
+}
+
+inline void stream_dram_writes_write_scratch(uint32_t stream_id, uint32_t scratch_reg_index, uint32_t val) {
+  NOC_STREAM_WRITE_REG(stream_id, STREAM_SCRATCH_0_REG_INDEX + scratch_reg_index, val);
+}
+
+inline void stream_clear_all_tiles(uint32_t stream_id) {
+  uint32_t msg_info_wr_ptr;
+  uint32_t msg_info_rd_ptr;
+  uint32_t num_msgs_recv;
+  uint32_t num_msgs_recv_in_bufs_and_mem;
+  do {
+    msg_info_rd_ptr = NOC_STREAM_READ_REG(stream_id, STREAM_MSG_INFO_PTR_REG_INDEX);
+    num_msgs_recv = NOC_STREAM_READ_REG(stream_id, STREAM_NUM_MSGS_RECEIVED_REG_INDEX);
+    msg_info_wr_ptr = NOC_STREAM_READ_REG(stream_id, STREAM_MSG_INFO_WR_PTR_REG_INDEX);
+    num_msgs_recv_in_bufs_and_mem = msg_info_wr_ptr - msg_info_rd_ptr + num_msgs_recv;
+    if (num_msgs_recv > 0) {
+      stream_receiver_endpoint_single_clear_op(stream_id, 1);
+    }
+  } while (num_msgs_recv_in_bufs_and_mem > 0);
+}
+
+
+#endif //ndef _STREAM_INTERFACE_H_
diff --git a/tt_metal/hw/inc/blackhole/stream_io_map.h b/tt_metal/hw/inc/blackhole/stream_io_map.h
new file mode 100644
index 000000000000..820dae572c42
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/stream_io_map.h
@@ -0,0 +1,203 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef _STREAM_IO_MAP_
+#define _STREAM_IO_MAP_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+
+/*
+   Kernel operand mapping scheme:
+   - ID 0-7 (inputs, unpacker-only) => streams 4,5,10-15
+   - ID 8-15 (params, unpacker-only) => streams 16-23
+   - ID 16-23 (outputs, packer-only) => streams 24-31
+   - ID 24-31 (intermediates, packer/unpacker) => streams 32-39
+*/
+
+const uint32_t MCAST_START_STREAM = 0;
+const uint32_t MCAST_END_STREAM = 3;
+const uint32_t OPERAND_START_STREAM = 4;
+const uint32_t INPUT_START_STREAM_1 = 4;
+const uint32_t INPUT_START_STREAM_1_SIZE = 2;
+const uint32_t INPUT_START_STREAM_2 = 10;
+const uint32_t INPUT_PARAMS_START_STREAM = 16;
+const uint32_t OUTPUT_START_STREAM = 24;
+const uint32_t INTERMEDIATES_START_STREAM = 32;
+const uint32_t END_IO_STREAM = 39;
+
+const int OPERAND_INPUT_START_INDEX = 0;
+const int OPERAND_INPUT_PARAMS_START_INDEX = 8;
+const int OPERAND_OUTPUT_START_INDEX = 16;
+const int OPERAND_INTERMEDIATES_START_INDEX = 24;
+const int OPERAND_RELAY_START_INDEX = 32;
+const int MAX_NUM_OPERANDS = 64;
+
+#ifdef TENSIX_FIRMWARE
+#include "risc_attribs.h"
+#define MCAST_PACKER_OPT_EN  ((volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(MCAST_END_STREAM, STREAM_SCRATCH_5_REG_INDEX)))
+#endif
+
+// Indexed with operand = kernel operand ID (0-31) per the table above
+// Used for tile push/pop operations.
+inline uint32_t get_operand_stream_id(int operand) {
+#ifdef TENSIX_FIRMWARE
+  if (*MCAST_PACKER_OPT_EN && operand >= OPERAND_OUTPUT_START_INDEX && operand < OPERAND_INTERMEDIATES_START_INDEX) {
+    return MCAST_END_STREAM - (operand - OPERAND_OUTPUT_START_INDEX);
+  }
+#endif
+
+  return ((uint32_t)operand) >= INPUT_START_STREAM_1_SIZE ? INPUT_START_STREAM_2 + operand - INPUT_START_STREAM_1_SIZE : OPERAND_START_STREAM + operand;
+}
+
+inline int stream_id_to_operand(uint32_t stream_id) {
+#ifdef TENSIX_FIRMWARE
+  if (*MCAST_PACKER_OPT_EN && stream_id >= MCAST_START_STREAM && stream_id <= MCAST_END_STREAM) {
+    return OPERAND_OUTPUT_START_INDEX + (MCAST_END_STREAM - stream_id);
+  }
+#endif
+
+  return stream_id >= INPUT_START_STREAM_2 ? (stream_id - INPUT_START_STREAM_2 + INPUT_START_STREAM_1_SIZE) : (stream_id - OPERAND_START_STREAM);
+}
+
+inline int stream_id_to_output(uint32_t stream_id) {
+#ifdef TENSIX_FIRMWARE
+  if (*MCAST_PACKER_OPT_EN && stream_id >= MCAST_START_STREAM && stream_id <= MCAST_END_STREAM) {
+    return MCAST_END_STREAM - stream_id;
+  }
+#endif
+
+  return (stream_id - OUTPUT_START_STREAM);
+}
+
+// This should only be used by llk tb, this is meant only as a hack
+inline __attribute__((always_inline)) uint32_t old_stream_id_to_new_stream_id(uint32_t stream_id) {
+  if (stream_id == 8)
+    return 4;
+  else if (stream_id == 9)
+    return 5;
+  else
+    return stream_id;
+}
+
+// Functions below convert between kernel operand indexes (per the above table)
+// and input/output indexes that can be used to iterate separately through
+// streams that have kernel input (stream->unpacker) or kernel output
+// (packer->stream) functionality.
+inline __attribute__((always_inline)) int operand_to_input_index(int operand) {
+  return (operand >= OPERAND_INTERMEDIATES_START_INDEX) ? (operand - (OPERAND_INTERMEDIATES_START_INDEX - OPERAND_OUTPUT_START_INDEX)) : operand;
+}
+
+inline __attribute__((always_inline)) int input_to_operand_index(int input) {
+  return (input >= OPERAND_OUTPUT_START_INDEX) ? (input + (OPERAND_INTERMEDIATES_START_INDEX - OPERAND_OUTPUT_START_INDEX)) : input;
+}
+
+inline __attribute__((always_inline)) int operand_to_output_index(int operand) {
+  return operand - OPERAND_OUTPUT_START_INDEX;
+}
+
+inline __attribute__((always_inline)) int output_to_operand_index(int output) {
+  return output + OPERAND_OUTPUT_START_INDEX;
+}
+
+inline __attribute__((always_inline)) bool operand_is_intermediate(int operand) {
+  return (operand>=OPERAND_INTERMEDIATES_START_INDEX);
+}
+
+
+// Pointers to scratch registers (implemented using don't-care functional registers) for input
+// stream tile sync operations:
+#ifdef TENSIX_FIRMWARE
+
+// XXXXX FIXME: separate interface for use by pipegen and loader from
+// implementation below for firmware
+
+#ifdef PERF_DUMP
+
+  // Must be the same values as in perf_lib/perf_base.hpp
+  static constexpr uint8_t PERF_MAX_NUM_INPUTS = 8;
+  static constexpr uint8_t PERF_MAX_NUM_OUTPUTS = 1;
+
+  #define PERF_RISC_MAILBOX_INPUT_DECOUPLE_MASK_PTR  ((volatile uint32_t tt_l1_ptr *) (l1_mem::address_map::PERF_RISC_MAILBOX_ADDR))
+  #define PERF_RISC_MAILBOX_OUTPUT_DECOUPLE_MASK_PTR ((volatile uint32_t tt_l1_ptr *) (l1_mem::address_map::PERF_RISC_MAILBOX_ADDR + 4))
+  #define PERF_DRAM_BUFFER_RESET_MAILBOX_PTR         ((volatile uint32_t tt_l1_ptr *) (l1_mem::address_map::PERF_RESET_PTR_MAILBOX_ADDR))
+
+  #if OVERLAY_DECOUPLE == 1
+  #define PERF_ANALYZER_COMMAND_START_PTR            ((volatile uint32_t tt_l1_ptr *) (l1_mem::address_map::PERF_ANALYZER_COMMAND_START_PTR_ADDR))
+  #define PERF_ANALYZER_COMMAND_START_VAL            ((volatile uint32_t tt_l1_ptr *) (l1_mem::address_map::PERF_ANALYZER_COMMAND_START_VAL_ADDR))
+
+  inline bool is_input_operand_decoupled(int operand) {
+    if (operand >= OPERAND_INPUT_PARAMS_START_INDEX) {
+        return false;
+    }
+    uint32_t overlay_input_decouple_mask = *PERF_RISC_MAILBOX_INPUT_DECOUPLE_MASK_PTR;
+    const uint8_t operand_mask = 1 << (operand & 0xff);
+    return (overlay_input_decouple_mask & 0xff) & operand_mask;
+  }
+  inline bool is_output_operand_decoupled(int operand, uint8_t overlay_output_decouple_mask) {
+    if (operand < OPERAND_OUTPUT_START_INDEX || operand >= OPERAND_INTERMEDIATES_START_INDEX) {
+        return false;
+    }
+    const uint8_t operand_mask = 1 << ((operand-OPERAND_OUTPUT_START_INDEX) & 0xff);
+    return overlay_output_decouple_mask & operand_mask;
+  }
+
+  #endif
+#endif
+
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_operand_tiles_received_ptr(int operand) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(get_operand_stream_id(operand), STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX));
+}
+
+#if defined(PERF_DUMP) && (OVERLAY_DECOUPLE == 1)
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_operand_tiles_acked_ptr(int operand) {
+  if (is_input_operand_decoupled(operand)) {
+    return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(get_operand_stream_id(operand), STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX));
+  } else {
+    return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(get_operand_stream_id(operand), STREAM_REMOTE_DEST_BUF_START_REG_INDEX));
+  }
+#else
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_operand_tiles_acked_ptr(int operand) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(get_operand_stream_id(operand), STREAM_REMOTE_DEST_BUF_START_REG_INDEX));
+#endif
+}
+
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_operand_phase_changed_ptr(int operand) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(get_operand_stream_id(operand), STREAM_REMOTE_DEST_MSG_INFO_BUF_SIZE_REG_INDEX));
+}
+
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_packer_tiles_received_ptr(int operand) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(get_operand_stream_id(operand), STREAM_REMOTE_SRC_PHASE_REG_INDEX));
+}
+
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_packer_tiles_acked_ptr(int operand) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(get_operand_stream_id(operand), STREAM_REMOTE_SRC_REG_INDEX));
+}
+
+//Ethernet FW managed streams use inputs/outputs differently from Tensix cores.
+//We can have 24 Input or 24 Output ethernet streams that are managed by FW.
+//Mapping to Kernel operand is not necessary so the following routines use the stream id directly
+//to return respective stream's tiles received/acked pointer.
+//FW managed streams are: 4 - 7, 12 - 31.
+//HW ethernet streams are: 0 - 3, 8 - 11.
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_operand_stream_tiles_received_ptr(uint32_t stream_id) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX));
+}
+
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_operand_stream_tiles_acked_ptr(uint32_t stream_id) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_START_REG_INDEX));
+}
+
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_packer_stream_tiles_received_ptr(uint32_t stream_id) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(stream_id, STREAM_REMOTE_SRC_PHASE_REG_INDEX));
+}
+
+inline __attribute__((always_inline)) volatile uint32_t * tt_reg_ptr get_packer_stream_tiles_acked_ptr(uint32_t stream_id) {
+  return (volatile uint32_t tt_reg_ptr *)(uintptr_t)(STREAM_REG_ADDR(stream_id, STREAM_REMOTE_SRC_REG_INDEX));
+}
+
+#endif
+
+#endif
diff --git a/tt_metal/hw/inc/blackhole/tdma_xmov.h b/tt_metal/hw/inc/blackhole/tdma_xmov.h
new file mode 100644
index 000000000000..c33d748c2c10
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/tdma_xmov.h
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// TDMA XMOV functions for the firmware
+//
+
+#pragma once
+
+#include "tensix_types.h"
+
+// Write TDMA registers and initiate a move
+void tdma_xmov(uint mover_number, uint source_addr, uint dest_addr, uint size, xmov_direction_t direction);
+void wait_tdma_movers_done(uint mover_busy_mask);
diff --git a/tt_metal/hw/inc/blackhole/tensix.h b/tt_metal/hw/inc/blackhole/tensix.h
new file mode 100644
index 000000000000..9c48c0d11af0
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/tensix.h
@@ -0,0 +1,735 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TENSIX_H_INCLUDED
+#define TENSIX_H_INCLUDED
+
+//#include "tensix_prototypes.h"
+#include <limits>
+#include <type_traits>
+#include <cstdint>
+#include <tensix_types.h>
+#include "cfg_defines.h"
+
+// Convenience and type defines
+typedef std::uint32_t uint;
+typedef std::uint8_t byte;
+
+#define MAX_THREADS 3 // max number of threads supported by single core
+
+#define MAX_PACKERS 4 // number of packers in the design
+
+#define TEST_MAILBOX_ADDRESS      ( 4 )
+#define WALL_CLOCK_MAILBOX_ADDRESS 96
+#define DEBUG_MAILBOX_ADDRESS 112
+#define DEBUG_MAILBOX_SIZE 64
+#define CQ_MAILBOX_ADDRESS 368 // byte address
+#define CQ_MAILBOX_SIZE 4 // byte size
+
+// TODO: use this in firmware.cc
+#define MEMORY_WORD_SIZE_IN_BYTES ( 16 )
+#define MEMORY_WORD_SHIFT_BITS    ( 4 ) // log2(MEMORY_WORD_SIZE_IN_BYTES)
+
+
+
+#define STALLWAIT_COMPUTE  (0x0)
+#define STALLWAIT_TDMA     (0x1)
+#define STALLWAIT_FOR_TC   (0x1 << 0)
+#define STALLWAIT_FOR_UNP0 (0x1 << 1)
+#define STALLWAIT_FOR_UNP1 (0x1 << 2)
+#define STALLWAIT_FOR_PACK (0x1 << 3)
+
+
+/////////////
+// RISC-V Address map definition (hardware)
+
+// TODO: Consider redefining these as uint32_t rather then #defines
+
+#define L0_BASE             0xFFC00000  // 0xFFC00000 - 0xFFDFFFFF
+#define L1_BASE             0x0         // 0x00000000 - 0xFFBFFFFF
+
+#define LOCAL_MEM_SIZE      4096
+
+// Reads and writes here access the tensix core register set. Each register is four bytes, but subword reads are supported through byte enables.
+// Register indices and contents are defined in local_regs.yaml.
+#define REGFILE_BASE        0xFFE00000  // 0xFFE00000 - 0xFFE3FFFF
+
+// Writes here are appended to the tensix core instruction FIFO. This has priority over incoming instruction fetch returns, which are simply dropped.
+// The instruction will stay in the queue if a loop instruction is in progress.
+// If the FIFO gets overfull, writes are dropped? Additionally, the instruction queue is flushed in some cases.
+#define INSTRN_BUF_BASE      0xFFE40000  // 0xFFE40000 - 0xFFE7FFFF
+#define INSTRN1_BUF_BASE     0xFFE50000  // 0xFFE40000 - 0xFFE7FFFF
+#define INSTRN2_BUF_BASE     0xFFE60000
+
+// PC buffer is used to pass kernel IDs and paramters from Brisc to Triscs, and also as a sync point -- a read from pc buffer+1 address
+// will not return until that thread is idle.
+#define PC_BUF_BASE      0xFFE80000  // 0xFFE80000 - 0xFFEBFFFF
+#define PC1_BUF_BASE     0xFFE90000  // 0xFFE80000 - 0xFFEBFFFF
+#define PC2_BUF_BASE     0xFFEA0000
+
+// Reads from here retrieve a value written by the tensix code, or 0 if there the mailbox FIFO is empty.
+#define TENSIX_MAILBOX0_BASE 0xFFEC0000  // Brisc
+#define TENSIX_MAILBOX1_BASE 0xFFEC1000  // Trisc0
+#define TENSIX_MAILBOX2_BASE 0xFFEC2000  // Trisc1
+#define TENSIX_MAILBOX3_BASE 0xFFEC3000  // Trisc2
+
+//Config registers
+#define TENSIX_CFG_BASE 0xFFEF0000 //0xFFEF0000 - 0xFFF00000
+
+// MOP config registers
+#define TENSIX_MOP_CFG_BASE 0xFFB80000 // 0xFFB8000 - 0xFFB8100
+
+// These addresses are defined by software convention
+#define L1_KERNEL_BASE      0x1F000      // This is a 128-bit address
+const static uint32_t L1_MATH_KERNEL_BASE = 0x1E000;      // This is a 128-bit address
+#define L1_L0_DUMP          0x1D000      // This is a 128-bit address
+
+#define LOCAL_MEM_BASE_ADDR             0xFFB00000
+
+// TDMA register base
+#define RISCV_TDMA_REGS_START_ADDR      0xFFB11000
+#define RISCV_TDMA_REG_XMOV_SRC_ADDR    0xFFB11000
+#define RISCV_TDMA_REG_XMOV_DST_ADDR    0xFFB11004
+#define RISCV_TDMA_REG_XMOV_SIZE        0xFFB11008
+#define RISCV_TDMA_REG_XMOV_DIRECTION   0xFFB1100C
+#define RISCV_TDMA_REG_COMMAND_ADDR     0xFFB11010
+#define RISCV_TDMA_REG_STATUS           0xFFB11014
+#define RISCV_TDMA_REG_PACKED_SIZE      0xFFB11018
+#define RISCV_TDMA_REG_ACC_PACKED_SIZE  0xFFB1101C  // read only
+#define RISCV_TDMA_REG_INITIAL_PACK_ACC 0xFFB1101C  // write only
+#define RISCV_TDMA_REG_CLK_GATE_EN      0xFFB11024
+#define RISCV_TDMA_REG_CLK_GATE_HYST    0xFFB11028
+#define RISCV_TDMA_REG_XMOV_L1_BASE_ADDR  0xFFB1102C
+#define RISCV_TDMA_REG_FIFO_PACKED_TILE_SIZE(packer)      (0xFFB11030 | (packer<<8))
+#define RISCV_TDMA_REG_FIFO_PACKED_TILE_ZEROMASK(packer)  (0xFFB11034 | (packer<<8))
+#define RISCV_TDMA_REG_FIFO_PACKED_TILE_STATUS            (0xFFB11038)
+
+#define RISCV_TDMA_PACKED_TILE_FIFO_EMPTY(status,packer)  ((status>>(packer*2))&0x1)
+#define RISCV_TDMA_PACKED_TILE_FIFO_FULL(status,packer)   ((status>>(packer*2+1))&0x1)
+#define RISCV_TDMA_STATUS_FLAG_MOVER0_BUSY_MASK    0x01
+#define RISCV_TDMA_STATUS_FLAG_MOVER1_BUSY_MASK    0x02
+#define RISCV_TDMA_STATUS_FLAG_FIFO_FULL_MASK      0x04
+#define RISCV_TDMA_STATUS_FLAG_FIFO_EMPTY_MASK     0x08
+#define RISCV_TDMA_STATUS_FLAG_ERROR_MASK          0x10
+
+// Debug registers
+/*
+#!/usr/bin/env lua
+
+-- Stupid little helper to generate the defines shown below
+
+root = assert(os.getenv("ROOT"), "Must provide ROOT env var porinting to repo root")
+f = assert(io.open(root .. "/src/hardware/tensix/rtl/tt_risc_debug_regs.sv"))
+
+f = f:read("*a")
+
+for nm,addr in f:gmatch("localparam%s*([a-zA-Z0-9_]*)%s*=%s*32'h(%x*)%s*;") do
+    io.write"#define RISCV_DEBUG_REG_"
+    io.write(nm)
+    local num = 40 - #nm
+    if num < 0 then num = 0 end
+    local pad = string.rep(" ", num)
+    io.write(pad)
+    io.write"(RISCV_DEBUG_REGS_START_ADDR | 0x"
+    io.write(addr)
+    io.write")\n"
+end
+*/
+#define RISCV_DEBUG_REGS_START_ADDR             0xFFB12000
+#define RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD0                 (RISCV_DEBUG_REGS_START_ADDR | 0x0)
+#define RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD1                 (RISCV_DEBUG_REGS_START_ADDR | 0x4)
+#define RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2                 (RISCV_DEBUG_REGS_START_ADDR | 0x8)
+#define RISCV_DEBUG_REG_PERF_CNT_TDMA_UNPACK0                   (RISCV_DEBUG_REGS_START_ADDR | 0xC)
+#define RISCV_DEBUG_REG_PERF_CNT_TDMA_UNPACK1                   (RISCV_DEBUG_REGS_START_ADDR | 0x10)
+#define RISCV_DEBUG_REG_PERF_CNT_TDMA_UNPACK2                   (RISCV_DEBUG_REGS_START_ADDR | 0x14)
+#define RISCV_DEBUG_REG_PERF_CNT_FPU0                           (RISCV_DEBUG_REGS_START_ADDR | 0x18)
+#define RISCV_DEBUG_REG_PERF_CNT_FPU1                           (RISCV_DEBUG_REGS_START_ADDR | 0x1C)
+#define RISCV_DEBUG_REG_PERF_CNT_FPU2                           (RISCV_DEBUG_REGS_START_ADDR | 0x20)
+#define RISCV_DEBUG_REG_PERF_CNT_L1_0                           (RISCV_DEBUG_REGS_START_ADDR | 0x30)
+#define RISCV_DEBUG_REG_PERF_CNT_L1_1                           (RISCV_DEBUG_REGS_START_ADDR | 0x34)
+#define RISCV_DEBUG_REG_PERF_CNT_L1_2                           (RISCV_DEBUG_REGS_START_ADDR | 0x38)
+#define RISCV_DEBUG_REG_PERF_CNT_ALL                            (RISCV_DEBUG_REGS_START_ADDR | 0x3C)
+#define RISCV_DEBUG_REG_DBG_L1_MEM_REG0                         (RISCV_DEBUG_REGS_START_ADDR | 0x48)
+#define RISCV_DEBUG_REG_DBG_L1_MEM_REG1                         (RISCV_DEBUG_REGS_START_ADDR | 0x4C)
+#define RISCV_DEBUG_REG_DBG_L1_MEM_REG2                         (RISCV_DEBUG_REGS_START_ADDR | 0x50)
+#define RISCV_DEBUG_REG_DBG_BUS_CTRL                            (RISCV_DEBUG_REGS_START_ADDR | 0x54)
+#define RISCV_DEBUG_REG_TENSIX_CREG_READ                        (RISCV_DEBUG_REGS_START_ADDR | 0x58)
+#define RISCV_DEBUG_REG_DBG_RD_DATA                             (RISCV_DEBUG_REGS_START_ADDR | 0x5C)
+#define RISCV_DEBUG_REG_THREAD1_CREG_READ                       (RISCV_DEBUG_REGS_START_ADDR | 0x5C)
+#define RISCV_DEBUG_REG_DBG_ARRAY_RD_EN                         (RISCV_DEBUG_REGS_START_ADDR | 0x60)
+#define RISCV_DEBUG_REG_DBG_ARRAY_RD_CMD                        (RISCV_DEBUG_REGS_START_ADDR | 0x64)
+#define RISCV_DEBUG_REG_DBG_FEATURE_DISABLE                     (RISCV_DEBUG_REGS_START_ADDR | 0x68)
+#define RISCV_DEBUG_REG_DBG_ARRAY_RD_DATA                       (RISCV_DEBUG_REGS_START_ADDR | 0x6C)
+#define RISCV_DEBUG_REG_CG_CTRL_HYST0                           (RISCV_DEBUG_REGS_START_ADDR | 0x70)
+#define RISCV_DEBUG_REG_CG_CTRL_HYST1                           (RISCV_DEBUG_REGS_START_ADDR | 0x74)
+#define RISCV_DEBUG_REG_TENSIX_CREG_RDDATA                      (RISCV_DEBUG_REGS_START_ADDR | 0x78)
+#define RISCV_DEBUG_REG_CG_CTRL_HYST2                           (RISCV_DEBUG_REGS_START_ADDR | 0x7C)
+#define RISCV_DEBUG_REG_THREAD1_CREG_RDDATA                     (RISCV_DEBUG_REGS_START_ADDR | 0x7C)
+#define RISCV_DEBUG_REG_RISC_DBG_CNTL_0                         (RISCV_DEBUG_REGS_START_ADDR | 0x80)
+#define RISCV_DEBUG_REG_RISC_DBG_CNTL_1                         (RISCV_DEBUG_REGS_START_ADDR | 0x84)
+#define RISCV_DEBUG_REG_RISC_DBG_STATUS_0                       (RISCV_DEBUG_REGS_START_ADDR | 0x88)
+#define RISCV_DEBUG_REG_RISC_DBG_STATUS_1                       (RISCV_DEBUG_REGS_START_ADDR | 0x8C)
+#define RISCV_DEBUG_REG_TRISC_PC_BUF_OVERRIDE                   (RISCV_DEBUG_REGS_START_ADDR | 0x90)
+#define RISCV_DEBUG_REG_DBG_INVALID_INSTRN                      (RISCV_DEBUG_REGS_START_ADDR | 0x94)
+#define RISCV_DEBUG_REG_DBG_INSTRN_BUF_CTRL0                    (RISCV_DEBUG_REGS_START_ADDR | 0xA0)
+#define RISCV_DEBUG_REG_DBG_INSTRN_BUF_CTRL1                    (RISCV_DEBUG_REGS_START_ADDR | 0xA4)
+#define RISCV_DEBUG_REG_DBG_INSTRN_BUF_STATUS                   (RISCV_DEBUG_REGS_START_ADDR | 0xA8)
+#define RISCV_DEBUG_REG_STOCH_RND_MASK0                         (RISCV_DEBUG_REGS_START_ADDR | 0xAC)
+#define RISCV_DEBUG_REG_STOCH_RND_MASK1                         (RISCV_DEBUG_REGS_START_ADDR | 0xB0)
+#define RISCV_DEBUG_REG_FPU_STICKY_BITS                         (RISCV_DEBUG_REGS_START_ADDR | 0xB4)
+#define RISCV_DEBUG_REG_ETH_RISC_PREFECTH_CTRL                  (RISCV_DEBUG_REGS_START_ADDR | 0xB8)
+#define RISCV_DEBUG_REG_ETH_RISC_PREFECTH_PC                    (RISCV_DEBUG_REGS_START_ADDR | 0xBC)
+#define RISCV_DEBUG_REG_PERF_CNT_TDMA_PACK0                     (RISCV_DEBUG_REGS_START_ADDR | 0xF0)
+#define RISCV_DEBUG_REG_PERF_CNT_TDMA_PACK1                     (RISCV_DEBUG_REGS_START_ADDR | 0xF4)
+#define RISCV_DEBUG_REG_PERF_CNT_TDMA_PACK2                     (RISCV_DEBUG_REGS_START_ADDR | 0xF8)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_L_INSTRN_THREAD            (RISCV_DEBUG_REGS_START_ADDR | 0x100)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_H_INSTRN_THREAD            (RISCV_DEBUG_REGS_START_ADDR | 0x104)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_L_TDMA_UNPACK              (RISCV_DEBUG_REGS_START_ADDR | 0x108)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_H_TDMA_UNPACK              (RISCV_DEBUG_REGS_START_ADDR | 0x10C)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_L_TDMA_PACK                (RISCV_DEBUG_REGS_START_ADDR | 0x110)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_H_TDMA_PACK                (RISCV_DEBUG_REGS_START_ADDR | 0x114)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_L_DBG_L1                   (RISCV_DEBUG_REGS_START_ADDR | 0x118)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_H_DBG_L1                   (RISCV_DEBUG_REGS_START_ADDR | 0x11C)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_L_FPU                      (RISCV_DEBUG_REGS_START_ADDR | 0x120)
+#define RISCV_DEBUG_REG_PERF_CNT_OUT_H_FPU                      (RISCV_DEBUG_REGS_START_ADDR | 0x124)
+#define RISCV_DEBUG_REG_SOFT_RESET_0                            (RISCV_DEBUG_REGS_START_ADDR | 0x1B0)
+#define RISCV_DEBUG_REG_ECC_CTRL                                (RISCV_DEBUG_REGS_START_ADDR | 0x1D0)
+#define RISCV_DEBUG_REG_ECC_STATUS                              (RISCV_DEBUG_REGS_START_ADDR | 0x1D4)
+#define RISCV_DEBUG_REG_WATCHDOG_TIMER                          (RISCV_DEBUG_REGS_START_ADDR | 0x1E0)
+#define RISCV_DEBUG_REG_WDT_CNTL                                (RISCV_DEBUG_REGS_START_ADDR | 0x1E4)
+#define RISCV_DEBUG_REG_WDT_STATUS                              (RISCV_DEBUG_REGS_START_ADDR | 0x1E8)
+#define RISCV_DEBUG_REG_WALL_CLOCK_0                            (RISCV_DEBUG_REGS_START_ADDR | 0x1F0)
+#define RISCV_DEBUG_REG_WALL_CLOCK_1                            (RISCV_DEBUG_REGS_START_ADDR | 0x1F4)
+#define RISCV_DEBUG_REG_WALL_CLOCK_1_AT                         (RISCV_DEBUG_REGS_START_ADDR | 0x1F8)
+#define RISCV_DEBUG_REG_TIMESTAMP_DUMP_CMD                      (RISCV_DEBUG_REGS_START_ADDR | 0x1FC)
+#define RISCV_DEBUG_REG_TIMESTAMP_DUMP_CNTL                     (RISCV_DEBUG_REGS_START_ADDR | 0x200)
+#define RISCV_DEBUG_REG_TIMESTAMP_DUMP_STATUS                   (RISCV_DEBUG_REGS_START_ADDR | 0x204)
+#define RISCV_DEBUG_REG_TIMESTAMP_DUMP_BUF0_START_ADDR          (RISCV_DEBUG_REGS_START_ADDR | 0x208)
+#define RISCV_DEBUG_REG_TIMESTAMP_DUMP_BUF0_END_ADDR            (RISCV_DEBUG_REGS_START_ADDR | 0x20C)
+#define RISCV_DEBUG_REG_TIMESTAMP_DUMP_BUF1_START_ADDR          (RISCV_DEBUG_REGS_START_ADDR | 0x210)
+#define RISCV_DEBUG_REG_TIMESTAMP_DUMP_BUF1_END_ADDR            (RISCV_DEBUG_REGS_START_ADDR | 0x214)
+#define RISCV_DEBUG_REG_PERF_CNT_MUX_CTRL                       (RISCV_DEBUG_REGS_START_ADDR | 0x218)
+#define RISCV_DEBUG_REG_DBG_L1_READBACK_OFFSET                  (RISCV_DEBUG_REGS_START_ADDR | 0x21C)
+#define RISCV_DEBUG_REG_LFSR_HIT_MASK                           (RISCV_DEBUG_REGS_START_ADDR | 0x220)
+#define RISCV_DEBUG_REG_DISABLE_RESET                           (RISCV_DEBUG_REGS_START_ADDR | 0x224)
+#define RISCV_DEBUG_REG_TRISC0_RESET_PC                         (RISCV_DEBUG_REGS_START_ADDR | 0x228)
+#define RISCV_DEBUG_REG_TRISC1_RESET_PC                         (RISCV_DEBUG_REGS_START_ADDR | 0x22C)
+#define RISCV_DEBUG_REG_TRISC2_RESET_PC                         (RISCV_DEBUG_REGS_START_ADDR | 0x230)
+#define RISCV_DEBUG_REG_TRISC_RESET_PC_OVERRIDE                 (RISCV_DEBUG_REGS_START_ADDR | 0x234)
+#define RISCV_DEBUG_REG_NCRISC_RESET_PC                         (RISCV_DEBUG_REGS_START_ADDR | 0x238)
+#define RISCV_DEBUG_REG_NCRISC_RESET_PC_OVERRIDE                (RISCV_DEBUG_REGS_START_ADDR | 0x23C)
+#define RISCV_DEBUG_REG_DEST_CG_CTRL                            (RISCV_DEBUG_REGS_START_ADDR | 0x240)
+#define RISCV_DEBUG_REG_CG_CTRL_EN                              (RISCV_DEBUG_REGS_START_ADDR | 0x244)
+#define RISCV_DEBUG_REG_CG_KICK                                 (RISCV_DEBUG_REGS_START_ADDR | 0x248)
+
+//Here are the old manually-written defines that weren't covered by the
+//generator script, or are being depended on by legacy code:
+#define RISCV_DEBUG_REG_BREAKPOINT_CTRL         (RISCV_DEBUG_REGS_START_ADDR | 0x1C0)
+#define RISCV_DEBUG_REG_BREAKPOINT_STATUS       (RISCV_DEBUG_REGS_START_ADDR | 0x1C4)
+#define RISCV_DEBUG_REG_BREAKPOINT_DATA         (RISCV_DEBUG_REGS_START_ADDR | 0x1C8)
+#define RISCV_DEBUG_REG_INSTRN_BUF_CTRL0        (RISCV_DEBUG_REGS_START_ADDR | 0x0A0)
+#define RISCV_DEBUG_REG_INSTRN_BUF_CTRL1        (RISCV_DEBUG_REGS_START_ADDR | 0x0A4)
+#define RISCV_DEBUG_REG_INSTRN_BUF_STATUS       (RISCV_DEBUG_REGS_START_ADDR | 0x0A8)
+#define RISCV_DEBUG_REG_THREAD0_CREG_RDDATA     (RISCV_DEBUG_REGS_START_ADDR | 0x078)
+#define RISCV_DEBUG_REG_WALL_CLOCK_L            (RISCV_DEBUG_REGS_START_ADDR | 0x1F0)
+#define RISCV_DEBUG_REG_WALL_CLOCK_H            (RISCV_DEBUG_REGS_START_ADDR | 0x1F8)
+#define RISCV_DEBUG_REG_WDT                     (RISCV_DEBUG_REGS_START_ADDR | 0x1E0)
+#define RISCV_DEBUG_REG_WDT_CNTL                (RISCV_DEBUG_REGS_START_ADDR | 0x1E4)
+#define RISCV_DEBUG_REG_WDT_STATUS              (RISCV_DEBUG_REGS_START_ADDR | 0x1E8)
+
+typedef struct {
+   uint dbg_sig_sel:16;
+   uint dbg_daisy_sel:8;
+   uint dbg_rd_sel:4;
+   uint dbg_reg_ovrd_en:1;
+   uint dbg_daisy_en:1;
+   uint dbg_reserved:2;
+} riscv_debug_reg_dbg_dbus_cntl_t;
+
+typedef union {
+   uint val;
+   riscv_debug_reg_dbg_dbus_cntl_t f;
+} riscv_debug_reg_dbg_dbus_cntl_u;
+
+typedef struct {
+   uint mem_dump_mode:4;
+   uint skip_cycles:8;
+   uint mem_write:1;
+   uint mem_read:1;
+   uint reserved:18;
+} riscv_debug_reg_dbg_l1_mem_reg2_t;
+
+typedef union {
+   uint val;
+   riscv_debug_reg_dbg_l1_mem_reg2_t f;
+} riscv_debug_reg_dbg_l1_mem_reg2_u;
+
+#define SOFT_RESET_UNPACKER(arg)    ((arg&0x3)<<0)
+#define SOFT_RESET_PACKER(arg)      ((arg&0xf)<<2)
+#define SOFT_RESET_MOVER            ((0x1)<<6)
+#define SOFT_RESET_SEARCH           ((0x1)<<7)
+#define SOFT_RESET_GLUE             ((0x1)<<8)
+#define SOFT_RESET_THCON            ((0x1)<<9)
+#define SOFT_RESET_FPU              ((0x1)<<10)
+#define SOFT_RESET_RISC_CTRL(arg)   ((arg&0xf)<<11) // Soft reset for RISCV cores. Bit 0 - Brisc, Bit 1+ - Trisc
+#define SOFT_RESET_SRCA_REG         ((0x1)<<15)
+#define SOFT_RESET_SRCB_REG         ((0x1)<<16)
+#define SOFT_RESET_DEST_REG         ((0x1)<<17)
+
+// TDMA flop register index offset
+#define TDMA_FLOPREG_IDX_BASE(arg)  ((arg) * 32)
+
+/////////////
+// Interrupt controller definitions
+#define RISC_PIC_BASE 0xFFB1'3000
+#define RISC_PIC_BASE_PTR ((uint32_t volatile *)0xFFB1'3000)
+#define RISC_PIC_BRISC_SW_INT_EN (RISC_PIC_BASE_PTR + 0)
+#define RISC_PIC_BRISC_HW_INT_EN (RISC_PIC_BASE_PTR + 1)
+#define RISC_PIC_BRISC_INT_NO    (RISC_PIC_BASE_PTR + 2)
+#define RISC_PIC_NCRISC_SW_INT_EN (RISC_PIC_BASE_PTR + 3)
+#define RISC_PIC_NCRISC_HW_INT_EN (RISC_PIC_BASE_PTR + 4)
+#define RISC_PIC_NCRISC_INT_NO    (RISC_PIC_BASE_PTR + 5)
+#define RISC_PIC_SW_INT_REGS (RISC_PIC_BASE_PTR + 6)
+#define RISC_PIC_HW_INTS     (RISC_PIC_BASE_PTR + 38)
+#define RISC_PIC_INT_PCS     (RISC_PIC_BASE_PTR + 42)
+
+
+/////////////
+// Instruction macro definitions
+// Consult instruction documentation in assembly.yaml
+#define INSTRN_GETDESC(arg)         (0x40000000 | (arg)) // Unimplemented.
+#define INSTRN_PACRNL(arg)          (0x41000000 | (arg)) // Pack row from DST to L0/L1
+#define INSTRN_UNPACR(arg)          (0x42000000 | (arg)) // Unpack row from tile in L0 to SRCA/SRCB
+#define INSTRN_SEARCHX(arg)         (0x43000000 | (arg)) // Search for start of selected row within tile. To be invoked prior to each invocation of UNPACR.
+#define INSTRN_RSTDMA                0x44000000          // Soft reset of TDMA engine
+#define INSTRN_SET_DMA_REG(arg)     (0x45000000 | (arg)) // Set TDMA register file register with 16b immediate value provided with instruction
+#define INSTRN_FLUSH_DMA(arg)       (0x46000000 | (arg)) // Flush TDMA engine or some subset of it as specified by instruction argument
+#define INSTRN_MV_REG_TO_FLOPS(arg) (0x48000000 | (arg)) // Move data from TDMA register file into flip flops driving actual config signals. Used for certain TDMA configuration signal setting.
+#define INSTRN_LOAD_IND(arg)        (0x49000000 | (arg)) // Load indirect from address specified in a TDMA register, with offset specified in TDMA register to a TDMA register. Supports autoincrementing offset
+#define INSTRN_AT_INCR_GET(arg)     (0x61000000 | (arg)) // Atomic increment and get - will read value in targetted memory location and return it to TDMA register and post-increment it atomically
+#define INSTRN_AT_INCR_GET_PTR(arg) (0x62000000 | (arg)) // Atomic increment and get pointer - will access a memory location designated as a FIFO pointer location (contains a 32b read pointer and a 32b write pointer), return the pointer value to TDMA register and post-increment it unless the FIFO condition precludes that. For example, write pointer will not be incremented if FIFO is full. Read pointer will not be incremented if FIFO is empty. FIFO full or empty conditions are returned as an unsuccessfull return condition code, so that the thread controller can retry until success (retry reads if FIFO empty, retry writes if FIFO full.)
+#define INSTRN_AT_SWAP(arg)         (0x63000000 | (arg)) // Atomic unconditional SWAP. Swaps selected 16b chunks of memory location with new ones provided on write data bus.
+#define INSTRN_AT_CAS(arg)          (0x64000000 | (arg)) // Atomic compare-and-swap. If value at selected memory location matches that provided by programmer it is swapped to a new one, also provided by programmer. This instruction is implemented for implementations of mutual exclusion between Tensix cores and threads
+#define INSTRN_STORE_IND(arg)       (0x66000000 | (arg)) // Store indirect. Stores data from TDMA register to memory location specified by a combination of base+offset provided in other TDMA registers. Supports auto-increment on offset value.
+
+#define INSTRN_SETC16(arg)         (0xb2000000 | (arg)) // Sets thread specific control register <register> to the value stored in the slot argument. 32-bit instruction. Register index (bits16-23) Value: (bits 15-0).
+#define INSTRN_WRCFG(arg)          (0xb0000000 | (arg))
+#define INSTRN_RDCFG(arg)          (0xb1000000 | (arg))
+
+#define INSTRN_SETC(arg)            (0x80000000 | (arg)) // Sets thread specific control register <register> to the value stored in the slot argument. 64-bit instruction. Register index in low 11 bits of first word, register value in second word. **Deprecated**
+#define INSTRN_SETRWC(arg)          (0x38000000 | (arg)) //
+#define INSTRN_SETADC(arg)          (0x50000000 | (arg)) // Set address counter for one channel and one dimension.
+#define INSTRN_SETADCXY(arg)        (0x51000000 | (arg)) // Set address counters for X and Y dimensions for all channels
+#define INSTRN_SETADCZW(arg)        (0x54000000 | (arg)) // Set address counters for Z and W dimensions for all channels
+#define INSTRN_FLUSH(arg)           (0x81000000 | (arg)) // Flush all buffers of oustanding instructions, reads/writes.
+#define INSTRN_NOP(arg)             (0x02000000 | (arg)) // Do nothing and consume an instruction slot and a cycle
+#define INSTRN_MOVA2D(arg)          (0x1a000000 | (arg)) // Move SRCA register to DST
+#define INSTRN_ZEROSRC(arg)         (0x1b000000 | (arg)) // Clear SRC registers
+#define INSTRN_SETPKEDGEOF(arg)     (0x1d000000 | (arg)) // Set packer edge masking offsets
+#define INSTRN_STALLWAIT(arg)       (0xa2000000 | (arg)) // Stall resource until condition is met
+#define INSTRN_CLEAR_DVALID(arg)    (0x37000000 | (arg)) // Clear dvalid bits
+#define INSTRN_SEMINIT(arg)         (0xa3000000 | (arg)) // Initialize a semaphore
+#define INSTRN_ZEROACC(arg)         (0x10000000 | (arg)) // Zero out the accumulator
+#define INSTRN_SFPENCC(arg)         (0x8a000000 | (arg)) // Enable the SFPU CC state
+#define INSTRN_SFPLOADI(arg)        (0x71000000 | (arg)) // Load an SFPU register
+#define INSTRN_SFPCONFIG(arg)       (0x91000000 | (arg)) // Set SFPU config register state
+
+#define TENSIX_UNHALT_VAL            0x40000000          // When written into PC_BUF_BASE, tensix core will unhalt and continue execution at the previous PC.
+#define TENSIX_NEWPC_VAL(arg)       (0x80000000 | (arg)) // Format a PC into a value that will unhalt the tensix core and jump to that PC. This value can be written into PC_BUF_BASE.
+#define TENSIX_LOOP_PC_VAL(arg)     (0x00000000 | (arg)) // Start a PC buffer loop
+#define TENSIX_PC_SYNC(arg)         (0xC0000000 | (arg)) // Sync - block until all kernels are done
+
+#define INSTRN_HALTF(arg)           (0x90000000 | (arg)) // Final Halt PC, it will stop the thread in question from executing and only tensix reset can unhalt, can't be unhalted by usual register write.
+
+// Instruction modes (i.e., selection) definitions
+#define INSTRN_SEL_L0               0
+#define INSTRN_SEL_L1               1
+
+#define INSTRN_SEL_SIZE_16B         0
+#define INSTRN_SEL_SIZE_4B          1
+#define INSTRN_SEL_SIZE_2B          2
+#define INSTRN_SEL_SIZE_1B          3
+
+#define INSTRN_SEL_AUTO_INC_NONE    0
+#define INSTRN_SEL_AUTO_INC_2B      1
+#define INSTRN_SEL_AUTO_INC_4B      2
+#define INSTRN_SEL_AUTO_INC_16B     3
+
+#define INSTRN_SEL_RD_PTR           0
+#define INSTRN_SEL_WR_PTR           1
+
+#define REG2FLOP_TARGET_TDMA        0
+#define REG2FLOP_TARGET_LOCAL_REGS  1
+#define REG2FLOP_TARGET_ADDR_CNTRS  2
+
+#define BYTE_OFFSET_ZERO            0
+#define BYTE_OFFSET_ONE             1
+#define BYTE_OFFSET_TWO             2
+#define BYTE_OFFSET_THREE           3
+
+// Address defines for "SETC registers" aka "Local registers" -- see src/meta/regspecs/local_regs.yaml
+// FIXME: This needs to be generated from that yaml file... it went out of date without anyone noticing :(
+/*
+#define ALU_FORMAT_SPEC_REG      1
+#define DEST_TARGET_REG_CFG      2
+//#define MISC_CFG                 3
+#define ALU_FORMAT_SPEC_REG0     4
+#define ALU_FORMAT_SPEC_REG1     5
+#define ALU_FORMAT_SPEC_REG2     6
+#define UNP0_ADDR_CTRL_XY_REG_0  8
+#define UNP0_ADDR_CTRL_ZW_REG_0  9
+#define UNP0_ADDR_BASE_REG_0     10
+#define UNP0_ADDR_CTRL_XY_REG_1  11
+#define UNP0_ADDR_CTRL_ZW_REG_1  12
+#define UNP0_ADDR_BASE_REG_1     13
+#define UNP1_ADDR_CTRL_XY_REG_0  14
+#define UNP1_ADDR_CTRL_ZW_REG_0  15
+#define UNP1_ADDR_BASE_REG_0     16
+#define UNP1_ADDR_CTRL_XY_REG_1  17
+#define UNP1_ADDR_CTRL_ZW_REG_1  18
+#define UNP1_ADDR_BASE_REG_1     19
+#define PCK0_ADDR_CTRL_XY_REG_0  20
+#define PCK0_ADDR_CTRL_ZW_REG_0  21
+#define PCK0_ADDR_BASE_REG_0     22
+#define PCK0_ADDR_CTRL_XY_REG_1  23
+#define PCK0_ADDR_CTRL_ZW_REG_1  24
+#define PCK0_ADDR_BASE_REG_1     25
+#define SRCA_REGW_BASE           26
+#define SRCB_REGW_BASE           27
+#define DEST_REGW_BASE           28
+#define MATH_FIDELITY_CTRL       29
+#define LOOP_CNT_REG0            32
+#define LOOP_CNT_REG1            33
+#define LOOP_CNT_REG2            34
+#define LOOP_CNT_REG3            35
+#define LOOP_CNT_REG4            36
+#define LOOP_CNT_REG5            37
+#define LOOP_CNT_REG6            38
+#define LOOP_CNT_REG7            39
+#define LOOP_CNT_REG8            40
+#define LOOP_CNT_REG9            41
+#define LOOP_CNT_REG10           42
+#define LOOP_CNT_REG11           43
+#define MATH_FIDELITY            44
+#define TXC_IC_INVALIDATE        45
+#define RISCV_IC_INVALIDATE      46
+#define STACC_RELU               47
+#define PCK_EDGE_OFFSET          48
+#define DEST_OFFSET              49
+#define DEBUG_MUX_CTRL           50
+#define DEBUG_MUX_RD             51
+#define SET_ADDRCNT_PROG_INC     52
+#define SET_REGWCNT_PROG_INC     53
+#define DEBUG_ARRAY_RD_CMD       54
+#define DEBUG_ARRAY_RD_EN        55
+#define CG_CTRL_EN               56
+#define CG_CTRL_KICK             57
+#define PERF_CNT_CMD0            58
+#define PERF_CNT_CMD1            59
+#define PERF_CNT_CMD2            60
+#define PERF_CNT_CMD3            61
+#define ENABLE_ACC_STATS         62
+#define DISABLE_RISC_BP          63
+*/
+
+#define ADDR_16K  0x4000
+#define ADDR_32K  0x8000
+#define ADDR_64K  0x10000
+#define ADDR_128K 0x20000
+#define ADDR_256K 0x40000
+#define ADDR_512K 0x80000
+#define ONE_16B 16
+#define ONE_32b 4
+
+#define GET_16B_ADDR(arg) ((arg) >> 4)
+
+// Tensix general purpose register file, 64 32-bit registers
+static constexpr unsigned int R0 = 0;
+static constexpr unsigned int R1 = 1;
+static constexpr unsigned int R2 = 2;
+static constexpr unsigned int R3 = 3;
+static constexpr unsigned int R4 = 4;
+static constexpr unsigned int R5 = 5;
+static constexpr unsigned int R6 = 6;
+static constexpr unsigned int R7 = 7;
+static constexpr unsigned int R8 = 8;
+static constexpr unsigned int R9 = 9;
+static constexpr unsigned int R10 = 10;
+static constexpr unsigned int R11 = 11;
+static constexpr unsigned int R12 = 12;
+static constexpr unsigned int R13 = 13;
+static constexpr unsigned int R14 = 14;
+static constexpr unsigned int R15 = 15;
+static constexpr unsigned int R16 = 16;
+static constexpr unsigned int R17 = 17;
+static constexpr unsigned int R18 = 18;
+static constexpr unsigned int R19 = 19;
+static constexpr unsigned int R20 = 20;
+static constexpr unsigned int R21 = 21;
+static constexpr unsigned int R22 = 22;
+static constexpr unsigned int R23 = 23;
+static constexpr unsigned int R24 = 24;
+static constexpr unsigned int R25 = 25;
+static constexpr unsigned int R26 = 26;
+static constexpr unsigned int R27 = 27;
+static constexpr unsigned int R28 = 28;
+static constexpr unsigned int R29 = 29;
+static constexpr unsigned int R30 = 30;
+static constexpr unsigned int R31 = 31;
+static constexpr unsigned int R32 = 32;
+static constexpr unsigned int R33 = 33;
+static constexpr unsigned int R34 = 34;
+static constexpr unsigned int R35 = 35;
+static constexpr unsigned int R36 = 36;
+static constexpr unsigned int R37 = 37;
+static constexpr unsigned int R38 = 38;
+static constexpr unsigned int R39 = 39;
+static constexpr unsigned int R40 = 40;
+static constexpr unsigned int R41 = 41;
+static constexpr unsigned int R42 = 42;
+static constexpr unsigned int R43 = 43;
+static constexpr unsigned int R44 = 44;
+static constexpr unsigned int R45 = 45;
+static constexpr unsigned int R46 = 46;
+static constexpr unsigned int R47 = 47;
+static constexpr unsigned int R48 = 48;
+static constexpr unsigned int R49 = 49;
+static constexpr unsigned int R50 = 50;
+static constexpr unsigned int R51 = 51;
+static constexpr unsigned int R52 = 52;
+static constexpr unsigned int R53 = 53;
+static constexpr unsigned int R54 = 54;
+static constexpr unsigned int R55 = 55;
+static constexpr unsigned int R56 = 56;
+static constexpr unsigned int R57 = 57;
+static constexpr unsigned int R58 = 58;
+static constexpr unsigned int R59 = 59;
+static constexpr unsigned int R60 = 60;
+static constexpr unsigned int R61 = 61;
+static constexpr unsigned int R62 = 62;
+static constexpr unsigned int R63 = 63;
+
+// this is a "short" (i.e., 16-bit) interface to the 32-bit Tensix registers
+// we can access LO or HI 16-bits of each 32-bit register
+#define R0_LO 0
+#define R0_HI 1
+#define R1_LO 2
+#define R1_HI 3
+#define R2_LO 4
+#define R2_HI 5
+#define R3_LO 6
+#define R3_HI 7
+#define R4_LO 8
+#define R4_HI 9
+#define R5_LO 10
+#define R5_HI 11
+#define R6_LO 12
+#define R6_HI 13
+#define R7_LO 14
+#define R7_HI 15
+#define R8_LO 16
+#define R8_HI 17
+#define R9_LO 18
+#define R9_HI 19
+#define R10_LO 20
+#define R10_HI 21
+#define R11_LO 22
+#define R11_HI 23
+#define R12_LO 24
+#define R12_HI 25
+#define R13_LO 26
+#define R13_HI 27
+#define R14_LO 28
+#define R14_HI 29
+#define R15_LO 30
+#define R15_HI 31
+#define R16_LO 32
+#define R16_HI 33
+#define R17_LO 34
+#define R17_HI 35
+#define R18_LO 36
+#define R18_HI 37
+#define R19_LO 38
+#define R19_HI 39
+#define R20_LO 40
+#define R20_HI 41
+#define R21_LO 42
+#define R21_HI 43
+#define R22_LO 44
+#define R22_HI 45
+#define R23_LO 46
+#define R23_HI 47
+#define R24_LO 48
+#define R24_HI 49
+#define R25_LO 50
+#define R25_HI 51
+#define R26_LO 52
+#define R26_HI 53
+#define R27_LO 54
+#define R27_HI 55
+#define R28_LO 56
+#define R28_HI 57
+#define R29_LO 58
+#define R29_HI 59
+#define R30_LO 60
+#define R30_HI 61
+#define R31_LO 62
+#define R31_HI 63
+#define R32_LO 64
+#define R32_HI 65
+#define R33_LO 66
+#define R33_HI 67
+#define R34_LO 68
+#define R34_HI 69
+#define R35_LO 70
+#define R35_HI 71
+#define R36_LO 72
+#define R36_HI 73
+#define R37_LO 74
+#define R37_HI 75
+#define R38_LO 76
+#define R38_HI 77
+#define R39_LO 78
+#define R39_HI 79
+#define R40_LO 80
+#define R40_HI 81
+#define R41_LO 82
+#define R41_HI 83
+#define R42_LO 84
+#define R42_HI 85
+#define R43_LO 86
+#define R43_HI 87
+#define R44_LO 88
+#define R44_HI 89
+#define R45_LO 90
+#define R45_HI 91
+#define R46_LO 92
+#define R46_HI 93
+#define R47_LO 94
+#define R47_HI 95
+#define R48_LO 96
+#define R48_HI 97
+#define R49_LO 98
+#define R49_HI 99
+#define R50_LO 100
+#define R50_HI 101
+#define R51_LO 102
+#define R51_HI 103
+#define R52_LO 104
+#define R52_HI 105
+#define R53_LO 106
+#define R53_HI 107
+#define R54_LO 108
+#define R54_HI 109
+#define R55_LO 110
+#define R55_HI 111
+#define R56_LO 112
+#define R56_HI 113
+#define R57_LO 114
+#define R57_HI 115
+#define R58_LO 116
+#define R58_HI 117
+#define R59_LO 118
+#define R59_HI 119
+#define R60_LO 120
+#define R60_HI 121
+#define R61_LO 122
+#define R61_HI 123
+#define R62_LO 124
+#define R62_HI 125
+#define R63_LO 126
+#define R63_HI 127
+
+typedef enum {
+  UNP0 = 1,
+  UNP1 = 2,
+  PCK0 = 4
+} cnt_id_t;
+
+#ifdef CPU_JAWBRIDGE
+#define TENSIX_MAX_KERNEL_LOOP_COUNT 128u
+#else
+#define TENSIX_MAX_KERNEL_LOOP_COUNT 65535u
+#endif
+
+/////////////
+
+
+template <class T>
+inline T bitmask(unsigned int bits)
+{
+    static_assert(!std::numeric_limits<T>::is_signed, "bitmask type must be unsigned");
+
+    // just a limitation of the implementation:
+    // we use digits() to see if 1 << bits is representable
+    static_assert(std::numeric_limits<T>::radix == 2, "bitmask type must be radix 2");
+
+    return (bits == std::numeric_limits<T>::digits) ? std::numeric_limits<T>::max() : (T(1) << bits) - 1;
+}
+
+template <class T>
+inline typename std::make_unsigned<T>::type pack_field(T x, unsigned int to_shift)
+{
+    typedef typename std::make_unsigned<T>::type u_T;
+    u_T u_x(x);
+
+    // verify that no bits are shifted away
+    //assert((u_x & (std::numeric_limits<u_T>::max() << (std::numeric_limits<u_T>::digits - to_shift))) == 0);
+
+    return u_x << to_shift;
+}
+
+template <class T>
+inline typename std::make_unsigned<T>::type pack_field(T x, unsigned int bits, unsigned int to_shift)
+{
+    typename std::make_unsigned<T>::type u_x(x);
+
+    // assert((u_x & ~bitmask<T>(bits)) == 0);
+    // assert(bits + to_shift <= std::numeric_limits<T>::digits);
+
+    return u_x << to_shift;
+}
+
+template <class T>
+inline typename std::make_unsigned<T>::type pack_field(T x, unsigned int bits, unsigned int from_shift, unsigned int to_shift)
+{
+  typename std::make_unsigned<T>::type u_x(x);
+
+    // assert(bits + to_shift <= std::numeric_limits<T>::digits);
+    // assert(bits + from_shift <= std::numeric_limits<T>::digits);
+
+    return ((u_x >> from_shift) & bitmask<T>(bits)) << to_shift;
+}
+
+#define IRQ_HANDLER __attribute__ ((interrupt("machine"), noinline, used))
+
+#define ADC_FLOP_ADDR(addr, counter_id, channel_index, dimension_index) do {                   \
+  if      ((channel_index == 0) && (counter_id == UNP0) && (dimension_index == 0)) addr = 0;   \
+  else if ((channel_index == 0) && (counter_id == UNP0) && (dimension_index == 1)) addr = 1;   \
+  else if ((channel_index == 0) && (counter_id == UNP0) && (dimension_index == 2)) addr = 2;   \
+  else if ((channel_index == 0) && (counter_id == UNP0) && (dimension_index == 3)) addr = 3;   \
+  else if ((channel_index == 0) && (counter_id == UNP1) && (dimension_index == 0)) addr = 8;   \
+  else if ((channel_index == 0) && (counter_id == UNP1) && (dimension_index == 1)) addr = 9;   \
+  else if ((channel_index == 0) && (counter_id == UNP1) && (dimension_index == 2)) addr = 10;  \
+  else if ((channel_index == 0) && (counter_id == UNP1) && (dimension_index == 3)) addr = 11;  \
+  else if ((channel_index == 0) && (counter_id == PCK0) && (dimension_index == 0)) addr = 16;  \
+  else if ((channel_index == 0) && (counter_id == PCK0) && (dimension_index == 1)) addr = 17;  \
+  else if ((channel_index == 0) && (counter_id == PCK0) && (dimension_index == 2)) addr = 18;  \
+  else if ((channel_index == 0) && (counter_id == PCK0) && (dimension_index == 3)) addr = 19;  \
+  else if ((channel_index == 1) && (counter_id == UNP0) && (dimension_index == 0)) addr = 32;  \
+  else if ((channel_index == 1) && (counter_id == UNP0) && (dimension_index == 1)) addr = 33;  \
+  else if ((channel_index == 1) && (counter_id == UNP0) && (dimension_index == 2)) addr = 34;  \
+  else if ((channel_index == 1) && (counter_id == UNP0) && (dimension_index == 3)) addr = 35;  \
+  else if ((channel_index == 1) && (counter_id == UNP1) && (dimension_index == 0)) addr = 40;  \
+  else if ((channel_index == 1) && (counter_id == UNP1) && (dimension_index == 1)) addr = 41;  \
+  else if ((channel_index == 1) && (counter_id == UNP1) && (dimension_index == 2)) addr = 42;  \
+  else if ((channel_index == 1) && (counter_id == UNP1) && (dimension_index == 3)) addr = 43;  \
+  else if ((channel_index == 1) && (counter_id == PCK0) && (dimension_index == 0)) addr = 48;  \
+  else if ((channel_index == 1) && (counter_id == PCK0) && (dimension_index == 1)) addr = 49;  \
+  else if ((channel_index == 1) && (counter_id == PCK0) && (dimension_index == 2)) addr = 50;  \
+  else if ((channel_index == 1) && (counter_id == PCK0) && (dimension_index == 3)) addr = 51;  \
+  else addr = 0;                                                                               \
+} while (0)
+
+#endif
diff --git a/tt_metal/hw/inc/blackhole/tensix_types.h b/tt_metal/hw/inc/blackhole/tensix_types.h
new file mode 100644
index 000000000000..879aab6c574d
--- /dev/null
+++ b/tt_metal/hw/inc/blackhole/tensix_types.h
@@ -0,0 +1,372 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TENSIX_TYPES_H_INCLUDED
+#define TENSIX_TYPES_H_INCLUDED
+
+#include <cstdint>
+
+#ifndef TENSIX_FIRMWARE
+#include <boost/format.hpp>
+#endif
+
+//
+//  tensix_types.h
+//  This file contains tensix structures used by RISCV firmware and test-bench/tests
+//
+//  Copyright © 2018 Tenstorrent. All rights reserved.
+//
+
+/////////////
+// Global enums and defines
+////////////
+typedef enum
+{
+  XMOV_L0_TO_L1 = 0,
+  XMOV_L1_TO_L0 = 1,
+  XMOV_L0_TO_L0 = 2,
+  XMOV_L1_TO_L1 = 3,
+} xmov_direction_t;
+
+typedef enum
+{
+  TDMA_MOVER0 = 0,
+  TDMA_MOVER1 = 1
+} tdma_mover_id_t;
+
+typedef enum {
+  MATH_HF   = 1,
+  MATH_AUTO = 2,
+  MATH_LF   = 4
+} math_fidelity_t;
+
+typedef enum {
+  RELU_NONE = 0,
+  RELU_PLAIN = 1,
+  RELU_THRESH = 2,
+  RELU_MAX = 3
+} relu_mode_t;
+
+typedef enum {
+  STOCH_RND_NONE = 0,
+  STOCH_RND_FPU = 1,
+  STOCH_RND_GASKET = 2,
+  STOCH_RND_PACKER = 4
+} stochastic_round_settings_t;
+
+/////////////
+// TDMA Registers
+////////////
+typedef struct {
+  uint32_t row_section_size : 16;
+  uint32_t exp_section_size : 16;
+  uint32_t tile_dst_addr    : 32;
+  uint32_t uncompressed     : 1;
+  uint32_t reserved_0       : 3;
+  uint32_t out_data_format  : 2;
+  uint32_t reserved_1       : 2;
+  uint32_t in_data_format   : 2;
+  uint32_t reserved_2       : 22;
+  uint32_t reserved_3       : 32;
+} packer_config_t; //16B
+
+typedef struct {
+  uint32_t rd_ptr ;
+  uint32_t wr_ptr ;
+  uint32_t rsvd0 ;
+  uint32_t rsvd1 ;
+#ifndef TENSIX_FIRMWARE
+  operator std::string() const {
+    return (boost::format("Fifo Control: rd_ptr(0x%08x) wr_ptr(0x%08x)")
+            % rd_ptr
+            % wr_ptr).str() ;
+  }
+#endif
+} fifo_ctl_t ;
+
+typedef struct {
+  uint32_t val[4];
+  packer_config_t f;
+} packer_config_u;
+
+typedef struct {
+  uint32_t src_addr   : 32;
+  uint32_t dst_addr   : 32;
+  uint32_t xfer_size  : 32;
+  uint32_t xfer_dir   : 2;
+  uint32_t reserved_0 : 30;
+} mover_config_t; //16B
+
+typedef struct {
+  uint32_t val[4];
+  mover_config_t f;
+} mover_config_u;
+
+/////////////
+// Data section structures
+/////////////
+
+// Tile descriptor
+typedef struct {
+  uint32_t data_format : 4;
+  uint32_t uncompressed: 1;
+  uint32_t reserved_0  : 3;
+  uint32_t blobs_per_xy_plane  : 4;
+  uint32_t reserved_1  : 4;
+  uint32_t x_dim       : 16;
+  uint32_t y_dim       : 16;
+  uint32_t z_dim       : 16;
+  uint32_t w_dim       : 16;
+  uint32_t blobs_y_start : 32;
+  uint32_t digest_type : 8;  // Not used
+  uint32_t digest_size : 8;  // Not used
+} tile_descriptor_t; // Unpack configuration
+
+typedef union {
+  uint32_t val[4];
+  tile_descriptor_t f;
+} tile_descriptor_u;
+
+struct TileHeader
+{
+  // occupied part of the 16B line
+  std::uint16_t tile_size_16B = 0;
+  std::uint16_t reserved_0_mbz : 1;
+  std::uint16_t tile_id : 15;
+
+  std::uint8_t  metadata_size_16B = 0;
+  std::uint8_t  reserved_1 = 0;
+  std::uint16_t format = 0x10; // [3:0] format, 4-uncompress flag.
+
+  std::uint32_t zero_mask = 0;
+  std::uint32_t reserved_3 = 0;
+
+  TileHeader()
+    : reserved_0_mbz(0), tile_id(0) {}
+
+  bool IsCompressed() const { return ((format & 0x10) == 0); }
+
+#ifndef TENSIX_FIRMWARE
+  operator std::string() const {
+    return (boost::format("TileHeader:tile_id(0x%04x) size16B(0x%04x)")
+            % tile_id
+            % tile_size_16B).str() ;
+  }
+
+  std::size_t size() const { return 16; }
+  const void *data() const { return this; }
+  typedef std::uint8_t value_type;
+
+  bool operator!=(const TileHeader& rhs) const
+  {
+    bool result = tile_size_16B      != rhs.tile_size_16B
+                || tile_id           != rhs.tile_id
+                || metadata_size_16B != rhs.metadata_size_16B;
+    return result;
+  }
+
+#endif
+};
+
+union TileHeader_u {
+  uint32_t val[4];
+  TileHeader header;
+  TileHeader_u() { };
+};
+
+static_assert(sizeof(TileHeader) == 16, "TileHeader must be 16B");
+
+struct SectionHeader
+{
+  // occupied part of the 16B line
+  std::uint16_t section_id;
+  std::uint16_t section_size;
+  std::uint16_t tile_count;
+
+  // unoccupied part of the 16B line
+  std::uint16_t reserved[5];
+
+#ifndef TENSIX_FIRMWARE
+  operator std::string() const {
+    return (boost::format("SectionHeader: id(0x%04x) size(0x%04x) tile_count(0x%04x)")
+            % section_id
+            % section_size
+            % tile_count).str() ;
+  }
+#endif
+};
+
+// Actually it only has to be a multiple of 16B
+static_assert(sizeof(SectionHeader) == 16, "struct section_header must be 16 bytes");
+
+static constexpr std::uint32_t TEST_MSG_EN_TENSIX_PM          = 0;
+static constexpr std::uint32_t TEST_MSG_DBG_DISABLE           = 1;
+static constexpr std::uint32_t TEST_MSG_SET_MAX_EXP_THRESH    = 2;
+static constexpr std::uint32_t TEST_MSG_RISC_BP_DISABLE       = 3;
+static constexpr std::uint32_t TEST_MSG_SET_RELU_PARAMS       = 4;
+static constexpr std::uint32_t TEST_MSG_SET_PRNG_SEED         = 5;
+static constexpr std::uint32_t TEST_MSG_RISC_PREFETCHER_CTRL  = 6;
+static constexpr std::uint32_t TEST_MSG_SYNTH_CKERNEL         = 10;
+
+static constexpr std::uint32_t COMMAND_QUEUE_SIZE_BYTES_LOG2 = 16;
+static constexpr std::uint32_t COMMAND_QUEUE_SIZE_BYTES = 1 << COMMAND_QUEUE_SIZE_BYTES_LOG2;
+static constexpr std::uint32_t COMMAND_SIZE_BYTES_LOG2 = 5;
+static constexpr std::uint32_t COMMAND_SIZE_BYTES = 1 << COMMAND_SIZE_BYTES_LOG2;
+
+static constexpr std::uint32_t DEST_FACE_WIDTH = 16;
+static constexpr std::uint32_t DEST_FACE_WIDTH_LOG2 = 4;
+static constexpr std::uint32_t DEST_FACE_HEIGHT = 16;
+static constexpr std::uint32_t DEST_FACE_HEIGHT_LOG2 = 4;
+static constexpr std::uint32_t DEST_REGISTER_FULL_SIZE = 64 * DEST_FACE_HEIGHT;
+static constexpr std::uint32_t DEST_REGISTER_FULL_SIZE_LOG2 = 12;
+static constexpr std::uint32_t DEST_REGISTER_HALF_SIZE = DEST_REGISTER_FULL_SIZE / 2;
+static constexpr std::uint32_t BIT32_DEST_REGISTER_HALF_SIZE = DEST_REGISTER_HALF_SIZE / 2;
+
+static constexpr std::uint32_t DEST_REGISTER_FULL_SIZE_BYTES = DEST_REGISTER_FULL_SIZE * 2 * 16;
+static constexpr std::uint32_t DEST_REGISTER_HALF_SIZE_BYTES = DEST_REGISTER_FULL_SIZE_BYTES / 2;
+
+static constexpr std::uint32_t SIM_L1_SIZE = 0x16E000;   // 1.5MB - 72KB
+#ifdef TENSIX_FIRMWARE
+static constexpr std::uint32_t L1_SIZE = 0x16E000;   // 1.5MB - 72KB
+#else
+static constexpr std::uint32_t L1_SIZE = 0x16E000; // 1.5MB - 72KB
+#endif
+
+// Voluntary FIFO alignment so that we can pack fifo address down to 16 bits in the command.
+// At 8, we can cover 16MB. The upper limit is 15 because the command queue is at 32K (by default)
+// Even though the command queue address never goes into a command, assertions complain if it is misaligned.
+// (Hardware only requires 16B alignment.)
+static constexpr std::uint32_t FIFO_BASE_ADDRESS_ALIGN_BITS = 9;
+static constexpr std::uint32_t FIFO_BASE_ADDRESS_ALIGN = 1 << FIFO_BASE_ADDRESS_ALIGN_BITS;
+
+enum class DataFormat : std::uint8_t
+{
+  Float32   = 0,
+  Float16   = 1,
+  Bfp8      = 2,
+  Bfp4      = 3,
+  Bfp2      = 11,
+  Float16_b = 5,
+  Bfp8_b    = 6,
+  Bfp4_b    = 7,
+  Bfp2_b    = 15,
+  Lf8       = 10,
+  Int8      = 14,
+  Int32     = 8,
+  Int16     = 9,
+  Tf32      = 4,
+  Fp8_e4m3  = 26,         //Not a valid HW encoding, it is Lf8 encoding + extra 5th bit set to specify Lf8 with E4M3
+  Uint8     = 129,        // Not a valid HW enum value, but useful to have it here for SW
+  testMan7  = 0x82,       // intermediate format for testing: 7bit mantissa (6+hidden)
+  testMan2  = 0x8A,       // intermediate format for testing: 2bit mantissa (2+hidden)
+  Invalid   = 0xff
+};
+
+struct io_queue_pointers_t {
+
+  static constexpr std::uint32_t INVALID_IO_QUEUE_POINTER = 0xfeedface;
+  static constexpr std::uint32_t WRAP_MASK = 0x80000000;
+  static constexpr std::uint32_t MAX_IO_QUEUES = 256;
+  static constexpr std::uint32_t INPUT_IO_QUEUES = 64;
+
+  std::uint32_t rdptr;
+  std::uint32_t wrptr;
+  std::uint32_t base_addr;
+  std::uint32_t data_size_16B;
+  std::uint32_t buffer_size_16B;
+
+
+  inline void init_input_queue(std::uint32_t buffer_start, std::uint32_t buffer_end, std::uint32_t data_size) volatile {
+    base_addr = buffer_start;
+    rdptr = buffer_start;
+    data_size_16B = data_size >> 4;
+    buffer_size_16B = (buffer_end - buffer_start) >> 4;
+  }
+
+  inline void init_output_queue(std::uint32_t buffer_start, std::uint32_t buffer_end, std::uint32_t data_size) volatile {
+    base_addr = buffer_start;
+    wrptr = buffer_start;
+    data_size_16B = data_size >> 4;
+    buffer_size_16B = (buffer_end - buffer_start) >> 4;
+  }
+
+  inline void reset() volatile {
+    rdptr = INVALID_IO_QUEUE_POINTER;
+    wrptr = INVALID_IO_QUEUE_POINTER;
+  }
+
+  inline bool valid() volatile {
+    return (rdptr != INVALID_IO_QUEUE_POINTER);
+  }
+
+  inline std::uint32_t get_buffer_end() const volatile
+  {
+    return base_addr + (buffer_size_16B << 4);
+  }
+
+  inline void increment_rd_pointer() volatile {
+    if (!valid())
+      return;
+    std::uint32_t new_rdptr = rdptr + (data_size_16B << 4);
+    if ((new_rdptr & ~WRAP_MASK) >= get_buffer_end()) {
+      if (wrap_bit(new_rdptr)) {
+        new_rdptr = base_addr;
+      } else {
+        new_rdptr = WRAP_MASK | base_addr;
+      }
+    }
+    rdptr = new_rdptr;
+  }
+
+  inline bool wrap_bit(std::uint32_t ptr) volatile
+  {
+    return (ptr & WRAP_MASK) != 0;
+  }
+
+  inline void increment_wr_pointer() volatile {
+    if (wrptr == INVALID_IO_QUEUE_POINTER)
+      return;
+    std::uint32_t new_wrptr = wrptr + (data_size_16B << 4);
+    if ((new_wrptr & ~WRAP_MASK) >= get_buffer_end()) {
+      if (wrap_bit(new_wrptr)) {
+        new_wrptr = base_addr;
+      } else {
+        new_wrptr = WRAP_MASK | base_addr;
+      }
+    }
+    wrptr = new_wrptr;
+  }
+
+  inline void set_wr_pointer(std::uint32_t value) volatile {
+    wrptr = value;
+  }
+
+  inline void set_rd_pointer(std::uint32_t value) volatile {
+    rdptr = value;
+  }
+
+  inline bool empty() volatile {
+    return rdptr == wrptr;
+  }
+
+  inline bool full() volatile {
+    auto wrapped_rdptr = rdptr ^ WRAP_MASK;
+    return wrapped_rdptr == wrptr;
+  }
+
+  inline bool has_data() volatile {
+    return (rdptr != INVALID_IO_QUEUE_POINTER) and (wrptr != INVALID_IO_QUEUE_POINTER) and (not empty());
+  }
+
+  inline std::uint32_t unwrap_ptr(std::uint32_t value) const volatile
+  {
+    if (value == INVALID_IO_QUEUE_POINTER) {
+      return value;
+    }
+    return value & ~WRAP_MASK;
+  }
+
+};
+
+#endif
diff --git a/tt_metal/hw/inc/wormhole/epoch_q.h b/tt_metal/hw/inc/wormhole/epoch_q.h
deleted file mode 100644
index d7cc0a2944e1..000000000000
--- a/tt_metal/hw/inc/wormhole/epoch_q.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <stdint.h>
-
-namespace epoch_queue {
-/**
- * @brief Configuration parameters of the Epoch Queue on the "Silicon" device.
-*/
-    static constexpr std::int32_t EPOCH_Q_NUM_SLOTS = 32; // needs to match param with same name in ncrisc - parametrized by arch
-    static constexpr std::int32_t EPOCH_Q_SLOT_SIZE = 32; // needs to match param with same name in ncrisc - parametrized by arch
-    static constexpr std::int32_t GridSizeRow = 16;
-    static constexpr std::int32_t GridSizeCol = 16;
-    static constexpr std::int32_t EpochEndReached = 0xFFFFFFFF;
-
-
-/**
- * @brief Silicon device epoch queue command interpreted by NCRISC/ERISC FW.
- */
-    enum EpochQueueCmd
-    {
-        EpochCmdValid = 0x1,
-        EpochCmdNotValid = 0x2,
-        EpochCmdIOQueueUpdate = 0x3,
-        EpochCmdEndProgram = 0xF,
-    };
-
-    struct IOQueueUpdateCmdInfo {
-        uint64_t queue_header_addr;
-
-        uint8_t num_buffers;
-        uint8_t reader_index;
-        uint8_t num_readers;
-        // In full update mode: update_mask = 0xff
-        uint8_t update_mask;
-
-        uint32_t header[5]; // The first 5 words of the header
-    };
-
-    static constexpr std::int32_t EPOCH_Q_WRPTR_OFFSET = 4;
-    static constexpr std::int32_t EPOCH_Q_RDPTR_OFFSET = 0;
-    static constexpr std::int32_t EPOCH_Q_SLOTS_OFFSET = 32;
-
-    static constexpr std::int32_t EPOCH_TABLE_ENTRY_SIZE_BYTES = EPOCH_Q_NUM_SLOTS*EPOCH_Q_SLOT_SIZE+EPOCH_Q_SLOTS_OFFSET;
-    static constexpr std::int32_t QUEUE_UPDATE_BLOB_SIZE_BYTES = 120 * 8;
-
-    static constexpr std::int32_t DRAM_PERF_SCRATCH_SIZE_BYTES =   8 * 1024 * 1024;
-    // Starting from address 0, epoch queues start at 40MByte - sizeof(All epoch queues on the chip)
-    // i.e top of epoch q table is @ 40MByte.
-    static constexpr std::int32_t EPOCH_TABLE_DRAM_ADDR = DRAM_PERF_SCRATCH_SIZE_BYTES-GridSizeCol*GridSizeRow*EPOCH_TABLE_ENTRY_SIZE_BYTES;
-
-} // namespace epoch_queue
diff --git a/tt_metal/hw/inc/wormhole/noc/noc_overlay.c b/tt_metal/hw/inc/wormhole/noc/noc_overlay.c
deleted file mode 100644
index 26a27038c674..000000000000
--- a/tt_metal/hw/inc/wormhole/noc/noc_overlay.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-
-#include "noc_overlay.h"
-
-#include <stdint.h>
-#include <stdbool.h>
-#include "noc_overlay_parameters.h"
-
-
-#ifdef TB_NOC
-
-#include "noc_api_dpi.h"
-
-#else
-
-#define STREAM_WRITE_REG(id, reg_idx, val) ( ( *( (volatile uint32_t*)( OVERLAY_REGS_START_ADDR + ((id)*STREAM_REG_SPACE_SIZE) + ((reg_idx)*4)) ) ) = (val) )
-#define STREAM_READ_REG(id, reg_idx )        ( *( (volatile uint32_t*)( OVERLAY_REGS_START_ADDR + ((id)*STREAM_REG_SPACE_SIZE) + ((reg_idx)*4)) ) )
-
-#endif
diff --git a/tt_metal/hw/inc/wormhole/noc/noc_overlay.h b/tt_metal/hw/inc/wormhole/noc/noc_overlay.h
deleted file mode 100644
index a76243afe220..000000000000
--- a/tt_metal/hw/inc/wormhole/noc/noc_overlay.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef _NOC_OVERLAY_H_
-#define _NOC_OVERLAY_H_
-
-#include <stdint.h>
-#include <stdbool.h>
-
-
-/*
-
-  Basic stream semantics:
-  =======================
-
-
-    1. A stream is associated with storage and a flow-control mechanism in
-       both directions (i.e. possibility of backpressure on both sender
-       and receiver).  A stream ID is unique per processor, and can be
-       arbitrarily assigned in the course of initializing an overlay.
-
-
-    2. A stream data source can be:
-
-         (a) Output of the local processor (i.e. math/packer).
-
-         (b) Data sent by a stream on a remote processor.
-
-         (c) Data sent by another local stream.
-
-       A stream can have at most one source of type (a) or (b), or multiple
-       [limited by hardware resources - TBD] sources of type (c).
-
-       For streams that gather data from N sources, it is necessary to set up
-       N point-to-point streams, and then set up another stream  with N sources
-       of type (c).
-
-
-     3. A stream destination can be:
-
-         (a) Input to the local processor (i.e., unpacker/math).
-
-         (b) A stream on a single remote processor.
-
-         (c) Streams (each with the same local ID) in a group of
-             remote processors that is addressable as a multicast
-             destination.  (This may include the local processor
-             as well.)  All destinations must have the receiving
-             buffer at the same local memory address, and the same
-             buffer size for wraparound.
-
-         (d) Another local stream. (As discussed under 2(c) above.)
-
-        A stream can have up to one destination of type (a), up to
-        one destination of type (b) or (c), and up to one destination
-        of type (d).  Flow control is determined by the behavior of
-        the slowest-receiving destination.
-
-
-     4. After the initialization phase, software needs to deal with only those
-        streams that directly consume the output of the local processor (i.e.,
-        math/packer) or directly supply the input of the local processor (i.e.,
-        math/unpacker).  Everything else runs automatically, providing a simple
-        abstraction.
-
-
-  Example:
-  ========
-
-  Suppose we have the following setup for a layer, with 32 cores involved
-  (x-dimension size = 4, y-dimension size = 8):
-
-     1. Two clusters, with 16 cores each.  Cluster 0 includes cores with
-        coordinates from (0, 0) to (3, 3).  Cluster 1 includes cores with
-        coordinates from (0, 4) to (3, 7).
-
-     2. We use z-parallelism only, so ultimately each core's output needs
-        to be sent to every other core in both clusters.
-
-     2. Cluster 0 master is at (3, 3).  Cluster 1 master is at (0, 4).
-
-     3. In each cluster, all cores (including master) send their output
-        activations to the local cluster's master (the gather operation).
-
-     4. The master multicasts the gathered activations to its own cluster (incl.
-        to itself, with data looping back through NOC).  It also needs to send
-        the same gathered data to the other master, and to multicast the data
-        received from the other master to its local cluster.  (This can't be
-        done simply by including the other master into the same multicast that
-        delivers data to the local cluster, since the multicast set would be
-        non-rectangular.)
-
-  Suppose also that NOC0 is directed so that the shortest path is between nodes
-  (x, y)->(x+1, y) or (x, y)->(x, y+1), while NOC1 has the opposite direction.
-
-  We can use the following overlay configuration:
-
-      * Cluster 0 cores use NOC0 unicast to gather data at Master 0.
-      * Cluster 1 cores use NOC1 unicast to gather data at Master 1.
-      * Master 0 uses NOC1 multicast for output to Cluster 0.
-      * Master 1 uses NOC0 multicast for output to Cluster 1.
-      * Cluster 0 core (0, 3) uses NOC0 unicast to forward data received
-        from Master 0 to Master 1 at (0, 4).
-      * Cluster 1 core (3, 4) uses NOC1 unicast to forward data received
-        from Master 0 to Master 1 at (3, 3).
-
-  For the N->1 gather from each core to master, the master needs N input streams,
-  which are then interleaved into a single output stream.  We need N separate
-  stream IDs for these, since each is associated with a separate buffer and a
-  separate flow-control mechanism.  Another stream at master node is then used
-  to interleave them and output them as multicast.
-
-  In the example, we use stream IDs:
-    0      = local math/packer output (at each core)
-    1      = local unpacker/math input for the next layer (at each core)
-             (this is also forwarded to the other cluster's master from cores (0, 3) and (3, 4))
-    2...17 = master gather inputs (from each core)
-    18     = master multicast output
-    19     = master unicast input forwarded from the other cluster
-
-  The initialization API calls would go as follows (shown for Cluster 0, similar for Cluster 1).
-
-  All non-master cluster 0 cores:
-
-      // Stream 0 = local math/packer output, sent to master
-      noc_overlay_stream_set_buf(0, packer_output_buf, packer_output_buf_size, true);
-      noc_overlay_stream_set_source_local_producer(0);
-      my_index = my_y*4 + my_x;
-      my_dest_buf_at_master = master_gather_buf_start + my_index*master_buf_size_per_gather_source;
-      noc_overlay_stream_set_dest_remote(0, NOC_XY_ADDR(3, 3, my_dest_buf_at_master0), master_buf_size_per_gather_source, false, 2+my_index);
-
-      // Stream 1 = local math/unpacker input, received from master
-      noc_overlay_stream_set_buf(1, unpacker_input_buf, unpacker_input_buf_size, true);
-      noc_overlay_stream_set_source_remote(1, 3, 3, 18, my_index);
-      noc_overlay_stream_set_dest_local_consumer(1);
-
-  Core (0, 3) only:
-
-      // this adds an extra destination (Master 1) to stream 1, giving it a fanout of 2:
-      noc_overlay_stream_set_dest_remote(1, NOC_XY_ADDR(0, 4, dest_buf_at_master1), master1_cluster0_data_buf_size, false, 19);
-
-  Master 0:
-
-      // Stream 18 (master multicast output) is the destination for all input streams, including the local stream 0
-      for each (gather_src_x, gather_src_y)
-         src_index = gather_src_y*4 + gather_src_x;
-         stream_id = 2 + src_index
-         noc_overlay_stream_set_buf(stream_id, master_gather_buf_start + src_index*master_buf_size_per_gather_source, gather_input_buf_size, true);
-         noc_overlay_stream_set_source_remote(stream_id, gather_src_x, gather_src_y);
-         noc_overlay_connect_local_streams(stream_id, 18);
-
-      // Local math/packer output at Master 0 - also connect as one of stream 18 inputs:
-      noc_overlay_stream_set_buf(0, packer_output_buf, packer_output_buf_size, true);
-      noc_overlay_stream_set_source_local_producer(0);
-      noc_overlay_connect_local_streams(0, 18);
-
-      // Remote data from Master 1 via (3, 4)- also connect as one of stream 18 inputs:
-      noc_overlay_stream_set_buf(19, master0_cluster1_data_buf, master0_cluster1_data_buf_size, true);
-      noc_overlay_stream_set_source_remote(19, 3, 4, 1, 1);
-      noc_overlay_connect_local_streams(19, 18);
-
-      // Set multicast destination for stream 18
-      noc_overlay_stream_set_dest_remote(18, NOC_MULTICAST_ADDR(0, 0, 3, 3), unpacker_input_buf_size, true, 1);
-
-      // Stream 1 = local math/unpacker input, works the same as on non-master cores (data loops back through NOC router)
-      noc_overlay_stream_set_buf(1, unpacker_input_buf, unpacker_input_buf_size, true);
-      noc_overlay_stream_set_source_remote(1, 3, 3, 18, 3*4+3);
-      noc_overlay_stream_set_dest_local_consumer(1);
-
-
-  After initialization, everyone calls noc_overlay_stream_start(S) for each initialized stream S.
-  This will wait for "init complete" message from all downstream destinations (if any), and then send
-  one upstream to all sources (if any).
-
-
-  During layer execution, the only code that needs to be executed by software is the handling of
-  packer data sources and unpacker data destinations (i.e., those streams initialized with the
-  set_source_local_producer and set_dest_local_consumer calls).  Everything else runs automatically.
-
-  Local producers/consumers need to use the polling functions below to establish buffer free space/
-  data availability and then call data send/receive functions.
-
-
-  Once the layer is finished, everyone calls noc_overlay_stream_dealloc(S) for each initialized
-  stream S.  This will wait for "dealloc" message from all upstream sources (if any), and then
-  flush all data from the local buffer and send a "dealloc" message to all downstream destinations
-  (if any).  The function is non-blocking, so a polling function is available to test if deallocation
-  of each stream's resources is complete.
-
-*/
-
-//  [TBD - API for performance-related settings]
-//  [TBD - DRAM destination interleaving mode]
-
-
-///////////////////////////////
-// (1) Initialization functions
-
-
-/*
-  Associate stream ID with a buffer starting address and size.
-
-  Set message_flow_ctrl to true if flow control takes place at message (i.e. tile) level,
-  and to false for a raw byte stream.  If true, set message_size_index to the offset of the
-  message size field in the message header.
-
-*/
-void noc_overlay_stream_set_buf(uint32_t stream_id, uint8_t* buf_start_addr, uint32_t buf_size, bool message_flow_ctrl = true, uint32_t message_size_index = 0);
-
-
-/*
-  Set remote source for the stream ID.
-
-  We need (x, y) coordinates and the (remote) stream ID of the source so the overlay
-  logic knows where to send read pointer updates.
-
-  If the source stream is multicast or fans out to a local consumer and remote
-  destinations, each remote destination needs to call this function with a unique
-  destination index.  (This is because the source must maintain separate read pointers,
-  so this index is also necessary to calculate the remote read pointer address).  The
-  dest_index argument is don't-care if the source stream has fanout of 1.
-
- */
-void noc_overlay_stream_set_source_remote(uint32_t stream_id, uint32_t source_x, uint32_t source_y, uint32_t source_stream_id, uint32_t dest_index = 0);
-
-
-/*
-  Set local (math/packer) source for the stream ID.
-
-  If this function is called on a stream, it means that data need to be written into
-  memory by the local processor, and the local software needs to poll for free buffer
-  space and send data using the functions below.
-
- */
-void noc_overlay_stream_set_source_local_producer(uint32_t stream_id);
-
-
-/*
-  Connect two local streams so they act as source and destination.
-
-  A local stream can have an arbitrary [determined by HW resources - TBD] number
-  of local streams as input, and only one as the output.
-
-  [TBD - functions to set arbitration policy]
-
- */
-void noc_overlay_connect_local_streams(uint32_t src_stream_id, uint32_t dest_stream_id);
-
-
-/*
-  Set circular arbitration policy for a stream that has multiple connected input
-  streams.  (Applicable only to such streams, i.e. to which multiple source streams
-  have been connected by calling noc_overlay_connect_local_streams.)
-
-  This function is applicable only to streams working in message mode.
-
-  A stream with circular arbitration and inputs S_1 ... S_N will forward messages only when
-  each S_i, i=1...N has a ready message.  Its output data will consist of the messages read
-  from S_1...S_N, in that same order.  The downstream receiver can therefore expect to see
-  the messages from each source S_i, placed consecutively and in order in its input buffer.
-
- */
-void noc_overlay_stream_circular_arbitration(uint32_t stream_id);
-
-
-/*
-  Set destination for the stream ID. Destination can be either unicast or multicast.
-
-  We need the remote stream ID so the overlay logic knows where to send write pointer updates,
-  as well as destination buffer size so it knows how to wrap around the address.
-
- */
-void noc_overlay_stream_set_dest_remote(uint32_t stream_id, uint64_t dest_noc_addr, uint32_t dest_buf_size, bool multicast, uint32_t dest_stream_id = 0);
-
-
-/*
-  Set local (math/unpacker) destination for the stream ID.
-
-  If this function is called on a stream, it means that data need to be read from L1
-  by the local processor, and the local software needs to poll for data ready indications
-  and receive data when available using the functions below.
-
- */
-void noc_overlay_stream_set_dest_local_consumer(uint32_t stream_id);
-
-
-/*
-  Signal that initialization is done for a stream.  During the overlay initialization
-  phase, each processor should initialize all local streams using the above functions,
-  and then call the function below with each stream ID in use.
-
-  After this function has been called on all local streams that are used for a layer,
-  it is safe to start sending/receiving data.
-
- */
-void noc_overlay_stream_start(uint32_t stream_id);
-
-
-
-//////////////////////////
-// (2) Data send functions
-
-/*
-  There are two mechanisms for data sending:
-
-    (1) For a stream with a dedicated buffer, we can use the following sequence:
-
-        - Call noc_overlay_stream_buf_free_space to find out how much buffer space
-          is currently free for sending data.  (Poll if not enough.)
-
-        - Call noc_overlay_stream_buf_curr_write_ptr to get the write pointer where
-          data should be written.  Write data to this localtion.
-
-        - Call noc_overlay_stream_send_data to send data down the stream.
-
-
-    (2) Sender can also provide its own buffer with data to be sent:
-
-        - Call noc_overlay_stream_send_data_buf to initiate sending data from a given
-          address.
-
-        - Poll on noc_overlay_stream_data_buf_send_done to ensure data have been sent
-          before the sender can discard/overwrite the buffer.
-
-        It is not possible to overlap multiple noc_overlay_stream_send_data_buf operations.
-        I.e., each time noc_overlay_stream_send_data_buf is called, the caller must wait
-        for the entire buffer to be sent before the memory space can be reused.
-
-*/
-
-
-/*
-  How much free space is there in local data sender stream buffer?
-  Needs to be called by software before starting packer.
-
-*/
-uint32_t noc_overlay_stream_buf_free_space(uint32_t stream_id);
-
-
-/*
-  Returns the current write pointer for a local outgoing stream.
-  Needs to be called by software before starting packer to provide the
-  memory address where data should be written.
-
-  (Note: packer needs to handle wraparound, based on the buffer start
-  address and size, as specified previously in the noc_overlay_stream_set_buf
-  call.)
-
-*/
-uint8_t* noc_overlay_stream_buf_curr_write_ptr(uint32_t stream_id);
-
-
-/*
-  Signals that data_size bytes have been written into the buffer and can
-  be sent off.
-
-*/
-void noc_overlay_stream_send_data(uint32_t stream_id, uint32_t data_size);
-
-
-/*
-  Alternative sending mechanism: send data from a buffer maintained by the
-  sender software.
-
- */
-void noc_overlay_stream_send_data_buf(uint32_t stream_id, uint8_t* data_ptr, uint32_t data_size);
-
-/*
-  Returns the number of bytes sent with data sent by noc_overlay_stream_send_data_buf
-  that have been forwarded so far, so that the corresponding buffer space can be discarded
-  or overwritten.
-
- */
-uint32_t noc_overlay_stream_data_buf_send_done(uint32_t stream_id);
-
-
-/////////////////////////////
-// (3) Data receive functions
-
-/*
-  Number of received data bytes currently in the buffer.
-
-*/
-uint32_t noc_overlay_stream_buf_data_bytes_received(uint32_t stream_id);
-
-
-/*
-  Number of received messages currently in the buffer.
-
-*/
-uint32_t noc_overlay_stream_buf_messages_received(uint32_t stream_id);
-
-
-/*
-  Read a word (2 bytes) from the header of the message at the head of the buffer, at the
-  given index (number of words) from the start of the message.
-
-*/
-uint16_t noc_overlay_stream_buf_message_header_get_word(uint32_t stream_id, uint32_t index);
-
-/*
-  Read the size of the message at the head of the buffer.
-
-*/
-uint32_t noc_overlay_stream_buf_message_size(uint32_t stream_id);
-
-/*
-  Get pointer to the received data/message.
-
-*/
-uint8_t* noc_overlay_stream_buf_curr_read_ptr(uint32_t stream_id);
-
-
-/*
-  Signals that data_size bytes have been consumed from the buffer and can
-  be discarded.
-
-*/
-void noc_overlay_stream_receive_data(uint32_t stream_id, uint32_t data_size);
-
-
-/*
-  Signals that num_msgs messages have been consumed from the buffer and can
-  be discarded.
-
-*/
-void noc_overlay_stream_receive_messages(uint32_t stream_id, uint32_t num_msgs);
-
-
-
-/////////////////////////////////////
-// (4) Overlay deallocation functions
-
-
-/*
-  These functions will ensure that all upstream remote sources (if any) have been flushed
-  and deallocated, and will subsequently flush all local source streams and propagate the
-  deallocation message downstream to all remote destinations (if any).
-
-  The function noc_overlay_stream_dealloc needs to be called for each local stream at the
-  end of a layer. It is non-blocking, and software needs to poll noc_overlay_stream_finished
-  to ensure that data are flushed and resources deallocated before starting to initialize a
-  new overlay (if it reuses any of the same resources).
-
-*/
-void noc_overlay_stream_dealloc(uint32_t stream_id);
-bool noc_overlay_stream_finished(uint32_t stream_id);
-
-
-//////
-
-/*
-
-  Structures for formatting NOC stream auto config.
-
-*/
-
-
-struct noc_stream_cfg_header_struct {
-  unsigned phase_num_incr : 12;
-  unsigned curr_phase_num_msgs: 12;
-  unsigned num_cfg_reg_writes: 8;
-};
-
-struct noc_stream_cfg_reg_write_struct {
-  unsigned reg_index : 8;
-  unsigned reg_val: 24;
-};
-
-typedef struct noc_stream_cfg_header_struct noc_stream_cfg_header;
-
-typedef struct noc_stream_cfg_reg_write_struct noc_stream_cfg_reg_write;
-
-
-#endif //ndef _NOC_OVERLAY_H_
diff --git a/tt_metal/hw/inc/wormhole/noc/noc_overlay_parameters.erb b/tt_metal/hw/inc/wormhole/noc/noc_overlay_parameters.erb
deleted file mode 100644
index f7dae7a9ce40..000000000000
--- a/tt_metal/hw/inc/wormhole/noc/noc_overlay_parameters.erb
+++ /dev/null
@@ -1,1458 +0,0 @@
-<%-
-
-header = "\
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// AUTO_GENERATED! DO NOT MODIFY!                                                                              //
-// Please run                                                                                                  //
-//                                                                                                             //
-// (echo '<\% type=:c_header %\>' && cat noc_overlay_parameters.erb) | erb -T - > noc_overlay_parameters.h     //
-// (echo '<\% type=:cpp_header %\>' && cat noc_overlay_parameters.erb) | erb -T - > noc_overlay_parameters.hpp //
-// Open noc_overlay_parameters.hpp and move static class varaible definitions to noc_overlay_parameters.cpp    //
-// overriding existing ones.                                                                                   //
-//                                                                                                             //
-// to regenerate                                                                                               //
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-"
-
-c_header_basic = "\
-#ifndef NOC_OVERLAY_PARAMETERS_BASIC_H
-#define NOC_OVERLAY_PARAMETERS_BASIC_H
-
-#define NOC_NUM_STREAMS 64
-
-#define NUM_MCAST_STREAM_ID_START 0
-#define NUM_MCAST_STREAM_ID_END   3
-#define NUM_RECEIVER_ENDPOINT_STREAM_ID_START 4
-#define NUM_RECEIVER_ENDPOINT_STREAM_ID_END   5
-#define NUM_REMOTE_RECEIVER_STREAM_ID_START 0
-#define NUM_REMOTE_RECEIVER_STREAM_ID_END 63
-#define RECEIVER_ENDPOINT_STREAM_MSG_GROUP_SIZE 4
-#define RECEIVER_ENDPOINT_STREAM_MSG_INFO_FIFO_GROUPS     2
-#define NON_RECEIVER_ENDPOINT_STREAM_MSG_INFO_FIFO_GROUPS 2
-#define DEST_READY_COMMON_CACHE_NUM_ENTRIES 24
-#define DEST_READY_MCAST_CACHE_NUM_ENTRIES 8
-
-#define NOC_OVERLAY_START_ADDR     0xFFB40000
-#define NOC_STREAM_REG_SPACE_SIZE  0x1000
-
-#define STREAM_REG_ADDR(stream_id, reg_id) ((NOC_OVERLAY_START_ADDR) + (((uint32_t)(stream_id))*(NOC_STREAM_REG_SPACE_SIZE)) + (((uint32_t)(reg_id)) << 2))
-
-#define NOC0_REGS_START_ADDR       0xFFB20000
-#define NOC1_REGS_START_ADDR       0xFFB30000
-
-#define NCRISC_STREAM_RANGE_1_START 0
-#define NCRISC_STREAM_RANGE_1_END   3
-#define NCRISC_STREAM_RANGE_2_START 8
-#define NCRISC_STREAM_RANGE_2_END   11
-#define NCRISC_PIC_CONFIG_PHASE_DEFAULT           0
-
-#define NOC_STREAM_WRITE_REG(stream_id, reg_id, val)  ((*((volatile uint32_t*)(STREAM_REG_ADDR(stream_id, reg_id)))) = (val))
-#define NOC_STREAM_READ_REG(stream_id, reg_id)        (*((volatile uint32_t*)(STREAM_REG_ADDR(stream_id, reg_id))))
-
-#define NOC_STREAM_WRITE_REG_FIELD(stream_id, reg_id, field, val) (NOC_STREAM_WRITE_REG(stream_id, reg_id, ((NOC_STREAM_READ_REG(stream_id, reg_id) & ~((1 << field##_WIDTH) - 1)) | ((val & ((1 << field##_WIDTH) - 1)) << field))))
-#define NOC_STREAM_READ_REG_FIELD(stream_id, reg_id, field)       ((NOC_STREAM_READ_REG(stream_id, reg_id) >> field) & ((1 << field##_WIDTH) - 1))
-#define NOC_STREAM_GET_REG_FIELD(reg_val, field)       (((reg_val) >> field) & ((1 << field##_WIDTH) - 1))
-
-#define NOC_WRITE_REG(addr, val) ((*((volatile uint32_t*)(addr)))) = (val)
-#define NOC_READ_REG(addr)       (*((volatile uint32_t*)(addr)))
-
-
-#define NOC_ID_WIDTH     6
-#define STREAM_ID_WIDTH  6
-
-#define DEST_CNT_WIDTH   6
-#define NOC_NUM_WIDTH     1
-
-#define STREAM_REG_INDEX_WIDTH 8
-#define STREAM_REG_CFG_DATA_WIDTH 24
-
-#define MEM_WORD_WIDTH 16
-#define MEM_WORD_ADDR_WIDTH 17
-
-#define MEM_WORD_BIT_OFFSET_WIDTH 7
-
-#endif
-"
-
-registers = [
-    {
-        :index => 0,
-        :name => "STREAM_REMOTE_SRC",
-        :description  => "\
-        // Properties of the remote source stream (coorindates, stream ID, and this streams destination index).
-        // Dont-care unless REMOTE_SOURCE == 1.
-        ",
-        :fields =>  [
-            { :name => "STREAM_REMOTE_SRC_X", :offset => "0", :width => "NOC_ID_WIDTH" },
-            { :name => "STREAM_REMOTE_SRC_Y", :offset => "(STREAM_REMOTE_SRC_X+STREAM_REMOTE_SRC_X_WIDTH)", :width => "NOC_ID_WIDTH" },
-            { :name => "REMOTE_SRC_STREAM_ID", :offset => "(STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH)", :width => "STREAM_ID_WIDTH" },
-            { :name => "STREAM_REMOTE_SRC_DEST_INDEX", :offset => "(REMOTE_SRC_STREAM_ID+REMOTE_SRC_STREAM_ID_WIDTH)", :width => "STREAM_ID_WIDTH" },
-            { :name => "DRAM_READS__TRANS_SIZE_WORDS_LO", :offset => "(STREAM_REMOTE_SRC_Y+STREAM_REMOTE_SRC_Y_WIDTH)", :width => "12" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_REMOTE_SRC_PHASE",
-        :description  => "\
-        // Remote source phase (may be different from the destination stream phase.)
-        // We use 20-bit phase ID, so phase count doesnt wrap until 1M phases.
-        // Dont-care unless REMOTE_SOURCE == 1.
-        ",
-        :fields =>  [
-            { :name => "DRAM_READS__SCRATCH_1_PTR", :offset => "0", :width => "19" },
-            { :name => "DRAM_READS__TRANS_SIZE_WORDS_HI", :offset => "(DRAM_READS__SCRATCH_1_PTR+DRAM_READS__SCRATCH_1_PTR_WIDTH)", :width => "1" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_REMOTE_DEST",
-        :description  => "\
-        // Properties of the remote destination stream (coorindates, stream ID).  Dont-care unless REMOTE_RECEIVER == 1.
-        // If destination is multicast, this register specifies the starting coordinates of the destination
-        // multicast group/rectangle. (The end coordinates are in STREAM_MCAST_DEST below.)
-        ",
-        :fields =>  [
-            { :name => "STREAM_REMOTE_DEST_X", :offset => "0", :width => "NOC_ID_WIDTH" },
-            { :name => "STREAM_REMOTE_DEST_Y", :offset => "(STREAM_REMOTE_DEST_X+STREAM_REMOTE_DEST_X_WIDTH)", :width => "NOC_ID_WIDTH" },
-            { :name => "STREAM_REMOTE_DEST_STREAM_ID", :offset => "(STREAM_REMOTE_DEST_Y+STREAM_REMOTE_DEST_Y_WIDTH)", :width => "STREAM_ID_WIDTH" },
-        ],
-    },
-    {
-        :index => :same_as_prev,
-        :name => "STREAM_LOCAL_DEST",
-        :description  => "\
-        // Properties of the local destination gather stream connection.
-        // Dont-care unless LOCAL_RECEIVER == 1.
-        // Shares register space with STREAM_REMOTE_DEST_REG_INDEX.
-        ",
-        :fields =>  [
-            { :name => "STREAM_LOCAL_DEST_MSG_CLEAR_NUM", :offset => "0", :width => "12" },
-            { :name => "STREAM_LOCAL_DEST_STREAM_ID", :offset => "(STREAM_LOCAL_DEST_MSG_CLEAR_NUM+STREAM_LOCAL_DEST_MSG_CLEAR_NUM_WIDTH)", :width => "STREAM_ID_WIDTH" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_REMOTE_DEST_BUF_START",
-        :description  => "\
-        // Start address (in words) of the remote destination stream memory buffer.
-        ",
-        :fields =>  [
-            { :name => "DRAM_WRITES__SCRATCH_1_PTR_LO", :offset => "0", :width => "16" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_REMOTE_DEST_BUF_SIZE",
-        :description  => "\
-        // Size (in words) of the remote destination stream memory buffer.
-        ",
-        :fields =>  [
-            { :name => "REMOTE_DEST_BUF_SIZE_WORDS", :offset => "0", :width => "MEM_WORD_ADDR_WIDTH" },
-            { :name => "DRAM_WRITES__SCRATCH_1_PTR_HI", :offset => "0", :width => "3" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_REMOTE_DEST_WR_PTR",
-        :description  => "\
-        // Write pointer for the remote destination stream memory buffer.
-        // Can be written directly; automatically reset to 0 when
-        // STREAM_REMOTE_DEST_BUF_START is written.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_BUF_START",
-        :description  => "\
-        // Start address (in words) of the memory buffer associated with this stream.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_BUF_SIZE",
-        :description  => "\
-        // Stream buffer size (in words).
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_INFO_PTR",
-        :description  => "\
-        // Stream message info buffer address.
-        //
-        // This register needs to be initialized to the start of the message info buffer during
-        // phase configuration.  Subsequently it will be incremented by hardware as data are read
-        // from the buffer, thus doubling as the read pointer during phase execution.
-        //
-        // Stream hardware will assume that this buffer is large enough to hold info for all messages
-        // within a phase, so unlike the buffer, it never needs to wrap.
-        //
-        // The buffer is filled automatically by snooping for streams with remote source.
-        // For source enpoints, the buffer is written explicitly (along with the data buffer), after which
-        // STREAM_NUM_MSGS_RECEIVED_INC is written to notify the stream that messages are available for
-        // sending.
-        //
-        // Write pointer is also managed automatically by hardware, but can be read or reset using
-        // STREAM_MSG_INFO_WR_PTR_REG. Write pointer is also reset when writing this register.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_REMOTE_DEST_MSG_INFO_WR_PTR",
-        :description  => "\
-        // Write pointer for the remote destination message info buffer.
-        // Dont-care unless REMOTE_RECEIVER==1.
-        // Needs to be initialized to the start of the message info buffer of the remote destination
-        // at phase start, if destination is changed.
-        // Subsequently its incremented automatically as messages are forwarded.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MISC_CFG",
-        :description  => "\
-        // The ID of NOCs used for incoming and outgoing data, followed by misc. stream configuration options:
-        //   * Source - set exactly one of these to 1:
-        //        SOURCE_ENDPOINT = source is local math/packer
-        //        REMOTE_SOURCE = source is remote sender stream
-        //        LOCAL_SOURCES_CONNECTED = source is one or more local connected streams
-        //   * Destination - set one or zero of these to 1:
-        //        RECEIVER_ENDPOINT = stream is read by local unpacker/math
-        //        REMOTE_RECEIVER = stream forwards data to a remote destination or multicast group
-        //        LOCAL_RECEIVER = stream is connected to a local destination stream
-        //        None set = stream just stores data in a local buffer, without forwarding/clearing, and
-        //                   finishes the phase once all messages have been received
-        //   * Phase/data forward options:
-        //      PHASE_AUTO_CONFIG = set to 1 for stream to fetch next phase configuration automatically.
-        //      PHASE_AUTO_ADVANCE = set to 1 for stream to advance to next phase automatically
-        //            (otherwise need to write STREAM_PHASE_ADVANCE below)
-        //      DATA_AUTO_SEND = set to 1 to forward data automatically based on read/write pointers;
-        //             set to 0 to forward data only when STREAM_NEXT_MSG_SEND is written
-        ",
-        :fields =>  [
-            { :name => "INCOMING_DATA_NOC", :offset => "0", :width => "NOC_NUM_WIDTH" },
-            { :name => "OUTGOING_DATA_NOC", :offset => "(INCOMING_DATA_NOC+INCOMING_DATA_NOC_WIDTH)", :width => "NOC_NUM_WIDTH" },
-            { :name => "REMOTE_SRC_UPDATE_NOC", :offset => "(OUTGOING_DATA_NOC+OUTGOING_DATA_NOC_WIDTH)", :width => "NOC_NUM_WIDTH" },
-            { :name => "LOCAL_SOURCES_CONNECTED", :offset => "(REMOTE_SRC_UPDATE_NOC+REMOTE_SRC_UPDATE_NOC_WIDTH)", :width => "1" },
-            { :name => "SOURCE_ENDPOINT", :offset => "(LOCAL_SOURCES_CONNECTED+LOCAL_SOURCES_CONNECTED_WIDTH)", :width => "1" },
-            { :name => "REMOTE_SOURCE", :offset => "(SOURCE_ENDPOINT+SOURCE_ENDPOINT_WIDTH)", :width => "1" },
-            { :name => "RECEIVER_ENDPOINT", :offset => "(REMOTE_SOURCE+REMOTE_SOURCE_WIDTH)", :width => "1" },
-            { :name => "LOCAL_RECEIVER", :offset => "(RECEIVER_ENDPOINT+RECEIVER_ENDPOINT_WIDTH)", :width => "1" },
-            { :name => "REMOTE_RECEIVER", :offset => "(LOCAL_RECEIVER+LOCAL_RECEIVER_WIDTH)", :width => "1" },
-            { :name => "PHASE_AUTO_CONFIG", :offset => "(REMOTE_RECEIVER+REMOTE_RECEIVER_WIDTH)", :width => "1" },
-            { :name => "PHASE_AUTO_ADVANCE", :offset => "(PHASE_AUTO_CONFIG+PHASE_AUTO_CONFIG_WIDTH)", :width => "1" },
-            { :name => "DATA_AUTO_SEND", :offset => "(PHASE_AUTO_ADVANCE+PHASE_AUTO_ADVANCE_WIDTH)", :width => "1" },
-            { :name => "NEXT_PHASE_SRC_CHANGE", :offset => "(DATA_AUTO_SEND+DATA_AUTO_SEND_WIDTH)", :width => "1" },
-            { :name => "NEXT_PHASE_DEST_CHANGE", :offset => "(NEXT_PHASE_SRC_CHANGE+NEXT_PHASE_SRC_CHANGE_WIDTH)", :width => "1" },
-            {
-                :name => "DATA_BUF_NO_FLOW_CTRL", :offset => "(NEXT_PHASE_DEST_CHANGE+NEXT_PHASE_DEST_CHANGE_WIDTH)", :width => "1",
-                :description  => "\
-                // set if REMOTE_SOURCE==1 and the buffer is large enough to accept full phase data without wrapping:
-                ",
-            },
-            {
-                :name => "DEST_DATA_BUF_NO_FLOW_CTRL", :offset => "(DATA_BUF_NO_FLOW_CTRL+DATA_BUF_NO_FLOW_CTRL_WIDTH)", :width => "1",
-                :description  => "\
-                // set if REMOTE_RECEIVER==1 and the destination buffer is large enough to accept full phase data without wrapping:
-                ",
-
-            },
-            {
-                :name => "REMOTE_SRC_IS_MCAST", :offset => "(DEST_DATA_BUF_NO_FLOW_CTRL+DEST_DATA_BUF_NO_FLOW_CTRL_WIDTH)", :width => "1",
-                :description  => "\
-                // set if REMOTE_SOURCE==1 and has mulicast enabled (i.e. this stream is part of a multicast group)
-                ",
-
-            },
-            {
-                :name => "NO_PREV_PHASE_OUTGOING_DATA_FLUSH", :offset => "(REMOTE_SRC_IS_MCAST+REMOTE_SRC_IS_MCAST_WIDTH)", :width => "1",
-                :description  => "\
-                // set if no need to flush outgoing remote data from previous phase
-                ",
-
-            },
-            {
-                :name => "UNICAST_VC_REG", :offset => "(NO_PREV_PHASE_OUTGOING_DATA_FLUSH+NO_PREV_PHASE_OUTGOING_DATA_FLUSH_WIDTH)", :width => "3",
-                :description  => "\
-                // set to one of the values (0-5) to select which VC unicast requests will be sent on
-                ",
-
-            },
-            {
-                :name => "REG_UPDATE_VC_REG", :offset => "(UNICAST_VC_REG+UNICAST_VC_REG_WIDTH)", :width => "3",
-                :description  => "\
-                // set to one of the values (0-5) to select which VC control flow updates will be sent on
-                ",
-
-            },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_CURR_PHASE",
-        :description  => "\
-        // Current phase number executed by the stream.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_PHASE_AUTO_CFG_PTR",
-        :description  => "\
-        // Pointer to the stream auto-config data. Initialized to the start of
-        // the auto-config structure at workload start, automatically updated
-        // subsequenty.
-        // Specified as byte address, needs to be multiple of 4B.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MCAST_DEST",
-        :description  => "\
-        // Destination spec for multicasting streams. STREAM_MCAST_END_X/Y are
-        // the end coordinate for the multicast rectangle, with the ones from
-        // STREAM_REMOTE_DEST taken as start.
-        // Dont-care if STREAM_MCAST_EN == 0.
-        ",
-        :fields =>  [
-            { :name => "STREAM_MCAST_END_X", :offset => "0", :width => "NOC_ID_WIDTH" },
-            { :name => "STREAM_MCAST_END_Y", :offset => "(STREAM_MCAST_END_X+STREAM_MCAST_END_X_WIDTH)", :width => "NOC_ID_WIDTH" },
-            { :name => "STREAM_MCAST_EN", :offset => "(STREAM_MCAST_END_Y+STREAM_MCAST_END_Y_WIDTH)", :width => "1" },
-            { :name => "STREAM_MCAST_LINKED", :offset => "(STREAM_MCAST_EN+STREAM_MCAST_EN_WIDTH)", :width => "1" },
-            {
-                :name => "STREAM_MCAST_VC", :offset => "(STREAM_MCAST_LINKED+STREAM_MCAST_LINKED_WIDTH)", :width => "1",
-                :description  => "\
-                // Set to 0 to select VC 4, and 1 to select VC 5 (default 0)
-                ",
-            },
-            { :name => "STREAM_MCAST_NO_PATH_RES", :offset => "(STREAM_MCAST_VC+STREAM_MCAST_VC_WIDTH)", :width => "1" },
-            { :name => "STREAM_MCAST_XY", :offset => "(STREAM_MCAST_NO_PATH_RES+STREAM_MCAST_NO_PATH_RES_WIDTH)", :width => "1" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MCAST_DEST_NUM",
-        :description  => "\
-        // Number of multicast destinations (dont-care for non-multicast streams)
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_GATHER",
-        :description  => "\
-        // Specifies MSG_ARB_GROUP_SIZE. Valid values are 1 (round-robin
-        // arbitration between each incoming stream) or 4 (round-robin arbitration
-        // between groups of 4 incoming streams).
-        ",
-        :fields =>  [
-            { :name => "MSG_ARB_GROUP_SIZE", :offset => "0", :width => "3" },
-            { :name => "MSG_SRC_IN_ORDER_FWD", :offset => "(MSG_ARB_GROUP_SIZE+MSG_ARB_GROUP_SIZE_WIDTH)", :width => "1" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_SRC_IN_ORDER_FWD_NUM_MSGS",
-        :description  => "\
-        // When using in-order message forwarding, number of messages after which the source
-        // pointer goes back to zero (without phase change).
-        // Dont-care if STREAM_MCAST_EN == 0 or MSG_SRC_IN_ORDER_FWD == 0.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_HEADER_FORMAT",
-        :description  => "\
-        // Offset & size of the size field in the message header. Only valid offsets are multiples of 8
-        // (i.e. byte-aligned).
-        ",
-        :fields =>  [
-            { :name => "MSG_HEADER_WORD_CNT_OFFSET", :offset => "0", :width => "MEM_WORD_BIT_OFFSET_WIDTH" },
-            { :name => "MSG_HEADER_WORD_CNT_BITS", :offset => "(MSG_HEADER_WORD_CNT_OFFSET+MSG_HEADER_WORD_CNT_OFFSET_WIDTH)", :width => "MEM_WORD_BIT_OFFSET_WIDTH" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_NUM_MSGS_RECEIVED",
-        :description  => "\
-        // Number of received & stored messages (read-only).
-        // To get the total number of messages penidng in memory read
-        // STREAM_NUM_MSGS_RECEIVED_IN_BUF_AND_MEM_REG_INDEX
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_NEXT_RECEIVED_MSG_ADDR",
-        :description  => "\
-        // Memory address (in words) of the next in line received message (read-only).
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_NEXT_RECEIVED_MSG_SIZE",
-        :description  => "\
-        // Size in words of the next in line received message (read-only).
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_INFO_CLEAR",
-        :description  => "\
-        // Clear message info for one or more stored messages.  Only valid values are 1, 2, or 4.
-        // No effect on the read pointer.
-        // Should be used only for streams where RECEIVER_ENDPOINT == 1.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_DATA_CLEAR",
-        :description  => "\
-        // Move read pointer & reclaim buffer space for one or more stored messages.
-        // Sends flow control update to the source if REMOTE_SOURCE==1.
-        // Only valid values are 1, 2, or 4.
-        // Should be used only for streams where RECEIVER_ENDPOINT == 1, after
-        // STREAM_MSG_INFO_CLEAR_REG has been written with the same value.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_NEXT_MSG_SEND",
-        :description  => "\
-        // Write to send the next in line stored message. Used when DATA_AUTO_SEND == 0.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_RD_PTR",
-        :description  => "\
-        // Read pointer value (word offset relative to buffer start). Can be updated by
-        // writing the register (e.g. to force resend).
-        // Value does not guarantee that all data up to the current value have been sent
-        // off (forwarding command may be  ongoing).  To find out free space in the buffer,
-        // read STREAM_BUF_SPACE_AVAILABLE.
-        // Automatically reset to 0 when STREAM_BUF_START_REG is updated.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_WR_PTR",
-        :description  => "\
-        // Write pointer value (word offset relative to buffer start).
-        // Can be read to determine the location at which to write new data.
-        // In normal operation, should be updated only by writing
-        // STREAM_NUM_MSGS_RECEIVED_INC_REG or STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_INFO_WR_PTR",
-        :description  => "\
-        // Write pointer value for message info buffer (absolute word address).
-        // In normal operation, should be updated only by writing
-        // STREAM_NUM_MSGS_RECEIVED_INC_REG or STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_PHASE_ADVANCE",
-        :description  => "\
-        // Write-only. Write 1 to advance to the next phase if PHASE_AUTO_ADVANCE == 0.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_BUF_SPACE_AVAILABLE",
-        :description  => "\
-        // Available buffer space at the stream (in 16B words).
-        // Source cant send data unless available space > 0.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO",
-        :description  => "\
-        // For endpoints with SOURCE_ENDPOINT == 1, this register is for firmware
-        // to register new message for sending.
-        // This updates the msg_info register structure directly, rather than writing to the message info
-        // buffer in memory.
-        // Must not be written when the message info register structure is full, or if
-        // there are message info entries in the memory buffer. (This would cause a race
-        // condition.)
-        ",
-        :fields =>  [
-            { :name => "SOURCE_ENDPOINT_NEW_MSG_ADDR", :offset => "0", :width => "MEM_WORD_ADDR_WIDTH" },
-            { :name => "SOURCE_ENDPOINT_NEW_MSG_SIZE", :offset => "(SOURCE_ENDPOINT_NEW_MSG_ADDR+SOURCE_ENDPOINT_NEW_MSG_ADDR_WIDTH)", :width => "(32-MEM_WORD_ADDR_WIDTH)" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_NUM_MSGS_RECEIVED_INC",
-        :description  => "\
-        // For endpoints with SOURCE_ENDPOINT == 1, this register is for firmware
-        // to update the number of messages whose data & header are available in the memory buffer.
-        // Hardware register is incremented atomically if sending of previous messages is in progress.
-        ",
-        :fields =>  [
-            { :name => "SOURCE_ENDPOINT_NEW_MSGS_NUM", :offset => "0", :width => "12" },
-            { :name => "SOURCE_ENDPOINT_NEW_MSGS_TOTAL_SIZE", :offset => "(SOURCE_ENDPOINT_NEW_MSGS_NUM+SOURCE_ENDPOINT_NEW_MSGS_NUM_WIDTH)", :width => "MEM_WORD_ADDR_WIDTH" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_RESET",
-        :description  => "\
-        // Write to reset & stop stream.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_DEST_PHASE_READY_UPDATE",
-        :description  => "\
-        // Write phase number to indicate destination ready for the given phase.
-        // (This is done automatically by stream hardware when starting a phase with REMOTE_SOURCE=1.)
-        // The phase number is the one indicated by STREAM_REMOTE_SRC_PHASE_REG at destination.
-        // This register is mapped to the shared destination ready table, not a per-stream register.
-        // (Stream index is taken from the register address, and stored into the table along with the
-        // phase number.)
-        ",
-        :fields =>  [
-            { :name => "PHASE_READY_DEST_NUM", :offset => "0", :width => "6" },
-            { :name => "PHASE_READY_NUM", :offset => "(PHASE_READY_DEST_NUM+PHASE_READY_DEST_NUM_WIDTH)", :width => "20" },
-            {
-                :name => "PHASE_READY_MCAST", :offset => "(PHASE_READY_NUM+PHASE_READY_NUM_WIDTH)", :width => "1",
-                :description  => "\
-                // set if this stream is part of multicast group (i.e. if REMOTE_SRC_IS_MCAST==1)
-                ",
-            },
-            {
-                :name => "PHASE_READY_TWO_WAY_RESP", :offset => "(PHASE_READY_MCAST+PHASE_READY_MCAST_WIDTH)", :width => "1",
-                :description  => "\
-                // set if the message is in response to 2-way handshake
-                ",
-            },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_SRC_READY_UPDATE",
-        :description  => "\
-        // Source ready message register for two-way handshake (sent by source in
-        // case destination ready entry is not found in the table).
-        // If received by a stream that already sent its ready update, it prompts resending.
-        ",
-        :fields =>  [
-            { :name => "STREAM_REMOTE_RDY_SRC_X", :offset => "0", :width => "NOC_ID_WIDTH" },
-            { :name => "STREAM_REMOTE_RDY_SRC_Y", :offset => "(STREAM_REMOTE_RDY_SRC_X+STREAM_REMOTE_RDY_SRC_X_WIDTH)", :width => "NOC_ID_WIDTH" },
-            { :name => "REMOTE_RDY_SRC_STREAM_ID", :offset => "(STREAM_REMOTE_RDY_SRC_Y+STREAM_REMOTE_RDY_SRC_Y_WIDTH)", :width => "STREAM_ID_WIDTH" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE",
-        :description  => "\
-        // Update available buffer space at remote destination stream.
-        // this is rd_ptr increment issued when a message is forwarded
-        ",
-        :fields =>  [
-            { :name => "REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM", :offset => "0", :width => "6" },
-            { :name => "REMOTE_DEST_BUF_WORDS_FREE_INC", :offset => "(REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM+REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_DEST_NUM_WIDTH)", :width => "MEM_WORD_ADDR_WIDTH" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_WAIT_STATUS",
-        :description  => "\
-        // Status info for the stream.
-        ",
-        :fields =>  [
-            {
-                :name => "WAIT_SW_PHASE_ADVANCE_SIGNAL", :offset => "0", :width => "1",
-                :description  => "\
-                // Set when stream is in START state with auto-config disabled, or if auto-config is enabled
-                // but PHASE_AUTO_ADVANCE=0
-                ",
-            },
-            {
-                :name => "WAIT_PREV_PHASE_DATA_FLUSH", :offset => "(WAIT_SW_PHASE_ADVANCE_SIGNAL+WAIT_SW_PHASE_ADVANCE_SIGNAL_WIDTH)", :width => "1",
-                :description  => "\
-                // Set when stream has configured the current phase, but waits data from the previous one to be flushed.
-                ",
-            },
-            {
-                :name => "MSG_FWD_ONGOING", :offset => "(WAIT_PREV_PHASE_DATA_FLUSH+WAIT_PREV_PHASE_DATA_FLUSH_WIDTH)", :width => "1",
-                :description  => "\
-                // Set when stream is in data forwarding state.
-                ",
-            },
-            { :name => "STREAM_CURR_STATE", :offset => "(MSG_FWD_ONGOING+MSG_FWD_ONGOING_WIDTH)", :width => "4" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_PHASE_AUTO_CFG_HEADER",
-        :description  => "\
-        // Register corresponding to the auto-configuration header. Written by each auto-config access
-        // at phase start, can be also written by software for initial configuration or if auto-config
-        // is disabled.
-        // PHASE_NUM_INCR is phase number increment relative to the previous executed phase (or 0 right
-        // after reset). The increment happens after auto-config is done, and before the phase is executed.
-        // (Therefore reading  STREAM_CURR_PHASE_REG while auto-config is ongoing, or if it hasnt started
-        // yet, may return the old phase number.)
-        // This enables up to 2^12-1 phases to be skipped. If more phases need to be skipped, it is
-        // necessary to insert an intermediate phase with zero messages, whose only purpose is to provide
-        // an additional skip offset.
-        ",
-        :fields =>  [
-            { :name => "PHASE_NUM_INCR", :offset => "0", :width => "12" },
-            { :name => "CURR_PHASE_NUM_MSGS", :offset => "(PHASE_NUM_INCR+PHASE_NUM_INCR_WIDTH)", :width => "12" },
-            { :name => "NEXT_PHASE_NUM_CFG_REG_WRITES", :offset => "(CURR_PHASE_NUM_MSGS+CURR_PHASE_NUM_MSGS_WIDTH)", :width => "8" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_PERF_CONFIG",
-        :description  => "\
-        // Should be written only for stream 0, applies to all streams.
-        ",
-        :fields =>  [
-            { :name => "CLOCK_GATING_EN", :offset => "0", :width => "1" },
-            { :name => "CLOCK_GATING_HYST", :offset => "(CLOCK_GATING_EN+CLOCK_GATING_EN_WIDTH)", :width => "7" },
-            {
-                :name => "PARTIAL_SEND_WORDS_THR", :offset => "(CLOCK_GATING_HYST+CLOCK_GATING_HYST_WIDTH)", :width => "8",
-                :description  => "\
-                // PARTIAL_SEND_WORDS_THR contols the minimum number of 16-byte words of a tile to accumulate in a relay stream before sending it off to the destination.
-                // If the size of the tile is less than or equal to PARTIAL_SEND_WORDS_THR, then this feild is ignored.
-                // Default is 16 words
-                ",
-            },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_GROUP_ZERO_MASK_AND",
-        :description  => "\
-        // AND value of zero masks for the pending message group.
-        // (Header bits [95:64].)
-        // Read-only.  Valid only for receiver endpoint streams.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_INFO_FULL",
-        :description  => "\
-        // Returns 1 if the message info register is full (read-only).
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MEM_BUF_SPACE_AVAILABLE_ACK_THRESHOLD",
-        :description  => "\
-        // 4-bit wide register that determines the threshold at which a stream
-        // with remote source sends an update message to STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE.
-        // Dont-care unless REMOTE_SOURCE==1.
-        // Values:
-        //   value[3:0] == 0 => disable threshold. Acks send as soon as any data are cleared/forwarded.
-        //   value[3:0] >  0 => threshold calculated according to the following formula:
-        //         if (value[3])
-        //              threshold = buf_size - (buf_size >> value[2:0])
-        //         else
-        //              threshold = (buf_size >> value[2:0])
-        //
-        // This enables setting thresholds of buf_size/2, buf_size/4, buf_size/8, ... buf_size/256,
-        // as well as  3*buf_size/4, 7*buf_size/8, etc.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_INFO_CAN_PUSH_NEW_MSG",
-        :description  => "\
-        // Returns 1 if the message info register can accept new message push (read-only).
-        // Equivalent to checking the condition:
-        //   (STREAM_MSG_INFO_FULL_REG_INDEX == 0) && (STREAM_MSG_INFO_PTR_REG_INDEX == STREAM_MSG_INFO_WR_PTR_REG_INDEX)
-        // (I.e. ther is free space in the msg info register, and we dont have any message info headers in the
-        //  memory buffer about to be fetched.)
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_MSG_GROUP_COMPRESS",
-        :description  => "\
-        // Concat compress flags from 4 tiles in the pending message group.
-        // (Header bit 52.)
-        // Read-only.  Valid only for receiver endpoint streams.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_GATHER_CLEAR",
-        :description  => "\
-        // Msg_LOCAL_STREAM_CLEAR_NUM specifies the number of messages that should
-        // be cleared from a gather stream before moving onto the next stream.
-        // When MSG_ARB_GROUP_SIZE > 1, the order of clearing the streams can be selected
-        // with MSG_GROUP_STREAM_CLEAR_TYPE. 0 = clear the whole group MSG_LOCAL_STREAM_CLEAR_NUM times,
-        // 1 = clear each stream of the group MSG_LOCAL_STREAM_CLEAR_NUM times before
-        // moving onto the next stream in the group.
-        ",
-        :fields =>  [
-            { :name => "MSG_LOCAL_STREAM_CLEAR_NUM", :offset => "0", :width => "16" },
-            { :name => "MSG_GROUP_STREAM_CLEAR_TYPE", :offset => "(MSG_LOCAL_STREAM_CLEAR_NUM+MSG_LOCAL_STREAM_CLEAR_NUM_WIDTH)", :width => "1" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_REMOTE_DEST_TRAFFIC_PRIORITY",
-        :description  => "\
-        // Priority for traffic sent to remote destination.
-        // Valid only for streams capable of remote sending.
-        // 4-bit value.
-        // Set to 0 to send traffic under round-robin arbitration.
-        // Set to 1-15 for priority arbitration (higher values are higher priority).
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_DEBUG_STATUS_SEL",
-        :description  => "\
-        // Debug bus stream selection. Write the stream id for the stream that you want exposed on the debug bus
-        // This register only exists in stream 0.
-        ",
-        :fields =>  [
-            { :name => "DEBUG_STATUS_STREAM_ID_SEL", :offset => "0", :width => "STREAM_ID_WIDTH" },
-            { :name => "DISABLE_DEST_READY_TABLE", :offset => "(DEBUG_STATUS_STREAM_ID_SEL+DEBUG_STATUS_STREAM_ID_SEL_WIDTH)", :width => "1" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_DEBUG_ASSERTIONS",
-        :description  => "\
-        // Debugging: Non-zero value indicates an invalid stream operation occured.
-        // Sticky, write 1 to clear.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_NUM_MSGS_RECEIVED_IN_BUF_AND_MEM",
-        :description  => "\
-        // Only in receiver endpoint streams (stream 4 and 5)
-        // Read-only. Tells you the number of tiles that have arrived in L1
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 48,
-        :name => "STREAM_LOCAL_SRC_MASK",
-        :description  => "\
-        // Bit mask of connnected local source. Dont care if LOCAL_SOURCES_CONNECTED == 0.
-        // Mask segments [23:0], [47:24], and [63:48] are at indexes STREAM_LOCAL_SRC_MASK_REG_INDEX,
-        // STREAM_LOCAL_SRC_MASK_REG_INDEX+1, STREAM_LOCAL_SRC_MASK_REG_INDEX+2.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 60,
-        :name => "STREAM_RECEIVER_ENDPOINT_SET_MSG_HEADER",
-        :description  => "\
-        // For receiver endpoint streams that expose the full message header bus to unpacker,
-        // write this register to specify the full header in case the stream is not snooping
-        // a remote source but instead also works as a source endpoint.
-        // Write (STREAM_RECEIVER_ENDPOINT_SET_MSG_HEADER_REG_INDEX+i) to set bits [i*32 +: 32]
-        // of the message header for the next message, prior to writing STREAM_SOURCE_ENDPOINT_NEW_MSG_INFO_REG_INDEX.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 64,
-        :name => "STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE",
-        :description  => "\
-        // Available buffer space at remote destination stream(s).
-        // Dont care unless REMOTE_RECEIVER == 1.
-        // Source cant send data unless WORDS_FREE > 0.
-        // Read-only; updated automatically to maximum value when
-        // STREAM_REMOTE_DEST_BUF_SIZE_REG is updated.
-        // For multicast streams, values for successive destinations are at
-        // subsequent indexes (STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX+1,
-        // STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX+2, etc.).
-        ",
-        :fields =>  [
-            { :name => "REMOTE_DEST_WORDS_FREE", :offset => "0", :width => "MEM_WORD_ADDR_WIDTH" },
-        ],
-    },
-    {
-        :index => 128,
-        :name => "STREAM_RECEIVER_MSG_INFO",
-        :description  => "\
-        // Read-only register view of the bits on the o_full_msg_info bus.
-        // Exposed as 32-bit read-only registers starting at this index.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 224,
-        :name => "STREAM_DEBUG_STATUS",
-        :description  => "\
-        // Read-only register that exposes internal states of the stream.
-        // Useful for debugging. Valid 32-bit data from STREAM_DEBUG_STATUS_REG_INDEX + 0 to STREAM_DEBUG_STATUS_REG_INDEX + 9
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 234,
-        :name => "STREAM_BLOB_AUTO_CFG_DONE",
-        :description  => "\
-        // 32 bit register. Each bit denotes whether the corresponding stream has completed its blob run and is in idle state.
-        // Resets to 0 upon starting a new stream run. Initially all are 0 to exclude streams that might not be used.
-        // Can be manually reset to 0 by writing 1 to the corresponding bit.
-        // Exists only in stream 0
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 242,
-        :name => "STREAM_REMOTE_DEST_BUF_START_HI",
-        :description  => "\
-        // High bits for STREAM_REMOTE_DEST_BUF_START
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 243,
-        :name => "STREAM_REMOTE_DEST_MSG_INFO_WR_PTR_HI",
-        :description  => "\
-        // High bits for STREAM_REMOTE_DEST_MSG_INFO_WR_PTR
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 244,
-        :name => "STREAM_CURR_PHASE_BASE",
-        :description  => "\
-        // Actual phase number executed is STREAM_CURR_PHASE_BASE_REG_INDEX + STREAM_CURR_PHASE_REG_INDEX
-        // When reprogramming this register you must also reprogram STREAM_CURR_PHASE_REG_INDEX and STREAM_REMOTE_SRC_PHASE_REG_INDEX
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 245,
-        :name => "STREAM_PHASE_AUTO_CFG_PTR_BASE",
-        :description  => "\
-        // Actual address accessed will be STREAM_PHASE_AUTO_CFG_PTR_BASE_REG_INDEX + STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX
-        // When reprogramming this register you must also reprogram STREAM_PHASE_AUTO_CFG_PTR_REG_INDEX
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 246,
-        :name => "STREAM_BLOB_NEXT_AUTO_CFG_DONE",
-        :description  => "\
-        // Reading this register will give you a stream id of a stream that finished its blob (according to STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX)
-        // Subsequent reads will give you the next stream, untill all streams are read, after which it will loop
-        // This register is only valid if BLOB_NEXT_AUTO_CFG_DONE_VALID is set (i.e. if STREAM_BLOB_AUTO_CFG_DONE_REG_INDEX non-zero)
-        // Exists only in stream 0
-        ",
-        :fields =>  [
-            { :name => "BLOB_NEXT_AUTO_CFG_DONE_STREAM_ID", :offset => "0", :width => "STREAM_ID_WIDTH" },
-            { :name => "BLOB_NEXT_AUTO_CFG_DONE_VALID", :offset => "16", :width => "1" },
-        ],
-    },
-    {
-        :index => 247,
-        :name => "FIRMWARE_SCRATCH",
-        :description  => "\
-        // Scratch location for firmware usage
-        // Guarantees that no side-effects occur in Overlay hardware
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 248,
-        :name => "STREAM_SCRATCH",
-        :description  => "\
-        // Scratch registers
-        // Exists only in streams 0-3 and 8-11
-        // Data can be stored at [23:0] from STREAM_SCRATCH_REG_INDEX + 0 to STREAM_SCRATCH_REG_INDEX + 5
-        // Can be loaded through overlay blobs.
-        ",
-        :fields =>  [
-        ],
-    },
-    {
-        :index => 248,
-        :name => "STREAM_SCRATCH_0",
-        :description  => "",
-        :fields =>  [
-            { :name => "NCRISC_TRANS_EN", :offset => "0", :width => "1" },
-            { :name => "NCRISC_TRANS_EN_IRQ_ON_BLOB_END", :offset => "(NCRISC_TRANS_EN + NCRISC_TRANS_EN_WIDTH)", :width => "1" },
-            { :name => "NCRISC_CMD_ID", :offset => "(NCRISC_TRANS_EN_IRQ_ON_BLOB_END + NCRISC_TRANS_EN_IRQ_ON_BLOB_END_WIDTH)", :width => "3" },
-            {
-                :name => "NEXT_NRISC_PIC_INT_ON_PHASE", :offset => "(NCRISC_CMD_ID + NCRISC_CMD_ID_WIDTH)", :width => "19",
-                :description  => "\
-                // Kept for compatibility with grayskull, but doesnt not exist anymore in wormhole
-                ",
-            },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_SCRATCH_1",
-        :description  => "",
-        :fields =>  [
-            { :name => "DRAM_FIFO_RD_PTR_WORDS_LO", :offset => "0", :width => "24" },
-            { :name => "NCRISC_LOOP_COUNT", :offset => "0", :width => "24" },
-            { :name => "NCRISC_INIT_ENABLE_BLOB_DONE_IRQ", :offset => "0", :width => "1" },
-            { :name => "NCRISC_INIT_DISABLE_BLOB_DONE_IRQ", :offset => "(NCRISC_INIT_ENABLE_BLOB_DONE_IRQ + NCRISC_INIT_ENABLE_BLOB_DONE_IRQ_WIDTH)", :width => "1" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_SCRATCH_2",
-        :description  => "",
-        :fields =>  [
-            { :name => "DRAM_FIFO_RD_PTR_WORDS_HI", :offset => "0", :width => "4" },
-            { :name => "DRAM_FIFO_WR_PTR_WORDS_LO", :offset => "(DRAM_FIFO_RD_PTR_WORDS_HI + DRAM_FIFO_RD_PTR_WORDS_HI_WIDTH)", :width => "20" },
-            { :name => "NCRISC_TOTAL_LOOP_ITER", :offset => "0", :width => "24" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_SCRATCH_3",
-        :description  => "",
-        :fields =>  [
-            { :name => "DRAM_FIFO_WR_PTR_WORDS_HI", :offset => "0", :width => "8" },
-            { :name => "DRAM_FIFO_CAPACITY_PTR_WORDS_LO", :offset => "(DRAM_FIFO_WR_PTR_WORDS_HI + DRAM_FIFO_WR_PTR_WORDS_HI_WIDTH)", :width => "16" },
-            { :name => "NCRISC_LOOP_INCR", :offset => "0", :width => "16" },
-            { :name => "NCRISC_LOOP_BACK_NUM_CFG_REG_WRITES", :offset => "(NCRISC_LOOP_INCR+NCRISC_LOOP_INCR_WIDTH)", :width => "8" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_SCRATCH_4",
-        :description  => "",
-        :fields =>  [
-            { :name => "DRAM_FIFO_CAPACITY_PTR_WORDS_HI", :offset => "0", :width => "12" },
-            { :name => "DRAM_FIFO_BASE_ADDR_WORDS_LO", :offset => "(DRAM_FIFO_CAPACITY_PTR_WORDS_HI + DRAM_FIFO_CAPACITY_PTR_WORDS_HI_WIDTH)", :width => "12" },
-            { :name => "NCRISC_LOOP_BACK_AUTO_CFG_PTR", :offset => "0", :width => "24" },
-        ],
-    },
-    {
-        :index => :prev_plus_1,
-        :name => "STREAM_SCRATCH_5",
-        :description  => "",
-        :fields =>  [
-            { :name => "DRAM_FIFO_BASE_ADDR_WORDS_HI", :offset => "0", :width => "16" },
-            {
-                :name => "DRAM_EN_BLOCKING", :offset => "(DRAM_FIFO_BASE_ADDR_WORDS_HI + DRAM_FIFO_BASE_ADDR_WORDS_HI_WIDTH)", :width => "1",
-                :description  => "\
-                // Processes the read or write operation to completeion without processing other dram streams in the meantime
-                ",
-            },
-            {
-                :name => "DRAM_DATA_STRUCTURE_IS_LUT", :offset => "(DRAM_EN_BLOCKING + DRAM_EN_BLOCKING_WIDTH)", :width => "1",
-                :description  => "\
-                // Fifo structure in dram holds a dram pointer and size that is used as indirection to a tile in dram
-                ",
-            },
-            {
-                :name => "DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY", :offset => "(DRAM_DATA_STRUCTURE_IS_LUT + DRAM_DATA_STRUCTURE_IS_LUT_WIDTH)", :width => "1",
-                :description  => "\
-                // During a dram read, if its detected that the fifo is empty the ncrisc will reset the read pointer back to base
-                // Its expected that there is no host interaction
-                ",
-            },
-            {
-                :name => "DRAM_RESET_WR_PTR_TO_BASE_ON_FULL", :offset => "(DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY + DRAM_RESET_RD_PTR_TO_BASE_ON_EMPTY_WIDTH)", :width => "1",
-                :description  => "\
-                // During a dram write, if its detected that the fifo is full the ncrisc will reset the write pointer back to base. Old data will be overwritten.
-                // Its expected that there is no host interaction
-                ",
-            },
-            {
-                :name => "DRAM_NO_PTR_UPDATE_ON_PHASE_END", :offset => "(DRAM_RESET_WR_PTR_TO_BASE_ON_FULL + DRAM_RESET_WR_PTR_TO_BASE_ON_FULL_WIDTH)", :width => "1",
-                :description  => "\
-                // The internal ncrisc rd/wr pointers will not be updated at phase end
-                // Its expected that there is no host interaction
-                ",
-            },
-            {
-                :name => "DRAM_WR_BUFFER_FLUSH_AND_RST_PTRS", :offset => "(DRAM_NO_PTR_UPDATE_ON_PHASE_END + DRAM_NO_PTR_UPDATE_ON_PHASE_END_WIDTH)", :width => "1",
-                :description  => "\
-                // Before ending the phase the ncrisc will wait until the host has emptied the write buffer and then reset the read and write pointers to base
-                // This can be used for hosts that do not want to track wrapping
-                // The host must be aware of this behaviour for this functionality to work
-                ",
-            },
-            { :name => "NCRISC_LOOP_NEXT_PIC_INT_ON_PHASE", :offset => "0", :width => "20" },
-        ],
-    },
-]
-
-prev_register = nil
-registers.each do |register|
-    # Process :prev_plus_1 and :same_as_prev
-    if register[:index] == :prev_plus_1
-        register[:index] = prev_register[:index] + 1
-    end
-    if register[:index] == :same_as_prev
-        register[:index] = prev_register[:index]
-    end
-
-    # Add :width_name to fields
-    register[:fields].each do |field|
-        field[:width_name] = field[:name] + "_WIDTH"
-    end
-
-    # Make fields printing pretty (align columns)
-    max_len = nil
-    register[:fields].each do |field|
-        # :width_name will always be larger than :fields, so we dont need to check :fields
-        if max_len == nil || max_len < field[:width_name].length
-            max_len = field[:width_name].length
-        end
-    end
-
-    register[:fields].each do |field|
-        field[:name] = field[:name].ljust(max_len)
-        field[:width_name] = field[:width_name].ljust(max_len)
-    end
-
-    # Remove whitespace after newline and last newline in :description
-    register[:description] = register[:description].strip
-    register[:description] = register[:description].gsub(/\n\s+/, "\n")
-    if register[:description] != ""
-        register[:description] = register[:description] + "\n"
-    end
-
-    register[:fields].each do |field|
-        if field[:description]
-            field[:description] = field[:description].strip
-            field[:description] = field[:description].gsub(/\n\s+/, "\n")
-        else
-            field[:description] = ""
-        end
-
-        if field[:description] != ""
-            field[:description] = field[:description] + "\n"
-        end
-    end
-
-    prev_register = register
-end
-
--%>
-<%- if type == :c_header -%>
-<%= header -%>
-
-#ifndef NOC_OVERLAY_PARAMETERS_H
-#define NOC_OVERLAY_PARAMETERS_H
-
-<%= c_header_basic -%>
-
-<%- registers.each do |register| -%>
-<%= register[:description] -%>
-#define   <%= register[:name].upcase %>_REG_INDEX   <%= register[:index] %>
-<%- register[:fields].each do |field| -%>
-<%= field[:description] -%>
-#define       <%= field[:name].upcase %> <%= field[:offset] %>
-#define       <%= field[:width_name].upcase %>   <%= field[:width] %>
-<%- end -%>
-
-<%- end -%>
-#endif // def NOC_OVERLAY_PARAMETERS_H
-
-<%- elsif type == :cpp_header -%>
-<%= header -%>
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include <stdexcept>
-
-<%= c_header_basic -%>
-
-namespace Noc {
-
-typedef struct OverlayField_ {
-    std::string name;
-    std::uint32_t offset;
-    std::uint32_t width;
-    std::string description;
-} OverlayField;
-
-typedef struct OverlayReg_ {
-    std::string name;
-    std::uint32_t index;
-    std::unordered_map<std::string, std::uint32_t> fields_by_name;
-    std::unordered_map<std::uint32_t, std::uint32_t> fields_by_offset;
-    std::vector<OverlayField> fields;
-    std::string description;
-} OverlayReg;
-
-// OverLayParams
-class OLP {
-    private:
-        static const std::unordered_map<std::string, std::uint32_t> registers_by_name;
-        static const std::unordered_map<std::uint32_t, std::uint32_t> registers_by_index;
-        static const std::vector<OverlayReg> registers;
-        static const std::unordered_map<std::string, std::uint32_t> fields_by_name;
-        static const std::vector<OverlayField> fields;
-
-    private:
-        // Disallow creating an instance of this object
-        OLP() {}
-
-    public:
-        static bool HasReg(std::string label)
-        {
-            return registers_by_name.count(label) >= 1;
-        }
-
-        // There might be multiple registers with the same index
-        // If so a register you didnt intend to access might be accessed.
-        // Use accessor based on label if possible
-        static bool HasReg(std::uint32_t index)
-        {
-            return registers_by_index.count(index) >= 1;
-        }
-
-        static const std::vector<OverlayReg>& GetAllRegs()
-        {
-            return registers;
-        }
-
-        // There might be multiple registers with the same index
-        // If so a register you didnt intend to access might be accessed.
-        // Use accessor based on label if possible
-        static std::string RegName(std::uint32_t index)
-        {
-            if (HasReg(index))
-                return registers[registers_by_index.at(index)].name;
-            else
-                throw std::runtime_error("Non-existant overlay register index: " + std::to_string(index));
-        }
-
-        static std::uint32_t RegIdx(std::string label)
-        {
-            if (HasReg(label))
-                return registers[registers_by_name.at(label)].index;
-            else
-                throw std::runtime_error("Non-existant overlay register: " + std::string(label));
-        }
-
-        static std::string RegInfo(std::string label)
-        {
-            if (HasReg(label))
-                return registers[registers_by_name.at(label)].description;
-            else
-                throw std::runtime_error("Non-existant overlay register: " + std::string(label));
-        }
-
-        ////////////////////////////////////
-
-        static bool HasFld(std::string label)
-        {
-            return fields_by_name.count(label) >= 1;
-        }
-
-        static const std::vector<OverlayField>& GetAllFlds()
-        {
-            return fields;
-        }
-
-        static std::uint32_t FldOff(std::string label)
-        {
-            if (HasFld(label))
-                return fields[fields_by_name.at(label)].offset;
-            else
-                throw std::runtime_error("Non-existant overlay field: " + std::string(label));
-        }
-
-        static std::uint32_t FldW(std::string label)
-        {
-            if (HasFld(label))
-                return fields[fields_by_name.at(label)].width;
-            else
-                throw std::runtime_error("Non-existant overlay field: " + std::string(label));
-        }
-
-        static std::string FldInfo(std::string label)
-        {
-            if (HasFld(label))
-                return fields[fields_by_name.at(label)].description;
-            else
-                throw std::runtime_error("Non-existant overlay field: " + std::string(label));
-        }
-
-        ////////////////////////////////////
-
-        static bool HasFld(std::string reg_label, std::string field_label)
-        {
-            return HasReg(reg_label) &&
-                   (registers[registers_by_name.at(reg_label)].fields_by_name.count(field_label) >= 1);
-        }
-
-        // There might be multiple registers(fields) with the same index(offset)
-        // If so a register(field) you didnt intend to access might be accessed.
-        // Use accessor based on label if possible
-        static bool HasFld(std::uint32_t reg_index, std::uint32_t field_offset)
-        {
-            return HasReg(reg_index) &&
-                   (registers[registers_by_index.at(reg_index)].fields_by_offset.count(field_offset) >= 1);
-        }
-
-        static const std::vector<OverlayField>& GetAllFlds(std::string reg_label)
-        {
-            if (HasReg(reg_label)) {
-                return registers[registers_by_name.at(reg_label)].fields;
-            } else {
-                throw std::runtime_error("Non-existant overlay register: " + std::string(reg_label));
-            }
-        }
-
-        // There might be multiple registers(fields) with the same index(offset)
-        // If so a register(field) you didnt intend to access might be accessed.
-        // Use accessor based on label if possible
-        static const std::vector<OverlayField>& GetAllFlds(std::uint32_t reg_index)
-        {
-            if (HasReg(reg_index)) {
-                return registers[registers_by_index.at(reg_index)].fields;
-            } else {
-                throw std::runtime_error("Non-existant overlay register index: " + std::to_string(reg_index));
-            }
-        }
-
-        // There might be multiple registers(fields) with the same index(offset)
-        // If so a register(field) you didnt intend to access might be accessed.
-        // Use accessor based on label if possible
-        static std::string FldName(std::uint32_t reg_index, std::uint32_t field_offset)
-        {
-            if (HasFld(reg_index, field_offset)) {
-                auto field_tmp = registers[registers_by_index.at(reg_index)].fields;
-                auto index_field_temp = registers[registers_by_index.at(reg_index)].fields_by_offset.at(field_offset);
-                return field_tmp[index_field_temp].name;
-            } else {
-                throw std::runtime_error("Non-existant overlay register field (index, offset): " + std::to_string(reg_index) + ", " + std::to_string(field_offset));
-            }
-        }
-
-        static std::uint32_t FldOff(std::string reg_label, std::string field_label)
-        {
-            if (HasFld(reg_label, field_label)) {
-                auto field_tmp = registers[registers_by_name.at(reg_label)].fields;
-                auto index_field_temp = registers[registers_by_name.at(reg_label)].fields_by_name.at(field_label);
-                return field_tmp[index_field_temp].offset;
-            } else {
-                throw std::runtime_error("Non-existant overlay register field: " + std::string(reg_label) + ", " + std::string(field_label));
-            }
-        }
-
-        static std::uint32_t FldW(std::string reg_label, std::string field_label)
-        {
-            if (HasFld(reg_label, field_label)) {
-                auto field_tmp = registers[registers_by_name.at(reg_label)].fields;
-                auto index_field_temp = registers[registers_by_name.at(reg_label)].fields_by_name.at(field_label);
-                return field_tmp[index_field_temp].width;
-            } else {
-                throw std::runtime_error("Non-existant overlay register field: " + std::string(reg_label) + ", " + std::string(field_label));
-            }
-        }
-
-        // There might be multiple registers(fields) with the same index(offset)
-        // If so a register(field) you didnt intend to access might be accessed.
-        // Use accessor based on label if possible
-        static std::uint32_t FldW(std::uint32_t reg_index, std::uint32_t field_offset)
-        {
-            if (HasFld(reg_index, field_offset)) {
-                auto field_tmp = registers[registers_by_index.at(reg_index)].fields;
-                auto index_field_temp = registers[registers_by_index.at(reg_index)].fields_by_offset.at(field_offset);
-                return field_tmp[index_field_temp].width;
-            } else {
-                throw std::runtime_error("Non-existant overlay register field (index, offset): " + std::to_string(reg_index) + ", " + std::to_string(field_offset));
-            }
-        }
-
-        static std::string FldInfo(std::string reg_label, std::string field_label)
-        {
-            if (HasFld(reg_label, field_label)) {
-                auto field_tmp = registers[registers_by_name.at(reg_label)].fields;
-                auto index_field_temp = registers[registers_by_name.at(reg_label)].fields_by_name.at(field_label);
-                return field_tmp[index_field_temp].description;
-            } else {
-                throw std::runtime_error("Non-existant overlay register field: " + std::string(reg_label) + ", " + std::string(field_label));
-            }
-        }
-
-};
-
-const std::vector<OverlayReg> OLP::registers = {
-<%- first_line_placed_outer = false -%>
-<%- registers.each_with_index do |register, index| -%>
-    <% if first_line_placed_outer -%>,<%- end -%>{
-        "<%= register[:name].upcase.strip %>",
-        <%= register[:index] %>,
-        {
-<%- first_line_placed = false -%>
-<%- register[:fields].each_with_index do |field, field_index| -%>
-            <% if first_line_placed -%>,<%- end -%>{"<%= field[:name].upcase.strip %>", <%= field_index %>}
-<%- first_line_placed = true -%>
-<%- end -%>
-        },
-        {
-<%- index_exists = {} -%>
-<%- first_line_placed = false -%>
-<%- register[:fields].each_with_index do |field, field_index| -%>
-<%- if !index_exists[field[:offset]] -%>
-            <% if first_line_placed -%>,<%- end -%>{<%= field[:offset] %>, <%= field_index %>}
-<%- first_line_placed = true -%>
-<%- index_exists[field[:offset]] = true -%>
-<%- end -%>
-<%- end -%>
-        },
-        {
-<%- first_line_placed = false -%>
-<%- register[:fields].each_with_index do |field, field_index| -%>
-            <% if first_line_placed -%>,<%- end -%>{
-                "<%= field[:name].upcase.strip %>",
-                <%= field[:offset] %>,
-                <%= field[:width] %>,
-                "<%= field[:description].gsub(/\n/, '\\n') %>"
-            }
-<%- first_line_placed = true -%>
-<%- end -%>
-        },
-        "<%= register[:description].gsub(/\n/, '\\n') %>"
-    }
-<%- first_line_placed_outer = true -%>
-<%- end -%>
-};
-
-const std::unordered_map<std::string, std::uint32_t> OLP::registers_by_name = {
-<%- first_line_placed = false -%>
-<%- registers.each_with_index do |register, index| -%>
-    <% if first_line_placed -%>,<%- end -%>{"<%= register[:name].upcase.strip %>", <%= index %>}
-<%- first_line_placed = true -%>
-<%- end -%>
-};
-
-const std::unordered_map<std::uint32_t, std::uint32_t> OLP::registers_by_index = {
-<%- index_exists = {} -%>
-<%- first_line_placed = false -%>
-<%- registers.each_with_index do |register, index| -%>
-<%- if !index_exists[register[:index]] -%>
-    <% if first_line_placed -%>,<%- end -%>{<%= register[:index] %>, <%= index %>}
-<%- first_line_placed = true -%>
-<%- index_exists[register[:index]] = true -%>
-<%- end -%>
-<%- end -%>
-};
-
-const std::vector<OverlayField> OLP::fields = {
-<%- first_line_placed = false -%>
-<%- registers.each_with_index do |register, index| -%>
-<%- register[:fields].each_with_index do |field, field_index| -%>
-    <% if first_line_placed -%>,<%- end -%>{
-        "<%= field[:name].upcase.strip %>",
-        <%= field[:offset] %>,
-        <%= field[:width] %>,
-        "<%= field[:description].gsub(/\n/, '\\n') %>"
-    }
-<%- first_line_placed = true -%>
-<%- end -%>
-<%- end -%>
-};
-
-const std::unordered_map<std::string, std::uint32_t> OLP::fields_by_name = {
-<%- first_line_placed = false -%>
-<%- unrolled_index = 0 -%>
-<%- registers.each_with_index do |register, index| -%>
-<%- register[:fields].each_with_index do |field, field_index| -%>
-    <% if first_line_placed -%>,<%- end -%>{"<%= field[:name].upcase.strip %>", <%= unrolled_index %>}
-<%- unrolled_index = unrolled_index + 1 -%>
-<%- first_line_placed = true -%>
-<%- end -%>
-<%- end -%>
-};
-
-}
-
-<%- elsif type == :sed -%>
-
-<%- registers.each_with_index do |register, index| -%>
-find . -type f | xargs grep -sl <%= register[:name].upcase.strip %>_REG_INDEX | xargs sed -i 's/\(^\|[^a-zA-Z_]\)<%= register[:name].upcase.strip %>_REG_INDEX\($\|[^a-zA-Z0-9_]\)/\1OLP::RegIdx("<%= register[:name].upcase.strip %>")\2/g'
-<%- register[:fields].each_with_index do |field, field_index| -%>
-find . -type f | xargs grep -sl <%= field[:name].upcase.strip %> | xargs sed -i 's/\(^\|[^a-zA-Z_]\)<%= field[:name].upcase.strip %>\($\|[^a-zA-Z0-9_]\)/\1OLP::FldOff("<%= field[:name].upcase.strip %>")\2/g'
-find . -type f | xargs grep -sl <%= field[:width_name].upcase.strip %> | xargs sed -i 's/\(^\|[^a-zA-Z_]\)<%= field[:width_name].upcase.strip %>\($\|[^a-zA-Z0-9_]\)/\1OLP::FldW("<%= field[:name].upcase.strip %>")\2/g'
-<%- end -%>
-<%- end -%>
-
-find . -type f | xargs grep -sl WRITE_FIELD | xargs sed -i 's/WRITE_FIELD\(\s*\)(\(\s*\)OLP::RegIdx("\([a-zA-Z0-9_]*\)")\(\s*\),\(\s*\)OLP::FldOff("\([a-zA-Z0-9_]*\)")/WRITE_FIELD\1(\2"\3"\4,\5"\6"/g'
-find . -type f | xargs grep -sl READ_FIELD | xargs sed -i 's/READ_FIELD\(\s*\)(\(\s*\)OLP::RegIdx("\([a-zA-Z0-9_]*\)")\(\s*\),\(\s*\)OLP::FldOff("\([a-zA-Z0-9_]*\)")/READ_FIELD\1(\2"\3"\4,\5"\6"/g'
-find . -type f | xargs grep -sl COPY_FIELD | xargs sed -i 's/COPY_FIELD\(\s*\)(\(\s*\)OLP::RegIdx("\([a-zA-Z0-9_]*\)")\(\s*\),\(\s*\)OLP::FldOff("\([a-zA-Z0-9_]*\)")/COPY_FIELD\1(\2"\3"\4,\5"\6"/g'
-
-<%- end -%>
diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp
index 9e82bf0da0b5..00028c9486a5 100644
--- a/tt_metal/jit_build/build.cpp
+++ b/tt_metal/jit_build/build.cpp
@@ -31,6 +31,7 @@ static std::string get_string_aliased_arch_lowercase(tt::ARCH arch) {
         case tt::ARCH::GRAYSKULL: return "grayskull"; break;
         case tt::ARCH::WORMHOLE: return "wormhole"; break;
         case tt::ARCH::WORMHOLE_B0: return "wormhole"; break;
+        case tt::ARCH::BLACKHOLE: return "blackhole"; break;
         default: return "invalid"; break;
     }
 }
@@ -64,6 +65,9 @@ void JitBuildEnv::init(uint32_t build_key, tt::ARCH arch)
     case ARCH::WORMHOLE_B0:
         common_flags = "-mwormhole -march=rv32imw -mtune=rvtt-b1 -mabi=ilp32 ";
         break;
+    case ARCH::BLACKHOLE:
+        common_flags = "-mblackhole -march=rv32iml -mtune=rvtt-b1 -mabi=ilp32 ";
+        break;
     default:
         TT_ASSERT(false, "Invalid arch");
         break;
@@ -89,6 +93,9 @@ void JitBuildEnv::init(uint32_t build_key, tt::ARCH arch)
     case ARCH::WORMHOLE_B0:
         this->defines_ = "-DARCH_WORMHOLE ";
         break;
+    case ARCH::BLACKHOLE:
+        this->defines_ = "-DARCH_BLACKHOLE ";
+        break;
     default:
         break;
     }
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index e7f1587509d0..97e25527dadc 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -88,6 +88,12 @@ void Cluster::detect_arch_and_target() {
         "Arch={} doesn't match compile-time build for WORMHOLE",
         get_string(this->arch_));
 #endif
+#ifdef ARCH_BLACKHOLE
+    TT_FATAL(
+        this->arch_ == tt::ARCH::BLACKHOLE,
+        "Arch={} doesn't match compile-time build for BLACKHOLE",
+        get_string(this->arch_));
+#endif
 
     TT_FATAL(this->target_type_ == TargetDevice::Versim or this->target_type_ == TargetDevice::Silicon);
 }
@@ -314,6 +320,13 @@ std::int32_t get_static_tlb_index(CoreCoord target) {
 }
 #endif
 
+// TODO: pull tlb config into sep file similar to BBE
+#ifdef ARCH_BLACKHOLE
+std::int32_t get_static_tlb_index(CoreCoord target) {
+    return -1;
+}
+#endif
+
 void Cluster::configure_static_tlbs(chip_id_t mmio_device_id) const {
     auto sdesc = get_soc_desc(mmio_device_id);
     auto statically_mapped_cores = sdesc.workers;
@@ -332,7 +345,9 @@ void Cluster::configure_static_tlbs(chip_id_t mmio_device_id) const {
         this->get_driver(mmio_device_id).configure_tlb(
             mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset);
         // Align address space of 16MB TLB to 16MB boundary
+#ifndef ARCH_BLACKHOLE // TODO (abhullar): clean this up
         peer_dram_offset += DEVICE_DATA.DYNAMIC_TLB_16M_SIZE;
+#endif
     }
     this->get_driver(mmio_device_id).setup_core_to_tlb_map([](CoreCoord core) { return get_static_tlb_index(core); });
 }
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp
index 0363b76b02b9..dc0a5eddf0ac 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/llrt/tt_cluster.hpp
@@ -29,7 +29,6 @@
 static constexpr std::uint32_t SW_VERSION = 0x00020000;
 
 using tt_target_dram = std::tuple<int, int, int>;
-using tt::DEVICE;
 using tt::TargetDevice;
 
 enum EthRouterMode : uint32_t {

From 5b632e5dc25b9d0e8fa209d3dc33c0406d747a3f Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Thu, 16 May 2024 16:04:20 +0000
Subject: [PATCH 33/40] #8530: Uplift UMD to get Blackhole support

---
 tt_metal/third_party/umd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd
index d851ac2c75db..49038c8f9246 160000
--- a/tt_metal/third_party/umd
+++ b/tt_metal/third_party/umd
@@ -1 +1 @@
-Subproject commit d851ac2c75db1a876ac22701874c95eb9bd4ce81
+Subproject commit 49038c8f9246391cbf4edfb6ffef929435588b19

From 60331a61b49bbe881c0458b9f99a5df214933ca4 Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Thu, 16 May 2024 17:15:59 +0000
Subject: [PATCH 34/40] #8530: Add build jobs for blackhole

---
 .github/workflows/build-artifact.yaml | 2 +-
 .github/workflows/build-cmake.yaml    | 2 +-
 .github/workflows/build.yaml          | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml
index 6e76bba84b84..4742cc225adb 100644
--- a/.github/workflows/build-artifact.yaml
+++ b/.github/workflows/build-artifact.yaml
@@ -13,7 +13,7 @@ jobs:
   build-artifact:
     strategy:
       matrix:
-        arch: ${{ fromJson(inputs.arch || '["grayskull", "wormhole_b0"]') }}
+        arch: ${{ fromJson(inputs.arch || '["grayskull", "wormhole_b0", "blackhole"]') }}
 
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
diff --git a/.github/workflows/build-cmake.yaml b/.github/workflows/build-cmake.yaml
index e8fb37fbb9ad..a67d5d48a4a8 100644
--- a/.github/workflows/build-cmake.yaml
+++ b/.github/workflows/build-cmake.yaml
@@ -10,7 +10,7 @@ jobs:
   build-cmake:
     strategy:
       matrix:
-        arch: [grayskull, wormhole_b0]
+        arch: [grayskull, wormhole_b0, blackhole]
     name: CMake Build
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 43f3fe4a4144..24b97b931da2 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -13,7 +13,7 @@ jobs:
           { type: Release, runs-on: ubuntu-20.04 },
           { type: RelWithDebInfo, runs-on: build },
         ]
-        arch: [grayskull, wormhole_b0]
+        arch: [grayskull, wormhole_b0, blackhole]
         os: [ubuntu-20.04]
     env:
       ARCH_NAME: ${{ matrix.arch }}
@@ -54,7 +54,7 @@ jobs:
     strategy:
       matrix:
         config: [Debug, Release, RelWithDebInfo]
-        arch: [grayskull, wormhole_b0]
+        arch: [grayskull, wormhole_b0, blackhole]
         os: [ubuntu-20.04]
     needs: build-lib
     name: cmake build cpptest ${{ matrix.config }} ${{ matrix.arch }}

From b59ea50709343c79ee81695b6f32219a244a4b27 Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Thu, 16 May 2024 17:17:47 +0000
Subject: [PATCH 35/40] #8560: add blackhole soc desc and prelim blackhome core
 desc yamls

---
 tt_metal/common/core_descriptor.hpp           |  6 +-
 tt_metal/common/test_common.hpp               |  2 +
 .../core_descriptors/blackhole_140_arch.yaml  | 41 +++++++++
 .../blackhole_versim_1x1_arch.yaml            | 21 +++++
 .../soc_descriptors/blackhole_140_arch.yaml   | 86 +++++++++++++++++++
 .../blackhole_versim_1x1_arch.yaml            | 67 +++++++++++++++
 .../soc_descriptors/wormhole_b0_80_arch.yaml  |  2 +-
 7 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 tt_metal/core_descriptors/blackhole_140_arch.yaml
 create mode 100644 tt_metal/core_descriptors/blackhole_versim_1x1_arch.yaml
 create mode 100644 tt_metal/soc_descriptors/blackhole_140_arch.yaml
 create mode 100644 tt_metal/soc_descriptors/blackhole_versim_1x1_arch.yaml

diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp
index 128661fe4ca9..66c0c92e0b61 100644
--- a/tt_metal/common/core_descriptor.hpp
+++ b/tt_metal/common/core_descriptor.hpp
@@ -48,6 +48,7 @@ inline std::string get_core_descriptor_file(const tt::ARCH &arch) {
             case tt::ARCH::GRAYSKULL: return tt_metal_home + "tt_metal/core_descriptors/grayskull_versim_1x1_arch.yaml";
             case tt::ARCH::WORMHOLE: throw std::runtime_error("WORMHOLE arch not supported");
             case tt::ARCH::WORMHOLE_B0: return tt_metal_home + "tt_metal/core_descriptors/wormhole_b0_versim_1x1_arch.yaml";
+            case tt::ARCH::BLACKHOLE: return tt_metal_home + "tt_metal/core_descriptors/blackhole_versim_1x1_arch.yaml";
             default: throw std::runtime_error("Unsupported device arch");
         };
     } else {
@@ -57,6 +58,7 @@ inline std::string get_core_descriptor_file(const tt::ARCH &arch) {
             case tt::ARCH::GRAYSKULL: return tt_metal_home + "tt_metal/core_descriptors/grayskull_120_arch.yaml";
             case tt::ARCH::WORMHOLE: throw std::runtime_error("WORMHOLE arch not supported");
             case tt::ARCH::WORMHOLE_B0: return tt_metal_home + wh_arch;
+            case tt::ARCH::BLACKHOLE: return tt_metal_home + "tt_metal/core_descriptors/blackhole_140_arch.yaml";
             default: throw std::runtime_error("Unsupported device arch");
         };
     }
@@ -66,7 +68,9 @@ inline std::string get_core_descriptor_file(const tt::ARCH &arch) {
 inline const std::string get_product_name(tt::ARCH arch, uint32_t num_harvested_rows) {
     const static std::map<tt::ARCH, std::map<uint32_t, std::string>> product_name = {
         {tt::ARCH::GRAYSKULL, {{0, "E150"}, {2, "E75"}}},
-        {tt::ARCH::WORMHOLE_B0, {{0, "galaxy"}, {1, "nebula_x1"}, {2, "nebula_x2"}}}};
+        {tt::ARCH::WORMHOLE_B0, {{0, "galaxy"}, {1, "nebula_x1"}, {2, "nebula_x2"}}},
+        {tt::ARCH::BLACKHOLE, {{0, "blackhole"}}} // TODO (abhullar): revisit blackhole product names
+    };
 
     return product_name.at(arch).at(num_harvested_rows);
 }
diff --git a/tt_metal/common/test_common.hpp b/tt_metal/common/test_common.hpp
index 9b53e4425d7e..b738460f425a 100644
--- a/tt_metal/common/test_common.hpp
+++ b/tt_metal/common/test_common.hpp
@@ -40,6 +40,7 @@ inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevi
             case tt::ARCH::GRAYSKULL: return tt_metal_home + "tt_metal/soc_descriptors/grayskull_versim_1x1_arch.yaml";
             case tt::ARCH::WORMHOLE: throw std::runtime_error("WORMHOLE arch not supported");
             case tt::ARCH::WORMHOLE_B0: return tt_metal_home + "tt_metal/soc_descriptors/wormhole_b0_versim_1x1_arch.yaml";
+            case tt::ARCH::BLACKHOLE: return tt_metal_home + "tt_metal/soc_descriptors/blackhole_versim_1x1_arch.yaml";
             default: throw std::runtime_error("Unsupported device arch");
         };
     } else {
@@ -49,6 +50,7 @@ inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevi
             case tt::ARCH::GRAYSKULL: return tt_metal_home + "tt_metal/soc_descriptors/grayskull_120_arch.yaml";
             case tt::ARCH::WORMHOLE: throw std::runtime_error("WORMHOLE arch not supported");
             case tt::ARCH::WORMHOLE_B0: return tt_metal_home + "tt_metal/soc_descriptors/wormhole_b0_80_arch.yaml";
+            case tt::ARCH::BLACKHOLE: return tt_metal_home + "tt_metal/soc_descriptors/blackhole_140_arch.yaml";
             default: throw std::runtime_error("Unsupported device arch");
         };
     }
diff --git a/tt_metal/core_descriptors/blackhole_140_arch.yaml b/tt_metal/core_descriptors/blackhole_140_arch.yaml
new file mode 100644
index 000000000000..b47085ceae81
--- /dev/null
+++ b/tt_metal/core_descriptors/blackhole_140_arch.yaml
@@ -0,0 +1,41 @@
+# Anything using [[#, #]] is logical coordinates (Can be relative)
+# relative index: 0 means first row, -1 means last row of functional grid...
+
+# product name:
+#   num of HW command queues:
+#     core descriptor config
+
+blackhole:
+  1:
+    l1_bank_size:
+      1376256
+
+    compute_with_storage_grid_range: # Logical only start and end [x, y]
+      start: [0, 0]
+      end: [7, 6]
+
+    storage_cores: # Relative to grid of tensix cores
+      []
+
+    dispatch_cores:
+      []
+
+    dispatch_core_type:
+      "tensix"
+
+  2:
+    l1_bank_size:
+      1376256
+
+    compute_with_storage_grid_range: # Logical only start and end [x, y]
+      start: [0, 0]
+      end: [7, 6]
+
+    storage_cores: # Relative to grid of tensix cores
+      []
+
+    dispatch_cores:
+      []
+
+    dispatch_core_type:
+      "tensix"
diff --git a/tt_metal/core_descriptors/blackhole_versim_1x1_arch.yaml b/tt_metal/core_descriptors/blackhole_versim_1x1_arch.yaml
new file mode 100644
index 000000000000..0301126f86bc
--- /dev/null
+++ b/tt_metal/core_descriptors/blackhole_versim_1x1_arch.yaml
@@ -0,0 +1,21 @@
+# Anything using [[#, #]] is logical coordinates (Can be relative)
+# relative index: 0 means first row, -1 means last row of functional grid...
+
+# product name:
+#   num of HW command queues:
+#     core descriptor config
+
+blackhole:
+  1:
+    l1_bank_size:
+      1376256
+
+    compute_with_storage_grid_range:
+      start: [0, 0]
+      end: [0, 0]
+
+    storage_cores:
+      []
+
+    dispatch_cores:
+      []
diff --git a/tt_metal/soc_descriptors/blackhole_140_arch.yaml b/tt_metal/soc_descriptors/blackhole_140_arch.yaml
new file mode 100644
index 000000000000..6113b2b80776
--- /dev/null
+++ b/tt_metal/soc_descriptors/blackhole_140_arch.yaml
@@ -0,0 +1,86 @@
+# soc-descriptor yaml
+# Anything using [#-#] is noc coordinates
+# Anything using [[#, #]] is logical coordinates (Can be relative)
+# relative index: 0 means first row, -1 means last row of functional grid...
+grid:
+  x_size: 17
+  y_size: 12
+
+arc:
+  [ 8-0 ]
+
+pcie:
+  [ 11-0 ]
+
+dram:
+  [
+      [0-0, 0-1, 0-11],
+      [0-2, 0-10, 0-3],
+      [0-9, 0-4, 0-8],
+      [0-5, 0-7, 0-6],
+      [9-0, 9-1, 9-11],
+      [9-2, 9-10, 9-3],
+      [9-9, 9-4, 9-8],
+      [9-5, 9-7, 9-6],
+  ]
+
+dram_preferred_eth_endpoint:
+  [ 0-1, 0-2, 0-4, 0-5, 9-1, 9-2, 9-4, 9-5 ]
+
+dram_preferred_worker_endpoint:
+  [ 0-11, 0-3, 0-8, 0-6, 9-11, 9-3, 9-8, 9-6 ]
+
+dram_address_offsets:
+  [ 0, 0, 0, 0, 0, 0, 0, 0 ]
+
+eth:
+  [
+   1-1, 2-1, 3-1, 4-1, 5-1, 6-1, 7-1, 10-1, 11-1, 12-1, 13-1, 14-1, 15-1, 16-1,
+  ]
+
+functional_workers:
+  [
+   1-2,   2-2,   3-2,   4-2,   5-2,   6-2,   7-2,   10-2,   11-2,   12-2,   13-2,   14-2,   15-2,   16-2,
+   1-3,   2-3,   3-3,   4-3,   5-3,   6-3,   7-3,   10-3,   11-3,   12-3,   13-3,   14-3,   15-3,   16-3,
+   1-4,   2-4,   3-4,   4-4,   5-4,   6-4,   7-4,   10-4,   11-4,   12-4,   13-4,   14-4,   15-4,   16-4,
+   1-5,   2-5,   3-5,   4-5,   5-5,   6-5,   7-5,   10-5,   11-5,   12-5,   13-5,   14-5,   15-5,   16-5,
+   1-6,   2-6,   3-6,   4-6,   5-6,   6-6,   7-6,   10-6,   11-6,   12-6,   13-6,   14-6,   15-6,   16-6,
+   1-7,   2-7,   3-7,   4-7,   5-7,   6-7,   7-7,   10-7,   11-7,   12-7,   13-7,   14-7,   15-7,   16-7,
+   1-8,   2-8,   3-8,   4-8,   5-8,   6-8,   7-8,   10-8,   11-8,   12-8,   13-8,   14-8,   15-8,   16-8,
+   1-9,   2-9,   3-9,   4-9,   5-9,   6-9,   7-9,   10-9,   11-9,   12-9,   13-9,   14-9,   15-9,   16-9,
+   1-10,  2-10,  3-10,  4-10,  5-10,  6-10,  7-10,  10-10,  11-10,  12-10,  13-10,  14-10,  15-10,  16-10,
+   1-11,  2-11,  3-11,  4-11,  5-11,  6-11,  7-11,  10-11,  11-11,  12-11,  13-11,  14-11,  15-11,  16-11,
+ ]
+
+harvested_workers:
+  []
+
+router_only:
+  [
+   1-0, 2-0, 3-0, 4-0, 5-0, 6-0, 7-0, 10-0, 12-0, 13-0, 14-0, 15-0, 16-0,
+   8-1, 8-2, 8-3, 8-4, 8-5, 8-6, 8-7, 8-8, 8-9, 8-10, 8-11
+  ]
+
+worker_l1_size:
+  1499136
+
+dram_bank_size:
+  4294967296
+
+eth_l1_size:
+  262144
+
+arch_name: BLACKHOLE
+
+features:
+  noc:
+    translation_id_enabled: True
+  unpacker:
+    version: 2
+    inline_srca_trans_without_srca_trans_instr: True
+  math:
+    dst_size_alignment: 32768
+  packer:
+    version: 2
+  overlay:
+    version: 2
diff --git a/tt_metal/soc_descriptors/blackhole_versim_1x1_arch.yaml b/tt_metal/soc_descriptors/blackhole_versim_1x1_arch.yaml
new file mode 100644
index 000000000000..aca55c5fc49f
--- /dev/null
+++ b/tt_metal/soc_descriptors/blackhole_versim_1x1_arch.yaml
@@ -0,0 +1,67 @@
+# soc-descriptor yaml
+# Anything using [#-#] is noc coordinates
+# Anything using [[#, #]] is logical coordinates (Can be relative)
+# relative index: 0 means first row, -1 means last row of functional grid...
+grid:
+  x_size: 17
+  y_size: 12
+
+arc:
+  [ 8-0 ]
+
+pcie:
+  [ 11-0 ]
+
+dram:
+  [
+      [0-0, 0-1],
+  ]
+
+dram_preferred_eth_endpoint:
+  [ 0-0 ]
+
+dram_preferred_worker_endpoint:
+  [ 0-1 ]
+
+dram_address_offsets:
+  [ 0 ]
+
+eth:
+  [ ]
+
+functional_workers:
+  [
+   1-2
+ ]
+
+harvested_workers:
+  []
+
+router_only:
+  [
+   1-0
+  ]
+
+worker_l1_size:
+  1499136
+
+dram_bank_size:
+  4294967296
+
+eth_l1_size:
+  262144
+
+arch_name: BLACKHOLE
+
+features:
+  noc:
+    translation_id_enabled: True
+  unpacker:
+    version: 2
+    inline_srca_trans_without_srca_trans_instr: True
+  math:
+    dst_size_alignment: 32768
+  packer:
+    version: 2
+  overlay:
+    version: 2
diff --git a/tt_metal/soc_descriptors/wormhole_b0_80_arch.yaml b/tt_metal/soc_descriptors/wormhole_b0_80_arch.yaml
index 3e65525ce0e7..4cee2c265056 100644
--- a/tt_metal/soc_descriptors/wormhole_b0_80_arch.yaml
+++ b/tt_metal/soc_descriptors/wormhole_b0_80_arch.yaml
@@ -1,4 +1,4 @@
-  # soc-descriptor yaml
+# soc-descriptor yaml
 # Anything using [#-#] is noc coordinates
 # Anything using [[#, #]] is logical coordinates (Can be relative)
 # relative index: 0 means first row, -1 means last row of functional grid...

From 40ea9ecadbaf0914e144c8d5cee8b64a63687ef1 Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Thu, 16 May 2024 19:49:18 +0000
Subject: [PATCH 36/40] #8530: Add Blackhole bring-up specific workarounds to
 tt::Cluster

---
 tt_metal/llrt/tt_cluster.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index 97e25527dadc..d350c03ca535 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -131,7 +131,8 @@ std::filesystem::path get_cluster_desc_yaml() {
 void Cluster::generate_cluster_descriptor() {
     this->cluster_desc_path_ = (this->target_type_ == TargetDevice::Silicon and this->arch_ == tt::ARCH::WORMHOLE_B0) ? get_cluster_desc_yaml().string() : "";
 
-    if (this->arch_ == tt::ARCH::GRAYSKULL) {
+    // create-eth-map not available for Blackhole bring up
+    if (this->arch_ == tt::ARCH::GRAYSKULL or this->arch_ == tt::ARCH::BLACKHOLE) {
         // Cannot use tt_SiliconDevice::detect_available_device_ids because that returns physical device IDs
         std::vector<chip_id_t> physical_mmio_device_ids = tt_SiliconDevice::detect_available_device_ids();
         std::set<chip_id_t> logical_mmio_device_ids;
@@ -357,7 +358,8 @@ void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_pa
 
     TT_FATAL(this->sdesc_per_chip_.size(), "Descriptor must be loaded. Try open_driver()");
 
-    if (this->target_type_ == TargetDevice::Silicon && device_params.init_device) {
+    // static TLBs avoided for Blackhole bring up
+    if (this->target_type_ == TargetDevice::Silicon && device_params.init_device && this->arch_ != tt::ARCH::BLACKHOLE) {
         configure_static_tlbs(mmio_device_id);
     }
 
@@ -408,6 +410,11 @@ void Cluster::verify_eth_fw() const {
 }
 
 int Cluster::get_device_aiclk(const chip_id_t &chip_id) const {
+    if (this->arch_ == tt::ARCH::BLACKHOLE) {
+        // For Blackhole bring up remove AICLK query due to lack of ARC message support
+        log_info(tt::LogDevice, "For Blackhole remove AICLK query due to lack of ARC message support");
+        return 0;
+    }
     if (this->device_to_mmio_device_.find(chip_id) != this->device_to_mmio_device_.end()) {
         // get_clocks returns MMIO device ID -> clock frequency
         // There is one driver per MMIO device, so we use that to index returned map

From b5d4b970def95f6680e5da2cf17630462ad89e8b Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Thu, 16 May 2024 23:09:51 +0000
Subject: [PATCH 37/40] #8530: Pull static/dynamic tlb config out of
 tt_cluster.cpp and into tlb_config.cpp

---
 tt_metal/llrt/CMakeLists.txt |   1 +
 tt_metal/llrt/tlb_config.cpp | 175 +++++++++++++++++++++++++++++++++++
 tt_metal/llrt/tlb_config.hpp |  19 ++++
 tt_metal/llrt/tt_cluster.cpp | 122 +-----------------------
 tt_metal/llrt/tt_cluster.hpp |   1 -
 5 files changed, 198 insertions(+), 120 deletions(-)
 create mode 100644 tt_metal/llrt/tlb_config.cpp
 create mode 100644 tt_metal/llrt/tlb_config.hpp

diff --git a/tt_metal/llrt/CMakeLists.txt b/tt_metal/llrt/CMakeLists.txt
index 2122f16e1bfa..62e097abc6ac 100644
--- a/tt_metal/llrt/CMakeLists.txt
+++ b/tt_metal/llrt/CMakeLists.txt
@@ -2,6 +2,7 @@
 set(LLRT_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/llrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/rtoptions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/tlb_config.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tt_cluster.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tt_hexfile.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tt_memory.cpp)
diff --git a/tt_metal/llrt/tlb_config.cpp b/tt_metal/llrt/tlb_config.cpp
new file mode 100644
index 000000000000..925def9d5233
--- /dev/null
+++ b/tt_metal/llrt/tlb_config.cpp
@@ -0,0 +1,175 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tlb_config.hpp"
+#include "device_data.hpp"
+
+#include "third_party/umd/device/blackhole_implementation.h"
+#include "third_party/umd/device/grayskull_implementation.h"
+#include "third_party/umd/device/wormhole_implementation.h"
+
+namespace ll_api {
+
+namespace grayskull {
+
+static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
+static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
+static constexpr unsigned int DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
+static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 0;
+static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = tt::umd::grayskull::DYNAMIC_TLB_16M_SIZE;
+
+int32_t get_static_tlb_index(CoreCoord target) {
+    // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
+    // The default 1MB TLB is not used for DRAM cores
+    // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
+    // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
+    //     return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
+    // }
+    int flat_index = target.y * DEVICE_DATA.GRID_SIZE_X + target.x;
+    if (flat_index == 0) {
+        return -1;
+    }
+    return flat_index;
+}
+
+}  // namespace grayskull
+
+namespace wormhole {
+
+static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
+static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
+static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
+static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 0;
+static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = tt::umd::wormhole::DYNAMIC_TLB_16M_SIZE;
+
+int32_t get_static_tlb_index(CoreCoord target) {
+    bool is_eth_location =
+        std::find(std::cbegin(DEVICE_DATA.ETH_LOCATIONS), std::cend(DEVICE_DATA.ETH_LOCATIONS), target) !=
+        std::cend(DEVICE_DATA.ETH_LOCATIONS);
+    bool is_tensix_location =
+        std::find(std::cbegin(DEVICE_DATA.T6_X_LOCATIONS), std::cend(DEVICE_DATA.T6_X_LOCATIONS), target.x) !=
+            std::cend(DEVICE_DATA.T6_X_LOCATIONS) &&
+        std::find(std::cbegin(DEVICE_DATA.T6_Y_LOCATIONS), std::cend(DEVICE_DATA.T6_Y_LOCATIONS), target.y) !=
+            std::cend(DEVICE_DATA.T6_Y_LOCATIONS);
+    // implementation migrated from wormhole.py in `src/t6ifc/t6py/packages/tenstorrent/chip/wormhole.py` from tensix
+    // repo (t6py-wormhole-bringup branch)
+
+    // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
+    // The default 1MB TLB is not used for DRAM cores
+    // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
+    // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
+    //     return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
+    // }
+
+    if (is_eth_location) {
+        if (target.y == 6) {
+            target.y = 1;
+        }
+
+        if (target.x >= 5) {
+            target.x -= 1;
+        }
+        target.x -= 1;
+
+        int flat_index = target.y * 8 + target.x;
+        int tlb_index = flat_index;
+        return tlb_index;
+
+    } else if (is_tensix_location) {
+        if (target.x >= 5) {
+            target.x -= 1;
+        }
+        target.x -= 1;
+
+        if (target.y >= 6) {
+            target.y -= 1;
+        }
+        target.y -= 1;
+
+        int flat_index = target.y * 8 + target.x;
+
+        // All 80 get single 1MB TLB.
+        int tlb_index = DEVICE_DATA.ETH_LOCATIONS.size() + flat_index;
+
+        return tlb_index;
+    } else {
+        return -1;
+    }
+}
+
+}  // namespace wormhole
+
+namespace blackhole {
+
+static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
+static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
+static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
+static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = tt::umd::blackhole::DYNAMIC_TLB_2M_SIZE;
+static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 0;
+
+int32_t get_static_tlb_index(CoreCoord target) {
+    return -1;
+}
+
+}  // namespace blackhole
+
+void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver) {
+    using get_static_tlb_index_ptr = std::int32_t (*)(tt_xy_pair);
+    get_static_tlb_index_ptr get_static_tlb_index;
+    uint32_t DYNAMIC_TLB_BASE_INDEX, DYNAMIC_TLB_COUNT, DYNAMIC_TLB_16M_SIZE, DYNAMIC_TLB_2M_SIZE;
+
+    switch (arch) {
+        case tt::ARCH::GRAYSKULL:
+            get_static_tlb_index = grayskull::get_static_tlb_index;
+            DYNAMIC_TLB_BASE_INDEX = grayskull::DYNAMIC_TLB_BASE_INDEX;
+            DYNAMIC_TLB_COUNT = grayskull::DYNAMIC_TLB_COUNT;
+            DYNAMIC_TLB_16M_SIZE = grayskull::DYNAMIC_TLB_16M_SIZE;
+            DYNAMIC_TLB_2M_SIZE = grayskull::DYNAMIC_TLB_2M_SIZE;
+            break;
+        case tt::ARCH::WORMHOLE:
+        case tt::ARCH::WORMHOLE_B0:
+            get_static_tlb_index = wormhole::get_static_tlb_index;
+            DYNAMIC_TLB_BASE_INDEX = wormhole::DYNAMIC_TLB_BASE_INDEX;
+            DYNAMIC_TLB_COUNT = wormhole::DYNAMIC_TLB_COUNT;
+            DYNAMIC_TLB_16M_SIZE = wormhole::DYNAMIC_TLB_16M_SIZE;
+            DYNAMIC_TLB_2M_SIZE = wormhole::DYNAMIC_TLB_2M_SIZE;
+            break;
+        case tt::ARCH::BLACKHOLE:
+            get_static_tlb_index = blackhole::get_static_tlb_index;
+            DYNAMIC_TLB_BASE_INDEX = blackhole::DYNAMIC_TLB_BASE_INDEX;
+            DYNAMIC_TLB_COUNT = blackhole::DYNAMIC_TLB_COUNT;
+            DYNAMIC_TLB_2M_SIZE = blackhole::DYNAMIC_TLB_2M_SIZE;
+            DYNAMIC_TLB_16M_SIZE = blackhole::DYNAMIC_TLB_16M_SIZE;
+            break;
+        default: TT_THROW("Configuring static TLBs is not supported for {}", tt::get_string(arch));
+    }
+
+    auto statically_mapped_cores = sdesc.workers;
+    statically_mapped_cores.insert(
+        statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end());
+    std::int32_t address = 0;
+
+    // Setup static TLBs for all worker cores
+    for (auto &core : statically_mapped_cores) {
+        auto tlb_index = get_static_tlb_index(core);
+        device_driver.configure_tlb(mmio_device_id, core, tlb_index, address);
+    }
+    // Setup static TLBs for MMIO mapped data space
+    uint64_t peer_dram_offset = DEVICE_DATA.DRAM_CHANNEL_0_PEER2PEER_REGION_START;
+    for (uint32_t tlb_id = DYNAMIC_TLB_BASE_INDEX; tlb_id < DYNAMIC_TLB_BASE_INDEX + DYNAMIC_TLB_COUNT; tlb_id++) {
+        device_driver.configure_tlb(
+            mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset);
+        // Align address space of 16MB TLB to 16MB boundary
+        peer_dram_offset += DYNAMIC_TLB_16M_SIZE;
+    }
+    device_driver.setup_core_to_tlb_map([get_static_tlb_index](CoreCoord core) { return get_static_tlb_index(core); });
+}
+
+std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config() {
+    std::unordered_map<std::string, std::int32_t> dynamic_tlb_config;
+    dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB;
+    return dynamic_tlb_config;
+}
+
+}  // namespace ll_api
diff --git a/tt_metal/llrt/tlb_config.hpp b/tt_metal/llrt/tlb_config.hpp
new file mode 100644
index 000000000000..b7b8d589f736
--- /dev/null
+++ b/tt_metal/llrt/tlb_config.hpp
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "third_party/umd/device/device_api.h"
+#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "tt_metal/common/metal_soc_descriptor.h"
+
+#include <unordered_map>
+
+namespace ll_api {
+
+void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver);
+
+std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config();
+
+}  // namespace ll_api
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index d350c03ca535..1e59341b9af3 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -18,17 +18,7 @@
 #include "tools/profiler/profiler.hpp"
 #include "tt_metal/impl/debug/sanitize_noc_host.hpp"
 #include "tt_metal/llrt/rtoptions.hpp"
-
-#ifdef ARCH_GRAYSKULL
-static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
-static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
-static constexpr unsigned int DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
-
-#else
-static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
-static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
-static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
-#endif
+#include "tt_metal/llrt/tlb_config.hpp"
 
 namespace tt {
 
@@ -218,8 +208,7 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &c
         // Silicon driver will attempt to open this many hugepages as channels, and assert if workload uses more than available.
         // Metal currently uses assigns 1 channel per device
         uint32_t num_host_mem_ch_per_mmio_device = controlled_device_ids.size();
-        std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
-        dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB;
+        std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = ll_api::get_dynamic_tlb_config();
         // This will remove harvested rows from the soc descriptor
         const bool perform_harvesting = true;
         const bool clean_system_resources = true;
@@ -248,111 +237,6 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &c
     this->mmio_device_id_to_driver_[mmio_device_id] = std::move(device_driver);
 }
 
-#ifdef ARCH_WORMHOLE
-std::int32_t get_static_tlb_index(CoreCoord target) {
-    bool is_eth_location =
-        std::find(std::cbegin(DEVICE_DATA.ETH_LOCATIONS), std::cend(DEVICE_DATA.ETH_LOCATIONS), target) !=
-        std::cend(DEVICE_DATA.ETH_LOCATIONS);
-    bool is_tensix_location =
-        std::find(std::cbegin(DEVICE_DATA.T6_X_LOCATIONS), std::cend(DEVICE_DATA.T6_X_LOCATIONS), target.x) !=
-            std::cend(DEVICE_DATA.T6_X_LOCATIONS) &&
-        std::find(std::cbegin(DEVICE_DATA.T6_Y_LOCATIONS), std::cend(DEVICE_DATA.T6_Y_LOCATIONS), target.y) !=
-            std::cend(DEVICE_DATA.T6_Y_LOCATIONS);
-    // implementation migrated from wormhole.py in `src/t6ifc/t6py/packages/tenstorrent/chip/wormhole.py` from tensix
-    // repo (t6py-wormhole-bringup branch)
-
-    // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
-    // The default 1MB TLB is not used for DRAM cores
-    // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
-    // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
-    //     return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
-    // }
-
-    if (is_eth_location) {
-        if (target.y == 6) {
-            target.y = 1;
-        }
-
-        if (target.x >= 5) {
-            target.x -= 1;
-        }
-        target.x -= 1;
-
-        int flat_index = target.y * 8 + target.x;
-        int tlb_index = flat_index;
-        return tlb_index;
-
-    } else if (is_tensix_location) {
-        if (target.x >= 5) {
-            target.x -= 1;
-        }
-        target.x -= 1;
-
-        if (target.y >= 6) {
-            target.y -= 1;
-        }
-        target.y -= 1;
-
-        int flat_index = target.y * 8 + target.x;
-
-        // All 80 get single 1MB TLB.
-        int tlb_index = DEVICE_DATA.ETH_LOCATIONS.size() + flat_index;
-
-        return tlb_index;
-    } else {
-        return -1;
-    }
-}
-#endif
-
-#ifdef ARCH_GRAYSKULL
-std::int32_t get_static_tlb_index(CoreCoord target) {
-    // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
-    // The default 1MB TLB is not used for DRAM cores
-    // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
-    // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
-    //     return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
-    // }
-    int flat_index = target.y * DEVICE_DATA.GRID_SIZE_X + target.x;
-    if (flat_index == 0) {
-        return -1;
-    }
-    return flat_index;
-}
-#endif
-
-// TODO: pull tlb config into sep file similar to BBE
-#ifdef ARCH_BLACKHOLE
-std::int32_t get_static_tlb_index(CoreCoord target) {
-    return -1;
-}
-#endif
-
-void Cluster::configure_static_tlbs(chip_id_t mmio_device_id) const {
-    auto sdesc = get_soc_desc(mmio_device_id);
-    auto statically_mapped_cores = sdesc.workers;
-    statically_mapped_cores.insert(
-        statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end());
-    std::int32_t address = 0;
-
-    // Setup static TLBs for all worker cores
-    for (auto &core : statically_mapped_cores) {
-        auto tlb_index = get_static_tlb_index(core);
-        this->get_driver(mmio_device_id).configure_tlb(mmio_device_id, core, tlb_index, address);
-    }
-    // Setup static TLBs for MMIO mapped data space
-    uint64_t peer_dram_offset = DEVICE_DATA.DRAM_CHANNEL_0_PEER2PEER_REGION_START;
-    for (uint32_t tlb_id = DYNAMIC_TLB_BASE_INDEX; tlb_id < DYNAMIC_TLB_BASE_INDEX + DYNAMIC_TLB_COUNT; tlb_id++) {
-        this->get_driver(mmio_device_id).configure_tlb(
-            mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset);
-        // Align address space of 16MB TLB to 16MB boundary
-#ifndef ARCH_BLACKHOLE // TODO (abhullar): clean this up
-        peer_dram_offset += DEVICE_DATA.DYNAMIC_TLB_16M_SIZE;
-#endif
-    }
-    this->get_driver(mmio_device_id).setup_core_to_tlb_map([](CoreCoord core) { return get_static_tlb_index(core); });
-}
-
 void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const {
     device_params.init_device = true;
 
@@ -360,7 +244,7 @@ void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_pa
 
     // static TLBs avoided for Blackhole bring up
     if (this->target_type_ == TargetDevice::Silicon && device_params.init_device && this->arch_ != tt::ARCH::BLACKHOLE) {
-        configure_static_tlbs(mmio_device_id);
+        ll_api::configure_static_tlbs(this->arch_, mmio_device_id, this->get_soc_desc(mmio_device_id), this->get_driver(mmio_device_id));
     }
 
     this->mmio_device_id_to_driver_.at(mmio_device_id)->start_device(device_params);
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp
index dc0a5eddf0ac..a17662ffffbc 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/llrt/tt_cluster.hpp
@@ -185,7 +185,6 @@ class Cluster {
     tt_device &get_driver(chip_id_t device_id) const;
     void get_metal_desc_from_tt_desc(const std::unordered_map<chip_id_t, tt_SocDescriptor> &input, const std::unordered_map<chip_id_t, uint32_t> &per_chip_id_harvesting_masks);
     tt_cxy_pair convert_physical_cxy_to_virtual(const tt_cxy_pair &physical_cxy) const;
-    void configure_static_tlbs(chip_id_t mmio_device_id) const;
 
     // Returns map of connected chip ids to active ethernet cores
     std::unordered_map<chip_id_t, std::vector<CoreCoord>> get_ethernet_cores_grouped_by_connected_chips(

From 71efe319772b330c9d5238b8a1d56c8a4e621420 Mon Sep 17 00:00:00 2001
From: Michael Chiou <mchiou@tenstorrent.com>
Date: Thu, 16 May 2024 11:21:09 -0700
Subject: [PATCH 38/40] #7944: Add dockerfile and scripts to bringup docker
 instance of tt-metal

---
 dockerfile/ubuntu-20.04-x86.Dockerfile | 40 ++++++++++++++++++++++++++
 scripts/docker/install_test_deps.sh    | 28 ++++++++++++++++++
 scripts/docker/requirements.txt        | 27 +++++++++++++++++
 scripts/docker/requirements_dev.txt    |  5 ++++
 4 files changed, 100 insertions(+)
 create mode 100644 dockerfile/ubuntu-20.04-x86.Dockerfile
 create mode 100755 scripts/docker/install_test_deps.sh
 create mode 100644 scripts/docker/requirements.txt
 create mode 100644 scripts/docker/requirements_dev.txt

diff --git a/dockerfile/ubuntu-20.04-x86.Dockerfile b/dockerfile/ubuntu-20.04-x86.Dockerfile
new file mode 100644
index 000000000000..bdb5cb7d8697
--- /dev/null
+++ b/dockerfile/ubuntu-20.04-x86.Dockerfile
@@ -0,0 +1,40 @@
+# Second stage: the actual image
+FROM ubuntu:20.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV GTEST_VERSION=1.13.0
+ENV DOXYGEN_VERSION=1.9.6
+
+# Install build and runtime deps
+COPY /scripts/docker/requirements.txt /opt/tt_metal_infra/scripts/docker/requirements.txt
+RUN apt-get -y update \
+    && xargs -a /opt/tt_metal_infra/scripts/docker/requirements.txt apt-get install -y --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install dev deps
+COPY /scripts/docker/requirements_dev.txt /opt/tt_metal_infra/scripts/docker/requirements_dev.txt
+RUN apt-get -y update \
+    && xargs -a /opt/tt_metal_infra/scripts/docker/requirements_dev.txt apt-get install -y --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/*
+
+## Test Related Dependencies
+COPY /scripts/docker/install_test_deps.sh /opt/tt_metal_infra/scripts/docker/install_test_deps.sh
+RUN /bin/bash /opt/tt_metal_infra/scripts/docker/install_test_deps.sh ${GTEST_VERSION} ${DOXYGEN_VERSION}
+
+# Copy remaining convenience scripts
+COPY /scripts /opt/tt_metal_infra/scripts
+COPY build_metal.sh /scripts/build_metal.sh
+
+# ENV TT_METAL_INFRA_DIR=/opt/tt_metal_infra
+# ENV PYTHON_ENV_DIR=${TT_METAL_INFRA_DIR}/tt-metal/python_env
+# RUN python3 -m venv $PYTHON_ENV_DIR
+
+# COPY /docs/requirements-docs.txt ${TT_METAL_INFRA_DIR}/tt-metal/docs/.
+# COPY /tt_metal/python_env/* ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/.
+# ENV PATH="$PYTHON_ENV_DIR/bin:$PATH"
+# RUN python3 -m pip config set global.extra-index-url https://download.pytorch.org/whl/cpu \
+#     && python3 -m pip install setuptools wheel
+
+# RUN python3 -m pip install -r ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/requirements-dev.txt
+
+CMD ["tail", "-f", "/dev/null"]
diff --git a/scripts/docker/install_test_deps.sh b/scripts/docker/install_test_deps.sh
new file mode 100755
index 000000000000..2f6c3fedde2b
--- /dev/null
+++ b/scripts/docker/install_test_deps.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Check if two arguments are provided
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <gtest_version> <doxygen_version>"
+    exit 1
+fi
+
+GTEST_VERSION=$1
+DOXYGEN_VERSION=$2
+
+# Installs Google test
+mkdir -p /opt/tt_metal_infra/googletest
+chmod ugo+w /opt/tt_metal_infra/googletest
+wget -O /opt/tt_metal_infra/googletest/googletest-release-${GTEST_VERSION}.tar.gz https://github.com/google/googletest/archive/refs/tags/v${GTEST_VERSION}.tar.gz
+tar -xzf /opt/tt_metal_infra/googletest/googletest-release-${GTEST_VERSION}.tar.gz -C /opt/tt_metal_infra/googletest/
+cd /opt/tt_metal_infra/googletest/googletest-${GTEST_VERSION}
+cmake -DCMAKE_INSTALL_PREFIX=/usr -DBUILD_SHARED_LIBS=ON .
+make
+make install
+
+# Install doxygen
+mkdir -p /opt/tt_metal_infra/doxygen
+wget -O /opt/tt_metal_infra/doxygen/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz "https://www.doxygen.nl/files/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz"
+tar -xzf /opt/tt_metal_infra/doxygen/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz -C /opt/tt_metal_infra/doxygen/
+rm /opt/tt_metal_infra/doxygen/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
+cd /opt/tt_metal_infra/doxygen/doxygen-${DOXYGEN_VERSION}
+make install
diff --git a/scripts/docker/requirements.txt b/scripts/docker/requirements.txt
new file mode 100644
index 000000000000..9404aefa4bda
--- /dev/null
+++ b/scripts/docker/requirements.txt
@@ -0,0 +1,27 @@
+apt-utils
+dialog
+software-properties-common=0.99.9.12
+build-essential=12.8ubuntu1.1
+libgoogle-glog-dev=0.4.0-1build1
+libyaml-cpp-dev=0.6.2-4ubuntu1
+git
+git-lfs
+clang-6.0=1:6.0.1-14
+libboost-all-dev=1.71.0.0ubuntu2
+libsndfile1=1.0.28-7ubuntu0.2
+pandoc
+libtbb-dev
+libcapstone-dev
+pkg-config
+cmake=3.16.3-1ubuntu1.20.04.1
+curl
+wget
+python3-pip
+libhwloc-dev
+
+libhdf5-serial-dev
+ruby=1:2.7+1
+python3.8-venv=3.8.10-0ubuntu1~20.04.9
+
+cargo
+ninja-build
\ No newline at end of file
diff --git a/scripts/docker/requirements_dev.txt b/scripts/docker/requirements_dev.txt
new file mode 100644
index 000000000000..da9014e02c4b
--- /dev/null
+++ b/scripts/docker/requirements_dev.txt
@@ -0,0 +1,5 @@
+sudo
+nano
+acl=2.2.53-6
+jq=1.6-1ubuntu0.20.04.1
+openssh-server

From d7faf7d4411d5e5f54100ce895328af4495cb689 Mon Sep 17 00:00:00 2001
From: Michael Chiou <mchiou@tenstorrent.com>
Date: Mon, 6 May 2024 22:52:32 -0700
Subject: [PATCH 39/40] #7944: Updated Codeowners with install and dockerfile
 files

---
 CODEOWNERS | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index e4e6c841c956..7807fa5d3a49 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -181,3 +181,6 @@ tests/python_api_testing/conv/ @tt-nshanker
 tests/python_api_testing/unit_testing/fallback_ops @tt-aho
 scripts/profiler/ @mo-tenstorrent
 scripts/install @tapspatel
+scripts/docker @ttmchiou @TT-billteng @tt-rkim
+
+dockerfile @ttmchiou @TT-billteng @tt-rkim
\ No newline at end of file

From c560a262a8a913780fe49c6cb7628cab718604d6 Mon Sep 17 00:00:00 2001
From: Michael Chiou <mchiou@tenstorrent.com>
Date: Fri, 17 May 2024 13:02:29 -0700
Subject: [PATCH 40/40] #7944: Maintain posix compliance, add vim, remove
 pinned dep versions

removing pinned version of jq and acl
posix lines for CODEOWNERS
adding vim to requirements_dev.txt
---
 CODEOWNERS                          | 2 +-
 scripts/docker/requirements.txt     | 2 +-
 scripts/docker/requirements_dev.txt | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 7807fa5d3a49..ac7466c7f4e7 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -183,4 +183,4 @@ scripts/profiler/ @mo-tenstorrent
 scripts/install @tapspatel
 scripts/docker @ttmchiou @TT-billteng @tt-rkim
 
-dockerfile @ttmchiou @TT-billteng @tt-rkim
\ No newline at end of file
+dockerfile @ttmchiou @TT-billteng @tt-rkim
diff --git a/scripts/docker/requirements.txt b/scripts/docker/requirements.txt
index 9404aefa4bda..a16e8ed49e05 100644
--- a/scripts/docker/requirements.txt
+++ b/scripts/docker/requirements.txt
@@ -24,4 +24,4 @@ ruby=1:2.7+1
 python3.8-venv=3.8.10-0ubuntu1~20.04.9
 
 cargo
-ninja-build
\ No newline at end of file
+ninja-build
diff --git a/scripts/docker/requirements_dev.txt b/scripts/docker/requirements_dev.txt
index da9014e02c4b..70c5648fde34 100644
--- a/scripts/docker/requirements_dev.txt
+++ b/scripts/docker/requirements_dev.txt
@@ -1,5 +1,6 @@
 sudo
 nano
-acl=2.2.53-6
-jq=1.6-1ubuntu0.20.04.1
+acl
+jq
 openssh-server
+vim
\ No newline at end of file