diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt
index cecc88db..36b34679 100644
--- a/device/CMakeLists.txt
+++ b/device/CMakeLists.txt
@@ -1,20 +1,18 @@
 
 set(UMD_DEVICE_SRCS
     architecture_implementation.cpp
-    blackhole_implementation.cpp
     cpuset_lib.cpp
-    grayskull_implementation.cpp
     tlb.cpp
     tt_cluster_descriptor.cpp
     tt_device.cpp
-    tt_emulation_stub.cpp
     tt_silicon_driver.cpp
     tt_silicon_driver_common.cpp
     tt_soc_descriptor.cpp
-    tt_versim_stub.cpp
-    wormhole_implementation.cpp
     simulation/tt_simulation_device.cpp
     simulation/tt_simulation_host.cpp
+    blackhole/blackhole_implementation.cpp
+    grayskull/grayskull_implementation.cpp
+    wormhole/wormhole_implementation.cpp
 )
 add_library(umd_device SHARED ${UMD_DEVICE_SRCS})
 target_link_libraries(umd_device 
diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp
index 96117d96..d55d3e29 100644
--- a/device/architecture_implementation.cpp
+++ b/device/architecture_implementation.cpp
@@ -4,9 +4,9 @@
 
 #include "device/architecture_implementation.h"
 
-#include "device/blackhole_implementation.h"
-#include "device/grayskull_implementation.h"
-#include "device/wormhole_implementation.h"
+#include "device/blackhole/blackhole_implementation.h"
+#include "device/grayskull/grayskull_implementation.h"
+#include "device/wormhole/wormhole_implementation.h"
 
 namespace tt::umd {
 
diff --git a/device/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp
similarity index 98%
rename from device/blackhole_implementation.cpp
rename to device/blackhole/blackhole_implementation.cpp
index 4c36838c..eda2f140 100644
--- a/device/blackhole_implementation.cpp
+++ b/device/blackhole/blackhole_implementation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "device/blackhole_implementation.h"
+#include "blackhole_implementation.h"
 
 namespace tt::umd {
 
diff --git a/device/blackhole_implementation.h b/device/blackhole/blackhole_implementation.h
similarity index 100%
rename from device/blackhole_implementation.h
rename to device/blackhole/blackhole_implementation.h
diff --git a/device/blackhole/impl_device.hpp b/device/blackhole/impl_device.hpp
deleted file mode 100644
index afb4091c..00000000
--- a/device/blackhole/impl_device.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include "device/tt_silicon_driver_common.hpp"
-
-// See src/t6ifc/t6py/packages/tenstorrent/data/wormhole/pci/tlb.yaml
-// local_offset: [ 0, 15,  0,  "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."]
-// x_end       : [ 0, 21, 16,  "" ]
-// y_end       : [ 0, 27, 22,  "" ]
-// x_start     : [ 0, 33, 28,  "" ]
-// y_start     : [ 0, 39, 34,  "" ]
-// noc_sel:      [ 0, 40, 40,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 41, 41,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 43, 42,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 44, 44,  "linked"]
-
-// local_offset: [ 0, 14,  0,  "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."]
-// x_end       : [ 0, 20, 15,  "" ]
-// y_end       : [ 0, 26, 21,  "" ]
-// x_start     : [ 0, 32, 27,  "" ]
-// y_start     : [ 0, 38, 33,  "" ]
-// noc_sel:      [ 0, 39, 39,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 40, 40,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 42, 41,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 43, 43,  "linked"]
-
-// local_offset: [ 0, 11,  0,  "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."]
-// x_end       : [ 0, 17, 12,  "" ]
-// y_end       : [ 0, 23, 18,  "" ]
-// x_start     : [ 0, 29, 24,  "" ]
-// y_start     : [ 0, 35, 30,  "" ]
-// noc_sel:      [ 0, 36, 36,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 37, 37,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 39, 38,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 40, 40,  "linked"]
-
-const auto TLB_1M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 16,
-    .y_end = 22,
-    .x_start = 28,
-    .y_start = 34,
-    .noc_sel = 40,
-    .mcast = 41,
-    .ordering = 42,
-    .linked = 44,
-    .static_vc = 45,
-    .static_vc_end = 46
-};
-
-const auto TLB_2M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 15,
-    .y_end = 21,
-    .x_start = 27,
-    .y_start = 33,
-    .noc_sel = 39,
-    .mcast = 40,
-    .ordering = 41,
-    .linked = 43,
-    .static_vc = 44,
-    .static_vc_end = 45
-};
-
-const auto TLB_16M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 12,
-    .y_end = 18,
-    .x_start = 24,
-    .y_start = 30,
-    .noc_sel = 36,
-    .mcast = 37,
-    .ordering = 38,
-    .linked = 40,
-    .static_vc = 41,
-    .static_vc_end = 42
-};
diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp
index 803ee8eb..123b5fd0 100644
--- a/device/cpuset_lib.cpp
+++ b/device/cpuset_lib.cpp
@@ -7,39 +7,13 @@
 #include "cpuset_lib.hpp"
 #include "common/logger.hpp"
 #include <thread>
-#include "device/device_api.h"
+#include "device/tt_device.h"
 #include <filesystem>
 namespace tt {
 
 namespace fs = std::filesystem;
 namespace cpuset {
 
-// Unrelated to hwloc binding of threads, instead to query cpu affinity to find reasonable number of threads to parallelize over.
-int get_allowed_num_threads(){
-    unsigned int num_pus_in_system = sysconf(_SC_NPROCESSORS_ONLN);
-    unsigned int num_threads = num_pus_in_system;
-
-    cpu_set_t mask;
-    if (sched_getaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
-        log_warning(LogSiliconDriver, "Could not detect current process cpu id affinity for calculating num_threads, will use default num_threads: {}.", num_threads);
-    } else{
-        unsigned int visible_pu_count = CPU_COUNT(&mask);
-        if (visible_pu_count < num_pus_in_system){
-            num_threads = visible_pu_count;
-        }
-        log_trace(LogSiliconDriver, "Detected (allowed) visible_pu_count: {}, setting num_threads: {}", visible_pu_count, num_threads);
-    }
-
-    char const* override_thread_count = std::getenv("TT_BACKEND_COMPILE_THREADS");
-    if (override_thread_count != nullptr && std::atoi(override_thread_count) > 0){
-        num_threads = std::atoi(override_thread_count);
-        log_trace(LogSiliconDriver, "Overriding via env-var to num_threads: {}", num_threads);
-    }
-
-    return num_threads;
-}
-
-
 /////////////////////////////////////////////////////////////////////////
 // Initialization Functions /////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
@@ -49,7 +23,6 @@ tt_cpuset_allocator::tt_cpuset_allocator() {
 
     m_pid           = getpid();
     m_debug         = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false;
-    m_skip_singlify = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SKIP_SINGLIFY") ? true : false;
 
     // Chicken bit to disable this entire feature for debug/comparison.
     bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false;
@@ -72,7 +45,6 @@ tt_cpuset_allocator::tt_cpuset_allocator() {
 
         if (is_cpu_supported){
             m_enable_cpuset_allocator &= init_determine_cpuset_allocations();
-            m_enable_cpuset_allocator &= init_populate_physical_mmio_device_id_map();
         }else{
             m_enable_cpuset_allocator = false;
         }
@@ -351,206 +323,10 @@ bool tt_cpuset_allocator::init_determine_cpuset_allocations(){
 
 }
 
-// Step 6 - Populate map of logical to physical mmio device map.
-bool tt_cpuset_allocator::init_populate_physical_mmio_device_id_map(){
-
-    if (!m_enable_cpuset_allocator){
-        return false;
-    }
-
-    log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::populate_physical_mmio_device_id_map()");
-
-    // Get map of logical to physical device ids - FIXME: This is not accurate for some WHB0 clusters.
-    std::vector<chip_id_t> available_device_ids = tt_SiliconDevice::detect_available_device_ids();
-    m_logical_to_physical_mmio_device_id_map    = tt_SiliconDevice::get_logical_to_physical_mmio_device_id_map(available_device_ids);
-
-    for (auto &d: m_logical_to_physical_mmio_device_id_map){
-        auto logical_device_id = d.first;
-        auto physical_device_id = d.second;
-        log_debug(LogSiliconDriver, "populate_physical_mmio_device_id_map() -- available_devices: {} logical_device_id: {} => physical_device_id: {}", available_device_ids.size(), (int) logical_device_id, (int) physical_device_id);
-        m_num_threads_pinned_per_tt_device.insert({physical_device_id, 0});
-    }
-
-    return true; // Success
-}
-
-
 /////////////////////////////////////////////////////////////////////////
 // Runtime Functions ////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
 
-// Idea  - Something to compare cpuset from Slurm to cpuset picked by this function.
-hwloc_cpuset_t tt_cpuset_allocator::allocate_cpu_set_for_thread(chip_id_t physical_device_id, bool skip_singlify){
-
-        // To prevent races on read/modify/write to m_num_threads_pinned_per_tt_device across threads to same device.
-        const std::lock_guard<std::mutex> lock(allocate_cpu_id_mutex);
-
-        int num_alloc_slots_for_tt_device   = m_physical_device_id_to_cpusets_map.at(physical_device_id).size();
-        int tt_device_alloc_idx             = m_num_threads_pinned_per_tt_device.at(physical_device_id) % num_alloc_slots_for_tt_device;
-
-        // Check if 2CCX-PER-CCD Optimization can be enabled. For AMD EPYC models : There is 1 L3Cache per CCX and 2 CCX per CCD.
-        // Better perf to first allocate to unique CCD's if we have enough per device. Expand to other CPU types?
-        bool enable_special_case    = true;
-        auto package_id             = m_physical_device_id_to_package_id_map.at(physical_device_id);
-        auto num_l3_per_ccx         = m_package_id_to_num_l3_per_ccx_map.at(package_id);
-        auto num_ccx_per_ccd        = m_package_id_to_num_ccx_per_ccd_map.at(package_id);
-
-        if (enable_special_case && num_l3_per_ccx == 1 && num_ccx_per_ccd == 2 && num_alloc_slots_for_tt_device > num_ccx_per_ccd && m_object_per_alloc_slot == HWLOC_OBJ_L3CACHE){
-            int alloc_idx_for_device    = m_num_threads_pinned_per_tt_device.at(physical_device_id);
-            int ccx_in_ccd              = (alloc_idx_for_device % num_alloc_slots_for_tt_device) < num_alloc_slots_for_tt_device/num_ccx_per_ccd ? 0 : 1;
-            tt_device_alloc_idx         = (ccx_in_ccd + (alloc_idx_for_device * num_ccx_per_ccd)) % num_alloc_slots_for_tt_device;
-            log_debug(LogSiliconDriver,"Special L3Cache case physical_device_id: {} alloc_idx_for_device: {} ccx_in_ccd: {} tt_device_alloc_idx: {}", physical_device_id, alloc_idx_for_device, ccx_in_ccd, tt_device_alloc_idx);
-        }
-
-
-        // Get the desired cpuset and prevent migration between different PU's in set by singlifying to single PU.
-        hwloc_cpuset_t cpuset = hwloc_bitmap_dup(m_physical_device_id_to_cpusets_map.at(physical_device_id).at(tt_device_alloc_idx));
-        if (!m_skip_singlify && !skip_singlify){
-            hwloc_bitmap_singlify(cpuset);
-        }
-
-        // Debug
-        auto tid = std::this_thread::get_id();
-        log_debug(LogSiliconDriver,"Allocating for physical_device_id: {} num_alloc_slots: {} num_threads_pinned: {} alloc_idx: {} skip_singlify: {} (pid: {} tid: {}) => {} PU's {}", 
-            physical_device_id, num_alloc_slots_for_tt_device, m_num_threads_pinned_per_tt_device.at(physical_device_id), tt_device_alloc_idx, skip_singlify,
-            m_pid, tid, hwloc_bitmap_weight(cpuset), get_hwloc_bitmap_vector(cpuset));
-
-        // Increment counter to keep track of number of pinned thread per device, to get unique cpuset per thread.
-        m_num_threads_pinned_per_tt_device.at(physical_device_id)++;
-
-        return cpuset;
-}
-
-void tt_cpuset_allocator::store_thread_original_cpuset(){
-
-    auto tid = std::this_thread::get_id();
-    hwloc_cpuset_t orig_cpuset = hwloc_bitmap_alloc();
-
-    if (hwloc_get_cpubind(m_topology, orig_cpuset, HWLOC_CPUBIND_THREAD)){
-        log_warning(LogSiliconDriver,"store_thread_original_cpuset() calling hwloc_get_cpubind() failed with errno: {} (pid: {} tid:{})", strerror(errno), m_pid, tid);
-    }else{
-        auto orig_cpuset_vector = get_hwloc_bitmap_vector(orig_cpuset);
-        log_debug(LogSiliconDriver, "store_thread_original_cpuset() success - got orig cpuset: {} PU's: {} (pid: {} tid: {})", orig_cpuset_vector.size(), orig_cpuset_vector, m_pid, tid);
-        m_global_thread_id_to_original_cpuset_map.insert({tid, hwloc_bitmap_dup(orig_cpuset)});
-    }
-    hwloc_bitmap_free(orig_cpuset);
-}
-
-
-
-// Given a logical device_id, determine the right cpu_ids associated with it and pin this thread to them.
-void tt_cpuset_allocator::bind_thread_cpuset(tt_cluster_description *ndesc, chip_id_t logical_device_id, bool skip_singlify){
-
-    auto tid = std::this_thread::get_id();
-
-    // This needed to be protected by not-empty otherwise arithmetic error.
-    if ((!m_global_thread_ids_pinned.empty() && m_global_thread_ids_pinned.count(tid)) || (!m_enable_cpuset_allocator)){
-        return;
-    }else{
-
-        if (!ndesc->is_chip_mmio_capable(logical_device_id)){
-            logical_device_id = ndesc->get_closest_mmio_capable_chip(logical_device_id);
-        }
-
-        log_debug(LogSiliconDriver,"bind_thread_cpuset_cpuset() for logical_device_id: {} m_logical_to_physical_mmio_device_id_map.size(): {}", logical_device_id, m_logical_to_physical_mmio_device_id_map.size());
-
-        // If a main thread ID was captured, make sure it is not attempted to be pinned. Only IO API sub threads are expected to be pinned today.
-        if (m_stored_main_thread_id && tid == m_main_thread_id){
-            log_warning(LogSiliconDriver, "bind_thread_cpuset() - Skipping cpubind for runtime main thread_id: {} to prevent undesired inheritence. Consider moving device IO (ie. push/pop/get) to sub-threads for binding to be supported.", m_main_thread_id);
-            return;
-        }
-
-        if (m_logical_to_physical_mmio_device_id_map.count(logical_device_id) > 0){
-
-            auto physical_device_id = m_logical_to_physical_mmio_device_id_map.at(logical_device_id);
-            auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id);
-
-            store_thread_original_cpuset(); // Store original cpuset for later unbinding if necessary.
-
-            // Get the cpuset, and attempt to bind thread to it.
-            hwloc_cpuset_t cpuset   = allocate_cpu_set_for_thread(physical_device_id, skip_singlify);
-            auto cpuset_vector      = get_hwloc_bitmap_vector(cpuset);
-
-            if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT )){; // HWLOC_CPUBIND_NOMEMBIND
-                log_warning(LogSiliconDriver,"bind_thread_cpuset() binding failed (errno: {}) for physical_device_id: {} on package_id: {} to {} PU's: {} (pid: {} tid: {})",
-                    strerror(errno), physical_device_id, package_id, cpuset_vector.size(), cpuset_vector, m_pid, tid);
-            }else{
-                log_debug(LogSiliconDriver,"bind_thread_cpuset() binding success skip: {} for physical_device_id: {} on package_id: {} to {} PU's: {} (pid: {} tid: {})",
-                    skip_singlify, physical_device_id, package_id, cpuset_vector.size(), cpuset_vector, m_pid, tid);
-                // Record that this thread is pinned, no need to repeat on subsequent IO API calls.
-                m_global_thread_ids_pinned.insert(tid);
-                m_global_thread_id_to_physical_device_id_map.insert({tid, physical_device_id});
-            }
-
-        }else{
-            log_warning(LogSiliconDriver,"Could not find logical_device_id: {} in m_logical_to_physical_mmio_device_id_map. This shouldn't happen.", logical_device_id);
-        }
-    }
-}
-
-
-// Restore thread's original cpubind. Perhaps could be simplified to not require physical_device_id or previous binding, and just always bind to MACHINE cpuset.
-void tt_cpuset_allocator::unbind_thread_cpuset(){
-
-    if (m_enable_cpuset_allocator){
-        auto tid = std::this_thread::get_id();
-
-        // Make sure this thread was successfully and previously binded to a cpuset.
-        if (!m_global_thread_id_to_original_cpuset_map.count(tid)){
-            log_warning(LogSiliconDriver,"unbind_thread_cpuset() called for tid: {} but no original cpuset for this thread found. Previous cpu binding skipped or failed?", tid);
-            return;
-        }
-
-        if (!m_global_thread_id_to_physical_device_id_map.count(tid)){
-            log_warning(LogSiliconDriver,"unbind_thread_cpuset() called for tid: {} but no physical_device_id this thread found. Previous cpu binding skipped or failed?", tid);
-            return;
-        }
-
-        // Handle the case where something goes wrong during original binding above, don't want to error out.
-        auto cpuset             = m_global_thread_id_to_original_cpuset_map.at(tid);
-        auto physical_device_id = m_global_thread_id_to_physical_device_id_map.at(tid);
-        auto cpuset_vector      = get_hwloc_bitmap_vector(cpuset); // Can tighten this up and remove, it's purely for debug anyways.
-
-        if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD)){
-            log_warning(LogSiliconDriver,"unbind_thread_cpuset() binding failed (errno: {}) for physical_device_id: {} to original {} PU's: {} (pid: {} tid: {})",
-                strerror(errno), physical_device_id, cpuset_vector.size(), cpuset_vector, m_pid, tid);
-        }else{
-            log_debug(LogSiliconDriver,"unbind_thread_cpuset() binding success for physical_device_id: {} to original {} PU's: {} (pid: {} tid: {})",
-                physical_device_id, cpuset_vector.size(), cpuset_vector, m_pid, tid);
-
-            // To prevent races on read/modify/write to m_num_threads_pinned_per_tt_device across threads to same device.
-            const std::lock_guard<std::mutex> lock(allocate_cpu_id_mutex);
-
-            // Update book-keeping by removing entry, so this thread can be re-pinned in the future.
-            m_num_threads_pinned_per_tt_device.at(physical_device_id)--;
-            m_global_thread_ids_pinned.erase(tid);
-            m_global_thread_id_to_physical_device_id_map.erase(tid);
-        }
-    }
-}
-
-// Teardown/Cleanup for end of process. Don't do anything if feature disabled. Probably don't even need this if process is going to be ended.
-void tt_cpuset_allocator::clear_state(){
-    if (m_enable_cpuset_allocator){
-
-        auto tid = std::this_thread::get_id();
-        log_debug(LogSiliconDriver,"Clearing state and unbinding entire process' cpuset (pid: {} tid: {}).", m_pid, tid);
-
-        // Reset state variables so that next time the thread can be freshly pinned
-        m_global_thread_ids_pinned.clear();
-        for (auto &device: m_num_threads_pinned_per_tt_device){
-            device.second = 0;
-        }
-
-        // Undo previous pinning, by binding to full machine cpuset. Alternatively could have saved and restored orig cpuset per thread.
-        auto machine_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_MACHINE, 0);
-        if (hwloc_set_cpubind(m_topology, machine_obj->cpuset, HWLOC_CPUBIND_PROCESS)){
-            log_warning(LogSiliconDriver,"clear_state() binding failed (errno: {}) to Machine cpuset (pid: {} tid: {})", strerror(errno), m_pid, tid);
-        }
-    }
-}
-
-
 // Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it.
 bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){
 
@@ -580,14 +356,6 @@ bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id,
     return true; // Success
 }
 
-
-// For checking purposes, to make sure main thread is not cpubinded accidentally.
-void tt_cpuset_allocator::_set_main_thread_id(){
-    m_main_thread_id = std::this_thread::get_id();
-    m_stored_main_thread_id = true;
-    log_debug(LogSiliconDriver,"Captured main_thread_id: {}", m_main_thread_id);
-}
-
 int tt_cpuset_allocator::_get_num_tt_pci_devices() {
 
     for (auto &d : m_physical_device_id_to_package_id_map) {
diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp
index 65e31eaa..a14a4f33 100644
--- a/device/cpuset_lib.hpp
+++ b/device/cpuset_lib.hpp
@@ -24,8 +24,6 @@ namespace tt {
 //! Utility functions for various backend paramsf
 namespace cpuset {
 
-int get_allowed_num_threads();
-
 // CPU ID allocator for pinning threads to cpu_ids
 // It's a singleton that should be retrieved via get()
 struct tt_cpuset_allocator {
@@ -34,39 +32,12 @@ struct tt_cpuset_allocator {
         tt_cpuset_allocator(tt_cpuset_allocator const&)     = delete;
         void operator=(tt_cpuset_allocator const&)          = delete;
 
-        static void bind_thread_to_cpuset(tt_cluster_description *ndesc, chip_id_t device_id, bool skip_singlify=false){
-            auto& instance = tt_cpuset_allocator::get();
-            instance.bind_thread_cpuset(ndesc, device_id, skip_singlify);
-        }
-        
-        static void unbind_thread_from_cpuset(){
-            auto& instance = tt_cpuset_allocator::get();
-            instance.unbind_thread_cpuset();
-        }
-
-        static void clear_state_and_cpuset_pins(){
-            auto& instance = tt_cpuset_allocator::get();
-            instance.clear_state();
-        }
-
         // Bind an already allocated memory region to particular numa nodes
         static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){
             auto& instance = tt_cpuset_allocator::get();
             return instance.bind_area_memory_nodeset(physical_device_id, addr, len);
         }
 
-        // Store process' main thread_id (not required, mainly for checking purposes to ensure no cpubinds on it occur).
-        static void set_main_thread_id(){
-            auto& instance = tt_cpuset_allocator::get();
-            instance._set_main_thread_id();
-        }
-
-        static int get_num_cpu_cores_allocated_to_device(chip_id_t physical_device_id){
-            auto& instance = tt_cpuset_allocator::get();
-            auto num_cores = instance.m_enable_cpuset_allocator ? instance.m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) : get_allowed_num_threads();
-            return num_cores;
-        }
-
         static int get_num_tt_pci_devices(){
             auto& instance = tt_cpuset_allocator::get();
             return instance._get_num_tt_pci_devices();
@@ -88,17 +59,10 @@ struct tt_cpuset_allocator {
 
         int TENSTORRENT_VENDOR_ID = 0x1e52;
 
-        void bind_thread_cpuset(tt_cluster_description *ndesc, chip_id_t device_id, bool skip_singlify);
-        void unbind_thread_cpuset();
-        void store_thread_original_cpuset();
         bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len);
-        void _set_main_thread_id();
         int _get_num_tt_pci_devices();
         int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id);
 
-        void clear_state();
-        hwloc_cpuset_t allocate_cpu_set_for_thread(chip_id_t physical_device_id, bool skip_singlify);
-
         // Series of init functions, must be called in this order. Seperated out to support
         // early exit in case of errors.
         bool init_topology_init_and_load();
@@ -106,7 +70,6 @@ struct tt_cpuset_allocator {
         bool init_get_number_of_packages();
         bool init_is_cpu_model_supported();
         bool init_determine_cpuset_allocations();
-        bool init_populate_physical_mmio_device_id_map();
 
         // Helper Functions
         std::string get_pci_bus_id(hwloc_obj_t pci_device_obj);
@@ -122,11 +85,8 @@ struct tt_cpuset_allocator {
         std::vector<int> get_hwloc_nodeset_vector(hwloc_obj_t &obj);
         hwloc_topology_t m_topology;
         bool m_debug;
-        bool m_skip_singlify;
         pid_t m_pid;
 
-        std::unordered_map<chip_id_t, chip_id_t> m_logical_to_physical_mmio_device_id_map;
-
         // Items calculated by parsing system info, used by allocation algorithm:
         std::map<int, std::vector<int>> m_package_id_to_devices_map;
         std::map<int, std::string> m_physical_device_id_to_pci_bus_id_map; // Debug/Info
@@ -135,30 +95,16 @@ struct tt_cpuset_allocator {
         std::map<chip_id_t, std::vector<hwloc_cpuset_t>> m_physical_device_id_to_cpusets_map;
         std::map<chip_id_t, int> m_physical_device_id_to_package_id_map;
 
-        std::mutex allocate_cpu_id_mutex;
-
         bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing.
         int m_num_packages = 0;
         std::vector<int> m_all_tt_devices = {};
 
         hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default
 
-
         // For 2CCX-PER-CCD Optimization detection.
         std::map<int, int> m_package_id_to_num_l3_per_ccx_map;
         std::map<int, int> m_package_id_to_num_ccx_per_ccd_map;
 
-        std::map<chip_id_t, int> m_num_threads_pinned_per_tt_device;
-        std::unordered_set<std::thread::id> m_global_thread_ids_pinned = {};
-        std::thread::id m_main_thread_id;
-        bool m_stored_main_thread_id = false;
-
-        // For quicker unbinding of threads, record the physical_device_id during binding.
-        std::map<std::thread::id, chip_id_t> m_global_thread_id_to_physical_device_id_map = {};
-
-        // For storing original cpuset during binding, to restore during unbinding.
-        std::map<std::thread::id, hwloc_cpuset_t> m_global_thread_id_to_original_cpuset_map = {};
-
         // Memory Binding
         std::map<chip_id_t, hwloc_nodeset_t> m_physical_device_id_to_numa_nodeset_map;
 
diff --git a/device/device_api.h b/device/device_api.h
deleted file mode 100644
index a2728e7a..00000000
--- a/device/device_api.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-#include "device/tt_device.h"
-#include "device/driver_atomics.h"
-#include "device/tt_emulation_device.h"
diff --git a/device/device_api_metal.h b/device/device_api_metal.h
index a2728e7a..0fc7820c 100644
--- a/device/device_api_metal.h
+++ b/device/device_api_metal.h
@@ -7,4 +7,3 @@
 #pragma once
 #include "device/tt_device.h"
 #include "device/driver_atomics.h"
-#include "device/tt_emulation_device.h"
diff --git a/device/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp
similarity index 98%
rename from device/grayskull_implementation.cpp
rename to device/grayskull/grayskull_implementation.cpp
index 9d773166..6ed7aaaf 100644
--- a/device/grayskull_implementation.cpp
+++ b/device/grayskull/grayskull_implementation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "device/grayskull_implementation.h"
+#include "grayskull_implementation.h"
 
 namespace tt::umd {
 
diff --git a/device/grayskull_implementation.h b/device/grayskull/grayskull_implementation.h
similarity index 99%
rename from device/grayskull_implementation.h
rename to device/grayskull/grayskull_implementation.h
index 79bdfdee..c014350a 100644
--- a/device/grayskull_implementation.h
+++ b/device/grayskull/grayskull_implementation.h
@@ -99,7 +99,6 @@ enum class arc_message_type {
     ARC_GO_LONG_IDLE = 0x54,
     ARC_GET_HARVESTING = 0x57,
     TEST = 0x90,
-    NOC_DMA_TRANSFER = 0x9A,
     SETUP_IATU_FOR_PEER_TO_PEER = 0x97,
     DEASSERT_RISCV_RESET = 0xba
 };
diff --git a/device/grayskull/impl_device.hpp b/device/grayskull/impl_device.hpp
deleted file mode 100644
index 21a18125..00000000
--- a/device/grayskull/impl_device.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include "device/tt_silicon_driver_common.hpp"
-
-// See src/t6ifc/t6py/packages/tenstorrent/data/grayskull/pci/tlb.yaml
-// 1M
-// local_offset: [ 0, 11,  0,  "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."]
-// x_end       : [ 0, 17, 12,  "" ]
-// y_end       : [ 0, 23, 18,  "" ]
-// x_start     : [ 0, 29, 24,  "" ]
-// y_start     : [ 0, 35, 30,  "" ]
-// noc_sel:      [ 0, 36, 36,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 37, 37,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 39, 38,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 40, 40,  "linked"]
-
-// 2M
-// local_offset: [ 0, 10,  0,  "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."]
-// x_end       : [ 0, 16, 11,  "" ]
-// y_end       : [ 0, 22, 17,  "" ]
-// x_start     : [ 0, 28, 23,  "" ]
-// y_start     : [ 0, 34, 29,  "" ]
-// noc_sel:      [ 0, 35, 35,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 36, 36,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 38, 37,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 39, 39,  "linked"]
-
-// 16M
-// local_offset: [ 0, 7 ,  0,  "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."]
-// x_end       : [ 0, 13,  8,  "" ]
-// y_end       : [ 0, 19, 14,  "" ]
-// x_start     : [ 0, 25, 20,  "" ]
-// y_start     : [ 0, 31, 26,  "" ]
-// noc_sel:      [ 0, 32, 32,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 33, 33,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 35, 34,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 36, 36,  "linked"]
-
-const auto TLB_1M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 12,
-    .y_end = 18,
-    .x_start = 24,
-    .y_start = 30,
-    .noc_sel = 36,
-    .mcast = 37,
-    .ordering = 38,
-    .linked = 40,
-    .static_vc = 41,
-    .static_vc_end = 42
-};
-
-const auto TLB_2M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 11,
-    .y_end = 17,
-    .x_start = 23,
-    .y_start = 29,
-    .noc_sel = 35,
-    .mcast = 36,
-    .ordering = 37,
-    .linked = 39,
-    .static_vc = 40,
-    .static_vc_end = 41
-};
-
-const auto TLB_16M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 8,
-    .y_end = 14,
-    .x_start = 20,
-    .y_start = 26,
-    .noc_sel = 32,
-    .mcast = 33,
-    .ordering = 34,
-    .linked = 36,
-    .static_vc = 37,
-    .static_vc_end = 38
-};
diff --git a/device/kmdif.h b/device/kmdif.h
index 32596d55..c013202b 100644
--- a/device/kmdif.h
+++ b/device/kmdif.h
@@ -9,15 +9,6 @@
 
 typedef std::uint32_t DWORD;
 
-const uint32_t MAX_DMA_BYTES = 4*1024*1024;
-
-// DMA
-struct DMAbuffer {
-	void *pBuf = NULL;
-	std::uint64_t pDma = 0;
-	std::uint64_t size;
-};
-
 struct TTDevice;
 
 struct PCIdevice  {
diff --git a/device/tt_emulation_device.cpp b/device/simulation/deprecated/tt_emulation_device.cpp
similarity index 100%
rename from device/tt_emulation_device.cpp
rename to device/simulation/deprecated/tt_emulation_device.cpp
diff --git a/device/tt_emulation_device.h b/device/simulation/deprecated/tt_emulation_device.h
similarity index 97%
rename from device/tt_emulation_device.h
rename to device/simulation/deprecated/tt_emulation_device.h
index 259841c4..fb2b5e0d 100644
--- a/device/tt_emulation_device.h
+++ b/device/simulation/deprecated/tt_emulation_device.h
@@ -1,3 +1,9 @@
+/*
+ * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
 #pragma once
 
 #include <cstdint>
diff --git a/device/tt_emulation_stub.cpp b/device/simulation/deprecated/tt_emulation_stub.cpp
similarity index 100%
rename from device/tt_emulation_stub.cpp
rename to device/simulation/deprecated/tt_emulation_stub.cpp
diff --git a/device/tt_versim_device.cpp b/device/simulation/deprecated/tt_versim_device.cpp
similarity index 100%
rename from device/tt_versim_device.cpp
rename to device/simulation/deprecated/tt_versim_device.cpp
diff --git a/device/simulation/deprecated/tt_versim_device.h b/device/simulation/deprecated/tt_versim_device.h
new file mode 100644
index 00000000..087b7336
--- /dev/null
+++ b/device/simulation/deprecated/tt_versim_device.h
@@ -0,0 +1,72 @@
+/*
+ * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "tt_device.h"
+#include "tt_soc_descriptor.h"
+#include "tt_xy_pair.h"
+
+class c_versim_core;
+namespace nuapi {namespace device {template <typename, typename>class Simulator;}}
+namespace versim {
+  struct VersimSimulatorState;
+  using VersimSimulator = nuapi::device::Simulator<c_versim_core *, VersimSimulatorState>;
+}
+
+/**
+ * @brief Versim Backend Class, derived from the tt_device class
+ * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device.
+*/ 
+class tt_VersimDevice: public tt_device
+{
+    public:
+    virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
+    virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
+    tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path);
+    virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
+    virtual void start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs);
+    virtual void start_device(const tt_device_params &device_params);
+    virtual void close_device();
+    virtual void deassert_risc_reset();
+    virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
+    virtual void assert_risc_reset();
+    virtual void assert_risc_reset_at_core(tt_cxy_pair core);
+    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
+    virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
+    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
+    virtual void read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
+    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
+    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
+    virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); 
+    virtual void wait_for_non_mmio_flush();
+    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
+    virtual bool using_harvested_soc_descriptors();
+    virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
+    virtual bool noc_translation_en();
+    virtual std::set<chip_id_t> get_target_mmio_device_ids();
+    virtual std::set<chip_id_t> get_target_remote_device_ids();
+    virtual ~tt_VersimDevice();
+    virtual tt_ClusterDescriptor* get_cluster_description();
+    virtual int get_number_of_chips_in_cluster();
+    virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
+    static int detect_number_of_chips();
+    virtual std::map<int,int> get_clocks();
+    virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
+    virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
+    virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
+    virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
+    private:
+    bool stop();
+    tt_device_l1_address_params l1_address_params;
+    tt_device_dram_address_params dram_address_params;
+    versim::VersimSimulator* versim;
+    std::shared_ptr<tt_ClusterDescriptor> ndesc;
+    void* p_ca_soc_manager;
+};
diff --git a/device/tt_versim_stub.cpp b/device/simulation/deprecated/tt_versim_stub.cpp
similarity index 100%
rename from device/tt_versim_stub.cpp
rename to device/simulation/deprecated/tt_versim_stub.cpp
diff --git a/device/simulation/tt_simulation_device.h b/device/simulation/tt_simulation_device.h
index c57bc1da..27a5fdc2 100644
--- a/device/simulation/tt_simulation_device.h
+++ b/device/simulation/tt_simulation_device.h
@@ -36,13 +36,7 @@ class tt_SimulationDevice: public tt_device {
 
     // Runtime Functions
     virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-
-    // void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude,  std::set<uint32_t>& rows_to_exclude,  std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-
-    // virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
-    // virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
     virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
-
     virtual void write_to_sysmem(std::vector<uint32_t>& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id);
     virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id);
     virtual void read_from_sysmem(std::vector<uint32_t> &vec, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id);
@@ -57,26 +51,16 @@ class tt_SimulationDevice: public tt_device {
     // Misc. Functions to Query/Set Device State
     // virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-    // virtual bool noc_translation_en();
-    // virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
-    // virtual int get_number_of_chips_in_cluster();
-    // virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
-    // virtual tt_ClusterDescriptor* get_cluster_description();
     static std::vector<chip_id_t> detect_available_device_ids();
-    // static std::unordered_map<chip_id_t, chip_id_t> get_logical_to_physical_mmio_device_id_map(std::vector<chip_id_t> physical_device_ids);
     virtual std::set<chip_id_t> get_target_remote_device_ids();
     virtual std::map<int,int> get_clocks();
-    // virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1);
-    // virtual void *channel_0_address(std::uint32_t offset, std::uint32_t device_id) const;
     virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
     virtual std::uint64_t get_pcie_base_addr_from_device() const;
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
     virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    // virtual std::uint32_t get_pcie_speed(std::uint32_t device_id);
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
-    // virtual tt_version get_ethernet_fw_version() const;
 
     private:
     // State variables
diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp
index 2ef5ec64..08a15f18 100644
--- a/device/tt_cluster_descriptor.cpp
+++ b/device/tt_cluster_descriptor.cpp
@@ -52,29 +52,6 @@ std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> tt_ClusterDescri
     return directly_connected_channels;
 }
 
-bool tt_ClusterDescriptor::channels_are_directly_connected(const chip_id_t &first, const ethernet_channel_t &first_channel, const chip_id_t &second, const ethernet_channel_t &second_channel) const {
-    if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) {
-        return false;
-    }
-
-    if (this->ethernet_connections.at(first).find(first_channel) == this->ethernet_connections.at(first).end()) {
-        return false;
-    }
-
-    const auto &[connected_chip, connected_channel] = this->ethernet_connections.at(first).at(first_channel);
-    return connected_chip == second && connected_channel == second_channel;   
-}
-
-// const eth_coord_t tt_ClusterDescriptor::get_chip_xy(const chip_id_t &chip_id) const {
-//     // For now we only support a 1D cluster, so the mapping is trivial (where the chip ID is the x value of the xy
-//     location) return eth_coord_t(chip_id, 0, 0, 0);
-// }
-
-// const chip_id_t tt_ClusterDescriptor::get_chip_id_at_location(const eth_coord_t &chip_location) const {
-//     // For now we only support a 1D cluster, so the mapping is trivial (where the chip ID is the x value of the xy
-//     location) return chip_location.x;
-// }
-
 bool tt_ClusterDescriptor::is_chip_mmio_capable(const chip_id_t &chip_id) const {
     return this->chips_with_mmio.find(chip_id) != this->chips_with_mmio.end();
 }
@@ -367,14 +344,6 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_for_grayskull
     return desc;
 }
 
-std::set<chip_id_t> get_sequential_chip_id_set(int num_chips) {
-    std::set<chip_id_t> chip_ids;
-    for (int i = 0; i < num_chips; ++i) {
-        chip_ids.insert(static_cast<chip_id_t>(i));
-    }
-    return chip_ids;
-}
-
 void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
     log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML");
     for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as<std::vector<YAML::Node>>()) {
@@ -594,22 +563,10 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus
     }
 }
 
-void tt_ClusterDescriptor::specify_enabled_devices(const std::vector<chip_id_t> &chip_ids) {
-    this->enabled_active_chips.clear();
-    for (auto chip_id : chip_ids) {
-        this->enabled_active_chips.insert(chip_id);
-    }
-}
-
 void tt_ClusterDescriptor::enable_all_devices() {
     this->enabled_active_chips = this->all_chips;
 }
 
-bool tt_ClusterDescriptor::chips_have_ethernet_connectivity() const { 
-    return ethernet_connections.size() > 0; 
-}
-
-
 std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > > tt_ClusterDescriptor::get_ethernet_connections() const {
     auto eth_connections = std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > >();
 
diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h
index 1a923a8b..bbb8a796 100644
--- a/device/tt_cluster_descriptor.h
+++ b/device/tt_cluster_descriptor.h
@@ -76,7 +76,6 @@ class tt_ClusterDescriptor {
    */
   std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const;
   
-  bool channels_are_directly_connected(const chip_id_t &first, const ethernet_channel_t &first_channel, const chip_id_t &second, const ethernet_channel_t &second_channel) const;
   bool is_chip_mmio_capable(const chip_id_t &chip_id) const;
   chip_id_t get_closest_mmio_capable_chip(const chip_id_t &chip);
   chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord);
@@ -84,10 +83,7 @@ class tt_ClusterDescriptor {
   static std::unique_ptr<tt_ClusterDescriptor> create_for_grayskull_cluster(
       const std::set<chip_id_t> &logical_mmio_device_ids,
       const std::vector<chip_id_t> &physical_mmio_device_ids);
-  // const eth_coord_t get_chip_xy(const chip_id_t &chip_id) const;
-  // const chip_id_t get_chip_id_at_location(const eth_coord_t &chip_location) const;
 
-  bool chips_have_ethernet_connectivity() const;
   std::unordered_map<chip_id_t, std::uint32_t> get_harvesting_info() const;
   std::unordered_map<chip_id_t, bool> get_noc_translation_table_en() const;
   std::unordered_map<chip_id_t, eth_coord_t> get_chip_locations() const;
@@ -103,9 +99,6 @@ class tt_ClusterDescriptor {
   bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
   std::tuple<chip_id_t, ethernet_channel_t> get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
 
-  void specify_enabled_devices(const std::vector<chip_id_t> &chip_ids);
   void enable_all_devices();
 
 };
-
-std::set<chip_id_t> get_sequential_chip_id_set(int num_chips);
diff --git a/device/tt_device.h b/device/tt_device.h
index f3064cd5..22117ad2 100644
--- a/device/tt_device.h
+++ b/device/tt_device.h
@@ -20,7 +20,6 @@
 #include "device/tlb.h"
 #include "device/tt_io.hpp"
 
-using TLB_OFFSETS = tt::umd::tlb_offsets;
 using TLB_DATA = tt::umd::tlb_data;
 
 
@@ -37,13 +36,6 @@ enum tt_DevicePowerState {
     LONG_IDLE
 };
 
-enum tt_MutexType {
-    LARGE_READ_TLB,
-    LARGE_WRITE_TLB,
-    SMALL_READ_WRITE_TLB,
-    ARC_MSG
-};
-
 enum tt_MemBarFlag {
     SET = 0xaa,
     RESET = 0xbb,
@@ -353,10 +345,8 @@ class tt_device
     * \param core chip-x-y struct specifying device and core
     * \param addr Address to write to
     * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor)
-    * \param send_epoch_cmd Specifies that this is an epoch_cmd write, forcing runtime to take a faster write path (Buda only)
-    * \param last_send_epoch_cmd Specifies that this is the last epoch command being written, which requires metadata to be updated (Buda only)
     */
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false) {
+    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
         // Only implement this for Silicon Backend
         throw std::runtime_error("---- tt_device::write_to_device is not implemented\n");
     }
@@ -369,38 +359,11 @@ class tt_device
     * \param core chip-x-y struct specifying device and core
     * \param addr Address to write to
     * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor)
-    * \param send_epoch_cmd Specifies that this is an epoch_cmd write, forcing runtime to take a faster write path (Buda only)
-    * \param last_send_epoch_cmd Specifies that this is the last epoch command being written, which requires metadata to be updated (Buda only)
     */
-    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false) {
+    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
         throw std::runtime_error("---- tt_device::write_to_device is not implemented\n");
     }
 
-    /**
-    * @brief Unroll/replicate uint32_t data (as specified by ptr + len pair) and write it to specified device, core and address (defined for Silicon).
-    * \param mem_ptr src data address
-    * \param len src data size (specified for uint32_t)
-    * \param unroll_count Number of times vector should be unrolled
-    * \param core chip-x-y struct specifying device and core
-    * \param addr Address to write to
-    * \param fallback_tlb Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor)
-    */
-    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-        // Only implement this for Silicon Backend
-        throw std::runtime_error("---- tt_device::rolled_write_to_device is not implemented\n");
-    }
-    /**
-    * @brief Unroll/replicate a uint32_t vector and write it to specified device, core and address (defined for Silicon and Versim).
-    * \param vec Vector to write
-    * \param unroll_count Number of times vector should be unrolled
-    * \param core chip-x-y struct specifying device and core
-    * \param addr Address to write to
-    * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor)
-    */
-    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
-        throw std::runtime_error("---- tt_device::rolled_write_to_device is not implemented\n");
-    }
-
     /**
     * @brief Read uint32_t data from a specified device, core and address to host memory (defined for Silicon).
     * \param mem_ptr dest data address on host (expected to be preallocated, depending on transfer size)
@@ -491,13 +454,7 @@ class tt_device
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors() {
         throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n");
     }
-     /**
-     * @brief Get Hardware Translation Table state
-     * \returns true if translation tables are enabled (WH only)
-     */ 
-    virtual bool noc_translation_en() {
-        throw std::runtime_error("---- tt_device:noc_translation_en is not implemented\n");
-    }
+
     /**
      * @brief Issue message to device, meant to be picked up by ARC Firmare
      * \param logical_device_id Chip to target
@@ -566,14 +523,6 @@ class tt_device
         return std::map<int,int>();
     }
 
-    /**
-     * @brief Get the PCIe speed for a specific device based on link width and link speed
-     * \returns Bandwidth in Gbps
-     */
-    virtual std::uint32_t get_pcie_speed(std::uint32_t device_id) {
-        return 8 * 16;  // default to x8 at 16 GT/s
-    }
-
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) {
         throw std::runtime_error("---- tt_device::get_numa_node_for_pcie_device is not implemented\n");
     }
@@ -585,30 +534,6 @@ class tt_device
     virtual tt_version get_ethernet_fw_version() const {
         throw std::runtime_error("---- tt_device::get_ethernet_fw_version is not implemented \n");
     }
-
-    /** 
-     * @brief Get the total hugepage (host memory) size allocated for a device. 
-     * This memory is not entirely accessible by device. To query the number of channels
-     * or memory per channel that is accessbile, see get_host_channel_size or get_num_host_channels
-     * \param src_device_id Device for which allocated host memory is being queried
-     * \returns Total memory allocated on host for a specific device
-     * 
-    */ 
-    virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1) {
-        throw std::runtime_error("---- tt_device::dma_allocation_size is not implemented\n");
-        return 0;
-    }
-
-    /** 
-     * Get the address for the MMIO mapped region on Channel (as seen from host memory)
-     * \param offset Address in DRAM
-     * \param target chip-x-y struct specifying device and core of target DRAM
-     * \returns Host interpretation of MMIO mapped channel 0 address 
-     */ 
-    virtual void *channel_address(std::uint32_t offset, const tt_cxy_pair& target) {
-        throw std::runtime_error("---- tt_device::channel_address is not implemented\n");
-        return nullptr;
-    }
     /**
      * @brief Query number of DRAM channels on a specific device
      * \param device_id Logical device id to query
@@ -676,67 +601,6 @@ class tt_device
     std::unordered_map<chip_id_t, tt_SocDescriptor> soc_descriptor_per_chip = {};
 };
 
-class c_versim_core;
-namespace nuapi {namespace device {template <typename, typename>class Simulator;}}
-namespace versim {
-  struct VersimSimulatorState;
-  using VersimSimulator = nuapi::device::Simulator<c_versim_core *, VersimSimulatorState>;
-}
-
-/**
- * @brief Versim Backend Class, derived from the tt_device class
- * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device.
-*/ 
-class tt_VersimDevice: public tt_device
-{
-    public:
-    virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
-    virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
-    tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path);
-    virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
-    virtual void start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs);
-    virtual void start_device(const tt_device_params &device_params);
-    virtual void close_device();
-    virtual void deassert_risc_reset();
-    virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
-    virtual void assert_risc_reset();
-    virtual void assert_risc_reset_at_core(tt_cxy_pair core);
-    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
-    virtual void read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
-    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); 
-    virtual void wait_for_non_mmio_flush();
-    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
-    virtual bool using_harvested_soc_descriptors();
-    virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-    virtual bool noc_translation_en();
-    virtual std::set<chip_id_t> get_target_mmio_device_ids();
-    virtual std::set<chip_id_t> get_target_remote_device_ids();
-    virtual ~tt_VersimDevice();
-    virtual tt_ClusterDescriptor* get_cluster_description();
-    virtual int get_number_of_chips_in_cluster();
-    virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
-    static int detect_number_of_chips();
-    virtual std::map<int,int> get_clocks();
-    virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
-    virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
-    virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    private:
-    bool stop();
-    tt_device_l1_address_params l1_address_params;
-    tt_device_dram_address_params dram_address_params;
-    versim::VersimSimulator* versim;
-    std::shared_ptr<tt_ClusterDescriptor> ndesc;
-    void* p_ca_soc_manager;
-};
-
 #include "device/architecture_implementation.h"
 
 /**
@@ -781,14 +645,10 @@ class tt_SiliconDevice: public tt_device
     virtual void close_device();
 
     // Runtime Functions
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
+    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
+    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
     void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude,  std::set<uint32_t>& rows_to_exclude,  std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-    virtual void write_epoch_cmd_to_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write);
-    virtual void write_epoch_cmd_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write);
 
-    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
-    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
     virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
     virtual void read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
     virtual void write_to_sysmem(std::vector<uint32_t>& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id);
@@ -809,7 +669,7 @@ class tt_SiliconDevice: public tt_device
     /**
      * @brief This API allows you to write directly to device memory that is addressable by a static TLB
     */
-    std::function<void(uint32_t, uint32_t, const uint8_t*, uint32_t)> get_fast_pcie_static_tlb_write_callable(int device_id);
+    std::function<void(uint32_t, uint32_t, const uint8_t*)> get_fast_pcie_static_tlb_write_callable(int device_id);
 
     /**
      * @brief Provide fast write access to a statically-mapped TLB.
@@ -824,40 +684,30 @@ class tt_SiliconDevice: public tt_device
      */
     tt::Writer get_static_tlb_writer(tt_cxy_pair target);
 
-    /**
-     * @brief Returns the DMA buf size 
-    */
-    uint32_t get_m_dma_buf_size() const;
     // Misc. Functions to Query/Set Device State
     virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
     virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-    virtual bool noc_translation_en();
     virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
     virtual int get_number_of_chips_in_cluster();
     virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
     virtual tt_ClusterDescriptor* get_cluster_description();
     static int detect_number_of_chips();
     static std::vector<chip_id_t> detect_available_device_ids();
-    static std::unordered_map<chip_id_t, chip_id_t> get_logical_to_physical_mmio_device_id_map(std::vector<chip_id_t> physical_device_ids);
     virtual std::set<chip_id_t> get_target_mmio_device_ids();
     virtual std::set<chip_id_t> get_target_remote_device_ids();
     virtual std::map<int,int> get_clocks();
-    virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1);
-    virtual void *channel_address(std::uint32_t offset, const tt_cxy_pair& target);
     virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
     virtual std::uint64_t get_pcie_base_addr_from_device() const;
     static std::vector<int> extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows);
     static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove);
     static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows);
     static std::unordered_map<tt_xy_pair, tt_xy_pair> create_harvested_coord_translation(const tt::ARCH arch, bool identity_map);
-    static std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_from_harvested_rows(std::unordered_map<chip_id_t, std::vector<uint32_t>> harvested_rows); 
     std::unordered_map<tt_xy_pair, tt_xy_pair> get_harvested_coord_translation_map(chip_id_t logical_device_id);
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
     virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    virtual std::uint32_t get_pcie_speed(std::uint32_t device_id);
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
     virtual tt_version get_ethernet_fw_version() const;
 
@@ -877,14 +727,9 @@ class tt_SiliconDevice: public tt_device
     void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets);
     void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting);
     void populate_cores();
-    void init_pcie_iatus();
-    void init_pcie_iatus_no_p2p();
+    void init_pcie_iatus(); // No more p2p support.
     bool init_hugepage(chip_id_t device_id);
-    bool init_dmabuf(chip_id_t device_id);
     void check_pcie_device_initialized(int device_id);
-    bool init_dma_turbo_buf(struct PCIdevice* pci_device);
-    bool uninit_dma_turbo_buf(struct PCIdevice* pci_device);
-    static std::map<chip_id_t, std::string> get_physical_device_id_to_bus_id_map(std::vector<chip_id_t> physical_device_ids);
     void set_pcie_power_state(tt_DevicePowerState state);
     int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state);
     void set_power_state(tt_DevicePowerState state);
@@ -900,13 +745,11 @@ class tt_SiliconDevice: public tt_device
     int get_clock(int logical_device_id);
 
     // Communication Functions
-    void read_dma_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id);
-    void write_dma_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id);
+    void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id);
+    void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id);
     void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb);
     void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector<int> broadcast_header = {});
     void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb);
-    void write_to_non_mmio_device_send_epoch_cmd(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write);
-    void rolled_write_to_non_mmio_device(const uint32_t *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t address, uint32_t unroll_count);
     void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes);
     void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
     void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
@@ -930,9 +773,7 @@ class tt_SiliconDevice: public tt_device
     // Test functions
     void verify_eth_fw();
     void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t> &fw_versions);
-    int test_pcie_tlb_setup (struct PCIdevice* pci_device);
     int test_setup_interface ();
-    int test_broadcast (int logical_device_id);
 
     // State variables
     tt_device_dram_address_params dram_address_params;
@@ -962,17 +803,10 @@ class tt_SiliconDevice: public tt_device
     static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1);
 
     int active_core = NON_EPOCH_ETH_CORES_START_ID;
-    int active_core_epoch = EPOCH_ETH_CORES_START_ID;
-    bool erisc_q_ptrs_initialized = false;
-    std::vector<std::uint32_t> erisc_q_ptrs_epoch[NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS];
-    bool erisc_q_wrptr_updated[NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS];
     std::vector< std::vector<tt_cxy_pair> > remote_transfer_ethernet_cores;
     bool flush_non_mmio = false;
     bool non_mmio_transfer_cores_customized = false;
     std::unordered_map<chip_id_t, int> active_eth_core_idx_per_chip = {};
-    // Size of the PCIE DMA buffer
-    // The setting should not exceed MAX_DMA_BYTES
-    std::uint32_t m_dma_buf_size;
     std::unordered_map<chip_id_t, bool> noc_translation_enabled_for_chip = {};
     std::map<std::string, std::shared_ptr<boost::interprocess::named_mutex>> hardware_resource_mutex_map = {};
     std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> harvested_coord_translation = {};
@@ -991,9 +825,6 @@ class tt_SiliconDevice: public tt_device
     std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
     std::unordered_map<std::string, uint64_t> dynamic_tlb_ordering_modes = {};
     std::map<std::set<chip_id_t>, std::unordered_map<chip_id_t, std::vector<std::vector<int>>>> bcast_header_cache = {};
-    std::uint64_t buf_physical_addr = 0;
-    void * buf_mapping = nullptr;
-    int driver_id;  
     bool perform_harvesting_on_sdesc = false;
     bool use_ethernet_ordered_writes = true;
     bool use_ethernet_broadcast = true;
diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp
index a147f3e0..f406fc6f 100644
--- a/device/tt_silicon_driver.cpp
+++ b/device/tt_silicon_driver.cpp
@@ -84,17 +84,6 @@ void clr_printf(const char *clr, const char *fmt, ...) {
 int g_DEBUG_LEVEL; // /src/t6ifc/t6py/packages/tenstorrent/jlink/jtag_comm.cpp
 bool g_READ_CHECKING_ENABLED = true;
 
-bool g_USE_MSI_FOR_DMA = false; // Whether to wait for MSI after DMA transfer, or poll a variable
-uint32_t g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES = 0;  // 0 - never use DMA. Otherwise use DMA for all blocks larger than this size
-uint32_t g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES = 0; // 0 - never use DMA. Otherwise use DMA for all blocks larger than this size
-
-// Address in CSM where the DMA request structure resides
-uint32_t c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0;
-// Address where the trigger for transfer resides
-uint32_t c_DMA_TRIGGER_ADDRESS = 0;
-// To trigger arc interrupt
-uint32_t c_ARC_MISC_CNTL_ADDRESS = 0;
-
 // Print all buffers smaller than this number of bytes
 uint32_t g_NUM_BYTES_TO_PRINT = 8;
 
@@ -102,24 +91,15 @@ uint32_t g_NUM_BYTES_TO_PRINT = 8;
 const bool g_SINGLE_PIN_PAGE_PER_FD_WORKAROND = true;
 const uint32_t g_MAX_HOST_MEM_CHANNELS = 4;
 
-volatile bool msi_interrupt_received = false;
-
 const char device_name_pattern[] = "/dev/tenstorrent/%u";
 
-const std::string tlb_large_read_mutex_name_prefix = "mem_tlb_large_read_mutex_pci_interface_id_";
-const std::string tlb_large_write_mutex_name_prefix = "mem_tlb_large_write_mutex_pci_interface_id_";
-const std::string tlb_small_read_write_mutex_name_prefix = "mem_tlb_small_read_write_mutex_pci_interface_id_";
-const std::string arc_msg_mutex_name_prefix = "arc_msg_mutex_pci_interface_id_";
-
 static uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24);
 static uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC
 
 static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078;
 static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044;
 
-const uint32_t DMA_BUF_REGION_SIZE = 4 << 20;
 const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB
-const uint32_t DMA_MAP_MASK = DMA_BUF_REGION_SIZE - 1;
 const uint32_t HUGEPAGE_MAP_MASK = HUGEPAGE_REGION_SIZE - 1;
 
 static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF;
@@ -141,10 +121,6 @@ const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;
 PCIdevice ttkmd_open(DWORD device_id, bool sharable /* = false */);
 int ttkmd_close(struct PCIdevice &device);
 
-uint32_t pcie_dma_transfer_turbo (TTDevice *dev, uint32_t chip_addr, uint32_t host_phys_addr, uint32_t size_bytes, bool write);
-DMAbuffer pci_allocate_dma_buffer(TTDevice *dev, uint32_t size);
-void pcie_init_dma_transfer_turbo (PCIdevice* dev);
-
 void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
 
 // Stash all the fields of TTDevice in TTDeviceBase to make moving simpler.
@@ -182,17 +158,8 @@ struct TTDeviceBase
     std::uint8_t pci_device;
     std::uint8_t pci_function;
 
-    unsigned int next_dma_buf = 0;
-
-	DMAbuffer dma_completion_flag_buffer;  // When DMA completes, it writes to this buffer
-	DMAbuffer dma_transfer_buffer;         // Buffer for large DMA transfers
-
-    std::uint32_t max_dma_buf_size_log2;
-
     tenstorrent_get_device_info_out device_info;
 
-    std::vector<DMAbuffer> dma_buffer_mappings;
-
     std::uint32_t read_checking_offset;
 };
 
@@ -256,10 +223,6 @@ struct TTDevice : TTDeviceBase
             munmap(system_reg_mapping, system_reg_mapping_size);
         }
 
-        for (auto &&buf : dma_buffer_mappings) {
-            munmap(buf.pBuf, buf.size);
-        }
-
         if (sysfs_config_fd != -1) {
             close(sysfs_config_fd);
         }
@@ -274,7 +237,6 @@ struct TTDevice : TTDeviceBase
         bar2_uc = nullptr;
         bar4_wc = nullptr;
         system_reg_mapping = nullptr;
-        dma_buffer_mappings.clear();
         sysfs_config_fd = -1;
     }
 
@@ -469,8 +431,6 @@ void TTDevice::do_open() {
 
     this->device_info = device_info.out;
 
-    max_dma_buf_size_log2 = device_info.out.max_dma_buf_size_log2;
-
     struct {
         tenstorrent_query_mappings query_mappings;
         tenstorrent_mapping mapping_array[8];
@@ -621,23 +581,6 @@ void TTDevice::do_open() {
     this->read_checking_offset = is_blackhole(device_info.out) ? BH_NOC_NODE_ID_OFFSET : GS_WH_ARC_SCRATCH_6_OFFSET;
 }
 
-void set_debug_level(int dl) {
-    g_DEBUG_LEVEL = dl;
-}
-
-std::uint64_t pci_dma_buffer_get_physical_addr(DMAbuffer &dma_buffer) {
-    log_assert (dma_buffer.pDma, "DMA Buffer not initialized");
-    return reinterpret_cast<std::uint64_t>(dma_buffer.pDma);
-}
-
-std::uint64_t pci_dma_buffer_get_user_addr(DMAbuffer &dma_buffer) {
-    log_assert (dma_buffer.pBuf, "DMA Buffer not initialized");
-    return reinterpret_cast<std::uint64_t>(dma_buffer.pBuf);
-}
-
-DWORD ttkmd_init() { return 0; }    // 0 on success
-DWORD ttkmd_uninit() { return 0; }  // 0 on success
-
 bool is_char_dev(const dirent *ent, const char *parent_dir) {
     if (ent->d_type == DT_UNKNOWN || ent->d_type == DT_LNK) {
         char name[2 * NAME_MAX + 2];
@@ -731,39 +674,6 @@ int get_revision_id(TTDevice *dev) {
     }
 }
 
-int get_link_width(TTDevice *dev) {
-
-    static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/current_link_width";
-    char buf[sizeof(pattern)];
-    std::snprintf(buf, sizeof(buf), pattern,
-    (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function);
-
-    std::ifstream linkwidth_file(buf);
-    std::string linkwidth_string;
-    if (std::getline(linkwidth_file, linkwidth_string)) {
-        return std::stoi(linkwidth_string, nullptr, 0);
-    } else {
-        throw std::runtime_error("Link width read failed for device");
-    }
-}
-
-int get_link_speed(TTDevice *dev) {
-
-    static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/current_link_speed";
-    char buf[sizeof(pattern)];
-    std::snprintf(buf, sizeof(buf), pattern,
-    (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function);
-
-    std::ifstream linkspeed_file(buf);
-    std::string linkspeed_string;
-    int linkspeed;
-    if (std::getline(linkspeed_file, linkspeed_string) && sscanf(linkspeed_string.c_str(), "%d", &linkspeed) == 1) {
-        return linkspeed;
-    } else {
-        throw std::runtime_error("Link speed read failed for device");
-    }
-}
-
 int get_numa_node(TTDevice *dev) {
 
     static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/numa_node";
@@ -792,41 +702,6 @@ std::uint64_t read_bar0_base(TTDevice *dev) {
     return bar01 & bar_address_mask;
 }
 
-DMAbuffer allocate_dma_buffer(TTDevice *ttdev, unsigned int buffer_index, std::size_t size) {
-    tenstorrent_allocate_dma_buf allocate_dma_buf;
-
-    if (size > std::numeric_limits<decltype(allocate_dma_buf.in.requested_size)>::max()) {
-        throw std::runtime_error(std::string("Requested DMA buffer size (" + std::to_string(allocate_dma_buf.in.requested_size)
-                                             + ") bytes exceeds interface size limit for device " + std::to_string(ttdev->index) + ", with error: " + std::strerror(errno)));
-    }
-
-    memset(&allocate_dma_buf, 0, sizeof(allocate_dma_buf));
-    allocate_dma_buf.in.requested_size = std::max<std::size_t>(size, getpagesize());
-    allocate_dma_buf.in.buf_index = buffer_index;
-
-    if (ioctl(ttdev->device_fd, TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF, &allocate_dma_buf) == -1) {
-        throw std::runtime_error(std::string("DMA buffer allocation failed (") + std::to_string(allocate_dma_buf.in.requested_size)
-                                 + " bytes) for device " + std::to_string(ttdev->index) + ".");
-    }
-
-    void *mapping = mmap(NULL, allocate_dma_buf.out.size, PROT_READ | PROT_WRITE, MAP_SHARED, ttdev->device_fd, allocate_dma_buf.out.mapping_offset);
-
-    log_trace(tt::LogSiliconDriver, "DMA buffer succeeded with size {} offset {} phy_addr {}", allocate_dma_buf.out.size, allocate_dma_buf.out.mapping_offset, allocate_dma_buf.out.physical_address);
-
-    if (mapping == MAP_FAILED) {
-        throw std::runtime_error(std::string("DMA buffer memory mapping failed for device ") + std::to_string(ttdev->index) + ".");
-    }
-
-    DMAbuffer dmabuf;
-    dmabuf.pBuf = mapping;
-    dmabuf.pDma = allocate_dma_buf.out.physical_address;
-    dmabuf.size = allocate_dma_buf.out.size;
-
-    ttdev->dma_buffer_mappings.push_back(dmabuf);
-
-    return dmabuf;
-}
-
 PCIdevice ttkmd_open(DWORD device_id, bool sharable /* = false */)
 {
     (void)sharable; // presently ignored
@@ -1053,24 +928,7 @@ void memcpy_from_device(void *dest, const void *src, std::size_t num_bytes) {
     }
 }
 
-void read_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr, uint32_t dma_buf_size) {
-    if (num_bytes >= g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES && g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES > 0) {
-        record_access ("read_block_a", byte_addr, num_bytes, true, false, true, true); // addr, size, turbo, write, block, endline
-
-        DMAbuffer &transfer_buffer = dev->dma_transfer_buffer;
-
-        uint64_t host_phys_addr = pci_dma_buffer_get_physical_addr (transfer_buffer);
-        uint64_t host_user_addr = pci_dma_buffer_get_user_addr (transfer_buffer);
-        while (num_bytes > 0) {
-            uint32_t transfered_bytes = std::min<uint32_t>(num_bytes, dma_buf_size);
-            pcie_dma_transfer_turbo (dev, byte_addr, host_phys_addr, transfered_bytes, false);
-            memcpy (buffer_addr, (void*)host_user_addr, transfered_bytes);
-            num_bytes -= transfered_bytes;
-            byte_addr += transfered_bytes;
-            buffer_addr += transfered_bytes;
-        }
-        return;
-    }
+void read_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) {
 
     record_access("read_block_b", byte_addr, num_bytes, false, false, true, false); // addr, size, turbo, write, block, endline
 
@@ -1116,24 +974,7 @@ void read_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, uint8_t*
     print_buffer (buffer_addr, std::min((uint64_t)g_NUM_BYTES_TO_PRINT, num_bytes), true);
 }
 
-void write_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr, uint32_t dma_buf_size) {
-    if (num_bytes >= g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES && g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES > 0) {
-        record_access ("write_block_a", byte_addr, num_bytes, true, true, true, true); // addr, size, turbo, write, block, endline
-
-        DMAbuffer &transfer_buffer = dev->dma_transfer_buffer;
-
-        uint64_t host_phys_addr = pci_dma_buffer_get_physical_addr (transfer_buffer);
-        uint64_t host_user_addr = pci_dma_buffer_get_user_addr (transfer_buffer);
-        while (num_bytes > 0) {
-            uint32_t transfered_bytes = std::min<uint32_t>(num_bytes, dma_buf_size);
-            memcpy ( (void*)host_user_addr, buffer_addr, transfered_bytes);
-            pcie_dma_transfer_turbo (dev, byte_addr, host_phys_addr, transfered_bytes, true);
-            num_bytes -= transfered_bytes;
-            byte_addr += transfered_bytes;
-            buffer_addr += transfered_bytes;
-        }
-        return;
-    }
+void write_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) {
 
     record_access("write_block_b", byte_addr, num_bytes, false, true, true, false); // addr, size, turbo, write, block, endline
 
@@ -1174,57 +1015,6 @@ void write_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, const ui
     print_buffer (buffer_addr, std::min((uint64_t)g_NUM_BYTES_TO_PRINT, num_bytes), true);
 }
 
-void read_checking_enable(bool enable = true) {
-    g_READ_CHECKING_ENABLED = enable;
-}
-
-// Read/write to the configuration space of the device
-// pData is a pointer to a buffer (see memory module)
-DWORD read_cfg(TTDevice *dev, DWORD byte_offset, uint64_t pData, DWORD num_bytes) {
-
-    if (pread(get_config_space_fd(dev), reinterpret_cast<void*>(pData), num_bytes, byte_offset) != num_bytes) {
-        throw std::runtime_error("Config space read failed for device ");
-    }
-
-    return 0;
-}
-
-DWORD write_cfg(TTDevice *dev, DWORD byte_offset, uint64_t pData, DWORD num_bytes) {
-
-    if (pwrite(get_config_space_fd(dev), reinterpret_cast<const void*>(pData), num_bytes, byte_offset) != num_bytes) {
-        throw std::runtime_error("Config space read failed for device ");
-    }
-
-    return 0;
-}
-
-DMAbuffer pci_allocate_dma_buffer(TTDevice *dev, uint32_t size) {
-
-    uint32_t page_size = getpagesize();
-    uint32_t page_aligned_size = (size + page_size - 1) & ~(page_size - 1);
-
-    DMAbuffer ret_val = allocate_dma_buffer(dev, dev->next_dma_buf++, page_aligned_size);
-    LOG1 ("Allocated DMA buffer at 0x%lx 0x%lx size: %u\n", ret_val.pBuf, ret_val.pDma, size);
-    return ret_val;
-}
-
-void pcie_init_dma_transfer_turbo (PCIdevice* dev) {
-    // From SHA 8cf7ff1bc7b3886a:
-    if (detect_arch(dev) == tt::ARCH::WORMHOLE_B0) {
-        c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0x1fef84c8; // chip.AXI.get_path_info("ARC_CSM.ARC_PCIE_DMA_REQUEST")
-    } else {
-        c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0x1fef84c0; // chip.AXI.get_path_info("ARC_CSM.ARC_PCIE_DMA_REQUEST")
-    }
-    c_DMA_TRIGGER_ADDRESS = 0x1ff30074;              // chip.AXI.get_path_info("ARC_RESET.SCRATCH[5]")
-    c_ARC_MISC_CNTL_ADDRESS = 0x1ff30100;            // chip.AXI.get_path_info("ARC_RESET.ARC_MISC_CNTL")
-}
-
-void set_use_dma(bool msi, uint32_t dma_block_size_read_threshold_bytes, uint32_t dma_block_size_write_threshold_bytes) {
-    g_USE_MSI_FOR_DMA = msi;
-    g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES  = dma_block_size_read_threshold_bytes;
-    g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES = dma_block_size_write_threshold_bytes;
-}
-
 void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len) {
     while (word_len-- != 0) {
         *dest++ = *src++;
@@ -1286,66 +1076,6 @@ void read_regs(TTDevice *dev, uint32_t byte_addr, uint32_t word_len, void *data)
     print_buffer (data, std::min(g_NUM_BYTES_TO_PRINT, word_len * 4), true);
 }
 
-void handle_dma_timeout(TTDevice *dev, uint32_t size_bytes, bool write) {
-    detect_ffffffff_read(dev);
-    throw std::runtime_error(std::string("DMA transfer timeout: ")
-                             + std::to_string(size_bytes)
-                             + (write ? " byte write." : " byte read."));
-}
-uint32_t pcie_dma_transfer_turbo (TTDevice *dev, uint32_t chip_addr, uint32_t host_phys_addr, uint32_t size_bytes, bool write) {
-    // c_timer t ("");
-
-    // t.now_in ("1. DMA setup");
-
-    if (c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET == 0) {
-        throw std::runtime_error ("pcie_init_dma_transfer_turbo must be called before pcie_dma_transfer_turbo");
-    }
-
-    arc_pcie_ctrl_dma_request_t req = {
-        .chip_addr           = chip_addr,
-        .host_phys_addr      = host_phys_addr,
-        .completion_flag_phys_addr = static_cast<uint32_t>(pci_dma_buffer_get_physical_addr(dev->dma_completion_flag_buffer)),
-        .size_bytes          = size_bytes,
-        .write               = (write ? 1U : 0U),
-        .pcie_msi_on_done    = g_USE_MSI_FOR_DMA ? 1U : 0U,
-        .pcie_write_on_done  = g_USE_MSI_FOR_DMA ? 0U : 1U,
-        .trigger             = 1U,
-        .repeat              = 1
-    };
-
-    volatile uint32_t *complete_flag = (uint32_t *)pci_dma_buffer_get_user_addr(dev->dma_completion_flag_buffer);
-    *complete_flag = 0;
-
-    // Configure the DMA engine
-    msi_interrupt_received = false;
-    write_regs (dev, c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET, sizeof(req) / sizeof(uint32_t), &req);
-
-    // Trigger ARC interrupt 0 on core 0
-    int arc_misc_cntl_value = 0;
-
-    // NOTE: Ideally, we should read the state of this register before writing to it, but that
-    //       casues a lot of delay (reads have huge latencies)
-    arc_misc_cntl_value |= (1 << 16); // Cause IRQ0 on core 0
-    write_regs (dev, c_ARC_MISC_CNTL_ADDRESS, 1, &arc_misc_cntl_value);
-
-    if (!g_USE_MSI_FOR_DMA) {
-        // t.now_in ("2. DMA poll");
-        int wait_loops = 0;
-        while (true) {
-            // The complete flag is set ty by ARC (see src/hardware/soc/tb/arc_fw/lib/pcie_dma.c)
-            if (*complete_flag == 0xfaca) break;
-            wait_loops++;
-        }
-        // LOG2 ("Waited %d iterations\n", wait_loops);
-    } else {
-        // t.now_in ("2. DMA wait for MSI");
-        while (msi_interrupt_received == false)
-            ;
-    }
-
-    return 0; // TODO: status
-}
-
 void print_device_info (struct PCIdevice &d) {
     LOG1("PCIEIntfId   0x%x\n", d.id);
     LOG1("VID:DID      0x%x:0x%x\n", d.vendor_id, d.device_id);
@@ -1519,25 +1249,16 @@ void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, boo
 
 void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) {
     m_pci_log_level = 0;
-    m_dma_buf_size = 0;
     LOG1("---- tt_SiliconDevice::tt_SiliconDevice\n");
-    static int unique_driver_id = 0;
-    driver_id = unique_driver_id++;
 
     // Set the log level for debugging
     const char* pci_log_level = std::getenv("TT_PCI_LOG_LEVEL");
     if (pci_log_level) {
         m_pci_log_level = atoi (pci_log_level);
     }
-    set_debug_level(m_pci_log_level);
+    g_DEBUG_LEVEL = m_pci_log_level;
     LOG1 ("TT_PCI_LOG_LEVEL=%d\n", m_pci_log_level);
 
-    const char* dma_buf_size = std::getenv("TT_PCI_DMA_BUF_SIZE");
-    if (dma_buf_size) {
-        m_dma_buf_size = atoi (dma_buf_size);
-    }
-    LOG1 ("TT_PCI_DMA_BUF_SIZE=%d\n", m_dma_buf_size);
-
     // Don't buffer stdout.
     setbuf(stdout, NULL);
 
@@ -1584,7 +1305,7 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
             print_device_info (*pci_device);
 
         // MT: Initial BH - hugepages will fail init
-        // For using silicon driver without workload to query mission mode params, no need for hugepage/dmabuf.
+        // For using silicon driver without workload to query mission mode params, no need for hugepage.
         if (!skip_driver_allocs){
             bool hugepages_initialized = init_hugepage(logical_device_id);
             // Large writes to remote chips require hugepages to be initialized.
@@ -1592,9 +1313,8 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
             if(target_remote_chips.size()) {
                 log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!");
             }
-            uint16_t channel = 0; // Single channel sufficient for this?
-            if (not hugepage_mapping.at(logical_device_id).at(channel)) {
-                init_dmabuf(logical_device_id);
+            if (not hugepage_mapping.at(logical_device_id).at(0)) {
+                log_warning(LogSiliconDriver, "No hugepage mapping at device {}", logical_device_id);
             }
         }
         harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map
@@ -1609,9 +1329,6 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
     }
 }
 
-bool tt_SiliconDevice::noc_translation_en() {
-    return translation_tables_en;
-}
 bool tt_SiliconDevice::using_harvested_soc_descriptors() {
     return perform_harvesting_on_sdesc && performed_harvesting;
 }
@@ -1811,17 +1528,6 @@ void tt_SiliconDevice::populate_cores() {
     }
 }
 
-std::unordered_map<chip_id_t, uint32_t> tt_SiliconDevice::get_harvesting_masks_from_harvested_rows(std::unordered_map<chip_id_t, std::vector<uint32_t>> harvested_rows) {
-    std::unordered_map<chip_id_t, uint32_t> harvesting_masks = {};
-    for(const auto& chip : harvested_rows) {
-        uint32_t harvesting_mask_per_chip = 0;
-        harvesting_masks.insert({chip.first, 0});
-        for(const auto& row : chip.second) {
-            harvesting_masks.at(chip.first) |= (1 << row);
-        }
-    }
-    return harvesting_masks;
-}
 std::vector<int> tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) {
     // Check if harvesting config is legal for GS and WH
     log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested");
@@ -2031,47 +1737,11 @@ void tt_SiliconDevice::initialize_pcie_devices() {
         check_pcie_device_initialized(device_it.first);
     }
 
-    // If requires multi-channel or doesn't support mmio-p2p, init iatus without p2p.
-    if (m_num_host_mem_channels <= 1 && arch_name == tt::ARCH::GRAYSKULL) {
-        init_pcie_iatus();
-    } else {
-        // TODO: Implement support for multiple host channels on BLACKHOLE.
-        log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1),
-            "More channels are not yet supported for Blackhole");
-        init_pcie_iatus_no_p2p();
-    }
+    // TODO: Implement support for multiple host channels on BLACKHOLE.
+    log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), "More channels are not yet supported for Blackhole");
+    init_pcie_iatus();
 
     init_membars();
-    
-    // https://yyz-gitlab.local.tenstorrent.com/ihamer/ll-sw/issues/25
-    // Note: using pcie dma while device is idle is safe, mixing p2p is unsafe, see issue above
-    // TODO: disable pcie dma if p2p traffic is present, ie. chip-to-chip or chip-to-host
-
-    for (auto &device_it : m_pci_device_map){
-        struct PCIdevice* pci_device = device_it.second;
-        auto device_id = pci_device->device_id;
-        // MT Initial BH - Don't use PCIe DMA
-        bool enable_pcie_dma;
-        if (arch_name == tt::ARCH::BLACKHOLE) {
-            enable_pcie_dma = false;
-        } else {
-            enable_pcie_dma = m_dma_buf_size>0;
-        }
-        // Use DMA only for transfers that cross the size thresholds (empirically determined)
-        if (enable_pcie_dma) {
-            try {
-                log_trace(LogSiliconDriver, "Enable PCIE DMA with bufsize {}", m_dma_buf_size);
-                set_use_dma (false, 128, 0); // use dma for reads only
-                init_dma_turbo_buf(pci_device);
-            } catch (const std::exception &e) {
-                log_trace(LogSiliconDriver, "Disable PCIE DMA, fallback to MMIO transfers due to exepction {}", e.what());
-                set_use_dma (false, 0, 0);
-                uninit_dma_turbo_buf(pci_device);
-            }
-        } else {
-            log_trace(LogSiliconDriver, "Disable PCIE DMA");
-        }
-    }   
 }
 
 void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(struct PCIdevice *device, const TensixSoftResetOptions &soft_resets) {
@@ -2169,24 +1839,12 @@ std::vector<chip_id_t> tt_SiliconDevice::detect_available_device_ids() {
     return detected_device_ids;
 }
 
-static bool check_dram_core_exists(const std::vector<std::vector<tt_xy_pair>> &all_dram_cores, tt_xy_pair target_core) {
-    bool dram_core_exists = false;
-    for (const auto &dram_cores_in_channel : all_dram_cores) {
-        for (auto dram_core : dram_cores_in_channel) {
-            if (dram_core.x == target_core.x && dram_core.y == target_core.y) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-std::function<void(uint32_t, uint32_t, const uint8_t*, uint32_t)> tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) {
+std::function<void(uint32_t, uint32_t, const uint8_t*)> tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) {
     struct PCIdevice* pci_device = get_pci_device(device_id);
     TTDevice* dev = pci_device->hdev;
 
-    const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr, uint32_t dma_buf_size) {
-        write_block(dev, byte_addr, num_bytes, buffer_addr, dma_buf_size);
+    const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) {
+        write_block(dev, byte_addr, num_bytes, buffer_addr);
     };
 
     return callable;
@@ -2242,9 +1900,9 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in
         if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
             // This is only for Blackhole. If we want to  write to DRAM (BAR4 space), we add offset
             // to which we write so write_block knows it needs to target BAR4
-            write_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr, m_dma_buf_size);
+            write_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr);
         } else {
-            write_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr, m_dma_buf_size);
+            write_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr);
         }
     } else {
         const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
@@ -2254,7 +1912,7 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in
 
             auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
             uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
-            write_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size);
+            write_block(dev, mapped_address, transfer_size, buffer_addr);
 
             size_in_bytes -= transfer_size;
             address += transfer_size;
@@ -2285,9 +1943,9 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std
         if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
             // This is only for Blackhole. If we want to  read from DRAM (BAR4 space), we add offset
             // from which we read so read_block knows it needs to target BAR4
-            read_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr, m_dma_buf_size);
+            read_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr);
         } else {
-            read_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr, m_dma_buf_size);
+            read_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr);
         }
         LOG1 ("  read_block called with tlb_offset: %d, tlb_size: %d\n", tlb_offset, tlb_size);
     } else {
@@ -2298,7 +1956,7 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std
 
             auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
             uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
-            read_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size);
+            read_block(dev, mapped_address, transfer_size, buffer_addr);
 
             size_in_bytes -= transfer_size;
             address += transfer_size;
@@ -2308,7 +1966,7 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std
     }
 }
 
-void tt_SiliconDevice::read_dma_buffer(
+void tt_SiliconDevice::read_buffer(
     void* mem_ptr,
     std::uint32_t address,
     std::uint16_t channel,
@@ -2321,20 +1979,18 @@ void tt_SiliconDevice::read_dma_buffer(
 
     if(hugepage_mapping.at(src_device_id).at(channel)) {
       user_scratchspace = static_cast<char*>(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK);
-    } else if (buf_mapping) {
-      user_scratchspace = static_cast<char*>(buf_mapping) + (address & DMA_MAP_MASK);
     } else {
-      std::string err_msg = "write_dma_buffer: Hugepage or DMAbuffer are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel);
+      std::string err_msg = "write_buffer: Hugepages are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel);
       err_msg += " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)";
       throw std::runtime_error(err_msg);
     }
 
-    LOG1("---- tt_SiliconDevice::read_dma_buffer (src_device_id: %d, ch: %d) from 0x%lx\n",  src_device_id, channel, user_scratchspace);
+    LOG1("---- tt_SiliconDevice::read_buffer (src_device_id: %d, ch: %d) from 0x%lx\n",  src_device_id, channel, user_scratchspace);
     
     memcpy(mem_ptr, user_scratchspace, size_in_bytes);
 }
 
-void tt_SiliconDevice::write_dma_buffer(
+void tt_SiliconDevice::write_buffer(
     const void *mem_ptr,
     std::uint32_t size,
     std::uint32_t address,
@@ -2343,24 +1999,15 @@ void tt_SiliconDevice::write_dma_buffer(
 
     void * user_scratchspace = nullptr;
     if(hugepage_mapping.at(src_device_id).at(channel)) {
-      log_assert(size <= HUGEPAGE_REGION_SIZE, "write_dma_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE);
+      log_assert(size <= HUGEPAGE_REGION_SIZE, "write_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE);
       log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}",
         hugepage_mapping.at(src_device_id).at(channel),
         (address & HUGEPAGE_MAP_MASK),
         channel,
         size);
       user_scratchspace = static_cast<char*>(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK);
-    }
-    else if(buf_mapping) {
-      log_assert(size <= DMA_BUF_REGION_SIZE, "write_dma_buffer data has larger size {} than destination buffer {}", size, DMA_BUF_REGION_SIZE);
-      log_debug(LogSiliconDriver, "Using DMA Buffer at address {} offset {} size {}",
-        buf_mapping,
-        address,
-        size);
-        // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in
-        user_scratchspace = reinterpret_cast<char*>(buf_mapping);
     } else {
-      std::string err_msg = "write_dma_buffer: Hugepage or DMAbuffer are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel);
+      std::string err_msg = "write_buffer: Hugepage are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel);
       throw std::runtime_error(err_msg);
     }
     memcpy(user_scratchspace, mem_ptr, size);
@@ -2435,46 +2082,6 @@ std::map<int, int> tt_SiliconDevice::get_clocks() {
     return clock_freq_map;
 }
 
-//! Simple test of communication to device/target.  true if it passes.
-// bool tt_SiliconDevice::test_write_read(tt_cxy_pair target) {
-//     WARN("---- tt_SiliconDevice::test_write_read not implemented\n");
-//     return true;
-// }
-
-// bool tt_SiliconDevice::test_write_speed (struct PCIdevice* pci_device) {
-//     TTDevice *dev = pci_device->hdev;
-
-//     if (dev->bar0_uc == dev->bar0_wc) {
-//         WARN("---- tt_SiliconDevice::test_write_speed WC not configured\n");
-//     }
-
-//     std::byte fill_value{0x42};
-//     std::vector<std::byte> write_buf(architecture_implementation->get_static_tlb_size(), fill_value);
-
-//     auto before = std::chrono::high_resolution_clock::now();
-//     for (std::uint32_t y = 1; y < architecture_implementation->get_grid_size_y(); y++)
-//     {
-//         for (std::uint32_t x = 1; x < architecture_implementation->get_grid_size_x(); x++)
-//         {
-//             auto tlb_index = map_core_to_tlb(tt_xy_pair(x, y));
-//             if (tlb_index < 0) { continue; }
-
-//             auto offset = tlb_index * architecture_implementation->get_static_tlb_size();
-
-//             memcpy(static_cast<std::byte*>(dev->bar0_wc) + offset, write_buf.data(), write_buf.size());
-//         }
-//     }
-//     auto after = std::chrono::high_resolution_clock::now();
-
-//     std::chrono::duration<double, std::milli> interval = after - before;
-
-//     unsigned int write_bw = 120 * std::milli::den / interval.count();
-
-//     LOG1("---- tt_SiliconDevice::test_write_speed Wrote 120MB @ %u MB/s\n", write_bw);
-
-//     return (write_bw >= 512); // L1 write BW scales with AICLK, for low AICLK it will be very slow.
-// }
-
 tt_SiliconDevice::~tt_SiliconDevice () {
 
     LOG1 ("---- tt_SiliconDevice::~tt_SiliconDevice\n");
@@ -2535,10 +2142,6 @@ std::optional<std::tuple<uint32_t, uint32_t>> tt_SiliconDevice::get_tlb_data_fro
     return tlb_data;
 }
 
-uint32_t tt_SiliconDevice::get_m_dma_buf_size() const {
-    return m_dma_buf_size;
-}
-
 void tt_SiliconDevice::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) {
     log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb");
     struct PCIdevice* pci_device = get_pci_device(logical_device_id);
@@ -2554,118 +2157,11 @@ void tt_SiliconDevice::set_fallback_tlb_ordering_mode(const std::string& fallbac
     log_assert(fallback_tlb != "LARGE_READ_TLB" &&  fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified.");
     dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering;
 }
-// This function checks that all TLBs are properly setup. It should return 0 if all is good (i.e. if init_pcie_tlb is called prior)
-// int tt_SiliconDevice::test_pcie_tlb_setup (struct PCIdevice* pci_device) {
-    // LOG1("---- tt_SiliconDevice::test_pcie_tlb_setup\n");
-    // uint64_t tlb_data;
-    // int ret_val;
-    // // Check static TLBs (only active Tensix cores for GS ... Active tensix cores + ethernet cores for WH)
-    // for (uint32_t y = 0; y < architecture_implementation->get_grid_size_y() - num_rows_harvested; y++) {
-    //     for (uint32_t x = 0; x < architecture_implementation->get_grid_size_x(); x++) {
-    //         int tlb_index = get_static_tlb_index(tt_xy_pair(x, y));
-    //         auto translated_coords = harvested_coord_translation.at(pci_device -> id).at(tt_xy_pair(x, y));
-    //         if (tlb_index < 0) { continue; }
-
-    //         auto tlb_data_attempt = architecture_implementation->get_tlb_data(tlb_index, TLB_DATA {
-    //             .x_end = translated_coords.x,
-    //             .y_end = translated_coords.y,
-    //         });
-    //         if (!tlb_data_attempt.has_value()) {
-    //             throw std::runtime_error("Error setting up (" + std::to_string(x) + ", " + std::to_string(y) + ") in pcie_tlb_test.");
-    //         }
-    //         uint64_t expected_tlb_data = tlb_data_attempt.value();
-
-    //         uint32_t tlb_setup_addr = architecture_implementation->get_static_tlb_cfg_addr() + 8 * tlb_index; // Each tlb setup takes 2 dwords, hence 8 bytes
-    //         read_regs(pci_device->hdev, tlb_setup_addr, 2, &tlb_data);
-
-    //     }
-    // }
-
-    // // Check 16MB TLBs 1-16 for peer-to-peer communication with DRAM channel 0
-    // uint64_t peer_dram_offset = architecture_implementation->get_dram_channel_0_peer2peer_region_start();
-    // for (uint32_t tlb_id = 1; tlb_id < 17; tlb_id++) {
-    //     auto tlb_data_expected = architecture_implementation->get_tlb_data(architecture_implementation->get_tlb_base_index_16m() + tlb_id, TLB_DATA {
-    //         .local_offset = peer_dram_offset / architecture_implementation->get_dynamic_tlb_16m_size(),
-    //         .x_end = architecture_implementation->get_dram_channel_0_x(),
-    //         .y_end = architecture_implementation->get_dram_channel_0_y(),
-    //         .ordering = TLB_DATA::Posted,
-    //         .static_vc = true,
-    //     });
-    //     uint64_t tlb_data_observed;
-    //     uint32_t tlb_setup_addr = architecture_implementation->get_dynamic_tlb_16m_cfg_addr() + 8 * tlb_id; // Each tlb setup takes 2 dwords, hence 8 bytes
-    //     read_regs(pci_device->hdev, tlb_setup_addr, 2, &tlb_data_observed);
-    //     ret_val = (tlb_data_expected == tlb_data_observed) ? 0 : 1;
-    //     if (ret_val != 0) return ret_val;
-    //     peer_dram_offset += architecture_implementation->get_dynamic_tlb_16m_size();
-    // }
-    // return ret_val;
-//}
-
-// Set up IATU for peer2peer
-// Consider changing this function
-void tt_SiliconDevice::init_pcie_iatus() {
-
-    int starting_device_id  = m_pci_device_map.begin()->first;
-    int ending_device_id    = m_pci_device_map.rbegin()->first;
-    int num_enabled_devices = m_pci_device_map.size();
-
-    LOG1("---- tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: %d starting_device_id: %d ending_device_id: %d\n", num_enabled_devices, starting_device_id, ending_device_id);
-    log_assert(m_num_host_mem_channels <= 1, "Maximum of 1x 1GB Host memory channels supported.");
-
-    // Requirement for ring topology in GS, but since WH can share below code, check it again here for mmio mapped devices,
-    // otherwise us/ds device calculations will not be correct. Don't expect to see this for Wormhole today.
-    log_assert((starting_device_id + num_enabled_devices - 1) == ending_device_id, "The set of workload mmio-mapped target_device_id's must be sequential, without gaps.");
-
-    for (auto &src_device_it : m_pci_device_map){
-        int src_pci_id = src_device_it.first;
-        struct PCIdevice* src_pci_device = src_device_it.second;
-
-        uint32_t current_peer_region = 0;
-        const int num_peer_ids = 3; // 0=HOST, 1=UPSTREAM Device, 2=DOWNSTREAM Device, 3=Unused
-        for (int peer_id = 0; peer_id < num_peer_ids; peer_id++) {
-
-            //TODO: migrate this to huge pages when that support is in
-            if (peer_id == 0){
-                LOG2 ("Setting up src_pci_id: %d peer_id: %d to Host. current_peer_region: %d\n", src_pci_id, peer_id, current_peer_region);
-                // Device to Host (peer_id==0)
-                const uint16_t host_memory_channel = 0; // Only single channel supported.
-                if (hugepage_mapping.at(src_pci_id).at(host_memory_channel)) {
-                    iatu_configure_peer_region(src_pci_id, current_peer_region, hugepage_physical_address.at(src_pci_id).at(host_memory_channel), HUGEPAGE_REGION_SIZE);
-                    host_channel_size.insert({(int)src_pci_device->logical_id, {HUGEPAGE_REGION_SIZE}});
-                } else if(buf_mapping) {
-                    // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in
-                    iatu_configure_peer_region(src_pci_id, current_peer_region, buf_physical_addr, DMA_BUF_REGION_SIZE);
-                }
-            } else if (peer_id == 1 || peer_id == 2){
-                // Device to Device (peer_id==1 : Upstream, peer_id==2 : Downstream)
-                // For determining upstream/downstream peers in ring topology - this matches is_target_device_downstream() in net2pipe
-                int upstream_peer_device_id = src_pci_id > starting_device_id ? src_pci_id - 1 : ending_device_id;
-                int downstream_peer_device_id = src_pci_id < (ending_device_id) ? src_pci_id + 1 : starting_device_id;
-
-                int peer_device_id = peer_id == 1 ? upstream_peer_device_id : downstream_peer_device_id;
-
-                struct PCIdevice* peer_pci_device = m_pci_device_map.at(peer_device_id);
-                uint64_t peer_BAR_addr = peer_pci_device->BAR_addr;
-                uint32_t peer_pci_interface_id = peer_pci_device->id;
-                uint32_t TLB1_16MB_OFFSET = 0; // Was 192MB offset to DRAM, now added by net2pipe since ATU maps to base of 512MB PCI Bar.
-                uint32_t PEER_REGION_SIZE = 1024 * 1024 * 1024; // Was 256MB. Want 512MB. Updated to 1024MB to match net2pipe more easily.
-                // FIXME - How to reduce PEER_REGION_SIZE=256 again, and make this still work? Need to make the ATU mappings non-contiguous 256MB chunks (every 1GB?) to match net2pipe?
-
-                LOG2 ("Setting up src_pci_id: %d peer_id: %d to Device (upstream_peer_device_id: %d downstream_peer_device_id: %d) gives peer_device_id: %d (peer_pci_interface_id: %d) current_peer_region: %d\n",
-                    src_pci_id, peer_id, upstream_peer_device_id, downstream_peer_device_id, peer_device_id, peer_pci_interface_id, current_peer_region );
-
-                iatu_configure_peer_region (src_pci_id, current_peer_region, peer_BAR_addr + TLB1_16MB_OFFSET, PEER_REGION_SIZE);
-            }
-            current_peer_region ++;
-        }
-    }
-}
 
 // TT<->TT P2P support removed in favor of increased Host memory.
-void tt_SiliconDevice::init_pcie_iatus_no_p2p() {
-
+void tt_SiliconDevice::init_pcie_iatus() {
     int num_enabled_devices = m_pci_device_map.size();
-    LOG1("---- tt_SiliconDevice::init_pcie_iatus_no_p2p() num_enabled_devices: %d\n", num_enabled_devices);
+    LOG1("---- tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: %d\n", num_enabled_devices);
     log_assert(m_num_host_mem_channels <= g_MAX_HOST_MEM_CHANNELS, "Maximum of {} 1GB Host memory channels supported.",  g_MAX_HOST_MEM_CHANNELS);
 
     for (auto &src_device_it : m_pci_device_map){
@@ -2674,7 +2170,6 @@ void tt_SiliconDevice::init_pcie_iatus_no_p2p() {
 
         // Device to Host (multiple channels)
         for (int channel_id = 0; channel_id < m_num_host_mem_channels; channel_id++) {
-            // TODO - Try to remove DMA buffer support.
             if (hugepage_mapping.at(src_pci_id).at(channel_id)) {
                 std::uint32_t region_size = HUGEPAGE_REGION_SIZE;
                 if(channel_id == 3) region_size = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation)
@@ -2684,34 +2179,14 @@ void tt_SiliconDevice::init_pcie_iatus_no_p2p() {
                      host_channel_size.insert({(int)src_pci_device->logical_id, {}});
                 }
                 host_channel_size.at(src_pci_device -> logical_id).push_back(region_size);
-            } else if(buf_mapping) {
-                log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to DMA buffer.", channel_id);
-                // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in
-                iatu_configure_peer_region(src_pci_id, channel_id, buf_physical_addr, DMA_BUF_REGION_SIZE);
+            } else {
+                std::string err_msg = "init_pcie_iatus: Hugepages are not allocated for src_pci_id: " + std::to_string(src_pci_id) + " ch: " + std::to_string(channel_id);
+                throw std::runtime_error(err_msg);
             }
         }
     }
 }
 
-uint32_t tt_SiliconDevice::dma_allocation_size(chip_id_t src_device_id)
-{
-
-  // Fall back to first device if no src_device_id is provided. Assumes all devices have the same size, which is true.
-  chip_id_t device_index = src_device_id == -1 ? m_pci_device_map.begin()->first : src_device_id;
-
-  if (hugepage_mapping.at(device_index).at(0)) {
-    return HUGEPAGE_REGION_SIZE;
-  } else if (buf_mapping) {
-    return DMA_BUF_REGION_SIZE;
-  } else {
-    log_fatal("Nothing has been allocated yet");
-    return 0;
-  }
-}
-
-
-
-
 // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G)
 std::string find_hugepage_dir(std::size_t pagesize)
 {
@@ -2799,52 +2274,6 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi
     return fd;
 }
 
-bool tt_SiliconDevice::init_dmabuf(chip_id_t device_id) {
-    if (buf_mapping == nullptr) {
-
-        TTDevice *dev = m_pci_device_map.begin()->second->hdev;
-
-        DMAbuffer buf = pci_allocate_dma_buffer(dev, DMA_BUF_REGION_SIZE);
-        buf_mapping = static_cast<void*>(reinterpret_cast<uint32_t*>(pci_dma_buffer_get_user_addr(buf)));
-        buf_physical_addr= pci_dma_buffer_get_physical_addr(buf);
-    }
-    return true;
-}
-
-bool tt_SiliconDevice::init_dma_turbo_buf (struct PCIdevice* pci_device) {
-    // Allocate buffers for DMA transfer data and flag
-    pci_device->hdev->dma_completion_flag_buffer = pci_allocate_dma_buffer(pci_device->hdev, sizeof(uint64_t));
-    pci_device->hdev->dma_transfer_buffer = pci_allocate_dma_buffer(pci_device->hdev, m_dma_buf_size);
-    pcie_init_dma_transfer_turbo(pci_device);
-    return true;
-}
-
-bool tt_SiliconDevice::uninit_dma_turbo_buf (struct PCIdevice* pci_device) {
-    struct DMAbuffer &flag_buffer = pci_device->hdev->dma_completion_flag_buffer;
-    struct DMAbuffer &xfer_buffer = pci_device->hdev->dma_transfer_buffer;
-    if (flag_buffer.pBuf) {
-        for (auto it = pci_device->hdev->dma_buffer_mappings.begin(); it != pci_device->hdev->dma_buffer_mappings.end();) {
-            if (it->pBuf == flag_buffer.pBuf) {
-                it = pci_device->hdev->dma_buffer_mappings.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        munmap(flag_buffer.pBuf, flag_buffer.size);
-    }
-    if (xfer_buffer.pBuf) {
-        for (auto it = pci_device->hdev->dma_buffer_mappings.begin(); it != pci_device->hdev->dma_buffer_mappings.end();) {
-            if (it->pBuf == xfer_buffer.pBuf) {
-                it = pci_device->hdev->dma_buffer_mappings.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        munmap(xfer_buffer.pBuf, xfer_buffer.size);
-    }
-    return true;
-}
-
 // For debug purposes when various stages fails.
 void print_file_contents(std::string filename, std::string hint = ""){
     if (std::filesystem::exists(filename)){
@@ -2975,67 +2404,11 @@ int tt_SiliconDevice::test_setup_interface () {
     }
 }
 
-// Code used to test non existent broadcast TLB
-// Keep for now, in case we need to test broadcast TLB again.
-// int tt_SiliconDevice::test_broadcast (int logical_device_id) {
-//     LOG1("---- tt_SiliconDevice::test_broadcast\n");
-
-//     int ret_val = 0;
-//     struct PCIdevice* pci_device = get_pci_device(logical_device_id);
-
-//     assert (test_pcie_tlb_setup(pci_device) == 0);
-
-//     std::vector<std::uint32_t> fill_array (1024, 0);
-//     uint32_t broadcast_bar_offset = architecture_implementation->get_broadcast_tlb_index() * architecture_implementation->get_static_tlb_size();
-//     LOG2 ("broadcast_bar_offset = 0x%x\n", broadcast_bar_offset);
-
-//     uint64_t fill_array_ptr = (uint64_t)(&fill_array[0]);
-
-//     // a. Fill with increasing numbers
-//     //
-//     for (size_t i = 0; i < fill_array.size(); i++) {
-//         fill_array[i] = i;
-//     }
-//     write_block(pci_device->hdev, broadcast_bar_offset, fill_array.size() * sizeof (std::uint32_t), fill_array_ptr, m_dma_buf_size);
-
-//     // Check individual locations
-//     for (uint32_t xi = 0; xi < architecture_implementation->get_t6_x_locations().size(); xi++) {
-//         for (uint32_t yi = 0; yi < architecture_implementation->get_t6_y_locations().size(); yi++) {
-//             tt_cxy_pair read_loc(logical_device_id, architecture_implementation->get_t6_x_locations()[xi], architecture_implementation->get_t6_y_locations()[yi]);
-//             read_vector (fill_array, read_loc, 0, fill_array.size() * sizeof (fill_array[0]) );
-//             for (size_t i = 0; i < fill_array.size(); i++) {
-//                 ret_val = (fill_array[i] == i) ? 0 : 1;
-//                 if (ret_val) return ret_val;
-//             }
-//         }
-//     }
-
-//     // b. Test with zeroes
-//     //
-//     std::vector<std::uint32_t> fill_array_zeroes (1024, 0);
-//     uint64_t fill_array_zeroes_ptr = (uint64_t)(&fill_array_zeroes[0]);
-//     write_block(pci_device->hdev, broadcast_bar_offset, fill_array.size() * sizeof (std::uint32_t), fill_array_zeroes_ptr, m_dma_buf_size);
-
-//     // Check individual locations
-//     for (uint32_t xi = 0; xi < architecture_implementation->get_t6_x_locations().size(); xi++) {
-//         for (uint32_t yi = 0; yi < architecture_implementation->get_t6_y_locations().size(); yi++) {
-//             tt_cxy_pair read_loc(logical_device_id, architecture_implementation->get_t6_x_locations()[xi], architecture_implementation->get_t6_y_locations()[yi]);
-//             read_vector (fill_array, read_loc, 0, fill_array.size() * sizeof (fill_array_zeroes[0]) );
-//             for (size_t i = 0; i < fill_array.size(); i++) {
-//                 ret_val = (fill_array_zeroes[i] == 0) ? 0 : 1;
-//                 if (ret_val) return ret_val;
-//             }
-//         }
-//     }
-
-//     return ret_val;
-// }
-
 void tt_SiliconDevice::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) {
     TTDevice* dev = get_pci_device(logical_device_id)->hdev;
 
     if (addr < dev->bar0_uc_offset) {
-        write_block (dev, addr, sizeof(data), reinterpret_cast<const uint8_t*>(&data), m_dma_buf_size);
+        write_block (dev, addr, sizeof(data), reinterpret_cast<const uint8_t*>(&data));
     } else {
         write_regs (dev, addr, 1, &data);
     }
@@ -3046,7 +2419,7 @@ uint32_t tt_SiliconDevice::bar_read32 (int logical_device_id, uint32_t addr) {
 
     uint32_t data;
     if (addr < dev->bar0_uc_offset) {
-        read_block (dev, addr, sizeof(data), reinterpret_cast<uint8_t*>(&data), m_dma_buf_size);
+        read_block (dev, addr, sizeof(data), reinterpret_cast<uint8_t*>(&data));
     } else {
         read_regs (dev, addr, 1, &data);
     }
@@ -3228,37 +2601,9 @@ void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t &device_id, i
     }
 }
 
-void *tt_SiliconDevice::channel_address(std::uint32_t offset, const tt_cxy_pair& target) {
-    log_assert(ndesc->is_chip_mmio_capable(target.chip), "Cannot call channel_address for non-MMIO device");
-    struct PCIdevice* pci_device = get_pci_device(target.chip);
-    auto architecture_implementation = pci_device->hdev->get_architecture_implementation();
-    std::uint64_t bar0_offset;
-
-    // Temporary hack for blackhole bringup.
-    if (arch_name == tt::ARCH::BLACKHOLE) {
-        // We use BAR4 segment for mapping for Blackhole.
-        log_assert(tlbs_init, "TLBs were not initialized.");
-        std::int32_t tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y));
-        auto [tlb_offset, tlb_size] = pci_device->hdev->get_architecture_implementation()->describe_tlb(tlb_index).value();
-
-        log_assert(pci_device->hdev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE, "BAR4 not initialized, or TLBs not initialized properly.");
-        return static_cast<std::byte*>(pci_device->hdev->bar4_wc) + tlb_offset + offset;
-    } else {
-        // This hard-codes that we use 16MB TLB #1 onwards for the mapping.
-        bar0_offset = offset - architecture_implementation->get_dram_channel_0_peer2peer_region_start()
-                        + architecture_implementation->get_dynamic_tlb_16m_base() + architecture_implementation->get_dynamic_tlb_16m_size();
-    }
-
-    return static_cast<std::byte*>(pci_device->hdev->bar0_wc) + bar0_offset;
-}
-
 void *tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
-
     if (hugepage_mapping.at(src_device_id).at(channel) != nullptr) {
         return static_cast<std::byte*>(hugepage_mapping.at(src_device_id).at(channel)) + offset;
-    } else if(buf_mapping) {
-        // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in
-        return static_cast<std::byte*>(buf_mapping) + offset;
     } else {
         return nullptr;
     }
@@ -3277,46 +2622,6 @@ std::shared_ptr<boost::interprocess::named_mutex> tt_SiliconDevice::get_mutex(co
     return hardware_resource_mutex_map.at(mutex_name);
 }
 
-
-std::unordered_map<chip_id_t, chip_id_t> tt_SiliconDevice::get_logical_to_physical_mmio_device_id_map(std::vector<chip_id_t> physical_device_ids){
-
-    std::unordered_map<chip_id_t, chip_id_t> logical_to_physical_mmio_device_id_map;
-
-    LOG1("get_logical_to_physical_mmio_device_id_map() -- num_physical_devices: %d\n", physical_device_ids.size());
-
-    for (int logical_device_idx=0; logical_device_idx < physical_device_ids.size(); logical_device_idx++){
-        logical_to_physical_mmio_device_id_map.insert({logical_device_idx, physical_device_ids.at(logical_device_idx)});
-    }
-
-    return logical_to_physical_mmio_device_id_map;
-
-}
-
-
-// Get PCI bus_id info for looking up TT devices in hwloc to find associated CPU package.
-std::map<chip_id_t, std::string> tt_SiliconDevice::get_physical_device_id_to_bus_id_map(std::vector<chip_id_t> physical_device_ids){
-
-    std::map<int, std::string> physical_device_id_to_bus_id_map;
-
-    for (auto &pci_interface_id : physical_device_ids){
-
-        auto ttdev = std::make_unique<TTDevice>(TTDevice::open(pci_interface_id));
-
-        std::ostringstream pci_bsf;
-        pci_bsf << std::hex << std::setw(2) << std::setfill('0') << (int) ttdev->pci_bus << ":";
-        pci_bsf << std::hex << std::setw(2) << std::setfill('0') << (int) ttdev->pci_device << ".";
-        pci_bsf << std::hex << (int) ttdev->pci_function;
-
-        std::string pci_bsf_str = pci_bsf.str();
-        LOG2("get_physical_device_id_to_bus_id_map() -- pci_interface_id: %d BSF: %s\n", pci_interface_id, pci_bsf_str.c_str());
-        physical_device_id_to_bus_id_map.insert({pci_interface_id, pci_bsf_str});
-
-    }
-
-    return physical_device_id_to_bus_id_map;
-
-}
-
 uint64_t tt_SiliconDevice::get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) {
     uint64_t result = chip_y;
     uint64_t noc_addr_local_bits_mask = (1UL << eth_interface_params.noc_addr_local_bits) - 1;
@@ -3349,7 +2654,6 @@ bool tt_SiliconDevice::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_
  *
  * Relevant functions:
  *  - write_to_non_mmio_device
- *  - rolled_write_to_non_mmio_device
  *  - read_from_non_mmio_device
  *
  * The non-MMIO read/write functions (excluding the `*_epoch_cmd` variants) are responsible for the
@@ -3583,282 +2887,6 @@ void tt_SiliconDevice::write_to_non_mmio_device(
     }
 }
 
-
-// Specialized function for small epoch commands:
-// 1) uses separate eth cores than other non-mmio transfers hence does not require mutex
-// 2) does not have the code paths for transfers larger than 32kB (1024 cmds)
-// 3) only reads erisc_q_ptrs_epoch once, or when the queues are full
-// 4) only updates wptr on eth command queues for the last epoch command or when the queue is full or when switching eth cores based on eth-ordered-writes policy, or when
-//    eth-ordered-writes are not supported but current write must be ordered (flush prev wrptr).
-// 5) When eth-ordered-write not supported, allow flush to be used as ordering mechanism when ordering is requested via arg. When eth-ordered-write is supported, always use it
-//    and ensure ordering to same remote chip destinations by always using same remote xfer eth core for a given destination based on noc xy. Must ensure wrptr is flushed on
-//    switch of eth cores, and copy of rdptr/wrptr maintained on host for each eth xfer core.
-void tt_SiliconDevice::write_to_non_mmio_device_send_epoch_cmd(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-    log_assert(!non_mmio_transfer_cores_customized, "{} cannot be used if ethernet cores for host->cluster transfers are customized. The default Ethernet Core configuration must be used.", __FUNCTION__);
-    using data_word_t = uint32_t;
-    constexpr int DATA_WORD_SIZE = sizeof(data_word_t);
-
-    const auto &mmio_capable_chip = ndesc->get_closest_mmio_capable_chip(core.chip);
-    const auto target_chip = ndesc->get_chip_locations().at(core.chip);
-
-    std::string write_tlb = "LARGE_WRITE_TLB";
-    std::string read_tlb = "LARGE_READ_TLB";
-    std::string empty_tlb = "";
-    translate_to_noc_table_coords(core.chip, core.y, core.x);
-
-    const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
-    tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch];
-
-    // read all eth queue ptrs for the first time, and initialize wrptr_updated bool for strict ordering.
-    if (!erisc_q_ptrs_initialized) {
-        for (int core_epoch = EPOCH_ETH_CORES_START_ID; core_epoch < EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS + EPOCH_ETH_CORES_START_ID; core_epoch++) {
-            erisc_q_ptrs_epoch[core_epoch].reserve(eth_interface_params.remote_update_ptr_size_bytes*2/sizeof(uint32_t));
-            read_device_memory(erisc_q_ptrs_epoch[core_epoch].data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-            erisc_q_wrptr_updated[core_epoch] = false;
-            erisc_q_ptrs_initialized = true;
-        }
-    }
-
-    std::vector<std::uint32_t> erisc_command(sizeof(routing_cmd_t)/DATA_WORD_SIZE);
-    routing_cmd_t *new_cmd = (routing_cmd_t *)&erisc_command[0];
-    std::vector<std::uint32_t> data_block;
-
-    // Two mechanisms for ordering depending on eth fw version.
-    if (use_ethernet_ordered_writes) {
-        // Feature in this function to ensure ordering via eth-ordered-writes by using same eth core for all epoch writes to same dest noc xy.
-        auto &soc_desc  = get_soc_descriptor(mmio_capable_chip);
-        int core_id = core.x * soc_desc.grid_size.y + core.y;
-        int new_active_core_epoch = (core_id % EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS) + EPOCH_ETH_CORES_START_ID;
-
-        // Switch eth cores, and if wrptr was not flushed to device for previous eth core, do it now.
-        if (new_active_core_epoch != active_core_epoch) {
-            if (!erisc_q_wrptr_updated[active_core_epoch]) {
-                std::vector<std::uint32_t> erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] };
-                write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
-                tt_driver_atomics::sfence();
-                erisc_q_wrptr_updated[active_core_epoch] = true;
-            }
-            active_core_epoch = new_active_core_epoch;
-            remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch];
-        }
-    } else if (ordered_with_prev_remote_write) {
-        // Flush used as ordering mechanism when eth ordered writes are unsupported. If previous write requires flush,
-        // handle it here before setting flush_non_mmio for the current write.
-        if (!erisc_q_wrptr_updated[active_core_epoch]) {
-            std::vector<std::uint32_t> erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] };
-            write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
-            tt_driver_atomics::sfence();
-            erisc_q_wrptr_updated[active_core_epoch] = true;
-        }
-        wait_for_non_mmio_flush();
-    }
-
-    flush_non_mmio = true;
-    uint32_t timestamp = 0; //CMD_TIMESTAMP;
-
-    bool use_dram = size_in_bytes > 256 * DATA_WORD_SIZE ? true : false;
-    uint32_t max_block_size = use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size;
-    uint32_t block_size;
-
-    // Ethernet ordered writes must originate from same erisc core, so prevent updating active core here.
-    while (is_non_mmio_cmd_q_full(erisc_q_ptrs_epoch[active_core_epoch][0], erisc_q_ptrs_epoch[active_core_epoch][4])) {
-        if (!use_ethernet_ordered_writes){
-            active_core_epoch++;
-            log_assert(active_core_epoch - EPOCH_ETH_CORES_START_ID >= 0, "Invalid ERISC core for sending epoch commands");
-            active_core_epoch = ((active_core_epoch - EPOCH_ETH_CORES_START_ID) % EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS) + EPOCH_ETH_CORES_START_ID;
-            remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch];
-        }
-        read_device_memory(erisc_q_ptrs_epoch[active_core_epoch].data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-    }
-
-    uint32_t req_wr_ptr = erisc_q_ptrs_epoch[active_core_epoch][0] & eth_interface_params.cmd_buf_size_mask;
-    if (address & 0x1F) { // address not 32-byte aligned
-        // can send it in one transfer, no need to break it up
-        log_assert(size_in_bytes == DATA_WORD_SIZE, "Non-mmio cmd queue update is too big");
-        block_size = DATA_WORD_SIZE;
-    } else {
-        // can send it in one transfer, no need to break it up
-        log_assert(size_in_bytes <= max_block_size, "Non-mmio cmd queue update is too big. size_in_bytes: {} exceeds max_block_size: {}", size_in_bytes, max_block_size);
-        block_size = size_in_bytes;
-    }
-    uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req;
-    if (use_ethernet_ordered_writes) {
-        req_flags |= eth_interface_params.cmd_ordered;
-    }
-
-    uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack;
-    timestamp = 0;
-
-    uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_epoch * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size;
-    uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0.
-
-    // send the data
-    if (req_flags & eth_interface_params.cmd_data_block) {
-        // Copy data to sysmem or device DRAM for Block mode
-        if (use_dram) {
-            req_flags |= eth_interface_params.cmd_data_block_dram;
-            resp_flags |= eth_interface_params.cmd_data_block_dram;
-            size_buffer_to_capacity(data_block, block_size);
-            memcpy(&data_block[0], mem_ptr, block_size);
-            write_to_sysmem(data_block, host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical);
-        } else {
-            uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size;
-            size_buffer_to_capacity(data_block, block_size);
-            memcpy(&data_block[0], mem_ptr, block_size);
-            write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb);
-        }
-        tt_driver_atomics::sfence();
-    }
-
-    // send the write request
-    log_assert((req_flags & eth_interface_params.cmd_data_block) ? (address & 0x1F) == 0 : true, "Block mode address must be 32-byte aligned.");
-
-    new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address);
-    new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip));
-    new_cmd->data = req_flags & eth_interface_params.cmd_data_block ? block_size : *mem_ptr;
-    new_cmd->flags = req_flags;
-    if (use_dram) {
-        new_cmd->src_addr_tag = host_dram_block_addr;
-    }
-
-    write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);
-    tt_driver_atomics::sfence();
-
-    // update the wptr only if the eth queue is full or for the last command
-    erisc_q_ptrs_epoch[active_core_epoch][0] = (erisc_q_ptrs_epoch[active_core_epoch][0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
-    if (last_send_epoch_cmd || is_non_mmio_cmd_q_full(erisc_q_ptrs_epoch[active_core_epoch][0], erisc_q_ptrs_epoch[active_core_epoch][4])) {
-        std::vector<std::uint32_t> erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] };
-        write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
-        tt_driver_atomics::sfence();
-        erisc_q_wrptr_updated[active_core_epoch] = true;
-    } else {
-        erisc_q_wrptr_updated[active_core_epoch] = false;
-    }
-}
-
-/*
- * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue
- * DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
- */
-void tt_SiliconDevice::rolled_write_to_non_mmio_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, uint32_t unroll_count) {
-    using data_word_t = uint32_t;
-    constexpr int DATA_WORD_SIZE = sizeof(data_word_t);
-
-    std::string write_tlb = "LARGE_WRITE_TLB";
-    std::string read_tlb = "LARGE_READ_TLB";
-    std::string empty_tlb = "";
-    translate_to_noc_table_coords(core.chip, core.y, core.x);
-
-    const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip);
-
-
-    std::vector<std::uint32_t> erisc_command;
-    std::vector<std::uint32_t> erisc_q_rptr = std::vector<uint32_t>(1);
-    std::vector<std::uint32_t> erisc_q_ptrs = std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t));
-
-    std::vector<std::uint32_t> data_block = std::vector<uint32_t>(size_in_bytes / DATA_WORD_SIZE);
-
-    routing_cmd_t *new_cmd;
-
-    flush_non_mmio = true;
-    uint32_t transfer_size = size_in_bytes * unroll_count;
-    uint32_t buffer_id = 0;
-    uint32_t timestamp = 0; //CMD_TIMESTAMP;
-
-    //
-    //                    MUTEX ACQUIRE (NON-MMIO)
-    //  do not locate any ethernet core reads/writes before this acquire
-    //
-    const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
-
-    if (non_mmio_transfer_cores_customized) {
-        log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices.");
-    }
-
-    const scoped_lock<named_mutex> lock(
-        *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->id));
-
-    erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE);
-    new_cmd = (routing_cmd_t *)&erisc_command[0];
-    int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
-    read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-
-    uint32_t offset = 0;
-
-    bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
-    erisc_q_rptr.resize(1);
-    erisc_q_rptr[0] = erisc_q_ptrs[4];
-
-    uint32_t unroll_offset = 0;
-
-    while (offset < transfer_size) {
-        while (full) {
-            read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb);
-            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]);
-        }
-        //full = true;
-        // set full only if this command will make the q full.
-        // otherwise full stays false so that we do not poll the rd pointer in next iteration.
-        // As long as current command push does not fill up the queue completely, we do not want
-        // to poll rd pointer in every iteration.
-        //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]);
-
-        log_assert(((address + offset) & 0x1F) == 0, "Base address + offset in incorrect range!");
-
-        uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask;
-
-        uint32_t req_flags = eth_interface_params.cmd_data_block_dram | eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req;
-        timestamp = 0;
-
-        uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * host_address_params.eth_routing_block_size;
-        uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0.
-
-        memcpy(data_block.data(), mem_ptr, size_in_bytes);
-        uint32_t byte_increment = data_block.size() * DATA_WORD_SIZE;
-        uint32_t host_mem_offset = 0;
-        uint32_t i = 0;
-        for (i = 0; (i + unroll_offset) < unroll_count; i++) {
-            if ((host_mem_offset + byte_increment) > host_address_params.eth_routing_block_size) {
-                break;
-            }
-            data_block[0] = i + unroll_offset;
-            write_to_sysmem(data_block, host_dram_block_addr + host_mem_offset, host_dram_channel, mmio_capable_chip_logical);
-            host_mem_offset += byte_increment;
-        }
-        unroll_offset += i;
-        tt_driver_atomics::sfence();
-        new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset);
-        new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip));
-        new_cmd->data = host_mem_offset;
-        new_cmd->flags = req_flags;
-        new_cmd->src_addr_tag = host_dram_block_addr;
-
-        write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);
-        tt_driver_atomics::sfence();
-        erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
-        std::vector<std::uint32_t> erisc_q_wptr;
-        erisc_q_wptr.resize(1);
-        erisc_q_wptr[0] = erisc_q_ptrs[0];
-        write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
-        tt_driver_atomics::sfence();
-        offset += host_mem_offset;
-
-        // If there is more data to send and this command will make the q full, switch to next Q.
-        // otherwise full stays false so that we do not poll the rd pointer in next iteration.
-        // As long as current command push does not fill up the queue completely, we do not want
-        // to poll rd pointer in every iteration.
-
-        if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) {
-            active_core_for_txn++;
-            uint32_t update_mask_for_chip = (remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1);
-            active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID);
-            read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
-            erisc_q_rptr[0] = erisc_q_ptrs[4];
-        }
-    }
-}
-
 /*
  * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue
  * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
@@ -4198,7 +3226,7 @@ void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr,
     while(size_in_bytes > 0) {
         auto [mapped_address, tlb_size] = set_dynamic_tlb_broadcast(pci_device, tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb));
         uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
-        write_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size);
+        write_block(dev, mapped_address, transfer_size, buffer_addr);
 
         size_in_bytes -= transfer_size;
         addr += transfer_size;
@@ -4423,18 +3451,18 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_
 }
 
 void tt_SiliconDevice::write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
-    write_dma_buffer(mem_ptr, size, addr, channel, src_device_id);
+    write_buffer(mem_ptr, size, addr, channel, src_device_id);
 }
 void tt_SiliconDevice::write_to_sysmem(std::vector<uint32_t>& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
-    write_dma_buffer(vec.data(), vec.size() * sizeof(uint32_t), addr, channel, src_device_id);
+    write_buffer(vec.data(), vec.size() * sizeof(uint32_t), addr, channel, src_device_id);
 }
 
 void tt_SiliconDevice::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
-    read_dma_buffer(mem_ptr, addr, channel, size, src_device_id);
+    read_buffer(mem_ptr, addr, channel, size, src_device_id);
 }
 void tt_SiliconDevice::read_from_sysmem(std::vector<uint32_t> &vec, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
     size_buffer_to_capacity(vec, size);
-    read_dma_buffer(vec.data(), addr, channel, size, src_device_id);
+    read_buffer(vec.data(), addr, channel, size, src_device_id);
 }
 
 void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) {
@@ -4548,7 +3576,7 @@ void tt_SiliconDevice::dram_membar(const chip_id_t chip, const std::string& fall
     }
 }
 
-void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
+void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
     bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip);
     if(target_is_mmio_capable) {
         if (fallback_tlb == "REG_TLB") {
@@ -4556,60 +3584,16 @@ void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cx
         } else {
             write_device_memory(mem_ptr, size, core, addr, fallback_tlb);
         }
-    }
-    else if (!send_epoch_cmd) {
+    } else {
         log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");
         log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!");
         write_to_non_mmio_device(mem_ptr, size, core, addr);
-    } else {
-        log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");
-        // as long as epoch commands are sent single-threaded, no need to acquire mutex
-        log_assert(!(size % 4), "Epoch commands must be 4 byte aligned!");
-        write_to_non_mmio_device_send_epoch_cmd((uint32_t*)mem_ptr, size, core, addr, last_send_epoch_cmd, ordered_with_prev_remote_write);
     }
 }
 
-
-void tt_SiliconDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-    // Overloaded device writer that accepts a vector
-    write_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
-}
-
-
-void tt_SiliconDevice::write_epoch_cmd_to_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip);
-    if(target_is_mmio_capable) {
-        write_device_memory(mem_ptr, size_in_bytes, core, addr, fallback_tlb);
-    } else {
-        log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");    // MT: Use only dynamic TLBs and never program static
-        write_to_non_mmio_device_send_epoch_cmd(mem_ptr, size_in_bytes, core, addr, last_send_epoch_cmd, ordered_with_prev_remote_write);
-     }
-}
-
-void tt_SiliconDevice::write_epoch_cmd_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
+void tt_SiliconDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
     // Overloaded device writer that accepts a vector
-    write_epoch_cmd_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb, last_send_epoch_cmd, ordered_with_prev_remote_write);
-}
-
-void tt_SiliconDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-    log_assert(!(size_in_bytes % 4), "{} only supports 4-byte aligned data", __FUNCTION__);
-    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip);
-
-    if (target_is_mmio_capable) {
-        for (int i=0; i<unroll_count; i++) {
-            *mem_ptr = i; // slot id for debug
-            write_device_memory(mem_ptr, size_in_bytes, core, addr + i * size_in_bytes, fallback_tlb);
-        }
-    }
-    else {
-        log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");    // MT: Use only dynamic TLBs and never program static
-        log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!");
-        rolled_write_to_non_mmio_device(mem_ptr, size_in_bytes, core, addr, unroll_count);
-    }
-}
-
-void tt_SiliconDevice::rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-    rolled_write_to_device(vec.data(), vec.size() * sizeof(uint32_t), unroll_count, core, addr, fallback_tlb);
+    write_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb);
 }
 
 void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
@@ -4909,20 +3893,6 @@ std::uint32_t tt_SiliconDevice::get_host_channel_size(std::uint32_t device_id, s
     return host_channel_size.at(device_id).at(channel);
 }
 
-std::uint32_t tt_SiliconDevice::get_pcie_speed(std::uint32_t device_id) {
-    int link_width = 0;
-    int link_speed = 0;
-    if (ndesc->is_chip_mmio_capable(device_id)) {
-        PCIdevice *pci_device = get_pci_device(device_id);
-        link_width = get_link_width(pci_device->hdev);
-        link_speed = get_link_speed(pci_device->hdev);
-        log_debug(LogSiliconDriver, "Device {} PCIe link width: x{}, speed: {} Gb/s", device_id, link_width, link_speed);
-    } else {
-        log_debug(LogSiliconDriver, "Device {} is NOT a PCIe device, width: x{}, speed: {} Gb/s", device_id, link_width, link_speed);
-    }
-    return (link_width * link_speed);
-}
-
 std::uint32_t tt_SiliconDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) {
     return get_numa_node(get_pci_device(device_id)->hdev);
 }
diff --git a/device/tt_silicon_driver_common.hpp b/device/tt_silicon_driver_common.hpp
index 1649bf70..9f275668 100644
--- a/device/tt_silicon_driver_common.hpp
+++ b/device/tt_silicon_driver_common.hpp
@@ -9,19 +9,6 @@
 #include <cstdint>
 #include <string>
 
-
-typedef struct {
-    uint32_t  chip_addr;
-    uint32_t  host_phys_addr;
-    uint32_t  completion_flag_phys_addr;
-    uint32_t  size_bytes                  : 28;
-    uint32_t  write                       : 1;
-    uint32_t  pcie_msi_on_done            : 1;
-    uint32_t  pcie_write_on_done          : 1;
-    uint32_t  trigger                     : 1;
-    uint32_t  repeat;
-} arc_pcie_ctrl_dma_request_t; // 5 * 4 = 20B
-
 enum class TensixSoftResetOptions: std::uint32_t {
     NONE = 0,
     BRISC = ((std::uint32_t) 1 << 11),
diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp
index 4320b3ef..60958372 100644
--- a/device/tt_soc_descriptor.cpp
+++ b/device/tt_soc_descriptor.cpp
@@ -199,98 +199,20 @@ int tt_SocDescriptor::get_num_dram_channels() const {
     return num_channels;
 }
 
-std::vector<int> tt_SocDescriptor::get_dram_chan_map() {
-    std::vector<int> chan_map;
-    for (unsigned int i = 0; i < dram_cores.size(); i++) {
-        chan_map.push_back(i);
-    }
-    return chan_map;
-};
-
 bool tt_SocDescriptor::is_worker_core(const tt_xy_pair &core) const {
     return (
         routing_x_to_worker_x.find(core.x) != routing_x_to_worker_x.end() &&
         routing_y_to_worker_y.find(core.y) != routing_y_to_worker_y.end());
 }
 
-tt_xy_pair tt_SocDescriptor::get_worker_core(const tt_xy_pair &core) const {
-    tt_xy_pair worker_xy = {
-        static_cast<size_t>(routing_x_to_worker_x.at(core.x)), static_cast<size_t>(routing_y_to_worker_y.at(core.y))};
-    return worker_xy;
-}
-
-tt_xy_pair tt_SocDescriptor::get_routing_core(const tt_xy_pair& core) const {
-    tt_xy_pair routing_xy = {
-        static_cast<size_t>(worker_log_to_routing_x.at(core.x)), static_cast<size_t>(worker_log_to_routing_y.at(core.y))};
-    return routing_xy;
-}
-
 tt_xy_pair tt_SocDescriptor::get_core_for_dram_channel(int dram_chan, int subchannel) const {
     return this->dram_cores.at(dram_chan).at(subchannel);
 };
 
-tt_xy_pair tt_SocDescriptor::get_pcie_core(int pcie_id) const {
-    return this->pcie_cores.at(pcie_id);
-};
-
 bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const {
     return this->ethernet_core_channel_map.find(core) != ethernet_core_channel_map.end();
 }
 
-bool tt_SocDescriptor::is_dram_core(const tt_xy_pair &core) const {
-    static std::unordered_set<tt_xy_pair> cores = {};
-    if (cores.empty()) {
-        for (const std::vector<tt_xy_pair> &dram_chan : this->dram_cores) {
-            for (const tt_xy_pair &subchannel : dram_chan) {
-                cores.insert(subchannel);
-            }
-        }
-    }
-    return cores.find(core) != cores.end();
-}
-
-int tt_SocDescriptor::get_channel_of_ethernet_core(const tt_xy_pair &core) const {
-    return this->ethernet_core_channel_map.at(core);
-}
-
-int tt_SocDescriptor::get_num_dram_subchans() const {
-    int num_chan = 0;
-    for (const std::vector<tt_xy_pair> &core : this->dram_cores) {
-        num_chan += core.size();
-    }
-    return num_chan;
-}
-
-int tt_SocDescriptor::get_num_dram_blocks_per_channel() const {
-    int num_blocks = 0;
-    if (arch == tt::ARCH::GRAYSKULL) {
-        num_blocks = 1;
-    } else if (arch == tt::ARCH::WORMHOLE) {
-        num_blocks = 2;
-    } else if (arch == tt::ARCH::WORMHOLE_B0) {
-        num_blocks = 2;
-    } else if (arch == tt::ARCH::BLACKHOLE) {
-        num_blocks = 2;
-    }
-    return num_blocks;
-}
-
-// Note: same as t_SiliconDevice::get_pcie_base_addr_from_device
-uint64_t tt_SocDescriptor::get_noc2host_offset(uint16_t host_channel) const {
-
-    const std::uint64_t PEER_REGION_SIZE = (1024 * 1024 * 1024);
-
-    if (arch == tt::ARCH::GRAYSKULL) {
-        return (host_channel * PEER_REGION_SIZE);
-    }else if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
-        return (host_channel * PEER_REGION_SIZE) + 0x800000000;
-    } else if (arch == tt::ARCH::BLACKHOLE) {
-        return (host_channel * PEER_REGION_SIZE) + (1ULL << 60);
-    } else {
-        throw std::runtime_error("Unsupported architecture");
-    }
-}
-
 std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) {
     if (arch_name == tt::ARCH::JAWBRIDGE) {
         out << "jawbridge";
diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h
index 2be98749..87ea1799 100644
--- a/device/tt_soc_descriptor.h
+++ b/device/tt_soc_descriptor.h
@@ -23,8 +23,6 @@ namespace YAML {
     class Node;
 }
 
-static constexpr std::size_t DEFAULT_DRAM_SIZE_PER_CORE = 8 * 1024 * 1024;
-
 std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name);
 
 static inline std::string get_arch_str(const tt::ARCH arch_name){
@@ -132,18 +130,9 @@ class tt_SocDescriptor {
     uint64_t dram_bank_size;
 
     int get_num_dram_channels() const;
-    std::vector<int> get_dram_chan_map();
     bool is_worker_core(const tt_xy_pair &core) const;
-    tt_xy_pair get_worker_core(const tt_xy_pair& core) const;
-    tt_xy_pair get_routing_core(const tt_xy_pair& core) const;
     tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const;
-    tt_xy_pair get_pcie_core(int pcie_id = 0) const;
-    bool is_dram_core(const tt_xy_pair& core) const;
     bool is_ethernet_core(const tt_xy_pair& core) const;
-    int get_channel_of_ethernet_core(const tt_xy_pair &core) const;
-    int get_num_dram_subchans() const;
-    int get_num_dram_blocks_per_channel() const;
-    uint64_t get_noc2host_offset(uint16_t host_channel) const;
 
     // Default constructor. Creates uninitialized object with public access to all of its attributes.
     tt_SocDescriptor() = default;
diff --git a/device/wormhole/impl_device.hpp b/device/wormhole/impl_device.hpp
deleted file mode 100644
index 227cac48..00000000
--- a/device/wormhole/impl_device.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include "device/tt_silicon_driver_common.hpp"
-
-// See src/t6ifc/t6py/packages/tenstorrent/data/wormhole/pci/tlb.yaml
-// local_offset: [ 0, 15,  0,  "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."]
-// x_end       : [ 0, 21, 16,  "" ]
-// y_end       : [ 0, 27, 22,  "" ]
-// x_start     : [ 0, 33, 28,  "" ]
-// y_start     : [ 0, 39, 34,  "" ]
-// noc_sel:      [ 0, 40, 40,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 41, 41,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 43, 42,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 44, 44,  "linked"]
-
-// local_offset: [ 0, 14,  0,  "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."]
-// x_end       : [ 0, 20, 15,  "" ]
-// y_end       : [ 0, 26, 21,  "" ]
-// x_start     : [ 0, 32, 27,  "" ]
-// y_start     : [ 0, 38, 33,  "" ]
-// noc_sel:      [ 0, 39, 39,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 40, 40,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 42, 41,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 43, 43,  "linked"]
-
-// local_offset: [ 0, 11,  0,  "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."]
-// x_end       : [ 0, 17, 12,  "" ]
-// y_end       : [ 0, 23, 18,  "" ]
-// x_start     : [ 0, 29, 24,  "" ]
-// y_start     : [ 0, 35, 30,  "" ]
-// noc_sel:      [ 0, 36, 36,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 37, 37,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 39, 38,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 40, 40,  "linked"]
-
-const auto TLB_1M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 16,
-    .y_end = 22,
-    .x_start = 28,
-    .y_start = 34,
-    .noc_sel = 40,
-    .mcast = 41,
-    .ordering = 42,
-    .linked = 44,
-    .static_vc = 45,
-    .static_vc_end = 46
-};
-
-const auto TLB_2M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 15,
-    .y_end = 21,
-    .x_start = 27,
-    .y_start = 33,
-    .noc_sel = 39,
-    .mcast = 40,
-    .ordering = 41,
-    .linked = 43,
-    .static_vc = 44,
-    .static_vc_end = 45
-};
-
-const auto TLB_16M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 12,
-    .y_end = 18,
-    .x_start = 24,
-    .y_start = 30,
-    .noc_sel = 36,
-    .mcast = 37,
-    .ordering = 38,
-    .linked = 40,
-    .static_vc = 41,
-    .static_vc_end = 42
-};
diff --git a/device/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp
similarity index 98%
rename from device/wormhole_implementation.cpp
rename to device/wormhole/wormhole_implementation.cpp
index 9295e2de..96722311 100644
--- a/device/wormhole_implementation.cpp
+++ b/device/wormhole/wormhole_implementation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "device/wormhole_implementation.h"
+#include "wormhole_implementation.h"
 
 namespace tt::umd {
 
diff --git a/device/wormhole_implementation.h b/device/wormhole/wormhole_implementation.h
similarity index 100%
rename from device/wormhole_implementation.h
rename to device/wormhole/wormhole_implementation.h
diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp
index d6c938aa..23816841 100644
--- a/tests/blackhole/test_silicon_driver_bh.cpp
+++ b/tests/blackhole/test_silicon_driver_bh.cpp
@@ -12,7 +12,7 @@
 #include <thread>
 #include <memory>
 
-#include "device/blackhole_implementation.h"
+#include "device/blackhole/blackhole_implementation.h"
 #include "device/tt_cluster_descriptor.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 
diff --git a/tests/emulation/test_emulation_device.cpp b/tests/emulation/test_emulation_device.cpp
index e54fa8f0..aef96112 100644
--- a/tests/emulation/test_emulation_device.cpp
+++ b/tests/emulation/test_emulation_device.cpp
@@ -3,6 +3,8 @@
 #include "device/tt_device.h"
 #include "device/tt_emulation_device.h"
 
+// DEPRECATED TEST SUITE !!!
+
 TEST(EmulationDeviceGS, BasicEmuTest) {
     tt_emulation_device device = tt_emulation_device("../../tests/soc_descs/grayskull_10x12.yaml");
     tt_device_params default_params;
diff --git a/tests/galaxy/test_umd_remote_api_stability.cpp b/tests/galaxy/test_umd_remote_api_stability.cpp
index f6bd28e8..ecf99862 100644
--- a/tests/galaxy/test_umd_remote_api_stability.cpp
+++ b/tests/galaxy/test_umd_remote_api_stability.cpp
@@ -76,13 +76,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, MixedRemoteTransfers) {
             100000 * scale_number_of_tests,
             seed,
 
-            transfer_type_weights_t{.write = 0.40, .rolled_write = 0.2, .read = 0.4, .epoch_cmd_write = 0.0},
+            transfer_type_weights_t{.write = 0.40, .read = 0.4},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -108,13 +106,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
             50000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.50, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -129,13 +125,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
             50000 * scale_number_of_tests,
             100,
 
-            transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.25, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -150,13 +144,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
             50000 * scale_number_of_tests,
             23,
 
-            transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.5, .read = 0.25},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -171,13 +163,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
             100000 * scale_number_of_tests,
             99,
 
-            transfer_type_weights_t{.write = 0.1, .rolled_write = 0, .read = 0.1, .epoch_cmd_write = 0.8},
+            transfer_type_weights_t{.write = 0.1, .read = 0.1},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp
index d8324f13..d890d8a9 100644
--- a/tests/grayskull/test_silicon_driver.cpp
+++ b/tests/grayskull/test_silicon_driver.cpp
@@ -7,7 +7,7 @@
 #include "gtest/gtest.h"
 #include "tt_device.h"
 #include "device/tt_soc_descriptor.h"
-#include "device/wormhole_implementation.h"
+#include "device/wormhole/wormhole_implementation.h"
 #include "l1_address_map.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 
diff --git a/tests/test_utils/stimulus_generators.hpp b/tests/test_utils/stimulus_generators.hpp
index 094f06cb..6d35afb8 100644
--- a/tests/test_utils/stimulus_generators.hpp
+++ b/tests/test_utils/stimulus_generators.hpp
@@ -36,7 +36,7 @@ namespace tt::umd::test::utils {
 static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml";
 
 
-enum RemoteTransferType : uint8_t { WRITE = 0, ROLLED_WRITE, READ, EPOCH_CMD_WRITE };
+enum RemoteTransferType : uint8_t { WRITE = 0, READ };
 
 template <
     typename SAMPLE_T,
@@ -102,14 +102,6 @@ struct write_transfer_sample_t {
     std::string tlb_to_use;
     // (payload.data(), size, destination, address, tlb_to_use, false, false);
 };
-struct rolled_write_transfer_sample_t {
-    destination_t destination;
-    address_t address;
-    transfer_size_t size_in_bytes;
-    int unroll_count;
-    std::string tlb_to_use;
-    // (payload, 2, destination, address, tlb_to_use);
-};
 struct read_transfer_sample_t {
     destination_t destination;
     address_t address;
@@ -117,17 +109,8 @@ struct read_transfer_sample_t {
     std::string tlb_to_use;
     // (payload.data(), destination, address, size, tlb_to_use);
 };
-struct write_epoch_cmd_sample_t {
-    destination_t destination;
-    address_t address;
-    transfer_size_t size_in_bytes;
-    std::string tlb_to_use;
-    bool last_epoch_command;
-    bool ordered_with_prev_remote_write;
-    // (payload.data(), size, destination, address, tlb_to_use, last_epoch_command, ordered_with_prev_remote_write);
-};
 
-using remote_transfer_sample_t = std::tuple<RemoteTransferType, std::variant<write_transfer_sample_t, rolled_write_transfer_sample_t, read_transfer_sample_t, write_epoch_cmd_sample_t>>;
+using remote_transfer_sample_t = std::tuple<RemoteTransferType, std::variant<write_transfer_sample_t, read_transfer_sample_t>>;
 
 template <
     template <typename>
@@ -267,25 +250,6 @@ template <
     template <typename>
     class WRITE_SIZE_DISTR_T,
 
-    template <typename>
-    class WRITE_EPOCH_CMD_DEST_DISTR_T,
-    template <typename>
-    class WRITE_EPOCH_CMD_ADDR_DISTR_T,
-    template <typename>
-    class WRITE_EPOCH_CMD_SIZE_DISTR_T,
-    class WRITE_EPOCH_CMD_LAST_CMD_DISTR_T,
-    class WRITE_EPOCH_CMD_ORDERED_DISTR_T,
-
-    template <typename>
-    class ROLLED_WRITE_DEST_DISTR_T,
-    template <typename>
-    class ROLLED_WRITE_ADDR_DISTR_T,
-    class ROLLED_WRITE_SIZE_DISTR_OUT_T,
-    template <typename>
-    class ROLLED_WRITE_SIZE_DISTR_T,
-    template <typename>
-    class ROLLED_WRITE_UNROLL_DISTR_T,
-
     template <typename>
     class READ_DEST_DISTR_T,
     template <typename>
@@ -299,8 +263,6 @@ class TestGenerator {
     using transfer_type_generator_t = DefaultTransferTypeGenerator;  // ConstrainedTemplateTemplateGenerator<RemoteTransferType, int,
                                                                      // TRANS_TYPE_DISTRIBUTION_T, GENERATOR_T>;
     using write_command_generator_t = WriteCommandGenerator<WRITE_DEST_DISTR_T, WRITE_ADDR_DISTR_T, WRITE_SIZE_DISTR_OUT_T, WRITE_SIZE_DISTR_T>;
-    using write_epoch_cmd_command_generator_t = WriteEpochCmdCommandGenerator<WRITE_EPOCH_CMD_DEST_DISTR_T, WRITE_EPOCH_CMD_ADDR_DISTR_T, WRITE_EPOCH_CMD_SIZE_DISTR_T, WRITE_EPOCH_CMD_LAST_CMD_DISTR_T, WRITE_EPOCH_CMD_ORDERED_DISTR_T>;
-    using rolled_write_command_generator_t = RolledWriteCommandGenerator<ROLLED_WRITE_DEST_DISTR_T, ROLLED_WRITE_ADDR_DISTR_T, ROLLED_WRITE_SIZE_DISTR_OUT_T, ROLLED_WRITE_SIZE_DISTR_T, ROLLED_WRITE_UNROLL_DISTR_T>;
     using read_command_generator_t = ReadCommandGenerator<READ_DEST_DISTR_T,READ_ADDR_DISTR_T, READ_SIZE_DISTR_OUT_T, READ_SIZE_DISTR_T>;
 
    public:
@@ -308,14 +270,10 @@ class TestGenerator {
         int seed,
         transfer_type_generator_t const& transfer_type_distribution,
         write_command_generator_t const& write_command_generator,
-        rolled_write_command_generator_t const& rolled_write_command_generator,
-        write_epoch_cmd_command_generator_t const& write_epoch_cmd_command_generator,
         read_command_generator_t const& read_command_generator) :
         generator(seed),
         transfer_type_distribution(transfer_type_distribution),
         write_command_generator(write_command_generator),
-        rolled_write_command_generator(rolled_write_command_generator),
-        write_epoch_cmd_command_generator(write_epoch_cmd_command_generator),
         read_command_generator(read_command_generator)
     {
     }
@@ -338,34 +296,6 @@ class TestGenerator {
                     .tlb_to_use = "LARGE_WRITE_TLB"}};
             } break;
 
-            case RemoteTransferType::ROLLED_WRITE: {
-                destination_t const& destination = rolled_write_command_generator.destination_generator.generate();
-                address_t const& address = rolled_write_command_generator.address_generator.generate();
-                transfer_size_t const& size_in_bytes = rolled_write_command_generator.size_generator.generate();
-                int unroll_count = rolled_write_command_generator.unroll_generator.generate();
-                return {transfer_type, rolled_write_transfer_sample_t{
-                    .destination = destination,
-                    .address = address,
-                    .size_in_bytes = size_in_bytes,
-                    .unroll_count = unroll_count,
-                    .tlb_to_use = "LARGE_WRITE_TLB"}};
-            } break;
-
-            case RemoteTransferType::EPOCH_CMD_WRITE: {
-                destination_t const& destination = write_epoch_cmd_command_generator.destination_generator.generate();
-                address_t const& address = write_epoch_cmd_command_generator.address_generator.generate();
-                transfer_size_t const& size_in_bytes = write_epoch_cmd_command_generator.size_generator.generate();
-                bool last_epoch_cmd = write_epoch_cmd_command_generator.last_cmd_generator.generate();
-                bool ordered_with_prev_remote_write = write_epoch_cmd_command_generator.ordered_generator.generate();
-                return {transfer_type, write_epoch_cmd_sample_t{
-                    .destination = destination,
-                    .address = address,
-                    .size_in_bytes = size_in_bytes,
-                    .tlb_to_use = "LARGE_WRITE_TLB",
-                    .last_epoch_command = last_epoch_cmd,
-                    .ordered_with_prev_remote_write = ordered_with_prev_remote_write}};
-            } break;
-
             case RemoteTransferType::READ: {
                 destination_t const& destination = read_command_generator.destination_generator.generate();
                 address_t const& address = read_command_generator.address_generator.generate();
@@ -388,22 +318,17 @@ class TestGenerator {
     transfer_type_generator_t transfer_type_distribution;
 
     write_command_generator_t write_command_generator;
-    rolled_write_command_generator_t rolled_write_command_generator;
-    write_epoch_cmd_command_generator_t write_epoch_cmd_command_generator;
     read_command_generator_t read_command_generator;
 };
 
 struct transfer_type_weights_t {
     double write;
-    double rolled_write;
     double read;
-    double epoch_cmd_write;
 };
 
 
 static auto address_aligner = [](address_t addr) -> address_t { addr = (((addr - 1) / 32) + 1) * 32; assert(addr % 32 == 0); return addr;};
 static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 4) + 1) * 4; assert(size > 0); assert(size % 4 == 0); return size; };
-static auto rolled_write_transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;};
 static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;};
 static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;};
 template<typename T>
@@ -433,28 +358,12 @@ static void print_command(remote_transfer_sample_t const& command) {
                         << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
                         << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl;
         } break;
-        case RemoteTransferType::ROLLED_WRITE: {
-            rolled_write_transfer_sample_t const& command_args =
-            std::get<rolled_write_transfer_sample_t>(std::get<1>(command));
-            std::cout << "Transfer type: ROLLED_WRITE, destination: (c=" << command_args.destination.chip
-                        << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
-                        << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes
-                        << ", unroll_count: " << command_args.unroll_count << std::endl;
-        } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
             std::cout << "Transfer type: READ, destination: (c=" << command_args.destination.chip
                         << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
                         << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl;
         } break;
-        case RemoteTransferType::EPOCH_CMD_WRITE: {
-            write_epoch_cmd_sample_t const& command_args = std::get<write_epoch_cmd_sample_t>(std::get<1>(command));
-            std::cout << "Transfer type: EPOCH_CMD_WRITE, destination: (c=" << command_args.destination.chip
-                        << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
-                        << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes
-                        << ", last_cmd: " << (command_args.last_epoch_command ? " True" : "False")
-                        << ", ordered_w_prev_remote_write: " <<  (command_args.ordered_with_prev_remote_write ? " True" : "False") << std::endl;
-        } break;
         default: throw std::runtime_error("Invalid transfer type");
     };
 }
@@ -479,14 +388,7 @@ static inline void dispatch_remote_transfer_command(
             write_transfer_sample_t const& command_args = std::get<write_transfer_sample_t>(std::get<1>(command));
             assert(command_args.size_in_bytes >= sizeof(uint32_t));
             resize_payload(payload,command_args.size_in_bytes);
-            driver.write_to_device(payload.data(), bytes_to_words<uint32_t>(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.
-            tlb_to_use, false, false);
-        } break;
-        case RemoteTransferType::ROLLED_WRITE: {
-            rolled_write_transfer_sample_t const& command_args = std::get<rolled_write_transfer_sample_t>(std::get<1>(command));
-            assert(command_args.size_in_bytes >= sizeof(uint32_t));
-            resize_payload(payload,command_args.size_in_bytes);
-            driver.rolled_write_to_device(payload, command_args.unroll_count, command_args.destination, command_args.address, command_args.tlb_to_use);
+            driver.write_to_device(payload.data(), bytes_to_words<uint32_t>(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use);
         } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
@@ -494,12 +396,6 @@ static inline void dispatch_remote_transfer_command(
             resize_payload(payload,command_args.size_in_bytes);
             driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size_in_bytes, command_args.tlb_to_use);
         } break;
-        case RemoteTransferType::EPOCH_CMD_WRITE: {
-            write_epoch_cmd_sample_t const& command_args = std::get<write_epoch_cmd_sample_t>(std::get<1>(command));
-            assert(command_args.size_in_bytes >= sizeof(uint32_t));
-            resize_payload(payload,command_args.size_in_bytes);
-            driver.write_epoch_cmd_to_device(payload.data(), bytes_to_words<uint32_t>(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use, command_args.last_epoch_command, command_args.ordered_with_prev_remote_write);
-        } break;
         default:
             throw std::runtime_error("Invalid transfer type");
     };
@@ -524,16 +420,9 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman
             std::cout << "assert(" << command_args.size_in_bytes << " >= sizeof(uint32_t));" << std::endl;
             emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t));
             emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t));
-            std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\", false, false);" << std::endl;
+            std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
             // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, false, false);
         } break;
-        case RemoteTransferType::ROLLED_WRITE: {
-            rolled_write_transfer_sample_t const& command_args = std::get<rolled_write_transfer_sample_t>(std::get<1>(command));
-            std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");"  << std::endl;
-            emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t));
-            std::cout << "device->rolled_write_to_device(payload, " << command_args.unroll_count << ", destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
-            // driver.rolled_write_to_device(payload, command_args.unroll_count, command_args.destination, command_args.address, command_args.tlb_to_use);
-        } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
             std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");"  << std::endl;
@@ -541,15 +430,6 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman
             std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
             // driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size, command_args.tlb_to_use);
         } break;
-        case RemoteTransferType::EPOCH_CMD_WRITE: {
-            write_epoch_cmd_sample_t const& command_args = std::get<write_epoch_cmd_sample_t>(std::get<1>(command));
-            std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");"  << std::endl;
-            emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t));
-            emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t));
-            std::cout << "device->write_epoch_cmd_to_device(payload.data(), len, destination, " << command_args.address << ", \""  << command_args.tlb_to_use << "\", " << (command_args.last_epoch_command ? "true":"false")
-            << "\", " << (command_args.ordered_with_prev_remote_write ? "true":"false") << ");" << std::endl;
-            // driver.write_epoch_cmd_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, command_args.last_epoch_command, command_args.ordered_with_prev_remote_write);
-        } break;
         default:
             throw std::runtime_error("Invalid transfer type");
     };
@@ -572,18 +452,6 @@ template<
     template <typename> class WRITE_ADDR_DISTR_T, 
     class WRITE_SIZE_DISTR_OUT_T,
     template <typename> class WRITE_SIZE_DISTR_T,
-    
-    template <typename> class ROLLED_WRITE_DEST_DISTR_T, 
-    template <typename> class ROLLED_WRITE_ADDR_DISTR_T,
-    class ROLLED_WRITE_SIZE_DISTR_OUT_T, 
-    template <typename> class ROLLED_WRITE_SIZE_DISTR_T,
-    template <typename> class ROLLED_WRITE_UNROLL_COUNT_DISTR_T,
-
-    template <typename> class WRITE_EPOCH_CMD_DEST_DISTR_T, 
-    template <typename> class WRITE_EPOCH_CMD_ADDR_DISTR_T, 
-    template <typename> class WRITE_EPOCH_CMD_SIZE_DISTR_T,
-    class WRITE_EPOCH_CMD_LAST_CMD_DISTR_T,
-    class WRITE_EPOCH_CMD_ORDERED_DISTR_T,
 
     template <typename> class READ_DEST_DISTR_T, 
     template <typename> class READ_ADDR_DISTR_T, 
@@ -598,8 +466,6 @@ void RunMixedTransfers(
     transfer_type_weights_t const& transfer_type_weights,
 
     WriteCommandGenerator<WRITE_DEST_DISTR_T, WRITE_ADDR_DISTR_T, WRITE_SIZE_DISTR_OUT_T, WRITE_SIZE_DISTR_T> const& write_command_generator,
-    RolledWriteCommandGenerator<ROLLED_WRITE_DEST_DISTR_T, ROLLED_WRITE_ADDR_DISTR_T, ROLLED_WRITE_SIZE_DISTR_OUT_T, ROLLED_WRITE_SIZE_DISTR_T, ROLLED_WRITE_UNROLL_COUNT_DISTR_T> const& rolled_write_command_generator,
-    WriteEpochCmdCommandGenerator<WRITE_EPOCH_CMD_DEST_DISTR_T, WRITE_EPOCH_CMD_ADDR_DISTR_T, WRITE_EPOCH_CMD_SIZE_DISTR_T, WRITE_EPOCH_CMD_LAST_CMD_DISTR_T, WRITE_EPOCH_CMD_ORDERED_DISTR_T> const& write_epoch_cmd_command_generator,
     ReadCommandGenerator<READ_DEST_DISTR_T, READ_ADDR_DISTR_T, READ_SIZE_DISTR_OUT_T, READ_SIZE_DISTR_T> const& read_command_generator,
     
     bool record_command_history = false,
@@ -609,14 +475,12 @@ void RunMixedTransfers(
     auto test_generator = TestGenerator(
         seed,
         {seed,
-         {transfer_type_weights.write, transfer_type_weights.rolled_write, transfer_type_weights.read, transfer_type_weights.epoch_cmd_write},
+         {transfer_type_weights.write, transfer_type_weights.read},
          [](int transfer_type) -> RemoteTransferType {
              assert(transfer_type < 4);
              return static_cast<RemoteTransferType>(transfer_type);
          }},
         write_command_generator,
-        rolled_write_command_generator,
-        write_epoch_cmd_command_generator,
         read_command_generator);
 
     if (record_command_history) {
@@ -663,58 +527,6 @@ static ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int
         [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
 }
 
-
-static RolledWriteCommandGenerator <
-    std::uniform_int_distribution,
-    std::uniform_int_distribution,
-    transfer_size_t,
-    std::uniform_int_distribution,
-    std::uniform_int_distribution
->
- build_dummy_rolled_write_command_generator(tt_SiliconDevice &device) {
-    tt_ClusterDescriptor *cluster_desc = device.get_cluster_description();
-    tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0);
-    std::vector<destination_t> core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc);
-    auto dest_generator = ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int_distribution>(
-        0,
-        std::uniform_int_distribution<int>(0, core_index_to_location.size() - 1),
-        [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
-    auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(0, std::uniform_int_distribution<address_t>(0,0), address_aligner_32B);
-    auto rolled_write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        0, std::uniform_int_distribution<transfer_size_t>(0,0), rolled_write_transfer_size_aligner);
-    auto unroll_count_generator = ConstrainedTemplateTemplateGenerator<int, int, std::uniform_int_distribution>(
-        0, std::uniform_int_distribution<int>(0,0), [](int unroll_count) -> int { return unroll_count; });
-
-    return RolledWriteCommandGenerator(
-        dest_generator, addr_generator_32B_aligned, rolled_write_size_generator, unroll_count_generator);
-}
-
-static WriteEpochCmdCommandGenerator <
-    std::uniform_int_distribution,
-    std::uniform_int_distribution,
-    std::uniform_int_distribution,
-    std::bernoulli_distribution,
-    std::bernoulli_distribution
-> build_dummy_write_epoch_cmd_command_generator(tt_SiliconDevice &device) {
-    tt_ClusterDescriptor *cluster_desc = device.get_cluster_description();
-    tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0);
-    std::vector<destination_t> core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc);
-    auto dest_generator = ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int_distribution>(
-        0,
-        std::uniform_int_distribution<int>(0, core_index_to_location.size() - 1),
-        [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
-    auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(0, std::uniform_int_distribution<address_t>(0,0), address_aligner_32B);
-    auto write_epoch_cmd_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        0, std::uniform_int_distribution<transfer_size_t>(0,0), transfer_size_aligner);
-    auto last_epoch_cmd_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
-        0, std::bernoulli_distribution(1), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; });
-    auto ordered_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
-        0, std::bernoulli_distribution(1), [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; });
-
-    return WriteEpochCmdCommandGenerator(
-        dest_generator, addr_generator_32B_aligned, write_epoch_cmd_generator, last_epoch_cmd_generator, ordered_generator);
-}
-
 static WriteCommandGenerator<
     std::uniform_int_distribution,
     std::uniform_int_distribution,
@@ -764,10 +576,6 @@ template<
     template <typename>
     class WRITE_SIZE_GENERATOR_T,
     template <typename>
-    class ROLLED_WRITE_SIZE_GENERATOR_T,
-    template <typename>
-    class WRITE_EPOCH_CMD_SIZE_GENERATOR_T,
-    template <typename>
     class READ_SIZE_GENERATOR_T,
     template <typename>
     class UNROLL_COUNT_GENERATOR_T
@@ -780,9 +588,7 @@ void RunMixedTransfersUniformDistributions(
     transfer_type_weights_t const& transfer_type_weights,
     ADDR_GENERATOR_T<ADDR_DISTR_T> const& address_distribution,
     WRITE_SIZE_GENERATOR_T<transfer_size_t> const& write_size_distribution,
-    ROLLED_WRITE_SIZE_GENERATOR_T<transfer_size_t> const& rolled_write_size_distribution,
     UNROLL_COUNT_GENERATOR_T<int> const& unroll_count_distribution,
-    WRITE_EPOCH_CMD_SIZE_GENERATOR_T<transfer_size_t> const& write_epoch_cmd_size_distribution,
     float percent_not_last_epoch_cmd,
     float percent_not_remote_ordered,
     READ_SIZE_GENERATOR_T<transfer_size_t> const& read_size_distribution,
@@ -802,12 +608,8 @@ void RunMixedTransfersUniformDistributions(
     auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(seed + 1, address_distribution, address_aligner_32B);
     auto write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
         seed + 2, write_size_distribution, transfer_size_aligner);
-    auto rolled_write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        seed + 2, rolled_write_size_distribution, rolled_write_transfer_size_aligner);
     auto read_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
         seed + 2, read_size_distribution, transfer_size_aligner);
-    auto write_epoch_cmd_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        seed + 2, write_epoch_cmd_size_distribution, transfer_size_aligner);
     auto last_epoch_cmd_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
         seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; });
     auto ordered_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
@@ -823,9 +625,6 @@ void RunMixedTransfersUniformDistributions(
         transfer_type_weights,
 
         WriteCommandGenerator(dest_generator, addr_generator, write_size_generator),
-        RolledWriteCommandGenerator(dest_generator, addr_generator_32B_aligned, rolled_write_size_generator, unroll_count_generator),
-        WriteEpochCmdCommandGenerator(
-            dest_generator, addr_generator_32B_aligned, write_epoch_cmd_generator, last_epoch_cmd_generator, ordered_generator),
         ReadCommandGenerator(dest_generator, addr_generator, read_size_generator),
         
         record_command_history,
diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp
index df686dfa..6551b3cc 100644
--- a/tests/wormhole/test_silicon_driver_wh.cpp
+++ b/tests/wormhole/test_silicon_driver_wh.cpp
@@ -13,7 +13,7 @@
 #include "host_mem_address_map.h"
 
 #include "device/tt_cluster_descriptor.h"
-#include "device/wormhole_implementation.h"
+#include "device/wormhole/wormhole_implementation.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 
 void set_params_for_remote_txn(tt_SiliconDevice& device) {
diff --git a/tests/wormhole/test_umd_remote_api_stability.cpp b/tests/wormhole/test_umd_remote_api_stability.cpp
index 36c02914..96fef09a 100644
--- a/tests/wormhole/test_umd_remote_api_stability.cpp
+++ b/tests/wormhole/test_umd_remote_api_stability.cpp
@@ -73,13 +73,11 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) {
             100000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.25},
+            transfer_type_weights_t{.write = 0.25, .read = 0.25},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -108,13 +106,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
             100000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.50, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -129,13 +125,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
             100000 * scale_number_of_tests,
             100,
 
-            transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.25, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -150,13 +144,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
             100000 * scale_number_of_tests,
             23,
 
-            transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.5, .read = 0.25},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -171,13 +163,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
             100000 * scale_number_of_tests,
             99,
 
-            transfer_type_weights_t{.write = 1.0, .rolled_write = 0, .read = 0.0, .epoch_cmd_write = 0.0},
+            transfer_type_weights_t{.write = 1.0, .read = 0.0},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -206,13 +196,11 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersLarge) {
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.15, .rolled_write = 0, .read = 0.15, .epoch_cmd_write = 0.7},
+            transfer_type_weights_t{.write = 0.15, .read = 0.15},
 
             std::uniform_int_distribution<address_t>(0x10000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 300000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 300000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -247,11 +235,9 @@ TEST_F(WormholeNebulaX2TestFixture, WritesOnlyNormalDistributionMean10kStd3kMinS
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 1., .read = 0.},
 
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             build_dummy_read_command_generator(*device),
 
             false, // Set to true if you want to emit the command history code to command line
@@ -279,13 +265,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
             100000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.50, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -300,13 +284,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
             100000 * scale_number_of_tests,
             100,
 
-            transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.25, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -321,13 +303,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
             100000 * scale_number_of_tests,
             23,
 
-            transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.5, .read = 0.25},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -342,13 +322,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
             100000 * scale_number_of_tests,
             99,
 
-            transfer_type_weights_t{.write = 1.0, .rolled_write = 0, .read = 0.0, .epoch_cmd_write = 0.0},
+            transfer_type_weights_t{.write = 1.0, .read = 0.0},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -387,11 +365,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 1., .read = 0.},
 
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             build_dummy_read_command_generator(*device),
 
             false, // Set to true if you want to emit the command history code to command line
@@ -404,11 +380,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 1., .read = 0.},
 
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             build_dummy_read_command_generator(*device),
 
             false, // Set to true if you want to emit the command history code to command line
@@ -421,11 +395,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0, .rolled_write = 0., .read = 1., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0, .read = 1.},
 
             build_dummy_write_command_generator(*device),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             ReadCommandGenerator(dest_generator, address_generator, read_size_generator),
 
             false, // Set to true if you want to emit the command history code to command line
@@ -438,11 +410,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0, .rolled_write = 0., .read = 1., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0, .read = 1.},
 
             build_dummy_write_command_generator(*device),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             ReadCommandGenerator(dest_generator, address_generator, read_size_generator),
 
             false, // Set to true if you want to emit the command history code to command line