diff --git a/.github/workflows/on-pr-opt.yml b/.github/workflows/build-and-run-all-tests.yml
similarity index 90%
rename from .github/workflows/on-pr-opt.yml
rename to .github/workflows/build-and-run-all-tests.yml
index 18364734..ff75e6c2 100644
--- a/.github/workflows/on-pr-opt.yml
+++ b/.github/workflows/build-and-run-all-tests.yml
@@ -1,10 +1,12 @@
-# Optional PR checks
-name: On PR - Optional
+# Build and then run all tests, on all supported archs.
+name: Build and run all tests
 
 on:
   workflow_dispatch:
   pull_request:
     branches: ["main"]
+  push:
+    branches: ["main"]
 
 jobs:
   build-tests:
diff --git a/.github/workflows/build-clients.yml b/.github/workflows/build-clients.yml
new file mode 100644
index 00000000..a2502367
--- /dev/null
+++ b/.github/workflows/build-clients.yml
@@ -0,0 +1,56 @@
+name: Build clients on newest UMD
+
+on:
+  workflow_dispatch:
+    inputs:
+      timeout:
+        required: true
+        description: 'The timeout for the job in minutes'
+        type: number
+        default: 30
+  pull_request:
+    branches: ["main"]
+  push:
+    branches: ["main"]
+
+jobs:
+  build-tt-metal:
+    # Due to parsing bug, fromJSON is used to convert string to number.
+    # In pull_request or push events, the input context is not available, stating the default again here.
+    timeout-minutes: ${{ fromJSON(inputs.timeout || '30') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        arch_name: [grayskull, wormhole_b0, blackhole]
+
+    name: Build tt-metal for ${{ matrix.arch_name }} with newest UMD
+    runs-on: ubuntu-20.04
+    container:
+      image: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:latest
+      options: --user root
+
+    steps:
+      - name: Checkout client repo
+        uses: actions/checkout@v4
+        with:
+          # Clone under tt-metal directory
+          path: tt-metal
+          repository: tenstorrent/tt-metal
+          submodules: recursive
+          lfs: 'true'
+
+      - name: Checkout UMD
+        uses: actions/checkout@v4
+        with:
+          # Clone directly into tt-metal directory for umd
+          path: tt-metal/tt_metal/third_party/umd
+          submodules: recursive
+          lfs: 'true'
+
+      - name: Build tt-metal
+        run: |
+          cd tt-metal
+          export ARCH_NAME=${{ matrix.arch_name }}
+          export TT_METAL_HOME=$(pwd)
+          export PYTHONPATH=$(pwd)
+          ./build_metal.sh
diff --git a/.github/workflows/build-device.yml b/.github/workflows/build-device.yml
index 5a8c0648..335dd2c0 100644
--- a/.github/workflows/build-device.yml
+++ b/.github/workflows/build-device.yml
@@ -1,19 +1,19 @@
 # Builds device.
 # Build is performed on all supported OS versions.
-name: Build Target
+name: Build Device
 
 on:
-  workflow_call:
-    inputs:
-      timeout:
-        required: true
-        type: number
   workflow_dispatch:
     inputs:
       timeout:
         required: true
-        description: 'The timeout for the build job in minutes'
+        description: 'The timeout for the job in minutes'
         type: number
+        default: 15
+  pull_request:
+    branches: ["main"]
+  push:
+    branches: ["main"]
 
 env:
   BUILD_TARGET: device
@@ -25,7 +25,9 @@ env:
 
 jobs:
   build:
-    timeout-minutes: ${{ inputs.timeout }}
+    # Due to parsing bug, fromJSON is used to convert string to number.
+    # In pull_request or push events, the input context is not available, stating the default again here.
+    timeout-minutes: ${{ fromJSON(inputs.timeout || '15') }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 3c21f65d..5affd5c2 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -3,11 +3,17 @@ name: Build and Publish Docker Image
 
 on:
   workflow_dispatch:
-  workflow_call:
+    inputs:
+      timeout:
+        required: true
+        description: 'The timeout for the job in minutes'
+        type: number
+        default: 15
 
 jobs:
   build:
-    timeout-minutes: 15
+    # Due to parsing bug, fromJSON is used to convert string to number
+    timeout-minutes: ${{ fromJSON(inputs.timeout) }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index 5edd35eb..3916e4bf 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -33,10 +33,12 @@ env:
   LIB_OUTPUT_DIR: ./build/lib
   DEPS_OUTPUT_DIR: ./build/_deps
   TEST_OUTPUT_DIR: ./build/test
+  CLUSTER_DESCRIPTORS_DIR: ./tests/api/cluster_descriptor_examples
 
 jobs:
   build:
-    timeout-minutes: ${{ inputs.timeout }}
+    # Due to parsing bug, fromJSON is used to convert string to number
+    timeout-minutes: ${{ fromJSON(inputs.timeout) }}
     strategy:
       fail-fast: false
       matrix:
@@ -74,7 +76,8 @@ jobs:
         run: |
           tar cvf artifact.tar ${{ env.TEST_OUTPUT_DIR }} \
           ${{ env.LIB_OUTPUT_DIR }} \
-          ${{ env.DEPS_OUTPUT_DIR }}
+          ${{ env.DEPS_OUTPUT_DIR }} \
+          ${{ env.CLUSTER_DESCRIPTORS_DIR }}
 
       - name: Upload build artifacts archive
         uses: actions/upload-artifact@v4
diff --git a/.github/workflows/on-pr.yml b/.github/workflows/on-pr.yml
deleted file mode 100644
index 158026dd..00000000
--- a/.github/workflows/on-pr.yml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Mandatory PR checks
-name: On PR
-
-on:
-  workflow_dispatch:
-  pull_request:
-    branches: ["main"]
-
-jobs:
-  build-all:
-    secrets: inherit
-    uses: ./.github/workflows/build-device.yml
-    with:
-      timeout: 15
-
-  pre-commit:
-    secrets: inherit
-    uses: ./.github/workflows/pre-commit.yml
diff --git a/.github/workflows/on-push.yml b/.github/workflows/on-push.yml
deleted file mode 100644
index 673be510..00000000
--- a/.github/workflows/on-push.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: On Push
-
-on:
-  workflow_dispatch:
-  push:
-    branches: ["main"]
-
-jobs:
-  build-all:
-    secrets: inherit
-    uses: ./.github/workflows/build-device.yml
-    with:
-      timeout: 15
-
-  pre-commit:
-    secrets: inherit
-    uses: ./.github/workflows/pre-commit.yml
-
-  build-tests:
-    secrets: inherit
-    strategy:
-      fail-fast: false
-      matrix:
-        test-group: [
-          # Enable once we have functional cards with specified architecture.
-          {arch: grayskull},
-          {arch: wormhole_b0},
-          # {arch: blackhole},
-        ]
-    uses: ./.github/workflows/build-tests.yml
-    with:
-      arch: ${{ matrix.test-group.arch }}
-      timeout: 15
-
-  test-all:
-    secrets: inherit
-    needs: build-tests
-    strategy:
-      fail-fast: false
-      matrix:
-        test-group: [
-          # Enable once we have functional cards.
-          {arch: grayskull, card: e75, timeout: 10},
-          {arch: grayskull, card: e150, timeout: 10},
-          {arch: grayskull, card: e300, timeout: 10},
-          {arch: wormhole_b0, card: n150, timeout: 5},
-          {arch: wormhole_b0, card: n300, timeout: 15},
-          # {arch: blackhole},
-        ]
-    uses: ./.github/workflows/run-tests.yml
-    with:
-      arch: ${{ matrix.test-group.arch }}
-      card: ${{ matrix.test-group.card }}
-      timeout: ${{ matrix.test-group.timeout }}
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index a4ecb678..c4b2b9a0 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -2,8 +2,11 @@
 name: Run Pre-commit Hooks
 
 on:
-  workflow_call:
   workflow_dispatch:
+  pull_request:
+    branches: ["main"]
+  push:
+    branches: ["main"]
 
 jobs:
   pre-commit:
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 98b2a526..e9d1e6b2 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -45,7 +45,8 @@ env:
 
 jobs:
   test:
-    timeout-minutes: ${{ inputs.timeout }}
+    # Due to parsing bug, fromJSON is used to convert string to number
+    timeout-minutes: ${{ fromJSON(inputs.timeout) }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/test-runner.yaml b/.github/workflows/test-runner.yaml
index 74a4d6bf..c871c773 100644
--- a/.github/workflows/test-runner.yaml
+++ b/.github/workflows/test-runner.yaml
@@ -2,10 +2,17 @@ name: Check runner
 
 on:
   workflow_dispatch:
+    inputs:
+      timeout:
+        required: true
+        description: 'The timeout for the job in minutes'
+        type: number
+        default: 10
 
 jobs:
   check-runners-host:
-    timeout-minutes: 10
+    # Due to parsing bug, fromJSON is used to convert string to number
+    timeout-minutes: ${{ fromJSON(inputs.timeout) }}
     strategy:
       fail-fast: false
       matrix:
@@ -52,7 +59,8 @@ jobs:
           du -h --max-depth=1 | sort -rh
 
   check-runners-docker:
-    timeout-minutes: 10
+    # Due to parsing bug, fromJSON is used to convert string to number
+    timeout-minutes: ${{ fromJSON(inputs.timeout) }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/common/disjoint_set.hpp b/common/disjoint_set.hpp
new file mode 100644
index 00000000..b2187173
--- /dev/null
+++ b/common/disjoint_set.hpp
@@ -0,0 +1,42 @@
+/*
+ * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <unordered_map>
+#include <unordered_set>
+
+// A standard disjoint set data structure to track connected components.
+template <typename T>
+class DisjointSet {
+public:
+    void add_item(T item) { parent[item] = item; }
+
+    int get_set(T item) {
+        while (parent[item] != item) {
+            item = parent[item];
+        }
+        return item;
+    }
+
+    void merge(T item1, T item2) {
+        T set1 = get_set(item1);
+        T set2 = get_set(item2);
+        parent[set1] = set2;
+    }
+
+    bool are_same_set(T item1, T item2) { return get_set(item1) == get_set(item2); }
+
+    int get_num_sets() {
+        std::unordered_set<T> sets;
+        for (auto [item, _] : parent) {
+            sets.insert(get_set(item));
+        }
+        return sets.size();
+    }
+
+private:
+    std::unordered_map<T, T> parent;
+};
diff --git a/common/utils.hpp b/common/utils.hpp
new file mode 100644
index 00000000..b8cba9f5
--- /dev/null
+++ b/common/utils.hpp
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <filesystem>
+#include <iostream>
+#include <string>
+
+namespace tt::umd::utils {
+
+std::string get_abs_path(std::string path) {
+    // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the
+    // compiler.
+    std::filesystem::path current_file_path = std::filesystem::path(__FILE__);
+    std::filesystem::path umd_root;
+    if (current_file_path.is_absolute()) {
+        umd_root = current_file_path.parent_path().parent_path();
+    } else {
+        std::filesystem::path umd_root_relative =
+            std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path(), "../");
+        umd_root = std::filesystem::canonical(umd_root_relative);
+    }
+    std::filesystem::path abs_path = umd_root / path;
+    return abs_path.string();
+}
+
+}  // namespace tt::umd::utils
diff --git a/device/.clang-format b/device/.clang-format
deleted file mode 100644
index 9d159247..00000000
--- a/device/.clang-format
+++ /dev/null
@@ -1,2 +0,0 @@
-DisableFormat: true
-SortIncludes: false
diff --git a/device/api/umd/device/architecture_implementation.h b/device/api/umd/device/architecture_implementation.h
index f157dc8e..60ce5368 100644
--- a/device/api/umd/device/architecture_implementation.h
+++ b/device/api/umd/device/architecture_implementation.h
@@ -12,8 +12,8 @@
 #include <vector>
 
 #include "umd/device/tlb.h"
-#include "umd/device/xy_pair.h"
 #include "umd/device/tt_arch_types.h"
+#include "umd/device/xy_pair.h"
 
 struct tt_driver_host_address_params;
 struct tt_driver_eth_interface_params;
@@ -22,7 +22,7 @@ struct tt_driver_noc_params;
 namespace tt::umd {
 
 class architecture_implementation {
-   public:
+public:
     virtual ~architecture_implementation() = default;
 
     virtual tt::ARCH get_architecture() const = 0;
@@ -65,7 +65,8 @@ class architecture_implementation {
     virtual std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const = 0;
     virtual tlb_configuration get_tlb_configuration(uint32_t tlb_index) const = 0;
     virtual std::optional<std::tuple<std::uint64_t, std::uint64_t>> describe_tlb(std::int32_t tlb_index) const = 0;
-    virtual std::pair<std::uint64_t, std::uint64_t> get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const = 0;
+    virtual std::pair<std::uint64_t, std::uint64_t> get_tlb_data(
+        std::uint32_t tlb_index, const tlb_data& data) const = 0;
 
     virtual tt_driver_host_address_params get_host_address_params() const = 0;
     virtual tt_driver_eth_interface_params get_eth_interface_params() const = 0;
diff --git a/device/api/umd/device/blackhole_implementation.h b/device/api/umd/device/blackhole_implementation.h
index 9fdf819c..3ff3493b 100644
--- a/device/api/umd/device/blackhole_implementation.h
+++ b/device/api/umd/device/blackhole_implementation.h
@@ -7,10 +7,10 @@
 #pragma once
 
 #include <array>
+#include <stdexcept>
 
 #include "umd/device/architecture_implementation.h"
 #include "umd/device/tlb.h"
-#include <stdexcept>
 
 namespace tt::umd {
 
@@ -59,30 +59,8 @@ enum class arc_message_type {
 
 // DEVICE_DATA
 static constexpr std::array<xy_pair, 24> DRAM_LOCATIONS = {
-    {{0, 0},
-     {0, 1},
-     {0, 11},
-     {0, 2},
-     {0, 10},
-     {0, 3},
-     {0, 9},
-     {0, 4},
-     {0, 8},
-     {0, 5},
-     {0, 7},
-     {0, 6},
-     {9, 0},
-     {9, 1},
-     {9, 11},
-     {9, 2},
-     {9, 10},
-     {9, 3},
-     {9, 9},
-     {9, 4},
-     {9, 8},
-     {9, 5},
-     {9, 7},
-     {9, 6}}};
+    {{0, 0}, {0, 1}, {0, 11}, {0, 2}, {0, 10}, {0, 3}, {0, 9}, {0, 4}, {0, 8}, {0, 5}, {0, 7}, {0, 6},
+     {9, 0}, {9, 1}, {9, 11}, {9, 2}, {9, 10}, {9, 3}, {9, 9}, {9, 4}, {9, 8}, {9, 5}, {9, 7}, {9, 6}}};
 
 static constexpr std::array<xy_pair, 1> ARC_LOCATIONS = {{{8, 0}}};
 static constexpr std::array<xy_pair, 1> PCI_LOCATIONS = {{{11, 0}}};
@@ -113,14 +91,14 @@ static constexpr uint32_t BROADCAST_TLB_INDEX = 0;     // TODO: Copied from worm
 static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000;
 
 static constexpr uint32_t TLB_COUNT_2M = 202;
-static constexpr uint32_t TLB_BASE_2M = 0; // 0 in BAR0
+static constexpr uint32_t TLB_BASE_2M = 0;  // 0 in BAR0
 static constexpr uint32_t TLB_BASE_INDEX_2M = 0;
 static constexpr uint32_t TLB_2M_SIZE = 2 * 1024 * 1024;
 
 static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 12;
 
 static constexpr uint32_t TLB_COUNT_4G = 8;
-static constexpr uint32_t TLB_BASE_4G = 0; // 0 in BAR4
+static constexpr uint32_t TLB_BASE_4G = 0;  // 0 in BAR4
 static constexpr uint32_t TLB_BASE_INDEX_4G = TLB_COUNT_2M;
 static constexpr uint64_t TLB_4G_SIZE = 4ULL * 1024ULL * 1024ULL * 1024ULL;
 static constexpr uint64_t DYNAMIC_TLB_4G_SIZE = TLB_4G_SIZE;
@@ -168,59 +146,108 @@ static constexpr uint32_t MSG_TYPE_SETUP_IATU_FOR_PEER_TO_PEER = 0x97;
 }  // namespace blackhole
 
 class blackhole_implementation : public architecture_implementation {
-   public:
+public:
     tt::ARCH get_architecture() const override { return tt::ARCH::BLACKHOLE; }
+
     uint32_t get_arc_message_arc_get_harvesting() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::ARC_GET_HARVESTING);
     }
+
     uint32_t get_arc_message_arc_go_busy() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::ARC_GO_BUSY);
     }
+
     uint32_t get_arc_message_arc_go_long_idle() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::ARC_GO_LONG_IDLE);
     }
+
     uint32_t get_arc_message_arc_go_short_idle() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::ARC_GO_SHORT_IDLE);
     }
+
     uint32_t get_arc_message_deassert_riscv_reset() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::DEASSERT_RISCV_RESET);
     }
+
     uint32_t get_arc_message_get_aiclk() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::GET_AICLK);
     }
+
     uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER);
     }
+
     uint32_t get_arc_message_test() const override { return static_cast<uint32_t>(blackhole::arc_message_type::TEST); }
-    uint32_t get_arc_csm_mailbox_offset() const override { throw std::runtime_error("Not supported for Blackhole arch"); return 0; }
+
+    uint32_t get_arc_csm_mailbox_offset() const override {
+        throw std::runtime_error("Not supported for Blackhole arch");
+        return 0;
+    }
+
     uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return blackhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; }
+
     uint32_t get_arc_reset_scratch_offset() const override { return blackhole::ARC_RESET_SCRATCH_OFFSET; }
+
     uint32_t get_dram_channel_0_peer2peer_region_start() const override {
         return blackhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START;
     }
+
     uint32_t get_dram_channel_0_x() const override { return blackhole::DRAM_CHANNEL_0_X; }
+
     uint32_t get_dram_channel_0_y() const override { return blackhole::DRAM_CHANNEL_0_Y; }
+
     uint32_t get_broadcast_tlb_index() const override { return blackhole::BROADCAST_TLB_INDEX; }
+
     uint32_t get_dynamic_tlb_2m_base() const override { return blackhole::DYNAMIC_TLB_2M_BASE; }
+
     uint32_t get_dynamic_tlb_2m_size() const override { return blackhole::DYNAMIC_TLB_2M_SIZE; }
-    uint32_t get_dynamic_tlb_16m_base() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; }
-    uint32_t get_dynamic_tlb_16m_size() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; }
-    uint32_t get_dynamic_tlb_16m_cfg_addr() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; }
+
+    uint32_t get_dynamic_tlb_16m_base() const override {
+        throw std::runtime_error("No 16MB TLBs for Blackhole arch");
+        return 0;
+    }
+
+    uint32_t get_dynamic_tlb_16m_size() const override {
+        throw std::runtime_error("No 16MB TLBs for Blackhole arch");
+        return 0;
+    }
+
+    uint32_t get_dynamic_tlb_16m_cfg_addr() const override {
+        throw std::runtime_error("No 16MB TLBs for Blackhole arch");
+        return 0;
+    }
+
     uint32_t get_mem_large_read_tlb() const override { return blackhole::MEM_LARGE_READ_TLB; }
+
     uint32_t get_mem_large_write_tlb() const override { return blackhole::MEM_LARGE_WRITE_TLB; }
+
     uint32_t get_static_tlb_cfg_addr() const override { return blackhole::STATIC_TLB_CFG_ADDR; }
-    uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE;  }
+
+    uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; }
+
     uint32_t get_reg_tlb() const override { return blackhole::REG_TLB; }
-    uint32_t get_tlb_base_index_16m() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0;  }
+
+    uint32_t get_tlb_base_index_16m() const override {
+        throw std::runtime_error("No 16MB TLBs for Blackhole arch");
+        return 0;
+    }
+
     uint32_t get_tensix_soft_reset_addr() const override { return blackhole::TENSIX_SOFT_RESET_ADDR; }
+
     uint32_t get_grid_size_x() const override { return blackhole::GRID_SIZE_X; }
+
     uint32_t get_grid_size_y() const override { return blackhole::GRID_SIZE_Y; }
+
     uint32_t get_tlb_cfg_reg_size_bytes() const override { return blackhole::TLB_CFG_REG_SIZE_BYTES; }
+
     uint32_t get_small_read_write_tlb() const override { return blackhole::MEM_SMALL_READ_WRITE_TLB; }
+
     const std::vector<uint32_t>& get_harvesting_noc_locations() const override {
         return blackhole::HARVESTING_NOC_LOCATIONS;
     }
+
     const std::vector<uint32_t>& get_t6_x_locations() const override { return blackhole::T6_X_LOCATIONS; }
+
     const std::vector<uint32_t>& get_t6_y_locations() const override { return blackhole::T6_Y_LOCATIONS; }
 
     std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
@@ -231,7 +258,6 @@ class blackhole_implementation : public architecture_implementation {
     tt_driver_host_address_params get_host_address_params() const override;
     tt_driver_eth_interface_params get_eth_interface_params() const override;
     tt_driver_noc_params get_noc_params() const override;
-
 };
 
 }  // namespace tt::umd
diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h
index b387ea15..8e97fc79 100644
--- a/device/api/umd/device/cluster.h
+++ b/device/api/umd/device/cluster.h
@@ -8,21 +8,20 @@
 #include <cassert>
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include <set>
 
-#include "umd/device/tt_soc_descriptor.h"
-#include "umd/device/tt_xy_pair.h"
-#include "umd/device/tt_silicon_driver_common.hpp"
-#include "umd/device/tt_cluster_descriptor_types.h"
+#include "fmt/core.h"
+#include "tt_silicon_driver_common.hpp"
+#include "tt_soc_descriptor.h"
+#include "tt_xy_pair.h"
+#include "umd/device/pci_device.hpp"
 #include "umd/device/tlb.h"
+#include "umd/device/tt_cluster_descriptor_types.h"
 #include "umd/device/tt_io.hpp"
 
-#include "umd/device/pci_device.hpp"
-#include "fmt/core.h"
-
 using TLB_DATA = tt::umd::tlb_data;
 
 // TODO: Remove this - it's here for Metal backwards compatibility.
@@ -30,29 +29,32 @@ using TLB_DATA = tt::umd::tlb_data;
 tt::ARCH detect_arch(int pci_device_num);
 tt::ARCH detect_arch();
 
-namespace boost::interprocess{
-    class named_mutex;
+namespace boost::interprocess {
+class named_mutex;
 }
 
 class tt_ClusterDescriptor;
 
-enum tt_DevicePowerState {
-    BUSY,
-    SHORT_IDLE,
-    LONG_IDLE
-};
+enum tt_DevicePowerState { BUSY, SHORT_IDLE, LONG_IDLE };
 
 enum tt_MemBarFlag {
     SET = 0xaa,
     RESET = 0xbb,
 };
 
-inline std::ostream &operator <<(std::ostream &os, const tt_DevicePowerState power_state) {
+inline std::ostream& operator<<(std::ostream& os, const tt_DevicePowerState power_state) {
     switch (power_state) {
-        case tt_DevicePowerState::BUSY: os << "Busy"; break;
-        case tt_DevicePowerState::SHORT_IDLE: os << "SHORT_IDLE"; break;
-        case tt_DevicePowerState::LONG_IDLE: os << "LONG_IDLE"; break;
-        default: throw ("Unknown DevicePowerState");
+        case tt_DevicePowerState::BUSY:
+            os << "Busy";
+            break;
+        case tt_DevicePowerState::SHORT_IDLE:
+            os << "SHORT_IDLE";
+            break;
+        case tt_DevicePowerState::LONG_IDLE:
+            os << "LONG_IDLE";
+            break;
+        default:
+            throw("Unknown DevicePowerState");
     }
     return os;
 }
@@ -116,20 +118,22 @@ struct tt_version {
     std::uint16_t major = 0xffff;
     std::uint8_t minor = 0xff;
     std::uint8_t patch = 0xff;
+
     tt_version() {}
+
     tt_version(std::uint16_t major_, std::uint8_t minor_, std::uint8_t patch_) {
         major = major_;
         minor = minor_;
         patch = patch_;
     }
+
     tt_version(std::uint32_t version) {
         major = (version >> 16) & 0xff;
         minor = (version >> 12) & 0xf;
         patch = version & 0xfff;
     }
-    std::string str() const {
-        return fmt::format("{}.{}.{}", major, minor, patch);
-    }
+
+    std::string str() const { return fmt::format("{}.{}.{}", major, minor, patch); }
 };
 
 struct tt_device_params {
@@ -140,29 +144,32 @@ struct tt_device_params {
     bool init_device = true;
     bool early_open_device = false;
     int aiclk = 0;
+
     // The command-line input for vcd_dump_cores can have the following format:
     // {"*-2", "1-*", "*-*", "1-2"}
     // '*' indicates we must dump all the cores in that dimension.
     // This function takes the vector above and unrolles the coords with '*' in one or both dimensions.
     std::vector<std::string> unroll_vcd_dump_cores(tt_xy_pair grid_size) const {
         std::vector<std::string> unrolled_dump_core;
-        for (auto &dump_core: vcd_dump_cores) {
+        for (auto& dump_core : vcd_dump_cores) {
             // If the input is a single *, then dump all cores.
             if (dump_core == "*") {
                 for (size_t x = 0; x < grid_size.x; x++) {
-                for (size_t y = 0; y < grid_size.y; y++) {
-                    std::string current_core_coord = fmt::format("{}-{}", x, y);
-                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) {
-                        unrolled_dump_core.push_back(current_core_coord);
+                    for (size_t y = 0; y < grid_size.y; y++) {
+                        std::string current_core_coord = fmt::format("{}-{}", x, y);
+                        if (std::find(
+                                std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) ==
+                            std::end(unrolled_dump_core)) {
+                            unrolled_dump_core.push_back(current_core_coord);
+                        }
                     }
                 }
-                }
                 continue;
             }
             // Each core coordinate must contain three characters: "core.x-core.y".
             assert(dump_core.size() <= 5);
             size_t delimiter_pos = dump_core.find('-');
-            assert (delimiter_pos != std::string::npos); // y-dim should exist in core coord.
+            assert(delimiter_pos != std::string::npos);  // y-dim should exist in core coord.
 
             std::string core_dim_x = dump_core.substr(0, delimiter_pos);
             size_t core_dim_y_start = delimiter_pos + 1;
@@ -172,7 +179,9 @@ struct tt_device_params {
                 for (size_t x = 0; x < grid_size.x; x++) {
                     for (size_t y = 0; y < grid_size.y; y++) {
                         std::string current_core_coord = fmt::format("{}-{}", x, y);
-                        if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) {
+                        if (std::find(
+                                std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) ==
+                            std::end(unrolled_dump_core)) {
                             unrolled_dump_core.push_back(current_core_coord);
                         }
                     }
@@ -180,14 +189,16 @@ struct tt_device_params {
             } else if (core_dim_x == "*") {
                 for (size_t x = 0; x < grid_size.x; x++) {
                     std::string current_core_coord = fmt::format("{}-{}", x, core_dim_y);
-                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) {
+                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) ==
+                        std::end(unrolled_dump_core)) {
                         unrolled_dump_core.push_back(current_core_coord);
                     }
                 }
             } else if (core_dim_y == "*") {
                 for (size_t y = 0; y < grid_size.y; y++) {
                     std::string current_core_coord = fmt::format("{}-{}", core_dim_x, y);
-                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) {
+                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) ==
+                        std::end(unrolled_dump_core)) {
                         unrolled_dump_core.push_back(current_core_coord);
                     }
                 }
@@ -199,10 +210,9 @@ struct tt_device_params {
     }
 
     std::vector<std::string> expand_plusargs() const {
-        std::vector<std::string> all_plusargs {
+        std::vector<std::string> all_plusargs{
             fmt::format("+enable_perf_scoreboard={}", enable_perf_scoreboard),
-            fmt::format("+register_monitor={}", register_monitor)
-        };
+            fmt::format("+register_monitor={}", register_monitor)};
 
         all_plusargs.insert(all_plusargs.end(), plusargs.begin(), plusargs.end());
 
@@ -216,18 +226,18 @@ struct tt_device_params {
  * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for Silicon.
  * Valid usage consists of declaring a tt_device object and initializing it to Silicon backend.
  * Using tt_device itself will throw errors, since its APIs are undefined.
- */ 
-class tt_device
-{
-    public:
-    tt_device(const std::string& sdesc_path);
+ */
+class tt_device {
+public:
+    tt_device();
     virtual ~tt_device();
+
     // Setup/Teardown Functions
     /**
      * Set L1 Address Map parameters used by UMD to communicate with the TT Device.
      *
      * @param l1_address_params_  All the L1 parameters required by UMD
-     */ 
+     */
     virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
         throw std::runtime_error("---- tt_device::set_device_l1_address_params is not implemented\n");
     }
@@ -240,9 +250,9 @@ class tt_device
      * Set Host Address Map parameters used by UMD to communicate with the TT Device (used for remote transactions).
      *
      * @param host_address_params_ All the Host Address space parameters required by UMD.
-     */ 
-    [[deprecated("Using unnecessary function.")]]
-    virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) {
+     */
+    [[deprecated("Using unnecessary function.")]] virtual void set_driver_host_address_params(
+        const tt_driver_host_address_params& host_address_params_) {
         throw std::runtime_error("---- tt_device::set_driver_host_address_params is not implemented\n");
     }
 
@@ -250,9 +260,9 @@ class tt_device
      * Set ERISC Firmware parameters used by UMD to communicate with the TT Device (used for remote transactions).
      *
      * @param eth_interface_params_ All the Ethernet Firmware parameters required by UMD.
-     */ 
-    [[deprecated("Using unnecessary function.")]]
-    virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) {
+     */
+    [[deprecated("Using unnecessary function.")]] virtual void set_driver_eth_interface_params(
+        const tt_driver_eth_interface_params& eth_interface_params_) {
         throw std::runtime_error("---- tt_device::set_driver_eth_interface_params is not implemented\n");
     }
 
@@ -264,8 +274,13 @@ class tt_device
      * @param tlb_index TLB id that will be programmed.
      * @param address Start address TLB is mapped to.
      * @param ordering Ordering mode for the TLB.
-     */ 
-    virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Relaxed) {
+     */
+    virtual void configure_tlb(
+        chip_id_t logical_device_id,
+        tt_xy_pair core,
+        std::int32_t tlb_index,
+        std::int32_t address,
+        uint64_t ordering = TLB_DATA::Relaxed) {
         throw std::runtime_error("---- tt_device::configure_tlb is not implemented\n");
     }
 
@@ -274,45 +289,51 @@ class tt_device
      *
      * @param fallback_tlb Dynamic TLB being targeted.
      * @param ordering Ordering mode for the TLB.
-     */ 
+     */
     virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted) {
         throw std::runtime_error("---- tt_device::set_fallback_tlb_ordering_mode is not implemented\n");
     }
-    
+
     /**
-     * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per core).
+     * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per
+     * core).
      *
      * @param logical_device_id MMIO chip being targeted.
      * @param mapping_function Function which maps core to TLB index.
      */
-    virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
+    virtual void setup_core_to_tlb_map(
+        const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
         throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n");
     }
 
     /**
-     * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to use a subset of cores from the active_eth_cores_per_chip set for all host->cluster
-     * non-MMIO transfers. If this function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5).
-     * If default behaviour is not desired, this function must be called for all MMIO devices.
+     * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to
+     * use a subset of cores from the active_eth_cores_per_chip set for all host->cluster non-MMIO transfers. If this
+     * function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5). If
+     * default behaviour is not desired, this function must be called for all MMIO devices.
      *
      * @param mmio_chip Device being targeted.
      * @param active_eth_cores_per_chip The active ethernet cores for this chip.
      */
-    virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip) {
-        throw std::runtime_error("---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n");
+    virtual void configure_active_ethernet_cores_for_mmio_device(
+        chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip) {
+        throw std::runtime_error(
+            "---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n");
     }
 
     /**
-     * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize iATUs for PCIe devices and ethernet queues for remote chips.
+     * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize
+     * iATUs for PCIe devices and ethernet queues for remote chips.
      *
      * @param device_params Object specifying initialization configuration.
      */
-    virtual void start_device(const tt_device_params &device_params) {
+    virtual void start_device(const tt_device_params& device_params) {
         throw std::runtime_error("---- tt_device::start_device is not implemented\n");
     }
 
     /**
      * Broadcast deassert soft Tensix Reset to the entire device (to be done after start_device is called).
-     */  
+     */
     virtual void deassert_risc_reset() {
         throw std::runtime_error("---- tt_device::deassert_risc_reset is not implemented\n");
     }
@@ -321,14 +342,15 @@ class tt_device
      * Send a soft deassert reset signal to a single tensix core.
      *
      * @param core Chip and core being targeted.
-     */  
-    virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET) {
+     */
+    virtual void deassert_risc_reset_at_core(
+        tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET) {
         throw std::runtime_error("---- tt_device::deassert_risc_reset_at_core is not implemented\n");
     }
 
     /**
      * Broadcast assert soft Tensix Reset to the entire device.
-     */  
+     */
     virtual void assert_risc_reset() {
         throw std::runtime_error("---- tt_device::assert_risc_reset is not implemented\n");
     }
@@ -337,7 +359,7 @@ class tt_device
      * Send a soft assert reset signal to a single tensix core.
      *
      * @param core Chip and core being targeted.
-     */  
+     */
     virtual void assert_risc_reset_at_core(tt_cxy_pair core) {
         throw std::runtime_error("---- tt_device::assert_risc_reset_at_core is not implemented\n");
     }
@@ -345,17 +367,15 @@ class tt_device
     /**
      * To be called at the end of a run.
      * Set power state to idle, assert tensix reset at all cores.
-     */  
-    virtual void close_device() {
-        throw std::runtime_error("---- tt_device::close_device is not implemented\n");
-    }
+     */
+    virtual void close_device() { throw std::runtime_error("---- tt_device::close_device is not implemented\n"); }
 
     // Runtime functions
     /**
      * Non-MMIO (ethernet) barrier.
-     * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding with the next one.
-     * This will be applied to all chips in the cluster.
-     */ 
+     * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding
+     * with the next one. This will be applied to all chips in the cluster.
+     */
     virtual void wait_for_non_mmio_flush() {
         throw std::runtime_error("---- tt_device::wait_for_non_mmio_flush is not implemented\n");
     }
@@ -377,12 +397,20 @@ class tt_device
      * @param addr Address to write to.
      * @param tlb_to_use Specifies fallback/dynamic TLB to use.
      */
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
+    virtual void write_to_device(
+        const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
         // Only implement this for Silicon Backend
         throw std::runtime_error("---- tt_device::write_to_device is not implemented\n");
     }
 
-    virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude,  std::set<uint32_t>& rows_to_exclude,  std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb) {
+    virtual void broadcast_write_to_cluster(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& columns_to_exclude,
+        const std::string& fallback_tlb) {
         throw std::runtime_error("---- tt_device::broadcast_write_to_cluster is not implemented\n");
     }
 
@@ -395,44 +423,54 @@ class tt_device
      * @param size Number of bytes to read.
      * @param fallback_tlb Specifies fallback/dynamic TLB to use.
      */
-    virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    virtual void read_from_device(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
         // Only implement this for Silicon Backend
         throw std::runtime_error("---- tt_device::read_from_device is not implemented\n");
     }
 
     /**
      * Write uint32_t vector to specified address and channel on host (defined for Silicon).
-     * 
+     *
      * @param vec Data to write.
      * @param addr Address to write to.
      * @param channel Host channel to target.
      * @param src_device_id Chip to target.
      */
-    virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
+    virtual void write_to_sysmem(
+        const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
         throw std::runtime_error("---- tt_device::write_to_sysmem is not implemented\n");
     }
-    virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
+
+    virtual void read_from_sysmem(
+        void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
         throw std::runtime_error("---- tt_device::read_from_sysmem is not implemented\n");
     }
-    virtual void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {}) {
+
+    virtual void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {}) {
         throw std::runtime_error("---- tt_device::l1_membar is not implemented\n");
     }
-    virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels = {}) {
+
+    virtual void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels = {}) {
         throw std::runtime_error("---- tt_device::dram_membar is not implemented\n");
     }
-    virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {}) {
+
+    virtual void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {}) {
         throw std::runtime_error("---- tt_device::dram_membar is not implemented\n");
     }
 
     // Misc. Functions to Query/Set Device State
     /**
-     * Query post harvesting SOC descriptors from UMD in virtual coordinates. 
+     * Query post harvesting SOC descriptors from UMD in virtual coordinates.
      * These descriptors should be used for looking up cores that are passed into UMD APIs.
      */
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors() {
         throw std::runtime_error("---- tt_device:get_virtual_soc_descriptors is not implemented\n");
     }
-   
+
     /**
      * Determine if UMD performed harvesting on SOC descriptors.
      */
@@ -440,18 +478,18 @@ class tt_device
         throw std::runtime_error("---- tt_device:using_harvested_soc_descriptors is not implemented\n");
         return 0;
     }
-    
+
     /**
      * Get harvesting masks for all chips/SOC Descriptors in the cluster.
      * Each mask represents a map of enabled (0) and disabled (1) rows on a specific chip (in NOC0 Coordinateds).
-     */ 
+     */
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors() {
         throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n");
     }
 
     /**
      * Issue message to device, meant to be picked up by ARC firmware.
-     * 
+     *
      * @param logical_device_id Chip to target.
      * @param msg_code Specifies type of ARC message.
      * @param wait_for_done Block until ARC responds.
@@ -460,8 +498,16 @@ class tt_device
      * @param timeout Timeout on ARC.
      * @param return3 Return value from ARC.
      * @param return4 Return value from ARC.
-     */ 
-    virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr) {
+     */
+    virtual int arc_msg(
+        int logical_device_id,
+        uint32_t msg_code,
+        bool wait_for_done = true,
+        uint32_t arg0 = 0,
+        uint32_t arg1 = 0,
+        int timeout = 1,
+        uint32_t* return_3 = nullptr,
+        uint32_t* return_4 = nullptr) {
         throw std::runtime_error("---- tt_device::arc_msg is not implemented\n");
     }
 
@@ -471,28 +517,28 @@ class tt_device
      * @param device_id Chip to target.
      * @param r Row coordinate.
      * @param c Column coordinate.
-     */ 
-    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {
+     */
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
         throw std::runtime_error("---- tt_device::translate_to_noc_table_coords is not implemented\n");
     }
 
     /**
      * Get the total number of chips in the cluster based on the network descriptor.
-     */ 
+     */
     virtual int get_number_of_chips_in_cluster() {
         throw std::runtime_error("---- tt_device::get_number_of_chips_in_cluster is not implemented\n");
     }
 
     /**
      * Get the logical ids for all chips in the cluster
-     */ 
+     */
     virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster() {
         throw std::runtime_error("---- tt_device::get_all_chips_in_cluster is not implemented\n");
     }
 
     /**
      * Get cluster descriptor object being used in UMD instance.
-     */ 
+     */
     virtual tt_ClusterDescriptor* get_cluster_description() {
         throw std::runtime_error("---- tt_device::get_cluster_description is not implemented\n");
     }
@@ -514,9 +560,9 @@ class tt_device
     /**
      * Get clock frequencies for all MMIO devices targeted by UMD.
      */
-    virtual std::map<int,int> get_clocks() {
+    virtual std::map<int, int> get_clocks() {
         throw std::runtime_error("---- tt_device::get_clocks is not implemented\n");
-        return std::map<int,int>();
+        return std::map<int, int>();
     }
 
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) {
@@ -534,7 +580,7 @@ class tt_device
      * Query number of DRAM channels on a specific device.
      *
      * @param device_id Logical device id to query.
-     */ 
+     */
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id) {
         throw std::runtime_error("---- tt_device::get_num_dram_channels is not implemented\n");
         return 0;
@@ -542,10 +588,10 @@ class tt_device
 
     /**
      * Get size for a specific DRAM channel on a device.
-     *    
+     *
      * @param device_id Device to target.
      * @param channel DRAM channel to target.
-     */ 
+     */
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
         throw std::runtime_error("---- tt_device::get_dram_channel_size is not implemented\n");
         return 0;
@@ -555,7 +601,7 @@ class tt_device
      * Query number of Host channels (hugepages) allocated for a specific device.
      *
      * @param device_id Logical device id to target.
-     */ 
+     */
     virtual std::uint32_t get_num_host_channels(std::uint32_t device_id) {
         throw std::runtime_error("---- tt_device::get_num_host_channels is not implemented\n");
         return 0;
@@ -566,20 +612,21 @@ class tt_device
      *
      * @param device_id Logical device id to target.
      * @param channel Logical host channel to target.
-     */ 
+     */
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {
         throw std::runtime_error("---- tt_device::get_host_channel_size is not implemented\n");
         return 0;
     }
 
     /**
-     * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific device.
-     *   
+     * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific
+     * device.
+     *
      * @param offset Offset wrt the start of the channel's address space.
-     * @param src_device_id Device to target. 
+     * @param src_device_id Device to target.
      * @param channel Host memory channel.
      */
-    virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
+    virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
         throw std::runtime_error("---- tt_device::host_dma_address is not implemented\n");
         return nullptr;
     }
@@ -588,28 +635,28 @@ class tt_device
         throw std::runtime_error("---- tt_device::get_pcie_base_addr_from_device is not implemented\n");
         return 0;
     }
+
     const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const;
 
     bool performed_harvesting = false;
     std::unordered_map<chip_id_t, uint32_t> harvested_rows_per_target = {};
     bool translation_tables_en = false;
 
-    protected:
+protected:
     std::unordered_map<chip_id_t, tt_SocDescriptor> soc_descriptor_per_chip = {};
 };
 
 namespace tt::umd {
 
 /**
-* Silicon Driver Class, derived from the tt_device class
+ * Silicon Driver Class, derived from the tt_device class
  * Implements APIs to communicate with a physical Tenstorrent Device.
-*/ 
-class Cluster: public tt_device
-{
-    public:
+ */
+class Cluster : public tt_device {
+public:
     // Constructor
     /**
-     * Silicon Driver constructor.
+     * Cluster constructor.
      *
      * @param sdesc_path SOC descriptor specifying single chip.
      * @param ndesc_path Network Descriptor specifying the network topology of the system.
@@ -619,43 +666,106 @@ class Cluster: public tt_device
      * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up.
      * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip.
      * @param simulated_harvesting_masks
-     */ 
-    Cluster(const std::string &sdesc_path, const std::string &ndesc_path, const std::set<chip_id_t> &target_devices, 
-                    const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false,
-                    const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {});
-    
-    //Setup/Teardown Functions
+     */
+    Cluster(
+        const std::string& sdesc_path,
+        const std::string& ndesc_path,
+        const std::set<chip_id_t>& target_devices,
+        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const bool skip_driver_allocs = false,
+        const bool clean_system_resources = false,
+        bool perform_harvesting = true,
+        std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {});
+
+    /**
+     * Cluster constructor. This constructor should be used to work towards removing all
+     * of the params from the constructor of tt_SiliconDevice (to become Cluster).
+     *
+     * @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages).
+     * @param skip_driver_allocs
+     * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up.
+     * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip.
+     * @param simulated_harvesting_masks
+     */
+    Cluster(
+        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const bool skip_driver_allocs = false,
+        const bool clean_system_resources = false,
+        bool perform_harvesting = true,
+        std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {});
+
+    /**
+     * Cluster constructor. This constructor should be used to target specific devices in a cluster.
+     *
+     * @param target_devices Devices to target.
+     * @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages).
+     * @param skip_driver_allocs
+     * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up.
+     * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip.
+     * @param simulated_harvesting_masks
+     */
+    Cluster(
+        const std::set<chip_id_t>& target_devices,
+        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const bool skip_driver_allocs = false,
+        const bool clean_system_resources = false,
+        bool perform_harvesting = true,
+        std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {});
+
+    // Setup/Teardown Functions
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
     virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
     virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
     virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_);
     virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_);
-    virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Posted);
+    virtual void configure_tlb(
+        chip_id_t logical_device_id,
+        tt_xy_pair core,
+        std::int32_t tlb_index,
+        std::int32_t address,
+        uint64_t ordering = TLB_DATA::Posted);
     virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted);
-    virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function);
-    virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip);
-    virtual void start_device(const tt_device_params &device_params);
+    virtual void setup_core_to_tlb_map(
+        const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function);
+    virtual void configure_active_ethernet_cores_for_mmio_device(
+        chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip);
+    virtual void start_device(const tt_device_params& device_params);
     virtual void assert_risc_reset();
     virtual void deassert_risc_reset();
-    virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET);
+    virtual void deassert_risc_reset_at_core(
+        tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET);
     virtual void assert_risc_reset_at_core(tt_cxy_pair core);
     virtual void close_device();
 
     // Runtime Functions
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
-    void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude,  std::set<uint32_t>& rows_to_exclude,  std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-
-    virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
-    virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id);
-    virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id);
+    virtual void write_to_device(
+        const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
+    void broadcast_write_to_cluster(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& columns_to_exclude,
+        const std::string& fallback_tlb);
+
+    virtual void read_from_device(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+    virtual void write_to_sysmem(
+        const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id);
+    virtual void read_from_sysmem(
+        void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id);
     virtual void wait_for_non_mmio_flush();
     virtual void wait_for_non_mmio_flush(const chip_id_t chip_id);
-    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
     // These functions are used by Debuda, so make them public
-    void bar_write32 (int logical_device_id, uint32_t addr, uint32_t data);
-    uint32_t bar_read32 (int logical_device_id, uint32_t addr);
+    void bar_write32(int logical_device_id, uint32_t addr, uint32_t data);
+    uint32_t bar_read32(int logical_device_id, uint32_t addr);
 
     /**
      * If the tlbs are initialized, returns a tuple with the TLB base address and its size
@@ -673,16 +783,24 @@ class Cluster: public tt_device
      * - the mapping is unchanged during the lifetime of the returned object.
      * - the Cluster instance outlives the returned object.
      * - use of the returned object is congruent with the target's TLB setup.
-     *    
+     *
      * @param target The target chip and core to write to.
      */
     tt::Writer get_static_tlb_writer(tt_cxy_pair target);
 
     // Misc. Functions to Query/Set Device State
-    virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
+    virtual int arc_msg(
+        int logical_device_id,
+        uint32_t msg_code,
+        bool wait_for_done = true,
+        uint32_t arg0 = 0,
+        uint32_t arg1 = 0,
+        int timeout = 1,
+        uint32_t* return_3 = nullptr,
+        uint32_t* return_4 = nullptr);
     virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
     virtual int get_number_of_chips_in_cluster();
     virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
     virtual tt_ClusterDescriptor* get_cluster_description();
@@ -690,13 +808,16 @@ class Cluster: public tt_device
     static std::vector<chip_id_t> detect_available_device_ids();
     virtual std::set<chip_id_t> get_target_mmio_device_ids();
     virtual std::set<chip_id_t> get_target_remote_device_ids();
-    virtual std::map<int,int> get_clocks();
-    virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
+    virtual std::map<int, int> get_clocks();
+    virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
     virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const;
-    static std::vector<int> extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows);
-    static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove);
+    static std::vector<int> extract_rows_to_remove(
+        const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows);
+    static void remove_worker_row_from_descriptor(
+        tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove);
     static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows);
-    static std::unordered_map<tt_xy_pair, tt_xy_pair> create_harvested_coord_translation(const tt::ARCH arch, bool identity_map);
+    static std::unordered_map<tt_xy_pair, tt_xy_pair> create_harvested_coord_translation(
+        const tt::ARCH arch, bool identity_map);
     std::unordered_map<tt_xy_pair, tt_xy_pair> get_harvested_coord_translation_map(chip_id_t logical_device_id);
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
@@ -705,72 +826,154 @@ class Cluster: public tt_device
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
     virtual tt_version get_ethernet_fw_version() const;
     // TODO: This should be accessible through public API, probably to be moved to tt_device.
-    PCIDevice *get_pci_device(int device_id) const;
+    PCIDevice* get_pci_device(int device_id) const;
 
     // Destructor
-    virtual ~Cluster ();
+    virtual ~Cluster();
 
-    private:
+private:
     // Helper functions
     // Startup + teardown
-    void create_device(const std::unordered_set<chip_id_t> &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources);
+    void create_device(
+        const std::unordered_set<chip_id_t>& target_mmio_device_ids,
+        const uint32_t& num_host_mem_ch_per_mmio_device,
+        const bool skip_driver_allocs,
+        const bool clean_system_resources);
     void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm);
     void cleanup_shared_host_state();
     void initialize_pcie_devices();
-    void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &cores);
-    void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets);
-    void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets);
-    void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets);
+    void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& cores);
+    void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets);
+    void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets);
+    void send_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets);
     void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting);
     void populate_cores();
-    void init_pcie_iatus(); // No more p2p support.
+    void init_pcie_iatus();  // No more p2p support.
     void check_pcie_device_initialized(int device_id);
     void set_pcie_power_state(tt_DevicePowerState state);
-    int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state);
+    int set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state);
     void set_power_state(tt_DevicePowerState state);
     uint32_t get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state);
     void enable_local_ethernet_queue(const chip_id_t& chip, int timeout);
     void enable_ethernet_queue(int timeout);
     void enable_remote_ethernet_queue(const chip_id_t& chip, int timeout);
     void deassert_resets_and_set_power_state();
-    int iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size);
-    uint32_t get_harvested_noc_rows (uint32_t harvesting_mask);
-    uint32_t get_harvested_rows (int logical_device_id);
+    int iatu_configure_peer_region(
+        int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size);
+    uint32_t get_harvested_noc_rows(uint32_t harvesting_mask);
+    uint32_t get_harvested_rows(int logical_device_id);
     int get_clock(int logical_device_id);
 
     // Communication Functions
-    void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id);
-    void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id);
-    void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb);
-    void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector<int> broadcast_header = {});
-    void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb);
+    void read_buffer(
+        void* mem_ptr,
+        std::uint32_t address,
+        std::uint16_t channel,
+        std::uint32_t size_in_bytes,
+        chip_id_t src_device_id);
+    void write_buffer(
+        const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id);
+    void write_device_memory(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        tt_cxy_pair target,
+        std::uint32_t address,
+        const std::string& fallback_tlb);
+    void write_to_non_mmio_device(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        tt_cxy_pair core,
+        uint64_t address,
+        bool broadcast = false,
+        std::vector<int> broadcast_header = {});
+    void read_device_memory(
+        void* mem_ptr,
+        tt_cxy_pair target,
+        std::uint32_t address,
+        std::uint32_t size_in_bytes,
+        const std::string& fallback_tlb);
     void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes);
-    void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
-    void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
-    void pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb);
-    void ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, const std::set<uint32_t>& rows_to_exclude, 
-                                  std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords);
-    void set_membar_flag(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb);
-    void insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_addr, const std::string& fallback_tlb);
+    void read_mmio_device_register(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+    void write_mmio_device_register(
+        const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+    void pcie_broadcast_write(
+        chip_id_t chip,
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        std::uint32_t addr,
+        const tt_xy_pair& start,
+        const tt_xy_pair& end,
+        const std::string& fallback_tlb);
+    void ethernet_broadcast_write(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        const std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& cols_to_exclude,
+        const std::string& fallback_tlb,
+        bool use_virtual_coords);
+    void set_membar_flag(
+        const chip_id_t chip,
+        const std::unordered_set<tt_xy_pair>& cores,
+        const uint32_t barrier_value,
+        const uint32_t barrier_addr,
+        const std::string& fallback_tlb);
+    void insert_host_to_device_barrier(
+        const chip_id_t chip,
+        const std::unordered_set<tt_xy_pair>& cores,
+        const uint32_t barrier_addr,
+        const std::string& fallback_tlb);
     void init_membars();
     uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset);
     uint16_t get_sys_rack(uint32_t rack_x, uint32_t rack_y);
     bool is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr);
-    int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
-    int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
-    bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
+    int pcie_arc_msg(
+        int logical_device_id,
+        uint32_t msg_code,
+        bool wait_for_done = true,
+        uint32_t arg0 = 0,
+        uint32_t arg1 = 0,
+        int timeout = 1,
+        uint32_t* return_3 = nullptr,
+        uint32_t* return_4 = nullptr);
+    int remote_arc_msg(
+        int logical_device_id,
+        uint32_t msg_code,
+        bool wait_for_done = true,
+        uint32_t arg0 = 0,
+        uint32_t arg1 = 0,
+        int timeout = 1,
+        uint32_t* return_3 = nullptr,
+        uint32_t* return_4 = nullptr);
+    bool address_in_tlb_space(
+        uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
     std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int pci_interface_id);
-    virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
-    void generate_tensix_broadcast_grids_for_grayskull( std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude);
-    std::unordered_map<chip_id_t, std::vector<std::vector<int>>>&  get_ethernet_broadcast_headers(const std::set<chip_id_t>& chips_to_exclude);
+    virtual uint32_t get_harvested_noc_rows_for_chip(
+        int logical_device_id);  // Returns one-hot encoded harvesting mask for PCIe mapped chips
+    void generate_tensix_broadcast_grids_for_grayskull(
+        std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& cols_to_exclude);
+    std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& get_ethernet_broadcast_headers(
+        const std::set<chip_id_t>& chips_to_exclude);
     // Test functions
     void verify_eth_fw();
-    void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t> &fw_versions);
-    int test_setup_interface ();
+    void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t>& fw_versions);
+    int test_setup_interface();
 
     // This functions has to be called for local chip, and then it will wait for all connected remote chips to flush.
     void wait_for_connected_non_mmio_flush(chip_id_t chip_id);
 
+    void construct_cluster(
+        const std::string& sdesc_path,
+        const uint32_t& num_host_mem_ch_per_mmio_device,
+        const bool skip_driver_allocs,
+        const bool clean_system_resources,
+        bool perform_harvesting,
+        std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks);
+
     // State variables
     tt_device_dram_address_params dram_address_params;
     tt_device_l1_address_params l1_address_params;
@@ -781,22 +984,24 @@ class Cluster: public tt_device
     std::set<chip_id_t> target_devices_in_cluster = {};
     std::set<chip_id_t> target_remote_chips = {};
     tt::ARCH arch_name;
-    std::unordered_map<chip_id_t, std::unique_ptr<PCIDevice>> m_pci_device_map;    // Map of enabled pci devices
-    int m_num_pci_devices;                                      // Number of pci devices in system (enabled or disabled)
+    std::unordered_map<chip_id_t, std::unique_ptr<PCIDevice>> m_pci_device_map;  // Map of enabled pci devices
+    int m_num_pci_devices;  // Number of pci devices in system (enabled or disabled)
     std::shared_ptr<tt_ClusterDescriptor> ndesc;
 
     // remote eth transfer setup
     static constexpr std::uint32_t NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 6;
     static constexpr std::uint32_t NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 4;
     static constexpr std::uint32_t NON_EPOCH_ETH_CORES_START_ID = 0;
-    static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1);
+    static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1);
 
-    static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS;
-    static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID = NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS;
-    static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1);
+    static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS =
+        NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS;
+    static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID =
+        NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS;
+    static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1);
 
     int active_core = NON_EPOCH_ETH_CORES_START_ID;
-    std::vector< std::vector<tt_cxy_pair> > remote_transfer_ethernet_cores;
+    std::vector<std::vector<tt_cxy_pair>> remote_transfer_ethernet_cores;
     std::unordered_map<chip_id_t, bool> flush_non_mmio_per_chip = {};
     bool non_mmio_transfer_cores_customized = false;
     std::unordered_map<chip_id_t, int> active_eth_core_idx_per_chip = {};
@@ -821,7 +1026,7 @@ class Cluster: public tt_device
     bool use_ethernet_ordered_writes = true;
     bool use_ethernet_broadcast = true;
     bool use_virtual_coords_for_eth_broadcast = true;
-    tt_version eth_fw_version; // Ethernet FW the driver is interfacing with
+    tt_version eth_fw_version;  // Ethernet FW the driver is interfacing with
     // Named Mutexes
     static constexpr char NON_MMIO_MUTEX_NAME[] = "NON_MMIO";
     static constexpr char ARC_MSG_MUTEX_NAME[] = "ARC_MSG";
@@ -830,13 +1035,13 @@ class Cluster: public tt_device
     static constexpr std::uint32_t SW_VERSION = 0x06060000;
 };
 
-}
+}  // namespace tt::umd
 
-constexpr inline bool operator==(const tt_version &a, const tt_version &b) {
+constexpr inline bool operator==(const tt_version& a, const tt_version& b) {
     return a.major == b.major && a.minor == b.minor && a.patch == b.patch;
 }
 
-constexpr inline bool operator>=(const tt_version &a, const tt_version &b) {
+constexpr inline bool operator>=(const tt_version& a, const tt_version& b) {
     bool fw_major_greater = a.major > b.major;
     bool fw_minor_greater = (a.major == b.major) && (a.minor > b.minor);
     bool patch_greater_or_equal = (a.major == b.major) && (a.minor == b.minor) && (a.patch >= b.patch);
diff --git a/device/api/umd/device/coordinate_manager.h b/device/api/umd/device/coordinate_manager.h
index e24e370b..bf98be70 100644
--- a/device/api/umd/device/coordinate_manager.h
+++ b/device/api/umd/device/coordinate_manager.h
@@ -7,17 +7,17 @@
 #pragma once
 
 #include <map>
-#include <vector>
 #include <set>
+#include <vector>
 
-#include "umd/device/tt_xy_pair.h"
 #include "umd/device/tt_arch_types.h"
+#include "umd/device/tt_xy_pair.h"
 
 class CoordinateManager {
-
 public:
-    CoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask)
-        : worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {}
+    CoordinateManager(
+        const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask) :
+        worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {}
 
     virtual void perform_harvesting(std::size_t harvesting_mask);
 
@@ -49,14 +49,17 @@ class CoordinateManager {
 
 protected:
     virtual void clear_harvesting_structures();
-    
+
     virtual std::set<std::size_t> get_x_coordinates_to_harvest(std::size_t harvesting_mask);
     virtual std::set<std::size_t> get_y_coordinates_to_harvest(std::size_t harvesting_mask);
 
     virtual void fill_logical_to_physical_mapping(
-        const std::set<size_t>& x_to_harvest, const std::set<size_t>& y_to_harvest,
+        const std::set<size_t>& x_to_harvest,
+        const std::set<size_t>& y_to_harvest,
+        const std::set<size_t>& physical_x_unharvested,
+        const std::set<size_t>& physical_y_unharvested);
+    virtual void fill_logical_to_virtual_mapping(
         const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested);
-    virtual void fill_logical_to_virtual_mapping(const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested);
 
     std::map<std::size_t, std::size_t> physical_y_to_logical_y;
     std::map<std::size_t, std::size_t> physical_x_to_logical_x;
diff --git a/device/api/umd/device/driver_atomics.h b/device/api/umd/device/driver_atomics.h
index ec213438..4ed3e7a6 100644
--- a/device/api/umd/device/driver_atomics.h
+++ b/device/api/umd/device/driver_atomics.h
@@ -12,54 +12,44 @@ namespace tt_driver_atomics {
 
 #if defined(__x86_64__) || defined(__i386__)
 // Store-Any barrier.
-static inline __attribute__((always_inline)) void sfence() {
-    _mm_sfence();
-}
+static inline __attribute__((always_inline)) void sfence() { _mm_sfence(); }
+
 // Load-Any barrier.
-static inline __attribute__((always_inline)) void lfence() {
-    _mm_lfence();
-}
+static inline __attribute__((always_inline)) void lfence() { _mm_lfence(); }
+
 // Any-Any barrier.
-static inline __attribute__((always_inline)) void mfence() {
-    _mm_mfence();
-}
+static inline __attribute__((always_inline)) void mfence() { _mm_mfence(); }
 
 #elif defined(__ARM_ARCH)
 
 static inline __attribute__((always_inline)) void sfence() {
     // Full memory barrier (full system). ARM does not have a Store-Any barrier.
     // https://developer.arm.com/documentation/100941/0101/Barriers
-    asm volatile ("DMB SY" : : : "memory");
+    asm volatile("DMB SY" : : : "memory");
 }
 
 static inline __attribute__((always_inline)) void lfence() {
     // Load-Any barrier (full system)
     // https://developer.arm.com/documentation/100941/0101/Barriers
-    asm volatile ("DMB LD" : : : "memory");
+    asm volatile("DMB LD" : : : "memory");
 }
 
 static inline __attribute__((always_inline)) void mfence() {
     // Full memory barrier (full system).
     // https://developer.arm.com/documentation/100941/0101/Barriers
-    asm volatile ("DMB SY" : : : "memory");
+    asm volatile("DMB SY" : : : "memory");
 }
 
 #elif defined(__riscv)
 
-static inline __attribute__((always_inline)) void sfence() {
-    asm volatile ("fence ow, ow" : : : "memory");
-}
+static inline __attribute__((always_inline)) void sfence() { asm volatile("fence ow, ow" : : : "memory"); }
 
-static inline __attribute__((always_inline)) void lfence() {
-    asm volatile ("fence ir, ir" : : : "memory");
-}
+static inline __attribute__((always_inline)) void lfence() { asm volatile("fence ir, ir" : : : "memory"); }
 
-static inline __attribute__((always_inline)) void mfence() {
-    asm volatile ("fence iorw, iorw" : : : "memory");
-}
+static inline __attribute__((always_inline)) void mfence() { asm volatile("fence iorw, iorw" : : : "memory"); }
 
 #else
 #error "Unsupported architecture"
 #endif
 
-} // namespace tt_driver_atomics
+}  // namespace tt_driver_atomics
diff --git a/device/api/umd/device/grayskull_implementation.h b/device/api/umd/device/grayskull_implementation.h
index 8f476ade..0a93e9b2 100644
--- a/device/api/umd/device/grayskull_implementation.h
+++ b/device/api/umd/device/grayskull_implementation.h
@@ -104,7 +104,8 @@ enum class arc_message_type {
 };
 
 // DEVICE_DATA
-static const std::array<xy_pair, 8> DRAM_LOCATIONS = {{{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}};
+static const std::array<xy_pair, 8> DRAM_LOCATIONS = {
+    {{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}};
 static const std::array<xy_pair, 1> ARC_LOCATIONS = {{{0, 2}}};
 static const std::array<xy_pair, 1> PCI_LOCATIONS = {{{0, 4}}};
 static const std::array<xy_pair, 0> ETH_LOCATIONS = {};
@@ -134,7 +135,8 @@ static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000;
 static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 8;
 
 static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024;
-static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES);
+static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR =
+    STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES);
 static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M;
 
 static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024;
@@ -171,59 +173,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;
 }  // namespace grayskull
 
 class grayskull_implementation : public architecture_implementation {
-   public:
+public:
     tt::ARCH get_architecture() const override { return tt::ARCH::GRAYSKULL; }
+
     uint32_t get_arc_message_arc_get_harvesting() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::ARC_GET_HARVESTING);
     }
+
     uint32_t get_arc_message_arc_go_busy() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::ARC_GO_BUSY);
     }
+
     uint32_t get_arc_message_arc_go_long_idle() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::ARC_GO_LONG_IDLE);
     }
+
     uint32_t get_arc_message_arc_go_short_idle() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::ARC_GO_SHORT_IDLE);
     }
+
     uint32_t get_arc_message_deassert_riscv_reset() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::DEASSERT_RISCV_RESET);
     }
+
     uint32_t get_arc_message_get_aiclk() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::GET_AICLK);
     }
+
     uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER);
     }
+
     uint32_t get_arc_message_test() const override { return static_cast<uint32_t>(grayskull::arc_message_type::TEST); }
+
     uint32_t get_arc_csm_mailbox_offset() const override { return grayskull::ARC_CSM_MAILBOX_OFFSET; }
+
     uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return grayskull::ARC_RESET_ARC_MISC_CNTL_OFFSET; }
+
     uint32_t get_arc_reset_scratch_offset() const override { return grayskull::ARC_RESET_SCRATCH_OFFSET; }
+
     uint32_t get_dram_channel_0_peer2peer_region_start() const override {
         return grayskull::DRAM_CHANNEL_0_PEER2PEER_REGION_START;
     }
+
     uint32_t get_dram_channel_0_x() const override { return grayskull::DRAM_CHANNEL_0_X; }
+
     uint32_t get_dram_channel_0_y() const override { return grayskull::DRAM_CHANNEL_0_Y; }
+
     uint32_t get_broadcast_tlb_index() const override { return grayskull::BROADCAST_TLB_INDEX; }
+
     uint32_t get_dynamic_tlb_2m_base() const override { return grayskull::DYNAMIC_TLB_2M_BASE; }
+
     uint32_t get_dynamic_tlb_2m_size() const override { return grayskull::DYNAMIC_TLB_2M_SIZE; }
+
     uint32_t get_dynamic_tlb_16m_base() const override { return grayskull::DYNAMIC_TLB_16M_BASE; }
+
     uint32_t get_dynamic_tlb_16m_size() const override { return grayskull::DYNAMIC_TLB_16M_SIZE; }
+
     uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return grayskull::DYNAMIC_TLB_16M_CFG_ADDR; }
+
     uint32_t get_mem_large_read_tlb() const override { return grayskull::MEM_LARGE_READ_TLB; }
+
     uint32_t get_mem_large_write_tlb() const override { return grayskull::MEM_LARGE_WRITE_TLB; }
+
     uint32_t get_static_tlb_cfg_addr() const override { return grayskull::STATIC_TLB_CFG_ADDR; }
+
     uint32_t get_static_tlb_size() const override { return grayskull::STATIC_TLB_SIZE; }
+
     uint32_t get_reg_tlb() const override { return grayskull::REG_TLB; }
+
     uint32_t get_tlb_base_index_16m() const override { return grayskull::TLB_BASE_INDEX_16M; }
+
     uint32_t get_tensix_soft_reset_addr() const override { return grayskull::TENSIX_SOFT_RESET_ADDR; }
+
     uint32_t get_grid_size_x() const override { return grayskull::GRID_SIZE_X; }
+
     uint32_t get_grid_size_y() const override { return grayskull::GRID_SIZE_Y; }
+
     uint32_t get_tlb_cfg_reg_size_bytes() const override { return grayskull::TLB_CFG_REG_SIZE_BYTES; }
+
     uint32_t get_small_read_write_tlb() const override { return grayskull::MEM_SMALL_READ_WRITE_TLB; }
+
     const std::vector<uint32_t>& get_harvesting_noc_locations() const override {
         return grayskull::HARVESTING_NOC_LOCATIONS;
     }
+
     const std::vector<uint32_t>& get_t6_x_locations() const override { return grayskull::T6_X_LOCATIONS; }
+
     const std::vector<uint32_t>& get_t6_y_locations() const override { return grayskull::T6_Y_LOCATIONS; }
 
     std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
@@ -234,7 +270,6 @@ class grayskull_implementation : public architecture_implementation {
     tt_driver_host_address_params get_host_address_params() const override;
     tt_driver_eth_interface_params get_eth_interface_params() const override;
     tt_driver_noc_params get_noc_params() const override;
-
 };
 
 }  // namespace tt::umd
diff --git a/device/api/umd/device/hugepage.h b/device/api/umd/device/hugepage.h
index 18840ec5..1bf37dac 100644
--- a/device/api/umd/device/hugepage.h
+++ b/device/api/umd/device/hugepage.h
@@ -6,10 +6,10 @@
 
 #pragma once
 
-#include "umd/device/tt_cluster_descriptor_types.h"
-
-#include <string>
 #include <cstdint>
+#include <string>
+
+#include "umd/device/tt_cluster_descriptor_types.h"
 
 namespace tt::umd {
 
@@ -17,7 +17,8 @@ namespace tt::umd {
 uint32_t get_num_hugepages();
 
 // Dynamically figure out how many host memory channels (based on hugepages installed) for each device, based on arch.
-uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id);
+uint32_t get_available_num_host_mem_channels(
+    const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id);
 
 // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G)
 std::string find_hugepage_dir(std::size_t pagesize);
@@ -27,4 +28,4 @@ std::string find_hugepage_dir(std::size_t pagesize);
 // Today we assume there's only one pipeline running within the system.
 // One hugepage per device such that each device gets unique memory.
 int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel);
-}
+}  // namespace tt::umd
diff --git a/device/api/umd/device/pci_device.hpp b/device/api/umd/device/pci_device.hpp
index c0c2c20d..914663d8 100644
--- a/device/api/umd/device/pci_device.hpp
+++ b/device/api/umd/device/pci_device.hpp
@@ -12,28 +12,30 @@
 #include <unordered_map>
 #include <vector>
 
-#include "umd/device/tt_xy_pair.h"
+#include "umd/device/tlb.h"
 #include "umd/device/tt_arch_types.h"
 #include "umd/device/tt_cluster_descriptor_types.h"
-#include "umd/device/tlb.h"
+#include "umd/device/tt_xy_pair.h"
 
 // TODO: this is used up in cluster.cpp but that logic ought to be
 // lowered into the PCIDevice class since it is specific to PCIe cards.
 // See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
 static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;
 
-// TODO: this is a bit of a hack... something to revisit when we formalize an 
+// TODO: this is a bit of a hack... something to revisit when we formalize an
 // abstraction for IO.
 // BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
 static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;
 
 constexpr unsigned int c_hang_read_value = 0xffffffffu;
 
-namespace tt::umd { class architecture_implementation; }
+namespace tt::umd {
+class architecture_implementation;
+}
 
 struct dynamic_tlb {
-    uint64_t bar_offset;        // Offset that address is mapped to, within the PCI BAR.
-    uint64_t remaining_size;    // Bytes remaining between bar_offset and end of the TLB.
+    uint64_t bar_offset;      // Offset that address is mapped to, within the PCI BAR.
+    uint64_t remaining_size;  // Bytes remaining between bar_offset and end of the TLB.
 };
 
 struct hugepage_mapping {
@@ -42,8 +44,7 @@ struct hugepage_mapping {
     uint64_t physical_address = 0;
 };
 
-struct PciDeviceInfo
-{
+struct PciDeviceInfo {
     uint16_t vendor_id;
     uint16_t device_id;
     uint16_t pci_domain;
@@ -57,14 +58,14 @@ struct PciDeviceInfo
 };
 
 class PCIDevice {
-    const std::string device_path;  // Path to character device: /dev/tenstorrent/N
-    const int pci_device_num;       // N in /dev/tenstorrent/N
-    const int logical_id;           // Unique identifier for each device in entire network topology
-    const int pci_device_file_desc; // Character device file descriptor
-    const PciDeviceInfo info;       // PCI device info
-    const int numa_node;            // -1 if non-NUMA
-    const int revision;             // PCI revision value from sysfs
-    const tt::ARCH arch;            // e.g. Grayskull, Wormhole, Blackhole
+    const std::string device_path;   // Path to character device: /dev/tenstorrent/N
+    const int pci_device_num;        // N in /dev/tenstorrent/N
+    const int logical_id;            // Unique identifier for each device in entire network topology
+    const int pci_device_file_desc;  // Character device file descriptor
+    const PciDeviceInfo info;        // PCI device info
+    const int numa_node;             // -1 if non-NUMA
+    const int revision;              // PCI revision value from sysfs
+    const tt::ARCH arch;             // e.g. Grayskull, Wormhole, Blackhole
     std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;
 
 public:
@@ -83,7 +84,7 @@ class PCIDevice {
      *
      * Opens the character device file descriptor, reads device information from
      * sysfs, and maps device memory region(s) into the process address space.
-     * 
+     *
      * @param pci_device_number     N in /dev/tenstorrent/N
      * @param logical_device_id     unique identifier for this device in the network topology
      */
@@ -95,8 +96,8 @@ class PCIDevice {
      */
     ~PCIDevice();
 
-    PCIDevice(const PCIDevice&) = delete; // copy
-    void operator=(const PCIDevice&) = delete; // copy assignment
+    PCIDevice(const PCIDevice &) = delete;       // copy
+    void operator=(const PCIDevice &) = delete;  // copy assignment
 
     /**
      * @return PCI device info
@@ -155,21 +156,39 @@ class PCIDevice {
     // NOC endpoints.  Probably worth waiting for the KMD to start owning the
     // resource management aspect of these PCIe->NOC mappings (the "TLBs")
     // before doing too much work here...
-    void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr);
-    void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr);
+    void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr);
+    void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr);
     void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data);
     void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
     void read_regs(uint32_t byte_addr, uint32_t word_len, void *data);
 
     // TLB related functions.
     // TODO: These are architecture specific, and will be moved out of the class.
-    void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
-    dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end,
-                                std::uint64_t address, bool multicast, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering);
-    dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
-    dynamic_tlb set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
-
-    tt::umd::architecture_implementation* get_architecture_implementation() const;
+    void write_tlb_reg(
+        uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
+    dynamic_tlb set_dynamic_tlb(
+        unsigned int tlb_index,
+        tt_xy_pair start,
+        tt_xy_pair end,
+        std::uint64_t address,
+        bool multicast,
+        std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+        std::uint64_t ordering);
+    dynamic_tlb set_dynamic_tlb(
+        unsigned int tlb_index,
+        tt_xy_pair target,
+        std::uint64_t address,
+        std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+        std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
+    dynamic_tlb set_dynamic_tlb_broadcast(
+        unsigned int tlb_index,
+        std::uint64_t address,
+        std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+        tt_xy_pair start,
+        tt_xy_pair end,
+        std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
+
+    tt::umd::architecture_implementation *get_architecture_implementation() const;
     void detect_hang_read(uint32_t data_read = c_hang_read_value);
 
     // TODO: this also probably has more sense to live in the future TTDevice class.
@@ -197,8 +216,8 @@ class PCIDevice {
     // and simplify the code.
     void *system_reg_mapping = nullptr;
     size_t system_reg_mapping_size;
-    uint32_t system_reg_start_offset;  // Registers >= this are system regs, use the mapping.
-    uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping.
+    uint32_t system_reg_start_offset;   // Registers >= this are system regs, use the mapping.
+    uint32_t system_reg_offset_adjust;  // This is the offset of the first reg in the system reg mapping.
 
     uint32_t read_checking_offset;
 
@@ -206,7 +225,7 @@ class PCIDevice {
     bool is_hardware_hung();
 
     template <typename T>
-    T* get_register_address(uint32_t register_offset);
+    T *get_register_address(uint32_t register_offset);
 
     // For debug purposes when various stages fails.
     void print_file_contents(std::string filename, std::string hint = "");
diff --git a/device/api/umd/device/tlb.h b/device/api/umd/device/tlb.h
index 3e8fb826..30094202 100644
--- a/device/api/umd/device/tlb.h
+++ b/device/api/umd/device/tlb.h
@@ -8,8 +8,8 @@
 
 #include <cstdint>
 #include <optional>
-#include <utility>
 #include <stdexcept>
+#include <utility>
 
 namespace tt::umd {
 
@@ -41,10 +41,10 @@ struct tlb_data {
 
     // Orderings
     static constexpr uint64_t Relaxed = 0;
-    static constexpr uint64_t Strict  = 1;
-    static constexpr uint64_t Posted  = 2;
+    static constexpr uint64_t Strict = 1;
+    static constexpr uint64_t Posted = 2;
 
-    bool check(const tlb_offsets & offset) const;
+    bool check(const tlb_offsets &offset) const;
     std::pair<std::uint64_t, std::uint64_t> apply_offset(const tlb_offsets &offset) const;
 };
 
diff --git a/device/api/umd/device/tt_arch_types.h b/device/api/umd/device/tt_arch_types.h
index 8a7c5dba..c165bf1b 100644
--- a/device/api/umd/device/tt_arch_types.h
+++ b/device/api/umd/device/tt_arch_types.h
@@ -17,4 +17,4 @@ enum class ARCH {
     BLACKHOLE = 3,
     Invalid = 0xFF,
 };
-}
+}  // namespace tt
diff --git a/device/api/umd/device/tt_cluster_descriptor.h b/device/api/umd/device/tt_cluster_descriptor.h
index 85d62c33..c39bdd93 100644
--- a/device/api/umd/device/tt_cluster_descriptor.h
+++ b/device/api/umd/device/tt_cluster_descriptor.h
@@ -4,23 +4,24 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-
 #pragma once
 
-#include "umd/device/tt_xy_pair.h"
-
 #include <cstdint>
-#include <unordered_map>
-#include <unordered_set>
-#include <set>
 #include <map>
-#include <tuple>
+#include <memory>
+#include <set>
 #include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
-#include <memory>
+
 #include "umd/device/tt_cluster_descriptor_types.h"
+#include "umd/device/tt_xy_pair.h"
 
-namespace YAML { class Node; }
+namespace YAML {
+class Node;
+}
 
 enum BoardType : uint32_t {
     N150 = 0,
@@ -32,88 +33,93 @@ enum BoardType : uint32_t {
 };
 
 class tt_ClusterDescriptor {
-
-  private:
-  int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const;
-
-  protected:
-
-  std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > > ethernet_connections;
-  std::unordered_map<chip_id_t, eth_coord_t> chip_locations;
-  // reverse map: rack/shelf/y/x -> chip_id
-  std::map<int, std::map<int, std::map<int, std::map<int, chip_id_t > > > > coords_to_chip_ids;
-  std::unordered_map<chip_id_t, chip_id_t> chips_with_mmio;
-  std::unordered_set<chip_id_t> all_chips;
-  std::unordered_map<chip_id_t, bool> noc_translation_enabled = {};
-  std::unordered_map<chip_id_t, std::uint32_t> harvesting_masks = {};
-  std::unordered_set<chip_id_t> enabled_active_chips;
-  std::unordered_map<chip_id_t, chip_id_t> closest_mmio_chip_cache = {};
-  std::unordered_map<chip_id_t, BoardType> chip_board_type = {};
-  std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio;
-
-  // one-to-many chip connections
-  struct Chip2ChipConnection {
-    eth_coord_t source_chip_coord;
-    std::unordered_set<eth_coord_t> destination_chip_coords;
-  };
-
-  // shelf_id -> y dim -> list of chip2chip connections between different shelves
-  // assumption is that on every row of the shelf there is a chip that is connected to the other shelf
-  // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other shelf (in case of nebula->galaxy)
-  std::unordered_map<int, std::unordered_map<int, Chip2ChipConnection > > galaxy_shelves_exit_chip_coords_per_y_dim = {};
-  // rack_id -> x dim -> list of chip2chip connections between different racks
-  // assumption is that on every row of the rack there is a chip that is connected to the other rack
-  std::unordered_map<int, std::unordered_map<int, Chip2ChipConnection > > galaxy_racks_exit_chip_coords_per_x_dim = {};
-
-  static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc);
-  static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc);
-  static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc);
-
-  void fill_chips_grouped_by_closest_mmio();
-
- public:
-  tt_ClusterDescriptor() = default;
-  tt_ClusterDescriptor(const tt_ClusterDescriptor&) = default;
-
-  /*
-   * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument ordering when calling the function
-   * An empty result implies that the two chips do not share any direct connection
-   */
-  std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const;
-  
-  bool is_chip_mmio_capable(const chip_id_t chip_id) const;
-  bool is_chip_remote(const chip_id_t chip_id) const;
-  chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip);
-  chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord);
-
-  // TODO: These following functions will be removed, and ClusterDescriptor will be created without any parameters.
-  // get_cluster_descriptor_file_path will create ethernet map in the background.
-  static std::string get_cluster_descriptor_file_path();
-  static std::unique_ptr<tt_ClusterDescriptor> create_from_yaml(const std::string &cluster_descriptor_file_path);
-
-  // TODO: This function is used to create mock cluster descriptor yaml files, for example for simulation.
-  // The name of the function is kept to not gate the changes regarding create-ethernet-map.
-  // It should be renamed to something like create_mock_cluster_descriptor and changed in tt-metal/tt-debuda.
-  static std::unique_ptr<tt_ClusterDescriptor> create_for_grayskull_cluster(
-    const std::set<chip_id_t> &logical_mmio_device_ids,
-    const std::vector<chip_id_t> &physical_mmio_device_ids);
-
-  const std::unordered_map<chip_id_t, std::uint32_t>& get_harvesting_info() const;
-  const std::unordered_map<chip_id_t, bool>& get_noc_translation_table_en() const;
-  const std::unordered_map<chip_id_t, eth_coord_t>& get_chip_locations() const;
-  const std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > > get_ethernet_connections() const;
-  const std::unordered_map<chip_id_t, chip_id_t> get_chips_with_mmio() const;
-  const std::unordered_set<chip_id_t>& get_all_chips() const;
-  const std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>>& get_chips_grouped_by_closest_mmio() const;
-  std::size_t get_number_of_chips() const;
-
-  int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const;
-
-  BoardType get_board_type(chip_id_t chip_id) const;
-
-  bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
-  std::tuple<chip_id_t, ethernet_channel_t> get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
-
-  void enable_all_devices();
-
+private:
+    int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const;
+
+protected:
+    std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>
+        ethernet_connections;
+    std::unordered_map<chip_id_t, eth_coord_t> chip_locations;
+    // reverse map: rack/shelf/y/x -> chip_id
+    std::map<int, std::map<int, std::map<int, std::map<int, chip_id_t>>>> coords_to_chip_ids;
+    std::unordered_map<chip_id_t, chip_id_t> chips_with_mmio;
+    std::unordered_set<chip_id_t> all_chips;
+    std::unordered_map<chip_id_t, bool> noc_translation_enabled = {};
+    std::unordered_map<chip_id_t, std::uint32_t> harvesting_masks = {};
+    std::unordered_set<chip_id_t> enabled_active_chips;
+    std::unordered_map<chip_id_t, chip_id_t> closest_mmio_chip_cache = {};
+    std::unordered_map<chip_id_t, BoardType> chip_board_type = {};
+    std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio;
+
+    // one-to-many chip connections
+    struct Chip2ChipConnection {
+        eth_coord_t source_chip_coord;
+        std::unordered_set<eth_coord_t> destination_chip_coords;
+    };
+
+    // shelf_id -> y dim -> list of chip2chip connections between different shelves
+    // assumption is that on every row of the shelf there is a chip that is connected to the other shelf
+    // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other
+    // shelf (in case of nebula->galaxy)
+    std::unordered_map<int, std::unordered_map<int, Chip2ChipConnection>> galaxy_shelves_exit_chip_coords_per_y_dim =
+        {};
+    // rack_id -> x dim -> list of chip2chip connections between different racks
+    // assumption is that on every row of the rack there is a chip that is connected to the other rack
+    std::unordered_map<int, std::unordered_map<int, Chip2ChipConnection>> galaxy_racks_exit_chip_coords_per_x_dim = {};
+
+    static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc);
+    static void fill_galaxy_connections(tt_ClusterDescriptor &desc);
+    static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc);
+    static void merge_cluster_ids(tt_ClusterDescriptor &desc);
+    static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc);
+
+    void fill_chips_grouped_by_closest_mmio();
+
+public:
+    tt_ClusterDescriptor() = default;
+    tt_ClusterDescriptor(const tt_ClusterDescriptor &) = default;
+
+    /*
+     * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument
+     * ordering when calling the function An empty result implies that the two chips do not share any direct connection
+     */
+    std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>>
+    get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const;
+
+    bool is_chip_mmio_capable(const chip_id_t chip_id) const;
+    bool is_chip_remote(const chip_id_t chip_id) const;
+    chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip);
+    chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord);
+
+    // TODO: These following functions will be removed, and ClusterDescriptor will be created without any parameters.
+    // get_cluster_descriptor_file_path will create ethernet map in the background.
+    static std::string get_cluster_descriptor_file_path();
+    static std::unique_ptr<tt_ClusterDescriptor> create_from_yaml(const std::string &cluster_descriptor_file_path);
+
+    // TODO: This function is used to create mock cluster descriptor yaml files, for example for simulation.
+    // The name of the function is kept to not gate the changes regarding create-ethernet-map.
+    // It should be renamed to something like create_mock_cluster_descriptor and changed in tt-metal/tt-debuda.
+    static std::unique_ptr<tt_ClusterDescriptor> create_for_grayskull_cluster(
+        const std::set<chip_id_t> &logical_mmio_device_ids, const std::vector<chip_id_t> &physical_mmio_device_ids);
+
+    const std::unordered_map<chip_id_t, std::uint32_t> &get_harvesting_info() const;
+    const std::unordered_map<chip_id_t, bool> &get_noc_translation_table_en() const;
+    const std::unordered_map<chip_id_t, eth_coord_t> &get_chip_locations() const;
+    const std::
+        unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>
+        get_ethernet_connections() const;
+    const std::unordered_map<chip_id_t, chip_id_t> get_chips_with_mmio() const;
+    const std::unordered_set<chip_id_t> &get_all_chips() const;
+    const std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> &get_chips_grouped_by_closest_mmio() const;
+    std::size_t get_number_of_chips() const;
+
+    int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const;
+
+    BoardType get_board_type(chip_id_t chip_id) const;
+
+    bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
+    std::tuple<chip_id_t, ethernet_channel_t> get_chip_and_channel_of_remote_ethernet_core(
+        chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
+
+    void enable_all_devices();
 };
diff --git a/device/api/umd/device/tt_cluster_descriptor_types.h b/device/api/umd/device/tt_cluster_descriptor_types.h
index 142c9fef..81b652f5 100644
--- a/device/api/umd/device/tt_cluster_descriptor_types.h
+++ b/device/api/umd/device/tt_cluster_descriptor_types.h
@@ -4,25 +4,47 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#pragma once 
+#pragma once
 
 #include <functional>
 #include <tuple>
 
 using chip_id_t = int;
 using ethernet_channel_t = int;
-using eth_coord_t = std::tuple<int, int, int, int>;  // x, y, rack, shelf
+
+struct eth_coord_t {
+    int cluster_id;  // This is the same for connected chips.
+    int x;
+    int y;
+    int rack;
+    int shelf;
+
+    // in C++20 this should be defined as:
+    // constexpr bool operator==(const eth_coord_t &other) const noexcept = default;
+    constexpr bool operator==(const eth_coord_t &other) const noexcept {
+        return (
+            cluster_id == other.cluster_id and x == other.x and y == other.y and rack == other.rack and
+            shelf == other.shelf);
+    }
+};
+
+// Small performant hash combiner taken from boost library.
+// Not using boost::hash_combine due to dependency complications.
+inline void boost_hash_combine(std::size_t &seed, const int value) {
+    seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
 
 namespace std {
 template <>
 struct hash<eth_coord_t> {
-  std::size_t operator()(eth_coord_t const &c) const {
-    std::size_t seed = 0;
-    seed = std::hash<std::size_t>()(std::get<0>(c)) << 48 | 
-          std::hash<std::size_t>()(std::get<1>(c)) << 32 |
-          std::hash<std::size_t>()(std::get<2>(c)) << 16 |
-          std::hash<std::size_t>()(std::get<3>(c));
-    return seed;
-  }
+    std::size_t operator()(eth_coord_t const &c) const {
+        std::size_t seed = 0;
+        boost_hash_combine(seed, c.cluster_id);
+        boost_hash_combine(seed, c.x);
+        boost_hash_combine(seed, c.y);
+        boost_hash_combine(seed, c.rack);
+        boost_hash_combine(seed, c.shelf);
+        return seed;
+    }
 };
-}
+}  // namespace std
diff --git a/device/api/umd/device/tt_io.hpp b/device/api/umd/device/tt_io.hpp
index 8d0203e3..174903cb 100644
--- a/device/api/umd/device/tt_io.hpp
+++ b/device/api/umd/device/tt_io.hpp
@@ -11,7 +11,7 @@
 namespace tt {
 
 namespace umd {
-    class Cluster;
+class Cluster;
 }
 
 /**
@@ -22,20 +22,18 @@ namespace umd {
  *
  * It is the caller's responsibility to manage the lifetime of Writer objects.
  */
-class Writer
-{
+class Writer {
     friend class tt::umd::Cluster;
 
 public:
     /**
      * @brief Write to a SoC core.
-     * 
+     *
      * @param address must be aligned to the size of T
-     * @param value 
+     * @param value
      */
     template <class T>
-    void write(uint32_t address, T value)
-    {
+    void write(uint32_t address, T value) {
         auto dst = reinterpret_cast<uintptr_t>(base) + address;
 
         if (address >= tlb_size) {
@@ -46,27 +44,23 @@ class Writer
             throw std::runtime_error("Unaligned write");
         }
 
-        *reinterpret_cast<volatile T*>(dst) = value;
+        *reinterpret_cast<volatile T *>(dst) = value;
     }
 
 private:
     /**
      * @brief tt::umd::Cluster interface to construct a new Writer object.
-     * 
+     *
      * @param base pointer to the base address of a mapped TLB.
      * @param tlb_size size of the mapped TLB.
      */
-    Writer(void *base, size_t tlb_size)
-        : base(base)
-        , tlb_size(tlb_size)
-    {
+    Writer(void *base, size_t tlb_size) : base(base), tlb_size(tlb_size) {
         assert(base);
         assert(tlb_size > 0);
     }
 
-    void *base{ nullptr };
-    size_t tlb_size{ 0 };
+    void *base{nullptr};
+    size_t tlb_size{0};
 };
 
-
-} // namespace tt
+}  // namespace tt
diff --git a/device/api/umd/device/tt_silicon_driver_common.hpp b/device/api/umd/device/tt_silicon_driver_common.hpp
index 9f275668..6dc6d7f4 100644
--- a/device/api/umd/device/tt_silicon_driver_common.hpp
+++ b/device/api/umd/device/tt_silicon_driver_common.hpp
@@ -9,53 +9,42 @@
 #include <cstdint>
 #include <string>
 
-enum class TensixSoftResetOptions: std::uint32_t {
+enum class TensixSoftResetOptions : std::uint32_t {
     NONE = 0,
-    BRISC = ((std::uint32_t) 1 << 11),
-    TRISC0 = ((std::uint32_t) 1 << 12),
-    TRISC1 = ((std::uint32_t) 1 << 13),
-    TRISC2 = ((std::uint32_t) 1 << 14),
-    NCRISC = ((std::uint32_t) 1 << 18),
-    STAGGERED_START = ((std::uint32_t) 1 << 31)
+    BRISC = ((std::uint32_t)1 << 11),
+    TRISC0 = ((std::uint32_t)1 << 12),
+    TRISC1 = ((std::uint32_t)1 << 13),
+    TRISC2 = ((std::uint32_t)1 << 14),
+    NCRISC = ((std::uint32_t)1 << 18),
+    STAGGERED_START = ((std::uint32_t)1 << 31)
 };
 
 std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value);
+
 constexpr TensixSoftResetOptions operator|(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) {
-    return static_cast<TensixSoftResetOptions>(
-        static_cast<uint32_t>(lhs) |
-        static_cast<uint32_t>(rhs)
-    );
+    return static_cast<TensixSoftResetOptions>(static_cast<uint32_t>(lhs) | static_cast<uint32_t>(rhs));
 }
 
 constexpr TensixSoftResetOptions operator&(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) {
-    return static_cast<TensixSoftResetOptions>(
-        static_cast<uint32_t>(lhs) &
-        static_cast<uint32_t>(rhs)
-    );
+    return static_cast<TensixSoftResetOptions>(static_cast<uint32_t>(lhs) & static_cast<uint32_t>(rhs));
 }
 
 constexpr bool operator!=(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) {
-    return
-        static_cast<uint32_t>(lhs) !=
-        static_cast<uint32_t>(rhs);
+    return static_cast<uint32_t>(lhs) != static_cast<uint32_t>(rhs);
 }
 
-static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET = TensixSoftResetOptions::TRISC0 |
-                                                           TensixSoftResetOptions::TRISC1 |
-                                                           TensixSoftResetOptions::TRISC2;
+static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET =
+    TensixSoftResetOptions::TRISC0 | TensixSoftResetOptions::TRISC1 | TensixSoftResetOptions::TRISC2;
 
-static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET = TensixSoftResetOptions::BRISC |
-                                                            TensixSoftResetOptions::NCRISC |
-                                                            TensixSoftResetOptions::STAGGERED_START |
-                                                            ALL_TRISC_SOFT_RESET;
+static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET =
+    TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | TensixSoftResetOptions::STAGGERED_START |
+    ALL_TRISC_SOFT_RESET;
 
-static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET = TensixSoftResetOptions::BRISC |
-                                                               TensixSoftResetOptions::NCRISC |
-                                                               ALL_TRISC_SOFT_RESET;
+static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET =
+    TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET;
 
-static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET = TensixSoftResetOptions::NCRISC |
-                                                                 ALL_TRISC_SOFT_RESET |
-                                                                 TensixSoftResetOptions::STAGGERED_START;
+static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET =
+    TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET | TensixSoftResetOptions::STAGGERED_START;
 
-static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER = TensixSoftResetOptions::NCRISC |
-                                                                                 ALL_TRISC_SOFT_RESET;
+static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER =
+    TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET;
diff --git a/device/api/umd/device/tt_simulation_device.h b/device/api/umd/device/tt_simulation_device.h
index 955dd288..9b4778aa 100644
--- a/device/api/umd/device/tt_simulation_device.h
+++ b/device/api/umd/device/tt_simulation_device.h
@@ -13,43 +13,49 @@
 #include "umd/device/cluster.h"
 #include "umd/device/tt_simulation_host.hpp"
 
-class tt_SimulationDevice: public tt_device {
-    public:
-    tt_SimulationDevice(const std::string &sdesc_path);
+class tt_SimulationDevice : public tt_device {
+public:
+    tt_SimulationDevice(const std::string& sdesc_path);
     ~tt_SimulationDevice();
 
     tt_SimulationHost host;
 
-    //Setup/Teardown Functions
+    // Setup/Teardown Functions
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
     virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
     virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
     virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_);
     virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_);
-    virtual void start_device(const tt_device_params &device_params);
+    virtual void start_device(const tt_device_params& device_params);
     virtual void assert_risc_reset();
     virtual void deassert_risc_reset();
-    virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET);
+    virtual void deassert_risc_reset_at_core(
+        tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET);
     virtual void assert_risc_reset_at_core(tt_cxy_pair core);
     virtual void close_device();
 
     // Runtime Functions
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
-    virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+    virtual void write_to_device(
+        const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
+    virtual void read_from_device(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
 
     virtual void wait_for_non_mmio_flush();
     virtual void wait_for_non_mmio_flush(const chip_id_t chip);
-    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
 
     // Misc. Functions to Query/Set Device State
     // virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
     static std::vector<chip_id_t> detect_available_device_ids();
     virtual std::set<chip_id_t> get_target_remote_device_ids();
-    virtual std::map<int,int> get_clocks();
-    virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
+    virtual std::map<int, int> get_clocks();
+    virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
     virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const;
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
@@ -57,7 +63,7 @@ class tt_SimulationDevice: public tt_device {
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
 
-    private:
+private:
     // State variables
     tt_device_dram_address_params dram_address_params;
     tt_device_l1_address_params l1_address_params;
diff --git a/device/api/umd/device/tt_simulation_host.hpp b/device/api/umd/device/tt_simulation_host.hpp
index 636c51e8..2db54394 100644
--- a/device/api/umd/device/tt_simulation_host.hpp
+++ b/device/api/umd/device/tt_simulation_host.hpp
@@ -1,9 +1,9 @@
 // SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include <vector>
 #include <cstdint>
 #include <memory>
+#include <vector>
 
 #include "umd/device/tt_xy_pair.h"
 
@@ -20,6 +20,7 @@ class tt_SimulationHost {
     void start_host();
     void send_to_device(uint8_t *buf, size_t buf_size);
     size_t recv_from_device(void **data_ptr);
+
 private:
     std::unique_ptr<nng_socket> host_socket;
     std::unique_ptr<nng_dialer> host_dialer;
diff --git a/device/api/umd/device/tt_soc_descriptor.h b/device/api/umd/device/tt_soc_descriptor.h
index 3a284d0a..e59d4416 100644
--- a/device/api/umd/device/tt_soc_descriptor.h
+++ b/device/api/umd/device/tt_soc_descriptor.h
@@ -7,29 +7,26 @@
 #pragma once
 
 #include <cstddef>
-#include <string>
+#include <cstdint>
+#include <iostream>
 #include <map>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
-#include <iostream>
-#include <string>
-#include <cstdint>
-
-#include "umd/device/tt_xy_pair.h"
-#include "umd/device/tt_arch_types.h"
-
-#include "umd/device/coordinate_manager.h"
-
 #include "fmt/core.h"
+#include "tt_xy_pair.h"
+#include "umd/device/coordinate_manager.h"
+#include "umd/device/tt_arch_types.h"
+#include "umd/device/tt_xy_pair.h"
 
 namespace YAML {
-    class Node;
+class Node;
 }
 
 std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name);
 
-static inline std::string get_arch_str(const tt::ARCH arch_name){
+static inline std::string get_arch_str(const tt::ARCH arch_name) {
     std::string arch_name_str;
 
     if (arch_name == tt::ARCH::GRAYSKULL) {
@@ -45,16 +42,18 @@ static inline std::string get_arch_str(const tt::ARCH arch_name){
     return arch_name_str;
 }
 
-static inline tt::ARCH get_arch_name(const std::string &arch_str){
+static inline tt::ARCH get_arch_name(const std::string &arch_str) {
     tt::ARCH arch;
 
     if ((arch_str == "grayskull") || (arch_str == "GRAYSKULL")) {
         arch = tt::ARCH::GRAYSKULL;
-    } else if ((arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || (arch_str == "WORMHOLE_B0")){
+    } else if (
+        (arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") ||
+        (arch_str == "WORMHOLE_B0")) {
         arch = tt::ARCH::WORMHOLE_B0;
-    } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")){
+    } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")) {
         arch = tt::ARCH::BLACKHOLE;
-    }else {
+    } else {
         throw std::runtime_error(
             fmt::format("At LoadSocDescriptorFromYaml: \"{}\" is not recognized as tt::ARCH.", arch_str));
     }
@@ -69,13 +68,13 @@ tt_xy_pair format_node(std::string str);
 //! SocCore type enumerations
 /*! Superset for all chip generations */
 enum class CoreType {
-  ARC,
-  DRAM,
-  ETH,
-  PCIE,
-  WORKER,
-  HARVESTED,
-  ROUTER_ONLY,
+    ARC,
+    DRAM,
+    ETH,
+    PCIE,
+    WORKER,
+    HARVESTED,
+    ROUTER_ONLY,
 
 };
 
@@ -84,10 +83,10 @@ enum class CoreType {
     Should only contain relevant configuration for SOC
 */
 struct CoreDescriptor {
-  tt_xy_pair coord = tt_xy_pair(0, 0);
-  CoreType type;
+    tt_xy_pair coord = tt_xy_pair(0, 0);
+    CoreType type;
 
-  std::size_t l1_size = 0;
+    std::size_t l1_size = 0;
 };
 
 //! tt_SocDescriptor contains information regarding the SOC configuration targetted.
@@ -95,7 +94,6 @@ struct CoreDescriptor {
     Should only contain relevant configuration for SOC
 */
 class tt_SocDescriptor {
-
 public:
     tt::ARCH arch;
     tt_xy_pair grid_size;
@@ -110,13 +108,15 @@ class tt_SocDescriptor {
     std::unordered_map<int, int> worker_log_to_routing_y;
     std::unordered_map<int, int> routing_x_to_worker_x;
     std::unordered_map<int, int> routing_y_to_worker_y;
-    std::vector<std::vector<tt_xy_pair>> dram_cores;  // per channel list of dram cores
+    std::vector<std::vector<tt_xy_pair>> dram_cores;                             // per channel list of dram cores
     std::unordered_map<tt_xy_pair, std::tuple<int, int>> dram_core_channel_map;  // map dram core to chan/subchan
-    std::vector<tt_xy_pair> ethernet_cores;  // ethernet cores (index == channel id)
-    std::unordered_map<tt_xy_pair,int> ethernet_core_channel_map;
+    std::vector<tt_xy_pair> ethernet_cores;                                      // ethernet cores (index == channel id)
+    std::unordered_map<tt_xy_pair, int> ethernet_core_channel_map;
     std::vector<std::size_t> trisc_sizes;  // Most of software stack assumes same trisc size for whole chip..
     std::string device_descriptor_file_path = std::string("");
+
     bool has(tt_xy_pair input) { return cores.find(input) != cores.end(); }
+
     int overlay_version;
     int unpacker_version;
     int dst_size_alignment;
@@ -129,15 +129,15 @@ class tt_SocDescriptor {
     int get_num_dram_channels() const;
     bool is_worker_core(const tt_xy_pair &core) const;
     tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const;
-    bool is_ethernet_core(const tt_xy_pair& core) const;
+    bool is_ethernet_core(const tt_xy_pair &core) const;
 
     // Default constructor. Creates uninitialized object with public access to all of its attributes.
     tt_SocDescriptor() = default;
-    // Constructor used to build object from device descriptor file.    
+    // Constructor used to build object from device descriptor file.
     tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask = 0);
 
     // Copy constructor
-    tt_SocDescriptor(const tt_SocDescriptor& other) :
+    tt_SocDescriptor(const tt_SocDescriptor &other) :
         arch(other.arch),
         grid_size(other.grid_size),
         physical_grid_size(other.physical_grid_size),
@@ -167,7 +167,7 @@ class tt_SocDescriptor {
         dram_bank_size(other.dram_bank_size) {
         coordinate_manager.reset(new CoordinateManager(*other.coordinate_manager));
     }
-    
+
     // Coordinate conversions.
 
     // Conversions from logical coordinates should be used just for worker cores.
@@ -189,11 +189,14 @@ class tt_SocDescriptor {
 
     void perform_harvesting(std::size_t harvesting_mask);
 
+    static std::string get_soc_descriptor_path(tt::ARCH arch);
+
 private:
-    std::unique_ptr<CoordinateManager> coordinate_manager = nullptr;
     void create_coordinate_manager(std::size_t harvesting_mask);
     void load_core_descriptors_from_device_descriptor(YAML::Node &device_descriptor_yaml);
     void load_soc_features_from_device_descriptor(YAML::Node &device_descriptor_yaml);
+
+    std::unique_ptr<CoordinateManager> coordinate_manager = nullptr;
 };
 
 // Allocates a new soc descriptor on the heap. Returns an owning pointer.
diff --git a/device/api/umd/device/tt_xy_pair.h b/device/api/umd/device/tt_xy_pair.h
index b86d568e..9375182f 100644
--- a/device/api/umd/device/tt_xy_pair.h
+++ b/device/api/umd/device/tt_xy_pair.h
@@ -15,44 +15,56 @@ using tt_cxy_pair = tt::umd::cxy_pair;
 
 struct tt_physical_coords : public tt_xy_pair {
     tt_physical_coords() : tt_xy_pair() {}
+
     tt_physical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {}
 };
 
 struct tt_chip_physical_coords : public tt_cxy_pair {
     tt_chip_physical_coords() : tt_cxy_pair() {}
+
     tt_chip_physical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {}
+
     tt_chip_physical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {}
 };
 
 struct tt_logical_coords : public tt_xy_pair {
     tt_logical_coords() : tt_xy_pair() {}
+
     tt_logical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {}
 };
 
 struct tt_chip_logical_coords : public tt_cxy_pair {
     tt_chip_logical_coords() : tt_cxy_pair() {}
+
     tt_chip_logical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {}
+
     tt_chip_logical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {}
 };
 
 struct tt_virtual_coords : public tt_xy_pair {
     tt_virtual_coords() : tt_xy_pair() {}
+
     tt_virtual_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {}
 };
 
 struct tt_chip_virtual_coords : public tt_cxy_pair {
     tt_chip_virtual_coords() : tt_cxy_pair() {}
+
     tt_chip_virtual_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {}
+
     tt_chip_virtual_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {}
 };
 
 struct tt_translated_coords : public tt_xy_pair {
     tt_translated_coords() : tt_xy_pair() {}
+
     tt_translated_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {}
 };
 
 struct tt_chip_translated_coords : public tt_cxy_pair {
     tt_chip_translated_coords() : tt_cxy_pair() {}
+
     tt_chip_translated_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {}
+
     tt_chip_translated_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {}
 };
diff --git a/device/api/umd/device/wormhole_implementation.h b/device/api/umd/device/wormhole_implementation.h
index 7bef1e9e..3dfebb96 100644
--- a/device/api/umd/device/wormhole_implementation.h
+++ b/device/api/umd/device/wormhole_implementation.h
@@ -167,7 +167,8 @@ static constexpr uint32_t TLB_BASE_INDEX_16M = TLB_BASE_INDEX_2M + TLB_COUNT_2M;
 static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
 
 static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024;
-static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES);
+static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR =
+    STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES);
 static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M;
 
 static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024;
@@ -205,59 +206,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;
 }  // namespace wormhole
 
 class wormhole_implementation : public architecture_implementation {
-   public:
+public:
     tt::ARCH get_architecture() const override { return tt::ARCH::WORMHOLE_B0; }
+
     uint32_t get_arc_message_arc_get_harvesting() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::ARC_GET_HARVESTING);
     }
+
     uint32_t get_arc_message_arc_go_busy() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::ARC_GO_BUSY);
     }
+
     uint32_t get_arc_message_arc_go_long_idle() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::ARC_GO_LONG_IDLE);
     }
+
     uint32_t get_arc_message_arc_go_short_idle() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::ARC_GO_SHORT_IDLE);
     }
+
     uint32_t get_arc_message_deassert_riscv_reset() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::DEASSERT_RISCV_RESET);
     }
+
     uint32_t get_arc_message_get_aiclk() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::GET_AICLK);
     }
+
     uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER);
     }
+
     uint32_t get_arc_message_test() const override { return static_cast<uint32_t>(wormhole::arc_message_type::TEST); }
+
     uint32_t get_arc_csm_mailbox_offset() const override { return wormhole::ARC_CSM_MAILBOX_OFFSET; }
+
     uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return wormhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; }
+
     uint32_t get_arc_reset_scratch_offset() const override { return wormhole::ARC_RESET_SCRATCH_OFFSET; }
+
     uint32_t get_dram_channel_0_peer2peer_region_start() const override {
         return wormhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START;
     }
+
     uint32_t get_dram_channel_0_x() const override { return wormhole::DRAM_CHANNEL_0_X; }
+
     uint32_t get_dram_channel_0_y() const override { return wormhole::DRAM_CHANNEL_0_Y; }
+
     uint32_t get_broadcast_tlb_index() const override { return wormhole::BROADCAST_TLB_INDEX; }
+
     uint32_t get_dynamic_tlb_2m_base() const override { return wormhole::DYNAMIC_TLB_2M_BASE; }
+
     uint32_t get_dynamic_tlb_2m_size() const override { return wormhole::DYNAMIC_TLB_2M_SIZE; }
+
     uint32_t get_dynamic_tlb_16m_base() const override { return wormhole::DYNAMIC_TLB_16M_BASE; }
+
     uint32_t get_dynamic_tlb_16m_size() const override { return wormhole::DYNAMIC_TLB_16M_SIZE; }
+
     uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return wormhole::DYNAMIC_TLB_16M_CFG_ADDR; }
+
     uint32_t get_mem_large_read_tlb() const override { return wormhole::MEM_LARGE_READ_TLB; }
+
     uint32_t get_mem_large_write_tlb() const override { return wormhole::MEM_LARGE_WRITE_TLB; }
+
     uint32_t get_static_tlb_cfg_addr() const override { return wormhole::STATIC_TLB_CFG_ADDR; }
+
     uint32_t get_static_tlb_size() const override { return wormhole::STATIC_TLB_SIZE; }
+
     uint32_t get_reg_tlb() const override { return wormhole::REG_TLB; }
+
     uint32_t get_tlb_base_index_16m() const override { return wormhole::TLB_BASE_INDEX_16M; }
+
     uint32_t get_tensix_soft_reset_addr() const override { return wormhole::TENSIX_SOFT_RESET_ADDR; }
+
     uint32_t get_grid_size_x() const override { return wormhole::GRID_SIZE_X; }
+
     uint32_t get_grid_size_y() const override { return wormhole::GRID_SIZE_Y; }
+
     uint32_t get_tlb_cfg_reg_size_bytes() const override { return wormhole::TLB_CFG_REG_SIZE_BYTES; }
+
     uint32_t get_small_read_write_tlb() const override { return wormhole::MEM_SMALL_READ_WRITE_TLB; }
+
     const std::vector<uint32_t>& get_harvesting_noc_locations() const override {
         return wormhole::HARVESTING_NOC_LOCATIONS;
     }
+
     const std::vector<uint32_t>& get_t6_x_locations() const override { return wormhole::T6_X_LOCATIONS; }
+
     const std::vector<uint32_t>& get_t6_y_locations() const override { return wormhole::T6_Y_LOCATIONS; }
 
     std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
@@ -268,7 +303,6 @@ class wormhole_implementation : public architecture_implementation {
     tt_driver_host_address_params get_host_address_params() const override;
     tt_driver_eth_interface_params get_eth_interface_params() const override;
     tt_driver_noc_params get_noc_params() const override;
-
 };
 
 }  // namespace tt::umd
diff --git a/device/api/umd/device/xy_pair.h b/device/api/umd/device/xy_pair.h
index ca717052..b989b31e 100644
--- a/device/api/umd/device/xy_pair.h
+++ b/device/api/umd/device/xy_pair.h
@@ -12,6 +12,7 @@ namespace tt::umd {
 
 struct xy_pair {
     constexpr xy_pair() : x{}, y{} {}
+
     constexpr xy_pair(std::size_t x, std::size_t y) : x(x), y(y) {}
 
     std::size_t x;
@@ -30,7 +31,9 @@ constexpr inline bool operator<(const xy_pair &left, const xy_pair &right) {
 
 struct cxy_pair : public xy_pair {
     cxy_pair() : xy_pair{}, chip{} {}
+
     cxy_pair(std::size_t ichip, xy_pair pair) : xy_pair(pair.x, pair.y), chip(ichip) {}
+
     cxy_pair(std::size_t ichip, std::size_t x, std::size_t y) : xy_pair(x, y), chip(ichip) {}
 
     std::size_t chip;
diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp
index 4b7b7faf..dc0c0b00 100644
--- a/device/architecture_implementation.cpp
+++ b/device/architecture_implementation.cpp
@@ -12,10 +12,14 @@ namespace tt::umd {
 
 std::unique_ptr<architecture_implementation> architecture_implementation::create(tt::ARCH architecture) {
     switch (architecture) {
-        case tt::ARCH::BLACKHOLE: return std::make_unique<blackhole_implementation>();
-        case tt::ARCH::GRAYSKULL: return std::make_unique<grayskull_implementation>();
-        case tt::ARCH::WORMHOLE_B0: return std::make_unique<wormhole_implementation>();
-        default: return nullptr;
+        case tt::ARCH::BLACKHOLE:
+            return std::make_unique<blackhole_implementation>();
+        case tt::ARCH::GRAYSKULL:
+            return std::make_unique<grayskull_implementation>();
+        case tt::ARCH::WORMHOLE_B0:
+            return std::make_unique<wormhole_implementation>();
+        default:
+            return nullptr;
     }
 }
 
diff --git a/device/blackhole/blackhole_coordinate_manager.h b/device/blackhole/blackhole_coordinate_manager.h
index 7491f272..9a00b46d 100644
--- a/device/blackhole/blackhole_coordinate_manager.h
+++ b/device/blackhole/blackhole_coordinate_manager.h
@@ -9,15 +9,15 @@
 #include "umd/device/coordinate_manager.h"
 
 class BlackholeCoordinateManager : public CoordinateManager {
-
 public:
-    BlackholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask)
-        : CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
+    BlackholeCoordinateManager(
+        const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask) :
+        CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
 
     tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override;
 
     tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override;
 
-protected: 
+protected:
     std::set<std::size_t> get_x_coordinates_to_harvest(std::size_t harvesting_mask) override;
 };
diff --git a/device/blackhole/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp
index 0421e2d1..14de739e 100644
--- a/device/blackhole/blackhole_implementation.cpp
+++ b/device/blackhole/blackhole_implementation.cpp
@@ -4,13 +4,12 @@
 
 #include "umd/device/blackhole_implementation.h"
 
-#include "blackhole/host_mem_address_map.h"
 #include "blackhole/eth_interface.h"
-
+#include "blackhole/host_mem_address_map.h"
 #include "umd/device/cluster.h"
 
-constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH
-constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH
+constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36;   // source: noc_parameters.h, common for WH && BH
+constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6;  // source: noc_parameters.h, common for WH && BH
 
 namespace tt::umd {
 
@@ -26,10 +25,9 @@ std::tuple<xy_pair, xy_pair> blackhole_implementation::multicast_workaround(xy_p
 }
 
 tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_index) const {
-
     // If TLB index is in range for 4GB tlbs (8 TLBs after 202 TLBs for 2MB)
     if (tlb_index >= blackhole::TLB_COUNT_2M && tlb_index < blackhole::TLB_COUNT_2M + blackhole::TLB_COUNT_4G) {
-        return tlb_configuration {
+        return tlb_configuration{
             .size = blackhole::DYNAMIC_TLB_4G_SIZE,
             .base = blackhole::DYNAMIC_TLB_4G_BASE,
             .cfg_addr = blackhole::DYNAMIC_TLB_4G_CFG_ADDR,
@@ -37,7 +35,7 @@ tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_i
             .offset = blackhole::TLB_4G_OFFSET,
         };
     }
-    
+
     return tlb_configuration{
         .size = blackhole::DYNAMIC_TLB_2M_SIZE,
         .base = blackhole::DYNAMIC_TLB_2M_BASE,
@@ -73,17 +71,17 @@ std::optional<std::tuple<std::uint64_t, std::uint64_t>> blackhole_implementation
 
 std::pair<std::uint64_t, std::uint64_t> blackhole_implementation::get_tlb_data(
     std::uint32_t tlb_index, const tlb_data& data) const {
-
     if (tlb_index < blackhole::TLB_COUNT_2M) {
         return data.apply_offset(blackhole::TLB_2M_OFFSET);
     } else {
         throw std::runtime_error("Invalid TLB index for Blackhole arch");
     }
-
 }
 
 tt_driver_host_address_params blackhole_implementation::get_host_address_params() const {
-    return {::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
+    return {
+        ::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE,
+        ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
 }
 
 tt_driver_eth_interface_params blackhole_implementation::get_eth_interface_params() const {
diff --git a/device/cluster.cpp b/device/cluster.cpp
index be574bfe..7163223d 100644
--- a/device/cluster.cpp
+++ b/device/cluster.cpp
@@ -3,61 +3,59 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "umd/device/cluster.h"
 
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <algorithm>
 #include <boost/interprocess/permissions.hpp>
-#include <boost/interprocess/sync/scoped_lock.hpp>
 #include <boost/interprocess/sync/named_mutex.hpp>
-
+#include <boost/interprocess/sync/scoped_lock.hpp>
+#include <cerrno>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
 #include <fstream>
 #include <iterator>
 #include <limits>
 #include <map>
-#include <vector>
 #include <memory>
 #include <mutex>
+#include <optional>
+#include <ratio>
 #include <regex>
 #include <stdexcept>
 #include <string>
 #include <utility>
-#include <cstddef>
-#include <cstdint>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <cstdlib>
-#include <cerrno>
-#include <chrono>
-#include <ratio>
-#include <algorithm>
-#include <filesystem>
-#include <stdarg.h>
-#include <optional>
-
-#include <sys/mman.h>
-#include <dirent.h>
-#include <errno.h>
+#include <vector>
 
-#include "yaml-cpp/yaml.h"
 #include "logger.hpp"
-
-#include "umd/device/tt_cluster_descriptor.h"
+#include "umd/device/architecture_implementation.h"
 #include "umd/device/driver_atomics.h"
 #include "umd/device/hugepage.h"
-#include "umd/device/architecture_implementation.h"
 #include "umd/device/tlb.h"
 #include "umd/device/tt_arch_types.h"
+#include "umd/device/tt_cluster_descriptor.h"
+#include "yaml-cpp/yaml.h"
 
 using namespace boost::interprocess;
 using namespace tt;
 using namespace tt::umd;
 
-
 static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF;
 
 // TLB size for DRAM on blackhole - 4GB
 const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024;
 
-static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation)
+// Remove 256MB from full 1GB for channel 3 (iATU limitation)
+static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368;
 
 // TODO: Remove in favor of cluster descriptor method, when it becomes available.
 // Metal uses this function to determine the architecture of the first PCIe chip
@@ -93,7 +91,7 @@ tt::ARCH detect_arch() {
 }
 
 template <typename T>
-void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_in_bytes) {
+void size_buffer_to_capacity(std::vector<T>& data_buf, std::size_t size_in_bytes) {
     std::size_t target_size = 0;
     if (size_in_bytes > 0) {
         target_size = ((size_in_bytes - 1) / sizeof(T)) + 1;
@@ -103,11 +101,9 @@ void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_in_bytes
 
 // TODO: To be removed when tt_device is removed
 
-tt_device::tt_device(const std::string& sdesc_path) : soc_descriptor_per_chip({}) {
-}
+tt_device::tt_device() : soc_descriptor_per_chip({}) {}
 
-tt_device::~tt_device() {
-}
+tt_device::~tt_device() {}
 
 const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const {
     return soc_descriptor_per_chip.at(chip_id);
@@ -117,12 +113,12 @@ const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const {
 // --------------------------------------------------------------------------------------------------------------
 // --------------------------------------------------------------------------------------------------------------
 
-#include "umd/device/tt_silicon_driver_common.hpp"
-#include "umd/device/tt_xy_pair.h"
-#include <thread>
 #include <fstream>
 #include <iomanip>
+#include <thread>
 
+#include "umd/device/tt_silicon_driver_common.hpp"
+#include "umd/device/tt_xy_pair.h"
 
 struct routing_cmd_t {
     uint64_t sys_addr;
@@ -131,49 +127,53 @@ struct routing_cmd_t {
     uint16_t rack;
     uint16_t src_resp_buf_index;
     uint32_t local_buf_index;
-    uint8_t  src_resp_q_id;
-    uint8_t  host_mem_txn_id;
+    uint8_t src_resp_q_id;
+    uint8_t host_mem_txn_id;
     uint16_t padding;
-    uint32_t src_addr_tag; //upper 32-bits of request source address.
+    uint32_t src_addr_tag;  // upper 32-bits of request source address.
 };
 
-struct remote_update_ptr_t{
-  uint32_t ptr;
-  uint32_t pad[3];
+struct remote_update_ptr_t {
+    uint32_t ptr;
+    uint32_t pad[3];
 };
 
 namespace {
-    struct tt_4_byte_aligned_buffer {
-        // Stores a 4 byte aligned buffer
-        // If the input buffer is already 4 byte aligned, this is a nop
-        std::uint32_t* local_storage = nullptr;
-        std::uint32_t input_size = 0;
-        std::uint32_t block_size = 0;
-
-        tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) {
-            input_size = size_in_bytes;
-            local_storage = (uint32_t*)mem_ptr;
-            uint32_t alignment_mask = sizeof(uint32_t) - 1;
-            uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask;
+struct tt_4_byte_aligned_buffer {
+    // Stores a 4 byte aligned buffer
+    // If the input buffer is already 4 byte aligned, this is a nop
+    std::uint32_t* local_storage = nullptr;
+    std::uint32_t input_size = 0;
+    std::uint32_t block_size = 0;
 
-            if(size_in_bytes < aligned_size) {
-                local_storage = new uint32_t[aligned_size / sizeof(uint32_t)];
-            }
-            block_size = aligned_size;
+    tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) {
+        input_size = size_in_bytes;
+        local_storage = (uint32_t*)mem_ptr;
+        uint32_t alignment_mask = sizeof(uint32_t) - 1;
+        uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask;
+
+        if (size_in_bytes < aligned_size) {
+            local_storage = new uint32_t[aligned_size / sizeof(uint32_t)];
         }
+        block_size = aligned_size;
+    }
 
-        ~tt_4_byte_aligned_buffer() {
-            if(block_size > input_size) {
-                delete [] local_storage;
-            }
+    ~tt_4_byte_aligned_buffer() {
+        if (block_size > input_size) {
+            delete[] local_storage;
         }
-    };
-}
+    }
+};
+}  // namespace
 
 namespace tt::umd {
 
-bool Cluster::address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) {
-    return ((tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) && address >= tlb_config_map.at(chip).at(tlb_index) && (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size));
+bool Cluster::address_in_tlb_space(
+    uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) {
+    return (
+        (tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) &&
+        address >= tlb_config_map.at(chip).at(tlb_index) &&
+        (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size));
 }
 
 std::unordered_map<chip_id_t, tt_SocDescriptor>& Cluster::get_virtual_soc_descriptors() {
@@ -181,10 +181,10 @@ std::unordered_map<chip_id_t, tt_SocDescriptor>& Cluster::get_virtual_soc_descri
 }
 
 void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) {
-    // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here (during device init)
-    // since its unsafe to modify shared state during multithreaded runtime.
-    // cleanup_mutexes_in_shm is tied to clean_system_resources from the constructor. The main process is responsible for initializing the driver with this
-    // field set to cleanup after an aborted process.
+    // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here
+    // (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm
+    // is tied to clean_system_resources from the constructor. The main process is responsible for initializing the
+    // driver with this field set to cleanup after an aborted process.
 
     // Store old mask and clear processes umask
     auto old_umask = umask(0);
@@ -193,236 +193,292 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup
     std::string mutex_name = "";
 
     // Initialize Dynamic TLB mutexes
-    for(auto &tlb : dynamic_tlb_config) {
+    for (auto& tlb : dynamic_tlb_config) {
         mutex_name = tlb.first + std::to_string(pci_interface_id);
-        if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str());
-        hardware_resource_mutex_map[mutex_name] = std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
+        if (cleanup_mutexes_in_shm) {
+            named_mutex::remove(mutex_name.c_str());
+        }
+        hardware_resource_mutex_map[mutex_name] =
+            std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
     }
 
     // Initialize ARC core mutex
     mutex_name = fmt::format("ARC_MSG{}", pci_interface_id);
-    if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str());
-    hardware_resource_mutex_map[mutex_name] = std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
+    if (cleanup_mutexes_in_shm) {
+        named_mutex::remove(mutex_name.c_str());
+    }
+    hardware_resource_mutex_map[mutex_name] =
+        std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
 
     if (arch_name == tt::ARCH::WORMHOLE_B0) {
         mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(pci_interface_id);
-        // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for ethernet broadcast
-        if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str());
-        hardware_resource_mutex_map[mutex_name] = std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
+        // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for
+        // ethernet broadcast
+        if (cleanup_mutexes_in_shm) {
+            named_mutex::remove(mutex_name.c_str());
+        }
+        hardware_resource_mutex_map[mutex_name] =
+            std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
     }
 
     // Initialize interprocess mutexes to make host -> device memory barriers atomic
     mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(pci_interface_id);
-    if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str());
-    hardware_resource_mutex_map[mutex_name] = std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
-    
+    if (cleanup_mutexes_in_shm) {
+        named_mutex::remove(mutex_name.c_str());
+    }
+    hardware_resource_mutex_map[mutex_name] =
+        std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
+
     // Restore old mask
     umask(old_umask);
 }
 
-void Cluster::create_device(const std::unordered_set<chip_id_t> &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) {
+void Cluster::create_device(
+    const std::unordered_set<chip_id_t>& target_mmio_device_ids,
+    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const bool skip_driver_allocs,
+    const bool clean_system_resources) {
     log_debug(LogSiliconDriver, "Cluster::Cluster");
 
     // Don't buffer stdout.
     setbuf(stdout, NULL);
 
-    // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to use available devices.
+    // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to
+    // use available devices.
     auto logical_to_physical_device_id_map = ndesc->get_chips_with_mmio();
 
-    log_assert(target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to Cluster constructor now.");
+    log_assert(
+        target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to Cluster constructor now.");
 
-    for (const chip_id_t &logical_device_id : target_mmio_device_ids) {
-        log_assert(logical_to_physical_device_id_map.count(logical_device_id) != 0, "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", logical_device_id);
+    for (const chip_id_t& logical_device_id : target_mmio_device_ids) {
+        log_assert(
+            logical_to_physical_device_id_map.count(logical_device_id) != 0,
+            "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map",
+            logical_device_id);
         int pci_interface_id = logical_to_physical_device_id_map.at(logical_device_id);
 
         if (!m_pci_device_map.count(logical_device_id)) {
-            log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id);
-            m_pci_device_map.insert({logical_device_id, std::make_unique<PCIDevice>(pci_interface_id, logical_device_id)});
+            log_debug(
+                LogSiliconDriver,
+                "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}",
+                pci_interface_id,
+                logical_device_id);
+            m_pci_device_map.insert(
+                {logical_device_id, std::make_unique<PCIDevice>(pci_interface_id, logical_device_id)});
         }
         auto dev = m_pci_device_map.at(logical_device_id).get();
 
         uint16_t pcie_device_id = dev->get_pci_device_id();
         uint32_t pcie_revision = dev->get_pci_revision();
         // TODO: get rid of this, it doesn't make any sense.
-        int num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision);
+        int num_host_mem_channels =
+            get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision);
         if (dev->get_arch() == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1) {
             // TODO: Implement support for multiple host channels on BLACKHOLE.
-            log_warning(LogSiliconDriver, "Forcing a single channel for Blackhole device. Multiple host channels not supported.");
+            log_warning(
+                LogSiliconDriver,
+                "Forcing a single channel for Blackhole device. Multiple host channels not supported.");
             num_host_mem_channels = 1;
         }
 
-        log_debug(LogSiliconDriver, "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} device_id: 0x{:x} revision: {})",
-            num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->get_device_num(), pci_device->revision_id);
+        log_debug(
+            LogSiliconDriver,
+            "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} "
+            "device_id: 0x{:x} revision: {})",
+            num_host_mem_channels,
+            logical_device_id,
+            pci_interface_id,
+            pci_device->get_device_num(),
+            pci_device->revision_id);
 
         initialize_interprocess_mutexes(pci_interface_id, clean_system_resources);
 
         // MT: Initial BH - hugepages will fail init
         // For using silicon driver without workload to query mission mode params, no need for hugepage.
-        if (!skip_driver_allocs){
+        if (!skip_driver_allocs) {
             // TODO: Implement support for multiple host channels on BLACKHOLE.
-            log_assert(!(arch_name == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1), "More channels are not yet supported for Blackhole");
-            bool hugepages_initialized = m_pci_device_map.at(logical_device_id)->init_hugepage(num_host_mem_channels); // Same number of host channels per device for now
+            log_assert(
+                !(arch_name == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1),
+                "More channels are not yet supported for Blackhole");
+            // Same number of host channels per device for now
+            bool hugepages_initialized = m_pci_device_map.at(logical_device_id)->init_hugepage(num_host_mem_channels);
             // Large writes to remote chips require hugepages to be initialized.
-            // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused if using remote only for small transactions)
-            if(target_remote_chips.size()) {
-                log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!");
+            // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused
+            // if using remote only for small transactions)
+            if (target_remote_chips.size()) {
+                log_assert(
+                    hugepages_initialized,
+                    "Hugepages must be successfully initialized if workload contains remote chips!");
             }
             if (not m_pci_device_map.at(logical_device_id)->get_hugepage_mapping(0).mapping) {
                 log_warning(LogSiliconDriver, "No hugepage mapping at device {}.", logical_device_id);
             }
         }
-        harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map
+        // translation layer for harvested coords. Default is identity map
+        harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)});
     }
 
-    for(const chip_id_t& chip : target_devices_in_cluster) {
+    for (const chip_id_t& chip : target_devices_in_cluster) {
         // Initialize identity mapping for Non-MMIO chips as well
-        if(!ndesc -> is_chip_mmio_capable(chip)) {
+        if (!ndesc->is_chip_mmio_capable(chip)) {
             harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, true)});
             flush_non_mmio_per_chip[chip] = false;
         }
     }
 }
 
-bool Cluster::using_harvested_soc_descriptors() {
-    return perform_harvesting_on_sdesc && performed_harvesting;
-}
+bool Cluster::using_harvested_soc_descriptors() { return perform_harvesting_on_sdesc && performed_harvesting; }
 
 std::unordered_map<tt_xy_pair, tt_xy_pair> Cluster::get_harvested_coord_translation_map(chip_id_t logical_device_id) {
     return harvested_coord_translation.at(logical_device_id);
 }
 
 std::unordered_map<chip_id_t, uint32_t> Cluster::get_harvesting_masks_for_soc_descriptors() {
-    if(using_harvested_soc_descriptors()) {
+    if (using_harvested_soc_descriptors()) {
         return harvested_rows_per_target;
     }
     std::unordered_map<chip_id_t, uint32_t> default_harvesting_masks = {};
-    for(const auto chip : target_devices_in_cluster) default_harvesting_masks.insert({chip, 0});
+    for (const auto chip : target_devices_in_cluster) {
+        default_harvesting_masks.insert({chip, 0});
+    }
     return default_harvesting_masks;
 }
 
-Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, const std::set<chip_id_t> &target_devices, 
-                                   const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs,
-                                   const bool clean_system_resources, bool perform_harvesting, std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks) : tt_device(sdesc_path) {
+void Cluster::construct_cluster(
+    const std::string& sdesc_path,
+    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const bool skip_driver_allocs,
+    const bool clean_system_resources,
+    bool perform_harvesting,
+    std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks) {
     std::unordered_set<chip_id_t> target_mmio_device_ids;
-    target_devices_in_cluster = target_devices;
-    arch_name = tt_SocDescriptor(sdesc_path).arch;
-    perform_harvesting_on_sdesc = perform_harvesting;
-
-    auto available_device_ids = detect_available_device_ids();
-    m_num_pci_devices = available_device_ids.size();
-
-    if (!skip_driver_allocs) {
-        log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids);
-        log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices);
-    }
-    
-    std::string cluster_descriptor_path = ndesc_path;
-    if (cluster_descriptor_path == "") {
-        cluster_descriptor_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
-    }
-
-    ndesc = tt_ClusterDescriptor::create_from_yaml(cluster_descriptor_path);
-
-    for (auto &d: target_devices){
-        if (ndesc->is_chip_mmio_capable(d)){
+    for (auto& d : target_devices_in_cluster) {
+        log_assert(
+            ndesc->get_all_chips().find(d) != ndesc->get_all_chips().end(),
+            "Target device {} not present in current cluster!",
+            d);
+        if (ndesc->is_chip_mmio_capable(d)) {
             target_mmio_device_ids.insert(d);
-        }
-        else {
+        } else {
             target_remote_chips.insert(d);
         }
     }
 
-    // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and writes.
+    // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and
+    // writes.
     auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
-    dynamic_tlb_config["LARGE_READ_TLB"] =  architecture_implementation->get_mem_large_read_tlb();
+    dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb();
     dynamic_tlb_config["LARGE_WRITE_TLB"] = architecture_implementation->get_mem_large_write_tlb();
     dynamic_tlb_config["REG_TLB"] = architecture_implementation->get_reg_tlb();
     dynamic_tlb_config["SMALL_READ_WRITE_TLB"] = architecture_implementation->get_small_read_write_tlb();
 
-    for(const auto& tlb : dynamic_tlb_config) {
-        dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); // All dynamic TLBs use Relaxed Ordering by default; MT: Good for BH
+    // All dynamic TLBs use Relaxed Ordering by default
+    for (const auto& tlb : dynamic_tlb_config) {
+        dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed});
     }
     create_device(target_mmio_device_ids, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources);
 
     // MT: Initial BH - Disable dependency to ethernet firmware
-    if(arch_name == tt::ARCH::BLACKHOLE) {
+    if (arch_name == tt::ARCH::BLACKHOLE) {
         use_ethernet_ordered_writes = false;
         use_ethernet_broadcast = false;
         use_virtual_coords_for_eth_broadcast = false;
     }
 
-    if(arch_name == tt::ARCH::WORMHOLE_B0) {
-        const auto& harvesting_masks = ndesc -> get_harvesting_info();
-        const auto& noc_translation_enabled = ndesc -> get_noc_translation_table_en();
+    if (arch_name == tt::ARCH::WORMHOLE_B0) {
+        const auto& harvesting_masks = ndesc->get_harvesting_info();
+        const auto& noc_translation_enabled = ndesc->get_noc_translation_table_en();
 
         translation_tables_en = false;
-        for(auto& masks : harvesting_masks) {
-            if(target_devices.find(masks.first) != target_devices.end()) {
+        for (auto& masks : harvesting_masks) {
+            if (target_devices_in_cluster.find(masks.first) != target_devices_in_cluster.end()) {
                 harvested_rows_per_target[masks.first] = get_harvested_noc_rows(masks.second);
                 noc_translation_enabled_for_chip[masks.first] = noc_translation_enabled.at(masks.first);
                 num_rows_harvested.insert({masks.first, std::bitset<32>(masks.second).count()});
-                if(harvested_rows_per_target[masks.first]) {
+                if (harvested_rows_per_target[masks.first]) {
                     performed_harvesting = true;
                 }
             }
         }
-        if(noc_translation_enabled_for_chip.size() > 0) {
-            auto const consistent_translation_table_state = [&] (std::pair<chip_id_t, bool> const& i) {
-                return noc_translation_enabled_for_chip.begin() -> second == i.second;
+        if (noc_translation_enabled_for_chip.size() > 0) {
+            auto const consistent_translation_table_state = [&](std::pair<chip_id_t, bool> const& i) {
+                return noc_translation_enabled_for_chip.begin()->second == i.second;
             };
 
-            bool translation_tables_match_on_all_chips = std::all_of(noc_translation_enabled_for_chip.begin(), noc_translation_enabled_for_chip.end(), consistent_translation_table_state);
-            log_assert(translation_tables_match_on_all_chips, "Cluster uses NOC translation tables inconsistently across chips.");
-            translation_tables_en = noc_translation_enabled_for_chip.begin() -> second;
+            bool translation_tables_match_on_all_chips = std::all_of(
+                noc_translation_enabled_for_chip.begin(),
+                noc_translation_enabled_for_chip.end(),
+                consistent_translation_table_state);
+            log_assert(
+                translation_tables_match_on_all_chips,
+                "Cluster uses NOC translation tables inconsistently across chips.");
+            translation_tables_en = noc_translation_enabled_for_chip.begin()->second;
         }
 
-        if(translation_tables_en) {
+        if (translation_tables_en) {
             harvested_coord_translation.clear();
-            for(const chip_id_t& chip : target_devices_in_cluster) {
+            for (const chip_id_t& chip : target_devices_in_cluster) {
                 harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, false)});
             }
         }
-        log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled.");
-    }
-    else if(arch_name == tt::ARCH::BLACKHOLE) {
+        log_assert(
+            performed_harvesting ? translation_tables_en : true,
+            "Using a harvested WH cluster with NOC translation disabled.");
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         // Default harvesting info for Blackhole, describing no harvesting
-        for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){
-            harvested_rows_per_target[*chip_id] =  0; //get_harvested_noc_rows_for_chip(*chip_id);
-            num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent.
-            if(harvested_rows_per_target[*chip_id]) {
+        for (auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++) {
+            harvested_rows_per_target[*chip_id] = 0;   // get_harvested_noc_rows_for_chip(*chip_id);
+            num_rows_harvested.insert({*chip_id, 0});  // Only set for broadcast TLB to get RISCS out of reset. We want
+                                                       // all rows to have a reset signal sent.
+            if (harvested_rows_per_target[*chip_id]) {
                 performed_harvesting = true;
             }
         }
-    }
-    else if(arch_name == tt::ARCH::GRAYSKULL) {
+    } else if (arch_name == tt::ARCH::GRAYSKULL) {
         // Multichip harvesting is supported for GS.
-        for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){
-            harvested_rows_per_target[*chip_id] =  get_harvested_noc_rows_for_chip(*chip_id);
-            num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent.
-            if(harvested_rows_per_target[*chip_id]) {
+        for (auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++) {
+            harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id);
+            num_rows_harvested.insert({*chip_id, 0});  // Only set for broadcast TLB to get RISCS out of reset. We want
+                                                       // all rows to have a reset signal sent.
+            if (harvested_rows_per_target[*chip_id]) {
                 performed_harvesting = true;
             }
         }
     }
 
-    if(simulated_harvesting_masks.size()) {
+    if (simulated_harvesting_masks.size()) {
         performed_harvesting = true;
-        for (auto device_id = target_devices.begin(); device_id != target_devices.end(); device_id++) {
-            log_assert(simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), "Could not find harvesting mask for device_id {}", *device_id);
-            if(arch_name == tt::ARCH::GRAYSKULL) {
-                if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != harvested_rows_per_target[*device_id]) {
-                    log_warning(LogSiliconDriver,
-                                "Simulated harvesting config for device {} does not include the actual harvesting config. Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : {}    Simulated Harvested Rows : {}",
-                                *device_id,  harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id));
+        for (auto device_id = target_devices_in_cluster.begin(); device_id != target_devices_in_cluster.end();
+             device_id++) {
+            log_assert(
+                simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(),
+                "Could not find harvesting mask for device_id {}",
+                *device_id);
+            if (arch_name == tt::ARCH::GRAYSKULL) {
+                if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) !=
+                    harvested_rows_per_target[*device_id]) {
+                    log_warning(
+                        LogSiliconDriver,
+                        "Simulated harvesting config for device {} does not include the actual harvesting config. "
+                        "Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : "
+                        "{}    Simulated Harvested Rows : {}",
+                        *device_id,
+                        harvested_rows_per_target[*device_id],
+                        simulated_harvesting_masks.at(*device_id));
                 }
                 simulated_harvesting_masks.at(*device_id) |= harvested_rows_per_target[*device_id];
-            }
-            else if(arch_name == tt::ARCH::WORMHOLE_B0) {
-                log_assert(std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= std::bitset<32>(harvested_rows_per_target[*device_id]).count(),
-                            "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. Actual Harvested Rows : {}  Simulated Harvested Rows : {}",
-                            harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id));
-                            num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count();
-                log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled.");
+            } else if (arch_name == tt::ARCH::WORMHOLE_B0) {
+                log_assert(
+                    std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >=
+                        std::bitset<32>(harvested_rows_per_target[*device_id]).count(),
+                    "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. "
+                    "Actual Harvested Rows : {}  Simulated Harvested Rows : {}",
+                    harvested_rows_per_target[*device_id],
+                    simulated_harvesting_masks.at(*device_id));
+                num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count();
+                log_assert(
+                    performed_harvesting ? translation_tables_en : true,
+                    "Using a harvested WH cluster with NOC translation disabled.");
             }
             harvested_rows_per_target[*device_id] = simulated_harvesting_masks.at(*device_id);
         }
@@ -432,18 +488,18 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c
     populate_cores();
 
     // MT: Initial BH - skip this for BH
-    if(arch_name == tt::ARCH::WORMHOLE_B0) {
+    if (arch_name == tt::ARCH::WORMHOLE_B0) {
         remote_transfer_ethernet_cores.resize(target_mmio_device_ids.size());
-        for (const auto &logical_mmio_chip_id : target_mmio_device_ids) {
+        for (const auto& logical_mmio_chip_id : target_mmio_device_ids) {
             const tt_SocDescriptor& soc_desc = get_soc_descriptor(logical_mmio_chip_id);
             // 4-5 is for send_epoch_commands, 0-3 are for everything else
             for (std::uint32_t i = 0; i < NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS; i++) {
-                if(remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) {
+                if (remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) {
                     remote_transfer_ethernet_cores.resize(logical_mmio_chip_id + 1);
                 }
-                remote_transfer_ethernet_cores.at(logical_mmio_chip_id).push_back(
-                    tt_cxy_pair(logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y)
-                );
+                remote_transfer_ethernet_cores.at(logical_mmio_chip_id)
+                    .push_back(tt_cxy_pair(
+                        logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y));
             }
         }
     }
@@ -456,20 +512,164 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c
 
     // Default initialize noc_params based on detected arch
     noc_params = architecture_implementation->get_noc_params();
+}
+
+Cluster::Cluster(
+    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const bool skip_driver_allocs,
+    const bool clean_system_resources,
+    bool perform_harvesting,
+    std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks) :
+    tt_device() {
+    // TODO: this should be fetched through ClusterDescriptor
+    auto available_device_ids = detect_available_device_ids();
+    m_num_pci_devices = available_device_ids.size();
+
+    int physical_device_id = available_device_ids[0];
+    // TODO: remove logical_device_id
+    PCIDevice pci_device(physical_device_id, 0);
+    tt::ARCH device_arch = pci_device.get_arch();
+
+    std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch);
+
+    arch_name = tt_SocDescriptor(sdesc_path).arch;
+    perform_harvesting_on_sdesc = perform_harvesting;
+
+    if (!skip_driver_allocs) {
+        log_info(
+            LogSiliconDriver,
+            "Detected {} PCI device{} : {}",
+            m_num_pci_devices,
+            (m_num_pci_devices > 1) ? "s" : "",
+            available_device_ids);
+        log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices);
+    }
+
+    std::string ndesc_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
+    ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);
+
+    std::set<chip_id_t> target_devices;
+    for (const chip_id_t& d : ndesc->get_all_chips()) {
+        target_devices.insert(d);
+    }
+    target_devices_in_cluster = target_devices;
+
+    construct_cluster(
+        sdesc_path,
+        num_host_mem_ch_per_mmio_device,
+        skip_driver_allocs,
+        clean_system_resources,
+        perform_harvesting,
+        simulated_harvesting_masks);
+}
+
+Cluster::Cluster(
+    const std::set<chip_id_t>& target_devices,
+    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const bool skip_driver_allocs,
+    const bool clean_system_resources,
+    bool perform_harvesting,
+    std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks) :
+    tt_device() {
+    // TODO: this should be fetched through ClusterDescriptor
+    auto available_device_ids = detect_available_device_ids();
+    m_num_pci_devices = available_device_ids.size();
+
+    int physical_device_id = available_device_ids[0];
+    // TODO: remove logical_device_id
+    PCIDevice pci_device(physical_device_id, 0);
+    tt::ARCH device_arch = pci_device.get_arch();
+
+    std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch);
 
+    arch_name = tt_SocDescriptor(sdesc_path).arch;
+    perform_harvesting_on_sdesc = perform_harvesting;
+
+    if (!skip_driver_allocs) {
+        log_info(
+            LogSiliconDriver,
+            "Detected {} PCI device{} : {}",
+            m_num_pci_devices,
+            (m_num_pci_devices > 1) ? "s" : "",
+            available_device_ids);
+        log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices);
+    }
+
+    std::string ndesc_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
+    ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);
+
+    target_devices_in_cluster = target_devices;
+
+    construct_cluster(
+        sdesc_path,
+        num_host_mem_ch_per_mmio_device,
+        skip_driver_allocs,
+        clean_system_resources,
+        perform_harvesting,
+        simulated_harvesting_masks);
+}
+
+Cluster::Cluster(
+    const std::string& sdesc_path,
+    const std::string& ndesc_path,
+    const std::set<chip_id_t>& target_devices,
+    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const bool skip_driver_allocs,
+    const bool clean_system_resources,
+    bool perform_harvesting,
+    std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks) :
+    tt_device() {
+    // TODO: this should be fetched through ClusterDescriptor
+    auto available_device_ids = detect_available_device_ids();
+    m_num_pci_devices = available_device_ids.size();
+
+    target_devices_in_cluster = target_devices;
+    arch_name = tt_SocDescriptor(sdesc_path).arch;
+    perform_harvesting_on_sdesc = perform_harvesting;
+
+    if (!skip_driver_allocs) {
+        log_info(
+            LogSiliconDriver,
+            "Detected {} PCI device{} : {}",
+            m_num_pci_devices,
+            (m_num_pci_devices > 1) ? "s" : "",
+            available_device_ids);
+        log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices);
+    }
+
+    std::string cluster_descriptor_path = ndesc_path;
+    if (cluster_descriptor_path == "") {
+        cluster_descriptor_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
+    }
+
+    ndesc = tt_ClusterDescriptor::create_from_yaml(cluster_descriptor_path);
+
+    construct_cluster(
+        sdesc_path,
+        num_host_mem_ch_per_mmio_device,
+        skip_driver_allocs,
+        clean_system_resources,
+        perform_harvesting,
+        simulated_harvesting_masks);
 }
 
-void Cluster::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip) {
+void Cluster::configure_active_ethernet_cores_for_mmio_device(
+    chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip) {
     // Makes UMD aware of which ethernet cores have active links.
     // Based on this information, UMD determines which ethernet cores can be used for host->cluster non-MMIO transfers.
-    // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be called for all MMIO devices, if default behaviour
-    // is not desired.
-    log_assert(get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0, "{} can only be called for Wormhole arch", __FUNCTION__);
+    // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be
+    // called for all MMIO devices, if default behaviour is not desired.
+    log_assert(
+        get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0,
+        "{} can only be called for Wormhole arch",
+        __FUNCTION__);
     auto& eth_cores = get_soc_descriptor(mmio_chip).ethernet_cores;
     // Cores 0, 1, 6, 7 are only available if in the active set
-    static std::unordered_set<tt_xy_pair> eth_cores_available_if_active = {eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)};
+    static std::unordered_set<tt_xy_pair> eth_cores_available_if_active = {
+        eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)};
     // Eth cores 8 and 9 are always available
-    std::vector<tt_cxy_pair> non_mmio_access_cores_for_chip = {tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))};
+    std::vector<tt_cxy_pair> non_mmio_access_cores_for_chip = {
+        tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))};
     for (const auto& active_eth_core : active_eth_cores_per_chip) {
         if (eth_cores_available_if_active.find(active_eth_core) != eth_cores_available_if_active.end()) {
             non_mmio_access_cores_for_chip.push_back(tt_cxy_pair(mmio_chip, active_eth_core));
@@ -483,27 +683,33 @@ void Cluster::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chi
 
 void Cluster::populate_cores() {
     std::uint32_t count = 0;
-    for(const auto chip : soc_descriptor_per_chip) {
-        workers_per_chip.insert({chip.first, std::unordered_set<tt_xy_pair>(chip.second.workers.begin(), chip.second.workers.end())});
-        if(count == 0) {
-            eth_cores = std::unordered_set<tt_xy_pair>(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end());
-            for(std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) {
-                dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0)) ;
+    for (const auto chip : soc_descriptor_per_chip) {
+        workers_per_chip.insert(
+            {chip.first, std::unordered_set<tt_xy_pair>(chip.second.workers.begin(), chip.second.workers.end())});
+        if (count == 0) {
+            eth_cores =
+                std::unordered_set<tt_xy_pair>(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end());
+            for (std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) {
+                dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0));
             }
         }
         count++;
     }
 }
 
-std::vector<int> Cluster::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) {
+std::vector<int> Cluster::extract_rows_to_remove(
+    const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows) {
     // Check if harvesting config is legal for GS and WH
-    log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested");
+    log_assert(
+        !((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)),
+        "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested");
     std::vector<int> row_coordinates_to_remove;
     int row_coordinate = 0;
     int tmp = harvested_rows;
     while (tmp) {
-        if (tmp & 1)
+        if (tmp & 1) {
             row_coordinates_to_remove.push_back(row_coordinate);
+        }
 
         tmp = tmp >> 1;
         row_coordinate++;
@@ -517,13 +723,14 @@ std::vector<int> Cluster::extract_rows_to_remove(const tt::ARCH &arch, const int
     return row_coordinates_to_remove;
 }
 
-void Cluster::remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove) {
+void Cluster::remove_worker_row_from_descriptor(
+    tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove) {
     std::vector<tt_xy_pair> workers_to_keep;
-    for(auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++){
-        if(find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) == row_coordinates_to_remove.end()){
+    for (auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++) {
+        if (find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) ==
+            row_coordinates_to_remove.end()) {
             workers_to_keep.push_back(*worker);
-        }
-        else{
+        } else {
             (full_soc_descriptor.harvested_workers).push_back(*worker);
             full_soc_descriptor.cores.at(*worker).type = CoreType::HARVESTED;
         }
@@ -535,28 +742,32 @@ void Cluster::remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descr
 
     std::set<int> modified_y_coords = {};
 
-    for(const auto& core : full_soc_descriptor.workers) {
+    for (const auto& core : full_soc_descriptor.workers) {
         modified_y_coords.insert(core.y);
     }
     int logical_y_coord = 0;
-    for(const auto& y_coord : modified_y_coords) {
+    for (const auto& y_coord : modified_y_coords) {
         full_soc_descriptor.routing_y_to_worker_y.insert({y_coord, logical_y_coord});
-        full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord,  y_coord});
+        full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord, y_coord});
         logical_y_coord++;
     }
 }
 
 void Cluster::harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows) {
-    std::uint32_t max_row_to_remove = (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [] (const auto& a, const auto& b) { return a.y < b.y; })).y;
+    std::uint32_t max_row_to_remove =
+        (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [](const auto& a, const auto& b) {
+            return a.y < b.y;
+        })).y;
     std::vector<int> row_coordinates_to_remove = extract_rows_to_remove(arch, max_row_to_remove, harvested_rows);
     remove_worker_row_from_descriptor(sdesc, row_coordinates_to_remove);
 }
 
-void Cluster::perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting) {
+void Cluster::perform_harvesting_and_populate_soc_descriptors(
+    const std::string& sdesc_path, const bool perform_harvesting) {
     const auto default_sdesc = tt_SocDescriptor(sdesc_path);
-    for(const auto& chip : harvested_rows_per_target) {
+    for (const auto& chip : harvested_rows_per_target) {
         auto temp_sdesc = default_sdesc;
-        if(perform_harvesting) {
+        if (perform_harvesting) {
             harvest_rows_in_soc_descriptor(arch_name, temp_sdesc, chip.second);
         }
         soc_descriptor_per_chip.insert({chip.first, temp_sdesc});
@@ -564,25 +775,24 @@ void Cluster::perform_harvesting_and_populate_soc_descriptors(const std::string&
 }
 
 void Cluster::check_pcie_device_initialized(int device_id) {
-
-    PCIDevice *pci_device = get_pci_device(device_id);
+    PCIDevice* pci_device = get_pci_device(device_id);
     tt::ARCH device_arch = pci_device->get_arch();
     if (arch_name == tt::ARCH::GRAYSKULL) {
         if (device_arch != tt::ARCH::GRAYSKULL) {
-            throw std::runtime_error(fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch)));
+            throw std::runtime_error(
+                fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch)));
         }
-    }
-    else if (arch_name == tt::ARCH::WORMHOLE_B0) {
+    } else if (arch_name == tt::ARCH::WORMHOLE_B0) {
         if (device_arch != tt::ARCH::WORMHOLE_B0) {
-            throw std::runtime_error(fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch)));
+            throw std::runtime_error(
+                fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch)));
         }
-    }
-    else if (arch_name == tt::ARCH::BLACKHOLE) {
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         if (device_arch != tt::ARCH::BLACKHOLE) {
-            throw std::runtime_error(fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch)));
+            throw std::runtime_error(
+                fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch)));
         }
-    }
-    else {
+    } else {
         throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name)));
     }
     auto architecture_implementation = pci_device->get_architecture_implementation();
@@ -590,29 +800,36 @@ void Cluster::check_pcie_device_initialized(int device_id) {
     // MT Initial BH - Add check for blackhole once access to ARC registers is setup through TLBs
     if (arch_name != tt::ARCH::BLACKHOLE) {
         log_debug(LogSiliconDriver, "== Check if device_id: {} is initialized", device_id);
-        uint32_t bar_read_initial = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4);
+        uint32_t bar_read_initial =
+            bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4);
         uint32_t arg = bar_read_initial == 500 ? 325 : 500;
         uint32_t bar_read_again;
-        uint32_t arc_msg_return = arc_msg(device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again);
+        uint32_t arc_msg_return = arc_msg(
+            device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again);
         if (arc_msg_return != 0 || bar_read_again != arg + 1) {
             auto postcode = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset());
-            throw std::runtime_error(fmt::format("Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} bar_read_again: {}",
-                                                 postcode,
-                                                 arc_msg_return,
-                                                 arg,
-                                                 bar_read_initial,
-                                                 bar_read_again));
+            throw std::runtime_error(fmt::format(
+                "Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} "
+                "bar_read_again: {}",
+                postcode,
+                arc_msg_return,
+                arg,
+                bar_read_initial,
+                bar_read_again));
         }
     }
 
-
     if (test_setup_interface()) {
-        throw std::runtime_error("Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run.");
+        throw std::runtime_error(
+            "Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC "
+            "Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run.");
     }
 }
 
-std::unordered_map<tt_xy_pair, tt_xy_pair> Cluster::create_harvested_coord_translation(const tt::ARCH arch, bool identity_map) {
-    log_assert(identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices");
+std::unordered_map<tt_xy_pair, tt_xy_pair> Cluster::create_harvested_coord_translation(
+    const tt::ARCH arch, bool identity_map) {
+    log_assert(
+        identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices");
     std::unordered_map<tt_xy_pair, tt_xy_pair> translation_table = {};
 
     tt_xy_pair grid_size;
@@ -620,29 +837,29 @@ std::unordered_map<tt_xy_pair, tt_xy_pair> Cluster::create_harvested_coord_trans
     std::vector<uint32_t> T6_y = {};
     std::vector<tt_xy_pair> ethernet = {};
     // Store device specific data for GS and WH depending on arch
-    if(arch == tt::ARCH::GRAYSKULL) {
+    if (arch == tt::ARCH::GRAYSKULL) {
         grid_size = tt_xy_pair(13, 12);
         T6_x = {12, 1, 11, 2, 10, 3, 9, 4, 8, 5, 7, 6};
         T6_y = {11, 1, 10, 2, 9, 3, 8, 4, 7, 5};
-    }
-    else if (arch == tt::ARCH::BLACKHOLE) {
+    } else if (arch == tt::ARCH::BLACKHOLE) {
         grid_size = tt_xy_pair(17, 12);
         T6_x = {16, 1, 15, 2, 14, 3, 13, 4, 12, 5, 11, 6, 10, 7};
         T6_y = {11, 2, 10, 3, 9, 4, 8, 5, 7, 6};
-    }
-    else {
+    } else {
         grid_size = tt_xy_pair(10, 12);
         T6_x = {1, 2, 3, 4, 6, 7, 8, 9};
         T6_y = {1, 2, 3, 4, 5, 7, 8, 9, 10, 11};
-        ethernet = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {6, 0}, {7, 0}, {8, 0}, {9, 0}, {1, 6}, {2, 6}, {3, 6}, {4, 6}, {6, 6}, {7, 6}, {8, 6}, {9, 6}};
+        // clang-format off
+        ethernet = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {6, 0}, {7, 0}, {8, 0}, {9, 0},
+                    {1, 6}, {2, 6}, {3, 6}, {4, 6}, {6, 6}, {7, 6}, {8, 6}, {9, 6}};
+        // clang-format on
     }
 
-    
-    if(identity_map) {
+    if (identity_map) {
         // When device is initialized, assume no harvesting and create an identity map for cores
         // This flow is always used for GS, since there is no hardware harvesting
-        for(int x = 0; x < grid_size.x; x++) {
-            for(int y = 0; y < grid_size.y; y++) {
+        for (int x = 0; x < grid_size.x; x++) {
+            for (int y = 0; y < grid_size.y; y++) {
                 tt_xy_pair curr_core = tt_xy_pair(x, y);
                 translation_table.insert({curr_core, curr_core});
             }
@@ -653,34 +870,50 @@ std::unordered_map<tt_xy_pair, tt_xy_pair> Cluster::create_harvested_coord_trans
     // If this function is called with identity_map = false, we have perform NOC translation
     // This can only happen for WH devices
     // Setup coord translation for workers. Map all worker cores
-    for(int x = 0; x < grid_size.x; x++) {
-        for(int y = 0; y < grid_size.y; y++) {
+    for (int x = 0; x < grid_size.x; x++) {
+        for (int y = 0; y < grid_size.y; y++) {
             tt_xy_pair curr_core = tt_xy_pair(x, y);
 
-            if(std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() &&
-            std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) {
+            if (std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() &&
+                std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) {
                 // This is a worker core. Apply translation for WH.
                 tt_xy_pair harvested_worker;
-                if(x >= 1 && x <= 4) harvested_worker.x = x + 17;
-                else if(x <= 9 && x > 5) harvested_worker.x = x + 16;
-                else log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x);
+                if (x >= 1 && x <= 4) {
+                    harvested_worker.x = x + 17;
+                } else if (x <= 9 && x > 5) {
+                    harvested_worker.x = x + 16;
+                } else {
+                    log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x);
+                }
 
-                if(y >= 1 && y <= 5) harvested_worker.y = y + 17;
-                else if(y <= 11 && y > 6) harvested_worker.y = y + 16;
-                else log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y);
+                if (y >= 1 && y <= 5) {
+                    harvested_worker.y = y + 17;
+                } else if (y <= 11 && y > 6) {
+                    harvested_worker.y = y + 16;
+                } else {
+                    log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y);
+                }
                 translation_table.insert({curr_core, harvested_worker});
             }
 
-            else if(std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()){
+            else if (std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()) {
                 // This is an eth core. Apply translation for WH.
                 tt_xy_pair harvested_eth_core;
-                if(x >= 1 && x <= 4) harvested_eth_core.x = x + 17;
-                else if(x <= 9 && x > 5) harvested_eth_core.x = x + 16;
-                else log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x);
+                if (x >= 1 && x <= 4) {
+                    harvested_eth_core.x = x + 17;
+                } else if (x <= 9 && x > 5) {
+                    harvested_eth_core.x = x + 16;
+                } else {
+                    log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x);
+                }
 
-                if(y == 0) harvested_eth_core.y = y + 16;
-                else if(y == 6) harvested_eth_core.y = y + 11;
-                else log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y);
+                if (y == 0) {
+                    harvested_eth_core.y = y + 16;
+                } else if (y == 6) {
+                    harvested_eth_core.y = y + 11;
+                } else {
+                    log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y);
+                }
                 translation_table.insert({curr_core, harvested_eth_core});
             }
 
@@ -693,7 +926,7 @@ std::unordered_map<tt_xy_pair, tt_xy_pair> Cluster::create_harvested_coord_trans
     return translation_table;
 }
 
-void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {
+void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
     auto translated_coords = harvested_coord_translation[device_id].at(tt_xy_pair(c, r));
     c = translated_coords.x;
     r = translated_coords.y;
@@ -702,7 +935,7 @@ void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r,
 void Cluster::initialize_pcie_devices() {
     log_debug(LogSiliconDriver, "Cluster::start");
 
-    for (auto &device_it : m_pci_device_map){
+    for (auto& device_it : m_pci_device_map) {
         check_pcie_device_initialized(device_it.first);
     }
 
@@ -711,7 +944,7 @@ void Cluster::initialize_pcie_devices() {
     init_membars();
 }
 
-void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &soft_resets) {
+void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& soft_resets) {
     log_debug(LogSiliconDriver, "Cluster::broadcast_tensix_risc_reset");
 
     PCIDevice* device = get_pci_device(chip_id);
@@ -719,7 +952,10 @@ void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSo
     auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
     auto logical_id = device->get_logical_id();
 
-    log_debug(LogSiliconDriver, "== For all tensix set soft-reset for {} risc cores.", TensixSoftResetOptionsToString(valid).c_str());
+    log_debug(
+        LogSiliconDriver,
+        "== For all tensix set soft-reset for {} risc cores.",
+        TensixSoftResetOptionsToString(valid).c_str());
 
     auto architecture_implementation = device->get_architecture_implementation();
 
@@ -738,77 +974,87 @@ void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSo
 }
 
 std::set<chip_id_t> Cluster::get_target_mmio_device_ids() {
-    if(!all_target_mmio_devices.size()) {
-        for (const auto &it: m_pci_device_map) {
+    if (!all_target_mmio_devices.size()) {
+        for (const auto& it : m_pci_device_map) {
             all_target_mmio_devices.insert(it.first);
         }
     }
     return all_target_mmio_devices;
 }
 
-void Cluster::assert_risc_reset() {
-    broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET);
-}
+void Cluster::assert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); }
 
-void Cluster::deassert_risc_reset() {
-    broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET);
-}
+void Cluster::deassert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET); }
 
-void Cluster::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {
-    std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster
-    log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() ||
-               std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(),
-                                "Cannot deassert reset on a non-tensix or harvested core");
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device);
-    if(target_is_mmio_capable) {
-        log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe");
+void Cluster::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {
+    // Get Target Device to query soc descriptor and determine location in cluster
+    std::uint32_t target_device = core.chip;
+    log_assert(
+        std::find(
+            get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) !=
+                get_soc_descriptor(target_device).workers.end() ||
+            std::find(
+                get_soc_descriptor(target_device).ethernet_cores.begin(),
+                get_soc_descriptor(target_device).ethernet_cores.end(),
+                core) != get_soc_descriptor(target_device).ethernet_cores.end(),
+        "Cannot deassert reset on a non-tensix or harvested core");
+    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device);
+    if (target_is_mmio_capable) {
+        log_assert(
+            m_pci_device_map.find(target_device) != m_pci_device_map.end(),
+            "Could not find MMIO mapped device in devices connected over PCIe");
         send_tensix_risc_reset_to_core(core, soft_resets);
-    }
-    else {
+    } else {
         log_assert(arch_name != tt::ARCH::BLACKHOLE, "Can't issue access to remote core in BH");
         send_remote_tensix_risc_reset_to_core(core, soft_resets);
     }
 }
 
 void Cluster::assert_risc_reset_at_core(tt_cxy_pair core) {
-    std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster
-    log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() ||
-               std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(),
-                                "Cannot assert reset on a non-tensix or harvested core");
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device);
-    if(target_is_mmio_capable) {
-        log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe");
+    // Get Target Device to query soc descriptor and determine location in cluster
+    std::uint32_t target_device = core.chip;
+    log_assert(
+        std::find(
+            get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) !=
+                get_soc_descriptor(target_device).workers.end() ||
+            std::find(
+                get_soc_descriptor(target_device).ethernet_cores.begin(),
+                get_soc_descriptor(target_device).ethernet_cores.end(),
+                core) != get_soc_descriptor(target_device).ethernet_cores.end(),
+        "Cannot assert reset on a non-tensix or harvested core");
+    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device);
+    if (target_is_mmio_capable) {
+        log_assert(
+            m_pci_device_map.find(target_device) != m_pci_device_map.end(),
+            "Could not find MMIO mapped device in devices connected over PCIe");
         send_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET);
-    }
-    else {
+    } else {
         send_remote_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET);
     }
 }
 
 // Free memory during teardown, and remove (clean/unlock) from any leftover mutexes.
 void Cluster::cleanup_shared_host_state() {
-    for(auto &mutex : hardware_resource_mutex_map) {
+    for (auto& mutex : hardware_resource_mutex_map) {
         mutex.second.reset();
         mutex.second = nullptr;
         named_mutex::remove(mutex.first.c_str());
     }
 }
 
-std::unordered_set<chip_id_t> Cluster::get_all_chips_in_cluster() {
-    return ndesc -> get_all_chips();
-}
+std::unordered_set<chip_id_t> Cluster::get_all_chips_in_cluster() { return ndesc->get_all_chips(); }
+
 int Cluster::get_number_of_chips_in_cluster() {
     // Returns the number of chips seen in the network descriptor
-    return ndesc -> get_all_chips().size();
+    return ndesc->get_all_chips().size();
 }
 
-tt_ClusterDescriptor* Cluster::get_cluster_description() {return ndesc.get();}
+tt_ClusterDescriptor* Cluster::get_cluster_description() { return ndesc.get(); }
+
 // Can be used before instantiating a silicon device
 int Cluster::detect_number_of_chips() {
-
     auto available_device_ids = detect_available_device_ids();
     return available_device_ids.size();
-
 }
 
 // Can be used before instantiating a silicon device
@@ -822,7 +1068,8 @@ std::vector<chip_id_t> Cluster::detect_available_device_ids() {
     return PCIDevice::enumerate_devices();
 }
 
-std::function<void(uint32_t, uint32_t, const uint8_t*)> Cluster::get_fast_pcie_static_tlb_write_callable(int device_id) {
+std::function<void(uint32_t, uint32_t, const uint8_t*)> Cluster::get_fast_pcie_static_tlb_write_callable(
+    int device_id) {
     PCIDevice* dev = get_pci_device(device_id);
 
     const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) {
@@ -841,7 +1088,7 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) {
         throw std::runtime_error("TLBs not initialized");
     }
 
-    auto *dev = get_pci_device(target.chip);
+    auto* dev = get_pci_device(target.chip);
 
     if (!dev->bar0_wc) {
         throw std::runtime_error("No write-combined mapping for BAR0");
@@ -855,26 +1102,39 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) {
     }
 
     auto [tlb_offset, tlb_size] = tlb_data.value();
-    auto *base = reinterpret_cast<uint8_t *>(dev->bar0_wc);
+    auto* base = reinterpret_cast<uint8_t*>(dev->bar0_wc);
 
     return tt::Writer(base + tlb_offset, tlb_size);
 }
 
-void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb) {
-    PCIDevice *dev = get_pci_device(target.chip);
+void Cluster::write_device_memory(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    tt_cxy_pair target,
+    std::uint32_t address,
+    const std::string& fallback_tlb) {
+    PCIDevice* dev = get_pci_device(target.chip);
     const uint8_t* buffer_addr = static_cast<const uint8_t*>(mem_ptr);
 
-    log_debug(LogSiliconDriver, "Cluster::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}",
-        target.chip, target.x, target.y, address, size_in_bytes, small_access);
+    log_debug(
+        LogSiliconDriver,
+        "Cluster::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}",
+        target.chip,
+        target.x,
+        target.y,
+        address,
+        size_in_bytes,
+        small_access);
 
     std::int32_t tlb_index = 0;
     std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
-    if(tlbs_init_per_chip[target.chip]) {
+    if (tlbs_init_per_chip[target.chip]) {
         tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
         tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
     }
 
-    if (tlb_data.has_value() && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
+    if (tlb_data.has_value() &&
+        address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
         auto [tlb_offset, tlb_size] = tlb_data.value();
         if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
             // This is only for Blackhole. If we want to  write to DRAM (BAR4 space), we add offset
@@ -887,9 +1147,9 @@ void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, t
         const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
         const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->get_device_num()));
 
-        while(size_in_bytes > 0) {
-
-            auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
+        while (size_in_bytes > 0) {
+            auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(
+                tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
             uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
             dev->write_block(mapped_address, transfer_size, buffer_addr);
 
@@ -901,22 +1161,36 @@ void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, t
     }
 }
 
-void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb) {
-    // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this function will cause a segfault.
-    log_debug(LogSiliconDriver, "Cluster::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}", target.chip, target.x, target.y, address, size_in_bytes);
-    PCIDevice *dev = get_pci_device(target.chip);
+void Cluster::read_device_memory(
+    void* mem_ptr,
+    tt_cxy_pair target,
+    std::uint32_t address,
+    std::uint32_t size_in_bytes,
+    const std::string& fallback_tlb) {
+    // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this
+    // function will cause a segfault.
+    log_debug(
+        LogSiliconDriver,
+        "Cluster::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}",
+        target.chip,
+        target.x,
+        target.y,
+        address,
+        size_in_bytes);
+    PCIDevice* dev = get_pci_device(target.chip);
 
     uint8_t* buffer_addr = static_cast<uint8_t*>(mem_ptr);
 
     std::int32_t tlb_index = 0;
     std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
-    if(tlbs_init_per_chip[target.chip]) {
+    if (tlbs_init_per_chip[target.chip]) {
         tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
         tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
     }
     log_debug(LogSiliconDriver, "  tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value());
 
-    if (tlb_data.has_value()  && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
+    if (tlb_data.has_value() &&
+        address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
         auto [tlb_offset, tlb_size] = tlb_data.value();
         if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
             // This is only for Blackhole. If we want to  read from DRAM (BAR4 space), we add offset
@@ -930,9 +1204,9 @@ void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_
         const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
         const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->get_device_num()));
         log_debug(LogSiliconDriver, "  dynamic tlb_index: {}", tlb_index);
-        while(size_in_bytes > 0) {
-
-            auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
+        while (size_in_bytes > 0) {
+            auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(
+                tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
             uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
             dev->read_block(mapped_address, transfer_size, buffer_addr);
 
@@ -945,55 +1219,61 @@ void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_
 }
 
 void Cluster::read_buffer(
-    void* mem_ptr,
-    std::uint32_t address,
-    std::uint16_t channel,
-    std::uint32_t size_in_bytes,
-    chip_id_t src_device_id) {
-
+    void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id) {
     log_assert(src_device_id != -1, "Must provide src_device_id for host_resident read/write");
-    log_assert(m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "read_buffer: Device id is not a MMIO device");
+    log_assert(
+        m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "read_buffer: Device id is not a MMIO device");
 
     hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel);
-    log_assert(hugepage_map.mapping, "read_buffer: Hugepages are not allocated for src_device_id: {} ch: {}."
-                                             " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)",
-                                             src_device_id,
-                                             channel);
+    log_assert(
+        hugepage_map.mapping,
+        "read_buffer: Hugepages are not allocated for src_device_id: {} ch: {}."
+        " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)",
+        src_device_id,
+        channel);
+
+    void* user_scratchspace = static_cast<char*>(hugepage_map.mapping) + (address % hugepage_map.mapping_size);
 
-    void * user_scratchspace = static_cast<char*>(hugepage_map.mapping) + (address % hugepage_map.mapping_size);
+    log_debug(
+        LogSiliconDriver,
+        "Cluster::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}",
+        src_device_id,
+        channel,
+        user_scratchspace);
 
-    log_debug(LogSiliconDriver, "Cluster::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}",  src_device_id, channel, user_scratchspace);
-    
     memcpy(mem_ptr, user_scratchspace, size_in_bytes);
 }
 
 void Cluster::write_buffer(
-    const void *mem_ptr,
-    std::uint32_t size,
-    std::uint32_t address,
-    std::uint16_t channel,
-    chip_id_t src_device_id) {
-
-    log_assert(m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "write_buffer: Device id is not a MMIO device");
+    const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id) {
+    log_assert(
+        m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "write_buffer: Device id is not a MMIO device");
 
     hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel);
-    log_assert(hugepage_map.mapping, "write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}."
-                                             " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)",
-                                             src_device_id,
-                                             channel);
-
-    log_assert(size <= hugepage_map.mapping_size, "write_buffer data has larger size {} than destination buffer {}", size, hugepage_map.mapping_size);
-    log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}",
+    log_assert(
+        hugepage_map.mapping,
+        "write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}."
+        " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)",
+        src_device_id,
+        channel);
+
+    log_assert(
+        size <= hugepage_map.mapping_size,
+        "write_buffer data has larger size {} than destination buffer {}",
+        size,
+        hugepage_map.mapping_size);
+    log_debug(
+        LogSiliconDriver,
+        "Using hugepage mapping at address {} offset {} chan {} size {}",
         hugepage_map.mapping,
         (address % hugepage_map.mapping_size),
         channel,
         size);
-    void * user_scratchspace = static_cast<char*>(hugepage_map.mapping) + (address % hugepage_map.mapping_size);
+    void* user_scratchspace = static_cast<char*>(hugepage_map.mapping) + (address % hugepage_map.mapping_size);
 
     memcpy(user_scratchspace, mem_ptr, size);
 }
 
-
 uint32_t Cluster::get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state) {
     PCIDevice* pci_device = get_pci_device(chip_id);
     uint32_t msg = 0xaa00;
@@ -1010,34 +1290,37 @@ uint32_t Cluster::get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState
             msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_short_idle();
             break;
         }
-        default: throw std::runtime_error("Unrecognized power state.");
+        default:
+            throw std::runtime_error("Unrecognized power state.");
     }
     return msg;
 }
 
 void Cluster::set_pcie_power_state(tt_DevicePowerState state) {
-
-    for (auto &device_it : m_pci_device_map){
+    for (auto& device_it : m_pci_device_map) {
         int chip_id = device_it.first;
         uint32_t msg = get_power_state_arc_msg(chip_id, state);
         std::stringstream ss;
         ss << state;
         auto exit_code = arc_msg(chip_id, 0xaa00 | msg, true, 0, 0);
         if (exit_code != 0) {
-            throw std::runtime_error(fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code));
+            throw std::runtime_error(
+                fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code));
         }
     }
 }
 
 int Cluster::get_clock(int logical_device_id) {
-
     // TODO: remove this once ARC messages work.
     // This is currently used only for testing and bringing up Blackhole on Buda.
     if (arch_name == tt::ARCH::BLACKHOLE) {
         char* clk_env_var = getenv("TT_SILICON_DRIVER_AICLK");
         if (clk_env_var != nullptr) {
-            log_warning(LogSiliconDriver, "ARC messages are not enabled on Blackhole. "
-                        "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}" , clk_env_var);
+            log_warning(
+                LogSiliconDriver,
+                "ARC messages are not enabled on Blackhole. "
+                "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}",
+                clk_env_var);
             return std::stoi(clk_env_var);
         }
     }
@@ -1045,7 +1328,14 @@ int Cluster::get_clock(int logical_device_id) {
     uint32_t clock;
     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id);
     PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical);
-    auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock);
+    auto exit_code = arc_msg(
+        logical_device_id,
+        0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(),
+        true,
+        0xFFFF,
+        0xFFFF,
+        1,
+        &clock);
     if (exit_code != 0) {
         throw std::runtime_error(fmt::format("Failed to get aiclk value with exit code {}", exit_code));
     }
@@ -1053,16 +1343,15 @@ int Cluster::get_clock(int logical_device_id) {
 }
 
 std::map<int, int> Cluster::get_clocks() {
-    std::map<int,int> clock_freq_map;
-    for (auto &device_it : m_pci_device_map){
+    std::map<int, int> clock_freq_map;
+    for (auto& device_it : m_pci_device_map) {
         int d = device_it.first;
         clock_freq_map.insert({d, get_clock(d)});
     }
     return clock_freq_map;
 }
 
-Cluster::~Cluster () {
-
+Cluster::~Cluster() {
     log_debug(LogSiliconDriver, "Cluster::~Cluster");
 
     cleanup_shared_host_state();
@@ -1083,23 +1372,34 @@ std::optional<std::tuple<uint32_t, uint32_t>> Cluster::get_tlb_data_from_target(
         tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
         auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
         tlb_data = architecture_implementation->describe_tlb(tlb_index);
-    } 
+    }
     return tlb_data;
 }
 
-void Cluster::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) {
-    log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in Cluster::configure_tlb");
-    PCIDevice *pci_device = get_pci_device(logical_device_id);
+void Cluster::configure_tlb(
+    chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) {
+    log_assert(
+        ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed,
+        "Invalid ordering specified in Cluster::configure_tlb");
+    PCIDevice* pci_device = get_pci_device(logical_device_id);
     pci_device->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation, ordering);
     auto tlb_size = std::get<1>(pci_device->get_architecture_implementation()->describe_tlb(tlb_index).value());
-    if(tlb_config_map.find(logical_device_id) == tlb_config_map.end()) tlb_config_map.insert({logical_device_id, {}});
+    if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) {
+        tlb_config_map.insert({logical_device_id, {}});
+    }
     tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size});
 }
 
 void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) {
-    log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in Cluster::configure_tlb.");
-    log_assert(dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), "Invalid TLB specified in Cluster::set_fallback_tlb_ordering_mode.");
-    log_assert(fallback_tlb != "LARGE_READ_TLB" &&  fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified.");
+    log_assert(
+        ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed,
+        "Invalid ordering specified in Cluster::configure_tlb.");
+    log_assert(
+        dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(),
+        "Invalid TLB specified in Cluster::set_fallback_tlb_ordering_mode.");
+    log_assert(
+        fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB",
+        "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified.");
     dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering;
 }
 
@@ -1109,7 +1409,7 @@ void Cluster::init_pcie_iatus() {
     int num_enabled_devices = m_pci_device_map.size();
     log_debug(LogSiliconDriver, "Cluster::init_pcie_iatus() num_enabled_devices: {}", num_enabled_devices);
 
-    for (auto &src_device_it : m_pci_device_map){
+    for (auto& src_device_it : m_pci_device_map) {
         int logical_id = src_device_it.first;
         PCIDevice* src_pci_device = src_device_it.second.get();
 
@@ -1119,72 +1419,86 @@ void Cluster::init_pcie_iatus() {
             if (hugepage_map.mapping) {
                 std::uint32_t region_size = hugepage_map.mapping_size;
                 if (channel_id == 3) {
-                    region_size = HUGEPAGE_CHANNEL_3_SIZE_LIMIT; 
+                    region_size = HUGEPAGE_CHANNEL_3_SIZE_LIMIT;
                 }
 
                 // This log message doesn't look right.
-                log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id);
+                log_debug(
+                    LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id);
                 iatu_configure_peer_region(logical_id, channel_id, hugepage_map.physical_address, region_size);
 
             } else {
-                throw std::runtime_error(fmt::format("init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", logical_id, channel_id));
+                throw std::runtime_error(fmt::format(
+                    "init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}",
+                    logical_id,
+                    channel_id));
             }
         }
     }
 }
 
-int Cluster::test_setup_interface () {
+int Cluster::test_setup_interface() {
     if (arch_name == tt::ARCH::GRAYSKULL) {
         int ret_val = 0;
-        PCIDevice *dev = m_pci_device_map.begin()->second.get();
+        PCIDevice* dev = m_pci_device_map.begin()->second.get();
 
-        uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset;
+        uint32_t mapped_reg = dev->set_dynamic_tlb(
+                                     dev->get_architecture_implementation()->get_reg_tlb(),
+                                     tt_xy_pair(0, 0),
+                                     0xffb20108,
+                                     harvested_coord_translation)
+                                  .bar_offset;
 
         uint32_t regval = 0;
         dev->read_regs(mapped_reg, 1, &regval);
         ret_val = (regval != 0xffffffff && ((regval & 0x1) == 1)) ? 0 : 1;
         return ret_val;
-    }
-    else if (arch_name == tt::ARCH::WORMHOLE_B0) {
+    } else if (arch_name == tt::ARCH::WORMHOLE_B0) {
         int ret_val = 0;
-        PCIDevice *dev = m_pci_device_map.begin()->second.get();
+        PCIDevice* dev = m_pci_device_map.begin()->second.get();
 
-        uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset;
+        uint32_t mapped_reg = dev->set_dynamic_tlb(
+                                     dev->get_architecture_implementation()->get_reg_tlb(),
+                                     tt_xy_pair(1, 0),
+                                     0xffb20108,
+                                     harvested_coord_translation)
+                                  .bar_offset;
 
         uint32_t regval = 0;
         dev->read_regs(mapped_reg, 1, &regval);
         ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1;
         return ret_val;
-    }
-    else if (arch_name == tt::ARCH::BLACKHOLE) {
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         // MT Inital BH - Try to enable this, but double check "regval == 33"
         // int ret_val = 0;
         // PCIDevice *dev = m_pci_device_map.begin()->second->hdev;
 
-        // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset;
+        // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second,
+        // dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108,
+        // harvested_coord_translation).bar_offset;
 
         // uint32_t regval = 0;
         // read_regs(dev, mapped_reg, 1, &regval);
         // ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1;
         // return ret_val;
         return 0;
-    }
-    else {
+    } else {
         throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name)));
     }
 }
 
-void Cluster::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) {
-    PCIDevice *dev = get_pci_device(logical_device_id);
+void Cluster::bar_write32(int logical_device_id, uint32_t addr, uint32_t data) {
+    PCIDevice* dev = get_pci_device(logical_device_id);
 
     if (addr < dev->bar0_uc_offset) {
-        dev->write_block(addr, sizeof(data), reinterpret_cast<const uint8_t*>(&data)); // do we have to reinterpret_cast?
+        dev->write_block(
+            addr, sizeof(data), reinterpret_cast<const uint8_t*>(&data));  // do we have to reinterpret_cast?
     } else {
         dev->write_regs(addr, 1, &data);
     }
 }
 
-uint32_t Cluster::bar_read32 (int logical_device_id, uint32_t addr) {
+uint32_t Cluster::bar_read32(int logical_device_id, uint32_t addr) {
     PCIDevice* dev = get_pci_device(logical_device_id);
 
     uint32_t data;
@@ -1197,32 +1511,39 @@ uint32_t Cluster::bar_read32 (int logical_device_id, uint32_t addr) {
 }
 
 // Returns 0 if everything was OK
-int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) {
-
-
+int Cluster::pcie_arc_msg(
+    int logical_device_id,
+    uint32_t msg_code,
+    bool wait_for_done,
+    uint32_t arg0,
+    uint32_t arg1,
+    int timeout,
+    uint32_t* return_3,
+    uint32_t* return_4) {
     if ((msg_code & 0xff00) != 0xaa00) {
         log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code);
     }
-    log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed
+    log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args");  // Only 16 bits are allowed
 
-    PCIDevice *pci_device = get_pci_device(logical_device_id);
+    PCIDevice* pci_device = get_pci_device(logical_device_id);
     auto architecture_implementation = pci_device->get_architecture_implementation();
 
     // Exclusive access for a single process at a time. Based on physical pci interface id.
     std::string msg_type = "ARC_MSG";
     const scoped_lock<named_mutex> lock(*get_mutex(msg_type, pci_device->get_device_num()));
-    uint32_t fw_arg = arg0 | (arg1<<16);
+    uint32_t fw_arg = arg0 | (arg1 << 16);
     int exit_code = 0;
 
-    bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg);
-    bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code);
+    bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg);
+    bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code);
 
-    uint32_t misc = bar_read32 (logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset());
+    uint32_t misc = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset());
     if (misc & (1 << 16)) {
         log_error("trigger_fw_int failed on device {}", logical_device_id);
         return 1;
     } else {
-        bar_write32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16));
+        bar_write32(
+            logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16));
     }
 
     if (wait_for_done) {
@@ -1231,24 +1552,31 @@ int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_fo
         auto start = std::chrono::system_clock::now();
         while (true) {
             if (std::chrono::system_clock::now() - start > timeout_seconds) {
-                throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id));
+                throw std::runtime_error(fmt::format(
+                    "Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id));
             }
 
             status = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4);
 
             if ((status & 0xffff) == (msg_code & 0xff)) {
                 if (return_3 != nullptr) {
-                    *return_3 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4);
+                    *return_3 = bar_read32(
+                        logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4);
                 }
 
                 if (return_4 != nullptr) {
-                    *return_4 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4);
+                    *return_4 = bar_read32(
+                        logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4);
                 }
 
                 exit_code = (status & 0xffff0000) >> 16;
                 break;
             } else if (status == MSG_ERROR_REPLY) {
-                log_warning(LogSiliconDriver, "On device {}, message code 0x{:x} not recognized by FW", logical_device_id, msg_code);
+                log_warning(
+                    LogSiliconDriver,
+                    "On device {}, message code 0x{:x} not recognized by FW",
+                    logical_device_id,
+                    msg_code);
                 exit_code = MSG_ERROR_REPLY;
                 break;
             }
@@ -1259,12 +1587,16 @@ int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_fo
     return exit_code;
 }
 
-int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) {
+int Cluster::iatu_configure_peer_region(
+    int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) {
     uint32_t dest_bar_lo = bar_addr_64 & 0xffffffff;
     uint32_t dest_bar_hi = (bar_addr_64 >> 32) & 0xffffffff;
     std::uint32_t region_id_to_use = peer_region_id;
-    if(peer_region_id == 3) region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address space with the correct start offset
-    PCIDevice *pci_device = get_pci_device(logical_device_id);
+    if (peer_region_id == 3) {
+        region_id_to_use = 4;  // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address
+                               // space with the correct start offset
+    }
+    PCIDevice* pci_device = get_pci_device(logical_device_id);
     auto architecture_implementation = pci_device->get_architecture_implementation();
 
     // BR: ARC doesn't work yet on Blackhole, so programming ATU directly. Should be removed when arc starts working.
@@ -1274,8 +1606,8 @@ int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_re
         uint64_t base_size = (region_id_to_use + 1) * region_size;
         uint64_t limit_address = base_addr + base_size - 1;
 
-        uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1
-        uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1
+        uint32_t region_ctrl_1 = 1 << 13;  // INCREASE_REGION_SIZE = 1
+        uint32_t region_ctrl_2 = 1 << 31;  // REGION_EN = 1
         uint32_t region_ctrl_3 = 0;
         uint32_t base_addr_lo = base_addr & 0xffffffff;
         uint32_t base_addr_hi = (base_addr >> 32) & 0xffffffff;
@@ -1285,43 +1617,83 @@ int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_re
         uint64_t iatu_index = 0;
         uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200;
 
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x00), &region_ctrl_1, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x1c), &region_ctrl_3, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1);
-    }
-    else {
-        bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x00),
+            &region_ctrl_1,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x04),
+            &region_ctrl_2,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x08),
+            &base_addr_lo,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x0c),
+            &base_addr_hi,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x10),
+            &limit_address_lo,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x14),
+            &dest_bar_lo,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x18),
+            &dest_bar_hi,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x1c),
+            &region_ctrl_3,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x20),
+            &limit_address_hi,
+            1);
+    } else {
+        bar_write32(
+            logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use);
         bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 1 * 4, dest_bar_lo);
         bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 2 * 4, dest_bar_hi);
         bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 3 * 4, region_size);
-        arc_msg(logical_device_id, 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), true, 0, 0);
+        arc_msg(
+            logical_device_id,
+            0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(),
+            true,
+            0,
+            0);
     }
 
     // Print what just happened
-    uint32_t peer_region_start = region_id_to_use*region_size;
-    uint32_t peer_region_end = (region_id_to_use+1)*region_size - 1;
-    log_debug(LogSiliconDriver, "    [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}", peer_region_id, peer_region_start, peer_region_end, bar_addr_64);
+    uint32_t peer_region_start = region_id_to_use * region_size;
+    uint32_t peer_region_end = (region_id_to_use + 1) * region_size - 1;
+    log_debug(
+        LogSiliconDriver,
+        "    [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}",
+        peer_region_id,
+        peer_region_start,
+        peer_region_end,
+        bar_addr_64);
     return 0;
 }
 
 // Returns broken rows as bits set to 1 in 'memory' and 'logic'
 uint32_t Cluster::get_harvested_noc_rows(uint32_t harvesting_mask) {
     auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
-    const std::vector<uint32_t> &harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations();
+    const std::vector<uint32_t>& harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations();
     uint32_t harv_noc_rows = 0;
     std::string harv_noc_rows_str = "";
 
-    for (int pos=0; pos<harv_to_noc_loc.size(); ++pos) {
+    for (int pos = 0; pos < harv_to_noc_loc.size(); ++pos) {
         bool is_row_harvested = harvesting_mask & 0x1;
         if (is_row_harvested) {
             harv_noc_rows |= (1 << harv_to_noc_loc[pos]);
-            if (harv_noc_rows_str != "") harv_noc_rows_str += ", ";
+            if (harv_noc_rows_str != "") {
+                harv_noc_rows_str += ", ";
+            }
             harv_noc_rows_str += std::to_string(harv_to_noc_loc[pos]);
         }
         harvesting_mask = harvesting_mask >> 1;
@@ -1332,36 +1704,45 @@ uint32_t Cluster::get_harvested_noc_rows(uint32_t harvesting_mask) {
     return harv_noc_rows;
 }
 
-uint32_t Cluster::get_harvested_rows (int logical_device_id) {
+uint32_t Cluster::get_harvested_rows(int logical_device_id) {
     const char* harv_override = std::getenv("T6PY_HARVESTING_OVERRIDE");
     uint32_t harv = 0xffffffff;
     if (harv_override) {
         harv = std::stoul(harv_override, nullptr, 16);
     } else {
         auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id);
-        PCIDevice *pci_device = get_pci_device(mmio_capable_chip_logical);
-        int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv);
-        log_assert(harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id);
+        PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical);
+        int harvesting_msg_code = arc_msg(
+            logical_device_id,
+            0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(),
+            true,
+            0,
+            0,
+            1,
+            &harv);
+        log_assert(
+            harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id);
     }
     log_assert(harv != 0xffffffff, "Readback 0xffffffff for harvesting info. Chip is fused incorrectly!");
-    log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv==0) ? "DISABLED":"ENABLED", harv);
-    
+    log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv == 0) ? "DISABLED" : "ENABLED", harv);
+
     uint32_t memory = harv & 0x3ff;
     uint32_t logic = (harv >> 10) & 0x3ff;
-    return (memory|logic);
+    return (memory | logic);
 }
 
-uint32_t Cluster::get_harvested_noc_rows_for_chip (int logical_device_id) {
+uint32_t Cluster::get_harvested_noc_rows_for_chip(int logical_device_id) {
     return get_harvested_noc_rows(get_harvested_rows(logical_device_id));
 }
 
-void Cluster::enable_local_ethernet_queue(const chip_id_t &device_id, int timeout) {
+void Cluster::enable_local_ethernet_queue(const chip_id_t& device_id, int timeout) {
     uint32_t msg_success = 0x0;
     auto timeout_seconds = std::chrono::seconds(timeout);
     auto start = std::chrono::system_clock::now();
     while (msg_success != 1) {
         if (std::chrono::system_clock::now() - start > timeout_seconds) {
-            throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout));
+            throw std::runtime_error(
+                fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout));
         }
 
         if (arc_msg(device_id, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success) == MSG_ERROR_REPLY) {
@@ -1370,7 +1751,7 @@ void Cluster::enable_local_ethernet_queue(const chip_id_t &device_id, int timeou
     }
 }
 
-void *Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
+void* Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
     hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel);
     if (hugepage_map.mapping != nullptr) {
         return static_cast<std::byte*>(hugepage_map.mapping) + offset;
@@ -1381,13 +1762,14 @@ void *Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, u
 
 // Wrapper for throwing more helpful exception when not-enabled pci intf is accessed.
 inline PCIDevice* Cluster::get_pci_device(int device_id) const {
-    if (!m_pci_device_map.count(device_id)){
+    if (!m_pci_device_map.count(device_id)) {
         throw std::runtime_error(fmt::format("device_id: {} attempted to be accessed, but is not enabled.", device_id));
     }
     return m_pci_device_map.at(device_id).get();
 }
 
-std::shared_ptr<boost::interprocess::named_mutex> Cluster::get_mutex(const std::string& tlb_name, int pci_interface_id) {
+std::shared_ptr<boost::interprocess::named_mutex> Cluster::get_mutex(
+    const std::string& tlb_name, int pci_interface_id) {
     std::string mutex_name = tlb_name + std::to_string(pci_interface_id);
     return hardware_resource_mutex_map.at(mutex_name);
 }
@@ -1415,7 +1797,8 @@ uint16_t Cluster::get_sys_rack(uint32_t rack_x, uint32_t rack_y) {
 }
 
 bool Cluster::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) {
-  return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) == (curr_rptr & eth_interface_params.cmd_buf_size_mask));
+    return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) ==
+                                        (curr_rptr & eth_interface_params.cmd_buf_size_mask));
 }
 
 /*
@@ -1464,35 +1847,37 @@ bool Cluster::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) {
  * Other schemes may be more performant.
  */
 
-
 /*
  * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the
  * ethernet core (host) command queue DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the
  * mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
  */
 
-
 void Cluster::write_to_non_mmio_device(
-                        const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, 
-                        bool broadcast, std::vector<int> broadcast_header) {
-    
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    tt_cxy_pair core,
+    uint64_t address,
+    bool broadcast,
+    std::vector<int> broadcast_header) {
     chip_id_t mmio_capable_chip_logical;
-    
-    if(broadcast) {
+
+    if (broadcast) {
         mmio_capable_chip_logical = core.chip;
-    }
-    else {
+    } else {
         mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
     }
     flush_non_mmio_per_chip[ndesc->get_closest_mmio_capable_chip(core.chip)] = true;
 
     if (non_mmio_transfer_cores_customized) {
-        log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices.");
+        log_assert(
+            active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(),
+            "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices.");
     }
 
     using data_word_t = uint32_t;
     constexpr int DATA_WORD_SIZE = sizeof(data_word_t);
-    constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8; // Broadcast header is 8 words
+    constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8;  // Broadcast header is 8 words
     const auto target_chip = ndesc->get_chip_locations().at(core.chip);
 
     std::string write_tlb = "LARGE_WRITE_TLB";
@@ -1501,14 +1886,15 @@ void Cluster::write_to_non_mmio_device(
     translate_to_noc_table_coords(core.chip, core.y, core.x);
     std::vector<std::uint32_t> erisc_command;
     std::vector<std::uint32_t> erisc_q_rptr = std::vector<uint32_t>(1);
-    std::vector<std::uint32_t> erisc_q_ptrs = std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t));
+    std::vector<std::uint32_t> erisc_q_ptrs =
+        std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t));
 
     std::vector<std::uint32_t> data_block;
 
-    routing_cmd_t *new_cmd;
+    routing_cmd_t* new_cmd;
 
     uint32_t buffer_id = 0;
-    uint32_t timestamp = 0; //CMD_TIMESTAMP;
+    uint32_t timestamp = 0;  // CMD_TIMESTAMP;
     bool use_dram;
     uint32_t max_block_size;
 
@@ -1520,14 +1906,22 @@ void Cluster::write_to_non_mmio_device(
     //                    MUTEX ACQUIRE (NON-MMIO)
     //  do not locate any ethernet core reads/writes before this acquire
     //
-    const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num()));
-
-    int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
-    tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
-
-    erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE);
-    new_cmd = (routing_cmd_t *)&erisc_command[0];
-    read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
+    const scoped_lock<named_mutex> lock(
+        *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num()));
+
+    int& active_core_for_txn =
+        non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
+    tt_cxy_pair remote_transfer_ethernet_core =
+        remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
+
+    erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE);
+    new_cmd = (routing_cmd_t*)&erisc_command[0];
+    read_device_memory(
+        erisc_q_ptrs.data(),
+        remote_transfer_ethernet_core,
+        eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+        eth_interface_params.remote_update_ptr_size_bytes * 2,
+        read_tlb);
     uint32_t full_count = 0;
     uint32_t offset = 0;
     uint32_t block_size;
@@ -1537,40 +1931,55 @@ void Cluster::write_to_non_mmio_device(
     erisc_q_rptr[0] = erisc_q_ptrs[4];
     while (offset < size_in_bytes) {
         while (full) {
-            read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb);
-            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]);
+            read_device_memory(
+                erisc_q_rptr.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes +
+                    eth_interface_params.remote_update_ptr_size_bytes,
+                DATA_WORD_SIZE,
+                read_tlb);
+            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]);
             full_count++;
         }
-        //full = true;
-        // set full only if this command will make the q full.
-        // otherwise full stays false so that we do not poll the rd pointer in next iteration.
-        // As long as current command push does not fill up the queue completely, we do not want
-        // to poll rd pointer in every iteration.
-        //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]);
+        // full = true;
+        //  set full only if this command will make the q full.
+        //  otherwise full stays false so that we do not poll the rd pointer in next iteration.
+        //  As long as current command push does not fill up the queue completely, we do not want
+        //  to poll rd pointer in every iteration.
+        // full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]);
 
         uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask;
-        if ((address + offset) & 0x1F) { // address not 32-byte aligned
-            block_size = DATA_WORD_SIZE; // 4 byte aligned
+        if ((address + offset) & 0x1F) {  // address not 32-byte aligned
+            block_size = DATA_WORD_SIZE;  // 4 byte aligned
         } else {
             // For broadcast we prepend a 32byte header. Decrease block size (size of payload) by this amount.
-            block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset : max_block_size - 32 * broadcast;
+            block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset
+                                                                                  : max_block_size - 32 * broadcast;
             // Explictly align block_size to 4 bytes, in case the input buffer is not uint32_t aligned
             uint32_t alignment_mask = sizeof(uint32_t) - 1;
             block_size = (block_size + alignment_mask) & ~alignment_mask;
         }
-        // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size in the last block
-        uint64_t transfer_size = std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied
+        // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size
+        // in the last block
+        uint64_t transfer_size =
+            std::min(block_size, size_in_bytes - offset);  // Host side data size that needs to be copied
         // Use block mode for broadcast
-        uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE)) ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req;
-        uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack;
+        uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE))
+                                 ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp)
+                                 : eth_interface_params.cmd_wr_req;
+        uint32_t resp_flags = block_size > DATA_WORD_SIZE
+                                  ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack)
+                                  : eth_interface_params.cmd_wr_ack;
         timestamp = 0;
-        
-        if(broadcast) {
+
+        if (broadcast) {
             req_flags |= eth_interface_params.cmd_broadcast;
         }
 
-        uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size;
-        uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0.
+        uint32_t host_dram_block_addr =
+            host_address_params.eth_routing_buffers_start +
+            (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size;
+        uint16_t host_dram_channel = 0;  // This needs to be 0, since WH can only map ETH buffers to chan 0.
 
         if (req_flags & eth_interface_params.cmd_data_block) {
             // Copy data to sysmem or device DRAM for Block mode
@@ -1579,46 +1988,60 @@ void Cluster::write_to_non_mmio_device(
                 resp_flags |= eth_interface_params.cmd_data_block_dram;
                 size_buffer_to_capacity(data_block, block_size);
                 memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size);
-                if(broadcast) {
+                if (broadcast) {
                     // Write broadcast header to sysmem
-                    write_to_sysmem(broadcast_header.data(), broadcast_header.size() * sizeof(uint32_t), host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical);
+                    write_to_sysmem(
+                        broadcast_header.data(),
+                        broadcast_header.size() * sizeof(uint32_t),
+                        host_dram_block_addr,
+                        host_dram_channel,
+                        mmio_capable_chip_logical);
                 }
                 // Write payload to sysmem
-                write_to_sysmem(data_block.data(), data_block.size() * DATA_WORD_SIZE, host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast, host_dram_channel, mmio_capable_chip_logical);
+                write_to_sysmem(
+                    data_block.data(),
+                    data_block.size() * DATA_WORD_SIZE,
+                    host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast,
+                    host_dram_channel,
+                    mmio_capable_chip_logical);
 
             } else {
                 uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size;
                 size_buffer_to_capacity(data_block, block_size);
                 memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size);
-                write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb);
+                write_device_memory(
+                    data_block.data(),
+                    data_block.size() * DATA_WORD_SIZE,
+                    remote_transfer_ethernet_core,
+                    buf_address,
+                    write_tlb);
             }
             tt_driver_atomics::sfence();
         }
 
         // Send the read request
-        log_assert(broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0), "Block mode address must be 32-byte aligned."); // Block mode address must be 32-byte aligned.
-        
-        if(broadcast) {
+        log_assert(
+            broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0),
+            "Block mode address must be 32-byte aligned.");  // Block mode address must be 32-byte aligned.
+
+        if (broadcast) {
             // Only specify endpoint local address for broadcast
             new_cmd->sys_addr = address + offset;
+        } else {
+            new_cmd->sys_addr = get_sys_addr(target_chip.x, target_chip.y, core.x, core.y, address + offset);
+            new_cmd->rack = get_sys_rack(target_chip.rack, target_chip.shelf);
         }
-        else {
-            new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset);
-            new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip));
-        }
-            
-        if(req_flags & eth_interface_params.cmd_data_block) {
+
+        if (req_flags & eth_interface_params.cmd_data_block) {
             // Block mode
             new_cmd->data = block_size + BROADCAST_HEADER_SIZE * broadcast;
-        }
-        else {
-            if(size_in_bytes - offset < sizeof(uint32_t)) {
+        } else {
+            if (size_in_bytes - offset < sizeof(uint32_t)) {
                 // Handle misalignment at the end of the buffer:
                 // Assemble a padded uint32_t from single bytes, in case we have less than 4 bytes remaining
                 memcpy(&new_cmd->data, static_cast<const uint8_t*>(mem_ptr) + offset, size_in_bytes - offset);
-            }
-            else {
-                new_cmd->data = *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE);
+            } else {
+                new_cmd->data = *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE);
             }
         }
 
@@ -1626,14 +2049,24 @@ void Cluster::write_to_non_mmio_device(
         if (use_dram) {
             new_cmd->src_addr_tag = host_dram_block_addr;
         }
-        write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);
+        write_device_memory(
+            erisc_command.data(),
+            erisc_command.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr),
+            write_tlb);
         tt_driver_atomics::sfence();
 
         erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
         std::vector<std::uint32_t> erisc_q_wptr;
         erisc_q_wptr.resize(1);
         erisc_q_wptr[0] = erisc_q_ptrs[0];
-        write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
+        write_device_memory(
+            erisc_q_wptr.data(),
+            erisc_q_wptr.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+            write_tlb);
         tt_driver_atomics::sfence();
 
         offset += transfer_size;
@@ -1646,10 +2079,19 @@ void Cluster::write_to_non_mmio_device(
         if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) {
             active_core_for_txn++;
             uint32_t update_mask_for_chip = remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1;
-            active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID);
+            active_core_for_txn =
+                non_mmio_transfer_cores_customized
+                    ? (active_core_for_txn & update_mask_for_chip)
+                    : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID);
             // active_core = (active_core & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID;
-            remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
-            read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
+            remote_transfer_ethernet_core =
+                remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
+            read_device_memory(
+                erisc_q_ptrs.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+                eth_interface_params.remote_update_ptr_size_bytes * 2,
+                read_tlb);
             full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
             erisc_q_rptr[0] = erisc_q_ptrs[4];
         }
@@ -1657,11 +2099,11 @@ void Cluster::write_to_non_mmio_device(
 }
 
 /*
- * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue
- * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
+ * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core
+ * (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring
+ * the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
  */
 void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) {
-
     using data_word_t = uint32_t;
     constexpr int DATA_WORD_SIZE = sizeof(data_word_t);
     std::string write_tlb = "LARGE_WRITE_TLB";
@@ -1669,33 +2111,50 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
     std::string empty_tlb = "";
     translate_to_noc_table_coords(core.chip, core.y, core.x);
 
-    const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
+    const auto& mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
     const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip);
 
     std::vector<std::uint32_t> erisc_command;
     std::vector<std::uint32_t> erisc_q_rptr;
-    std::vector<std::uint32_t> erisc_q_ptrs = std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes*2 / DATA_WORD_SIZE);
+    std::vector<std::uint32_t> erisc_q_ptrs =
+        std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes * 2 / DATA_WORD_SIZE);
     std::vector<std::uint32_t> erisc_resp_q_wptr = std::vector<uint32_t>(1);
     std::vector<std::uint32_t> erisc_resp_q_rptr = std::vector<uint32_t>(1);
 
-
     std::vector<std::uint32_t> data_block;
 
-    routing_cmd_t *new_cmd;
+    routing_cmd_t* new_cmd;
 
-    erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE);
-    new_cmd = (routing_cmd_t *)&erisc_command[0];
+    erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE);
+    new_cmd = (routing_cmd_t*)&erisc_command[0];
 
     //
     //                    MUTEX ACQUIRE (NON-MMIO)
     //  do not locate any ethernet core reads/writes before this acquire
     //
-    const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num()));
+    const scoped_lock<named_mutex> lock(
+        *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num()));
     const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0);
 
-    read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-    read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb);
-    read_device_memory(erisc_resp_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb);
+    read_device_memory(
+        erisc_q_ptrs.data(),
+        remote_transfer_ethernet_core,
+        eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+        eth_interface_params.remote_update_ptr_size_bytes * 2,
+        read_tlb);
+    read_device_memory(
+        erisc_resp_q_wptr.data(),
+        remote_transfer_ethernet_core,
+        eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+        DATA_WORD_SIZE,
+        read_tlb);
+    read_device_memory(
+        erisc_resp_q_rptr.data(),
+        remote_transfer_ethernet_core,
+        eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes +
+            eth_interface_params.remote_update_ptr_size_bytes,
+        DATA_WORD_SIZE,
+        read_tlb);
 
     bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
     erisc_q_rptr.resize(1);
@@ -1713,25 +2172,34 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
 
     while (offset < size_in_bytes) {
         while (full) {
-            read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb);
-            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]);
+            read_device_memory(
+                erisc_q_rptr.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes +
+                    eth_interface_params.remote_update_ptr_size_bytes,
+                DATA_WORD_SIZE,
+                read_tlb);
+            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]);
         }
 
         uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask;
-        if ((address + offset) & 0x1F) { // address not 32-byte aligned
-            block_size = DATA_WORD_SIZE; // 4 byte aligned block
+        if ((address + offset) & 0x1F) {  // address not 32-byte aligned
+            block_size = DATA_WORD_SIZE;  // 4 byte aligned block
         } else {
             block_size = offset + max_block_size > size_in_bytes ? size_in_bytes - offset : max_block_size;
             // Align up to 4 bytes.
             uint32_t alignment_mask = sizeof(uint32_t) - 1;
             block_size = (block_size + alignment_mask) & ~alignment_mask;
-
         }
-        uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req) : eth_interface_params.cmd_rd_req;
-        uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data) : eth_interface_params.cmd_rd_data;
+        uint32_t req_flags = block_size > DATA_WORD_SIZE
+                                 ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req)
+                                 : eth_interface_params.cmd_rd_req;
+        uint32_t resp_flags = block_size > DATA_WORD_SIZE
+                                  ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data)
+                                  : eth_interface_params.cmd_rd_data;
         uint32_t resp_rd_ptr = erisc_resp_q_rptr[0] & eth_interface_params.cmd_buf_size_mask;
         uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + resp_rd_ptr * max_block_size;
-        uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0.
+        uint16_t host_dram_channel = 0;  // This needs to be 0, since WH can only map ETH buffers to chan 0.
 
         if (use_dram && block_size > DATA_WORD_SIZE) {
             req_flags |= eth_interface_params.cmd_data_block_dram;
@@ -1739,22 +2207,35 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
         }
 
         // Send the read request
-        log_assert((req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned.
-        new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset);
-        new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip));
+        log_assert(
+            (req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0),
+            "Block mode offset must be 32-byte aligned.");  // Block mode offset must be 32-byte aligned.
+        new_cmd->sys_addr = get_sys_addr(target_chip.x, target_chip.y, core.x, core.y, address + offset);
+        new_cmd->rack = get_sys_rack(target_chip.rack, target_chip.shelf);
         new_cmd->data = block_size;
         new_cmd->flags = req_flags;
         if (use_dram) {
             new_cmd->src_addr_tag = host_dram_block_addr;
         }
-        write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);;
+        write_device_memory(
+            erisc_command.data(),
+            erisc_command.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr),
+            write_tlb);
+        ;
         tt_driver_atomics::sfence();
 
         erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
         std::vector<std::uint32_t> erisc_q_wptr;
         erisc_q_wptr.resize(1);
         erisc_q_wptr[0] = erisc_q_ptrs[0];
-        write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
+        write_device_memory(
+            erisc_q_wptr.data(),
+            erisc_q_wptr.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+            write_tlb);
         tt_driver_atomics::sfence();
         // If there is more data to read and this command will make the q full, set full to 1.
         // otherwise full stays false so that we do not poll the rd pointer in next iteration.
@@ -1762,7 +2243,12 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
         // to poll rd pointer in every iteration.
 
         if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]), erisc_q_rptr[0])) {
-            read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
+            read_device_memory(
+                erisc_q_ptrs.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+                eth_interface_params.remote_update_ptr_size_bytes * 2,
+                read_tlb);
             full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
             erisc_q_rptr[0] = erisc_q_ptrs[4];
         }
@@ -1778,13 +2264,23 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
         // So we have to wait for wrptr to advance, then wait for flags to be nonzero, then read data.
 
         do {
-            read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb);
+            read_device_memory(
+                erisc_resp_q_wptr.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+                DATA_WORD_SIZE,
+                read_tlb);
         } while (erisc_resp_q_rptr[0] == erisc_resp_q_wptr[0]);
         tt_driver_atomics::lfence();
         uint32_t flags_offset = 12 + sizeof(routing_cmd_t) * resp_rd_ptr;
         std::vector<std::uint32_t> erisc_resp_flags = std::vector<uint32_t>(1);
         do {
-            read_device_memory(erisc_resp_flags.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + flags_offset, DATA_WORD_SIZE, read_tlb);
+            read_device_memory(
+                erisc_resp_flags.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.response_routing_cmd_queue_base + flags_offset,
+                DATA_WORD_SIZE,
+                read_tlb);
         } while (erisc_resp_flags[0] == 0);
 
         if (erisc_resp_flags[0] == resp_flags) {
@@ -1792,27 +2288,40 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
             uint32_t data_offset = 8 + sizeof(routing_cmd_t) * resp_rd_ptr;
             if (block_size == DATA_WORD_SIZE) {
                 std::vector<std::uint32_t> erisc_resp_data = std::vector<uint32_t>(1);
-                read_device_memory(erisc_resp_data.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + data_offset, DATA_WORD_SIZE, read_tlb);
-                if(size_in_bytes - offset < 4)  {
+                read_device_memory(
+                    erisc_resp_data.data(),
+                    remote_transfer_ethernet_core,
+                    eth_interface_params.response_routing_cmd_queue_base + data_offset,
+                    DATA_WORD_SIZE,
+                    read_tlb);
+                if (size_in_bytes - offset < 4) {
                     // Handle misaligned (4 bytes) data at the end of the block.
                     // Only read remaining bytes into the host buffer, instead of reading the full uint32_t
                     std::memcpy((uint8_t*)mem_ptr + offset, erisc_resp_data.data(), size_in_bytes - offset);
-                }
-                else {
-                    *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE) = erisc_resp_data[0];
+                } else {
+                    *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE) = erisc_resp_data[0];
                 }
             } else {
                 // Read 4 byte aligned block from device/sysmem
                 if (use_dram) {
                     size_buffer_to_capacity(data_block, block_size);
-                    read_from_sysmem(data_block.data(), host_dram_block_addr, host_dram_channel, block_size, mmio_capable_chip_logical);
+                    read_from_sysmem(
+                        data_block.data(),
+                        host_dram_block_addr,
+                        host_dram_channel,
+                        block_size,
+                        mmio_capable_chip_logical);
                 } else {
-                    uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size;
+                    uint32_t buf_address =
+                        eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size;
                     size_buffer_to_capacity(data_block, block_size);
-                    read_device_memory(data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb);
+                    read_device_memory(
+                        data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb);
                 }
                 // assert(mem_ptr.size() - (offset/DATA_WORD_SIZE) >= (block_size * DATA_WORD_SIZE));
-                log_assert((data_block.size() * DATA_WORD_SIZE) >= block_size, "Incorrect data size read back from sysmem/device");
+                log_assert(
+                    (data_block.size() * DATA_WORD_SIZE) >= block_size,
+                    "Incorrect data size read back from sysmem/device");
                 // Account for misalignment by skipping any padding bytes in the copied data_block
                 memcpy((uint8_t*)mem_ptr + offset, data_block.data(), std::min(block_size, size_in_bytes - offset));
             }
@@ -1820,40 +2329,53 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
 
         // Finally increment the rdptr for the response command q
         erisc_resp_q_rptr[0] = (erisc_resp_q_rptr[0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
-        write_device_memory(erisc_resp_q_rptr.data(), erisc_resp_q_rptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) + eth_interface_params.cmd_counters_size_bytes, write_tlb);
+        write_device_memory(
+            erisc_resp_q_rptr.data(),
+            erisc_resp_q_rptr.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) +
+                eth_interface_params.cmd_counters_size_bytes,
+            write_tlb);
         tt_driver_atomics::sfence();
         log_assert(erisc_resp_flags[0] == resp_flags, "Unexpected ERISC Response Flags.");
 
         offset += block_size;
     }
-
 }
 
 void Cluster::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) {
-    if(flush_non_mmio_per_chip[chip_id]) {
+    if (flush_non_mmio_per_chip[chip_id]) {
         log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole");
         std::string read_tlb = "LARGE_READ_TLB";
         auto chips_with_mmio = this->get_target_mmio_device_ids();
 
         if (chips_with_mmio.find(chip_id) == chips_with_mmio.end()) {
-            log_debug(LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id);
+            log_debug(
+                LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id);
             return;
         }
 
         if (arch_name == tt::ARCH::WORMHOLE_B0) {
             std::vector<std::uint32_t> erisc_txn_counters = std::vector<uint32_t>(2);
-            std::vector<std::uint32_t> erisc_q_ptrs = std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t));
+            std::vector<std::uint32_t> erisc_q_ptrs =
+                std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t));
 
-            //wait for all queues to be empty.
-            for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) {
+            // wait for all queues to be empty.
+            for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) {
                 do {
-                    read_device_memory(erisc_q_ptrs.data(), cxy, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
+                    read_device_memory(
+                        erisc_q_ptrs.data(),
+                        cxy,
+                        eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+                        eth_interface_params.remote_update_ptr_size_bytes * 2,
+                        read_tlb);
                 } while (erisc_q_ptrs[0] != erisc_q_ptrs[4]);
             }
-            //wait for all write responses to come back.
-            for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) {
+            // wait for all write responses to come back.
+            for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) {
                 do {
-                    read_device_memory(erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb);
+                    read_device_memory(
+                        erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb);
                 } while (erisc_txn_counters[0] != erisc_txn_counters[1]);
             }
         }
@@ -1861,7 +2383,6 @@ void Cluster::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) {
     }
 }
 
-
 void Cluster::wait_for_non_mmio_flush(const chip_id_t chip_id) {
     log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole");
     std::string read_tlb = "LARGE_READ_TLB";
@@ -1882,39 +2403,48 @@ void Cluster::wait_for_non_mmio_flush() {
 }
 
 // Broadcast Functions
-void Cluster::generate_tensix_broadcast_grids_for_grayskull(std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids,  std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude) {
+void Cluster::generate_tensix_broadcast_grids_for_grayskull(
+    std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude) {
     // If row 0 is not explicitly excluded, exclude it here since its non-tensix
     rows_to_exclude.insert(0);
     // If row 11 is excluded, we can close the SOC grid. If not, exclude row 12 to close grid.
-    if(rows_to_exclude.find(11) == rows_to_exclude.end()) {
+    if (rows_to_exclude.find(11) == rows_to_exclude.end()) {
         rows_to_exclude.insert(12);
     }
     // If col 0 is not explicitly excluded, exclude it here since its non-tensix
     cols_to_exclude.insert(0);
     // If col 12 is excluded, we can close the SOC grid. If not, exclude col 13 to close grid.
-    if(cols_to_exclude.find(12) == cols_to_exclude.end()) {
+    if (cols_to_exclude.find(12) == cols_to_exclude.end()) {
         cols_to_exclude.insert(13);
     }
     std::vector<std::pair<int, int>> bb_x_coords = {};
     std::vector<std::pair<int, int>> bb_y_coords = {};
 
     // Generate starting and ending x coordinates of each bounding box/grid
-    for(auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) {
-        if(x_it == std::prev(cols_to_exclude.end(), 1)) continue;
-        if(cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) {
+    for (auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) {
+        if (x_it == std::prev(cols_to_exclude.end(), 1)) {
+            continue;
+        }
+        if (cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and
+            cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) {
             bb_x_coords.push_back({*(x_it) + 1, *(std::next(x_it, 1)) - 1});
         }
     }
 
-    for(auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) {
-        if(y_it == std::prev(rows_to_exclude.end(), 1)) continue;
-        if(rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) {
+    for (auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) {
+        if (y_it == std::prev(rows_to_exclude.end(), 1)) {
+            continue;
+        }
+        if (rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and
+            rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) {
             bb_y_coords.push_back({*(y_it) + 1, *(std::next(y_it, 1)) - 1});
         }
     }
     // Assemble x and y coordinates into bounding box vertices
-    for(const auto& x_pair : bb_x_coords) {
-        for(const auto& y_pair : bb_y_coords) {
+    for (const auto& x_pair : bb_x_coords) {
+        for (const auto& y_pair : bb_y_coords) {
             tt_xy_pair top_left = tt_xy_pair(x_pair.first, y_pair.first);
             tt_xy_pair bot_right = tt_xy_pair(x_pair.second, y_pair.second);
             broadcast_grids.insert({top_left, bot_right});
@@ -1922,81 +2452,94 @@ void Cluster::generate_tensix_broadcast_grids_for_grayskull(std::set<std::pair<t
     }
 }
 
-std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& Cluster::get_ethernet_broadcast_headers(const std::set<chip_id_t>& chips_to_exclude) {
+std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& Cluster::get_ethernet_broadcast_headers(
+    const std::set<chip_id_t>& chips_to_exclude) {
     // Generate headers for Ethernet Broadcast (WH) only. Each header corresponds to a unique broadcast "grid".
-    if(bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) {
+    if (bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) {
         bcast_header_cache[chips_to_exclude] = {};
-        std::unordered_map<chip_id_t, std::unordered_map<chip_id_t, std::vector<int>>> broadcast_mask_for_target_chips_per_group = {};
+        std::unordered_map<chip_id_t, std::unordered_map<chip_id_t, std::vector<int>>>
+            broadcast_mask_for_target_chips_per_group = {};
         std::map<std::vector<int>, std::tuple<chip_id_t, std::vector<int>>> broadcast_header_union_per_group = {};
         chip_id_t first_mmio_chip = *(get_target_mmio_device_ids().begin());
-        for(const auto& chip : target_devices_in_cluster) {
-            if(chips_to_exclude.find(chip) == chips_to_exclude.end()) {
+        for (const auto& chip : target_devices_in_cluster) {
+            if (chips_to_exclude.find(chip) == chips_to_exclude.end()) {
                 // Get shelf local physical chip id included in broadcast
-                chip_id_t physical_chip_id = ndesc -> get_shelf_local_physical_chip_coords(chip);
-                eth_coord_t eth_coords = ndesc -> get_chip_locations().at(chip);
+                chip_id_t physical_chip_id = ndesc->get_shelf_local_physical_chip_coords(chip);
+                eth_coord_t eth_coords = ndesc->get_chip_locations().at(chip);
                 // Rack word to be set in header
-                uint32_t rack_word = std::get<2>(eth_coords) >> 2;
+                uint32_t rack_word = eth_coords.rack >> 2;
                 // Rack byte to be set in header
-                uint32_t rack_byte = std::get<2>(eth_coords) % 4;
+                uint32_t rack_byte = eth_coords.rack % 4;
                 // 1st level grouping: Group broadcasts based on the MMIO chip they must go through
-                // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each set connected to host through its closest MMIO chip
-                // For the first shelf, pass broadcasts to specific chips through their closest MMIO chip
-                // All other shelves are fully connected galaxy grids. These are connected to all MMIO devices. Use any (or the first) MMIO device in the list.
+                // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each
+                // set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to specific
+                // chips through their closest MMIO chip All other shelves are fully connected galaxy grids. These are
+                // connected to all MMIO devices. Use any (or the first) MMIO device in the list.
                 chip_id_t closest_mmio_chip = 0;
-                if (std::get<2>(eth_coords) == 0 && std::get<3>(eth_coords) == 0) {
-                    // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its own MMIO counterpart.
-                    closest_mmio_chip = ndesc -> get_closest_mmio_capable_chip(chip);
-                }
-                else {
-                    // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are connected.
+                if (eth_coords.rack == 0 && eth_coords.shelf == 0) {
+                    // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its
+                    // own MMIO counterpart.
+                    closest_mmio_chip = ndesc->get_closest_mmio_capable_chip(chip);
+                } else {
+                    // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are
+                    // connected.
                     closest_mmio_chip = first_mmio_chip;
                 }
-                if(broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) == broadcast_mask_for_target_chips_per_group.end()) {
+                if (broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) ==
+                    broadcast_mask_for_target_chips_per_group.end()) {
                     broadcast_mask_for_target_chips_per_group.insert({closest_mmio_chip, {}});
                 }
-                // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves that contain this physical id.
-                if(broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) {
+                // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves
+                // that contain this physical id.
+                if (broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) ==
+                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) {
                     // Target seen for the first time.
                     std::vector<int> broadcast_mask(8, 0);
-                    broadcast_mask.at(rack_word) |= (1 << std::get<3>(eth_coords)) << rack_byte;
+                    broadcast_mask.at(rack_word) |= (1 << eth_coords.shelf) << rack_byte;
                     broadcast_mask.at(3) |= 1 << physical_chip_id;
-                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).insert({physical_chip_id, broadcast_mask});
+                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip)
+                        .insert({physical_chip_id, broadcast_mask});
 
-                }
-                else {
+                } else {
                     // Target was seen before -> include curr rack and shelf in header
-                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).at(physical_chip_id).at(rack_word) |= static_cast<uint32_t>(1 << std::get<3>(eth_coords)) << rack_byte;
+                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip)
+                        .at(physical_chip_id)
+                        .at(rack_word) |= static_cast<uint32_t>(1 << eth_coords.shelf) << rack_byte;
                 }
             }
         }
-        // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The number of groups after this step represent the final set of broadcast grids.
-        for(auto& mmio_group : broadcast_mask_for_target_chips_per_group) {
-            for(auto& chip : mmio_group.second) {
+        // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The
+        // number of groups after this step represent the final set of broadcast grids.
+        for (auto& mmio_group : broadcast_mask_for_target_chips_per_group) {
+            for (auto& chip : mmio_group.second) {
                 // Generate a hash for this MMIO Chip + Rack + Shelf group
-                std::vector<int> header_hash = {mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)};
-                if(broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) {
-                    broadcast_header_union_per_group.insert({header_hash, std::make_tuple(mmio_group.first, chip.second)});
-                }
-                else {
+                std::vector<int> header_hash = {
+                    mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)};
+                if (broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) {
+                    broadcast_header_union_per_group.insert(
+                        {header_hash, std::make_tuple(mmio_group.first, chip.second)});
+                } else {
                     // If group found, update chip header entry
                     std::get<1>(broadcast_header_union_per_group.at(header_hash)).at(3) |= chip.second.at(3);
                 }
             }
         }
         // Get all broadcast headers per MMIO group
-        for(const auto& header : broadcast_header_union_per_group) {
+        for (const auto& header : broadcast_header_union_per_group) {
             chip_id_t mmio_chip = std::get<0>(header.second);
-            if(bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) {
+            if (bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) {
                 bcast_header_cache[chips_to_exclude].insert({mmio_chip, {}});
             }
             bcast_header_cache[chips_to_exclude].at(mmio_chip).push_back(std::get<1>(header.second));
         }
         // Invert headers (FW convention)
-        for(auto& bcast_group : bcast_header_cache[chips_to_exclude]) {
-            for(auto& header : bcast_group.second) {
+        for (auto& bcast_group : bcast_header_cache[chips_to_exclude]) {
+            for (auto& header : bcast_group.second) {
                 int header_idx = 0;
-                for(auto& header_entry : header) {
-                    if(header_idx == 4) break;
+                for (auto& header_entry : header) {
+                    if (header_idx == 4) {
+                        break;
+                    }
                     header_entry = ~header_entry;
                     header_idx++;
                 }
@@ -2006,14 +2549,23 @@ std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& Cluster::get_ether
     return bcast_header_cache[chips_to_exclude];
 }
 
-void Cluster::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) {
-    // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet Broadcast for WH.
-    PCIDevice *pci_device = get_pci_device(chip);
+void Cluster::pcie_broadcast_write(
+    chip_id_t chip,
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    std::uint32_t addr,
+    const tt_xy_pair& start,
+    const tt_xy_pair& end,
+    const std::string& fallback_tlb) {
+    // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet
+    // Broadcast for WH.
+    PCIDevice* pci_device = get_pci_device(chip);
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
     const uint8_t* buffer_addr = static_cast<const uint8_t*>(mem_ptr);
     const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->get_device_num()));
-    while(size_in_bytes > 0) {
-        auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast(tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb));
+    while (size_in_bytes > 0) {
+        auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast(
+            tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb));
         uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
         pci_device->write_block(mapped_address, transfer_size, buffer_addr);
 
@@ -2023,155 +2575,235 @@ void Cluster::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t
     }
 }
 
-inline bool tensix_or_eth_in_broadcast(const std::set<uint32_t>& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) {
+inline bool tensix_or_eth_in_broadcast(
+    const std::set<uint32_t>& cols_to_exclude,
+    const tt::umd::architecture_implementation* architecture_implementation) {
     bool found_tensix_or_eth = false;
-    for(const auto& col : architecture_implementation->get_t6_x_locations()) {
+    for (const auto& col : architecture_implementation->get_t6_x_locations()) {
         found_tensix_or_eth |= (cols_to_exclude.find(col) == cols_to_exclude.end());
     }
     return found_tensix_or_eth;
 }
 
-inline bool valid_tensix_broadcast_grid(const std::set<uint32_t>& rows_to_exclude, const std::set<uint32_t>& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) {
+inline bool valid_tensix_broadcast_grid(
+    const std::set<uint32_t>& rows_to_exclude,
+    const std::set<uint32_t>& cols_to_exclude,
+    const tt::umd::architecture_implementation* architecture_implementation) {
     bool t6_bcast_rows_complete = true;
     bool t6_bcast_rows_empty = true;
-    
-    for(const auto& row : architecture_implementation->get_t6_y_locations()) {
+
+    for (const auto& row : architecture_implementation->get_t6_y_locations()) {
         t6_bcast_rows_complete &= (rows_to_exclude.find(row) == rows_to_exclude.end());
         t6_bcast_rows_empty &= (rows_to_exclude.find(row) != rows_to_exclude.end());
     }
     return t6_bcast_rows_complete || t6_bcast_rows_empty;
 }
 
-
-void Cluster::ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address,
-                                                const std::set<chip_id_t>& chips_to_exclude, const std::set<uint32_t>& rows_to_exclude, 
-                                                std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords) {
-    if(use_ethernet_broadcast) {
+void Cluster::ethernet_broadcast_write(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    const std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb,
+    bool use_virtual_coords) {
+    if (use_ethernet_broadcast) {
         // Broadcast through ERISC core supported
-        std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& broadcast_headers = get_ethernet_broadcast_headers(chips_to_exclude);
-        // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level broadcast headers on future/
+        std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& broadcast_headers =
+            get_ethernet_broadcast_headers(chips_to_exclude);
+        // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level
+        // broadcast headers on future/
         std::uint32_t row_exclusion_mask = 0;
         std::uint32_t col_exclusion_mask = 0;
-        for(const auto& row : rows_to_exclude) {
+        for (const auto& row : rows_to_exclude) {
             row_exclusion_mask |= 1 << row;
         }
 
-        for(const auto& col : cols_to_exclude) {
+        for (const auto& col : cols_to_exclude) {
             col_exclusion_mask |= 1 << (16 + col);
         }
         // Write broadcast block to device.
-        for(auto& mmio_group : broadcast_headers) {
-            for(auto& header : mmio_group.second) {
-                header.at(4) = use_virtual_coords * 0x8000; // Reset row/col exclusion masks
+        for (auto& mmio_group : broadcast_headers) {
+            for (auto& header : mmio_group.second) {
+                header.at(4) = use_virtual_coords * 0x8000;  // Reset row/col exclusion masks
                 header.at(4) |= row_exclusion_mask;
                 header.at(4) |= col_exclusion_mask;
                 // Write Target: x-y endpoint is a don't care. Initialize to tt_xy_pair(1, 1)
-                write_to_non_mmio_device(mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header);
+                write_to_non_mmio_device(
+                    mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header);
             }
         }
-    }
-    else {
+    } else {
         // Broadcast not supported. Implement this at the software level as a for loop
         std::vector<tt_cxy_pair> cores_to_write = {};
-        for(const auto& chip : target_devices_in_cluster) {
-            if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue;
-            for(const auto& core : get_soc_descriptor(chip).cores) {
-                if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
-                    write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb);
+        for (const auto& chip : target_devices_in_cluster) {
+            if (chips_to_exclude.find(chip) != chips_to_exclude.end()) {
+                continue;
+            }
+            for (const auto& core : get_soc_descriptor(chip).cores) {
+                if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and
+                    rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and
+                    core.second.type != CoreType::HARVESTED) {
+                    write_to_device(
+                        mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb);
                 }
             }
         }
     }
 }
 
-void Cluster::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address,
-                       const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {
+void Cluster::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {
     if (arch_name == tt::ARCH::GRAYSKULL) {
         // Device FW disables broadcasts to all non tensix cores.
         std::vector<tt_xy_pair> dram_cores_to_write = {};
         std::vector<uint32_t> dram_rows = {0, 6};
         std::vector<uint32_t> dram_cols = {1, 4, 7, 10};
 
-        for(const auto& row : dram_rows) {
-            for(const auto& col : dram_cols) {
-                if(rows_to_exclude.find(row) == rows_to_exclude.end() and cols_to_exclude.find(col) == cols_to_exclude.end()) {
+        for (const auto& row : dram_rows) {
+            for (const auto& col : dram_cols) {
+                if (rows_to_exclude.find(row) == rows_to_exclude.end() and
+                    cols_to_exclude.find(col) == cols_to_exclude.end()) {
                     dram_cores_to_write.push_back(tt_xy_pair(col, row));
                 }
             }
         }
-        
+
         std::set<std::pair<tt_xy_pair, tt_xy_pair>> broadcast_grids = {};
         generate_tensix_broadcast_grids_for_grayskull(broadcast_grids, rows_to_exclude, cols_to_exclude);
-        for(const auto& chip : target_devices_in_cluster) {
-            if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue;
-            for(const auto& dram : dram_cores_to_write) {
+        for (const auto& chip : target_devices_in_cluster) {
+            if (chips_to_exclude.find(chip) != chips_to_exclude.end()) {
+                continue;
+            }
+            for (const auto& dram : dram_cores_to_write) {
                 write_device_memory(mem_ptr, size_in_bytes, tt_cxy_pair(chip, dram), address, fallback_tlb);
             }
-            for(const auto& grid : broadcast_grids) {
+            for (const auto& grid : broadcast_grids) {
                 pcie_broadcast_write(chip, mem_ptr, size_in_bytes, address, grid.first, grid.second, fallback_tlb);
             }
-        } 
-    }
-    else if (arch_name == tt::ARCH::BLACKHOLE) {
+        }
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
-        if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) {
-            log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole.");
-            if(cols_to_exclude.find(0) == cols_to_exclude.end()) {
+        if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) {
+            log_assert(
+                !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()),
+                "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole.");
+            if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
                 // When broadcast includes column zero do not exclude anything
                 std::set<uint32_t> unsafe_rows = {};
                 std::set<uint32_t> cols_to_exclude_for_col_0_bcast = cols_to_exclude;
                 std::set<uint32_t> rows_to_exclude_for_col_0_bcast = rows_to_exclude;
                 cols_to_exclude_for_col_0_bcast.insert(9);
                 rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end());
-                ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                        rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false);
+                ethernet_broadcast_write(
+                    mem_ptr,
+                    size_in_bytes,
+                    address,
+                    chips_to_exclude,
+                    rows_to_exclude_for_col_0_bcast,
+                    cols_to_exclude_for_col_0_bcast,
+                    fallback_tlb,
+                    false);
             }
-            if(cols_to_exclude.find(9) == cols_to_exclude.end()) {
+            if (cols_to_exclude.find(9) == cols_to_exclude.end()) {
                 std::set<uint32_t> cols_to_exclude_for_col_9_bcast = cols_to_exclude;
                 cols_to_exclude_for_col_9_bcast.insert(0);
-                ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                        rows_to_exclude, cols_to_exclude_for_col_9_bcast, fallback_tlb, false);
+                ethernet_broadcast_write(
+                    mem_ptr,
+                    size_in_bytes,
+                    address,
+                    chips_to_exclude,
+                    rows_to_exclude,
+                    cols_to_exclude_for_col_9_bcast,
+                    fallback_tlb,
+                    false);
             }
+        } else {
+            log_assert(
+                use_virtual_coords_for_eth_broadcast or
+                    valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()),
+                "Must broadcast to all tensix rows when ERISC FW is < 6.8.0.");
+            ethernet_broadcast_write(
+                mem_ptr,
+                size_in_bytes,
+                address,
+                chips_to_exclude,
+                rows_to_exclude,
+                cols_to_exclude,
+                fallback_tlb,
+                use_virtual_coords_for_eth_broadcast);
         }
-        else {
-            log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), 
-                        "Must broadcast to all tensix rows when ERISC FW is < 6.8.0.");
-            ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                    rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast);
-        }
-    }
-    else {
+    } else {
         auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
-        if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) {
-            log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole.");
-            if(cols_to_exclude.find(0) == cols_to_exclude.end()) {
-                // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly, since writing to these is unsafe
-                // ERISC FW does not exclude these.
+        if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) {
+            log_assert(
+                !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()),
+                "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole.");
+            if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
+                // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly,
+                // since writing to these is unsafe ERISC FW does not exclude these.
                 std::set<uint32_t> unsafe_rows = {2, 3, 4, 8, 9, 10};
                 std::set<uint32_t> cols_to_exclude_for_col_0_bcast = cols_to_exclude;
                 std::set<uint32_t> rows_to_exclude_for_col_0_bcast = rows_to_exclude;
                 cols_to_exclude_for_col_0_bcast.insert(5);
                 rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end());
-                ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                        rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false);
+                ethernet_broadcast_write(
+                    mem_ptr,
+                    size_in_bytes,
+                    address,
+                    chips_to_exclude,
+                    rows_to_exclude_for_col_0_bcast,
+                    cols_to_exclude_for_col_0_bcast,
+                    fallback_tlb,
+                    false);
             }
-            if(cols_to_exclude.find(5) == cols_to_exclude.end()) {
+            if (cols_to_exclude.find(5) == cols_to_exclude.end()) {
                 std::set<uint32_t> cols_to_exclude_for_col_5_bcast = cols_to_exclude;
                 cols_to_exclude_for_col_5_bcast.insert(0);
-                ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                        rows_to_exclude, cols_to_exclude_for_col_5_bcast, fallback_tlb, false);
+                ethernet_broadcast_write(
+                    mem_ptr,
+                    size_in_bytes,
+                    address,
+                    chips_to_exclude,
+                    rows_to_exclude,
+                    cols_to_exclude_for_col_5_bcast,
+                    fallback_tlb,
+                    false);
             }
-        }
-        else {
-            log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), 
-                        "Must broadcast to all tensix rows when ERISC FW is < 6.8.0.");
-            ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                    rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast);
-        }
-    }
-}
-
-int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) {
+        } else {
+            log_assert(
+                use_virtual_coords_for_eth_broadcast or
+                    valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()),
+                "Must broadcast to all tensix rows when ERISC FW is < 6.8.0.");
+            ethernet_broadcast_write(
+                mem_ptr,
+                size_in_bytes,
+                address,
+                chips_to_exclude,
+                rows_to_exclude,
+                cols_to_exclude,
+                fallback_tlb,
+                use_virtual_coords_for_eth_broadcast);
+        }
+    }
+}
+
+int Cluster::remote_arc_msg(
+    int chip,
+    uint32_t msg_code,
+    bool wait_for_done,
+    uint32_t arg0,
+    uint32_t arg1,
+    int timeout,
+    uint32_t* return_3,
+    uint32_t* return_4) {
     constexpr uint64_t ARC_RESET_SCRATCH_ADDR = 0x880030060;
     constexpr uint64_t ARC_RESET_MISC_CNTL_ADDR = 0x880030100;
 
@@ -2180,18 +2812,14 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin
     if ((msg_code & 0xff00) != 0xaa00) {
         log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code);
     }
-    log_assert (arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed
+    log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args");  // Only 16 bits are allowed
 
-    uint32_t fw_arg = arg0 | (arg1<<16);
+    uint32_t fw_arg = arg0 | (arg1 << 16);
     int exit_code = 0;
 
-    {
-        write_to_non_mmio_device(&fw_arg, sizeof(fw_arg),  core, ARC_RESET_SCRATCH_ADDR + 3 * 4);
-    }
+    { write_to_non_mmio_device(&fw_arg, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 3 * 4); }
 
-    {
-        write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4);
-    }
+    { write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4); }
 
     wait_for_non_mmio_flush();
     uint32_t misc = 0;
@@ -2213,7 +2841,11 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin
             if (std::chrono::system_clock::now() - start > timeout_seconds) {
                 std::stringstream ss;
                 ss << std::hex << msg_code;
-                throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}", timeout, chip, ss.str()));
+                throw std::runtime_error(fmt::format(
+                    "Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}",
+                    timeout,
+                    chip,
+                    ss.str()));
             }
 
             uint32_t status = 0;
@@ -2239,7 +2871,8 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin
     return exit_code;
 }
 
-void Cluster::write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
+void Cluster::write_to_sysmem(
+    const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
     write_buffer(mem_ptr, size, addr, channel, src_device_id);
 }
 
@@ -2247,58 +2880,86 @@ void Cluster::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, u
     read_buffer(mem_ptr, addr, channel, size, src_device_id);
 }
 
-void Cluster::set_membar_flag(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) {
-    tt_driver_atomics::sfence(); // Ensure that writes before this do not get reordered
+void Cluster::set_membar_flag(
+    const chip_id_t chip,
+    const std::unordered_set<tt_xy_pair>& cores,
+    const uint32_t barrier_value,
+    const uint32_t barrier_addr,
+    const std::string& fallback_tlb) {
+    tt_driver_atomics::sfence();  // Ensure that writes before this do not get reordered
     std::unordered_set<tt_xy_pair> cores_synced = {};
     std::vector<uint32_t> barrier_val_vec = {barrier_value};
     for (const auto& core : cores) {
-        write_to_device(barrier_val_vec.data(), barrier_val_vec.size() * sizeof(uint32_t), tt_cxy_pair(chip, core), barrier_addr, fallback_tlb);
-    }
-    tt_driver_atomics::sfence(); // Ensure that all writes in the Host WC buffer are flushed
+        write_to_device(
+            barrier_val_vec.data(),
+            barrier_val_vec.size() * sizeof(uint32_t),
+            tt_cxy_pair(chip, core),
+            barrier_addr,
+            fallback_tlb);
+    }
+    tt_driver_atomics::sfence();  // Ensure that all writes in the Host WC buffer are flushed
     while (cores_synced.size() != cores.size()) {
-        for(const auto& core : cores) {
+        for (const auto& core : cores) {
             if (cores_synced.find(core) == cores_synced.end()) {
                 uint32_t readback_val;
-                read_from_device(&readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb);
+                read_from_device(
+                    &readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb);
                 if (readback_val == barrier_value) {
                     cores_synced.insert(core);
-                }
-                else {
-                    log_trace(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value);
+                } else {
+                    log_trace(
+                        LogSiliconDriver,
+                        "Waiting for core {} to recieve mem bar flag {} in function",
+                        core.str(),
+                        barrier_value);
                 }
             }
         }
     }
     // Ensure that reads or writes after this do not get reordered.
     // Reordering can cause races where data gets transferred before the barrier has returned
-    tt_driver_atomics::mfence(); 
+    tt_driver_atomics::mfence();
 }
 
-void Cluster::insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_addr, const std::string& fallback_tlb) {
+void Cluster::insert_host_to_device_barrier(
+    const chip_id_t chip,
+    const std::unordered_set<tt_xy_pair>& cores,
+    const uint32_t barrier_addr,
+    const std::string& fallback_tlb) {
     // Ensure that this memory barrier is atomic across processes/threads
-    const scoped_lock<named_mutex> lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num()));
+    const scoped_lock<named_mutex> lock(
+        *get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num()));
     set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb);
     set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb);
 }
 
 void Cluster::init_membars() {
-    for(const auto& chip :  target_devices_in_cluster) {
-        if (ndesc -> is_chip_mmio_capable(chip)) {
-            set_membar_flag(chip, workers_per_chip.at(chip), tt_MemBarFlag::RESET, l1_address_params.tensix_l1_barrier_base, "LARGE_WRITE_TLB");
-            set_membar_flag(chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB");
-            set_membar_flag(chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB");
+    for (const auto& chip : target_devices_in_cluster) {
+        if (ndesc->is_chip_mmio_capable(chip)) {
+            set_membar_flag(
+                chip,
+                workers_per_chip.at(chip),
+                tt_MemBarFlag::RESET,
+                l1_address_params.tensix_l1_barrier_base,
+                "LARGE_WRITE_TLB");
+            set_membar_flag(
+                chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB");
+            set_membar_flag(
+                chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB");
         }
     }
 }
-void Cluster::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-    if (ndesc -> is_chip_mmio_capable(chip)) {
+
+void Cluster::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+    if (ndesc->is_chip_mmio_capable(chip)) {
         const auto& all_workers = workers_per_chip.at(chip);
         const auto& all_eth = eth_cores;
         if (cores.size()) {
             // Insert barrier on specific cores with L1
             std::unordered_set<tt_xy_pair> workers_to_sync = {};
             std::unordered_set<tt_xy_pair> eth_to_sync = {};
-            
+
             for (const auto& core : cores) {
                 if (all_workers.find(core) != all_workers.end()) {
                     workers_to_sync.insert(core);
@@ -2308,59 +2969,60 @@ void Cluster::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, c
                     log_fatal("Can only insert an L1 Memory barrier on Tensix or Ethernet cores.");
                 }
             }
-            insert_host_to_device_barrier(chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb);
+            insert_host_to_device_barrier(
+                chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb);
             insert_host_to_device_barrier(chip, eth_to_sync, l1_address_params.eth_l1_barrier_base, fallback_tlb);
         } else {
             // Insert barrier on all cores with L1
             insert_host_to_device_barrier(chip, all_workers, l1_address_params.tensix_l1_barrier_base, fallback_tlb);
             insert_host_to_device_barrier(chip, all_eth, l1_address_params.eth_l1_barrier_base, fallback_tlb);
         }
-    }
-    else {
+    } else {
         wait_for_non_mmio_flush();
     }
 }
 
-void Cluster::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-    if (ndesc -> is_chip_mmio_capable(chip)) {
+void Cluster::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+    if (ndesc->is_chip_mmio_capable(chip)) {
         if (cores.size()) {
-            for(const auto& core : cores) {
-                log_assert(dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores.");
+            for (const auto& core : cores) {
+                log_assert(
+                    dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores.");
             }
             insert_host_to_device_barrier(chip, cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
-        }
-        else {
+        } else {
             // Insert Barrier on all DRAM Cores
             insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
         }
-    }
-    else {
+    } else {
         wait_for_non_mmio_flush();
     }
 }
 
-void Cluster::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
-    if (ndesc -> is_chip_mmio_capable(chip)) {
+void Cluster::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
+    if (ndesc->is_chip_mmio_capable(chip)) {
         if (channels.size()) {
             std::unordered_set<tt_xy_pair> dram_cores_to_sync = {};
-            for(const auto& chan : channels) {
+            for (const auto& chan : channels) {
                 dram_cores_to_sync.insert(get_soc_descriptor(chip).get_core_for_dram_channel(chan, 0));
             }
-            insert_host_to_device_barrier(chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
-        }
-        else {
+            insert_host_to_device_barrier(
+                chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
+        } else {
             // Insert Barrier on all DRAM Cores
             insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
         }
-    }
-    else {
+    } else {
         wait_for_non_mmio_flush();
     }
 }
 
-void Cluster::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip);
-    if(target_is_mmio_capable) {
+void Cluster::write_to_device(
+    const void* mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
+    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip);
+    if (target_is_mmio_capable) {
         if (fallback_tlb == "REG_TLB") {
             write_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb);
         } else {
@@ -2368,100 +3030,118 @@ void Cluster::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair co
         }
     } else {
         log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");
-        log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!");
+        log_assert(
+            (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1,
+            "Cannot issue ethernet writes to a single chip cluster!");
         write_to_non_mmio_device(mem_ptr, size, core, addr);
     }
 }
 
-void Cluster::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    PCIDevice *pci_device = get_pci_device(core.chip);
+void Cluster::read_mmio_device_register(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    PCIDevice* pci_device = get_pci_device(core.chip);
 
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
     const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->get_device_num()));
     log_debug(LogSiliconDriver, "  dynamic tlb_index: {}", tlb_index);
 
-    auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
-    // Align block to 4bytes if needed. 
+    auto [mapped_address, tlb_size] =
+        pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
+    // Align block to 4bytes if needed.
     auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size);
     pci_device->read_regs(mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage);
 
-    if(aligned_buf.input_size != aligned_buf.block_size) {
+    if (aligned_buf.input_size != aligned_buf.block_size) {
         // Copy value from aligned buffer to main buffer.
         std::memcpy(mem_ptr, aligned_buf.local_storage, size);
     }
 }
 
-
-void Cluster::write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    PCIDevice *pci_device = get_pci_device(core.chip);
+void Cluster::write_mmio_device_register(
+    const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    PCIDevice* pci_device = get_pci_device(core.chip);
 
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
     const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->get_device_num()));
     log_debug(LogSiliconDriver, "  dynamic tlb_index: {}", tlb_index);
 
-    auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
-    // Align block to 4bytes if needed. 
+    auto [mapped_address, tlb_size] =
+        pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
+    // Align block to 4bytes if needed.
     auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size);
-    if(aligned_buf.input_size != aligned_buf.block_size) {
+    if (aligned_buf.input_size != aligned_buf.block_size) {
         // Copy value from main buffer to aligned buffer
         std::memcpy(aligned_buf.local_storage, mem_ptr, size);
     }
     pci_device->write_regs(mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage);
 }
 
-void Cluster::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip);
+void Cluster::read_from_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip);
     if (target_is_mmio_capable) {
         if (fallback_tlb == "REG_TLB") {
             read_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb);
         } else {
             read_device_memory(mem_ptr, core, addr, size, fallback_tlb);
         }
-    }
-    else {
-        log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");    // MT: Use only dynamic TLBs and never program static
-        log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 &&  get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet reads from a single chip cluster!");
+    } else {
+        log_assert(
+            arch_name != tt::ARCH::BLACKHOLE,
+            "Non-MMIO targets not supported in Blackhole");  // MT: Use only dynamic TLBs and never program static
+        log_assert(
+            (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1,
+            "Cannot issue ethernet reads from a single chip cluster!");
         read_from_non_mmio_device(mem_ptr, core, addr, size);
     }
 }
 
-int Cluster::arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) {
+int Cluster::arc_msg(
+    int logical_device_id,
+    uint32_t msg_code,
+    bool wait_for_done,
+    uint32_t arg0,
+    uint32_t arg1,
+    int timeout,
+    uint32_t* return_3,
+    uint32_t* return_4) {
     log_assert(arch_name != tt::ARCH::BLACKHOLE, "ARC messages not supported in Blackhole");
-    if(ndesc -> is_chip_mmio_capable(logical_device_id)) {
+    if (ndesc->is_chip_mmio_capable(logical_device_id)) {
         return pcie_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4);
-    }
-    else {
+    } else {
         return remote_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4);
     }
 }
 
-void Cluster::send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) {
+void Cluster::send_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) {
     auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
-    uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type) valid;
+    uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type)valid;
     write_to_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0, "REG_TLB");
     tt_driver_atomics::sfence();
 }
 
-void Cluster::send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) {
+void Cluster::send_remote_tensix_risc_reset_to_core(
+    const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) {
     auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
-    uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type) valid;
+    uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type)valid;
     write_to_non_mmio_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0);
     tt_driver_atomics::sfence();
 }
 
-int Cluster::set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state) {
+int Cluster::set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state) {
     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip);
-    return remote_arc_msg(chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL);
+    return remote_arc_msg(
+        chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL);
 }
 
-
 void Cluster::enable_remote_ethernet_queue(const chip_id_t& chip, int timeout) {
     uint32_t msg_success = 0x0;
     auto timeout_seconds = std::chrono::seconds(timeout);
     auto start = std::chrono::system_clock::now();
     while (msg_success != 1) {
         if (std::chrono::system_clock::now() - start > timeout_seconds) {
-            throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout));
+            throw std::runtime_error(
+                fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout));
         }
         int msg_rt = remote_arc_msg(chip, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success, NULL);
         if (msg_rt == MSG_ERROR_REPLY) {
@@ -2470,16 +3150,14 @@ void Cluster::enable_remote_ethernet_queue(const chip_id_t& chip, int timeout) {
     }
 }
 
-
-void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets) {
-    if(arch_name == tt::ARCH::GRAYSKULL) {
-        for (auto &device_it : m_pci_device_map) {
+void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets) {
+    if (arch_name == tt::ARCH::GRAYSKULL) {
+        for (auto& device_it : m_pci_device_map) {
             broadcast_pcie_tensix_risc_reset(device_it.first, soft_resets);
         }
-    }
-    else {
+    } else {
         auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
-        uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type) valid;
+        uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type)valid;
         std::set<chip_id_t> chips_to_exclude = {};
         std::set<uint32_t> rows_to_exclude;
         std::set<uint32_t> columns_to_exclude;
@@ -2491,7 +3169,14 @@ void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOption
             columns_to_exclude = {0, 5};
         }
         std::string fallback_tlb = "LARGE_WRITE_TLB";
-        broadcast_write_to_cluster(&valid_val, sizeof(uint32_t), 0xFFB121B0, chips_to_exclude, rows_to_exclude, columns_to_exclude, fallback_tlb);
+        broadcast_write_to_cluster(
+            &valid_val,
+            sizeof(uint32_t),
+            0xFFB121B0,
+            chips_to_exclude,
+            rows_to_exclude,
+            columns_to_exclude,
+            fallback_tlb);
         // Ensure that reset signal is globally visible
         wait_for_non_mmio_flush();
     }
@@ -2500,22 +3185,23 @@ void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOption
 void Cluster::set_power_state(tt_DevicePowerState device_state) {
     // MT Initial BH - ARC messages not supported in Blackhole
     if (arch_name != tt::ARCH::BLACKHOLE) {
-        for(auto& chip : target_devices_in_cluster) {
-            if(ndesc -> is_chip_mmio_capable(chip)) {
+        for (auto& chip : target_devices_in_cluster) {
+            if (ndesc->is_chip_mmio_capable(chip)) {
                 set_pcie_power_state(device_state);
             } else {
                 int exit_code = set_remote_power_state(chip, device_state);
-                log_assert(exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code);
+                log_assert(
+                    exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code);
             }
         }
     }
 }
 
 void Cluster::enable_ethernet_queue(int timeout) {
-    for (const chip_id_t &chip : target_devices_in_cluster) {
+    for (const chip_id_t& chip : target_devices_in_cluster) {
         auto arch = get_soc_descriptor(chip).arch;
 
-         switch (arch) {
+        switch (arch) {
             case tt::ARCH::WORMHOLE_B0: {
                 if (ndesc->is_chip_mmio_capable(chip)) {
                     enable_local_ethernet_queue(chip, timeout);
@@ -2524,20 +3210,17 @@ void Cluster::enable_ethernet_queue(int timeout) {
                 }
 
                 break;
-            case tt::ARCH::BLACKHOLE:
-                log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet");
+                case tt::ARCH::BLACKHOLE:
+                    log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet");
             }
             default: {
                 break;
             }
         }
-
     }
 }
 
-std::set<chip_id_t> Cluster::get_target_remote_device_ids() {
-    return target_remote_chips;
-}
+std::set<chip_id_t> Cluster::get_target_remote_device_ids() { return target_remote_chips; }
 
 void Cluster::deassert_resets_and_set_power_state() {
     // Assert tensix resets on all chips in cluster
@@ -2546,15 +3229,29 @@ void Cluster::deassert_resets_and_set_power_state() {
     // MT Initial BH - ARC messages not supported in Blackhole
     if (arch_name != tt::ARCH::BLACKHOLE) {
         // Send ARC Messages to deassert RISCV resets
-        for (auto &device_it : m_pci_device_map){
-            arc_msg(device_it.first, 0xaa00 | device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0);
-        }
-        if(ndesc != nullptr) {
-            for(const chip_id_t& chip : target_devices_in_cluster) {
-                if(!ndesc -> is_chip_mmio_capable(chip)) {
+        for (auto& device_it : m_pci_device_map) {
+            arc_msg(
+                device_it.first,
+                0xaa00 |
+                    device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(),
+                true,
+                0,
+                0);
+        }
+        if (ndesc != nullptr) {
+            for (const chip_id_t& chip : target_devices_in_cluster) {
+                if (!ndesc->is_chip_mmio_capable(chip)) {
                     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip);
                     auto pci_device = get_pci_device(mmio_capable_chip_logical);
-                    remote_arc_msg(chip, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL);
+                    remote_arc_msg(
+                        chip,
+                        0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(),
+                        true,
+                        0x0,
+                        0x0,
+                        1,
+                        NULL,
+                        NULL);
                 }
             }
             enable_ethernet_queue(30);
@@ -2565,11 +3262,16 @@ void Cluster::deassert_resets_and_set_power_state() {
 }
 
 void Cluster::verify_eth_fw() {
-    for(const auto& chip : target_devices_in_cluster) {
+    for (const auto& chip : target_devices_in_cluster) {
         uint32_t fw_version;
         std::vector<uint32_t> fw_versions;
-        for (const tt_xy_pair &eth_core : get_soc_descriptor(chip).ethernet_cores) {
-            read_from_device(&fw_version, tt_cxy_pair(chip, eth_core), l1_address_params.fw_version_addr, sizeof(uint32_t), "LARGE_READ_TLB");
+        for (const tt_xy_pair& eth_core : get_soc_descriptor(chip).ethernet_cores) {
+            read_from_device(
+                &fw_version,
+                tt_cxy_pair(chip, eth_core),
+                l1_address_params.fw_version_addr,
+                sizeof(uint32_t),
+                "LARGE_READ_TLB");
             fw_versions.push_back(fw_version);
         }
         verify_sw_fw_versions(chip, SW_VERSION, fw_versions);
@@ -2577,7 +3279,7 @@ void Cluster::verify_eth_fw() {
     }
 }
 
-void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t> &fw_versions) {
+void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t>& fw_versions) {
     tt_version sw(sw_version), fw_first_eth_core(fw_versions.at(0));
     log_info(
         LogSiliconDriver,
@@ -2585,7 +3287,7 @@ void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std
         sw.str(),
         fw_first_eth_core.str(),
         device_id);
-    for (std::uint32_t &fw_version : fw_versions) {
+    for (std::uint32_t& fw_version : fw_versions) {
         tt_version fw(fw_version);
         log_assert(fw == fw_first_eth_core, "FW versions are not the same across different ethernet cores");
         log_assert(sw.major == fw.major, "SW/FW major version number out of sync");
@@ -2598,14 +3300,16 @@ void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std
     use_ethernet_broadcast &= fw_first_eth_core >= tt_version(6, 5, 0);
     // Virtual coordinates can be used for broadcast headers if ERISC FW >= 6.8.0 and NOC translation is enabled
     // Temporarily enable this feature for 6.7.241 as well for testing.
-    use_virtual_coords_for_eth_broadcast &= (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) && translation_tables_en;
+    use_virtual_coords_for_eth_broadcast &=
+        (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) &&
+        translation_tables_en;
 }
 
-void Cluster::start_device(const tt_device_params &device_params) {
-    if(device_params.init_device) {
+void Cluster::start_device(const tt_device_params& device_params) {
+    if (device_params.init_device) {
         initialize_pcie_devices();
         // MT Initial BH - Ethernet firmware not present in Blackhole
-        if(arch_name == tt::ARCH::WORMHOLE_B0) {
+        if (arch_name == tt::ARCH::WORMHOLE_B0) {
             verify_eth_fw();
         }
         deassert_resets_and_set_power_state();
@@ -2617,7 +3321,6 @@ void Cluster::close_device() {
     broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET);
 }
 
-
 void Cluster::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
     l1_address_params = l1_address_params_;
 }
@@ -2634,24 +3337,29 @@ void Cluster::set_driver_eth_interface_params(const tt_driver_eth_interface_para
     eth_interface_params = eth_interface_params_;
 }
 
-void Cluster::setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
+void Cluster::setup_core_to_tlb_map(
+    const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
     map_core_to_tlb_per_chip[logical_device_id] = mapping_function;
     tlbs_init_per_chip[logical_device_id] = true;
 }
 
 std::uint32_t Cluster::get_num_dram_channels(std::uint32_t device_id) {
-    log_assert(target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(), "Querying DRAM parameters for a device that does not exist.");
+    log_assert(
+        target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(),
+        "Querying DRAM parameters for a device that does not exist.");
     return get_soc_descriptor(device_id).get_num_dram_channels();
 }
 
 std::uint64_t Cluster::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
     log_assert(channel < get_num_dram_channels(device_id), "Querying size for a device channel that does not exist.");
-    return  get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now
+    return get_soc_descriptor(device_id).dram_bank_size;  // Space per channel is identical for now
 }
 
 std::uint32_t Cluster::get_num_host_channels(std::uint32_t device_id) {
     auto devices = get_target_mmio_device_ids();
-    log_assert(devices.find(device_id) != devices.end(), "Querying Host Address parameters for a non-mmio device or a device does not exist.");
+    log_assert(
+        devices.find(device_id) != devices.end(),
+        "Querying Host Address parameters for a non-mmio device or a device does not exist.");
     return m_pci_device_map.at(device_id)->get_num_host_mem_channels();
 }
 
@@ -2669,22 +3377,22 @@ std::uint32_t Cluster::get_numa_node_for_pcie_device(std::uint32_t device_id) {
 std::uint64_t Cluster::get_pcie_base_addr_from_device(const chip_id_t chip_id) const {
     // TODO: Should probably be lowered to TTDevice.
     tt::ARCH arch = get_soc_descriptor(chip_id).arch;
-    if(arch == tt::ARCH::WORMHOLE_B0) {
+    if (arch == tt::ARCH::WORMHOLE_B0) {
         return 0x800000000;
-    }
-    else if (arch == tt::ARCH::BLACKHOLE) {
+    } else if (arch == tt::ARCH::BLACKHOLE) {
         // Enable 4th ATU window.
         return 1ULL << 60;
-    }
-    else {
+    } else {
         return 0;
     }
 }
 
 tt_version Cluster::get_ethernet_fw_version() const {
     log_assert(arch_name == tt::ARCH::WORMHOLE_B0, "Can only get Ethernet FW version for Wormhole architectures.");
-    log_assert(eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, "Device must be started before querying Ethernet FW version.");
+    log_assert(
+        eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff,
+        "Device must be started before querying Ethernet FW version.");
     return eth_fw_version;
 }
 
-}
+}  // namespace tt::umd
diff --git a/device/coordinate_manager.cpp b/device/coordinate_manager.cpp
index 330eb864..eb3bda7e 100644
--- a/device/coordinate_manager.cpp
+++ b/device/coordinate_manager.cpp
@@ -4,9 +4,11 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "umd/device/coordinate_manager.h"
+
 #include <memory>
-#include "umd/device/coordinate_manager.h"
+
 #include "grayskull/grayskull_coordinate_manager.h"
+#include "umd/device/coordinate_manager.h"
 
 tt_physical_coords CoordinateManager::to_physical_coords(tt_logical_coords logical_coords) {
     return tt_physical_coords(logical_x_to_physical_x[logical_coords.x], logical_y_to_physical_y[logical_coords.y]);
@@ -71,13 +73,9 @@ void CoordinateManager::clear_harvesting_structures() {
     virtual_y_to_logical_y.clear();
 }
 
-std::set<std::size_t> CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) {
-    return {};
-}
+std::set<std::size_t> CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; }
 
-std::set<std::size_t> CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) {
-    return {};
-}
+std::set<std::size_t> CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; }
 
 void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) {
     clear_harvesting_structures();
@@ -104,14 +102,16 @@ void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) {
     logical_x_to_virtual_x.resize(grid_size_x - num_harvested_x);
     logical_y_to_virtual_y.resize(grid_size_y - num_harvested_y);
 
-    fill_logical_to_physical_mapping(x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested);
+    fill_logical_to_physical_mapping(
+        x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested);
     fill_logical_to_virtual_mapping(physical_x_unharvested, physical_y_unharvested);
 }
 
 void CoordinateManager::fill_logical_to_physical_mapping(
-    const std::set<size_t>& x_to_harvest, const std::set<size_t>& y_to_harvest,
-    const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested) {
-    
+    const std::set<size_t>& x_to_harvest,
+    const std::set<size_t>& y_to_harvest,
+    const std::set<size_t>& physical_x_unharvested,
+    const std::set<size_t>& physical_y_unharvested) {
     auto physical_y_it = physical_y_unharvested.begin();
     std::size_t logical_y = 0;
     for (size_t y = 0; y < worker_grid_size.y; y++) {
@@ -130,7 +130,7 @@ void CoordinateManager::fill_logical_to_physical_mapping(
 
     auto physical_x_it = physical_x_unharvested.begin();
     std::size_t logical_x = 0;
-    for(std::size_t x = 0; x < worker_grid_size.x; x++) {
+    for (std::size_t x = 0; x < worker_grid_size.x; x++) {
         if (x_to_harvest.find(x) == x_to_harvest.end()) {
             logical_x_to_physical_x[logical_x] = *physical_x_it;
             if (physical_x_to_logical_x.find(*physical_x_it) != physical_x_to_logical_x.end()) {
@@ -145,7 +145,8 @@ void CoordinateManager::fill_logical_to_physical_mapping(
     }
 }
 
-void CoordinateManager::fill_logical_to_virtual_mapping(const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested) {
+void CoordinateManager::fill_logical_to_virtual_mapping(
+    const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested) {
     auto physical_y_it = physical_y_unharvested.begin();
     for (std::size_t y = 0; y < logical_y_to_virtual_y.size(); y++) {
         logical_y_to_virtual_y[y] = *physical_y_it;
@@ -176,7 +177,6 @@ std::unique_ptr<CoordinateManager> CoordinateManager::get_coordinate_manager(
     const tt_xy_pair& worker_grid_size,
     const std::vector<tt_xy_pair>& workers,
     std::size_t harvesting_mask) {
-
     switch (arch) {
         case tt::ARCH::GRAYSKULL:
             return std::make_unique<GrayskullCoordinateManager>(worker_grid_size, workers, harvesting_mask);
diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp
index b51a26cc..5c9f278b 100644
--- a/device/cpuset_lib.cpp
+++ b/device/cpuset_lib.cpp
@@ -2,17 +2,21 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include "cpuset_lib.hpp"
+
 #include <algorithm>
+#include <filesystem>
+#include <thread>
 
 #include "cpuset_lib.hpp"
+#include "fmt/core.h"
 #include "logger.hpp"
-#include <thread>
 #include "umd/device/cluster.h"
-#include <filesystem>
-#include "fmt/core.h"
+
 namespace tt {
 
 namespace fs = std::filesystem;
+
 namespace cpuset {
 
 /////////////////////////////////////////////////////////////////////////
@@ -21,15 +25,18 @@ namespace cpuset {
 
 // Constructor for singleton class cpu id allocator
 tt_cpuset_allocator::tt_cpuset_allocator() {
-
-    m_pid           = getpid();
-    m_debug         = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false;
+    m_pid = getpid();
+    m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false;
 
     // Chicken bit to disable this entire feature for debug/comparison.
     bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false;
 
     auto system_tid = std::this_thread::get_id();
-    log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}", m_pid, system_tid);
+    log_debug(
+        LogSiliconDriver,
+        "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}",
+        m_pid,
+        system_tid);
 
     m_enable_cpuset_allocator = true;
 
@@ -38,86 +45,102 @@ tt_cpuset_allocator::tt_cpuset_allocator() {
     m_enable_cpuset_allocator &= init_get_number_of_packages();
     m_enable_cpuset_allocator &= init_find_tt_pci_devices_packages_numanodes();
 
-    if (!cpuset_allocator_enable_env){
+    if (!cpuset_allocator_enable_env) {
         m_enable_cpuset_allocator = false;
-    }else{
-
-        bool is_cpu_supported      = init_is_cpu_model_supported();
+    } else {
+        bool is_cpu_supported = init_is_cpu_model_supported();
 
-        if (is_cpu_supported){
+        if (is_cpu_supported) {
             m_enable_cpuset_allocator &= init_determine_cpuset_allocations();
-        }else{
+        } else {
             m_enable_cpuset_allocator = false;
         }
 
-        log_debug(LogSiliconDriver,"Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} thread_id: {} ", m_enable_cpuset_allocator, m_pid, system_tid);
+        log_debug(
+            LogSiliconDriver,
+            "Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} "
+            "thread_id: {} ",
+            m_enable_cpuset_allocator,
+            m_pid,
+            system_tid);
     }
 }
 
 // Step 1 : Initialize and perform m_topology detection
-bool tt_cpuset_allocator::init_topology_init_and_load(){
-    log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::topology_init_and_load()");
+bool tt_cpuset_allocator::init_topology_init_and_load() {
+    log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::topology_init_and_load()");
 
-    if (!m_enable_cpuset_allocator){
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
-    if (hwloc_topology_init(&m_topology)){
+    if (hwloc_topology_init(&m_topology)) {
         log_warning(LogSiliconDriver, "Problem initializing topology");
         return false;
     }
 
-    hwloc_topology_set_type_filter(m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL); // Need to find PCI devices.
+    hwloc_topology_set_type_filter(
+        m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL);  // Need to find PCI devices.
 
-    if (hwloc_topology_load(m_topology)){
+    if (hwloc_topology_load(m_topology)) {
         log_warning(LogSiliconDriver, "Problem loading topology");
         return false;
     }
 
-    return true; // Success
+    return true;  // Success
 }
 
-// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and numamode.
-bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){
-
-    if (!m_enable_cpuset_allocator){
+// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and
+// numamode.
+bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() {
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
-    log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()");
+    log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()");
     m_num_tt_device_by_pci_device_id_map.clear();
 
     hwloc_obj_t pci_device_obj = NULL;
     const std::regex tt_device_re("tenstorrent!([0-9]+)");
 
-    while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))){
-
-        if (hwloc_obj_type_is_io(pci_device_obj->type) && (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) {
-
-            std::pair<uint16_t, uint16_t> device_id_revision = std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision);
+    while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))) {
+        if (hwloc_obj_type_is_io(pci_device_obj->type) &&
+            (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) {
+            std::pair<uint16_t, uint16_t> device_id_revision =
+                std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision);
             m_num_tt_device_by_pci_device_id_map[device_id_revision] += 1;
 
-            std::string pci_bus_id_str  = get_pci_bus_id(pci_device_obj);
+            std::string pci_bus_id_str = get_pci_bus_id(pci_device_obj);
             std::string pci_device_dir = fmt::format("/sys/bus/pci/devices/{}/tenstorrent/", pci_bus_id_str);
             int physical_device_id = -1;
 
-            log_trace(LogSiliconDriver, "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}", pci_bus_id_str, m_num_tt_device_by_pci_device_id_map[device_id_revision]);
+            log_trace(
+                LogSiliconDriver,
+                "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}",
+                pci_bus_id_str,
+                m_num_tt_device_by_pci_device_id_map[device_id_revision]);
 
             // First, get the physical_device_id of the device.
-            if (fs::exists(pci_device_dir)){
-                for (const auto &entry : fs::directory_iterator(pci_device_dir)){
+            if (fs::exists(pci_device_dir)) {
+                for (const auto &entry : fs::directory_iterator(pci_device_dir)) {
                     auto entry_str = entry.path().string();
 
-                    if (std::smatch device_match; std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)){
+                    if (std::smatch device_match;
+                        std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)) {
                         physical_device_id = stoi(device_match[1]);
                         m_all_tt_devices.push_back(physical_device_id);
-                        log_debug(LogSiliconDriver, "Found physical_device_id: {} from file: {}", physical_device_id, entry_str);
+                        log_debug(
+                            LogSiliconDriver,
+                            "Found physical_device_id: {} from file: {}",
+                            physical_device_id,
+                            entry_str);
                         break;
                     }
                 }
 
-                if (physical_device_id == -1){
-                    log_warning(LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir);
+                if (physical_device_id == -1) {
+                    log_warning(
+                        LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir);
                     return false;
                 }
 
@@ -125,19 +148,23 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){
 
                 // Next, get the PackageID of the device and update maps.
                 auto package_id = get_package_id_from_device(pci_device_obj, physical_device_id);
-                
-                // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this 
+
+                // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this
                 // package and structures storing the CPU characteristics per package.
                 if (m_package_id_to_devices_map.find(package_id) == m_package_id_to_devices_map.end()) {
                     m_package_id_to_devices_map.insert({package_id, {}});
                     m_package_id_to_num_l3_per_ccx_map.insert({package_id, 0});
                     m_package_id_to_num_ccx_per_ccd_map.insert({package_id, 0});
                 }
-                if (package_id != -1){
+                if (package_id != -1) {
                     m_package_id_to_devices_map.at(package_id).push_back(physical_device_id);
                     m_physical_device_id_to_package_id_map.insert({physical_device_id, package_id});
                 } else {
-                    log_warning(LogSiliconDriver, "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str);
+                    log_warning(
+                        LogSiliconDriver,
+                        "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})",
+                        physical_device_id,
+                        pci_bus_id_str);
                     return false;
                 }
 
@@ -145,377 +172,479 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){
                 auto numa_nodeset = get_numa_nodeset_from_device(pci_device_obj, physical_device_id);
                 m_physical_device_id_to_numa_nodeset_map.insert({physical_device_id, numa_nodeset});
 
-                if (numa_nodeset == 0x0){
-                    log_warning(LogSiliconDriver, "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str);
+                if (numa_nodeset == 0x0) {
+                    log_warning(
+                        LogSiliconDriver,
+                        "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})",
+                        physical_device_id,
+                        pci_bus_id_str);
                     return false;
                 }
 
-                m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}}); // Empty vector.
+                m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}});  // Empty vector.
                 m_num_cpu_cores_allocated_per_tt_device.insert({physical_device_id, 0});
             }
         }
     }
 
-    if (m_all_tt_devices.size() == 0){
-        log_warning(LogSiliconDriver, "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}", TENSTORRENT_VENDOR_ID);
+    if (m_all_tt_devices.size() == 0) {
+        log_warning(
+            LogSiliconDriver,
+            "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}",
+            TENSTORRENT_VENDOR_ID);
         return false;
     }
 
-    log_debug(LogSiliconDriver,"Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices", m_all_tt_devices.size());
-
+    log_debug(
+        LogSiliconDriver,
+        "Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices",
+        m_all_tt_devices.size());
 
     // Sort these 2 vectors of device_ids before we are done, since discovery can be in any order.
-    for (auto &p: m_package_id_to_devices_map){
+    for (auto &p : m_package_id_to_devices_map) {
         std::sort(p.second.begin(), p.second.end());
     }
 
     std::sort(m_all_tt_devices.begin(), m_all_tt_devices.end());
 
-    return true; // Success
+    return true;  // Success
 }
 
-
 // Step 3 : Detect the number of packages.
-bool tt_cpuset_allocator::init_get_number_of_packages(){
-
-    if (!m_enable_cpuset_allocator){
+bool tt_cpuset_allocator::init_get_number_of_packages() {
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
     m_num_packages = hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_PACKAGE);
-    log_debug(LogSiliconDriver,"Found {} CPU packages", m_num_packages);
-    return m_num_packages > 0; // Success
+    log_debug(LogSiliconDriver, "Found {} CPU packages", m_num_packages);
+    return m_num_packages > 0;  // Success
 }
 
 // Step 4 : Return true if all packages are models we want to support. Env-var can be used to ignore this check.
-bool tt_cpuset_allocator::init_is_cpu_model_supported(){
-
-    if (!m_enable_cpuset_allocator){
+bool tt_cpuset_allocator::init_is_cpu_model_supported() {
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
-    if (m_num_packages == 0){
-        log_debug(LogSiliconDriver,"init_is_cpu_model_supported(): Found 0 packages, functions run out of order?");
+    if (m_num_packages == 0) {
+        log_debug(LogSiliconDriver, "init_is_cpu_model_supported(): Found 0 packages, functions run out of order?");
         return false;
     }
 
     bool use_any_cpu = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SUPPORT_ANY_CPU") ? true : false;
 
-    log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::check_if_cpu_model_supported()");
+    log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::check_if_cpu_model_supported()");
 
     // Supported CPU Models for enabling CPUSET Allocator.  Keep the list small to production machines to start.
-    std::vector<std::string> supported_cpu_models = {   "AMD EPYC 7352 24-Core Processor",
-                                                        "AMD EPYC 7532 32-Core Processor"};
+    std::vector<std::string> supported_cpu_models = {
+        "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"};
 
     // CPU Models that have L3 per CCX and 2 CCX per CCD
-    std::vector<std::string> opt_2ccx_per_ccd_cpu_models = {    "AMD EPYC 7352 24-Core Processor",
-                                                                "AMD EPYC 7532 32-Core Processor"};
-    for(const auto& package: m_package_id_to_devices_map) {
+    std::vector<std::string> opt_2ccx_per_ccd_cpu_models = {
+        "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"};
+    for (const auto &package : m_package_id_to_devices_map) {
         int package_id = package.first;
         auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id);
-        if (m_debug) print_hwloc_object(package_obj, 0, true, true);
+        if (m_debug) {
+            print_hwloc_object(package_obj, 0, true, true);
+        }
 
         std::string pkg_cpu_model = hwloc_obj_get_info_by_name(package_obj, "CPUModel");
 
         // First find out if this CPU is supported by CPUSET Allocator at all.
         bool has_supported_cpu = use_any_cpu ? true : false;
 
-        for (auto &supported_cpu_model : supported_cpu_models){
+        for (auto &supported_cpu_model : supported_cpu_models) {
             has_supported_cpu |= (pkg_cpu_model.find(supported_cpu_model) != std::string::npos);
         }
 
-        log_debug(LogSiliconDriver,"Detected package-id: {} has_supported_cpu: {} for CpuModel: {}", package_id, has_supported_cpu, pkg_cpu_model);
+        log_debug(
+            LogSiliconDriver,
+            "Detected package-id: {} has_supported_cpu: {} for CpuModel: {}",
+            package_id,
+            has_supported_cpu,
+            pkg_cpu_model);
 
-        if (!has_supported_cpu){
+        if (!has_supported_cpu) {
             return false;
         }
 
         // Then, determine if the 2CCX-PER-CCD optimization can be enabled for this CPU Model in the package.
-        for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models){
-            if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos){
+        for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models) {
+            if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos) {
                 m_package_id_to_num_l3_per_ccx_map.at(package_id) = 1;
                 m_package_id_to_num_ccx_per_ccd_map.at(package_id) = 2;
             }
         }
     }
 
-    return true; // Successhwloc
+    return true;  // Successhwloc
 }
 
-
-// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given socket/package.
-bool tt_cpuset_allocator::init_determine_cpuset_allocations(){
-
-    if (!m_enable_cpuset_allocator){
+// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given
+// socket/package.
+bool tt_cpuset_allocator::init_determine_cpuset_allocations() {
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
-    log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::init_determine_cpuset_allocations()");
-    for (const auto& package : m_package_id_to_devices_map) {
+    log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::init_determine_cpuset_allocations()");
+    for (const auto &package : m_package_id_to_devices_map) {
         int package_id = package.first;
         auto num_tt_devices_for_cpu_package = package.second.size();
 
-        if (num_tt_devices_for_cpu_package == 0){
-            log_debug(LogSiliconDriver, "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.", package_id);
+        if (num_tt_devices_for_cpu_package == 0) {
+            log_debug(
+                LogSiliconDriver,
+                "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.",
+                package_id);
             continue;
         }
 
-        log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ", package_id);
+        log_debug(
+            LogSiliconDriver,
+            "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ",
+            package_id);
 
         auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id);
-        if (m_debug) print_hwloc_object(package_obj, 0, true, true);
+        if (m_debug) {
+            print_hwloc_object(package_obj, 0, true, true);
+        }
 
-        auto num_alloc_slots_in_package = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot);
-        if (num_alloc_slots_in_package == 0){
-            log_warning(LogSiliconDriver, "Could not find any of the alloc objects in package_id: {} for this cpu arc", package_id);
+        auto num_alloc_slots_in_package =
+            hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot);
+        if (num_alloc_slots_in_package == 0) {
+            log_warning(
+                LogSiliconDriver,
+                "Could not find any of the alloc objects in package_id: {} for this cpu arc",
+                package_id);
             return false;
         }
         auto num_alloc_slots_per_tt_device = num_alloc_slots_in_package / num_tt_devices_for_cpu_package;
 
         // Above splits evenly by devices, leaves remainder unused in the example case of 3 devices but 8 slots.
-        log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}",
-            package_id, num_alloc_slots_in_package, num_tt_devices_for_cpu_package, num_alloc_slots_per_tt_device);
+        log_debug(
+            LogSiliconDriver,
+            "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} "
+            "num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}",
+            package_id,
+            num_alloc_slots_in_package,
+            num_tt_devices_for_cpu_package,
+            num_alloc_slots_per_tt_device);
 
         int device_idx = 0;
 
-        for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++){
-
-            auto obj = hwloc_get_obj_below_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx);
+        for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++) {
+            auto obj = hwloc_get_obj_below_by_type(
+                m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx);
 
-            if (obj){
-                if (m_debug) print_hwloc_object(obj, 1, true);
+            if (obj) {
+                if (m_debug) {
+                    print_hwloc_object(obj, 1, true);
+                }
 
                 auto physical_device_id = m_package_id_to_devices_map.at(package_id).at(device_idx);
 
                 // Hack for maximum number of slots per device.
                 // if (m_physical_device_id_to_cpusets_map.at(physical_device_id).size() < 2){
                 m_physical_device_id_to_cpusets_map.at(physical_device_id).push_back(obj->cpuset);
-                int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology,obj->cpuset,HWLOC_OBJ_CORE);
+                int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, obj->cpuset, HWLOC_OBJ_CORE);
                 m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) += num_cpus;
                 // }
 
                 // We're distributing allocation objects per package across TT devices, so switch to next one.
-                if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0){
-                    device_idx = (device_idx + 1) % num_tt_devices_for_cpu_package; // Loop around if extra slots remain. Assigned to first device for that package.
+                if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0) {
+                    device_idx = (device_idx + 1) %
+                                 num_tt_devices_for_cpu_package;  // Loop around if extra slots remain. Assigned to
+                                                                  // first device for that package.
                 }
 
-            }else{
-                log_warning(LogSiliconDriver, "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under package");
+            } else {
+                log_warning(
+                    LogSiliconDriver,
+                    "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under "
+                    "package");
                 return false;
             }
         }
 
-        log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ", package_id);
+        log_debug(
+            LogSiliconDriver,
+            "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ",
+            package_id);
     }
 
-
     // Summary for Debug purposes.
-    for (auto &physical_device_id : m_all_tt_devices){
-        for (size_t device_alloc_idx=0; device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); device_alloc_idx++){
+    for (auto &physical_device_id : m_all_tt_devices) {
+        for (size_t device_alloc_idx = 0;
+             device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size();
+             device_alloc_idx++) {
             auto cpuset = m_physical_device_id_to_cpusets_map.at(physical_device_id).at(device_alloc_idx);
             auto pu_ids_vector = get_hwloc_bitmap_vector(cpuset);
             auto num_pu_ids = pu_ids_vector.size();
             auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id);
-            log_debug(LogSiliconDriver, "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} device_alloc_idx: {} picked {} PU's {}", physical_device_id, package_id, device_alloc_idx, num_pu_ids, pu_ids_vector);
+            log_debug(
+                LogSiliconDriver,
+                "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} "
+                "device_alloc_idx: {} picked {} PU's {}",
+                physical_device_id,
+                package_id,
+                device_alloc_idx,
+                num_pu_ids,
+                pu_ids_vector);
         }
     }
 
-    return true; // Success
-
+    return true;  // Success
 }
 
 /////////////////////////////////////////////////////////////////////////
 // Runtime Functions ////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
 
-// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it.
-bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){
-
+// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously
+// allocated memory region to it.
+bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) {
     auto tid = std::this_thread::get_id();
-    log_debug(LogSiliconDriver,"bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: {} (pid: {} tid: {})", physical_device_id, m_pid, tid);
-
-    if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0){
-        log_fatal("bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not expected.", physical_device_id);
+    log_debug(
+        LogSiliconDriver,
+        "bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: "
+        "{} (pid: {} tid: {})",
+        physical_device_id,
+        m_pid,
+        tid);
+
+    if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0) {
+        log_fatal(
+            "bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not "
+            "expected.",
+            physical_device_id);
         return false;
     }
 
     auto target_nodeset = m_physical_device_id_to_numa_nodeset_map.at(physical_device_id);
 
-    if (target_nodeset != 0){
-        if (hwloc_set_area_membind(m_topology, addr, len, target_nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE) ){
-            log_warning(LogSiliconDriver,"hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} tid: {})", 
-                physical_device_id, get_hwloc_bitmap_vector(target_nodeset), strerror(errno), m_pid, tid);
+    if (target_nodeset != 0) {
+        if (hwloc_set_area_membind(
+                m_topology,
+                addr,
+                len,
+                target_nodeset,
+                HWLOC_MEMBIND_BIND,
+                HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE)) {
+            log_warning(
+                LogSiliconDriver,
+                "hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} "
+                "tid: {})",
+                physical_device_id,
+                get_hwloc_bitmap_vector(target_nodeset),
+                strerror(errno),
+                m_pid,
+                tid);
             return false;
-        }else{
-            log_debug(LogSiliconDriver,"hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})", physical_device_id, get_hwloc_bitmap_vector(target_nodeset), m_pid, tid);
+        } else {
+            log_debug(
+                LogSiliconDriver,
+                "hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})",
+                physical_device_id,
+                get_hwloc_bitmap_vector(target_nodeset),
+                m_pid,
+                tid);
         }
-    }else{
-        log_warning(LogSiliconDriver,"bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. Skipping membind.", physical_device_id);
+    } else {
+        log_warning(
+            LogSiliconDriver,
+            "bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. "
+            "Skipping membind.",
+            physical_device_id);
         return false;
     }
 
-    return true; // Success
+    return true;  // Success
 }
 
 int tt_cpuset_allocator::_get_num_tt_pci_devices() {
-
     for (auto &d : m_physical_device_id_to_package_id_map) {
         log_trace(LogSiliconDriver, "Found physical_device_id: {} ", d.first);
     }
     return m_physical_device_id_to_package_id_map.size();
 }
 
-
-
-
 /////////////////////////////////////////////////////////////////////////
-//Helper Functions //////////////////////////////////////////////////////
+// Helper Functions //////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
 
-
-std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj){
-
+std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj) {
     std::string pci_bus_id_str = "";
 
-    if (hwloc_obj_type_is_io(pci_device_obj->type)) {        
+    if (hwloc_obj_type_is_io(pci_device_obj->type)) {
         auto attrs = pci_device_obj->attr->pcidev;
         pci_bus_id_str = fmt::format("{:04x}:{:02x}:{:02x}.{:01x}", attrs.domain, attrs.bus, attrs.dev, attrs.func);
     }
 
     return pci_bus_id_str;
-
 }
 
-int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){
-
+int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) {
     auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id);
 
-    log_debug(LogSiliconDriver, "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package", physical_device_id, pci_bus_id_str);
+    log_debug(
+        LogSiliconDriver,
+        "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package",
+        physical_device_id,
+        pci_bus_id_str);
 
     hwloc_obj_t tmp_obj = hwloc_get_non_io_ancestor_obj(m_topology, pci_device_obj);
     int package_id = -1;
 
     // Keep going up until package/machine hierarchy is found, in case we don't find it right away.
-    while (package_id == -1){
-
-        if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) || (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)){
-            if (tmp_obj->os_index != (unsigned) -1){
+    while (package_id == -1) {
+        if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) ||
+            (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)) {
+            if (tmp_obj->os_index != (unsigned)-1) {
                 package_id = tmp_obj->os_index;
-            }else{
-                log_warning(LogSiliconDriver, "Could not find os_index of package or machine object for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str);
+            } else {
+                log_warning(
+                    LogSiliconDriver,
+                    "Could not find os_index of package or machine object for TT device (physical_device_id: {} "
+                    "pci_bus_id: {})",
+                    physical_device_id,
+                    pci_bus_id_str);
                 break;
             }
-        }else{
-            if (tmp_obj->parent){
+        } else {
+            if (tmp_obj->parent) {
                 tmp_obj = tmp_obj->parent;
-            }else{
+            } else {
                 break;
             }
         }
     }
 
-    if (m_debug) print_hwloc_object(pci_device_obj, 1, true, true);
-    if (m_debug) print_hwloc_object(tmp_obj, 1, true, true);
+    if (m_debug) {
+        print_hwloc_object(pci_device_obj, 1, true, true);
+    }
+    if (m_debug) {
+        print_hwloc_object(tmp_obj, 1, true, true);
+    }
 
     return package_id;
 }
 
-hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){
-
+hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device(
+    hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) {
     hwloc_nodeset_t nodeset = 0x0;
 
     // Currently an issue in non-EPYC machines where PCI devices are directly under Machine, and not any NumaNodes.
     // As quick workaround, skip this if there is only single numanode since returning 1 seems fine.
-    if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1){
+    if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1) {
         auto numanode = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_NUMANODE, 0);
         return numanode->nodeset;
     }
 
     auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id);
 
-    log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding NumaNode.", physical_device_id, pci_bus_id_str);
+    log_debug(
+        LogSiliconDriver,
+        "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's "
+        "corresponding NumaNode.",
+        physical_device_id,
+        pci_bus_id_str);
 
     hwloc_obj_t tmp_obj = pci_device_obj->parent;
-    while (tmp_obj && !tmp_obj->memory_arity){
+    while (tmp_obj && !tmp_obj->memory_arity) {
         tmp_obj = tmp_obj->parent; /* no memory child, walk up */
     }
 
-    if (tmp_obj && tmp_obj->nodeset){
-        log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found NumaNodeSet: {}", physical_device_id, pci_bus_id_str, get_hwloc_bitmap_vector(tmp_obj->nodeset));
+    if (tmp_obj && tmp_obj->nodeset) {
+        log_debug(
+            LogSiliconDriver,
+            "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found "
+            "NumaNodeSet: {}",
+            physical_device_id,
+            pci_bus_id_str,
+            get_hwloc_bitmap_vector(tmp_obj->nodeset));
         nodeset = tmp_obj->nodeset;
-    }else{
-        log_warning(LogSiliconDriver, "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str);
+    } else {
+        log_warning(
+            LogSiliconDriver,
+            "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} "
+            "pci_bus_id: {})",
+            physical_device_id,
+            pci_bus_id_str);
     }
 
     return nodeset;
-
 }
 
 int tt_cpuset_allocator::_get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision) {
-
     std::pair<uint16_t, uint16_t> device_id_revision = std::make_pair(device_id, revision);
 
     if (m_num_tt_device_by_pci_device_id_map.find(device_id_revision) != m_num_tt_device_by_pci_device_id_map.end()) {
         return m_num_tt_device_by_pci_device_id_map.at(device_id_revision);
     } else {
-        log_warning(LogSiliconDriver, "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.", device_id, revision);
+        log_warning(
+            LogSiliconDriver,
+            "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.",
+            device_id,
+            revision);
         return 0;
     }
 }
 
 /////////////////////////////////////////////////////////////////////////
-//Debug Functions ///////////////////////////////////////////////////////
+// Debug Functions ///////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
 
 // Get all PU ids (or numa nodes) in a vector, for legacy/back-compat/debug purposes.
-std::vector<int> tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap){
-
+std::vector<int> tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap) {
     std::vector<int> indices;
     int index;
-    if (bitmap){
-        hwloc_bitmap_foreach_begin(index, bitmap)
-            indices.push_back(index);
+    if (bitmap) {
+        hwloc_bitmap_foreach_begin(index, bitmap) indices.push_back(index);
         hwloc_bitmap_foreach_end();
     }
     return indices;
 }
 
-std::vector<int> tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj){
+std::vector<int> tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj) {
     return get_hwloc_bitmap_vector(obj->cpuset);
 }
 
-std::vector<int> tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj){
+std::vector<int> tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj) {
     return get_hwloc_bitmap_vector(obj->nodeset);
 }
 
-
 // Nicer way to print pu ids as a vector on single line.
-void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj){
+void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj) {
     std::cout << " Number: " << hwloc_bitmap_weight(obj->cpuset) << " cpuset_pu_ids: " << get_hwloc_cpuset_vector(obj);
 }
 
-void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj){
-    std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset) << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj);
+void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj) {
+    std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset)
+              << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj);
 }
 
-void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids){
-
+void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids) {
     char type[32], attr[1024];
 
     hwloc_obj_type_snprintf(type, sizeof(type), obj, verbose);
-    printf("%*s%s", 2*depth, "", type);
-    if (obj->os_index != (unsigned) -1)
+    printf("%*s%s", 2 * depth, "", type);
+    if (obj->os_index != (unsigned)-1) {
         printf("#%u", obj->os_index);
+    }
     hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", verbose);
 
-    if (*attr)
+    if (*attr) {
         printf("(%s)", attr);
-    if (show_cpuids && obj->cpuset)
+    }
+    if (show_cpuids && obj->cpuset) {
         print_hwloc_cpuset(obj);
+    }
 
     printf("\n");
 }
 
-
 }  // namespace cpuset
 }  // namespace tt
diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp
index a08bc7cc..01210c02 100644
--- a/device/cpuset_lib.hpp
+++ b/device/cpuset_lib.hpp
@@ -4,19 +4,18 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-
 #pragma once
 
+#include <unistd.h>
+
 #include <map>
-#include <vector>
-#include <string>
 #include <mutex>
+#include <string>
 #include <thread>
-#include <unistd.h>
-
-#include "umd/device/tt_cluster_descriptor.h" // For chip_id_t
+#include <vector>
 
 #include "hwloc.h"
+#include "umd/device/tt_cluster_descriptor.h"  // For chip_id_t
 
 using tt_cluster_description = tt_ClusterDescriptor;
 
@@ -27,90 +26,87 @@ namespace cpuset {
 // CPU ID allocator for pinning threads to cpu_ids
 // It's a singleton that should be retrieved via get()
 struct tt_cpuset_allocator {
-    public:
-
-        tt_cpuset_allocator(tt_cpuset_allocator const&)     = delete;
-        void operator=(tt_cpuset_allocator const&)          = delete;
-
-        // Bind an already allocated memory region to particular numa nodes
-        static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){
-            auto& instance = tt_cpuset_allocator::get();
-            return instance.bind_area_memory_nodeset(physical_device_id, addr, len);
-        }
-
-        static int get_num_tt_pci_devices(){
-            auto& instance = tt_cpuset_allocator::get();
-            return instance._get_num_tt_pci_devices();
-        }
-
-        static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id){
-            auto& instance = tt_cpuset_allocator::get();
-            return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
-        }
-
-    private:
-
-        static tt_cpuset_allocator& get() {
-            static tt_cpuset_allocator instance;
-            return instance;
-        }
-
-        tt_cpuset_allocator();
-
-        int TENSTORRENT_VENDOR_ID = 0x1e52;
-
-        bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len);
-        int _get_num_tt_pci_devices();
-        int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id);
-
-        // Series of init functions, must be called in this order. Seperated out to support
-        // early exit in case of errors.
-        bool init_topology_init_and_load();
-        bool init_find_tt_pci_devices_packages_numanodes();
-        bool init_get_number_of_packages();
-        bool init_is_cpu_model_supported();
-        bool init_determine_cpuset_allocations();
-
-        // Helper Functions
-        std::string get_pci_bus_id(hwloc_obj_t pci_device_obj);
-        int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id);
-        hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id);
-
-        // Debug Functions
-        void print_hwloc_cpuset(hwloc_obj_t &obj);
-        void print_hwloc_nodeset(hwloc_obj_t &obj);
-        void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true);
-        std::vector<int> get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap);
-        std::vector<int> get_hwloc_cpuset_vector(hwloc_obj_t &obj);
-        std::vector<int> get_hwloc_nodeset_vector(hwloc_obj_t &obj);
-        hwloc_topology_t m_topology;
-        bool m_debug;
-        pid_t m_pid;
-
-        // Items calculated by parsing system info, used by allocation algorithm:
-        std::map<int, std::vector<int>> m_package_id_to_devices_map;
-        std::map<int, std::string> m_physical_device_id_to_pci_bus_id_map; // Debug/Info
-        std::map<std::pair<uint16_t, uint16_t>, int> m_num_tt_device_by_pci_device_id_map;
-
-        std::map<chip_id_t, std::vector<hwloc_cpuset_t>> m_physical_device_id_to_cpusets_map;
-        std::map<chip_id_t, int> m_physical_device_id_to_package_id_map;
-
-        bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing.
-        int m_num_packages = 0;
-        std::vector<int> m_all_tt_devices = {};
-
-        hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default
+public:
+    tt_cpuset_allocator(tt_cpuset_allocator const &) = delete;
+    void operator=(tt_cpuset_allocator const &) = delete;
+
+    // Bind an already allocated memory region to particular numa nodes
+    static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) {
+        auto &instance = tt_cpuset_allocator::get();
+        return instance.bind_area_memory_nodeset(physical_device_id, addr, len);
+    }
 
-        // For 2CCX-PER-CCD Optimization detection.
-        std::map<int, int> m_package_id_to_num_l3_per_ccx_map;
-        std::map<int, int> m_package_id_to_num_ccx_per_ccd_map;
+    static int get_num_tt_pci_devices() {
+        auto &instance = tt_cpuset_allocator::get();
+        return instance._get_num_tt_pci_devices();
+    }
 
-        // Memory Binding
-        std::map<chip_id_t, hwloc_nodeset_t> m_physical_device_id_to_numa_nodeset_map;
+    static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id) {
+        auto &instance = tt_cpuset_allocator::get();
+        return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
+    }
 
-        // Helper for some dynamic multi-threading.
-        std::map<chip_id_t, int> m_num_cpu_cores_allocated_per_tt_device;
+private:
+    static tt_cpuset_allocator &get() {
+        static tt_cpuset_allocator instance;
+        return instance;
+    }
 
+    tt_cpuset_allocator();
+
+    int TENSTORRENT_VENDOR_ID = 0x1e52;
+
+    bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len);
+    int _get_num_tt_pci_devices();
+    int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id);
+
+    // Series of init functions, must be called in this order. Seperated out to support
+    // early exit in case of errors.
+    bool init_topology_init_and_load();
+    bool init_find_tt_pci_devices_packages_numanodes();
+    bool init_get_number_of_packages();
+    bool init_is_cpu_model_supported();
+    bool init_determine_cpuset_allocations();
+
+    // Helper Functions
+    std::string get_pci_bus_id(hwloc_obj_t pci_device_obj);
+    int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id);
+    hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id);
+
+    // Debug Functions
+    void print_hwloc_cpuset(hwloc_obj_t &obj);
+    void print_hwloc_nodeset(hwloc_obj_t &obj);
+    void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true);
+    std::vector<int> get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap);
+    std::vector<int> get_hwloc_cpuset_vector(hwloc_obj_t &obj);
+    std::vector<int> get_hwloc_nodeset_vector(hwloc_obj_t &obj);
+    hwloc_topology_t m_topology;
+    bool m_debug;
+    pid_t m_pid;
+
+    // Items calculated by parsing system info, used by allocation algorithm:
+    std::map<int, std::vector<int>> m_package_id_to_devices_map;
+    std::map<int, std::string> m_physical_device_id_to_pci_bus_id_map;  // Debug/Info
+    std::map<std::pair<uint16_t, uint16_t>, int> m_num_tt_device_by_pci_device_id_map;
+
+    std::map<chip_id_t, std::vector<hwloc_cpuset_t>> m_physical_device_id_to_cpusets_map;
+    std::map<chip_id_t, int> m_physical_device_id_to_package_id_map;
+
+    bool m_enable_cpuset_allocator = true;  // Enable feature, otherwise do nothing.
+    int m_num_packages = 0;
+    std::vector<int> m_all_tt_devices = {};
+
+    hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE;  // Default
+
+    // For 2CCX-PER-CCD Optimization detection.
+    std::map<int, int> m_package_id_to_num_l3_per_ccx_map;
+    std::map<int, int> m_package_id_to_num_ccx_per_ccd_map;
+
+    // Memory Binding
+    std::map<chip_id_t, hwloc_nodeset_t> m_physical_device_id_to_numa_nodeset_map;
+
+    // Helper for some dynamic multi-threading.
+    std::map<chip_id_t, int> m_num_cpu_cores_allocated_per_tt_device;
 };
 
 template <typename T>
diff --git a/device/grayskull/grayskull_coordinate_manager.h b/device/grayskull/grayskull_coordinate_manager.h
index acecbf22..cac8b29a 100644
--- a/device/grayskull/grayskull_coordinate_manager.h
+++ b/device/grayskull/grayskull_coordinate_manager.h
@@ -9,8 +9,8 @@
 #include "umd/device/coordinate_manager.h"
 
 class GrayskullCoordinateManager : public CoordinateManager {
-
 public:
-    GrayskullCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask)
-        : CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
+    GrayskullCoordinateManager(
+        const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask) :
+        CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
 };
diff --git a/device/grayskull/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp
index b7199873..209f1c42 100644
--- a/device/grayskull/grayskull_implementation.cpp
+++ b/device/grayskull/grayskull_implementation.cpp
@@ -4,13 +4,12 @@
 
 #include "umd/device/grayskull_implementation.h"
 
-#include "grayskull/host_mem_address_map.h"
 #include "grayskull/eth_interface.h"
-
+#include "grayskull/host_mem_address_map.h"
 #include "umd/device/cluster.h"
 
-constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 32; // source: noc_parameters.h, unique for GS
-constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for GS && WH && BH
+constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 32;   // source: noc_parameters.h, unique for GS
+constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6;  // source: noc_parameters.h, common for GS && WH && BH
 
 namespace tt::umd {
 
@@ -90,7 +89,9 @@ std::pair<std::uint64_t, std::uint64_t> grayskull_implementation::get_tlb_data(
 }
 
 tt_driver_host_address_params grayskull_implementation::get_host_address_params() const {
-    return {::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
+    return {
+        ::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE,
+        ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
 }
 
 tt_driver_eth_interface_params grayskull_implementation::get_eth_interface_params() const {
diff --git a/device/hugepage.cpp b/device/hugepage.cpp
index e9c45d63..8883bff2 100644
--- a/device/hugepage.cpp
+++ b/device/hugepage.cpp
@@ -6,11 +6,11 @@
 
 #include "umd/device/hugepage.h"
 
-#include <sys/stat.h> // for umask
-#include <fcntl.h> // for O_RDWR and other constants
+#include <fcntl.h>     // for O_RDWR and other constants
+#include <sys/stat.h>  // for umask
 
-#include "logger.hpp"
 #include "cpuset_lib.hpp"
+#include "logger.hpp"
 
 const uint32_t g_MAX_HOST_MEM_CHANNELS = 4;
 
@@ -20,13 +20,12 @@ std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages
 
 namespace tt::umd {
 
-uint32_t get_num_hugepages(){
-
+uint32_t get_num_hugepages() {
     std::string nr_hugepages_path = "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages";
     std::ifstream hugepages_file(nr_hugepages_path);
     uint32_t num_hugepages = 0;
 
-    if(hugepages_file.is_open()) {
+    if (hugepages_file.is_open()) {
         std::string value;
         std::getline(hugepages_file, value);
         num_hugepages = std::stoi(value);
@@ -36,100 +35,121 @@ uint32_t get_num_hugepages(){
     }
 
     return num_hugepages;
-
 }
 
-uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) {
-
+uint32_t get_available_num_host_mem_channels(
+    const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) {
     // To minimally support hybrid dev systems with mix of ARCH, get only devices matching current ARCH's device_id.
-    uint32_t total_num_tt_mmio_devices      = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices();
-    uint32_t num_tt_mmio_devices_for_arch   = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
-    uint32_t total_hugepages                = get_num_hugepages();
+    uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices();
+    uint32_t num_tt_mmio_devices_for_arch =
+        tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
+    uint32_t total_hugepages = get_num_hugepages();
 
     // This shouldn't happen on silicon machines.
     if (num_tt_mmio_devices_for_arch == 0) {
-        log_warning(LogSiliconDriver,
+        log_warning(
+            LogSiliconDriver,
             "No TT devices found that match PCI device_id: 0x{:x} revision: {}, returning NumHostMemChannels:0",
-            device_id, revision_id);
+            device_id,
+            revision_id);
         return 0;
     }
 
-    // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups that were incomplete
-    // ie fewer hugepages than devices, which would partially work previously for some devices.
-    uint32_t num_channels_per_device_available = std::min(num_channels_per_device_target, std::max((uint32_t) 1, total_hugepages / num_tt_mmio_devices_for_arch));
+    // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups
+    // that were incomplete ie fewer hugepages than devices, which would partially work previously for some devices.
+    uint32_t num_channels_per_device_available =
+        std::min(num_channels_per_device_target, std::max((uint32_t)1, total_hugepages / num_tt_mmio_devices_for_arch));
 
-    // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later on.
+    // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later
+    // on.
     if (total_num_tt_mmio_devices > num_tt_mmio_devices_for_arch) {
-        log_warning(LogSiliconDriver,
-            "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient Hugepages/HostMemChannels per device.");
+        log_warning(
+            LogSiliconDriver,
+            "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient "
+            "Hugepages/HostMemChannels per device.");
     }
 
     if (total_hugepages < num_tt_mmio_devices_for_arch) {
-        log_warning(LogSiliconDriver,
-            "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. NumHostMemChannels would be 0, bumping to 1.",
-            total_hugepages, num_tt_mmio_devices_for_arch, device_id, revision_id);
+        log_warning(
+            LogSiliconDriver,
+            "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. "
+            "NumHostMemChannels would be 0, bumping to 1.",
+            total_hugepages,
+            num_tt_mmio_devices_for_arch,
+            device_id,
+            revision_id);
     }
 
     if (num_channels_per_device_available < num_channels_per_device_target) {
-        log_warning(LogSiliconDriver,
-            "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds NumHostMemChannels. Increase Number of Hugepages.",
-            num_channels_per_device_available, device_id, num_channels_per_device_target);
+        log_warning(
+            LogSiliconDriver,
+            "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds "
+            "NumHostMemChannels. Increase Number of Hugepages.",
+            num_channels_per_device_available,
+            device_id,
+            num_channels_per_device_target);
     }
 
-    log_assert(num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS,
+    log_assert(
+        num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS,
         "NumHostMemChannels: {} exceeds supported maximum: {}, this is unexpected.",
-        num_channels_per_device_available, g_MAX_HOST_MEM_CHANNELS);
+        num_channels_per_device_available,
+        g_MAX_HOST_MEM_CHANNELS);
 
     return num_channels_per_device_available;
-
 }
 
-std::string find_hugepage_dir(std::size_t pagesize)
-{
-
-    static const std::regex hugetlbfs_mount_re(fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir));
+std::string find_hugepage_dir(std::size_t pagesize) {
+    static const std::regex hugetlbfs_mount_re(
+        fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir));
     static const std::regex pagesize_re("(?:^|,)pagesize=([0-9]+)([KMGT])(?:,|$)");
 
     std::ifstream proc_mounts("/proc/mounts");
 
-    for (std::string line; std::getline(proc_mounts, line); )
-    {
-        if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re))
-        {
+    for (std::string line; std::getline(proc_mounts, line);) {
+        if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re)) {
             std::string options = mount_match[3];
-            if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re))
-            {
+            if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re)) {
                 std::size_t mount_page_size = std::stoull(pagesize_match[1]);
-                switch (pagesize_match[2].str()[0])
-                {
-                    case 'T': mount_page_size <<= 10;
-                    case 'G': mount_page_size <<= 10;
-                    case 'M': mount_page_size <<= 10;
-                    case 'K': mount_page_size <<= 10;
+                switch (pagesize_match[2].str()[0]) {
+                    case 'T':
+                        mount_page_size <<= 10;
+                    case 'G':
+                        mount_page_size <<= 10;
+                    case 'M':
+                        mount_page_size <<= 10;
+                    case 'K':
+                        mount_page_size <<= 10;
                 }
 
-                if (mount_page_size == pagesize)
-                {
+                if (mount_page_size == pagesize) {
                     return mount_match[2];
                 }
             }
         }
     }
 
-    log_warning(LogSiliconDriver, "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: {}.", hugepage_dir, pagesize);
+    log_warning(
+        LogSiliconDriver,
+        "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: "
+        "{}.",
+        hugepage_dir,
+        pagesize);
     return std::string();
 }
 
-int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel) {
+int open_hugepage_file(const std::string& dir, chip_id_t physical_device_id, uint16_t channel) {
     std::vector<char> filename;
     static const char pipeline_name[] = "tenstorrent";
 
     filename.insert(filename.end(), dir.begin(), dir.end());
-    if (filename.back() != '/') filename.push_back('/');
+    if (filename.back() != '/') {
+        filename.push_back('/');
+    }
 
     // In order to limit number of hugepages while transition from shared hugepage (1 per system) to unique
     // hugepage per device, will share original/shared hugepage filename with physical device 0.
-    if (physical_device_id != 0 || channel != 0){
+    if (physical_device_id != 0 || channel != 0) {
         std::string device_id_str = fmt::format("device_{}_", physical_device_id);
         filename.insert(filename.end(), device_id_str.begin(), device_id_str.end());
     }
@@ -139,20 +159,32 @@ int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uin
         filename.insert(filename.end(), channel_id_str.begin(), channel_id_str.end());
     }
 
-    filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name)); // includes NUL terminator
+    filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name));  // includes NUL terminator
 
     std::string filename_str(filename.begin(), filename.end());
-    filename_str.erase(std::find(filename_str.begin(), filename_str.end(), '\0'), filename_str.end()); // Erase NULL terminator for printing.
-    log_debug(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}", filename_str.c_str(), physical_device_id, channel);
+    filename_str.erase(
+        std::find(filename_str.begin(), filename_str.end(), '\0'),
+        filename_str.end());  // Erase NULL terminator for printing.
+    log_debug(
+        LogSiliconDriver,
+        "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}",
+        filename_str.c_str(),
+        physical_device_id,
+        channel);
 
     // Save original and set umask to unrestricted.
     auto old_umask = umask(0);
 
-    int fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH );
+    int fd =
+        open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH);
     if (fd == -1 && errno == EACCES) {
-        log_warning(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.", filename_str);
+        log_warning(
+            LogSiliconDriver,
+            "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.",
+            filename_str);
         unlink(filename.data());
-        fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH );
+        fd = open(
+            filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH);
     }
 
     // Restore original mask
@@ -166,4 +198,4 @@ int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uin
     return fd;
 }
 
-} // namespace tt::umd
+}  // namespace tt::umd
diff --git a/device/ioctl.h b/device/ioctl.h
index 60ec7b2f..1f732cfc 100644
--- a/device/ioctl.h
+++ b/device/ioctl.h
@@ -4,6 +4,9 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+// clang-format off
+// This file is copied from KMD, so we don't want clang formatting diff.
+
 #ifndef TTDRIVER_IOCTL_H_INCLUDED
 #define TTDRIVER_IOCTL_H_INCLUDED
 
@@ -155,3 +158,4 @@ struct tenstorrent_pin_pages {
 };
 
 #endif
+// clang-format on
diff --git a/device/mockup/tt_mockup_device.hpp b/device/mockup/tt_mockup_device.hpp
index 25985407..15107ebc 100644
--- a/device/mockup/tt_mockup_device.hpp
+++ b/device/mockup/tt_mockup_device.hpp
@@ -9,31 +9,42 @@
 #include <cstdint>
 #include <vector>
 
-#include "umd/device/tt_cluster_descriptor.h"
 #include "umd/device/cluster.h"
+#include "umd/device/tt_cluster_descriptor.h"
 
 class tt_MockupDevice : public tt_device {
-   public:
-    tt_MockupDevice(const std::string& sdesc_path) : tt_device(sdesc_path) {
+public:
+    tt_MockupDevice(const std::string& sdesc_path) : tt_device() {
         soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
         std::set<chip_id_t> target_devices = {0};
     }
+
     virtual ~tt_MockupDevice() {}
 
     // Setup/Teardown Functions
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors() override {
         return soc_descriptor_per_chip;
     }
+
     void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) override {}
+
     void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) override {}
+
     void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) override {}
-    void set_driver_eth_interface_params(
-        const tt_driver_eth_interface_params& eth_interface_params_) override {}
+
+    void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) override {}
+
     void start_device(const tt_device_params& device_params) override {}
+
     void assert_risc_reset() override {}
+
     void deassert_risc_reset() override {}
-    void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET) override {}
+
+    void deassert_risc_reset_at_core(
+        tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET) override {}
+
     void assert_risc_reset_at_core(tt_cxy_pair core) override {}
+
     void close_device() override {}
 
     // Runtime Functions
@@ -43,10 +54,13 @@ class tt_MockupDevice : public tt_device {
         tt_cxy_pair core,
         uint64_t addr,
         const std::string& tlb_to_use) override {}
+
     void read_from_device(
         void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) override {}
+
     void write_to_sysmem(
         const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) override {}
+
     void read_from_sysmem(
         void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) override {}
 
@@ -54,10 +68,12 @@ class tt_MockupDevice : public tt_device {
         const chip_id_t chip,
         const std::string& fallback_tlb,
         const std::unordered_set<tt_xy_pair>& cores = {}) override {}
+
     void dram_membar(
         const chip_id_t chip,
         const std::string& fallback_tlb,
         const std::unordered_set<uint32_t>& channels = {}) override {}
+
     void dram_membar(
         const chip_id_t chip,
         const std::string& fallback_tlb,
@@ -66,27 +82,35 @@ class tt_MockupDevice : public tt_device {
     void wait_for_non_mmio_flush() override {}
 
     // Misc. Functions to Query/Set Device State
-    std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors() override {
-        return {{0, 0}};
-    }
+    std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors() override { return {{0, 0}}; }
+
     static std::vector<chip_id_t> detect_available_device_ids() { return {0}; };
+
     std::set<chip_id_t> get_target_remote_device_ids() override { return target_remote_chips; }
+
     std::map<int, int> get_clocks() override { return {{0, 0}}; }
+
     void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const override {
         return nullptr;
     }
+
     std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const override { return 0; }
+
     std::uint32_t get_num_dram_channels(std::uint32_t device_id) override {
         return get_soc_descriptor(device_id).get_num_dram_channels();
     };
+
     std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) override {
         return get_soc_descriptor(device_id).dram_bank_size;
     }
+
     std::uint32_t get_num_host_channels(std::uint32_t device_id) override { return 1; }
+
     std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) override { return 0; }
+
     std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) override { return 0; }
 
-   private:
+private:
     std::vector<tt::ARCH> archs_in_cluster = {};
     std::set<chip_id_t> target_devices_in_cluster = {};
     std::set<chip_id_t> target_remote_chips = {};
diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp
index 433fe4bf..bdf40962 100644
--- a/device/pcie/pci_device.cpp
+++ b/device/pcie/pci_device.cpp
@@ -4,27 +4,27 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include "umd/device/pci_device.hpp"
+
+#include <fcntl.h>      // for ::open
+#include <linux/pci.h>  // for PCI_SLOT, PCI_FUNC
+#include <sys/ioctl.h>  // for ioctl
+#include <sys/mman.h>   // for mmap, munmap
+#include <sys/stat.h>   // for fstat
+#include <unistd.h>     // for ::close
+
 #include <cstdint>
-#include <cstring> // for memcpy
+#include <cstring>  // for memcpy
 #include <vector>
-#include <fcntl.h>  // for ::open
-#include <unistd.h> // for ::close
-#include <sys/ioctl.h> // for ioctl
-#include <sys/mman.h>  // for mmap, munmap
-#include <sys/stat.h> // for fstat
-#include <linux/pci.h> // for PCI_SLOT, PCI_FUNC
-
-#include "umd/device/pci_device.hpp"
-#include "ioctl.h"
 
+#include "assert.hpp"
+#include "cpuset_lib.hpp"
 #include "ioctl.h"
-#include "umd/device/tt_arch_types.h"
-#include "umd/device/driver_atomics.h"
+#include "logger.hpp"
 #include "umd/device/architecture_implementation.h"
-#include "cpuset_lib.hpp"
+#include "umd/device/driver_atomics.h"
 #include "umd/device/hugepage.h"
-#include "assert.hpp"
-#include "logger.hpp"
+#include "umd/device/tt_arch_types.h"
 
 static const uint16_t GS_PCIE_DEVICE_ID = 0xfaca;
 static const uint16_t WH_PCIE_DEVICE_ID = 0x401e;
@@ -32,25 +32,29 @@ static const uint16_t BH_PCIE_DEVICE_ID = 0xb140;
 
 // TODO: we'll have to rethink this when KMD takes control of the inbound PCIe
 // TLB windows and there is no longer a pre-defined WC/UC split.
-static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24);
+static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156 << 20) + (10 << 21) + (18 << 24);
 
 // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC
-static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21;
+static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188 << 21;
 
 static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044;
 static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078;
 
 // Hugepages must be 1GB in size
-const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB
+const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30;  // 1GB
 
 using namespace tt;
 using namespace tt::umd;
 
 template <typename T>
 static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name) {
-    const auto sysfs_path = fmt::format("/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}",
-                                        device_info.pci_domain, device_info.pci_bus,
-                                        device_info.pci_device, device_info.pci_function, attribute_name);
+    const auto sysfs_path = fmt::format(
+        "/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}",
+        device_info.pci_domain,
+        device_info.pci_bus,
+        device_info.pci_device,
+        device_info.pci_function,
+        attribute_name);
     std::ifstream attribute_file(sysfs_path);
     std::string value_str;
     T value;
@@ -75,8 +79,7 @@ static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribu
     return value;
 }
 
-static PciDeviceInfo read_device_info(int fd)
-{
+static PciDeviceInfo read_device_info(int fd) {
     tenstorrent_get_device_info info{};
     info.in.output_size_bytes = sizeof(info.out);
 
@@ -92,11 +95,11 @@ static PciDeviceInfo read_device_info(int fd)
 }
 
 static tt::ARCH detect_arch(uint32_t pcie_device_id, uint32_t pcie_revision_id) {
-    if (pcie_device_id == GS_PCIE_DEVICE_ID){
+    if (pcie_device_id == GS_PCIE_DEVICE_ID) {
         return tt::ARCH::GRAYSKULL;
-    } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01){
+    } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01) {
         return tt::ARCH::WORMHOLE_B0;
-    } else if (pcie_device_id == BH_PCIE_DEVICE_ID){
+    } else if (pcie_device_id == BH_PCIE_DEVICE_ID) {
         return tt::ARCH::BLACKHOLE;
     } else {
         TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id);
@@ -122,28 +125,29 @@ inline void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes)
 
     if (dest_misalignment != 0) {
         // Read-modify-write for the first dest element.
-        dp = reinterpret_cast<copy_t*>(dest_addr - dest_misalignment);
+        dp = reinterpret_cast<copy_t *>(dest_addr - dest_misalignment);
 
         copy_t tmp = *dp;
 
         auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes);
 
-        std::memcpy(reinterpret_cast<char*>(&tmp) + dest_misalignment, src, leading_len);
+        std::memcpy(reinterpret_cast<char *>(&tmp) + dest_misalignment, src, leading_len);
         num_bytes -= leading_len;
         src = static_cast<const char *>(src) + leading_len;
 
         *dp++ = tmp;
 
     } else {
-        dp = static_cast<copy_t*>(dest);
+        dp = static_cast<copy_t *>(dest);
     }
 
     // Copy the destination-aligned middle.
-    const copy_t *sp = static_cast<const copy_t*>(src);
+    const copy_t *sp = static_cast<const copy_t *>(src);
     std::size_t num_words = num_bytes / sizeof(copy_t);
 
-    for (std::size_t i = 0; i < num_words; i++)
+    for (std::size_t i = 0; i < num_words; i++) {
         *dp++ = *sp++;
+    }
 
     // Finally copy any sub-word trailer, again RMW on the destination.
     auto trailing_len = num_bytes % sizeof(copy_t);
@@ -166,7 +170,7 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
     unsigned int src_misalignment = src_addr % sizeof(copy_t);
 
     if (src_misalignment != 0) {
-        sp = reinterpret_cast<copy_t*>(src_addr - src_misalignment);
+        sp = reinterpret_cast<copy_t *>(src_addr - src_misalignment);
 
         copy_t tmp = *sp++;
 
@@ -176,15 +180,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
         dest = static_cast<char *>(dest) + leading_len;
 
     } else {
-        sp = static_cast<const volatile copy_t*>(src);
+        sp = static_cast<const volatile copy_t *>(src);
     }
 
     // Copy the source-aligned middle.
     copy_t *dp = static_cast<copy_t *>(dest);
     std::size_t num_words = num_bytes / sizeof(copy_t);
 
-    for (std::size_t i = 0; i < num_words; i++)
+    for (std::size_t i = 0; i < num_words; i++) {
         *dp++ = *sp++;
+    }
 
     // Finally copy any sub-word trailer.
     auto trailing_len = num_bytes % sizeof(copy_t);
@@ -195,17 +200,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
 }
 
 tt::ARCH PciDeviceInfo::get_arch() const {
-    if (this->device_id == GS_PCIE_DEVICE_ID){
+    if (this->device_id == GS_PCIE_DEVICE_ID) {
         return tt::ARCH::GRAYSKULL;
     } else if (this->device_id == WH_PCIE_DEVICE_ID) {
         return tt::ARCH::WORMHOLE_B0;
-    } else if (this->device_id == BH_PCIE_DEVICE_ID){
+    } else if (this->device_id == BH_PCIE_DEVICE_ID) {
         return tt::ARCH::BLACKHOLE;
     }
     return tt::ARCH::Invalid;
 }
 
-
 /* static */ std::vector<int> PCIDevice::enumerate_devices() {
     std::vector<int> device_ids;
     std::string path = "/dev/tenstorrent/";
@@ -213,7 +217,7 @@ tt::ARCH PciDeviceInfo::get_arch() const {
     if (!std::filesystem::exists(path)) {
         return device_ids;
     }
-    for (const auto& entry : std::filesystem::directory_iterator(path)) {
+    for (const auto &entry : std::filesystem::directory_iterator(path)) {
         std::string filename = entry.path().filename().string();
 
         // TODO: this will skip any device that has a non-numeric name, which
@@ -237,28 +241,29 @@ tt::ARCH PciDeviceInfo::get_arch() const {
 
         try {
             infos[n] = read_device_info(fd);
-        } catch (...) {}
+        } catch (...) {
+        }
 
         close(fd);
     }
     return infos;
 }
 
-PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
-    : device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number))
-    , pci_device_num(pci_device_number)
-    , logical_id(logical_device_id)
-    , pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC))
-    , info(read_device_info(pci_device_file_desc))
-    , numa_node(read_sysfs<int>(info, "numa_node"))
-    , revision(read_sysfs<int>(info, "revision"))
-    , arch(detect_arch(info.device_id, revision))
-    , architecture_implementation(tt::umd::architecture_implementation::create(arch))
-{
+PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) :
+    device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)),
+    pci_device_num(pci_device_number),
+    logical_id(logical_device_id),
+    pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)),
+    info(read_device_info(pci_device_file_desc)),
+    numa_node(read_sysfs<int>(info, "numa_node")),
+    revision(read_sysfs<int>(info, "revision")),
+    arch(detect_arch(info.device_id, revision)),
+    architecture_implementation(tt::umd::architecture_implementation::create(arch)) {
     struct {
         tenstorrent_query_mappings query_mappings;
         tenstorrent_mapping mapping_array[8];
     } mappings;
+
     memset(&mappings, 0, sizeof(mappings));
     mappings.query_mappings.in.output_mapping_count = 8;
 
@@ -302,7 +307,9 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
             bar4_wc_mapping = mappings.mapping_array[i];
         }
 
-        log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}",
+        log_debug(
+            LogSiliconDriver,
+            "BAR mapping id {} base {} size {}",
             mappings.mapping_array[i].mapping_id,
             (void *)mappings.mapping_array[i].mapping_base,
             mappings.mapping_array[i].mapping_size);
@@ -317,7 +324,8 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
     // Attempt WC mapping first so we can fall back to all-UC if it fails.
     if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) {
         bar0_wc_size = std::min<size_t>(bar0_wc_mapping.mapping_size, wc_mapping_size);
-        bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base);
+        bar0_wc = mmap(
+            NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base);
         if (bar0_wc == MAP_FAILED) {
             bar0_wc_size = 0;
             bar0_wc = nullptr;
@@ -334,7 +342,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
         bar0_uc_offset = 0;
     }
 
-    bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_uc_mapping.mapping_base + bar0_uc_offset);
+    bar0_uc = mmap(
+        NULL,
+        bar0_uc_size,
+        PROT_READ | PROT_WRITE,
+        MAP_SHARED,
+        pci_device_file_desc,
+        bar0_uc_mapping.mapping_base + bar0_uc_offset);
 
     if (bar0_uc == MAP_FAILED) {
         throw std::runtime_error(fmt::format("BAR0 UC mapping failed for device {}.", pci_device_num));
@@ -351,22 +365,34 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
 
         system_reg_mapping_size = bar4_uc_mapping.mapping_size;
 
-        system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_uc_mapping.mapping_base);
+        system_reg_mapping = mmap(
+            NULL,
+            bar4_uc_mapping.mapping_size,
+            PROT_READ | PROT_WRITE,
+            MAP_SHARED,
+            pci_device_file_desc,
+            bar4_uc_mapping.mapping_base);
 
         if (system_reg_mapping == MAP_FAILED) {
             throw std::runtime_error(fmt::format("BAR4 UC mapping failed for device {}.", pci_device_num));
         }
 
-        system_reg_start_offset = (512 - 16) * 1024*1024;
-        system_reg_offset_adjust = (512 - 32) * 1024*1024;
-    } else if(arch == tt::ARCH::BLACKHOLE) {
+        system_reg_start_offset = (512 - 16) * 1024 * 1024;
+        system_reg_offset_adjust = (512 - 32) * 1024 * 1024;
+    } else if (arch == tt::ARCH::BLACKHOLE) {
         if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) {
             throw std::runtime_error(fmt::format("Device {} has no BAR2 UC mapping.", pci_device_num));
         }
 
         // Using UnCachable memory mode. This is used for accessing registers on Blackhole.
         bar2_uc_size = bar2_uc_mapping.mapping_size;
-        bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar2_uc_mapping.mapping_base);
+        bar2_uc = mmap(
+            NULL,
+            bar2_uc_mapping.mapping_size,
+            PROT_READ | PROT_WRITE,
+            MAP_SHARED,
+            pci_device_file_desc,
+            bar2_uc_mapping.mapping_base);
 
         if (bar2_uc == MAP_FAILED) {
             throw std::runtime_error(fmt::format("BAR2 UC mapping failed for device {}.", pci_device_num));
@@ -379,7 +405,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
         // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole.
         // WC doesn't guarantee write ordering but has better performance.
         bar4_wc_size = bar4_wc_mapping.mapping_size;
-        bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_wc_mapping.mapping_base);
+        bar4_wc = mmap(
+            NULL,
+            bar4_wc_mapping.mapping_size,
+            PROT_READ | PROT_WRITE,
+            MAP_SHARED,
+            pci_device_file_desc,
+            bar4_wc_mapping.mapping_base);
 
         if (bar4_wc == MAP_FAILED) {
             throw std::runtime_error(fmt::format("BAR4 WC mapping failed for device {}.", pci_device_num));
@@ -391,7 +423,7 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
 }
 
 PCIDevice::~PCIDevice() {
-    for (const auto& hugepage_mapping : hugepage_mapping_per_channel) {
+    for (const auto &hugepage_mapping : hugepage_mapping_per_channel) {
         if (hugepage_mapping.mapping) {
             munmap(hugepage_mapping.mapping, hugepage_mapping.mapping_size);
         }
@@ -405,8 +437,8 @@ PCIDevice::~PCIDevice() {
         // essential for correctness then it needs to move to the driver.
         uint64_t iatu_index = 0;
         uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200;
-        uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0
-        write_regs(reinterpret_cast<uint32_t*>(static_cast<uint8_t*>(bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
+        uint32_t region_ctrl_2 = 0 << 31;  // REGION_EN = 0
+        write_regs(reinterpret_cast<uint32_t *>(static_cast<uint8_t *>(bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
     }
 
     close(pci_device_file_desc);
@@ -432,8 +464,8 @@ PCIDevice::~PCIDevice() {
     }
 }
 
-template<typename T>
-T* PCIDevice::get_register_address(uint32_t register_offset) {
+template <typename T>
+T *PCIDevice::get_register_address(uint32_t register_offset) {
     // Right now, address can either be exposed register in BAR, or TLB window in BAR0 (BAR4 for Blackhole).
     // Should clarify this interface
     void *reg_mapping;
@@ -446,10 +478,10 @@ T* PCIDevice::get_register_address(uint32_t register_offset) {
         register_offset -= bar0_uc_offset;
         reg_mapping = bar0_uc;
     }
-    return reinterpret_cast<T*>(static_cast<uint8_t*>(reg_mapping) + register_offset);
+    return reinterpret_cast<T *>(static_cast<uint8_t *>(reg_mapping) + register_offset);
 }
 
-void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) {
+void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr) {
     void *dest = nullptr;
     if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) {
         byte_addr -= BAR0_BH_SIZE;
@@ -466,7 +498,7 @@ void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_
     }
 }
 
-void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) {
+void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr) {
     void *src = nullptr;
     if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) {
         byte_addr -= BAR0_BH_SIZE;
@@ -483,7 +515,7 @@ void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buff
     }
 
     if (num_bytes >= sizeof(std::uint32_t)) {
-        detect_hang_read(*reinterpret_cast<std::uint32_t*>(dest));
+        detect_hang_read(*reinterpret_cast<std::uint32_t *>(dest));
     }
 }
 
@@ -496,14 +528,14 @@ void PCIDevice::write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_
 
 void PCIDevice::write_regs(uint32_t byte_addr, uint32_t word_len, const void *data) {
     volatile uint32_t *dest = get_register_address<uint32_t>(byte_addr);
-    const uint32_t *src = reinterpret_cast<const uint32_t*>(data);
+    const uint32_t *src = reinterpret_cast<const uint32_t *>(data);
 
     write_regs(dest, src, word_len);
 }
 
 void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) {
     const volatile uint32_t *src = get_register_address<uint32_t>(byte_addr);
-    uint32_t *dest = reinterpret_cast<uint32_t*>(data);
+    uint32_t *dest = reinterpret_cast<uint32_t *>(data);
 
     while (word_len-- != 0) {
         uint32_t temp = *src++;
@@ -511,29 +543,34 @@ void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) {
     }
 }
 
-void PCIDevice::write_tlb_reg(uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size){
-    log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs");
+void PCIDevice::write_tlb_reg(
+    uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size) {
+    log_assert(
+        (tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12),
+        "Tenstorrent hardware supports only 64bit or 96bit TLB config regs");
 
     volatile uint64_t *dest_qw = get_register_address<uint64_t>(byte_addr);
-    volatile uint32_t *dest_extra_dw = get_register_address<uint32_t>(byte_addr+8);
+    volatile uint32_t *dest_extra_dw = get_register_address<uint32_t>(byte_addr + 8);
 #if defined(__ARM_ARCH) || defined(__riscv)
     // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses.
-    // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses.
-    // Insert an explicit full memory barrier for ARM.
-    // Do the same for RISC-V.
+    // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory
+    // accesses. Insert an explicit full memory barrier for ARM. Do the same for RISC-V.
     tt_driver_atomics::mfence();
 #endif
     *dest_qw = value_lower;
     if (tlb_cfg_reg_size > 8) {
-        uint32_t* p_value_upper = reinterpret_cast<uint32_t*>(&value_upper);
+        uint32_t *p_value_upper = reinterpret_cast<uint32_t *>(&value_upper);
         *dest_extra_dw = p_value_upper[0];
     }
-    tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register.
+    tt_driver_atomics::mfence();  // Otherwise subsequent WC loads move earlier than the above UC store to the TLB
+                                  // register.
 }
 
 bool PCIDevice::is_hardware_hung() {
-    volatile const void *addr = reinterpret_cast<const char *>(bar0_uc) + (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - bar0_uc_offset;
-    std::uint32_t scratch_data = *reinterpret_cast<const volatile std::uint32_t*>(addr);
+    volatile const void *addr = reinterpret_cast<const char *>(bar0_uc) +
+                                (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) -
+                                bar0_uc_offset;
+    std::uint32_t scratch_data = *reinterpret_cast<const volatile std::uint32_t *>(addr);
 
     return (scratch_data == c_hang_read_value);
 }
@@ -547,55 +584,94 @@ void PCIDevice::detect_hang_read(std::uint32_t data_read) {
 }
 
 // Get TLB index (from zero), check if it's in 16MB, 2MB or 1MB TLB range, and dynamically program it.
-dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end,
-                            std::uint64_t address, bool multicast, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering) {
+dynamic_tlb PCIDevice::set_dynamic_tlb(
+    unsigned int tlb_index,
+    tt_xy_pair start,
+    tt_xy_pair end,
+    std::uint64_t address,
+    bool multicast,
+    std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+    std::uint64_t ordering) {
     auto architecture_implementation = get_architecture_implementation();
     if (multicast) {
         std::tie(start, end) = architecture_implementation->multicast_workaround(start, end);
     }
 
-    log_trace(LogSiliconDriver, "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast = {}, ordering = {}",
-         tlb_index, start.x, start.y, end.x, end.y, address, multicast, (int)ordering);
+    log_trace(
+        LogSiliconDriver,
+        "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast "
+        "= {}, ordering = {}",
+        tlb_index,
+        start.x,
+        start.y,
+        end.x,
+        end.y,
+        address,
+        multicast,
+        (int)ordering);
 
     tt::umd::tlb_configuration tlb_config = architecture_implementation->get_tlb_configuration(tlb_index);
     std::uint32_t TLB_CFG_REG_SIZE_BYTES = architecture_implementation->get_tlb_cfg_reg_size_bytes();
     auto translated_start_coords = harvested_coord_translation.at(logical_id).at(start);
     auto translated_end_coords = harvested_coord_translation.at(logical_id).at(end);
-    uint32_t tlb_address    = address / tlb_config.size;
-    uint32_t local_address   = address % tlb_config.size;
-    uint64_t tlb_base       = tlb_config.base + (tlb_config.size * tlb_config.index_offset);
-    uint32_t tlb_cfg_reg    = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset);
-
-    std::pair<std::uint64_t, std::uint64_t> tlb_data = tt::umd::tlb_data {
-        .local_offset = tlb_address,
-        .x_end = static_cast<uint64_t>(translated_end_coords.x),
-        .y_end = static_cast<uint64_t>(translated_end_coords.y),
-        .x_start = static_cast<uint64_t>(translated_start_coords.x),
-        .y_start = static_cast<uint64_t>(translated_start_coords.y),
-        .mcast = multicast,
-        .ordering = ordering,
-        // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0.
-        // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be the same TLB.
-        // Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc.
-        .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true,
-    }.apply_offset(tlb_config.offset);
-
-    log_debug(LogSiliconDriver, "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} tlb_cfg_reg: 0x{:x}", tlb_index, tlb_config.index_offset, tlb_config.size/(1024*1024), tlb_base, tlb_cfg_reg);
+    uint32_t tlb_address = address / tlb_config.size;
+    uint32_t local_address = address % tlb_config.size;
+    uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset);
+    uint32_t tlb_cfg_reg = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset);
+
+    std::pair<std::uint64_t, std::uint64_t> tlb_data =
+        tt::umd::tlb_data{
+            .local_offset = tlb_address,
+            .x_end = static_cast<uint64_t>(translated_end_coords.x),
+            .y_end = static_cast<uint64_t>(translated_end_coords.y),
+            .x_start = static_cast<uint64_t>(translated_start_coords.x),
+            .y_start = static_cast<uint64_t>(translated_start_coords.y),
+            .mcast = multicast,
+            .ordering = ordering,
+            // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0.
+            // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be
+            // the same TLB. Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc.
+            .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true,
+        }
+            .apply_offset(tlb_config.offset);
+
+    log_debug(
+        LogSiliconDriver,
+        "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} "
+        "tlb_cfg_reg: 0x{:x}",
+        tlb_index,
+        tlb_config.index_offset,
+        tlb_config.size / (1024 * 1024),
+        tlb_base,
+        tlb_cfg_reg);
     write_tlb_reg(tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES);
 
-    return { tlb_base + local_address, tlb_config.size - local_address };
+    return {tlb_base + local_address, tlb_config.size - local_address};
 }
 
-dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering) {
+dynamic_tlb PCIDevice::set_dynamic_tlb(
+    unsigned int tlb_index,
+    tt_xy_pair target,
+    std::uint64_t address,
+    std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+    std::uint64_t ordering) {
     return set_dynamic_tlb(tlb_index, tt_xy_pair(0, 0), target, address, false, harvested_coord_translation, ordering);
 }
 
-dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering) {
+dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast(
+    unsigned int tlb_index,
+    std::uint64_t address,
+    std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+    tt_xy_pair start,
+    tt_xy_pair end,
+    std::uint64_t ordering) {
     // Issue a broadcast to cores included in the start (top left) and end (bottom right) grid
     return set_dynamic_tlb(tlb_index, start, end, address, true, harvested_coord_translation, ordering);
 }
 
-tt::umd::architecture_implementation* PCIDevice::get_architecture_implementation() const {return architecture_implementation.get();}
+tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation() const {
+    return architecture_implementation.get();
+}
 
 bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
     const size_t hugepage_size = HUGEPAGE_REGION_SIZE;
@@ -605,7 +681,10 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
 
     std::string hugepage_dir = find_hugepage_dir(hugepage_size);
     if (hugepage_dir.empty()) {
-        log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.", hugepage_size);
+        log_warning(
+            LogSiliconDriver,
+            "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.",
+            hugepage_size);
         return false;
     }
 
@@ -615,11 +694,14 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
 
     // Support for more than 1GB host memory accessible per device, via channels.
     for (int ch = 0; ch < num_host_mem_channels; ch++) {
-
         int hugepage_fd = open_hugepage_file(hugepage_dir, physical_device_id, ch);
         if (hugepage_fd == -1) {
             // Probably a permissions problem.
-            log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.", physical_device_id, ch);
+            log_warning(
+                LogSiliconDriver,
+                "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.",
+                physical_device_id,
+                ch);
             success = false;
             continue;
         }
@@ -630,26 +712,43 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
             log_warning(LogSiliconDriver, "Error reading hugepage file size after opening.");
         }
 
-        std::byte *mapping = static_cast<std::byte*>(mmap(nullptr, hugepage_size, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0));
+        std::byte *mapping = static_cast<std::byte *>(
+            mmap(nullptr, hugepage_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0));
 
         close(hugepage_fd);
 
         if (mapping == MAP_FAILED) {
-            log_warning(LogSiliconDriver, "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).", physical_device_id, ch, num_host_mem_channels, strerror(errno));
+            log_warning(
+                LogSiliconDriver,
+                "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).",
+                physical_device_id,
+                ch,
+                num_host_mem_channels,
+                strerror(errno));
             if (hugepage_st.st_size == 0) {
-                log_warning(LogSiliconDriver, "Opened hugepage file has zero size, mapping might've failed due to that. Verify that enough hugepages are provided.");
+                log_warning(
+                    LogSiliconDriver,
+                    "Opened hugepage file has zero size, mapping might've failed due to that. Verify that enough "
+                    "hugepages are provided.");
             }
-            print_file_contents("/proc/cmdline");\
-            print_file_contents("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"); // Hardcoded for 1GB hugepage.
+            print_file_contents("/proc/cmdline");
+            print_file_contents(
+                "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages");  // Hardcoded for 1GB hugepage.
             success = false;
             continue;
         }
 
-        // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same numanode as TT device.
-        if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)){
-            log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: {}). "
-            "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893).",
-            physical_device_id, ch);
+        // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same
+        // numanode as TT device.
+        if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)) {
+            log_warning(
+                LogSiliconDriver,
+                "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: "
+                "{}). "
+                "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf "
+                "(Issue #893).",
+                physical_device_id,
+                ch);
         }
 
         tenstorrent_pin_pages pin_pages;
@@ -662,7 +761,13 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
         auto fd = get_fd();
 
         if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) {
-            log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed (errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", physical_device_id, ch, strerror(errno));
+            log_warning(
+                LogSiliconDriver,
+                "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed "
+                "(errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...",
+                physical_device_id,
+                ch,
+                strerror(errno));
             munmap(mapping, hugepage_size);
             print_file_contents("/sys/module/tenstorrent/version", "(TTKMD version)");
             print_file_contents("/proc/meminfo");
@@ -673,15 +778,19 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
 
         hugepage_mapping_per_channel[ch] = {mapping, hugepage_size, pin_pages.out.physical_address};
 
-        log_debug(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}", physical_device_id, ch, hugepage_size, (unsigned long long)hugepage_mappings.at(device_id).at(ch).physical_address);
+        log_debug(
+            LogSiliconDriver,
+            "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}",
+            physical_device_id,
+            ch,
+            hugepage_size,
+            (unsigned long long)hugepage_mappings.at(device_id).at(ch).physical_address);
     }
 
     return success;
 }
 
-int PCIDevice::get_num_host_mem_channels() const {
-    return hugepage_mapping_per_channel.size();
-}
+int PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); }
 
 hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const {
     if (channel < 0 || hugepage_mapping_per_channel.size() <= channel) {
@@ -691,10 +800,10 @@ hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const {
     }
 }
 
-void PCIDevice::print_file_contents(std::string filename, std::string hint){
-    if (std::filesystem::exists(filename)){
+void PCIDevice::print_file_contents(std::string filename, std::string hint) {
+    if (std::filesystem::exists(filename)) {
         std::ifstream meminfo(filename);
-        if (meminfo.is_open()){
+        if (meminfo.is_open()) {
             std::cout << std::endl << "File " << filename << " " << hint << " is: " << std::endl;
             std::cout << meminfo.rdbuf();
         }
diff --git a/device/simulation/deprecated/tt_emulation_device.cpp b/device/simulation/deprecated/tt_emulation_device.cpp
index 25026737..e7d66893 100644
--- a/device/simulation/deprecated/tt_emulation_device.cpp
+++ b/device/simulation/deprecated/tt_emulation_device.cpp
@@ -3,193 +3,231 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdexcept>
+#include "tt_emulation_device.h"
+
 #include <cstring>
+#include <stdexcept>
 
 #include "common/logger.hpp"
 #include "device/tt_cluster_descriptor.h"
-#include "tt_emulation_device.h"
 #include "tt_emu_zemi3_wrapper.h"
 
-
 tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) {
-  soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
-  std::set<chip_id_t> target_devices = {0};
-  // create just a default one, we do not have cluster anyway
-  ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
-  tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper();
+    soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
+    std::set<chip_id_t> target_devices = {0};
+    // create just a default one, we do not have cluster anyway
+    ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
+    tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper();
 
-  log_info(tt::LogEmulationDriver, "Created Emulation Device ");
+    log_info(tt::LogEmulationDriver, "Created Emulation Device ");
 }
 
 tt_emulation_device::~tt_emulation_device() {
-  ndesc.reset();
-  delete tt_zebu_wrapper_inst;
-  log_info(tt::LogEmulationDriver, "Destroyed Emulation Device ");
+    ndesc.reset();
+    delete tt_zebu_wrapper_inst;
+    log_info(tt::LogEmulationDriver, "Destroyed Emulation Device ");
 }
-  
+
 void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data) {
-  const uint32_t size = static_cast<uint32_t>(data.size());
-  tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); 
-  log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y);
+    const uint32_t size = static_cast<uint32_t>(data.size());
+    tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data);
+    log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y);
 }
 
 std::vector<uint8_t> tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {
-  std::vector<uint8_t> data(size);
-  tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data);
-  log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr);
+    std::vector<uint8_t> data(size);
+    tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data);
+    log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr);
 
-  return data;
+    return data;
 }
 
-
 void tt_emulation_device::start_device(const tt_device_params& device_params) {
-  tt_zebu_wrapper_inst->zebu_start();
-  tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC);
-  log_info(tt::LogEmulationDriver, "Started Emulation Device ");
+    tt_zebu_wrapper_inst->zebu_start();
+    tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC);
+    log_info(tt::LogEmulationDriver, "Started Emulation Device ");
 }
 
 void tt_emulation_device::deassert_risc_reset() {
-  tt_zebu_wrapper_inst->all_tensix_reset_deassert();
-  log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset ");
+    tt_zebu_wrapper_inst->all_tensix_reset_deassert();
+    log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset ");
 }
 
 void tt_emulation_device::assert_risc_reset() {
-  tt_zebu_wrapper_inst->all_tensix_reset_assert();
-  log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset ");
+    tt_zebu_wrapper_inst->all_tensix_reset_assert();
+    log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset ");
 }
 
-void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {
-  tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y);
+void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {
+    tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y);
 }
 
 void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {
-  tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y);
+    tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y);
 }
 
-
-
 void tt_emulation_device::close_device() {
     log_info(tt::LogEmulationDriver, "Closing Emulation Device ");
     tt_zebu_wrapper_inst->zebu_finish();
 }
 
-void tt_emulation_device::start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/
+void tt_emulation_device::start(
+    std::vector<std::string> plusargs,
+    std::vector<std::string> dump_cores,
+    bool no_checkers,
+    bool /*init_device*/,
+    bool /*skip_driver_allocs*/
 ) {
-  log_info(tt::LogEmulationDriver, "Starting Emulation Device ");
+    log_info(tt::LogEmulationDriver, "Starting Emulation Device ");
+}
+
+void tt_emulation_device::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {
+    for (const auto& core : get_soc_descriptor(0)->cores) {
+        // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) ==
+        // rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
+        //     write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+        //   }
+        // MT: Iterate through all the worker cores for bcast:
+        // if (get_soc_descriptor(0)->is_worker_core(core.first)) {
+        //   write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+        // }
+        // Emulation only broadcasts to all Tensix cores or all DRAM cores.
+        // differentiate which bcast pattern to use based on exclude columns
+        if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
+            // Detect DRAM bcast
+            if (get_soc_descriptor(0)->is_dram_core(core.first)) {
+                write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+            }
+        } else {
+            if (get_soc_descriptor(0)->is_worker_core(core.first)) {
+                write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+            }
+        }
+    }
 }
 
-
-void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {
-  for(const auto& core : get_soc_descriptor(0) -> cores) {
-    // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
-    //     write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-    //   }
-    // MT: Iterate through all the worker cores for bcast:
-    // if (get_soc_descriptor(0)->is_worker_core(core.first)) {
-    //   write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-    // }
-    // Emulation only broadcasts to all Tensix cores or all DRAM cores.
-    // differentiate which bcast pattern to use based on exclude columns
-    if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
-      // Detect DRAM bcast
-      if (get_soc_descriptor(0)->is_dram_core(core.first)) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
-    } else {
-      if (get_soc_descriptor(0)->is_worker_core(core.first)) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
+void tt_emulation_device::rolled_write_to_device(
+    std::vector<uint32_t>& base_vec,
+    uint32_t unroll_count,
+    tt_cxy_pair core,
+    uint64_t base_addr,
+    const std::string& tlb_to_use) {
+    std::vector<uint32_t> vec = base_vec;
+    uint32_t byte_increment = 4 * vec.size();
+    for (uint32_t i = 0; i < unroll_count; ++i) {
+        vec[0] = i;  // slot id for debug
+        uint64_t offset_addr = base_addr + i * byte_increment;
+        write_to_device(vec, core, offset_addr, tlb_to_use);
     }
-  }
-} 
-void tt_emulation_device::rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {
-  std::vector<uint32_t> vec = base_vec;
-  uint32_t byte_increment = 4 * vec.size();
-  for (uint32_t i = 0; i < unroll_count; ++i) {
-    vec[0] = i; // slot id for debug
-    uint64_t offset_addr = base_addr + i * byte_increment;
-    write_to_device(vec, core, offset_addr, tlb_to_use);
-  }
 }
-void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!");
 
-  std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
-  write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
-}
+void tt_emulation_device::write_to_device(
+    const void* mem_ptr,
+    uint32_t size,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {
+    log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!");
 
-void tt_emulation_device::write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
+    std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
+    write_to_device(
+        mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
+}
 
-  std::vector<uint8_t> byte_data(vec.size() * sizeof(uint32_t));
-  std::memcpy(byte_data.data(), vec.data(), byte_data.size());
+void tt_emulation_device::write_to_device(
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {
+    std::vector<uint8_t> byte_data(vec.size() * sizeof(uint32_t));
+    std::memcpy(byte_data.data(), vec.data(), byte_data.size());
 
-  write(core, addr, byte_data);
+    write(core, addr, byte_data);
 }
 
-void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+void tt_emulation_device::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
     // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
 }
 
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+void tt_emulation_device::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
     // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
 }
 
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
+void tt_emulation_device::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
     // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
 }
 
+void tt_emulation_device::read_from_device(
+    std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {
+    std::vector<uint8_t> byte_data = read(core, addr, size);
 
+    // Verify that the received byte data can be converted to uint32_t
+    // if (byte_data.size() % sizeof(uint32_t) != 0) {
+    //   throw std::runtime_error("Received byte data size is not a multiple of uint32_t size.");
+    // }
 
-void tt_emulation_device::read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {
-  std::vector<uint8_t> byte_data = read(core, addr, size);
-
-  // Verify that the received byte data can be converted to uint32_t
-  // if (byte_data.size() % sizeof(uint32_t) != 0) {
-  //   throw std::runtime_error("Received byte data size is not a multiple of uint32_t size.");
-  // }
-
-  vec.clear();
-  vec.resize(byte_data.size() / sizeof(uint32_t));
-  std::memcpy(vec.data(), byte_data.data(), byte_data.size());
+    vec.clear();
+    vec.resize(byte_data.size() / sizeof(uint32_t));
+    std::memcpy(vec.data(), byte_data.data(), byte_data.size());
 }
 
 void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
-  // No translation is performed
-  return;
+    // No translation is performed
+    return;
 }
+
 tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); }
 
 std::set<chip_id_t> tt_emulation_device::get_target_mmio_device_ids() {
-  log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented");
-  return {};
+    log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented");
+    return {};
 }
 
 std::set<chip_id_t> tt_emulation_device::get_target_remote_device_ids() {
-  log_error("LogEmulationDriver: get_target_remote_device_ids not implemented");
-  return {};
+    log_error("LogEmulationDriver: get_target_remote_device_ids not implemented");
+    return {};
 }
 
 void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {
     dram_address_params = dram_address_params_;
 }
+
 int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
-std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; }
+
+std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return {0}; }
+
 int tt_emulation_device::detect_number_of_chips() { return 1; }
 
 bool tt_emulation_device::using_harvested_soc_descriptors() { return false; }
-bool tt_emulation_device::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
 
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
+bool tt_emulation_device::noc_translation_en() { return false; }
 
-std::map<int, int> tt_emulation_device::get_clocks() {
-  return std::map<int, int>();
+std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() {
+    return {{0, 0}};
 }
 
-void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
-  l1_address_params = l1_address_params_;
+std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {
+    return soc_descriptor_per_chip;
 }
 
+std::map<int, int> tt_emulation_device::get_clocks() { return std::map<int, int>(); }
 
-
+void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
+    l1_address_params = l1_address_params_;
+}
diff --git a/device/simulation/deprecated/tt_emulation_device.h b/device/simulation/deprecated/tt_emulation_device.h
index b15e2aaf..8c411d07 100644
--- a/device/simulation/deprecated/tt_emulation_device.h
+++ b/device/simulation/deprecated/tt_emulation_device.h
@@ -9,63 +9,97 @@
 #include <cstdint>
 #include <fstream>
 #include <vector>
+
+#include "cluster.h"
 #include "tt_soc_descriptor.h"
 #include "tt_xy_pair.h"
-#include "cluster.h"
 
 // use forward declaration here so we do not need to include tt_zebu_wrapper.h
 class tt_zebu_wrapper;
 
 class tt_emulation_device : public tt_device {
 public:
-  virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care
-  tt_emulation_device(const std::string& sdesc_path);
-  virtual void start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs);
-  virtual void start_device(const tt_device_params& device_params);
-  virtual void close_device();
-  virtual void deassert_risc_reset();
-  virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET);
-  virtual void assert_risc_reset();
-  virtual void assert_risc_reset_at_core(tt_cxy_pair core);
-  virtual void write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-  virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-  virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
+    virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);  // Dont care
+    tt_emulation_device(const std::string& sdesc_path);
+    virtual void start(
+        std::vector<std::string> plusargs,
+        std::vector<std::string> dump_cores,
+        bool no_checkers,
+        bool init_device,
+        bool skip_driver_allocs);
+    virtual void start_device(const tt_device_params& device_params);
+    virtual void close_device();
+    virtual void deassert_risc_reset();
+    virtual void deassert_risc_reset_at_core(
+        tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET);
+    virtual void assert_risc_reset();
+    virtual void assert_risc_reset_at_core(tt_cxy_pair core);
+    virtual void write_to_device(
+        std::vector<uint32_t>& vec,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use,
+        bool send_epoch_cmd = false,
+        bool last_send_epoch_cmd = true,
+        bool ordered_with_prev_remote_write = false);
+    virtual void write_to_device(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use,
+        bool send_epoch_cmd = false,
+        bool last_send_epoch_cmd = true,
+        bool ordered_with_prev_remote_write = false);
+    virtual void broadcast_write_to_cluster(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& columns_to_exclude,
+        const std::string& fallback_tlb);
 
-  void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-  void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-  void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
 
-  virtual void rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use); // See Versim Implementation
-  virtual void read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
+    virtual void rolled_write_to_device(
+        std::vector<uint32_t>& base_vec,
+        uint32_t unroll_count,
+        tt_cxy_pair core,
+        uint64_t base_addr,
+        const std::string& tlb_to_use);  // See Versim Implementation
+    virtual void read_from_device(
+        std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
 
-  virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
-  virtual bool using_harvested_soc_descriptors();
-  virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-  virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
-  virtual bool noc_translation_en();
-  virtual std::set<chip_id_t> get_target_mmio_device_ids(); 
-  virtual std::set<chip_id_t> get_target_remote_device_ids();
-  virtual ~tt_emulation_device(); 
-  virtual tt_ClusterDescriptor* get_cluster_description();
-  virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
-  virtual int get_number_of_chips_in_cluster(); 
-  virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster(); 
-  static int detect_number_of_chips();  
-  virtual std::map<int, int> get_clocks(); 
-private:
-
-  tt_device_l1_address_params l1_address_params;
-  std::shared_ptr<tt_ClusterDescriptor> ndesc;
-  tt_device_dram_address_params dram_address_params;
-  
-  // zebu wrapper, provides interface to zebu emulator device through axi and command transactors
-  tt_zebu_wrapper *tt_zebu_wrapper_inst = NULL;
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
+    virtual bool using_harvested_soc_descriptors();
+    virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
+    virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
+    virtual bool noc_translation_en();
+    virtual std::set<chip_id_t> get_target_mmio_device_ids();
+    virtual std::set<chip_id_t> get_target_remote_device_ids();
+    virtual ~tt_emulation_device();
+    virtual tt_ClusterDescriptor* get_cluster_description();
+    virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
+    virtual int get_number_of_chips_in_cluster();
+    virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
+    static int detect_number_of_chips();
+    virtual std::map<int, int> get_clocks();
 
+private:
+    tt_device_l1_address_params l1_address_params;
+    std::shared_ptr<tt_ClusterDescriptor> ndesc;
+    tt_device_dram_address_params dram_address_params;
 
+    // zebu wrapper, provides interface to zebu emulator device through axi and command transactors
+    tt_zebu_wrapper* tt_zebu_wrapper_inst = NULL;
 
-  // These functions implement the "protocol" between the RTL simulation and the UMD
-  void write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data);
-  std::vector<uint8_t> read(tt_cxy_pair core, uint64_t addr, uint32_t size);
-  
+    // These functions implement the "protocol" between the RTL simulation and the UMD
+    void write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data);
+    std::vector<uint8_t> read(tt_cxy_pair core, uint64_t addr, uint32_t size);
 };
-
diff --git a/device/simulation/deprecated/tt_emulation_stub.cpp b/device/simulation/deprecated/tt_emulation_stub.cpp
index b841359f..bdd97b27 100644
--- a/device/simulation/deprecated/tt_emulation_stub.cpp
+++ b/device/simulation/deprecated/tt_emulation_stub.cpp
@@ -3,23 +3,21 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdexcept>
 #include <cstring>
+#include <stdexcept>
 
 #include "common/logger.hpp"
 #include "tt_emulation_device.h"
 
 tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) {
-  throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n");
+    throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n");
 }
 
-
 tt_emulation_device::~tt_emulation_device() {}
-  
-void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data) {}
 
-std::vector<uint8_t> tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {return {};}
+void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data) {}
 
+std::vector<uint8_t> tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { return {}; }
 
 void tt_emulation_device::start_device(const tt_device_params& device_params) {}
 
@@ -27,52 +25,99 @@ void tt_emulation_device::deassert_risc_reset() {}
 
 void tt_emulation_device::assert_risc_reset() {}
 
-void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {}
+void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {}
 
 void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {}
 
 void tt_emulation_device::close_device() {}
 
-void tt_emulation_device::start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/) {}
-
-
-void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {} 
-void tt_emulation_device::rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {}
-
-void tt_emulation_device::write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {};
-void tt_emulation_device::read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {}
-void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
-
+void tt_emulation_device::start(
+    std::vector<std::string> plusargs,
+    std::vector<std::string> dump_cores,
+    bool no_checkers,
+    bool /*init_device*/,
+    bool /*skip_driver_allocs*/) {}
+
+void tt_emulation_device::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {}
+
+void tt_emulation_device::rolled_write_to_device(
+    std::vector<uint32_t>& base_vec,
+    uint32_t unroll_count,
+    tt_cxy_pair core,
+    uint64_t base_addr,
+    const std::string& tlb_to_use) {}
+
+void tt_emulation_device::write_to_device(
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {}
+
+void tt_emulation_device::write_to_device(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write){};
+
+void tt_emulation_device::read_from_device(
+    std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {}
+
+void tt_emulation_device::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_emulation_device::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_emulation_device::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
 
 // -------------------------
 // Not sure how to implement these functions below, leaving them blank/default for now
 void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
-  // No translation is performed
-  return;
+    // No translation is performed
+    return;
 }
+
 tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); }
 
-std::set<chip_id_t> tt_emulation_device::get_target_mmio_device_ids() {return {};}
+std::set<chip_id_t> tt_emulation_device::get_target_mmio_device_ids() { return {}; }
 
-std::set<chip_id_t> tt_emulation_device::get_target_remote_device_ids() {return {};}
+std::set<chip_id_t> tt_emulation_device::get_target_remote_device_ids() { return {}; }
 
 void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {}
+
 int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
-std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; }
+
+std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return {0}; }
+
 int tt_emulation_device::detect_number_of_chips() { return 1; }
 
 bool tt_emulation_device::using_harvested_soc_descriptors() { return false; }
-bool tt_emulation_device::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
-
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
 
-std::map<int, int> tt_emulation_device::get_clocks() {return std::map<int, int>();}
+bool tt_emulation_device::noc_translation_en() { return false; }
 
-void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {}
+std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() {
+    return {{0, 0}};
+}
 
+std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {
+    return soc_descriptor_per_chip;
+}
 
+std::map<int, int> tt_emulation_device::get_clocks() { return std::map<int, int>(); }
 
+void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {}
diff --git a/device/simulation/deprecated/tt_versim_device.cpp b/device/simulation/deprecated/tt_versim_device.cpp
index 7e700b2f..9504d9f6 100644
--- a/device/simulation/deprecated/tt_versim_device.cpp
+++ b/device/simulation/deprecated/tt_versim_device.cpp
@@ -2,16 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
-
-#include "cluster.h"
-#include "device/driver_atomics.h"
-#include "common/logger.hpp"
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
 
+#include "cluster.h"
+#include "common/logger.hpp"
+#include "device/driver_atomics.h"
 #include "yaml-cpp/yaml.h"
 
 // TODO: Remove dependency on command_assembler + soc
@@ -19,112 +17,134 @@
 #include "device/tt_cluster_descriptor.h"
 namespace CA = CommandAssembler;
 
-
-void translate_soc_descriptor_to_ca_soc(CA::Soc &soc, const tt_SocDescriptor soc_descriptor) {
-  for (auto &core : soc_descriptor.cores) {
-    CA::SocNocNode node;
-    CA::xy_pair CA_coord(core.first.x, core.first.y);
-    node.noc_coord = CA_coord;
-    node.memory_size = core.second.l1_size;
-    switch (core.second.type) {
-      case CoreType::ARC: node.arc = true; break;
-      case CoreType::DRAM: {
-        node.dram = true; 
-        #ifdef EN_DRAM_ALIAS
-          node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first));
-        #endif
-      } break;
-      case CoreType::ETH: node.eth = true; break;
-      case CoreType::PCIE: node.pcie = true; break;
-      case CoreType::WORKER: node.worker = true; break;
-      case CoreType::HARVESTED: node.harvested = true; break;
-      case CoreType::ROUTER_ONLY: node.router_only = true; break;
-      default: std::cout << " Error: Unsupported CoreType type: " << static_cast<int>(core.second.type) << std::endl; break;
+void translate_soc_descriptor_to_ca_soc(CA::Soc& soc, const tt_SocDescriptor soc_descriptor) {
+    for (auto& core : soc_descriptor.cores) {
+        CA::SocNocNode node;
+        CA::xy_pair CA_coord(core.first.x, core.first.y);
+        node.noc_coord = CA_coord;
+        node.memory_size = core.second.l1_size;
+        switch (core.second.type) {
+            case CoreType::ARC:
+                node.arc = true;
+                break;
+            case CoreType::DRAM: {
+                node.dram = true;
+#ifdef EN_DRAM_ALIAS
+                node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first));
+#endif
+            } break;
+            case CoreType::ETH:
+                node.eth = true;
+                break;
+            case CoreType::PCIE:
+                node.pcie = true;
+                break;
+            case CoreType::WORKER:
+                node.worker = true;
+                break;
+            case CoreType::HARVESTED:
+                node.harvested = true;
+                break;
+            case CoreType::ROUTER_ONLY:
+                node.router_only = true;
+                break;
+            default:
+                std::cout << " Error: Unsupported CoreType type: " << static_cast<int>(core.second.type) << std::endl;
+                break;
+        }
+        soc.SetNodeProperties(node.noc_coord, node);
     }
-    soc.SetNodeProperties(node.noc_coord, node);
-  }
 }
 
 ////////
 // Device Versim
 ////////
 
+#include <command_assembler/xy_pair.h>
+
 #include "device.h"
 #include "sim_interactive.h"
-#include <command_assembler/xy_pair.h>
 
-tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) {
-  soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
-  std::set<chip_id_t> target_devices = {0};
-  if (ndesc_path == "") {
-    ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
-  } 
-  else {
-    ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);
-  }
+tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) {
+    soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
+    std::set<chip_id_t> target_devices = {0};
+    if (ndesc_path == "") {
+        ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
+    } else {
+        ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);
+    }
 }
 
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
-
-tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();}
-void tt_VersimDevice::start_device(const tt_device_params &device_params) {
-  bool no_checkers = true;
-  std::vector<std::string> dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0) -> grid_size);
-  start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false);
+std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_soc_descriptors() {
+    return soc_descriptor_per_chip;
 }
 
-void tt_VersimDevice::close_device() {
-  stop();
+tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); }
+
+void tt_VersimDevice::start_device(const tt_device_params& device_params) {
+    bool no_checkers = true;
+    std::vector<std::string> dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0)->grid_size);
+    start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false);
 }
 
+void tt_VersimDevice::close_device() { stop(); }
+
 void tt_VersimDevice::start(
     std::vector<std::string> plusargs,
     std::vector<std::string> dump_cores,
     bool no_checkers,
     bool /*init_device*/,
     bool /*skip_driver_allocs*/
-    ) {
-
-     std::cout << "Start Versim Device " << std::endl;
-     std::string device_descriptor_dir = "./";
+) {
+    std::cout << "Start Versim Device " << std::endl;
+    std::string device_descriptor_dir = "./";
 
-     std::optional<std::string> vcd_suffix;
-     if (dump_cores.size() > 0) {
-       vcd_suffix = "core_dump.vcd";
-     }
+    std::optional<std::string> vcd_suffix;
+    if (dump_cores.size() > 0) {
+        vcd_suffix = "core_dump.vcd";
+    }
 
-     std::vector<std::string> vcd_cores;
+    std::vector<std::string> vcd_cores;
 
-     // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core
-     // interface. mainly bypasses arch_configs etc from llir.  We can populate soc directly
-     // MT: have to preserve ca_soc_descriptor object since versim references it at runtime
-     CA::xy_pair CA_grid_size((soc_descriptor_per_chip.begin() -> second).grid_size.x, (soc_descriptor_per_chip.begin() -> second).grid_size.y);
-     // CA::Soc ca_soc_manager(CA_grid_size);
-     std::unique_ptr<CA::Soc> p_ca_soc_manager_unique = std::make_unique<CA::Soc>(CA_grid_size);
-     translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin() -> second));
-     // TODO: End
+    // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core
+    // interface. mainly bypasses arch_configs etc from llir.  We can populate soc directly
+    // MT: have to preserve ca_soc_descriptor object since versim references it at runtime
+    CA::xy_pair CA_grid_size(
+        (soc_descriptor_per_chip.begin()->second).grid_size.x, (soc_descriptor_per_chip.begin()->second).grid_size.y);
+    // CA::Soc ca_soc_manager(CA_grid_size);
+    std::unique_ptr<CA::Soc> p_ca_soc_manager_unique = std::make_unique<CA::Soc>(CA_grid_size);
+    translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin()->second));
+    // TODO: End
 
-     std::cout << "Versim Device: turn_on_device ";
-     std::vector<std::uint32_t> trisc_sizes = {static_cast<unsigned int>(l1_address_params.trisc0_size), static_cast<unsigned int>(l1_address_params.trisc1_size), static_cast<unsigned int>(l1_address_params.trisc2_size)};
-     std::unique_ptr<versim::VersimSimulator> versim_unique = versim::turn_on_device(CA_grid_size, *p_ca_soc_manager_unique, plusargs, vcd_suffix, dump_cores, no_checkers,
-        l1_address_params.trisc_base, trisc_sizes);
-     versim = versim_unique.release();
+    std::cout << "Versim Device: turn_on_device ";
+    std::vector<std::uint32_t> trisc_sizes = {
+        static_cast<unsigned int>(l1_address_params.trisc0_size),
+        static_cast<unsigned int>(l1_address_params.trisc1_size),
+        static_cast<unsigned int>(l1_address_params.trisc2_size)};
+    std::unique_ptr<versim::VersimSimulator> versim_unique = versim::turn_on_device(
+        CA_grid_size,
+        *p_ca_soc_manager_unique,
+        plusargs,
+        vcd_suffix,
+        dump_cores,
+        no_checkers,
+        l1_address_params.trisc_base,
+        trisc_sizes);
+    versim = versim_unique.release();
 
-     std::cout << "Versim Device: write info to tvm db " << std::endl;
-     versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes);
-     versim::build_and_connect_tvm_phase();
+    std::cout << "Versim Device: write info to tvm db " << std::endl;
+    versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes);
+    versim::build_and_connect_tvm_phase();
 
-     versim->spin_threads(*p_ca_soc_manager_unique, false);
-     versim::assert_reset(*versim);
+    versim->spin_threads(*p_ca_soc_manager_unique, false);
+    versim::assert_reset(*versim);
 
-     p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release());
+    p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release());
 
-     std::cout << "Versim Device: Done start " << std::endl;
+    std::cout << "Versim Device: Done start " << std::endl;
 }
 
-tt_VersimDevice::~tt_VersimDevice () {
-  ndesc.reset();
-}
+tt_VersimDevice::~tt_VersimDevice() { ndesc.reset(); }
 
 // bool tt_VersimDevice::run() {
 //   std::cout << "Versim Device: Run " << std::endl;
@@ -136,165 +156,218 @@ tt_VersimDevice::~tt_VersimDevice () {
 // }
 
 void tt_VersimDevice::deassert_risc_reset() {
-  std::cout << "Versim Device: Deassert risc resets start" << std::endl;
-  versim::handle_resetting_triscs(*versim);
-  std::cout << "Versim Device: Start main loop " << std::endl;
-  versim::startup_versim_main_loop(*versim);
+    std::cout << "Versim Device: Deassert risc resets start" << std::endl;
+    versim::handle_resetting_triscs(*versim);
+    std::cout << "Versim Device: Start main loop " << std::endl;
+    versim::startup_versim_main_loop(*versim);
 }
 
-void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {
-  // This function deasserts reset on the full versim device (don't need core level granularity for versim)
- deassert_risc_reset();
+void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {
+    // This function deasserts reset on the full versim device (don't need core level granularity for versim)
+    deassert_risc_reset();
 }
 
 void tt_VersimDevice::assert_risc_reset() {
-  std::cout << "Pause all the cores" << std::endl;
-  versim::pause(*versim);
+    std::cout << "Pause all the cores" << std::endl;
+    versim::pause(*versim);
 
-  std::cout << "Wait for cores to go to paused state" << std::endl;
-  versim::sleep_wait_for_paused (*versim);
+    std::cout << "Wait for cores to go to paused state" << std::endl;
+    versim::sleep_wait_for_paused(*versim);
 
-  std::cout << "Assert riscv reset" << std::endl;
-  versim::assert_riscv_reset(*versim);
+    std::cout << "Assert riscv reset" << std::endl;
+    versim::assert_riscv_reset(*versim);
 }
 
 void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {
-  // This function asserts reset on the full versim device (don't need core level granularity for versim)
- assert_risc_reset();
-}
-
-void tt_VersimDevice::rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
-  uint32_t byte_increment = vec.size() * 4; 
-  for (int i=0; i<unroll_count; i++) {
-      vec[0] = i; // slot id for debug
-      write_to_device(vec, core, addr + i * byte_increment, tlb_to_use);
-  }
-}
-
-void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-  std::vector<std::uint32_t> mem_vector(mem_ptr, mem_ptr + len);
-  rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb);
+    // This function asserts reset on the full versim device (don't need core level granularity for versim)
+    assert_risc_reset();
 }
 
-void tt_VersimDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Write vector at target core {}, address: {}", get_sim_time(*versim), core.str(), addr);
-
-  bool aligned_32B = (soc_descriptor_per_chip.begin() -> second).cores.at(core).type == CoreType::DRAM;
-  // MT: Remove these completely
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
-  CommandAssembler::memory CA_tensor_memory(addr, vec);
-
-  nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory);
+void tt_VersimDevice::rolled_write_to_device(
+    std::vector<uint32_t>& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
+    uint32_t byte_increment = vec.size() * 4;
+    for (int i = 0; i < unroll_count; i++) {
+        vec[0] = i;  // slot id for debug
+        write_to_device(vec, core, addr + i * byte_increment, tlb_to_use);
+    }
 }
 
-void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!");
-
-  std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
-  write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
+void tt_VersimDevice::rolled_write_to_device(
+    uint32_t* mem_ptr,
+    uint32_t len,
+    uint32_t unroll_count,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& fallback_tlb) {
+    std::vector<std::uint32_t> mem_vector(mem_ptr, mem_ptr + len);
+    rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb);
+}
+
+void tt_VersimDevice::write_to_device(
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {
+    log_debug(
+        tt::LogSiliconDriver,
+        "Versim Device ({}): Write vector at target core {}, address: {}",
+        get_sim_time(*versim),
+        core.str(),
+        addr);
+
+    bool aligned_32B = (soc_descriptor_per_chip.begin()->second).cores.at(core).type == CoreType::DRAM;
+    // MT: Remove these completely
+    CommandAssembler::xy_pair CA_target(core.x, core.y);
+    CommandAssembler::memory CA_tensor_memory(addr, vec);
+
+    nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory);
+}
+
+void tt_VersimDevice::write_to_device(
+    const void* mem_ptr,
+    uint32_t size,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {
+    log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!");
+
+    std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
+    write_to_device(
+        mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
+}
+
+void tt_VersimDevice::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {
+    for (const auto& core : get_soc_descriptor(0)->cores) {
+        if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and
+            rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
+            write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+        }
+    }
 }
 
-void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {
-  for(const auto& core : get_soc_descriptor(0) -> cores) {
-    if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
-  }
-}
 void tt_VersimDevice::wait_for_non_mmio_flush() {
-  // Do nothing, since Versim does not simulate non-mmio mapped chips
+    // Do nothing, since Versim does not simulate non-mmio mapped chips
 }
 
-void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
+void tt_VersimDevice::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+    tt_driver_atomics::mfence();  // Ensure no reordering of loads/stores around this
 }
 
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
+void tt_VersimDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
+    tt_driver_atomics::mfence();  // Ensure no reordering of loads/stores around this
 }
 
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
+void tt_VersimDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {
+    tt_driver_atomics::mfence();  // Ensure no reordering of loads/stores around this
 }
 
-void tt_VersimDevice::read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size);
+void tt_VersimDevice::read_from_device(
+    std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
+    log_debug(
+        tt::LogSiliconDriver,
+        "Versim Device ({}): Read vector from address: {}, with size: {} Bytes",
+        get_sim_time(*versim),
+        addr,
+        size);
 
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
+    CommandAssembler::xy_pair CA_target(core.x, core.y);
 
-  size_t size_in_words = size / 4;
-  auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
-  vec = result;
+    size_t size_in_words = size / 4;
+    auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
+    vec = result;
 }
 
-void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size);
-  log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!");
+void tt_VersimDevice::read_from_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
+    log_debug(
+        tt::LogSiliconDriver,
+        "Versim Device ({}): Read vector from address: {}, with size: {} Bytes",
+        get_sim_time(*versim),
+        addr,
+        size);
+    log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!");
 
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
+    CommandAssembler::xy_pair CA_target(core.x, core.y);
 
-  size_t size_in_words = size / 4;
-  auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
-  memcpy(mem_ptr, result.data(), result.size()*sizeof(uint32_t));
+    size_t size_in_words = size / 4;
+    auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
+    memcpy(mem_ptr, result.data(), result.size() * sizeof(uint32_t));
 }
 
-void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {
-  // No translation is performed
-  return;
+void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
+    // No translation is performed
+    return;
 }
 
 std::set<chip_id_t> tt_VersimDevice::get_target_mmio_device_ids() {
-  // Must only be used for silicon
-  return {};
+    // Must only be used for silicon
+    return {};
 }
 
 std::set<chip_id_t> tt_VersimDevice::get_target_remote_device_ids() {
-  // Must only be used for silicon
-  return {};
+    // Must only be used for silicon
+    return {};
 }
 
-
-bool versim_check_dram_core_exists(const std::vector<std::vector<tt_xy_pair>> &dram_core_channels, tt_xy_pair target_core) {
+bool versim_check_dram_core_exists(
+    const std::vector<std::vector<tt_xy_pair>>& dram_core_channels, tt_xy_pair target_core) {
     bool dram_core_exists = false;
-    for (const auto &dram_cores_in_channel: dram_core_channels) {
-      for (const auto &dram_core : dram_cores_in_channel) {
-        if (dram_core.x == target_core.x && dram_core.y == target_core.y) {
-            return true;
+    for (const auto& dram_cores_in_channel : dram_core_channels) {
+        for (const auto& dram_core : dram_cores_in_channel) {
+            if (dram_core.x == target_core.x && dram_core.y == target_core.y) {
+                return true;
+            }
         }
-      }
     }
     return false;
 }
 
 int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
+
 std::unordered_set<int> tt_VersimDevice::get_all_chips_in_cluster() { return {0}; }
+
 int tt_VersimDevice::detect_number_of_chips() { return 1; }
 
 bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; }
+
 bool tt_VersimDevice::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
+
+std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}}; }
 
 // Meant to breakout running functions for simulator
 bool tt_VersimDevice::stop() {
-  std::cout << "Versim Device: Stop " << std::endl;
-
-  versim::turn_off_device(*versim);
-  versim->shutdown();
-  // Force free of all versim cores
-  for (auto x = 0; x < versim->grid_size.x; x++) {
-    for (auto y = 0; y < versim->grid_size.y; y++) {
-      delete versim->core_grid.at(x).at(y);
+    std::cout << "Versim Device: Stop " << std::endl;
+
+    versim::turn_off_device(*versim);
+    versim->shutdown();
+    // Force free of all versim cores
+    for (auto x = 0; x < versim->grid_size.x; x++) {
+        for (auto y = 0; y < versim->grid_size.y; y++) {
+            delete versim->core_grid.at(x).at(y);
+        }
     }
-  }
-  std::cout << "Versim Device: Stop completed " << std::endl;
-  delete versim;
-  return true;
+    std::cout << "Versim Device: Stop completed " << std::endl;
+    delete versim;
+    return true;
 }
 
-std::map<int,int> tt_VersimDevice::get_clocks() {
-  return std::map<int,int>();
-}
+std::map<int, int> tt_VersimDevice::get_clocks() { return std::map<int, int>(); }
 
 void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
     l1_address_params = l1_address_params_;
@@ -305,11 +378,11 @@ void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_addres
 }
 
 std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {
-    return get_soc_descriptor(device_id) -> get_num_dram_channels();
+    return get_soc_descriptor(device_id)->get_num_dram_channels();
 }
 
 std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
-    return get_soc_descriptor(device_id) -> dram_bank_size; // Space per channel is identical for now
+    return get_soc_descriptor(device_id)->dram_bank_size;  // Space per channel is identical for now
 }
 
 std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {
diff --git a/device/simulation/deprecated/tt_versim_device.h b/device/simulation/deprecated/tt_versim_device.h
index 05ac6b06..2c71f1be 100644
--- a/device/simulation/deprecated/tt_versim_device.h
+++ b/device/simulation/deprecated/tt_versim_device.h
@@ -11,42 +11,92 @@
 #include "tt_xy_pair.h"
 
 class c_versim_core;
-namespace nuapi {namespace device {template <typename, typename>class Simulator;}}
-namespace versim {
-  struct VersimSimulatorState;
-  using VersimSimulator = nuapi::device::Simulator<c_versim_core *, VersimSimulatorState>;
+
+namespace nuapi {
+namespace device {
+template <typename, typename>
+class Simulator;
 }
+}  // namespace nuapi
+
+namespace versim {
+struct VersimSimulatorState;
+using VersimSimulator = nuapi::device::Simulator<c_versim_core*, VersimSimulatorState>;
+}  // namespace versim
 
 /**
  * @brief Versim Backend Class, derived from the tt_device class
  * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device.
-*/ 
-class tt_VersimDevice: public tt_device
-{
-    public:
+ */
+class tt_VersimDevice : public tt_device {
+public:
     virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
     virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
-    tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path);
+    tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path);
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
-    virtual void start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs);
-    virtual void start_device(const tt_device_params &device_params);
+    virtual void start(
+        std::vector<std::string> plusargs,
+        std::vector<std::string> dump_cores,
+        bool no_checkers,
+        bool init_device,
+        bool skip_driver_allocs);
+    virtual void start_device(const tt_device_params& device_params);
     virtual void close_device();
     virtual void deassert_risc_reset();
-    virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET);
+    virtual void deassert_risc_reset_at_core(
+        tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET);
     virtual void assert_risc_reset();
     virtual void assert_risc_reset_at_core(tt_cxy_pair core);
-    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
-    virtual void read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
-    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); 
+    virtual void write_to_device(
+        std::vector<uint32_t>& vec,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use,
+        bool send_epoch_cmd = false,
+        bool last_send_epoch_cmd = true,
+        bool ordered_with_prev_remote_write = false);
+    virtual void broadcast_write_to_cluster(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& columns_to_exclude,
+        const std::string& fallback_tlb);
+    virtual void rolled_write_to_device(
+        std::vector<uint32_t>& vec,
+        uint32_t unroll_count,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use);
+    virtual void read_from_device(
+        std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
+    virtual void rolled_write_to_device(
+        uint32_t* mem_ptr,
+        uint32_t size_in_bytes,
+        uint32_t unroll_count,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& fallback_tlb);
+    virtual void write_to_device(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use,
+        bool send_epoch_cmd = false,
+        bool last_send_epoch_cmd = true,
+        bool ordered_with_prev_remote_write = false);
+    virtual void read_from_device(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
     virtual void wait_for_non_mmio_flush();
-    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
+    void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
     virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
     virtual bool noc_translation_en();
@@ -57,12 +107,13 @@ class tt_VersimDevice: public tt_device
     virtual int get_number_of_chips_in_cluster();
     virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
     static int detect_number_of_chips();
-    virtual std::map<int,int> get_clocks();
+    virtual std::map<int, int> get_clocks();
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
     virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    private:
+
+private:
     bool stop();
     tt_device_l1_address_params l1_address_params;
     tt_device_dram_address_params dram_address_params;
diff --git a/device/simulation/deprecated/tt_versim_stub.cpp b/device/simulation/deprecated/tt_versim_stub.cpp
index 8cf0899b..c80e0bdd 100644
--- a/device/simulation/deprecated/tt_versim_stub.cpp
+++ b/device/simulation/deprecated/tt_versim_stub.cpp
@@ -2,19 +2,18 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
-#include "cluster.h"
-
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
 
-tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) {
-  throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n");
+#include "cluster.h"
+
+tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) {
+    throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n");
 }
 
-tt_VersimDevice::~tt_VersimDevice () {}
+tt_VersimDevice::~tt_VersimDevice() {}
 
 std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_soc_descriptors() {
     throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n");
@@ -22,23 +21,71 @@ std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_so
 }
 
 int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
+
 std::unordered_set<int> tt_VersimDevice::get_all_chips_in_cluster() { return {}; }
+
 int tt_VersimDevice::detect_number_of_chips() { return 0; }
 
-void tt_VersimDevice::start_device(const tt_device_params &device_params) {}
+void tt_VersimDevice::start_device(const tt_device_params& device_params) {}
+
 void tt_VersimDevice::close_device() {}
-void tt_VersimDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {}
-void tt_VersimDevice::read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
-void tt_VersimDevice::rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {}
-void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
-void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {}
+
+void tt_VersimDevice::write_to_device(
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {}
+
+void tt_VersimDevice::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {}
+
+void tt_VersimDevice::read_from_device(
+    std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
+
+void tt_VersimDevice::rolled_write_to_device(
+    std::vector<uint32_t>& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
+}
+
+void tt_VersimDevice::write_to_device(
+    const void* mem_ptr,
+    uint32_t len,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {}
+
+void tt_VersimDevice::read_from_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
+
+void tt_VersimDevice::rolled_write_to_device(
+    uint32_t* mem_ptr,
+    uint32_t len,
+    uint32_t unroll_count,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& fallback_tlb) {}
+
 void tt_VersimDevice::wait_for_non_mmio_flush() {}
 
-void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {}
+void tt_VersimDevice::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_VersimDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
+
+void tt_VersimDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {}
 
 void tt_VersimDevice::start(
     std::vector<std::string> plusargs,
@@ -49,36 +96,48 @@ void tt_VersimDevice::start(
 ) {}
 
 void tt_VersimDevice::deassert_risc_reset() {}
-void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {}
+
+void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {}
+
 void tt_VersimDevice::assert_risc_reset() {}
+
 void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {}
 
-void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {};
+void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c){};
+
 // void tt_VersimDevice::dump_wall_clock_mailbox(std::string output_path, int device_id) {}
 
-std::set<chip_id_t> tt_VersimDevice::get_target_mmio_device_ids() {return {};}
-std::set<chip_id_t> tt_VersimDevice::get_target_remote_device_ids() {return {};}
+std::set<chip_id_t> tt_VersimDevice::get_target_mmio_device_ids() { return {}; }
+
+std::set<chip_id_t> tt_VersimDevice::get_target_remote_device_ids() { return {}; }
 
 bool versim_check_dram_core_exists(
-    const std::vector<std::vector<tt_xy_pair>> &dram_core_channels, tt_xy_pair target_core) {
-  return false;
+    const std::vector<std::vector<tt_xy_pair>>& dram_core_channels, tt_xy_pair target_core) {
+    return false;
 }
 
 bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; }
+
 bool tt_VersimDevice::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return std::unordered_map<chip_id_t, uint32_t>();}
+
+std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() {
+    return std::unordered_map<chip_id_t, uint32_t>();
+}
 
 bool tt_VersimDevice::stop() { return true; }
 
 void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {}
+
 void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {}
 
-std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {return 0;}
-std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;}
-std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {return 0;}
-std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;}
+std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { return 0; }
+
+std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; }
+
+std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { return 0; }
 
-std::map<int,int> tt_VersimDevice::get_clocks() {return std::map<int,int>();}
+std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; }
 
-tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();}
+std::map<int, int> tt_VersimDevice::get_clocks() { return std::map<int, int>(); }
 
+tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); }
diff --git a/device/simulation/tt_simulation_device.cpp b/device/simulation/tt_simulation_device.cpp
index 61cd55ac..086eb520 100644
--- a/device/simulation/tt_simulation_device.cpp
+++ b/device/simulation/tt_simulation_device.cpp
@@ -4,43 +4,44 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <iostream>
+#include "umd/device/tt_simulation_device.h"
+
+#include <nng/nng.h>
+#include <uv.h>
+
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
 
-#include <uv.h>
-#include <nng/nng.h>
-
-#include "logger.hpp"
 #include "assert.hpp"
+#include "logger.hpp"
+#include "tt_simulation_device_generated.h"
 #include "umd/device/driver_atomics.h"
 #include "umd/device/tt_cluster_descriptor.h"
 
-#include "umd/device/tt_simulation_device.h"
-#include "tt_simulation_device_generated.h"
-
-flatbuffers::FlatBufferBuilder create_flatbuffer(DEVICE_COMMAND rw, std::vector<uint32_t> vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_=0){
+flatbuffers::FlatBufferBuilder create_flatbuffer(
+    DEVICE_COMMAND rw, std::vector<uint32_t> vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_ = 0) {
     flatbuffers::FlatBufferBuilder builder;
     auto data = builder.CreateVector(vec);
     auto core = tt_vcs_core(core_.x, core_.y);
-    uint64_t size = size_ == 0 ? size = vec.size()*sizeof(uint32_t) : size = size_;
+    uint64_t size = size_ == 0 ? size = vec.size() * sizeof(uint32_t) : size = size_;
     auto device_cmd = CreateDeviceRequestResponse(builder, rw, data, &core, addr, size);
     builder.Finish(device_cmd);
     return builder;
 }
 
-void print_flatbuffer(const DeviceRequestResponse *buf){    
+void print_flatbuffer(const DeviceRequestResponse* buf) {
     std::vector<uint32_t> data_vec(buf->data()->begin(), buf->data()->end());
     uint64_t addr = buf->address();
     uint32_t size = buf->size();
     tt_cxy_pair core = {0, buf->core()->x(), buf->core()->y()};
-    
+
     std::stringstream ss;
     ss << std::hex << reinterpret_cast<uintptr_t>(addr);
     std::string addr_hex = ss.str();
     log_info(tt::LogEmulationDriver, "{} bytes @ address {} in core ({}, {})", size, addr_hex, core.x, core.y);
-    for(int i = 0; i < data_vec.size(); i++){
+    for (int i = 0; i < data_vec.size(); i++) {
         std::ios_base::fmtflags save = std::cout.flags();
         std::cout << "0x" << std::hex << std::setw(8) << std::setfill('0') << data_vec[i] << " ";
         std::cout.flags(save);
@@ -48,14 +49,14 @@ void print_flatbuffer(const DeviceRequestResponse *buf){
     std::cout << std::endl;
 }
 
-tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_device(sdesc_path){
+tt_SimulationDevice::tt_SimulationDevice(const std::string& sdesc_path) : tt_device() {
     log_info(tt::LogEmulationDriver, "Instantiating simulation device");
     soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
     std::set<chip_id_t> target_devices = {0};
-    
+
     // Start VCS simulator in a separate process
     TT_ASSERT(std::getenv("TT_REMOTE_EXE"), "TT_REMOTE_EXE not set, please provide path to the VCS binary");
-    uv_loop_t *loop = uv_default_loop();
+    uv_loop_t* loop = uv_default_loop();
     uv_process_t child_p;
     uv_process_options_t child_options = {0};
 
@@ -69,14 +70,12 @@ tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_dev
         log_info(tt::LogEmulationDriver, "Simulator process spawned with PID: {}", child_p.pid);
     }
 
-    uv_unref((uv_handle_t *) &child_p);
+    uv_unref((uv_handle_t*)&child_p);
     uv_run(loop, UV_RUN_DEFAULT);
     uv_loop_close(loop);
 }
 
-tt_SimulationDevice::~tt_SimulationDevice() {
-    close_device();
-}
+tt_SimulationDevice::~tt_SimulationDevice() { close_device(); }
 
 // Setup/Teardown Functions
 std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_SimulationDevice::get_virtual_soc_descriptors() {
@@ -99,11 +98,11 @@ void tt_SimulationDevice::set_driver_eth_interface_params(const tt_driver_eth_in
     eth_interface_params = eth_interface_params_;
 }
 
-void tt_SimulationDevice::start_device(const tt_device_params &device_params) {
-    void *buf_ptr = nullptr;
+void tt_SimulationDevice::start_device(const tt_device_params& device_params) {
+    void* buf_ptr = nullptr;
 
     host.start_host();
-    
+
     log_info(tt::LogEmulationDriver, "Waiting for ack msg from remote...");
     size_t buf_size = host.recv_from_device(&buf_ptr);
     auto buf = GetDeviceRequestResponse(buf_ptr);
@@ -114,8 +113,9 @@ void tt_SimulationDevice::start_device(const tt_device_params &device_params) {
 
 void tt_SimulationDevice::assert_risc_reset() {
     log_info(tt::LogEmulationDriver, "Sending assert_risc_reset signal..");
-    auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector<uint32_t>(1, 0), {0, 0, 0}, 0);
-    uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer();
+    auto wr_buffer =
+        create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector<uint32_t>(1, 0), {0, 0, 0}, 0);
+    uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer();
     size_t wr_buffer_size = wr_buffer.GetSize();
 
     print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr));
@@ -124,20 +124,25 @@ void tt_SimulationDevice::assert_risc_reset() {
 
 void tt_SimulationDevice::deassert_risc_reset() {
     log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset' signal..");
-    auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector<uint32_t>(1, 0), {0, 0, 0}, 0);
-    uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer();
+    auto wr_buffer =
+        create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector<uint32_t>(1, 0), {0, 0, 0}, 0);
+    uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer();
     size_t wr_buffer_size = wr_buffer.GetSize();
 
     host.send_to_device(wr_buffer_ptr, wr_buffer_size);
 }
 
-void tt_SimulationDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {
-    log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)");
+void tt_SimulationDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {
+    log_info(
+        tt::LogEmulationDriver,
+        "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)");
     deassert_risc_reset();
 }
 
 void tt_SimulationDevice::assert_risc_reset_at_core(tt_cxy_pair core) {
-    log_info(tt::LogEmulationDriver, "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)");
+    log_info(
+        tt::LogEmulationDriver,
+        "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)");
     assert_risc_reset();
 }
 
@@ -149,19 +154,21 @@ void tt_SimulationDevice::close_device() {
 }
 
 // Runtime Functions
-void tt_SimulationDevice::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
+void tt_SimulationDevice::write_to_device(
+    const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
     log_info(tt::LogEmulationDriver, "Device writing");
     std::vector<std::uint32_t> data((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size_in_bytes / sizeof(uint32_t));
     auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_WRITE, data, core, addr);
-    uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer();
+    uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer();
     size_t wr_buffer_size = wr_buffer.GetSize();
-    
-    print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); // sanity print
+
+    print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr));  // sanity print
     host.send_to_device(wr_buffer_ptr, wr_buffer_size);
 }
 
-void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    void *rd_resp;
+void tt_SimulationDevice::read_from_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    void* rd_resp;
 
     // Send read request
     auto rd_req_buf = create_flatbuffer(DEVICE_COMMAND_READ, {0}, core, addr, size);
@@ -171,50 +178,49 @@ void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint
     size_t rd_rsp_sz = host.recv_from_device(&rd_resp);
 
     auto rd_resp_buf = GetDeviceRequestResponse(rd_resp);
-    if (addr != 0x40){
+    if (addr != 0x40) {
         log_info(tt::LogEmulationDriver, "Device reading vec");
-        print_flatbuffer(rd_resp_buf); // 0x40 is host polling device, don't print since it'll spam
+        print_flatbuffer(rd_resp_buf);  // 0x40 is host polling device, don't print since it'll spam
     }
     std::memcpy(mem_ptr, rd_resp_buf->data()->data(), rd_resp_buf->data()->size() * sizeof(uint32_t));
     nng_free(rd_resp, rd_rsp_sz);
 }
 
 void tt_SimulationDevice::wait_for_non_mmio_flush() {}
+
 void tt_SimulationDevice::wait_for_non_mmio_flush(const chip_id_t chip) {}
-void tt_SimulationDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
-void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_SimulationDevice::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_SimulationDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
+
+void tt_SimulationDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
 
 // Misc. Functions to Query/Set Device State
 std::unordered_map<chip_id_t, uint32_t> tt_SimulationDevice::get_harvesting_masks_for_soc_descriptors() {
     return {{0, 0}};
 }
 
-std::vector<chip_id_t> tt_SimulationDevice::detect_available_device_ids() {
-    return {0};
-}
+std::vector<chip_id_t> tt_SimulationDevice::detect_available_device_ids() { return {0}; }
 
-std::set<chip_id_t> tt_SimulationDevice::get_target_remote_device_ids() {
-    return target_remote_chips;
-}
+std::set<chip_id_t> tt_SimulationDevice::get_target_remote_device_ids() { return target_remote_chips; }
 
-std::map<int,int> tt_SimulationDevice::get_clocks() {
-    return {{0, 0}};
-}
+std::map<int, int> tt_SimulationDevice::get_clocks() { return {{0, 0}}; }
 
-void *tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
+void* tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
     return nullptr;
 }
 
 std::uint64_t tt_SimulationDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const {
-    if(arch_name == tt::ARCH::WORMHOLE_B0) {
+    if (arch_name == tt::ARCH::WORMHOLE_B0) {
         return 0x800000000;
-    }
-    else if (arch_name == tt::ARCH::BLACKHOLE) {
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         // Enable 4th ATU window.
         return 1ULL << 60;
-    }
-    else {
+    } else {
         return 0;
     }
 }
@@ -224,12 +230,11 @@ std::uint32_t tt_SimulationDevice::get_num_dram_channels(std::uint32_t device_id
 }
 
 std::uint64_t tt_SimulationDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
-    return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now
+    return get_soc_descriptor(device_id).dram_bank_size;  // Space per channel is identical for now
 }
 
-std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) {
-    return 1;
-}
+std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) { return 1; }
+
+std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; }
 
-std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;}
-std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) {return 0;}
+std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) { return 0; }
diff --git a/device/simulation/tt_simulation_host.cpp b/device/simulation/tt_simulation_host.cpp
index eeee8110..7e5fe8be 100644
--- a/device/simulation/tt_simulation_host.cpp
+++ b/device/simulation/tt_simulation_host.cpp
@@ -2,19 +2,20 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <typeinfo>
-#include <sstream>
-#include <iomanip>
-#include <filesystem>
-#include <cassert>
-#include <cstdlib>
+#include "umd/device/tt_simulation_host.hpp"
 
 #include <nng/nng.h>
 #include <nng/protocol/pair1/pair.h>
 
-#include "logger.hpp"
+#include <cassert>
+#include <cstdlib>
+#include <filesystem>
+#include <iomanip>
+#include <sstream>
+#include <typeinfo>
+
 #include "assert.hpp"
-#include "umd/device/tt_simulation_host.hpp"
+#include "logger.hpp"
 
 tt_SimulationHost::tt_SimulationHost() {
     // Initialize socket and dialer
@@ -64,7 +65,7 @@ void tt_SimulationHost::start_host() {
 void tt_SimulationHost::send_to_device(uint8_t *buf, size_t buf_size) {
     int rv;
     log_debug(tt::LogEmulationDriver, "Sending messsage to remote..");
-    
+
     void *msg = nng_alloc(buf_size);
     std::memcpy(msg, buf, buf_size);
 
diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp
index 203a7a0c..ff3897f5 100644
--- a/device/tt_cluster_descriptor.cpp
+++ b/device/tt_cluster_descriptor.cpp
@@ -2,23 +2,25 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
 #include "umd/device/tt_cluster_descriptor.h"
-#include "libs/create_ethernet_map.h"
 
 #include <fstream>
 #include <memory>
-#include <sstream> 
+#include <sstream>
 
+#include "disjoint_set.hpp"
+#include "fmt/core.h"
+#include "libs/create_ethernet_map.h"
 #include "logger.hpp"
 #include "yaml-cpp/yaml.h"
 
-#include "fmt/core.h"
-
 using namespace tt;
-bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const {
+
+bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link(
+    chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const {
     return this->ethernet_connections.find(local_chip) != this->ethernet_connections.end() &&
-           this->ethernet_connections.at(local_chip).find(local_ethernet_channel) != this->ethernet_connections.at(local_chip).end();
+           this->ethernet_connections.at(local_chip).find(local_ethernet_channel) !=
+               this->ethernet_connections.at(local_chip).end();
 }
 
 std::tuple<chip_id_t, ethernet_channel_t> tt_ClusterDescriptor::get_chip_and_channel_of_remote_ethernet_core(
@@ -39,10 +41,14 @@ std::tuple<chip_id_t, ethernet_channel_t> tt_ClusterDescriptor::get_chip_and_cha
     }
 }
 
-// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how extensively router needs to use it
-std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const {
+// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how
+// extensively router needs to use it
+std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>>
+tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips(
+    const chip_id_t &first, const chip_id_t &second) const {
     std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> directly_connected_channels = {};
-    if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) {
+    if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() ||
+        this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) {
         return {};
     }
 
@@ -59,9 +65,7 @@ bool tt_ClusterDescriptor::is_chip_mmio_capable(const chip_id_t chip_id) const {
     return this->chips_with_mmio.find(chip_id) != this->chips_with_mmio.end();
 }
 
-bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const {
-    return !is_chip_mmio_capable(chip_id);
-}
+bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { return !is_chip_mmio_capable(chip_id); }
 
 // given two coordinates, finds the number of hops between the two chips
 // it assumes that shelves are connected in x-dim and racks are connected in y-dim
@@ -70,190 +74,266 @@ bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const {
 // then once a chip on the same shelf&rack is found,
 // the distance from this chip to either location_a or location_b is just x&y dim difference.
 // the function returns the total distance of travelled between shelves and racks, plust the x&y dim difference
-int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const {
-
-    log_trace(LogSiliconDriver, "get_ethernet_link_coord_distance from ({}, {}, {}, {}) to ({}, {}, {}, {})",
-        std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-        std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b));
-
-    // eth_coord_t: x, y, rack, shelf
-
-    int x_a = std::get<0>(location_a);
-    int x_b = std::get<0>(location_b);
-
-    int y_a = std::get<1>(location_a);
-    int y_b = std::get<1>(location_b);
-
-    int shelf_a = std::get<3>(location_a);
-    int shelf_b = std::get<3>(location_b);
-
-    int rack_a = std::get<2>(location_a);
-    int rack_b = std::get<2>(location_b);
-
-    int x_distance = std::abs(x_a - x_b);
-    int y_distance = std::abs(y_a - y_b);
+int tt_ClusterDescriptor::get_ethernet_link_coord_distance(
+    const eth_coord_t &location_a, const eth_coord_t &location_b) const {
+    log_trace(
+        LogSiliconDriver,
+        "get_ethernet_link_coord_distance from ({}, {}, {}, {}, {}) to ({}, {}, {}, {}, {})",
+        location_a.cluster_id,
+        location_a.x,
+        location_a.y,
+        location_a.rack,
+        location_a.shelf,
+        location_b.cluster_id,
+        location_b.x,
+        location_b.y,
+        location_b.rack,
+        location_b.shelf);
+
+    if (location_a.cluster_id != location_b.cluster_id) {
+        return std::numeric_limits<int>::max();
+    }
+
+    int x_distance = std::abs(location_a.x - location_b.x);
+    int y_distance = std::abs(location_a.y - location_b.y);
 
     // move along y-dim to exit from the shelf to go to a higher shelf
-    if(shelf_b > shelf_a) {
+    if (location_b.shelf > location_a.shelf) {
         // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe
-        log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_a) != galaxy_shelves_exit_chip_coords_per_y_dim.end(),
+        log_assert(
+            galaxy_shelves_exit_chip_coords_per_y_dim.find(location_a.shelf) !=
+                galaxy_shelves_exit_chip_coords_per_y_dim.end(),
             "Expected shelf-to-shelf connection");
         // this row does not have a shelf-to-shelf connection
-        if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).find(y_a) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).end()) {
+        if (galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).find(location_a.y) ==
+            galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).end()) {
             return std::numeric_limits<int>::max();
         }
 
-        const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).at(y_a);
-        log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many");
+        const Chip2ChipConnection &shelf_to_shelf_connection =
+            galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).at(location_a.y);
+        log_assert(
+            shelf_to_shelf_connection.destination_chip_coords.size(),
+            "Expecting at least one shelf-to-shelf connection, possibly one-to-many");
 
-        // for each shelf-to-shelf connection at y_a, find the distance to location_b, take min
+        // for each shelf-to-shelf connection at location_a.y, find the distance to location_b, take min
         int distance = std::numeric_limits<int>::max();
         eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord;
-        for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) {
-
-            log_assert(std::get<1>(exit_shelf) == y_a && std::get<3>(exit_shelf) == shelf_a && std::get<2>(exit_shelf) == rack_a,
+        for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) {
+            log_assert(
+                exit_shelf.y == location_a.y && exit_shelf.shelf == location_a.shelf &&
+                    exit_shelf.rack == location_a.rack,
                 "Invalid shelf exit coordinates");
 
             // next shelf could be at a different y-dim in nebula->galaxy systems
-            log_assert(std::get<3>(next_shelf) == (shelf_a+1) && std::get<2>(next_shelf) == rack_a,
+            log_assert(
+                next_shelf.shelf == (location_a.shelf + 1) && next_shelf.rack == location_a.rack,
                 "Invalid shelf entry coordinates");
 
             // hop onto the next shelf and find distance from there
             int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_shelf);
             int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_b);
             // no path found
-            if(distance_to_exit == std::numeric_limits<int>::max() || distance_in_next_shelf == std::numeric_limits<int>::max()) {
+            if (distance_to_exit == std::numeric_limits<int>::max() ||
+                distance_in_next_shelf == std::numeric_limits<int>::max()) {
                 continue;
             }
             distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1);
         }
-        log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-            std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-            std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance);
+        log_trace(
+            LogSiliconDriver,
+            "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+            location_a.x,
+            location_a.y,
+            location_a.rack,
+            location_a.shelf,
+            location_b.x,
+            location_b.y,
+            location_b.rack,
+            location_b.shelf,
+            distance);
         return distance;
-    }
-    else if(shelf_a > shelf_b) {
-
+    } else if (location_a.shelf > location_b.shelf) {
         // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe
-        log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_b) != galaxy_shelves_exit_chip_coords_per_y_dim.end(),
+        log_assert(
+            galaxy_shelves_exit_chip_coords_per_y_dim.find(location_b.shelf) !=
+                galaxy_shelves_exit_chip_coords_per_y_dim.end(),
             "Expected shelf-to-shelf connection");
         // this row does not have a shelf-to-shelf connection
-        if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).find(y_b) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).end()) {
+        if (galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).find(location_b.y) ==
+            galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).end()) {
             return std::numeric_limits<int>::max();
         }
 
-        const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).at(y_b);
-        log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many")
+        const Chip2ChipConnection &shelf_to_shelf_connection =
+            galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).at(location_b.y);
+        log_assert(
+            shelf_to_shelf_connection.destination_chip_coords.size(),
+            "Expecting at least one shelf-to-shelf connection, possibly one-to-many")
 
-        // for each shelf-to-shelf connection at y_b, find the distance to location_a, take min
-        int distance = std::numeric_limits<int>::max();
+            // for each shelf-to-shelf connection at location_b.y, find the distance to location_a, take min
+            int distance = std::numeric_limits<int>::max();
         eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord;
-        for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) {
-
-            log_assert(std::get<1>(exit_shelf) == y_b && std::get<3>(exit_shelf) == shelf_b && std::get<2>(exit_shelf) == rack_b,
+        for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) {
+            log_assert(
+                exit_shelf.y == location_b.y && exit_shelf.shelf == location_b.shelf &&
+                    exit_shelf.rack == location_b.rack,
                 "Invalid shelf exit coordinates");
             // next shelf could be at a different y-dim in nebula->galaxy systems
-            log_assert(std::get<3>(next_shelf) == (shelf_b+1) && std::get<2>(next_shelf) == rack_b,
+            log_assert(
+                next_shelf.shelf == (location_b.shelf + 1) && next_shelf.rack == location_b.rack,
                 "Invalid shelf entry coordinates");
 
             // hop onto the next shelf and find distance from there
             int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_shelf);
             int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_a);
             // no path found
-            if(distance_to_exit == std::numeric_limits<int>::max() || distance_in_next_shelf == std::numeric_limits<int>::max()) {
+            if (distance_to_exit == std::numeric_limits<int>::max() ||
+                distance_in_next_shelf == std::numeric_limits<int>::max()) {
                 continue;
             }
             distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1);
         }
-        log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-            std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-            std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance);
+        log_trace(
+            LogSiliconDriver,
+            "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+            location_a.x,
+            location_a.y,
+            location_a.rack,
+            location_a.shelf,
+            location_b.x,
+            location_b.y,
+            location_b.rack,
+            location_b.shelf,
+            distance);
         return distance;
     }
 
     // move along y-dim to exit from the shelf to go to a higher shelf
-    if(rack_b > rack_a) {
-
+    if (location_b.rack > location_a.rack) {
         // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe
-        log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_a) != galaxy_racks_exit_chip_coords_per_x_dim.end(),
+        log_assert(
+            galaxy_racks_exit_chip_coords_per_x_dim.find(location_a.rack) !=
+                galaxy_racks_exit_chip_coords_per_x_dim.end(),
             "Expected rack-to-rack connection");
 
         // this row does not have a rack-to-rack connection
-        if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).find(x_a) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).end()) {
+        if (galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).find(location_a.x) ==
+            galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).end()) {
             return std::numeric_limits<int>::max();
         }
 
-        const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).at(x_a);
-        log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many");
+        const Chip2ChipConnection &rack_to_rack_connection =
+            galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).at(location_a.x);
+        log_assert(
+            rack_to_rack_connection.destination_chip_coords.size(),
+            "Expecting at least one rack-to-rack connection, possibly one-to-many");
 
-        // for each rack-to-rack connection at x_a, find the distance to location_b, take min
+        // for each rack-to-rack connection at location_a.x, find the distance to location_b, take min
         int distance = std::numeric_limits<int>::max();
         eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord;
-        for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) {
-
-            log_assert(std::get<0>(exit_rack) == x_a && std::get<3>(exit_rack) == shelf_a && std::get<2>(exit_rack) == rack_a,
+        for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) {
+            log_assert(
+                exit_rack.x == location_a.x && exit_rack.shelf == location_a.shelf && exit_rack.rack == location_a.rack,
                 "Invalid rack exit coordinates");
-            log_assert(std::get<0>(next_rack) == x_a && std::get<3>(next_rack) == shelf_a && std::get<2>(next_rack) == (rack_a+1),
+            log_assert(
+                next_rack.x == location_a.x && next_rack.shelf == location_a.shelf &&
+                    next_rack.rack == (location_a.rack + 1),
                 "Invalid rack entry coordinates");
 
             // hop onto the next rack and find distance from there
             int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_rack);
             int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_b);
             // no path found
-            if (distance_to_exit == std::numeric_limits<int>::max() || distance_in_next_rack == std::numeric_limits<int>::max()) {
+            if (distance_to_exit == std::numeric_limits<int>::max() ||
+                distance_in_next_rack == std::numeric_limits<int>::max()) {
                 continue;
             }
             distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1);
         }
-        log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-            std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-            std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance);
+        log_trace(
+            LogSiliconDriver,
+            "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+            location_a.x,
+            location_a.y,
+            location_a.rack,
+            location_a.shelf,
+            location_b.x,
+            location_b.y,
+            location_b.rack,
+            location_b.shelf,
+            distance);
 
         return distance;
-    }
-    else if(rack_a > rack_b) {
-
+    } else if (location_a.rack > location_b.rack) {
         // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe
-        log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_b) != galaxy_racks_exit_chip_coords_per_x_dim.end(),
+        log_assert(
+            galaxy_racks_exit_chip_coords_per_x_dim.find(location_b.rack) !=
+                galaxy_racks_exit_chip_coords_per_x_dim.end(),
             "Expected rack-to-rack connection");
 
         // this row does not have a rack-to-rack connection
-        if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).find(x_b) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).end()) {
+        if (galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).find(location_b.x) ==
+            galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).end()) {
             return std::numeric_limits<int>::max();
         }
 
-        const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).at(x_b);
-        log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many");
+        const Chip2ChipConnection &rack_to_rack_connection =
+            galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).at(location_b.x);
+        log_assert(
+            rack_to_rack_connection.destination_chip_coords.size(),
+            "Expecting at least one rack-to-rack connection, possibly one-to-many");
 
-        // for each rack-to-rack connection at x_a, find the distance to location_b, take min
+        // for each rack-to-rack connection at location_a.x, find the distance to location_b, take min
         int distance = std::numeric_limits<int>::max();
         eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord;
-        for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) {
-
-            log_assert(std::get<0>(exit_rack) == x_b && std::get<3>(exit_rack) == shelf_b && std::get<2>(exit_rack) == rack_b,
+        for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) {
+            log_assert(
+                exit_rack.x == location_b.x && exit_rack.shelf == location_b.shelf && exit_rack.rack == location_b.rack,
                 "Invalid rack exit coordinates");
-            log_assert(std::get<0>(next_rack) == x_b && std::get<3>(next_rack) == shelf_b && std::get<2>(next_rack) == (rack_b+1),
+            log_assert(
+                next_rack.x == location_b.x && next_rack.shelf == location_b.shelf &&
+                    next_rack.rack == (location_b.rack + 1),
                 "Invalid rack entry coordinates");
 
             // hop onto the next rack and find distance from there
             int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_rack);
             int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_a);
             // no path found
-            if (distance_to_exit == std::numeric_limits<int>::max() || distance_in_next_rack == std::numeric_limits<int>::max()) {
+            if (distance_to_exit == std::numeric_limits<int>::max() ||
+                distance_in_next_rack == std::numeric_limits<int>::max()) {
                 continue;
             }
             distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1);
         }
-        log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-            std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-            std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance);
+        log_trace(
+            LogSiliconDriver,
+            "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+            location_a.x,
+            location_a.y,
+            location_a.rack,
+            location_a.shelf,
+            location_b.x,
+            location_b.y,
+            location_b.rack,
+            location_b.shelf,
+            distance);
 
         return distance;
     }
 
-    log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-        std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-        std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), x_distance + y_distance);
+    log_trace(
+        LogSiliconDriver,
+        "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+        location_a.x,
+        location_a.y,
+        location_a.rack,
+        location_a.shelf,
+        location_b.x,
+        location_b.y,
+        location_b.rack,
+        location_b.shelf,
+        x_distance + y_distance);
 
     // on same shelf/rack, the distance is just x+y difference
     return x_distance + y_distance;
@@ -261,14 +341,13 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo
 
 // Returns the closest mmio chip to the given chip
 chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t chip) {
-
     log_debug(LogSiliconDriver, "get_closest_mmio_chip to chip{}", chip);
 
     if (this->is_chip_mmio_capable(chip)) {
         return chip;
     }
 
-    if(closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) {
+    if (closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) {
         return closest_mmio_chip_cache[chip];
     }
 
@@ -280,15 +359,24 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch
         const chip_id_t &mmio_chip = pair.first;
         eth_coord_t mmio_eth_coord = this->chip_locations.at(mmio_chip);
 
-        log_debug(LogSiliconDriver, "Checking chip{} at ({}, {}, {}, {})", mmio_chip, std::get<0>(mmio_eth_coord), std::get<1>(mmio_eth_coord), std::get<2>(mmio_eth_coord), std::get<3>(mmio_eth_coord));
+        log_debug(
+            LogSiliconDriver,
+            "Checking chip{} at ({}, {}, {}, {})",
+            mmio_chip,
+            mmio_eth_coord.x,
+            mmio_eth_coord.y,
+            mmio_eth_coord.rack,
+            mmio_eth_coord.shelf);
 
         int distance = get_ethernet_link_coord_distance(mmio_eth_coord, chip_eth_coord);
+        log_debug(LogSiliconDriver, "Distance from chip{} to chip{} is {}", chip, mmio_chip, distance);
         if (distance < min_distance) {
             min_distance = distance;
             closest_chip = mmio_chip;
         }
     }
-    log_assert(min_distance != std::numeric_limits<int>::max(), "Chip{} is not connected to any MMIO capable chip", chip);
+    log_assert(
+        min_distance != std::numeric_limits<int>::max(), "Chip{} is not connected to any MMIO capable chip", chip);
 
     log_assert(is_chip_mmio_capable(closest_chip), "Closest MMIO chip must be MMIO capable");
 
@@ -302,38 +390,45 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch
 std::string tt_ClusterDescriptor::get_cluster_descriptor_file_path() {
     static std::string yaml_path;
     static bool is_initialized = false;
-    if (!is_initialized){
-
+    if (!is_initialized) {
         // Cluster descriptor yaml will be created in a unique temporary directory.
         std::filesystem::path temp_path = std::filesystem::temp_directory_path();
         std::string cluster_path_dir_template = temp_path / "umd_XXXXXX";
         std::filesystem::path cluster_path_dir = mkdtemp(cluster_path_dir_template.data());
         std::filesystem::path cluster_path = cluster_path_dir / "cluster_descriptor.yaml";
-        if (!std::filesystem::exists(cluster_path)){
-            auto val = system ( ("touch " + cluster_path.string()).c_str());
-            if(val != 0) throw std::runtime_error("Cluster Generation Failed!");
+        if (!std::filesystem::exists(cluster_path)) {
+            auto val = system(("touch " + cluster_path.string()).c_str());
+            if (val != 0) {
+                throw std::runtime_error("Cluster Generation Failed!");
+            }
         }
 
-        int val = create_ethernet_map((char*)cluster_path.string().c_str());
-        if(val != 0) throw std::runtime_error("Cluster Generation Failed!");
+        int val = create_ethernet_map((char *)cluster_path.string().c_str());
+        if (val != 0) {
+            throw std::runtime_error("Cluster Generation Failed!");
+        }
         yaml_path = cluster_path.string();
         is_initialized = true;
     }
     return yaml_path;
 }
 
-std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_from_yaml(const std::string &cluster_descriptor_file_path) {
+std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_from_yaml(
+    const std::string &cluster_descriptor_file_path) {
     std::unique_ptr<tt_ClusterDescriptor> desc = std::unique_ptr<tt_ClusterDescriptor>(new tt_ClusterDescriptor());
 
     std::ifstream fdesc(cluster_descriptor_file_path);
     if (fdesc.fail()) {
-        throw std::runtime_error(fmt::format("Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path));
+        throw std::runtime_error(fmt::format(
+            "Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path));
     }
     fdesc.close();
 
     YAML::Node yaml = YAML::LoadFile(cluster_descriptor_file_path);
     tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(yaml, *desc);
     tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(yaml, *desc);
+    tt_ClusterDescriptor::merge_cluster_ids(*desc);
+    tt_ClusterDescriptor::fill_galaxy_connections(*desc);
     tt_ClusterDescriptor::load_harvesting_information(yaml, *desc);
     desc->enable_all_devices();
 
@@ -343,22 +438,31 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_from_yaml(con
 }
 
 std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_for_grayskull_cluster(
-    const std::set<chip_id_t> &logical_mmio_device_ids,
-    const std::vector<chip_id_t> &physical_mmio_device_ids) {
+    const std::set<chip_id_t> &logical_mmio_device_ids, const std::vector<chip_id_t> &physical_mmio_device_ids) {
     std::unique_ptr<tt_ClusterDescriptor> desc = std::unique_ptr<tt_ClusterDescriptor>(new tt_ClusterDescriptor());
 
     // Some users need not care about physical ids, can provide empty set.
-    auto use_physical_ids                   = physical_mmio_device_ids.size() ? true : false;
-    auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin(); // Last element in ordered set.
-    auto num_available_physical_devices     = physical_mmio_device_ids.size();
-    auto required_physical_devices          = largest_workload_logical_device_id + 1;
-
-    log_debug(tt::LogSiliconDriver, "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} required_physical_devices: {}",
-        __FUNCTION__, use_physical_ids, largest_workload_logical_device_id, num_available_physical_devices, required_physical_devices);
-
-    log_assert(!use_physical_ids || num_available_physical_devices >= required_physical_devices,
+    auto use_physical_ids = physical_mmio_device_ids.size() ? true : false;
+    auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin();  // Last element in ordered set.
+    auto num_available_physical_devices = physical_mmio_device_ids.size();
+    auto required_physical_devices = largest_workload_logical_device_id + 1;
+
+    log_debug(
+        tt::LogSiliconDriver,
+        "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} "
+        "required_physical_devices: {}",
+        __FUNCTION__,
+        use_physical_ids,
+        largest_workload_logical_device_id,
+        num_available_physical_devices,
+        required_physical_devices);
+
+    log_assert(
+        !use_physical_ids || num_available_physical_devices >= required_physical_devices,
         "Insufficient silicon devices. Workload requires device_id: {} (ie. {} devices) but only {} present",
-        largest_workload_logical_device_id, required_physical_devices, num_available_physical_devices);
+        largest_workload_logical_device_id,
+        required_physical_devices,
+        num_available_physical_devices);
 
     // All Grayskull devices are MMIO mapped so physical_mmio_device_ids correspond to all available devices
     for (auto &logical_id : logical_mmio_device_ids) {
@@ -367,8 +471,10 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_for_grayskull
         desc->all_chips.insert(logical_id);
         eth_coord_t chip_location{logical_id, 0, 0, 0};
         desc->chip_locations.insert({logical_id, chip_location});
-        desc->coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = logical_id;
-        log_debug(tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id);
+        desc->coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] =
+            logical_id;
+        log_debug(
+            tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id);
     }
 
     desc->enable_all_devices();
@@ -376,7 +482,8 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_for_grayskull
     return desc;
 }
 
-void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
+void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(
+    YAML::Node &yaml, tt_ClusterDescriptor &desc) {
     log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML");
     for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as<std::vector<YAML::Node>>()) {
         log_assert(connected_endpoints.IsSequence(), "Invalid YAML");
@@ -409,7 +516,13 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
     log_debug(LogSiliconDriver, "Ethernet Connectivity Descriptor:");
     for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) {
         for (const auto &[chan, chip_and_chan] : chan_to_chip_chan_map) {
-            log_debug(LogSiliconDriver, "\tchip: {}, chan: {}  <-->  chip: {}, chan: {}", chip, chan, std::get<0>(chip_and_chan), std::get<1>(chip_and_chan));
+            log_debug(
+                LogSiliconDriver,
+                "\tchip: {}, chan: {}  <-->  chip: {}, chan: {}",
+                chip,
+                chan,
+                chip_and_chan.x,
+                chip_and_chan.y);
         }
     }
 
@@ -426,52 +539,64 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
             }
         }
     }
+}
 
+void tt_ClusterDescriptor::fill_galaxy_connections(tt_ClusterDescriptor &desc) {
     int highest_shelf_id = 0;
     int highest_rack_id = 0;
 
     // shelves and racks can be connected at different chip coordinates
-    // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip on the other shelf/rack is
-    // this is used in get_ethernet_link_coord_distance to find the distance between two chips
+    // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip
+    // on the other shelf/rack is this is used in get_ethernet_link_coord_distance to find the distance between two
+    // chips
     for (const auto &[chip_id, chip_eth_coord] : desc.chip_locations) {
-        highest_shelf_id = std::max(highest_shelf_id, std::get<3>(chip_eth_coord));
-        highest_rack_id = std::max(highest_rack_id, std::get<2>(chip_eth_coord));
+        highest_shelf_id = std::max(highest_shelf_id, chip_eth_coord.shelf);
+        highest_rack_id = std::max(highest_rack_id, chip_eth_coord.rack);
         // iterate over all neighbors
-        if(desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) {
-            continue; // chip has no eth connections
+        if (desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) {
+            continue;  // chip has no eth connections
         }
         for (const auto &[chan, chip_and_chan] : desc.ethernet_connections.at(chip_id)) {
             const chip_id_t &neighbor_chip = std::get<0>(chip_and_chan);
             eth_coord_t neighbor_eth_coord = desc.chip_locations.at(neighbor_chip);
             // shelves are connected in x-dim
-            if(std::get<3>(neighbor_eth_coord) != std::get<3>(chip_eth_coord)) {
-                eth_coord_t higher_shelf_coord = std::get<3>(neighbor_eth_coord) > std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
-                eth_coord_t lower_shelf_coord = std::get<3>(neighbor_eth_coord) < std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
-                int lower_shelf_id = std::get<3>(lower_shelf_coord);
-                int lower_shelf_y = std::get<1>(lower_shelf_coord);
+            if (neighbor_eth_coord.shelf != chip_eth_coord.shelf) {
+                eth_coord_t higher_shelf_coord =
+                    neighbor_eth_coord.shelf > chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord;
+                eth_coord_t lower_shelf_coord =
+                    neighbor_eth_coord.shelf < chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord;
+                int lower_shelf_id = lower_shelf_coord.shelf;
+                int lower_shelf_y = lower_shelf_coord.y;
 
-                auto& galaxy_shelf_exit_chip_coords_per_y_dim = desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id];
+                auto &galaxy_shelf_exit_chip_coords_per_y_dim =
+                    desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id];
 
                 log_assert(
-                    galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) == galaxy_shelf_exit_chip_coords_per_y_dim.end() ||
-                    galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord,
+                    galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) ==
+                            galaxy_shelf_exit_chip_coords_per_y_dim.end() ||
+                        galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord,
                     "Expected a single exit chip on each shelf row");
                 galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord = lower_shelf_coord;
-                galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert(higher_shelf_coord);
+                galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert(
+                    higher_shelf_coord);
             }
 
             // racks are connected in y-dim
-            if(std::get<2>(neighbor_eth_coord) != std::get<2>(chip_eth_coord)) {
-                eth_coord_t higher_rack_coord = std::get<2>(neighbor_eth_coord) > std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
-                eth_coord_t lower_rack_coord = std::get<2>(neighbor_eth_coord) < std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
-                int lower_rack_id = std::get<2>(lower_rack_coord);
-                int lower_rack_x = std::get<0>(lower_rack_coord);
+            if (neighbor_eth_coord.rack != chip_eth_coord.rack) {
+                eth_coord_t higher_rack_coord =
+                    neighbor_eth_coord.rack > chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord;
+                eth_coord_t lower_rack_coord =
+                    neighbor_eth_coord.rack < chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord;
+                int lower_rack_id = lower_rack_coord.rack;
+                int lower_rack_x = lower_rack_coord.x;
 
-                auto& galaxy_rack_exit_chip_coords_per_x_dim = desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id];
+                auto &galaxy_rack_exit_chip_coords_per_x_dim =
+                    desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id];
 
                 log_assert(
-                    galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) == galaxy_rack_exit_chip_coords_per_x_dim.end() ||
-                    galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord,
+                    galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) ==
+                            galaxy_rack_exit_chip_coords_per_x_dim.end() ||
+                        galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord,
                     "Expected a single exit chip on each rack column");
                 galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord = lower_rack_coord;
                 galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].destination_chip_coords.insert(higher_rack_coord);
@@ -482,23 +607,36 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
     // verify that every shelf (except the highest in id) is found in galaxy_shelves_exit_chip_coords_per_y_dim
     // this means that we expect the shelves to be connected linearly in a daisy-chain fashion.
     // shelf0->shelf1->shelf2->...->shelfN
-    for(int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) {
-        log_assert(desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) != desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(),
-            "Expected shelf {} to be connected to the next shelf", shelf_id);
+    for (int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) {
+        log_assert(
+            desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) !=
+                desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(),
+            "Expected shelf {} to be connected to the next shelf",
+            shelf_id);
     }
 
     // this prints the exit chip coordinates for each shelf
     // this is used in get_ethernet_link_coord_distance to find the distance between two chips
     for (const auto &[shelf, shelf_exit_chip_coords_per_y_dim] : desc.galaxy_shelves_exit_chip_coords_per_y_dim) {
         for (const auto &[y_dim, shelf_exit_chip_coords] : shelf_exit_chip_coords_per_y_dim) {
-            log_debug(LogSiliconDriver, "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})",
-                shelf, y_dim,
-                std::get<0>(shelf_exit_chip_coords.source_chip_coord), std::get<1>(shelf_exit_chip_coords.source_chip_coord),
-                std::get<2>(shelf_exit_chip_coords.source_chip_coord), std::get<3>(shelf_exit_chip_coords.source_chip_coord));
+            log_debug(
+                LogSiliconDriver,
+                "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})",
+                shelf,
+                y_dim,
+                shelf_exit_chip_coords.source_chip_coord.x,
+                shelf_exit_chip_coords.source_chip_coord.y,
+                shelf_exit_chip_coords.source_chip_coord.rack,
+                shelf_exit_chip_coords.source_chip_coord.shelf);
             for (const auto &destination_chip_coord : shelf_exit_chip_coords.destination_chip_coords) {
                 // print shelf_exit_chip_coord in the format: (x, y, rack, shelf)
-                log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})",
-                    std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord));
+                log_debug(
+                    LogSiliconDriver,
+                    "\tdestination_chip_coord: ({}, {}, {}, {})",
+                    destination_chip_coord.x,
+                    destination_chip_coord.y,
+                    destination_chip_coord.rack,
+                    destination_chip_coord.shelf);
             }
         }
     }
@@ -506,28 +644,61 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
     // verify that every rack (except the highest in id) is found in galaxy_racks_exit_chip_coords_per_x_dim
     // this means that we expect the racks to be connected linearly in a daisy-chain fashion.
     // rack0->rack1->rack2->...->rackN
-    for(int rack_id = 0; rack_id < highest_rack_id; rack_id++) {
-        log_assert(desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) != desc.galaxy_racks_exit_chip_coords_per_x_dim.end(),
-            "Expected rack {} to be connected to the next rack", rack_id);
+    for (int rack_id = 0; rack_id < highest_rack_id; rack_id++) {
+        log_assert(
+            desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) !=
+                desc.galaxy_racks_exit_chip_coords_per_x_dim.end(),
+            "Expected rack {} to be connected to the next rack",
+            rack_id);
     }
 
     // this prints the exit chip coordinates for each rack
     // this is used in get_ethernet_link_coord_distance to find the distance between two chips
     for (const auto &[rack, rack_exit_chip_coords_per_x_dim] : desc.galaxy_racks_exit_chip_coords_per_x_dim) {
         for (const auto &[x_dim, rack_exit_chip_coords] : rack_exit_chip_coords_per_x_dim) {
-            log_debug(LogSiliconDriver, "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", rack, x_dim,
-                std::get<0>(rack_exit_chip_coords.source_chip_coord), std::get<1>(rack_exit_chip_coords.source_chip_coord),
-                std::get<2>(rack_exit_chip_coords.source_chip_coord), std::get<3>(rack_exit_chip_coords.source_chip_coord));
+            log_debug(
+                LogSiliconDriver,
+                "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})",
+                rack,
+                x_dim,
+                rack_exit_chip_coords.source_chip_coord.x,
+                rack_exit_chip_coords.source_chip_coord.y,
+                rack_exit_chip_coords.source_chip_coord.rack,
+                rack_exit_chip_coords.source_chip_coord.shelf);
             for (const auto &destination_chip_coord : rack_exit_chip_coords.destination_chip_coords) {
-                log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})",
-                    std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord));
+                log_debug(
+                    LogSiliconDriver,
+                    "\tdestination_chip_coord: ({}, {}, {}, {})",
+                    destination_chip_coord.x,
+                    destination_chip_coord.y,
+                    destination_chip_coord.rack,
+                    destination_chip_coord.shelf);
             }
         }
     }
 }
 
-void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
+void tt_ClusterDescriptor::merge_cluster_ids(tt_ClusterDescriptor &desc) {
+    DisjointSet<chip_id_t> chip_sets;
+    for (const auto &[chip, _] : desc.chip_locations) {
+        chip_sets.add_item(chip);
+        log_debug(LogSiliconDriver, "Adding chip {} to disjoint set", chip);
+    }
+
+    for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) {
+        for (const auto &[chan, dest_chip_chan_tuple] : chan_to_chip_chan_map) {
+            chip_sets.merge(chip, std::get<0>(dest_chip_chan_tuple));
+            log_debug(LogSiliconDriver, "Merging chip {} and chip {}", chip, std::get<0>(dest_chip_chan_tuple));
+        }
+    }
+
+    for (const auto &[chip, chip_eth_coords] : desc.chip_locations) {
+        desc.chip_locations[chip].cluster_id = chip_sets.get_set(chip);
+        log_debug(LogSiliconDriver, "Chip {} belongs to cluster {}", chip, chip_sets.get_set(chip));
+    }
+}
 
+void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
     for (YAML::const_iterator node = yaml["arch"].begin(); node != yaml["arch"].end(); ++node) {
         chip_id_t chip_id = node->first.as<int>();
         desc.all_chips.insert(chip_id);
@@ -538,19 +709,18 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
         std::vector<int> chip_rack_coords = node->second.as<std::vector<int>>();
         log_assert(chip_rack_coords.size() == 4, "Galaxy (x, y, rack, shelf) coords must be size 4");
         eth_coord_t chip_location{
-            chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)};
+            chip_id, chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)};
 
         desc.chip_locations.insert({chip_id, chip_location});
-        desc.coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = chip_id;
+        desc.coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] = chip_id;
     }
-    
-    for(const auto& chip : yaml["chips_with_mmio"]) {
-        if(chip.IsMap()) {
+
+    for (const auto &chip : yaml["chips_with_mmio"]) {
+        if (chip.IsMap()) {
             const auto &chip_map = chip.as<std::map<chip_id_t, chip_id_t>>();
             const auto &chips = chip_map.begin();
             desc.chips_with_mmio.insert({chips->first, chips->second});
-        }
-        else {
+        } else {
             const auto &chip_val = chip.as<int>();
             desc.chips_with_mmio.insert({chip_val, chip_val});
         }
@@ -561,14 +731,14 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
             LogSiliconDriver,
             "\tchip: {},  EthCoord(x={}, y={}, rack={}, shelf={})",
             chip_id,
-            std::get<0>(chip_location),
-            std::get<1>(chip_location),
-            std::get<2>(chip_location),
-            std::get<3>(chip_location));
+            chip_location.x,
+            chip_location.y,
+            chip_location.rack,
+            chip_location.shelf);
     }
 
-	if (yaml["boardtype"]) {
-        for (const auto& chip_board_type : yaml["boardtype"].as<std::map<int, std::string>>()) {
+    if (yaml["boardtype"]) {
+        for (const auto &chip_board_type : yaml["boardtype"].as<std::map<int, std::string>>()) {
             auto &chip = chip_board_type.first;
             BoardType board_type;
             if (chip_board_type.second == "n150") {
@@ -579,25 +749,28 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
                 board_type = BoardType::GALAXY;
             } else if (chip_board_type.second == "e150") {
                 board_type = BoardType::E150;
-            }
-            else if (chip_board_type.second == "p150A") {
+            } else if (chip_board_type.second == "p150A") {
                 board_type = BoardType::P150A;
             } else {
-                log_warning(LogSiliconDriver, "Unknown board type for chip {}. This might happen because chip is running old firmware. Defaulting to DEFAULT", chip);
+                log_warning(
+                    LogSiliconDriver,
+                    "Unknown board type for chip {}. This might happen because chip is running old firmware. "
+                    "Defaulting to DEFAULT",
+                    chip);
                 board_type = BoardType::DEFAULT;
             }
             desc.chip_board_type.insert({chip, board_type});
         }
     } else {
-        for (const auto& chip: desc.all_chips) {
+        for (const auto &chip : desc.all_chips) {
             desc.chip_board_type.insert({chip, BoardType::DEFAULT});
         }
     }
 }
 
 void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
-    if(yaml["harvesting"]) {
-        for (const auto& chip_node : yaml["harvesting"].as<std::map<int, YAML::Node>>()) {
+    if (yaml["harvesting"]) {
+        for (const auto &chip_node : yaml["harvesting"].as<std::map<int, YAML::Node>>()) {
             chip_id_t chip = chip_node.first;
             auto harvesting_info = chip_node.second;
             desc.noc_translation_enabled.insert({chip, harvesting_info["noc_translation"].as<bool>()});
@@ -606,9 +779,7 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus
     }
 }
 
-void tt_ClusterDescriptor::enable_all_devices() {
-    this->enabled_active_chips = this->all_chips;
-}
+void tt_ClusterDescriptor::enable_all_devices() { this->enabled_active_chips = this->all_chips; }
 
 void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() {
     for (const auto &chip : this->all_chips) {
@@ -618,8 +789,10 @@ void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() {
     }
 }
 
-const std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > > tt_ClusterDescriptor::get_ethernet_connections() const {
-    auto eth_connections = std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > >();
+const std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>
+tt_ClusterDescriptor::get_ethernet_connections() const {
+    auto eth_connections = std::
+        unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>();
 
     for (const auto &[chip, channel_mapping] : this->ethernet_connections) {
         if (this->enabled_active_chips.find(chip) != this->enabled_active_chips.end()) {
@@ -635,7 +808,7 @@ const std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::
     return eth_connections;
 }
 
-const std::unordered_map<chip_id_t, eth_coord_t>& tt_ClusterDescriptor::get_chip_locations() const {
+const std::unordered_map<chip_id_t, eth_coord_t> &tt_ClusterDescriptor::get_chip_locations() const {
     static auto locations = std::unordered_map<chip_id_t, eth_coord_t>();
     if (locations.empty() and !this->chip_locations.empty()) {
         for (auto chip_id : this->enabled_active_chips) {
@@ -647,11 +820,14 @@ const std::unordered_map<chip_id_t, eth_coord_t>& tt_ClusterDescriptor::get_chip
 }
 
 chip_id_t tt_ClusterDescriptor::get_shelf_local_physical_chip_coords(chip_id_t virtual_coord) {
-    log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates");
+    log_assert(
+        !this->chip_locations.empty(),
+        "Getting physical chip coordinates is only valid for systems where chips have coordinates");
     // Physical cooridnates of chip inside a single rack. Calculated based on Galaxy topology.
-    // See: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png
-    int x = std::get<0>(get_chip_locations().at(virtual_coord));
-    int y = std::get<1>(get_chip_locations().at(virtual_coord));
+    // See:
+    // https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png
+    int x = get_chip_locations().at(virtual_coord).x;
+    int y = get_chip_locations().at(virtual_coord).y;
     return 8 * x + y;
 }
 
@@ -668,30 +844,31 @@ const std::unordered_map<chip_id_t, chip_id_t> tt_ClusterDescriptor::get_chips_w
     return chips_map;
 }
 
-const std::unordered_set<chip_id_t>& tt_ClusterDescriptor::get_all_chips() const {
-    return this->enabled_active_chips;
-}
+const std::unordered_set<chip_id_t> &tt_ClusterDescriptor::get_all_chips() const { return this->enabled_active_chips; }
 
-const std::unordered_map<chip_id_t, std::uint32_t>& tt_ClusterDescriptor::get_harvesting_info() const {
+const std::unordered_map<chip_id_t, std::uint32_t> &tt_ClusterDescriptor::get_harvesting_info() const {
     return harvesting_masks;
 }
 
-const std::unordered_map<chip_id_t, bool>& tt_ClusterDescriptor::get_noc_translation_table_en() const {
+const std::unordered_map<chip_id_t, bool> &tt_ClusterDescriptor::get_noc_translation_table_en() const {
     return noc_translation_enabled;
 }
 
 std::size_t tt_ClusterDescriptor::get_number_of_chips() const { return this->enabled_active_chips.size(); }
 
 int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const {
-    log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates");
+    log_assert(
+        !this->chip_locations.empty(),
+        "Getting physical chip coordinates is only valid for systems where chips have coordinates");
     return this->get_ethernet_link_coord_distance(chip_locations.at(chip_a), chip_locations.at(chip_b));
 }
 
 BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const {
-  BoardType board_type = this->chip_board_type.at(chip_id);
-  return board_type;
+    BoardType board_type = this->chip_board_type.at(chip_id);
+    return board_type;
 }
 
-const std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>>& tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() const {
+const std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> &
+tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() const {
     return chips_grouped_by_closest_mmio;
 }
diff --git a/device/tt_device.cpp b/device/tt_device.cpp
new file mode 100644
index 00000000..071f6676
--- /dev/null
+++ b/device/tt_device.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifdef TT_DEBUG_LOGGING
+#define DEBUG_LOG(str)                 \
+    do {                               \
+        std::cout << str << std::endl; \
+    } while (false)
+#else
+#define DEBUG_LOG(str) ((void)0)
+#endif
+
+#include "tt_device.h"
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "device/tt_cluster_descriptor_types.h"
+#include "yaml-cpp/yaml.h"
+
+////////
+// Device base
+////////
+tt_device::tt_device() : soc_descriptor_per_chip({}) {}
+
+tt_device::~tt_device() {}
+
+const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const {
+    return soc_descriptor_per_chip.at(chip_id);
+}
diff --git a/device/tt_silicon_driver_common.cpp b/device/tt_silicon_driver_common.cpp
index 0b42f5a3..0d6c8b62 100644
--- a/device/tt_silicon_driver_common.cpp
+++ b/device/tt_silicon_driver_common.cpp
@@ -3,36 +3,37 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "umd/device/tt_silicon_driver_common.hpp"
-#include "umd/device/tt_xy_pair.h"
+
 #include "umd/device/cluster.h"
+#include "umd/device/tt_xy_pair.h"
 
 std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value) {
     std::string output;
 
-    if((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) {
         output += "BRISC | ";
     }
-    if((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) {
         output += "TRISC0 | ";
     }
-    if((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) {
         output += "TRISC1 | ";
     }
-    if((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) {
         output += "TRISC2 | ";
     }
-    if((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) {
         output += "NCRISC | ";
     }
-    if((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) {
         output += "STAGGERED_START | ";
     }
 
-  if(output.empty()) {
-    output = "UNKNOWN";
-  } else {
-    output.erase(output.end() - 3, output.end());
-  }
+    if (output.empty()) {
+        output = "UNKNOWN";
+    } else {
+        output.erase(output.end() - 3, output.end());
+    }
 
-  return output;
+    return output;
 }
diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp
index 6d84a2b7..0aa80685 100644
--- a/device/tt_soc_descriptor.cpp
+++ b/device/tt_soc_descriptor.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "yaml-cpp/yaml.h"
 #include "umd/device/tt_soc_descriptor.h"
 
 #include <assert.h>
+
 #include <fstream>
 #include <iostream>
 #include <regex>
@@ -13,53 +13,54 @@
 #include <unordered_set>
 
 #include "fmt/core.h"
+#include "utils.hpp"
+#include "yaml-cpp/yaml.h"
 
 // #include "l1_address_map.h"
 
 std::string format_node(tt_xy_pair xy) { return fmt::format("{}-{}", xy.x, xy.y); }
 
 tt_xy_pair format_node(std::string str) {
-  int x_coord;
-  int y_coord;
-  std::regex expr("([0-9]+)[-,xX]([0-9]+)");
-  std::smatch x_y_pair;
-
-  if (std::regex_search(str, x_y_pair, expr)) {
-    x_coord = std::stoi(x_y_pair[1]);
-    y_coord = std::stoi(x_y_pair[2]);
-  } else {
-    throw std::runtime_error(fmt::format("Could not parse the core id: {}", str));
-  }
+    int x_coord;
+    int y_coord;
+    std::regex expr("([0-9]+)[-,xX]([0-9]+)");
+    std::smatch x_y_pair;
+
+    if (std::regex_search(str, x_y_pair, expr)) {
+        x_coord = std::stoi(x_y_pair[1]);
+        y_coord = std::stoi(x_y_pair[2]);
+    } else {
+        throw std::runtime_error(fmt::format("Could not parse the core id: {}", str));
+    }
 
-  tt_xy_pair xy(x_coord, y_coord);
+    tt_xy_pair xy(x_coord, y_coord);
 
-  return xy;
+    return xy;
 }
-const char* ws = " \t\n\r\f\v";
+
+const char *ws = " \t\n\r\f\v";
 
 // trim from end of string (right)
-inline std::string& rtrim(std::string& s, const char* t = ws)
-{
+inline std::string &rtrim(std::string &s, const char *t = ws) {
     s.erase(s.find_last_not_of(t) + 1);
     return s;
 }
 
 // trim from beginning of string (left)
-inline std::string& ltrim(std::string& s, const char* t = ws)
-{
+inline std::string &ltrim(std::string &s, const char *t = ws) {
     s.erase(0, s.find_first_not_of(t));
     return s;
 }
 
 // trim from both ends of string (right then left)
-inline std::string& trim(std::string& s, const char* t = ws)
-{
-    return ltrim(rtrim(s, t), t);
-}
+inline std::string &trim(std::string &s, const char *t = ws) { return ltrim(rtrim(s, t), t); }
 
 void tt_SocDescriptor::load_soc_features_from_device_descriptor(YAML::Node &device_descriptor_yaml) {
     overlay_version = device_descriptor_yaml["features"]["overlay"]["version"].as<int>();
-    noc_translation_id_enabled = device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"] ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as<bool>() : false;
+    noc_translation_id_enabled =
+        device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"]
+            ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as<bool>()
+            : false;
     packer_version = device_descriptor_yaml["features"]["packer"]["version"].as<int>();
     unpacker_version = device_descriptor_yaml["features"]["unpacker"]["version"].as<int>();
     dst_size_alignment = device_descriptor_yaml["features"]["math"]["dst_size_alignment"].as<int>();
@@ -90,7 +91,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node &
     }
 
     int current_dram_channel = 0;
-    for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end(); ++channel_it) {
+    for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end();
+         ++channel_it) {
         dram_cores.push_back({});
         auto &soc_dram_cores = dram_cores.at(dram_cores.size() - 1);
         const auto &dram_cores = (*channel_it).as<std::vector<std::string>>();
@@ -121,8 +123,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node &
     std::vector<std::string> worker_cores = device_descriptor_yaml["functional_workers"].as<std::vector<std::string>>();
     std::set<int> worker_routing_coords_x;
     std::set<int> worker_routing_coords_y;
-    std::unordered_map<int,int> routing_coord_worker_x;
-    std::unordered_map<int,int> routing_coord_worker_y;
+    std::unordered_map<int, int> routing_coord_worker_x;
+    std::unordered_map<int, int> routing_coord_worker_y;
     for (const auto &core_string : worker_cores) {
         CoreDescriptor core_descriptor;
         core_descriptor.coord = format_node(core_string);
@@ -137,12 +139,12 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node &
     int func_x_start = 0;
     int func_y_start = 0;
     std::set<int>::iterator it;
-    for (it=worker_routing_coords_x.begin(); it!=worker_routing_coords_x.end(); ++it) {
+    for (it = worker_routing_coords_x.begin(); it != worker_routing_coords_x.end(); ++it) {
         worker_log_to_routing_x[func_x_start] = *it;
         routing_x_to_worker_x[*it] = func_x_start;
         func_x_start++;
     }
-    for (it=worker_routing_coords_y.begin(); it!=worker_routing_coords_y.end(); ++it) {
+    for (it = worker_routing_coords_y.begin(); it != worker_routing_coords_y.end(); ++it) {
         worker_log_to_routing_y[func_y_start] = *it;
         routing_y_to_worker_y[*it] = func_y_start;
         func_y_start++;
@@ -225,7 +227,8 @@ tt_virtual_coords tt_SocDescriptor::to_virtual_coords(tt_translated_coords trans
 tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask) {
     std::ifstream fdesc(device_descriptor_path);
     if (fdesc.fail()) {
-        throw std::runtime_error(fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path));
+        throw std::runtime_error(
+            fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path));
     }
     fdesc.close();
 
@@ -233,10 +236,12 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size
 
     auto grid_size_x = device_descriptor_yaml["grid"]["x_size"].as<int>();
     auto grid_size_y = device_descriptor_yaml["grid"]["y_size"].as<int>();
-    int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"] ?
-                                device_descriptor_yaml["physical"]["x_size"].as<int>() : grid_size_x;
-    int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"] ?
-                                device_descriptor_yaml["physical"]["y_size"].as<int>() : grid_size_y;
+    int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"]
+                                   ? device_descriptor_yaml["physical"]["x_size"].as<int>()
+                                   : grid_size_x;
+    int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"]
+                                   ? device_descriptor_yaml["physical"]["y_size"].as<int>()
+                                   : grid_size_y;
     load_core_descriptors_from_device_descriptor(device_descriptor_yaml);
     grid_size = tt_xy_pair(grid_size_x, grid_size_y);
     physical_grid_size = tt_xy_pair(physical_grid_size_x, physical_grid_size_y);
@@ -251,7 +256,7 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size
 
 int tt_SocDescriptor::get_num_dram_channels() const {
     int num_channels = 0;
-    for (auto& dram_core : dram_cores) {
+    for (auto &dram_core : dram_cores) {
         if (dram_core.size() > 0) {
             num_channels++;
         }
@@ -273,6 +278,22 @@ bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const {
     return this->ethernet_core_channel_map.find(core) != ethernet_core_channel_map.end();
 }
 
+std::string tt_SocDescriptor::get_soc_descriptor_path(tt::ARCH arch) {
+    switch (arch) {
+        case tt::ARCH::GRAYSKULL:
+            // TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
+            return tt::umd::utils::get_abs_path("tests/soc_descs/grayskull_10x12.yaml");
+        case tt::ARCH::WORMHOLE_B0:
+            // TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
+            return tt::umd::utils::get_abs_path("tests/soc_descs/wormhole_b0_8x10.yaml");
+        case tt::ARCH::BLACKHOLE:
+            // TODO: this path needs to be changed to point to soc descriptors outside of tests directory.
+            return tt::umd::utils::get_abs_path("tests/soc_descs/blackhole_140_arch_no_eth.yaml");
+        default:
+            throw std::runtime_error("Invalid architecture");
+    }
+}
+
 std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) {
     if (arch_name == tt::ARCH::Invalid) {
         out << "none";
@@ -281,7 +302,7 @@ std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) {
     } else if (arch_name == tt::ARCH::WORMHOLE_B0) {
         out << "wormhole_b0";
     } else if (arch_name == tt::ARCH::BLACKHOLE) {
-        out << "blackhole"; //Just how many ARCH-to-string functions do we plan to have, anyway?
+        out << "blackhole";  // Just how many ARCH-to-string functions do we plan to have, anyway?
     } else {
         out << "ArchNameSerializationNotImplemented";
     }
diff --git a/device/wormhole/wormhole_coordinate_manager.cpp b/device/wormhole/wormhole_coordinate_manager.cpp
index ddb088de..e9766d16 100644
--- a/device/wormhole/wormhole_coordinate_manager.cpp
+++ b/device/wormhole/wormhole_coordinate_manager.cpp
@@ -19,9 +19,11 @@ std::set<std::size_t> WormholeCoordinateManager::get_y_coordinates_to_harvest(st
 }
 
 tt_translated_coords WormholeCoordinateManager::to_translated_coords(tt_logical_coords logical_coords) {
-    return tt_translated_coords(logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y);
+    return tt_translated_coords(
+        logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y);
 }
 
 tt_logical_coords WormholeCoordinateManager::to_logical_coords(tt_translated_coords translated_coords) {
-    return tt_logical_coords(translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y);
+    return tt_logical_coords(
+        translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y);
 }
diff --git a/device/wormhole/wormhole_coordinate_manager.h b/device/wormhole/wormhole_coordinate_manager.h
index 0c06d119..eda84809 100644
--- a/device/wormhole/wormhole_coordinate_manager.h
+++ b/device/wormhole/wormhole_coordinate_manager.h
@@ -9,16 +9,16 @@
 #include "umd/device/coordinate_manager.h"
 
 class WormholeCoordinateManager : public CoordinateManager {
-
 public:
-    WormholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask)
-        : CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
+    WormholeCoordinateManager(
+        const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask) :
+        CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
 
     tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override;
 
     tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override;
 
-protected: 
+protected:
     std::set<std::size_t> get_y_coordinates_to_harvest(std::size_t harvesting_mask) override;
 
 private:
diff --git a/device/wormhole/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp
index c19e59fd..bd6e32e7 100644
--- a/device/wormhole/wormhole_implementation.cpp
+++ b/device/wormhole/wormhole_implementation.cpp
@@ -4,13 +4,12 @@
 
 #include "umd/device/wormhole_implementation.h"
 
-#include "wormhole/host_mem_address_map.h"
-#include "wormhole/eth_interface.h"
-
 #include "umd/device/cluster.h"
+#include "wormhole/eth_interface.h"
+#include "wormhole/host_mem_address_map.h"
 
-constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH
-constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH
+constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36;   // source: noc_parameters.h, common for WH && BH
+constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6;  // source: noc_parameters.h, common for WH && BH
 
 namespace tt::umd {
 
@@ -98,7 +97,9 @@ std::pair<std::uint64_t, std::uint64_t> wormhole_implementation::get_tlb_data(
 }
 
 tt_driver_host_address_params wormhole_implementation::get_host_address_params() const {
-    return {::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
+    return {
+        ::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE,
+        ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
 }
 
 tt_driver_eth_interface_params wormhole_implementation::get_eth_interface_params() const {
diff --git a/device/xy_pair.cpp b/device/xy_pair.cpp
index 0559f31c..ff9b7f95 100644
--- a/device/xy_pair.cpp
+++ b/device/xy_pair.cpp
@@ -11,6 +11,7 @@
 namespace tt::umd {
 
 std::string xy_pair::str() const { return fmt::format("(x={},y={})", x, y); }
+
 std::string cxy_pair::str() const { return fmt::format("(chip={},x={},y={})", chip, x, y); }
 
 }  // namespace tt::umd
diff --git a/tests/.clang-format b/tests/.clang-format
deleted file mode 100644
index 9d159247..00000000
--- a/tests/.clang-format
+++ /dev/null
@@ -1,2 +0,0 @@
-DisableFormat: true
-SortIncludes: false
diff --git a/tests/api/cluster_descriptor_examples/blackhole_P150.yaml b/tests/api/cluster_descriptor_examples/blackhole_P150.yaml
new file mode 100644
index 00000000..06232d98
--- /dev/null
+++ b/tests/api/cluster_descriptor_examples/blackhole_P150.yaml
@@ -0,0 +1,23 @@
+arch: {
+   0: Blackhole,
+}
+
+chips: {
+}
+
+ethernet_connections: [
+]
+
+chips_with_mmio: [
+   0: 0,
+]
+
+# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
+harvesting: {
+   0: {noc_translation: false, harvest_mask: 0},
+}
+
+# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case.
+boardtype: {
+   0: null,
+}
\ No newline at end of file
diff --git a/tests/api/cluster_descriptor_examples/galaxy.yaml b/tests/api/cluster_descriptor_examples/galaxy.yaml
new file mode 100644
index 00000000..d2ca245c
--- /dev/null
+++ b/tests/api/cluster_descriptor_examples/galaxy.yaml
@@ -0,0 +1,383 @@
+arch: {
+   0: Wormhole,
+   1: Wormhole,
+   2: Wormhole,
+   3: Wormhole,
+   4: Wormhole,
+   5: Wormhole,
+   6: Wormhole,
+   7: Wormhole,
+   8: Wormhole,
+   9: Wormhole,
+   10: Wormhole,
+   11: Wormhole,
+   12: Wormhole,
+   13: Wormhole,
+   14: Wormhole,
+   15: Wormhole,
+   16: Wormhole,
+   17: Wormhole,
+   18: Wormhole,
+   19: Wormhole,
+   20: Wormhole,
+   21: Wormhole,
+   22: Wormhole,
+   23: Wormhole,
+   24: Wormhole,
+   25: Wormhole,
+   26: Wormhole,
+   27: Wormhole,
+   28: Wormhole,
+   29: Wormhole,
+   30: Wormhole,
+   31: Wormhole,
+   32: Wormhole,
+   33: Wormhole,
+   34: Wormhole,
+   35: Wormhole,
+}
+
+chips: {
+   0: [0,3,0,0],
+   1: [0,2,0,0],
+   2: [0,1,0,0],
+   3: [0,0,0,0],
+   4: [3,6,0,1],
+   5: [3,5,0,1],
+   6: [2,5,0,1],
+   7: [2,6,0,1],
+   8: [1,6,0,1],
+   9: [1,7,0,1],
+   10: [2,7,0,1],
+   11: [3,7,0,1],
+   12: [0,7,0,1],
+   13: [0,6,0,1],
+   14: [0,5,0,1],
+   15: [1,5,0,1],
+   16: [1,4,0,1],
+   17: [2,4,0,1],
+   18: [3,4,0,1],
+   19: [3,3,0,1],
+   20: [2,3,0,1],
+   21: [1,3,0,1],
+   22: [1,2,0,1],
+   23: [2,2,0,1],
+   24: [3,2,0,1],
+   25: [3,1,0,1],
+   26: [2,1,0,1],
+   27: [1,1,0,1],
+   28: [1,0,0,1],
+   29: [2,0,0,1],
+   30: [3,0,0,1],
+   31: [0,0,0,1],
+   32: [0,1,0,1],
+   33: [0,2,0,1],
+   34: [0,3,0,1],
+   35: [0,4,0,1],
+}
+
+ethernet_connections: [
+   [{chip: 0, chan: 6}, {chip: 11, chan: 12}],
+   [{chip: 0, chan: 7}, {chip: 4, chan: 12}],
+   [{chip: 1, chan: 6}, {chip: 5, chan: 12}],
+   [{chip: 1, chan: 7}, {chip: 18, chan: 12}],
+   [{chip: 2, chan: 6}, {chip: 19, chan: 12}],
+   [{chip: 2, chan: 7}, {chip: 24, chan: 12}],
+   [{chip: 3, chan: 6}, {chip: 25, chan: 12}],
+   [{chip: 3, chan: 7}, {chip: 30, chan: 12}],
+   [{chip: 4, chan: 0}, {chip: 11, chan: 0}],
+   [{chip: 4, chan: 1}, {chip: 11, chan: 1}],
+   [{chip: 4, chan: 2}, {chip: 11, chan: 2}],
+   [{chip: 4, chan: 3}, {chip: 11, chan: 3}],
+   [{chip: 4, chan: 4}, {chip: 7, chan: 12}],
+   [{chip: 4, chan: 5}, {chip: 7, chan: 13}],
+   [{chip: 4, chan: 6}, {chip: 7, chan: 14}],
+   [{chip: 4, chan: 7}, {chip: 7, chan: 15}],
+   [{chip: 4, chan: 8}, {chip: 5, chan: 8}],
+   [{chip: 4, chan: 9}, {chip: 5, chan: 9}],
+   [{chip: 4, chan: 10}, {chip: 5, chan: 10}],
+   [{chip: 4, chan: 11}, {chip: 5, chan: 11}],
+   [{chip: 5, chan: 0}, {chip: 18, chan: 0}],
+   [{chip: 5, chan: 1}, {chip: 18, chan: 1}],
+   [{chip: 5, chan: 2}, {chip: 18, chan: 2}],
+   [{chip: 5, chan: 3}, {chip: 18, chan: 3}],
+   [{chip: 5, chan: 4}, {chip: 6, chan: 12}],
+   [{chip: 5, chan: 5}, {chip: 6, chan: 13}],
+   [{chip: 5, chan: 6}, {chip: 6, chan: 14}],
+   [{chip: 5, chan: 7}, {chip: 6, chan: 15}],
+   [{chip: 6, chan: 0}, {chip: 17, chan: 0}],
+   [{chip: 6, chan: 1}, {chip: 17, chan: 1}],
+   [{chip: 6, chan: 2}, {chip: 17, chan: 2}],
+   [{chip: 6, chan: 3}, {chip: 17, chan: 3}],
+   [{chip: 6, chan: 4}, {chip: 15, chan: 12}],
+   [{chip: 6, chan: 5}, {chip: 15, chan: 13}],
+   [{chip: 6, chan: 6}, {chip: 15, chan: 14}],
+   [{chip: 6, chan: 7}, {chip: 15, chan: 15}],
+   [{chip: 6, chan: 8}, {chip: 7, chan: 8}],
+   [{chip: 6, chan: 9}, {chip: 7, chan: 9}],
+   [{chip: 6, chan: 10}, {chip: 7, chan: 10}],
+   [{chip: 6, chan: 11}, {chip: 7, chan: 11}],
+   [{chip: 7, chan: 0}, {chip: 10, chan: 0}],
+   [{chip: 7, chan: 1}, {chip: 10, chan: 1}],
+   [{chip: 7, chan: 2}, {chip: 10, chan: 2}],
+   [{chip: 7, chan: 3}, {chip: 10, chan: 3}],
+   [{chip: 7, chan: 4}, {chip: 8, chan: 12}],
+   [{chip: 7, chan: 5}, {chip: 8, chan: 13}],
+   [{chip: 7, chan: 6}, {chip: 8, chan: 14}],
+   [{chip: 7, chan: 7}, {chip: 8, chan: 15}],
+   [{chip: 8, chan: 0}, {chip: 15, chan: 0}],
+   [{chip: 8, chan: 1}, {chip: 15, chan: 1}],
+   [{chip: 8, chan: 2}, {chip: 15, chan: 2}],
+   [{chip: 8, chan: 3}, {chip: 15, chan: 3}],
+   [{chip: 8, chan: 4}, {chip: 13, chan: 12}],
+   [{chip: 8, chan: 5}, {chip: 13, chan: 13}],
+   [{chip: 8, chan: 6}, {chip: 13, chan: 14}],
+   [{chip: 8, chan: 7}, {chip: 13, chan: 15}],
+   [{chip: 8, chan: 8}, {chip: 9, chan: 8}],
+   [{chip: 8, chan: 9}, {chip: 9, chan: 9}],
+   [{chip: 8, chan: 10}, {chip: 9, chan: 10}],
+   [{chip: 8, chan: 11}, {chip: 9, chan: 11}],
+   [{chip: 9, chan: 4}, {chip: 12, chan: 12}],
+   [{chip: 9, chan: 5}, {chip: 12, chan: 13}],
+   [{chip: 9, chan: 6}, {chip: 12, chan: 14}],
+   [{chip: 9, chan: 7}, {chip: 12, chan: 15}],
+   [{chip: 9, chan: 12}, {chip: 10, chan: 4}],
+   [{chip: 9, chan: 13}, {chip: 10, chan: 5}],
+   [{chip: 9, chan: 14}, {chip: 10, chan: 6}],
+   [{chip: 9, chan: 15}, {chip: 10, chan: 7}],
+   [{chip: 10, chan: 12}, {chip: 11, chan: 4}],
+   [{chip: 10, chan: 13}, {chip: 11, chan: 5}],
+   [{chip: 10, chan: 14}, {chip: 11, chan: 6}],
+   [{chip: 10, chan: 15}, {chip: 11, chan: 7}],
+   [{chip: 12, chan: 8}, {chip: 13, chan: 8}],
+   [{chip: 12, chan: 9}, {chip: 13, chan: 9}],
+   [{chip: 12, chan: 10}, {chip: 13, chan: 10}],
+   [{chip: 12, chan: 11}, {chip: 13, chan: 11}],
+   [{chip: 13, chan: 0}, {chip: 14, chan: 0}],
+   [{chip: 13, chan: 1}, {chip: 14, chan: 1}],
+   [{chip: 13, chan: 2}, {chip: 14, chan: 2}],
+   [{chip: 13, chan: 3}, {chip: 14, chan: 3}],
+   [{chip: 14, chan: 8}, {chip: 35, chan: 8}],
+   [{chip: 14, chan: 9}, {chip: 35, chan: 9}],
+   [{chip: 14, chan: 10}, {chip: 35, chan: 10}],
+   [{chip: 14, chan: 11}, {chip: 35, chan: 11}],
+   [{chip: 14, chan: 12}, {chip: 15, chan: 4}],
+   [{chip: 14, chan: 13}, {chip: 15, chan: 5}],
+   [{chip: 14, chan: 14}, {chip: 15, chan: 6}],
+   [{chip: 14, chan: 15}, {chip: 15, chan: 7}],
+   [{chip: 15, chan: 8}, {chip: 16, chan: 8}],
+   [{chip: 15, chan: 9}, {chip: 16, chan: 9}],
+   [{chip: 15, chan: 10}, {chip: 16, chan: 10}],
+   [{chip: 15, chan: 11}, {chip: 16, chan: 11}],
+   [{chip: 16, chan: 0}, {chip: 21, chan: 0}],
+   [{chip: 16, chan: 1}, {chip: 21, chan: 1}],
+   [{chip: 16, chan: 2}, {chip: 21, chan: 2}],
+   [{chip: 16, chan: 3}, {chip: 21, chan: 3}],
+   [{chip: 16, chan: 4}, {chip: 35, chan: 12}],
+   [{chip: 16, chan: 5}, {chip: 35, chan: 13}],
+   [{chip: 16, chan: 6}, {chip: 35, chan: 14}],
+   [{chip: 16, chan: 7}, {chip: 35, chan: 15}],
+   [{chip: 16, chan: 12}, {chip: 17, chan: 4}],
+   [{chip: 16, chan: 13}, {chip: 17, chan: 5}],
+   [{chip: 16, chan: 14}, {chip: 17, chan: 6}],
+   [{chip: 16, chan: 15}, {chip: 17, chan: 7}],
+   [{chip: 17, chan: 8}, {chip: 20, chan: 8}],
+   [{chip: 17, chan: 9}, {chip: 20, chan: 9}],
+   [{chip: 17, chan: 10}, {chip: 20, chan: 10}],
+   [{chip: 17, chan: 11}, {chip: 20, chan: 11}],
+   [{chip: 17, chan: 12}, {chip: 18, chan: 4}],
+   [{chip: 17, chan: 13}, {chip: 18, chan: 5}],
+   [{chip: 17, chan: 14}, {chip: 18, chan: 6}],
+   [{chip: 17, chan: 15}, {chip: 18, chan: 7}],
+   [{chip: 18, chan: 8}, {chip: 19, chan: 8}],
+   [{chip: 18, chan: 9}, {chip: 19, chan: 9}],
+   [{chip: 18, chan: 10}, {chip: 19, chan: 10}],
+   [{chip: 18, chan: 11}, {chip: 19, chan: 11}],
+   [{chip: 19, chan: 0}, {chip: 24, chan: 0}],
+   [{chip: 19, chan: 1}, {chip: 24, chan: 1}],
+   [{chip: 19, chan: 2}, {chip: 24, chan: 2}],
+   [{chip: 19, chan: 3}, {chip: 24, chan: 3}],
+   [{chip: 19, chan: 4}, {chip: 20, chan: 12}],
+   [{chip: 19, chan: 5}, {chip: 20, chan: 13}],
+   [{chip: 19, chan: 6}, {chip: 20, chan: 14}],
+   [{chip: 19, chan: 7}, {chip: 20, chan: 15}],
+   [{chip: 20, chan: 0}, {chip: 23, chan: 0}],
+   [{chip: 20, chan: 1}, {chip: 23, chan: 1}],
+   [{chip: 20, chan: 2}, {chip: 23, chan: 2}],
+   [{chip: 20, chan: 3}, {chip: 23, chan: 3}],
+   [{chip: 20, chan: 4}, {chip: 21, chan: 12}],
+   [{chip: 20, chan: 5}, {chip: 21, chan: 13}],
+   [{chip: 20, chan: 6}, {chip: 21, chan: 14}],
+   [{chip: 20, chan: 7}, {chip: 21, chan: 15}],
+   [{chip: 21, chan: 4}, {chip: 34, chan: 12}],
+   [{chip: 21, chan: 5}, {chip: 34, chan: 13}],
+   [{chip: 21, chan: 6}, {chip: 34, chan: 14}],
+   [{chip: 21, chan: 7}, {chip: 34, chan: 15}],
+   [{chip: 21, chan: 8}, {chip: 22, chan: 8}],
+   [{chip: 21, chan: 9}, {chip: 22, chan: 9}],
+   [{chip: 21, chan: 10}, {chip: 22, chan: 10}],
+   [{chip: 21, chan: 11}, {chip: 22, chan: 11}],
+   [{chip: 22, chan: 0}, {chip: 27, chan: 0}],
+   [{chip: 22, chan: 1}, {chip: 27, chan: 1}],
+   [{chip: 22, chan: 2}, {chip: 27, chan: 2}],
+   [{chip: 22, chan: 3}, {chip: 27, chan: 3}],
+   [{chip: 22, chan: 4}, {chip: 33, chan: 12}],
+   [{chip: 22, chan: 5}, {chip: 33, chan: 13}],
+   [{chip: 22, chan: 6}, {chip: 33, chan: 14}],
+   [{chip: 22, chan: 7}, {chip: 33, chan: 15}],
+   [{chip: 22, chan: 12}, {chip: 23, chan: 4}],
+   [{chip: 22, chan: 13}, {chip: 23, chan: 5}],
+   [{chip: 22, chan: 14}, {chip: 23, chan: 6}],
+   [{chip: 22, chan: 15}, {chip: 23, chan: 7}],
+   [{chip: 23, chan: 8}, {chip: 26, chan: 8}],
+   [{chip: 23, chan: 9}, {chip: 26, chan: 9}],
+   [{chip: 23, chan: 10}, {chip: 26, chan: 10}],
+   [{chip: 23, chan: 11}, {chip: 26, chan: 11}],
+   [{chip: 23, chan: 12}, {chip: 24, chan: 4}],
+   [{chip: 23, chan: 13}, {chip: 24, chan: 5}],
+   [{chip: 23, chan: 14}, {chip: 24, chan: 6}],
+   [{chip: 23, chan: 15}, {chip: 24, chan: 7}],
+   [{chip: 24, chan: 8}, {chip: 25, chan: 8}],
+   [{chip: 24, chan: 9}, {chip: 25, chan: 9}],
+   [{chip: 24, chan: 10}, {chip: 25, chan: 10}],
+   [{chip: 24, chan: 11}, {chip: 25, chan: 11}],
+   [{chip: 25, chan: 0}, {chip: 30, chan: 0}],
+   [{chip: 25, chan: 1}, {chip: 30, chan: 1}],
+   [{chip: 25, chan: 2}, {chip: 30, chan: 2}],
+   [{chip: 25, chan: 3}, {chip: 30, chan: 3}],
+   [{chip: 25, chan: 4}, {chip: 26, chan: 12}],
+   [{chip: 25, chan: 5}, {chip: 26, chan: 13}],
+   [{chip: 25, chan: 6}, {chip: 26, chan: 14}],
+   [{chip: 25, chan: 7}, {chip: 26, chan: 15}],
+   [{chip: 26, chan: 0}, {chip: 29, chan: 0}],
+   [{chip: 26, chan: 1}, {chip: 29, chan: 1}],
+   [{chip: 26, chan: 2}, {chip: 29, chan: 2}],
+   [{chip: 26, chan: 3}, {chip: 29, chan: 3}],
+   [{chip: 26, chan: 4}, {chip: 27, chan: 12}],
+   [{chip: 26, chan: 5}, {chip: 27, chan: 13}],
+   [{chip: 26, chan: 6}, {chip: 27, chan: 14}],
+   [{chip: 26, chan: 7}, {chip: 27, chan: 15}],
+   [{chip: 27, chan: 4}, {chip: 32, chan: 12}],
+   [{chip: 27, chan: 5}, {chip: 32, chan: 13}],
+   [{chip: 27, chan: 6}, {chip: 32, chan: 14}],
+   [{chip: 27, chan: 7}, {chip: 32, chan: 15}],
+   [{chip: 27, chan: 8}, {chip: 28, chan: 8}],
+   [{chip: 27, chan: 9}, {chip: 28, chan: 9}],
+   [{chip: 27, chan: 10}, {chip: 28, chan: 10}],
+   [{chip: 27, chan: 11}, {chip: 28, chan: 11}],
+   [{chip: 28, chan: 4}, {chip: 31, chan: 12}],
+   [{chip: 28, chan: 5}, {chip: 31, chan: 13}],
+   [{chip: 28, chan: 6}, {chip: 31, chan: 14}],
+   [{chip: 28, chan: 7}, {chip: 31, chan: 15}],
+   [{chip: 28, chan: 12}, {chip: 29, chan: 4}],
+   [{chip: 28, chan: 13}, {chip: 29, chan: 5}],
+   [{chip: 28, chan: 14}, {chip: 29, chan: 6}],
+   [{chip: 28, chan: 15}, {chip: 29, chan: 7}],
+   [{chip: 29, chan: 12}, {chip: 30, chan: 4}],
+   [{chip: 29, chan: 13}, {chip: 30, chan: 5}],
+   [{chip: 29, chan: 14}, {chip: 30, chan: 6}],
+   [{chip: 29, chan: 15}, {chip: 30, chan: 7}],
+   [{chip: 31, chan: 8}, {chip: 32, chan: 8}],
+   [{chip: 31, chan: 9}, {chip: 32, chan: 9}],
+   [{chip: 31, chan: 10}, {chip: 32, chan: 10}],
+   [{chip: 31, chan: 11}, {chip: 32, chan: 11}],
+   [{chip: 32, chan: 0}, {chip: 33, chan: 0}],
+   [{chip: 32, chan: 1}, {chip: 33, chan: 1}],
+   [{chip: 32, chan: 2}, {chip: 33, chan: 2}],
+   [{chip: 32, chan: 3}, {chip: 33, chan: 3}],
+   [{chip: 33, chan: 8}, {chip: 34, chan: 8}],
+   [{chip: 33, chan: 9}, {chip: 34, chan: 9}],
+   [{chip: 33, chan: 10}, {chip: 34, chan: 10}],
+   [{chip: 33, chan: 11}, {chip: 34, chan: 11}],
+   [{chip: 34, chan: 0}, {chip: 35, chan: 0}],
+   [{chip: 34, chan: 1}, {chip: 35, chan: 1}],
+   [{chip: 34, chan: 2}, {chip: 35, chan: 2}],
+   [{chip: 34, chan: 3}, {chip: 35, chan: 3}],
+]
+
+chips_with_mmio: [
+   0: 0,
+   1: 1,
+   2: 2,
+   3: 3,
+]
+
+# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
+harvesting: {
+   0: {noc_translation: true, harvest_mask: 1},
+   1: {noc_translation: true, harvest_mask: 1},
+   2: {noc_translation: true, harvest_mask: 4},
+   3: {noc_translation: true, harvest_mask: 8},
+   4: {noc_translation: true, harvest_mask: 0},
+   5: {noc_translation: true, harvest_mask: 0},
+   6: {noc_translation: true, harvest_mask: 0},
+   7: {noc_translation: true, harvest_mask: 0},
+   8: {noc_translation: true, harvest_mask: 0},
+   9: {noc_translation: true, harvest_mask: 0},
+   10: {noc_translation: true, harvest_mask: 0},
+   11: {noc_translation: true, harvest_mask: 0},
+   12: {noc_translation: true, harvest_mask: 0},
+   13: {noc_translation: true, harvest_mask: 0},
+   14: {noc_translation: true, harvest_mask: 0},
+   15: {noc_translation: true, harvest_mask: 0},
+   16: {noc_translation: true, harvest_mask: 0},
+   17: {noc_translation: true, harvest_mask: 0},
+   18: {noc_translation: true, harvest_mask: 0},
+   19: {noc_translation: true, harvest_mask: 0},
+   20: {noc_translation: true, harvest_mask: 0},
+   21: {noc_translation: true, harvest_mask: 0},
+   22: {noc_translation: true, harvest_mask: 0},
+   23: {noc_translation: true, harvest_mask: 0},
+   24: {noc_translation: true, harvest_mask: 0},
+   25: {noc_translation: true, harvest_mask: 0},
+   26: {noc_translation: true, harvest_mask: 0},
+   27: {noc_translation: true, harvest_mask: 0},
+   28: {noc_translation: true, harvest_mask: 0},
+   29: {noc_translation: true, harvest_mask: 0},
+   30: {noc_translation: true, harvest_mask: 0},
+   31: {noc_translation: true, harvest_mask: 0},
+   32: {noc_translation: true, harvest_mask: 0},
+   33: {noc_translation: true, harvest_mask: 0},
+   34: {noc_translation: true, harvest_mask: 0},
+   35: {noc_translation: true, harvest_mask: 0},
+}
+
+# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case.
+boardtype: {
+   0: n150,
+   1: n150,
+   2: n150,
+   3: n150,
+   4: GALAXY,
+   5: GALAXY,
+   6: GALAXY,
+   7: GALAXY,
+   8: GALAXY,
+   9: GALAXY,
+   10: GALAXY,
+   11: GALAXY,
+   12: GALAXY,
+   13: GALAXY,
+   14: GALAXY,
+   15: GALAXY,
+   16: GALAXY,
+   17: GALAXY,
+   18: GALAXY,
+   19: GALAXY,
+   20: GALAXY,
+   21: GALAXY,
+   22: GALAXY,
+   23: GALAXY,
+   24: GALAXY,
+   25: GALAXY,
+   26: GALAXY,
+   27: GALAXY,
+   28: GALAXY,
+   29: GALAXY,
+   30: GALAXY,
+   31: GALAXY,
+   32: GALAXY,
+   33: GALAXY,
+   34: GALAXY,
+   35: GALAXY,
+}
\ No newline at end of file
diff --git a/tests/api/cluster_descriptor_examples/grayskull_E150.yaml b/tests/api/cluster_descriptor_examples/grayskull_E150.yaml
new file mode 100644
index 00000000..6545cdad
--- /dev/null
+++ b/tests/api/cluster_descriptor_examples/grayskull_E150.yaml
@@ -0,0 +1,23 @@
+arch: {
+   0: Grayskull,
+}
+
+chips: {
+}
+
+ethernet_connections: [
+]
+
+chips_with_mmio: [
+   0: 0,
+]
+
+# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
+harvesting: {
+   0: {noc_translation: false, harvest_mask: 0},
+}
+
+# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case.
+boardtype: {
+   0: e150,
+}
\ No newline at end of file
diff --git a/tests/api/cluster_descriptor_examples/grayskull_E300.yaml b/tests/api/cluster_descriptor_examples/grayskull_E300.yaml
new file mode 100644
index 00000000..16a57168
--- /dev/null
+++ b/tests/api/cluster_descriptor_examples/grayskull_E300.yaml
@@ -0,0 +1,23 @@
+arch: {
+   0: Grayskull,
+}
+
+chips: {
+}
+
+ethernet_connections: [
+]
+
+chips_with_mmio: [
+   0: 0,
+]
+
+# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
+harvesting: {
+   0: {noc_translation: false, harvest_mask: 514},
+}
+
+# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case.
+boardtype: {
+   0: e300,
+}
\ No newline at end of file
diff --git a/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml b/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml
new file mode 100644
index 00000000..896888d0
--- /dev/null
+++ b/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml
@@ -0,0 +1,41 @@
+arch: {
+   0: Wormhole,
+   1: Wormhole,
+   2: Wormhole,
+   3: Wormhole,
+}
+
+chips: {
+   0: [0,0,0,0],
+   1: [0,0,0,0],
+   2: [1,0,0,0],
+   3: [1,0,0,0],
+}
+
+ethernet_connections: [
+   [{chip: 0, chan: 8}, {chip: 2, chan: 0}],
+   [{chip: 0, chan: 9}, {chip: 2, chan: 1}],
+   [{chip: 1, chan: 8}, {chip: 3, chan: 0}],
+   [{chip: 1, chan: 9}, {chip: 3, chan: 1}],
+]
+
+chips_with_mmio: [
+   0: 0,
+   1: 1,
+]
+
+# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
+harvesting: {
+   0: {noc_translation: true, harvest_mask: 65},
+   1: {noc_translation: true, harvest_mask: 3},
+   2: {noc_translation: true, harvest_mask: 5},
+   3: {noc_translation: true, harvest_mask: 33},
+}
+
+# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case.
+boardtype: {
+   0: n300,
+   1: n300,
+   2: n300,
+   3: n300,
+}
\ No newline at end of file
diff --git a/tests/api/cluster_descriptor_examples/wormhole_N150.yaml b/tests/api/cluster_descriptor_examples/wormhole_N150.yaml
new file mode 100644
index 00000000..c2dd123a
--- /dev/null
+++ b/tests/api/cluster_descriptor_examples/wormhole_N150.yaml
@@ -0,0 +1,24 @@
+arch: {
+   0: Wormhole,
+}
+
+chips: {
+   0: [0,0,0,0],
+}
+
+ethernet_connections: [
+]
+
+chips_with_mmio: [
+   0: 0,
+]
+
+# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
+harvesting: {
+   0: {noc_translation: true, harvest_mask: 32},
+}
+
+# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case.
+boardtype: {
+   0: n150,
+}
\ No newline at end of file
diff --git a/tests/api/cluster_descriptor_examples/wormhole_N300.yaml b/tests/api/cluster_descriptor_examples/wormhole_N300.yaml
new file mode 100644
index 00000000..78f7822a
--- /dev/null
+++ b/tests/api/cluster_descriptor_examples/wormhole_N300.yaml
@@ -0,0 +1,30 @@
+arch: {
+   0: Wormhole,
+   1: Wormhole,
+}
+
+chips: {
+   0: [0,0,0,0],
+   1: [1,0,0,0],
+}
+
+ethernet_connections: [
+   [{chip: 0, chan: 8}, {chip: 1, chan: 0}],
+   [{chip: 0, chan: 9}, {chip: 1, chan: 1}],
+]
+
+chips_with_mmio: [
+   0: 0,
+]
+
+# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
+harvesting: {
+   0: {noc_translation: true, harvest_mask: 65},
+   1: {noc_translation: true, harvest_mask: 5},
+}
+
+# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case.
+boardtype: {
+   0: n300,
+   1: n300,
+}
\ No newline at end of file
diff --git a/tests/api/test_chip.cpp b/tests/api/test_chip.cpp
index d15ff66d..2e9a268f 100644
--- a/tests/api/test_chip.cpp
+++ b/tests/api/test_chip.cpp
@@ -5,55 +5,23 @@
 // This file holds Chip specific API examples.
 
 #include <gtest/gtest.h>
-#include "fmt/xchar.h"
 
 #include <algorithm>
 #include <filesystem>
 #include <string>
 #include <vector>
 
+#include "fmt/xchar.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 
 // TODO: change to tt_cluster
+#include "umd/device/architecture_implementation.h"
 #include "umd/device/cluster.h"
 #include "umd/device/tt_cluster_descriptor.h"
-#include "umd/device/architecture_implementation.h"
 
 using namespace tt::umd;
 
-inline std::unique_ptr<tt_ClusterDescriptor> get_cluster_desc() {
-    // TODO: This should not be needed. And could be part of the cluster descriptor probably.
-    // Note that cluster descriptor holds logical ids of chips.
-    // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones.
-    // You have to see if physical PCIe is GS before constructing a cluster descriptor.
-    std::vector<int> pci_device_ids = PCIDevice::enumerate_devices();
-    std::set<int> pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end());
-
-    tt::ARCH device_arch = tt::ARCH::GRAYSKULL;
-    if (!pci_device_ids.empty()) {
-        // TODO: This should be removed from the API, the driver itself should do it.
-        int physical_device_id = pci_device_ids[0];
-        // TODO: remove logical_device_id
-        PCIDevice pci_device (physical_device_id, 0);
-        device_arch = pci_device.get_arch();
-    }
-
-    // TODO: Make this test work on a host system without any tt devices.
-    if (pci_device_ids.empty()) {
-        std::cout << "No Tenstorrent devices found. Skipping test." << std::endl;
-        return nullptr;
-    }
-
-    // TODO: Remove different branch for different archs
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc;
-    // TODO: remove getting manually cluster descriptor from yaml.
-    std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
-    cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
-
-    return cluster_desc;
-}
-
-inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr<Cluster> &umd_cluster) {
+inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr<Cluster>& umd_cluster) {
     chip_id_t any_mmio_chip = *umd_cluster->get_target_mmio_device_ids().begin();
     const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_mmio_chip);
     tt_xy_pair core = soc_desc.workers[0];
@@ -61,61 +29,12 @@ inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr<Cluster> &um
 }
 
 inline std::unique_ptr<Cluster> get_cluster() {
-
-    // TODO: This should not be needed. And could be part of the cluster descriptor probably.
-    // Note that cluster descriptor holds logical ids of chips.
-    // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones.
-    // You have to see if physical PCIe is GS before constructing a cluster descriptor.
     std::vector<int> pci_device_ids = PCIDevice::enumerate_devices();
-    std::set<int> pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end());
-
-    tt::ARCH device_arch = tt::ARCH::GRAYSKULL;
-    if (!pci_device_ids.empty()) {
-        // TODO: This should be removed from the API, the driver itself should do it.
-        int physical_device_id = pci_device_ids[0];
-        // TODO: remove logical_device_id
-        PCIDevice pci_device (physical_device_id, 0);
-        device_arch = pci_device.get_arch();
-    }
-
     // TODO: Make this test work on a host system without any tt devices.
     if (pci_device_ids.empty()) {
-        std::cout << "No Tenstorrent devices found. Skipping test." << std::endl;
         return nullptr;
     }
-
-    std::string yaml_path;
-    if (device_arch == tt::ARCH::GRAYSKULL) {
-        yaml_path = "";
-    } else if (device_arch == tt::ARCH::BLACKHOLE) {
-        yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml");
-    } else {
-        // TODO: remove getting manually cluster descriptor from yaml.
-        yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
-    }
-    // TODO: Remove the need to do this, allow default constructor to construct with all chips.
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc = get_cluster_desc();
-    std::unordered_set<int> detected_num_chips = cluster_desc->get_all_chips();
-
-    // TODO: make this unordered vs set conversion not needed.
-    std::set<chip_id_t> detected_num_chips_set (detected_num_chips.begin(), detected_num_chips.end());
-
-    
-    // TODO: This would be incorporated inside SocDescriptor.
-    std::string soc_path;
-    if (device_arch == tt::ARCH::GRAYSKULL) {
-        soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml");
-    } else if (device_arch == tt::ARCH::WORMHOLE_B0) {
-        soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml");
-    } else if (device_arch == tt::ARCH::BLACKHOLE) {
-        soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml");
-    } else {
-        throw std::runtime_error("Unsupported architecture");
-    }
-
-
-    // TODO: Don't pass each of these arguments.
-    return std::unique_ptr<Cluster>(new Cluster(soc_path, tt_ClusterDescriptor::get_cluster_descriptor_file_path(), detected_num_chips_set));
+    return std::unique_ptr<Cluster>(new Cluster());
 }
 
 // TODO: Once default auto TLB setup is in, check it is setup properly.
@@ -123,8 +42,7 @@ TEST(ApiChipTest, ManualTLBConfiguration) {
     std::unique_ptr<Cluster> umd_cluster = get_cluster();
 
     if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) {
-        std::cout << "No chips found. Skipping test." << std::endl;
-        return;
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
     }
 
     // Expect to throw for remote chip for any worker core
@@ -150,16 +68,17 @@ TEST(ApiChipTest, ManualTLBConfiguration) {
         if (!is_worker_core) {
             return -1;
         }
-        return core.x + core.y * umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x();
+        return core.x +
+               core.y *
+                   umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x();
     };
 
     std::int32_t c_zero_address = 0;
 
     // Each MMIO chip has it's own set of TLBs, so needs its own configuration.
-    for (chip_id_t mmio_chip: umd_cluster->get_target_mmio_device_ids()) {
-
+    for (chip_id_t mmio_chip : umd_cluster->get_target_mmio_device_ids()) {
         const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(mmio_chip);
-        for (tt_xy_pair core: soc_desc.workers) {
+        for (tt_xy_pair core : soc_desc.workers) {
             umd_cluster->configure_tlb(mmio_chip, core, get_static_tlb_index(core), c_zero_address);
         }
 
@@ -183,8 +102,7 @@ TEST(ApiChipTest, SimpleAPIShowcase) {
     std::unique_ptr<Cluster> umd_cluster = get_cluster();
 
     if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) {
-        std::cout << "No chips found. Skipping test." << std::endl;
-        return;
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
     }
 
     chip_id_t chip_id = umd_cluster->get_cluster_description()->get_chips_with_mmio().begin()->first;
@@ -198,7 +116,11 @@ TEST(ApiChipTest, SimpleAPIShowcase) {
 // It reads back the risc reset reg to validate
 TEST(ApiChipTest, DeassertRiscResetOnCore) {
     std::unique_ptr<Cluster> umd_cluster = get_cluster();
-    
+
+    if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) {
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
+    }
+
     tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster);
 
     umd_cluster->assert_risc_reset_at_core(chip_core_coord);
@@ -218,6 +140,10 @@ TEST(ApiChipTest, DeassertRiscResetOnCore) {
 TEST(ApiChipTest, SpecifyLegalDeassertRiscResetOnCore) {
     std::unique_ptr<Cluster> umd_cluster = get_cluster();
 
+    if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) {
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
+    }
+
     tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster);
 
     umd_cluster->assert_risc_reset_at_core(chip_core_coord);
@@ -236,6 +162,10 @@ TEST(ApiChipTest, SpecifyLegalDeassertRiscResetOnCore) {
 TEST(ApiChipTest, SpecifyIllegalDeassertRiscResetOnCore) {
     std::unique_ptr<Cluster> umd_cluster = get_cluster();
 
+    if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) {
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
+    }
+
     tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster);
 
     umd_cluster->assert_risc_reset_at_core(chip_core_coord);
diff --git a/tests/api/test_cluster.cpp b/tests/api/test_cluster.cpp
index c6f1285a..15aa28d2 100644
--- a/tests/api/test_cluster.cpp
+++ b/tests/api/test_cluster.cpp
@@ -13,15 +13,14 @@
 
 #include "fmt/xchar.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
-
-#include "umd/device/tt_cluster_descriptor.h"
 #include "umd/device/cluster.h"
+#include "umd/device/tt_cluster_descriptor.h"
 
 // TODO: obviously we need some other way to set this up
+#include "noc/noc_parameters.h"
 #include "src/firmware/riscv/wormhole/eth_l1_address_map.h"
 #include "src/firmware/riscv/wormhole/host_mem_address_map.h"
 #include "src/firmware/riscv/wormhole/l1_address_map.h"
-#include "noc/noc_parameters.h"
 
 using namespace tt::umd;
 
@@ -30,93 +29,13 @@ using namespace tt::umd;
 // N150. N300
 // Galaxy
 
-// TODO: This function should not exist, the API itself should be simple enough.
-inline std::unique_ptr<tt_ClusterDescriptor> get_cluster_desc() {
-    // TODO: This should not be needed. And could be part of the cluster descriptor probably.
-    // Note that cluster descriptor holds logical ids of chips.
-    // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones.
-    // You have to see if physical PCIe is GS before constructing a cluster descriptor.
-    std::vector<int> pci_device_ids = PCIDevice::enumerate_devices();
-    std::set<int> pci_device_ids_set(pci_device_ids.begin(), pci_device_ids.end());
-
-    tt::ARCH device_arch = tt::ARCH::GRAYSKULL;
-    if (!pci_device_ids.empty()) {
-        // TODO: This should be removed from the API, the driver itself should do it.
-        int physical_device_id = pci_device_ids[0];
-        // TODO: remove logical_device_id
-        PCIDevice pci_device(physical_device_id, 0);
-        device_arch = pci_device.get_arch();
-    }
-
-    // TODO: Make this test work on a host system without any tt devices.
-    if (pci_device_ids.empty()) {
-        std::cout << "No Tenstorrent devices found. Skipping test." << std::endl;
-        return nullptr;
-    }
-
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc;
-    // TODO: remove getting manually cluster descriptor from yaml.
-    std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
-    cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
-
-    return cluster_desc;
-}
-
-// TODO: This function should not exist, the API itself should be simple enough.
 inline std::unique_ptr<Cluster> get_cluster() {
-    // TODO: This should not be needed. And could be part of the cluster descriptor probably.
-    // Note that cluster descriptor holds logical ids of chips.
-    // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones.
-    // You have to see if physical PCIe is GS before constructing a cluster descriptor.
     std::vector<int> pci_device_ids = PCIDevice::enumerate_devices();
-    std::set<int> pci_device_ids_set(pci_device_ids.begin(), pci_device_ids.end());
-
-    tt::ARCH device_arch = tt::ARCH::GRAYSKULL;
-    if (!pci_device_ids.empty()) {
-        // TODO: This should be removed from the API, the driver itself should do it.
-        int physical_device_id = pci_device_ids[0];
-        // TODO: remove logical_device_id
-        PCIDevice pci_device(physical_device_id, 0);
-        device_arch = pci_device.get_arch();
-    }
-
     // TODO: Make this test work on a host system without any tt devices.
     if (pci_device_ids.empty()) {
-        std::cout << "No Tenstorrent devices found. Skipping test." << std::endl;
         return nullptr;
     }
-
-    std::string yaml_path;
-    if (device_arch == tt::ARCH::GRAYSKULL) {
-        yaml_path = "";
-    } else if (device_arch == tt::ARCH::BLACKHOLE) {
-        yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml");
-    } else {
-        // TODO: remove getting manually cluster descriptor from yaml.
-        yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
-    }
-    // TODO: Remove the need to do this, allow default constructor to construct with all chips.
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc = get_cluster_desc();
-    std::unordered_set<int> detected_num_chips = cluster_desc->get_all_chips();
-
-    // TODO: make this unordered vs set conversion not needed.
-    std::set<chip_id_t> detected_num_chips_set(detected_num_chips.begin(), detected_num_chips.end());
-
-    // TODO: This would be incorporated inside SocDescriptor.
-    std::string soc_path;
-    if (device_arch == tt::ARCH::GRAYSKULL) {
-        soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml");
-    } else if (device_arch == tt::ARCH::WORMHOLE_B0) {
-        soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml");
-    } else if (device_arch == tt::ARCH::BLACKHOLE) {
-        soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml");
-    } else {
-        throw std::runtime_error("Unsupported architecture");
-    }
-
-    // TODO: Don't pass each of these arguments.
-    return std::unique_ptr<Cluster>(
-        new Cluster(soc_path, tt_ClusterDescriptor::get_cluster_descriptor_file_path(), detected_num_chips_set));
+    return std::unique_ptr<Cluster>(new Cluster());
 }
 
 // TODO: Should not be wormhole specific.
@@ -128,11 +47,9 @@ void setup_wormhole_remote(Cluster* umd_cluster) {
         // Populate address map and NOC parameters that the driver needs for remote transactions
 
         umd_cluster->set_device_l1_address_params(
-            {
-             l1_mem::address_map::L1_BARRIER_BASE,
+            {l1_mem::address_map::L1_BARRIER_BASE,
              eth_l1_mem::address_map::ERISC_BARRIER_BASE,
-             eth_l1_mem::address_map::FW_VERSION_ADDR
-	    });
+             eth_l1_mem::address_map::FW_VERSION_ADDR});
     }
 }
 
@@ -140,12 +57,12 @@ void setup_wormhole_remote(Cluster* umd_cluster) {
 TEST(ApiClusterTest, OpenAllChips) { std::unique_ptr<Cluster> umd_cluster = get_cluster(); }
 
 TEST(ApiClusterTest, SimpleIOAllChips) {
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc = get_cluster_desc();
     std::unique_ptr<Cluster> umd_cluster = get_cluster();
 
+    const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description();
+
     if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) {
-        std::cout << "No chips found. Skipping test." << std::endl;
-        return;
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
     }
 
     // Initialize random data.
@@ -198,12 +115,12 @@ TEST(ApiClusterTest, SimpleIOAllChips) {
 }
 
 TEST(ApiClusterTest, RemoteFlush) {
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc = get_cluster_desc();
     std::unique_ptr<Cluster> umd_cluster = get_cluster();
 
+    const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description();
+
     if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) {
-        std::cout << "No chips found. Skipping test." << std::endl;
-        return;
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
     }
 
     size_t data_size = 1024;
@@ -256,3 +173,61 @@ TEST(ApiClusterTest, RemoteFlush) {
     std::cout << "Testing whole cluster wait for remote chip flush again, should be no-op." << std::endl;
     umd_cluster->wait_for_non_mmio_flush();
 }
+
+TEST(ApiClusterTest, SimpleIOSpecificChips) {
+    std::unique_ptr<Cluster> umd_cluster = std::make_unique<Cluster>(0);
+
+    const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description();
+
+    if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) {
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
+    }
+
+    // Initialize random data.
+    size_t data_size = 1024;
+    std::vector<uint8_t> data(data_size, 0);
+    for (int i = 0; i < data_size; i++) {
+        data[i] = i % 256;
+    }
+
+    // TODO: this should be part of constructor if it is mandatory.
+    setup_wormhole_remote(umd_cluster.get());
+
+    for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) {
+        const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id);
+
+        // TODO: figure out if core locations should contain chip_id
+        tt_xy_pair any_core = soc_desc.workers[0];
+        tt_cxy_pair any_core_global(chip_id, any_core);
+
+        if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) {
+            std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl;
+            continue;
+        }
+
+        std::cout << "Writing to chip " << chip_id << " core " << any_core.str() << std::endl;
+
+        umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB");
+    }
+
+    // Now read back the data.
+    for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) {
+        const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id);
+
+        // TODO: figure out if core locations should contain chip_id
+        tt_xy_pair any_core = soc_desc.workers[0];
+        tt_cxy_pair any_core_global(chip_id, any_core);
+
+        if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) {
+            std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl;
+            continue;
+        }
+
+        std::cout << "Reading from chip " << chip_id << " core " << any_core.str() << std::endl;
+
+        std::vector<uint8_t> readback_data(data_size, 0);
+        umd_cluster->read_from_device(readback_data.data(), any_core_global, 0, data_size, "LARGE_READ_TLB");
+
+        ASSERT_EQ(data, readback_data);
+    }
+}
diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp
index 867388ae..a6328b95 100644
--- a/tests/api/test_cluster_descriptor.cpp
+++ b/tests/api/test_cluster_descriptor.cpp
@@ -5,44 +5,22 @@
 #include <gtest/gtest.h>
 
 #include <string>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
+#include "disjoint_set.hpp"
 #include "tests/test_utils/generate_cluster_desc.hpp"
-
 #include "umd/device/pci_device.hpp"
 #include "umd/device/tt_cluster_descriptor.h"
 
 // TODO: Needed for detect_arch, remove when it is part of cluster descriptor.
 #include "umd/device/cluster.h"
 
-
 inline std::unique_ptr<tt_ClusterDescriptor> get_cluster_desc() {
-
-    std::vector<int> pci_device_ids = PCIDevice::enumerate_devices();
-    std::set<int> pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end());
-
-    // TODO: This test requires knowledge of the device architecture, which should not be true.
-    tt::ARCH device_arch = tt::ARCH::GRAYSKULL;
-    if (!pci_device_ids.empty()) {
-        int physical_device_id = pci_device_ids[0];
-        PCIDevice pci_device (physical_device_id, 0);
-        device_arch = pci_device.get_arch();
-    }
-
-    // TODO: Make this test work on a host system without any tt devices.
-    if (pci_device_ids.empty()) {
-        std::cout << "No Tenstorrent devices found. Skipping test." << std::endl;
-        return nullptr;
-    }
-
-    // TODO: Remove different branch for different archs
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc;
     // TODO: remove getting manually cluster descriptor from yaml.
     std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
-    cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
 
-    return cluster_desc;
+    return tt_ClusterDescriptor::create_from_yaml(yaml_path);
 }
 
 TEST(ApiClusterDescriptorTest, DetectArch) {
@@ -65,11 +43,10 @@ TEST(ApiClusterDescriptorTest, DetectArch) {
 }
 
 TEST(ApiClusterDescriptorTest, BasicFunctionality) {
-
     std::unique_ptr<tt_ClusterDescriptor> cluster_desc = get_cluster_desc();
 
     if (cluster_desc == nullptr) {
-        return;
+        GTEST_SKIP() << "No chips present on the system. Skipping test.";
     }
 
     std::unordered_set<chip_id_t> all_chips = cluster_desc->get_all_chips();
@@ -77,7 +54,7 @@ TEST(ApiClusterDescriptorTest, BasicFunctionality) {
     std::unordered_map<chip_id_t, eth_coord_t> eth_chip_coords = cluster_desc->get_chip_locations();
     std::unordered_map<chip_id_t, chip_id_t> local_chips_to_pci_device_id = cluster_desc->get_chips_with_mmio();
     std::unordered_set<chip_id_t> local_chips;
-    for (auto [chip, _]: local_chips_to_pci_device_id) {
+    for (auto [chip, _] : local_chips_to_pci_device_id) {
         local_chips.insert(chip);
     }
     std::unordered_set<chip_id_t> remote_chips;
@@ -87,65 +64,58 @@ TEST(ApiClusterDescriptorTest, BasicFunctionality) {
         }
     }
 
-    std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio = cluster_desc->get_chips_grouped_by_closest_mmio();
+    std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio =
+        cluster_desc->get_chips_grouped_by_closest_mmio();
 }
 
-// A standard disjoint set data structure to track connected components.
-class DisjointSet {
-    public:
-        void add_item(int item) {
-            parent[item] = item;
+TEST(ApiClusterDescriptorTest, TestAllOfflineClusterDescriptors) {
+    for (std::string cluster_desc_yaml : {
+             "blackhole_P150.yaml",
+             "galaxy.yaml",
+             "grayskull_E150.yaml",
+             "grayskull_E300.yaml",
+             "wormhole_2xN300_unconnected.yaml",
+             "wormhole_N150.yaml",
+             "wormhole_N300.yaml",
+         }) {
+        std::cout << "Testing " << cluster_desc_yaml << std::endl;
+        std::unique_ptr<tt_ClusterDescriptor> cluster_desc = tt_ClusterDescriptor::create_from_yaml(
+            test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/" + cluster_desc_yaml));
+
+        std::unordered_set<chip_id_t> all_chips = cluster_desc->get_all_chips();
+        std::unordered_map<chip_id_t, std::uint32_t> harvesting_for_chips = cluster_desc->get_harvesting_info();
+        std::unordered_map<chip_id_t, eth_coord_t> eth_chip_coords = cluster_desc->get_chip_locations();
+        std::unordered_map<chip_id_t, chip_id_t> local_chips_to_pci_device_id = cluster_desc->get_chips_with_mmio();
+        std::unordered_set<chip_id_t> local_chips;
+        for (auto [chip, _] : local_chips_to_pci_device_id) {
+            local_chips.insert(chip);
         }
-
-        int get_parent(int item) {
-            while (parent[item] != item) {
-                item = parent[item];
+        std::unordered_set<chip_id_t> remote_chips;
+        for (auto chip : all_chips) {
+            if (local_chips.find(chip) == local_chips.end()) {
+                remote_chips.insert(chip);
             }
-            return item;
         }
 
-        void merge(int item1, int item2) {
-            int parent1 = get_parent(item1);
-            int parent2 = get_parent(item2);
-            parent[parent1] = parent2;
-        }
-
-        bool are_same_set(int item1, int item2) {
-            return get_parent(item1) == get_parent(item2);
-        }
-
-        int get_num_sets() {
-            std::unordered_set<int> sets;
-            for (auto [item, _]: parent) {
-                sets.insert(get_parent(item));
-            }
-            return sets.size();
-        }
-
-    private:
-        std::unordered_map<int, int> parent;
-};
+        std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio =
+            cluster_desc->get_chips_grouped_by_closest_mmio();
+    }
+}
 
-// This tests fails on a machine with multiple cards.
-// It works as long as all the devices that are discoverable are connected through ethernet.
-// Our ClusterDescriptor doesn't have a notion of multiple unconnected clusters of cards.
 TEST(ApiClusterDescriptorTest, SeparateClusters) {
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc = get_cluster_desc();
-
-    if (cluster_desc == nullptr) {
-        return;
-    }
+    std::unique_ptr<tt_ClusterDescriptor> cluster_desc = tt_ClusterDescriptor::create_from_yaml(
+        test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml"));
 
     auto all_chips = cluster_desc->get_all_chips();
-    DisjointSet chip_clusters;
+    DisjointSet<chip_id_t> chip_clusters;
     for (auto chip : all_chips) {
         chip_clusters.add_item(chip);
     }
 
     // Merge into clusters of chips.
-    for (auto connection: cluster_desc->get_ethernet_connections()) {
+    for (auto connection : cluster_desc->get_ethernet_connections()) {
         chip_id_t chip = connection.first;
-        for (auto [channel, remote_chip_and_channel]: connection.second) {
+        for (auto [channel, remote_chip_and_channel] : connection.second) {
             chip_id_t remote_chip = std::get<0>(remote_chip_and_channel);
             chip_clusters.merge(chip, remote_chip);
         }
diff --git a/tests/api/test_mockup_device.cpp b/tests/api/test_mockup_device.cpp
index d687075e..bb8001ea 100644
--- a/tests/api/test_mockup_device.cpp
+++ b/tests/api/test_mockup_device.cpp
@@ -11,8 +11,8 @@
 #include <string_view>
 
 #include "device/mockup/tt_mockup_device.hpp"
-#include "umd/device/tt_arch_types.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
+#include "umd/device/tt_arch_types.h"
 
 namespace test::mockup_device {
 
@@ -25,14 +25,18 @@ std::string get_env_arch_name() {
 }
 
 tt::ARCH get_arch_from_string(const std::string &arch_str) {
-    if (arch_str == "grayskull" || arch_str == "GRAYSKULL")
+    if (arch_str == "grayskull" || arch_str == "GRAYSKULL") {
         return tt::ARCH::GRAYSKULL;
-    if (arch_str == "wormhole_b0" || arch_str == "WORMHOLE_B0")
+    }
+    if (arch_str == "wormhole_b0" || arch_str == "WORMHOLE_B0") {
         return tt::ARCH::WORMHOLE_B0;
-    if (arch_str == "blackhole" || arch_str == "BLACKHOLE")
+    }
+    if (arch_str == "blackhole" || arch_str == "BLACKHOLE") {
         return tt::ARCH::BLACKHOLE;
-    if (arch_str == "Invalid" || arch_str == "INVALID")
+    }
+    if (arch_str == "Invalid" || arch_str == "INVALID") {
         return tt::ARCH::Invalid;
+    }
 
     throw std::runtime_error(arch_str + " is not recognized as tt::ARCH.");
 }
@@ -41,11 +45,16 @@ std::string get_soc_descriptor_file(tt::ARCH arch) {
     // const std::string umd_root = get_umd_root();
 
     switch (arch) {
-        case tt::ARCH::GRAYSKULL: return test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml");
-        case tt::ARCH::WORMHOLE_B0: return  test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml");
-        case tt::ARCH::BLACKHOLE: return  test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml");
-        case tt::ARCH::Invalid: throw std::runtime_error("Invalid arch not supported");
-        default: throw std::runtime_error("Unsupported device architecture");
+        case tt::ARCH::GRAYSKULL:
+            return test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml");
+        case tt::ARCH::WORMHOLE_B0:
+            return test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml");
+        case tt::ARCH::BLACKHOLE:
+            return test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml");
+        case tt::ARCH::Invalid:
+            throw std::runtime_error("Invalid arch not supported");
+        default:
+            throw std::runtime_error("Unsupported device architecture");
     }
 }
 
diff --git a/tests/api/test_soc_descriptor_bh.cpp b/tests/api/test_soc_descriptor_bh.cpp
index 5234a7c0..7007a98f 100644
--- a/tests/api/test_soc_descriptor_bh.cpp
+++ b/tests/api/test_soc_descriptor_bh.cpp
@@ -4,11 +4,9 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "gtest/gtest.h"
-
-#include "umd/device/tt_soc_descriptor.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 #include "tests/test_utils/soc_desc_test_utils.hpp"
-
+#include "umd/device/tt_soc_descriptor.h"
 
 // Blackhole workers - x-y annotation
 // functional_workers:
@@ -28,8 +26,8 @@
 // Tests that all physical coordinates are same as all virtual coordinates
 // when there is no harvesting.
 TEST(SocDescriptor, SocDescriptorBHNoHarvesting) {
-
-    tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 0);
+    tt_SocDescriptor soc_desc =
+        tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 0);
 
     // We expect full grid size since there is no harvesting.
     tt_xy_pair worker_grid_size = soc_desc.worker_grid_size;
@@ -38,7 +36,7 @@ TEST(SocDescriptor, SocDescriptorBHNoHarvesting) {
             tt_logical_coords logical_coords = tt_logical_coords(x, y);
             tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords);
             tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords);
-            
+
             // Virtual and physical coordinates should be the same.
             EXPECT_EQ(physical_coords, virtual_coords);
         }
@@ -49,7 +47,8 @@ TEST(SocDescriptor, SocDescriptorBHNoHarvesting) {
 // We expect that the top left core will have virtual and physical coordinates (1, 2) and (2, 2) for
 // the logical coordinates if the first row is harvested.
 TEST(SocDescriptor, SocDescriptorBHTopLeftCore) {
-    tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 1);
+    tt_SocDescriptor soc_desc =
+        tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 1);
     tt_xy_pair worker_grid_size = soc_desc.worker_grid_size;
 
     tt_logical_coords logical_coords = tt_logical_coords(0, 0);
@@ -65,13 +64,12 @@ TEST(SocDescriptor, SocDescriptorBHTopLeftCore) {
 
 // Test logical to physical coordinate translation.
 // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates.
-// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping.
+// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned
+// as from original mapping.
 TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) {
-
     const std::size_t max_num_harvested_x = 14;
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"));
     for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) {
-       
         soc_desc.perform_harvesting(harvesting_mask);
 
         std::map<tt_logical_coords, tt_physical_coords> logical_to_physical;
@@ -97,7 +95,7 @@ TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) {
         for (auto it : logical_to_physical) {
             tt_physical_coords physical_coords = it.second;
             tt_logical_coords logical_coords = soc_desc.to_logical_coords(physical_coords);
-            
+
             // Expect that reverse mapping of physical coordinates gives the same logical coordinates
             // using which we got the physical coordinates.
             EXPECT_EQ(it.first, logical_coords);
@@ -107,13 +105,12 @@ TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) {
 
 // Test logical to virtual coordinate translation.
 // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates.
-// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping.
+// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned
+// as from original mapping.
 TEST(SocDescriptor, SocDescriptorBHLogicalVirtualMapping) {
-
     const std::size_t max_num_harvested_x = 14;
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"));
     for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) {
-        
         soc_desc.perform_harvesting(harvesting_mask);
 
         std::map<tt_logical_coords, tt_virtual_coords> logical_to_virtual;
@@ -149,13 +146,12 @@ TEST(SocDescriptor, SocDescriptorBHLogicalVirtualMapping) {
 
 // Test logical to translated coordinate translation.
 // For the full grid of logical coordinates we expect that there are no duplicates of translated coordinates.
-// For the reverse mapping back of translated to logical coordinates we expect that same logical coordinates are returned as from original mapping.
+// For the reverse mapping back of translated to logical coordinates we expect that same logical coordinates are
+// returned as from original mapping.
 TEST(SocDescriptor, SocDescriptorBHLogicalTranslatedMapping) {
-
     const std::size_t max_num_harvested_x = 14;
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"));
     for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) {
-        
         soc_desc.perform_harvesting(harvesting_mask);
 
         std::map<tt_logical_coords, tt_translated_coords> logical_to_translated;
@@ -170,7 +166,8 @@ TEST(SocDescriptor, SocDescriptorBHLogicalTranslatedMapping) {
                 tt_translated_coords translated_coords = soc_desc.to_translated_coords(logical_coords);
                 logical_to_translated[logical_coords] = translated_coords;
 
-                // Expect that logical to translated translation is 1-1 mapping. No duplicates for translated coordinates.
+                // Expect that logical to translated translation is 1-1 mapping. No duplicates for translated
+                // coordinates.
                 EXPECT_EQ(translated_coords_set.count(translated_coords), 0);
                 translated_coords_set.insert(translated_coords);
             }
@@ -196,7 +193,7 @@ TEST(SocDescriptor, SocDescriptorBHVirtualEqualTranslated) {
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"));
     for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) {
         soc_desc.perform_harvesting(harvesting_mask);
-        
+
         std::size_t num_harvested_x = test_utils::get_num_harvested(harvesting_mask);
 
         for (std::size_t x = 0; x < soc_desc.worker_grid_size.x - num_harvested_x; x++) {
@@ -209,5 +206,5 @@ TEST(SocDescriptor, SocDescriptorBHVirtualEqualTranslated) {
                 EXPECT_EQ(translated_coords, virtual_coords);
             }
         }
-    } 
+    }
 }
diff --git a/tests/api/test_soc_descriptor_gs.cpp b/tests/api/test_soc_descriptor_gs.cpp
index c697a59d..b5cabc7c 100644
--- a/tests/api/test_soc_descriptor_gs.cpp
+++ b/tests/api/test_soc_descriptor_gs.cpp
@@ -4,10 +4,9 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "gtest/gtest.h"
-
-#include "umd/device/tt_soc_descriptor.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 #include "tests/test_utils/soc_desc_test_utils.hpp"
+#include "umd/device/tt_soc_descriptor.h"
 
 // Grayskull workers - x-y annotation
 // functional_workers:
@@ -27,7 +26,6 @@
 // Tests that all physical coordinates are same as all virtual coordinates
 // when there is no harvesting.
 TEST(SocDescriptor, SocDescriptorGSNoHarvesting) {
-
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"));
 
     // We expect full grid size since there is no harvesting.
@@ -37,7 +35,7 @@ TEST(SocDescriptor, SocDescriptorGSNoHarvesting) {
             tt_logical_coords logical_coords = tt_logical_coords(x, y);
             tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords);
             tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords);
-            
+
             // Virtual and physical coordinates should be the same.
             EXPECT_EQ(physical_coords, virtual_coords);
         }
@@ -48,7 +46,6 @@ TEST(SocDescriptor, SocDescriptorGSNoHarvesting) {
 // We expect that the top left core will have virtual and physical coordinates (1, 1) and (1, 2) for
 // the logical coordinates if the first row is harvested.
 TEST(SocDescriptor, SocDescriptorGSTopLeftCore) {
-
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"));
     tt_xy_pair worker_grid_size = soc_desc.worker_grid_size;
 
@@ -75,7 +72,7 @@ TEST(SocDescriptor, SocDescriptorGSTranslatingCoords) {
             tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords);
             tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords);
             tt_translated_coords translated_coords = soc_desc.to_translated_coords(logical_coords);
-            
+
             // Virtual, physical and translated coordinates should be the same.
             EXPECT_EQ(physical_coords, virtual_coords);
             EXPECT_EQ(physical_coords, translated_coords);
@@ -85,9 +82,9 @@ TEST(SocDescriptor, SocDescriptorGSTranslatingCoords) {
 
 // Test logical to physical coordinate translation.
 // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates.
-// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping.
+// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned
+// as from original mapping.
 TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) {
-
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"));
 
     std::map<tt_logical_coords, tt_physical_coords> logical_to_physical;
@@ -111,7 +108,7 @@ TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) {
     for (auto it : logical_to_physical) {
         tt_physical_coords physical_coords = it.second;
         tt_logical_coords logical_coords = soc_desc.to_logical_coords(physical_coords);
-        
+
         // Expect that reverse mapping of physical coordinates gives the same logical coordinates
         // using which we got the physical coordinates.
         EXPECT_EQ(it.first, logical_coords);
@@ -120,9 +117,9 @@ TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) {
 
 // Test logical to virtual coordinate translation.
 // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates.
-// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping.
+// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned
+// as from original mapping.
 TEST(SocDescriptor, SocDescriptorGSLogicalVirtualMapping) {
-
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"));
 
     std::map<tt_logical_coords, tt_virtual_coords> logical_to_virtual;
diff --git a/tests/api/test_soc_descriptor_wh.cpp b/tests/api/test_soc_descriptor_wh.cpp
index 2e8f5367..a10afbdc 100644
--- a/tests/api/test_soc_descriptor_wh.cpp
+++ b/tests/api/test_soc_descriptor_wh.cpp
@@ -4,34 +4,32 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "gtest/gtest.h"
-
-#include "umd/device/tt_soc_descriptor.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 #include "tests/test_utils/soc_desc_test_utils.hpp"
-
+#include "umd/device/tt_soc_descriptor.h"
 
 // Wormhole workers - x-y annotation
 // functional_workers:
 //   [
-//    1-1,   2-1,   3-1,   4-1,   6-1,   7-1,   8-1,   9-1, 
-//    1-2,   2-2,   3-2,   4-2,   6-2,   7-2,   8-2,   9-2, 
-//    1-3,   2-3,   3-3,   4-3,   6-3,   7-3,   8-3,   9-3, 
-//    1-4,   2-4,   3-4,   4-4,   6-4,   7-4,   8-4,   9-4, 
-//    1-5,   2-5,   3-5,   4-5,   6-5,   7-5,   8-5,   9-5, 
-//    1-7,   2-7,   3-7,   4-7,   6-7,   7-7,   8-7,   9-7, 
-//    1-8,   2-8,   3-8,   4-8,   6-8,   7-8,   8-8,   9-8, 
-//    1-9,   2-9,   3-9,   4-9,   6-9,   7-9,   8-9,   9-9, 
-//    1-10,  2-10,  3-10,  4-10,  6-10,  7-10,  8-10,  9-10, 
-//    1-11,  2-11,  3-11,  4-11,  6-11,  7-11,  8-11,  9-11, 
+//    1-1,   2-1,   3-1,   4-1,   6-1,   7-1,   8-1,   9-1,
+//    1-2,   2-2,   3-2,   4-2,   6-2,   7-2,   8-2,   9-2,
+//    1-3,   2-3,   3-3,   4-3,   6-3,   7-3,   8-3,   9-3,
+//    1-4,   2-4,   3-4,   4-4,   6-4,   7-4,   8-4,   9-4,
+//    1-5,   2-5,   3-5,   4-5,   6-5,   7-5,   8-5,   9-5,
+//    1-7,   2-7,   3-7,   4-7,   6-7,   7-7,   8-7,   9-7,
+//    1-8,   2-8,   3-8,   4-8,   6-8,   7-8,   8-8,   9-8,
+//    1-9,   2-9,   3-9,   4-9,   6-9,   7-9,   8-9,   9-9,
+//    1-10,  2-10,  3-10,  4-10,  6-10,  7-10,  8-10,  9-10,
+//    1-11,  2-11,  3-11,  4-11,  6-11,  7-11,  8-11,  9-11,
 //   ]
 
 // Tests that all physical coordinates are same as all virtual coordinates
 // when there is no harvesting.
 TEST(SocDescriptor, SocDescriptorWHNoHarvesting) {
-
     const std::size_t harvesting_mask = 0;
-    
-    tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask);
+
+    tt_SocDescriptor soc_desc =
+        tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask);
 
     // We expect full grid size since there is no harvesting.
     tt_xy_pair worker_grid_size = soc_desc.worker_grid_size;
@@ -51,10 +49,10 @@ TEST(SocDescriptor, SocDescriptorWHNoHarvesting) {
 // We expect that the top left core will have virtual and physical coordinates (1, 1) and (1, 2) for
 // the logical coordinates if the first row is harvested.
 TEST(SocDescriptor, SocDescriptorWHTopLeftCore) {
-
     const std::size_t harvesting_mask = 1;
 
-    tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask);
+    tt_SocDescriptor soc_desc =
+        tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask);
     tt_xy_pair worker_grid_size = soc_desc.worker_grid_size;
 
     tt_logical_coords logical_coords = tt_logical_coords(0, 0);
@@ -70,13 +68,12 @@ TEST(SocDescriptor, SocDescriptorWHTopLeftCore) {
 
 // Test logical to physical coordinate translation.
 // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates.
-// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping.
+// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned
+// as from original mapping.
 TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) {
-
     const std::size_t max_num_harvested_y = 10;
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"));
     for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y); harvesting_mask++) {
-
         soc_desc.perform_harvesting(harvesting_mask);
 
         std::map<tt_logical_coords, tt_physical_coords> logical_to_physical;
@@ -96,8 +93,9 @@ TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) {
                 physical_coords_set.insert(physical_coords);
             }
         }
-        
-        // Expect that the number of physical coordinates is equal to the number of workers minus the number of harvested rows.
+
+        // Expect that the number of physical coordinates is equal to the number of workers minus the number of
+        // harvested rows.
         EXPECT_EQ(physical_coords_set.size(), worker_grid_size.x * (worker_grid_size.y - num_harvested_y));
 
         for (auto it : logical_to_physical) {
@@ -113,13 +111,12 @@ TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) {
 
 // Test logical to virtual coordinate translation.
 // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates.
-// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping.
+// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned
+// as from original mapping.
 TEST(SocDescriptor, SocDescriptorWHLogicalVirtualMapping) {
-
     const std::size_t max_num_harvested_y = 10;
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"));
     for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y); harvesting_mask++) {
-
         soc_desc.perform_harvesting(harvesting_mask);
 
         std::map<tt_logical_coords, tt_virtual_coords> logical_to_virtual;
@@ -153,17 +150,18 @@ TEST(SocDescriptor, SocDescriptorWHLogicalVirtualMapping) {
 
 // Test top left corner translation from logical to translated coordinates.
 TEST(SocDescriptor, SocDescriptorWHLogicalTranslatedTopLeft) {
-
     const std::size_t translated_x_start = 18;
     const std::size_t translated_y_start = 18;
-    const tt_translated_coords expected_translated_coords = tt_translated_coords(translated_x_start, translated_y_start);
+    const tt_translated_coords expected_translated_coords =
+        tt_translated_coords(translated_x_start, translated_y_start);
 
     const std::size_t max_num_harvested_y = 10;
     tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"));
-    // We go up to numbers less than 2^10 - 1 to test all possible harvesting masks, we don't want to try to convert if everything is harvested.
+    // We go up to numbers less than 2^10 - 1 to test all possible harvesting masks, we don't want to try to convert if
+    // everything is harvested.
     for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y) - 1; harvesting_mask++) {
         soc_desc.perform_harvesting(harvesting_mask);
-        
+
         tt_xy_pair worker_grid_size = soc_desc.worker_grid_size;
 
         std::size_t num_harvested_y = test_utils::get_num_harvested(harvesting_mask);
diff --git a/tests/blackhole/test_bh_common.h b/tests/blackhole/test_bh_common.h
index a84b2cdd..0297b191 100644
--- a/tests/blackhole/test_bh_common.h
+++ b/tests/blackhole/test_bh_common.h
@@ -3,12 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "umd/device/tt_xy_pair.h"
-#include "umd/device/tt_cluster_descriptor.h"
-#include "umd/device/cluster.h"
-
-#include "tests/test_utils/stimulus_generators.hpp"
 #include "eth_l1_address_map.h"
+#include "tests/test_utils/stimulus_generators.hpp"
+#include "tt_cluster_descriptor.h"
+#include "tt_xy_pair.h"
+#include "umd/device/cluster.h"
 
 using namespace tt::umd;
 
@@ -16,68 +15,68 @@ namespace tt::umd::test::utils {
 
 static void set_params_for_remote_txn(Cluster& device) {
     // Populate address map and NOC parameters that the driver needs for remote transactions
-    device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR});
+    device.set_device_l1_address_params(
+        {l1_mem::address_map::L1_BARRIER_BASE,
+         eth_l1_mem::address_map::ERISC_BARRIER_BASE,
+         eth_l1_mem::address_map::FW_VERSION_ADDR});
 }
 
 class BlackholeTestFixture : public ::testing::Test {
- protected:
-  // You can remove any or all of the following functions if their bodies would
-  // be empty.
+protected:
+    // You can remove any or all of the following functions if their bodies would
+    // be empty.
 
-  std::unique_ptr<Cluster> device;
+    std::unique_ptr<Cluster> device;
 
-  BlackholeTestFixture() {
+    BlackholeTestFixture() {}
 
-  }
-
-  ~BlackholeTestFixture() override {
-     // You can do clean-up work that doesn't throw exceptions here.
-  }
+    ~BlackholeTestFixture() override {
+        // You can do clean-up work that doesn't throw exceptions here.
+    }
 
-  virtual int get_detected_num_chips() = 0;
-  virtual bool is_test_skipped() = 0;
+    virtual int get_detected_num_chips() = 0;
+    virtual bool is_test_skipped() = 0;
 
-  // If the constructor and destructor are not enough for setting up
-  // and cleaning up each test, you can define the following methods:
+    // If the constructor and destructor are not enough for setting up
+    // and cleaning up each test, you can define the following methods:
 
-  void SetUp() override {
-    // Code here will be called immediately after the constructor (right
-    // before each test).
+    void SetUp() override {
+        // Code here will be called immediately after the constructor (right
+        // before each test).
 
-    if (is_test_skipped()) {
-        GTEST_SKIP() << "Test is skipped due to incorrect number of chips";
-    }
+        if (is_test_skipped()) {
+            GTEST_SKIP() << "Test is skipped due to incorrect number of chips";
+        }
 
-    // std::cout << "Setting Up Test." << std::endl;
-    assert(get_detected_num_chips() > 0);
-    auto devices = std::vector<chip_id_t>(get_detected_num_chips());
-    std::iota(devices.begin(), devices.end(), 0);
-    std::set<chip_id_t> target_devices = {devices.begin(), devices.end()};
-    uint32_t num_host_mem_ch_per_mmio_device = 1;
-    device = std::make_unique<Cluster>(test_utils::GetAbsPath(SOC_DESC_PATH), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
-    assert(device != nullptr);
-    assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips());
+        // std::cout << "Setting Up Test." << std::endl;
+        assert(get_detected_num_chips() > 0);
+        auto devices = std::vector<chip_id_t>(get_detected_num_chips());
+        std::iota(devices.begin(), devices.end(), 0);
+        std::set<chip_id_t> target_devices = {devices.begin(), devices.end()};
+        uint32_t num_host_mem_ch_per_mmio_device = 1;
+        device = std::make_unique<Cluster>(num_host_mem_ch_per_mmio_device, false, true, true);
+        assert(device != nullptr);
+        assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips());
 
-    set_params_for_remote_txn(*device);
+        set_params_for_remote_txn(*device);
 
-    tt_device_params default_params;
-    device->start_device(default_params);
+        tt_device_params default_params;
+        device->start_device(default_params);
 
-    device->deassert_risc_reset();
+        device->deassert_risc_reset();
 
-    device->wait_for_non_mmio_flush();
-  }
+        device->wait_for_non_mmio_flush();
+    }
 
-  void TearDown() override {
-    // Code here will be called immediately after each test (right
-    // before the destructor).
+    void TearDown() override {
+        // Code here will be called immediately after each test (right
+        // before the destructor).
 
-    if (!is_test_skipped()) {
-        // std::cout << "Tearing Down Test." << std::endl;
-        device->close_device();
+        if (!is_test_skipped()) {
+            // std::cout << "Tearing Down Test." << std::endl;
+            device->close_device();
+        }
     }
-  }
-
 };
 
-} // namespace tt::umd::test::utils
+}  // namespace tt::umd::test::utils
diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp
index 1ac75e65..735bad0d 100644
--- a/tests/blackhole/test_silicon_driver_bh.cpp
+++ b/tests/blackhole/test_silicon_driver_bh.cpp
@@ -2,30 +2,41 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "gtest/gtest.h"
 #include <umd/device/cluster.h>
-#include "eth_l1_address_map.h"
-#include "l1_address_map.h"
-#include "host_mem_address_map.h"
-#include <thread>
+
 #include <memory>
+#include <thread>
 
+#include "eth_l1_address_map.h"
+#include "gtest/gtest.h"
+#include "host_mem_address_map.h"
+#include "l1_address_map.h"
+#include "tests/test_utils/device_test_utils.hpp"
+#include "tests/test_utils/generate_cluster_desc.hpp"
 #include "umd/device/blackhole_implementation.h"
 #include "umd/device/tt_cluster_descriptor.h"
-#include "tests/test_utils/generate_cluster_desc.hpp"
-#include "tests/test_utils/device_test_utils.hpp"
 
 using namespace tt::umd;
 
 void set_params_for_remote_txn(Cluster& device) {
     // Populate address map and NOC parameters that the driver needs for remote transactions
-    device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR});
+    device.set_device_l1_address_params(
+        {l1_mem::address_map::L1_BARRIER_BASE,
+         eth_l1_mem::address_map::ERISC_BARRIER_BASE,
+         eth_l1_mem::address_map::FW_VERSION_ADDR});
 }
 
 std::int32_t get_static_tlb_index(tt_xy_pair target) {
-    bool is_eth_location = std::find(std::begin(tt::umd::blackhole::ETH_LOCATIONS), std::end(tt::umd::blackhole::ETH_LOCATIONS), target) != std::end(tt::umd::blackhole::ETH_LOCATIONS);
-    bool is_tensix_location = std::find(std::begin(tt::umd::blackhole::T6_X_LOCATIONS), std::end(tt::umd::blackhole::T6_X_LOCATIONS), target.x) != std::end(tt::umd::blackhole::T6_X_LOCATIONS) &&
-                            std::find(std::begin(tt::umd::blackhole::T6_Y_LOCATIONS), std::end(tt::umd::blackhole::T6_Y_LOCATIONS), target.y) != std::end(tt::umd::blackhole::T6_Y_LOCATIONS);
+    bool is_eth_location =
+        std::find(std::begin(tt::umd::blackhole::ETH_LOCATIONS), std::end(tt::umd::blackhole::ETH_LOCATIONS), target) !=
+        std::end(tt::umd::blackhole::ETH_LOCATIONS);
+    bool is_tensix_location =
+        std::find(
+            std::begin(tt::umd::blackhole::T6_X_LOCATIONS), std::end(tt::umd::blackhole::T6_X_LOCATIONS), target.x) !=
+            std::end(tt::umd::blackhole::T6_X_LOCATIONS) &&
+        std::find(
+            std::begin(tt::umd::blackhole::T6_Y_LOCATIONS), std::end(tt::umd::blackhole::T6_Y_LOCATIONS), target.y) !=
+            std::end(tt::umd::blackhole::T6_Y_LOCATIONS);
     if (is_eth_location) {
         if (target.y == 6) {
             target.y = 1;
@@ -61,7 +72,8 @@ std::int32_t get_static_tlb_index(tt_xy_pair target) {
 
 std::set<chip_id_t> get_target_devices() {
     std::set<chip_id_t> target_devices;
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
+    std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq =
+        tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
     for (int i = 0; i < cluster_desc_uniq->get_number_of_chips(); i++) {
         target_devices.insert(i);
     }
@@ -73,8 +85,15 @@ TEST(SiliconDriverBH, CreateDestroy) {
     uint32_t num_host_mem_ch_per_mmio_device = 1;
     tt_device_params default_params;
     // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting
-    for(int i = 0; i < 50; i++) {
-        Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false);
+    for (int i = 0; i < 50; i++) {
+        Cluster device = Cluster(
+            test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"),
+            tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
+            target_devices,
+            num_host_mem_ch_per_mmio_device,
+            false,
+            true,
+            false);
         set_params_for_remote_txn(device);
         device.start_device(default_params);
         device.deassert_risc_reset();
@@ -85,81 +104,113 @@ TEST(SiliconDriverBH, CreateDestroy) {
 // TEST(SiliconDriverWH, Harvesting) {
 //     std::set<chip_id_t> target_devices = {0, 1};
 //     std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {{0, 30}, {1, 60}};
-    
+
 //     {
-//         std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
+//         std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq =
+//             tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
 //         if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) {
-//             GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system";
+//             GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula
+//             system";
 //         }
 //     }
 
 //     uint32_t num_host_mem_ch_per_mmio_device = 1;
-//     Cluster device = Cluster("./tests/soc_descs/wormhole_b0_8x10.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
+//     Cluster device = Cluster(
+//         "./tests/soc_descs/wormhole_b0_8x10.yaml",
+//         tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
+//         target_devices,
+//         num_host_mem_ch_per_mmio_device,
+//         false,
+//         true,
+//         true,
+//         simulated_harvesting_masks);
 //     auto sdesc_per_chip = device.get_virtual_soc_descriptors();
 
 //     ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting";
 
-//     for(const auto& chip : sdesc_per_chip) {
-//         ASSERT_EQ(chip.second.workers.size(), 48) << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first;
+//     for (const auto& chip : sdesc_per_chip) {
+//         ASSERT_EQ(chip.second.workers.size(), 48)
+//             << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first;
 //     }
-//     ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0), 30) << "Expected first chip to have harvesting mask of 30";
-//     ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 60) << "Expected second chip to have harvesting mask of 60";
+//     ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0), 30)
+//         << "Expected first chip to have harvesting mask of 30";
+//     ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 60)
+//         << "Expected second chip to have harvesting mask of 60";
 // }
 
 // TEST(SiliconDriverWH, CustomSocDesc) {
 //     std::set<chip_id_t> target_devices = {0, 1};
 //     std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {{0, 30}, {1, 60}};
 //     {
-//         std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
+//         std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq =
+//             tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
 //         if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) {
-//             GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system";
+//             GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula
+//             system";
 //         }
 //     }
 
 //     uint32_t num_host_mem_ch_per_mmio_device = 1;
 //     // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting
-//     Cluster device = Cluster("./tests/soc_descs/wormhole_b0_1x1.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks);
+//     Cluster device = Cluster(
+//         "./tests/soc_descs/wormhole_b0_1x1.yaml",
+//         tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
+//         target_devices,
+//         num_host_mem_ch_per_mmio_device,
+//         false,
+//         true,
+//         false,
+//         simulated_harvesting_masks);
 //     auto sdesc_per_chip = device.get_virtual_soc_descriptors();
-    
-//     ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled";
-//     for(const auto& chip : sdesc_per_chip) {
+
+//     ASSERT_EQ(device.using_harvested_soc_descriptors(), false)
+//         << "SOC descriptors should not be modified when harvesting is disabled";
+//     for (const auto& chip : sdesc_per_chip) {
 //         ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver";
 //     }
 // }
 
 // TEST(SiliconDriverWH, HarvestingRuntime) {
-
-//     auto get_static_tlb_index_callback = [] (tt_xy_pair target) {
-//         return get_static_tlb_index(target);
-//     };
+//     auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); };
 
 //     std::set<chip_id_t> target_devices = {0, 1};
 //     std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {{0, 30}, {1, 60}};
 //     {
-//         std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
+//         std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq =
+//             tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
 //         if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) {
-//             GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system";
+//             GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula
+//             system";
 //         }
 //     }
 
 //     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-//     Cluster device = Cluster("./tests/soc_descs/wormhole_b0_8x10.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
+
+//     Cluster device = Cluster(
+//         "./tests/soc_descs/wormhole_b0_8x10.yaml",
+//         tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
+//         target_devices,
+//         num_host_mem_ch_per_mmio_device,
+//         false,
+//         true,
+//         true,
+//         simulated_harvesting_masks);
 //     set_params_for_remote_txn(device);
 //     auto mmio_devices = device.get_target_mmio_device_ids();
-    
-//     for(int i = 0; i < target_devices.size(); i++) {
+
+//     for (int i = 0; i < target_devices.size(); i++) {
 //         // Iterate over MMIO devices and only setup static TLBs for worker cores
-//         if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
+//         if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
 //             auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-//             for(auto& core : sdesc.workers) {
-//                 // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.  
-//                 device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
+//             for (auto& core : sdesc.workers) {
+//                 // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.
+//                 device.configure_tlb(
+//                     i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
 //             }
-//         } 
+//         }
 //     }
 //     device.setup_core_to_tlb_map(get_static_tlb_index_callback);
-    
+
 //     tt_device_params default_params;
 //     device.start_device(default_params);
 //     device.deassert_risc_reset();
@@ -169,29 +220,57 @@ TEST(SiliconDriverBH, CreateDestroy) {
 //     std::vector<uint32_t> readback_vec = {};
 //     std::vector<uint32_t> zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
-
-//     for(int i = 0; i < target_devices.size(); i++) {
+//     for (int i = 0; i < target_devices.size(); i++) {
 //         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
 //         std::uint32_t dynamic_write_address = 0x40000000;
-//         for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses
-//             for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-//                 device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "");
-//                 device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB");
-//                 device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited
-                
+//         for (int loop = 0; loop < 100;
+//              loop++) {  // Write to each core a 100 times at different statically mapped addresses
+//             for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+//                 device.write_to_device(
+//                     vector_to_write.data(),
+//                     vector_to_write.size() * sizeof(std::uint32_t),
+//                     tt_cxy_pair(i, core),
+//                     address,
+//                     "");
+//                 device.write_to_device(
+//                     vector_to_write.data(),
+//                     vector_to_write.size() * sizeof(std::uint32_t),
+//                     tt_cxy_pair(i, core),
+//                     dynamic_write_address,
+//                     "SMALL_READ_WRITE_TLB");
+//                 device.wait_for_non_mmio_flush();  // Barrier to ensure that all writes over ethernet were commited
+
 //                 test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "");
-//                 test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB");
-//                 ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
-//                 ASSERT_EQ(vector_to_write, dynamic_readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+//                 test_utils::read_data_from_device(
+//                     device,
+//                     dynamic_readback_vec,
+//                     tt_cxy_pair(i, core),
+//                     dynamic_write_address,
+//                     40,
+//                     "SMALL_READ_WRITE_TLB");
+//                 ASSERT_EQ(vector_to_write, readback_vec)
+//                     << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+//                 ASSERT_EQ(vector_to_write, dynamic_readback_vec)
+//                     << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
 //                 device.wait_for_non_mmio_flush();
-                
-//                 device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data
-//                 device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); // Clear any written data
+
+//                 device.write_to_device(
+//                     zeros.data(),
+//                     zeros.size() * sizeof(std::uint32_t),
+//                     tt_cxy_pair(i, core),
+//                     dynamic_write_address,
+//                     "SMALL_READ_WRITE_TLB");  // Clear any written data
+//                 device.write_to_device(
+//                     zeros.data(),
+//                     zeros.size() * sizeof(std::uint32_t),
+//                     tt_cxy_pair(i, core),
+//                     address,
+//                     "");  // Clear any written data
 //                 device.wait_for_non_mmio_flush();
 //                 readback_vec = {};
 //                 dynamic_readback_vec = {};
 //             }
-//             address += 0x20; // Increment by uint32_t size for each write
+//             address += 0x20;  // Increment by uint32_t size for each write
 //             dynamic_write_address += 0x20;
 //         }
 //     }
@@ -199,45 +278,44 @@ TEST(SiliconDriverBH, CreateDestroy) {
 // }
 
 TEST(SiliconDriverBH, UnalignedStaticTLB_RW) {
-    auto get_static_tlb_index_callback = [] (tt_xy_pair target) {
-        return get_static_tlb_index(target);
-    };
+    auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); };
 
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
 
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over MMIO devices and only setup static TLBs for worker cores
-        if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
+        if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
             auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-            for(auto& core : sdesc.workers) {
-                // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.  
-                device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
+            for (auto& core : sdesc.workers) {
+                // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.
+                device.configure_tlb(
+                    i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
             }
             device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
-        } 
+        }
     }
-    
+
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
 
     std::vector<uint32_t> unaligned_sizes = {3, 14, 21, 255, 362, 430, 1022, 1023, 1025};
-    for(int i = 0; i < target_devices.size(); i++) {
-        for(const auto& size : unaligned_sizes) {
+    for (int i = 0; i < target_devices.size(); i++) {
+        for (const auto& size : unaligned_sizes) {
             std::vector<uint8_t> write_vec(size, 0);
-            for(int i = 0; i < size; i++){
+            for (int i = 0; i < size; i++) {
                 write_vec[i] = size + i;
             }
             std::vector<uint8_t> readback_vec(size, 0);
             std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
-            for(int loop = 0; loop < 50; loop++){
-                for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+            for (int loop = 0; loop < 50; loop++) {
+                for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
                     device.write_to_device(write_vec.data(), size, tt_cxy_pair(i, core), address, "");
                     device.wait_for_non_mmio_flush();
                     device.read_from_device(readback_vec.data(), tt_cxy_pair(i, core), address, size, "");
@@ -251,37 +329,35 @@ TEST(SiliconDriverBH, UnalignedStaticTLB_RW) {
                 }
                 address += 0x20;
             }
-
         }
     }
     device.close_device();
 }
 
 TEST(SiliconDriverBH, StaticTLB_RW) {
-    auto get_static_tlb_index_callback = [] (tt_xy_pair target) {
-        return get_static_tlb_index(target);
-    };
+    auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); };
 
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
 
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over MMIO devices and only setup static TLBs for worker cores
-        if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
+        if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
             auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-            for(auto& core : sdesc.workers) {
-                // Statically mapping a 2MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.  
-                device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
+            for (auto& core : sdesc.workers) {
+                // Statically mapping a 2MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.
+                device.configure_tlb(
+                    i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
             }
             device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
-        } 
+        }
     }
-    
+
     printf("MT: Static TLBs set\n");
 
     tt_device_params default_params;
@@ -292,31 +368,44 @@ TEST(SiliconDriverBH, StaticTLB_RW) {
     std::vector<uint32_t> readback_vec = {};
     std::vector<uint32_t> zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     // Check functionality of Static TLBs by reading adn writing from statically mapped address space
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
-        for(int loop = 0; loop < 1; loop++){ // Write to each core a 100 times at different statically mapped addresses
-            for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "");
-                device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited
+        for (int loop = 0; loop < 1;
+             loop++) {  // Write to each core a 100 times at different statically mapped addresses
+            for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "");
+                device.wait_for_non_mmio_flush();  // Barrier to ensure that all writes over ethernet were commited
                 test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 device.wait_for_non_mmio_flush();
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");  // Clear any written data
                 device.wait_for_non_mmio_flush();
                 readback_vec = {};
             }
-            address += 0x20; // Increment by uint32_t size for each write
+            address += 0x20;  // Increment by uint32_t size for each write
         }
     }
-    device.close_device();    
+    device.close_device();
 }
 
 TEST(SiliconDriverBH, DynamicTLB_RW) {
-    // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction
+    // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for
+    // each transaction
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
 
     set_params_for_remote_txn(device);
 
@@ -329,42 +418,68 @@ TEST(SiliconDriverBH, DynamicTLB_RW) {
     std::vector<uint32_t> zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     std::vector<uint32_t> readback_vec = {};
 
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
-        for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses
-            for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB");
-                device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+        for (int loop = 0; loop < 100;
+             loop++) {  // Write to each core a 100 times at different statically mapped addresses
+            for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
+                device.wait_for_non_mmio_flush();  // Barrier to ensure that all writes over ethernet were commited
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 device.wait_for_non_mmio_flush();
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB");
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
                 device.wait_for_non_mmio_flush();
                 readback_vec = {};
             }
-            address += 0x20; // Increment by uint32_t size for each write
+            address += 0x20;  // Increment by uint32_t size for each write
         }
     }
     printf("Target Tensix cores completed\n");
-    
+
     // Target DRAM channel 0
     constexpr int NUM_CHANNELS = 8;
     std::vector<uint32_t> dram_vector_to_write = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
     std::uint32_t address = 0x400;
-    for(int i = 0; i < target_devices.size(); i++) {
-        for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses
-            for (int ch=0; ch<NUM_CHANNELS; ch++) {
+    for (int i = 0; i < target_devices.size(); i++) {
+        for (int loop = 0; loop < 100;
+             loop++) {  // Write to each core a 100 times at different statically mapped addresses
+            for (int ch = 0; ch < NUM_CHANNELS; ch++) {
                 std::vector<tt_xy_pair> chan = device.get_virtual_soc_descriptors().at(i).dram_cores.at(ch);
                 tt_xy_pair subchan = chan.at(0);
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, subchan), address, "SMALL_READ_WRITE_TLB");
-                device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, subchan), address, 40, "SMALL_READ_WRITE_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << subchan.x << "-" << subchan.y << "does not match what was written";
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, subchan),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
+                device.wait_for_non_mmio_flush();  // Barrier to ensure that all writes over ethernet were commited
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, subchan), address, 40, "SMALL_READ_WRITE_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << subchan.x << "-"
+                                                         << subchan.y << "does not match what was written";
                 device.wait_for_non_mmio_flush();
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, subchan), address, "SMALL_READ_WRITE_TLB");
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, subchan),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
                 device.wait_for_non_mmio_flush();
                 readback_vec = {};
-                address += 0x20; // Increment by uint32_t size for each write
+                address += 0x20;  // Increment by uint32_t size for each write
             }
         }
     }
@@ -380,8 +495,8 @@ TEST(SiliconDriverBH, MultiThreadedDevice) {
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
-    
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
+
     set_params_for_remote_txn(device);
 
     tt_device_params default_params;
@@ -392,11 +507,18 @@ TEST(SiliconDriverBH, MultiThreadedDevice) {
         std::vector<uint32_t> vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
         std::vector<uint32_t> readback_vec = {};
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
-        for(int loop = 0; loop < 100; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB");
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+        for (int loop = 0; loop < 100; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(0, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 readback_vec = {};
             }
             address += 0x20;
@@ -407,12 +529,19 @@ TEST(SiliconDriverBH, MultiThreadedDevice) {
         std::vector<uint32_t> vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
         std::vector<uint32_t> readback_vec = {};
         std::uint32_t address = 0x30000000;
-        for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) {
-            for(int loop = 0; loop < 100; loop++) {
-                for(auto& core : core_ls) {
-                    device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB");
-                    test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
-                    ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+        for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) {
+            for (int loop = 0; loop < 100; loop++) {
+                for (auto& core : core_ls) {
+                    device.write_to_device(
+                        vector_to_write.data(),
+                        vector_to_write.size() * sizeof(std::uint32_t),
+                        tt_cxy_pair(0, core),
+                        address,
+                        "SMALL_READ_WRITE_TLB");
+                    test_utils::read_data_from_device(
+                        device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
+                    ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y
+                                                             << "does not match what was written";
                     readback_vec = {};
                 }
                 address += 0x20;
@@ -427,25 +556,23 @@ TEST(SiliconDriverBH, MultiThreadedDevice) {
 
 TEST(SiliconDriverBH, MultiThreadedMemBar) {
     // Have 2 threads read and write from a single device concurrently
-    // All (fairly large) transactions go through a static TLB. 
+    // All (fairly large) transactions go through a static TLB.
     // We want to make sure the memory barrier is thread/process safe.
 
     // Memory barrier flags get sent to address 0 for all channels in this test
-    auto get_static_tlb_index_callback = [] (tt_xy_pair target) {
-        return get_static_tlb_index(target);
-    };
+    auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); };
 
     std::set<chip_id_t> target_devices = get_target_devices();
     uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over devices and only setup static TLBs for functional worker cores
         auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-        for(auto& core : sdesc.workers) {
-            // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. 
+        for (auto& core : sdesc.workers) {
+            // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
             device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr);
         }
         device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
@@ -454,24 +581,41 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) {
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
-    
+
     std::vector<uint32_t> readback_membar_vec = {};
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        test_utils::read_data_from_device(
+            device,
+            readback_membar_vec,
+            tt_cxy_pair(0, core),
+            l1_mem::address_map::L1_BARRIER_BASE,
+            4,
+            "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers were correctly initialized on all workers
         readback_membar_vec = {};
     }
 
-    for(int chan = 0; chan <  device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) {
+    for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) {
         auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0);
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM
+        test_utils::read_data_from_device(
+            device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers were correctly initialized on all DRAM
         readback_membar_vec = {};
     }
-    
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores
+
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) {
+        test_utils::read_data_from_device(
+            device,
+            readback_membar_vec,
+            tt_cxy_pair(0, core),
+            eth_l1_mem::address_map::ERISC_BARRIER_BASE,
+            4,
+            "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0),
+            187);  // Ensure that memory barriers were correctly initialized on all ethernet cores
         readback_membar_vec = {};
     }
 
@@ -481,38 +625,43 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) {
     std::vector<uint32_t> vec2(2560);
     std::vector<uint32_t> zeros(2560, 0);
 
-    for(int i = 0; i < vec1.size(); i++) {
+    for (int i = 0; i < vec1.size(); i++) {
         vec1.at(i) = i;
     }
-    for(int i = 0; i < vec2.size(); i++) {
+    for (int i = 0; i < vec2.size(); i++) {
         vec2.at(i) = vec1.size() + i;
     }
     std::thread th1 = std::thread([&] {
         std::uint32_t address = base_addr;
-        for(int loop = 0; loop < 50; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        for (int loop = 0; loop < 50; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
                 std::vector<uint32_t> readback_vec = {};
-                device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core});
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), "");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), "");
                 ASSERT_EQ(readback_vec, vec1);
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 readback_vec = {};
             }
-            
         }
     });
 
     std::thread th2 = std::thread([&] {
         std::uint32_t address = base_addr + vec1.size() * 4;
-        for(int loop = 0; loop < 50; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        for (int loop = 0; loop < 50; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
                 std::vector<uint32_t> readback_vec = {};
-                device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core});
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), "");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), "");
                 ASSERT_EQ(readback_vec, vec2);
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ;
+                device.write_to_device(
+                    zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 readback_vec = {};
             }
         }
@@ -521,27 +670,43 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) {
     th1.join();
     th2.join();
 
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        test_utils::read_data_from_device(
+            device,
+            readback_membar_vec,
+            tt_cxy_pair(0, core),
+            l1_mem::address_map::L1_BARRIER_BASE,
+            4,
+            "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers end up in the correct sate for workers
         readback_membar_vec = {};
     }
 
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for ethernet cores
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) {
+        test_utils::read_data_from_device(
+            device,
+            readback_membar_vec,
+            tt_cxy_pair(0, core),
+            eth_l1_mem::address_map::ERISC_BARRIER_BASE,
+            4,
+            "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0),
+            187);  // Ensure that memory barriers end up in the correct sate for ethernet cores
         readback_membar_vec = {};
     }
     device.close_device();
 }
 
-TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole .. wait_for_non_mmio_flush() is not working as expected?
+TEST(SiliconDriverBH, DISABLED_BroadcastWrite) {  // Cannot broadcast to tensix/ethernet and DRAM simultaneously on
+                                                  // Blackhole .. wait_for_non_mmio_flush() is not working as expected?
     // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
 
@@ -555,62 +720,95 @@ TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/e
     std::set<uint32_t> rows_to_exclude_for_dram_broadcast = {};
     std::set<uint32_t> cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9};
 
-    for(const auto& size : broadcast_sizes) {
+    for (const auto& size : broadcast_sizes) {
         std::vector<uint32_t> vector_to_write(size);
         std::vector<uint32_t> zeros(size);
         std::vector<uint32_t> readback_vec = {};
-        for(int i = 0; i < size; i++) {
+        for (int i = 0; i < size; i++) {
             vector_to_write[i] = i;
             zeros[i] = 0;
         }
         // Broadcast to Tensix
-        device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB");
-        device.wait_for_non_mmio_flush(); // flush here so we don't simultaneously broadcast to DRAM? 
+        device.broadcast_write_to_cluster(
+            vector_to_write.data(),
+            vector_to_write.size() * 4,
+            address,
+            {},
+            rows_to_exclude,
+            cols_to_exclude,
+            "LARGE_WRITE_TLB");
+        device.wait_for_non_mmio_flush();  // flush here so we don't simultaneously broadcast to DRAM?
         // Broadcast to DRAM
-        device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB");
+        device.broadcast_write_to_cluster(
+            vector_to_write.data(),
+            vector_to_write.size() * 4,
+            address,
+            {},
+            rows_to_exclude_for_dram_broadcast,
+            cols_to_exclude_for_dram_broadcast,
+            "LARGE_WRITE_TLB");
         device.wait_for_non_mmio_flush();
 
-        for(const auto i : target_devices) {
-            for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue;
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted";
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data
+        for (const auto i : target_devices) {
+            for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) {
+                    continue;
+                }
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y
+                                                         << "does not match what was broadcasted";
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "LARGE_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
-            for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) {
+            for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) {
                 const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0);
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size;
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y
+                    << " does not match what was broadcasted " << size;
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "LARGE_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
         }
         // Wait for data to be cleared before writing next block
         device.wait_for_non_mmio_flush();
     }
-    device.close_device();    
+    device.close_device();
 }
 
-TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as above..
+TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) {  // same problem as above..
     // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
 
     tt_device_params default_params;
     device.start_device(default_params);
     auto eth_version = device.get_ethernet_fw_version();
-    bool virtual_bcast_supported = (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en;
+    bool virtual_bcast_supported =
+        (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en;
     if (!virtual_bcast_supported) {
         device.close_device();
-        GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled";
+        GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support "
+                        "Virtual Coordinate Broadcast or NOC translation is not enabled";
     }
-    
+
     device.deassert_risc_reset();
     std::vector<uint32_t> broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384};
     uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
@@ -619,38 +817,69 @@ TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as
     std::set<uint32_t> rows_to_exclude_for_dram_broadcast = {};
     std::set<uint32_t> cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9};
 
-    for(const auto& size : broadcast_sizes) {
+    for (const auto& size : broadcast_sizes) {
         std::vector<uint32_t> vector_to_write(size);
         std::vector<uint32_t> zeros(size);
         std::vector<uint32_t> readback_vec = {};
-        for(int i = 0; i < size; i++) {
+        for (int i = 0; i < size; i++) {
             vector_to_write[i] = i;
             zeros[i] = 0;
         }
         // Broadcast to Tensix
-        device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB");
+        device.broadcast_write_to_cluster(
+            vector_to_write.data(),
+            vector_to_write.size() * 4,
+            address,
+            {},
+            rows_to_exclude,
+            cols_to_exclude,
+            "LARGE_WRITE_TLB");
         // Broadcast to DRAM
-        device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB");        
+        device.broadcast_write_to_cluster(
+            vector_to_write.data(),
+            vector_to_write.size() * 4,
+            address,
+            {},
+            rows_to_exclude_for_dram_broadcast,
+            cols_to_exclude_for_dram_broadcast,
+            "LARGE_WRITE_TLB");
         device.wait_for_non_mmio_flush();
 
-        for(const auto i : target_devices) {
-            for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue;
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted";
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data
+        for (const auto i : target_devices) {
+            for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) {
+                    continue;
+                }
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y
+                                                         << "does not match what was broadcasted";
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "LARGE_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
-            for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) {
+            for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) {
                 const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0);
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size;
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y
+                    << " does not match what was broadcasted " << size;
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "LARGE_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
         }
         // Wait for data to be cleared before writing next block
         device.wait_for_non_mmio_flush();
     }
-    device.close_device();    
+    device.close_device();
 }
diff --git a/tests/emulation/test_emulation_device.cpp b/tests/emulation/test_emulation_device.cpp
index 8ff436ba..b4136807 100644
--- a/tests/emulation/test_emulation_device.cpp
+++ b/tests/emulation/test_emulation_device.cpp
@@ -3,10 +3,10 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "gtest/gtest.h"
-#include "device/tt_soc_descriptor.h"
 #include "device/cluster.h"
 #include "device/tt_emulation_device.h"
+#include "device/tt_soc_descriptor.h"
+#include "gtest/gtest.h"
 
 // DEPRECATED TEST SUITE !!!
 
@@ -22,7 +22,7 @@ TEST(EmulationDeviceGS, BasicEmuTest) {
     uint64_t l1_addr = 0x1000;
     std::vector<uint32_t> wdata(size);
     std::vector<uint32_t> rdata(size);
-    
+
     try {
         device.start_device(default_params);
 
@@ -31,13 +31,23 @@ TEST(EmulationDeviceGS, BasicEmuTest) {
         }
         device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), l1_addr, "l1");
         test_utils::read_data_from_device(device, rdata, tt_cxy_pair(0, core), l1_addr, size, "l1");
-        ASSERT_EQ(wdata, rdata) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+        ASSERT_EQ(wdata, rdata) << "Vector read back from core " << core.x << "-" << core.y
+                                << "does not match what was written";
 
         device.deassert_risc_reset();
-        device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1");
+        device.write_to_device(
+            wdata.data(),
+            wdata.size() * sizeof(std::uint32_t),
+            tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)),
+            l1_addr,
+            "l1");
         device.assert_risc_reset();
-        device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1");
-
+        device.write_to_device(
+            wdata.data(),
+            wdata.size() * sizeof(std::uint32_t),
+            tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)),
+            l1_addr,
+            "l1");
 
     } catch (const std::exception &e) {
         std::cout << "Error: " << e.what() << std::endl;
diff --git a/tests/galaxy/test_galaxy_common.cpp b/tests/galaxy/test_galaxy_common.cpp
index 546c4c7f..4cff57f1 100644
--- a/tests/galaxy/test_galaxy_common.cpp
+++ b/tests/galaxy/test_galaxy_common.cpp
@@ -10,9 +10,18 @@ void move_data(
     Cluster& device, tt_multichip_core_addr sender_core, tt_multichip_core_addr receiver_core, uint32_t size) {
     std::vector<uint32_t> readback_vec = {};
     test_utils::read_data_from_device(
-        device, readback_vec, tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, size, "SMALL_READ_WRITE_TLB");
+        device,
+        readback_vec,
+        tt_cxy_pair(sender_core.chip, sender_core.core),
+        sender_core.addr,
+        size,
+        "SMALL_READ_WRITE_TLB");
     device.write_to_device(
-        readback_vec.data(), readback_vec.size() * sizeof(std::uint32_t), tt_cxy_pair(receiver_core.chip, receiver_core.core), receiver_core.addr, "SMALL_READ_WRITE_TLB");
+        readback_vec.data(),
+        readback_vec.size() * sizeof(std::uint32_t),
+        tt_cxy_pair(receiver_core.chip, receiver_core.core),
+        receiver_core.addr,
+        "SMALL_READ_WRITE_TLB");
     device.wait_for_non_mmio_flush();  // Barrier to ensure that all writes over ethernet were commited
 
     return;
@@ -25,7 +34,12 @@ void broadcast_data(
     uint32_t size) {
     std::vector<uint32_t> readback_vec = {};
     test_utils::read_data_from_device(
-        device, readback_vec, tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, size, "SMALL_READ_WRITE_TLB");
+        device,
+        readback_vec,
+        tt_cxy_pair(sender_core.chip, sender_core.core),
+        sender_core.addr,
+        size,
+        "SMALL_READ_WRITE_TLB");
     for (const auto& receiver_core : receiver_cores) {
         device.write_to_device(
             readback_vec.data(),
diff --git a/tests/galaxy/test_galaxy_common.h b/tests/galaxy/test_galaxy_common.h
index 1198d0a4..01ecc704 100644
--- a/tests/galaxy/test_galaxy_common.h
+++ b/tests/galaxy/test_galaxy_common.h
@@ -4,37 +4,32 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-
 #pragma once
 
 #include <set>
+#include <sstream>
 #include <string>
 #include <tuple>
 #include <vector>
-#include <sstream>
 
+#include "fmt/core.h"
 #include "umd/device/cluster.h"
 #include "umd/device/tt_xy_pair.h"
 
-#include "fmt/core.h"
-
 // static const std::string SOC_DESC_PATH = "./tests/soc_descs/wormhole_b0_8x10.yaml";
 
 using namespace tt::umd;
 
-using chip_id_t = int;
-using ethernet_channel_t = int;
-using eth_coord_t = std::tuple<int, int, int, int>;  // x, y, rack, shelf
 struct tt_multichip_core_addr {
     tt_multichip_core_addr() : core{}, chip{}, addr{} {}
+
     tt_multichip_core_addr(chip_id_t chip, tt_xy_pair core, std::uint64_t addr) : core(core), chip(chip), addr(addr) {}
 
     tt_xy_pair core;
     chip_id_t chip;
     std::uint64_t addr;
-    std::string str() const {
-        return fmt::format("(chip={},x={},y={},addr=0x{:x})", chip, core.x, core.y, addr);
-    }
+
+    std::string str() const { return fmt::format("(chip={},x={},y={},addr=0x{:x})", chip, core.x, core.y, addr); }
 };
 
 // SIMPLE DATAMOVEMENT API BASED ON UMD
diff --git a/tests/galaxy/test_umd_concurrent_threads.cpp b/tests/galaxy/test_umd_concurrent_threads.cpp
index 2c4f6d42..e668160f 100644
--- a/tests/galaxy/test_umd_concurrent_threads.cpp
+++ b/tests/galaxy/test_umd_concurrent_threads.cpp
@@ -2,22 +2,21 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <filesystem>
 #include <numeric>
 #include <thread>
-#include <filesystem>
 
-#include "gtest/gtest.h"
 #include "common/logger.hpp"
-#include "umd/device/tt_cluster_descriptor.h"
-#include "umd/device/cluster.h"
 #include "eth_interface.h"
+#include "gtest/gtest.h"
 #include "host_mem_address_map.h"
 #include "l1_address_map.h"
-
 #include "test_galaxy_common.h"
-#include "tests/wormhole/test_wh_common.h"
-#include "tests/test_utils/generate_cluster_desc.hpp"
 #include "tests/test_utils/device_test_utils.hpp"
+#include "tests/test_utils/generate_cluster_desc.hpp"
+#include "tests/wormhole/test_wh_common.h"
+#include "umd/device/cluster.h"
+#include "umd/device/tt_cluster_descriptor.h"
 
 static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml";
 
@@ -52,7 +51,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) {
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
     Cluster device = Cluster(
-        test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, all_devices, num_host_mem_ch_per_mmio_device, false, true);
+        test_utils::GetAbsPath(SOC_DESC_PATH),
+        cluster_desc_path,
+        all_devices,
+        num_host_mem_ch_per_mmio_device,
+        false,
+        true);
     const auto sdesc_per_chip = device.get_virtual_soc_descriptors();
 
     tt::umd::test::utils::set_params_for_remote_txn(device);
@@ -70,7 +74,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) {
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
         for (const auto& chip : target_devices_th1) {
             for (auto& core : sdesc_per_chip.at(chip).workers) {
-                device.write_to_device(vector_to_write_th1.data(), vector_to_write_th1.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB");
+                device.write_to_device(
+                    vector_to_write_th1.data(),
+                    vector_to_write_th1.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(chip, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
             }
         }
         device.wait_for_non_mmio_flush();
@@ -91,7 +100,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) {
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
         for (const auto& chip : target_devices_th2) {
             for (auto& core : sdesc_per_chip.at(chip).workers) {
-                device.write_to_device(vector_to_write_th2.data(), vector_to_write_th2.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB");
+                device.write_to_device(
+                    vector_to_write_th2.data(),
+                    vector_to_write_th2.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(chip, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
             }
         }
         device.wait_for_non_mmio_flush();
@@ -140,7 +154,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) {
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
     Cluster device = Cluster(
-        test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, all_devices, num_host_mem_ch_per_mmio_device, false, true);
+        test_utils::GetAbsPath(SOC_DESC_PATH),
+        cluster_desc_path,
+        all_devices,
+        num_host_mem_ch_per_mmio_device,
+        false,
+        true);
     const auto sdesc_per_chip = device.get_virtual_soc_descriptors();
 
     tt::umd::test::utils::set_params_for_remote_txn(device);
@@ -162,7 +181,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) {
         std::uint32_t address = 0x4000000;
         for (const auto& chip : target_devices_th1) {
             for (auto& core : dram_cores) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB");
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(chip, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
             }
         }
         device.wait_for_non_mmio_flush();
@@ -182,7 +206,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) {
         std::uint32_t address = 0x5000000;
         for (const auto& chip : target_devices_th2) {
             for (auto& core : sdesc_per_chip.at(chip).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB");
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(chip, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
             }
         }
         device.wait_for_non_mmio_flush();
@@ -217,7 +246,12 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) {
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
     Cluster device = Cluster(
-        test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true);
+        test_utils::GetAbsPath(SOC_DESC_PATH),
+        cluster_desc_path,
+        target_devices,
+        num_host_mem_ch_per_mmio_device,
+        false,
+        true);
     const auto sdesc_per_chip = device.get_virtual_soc_descriptors();
 
     tt::umd::test::utils::set_params_for_remote_txn(device);
@@ -239,7 +273,12 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) {
         chip_id_t mmio_chip = cluster_desc->get_chips_with_mmio().begin()->first;
         std::vector<uint32_t> readback_vec = {};
         std::uint32_t address = 0x0;
-        device.write_to_device(large_vector.data(), large_vector.size() * sizeof(std::uint32_t), tt_cxy_pair(mmio_chip, tt_xy_pair(0, 0)), address, "SMALL_READ_WRITE_TLB");
+        device.write_to_device(
+            large_vector.data(),
+            large_vector.size() * sizeof(std::uint32_t),
+            tt_cxy_pair(mmio_chip, tt_xy_pair(0, 0)),
+            address,
+            "SMALL_READ_WRITE_TLB");
         test_utils::read_data_from_device(
             device,
             readback_vec,
@@ -257,14 +296,24 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) {
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
         for (const auto& chip : target_devices) {
             for (auto& core : sdesc_per_chip.at(chip).workers) {
-                device.write_to_device(small_vector.data(), small_vector.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB");
+                device.write_to_device(
+                    small_vector.data(),
+                    small_vector.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(chip, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
             }
         }
         device.wait_for_non_mmio_flush();
         for (const auto& chip : target_devices) {
             for (auto& core : sdesc_per_chip.at(chip).workers) {
                 test_utils::read_data_from_device(
-                    device, readback_vec, tt_cxy_pair(chip, core), address, small_vector.size() * 4, "SMALL_READ_WRITE_TLB");
+                    device,
+                    readback_vec,
+                    tt_cxy_pair(chip, core),
+                    address,
+                    small_vector.size() * 4,
+                    "SMALL_READ_WRITE_TLB");
                 EXPECT_EQ(small_vector, readback_vec)
                     << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 readback_vec = {};
diff --git a/tests/galaxy/test_umd_remote_api.cpp b/tests/galaxy/test_umd_remote_api.cpp
index 535607b5..366ea05d 100644
--- a/tests/galaxy/test_umd_remote_api.cpp
+++ b/tests/galaxy/test_umd_remote_api.cpp
@@ -2,21 +2,20 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <numeric>
 #include <filesystem>
+#include <numeric>
 
-#include "gtest/gtest.h"
 #include "common/logger.hpp"
-#include "umd/device/tt_cluster_descriptor.h"
-#include "umd/device/cluster.h"
 #include "eth_interface.h"
+#include "gtest/gtest.h"
 #include "host_mem_address_map.h"
 #include "l1_address_map.h"
-
 #include "test_galaxy_common.h"
-#include "tests/wormhole/test_wh_common.h"
-#include "tests/test_utils/generate_cluster_desc.hpp"
 #include "tests/test_utils/device_test_utils.hpp"
+#include "tests/test_utils/generate_cluster_desc.hpp"
+#include "tests/wormhole/test_wh_common.h"
+#include "umd/device/cluster.h"
+#include "umd/device/tt_cluster_descriptor.h"
 
 static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml";
 
@@ -32,7 +31,12 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) {
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
     Cluster device = Cluster(
-        test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true);
+        test_utils::GetAbsPath(SOC_DESC_PATH),
+        cluster_desc_path,
+        target_devices,
+        num_host_mem_ch_per_mmio_device,
+        false,
+        true);
     const auto sdesc_per_chip = device.get_virtual_soc_descriptors();
 
     tt::umd::test::utils::set_params_for_remote_txn(device);
@@ -64,7 +68,12 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) {
             for (const auto& core : target_cores) {
                 tt_cxy_pair target_core = tt_cxy_pair(chip, core);
                 auto start = std::chrono::high_resolution_clock::now();
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), target_core, address, "SMALL_READ_WRITE_TLB");
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    target_core,
+                    address,
+                    "SMALL_READ_WRITE_TLB");
                 device.wait_for_non_mmio_flush();  // Barrier to ensure that all writes over ethernet were commited
                 auto end = std::chrono::high_resolution_clock::now();
                 auto duration = double(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count());
@@ -72,7 +81,8 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) {
                 // std::cout << "  chip " << chip << " core " << target_core.str() << " " << duration << std::endl;
 
                 start = std::chrono::high_resolution_clock::now();
-                test_utils::read_data_from_device(device, readback_vec, target_core, address, write_size, "SMALL_READ_WRITE_TLB");
+                test_utils::read_data_from_device(
+                    device, readback_vec, target_core, address, write_size, "SMALL_READ_WRITE_TLB");
                 end = std::chrono::high_resolution_clock::now();
                 duration = double(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count());
                 // std::cout << " read chip " << chip << " core " << target_core.str()<< " " << duration << std::endl;
@@ -145,7 +155,12 @@ void run_data_mover_test(
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
     Cluster device = Cluster(
-        test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true);
+        test_utils::GetAbsPath(SOC_DESC_PATH),
+        cluster_desc_path,
+        target_devices,
+        num_host_mem_ch_per_mmio_device,
+        false,
+        true);
 
     tt::umd::test::utils::set_params_for_remote_txn(device);
 
@@ -162,7 +177,11 @@ void run_data_mover_test(
     std::vector<float> send_bw;
     // Set up data in sender core
     device.write_to_device(
-        vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, "SMALL_READ_WRITE_TLB");
+        vector_to_write.data(),
+        vector_to_write.size() * sizeof(std::uint32_t),
+        tt_cxy_pair(sender_core.chip, sender_core.core),
+        sender_core.addr,
+        "SMALL_READ_WRITE_TLB");
     device.wait_for_non_mmio_flush();  // Barrier to ensure that all writes over ethernet were commited
 
     // Send data from sender core to receiver core
@@ -261,7 +280,12 @@ void run_data_broadcast_test(
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
     Cluster device = Cluster(
-        test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true);
+        test_utils::GetAbsPath(SOC_DESC_PATH),
+        cluster_desc_path,
+        target_devices,
+        num_host_mem_ch_per_mmio_device,
+        false,
+        true);
 
     tt::umd::test::utils::set_params_for_remote_txn(device);
 
@@ -278,7 +302,11 @@ void run_data_broadcast_test(
     std::vector<float> send_bw;
     //  Set up data in sender core
     device.write_to_device(
-        vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, "SMALL_READ_WRITE_TLB");
+        vector_to_write.data(),
+        vector_to_write.size() * sizeof(std::uint32_t),
+        tt_cxy_pair(sender_core.chip, sender_core.core),
+        sender_core.addr,
+        "SMALL_READ_WRITE_TLB");
     device.wait_for_non_mmio_flush();  // Barrier to ensure that all writes over ethernet were commited
 
     // Send data from sender core to receiver core
diff --git a/tests/galaxy/test_umd_remote_api_stability.cpp b/tests/galaxy/test_umd_remote_api_stability.cpp
index ae2f8094..86416e4d 100644
--- a/tests/galaxy/test_umd_remote_api_stability.cpp
+++ b/tests/galaxy/test_umd_remote_api_stability.cpp
@@ -7,173 +7,167 @@
 #include <random>
 #include <thread>
 
-#include "umd/device/tt_cluster_descriptor.h"
-#include "umd/device/cluster.h"
-
 #include "common/logger.hpp"
 #include "eth_interface.h"
 #include "filesystem"
 #include "gtest/gtest.h"
 #include "host_mem_address_map.h"
 #include "l1_address_map.h"
-#include "umd/device/tt_soc_descriptor.h"
-
-#include "tests/test_utils/stimulus_generators.hpp"
-#include "tests/test_utils/generate_cluster_desc.hpp"
 #include "tests/galaxy/test_galaxy_common.h"
+#include "tests/test_utils/generate_cluster_desc.hpp"
+#include "tests/test_utils/stimulus_generators.hpp"
 #include "tests/wormhole/test_wh_common.h"
+#include "umd/device/cluster.h"
+#include "umd/device/tt_cluster_descriptor.h"
+#include "umd/device/tt_soc_descriptor.h"
 
 namespace tt::umd::test::utils {
 
-
 class WormholeGalaxyStabilityTestFixture : public WormholeTestFixture {
- private:
-  static int detected_num_chips;
-  static bool skip_tests;
-
- protected:
-
-  static constexpr int EXPECTED_MIN_CHIPS = 32;
-  static uint32_t scale_number_of_tests;
-
-  static void SetUpTestSuite() {
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
-    detected_num_chips = cluster_desc->get_number_of_chips();
-    if (detected_num_chips < EXPECTED_MIN_CHIPS) {
-        skip_tests = true;
-    }
-    if(char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) {
-        scale_number_of_tests = std::atoi(scale_number_of_tests_env);
+private:
+    static int detected_num_chips;
+    static bool skip_tests;
+
+protected:
+    static constexpr int EXPECTED_MIN_CHIPS = 32;
+    static uint32_t scale_number_of_tests;
+
+    static void SetUpTestSuite() {
+        std::unique_ptr<tt_ClusterDescriptor> cluster_desc =
+            tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
+        detected_num_chips = cluster_desc->get_number_of_chips();
+        if (detected_num_chips < EXPECTED_MIN_CHIPS) {
+            skip_tests = true;
+        }
+        if (char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) {
+            scale_number_of_tests = std::atoi(scale_number_of_tests_env);
+        }
     }
-  }
 
-  virtual int get_detected_num_chips() {
-    return detected_num_chips;
-  }
-
-  virtual bool is_test_skipped() {
-    return skip_tests;
-  }
+    virtual int get_detected_num_chips() { return detected_num_chips; }
 
+    virtual bool is_test_skipped() { return skip_tests; }
 };
 
-
 int WormholeGalaxyStabilityTestFixture::detected_num_chips = -1;
 bool WormholeGalaxyStabilityTestFixture::skip_tests = false;
 uint32_t WormholeGalaxyStabilityTestFixture::scale_number_of_tests = 1;
 
-
 TEST_F(WormholeGalaxyStabilityTestFixture, MixedRemoteTransfers) {
     int seed = 0;
-    
+
     assert(device != nullptr);
-    log_info(LogSiliconDriver,"Started MixedRemoteTransfers");
+    log_info(LogSiliconDriver, "Started MixedRemoteTransfers");
     std::vector<remote_transfer_sample_t> command_history;
     try {
         RunMixedTransfersUniformDistributions(
-            *this->device, 
+            *this->device,
             100000 * scale_number_of_tests,
             seed,
-
             transfer_type_weights_t{.write = 0.40, .read = 0.4},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 30000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history
-        );
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 30000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history);
     } catch (...) {
         print_command_history_executable_code(command_history);
     }
-
 }
 
 TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTransfersMediumSmall) {
     int seed = 0;
 
-    log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersMediumSmall");
+    log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersMediumSmall");
 
     assert(device != nullptr);
-    std::thread t1([&](){
+    std::thread t1([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             50000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 0.50, .read = 0.50},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 30000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            nullptr
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 30000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            nullptr);
     });
-    std::thread t2([&](){
+    std::thread t2([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             50000 * scale_number_of_tests,
             100,
-
             transfer_type_weights_t{.write = 0.25, .read = 0.50},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 30000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            nullptr
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            // Set to true if you want to emit the command history code to command line
+            std::uniform_int_distribution<transfer_size_t>(0x4, 30000),
+            false,
+            nullptr);
     });
-    std::thread t3([&](){
+    std::thread t3([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             50000 * scale_number_of_tests,
             23,
-
             transfer_type_weights_t{.write = 0.5, .read = 0.25},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 30000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            nullptr
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            // Set to true if you want to emit the command history code to command line
+            std::uniform_int_distribution<transfer_size_t>(0x4, 30000),
+            false,
+            nullptr);
     });
-    std::thread t4([&](){
+    std::thread t4([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             99,
-
             transfer_type_weights_t{.write = 0.1, .read = 0.1},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            nullptr
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            nullptr);
     });
 
     t1.join();
@@ -182,4 +176,4 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
     t4.join();
 }
 
-} // namespace tt::umd::test::utils
+}  // namespace tt::umd::test::utils
diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp
index b2e34c70..04af85bb 100644
--- a/tests/grayskull/test_silicon_driver.cpp
+++ b/tests/grayskull/test_silicon_driver.cpp
@@ -5,13 +5,13 @@
 #include <thread>
 
 #include "gtest/gtest.h"
+#include "l1_address_map.h"
+#include "tests/test_utils/device_test_utils.hpp"
+#include "tests/test_utils/generate_cluster_desc.hpp"
 #include "umd/device/cluster.h"
-#include "umd/device/tt_soc_descriptor.h"
 #include "umd/device/tt_cluster_descriptor.h"
+#include "umd/device/tt_soc_descriptor.h"
 #include "umd/device/wormhole_implementation.h"
-#include "l1_address_map.h"
-#include "tests/test_utils/generate_cluster_desc.hpp"
-#include "tests/test_utils/device_test_utils.hpp"
 
 using namespace tt::umd;
 
@@ -19,8 +19,8 @@ TEST(SiliconDriverGS, CreateDestroySequential) {
     std::set<chip_id_t> target_devices = {0};
     uint32_t num_host_mem_ch_per_mmio_device = 1;
     tt_device_params default_params;
-    for(int i = 0; i < 100; i++) {
-        Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true);
+    for (int i = 0; i < 100; i++) {
+        Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true);
         device.start_device(default_params);
         device.deassert_risc_reset();
         device.close_device();
@@ -33,13 +33,13 @@ TEST(SiliconDriverGS, CreateMultipleInstance) {
     tt_device_params default_params;
     default_params.init_device = false;
     std::unordered_map<int, Cluster*> concurrent_devices = {};
-    for(int i = 0; i < 100; i++) {
-        concurrent_devices.insert({i, new Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true)});
-        concurrent_devices.at(i) -> start_device(default_params);
+    for (int i = 0; i < 100; i++) {
+        concurrent_devices.insert({i, new Cluster(num_host_mem_ch_per_mmio_device, false, true)});
+        concurrent_devices.at(i)->start_device(default_params);
     }
 
-    for(auto& device : concurrent_devices) {
-        device.second -> close_device();
+    for (auto& device : concurrent_devices) {
+        device.second->close_device();
         delete device.second;
     }
 }
@@ -48,15 +48,19 @@ TEST(SiliconDriverGS, Harvesting) {
     std::set<chip_id_t> target_devices = {0};
     std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {{0, 6}, {1, 12}};
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
     auto sdesc_per_chip = device.get_virtual_soc_descriptors();
 
     ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting";
-    for(const auto& chip : sdesc_per_chip) {
-        ASSERT_LE(chip.second.workers.size(), 96) << "Expected SOC descriptor with harvesting to have less than or equal to 96 workers for chip " << chip.first;
+    for (const auto& chip : sdesc_per_chip) {
+        ASSERT_LE(chip.second.workers.size(), 96)
+            << "Expected SOC descriptor with harvesting to have less than or equal to 96 workers for chip "
+            << chip.first;
     }
-    ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0) & simulated_harvesting_masks[0], 6) << "Expected first chip to include simulated harvesting mask of 6";
-    // ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 12) << "Expected second chip to have harvesting mask of 12";
+    ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0) & simulated_harvesting_masks[0], 6)
+        << "Expected first chip to include simulated harvesting mask of 6";
+    // ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 12) << "Expected second chip to have
+    // harvesting mask of 12";
     device.close_device();
 }
 
@@ -65,16 +69,25 @@ TEST(SiliconDriverGS, CustomSocDesc) {
     std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {{0, 6}, {1, 12}};
     uint32_t num_host_mem_ch_per_mmio_device = 1;
     // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting
-    Cluster device = Cluster(test_utils::GetAbsPath("./tests/soc_descs/grayskull_1x1_arch.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks);
+    Cluster device = Cluster(
+        test_utils::GetAbsPath("./tests/soc_descs/grayskull_1x1_arch.yaml"),
+        tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
+        target_devices,
+        num_host_mem_ch_per_mmio_device,
+        false,
+        true,
+        false,
+        simulated_harvesting_masks);
     auto sdesc_per_chip = device.get_virtual_soc_descriptors();
-    ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled";
-    for(const auto& chip : sdesc_per_chip) {
+    ASSERT_EQ(device.using_harvested_soc_descriptors(), false)
+        << "SOC descriptors should not be modified when harvesting is disabled";
+    for (const auto& chip : sdesc_per_chip) {
         ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver";
     }
 }
 
 TEST(SiliconDriverGS, HarvestingRuntime) {
-    auto get_static_tlb_index = [] (tt_xy_pair target) {
+    auto get_static_tlb_index = [](tt_xy_pair target) {
         int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x;
         if (flat_index == 0) {
             return -1;
@@ -85,13 +98,13 @@ TEST(SiliconDriverGS, HarvestingRuntime) {
     std::set<chip_id_t> target_devices = {0};
     std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {{0, 6}, {1, 12}};
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
 
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over devices and only setup static TLBs for functional worker cores
         auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-        for(auto& core : sdesc.workers) {
-            // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. 
+        for (auto& core : sdesc.workers) {
+            // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
             device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE);
         }
         device.setup_core_to_tlb_map(i, get_static_tlb_index);
@@ -108,29 +121,59 @@ TEST(SiliconDriverGS, HarvestingRuntime) {
     std::vector<uint32_t> zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     float timeout_in_seconds = 10;
     // Check functionality of Static TLBs by reading adn writing from statically mapped address space
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
         std::uint32_t dynamic_write_address = 0x30000000;
-        for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses
-            for(auto& core :  device.get_virtual_soc_descriptors().at(i).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "");
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB");
+        for (int loop = 0; loop < 100;
+             loop++) {  // Write to each core a 100 times at different statically mapped addresses
+            for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "");
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    dynamic_write_address,
+                    "SMALL_READ_WRITE_TLB");
                 auto start_time = std::chrono::high_resolution_clock::now();
-                while(!(vector_to_write == readback_vec)) {
-                    float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - start_time).count();
-                    if(wait_duration > timeout_in_seconds) {
+                while (!(vector_to_write == readback_vec)) {
+                    float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(
+                                              std::chrono::high_resolution_clock::now() - start_time)
+                                              .count();
+                    if (wait_duration > timeout_in_seconds) {
                         break;
                     }
                     test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "");
-                    test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB");
+                    test_utils::read_data_from_device(
+                        device,
+                        dynamic_readback_vec,
+                        tt_cxy_pair(i, core),
+                        dynamic_write_address,
+                        40,
+                        "SMALL_READ_WRITE_TLB");
                 }
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");  // Clear any written data
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    dynamic_write_address,
+                    "SMALL_READ_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
                 dynamic_readback_vec = {};
             }
-            address += 0x20; // Increment by uint32_t size for each write
+            address += 0x20;  // Increment by uint32_t size for each write
             dynamic_write_address += 0x20;
         }
     }
@@ -138,7 +181,7 @@ TEST(SiliconDriverGS, HarvestingRuntime) {
 }
 
 TEST(SiliconDriverGS, StaticTLB_RW) {
-    auto get_static_tlb_index = [] (tt_xy_pair target) {
+    auto get_static_tlb_index = [](tt_xy_pair target) {
         int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x;
         if (flat_index == 0) {
             return -1;
@@ -146,19 +189,20 @@ TEST(SiliconDriverGS, StaticTLB_RW) {
         return flat_index;
     };
     std::set<chip_id_t> target_devices = {0};
-    
+
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true);
-    for(int i = 0; i < target_devices.size(); i++) {
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true);
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over devices and only setup static TLBs for worker cores
         auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-        for(auto& core : sdesc.workers) {
-            // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. 
-            device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted);
+        for (auto& core : sdesc.workers) {
+            // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
+            device.configure_tlb(
+                i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted);
         }
         device.setup_core_to_tlb_map(i, get_static_tlb_index);
     }
-    
+
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
@@ -168,36 +212,52 @@ TEST(SiliconDriverGS, StaticTLB_RW) {
     std::vector<uint32_t> zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     float timeout_in_seconds = 10;
     // Check functionality of Static TLBs by reading adn writing from statically mapped address space
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
-        for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses
-            for(auto& core :  device.get_virtual_soc_descriptors().at(i).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "");
+        for (int loop = 0; loop < 100;
+             loop++) {  // Write to each core a 100 times at different statically mapped addresses
+            for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "");
                 auto start_time = std::chrono::high_resolution_clock::now();
-                while(!(vector_to_write == readback_vec)) {
-                    float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - start_time).count();
-                    if(wait_duration > timeout_in_seconds) {
+                while (!(vector_to_write == readback_vec)) {
+                    float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(
+                                              std::chrono::high_resolution_clock::now() - start_time)
+                                              .count();
+                    if (wait_duration > timeout_in_seconds) {
                         break;
                     }
                     test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "");
                 }
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
-            address += 0x20; // Increment by uint32_t size for each write
+            address += 0x20;  // Increment by uint32_t size for each write
         }
     }
-    device.close_device();    
+    device.close_device();
 }
 
 TEST(SiliconDriverGS, DynamicTLB_RW) {
-    // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction
+    // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for
+    // each transaction
     std::set<chip_id_t> target_devices = {0};
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true);
-    device.set_fallback_tlb_ordering_mode("SMALL_READ_WRITE_TLB", TLB_DATA::Posted); // Explicitly test API to set fallback tlb ordering mode
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true);
+    device.set_fallback_tlb_ordering_mode(
+        "SMALL_READ_WRITE_TLB", TLB_DATA::Posted);  // Explicitly test API to set fallback tlb ordering mode
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
@@ -207,25 +267,40 @@ TEST(SiliconDriverGS, DynamicTLB_RW) {
     std::vector<uint32_t> readback_vec = {};
     float timeout_in_seconds = 10;
 
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
-        for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses
-            for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB");
+        for (int loop = 0; loop < 100;
+             loop++) {  // Write to each core a 100 times at different statically mapped addresses
+            for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
                 auto start_time = std::chrono::high_resolution_clock::now();
-                while(!(vector_to_write == readback_vec)) {
-                    float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - start_time).count();
-                    if(wait_duration > timeout_in_seconds) {
+                while (!(vector_to_write == readback_vec)) {
+                    float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(
+                                              std::chrono::high_resolution_clock::now() - start_time)
+                                              .count();
+                    if (wait_duration > timeout_in_seconds) {
                         break;
                     }
-                    test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB");
+                    test_utils::read_data_from_device(
+                        device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB");
                 }
 
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
-            address += 0x20; // Increment by uint32_t size for each write
+            address += 0x20;  // Increment by uint32_t size for each write
         }
     }
     device.close_device();
@@ -238,8 +313,8 @@ TEST(SiliconDriverGS, MultiThreadedDevice) {
     std::set<chip_id_t> target_devices = {0};
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true);
-    
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true);
+
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
@@ -249,18 +324,27 @@ TEST(SiliconDriverGS, MultiThreadedDevice) {
         std::vector<uint32_t> readback_vec = {};
         float timeout_in_seconds = 10;
         std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
-        for(int loop = 0; loop < 100; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB");
+        for (int loop = 0; loop < 100; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(0, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
                 auto start_time = std::chrono::high_resolution_clock::now();
-                while(!(vector_to_write == readback_vec)) {
-                    float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - start_time).count();
-                    if(wait_duration > timeout_in_seconds) {
+                while (!(vector_to_write == readback_vec)) {
+                    float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(
+                                              std::chrono::high_resolution_clock::now() - start_time)
+                                              .count();
+                    if (wait_duration > timeout_in_seconds) {
                         break;
                     }
-                    test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
+                    test_utils::read_data_from_device(
+                        device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
                 }
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 readback_vec = {};
             }
             address += 0x20;
@@ -272,19 +356,28 @@ TEST(SiliconDriverGS, MultiThreadedDevice) {
         std::vector<uint32_t> readback_vec = {};
         float timeout_in_seconds = 10;
         std::uint32_t address = 0x30000000;
-        for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) {
-            for(int loop = 0; loop < 100; loop++) {
-                for(auto& core : core_ls) {
-                    device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB");
+        for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) {
+            for (int loop = 0; loop < 100; loop++) {
+                for (auto& core : core_ls) {
+                    device.write_to_device(
+                        vector_to_write.data(),
+                        vector_to_write.size() * sizeof(std::uint32_t),
+                        tt_cxy_pair(0, core),
+                        address,
+                        "SMALL_READ_WRITE_TLB");
                     auto start_time = std::chrono::high_resolution_clock::now();
-                    while(!(vector_to_write == readback_vec)) {
-                        float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - start_time).count();
-                        if(wait_duration > timeout_in_seconds) {
+                    while (!(vector_to_write == readback_vec)) {
+                        float wait_duration = std::chrono::duration_cast<std::chrono::seconds>(
+                                                  std::chrono::high_resolution_clock::now() - start_time)
+                                                  .count();
+                        if (wait_duration > timeout_in_seconds) {
                             break;
+                        }
+                        test_utils::read_data_from_device(
+                            device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
                     }
-                    test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
-                }
-                    ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+                    ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y
+                                                             << "does not match what was written";
                     readback_vec = {};
                 }
                 address += 0x20;
@@ -297,14 +390,14 @@ TEST(SiliconDriverGS, MultiThreadedDevice) {
     device.close_device();
 }
 
-TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
-    // Have 2 threads read and write from a single device concurrently
-    // All (fairly large) transactions go through a static TLB. 
-    // We want to make sure the memory barrier is thread/process safe.
+TEST(SiliconDriverGS, MultiThreadedMemBar) {  // this tests takes ~5 mins to run
+                                              // Have 2 threads read and write from a single device concurrently
+                                              // All (fairly large) transactions go through a static TLB.
+                                              // We want to make sure the memory barrier is thread/process safe.
 
     // Memory barrier flags get sent to address 0 for all channels in this test
 
-     auto get_static_tlb_index = [] (tt_xy_pair target) {
+    auto get_static_tlb_index = [](tt_xy_pair target) {
         int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x;
         if (flat_index == 0) {
             return -1;
@@ -316,13 +409,13 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
     uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true);
-    
-    for(int i = 0; i < target_devices.size(); i++) {
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true);
+
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over devices and only setup static TLBs for functional worker cores
         auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-        for(auto& core : sdesc.workers) {
-            // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. 
+        for (auto& core : sdesc.workers) {
+            // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
             device.configure_tlb(i, core, get_static_tlb_index(core), base_addr);
         }
         device.setup_core_to_tlb_map(i, get_static_tlb_index);
@@ -332,22 +425,28 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
     device.start_device(default_params);
     device.deassert_risc_reset();
     std::vector<uint32_t> readback_membar_vec = {};
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        test_utils::read_data_from_device(
+            device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers were correctly initialized on all workers
         readback_membar_vec = {};
     }
 
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        test_utils::read_data_from_device(
+            device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers were correctly initialized on all workers
         readback_membar_vec = {};
     }
 
-    for(int chan = 0; chan <  device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) {
+    for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) {
         auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0);
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM
+        test_utils::read_data_from_device(
+            device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers were correctly initialized on all DRAM
         readback_membar_vec = {};
     }
     // Launch 2 thread accessing different locations of L1 and using memory barrier between write and read
@@ -356,23 +455,26 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
     std::vector<uint32_t> vec2(25600);
     std::vector<uint32_t> zeros(25600, 0);
 
-    for(int i = 0; i < vec1.size(); i++) {
+    for (int i = 0; i < vec1.size(); i++) {
         vec1.at(i) = i;
     }
-    for(int i = 0; i < vec2.size(); i++) {
+    for (int i = 0; i < vec2.size(); i++) {
         vec2.at(i) = vec1.size() + i;
     }
 
     std::thread th1 = std::thread([&] {
         std::uint32_t address = base_addr;
-        for(int loop = 0; loop < 100; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        for (int loop = 0; loop < 100; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
                 std::vector<uint32_t> readback_vec = {};
-                device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 device.l1_membar(0, "", {core});
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), "");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), "");
                 ASSERT_EQ(readback_vec, vec1);
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 readback_vec = {};
             }
         }
@@ -380,14 +482,17 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
 
     std::thread th2 = std::thread([&] {
         std::uint32_t address = base_addr + vec1.size() * 4;
-        for(int loop = 0; loop < 100; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        for (int loop = 0; loop < 100; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
                 std::vector<uint32_t> readback_vec = {};
-                device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 device.l1_membar(0, "", {core});
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), "");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), "");
                 ASSERT_EQ(readback_vec, vec2);
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ;
+                device.write_to_device(
+                    zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 readback_vec = {};
             }
         }
@@ -396,11 +501,71 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
     th1.join();
     th2.join();
 
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in correct sate workers
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        test_utils::read_data_from_device(
+            device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(readback_membar_vec.at(0), 187);  // Ensure that memory barriers end up in correct sate workers
         readback_membar_vec = {};
     }
 
     device.close_device();
 }
+
+/**
+ * Copied from Wormhole unit tests.
+ */
+TEST(SiliconDriverGS, SysmemTestWithPcie) {
+    Cluster cluster(
+        test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"),
+        "",  // test_utils::GetClusterDescYAML(),
+        {0},
+        1,      // one "host memory channel", currently a 1G huge page
+        false,  // skip driver allocs - no (don't skip)
+        true,   // clean system resources - yes
+        true);  // perform harvesting - yes
+
+    cluster.start_device(tt_device_params{});  // no special parameters
+
+    const chip_id_t mmio_chip_id = 0;
+    const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0);
+    const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y);
+    const size_t test_size_bytes = 0x4000;  // Arbitrarilly chosen, but small size so the test runs quickly.
+
+    // PCIe core is at (x=0, y=4) on Grayskull NOC0.
+    ASSERT_EQ(PCIE.x, 0);
+    ASSERT_EQ(PCIE.y, 4);
+
+    // Bad API: how big is the buffer?  How do we know it's big enough?
+    // Situation today is that there's a 1G hugepage behind it, although this is
+    // unclear from the API and may change in the future.
+    uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0);
+    ASSERT_NE(sysmem, nullptr);
+
+    uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id);
+
+    // Buffer that we will use to read sysmem into, then write sysmem from.
+    std::vector<uint8_t> buffer(test_size_bytes, 0x0);
+
+    // Step 1: Fill sysmem with random bytes.
+    test_utils::fill_with_random_bytes(sysmem, test_size_bytes);
+
+    // Step 2: Read sysmem into buffer.
+    cluster.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB");
+
+    // Step 3: Verify that buffer matches sysmem.
+    ASSERT_EQ(buffer, std::vector<uint8_t>(sysmem, sysmem + test_size_bytes));
+
+    // Step 4: Fill buffer with random bytes.
+    test_utils::fill_with_random_bytes(&buffer[0], test_size_bytes);
+
+    // Step 5: Write buffer into sysmem, overwriting what was there.
+    cluster.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB");
+
+    // Step 5b: Read back sysmem into a throwaway buffer.  The intent is to
+    // ensure the write has completed before we check sysmem against buffer.
+    std::vector<uint8_t> throwaway(test_size_bytes, 0x0);
+    cluster.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB");
+
+    // Step 6: Verify that sysmem matches buffer.
+    ASSERT_EQ(buffer, std::vector<uint8_t>(sysmem, sysmem + test_size_bytes));
+}
diff --git a/tests/microbenchmark/device_fixture.hpp b/tests/microbenchmark/device_fixture.hpp
index 3e20679a..b4b744b8 100644
--- a/tests/microbenchmark/device_fixture.hpp
+++ b/tests/microbenchmark/device_fixture.hpp
@@ -2,24 +2,27 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <iostream>
-#include <fstream>
+#include <gtest/gtest.h>
+
 #include <cassert>
+#include <fstream>
+#include <iostream>
 #include <random>
-#include <gtest/gtest.h>
 
 #include "cluster.h"
-#include "l1_address_map.h"
 #include "device/tt_soc_descriptor.h"
+#include "l1_address_map.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 
+using tt::umd::Cluster;
+
 class uBenchmarkFixture : public ::testing::Test {
-    protected:
+protected:
     void SetUp() override {
         // get arch name?
         results_csv.open("ubench_results.csv", std::ios_base::app);
 
-        auto get_static_tlb_index = [] (tt_xy_pair target) {
+        auto get_static_tlb_index = [](tt_xy_pair target) {
             int flat_index = target.y * 10 + target.x;  // grid_size_x = 10 for GS/WH ????? something is wrong here
             if (flat_index == 0) {
                 return -1;
@@ -28,13 +31,19 @@ class uBenchmarkFixture : public ::testing::Test {
         };
         std::set<chip_id_t> target_devices = {0};
         uint32_t num_host_mem_ch_per_mmio_device = 1;
-        device = std::make_shared<Cluster>(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), "", target_devices, num_host_mem_ch_per_mmio_device, false, true);
+        device = std::make_shared<Cluster>(
+            test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"),
+            "",
+            target_devices,
+            num_host_mem_ch_per_mmio_device,
+            false,
+            true);
 
-        for(int i = 0; i < target_devices.size(); i++) {
+        for (int i = 0; i < target_devices.size(); i++) {
             // Iterate over devices and only setup static TLBs for functional worker cores
             auto& sdesc = device->get_virtual_soc_descriptors().at(i);
-            for(auto& core : sdesc.workers) {
-                // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. 
+            for (auto& core : sdesc.workers) {
+                // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
                 device->configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE);
             }
         }
diff --git a/tests/microbenchmark/test_rw_tensix.cpp b/tests/microbenchmark/test_rw_tensix.cpp
index 274e17a7..9d1973b2 100644
--- a/tests/microbenchmark/test_rw_tensix.cpp
+++ b/tests/microbenchmark/test_rw_tensix.cpp
@@ -6,11 +6,11 @@
 
 #include <iostream>
 
-#include "nanobench.h"
 #include "device_fixture.hpp"
+#include "nanobench.h"
 #include "tests/test_utils/device_test_utils.hpp"
 
-std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min=0) {
+std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min = 0) {
     ankerl::nanobench::Rng gen(80085);
     std::uniform_int_distribution<> dis(min, max);  // between 0 and 1MB
     return dis(gen);
@@ -19,81 +19,119 @@ std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min=0) {
 TEST_F(uBenchmarkFixture, WriteAllCores32Bytes) {
     std::vector<uint32_t> vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7};
     std::uint64_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
-    std::uint64_t bad_address = 0x30000000;     // this address is not mapped, should trigger fallback write/read path
+    std::uint64_t bad_address = 0x30000000;  // this address is not mapped, should trigger fallback write/read path
 
     ankerl::nanobench::Bench bench_static;
     ankerl::nanobench::Bench bench_dynamic;
-    for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) {
+    for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) {
         std::stringstream wname;
         wname << "Write to device core (" << core.x << ", " << core.y << ")";
         // Write 32 bytes through static tlbs
-        bench_static.title("Write 32 bytes").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] {
-            device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB");
-        });
+        bench_static.title("Write 32 bytes")
+            .unit("writes")
+            .minEpochIterations(50)
+            .output(nullptr)
+            .run(wname.str(), [&] {
+                device->write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(0, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
+            });
         // Write through "fallback/dynamic" tlb
-        bench_dynamic.title("Write 32 bytes fallback").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] {
-            device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), bad_address, "SMALL_READ_WRITE_TLB");
-        });
+        bench_dynamic.title("Write 32 bytes fallback")
+            .unit("writes")
+            .minEpochIterations(50)
+            .output(nullptr)
+            .run(wname.str(), [&] {
+                device->write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(0, core),
+                    bad_address,
+                    "SMALL_READ_WRITE_TLB");
+            });
         wname.clear();
     }
     bench_static.render(ankerl::nanobench::templates::csv(), results_csv);
     bench_dynamic.render(ankerl::nanobench::templates::csv(), results_csv);
 }
 
-TEST_F(uBenchmarkFixture, ReadAllCores32Bytes){
+TEST_F(uBenchmarkFixture, ReadAllCores32Bytes) {
     std::vector<uint32_t> readback_vec = {};
     std::uint64_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
-    std::uint64_t bad_address = 0x30000000;     // this address is not mapped, should trigger fallback write/read path
+    std::uint64_t bad_address = 0x30000000;  // this address is not mapped, should trigger fallback write/read path
 
     ankerl::nanobench::Bench bench_static;
     ankerl::nanobench::Bench bench_dynamic;
-    
-    for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) {
+
+    for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) {
         std::stringstream rname;
         // Read through static tlbs
         rname << "Read from device core (" << core.x << ", " << core.y << ")";
         bench_static.title("Read 32 bytes").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] {
-            test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB");
+            test_utils::read_data_from_device(
+                *device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB");
         });
         // Read through "fallback/dynamic" tlb
-        bench_dynamic.title("Read 32 bytes fallback").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] {
-            test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), bad_address, 0x20, "SMALL_READ_WRITE_TLB");
-        });
+        bench_dynamic.title("Read 32 bytes fallback")
+            .unit("reads")
+            .minEpochIterations(50)
+            .output(nullptr)
+            .run(rname.str(), [&] {
+                test_utils::read_data_from_device(
+                    *device, readback_vec, tt_cxy_pair(0, core), bad_address, 0x20, "SMALL_READ_WRITE_TLB");
+            });
         rname.clear();
     }
     bench_static.render(ankerl::nanobench::templates::csv(), results_csv);
     bench_dynamic.render(ankerl::nanobench::templates::csv(), results_csv);
 }
 
-TEST_F(uBenchmarkFixture, Write32BytesRandomAddr){
+TEST_F(uBenchmarkFixture, Write32BytesRandomAddr) {
     std::vector<uint32_t> vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7};
     std::uint32_t address;
 
     ankerl::nanobench::Bench bench;
-    for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) {
-        address = generate_random_address(1<<20); // between 0 and 1MB
+    for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) {
+        address = generate_random_address(1 << 20);  // between 0 and 1MB
         std::stringstream wname;
         wname << "Write to device core (" << core.x << ", " << core.y << ") @ address " << std::hex << address;
-        bench.title("Write 32 bytes random address").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] {
-            device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB");
-        });
+        bench.title("Write 32 bytes random address")
+            .unit("writes")
+            .minEpochIterations(50)
+            .output(nullptr)
+            .run(wname.str(), [&] {
+                device->write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(0, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
+            });
         wname.clear();
     }
     bench.render(ankerl::nanobench::templates::csv(), results_csv);
 }
 
-TEST_F(uBenchmarkFixture, Read32BytesRandomAddr){
+TEST_F(uBenchmarkFixture, Read32BytesRandomAddr) {
     std::vector<uint32_t> readback_vec = {};
     std::uint32_t address;
 
     ankerl::nanobench::Bench bench;
-    for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) {
-        address = generate_random_address(1<<20); // between 0 and 1MB
+    for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) {
+        address = generate_random_address(1 << 20);  // between 0 and 1MB
         std::stringstream rname;
         rname << "Read from device core (" << core.x << ", " << core.y << ") @ address " << std::hex << address;
-        bench.title("Read 32 bytes random address").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] {
-            test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB");
-        });
+        bench.title("Read 32 bytes random address")
+            .unit("reads")
+            .minEpochIterations(50)
+            .output(nullptr)
+            .run(rname.str(), [&] {
+                test_utils::read_data_from_device(
+                    *device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB");
+            });
         rname.clear();
     }
     bench.render(ankerl::nanobench::templates::csv(), results_csv);
diff --git a/tests/pcie/test_pcie_device.cpp b/tests/pcie/test_pcie_device.cpp
index b12d835e..02de5fe1 100644
--- a/tests/pcie/test_pcie_device.cpp
+++ b/tests/pcie/test_pcie_device.cpp
@@ -5,16 +5,15 @@
  */
 
 #include <gtest/gtest.h>
-#include "fmt/xchar.h"
 
 #include <algorithm>
 #include <filesystem>
 #include <string>
 #include <vector>
 
+#include "fmt/xchar.h"
 #include "umd/device/pci_device.hpp"
 
-
 TEST(PcieDeviceTest, Numa) {
     std::vector<int> nodes;
 
diff --git a/tests/simulation/device_fixture.hpp b/tests/simulation/device_fixture.hpp
index 115d3ac1..4d76c308 100644
--- a/tests/simulation/device_fixture.hpp
+++ b/tests/simulation/device_fixture.hpp
@@ -5,15 +5,14 @@
 #pragma once
 
 #include <gtest/gtest.h>
-
-#include "umd/device/tt_simulation_device.h"
-#include "common/logger.hpp"
-#include "tests/test_utils/generate_cluster_desc.hpp"
-
 #include <nng/nng.h>
+#include <nng/protocol/pair1/pair.h>
 #include <nng/protocol/pipeline0/pull.h>
 #include <nng/protocol/pipeline0/push.h>
-#include <nng/protocol/pair1/pair.h>
+
+#include "common/logger.hpp"
+#include "tests/test_utils/generate_cluster_desc.hpp"
+#include "umd/device/tt_simulation_device.h"
 
 class SimulationDeviceFixture : public ::testing::Test {
 protected:
@@ -24,9 +23,7 @@ class SimulationDeviceFixture : public ::testing::Test {
         device->start_device(default_params);
     }
 
-    static void TearDownTestSuite() {
-        device->close_device();
-    }
+    static void TearDownTestSuite() { device->close_device(); }
 
     static std::unique_ptr<tt_SimulationDevice> device;
 };
diff --git a/tests/simulation/test_simulation_device.cpp b/tests/simulation/test_simulation_device.cpp
index 1ac6146a..3b3015e0 100644
--- a/tests/simulation/test_simulation_device.cpp
+++ b/tests/simulation/test_simulation_device.cpp
@@ -3,86 +3,79 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <random>
+
 #include "device_fixture.hpp"
 #include "tests/test_utils/device_test_utils.hpp"
 
-std::vector<uint32_t> generate_data(uint32_t size_in_bytes){
-    size_t size = size_in_bytes/sizeof(uint32_t);
+std::vector<uint32_t> generate_data(uint32_t size_in_bytes) {
+    size_t size = size_in_bytes / sizeof(uint32_t);
     std::vector<uint32_t> data(size);
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_int_distribution<uint32_t> dis(0, 100);
 
-    for(uint32_t i = 0; i < size; i++){
+    for (uint32_t i = 0; i < size; i++) {
         data[i] = dis(gen);
     }
     return data;
 }
 
-class LoopbackAllCoresParam : public SimulationDeviceFixture , 
-                            public ::testing::WithParamInterface<tt_xy_pair> {};
+class LoopbackAllCoresParam : public SimulationDeviceFixture, public ::testing::WithParamInterface<tt_xy_pair> {};
 
 INSTANTIATE_TEST_SUITE_P(
-    LoopbackAllCores,
-    LoopbackAllCoresParam,
-    ::testing::Values(
-        tt_xy_pair{0, 1},
-        tt_xy_pair{1, 1},
-        tt_xy_pair{1, 0}
-    )
-);
-
-TEST_P(LoopbackAllCoresParam, LoopbackSingleTensix){
-    std::vector<uint32_t> wdata = {1,2,3,4,5};
+    LoopbackAllCores, LoopbackAllCoresParam, ::testing::Values(tt_xy_pair{0, 1}, tt_xy_pair{1, 1}, tt_xy_pair{1, 0}));
+
+TEST_P(LoopbackAllCoresParam, LoopbackSingleTensix) {
+    std::vector<uint32_t> wdata = {1, 2, 3, 4, 5};
     std::vector<uint32_t> rdata(wdata.size(), 0);
     tt_cxy_pair core = {0, GetParam()};
 
-    device->write_to_device(wdata.data(), wdata.size()*sizeof(uint32_t), core, 0x100, "");
-    device->read_from_device(rdata.data(), core, 0x100, rdata.size()*sizeof(uint32_t), "");
-    
+    device->write_to_device(wdata.data(), wdata.size() * sizeof(uint32_t), core, 0x100, "");
+    device->read_from_device(rdata.data(), core, 0x100, rdata.size() * sizeof(uint32_t), "");
+
     ASSERT_EQ(wdata, rdata);
 }
 
-bool loopback_stress_size(std::unique_ptr<tt_SimulationDevice> &device, tt_xy_pair core, uint32_t byte_shift){
+bool loopback_stress_size(std::unique_ptr<tt_SimulationDevice> &device, tt_xy_pair core, uint32_t byte_shift) {
     uint64_t addr = 0x0;
 
     std::vector<uint32_t> wdata = generate_data(1 << byte_shift);
     std::vector<uint32_t> rdata(wdata.size(), 0);
 
-    device->write_to_device(wdata.data(), wdata.size()*sizeof(uint32_t), tt_cxy_pair{0, core}, addr, "");
-    device->read_from_device(rdata.data(), tt_cxy_pair{0, core}, addr, rdata.size()*sizeof(uint32_t), "");
-    
+    device->write_to_device(wdata.data(), wdata.size() * sizeof(uint32_t), tt_cxy_pair{0, core}, addr, "");
+    device->read_from_device(rdata.data(), tt_cxy_pair{0, core}, addr, rdata.size() * sizeof(uint32_t), "");
+
     return wdata == rdata;
 }
 
-TEST_P(LoopbackAllCoresParam, LoopbackStressSize){
+TEST_P(LoopbackAllCoresParam, LoopbackStressSize) {
     tt_xy_pair core = GetParam();
     tt_xy_pair dram = {1, 0};
     if (core == dram) {
-        for (uint32_t i = 2; i <= 30; ++i) {    // 2^30 = 1 GB
+        for (uint32_t i = 2; i <= 30; ++i) {  // 2^30 = 1 GB
             ASSERT_TRUE(loopback_stress_size(device, core, i));
         }
     } else {
-        for (uint32_t i = 2; i <= 20; ++i) {    // 2^20 = 1 MB
+        for (uint32_t i = 2; i <= 20; ++i) {  // 2^20 = 1 MB
             ASSERT_TRUE(loopback_stress_size(device, core, i));
         }
     }
 }
 
-TEST_F(SimulationDeviceFixture, LoopbackTwoTensix){
-    std::vector<uint32_t> wdata1 = {1,2,3,4,5};
-    std::vector<uint32_t> wdata2 = {6,7,8,9,10};
+TEST_F(SimulationDeviceFixture, LoopbackTwoTensix) {
+    std::vector<uint32_t> wdata1 = {1, 2, 3, 4, 5};
+    std::vector<uint32_t> wdata2 = {6, 7, 8, 9, 10};
     std::vector<uint32_t> rdata1(wdata1.size());
     std::vector<uint32_t> rdata2(wdata2.size());
     tt_cxy_pair core1 = {0, 0, 1};
     tt_cxy_pair core2 = {0, 1, 1};
 
-    device->write_to_device(wdata1.data(), wdata1.size()*sizeof(uint32_t),  core1, 0x100, "");
-    device->write_to_device(wdata2.data(), wdata2.size()*sizeof(uint32_t), core2, 0x100, "");
+    device->write_to_device(wdata1.data(), wdata1.size() * sizeof(uint32_t), core1, 0x100, "");
+    device->write_to_device(wdata2.data(), wdata2.size() * sizeof(uint32_t), core2, 0x100, "");
+
+    device->read_from_device(rdata1.data(), core1, 0x100, rdata1.size() * sizeof(uint32_t), "");
+    device->read_from_device(rdata2.data(), core2, 0x100, rdata2.size() * sizeof(uint32_t), "");
 
-    device->read_from_device(rdata1.data(), core1, 0x100, rdata1.size()*sizeof(uint32_t), "");
-    device->read_from_device(rdata2.data(), core2, 0x100, rdata2.size()*sizeof(uint32_t), "");
-    
     ASSERT_EQ(wdata1, rdata1);
     ASSERT_EQ(wdata2, rdata2);
 }
diff --git a/tests/test_utils/device_test_utils.hpp b/tests/test_utils/device_test_utils.hpp
index 136c6c5e..842e4ce5 100644
--- a/tests/test_utils/device_test_utils.hpp
+++ b/tests/test_utils/device_test_utils.hpp
@@ -6,15 +6,16 @@
 #pragma once
 
 #include <cstdint>
-#include <vector>
+#include <random>
 #include <string>
+#include <vector>
 
 #include "umd/device/cluster.h"
 
 namespace test_utils {
 
 template <typename T>
-static void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_in_bytes) {
+static void size_buffer_to_capacity(std::vector<T>& data_buf, std::size_t size_in_bytes) {
     std::size_t target_size = 0;
     if (size_in_bytes > 0) {
         target_size = ((size_in_bytes - 1) / sizeof(T)) + 1;
@@ -22,9 +23,27 @@ static void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_i
     data_buf.resize(target_size);
 }
 
-static void read_data_from_device(tt_device& device, std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
+static void read_data_from_device(
+    tt_device& device,
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    uint32_t size,
+    const std::string& tlb_to_use) {
     size_buffer_to_capacity(vec, size);
     device.read_from_device(vec.data(), core, addr, size, tlb_to_use);
 }
 
+inline void fill_with_random_bytes(uint8_t* data, size_t n) {
+    static std::random_device rd;
+    static std::mt19937_64 gen(rd());
+    uint64_t* data64 = reinterpret_cast<uint64_t*>(data);
+    std::generate_n(data64, n / 8, [&]() { return gen(); });
+
+    // Handle remaining bytes
+    for (size_t i = (n / 8) * 8; i < n; ++i) {
+        data[i] = static_cast<uint8_t>(gen());
+    }
 }
+
+}  // namespace test_utils
diff --git a/tests/test_utils/generate_cluster_desc.hpp b/tests/test_utils/generate_cluster_desc.hpp
index 145f011a..539dd39f 100644
--- a/tests/test_utils/generate_cluster_desc.hpp
+++ b/tests/test_utils/generate_cluster_desc.hpp
@@ -7,24 +7,26 @@
 #pragma once
 
 #include <filesystem>
-#include <string>
 #include <iostream>
+#include <string>
 
 #include "fmt/core.h"
 
 namespace test_utils {
 
-inline std::string GetAbsPath(std::string path_){
-    // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the compiler.
+inline std::string GetAbsPath(std::string path_) {
+    // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the
+    // compiler.
     std::filesystem::path current_file_path = std::filesystem::path(__FILE__);
     std::filesystem::path umd_root;
     if (current_file_path.is_absolute()) {
         umd_root = current_file_path.parent_path().parent_path().parent_path();
     } else {
-        std::filesystem::path umd_root_relative = std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path().parent_path(), "../");
+        std::filesystem::path umd_root_relative =
+            std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path().parent_path(), "../");
         umd_root = std::filesystem::canonical(umd_root_relative);
     }
     std::filesystem::path abs_path = umd_root / path_;
     return abs_path.string();
 }
-} // namespace test_utils
+}  // namespace test_utils
diff --git a/tests/test_utils/soc_desc_test_utils.hpp b/tests/test_utils/soc_desc_test_utils.hpp
index 30fb90d2..884a3504 100644
--- a/tests/test_utils/soc_desc_test_utils.hpp
+++ b/tests/test_utils/soc_desc_test_utils.hpp
@@ -15,4 +15,4 @@ static std::size_t get_num_harvested(std::size_t harvesting_mask) {
     return __builtin_popcount(harvesting_mask);
 }
 
-}
+}  // namespace test_utils
diff --git a/tests/test_utils/stimulus_generators.hpp b/tests/test_utils/stimulus_generators.hpp
index 3773d7de..025284bc 100644
--- a/tests/test_utils/stimulus_generators.hpp
+++ b/tests/test_utils/stimulus_generators.hpp
@@ -4,18 +4,17 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include "umd/device/tt_xy_pair.h"
-#include "umd/device/tt_cluster_descriptor.h"
-#include "umd/device/cluster.h"
-
-
+#include <cassert>
 #include <functional>
 #include <iostream>
 #include <map>
 #include <random>
-#include <vector>
 #include <variant>
-#include <cassert>
+#include <vector>
+
+#include "umd/device/cluster.h"
+#include "umd/device/tt_cluster_descriptor.h"
+#include "umd/device/tt_xy_pair.h"
 
 /* Sizes:
  * Distribution (including min/max)
@@ -40,7 +39,6 @@ namespace tt::umd::test::utils {
 
 static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml";
 
-
 enum RemoteTransferType : uint8_t { WRITE = 0, READ };
 
 template <
@@ -50,7 +48,7 @@ template <
     class DISTRIBUTION_T,
     typename GENERATOR_T = std::mt19937>
 class ConstrainedTemplateTemplateGenerator {
-   public:
+public:
     ConstrainedTemplateTemplateGenerator(
         int seed,
         DISTRIBUTION_T<UNCONSTRAINED_SAMPLE_T> const& distribution,
@@ -62,24 +60,17 @@ class ConstrainedTemplateTemplateGenerator {
         return constrain(sample);
     }
 
-   private:
+private:
     GENERATOR_T generator;
     DISTRIBUTION_T<UNCONSTRAINED_SAMPLE_T> distribution;
     std::function<SAMPLE_T(UNCONSTRAINED_SAMPLE_T)> constrain;
 };
 
-
-template <
-    typename SAMPLE_T,
-    typename UNCONSTRAINED_SAMPLE_T,
-    class DISTRIBUTION_T,
-    typename GENERATOR_T = std::mt19937>
+template <typename SAMPLE_T, typename UNCONSTRAINED_SAMPLE_T, class DISTRIBUTION_T, typename GENERATOR_T = std::mt19937>
 class ConstrainedTemplateGenerator {
-   public:
+public:
     ConstrainedTemplateGenerator(
-        int seed,
-        DISTRIBUTION_T const& distribution,
-        std::function<SAMPLE_T(UNCONSTRAINED_SAMPLE_T)> constrain) :
+        int seed, DISTRIBUTION_T const& distribution, std::function<SAMPLE_T(UNCONSTRAINED_SAMPLE_T)> constrain) :
         generator(seed), distribution(distribution), constrain(constrain) {}
 
     SAMPLE_T generate() {
@@ -87,14 +78,14 @@ class ConstrainedTemplateGenerator {
         return constrain(sample);
     }
 
-   private:
+private:
     GENERATOR_T generator;
     DISTRIBUTION_T distribution;
     std::function<SAMPLE_T(UNCONSTRAINED_SAMPLE_T)> constrain;
 };
 
-
-using DefaultTransferTypeGenerator = ConstrainedTemplateTemplateGenerator<RemoteTransferType, int, std::discrete_distribution>;
+using DefaultTransferTypeGenerator =
+    ConstrainedTemplateTemplateGenerator<RemoteTransferType, int, std::discrete_distribution>;
 
 using address_t = uint32_t;
 using destination_t = tt_cxy_pair;
@@ -107,6 +98,7 @@ struct write_transfer_sample_t {
     std::string tlb_to_use;
     // (payload.data(), size, destination, address, tlb_to_use, false, false);
 };
+
 struct read_transfer_sample_t {
     destination_t destination;
     address_t address;
@@ -115,7 +107,8 @@ struct read_transfer_sample_t {
     // (payload.data(), destination, address, size, tlb_to_use);
 };
 
-using remote_transfer_sample_t = std::tuple<RemoteTransferType, std::variant<write_transfer_sample_t, read_transfer_sample_t>>;
+using remote_transfer_sample_t =
+    std::tuple<RemoteTransferType, std::variant<write_transfer_sample_t, read_transfer_sample_t>>;
 
 template <
     template <typename>
@@ -130,7 +123,8 @@ template <
 struct WriteCommandGenerator {
     using destination_generator_t = ConstrainedTemplateTemplateGenerator<destination_t, int, DEST_DISTR_T, GENERATOR_T>;
     using address_generator_t = ConstrainedTemplateTemplateGenerator<address_t, address_t, ADDR_DISTR_T, GENERATOR_T>;
-    using size_generator_t = ConstrainedTemplateTemplateGenerator<transfer_size_t, DISTR_OUT_T, SIZE_DISTR_T, GENERATOR_T>;
+    using size_generator_t =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, DISTR_OUT_T, SIZE_DISTR_T, GENERATOR_T>;
 
     WriteCommandGenerator(
         destination_generator_t const& destination_generator,
@@ -159,7 +153,8 @@ template <
 struct WriteEpochCmdCommandGenerator {
     using destination_generator_t = ConstrainedTemplateTemplateGenerator<destination_t, int, DEST_DISTR_T, GENERATOR_T>;
     using address_generator_t = ConstrainedTemplateTemplateGenerator<address_t, address_t, ADDR_DISTR_T, GENERATOR_T>;
-    using size_generator_t = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, SIZE_DISTR_T, GENERATOR_T>;
+    using size_generator_t =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, SIZE_DISTR_T, GENERATOR_T>;
     using last_cmd_generator_t = ConstrainedTemplateGenerator<bool, bool, LAST_CMD_DISTR_T, GENERATOR_T>;
     using ordered_generator_t = ConstrainedTemplateGenerator<bool, bool, ORDERED_DISTR_T, GENERATOR_T>;
 
@@ -196,8 +191,10 @@ template <
     typename GENERATOR_T = std::mt19937>
 struct RolledWriteCommandGenerator {
     using destination_generator_t = ConstrainedTemplateTemplateGenerator<destination_t, int, DEST_DISTR_T, GENERATOR_T>;
-    using address_generator_t = ConstrainedTemplateTemplateGenerator<address_t, address_t, SIZE_DISTR_OUT_T, GENERATOR_T>;
-    using size_generator_t = ConstrainedTemplateTemplateGenerator<transfer_size_t, DISTR_SIZE_OUT_T, SIZE_DISTR_T, GENERATOR_T>;
+    using address_generator_t =
+        ConstrainedTemplateTemplateGenerator<address_t, address_t, SIZE_DISTR_OUT_T, GENERATOR_T>;
+    using size_generator_t =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, DISTR_SIZE_OUT_T, SIZE_DISTR_T, GENERATOR_T>;
     using unroll_count_generator_t = ConstrainedTemplateTemplateGenerator<int, int, UNROLL_COUNT_DISTR_T, GENERATOR_T>;
 
     RolledWriteCommandGenerator(
@@ -229,7 +226,8 @@ template <
 struct ReadCommandGenerator {
     using destination_generator_t = ConstrainedTemplateTemplateGenerator<destination_t, int, DEST_DISTR_T, GENERATOR_T>;
     using address_generator_t = ConstrainedTemplateTemplateGenerator<address_t, address_t, ADDR_DISTR_T, GENERATOR_T>;
-    using size_generator_t = ConstrainedTemplateTemplateGenerator<transfer_size_t, DISTR_OUT_T, SIZE_DISTR_T, GENERATOR_T>;
+    using size_generator_t =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, DISTR_OUT_T, SIZE_DISTR_T, GENERATOR_T>;
 
     ReadCommandGenerator(
         destination_generator_t const& destination_generator,
@@ -239,8 +237,6 @@ struct ReadCommandGenerator {
         address_generator(address_generator),
         size_generator(size_generator) {}
 
-
-
     destination_generator_t destination_generator;
     address_generator_t address_generator;
     size_generator_t size_generator;
@@ -265,12 +261,14 @@ template <
 
     typename GENERATOR_T = std::mt19937>
 class TestGenerator {
-    using transfer_type_generator_t = DefaultTransferTypeGenerator;  // ConstrainedTemplateTemplateGenerator<RemoteTransferType, int,
-                                                                     // TRANS_TYPE_DISTRIBUTION_T, GENERATOR_T>;
-    using write_command_generator_t = WriteCommandGenerator<WRITE_DEST_DISTR_T, WRITE_ADDR_DISTR_T, WRITE_SIZE_DISTR_OUT_T, WRITE_SIZE_DISTR_T>;
-    using read_command_generator_t = ReadCommandGenerator<READ_DEST_DISTR_T,READ_ADDR_DISTR_T, READ_SIZE_DISTR_OUT_T, READ_SIZE_DISTR_T>;
-
-   public:
+    // ConstrainedTemplateTemplateGenerator<RemoteTransferType, int, TRANS_TYPE_DISTRIBUTION_T, GENERATOR_T>;
+    using transfer_type_generator_t = DefaultTransferTypeGenerator;
+    using write_command_generator_t =
+        WriteCommandGenerator<WRITE_DEST_DISTR_T, WRITE_ADDR_DISTR_T, WRITE_SIZE_DISTR_OUT_T, WRITE_SIZE_DISTR_T>;
+    using read_command_generator_t =
+        ReadCommandGenerator<READ_DEST_DISTR_T, READ_ADDR_DISTR_T, READ_SIZE_DISTR_OUT_T, READ_SIZE_DISTR_T>;
+
+public:
     TestGenerator(
         int seed,
         transfer_type_generator_t const& transfer_type_distribution,
@@ -279,13 +277,10 @@ class TestGenerator {
         generator(seed),
         transfer_type_distribution(transfer_type_distribution),
         write_command_generator(write_command_generator),
-        read_command_generator(read_command_generator)
-    {
-    }
+        read_command_generator(read_command_generator) {}
 
     // Generate a sample (transfer type, size, destination, address) based on custom distributions
     remote_transfer_sample_t generate_sample() {
-
         // Randomly select a transfer type
         RemoteTransferType transfer_type = transfer_type_distribution.generate();
         assert(transfer_type < 4 && transfer_type >= 0);
@@ -294,22 +289,26 @@ class TestGenerator {
                 destination_t const& destination = write_command_generator.destination_generator.generate();
                 address_t const& address = write_command_generator.address_generator.generate();
                 transfer_size_t const& size_in_bytes = write_command_generator.size_generator.generate();
-                return {transfer_type, write_transfer_sample_t{
-                    .destination = destination,
-                    .address = address,
-                    .size_in_bytes = size_in_bytes,
-                    .tlb_to_use = "LARGE_WRITE_TLB"}};
+                return {
+                    transfer_type,
+                    write_transfer_sample_t{
+                        .destination = destination,
+                        .address = address,
+                        .size_in_bytes = size_in_bytes,
+                        .tlb_to_use = "LARGE_WRITE_TLB"}};
             } break;
 
             case RemoteTransferType::READ: {
                 destination_t const& destination = read_command_generator.destination_generator.generate();
                 address_t const& address = read_command_generator.address_generator.generate();
                 transfer_size_t const& size_in_bytes = read_command_generator.size_generator.generate();
-                return {transfer_type, read_transfer_sample_t{
-                    .destination = destination,
-                    .address = address,
-                    .size_in_bytes = size_in_bytes,
-                    .tlb_to_use = "LARGE_READ_TLB"}};
+                return {
+                    transfer_type,
+                    read_transfer_sample_t{
+                        .destination = destination,
+                        .address = address,
+                        .size_in_bytes = size_in_bytes,
+                        .tlb_to_use = "LARGE_READ_TLB"}};
             } break;
 
             default:
@@ -317,7 +316,7 @@ class TestGenerator {
         };
     }
 
-   private:
+private:
     std::mt19937 generator;
 
     transfer_type_generator_t transfer_type_distribution;
@@ -331,15 +330,32 @@ struct transfer_type_weights_t {
     double read;
 };
 
-
-static auto address_aligner = [](address_t addr) -> address_t { addr = (((addr - 1) / 32) + 1) * 32; assert(addr % 32 == 0); return addr;};
-static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 4) + 1) * 4; assert(size > 0); assert(size % 4 == 0); return size; };
-static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;};
-static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;};
-template<typename T>
+static auto address_aligner = [](address_t addr) -> address_t {
+    addr = (((addr - 1) / 32) + 1) * 32;
+    assert(addr % 32 == 0);
+    return addr;
+};
+static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t {
+    size = (((size - 1) / 4) + 1) * 4;
+    assert(size > 0);
+    assert(size % 4 == 0);
+    return size;
+};
+static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t {
+    size = (((size - 1) / 32) + 1) * 32;
+    assert(size > 0);
+    return size;
+};
+static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t {
+    size = (((size - 1) / 32) + 1) * 32;
+    assert(size > 0);
+    return size;
+};
+template <typename T>
 static auto passthrough_constrainer = [](T const& t) -> T { return t; };
 
-static inline std::vector<destination_t> generate_core_index_locations(tt_ClusterDescriptor const& cluster_desc, tt_SocDescriptor const& soc_desc) {
+static inline std::vector<destination_t> generate_core_index_locations(
+    tt_ClusterDescriptor const& cluster_desc, tt_SocDescriptor const& soc_desc) {
     std::vector<destination_t> core_index_to_location = {};
 
     for (chip_id_t chip : cluster_desc.get_all_chips()) {
@@ -360,16 +376,19 @@ static void print_command(remote_transfer_sample_t const& command) {
         case RemoteTransferType::WRITE: {
             write_transfer_sample_t const& command_args = std::get<write_transfer_sample_t>(std::get<1>(command));
             std::cout << "Transfer type: WRITE, destination: (c=" << command_args.destination.chip
-                        << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
-                        << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl;
+                      << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
+                      << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes
+                      << std::endl;
         } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
             std::cout << "Transfer type: READ, destination: (c=" << command_args.destination.chip
-                        << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
-                        << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl;
+                      << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
+                      << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes
+                      << std::endl;
         } break;
-        default: throw std::runtime_error("Invalid transfer type");
+        default:
+            throw std::runtime_error("Invalid transfer type");
     };
 }
 
@@ -379,12 +398,9 @@ int bytes_to_words(int num_bytes) {
 }
 
 static inline void dispatch_remote_transfer_command(
-    Cluster &driver, 
-    remote_transfer_sample_t const& command, 
-    std::vector<uint32_t> &payload) {
-
+    Cluster& driver, remote_transfer_sample_t const& command, std::vector<uint32_t>& payload) {
     RemoteTransferType transfer_type = std::get<0>(command);
-    auto resize_payload = [](std::vector<uint32_t> &payload, int size_in_bytes) {
+    auto resize_payload = [](std::vector<uint32_t>& payload, int size_in_bytes) {
         payload.resize(bytes_to_words<uint32_t>(size_in_bytes));
     };
 
@@ -392,28 +408,37 @@ static inline void dispatch_remote_transfer_command(
         case RemoteTransferType::WRITE: {
             write_transfer_sample_t const& command_args = std::get<write_transfer_sample_t>(std::get<1>(command));
             assert(command_args.size_in_bytes >= sizeof(uint32_t));
-            resize_payload(payload,command_args.size_in_bytes);
-            driver.write_to_device(payload.data(), bytes_to_words<uint32_t>(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use);
+            resize_payload(payload, command_args.size_in_bytes);
+            driver.write_to_device(
+                payload.data(),
+                bytes_to_words<uint32_t>(command_args.size_in_bytes),
+                command_args.destination,
+                command_args.address,
+                command_args.tlb_to_use);
         } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
             assert(command_args.size_in_bytes >= sizeof(uint32_t));
-            resize_payload(payload,command_args.size_in_bytes);
-            driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size_in_bytes, command_args.tlb_to_use);
+            resize_payload(payload, command_args.size_in_bytes);
+            driver.read_from_device(
+                payload.data(),
+                command_args.destination,
+                command_args.address,
+                command_args.size_in_bytes,
+                command_args.tlb_to_use);
         } break;
         default:
             throw std::runtime_error("Invalid transfer type");
     };
 }
 
-
 static void print_command_executable_code(remote_transfer_sample_t const& command) {
-
     auto emit_payload_resize_string = [](int size_bytes, int size_word) {
         std::cout << "payload.resize(((" << size_bytes << " - 1) / " << size_word << ") + 1);" << std::endl;
     };
     auto emit_bytes_to_words_len_string = [](std::string const& var_name, int size_in_bytes, int size_word) {
-        std::cout << "int " << var_name << " = (((" << size_in_bytes << " - 1) / " << size_word << ") + 1);" << std::endl;
+        std::cout << "int " << var_name << " = (((" << size_in_bytes << " - 1) / " << size_word << ") + 1);"
+                  << std::endl;
     };
 
     std::cout << "{" << std::endl;
@@ -421,19 +446,25 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman
         case RemoteTransferType::WRITE: {
             write_transfer_sample_t const& command_args = std::get<write_transfer_sample_t>(std::get<1>(command));
             assert(command_args.size_in_bytes >= sizeof(uint32_t));
-            std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");"  << std::endl;
+            std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", "
+                      << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl;
             std::cout << "assert(" << command_args.size_in_bytes << " >= sizeof(uint32_t));" << std::endl;
             emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t));
             emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t));
-            std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
-            // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, false, false);
+            std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \""
+                      << command_args.tlb_to_use << "\");" << std::endl;
+            // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address,
+            // command_args.tlb_to_use, false, false);
         } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
-            std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");"  << std::endl;
+            std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", "
+                      << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl;
             emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t));
-            std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
-            // driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size, command_args.tlb_to_use);
+            std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", "
+                      << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
+            // driver.read_from_device(payload.data(), command_args.destination, command_args.address,
+            // command_args.size, command_args.tlb_to_use);
         } break;
         default:
             throw std::runtime_error("Invalid transfer type");
@@ -450,32 +481,36 @@ static void print_command_history_executable_code(std::vector<remote_transfer_sa
     }
 }
 
-
-
-template<
-    template <typename> class WRITE_DEST_DISTR_T, 
-    template <typename> class WRITE_ADDR_DISTR_T, 
+template <
+    template <typename>
+    class WRITE_DEST_DISTR_T,
+    template <typename>
+    class WRITE_ADDR_DISTR_T,
     class WRITE_SIZE_DISTR_OUT_T,
-    template <typename> class WRITE_SIZE_DISTR_T,
+    template <typename>
+    class WRITE_SIZE_DISTR_T,
 
-    template <typename> class READ_DEST_DISTR_T, 
-    template <typename> class READ_ADDR_DISTR_T, 
-    class READ_SIZE_DISTR_OUT_T, 
-    template <typename> class READ_SIZE_DISTR_T
->
+    template <typename>
+    class READ_DEST_DISTR_T,
+    template <typename>
+    class READ_ADDR_DISTR_T,
+    class READ_SIZE_DISTR_OUT_T,
+    template <typename>
+    class READ_SIZE_DISTR_T>
 void RunMixedTransfers(
-    Cluster& device, 
+    Cluster& device,
     int num_samples,
     int seed,
 
     transfer_type_weights_t const& transfer_type_weights,
 
-    WriteCommandGenerator<WRITE_DEST_DISTR_T, WRITE_ADDR_DISTR_T, WRITE_SIZE_DISTR_OUT_T, WRITE_SIZE_DISTR_T> const& write_command_generator,
-    ReadCommandGenerator<READ_DEST_DISTR_T, READ_ADDR_DISTR_T, READ_SIZE_DISTR_OUT_T, READ_SIZE_DISTR_T> const& read_command_generator,
-    
+    WriteCommandGenerator<WRITE_DEST_DISTR_T, WRITE_ADDR_DISTR_T, WRITE_SIZE_DISTR_OUT_T, WRITE_SIZE_DISTR_T> const&
+        write_command_generator,
+    ReadCommandGenerator<READ_DEST_DISTR_T, READ_ADDR_DISTR_T, READ_SIZE_DISTR_OUT_T, READ_SIZE_DISTR_T> const&
+        read_command_generator,
+
     bool record_command_history = false,
-    std::vector<remote_transfer_sample_t> *command_history = nullptr
-) {
+    std::vector<remote_transfer_sample_t>* command_history = nullptr) {
     SCOPED_TRACE("RunMixedTransfers");
     auto test_generator = TestGenerator(
         seed,
@@ -490,7 +525,7 @@ void RunMixedTransfers(
 
     if (record_command_history) {
         assert(command_history != nullptr);
-        assert(command_history->size() == 0); // only support passing in empty command histories
+        assert(command_history->size() == 0);  // only support passing in empty command histories
         command_history->reserve(num_samples);
     }
     std::vector<uint32_t> payload = {};
@@ -513,16 +548,17 @@ void RunMixedTransfers(
     }
 }
 
-
-static ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution> get_default_address_generator(int seed, address_t start, address_t end) {
+static ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>
+get_default_address_generator(int seed, address_t start, address_t end) {
     auto const& address_distribution = std::uniform_int_distribution<address_t>(start, end);
-    return ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(seed + 1, address_distribution, address_aligner);
+    return ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(
+        seed + 1, address_distribution, address_aligner);
 }
 
-
-static ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int_distribution> get_default_full_dram_dest_generator(int seed, Cluster *device) {
+static ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int_distribution>
+get_default_full_dram_dest_generator(int seed, Cluster* device) {
     assert(device != nullptr);
-    tt_ClusterDescriptor *cluster_desc = device->get_cluster_description();
+    tt_ClusterDescriptor* cluster_desc = device->get_cluster_description();
     tt_SocDescriptor const& soc_desc = device->get_virtual_soc_descriptors().at(0);
     std::vector<destination_t> core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc);
 
@@ -536,19 +572,23 @@ static WriteCommandGenerator<
     std::uniform_int_distribution,
     std::uniform_int_distribution,
     transfer_size_t,
-    std::uniform_int_distribution
-> build_dummy_write_command_generator(Cluster &device) {
-    tt_ClusterDescriptor *cluster_desc = device.get_cluster_description();
+    std::uniform_int_distribution>
+build_dummy_write_command_generator(Cluster& device) {
+    tt_ClusterDescriptor* cluster_desc = device.get_cluster_description();
     tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0);
     std::vector<destination_t> core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc);
     auto dest_generator = ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int_distribution>(
         0,
         std::uniform_int_distribution<int>(0, core_index_to_location.size() - 1),
         [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
-    auto addr_generator = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(0 , std::uniform_int_distribution<address_t>(0,0), address_aligner);
-    auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(0, std::uniform_int_distribution<address_t>(0,0), address_aligner_32B);
-    auto write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        0, std::uniform_int_distribution<address_t>(0,0), transfer_size_aligner);
+    auto addr_generator = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(
+        0, std::uniform_int_distribution<address_t>(0, 0), address_aligner);
+    auto addr_generator_32B_aligned =
+        ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(
+            0, std::uniform_int_distribution<address_t>(0, 0), address_aligner_32B);
+    auto write_size_generator =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
+            0, std::uniform_int_distribution<address_t>(0, 0), transfer_size_aligner);
 
     return WriteCommandGenerator(dest_generator, addr_generator, write_size_generator);
 }
@@ -557,24 +597,25 @@ static ReadCommandGenerator<
     std::uniform_int_distribution,
     std::uniform_int_distribution,
     transfer_size_t,
-    std::uniform_int_distribution
-> build_dummy_read_command_generator(Cluster &device) {
-    tt_ClusterDescriptor *cluster_desc = device.get_cluster_description();
+    std::uniform_int_distribution>
+build_dummy_read_command_generator(Cluster& device) {
+    tt_ClusterDescriptor* cluster_desc = device.get_cluster_description();
     tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0);
     std::vector<destination_t> core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc);
     auto dest_generator = ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int_distribution>(
         0,
         std::uniform_int_distribution<int>(0, core_index_to_location.size() - 1),
         [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
-    auto addr_generator = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(0, std::uniform_int_distribution<address_t>(0,0), address_aligner);
-    auto read_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        0, std::uniform_int_distribution<transfer_size_t>(0,0), transfer_size_aligner);
+    auto addr_generator = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(
+        0, std::uniform_int_distribution<address_t>(0, 0), address_aligner);
+    auto read_size_generator =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
+            0, std::uniform_int_distribution<transfer_size_t>(0, 0), transfer_size_aligner);
 
     return ReadCommandGenerator(dest_generator, addr_generator, read_size_generator);
-   
 }
 
-template<
+template <
     template <typename>
     class ADDR_GENERATOR_T,
     typename ADDR_DISTR_T,
@@ -583,10 +624,9 @@ template<
     template <typename>
     class READ_SIZE_GENERATOR_T,
     template <typename>
-    class UNROLL_COUNT_GENERATOR_T
->
+    class UNROLL_COUNT_GENERATOR_T>
 void RunMixedTransfersUniformDistributions(
-    Cluster& device, 
+    Cluster& device,
     int num_samples,
     int seed,
 
@@ -597,11 +637,10 @@ void RunMixedTransfersUniformDistributions(
     float percent_not_last_epoch_cmd,
     float percent_not_remote_ordered,
     READ_SIZE_GENERATOR_T<transfer_size_t> const& read_size_distribution,
-    
+
     bool record_command_history = false,
-    std::vector<remote_transfer_sample_t> *command_history = nullptr
-) {
-    tt_ClusterDescriptor *cluster_desc = device.get_cluster_description();
+    std::vector<remote_transfer_sample_t>* command_history = nullptr) {
+    tt_ClusterDescriptor* cluster_desc = device.get_cluster_description();
     tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0);
     std::vector<destination_t> core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc);
 
@@ -609,21 +648,30 @@ void RunMixedTransfersUniformDistributions(
         seed,
         std::uniform_int_distribution<int>(0, core_index_to_location.size() - 1),
         [&core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
-    auto addr_generator = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(seed + 1, address_distribution, address_aligner);
-    auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(seed + 1, address_distribution, address_aligner_32B);
-    auto write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        seed + 2, write_size_distribution, transfer_size_aligner);
-    auto read_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        seed + 2, read_size_distribution, transfer_size_aligner);
+    auto addr_generator = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(
+        seed + 1, address_distribution, address_aligner);
+    auto addr_generator_32B_aligned =
+        ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(
+            seed + 1, address_distribution, address_aligner_32B);
+    auto write_size_generator =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
+            seed + 2, write_size_distribution, transfer_size_aligner);
+    auto read_size_generator =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
+            seed + 2, read_size_distribution, transfer_size_aligner);
     auto last_epoch_cmd_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
-        seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; });
+        seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool {
+            return last_epoch_cmd;
+        });
     auto ordered_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
-        seed + 3, std::bernoulli_distribution(percent_not_remote_ordered), [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; });
+        seed + 3,
+        std::bernoulli_distribution(percent_not_remote_ordered),
+        [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; });
     auto unroll_count_generator = ConstrainedTemplateTemplateGenerator<int, int, std::uniform_int_distribution>(
         seed + 4, unroll_count_distribution, [](int unroll_count) -> int { return unroll_count; });
 
     RunMixedTransfers(
-        device, 
+        device,
         num_samples,
         seed,
 
@@ -631,12 +679,9 @@ void RunMixedTransfersUniformDistributions(
 
         WriteCommandGenerator(dest_generator, addr_generator, write_size_generator),
         ReadCommandGenerator(dest_generator, addr_generator, read_size_generator),
-        
-        record_command_history,
-        command_history
-    );
 
+        record_command_history,
+        command_history);
 }
 
-
 }  // namespace tt::umd::test::utils
diff --git a/tests/unit_test_main.cpp b/tests/unit_test_main.cpp
index ff89a889..c48ceb23 100644
--- a/tests/unit_test_main.cpp
+++ b/tests/unit_test_main.cpp
@@ -3,10 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-
 #include "gtest_initializer.hpp"
 
 int main(int argc, char **argv) {
-  initialize_gtest(argc, argv);
-  return RUN_ALL_TESTS();
+    initialize_gtest(argc, argv);
+    return RUN_ALL_TESTS();
 }
diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp
index 791586c9..c85f84c5 100644
--- a/tests/wormhole/test_silicon_driver_wh.cpp
+++ b/tests/wormhole/test_silicon_driver_wh.cpp
@@ -1,41 +1,40 @@
 // SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include <thread>
 #include <memory>
-#include <random>
+#include <thread>
 
-#include "gtest/gtest.h"
-#include "umd/device/cluster.h"
 #include "eth_l1_address_map.h"
-#include "l1_address_map.h"
+#include "gtest/gtest.h"
 #include "host_mem_address_map.h"
-
+#include "l1_address_map.h"
+#include "tests/test_utils/device_test_utils.hpp"
+#include "tests/test_utils/generate_cluster_desc.hpp"
+#include "umd/device/cluster.h"
 #include "umd/device/tt_cluster_descriptor.h"
 #include "umd/device/wormhole_implementation.h"
-#include "tests/test_utils/generate_cluster_desc.hpp"
-#include "tests/test_utils/device_test_utils.hpp"
 
 using namespace tt::umd;
 
-inline void fill_with_random_bytes(uint8_t* data, size_t n)
-{
-    static std::random_device rd;
-    static std::mt19937 gen(rd());
-    static std::uniform_int_distribution<uint8_t> dis(0, 255);
-
-    std::generate(data, data + n, [&]() { return dis(gen); });
-}
-
 void set_params_for_remote_txn(Cluster& device) {
     // Populate address map and NOC parameters that the driver needs for remote transactions
-    device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR});
+    device.set_device_l1_address_params(
+        {l1_mem::address_map::L1_BARRIER_BASE,
+         eth_l1_mem::address_map::ERISC_BARRIER_BASE,
+         eth_l1_mem::address_map::FW_VERSION_ADDR});
 }
 
 std::int32_t get_static_tlb_index(tt_xy_pair target) {
-    bool is_eth_location = std::find(std::cbegin(tt::umd::wormhole::ETH_LOCATIONS), std::cend(tt::umd::wormhole::ETH_LOCATIONS), target) != std::cend(tt::umd::wormhole::ETH_LOCATIONS);
-    bool is_tensix_location = std::find(std::cbegin(tt::umd::wormhole::T6_X_LOCATIONS), std::cend(tt::umd::wormhole::T6_X_LOCATIONS), target.x) != std::cend(tt::umd::wormhole::T6_X_LOCATIONS) &&
-                            std::find(std::cbegin(tt::umd::wormhole::T6_Y_LOCATIONS), std::cend(tt::umd::wormhole::T6_Y_LOCATIONS), target.y) != std::cend(tt::umd::wormhole::T6_Y_LOCATIONS);
+    bool is_eth_location =
+        std::find(std::cbegin(tt::umd::wormhole::ETH_LOCATIONS), std::cend(tt::umd::wormhole::ETH_LOCATIONS), target) !=
+        std::cend(tt::umd::wormhole::ETH_LOCATIONS);
+    bool is_tensix_location =
+        std::find(
+            std::cbegin(tt::umd::wormhole::T6_X_LOCATIONS), std::cend(tt::umd::wormhole::T6_X_LOCATIONS), target.x) !=
+            std::cend(tt::umd::wormhole::T6_X_LOCATIONS) &&
+        std::find(
+            std::cbegin(tt::umd::wormhole::T6_Y_LOCATIONS), std::cend(tt::umd::wormhole::T6_Y_LOCATIONS), target.y) !=
+            std::cend(tt::umd::wormhole::T6_Y_LOCATIONS);
     if (is_eth_location) {
         if (target.y == 6) {
             target.y = 1;
@@ -74,7 +73,8 @@ std::int32_t get_static_tlb_index(tt_xy_pair target) {
 
 std::set<chip_id_t> get_target_devices() {
     std::set<chip_id_t> target_devices;
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
+    std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq =
+        tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
     for (int i = 0; i < cluster_desc_uniq->get_number_of_chips(); i++) {
         target_devices.insert(i);
     }
@@ -86,8 +86,15 @@ TEST(SiliconDriverWH, CreateDestroy) {
     uint32_t num_host_mem_ch_per_mmio_device = 1;
     tt_device_params default_params;
     // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting
-    for(int i = 0; i < 50; i++) {
-        Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false);
+    for (int i = 0; i < 50; i++) {
+        Cluster device = Cluster(
+            test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"),
+            tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
+            target_devices,
+            num_host_mem_ch_per_mmio_device,
+            false,
+            true,
+            false);
         set_params_for_remote_txn(device);
         device.start_device(default_params);
         device.deassert_risc_reset();
@@ -101,16 +108,18 @@ TEST(SiliconDriverWH, Harvesting) {
     std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {{0, 30}, {1, 60}};
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
     auto sdesc_per_chip = device.get_virtual_soc_descriptors();
 
     ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting";
 
-    for(const auto& chip : sdesc_per_chip) {
-        ASSERT_EQ(chip.second.workers.size(), 48) << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first;
+    for (const auto& chip : sdesc_per_chip) {
+        ASSERT_EQ(chip.second.workers.size(), 48)
+            << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first;
     }
-    for(int i = 0; i < num_devices; i++){
-        ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(i), simulated_harvesting_masks.at(i)) << "Expecting chip " << i << " to have harvesting mask of " << simulated_harvesting_masks.at(i);
+    for (int i = 0; i < num_devices; i++) {
+        ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(i), simulated_harvesting_masks.at(i))
+            << "Expecting chip " << i << " to have harvesting mask of " << simulated_harvesting_masks.at(i);
     }
 }
 
@@ -120,11 +129,20 @@ TEST(SiliconDriverWH, CustomSocDesc) {
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
     // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks);
+    Cluster device = Cluster(
+        test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"),
+        tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
+        target_devices,
+        num_host_mem_ch_per_mmio_device,
+        false,
+        true,
+        false,
+        simulated_harvesting_masks);
     auto sdesc_per_chip = device.get_virtual_soc_descriptors();
-    
-    ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled";
-    for(const auto& chip : sdesc_per_chip) {
+
+    ASSERT_EQ(device.using_harvested_soc_descriptors(), false)
+        << "SOC descriptors should not be modified when harvesting is disabled";
+    for (const auto& chip : sdesc_per_chip) {
         ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver";
     }
 }
@@ -143,22 +161,22 @@ TEST(SiliconDriverWH, HarvestingRuntime) {
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
     
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
-    
+
     for(int i = 0; i < target_devices.size(); i++) {
         // Iterate over MMIO devices and only setup static TLBs for worker cores
         if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
             auto& sdesc = device.get_virtual_soc_descriptors().at(i);
             for(auto& core : sdesc.workers) {
-                // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.  
+                // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.
                 device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
             }
-        } 
+        }
     }
     device.setup_core_to_tlb_map(get_static_tlb_index_callback);
-    
+
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
@@ -177,13 +195,13 @@ TEST(SiliconDriverWH, HarvestingRuntime) {
                 device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "");
                 device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB");
                 device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited
-                
+
                 test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "");
                 test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB");
                 ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 ASSERT_EQ(vector_to_write, dynamic_readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 device.wait_for_non_mmio_flush();
-                
+
                 device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data
                 device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); // Clear any written data
                 device.wait_for_non_mmio_flush();
@@ -199,46 +217,44 @@ TEST(SiliconDriverWH, HarvestingRuntime) {
 #endif
 
 TEST(SiliconDriverWH, UnalignedStaticTLB_RW) {
-    auto get_static_tlb_index_callback = [] (tt_xy_pair target) {
-        return get_static_tlb_index(target);
-    };
+    auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); };
 
     std::set<chip_id_t> target_devices = get_target_devices();
     int num_devices = target_devices.size();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
 
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over MMIO devices and only setup static TLBs for worker cores
-        if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
+        if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
             auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-            for(auto& core : sdesc.workers) {
-                // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.  
-                device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
+            for (auto& core : sdesc.workers) {
+                // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.
+                device.configure_tlb(
+                    i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
             }
             device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
         }
     }
-    
+
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
 
     std::vector<uint32_t> unaligned_sizes = {3, 14, 21, 255, 362, 430, 1022, 1023, 1025};
-    for(int i = 0; i < num_devices; i++) {
-        for(const auto& size : unaligned_sizes) {
+    for (int i = 0; i < num_devices; i++) {
+        for (const auto& size : unaligned_sizes) {
             std::vector<uint8_t> write_vec(size, 0);
-            for(int i = 0; i < size; i++){
+            for (int i = 0; i < size; i++) {
                 write_vec[i] = size + i;
             }
             std::vector<uint8_t> readback_vec(size, 0);
             std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
-            for(int loop = 0; loop < 50; loop++){
-                for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+            for (int loop = 0; loop < 50; loop++) {
+                for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
                     device.write_to_device(write_vec.data(), size, tt_cxy_pair(i, core), address, "");
                     device.wait_for_non_mmio_flush();
                     device.read_from_device(readback_vec.data(), tt_cxy_pair(i, core), address, size, "");
@@ -252,38 +268,34 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) {
                 }
                 address += 0x20;
             }
-
         }
     }
     device.close_device();
 }
 
 TEST(SiliconDriverWH, StaticTLB_RW) {
-    auto get_static_tlb_index_callback = [] (tt_xy_pair target) {
-        return get_static_tlb_index(target);
-    };
+    auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); };
 
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
 
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over MMIO devices and only setup static TLBs for worker cores
-        if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
+        if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
             auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-            for(auto& core : sdesc.workers) {
-                // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.  
-                device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
+            for (auto& core : sdesc.workers) {
+                // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE.
+                device.configure_tlb(
+                    i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
             }
             device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
-        } 
+        }
     }
 
-    
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
@@ -292,31 +304,45 @@ TEST(SiliconDriverWH, StaticTLB_RW) {
     std::vector<uint32_t> readback_vec = {};
     std::vector<uint32_t> zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     // Check functionality of Static TLBs by reading adn writing from statically mapped address space
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
-        for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses
-            for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "");
-                device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited
+        // Write to each core a 100 times at different statically mapped addresses
+        for (int loop = 0; loop < 100; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "");
+                // Barrier to ensure that all writes over ethernet were commited
+                device.wait_for_non_mmio_flush();
                 test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 device.wait_for_non_mmio_flush();
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");  // Clear any written data
                 device.wait_for_non_mmio_flush();
                 readback_vec = {};
             }
-            address += 0x20; // Increment by uint32_t size for each write
+            address += 0x20;  // Increment by uint32_t size for each write
         }
     }
-    device.close_device();    
+    device.close_device();
 }
 
 TEST(SiliconDriverWH, DynamicTLB_RW) {
-    // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction
+    // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for
+    // each transaction
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"),  tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
 
     set_params_for_remote_txn(device);
 
@@ -328,20 +354,34 @@ TEST(SiliconDriverWH, DynamicTLB_RW) {
     std::vector<uint32_t> zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     std::vector<uint32_t> readback_vec = {};
 
-    for(int i = 0; i < target_devices.size(); i++) {
+    for (int i = 0; i < target_devices.size(); i++) {
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
-        for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses
-            for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB");
-                device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+        // Write to each core a 100 times at different statically mapped addresses
+        for (int loop = 0; loop < 100; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
+                // Barrier to ensure that all writes over ethernet were commited
                 device.wait_for_non_mmio_flush();
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+                device.wait_for_non_mmio_flush();
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
                 device.wait_for_non_mmio_flush();
                 readback_vec = {};
             }
-            address += 0x20; // Increment by uint32_t size for each write
+            address += 0x20;  // Increment by uint32_t size for each write
         }
     }
     device.close_device();
@@ -354,8 +394,8 @@ TEST(SiliconDriverWH, MultiThreadedDevice) {
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
-    
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
+
     set_params_for_remote_txn(device);
 
     tt_device_params default_params;
@@ -366,11 +406,18 @@ TEST(SiliconDriverWH, MultiThreadedDevice) {
         std::vector<uint32_t> vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
         std::vector<uint32_t> readback_vec = {};
         std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE;
-        for(int loop = 0; loop < 100; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-                device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB");
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+        for (int loop = 0; loop < 100; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+                device.write_to_device(
+                    vector_to_write.data(),
+                    vector_to_write.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(0, core),
+                    address,
+                    "SMALL_READ_WRITE_TLB");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
                 readback_vec = {};
             }
             address += 0x20;
@@ -381,12 +428,19 @@ TEST(SiliconDriverWH, MultiThreadedDevice) {
         std::vector<uint32_t> vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
         std::vector<uint32_t> readback_vec = {};
         std::uint32_t address = 0x30000000;
-        for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) {
-            for(int loop = 0; loop < 100; loop++) {
-                for(auto& core : core_ls) {
-                    device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB");
-                    test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
-                    ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
+        for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) {
+            for (int loop = 0; loop < 100; loop++) {
+                for (auto& core : core_ls) {
+                    device.write_to_device(
+                        vector_to_write.data(),
+                        vector_to_write.size() * sizeof(std::uint32_t),
+                        tt_cxy_pair(0, core),
+                        address,
+                        "SMALL_READ_WRITE_TLB");
+                    test_utils::read_data_from_device(
+                        device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB");
+                    ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y
+                                                             << "does not match what was written";
                     readback_vec = {};
                 }
                 address += 0x20;
@@ -401,28 +455,26 @@ TEST(SiliconDriverWH, MultiThreadedDevice) {
 
 TEST(SiliconDriverWH, MultiThreadedMemBar) {
     // Have 2 threads read and write from a single device concurrently
-    // All (fairly large) transactions go through a static TLB. 
+    // All (fairly large) transactions go through a static TLB.
     // We want to make sure the memory barrier is thread/process safe.
 
     // Memory barrier flags get sent to address 0 for all channels in this test
-    auto get_static_tlb_index_callback = [] (tt_xy_pair target) {
-        return get_static_tlb_index(target);
-    };
+    auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); };
 
     std::set<chip_id_t> target_devices = get_target_devices();
     uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
     uint32_t num_host_mem_ch_per_mmio_device = 1;
 
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
-    
-    for(int i = 0; i < target_devices.size(); i++) {
+
+    for (int i = 0; i < target_devices.size(); i++) {
         // Iterate over devices and only setup static TLBs for functional worker cores
-        if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
+        if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) {
             auto& sdesc = device.get_virtual_soc_descriptors().at(i);
-            for(auto& core : sdesc.workers) {
-                // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. 
+            for (auto& core : sdesc.workers) {
+                // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
                 device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr);
             }
             device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
@@ -432,24 +484,41 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) {
     tt_device_params default_params;
     device.start_device(default_params);
     device.deassert_risc_reset();
-    
+
     std::vector<uint32_t> readback_membar_vec = {};
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        test_utils::read_data_from_device(
+            device,
+            readback_membar_vec,
+            tt_cxy_pair(0, core),
+            l1_mem::address_map::L1_BARRIER_BASE,
+            4,
+            "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers were correctly initialized on all workers
         readback_membar_vec = {};
     }
 
-    for(int chan = 0; chan <  device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) {
+    for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) {
         auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0);
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM
+        test_utils::read_data_from_device(
+            device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers were correctly initialized on all DRAM
         readback_membar_vec = {};
     }
-    
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores
+
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) {
+        test_utils::read_data_from_device(
+            device,
+            readback_membar_vec,
+            tt_cxy_pair(0, core),
+            eth_l1_mem::address_map::ERISC_BARRIER_BASE,
+            4,
+            "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0),
+            187);  // Ensure that memory barriers were correctly initialized on all ethernet cores
         readback_membar_vec = {};
     }
 
@@ -459,38 +528,43 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) {
     std::vector<uint32_t> vec2(2560);
     std::vector<uint32_t> zeros(2560, 0);
 
-    for(int i = 0; i < vec1.size(); i++) {
+    for (int i = 0; i < vec1.size(); i++) {
         vec1.at(i) = i;
     }
-    for(int i = 0; i < vec2.size(); i++) {
+    for (int i = 0; i < vec2.size(); i++) {
         vec2.at(i) = vec1.size() + i;
     }
     std::thread th1 = std::thread([&] {
         std::uint32_t address = base_addr;
-        for(int loop = 0; loop < 50; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        for (int loop = 0; loop < 50; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
                 std::vector<uint32_t> readback_vec = {};
-                device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core});
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), "");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), "");
                 ASSERT_EQ(readback_vec, vec1);
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 readback_vec = {};
             }
-            
         }
     });
 
     std::thread th2 = std::thread([&] {
         std::uint32_t address = base_addr + vec1.size() * 4;
-        for(int loop = 0; loop < 50; loop++) {
-            for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        for (int loop = 0; loop < 50; loop++) {
+            for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
                 std::vector<uint32_t> readback_vec = {};
-                device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
+                device.write_to_device(
+                    vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core});
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), "");
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), "");
                 ASSERT_EQ(readback_vec, vec2);
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ;
+                device.write_to_device(
+                    zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "");
                 readback_vec = {};
             }
         }
@@ -499,28 +573,42 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) {
     th1.join();
     th2.join();
 
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) {
+        test_utils::read_data_from_device(
+            device,
+            readback_membar_vec,
+            tt_cxy_pair(0, core),
+            l1_mem::address_map::L1_BARRIER_BASE,
+            4,
+            "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0), 187);  // Ensure that memory barriers end up in the correct sate for workers
         readback_membar_vec = {};
     }
 
-    for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) {
-        test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB");
-        ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for ethernet cores
+    for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) {
+        test_utils::read_data_from_device(
+            device,
+            readback_membar_vec,
+            tt_cxy_pair(0, core),
+            eth_l1_mem::address_map::ERISC_BARRIER_BASE,
+            4,
+            "SMALL_READ_WRITE_TLB");
+        ASSERT_EQ(
+            readback_membar_vec.at(0),
+            187);  // Ensure that memory barriers end up in the correct sate for ethernet cores
         readback_membar_vec = {};
     }
     device.close_device();
 }
 
-
 TEST(SiliconDriverWH, BroadcastWrite) {
     // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
 
@@ -534,40 +622,71 @@ TEST(SiliconDriverWH, BroadcastWrite) {
     std::set<uint32_t> rows_to_exclude_for_dram_broadcast = {};
     std::set<uint32_t> cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9};
 
-    for(const auto& size : broadcast_sizes) {
+    for (const auto& size : broadcast_sizes) {
         std::vector<uint32_t> vector_to_write(size);
         std::vector<uint32_t> zeros(size);
         std::vector<uint32_t> readback_vec = {};
-        for(int i = 0; i < size; i++) {
+        for (int i = 0; i < size; i++) {
             vector_to_write[i] = i;
             zeros[i] = 0;
         }
         // Broadcast to Tensix
-        device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB");
+        device.broadcast_write_to_cluster(
+            vector_to_write.data(),
+            vector_to_write.size() * 4,
+            address,
+            {},
+            rows_to_exclude,
+            cols_to_exclude,
+            "LARGE_WRITE_TLB");
         // Broadcast to DRAM
-        device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB");        
+        device.broadcast_write_to_cluster(
+            vector_to_write.data(),
+            vector_to_write.size() * 4,
+            address,
+            {},
+            rows_to_exclude_for_dram_broadcast,
+            cols_to_exclude_for_dram_broadcast,
+            "LARGE_WRITE_TLB");
         device.wait_for_non_mmio_flush();
 
-        for(const auto i : target_devices) {
-            for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue;
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted";
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data
+        for (const auto i : target_devices) {
+            for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) {
+                    continue;
+                }
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y
+                                                         << "does not match what was broadcasted";
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "LARGE_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
-            for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) {
+            for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) {
                 const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0);
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size;
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y
+                    << " does not match what was broadcasted " << size;
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "LARGE_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
         }
         // Wait for data to be cleared before writing next block
         device.wait_for_non_mmio_flush();
     }
-    device.close_device();    
+    device.close_device();
 }
 
 TEST(SiliconDriverWH, VirtualCoordinateBroadcast) {
@@ -575,20 +694,22 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) {
     std::set<chip_id_t> target_devices = get_target_devices();
 
     uint32_t num_host_mem_ch_per_mmio_device = 1;
-    
-    Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
+
+    Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true);
     set_params_for_remote_txn(device);
     auto mmio_devices = device.get_target_mmio_device_ids();
 
     tt_device_params default_params;
     device.start_device(default_params);
     auto eth_version = device.get_ethernet_fw_version();
-    bool virtual_bcast_supported = (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en;
+    bool virtual_bcast_supported =
+        (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en;
     if (!virtual_bcast_supported) {
         device.close_device();
-        GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled";
+        GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support "
+                        "Virtual Coordinate Broadcast or NOC translation is not enabled";
     }
-    
+
     device.deassert_risc_reset();
     std::vector<uint32_t> broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384};
     uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
@@ -597,43 +718,73 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) {
     std::set<uint32_t> rows_to_exclude_for_dram_broadcast = {};
     std::set<uint32_t> cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9};
 
-    for(const auto& size : broadcast_sizes) {
+    for (const auto& size : broadcast_sizes) {
         std::vector<uint32_t> vector_to_write(size);
         std::vector<uint32_t> zeros(size);
         std::vector<uint32_t> readback_vec = {};
-        for(int i = 0; i < size; i++) {
+        for (int i = 0; i < size; i++) {
             vector_to_write[i] = i;
             zeros[i] = 0;
         }
         // Broadcast to Tensix
-        device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB");
+        device.broadcast_write_to_cluster(
+            vector_to_write.data(),
+            vector_to_write.size() * 4,
+            address,
+            {},
+            rows_to_exclude,
+            cols_to_exclude,
+            "LARGE_WRITE_TLB");
         // Broadcast to DRAM
-        device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB");        
+        device.broadcast_write_to_cluster(
+            vector_to_write.data(),
+            vector_to_write.size() * 4,
+            address,
+            {},
+            rows_to_exclude_for_dram_broadcast,
+            cols_to_exclude_for_dram_broadcast,
+            "LARGE_WRITE_TLB");
         device.wait_for_non_mmio_flush();
 
-        for(const auto i : target_devices) {
-            for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
-                if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue;
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted";
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data
+        for (const auto i : target_devices) {
+            for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) {
+                if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) {
+                    continue;
+                }
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y
+                                                         << "does not match what was broadcasted";
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "LARGE_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
-            for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) {
+            for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) {
                 const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0);
-                test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
-                ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size;
-                device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data
+                test_utils::read_data_from_device(
+                    device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB");
+                ASSERT_EQ(vector_to_write, readback_vec)
+                    << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y
+                    << " does not match what was broadcasted " << size;
+                device.write_to_device(
+                    zeros.data(),
+                    zeros.size() * sizeof(std::uint32_t),
+                    tt_cxy_pair(i, core),
+                    address,
+                    "LARGE_WRITE_TLB");  // Clear any written data
                 readback_vec = {};
             }
         }
         // Wait for data to be cleared before writing next block
         device.wait_for_non_mmio_flush();
     }
-    device.close_device();    
+    device.close_device();
 }
 
-
 /**
  * This is a basic DMA test -- not using the PCIe controller's DMA engine, but
  * rather using the ability of the NOC to access the host system bus via traffic
@@ -658,58 +809,132 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) {
 TEST(SiliconDriverWH, SysmemTestWithPcie) {
     auto target_devices = get_target_devices();
 
-    Cluster device(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"),
-                            tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
-                            target_devices,
-                            1,  // one "host memory channel", currently a 1G huge page
-                            false, // skip driver allocs - no (don't skip)
-                            true,  // clean system resources - yes
-                            true); // perform harvesting - yes
+    Cluster cluster(
+        1,      // one "host memory channel", currently a 1G huge page
+        false,  // skip driver allocs - no (don't skip)
+        true,   // clean system resources - yes
+        true);  // perform harvesting - yes
 
-    set_params_for_remote_txn(device);
-    device.start_device(tt_device_params{});  // no special parameters
+    set_params_for_remote_txn(cluster);
+    cluster.start_device(tt_device_params{});  // no special parameters
 
-    // PCIe core is at (x=0, y=3) on Wormhole NOC0.
     const chip_id_t mmio_chip_id = 0;
-    const size_t PCIE_X = 0;    // NOC0
-    const size_t PCIE_Y = 3;    // NOC0
-    const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE_X, PCIE_Y);
+    const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0);
+    const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y);
     const size_t test_size_bytes = 0x4000;  // Arbitrarilly chosen, but small size so the test runs quickly.
 
+    // PCIe core is at (x=0, y=3) on Wormhole NOC0.
+    ASSERT_EQ(PCIE.x, 0);
+    ASSERT_EQ(PCIE.y, 3);
+
     // Bad API: how big is the buffer?  How do we know it's big enough?
     // Situation today is that there's a 1G hugepage behind it, although this is
     // unclear from the API and may change in the future.
-    uint8_t *sysmem = (uint8_t*)device.host_dma_address(0, 0, 0);
+    uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0);
     ASSERT_NE(sysmem, nullptr);
 
     // This is the address inside the Wormhole PCIe block that is mapped to the
     // system bus.  In Wormhole, this is a fixed address, 0x8'0000'0000.
     // The driver should have mapped this address to the bottom of sysmem.
-    uint64_t base_address = device.get_pcie_base_addr_from_device(mmio_chip_id);
+    uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id);
 
     // Buffer that we will use to read sysmem into, then write sysmem from.
     std::vector<uint8_t> buffer(test_size_bytes, 0x0);
 
     // Step 1: Fill sysmem with random bytes.
-    fill_with_random_bytes(sysmem, test_size_bytes);
+    test_utils::fill_with_random_bytes(sysmem, test_size_bytes);
 
     // Step 2: Read sysmem into buffer.
-    device.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB");
+    cluster.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB");
 
     // Step 3: Verify that buffer matches sysmem.
     ASSERT_EQ(buffer, std::vector<uint8_t>(sysmem, sysmem + test_size_bytes));
 
     // Step 4: Fill buffer with random bytes.
-    fill_with_random_bytes(&buffer[0], test_size_bytes);
+    test_utils::fill_with_random_bytes(&buffer[0], test_size_bytes);
 
     // Step 5: Write buffer into sysmem, overwriting what was there.
-    device.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB");
+    cluster.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB");
 
     // Step 5b: Read back sysmem into a throwaway buffer.  The intent is to
     // ensure the write has completed before we check sysmem against buffer.
     std::vector<uint8_t> throwaway(test_size_bytes, 0x0);
-    device.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB");
+    cluster.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB");
 
     // Step 6: Verify that sysmem matches buffer.
     ASSERT_EQ(buffer, std::vector<uint8_t>(sysmem, sysmem + test_size_bytes));
 }
+
+/**
+ * Same idea as above, but with four channels of sysmem and random addresses.
+ * The hardware mechanism is too slow to sweep the entire range.
+ */
+TEST(SiliconDriverWH, RandomSysmemTestWithPcie) {
+    const size_t num_channels = 2;  // ideally 4, but CI seems to have 2...
+    auto target_devices = get_target_devices();
+
+    Cluster cluster(
+        test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"),
+        tt_ClusterDescriptor::get_cluster_descriptor_file_path(),
+        target_devices,
+        num_channels,
+        false,  // skip driver allocs - no (don't skip)
+        true,   // clean system resources - yes
+        true);  // perform harvesting - yes
+
+    set_params_for_remote_txn(cluster);
+    cluster.start_device(tt_device_params{});  // no special parameters
+
+    const chip_id_t mmio_chip_id = 0;
+    const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0);
+    const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y);
+    const size_t ONE_GIG = 1 << 30;
+    const size_t num_tests = 0x20000;  // runs in a reasonable amount of time
+
+    // PCIe core is at (x=0, y=3) on Wormhole NOC0.
+    ASSERT_EQ(PCIE.x, 0);
+    ASSERT_EQ(PCIE.y, 3);
+
+    const uint64_t ALIGNMENT = sizeof(uint32_t);
+    auto generate_aligned_address = [&](uint64_t lo, uint64_t hi) -> uint64_t {
+        static std::random_device rd;
+        static std::mt19937_64 gen(rd());
+        std::uniform_int_distribution<uint64_t> dis(lo / ALIGNMENT, hi / ALIGNMENT);
+        return dis(gen) * ALIGNMENT;
+    };
+
+    uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id);
+    for (size_t channel = 0; channel < num_channels; ++channel) {
+        uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, channel);
+        ASSERT_NE(sysmem, nullptr);
+
+        test_utils::fill_with_random_bytes(sysmem, ONE_GIG);
+
+        uint64_t lo = (ONE_GIG * channel);
+        uint64_t hi = (lo + ONE_GIG) - 1;
+
+        if (channel == 3) {
+            // TODO: I thought everything past 0xffff'dddd was registers or
+            // something, but a) I don't know what's actually there, and b)
+            // the unusable range seems to be bigger than that... so
+            // restricting to 0x8'f000'0000.
+            hi &= ~0x0fff'ffffULL;
+        }
+
+        for (size_t i = 0; i < num_tests; ++i) {
+            uint64_t address = generate_aligned_address(lo, hi);
+            uint64_t noc_addr = base_address + address;
+            uint64_t sysmem_address = address - lo;
+
+            ASSERT_GE(address, lo) << "Address too low";
+            ASSERT_LE(address, hi) << "Address too high";
+            ASSERT_EQ(address % ALIGNMENT, 0) << "Address not properly aligned";
+
+            uint32_t value = 0;
+            cluster.read_from_device(&value, PCIE_CORE, noc_addr, sizeof(uint32_t), "REG_TLB");
+
+            uint32_t expected = *reinterpret_cast<uint32_t*>(&sysmem[sysmem_address]);
+            ASSERT_EQ(value, expected) << fmt::format("Mismatch at address {:#x}", address);
+        }
+    }
+}
diff --git a/tests/wormhole/test_umd_remote_api_stability.cpp b/tests/wormhole/test_umd_remote_api_stability.cpp
index 16f1d101..26978a2b 100644
--- a/tests/wormhole/test_umd_remote_api_stability.cpp
+++ b/tests/wormhole/test_umd_remote_api_stability.cpp
@@ -2,58 +2,51 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <chrono>
 #include <cstdint>
+#include <ctime>
 #include <numeric>
 #include <random>
 #include <thread>
 
-#include "umd/device/tt_cluster_descriptor.h"
-#include "umd/device/cluster.h"
-
 #include "common/logger.hpp"
 #include "eth_interface.h"
 #include "filesystem"
 #include "gtest/gtest.h"
 #include "host_mem_address_map.h"
 #include "l1_address_map.h"
-#include "umd/device/tt_soc_descriptor.h"
-
-#include "tests/test_utils/stimulus_generators.hpp"
-#include "tests/test_utils/generate_cluster_desc.hpp"
 #include "test_wh_common.h"
-
-#include <chrono>
-#include <ctime>
+#include "tests/test_utils/generate_cluster_desc.hpp"
+#include "tests/test_utils/stimulus_generators.hpp"
+#include "umd/device/cluster.h"
+#include "umd/device/tt_cluster_descriptor.h"
+#include "umd/device/tt_soc_descriptor.h"
 
 namespace tt::umd::test::utils {
 class WormholeNebulaX2TestFixture : public WormholeTestFixture {
- private:
-  static int detected_num_chips;
-  static bool skip_tests;
-
- protected: 
-
-  static constexpr int EXPECTED_NUM_CHIPS = 2;
-  static uint32_t scale_number_of_tests;
-
-  static void SetUpTestSuite() {
-    std::unique_ptr<tt_ClusterDescriptor> cluster_desc = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
-    detected_num_chips = cluster_desc->get_number_of_chips();
-    if (detected_num_chips != EXPECTED_NUM_CHIPS) {
-        skip_tests = true;
+private:
+    static int detected_num_chips;
+    static bool skip_tests;
+
+protected:
+    static constexpr int EXPECTED_NUM_CHIPS = 2;
+    static uint32_t scale_number_of_tests;
+
+    static void SetUpTestSuite() {
+        std::unique_ptr<tt_ClusterDescriptor> cluster_desc =
+            tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
+        detected_num_chips = cluster_desc->get_number_of_chips();
+        if (detected_num_chips != EXPECTED_NUM_CHIPS) {
+            skip_tests = true;
+        }
+        if (char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) {
+            scale_number_of_tests = std::atoi(scale_number_of_tests_env);
+        }
     }
-    if(char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) {
-        scale_number_of_tests = std::atoi(scale_number_of_tests_env);
-    }
-  }
 
-  virtual int get_detected_num_chips() {
-    return detected_num_chips;
-  }
+    virtual int get_detected_num_chips() { return detected_num_chips; }
 
-  virtual bool is_test_skipped() {
-    return skip_tests;
-  }
+    virtual bool is_test_skipped() { return skip_tests; }
 };
 
 int WormholeNebulaX2TestFixture::detected_num_chips = -1;
@@ -63,28 +56,29 @@ uint32_t WormholeNebulaX2TestFixture::scale_number_of_tests = 1;
 TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) {
     int seed = 0;
 
-    log_info(LogSiliconDriver,"Started MixedRemoteTransfersMediumSmall");
+    log_info(LogSiliconDriver, "Started MixedRemoteTransfersMediumSmall");
 
     std::vector<remote_transfer_sample_t> command_history;
     try {
         assert(device != nullptr);
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 0.25, .read = 0.25},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history
-        );
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history);
     } catch (...) {
         print_command_history_executable_code(command_history);
     }
@@ -93,88 +87,92 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) {
 TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall) {
     int seed = 0;
 
-    log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersMediumSmall");
+    log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersMediumSmall");
 
     assert(device != nullptr);
     std::vector<remote_transfer_sample_t> command_history0;
     std::vector<remote_transfer_sample_t> command_history1;
     std::vector<remote_transfer_sample_t> command_history2;
     std::vector<remote_transfer_sample_t> command_history3;
-    std::thread t1([&](){
+    std::thread t1([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 0.50, .read = 0.50},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history0
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history0);
     });
-    std::thread t2([&](){
+    std::thread t2([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             100,
-
             transfer_type_weights_t{.write = 0.25, .read = 0.50},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history1
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history1);
     });
-    std::thread t3([&](){
+    std::thread t3([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             23,
-
             transfer_type_weights_t{.write = 0.5, .read = 0.25},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history2
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history2);
     });
-    std::thread t4([&](){
+    std::thread t4([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             99,
-
             transfer_type_weights_t{.write = 1.0, .read = 0.0},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history3
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history3);
     });
 
     t1.join();
@@ -186,154 +184,155 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
 TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersLarge) {
     int seed = 0;
 
-    log_info(LogSiliconDriver,"Started MixedRemoteTransfersLarge");
+    log_info(LogSiliconDriver, "Started MixedRemoteTransfersLarge");
 
     assert(device != nullptr);
     std::vector<remote_transfer_sample_t> command_history;
     try {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             10000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 0.15, .read = 0.15},
-
-            std::uniform_int_distribution<address_t>(0x10000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x10000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 300000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 300000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history
-        );
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            // Set to true if you want to emit the command history code to command line
+            std::uniform_int_distribution<transfer_size_t>(0x4, 300000),
+            false,
+            &command_history);
     } catch (...) {
         print_command_history_executable_code(command_history);
     }
-
 }
 
 TEST_F(WormholeNebulaX2TestFixture, WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4) {
     int seed = 0;
 
-    log_info(LogSiliconDriver,"Started WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4");
+    log_info(LogSiliconDriver, "Started WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4");
 
     assert(device != nullptr);
     std::vector<remote_transfer_sample_t> command_history;
 
     auto write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, double, std::normal_distribution>(
-        seed, std::normal_distribution<>(10000, 3000), [](double x) -> transfer_size_t { return size_aligner_32B(static_cast<transfer_size_t>((x >= 4) ? x : 4)); });
-    
+        seed, std::normal_distribution<>(10000, 3000), [](double x) -> transfer_size_t {
+            return size_aligner_32B(static_cast<transfer_size_t>((x >= 4) ? x : 4));
+        });
 
     auto dest_generator = get_default_full_dram_dest_generator(seed, device.get());
     auto address_generator = get_default_address_generator(seed, 0x100000, 0x5000000);
 
     try {
         RunMixedTransfers(
-            *device, 
+            *device,
             10000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 1., .read = 0.},
-
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
             build_dummy_read_command_generator(*device),
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history
-        );
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history);
     } catch (...) {
         print_command_history_executable_code(command_history);
     }
-
 }
 
 TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
     int seed = 0;
 
-    log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersLMS");
+    log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersLMS");
 
     assert(device != nullptr);
     std::vector<remote_transfer_sample_t> command_history0;
     std::vector<remote_transfer_sample_t> command_history1;
     std::vector<remote_transfer_sample_t> command_history2;
     std::vector<remote_transfer_sample_t> command_history3;
-    std::thread t1([&](){
+    std::thread t1([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 0.50, .read = 0.50},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(4, 300000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history0
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history0);
     });
-    std::thread t2([&](){
+    std::thread t2([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             100,
-
             transfer_type_weights_t{.write = 0.25, .read = 0.50},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history1
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history1);
     });
-    std::thread t3([&](){
+    std::thread t3([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             23,
-
             transfer_type_weights_t{.write = 0.5, .read = 0.25},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history2
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history2);
     });
-    std::thread t4([&](){
+    std::thread t4([&]() {
         RunMixedTransfersUniformDistributions(
-            *device, 
+            *device,
             100000 * scale_number_of_tests,
             99,
-
             transfer_type_weights_t{.write = 1.0, .read = 0.0},
-
-            std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            // address generator distribution
+            std::uniform_int_distribution<address_t>(0x100000, 0x200000),
+            // WRITE_SIZE_GENERATOR_T const& write_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
+            std::uniform_int_distribution<int>(2, 4),
             0.75,
             0.75,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history3
-        );    
+            // READ_SIZE_GENERATOR_T const& read_size_distribution,
+            std::uniform_int_distribution<transfer_size_t>(0x4, 3000),
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history3);
     });
 
     t1.join();
@@ -345,85 +344,80 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
 TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWritesSmallReads) {
     int seed = 0;
 
-    log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersLargeWritesSmallReads");
+    log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersLargeWritesSmallReads");
 
     assert(device != nullptr);
     std::vector<remote_transfer_sample_t> command_history0;
     std::vector<remote_transfer_sample_t> command_history1;
 
-    auto write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        seed, std::uniform_int_distribution<transfer_size_t>(1000000, 30000000), [](transfer_size_t x) -> transfer_size_t { return size_aligner_32B(static_cast<transfer_size_t>((x >= 4) ? x : 4)); });
-    auto read_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        seed, std::uniform_int_distribution<transfer_size_t>(16, 4096), [](transfer_size_t x) -> transfer_size_t { return size_aligner_32B(static_cast<transfer_size_t>((x >= 4) ? x : 4)); });
+    auto write_size_generator =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
+            seed,
+            std::uniform_int_distribution<transfer_size_t>(1000000, 30000000),
+            [](transfer_size_t x) -> transfer_size_t {
+                return size_aligner_32B(static_cast<transfer_size_t>((x >= 4) ? x : 4));
+            });
+    auto read_size_generator =
+        ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
+            seed, std::uniform_int_distribution<transfer_size_t>(16, 4096), [](transfer_size_t x) -> transfer_size_t {
+                return size_aligner_32B(static_cast<transfer_size_t>((x >= 4) ? x : 4));
+            });
 
     auto dest_generator = get_default_full_dram_dest_generator(seed, device.get());
     auto address_generator = get_default_address_generator(seed, 0x100000, 0x5000000);
 
-    std::thread write_cmds_thread1([&](){
+    std::thread write_cmds_thread1([&]() {
         RunMixedTransfers(
             *device,
             10000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 1., .read = 0.},
-
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
             build_dummy_read_command_generator(*device),
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history0
-        );
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history0);
     });
-    std::thread write_cmds_thread2([&](){
+    std::thread write_cmds_thread2([&]() {
         RunMixedTransfers(
             *device,
             10000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 1., .read = 0.},
-
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
             build_dummy_read_command_generator(*device),
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history0
-        );
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history0);
     });
-    std::thread read_cmd_threads1([&](){
+    std::thread read_cmd_threads1([&]() {
         RunMixedTransfers(
             *device,
             10000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 0, .read = 1.},
-
             build_dummy_write_command_generator(*device),
             ReadCommandGenerator(dest_generator, address_generator, read_size_generator),
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history0
-        );
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history0);
     });
-    std::thread read_cmd_threads2([&](){
+    std::thread read_cmd_threads2([&]() {
         RunMixedTransfers(
             *device,
             10000 * scale_number_of_tests,
             0,
-
             transfer_type_weights_t{.write = 0, .read = 1.},
-
             build_dummy_write_command_generator(*device),
             ReadCommandGenerator(dest_generator, address_generator, read_size_generator),
-
-            false, // Set to true if you want to emit the command history code to command line
-            &command_history0
-        );
+            // Set to true if you want to emit the command history code to command line
+            false,
+            &command_history0);
     });
 
     write_cmds_thread1.join();
     write_cmds_thread2.join();
     read_cmd_threads1.join();
     read_cmd_threads2.join();
-
 }
-} // namespace tt::umd::test::utils
+}  // namespace tt::umd::test::utils
diff --git a/tests/wormhole/test_wh_common.h b/tests/wormhole/test_wh_common.h
index e96ad803..fe76e3c2 100644
--- a/tests/wormhole/test_wh_common.h
+++ b/tests/wormhole/test_wh_common.h
@@ -5,80 +5,77 @@
  */
 #pragma once
 
-#include "umd/device/tt_cluster_descriptor.h"
-#include "umd/device/cluster.h"
-#include "umd/device/tt_xy_pair.h"
 #include "eth_l1_address_map.h"
-
-#include "tests/test_utils/stimulus_generators.hpp"
 #include "tests/test_utils/generate_cluster_desc.hpp"
+#include "tests/test_utils/stimulus_generators.hpp"
+#include "umd/device/cluster.h"
+#include "umd/device/tt_cluster_descriptor.h"
+#include "umd/device/tt_xy_pair.h"
 
 namespace tt::umd::test::utils {
 
 static void set_params_for_remote_txn(Cluster& device) {
     // Populate address map and NOC parameters that the driver needs for remote transactions
-    device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR});
+    device.set_device_l1_address_params(
+        {l1_mem::address_map::L1_BARRIER_BASE,
+         eth_l1_mem::address_map::ERISC_BARRIER_BASE,
+         eth_l1_mem::address_map::FW_VERSION_ADDR});
 }
 
 class WormholeTestFixture : public ::testing::Test {
- protected:
-  // You can remove any or all of the following functions if their bodies would
-  // be empty.
+protected:
+    // You can remove any or all of the following functions if their bodies would
+    // be empty.
 
-  std::unique_ptr<Cluster> device;
+    std::unique_ptr<Cluster> device;
 
-  WormholeTestFixture() {
+    WormholeTestFixture() {}
 
-  }
+    ~WormholeTestFixture() override {
+        // You can do clean-up work that doesn't throw exceptions here.
+    }
 
-  ~WormholeTestFixture() override {
-     // You can do clean-up work that doesn't throw exceptions here.
-  }
+    virtual int get_detected_num_chips() = 0;
+    virtual bool is_test_skipped() = 0;
 
-  virtual int get_detected_num_chips() = 0;
-  virtual bool is_test_skipped() = 0;
+    // If the constructor and destructor are not enough for setting up
+    // and cleaning up each test, you can define the following methods:
 
-  // If the constructor and destructor are not enough for setting up
-  // and cleaning up each test, you can define the following methods:
+    void SetUp() override {
+        // Code here will be called immediately after the constructor (right
+        // before each test).
 
-  void SetUp() override {
-    // Code here will be called immediately after the constructor (right
-    // before each test).
+        if (is_test_skipped()) {
+            GTEST_SKIP() << "Test is skipped due to incorrect number of chips";
+        }
 
-    if (is_test_skipped()) {
-        GTEST_SKIP() << "Test is skipped due to incorrect number of chips";
-    }
+        assert(get_detected_num_chips() > 0);
+        auto devices = std::vector<chip_id_t>(get_detected_num_chips());
+        std::iota(devices.begin(), devices.end(), 0);
+        std::set<chip_id_t> target_devices = {devices.begin(), devices.end()};
+        uint32_t num_host_mem_ch_per_mmio_device = 1;
+        device = std::make_unique<Cluster>(num_host_mem_ch_per_mmio_device, false, true, true);
+        assert(device != nullptr);
+        assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips());
 
-    // std::cout << "Setting Up Test." << std::endl;
-    assert(get_detected_num_chips() > 0);
-    auto devices = std::vector<chip_id_t>(get_detected_num_chips());
-    std::iota(devices.begin(), devices.end(), 0);
-    std::set<chip_id_t> target_devices = {devices.begin(), devices.end()};
-    uint32_t num_host_mem_ch_per_mmio_device = 1;
-    device = std::make_unique<Cluster>(test_utils::GetAbsPath(SOC_DESC_PATH), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
-    assert(device != nullptr);
-    assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips());
+        set_params_for_remote_txn(*device);
 
-    set_params_for_remote_txn(*device);
+        tt_device_params default_params;
+        device->start_device(default_params);
 
-    tt_device_params default_params;
-    device->start_device(default_params);
+        device->deassert_risc_reset();
 
-    device->deassert_risc_reset();
-
-    device->wait_for_non_mmio_flush();
-  }
+        device->wait_for_non_mmio_flush();
+    }
 
-  void TearDown() override {
-    // Code here will be called immediately after each test (right
-    // before the destructor).
+    void TearDown() override {
+        // Code here will be called immediately after each test (right
+        // before the destructor).
 
-    if (!is_test_skipped()) {
-        // std::cout << "Tearing Down Test." << std::endl;
-        device->close_device();
+        if (!is_test_skipped()) {
+            device->close_device();
+        }
     }
-  }
-
 };
 
-} // namespace tt::umd::test::utils
+}  // namespace tt::umd::test::utils