diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
index 50616bf25c9..39380695040 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
@@ -64,8 +64,6 @@ void kernel_main() {
     auto worker_buffer_index_semaphore_addr = get_semaphore(get_arg_val<uint32_t>(arg_idx++));
     bool connected_to_persistent_fabric = get_arg_val<uint32_t>(arg_idx++) != 0;
 
-    DPRINT << "worker_buffer_index_semaphore_addr: " << (uint32_t)worker_buffer_index_semaphore_addr << "\n";
-    DPRINT << "connected_to_persistent_fabric: " << (uint32_t)connected_to_persistent_fabric << "\n";
     // TODO: move to semaphore
     auto edm_buffer_index_sem_id = get_arg_val<uint32_t>(arg_idx++);
     ASSERT(edm_buffer_index_sem_id < 8);
@@ -80,7 +78,6 @@ void kernel_main() {
     } else {
         config.unicast.distance = static_cast<uint8_t>(get_arg_val<uint32_t>(arg_idx++));
     }
-    DPRINT << "config.unicast.distance: " << (uint32_t)config.unicast.distance << "\n";
 
     const InterleavedAddrGen<dest_is_dram> dest_addr_gen = {
         .bank_base_address = dest_addr, .page_size = page_size};
@@ -101,9 +98,7 @@ void kernel_main() {
         writer_send_sem_addr,
         worker_buffer_index_semaphore_addr);
 
-    DPRINT << "sender open\n";
     sender.open();
-    DPRINT << "opened\n";
 
     constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
@@ -116,10 +111,8 @@ void kernel_main() {
     uint32_t buffer_index = 0;
     cb_wait_front(cb_id_in0, 1);
     auto a_packet_header_addr = get_read_ptr(cb_id_in0);
-    DPRINT << "total_pages_to_send: " << (uint32_t)total_pages_to_send << "\n";
     for (uint32_t p = 0; p < total_pages_to_send; p += num_pages_per_send) {
         uint32_t pages_to_send = std::min<uint32_t>(num_pages_per_send, total_pages_to_send - p);
-        DPRINT << "wait empty write slot\n";
         sender.wait_for_empty_write_slot();
         cb_wait_front(cb_id_in0, pages_to_send);
 
@@ -156,22 +149,15 @@ void kernel_main() {
         uint64_t buffer_address = sender.edm_buffer_addr +
                                   (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t)));
         sender.send_payload_blocking_from_address(packet_addr, packet_size);
-        DPRINT << "noc write barrier\n";
         noc_async_writes_flushed();
-        DPRINT << "cb pop front\n";
         cb_pop_front(cb_id_in0, pages_to_send);
-        DPRINT << "cb pop front done\n";
     }
 
-    DPRINT << "DONE MAIN LOOP\n";
     if constexpr (!mcast_mode) {
-        DPRINT << "TEARDOWN\n";
         sender.wait_for_empty_write_slot();
 
         auto& packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(a_packet_header_addr);
         ASSERT(*last_message_semaphore_address == 0);
-        packet_header.reserved = 0xE;
-        packet_header.reserved2 = 0xFFFF;
         packet_header.to_atomic_inc();
         packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{2});
         packet_header.to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader(
@@ -180,16 +166,12 @@ void kernel_main() {
         sender.send_payload_blocking_from_address(
             a_packet_header_addr, packet_header.get_payload_size_including_header());
 
-        DPRINT << "WAITING FOR COMPLETION LOOPBACK @: " << (uint32_t)last_message_semaphore_address << "\n";
         noc_semaphore_wait(last_message_semaphore_address, 1);
-        DPRINT << "\t got it!\n";
     }
 
     bool closed_fabric_connection = terminate_fabric_endpoints_farthest_to_nearest(sender, a_packet_header_addr, arg_idx);
 
     if (!closed_fabric_connection) {
-        DPRINT << "CLOSE\n";
         sender.close();
     }
-    DPRINT << "DONE\n";
 }
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp
index 5697ea5b307..f699132dbca 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp
@@ -8,6 +8,7 @@
 #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
 #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
 #include "tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp"
+#include "ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp"
 
 struct unicast_mode {
     uint8_t distance;
@@ -51,9 +52,7 @@ auto forward_to_fabric_from_cb(
 
     // bit of a hack to extract X/Y
     const auto dest_noc_address = get_noc_addr(current_page, dest_addr_gen, 0, NORMALIZED_NOC_INDEX);
-    const size_t dest_addr = dest_noc_address & 0xFFFFFFFF;
-    const size_t dest_noc_x = (dest_noc_address >> NOC_ADDR_LOCAL_BITS) & ((1 << NOC_ADDR_NODE_ID_BITS) - 1);
-    const size_t dest_noc_y = (dest_noc_address >> (NOC_ADDR_LOCAL_BITS + NOC_ADDR_NODE_ID_BITS)) & ((1 << NOC_ADDR_NODE_ID_BITS) - 1);
+    const auto [dest_worker_noc, dest_addr] = get_noc_address_components(dest_noc_address);
     const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader);
 
     auto packet_addr = get_read_ptr(cb_id);
@@ -64,20 +63,16 @@ auto forward_to_fabric_from_cb(
             .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{
                 dest_addr,
                 (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader),
-                static_cast<uint8_t>(dest_noc_x),
-                static_cast<uint8_t>(dest_noc_y)
-            });
-        packet_header.reserved2 = 0x1111; // debug only
+                static_cast<uint8_t>(dest_worker_noc.x),
+                static_cast<uint8_t>(dest_worker_noc.y)});
     } else {
         packet_header.to_write()
             .to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance})
             .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{
                 dest_addr,
                 (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader),
-                static_cast<uint8_t>(dest_noc_x),
-                static_cast<uint8_t>(dest_noc_y)
-            });
-        packet_header.reserved2 = 0x1111; // debug only
+                static_cast<uint8_t>(dest_worker_noc.x),
+                static_cast<uint8_t>(dest_worker_noc.y)});
     }
 
     uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t)));
@@ -196,12 +191,13 @@ void kernel_main() {
 
     sender.wait_for_empty_write_slot();
 
+    constexpr size_t kLoopbackNumHopsToMyChip = 2;
     auto &packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(a_packet_header_addr);
     ASSERT(*last_message_semaphore_address == 0);
     packet_header.reserved = 0xE;
     packet_header.reserved2 = 0xFFFF;
     packet_header.to_atomic_inc();
-    packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{2});
+    packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{kLoopbackNumHopsToMyChip});
     packet_header.to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader(
             reinterpret_cast<size_t>(last_message_semaphore_address),
             1,
diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
index 834f71d6f06..13e2166bae6 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
@@ -68,8 +68,7 @@ class T3000TestDevice {
         arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
 
         num_devices_ = tt::tt_metal::GetNumAvailableDevices();
-        if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() == 8 and
-            tt::tt_metal::GetNumPCIeDevices() == 4) {
+        if (arch_ == tt::ARCH::WORMHOLE_B0 and num_devices_ == 8 and tt::tt_metal::GetNumPCIeDevices() == 4) {
             mesh_device_ = MeshDevice::create(MeshDeviceConfig(MeshShape{2, 4}));
 
             std::vector<chip_id_t> ids(num_devices_, 0);
@@ -1128,7 +1127,7 @@ int TestLineFabricEntrypoint(
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     if (num_devices < 4) {
-        log_info("This test can only be run on N300 devices");
+        log_info("This test can only be run on T3000 devices");
         return 0;
     }
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -1141,7 +1140,7 @@ int TestLineFabricEntrypoint(
 
     // build a line of devices
     std::vector<Device*> devices = {
-        view->get_device(0, 0), view->get_device(0, 1), view->get_device(0, 2), view->get_device(0, 3)};
+        view.get_device(0, 0), view.get_device(0, 1), view.get_device(0, 2), view.get_device(0, 3)};
     std::vector<Program> programs(enable_persistent_fabric ? 1 : devices.size());
     std::optional<SubdeviceInfo> subdevice_managers = std::nullopt;
     std::optional<std::vector<Program>> fabric_programs;
@@ -1211,7 +1210,7 @@ int TestLoopbackEntrypoint(
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     if (num_devices < 4) {
-        log_info("This test can only be run on N300 devices");
+        log_info("This test can only be run on T3000 devices");
         return 0;
     }
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -1222,8 +1221,8 @@ int TestLoopbackEntrypoint(
     T3000TestDevice test_fixture;
     auto view = test_fixture.mesh_device_->get_view();
 
-    const auto& device_0 = view->get_device(0, 0);
-    const auto& device_1 = view->get_device(0, 1);
+    const auto& device_0 = view.get_device(0, 0);
+    const auto& device_1 = view.get_device(0, 1);
 
     auto const& active_eth_cores = device_0->get_active_ethernet_cores(true);
     auto eth_sender_core_iter = active_eth_cores.begin();
@@ -1385,7 +1384,7 @@ bool TestMultiInputReaderKernel(
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     if (num_devices < 4) {
-        log_info("This test can only be run on N300 devices");
+        log_info("This test can only be run on T3000 devices");
         return true;
     }
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -1403,7 +1402,7 @@ bool TestMultiInputReaderKernel(
     std::vector<Device*> devices;
     devices.reserve(fabric_num_devices);
     for (size_t i = 0; i < fabric_num_devices; i++) {
-        devices.push_back(view->get_device(0, i));
+        devices.push_back(view.get_device(0, i));
     }
 
     std::vector<Program> programs(enable_persistent_fabric ? 1 : devices.size());
@@ -1702,23 +1701,15 @@ bool RunMultiInputReaderTestPropagateFullTensorIn(
     Tensor output_tensor0 = ttnn::ones(tensor_shape, DataType::UINT32, layout).reshape(tensor_shape);
     Tensor output_tensor1 = ttnn::ones(tensor_shape, DataType::UINT32, layout).reshape(tensor_shape);
     input_tensor0.set_tensor_spec(TensorSpec(
-        logical_shape,
-        TensorLayout(
-            /*DataType::BFLOAT16*/ DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config)));
+        logical_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config)));
     input_tensor1.set_tensor_spec(TensorSpec(
-        logical_shape,
-        TensorLayout(
-            /*DataType::BFLOAT16*/ DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in1_memory_config)));
+        logical_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in1_memory_config)));
     output_tensor0.set_tensor_spec(TensorSpec(
-        logical_shape,
-        TensorLayout(
-            /*DataType::BFLOAT16*/ DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config)));
+        logical_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config)));
     output_tensor1.set_tensor_spec(TensorSpec(
-        logical_shape,
-        TensorLayout(
-            /*DataType::BFLOAT16*/ DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config)));
+        logical_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config)));
 
-    size_t page_size = tile_size(DataFormat::RawUInt32);  // DataFormat::Float16);// DataFormat::RawUInt32);
+    size_t page_size = tile_size(DataFormat::RawUInt32);
 
     ttnn::ccl::Shape4D<uint32_t> tensor_shape_in_pages = shape_to_shape_in_tiles(tensor_shape);
     ttnn::ccl::Shape4D<uint32_t> tensor_slice_shape_in_pages = tensor_shape_in_pages;
@@ -1726,12 +1717,12 @@ bool RunMultiInputReaderTestPropagateFullTensorIn(
     ttnn::ccl::Shape4D<uint32_t> worker_slice_shape = tensor_shape_in_pages;
     ttnn::ccl::Shape4D<uint32_t> worker_slice_offset = {0, 0, 0, 0};
 
-    ttnn::ccl::v2::TensorSlice tensor_slice{// using ords_t = Shape4D<uint32_t>;
-                                            tensor_shape_in_pages,
-                                            tensor_slice_shape_in_pages,
-                                            tensor_slice_offset,
-                                            worker_slice_shape,
-                                            worker_slice_offset};
+    ttnn::ccl::v2::TensorSlice tensor_slice{
+        tensor_shape_in_pages,
+        tensor_slice_shape_in_pages,
+        tensor_slice_offset,
+        worker_slice_shape,
+        worker_slice_offset};
 
     auto const in0_tensor_slice = tensor_slice;
     auto const in1_tensor_slice = tensor_slice;
@@ -1755,7 +1746,7 @@ bool RunMultiInputReaderTestPropagateFullTensorIn(
         out1_tensor_slice,
 
         page_size,
-        test_writeback_mode,  // TwoInputReaderKernelWriteMode::LOCAL_WRITEBACK,
+        test_writeback_mode,
         ttnn::ccl::cmd::LocalOnlyCommandDestArgs{},
         false);
 
@@ -1978,7 +1969,7 @@ TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage0_Shar
 // Copying even slightly large tensors exposes issues in underlying tensor code
 // that isn't under test here
 TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage1) {
-    ttnn::Shape tensor_shape = {1, 1, 256, 256};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 256, 256};
     auto pass = RunMultiInputReaderTestPropagateFullTensorIn(
         tensor_shape,
         Layout::TILE,
@@ -1993,8 +1984,6 @@ TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage1) {
 // TODO: update the test infra to be able to properly compare tensors if we are only
 // doing a slice of the larger tensor
 
-// EnablePersistentKernelCache
-
 // ////////////////////////////////////////////////////////////////////
 // ////////////////////////////////////////////////////////////////////
 // ////               FABRIC UNICAST TENSOR WRITE (2 INPUT)
@@ -2002,7 +1991,7 @@ TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage1) {
 // ////////////////////////////////////////////////////////////////////
 
 TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SinglePageTile_OneHop) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 32};
     constexpr size_t distance_dest_device = 1;
     constexpr size_t num_devices = 4;
     auto logical_shape = tensor_shape.logical_shape();
@@ -2035,12 +2024,12 @@ TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SingleP
     ttnn::ccl::Shape4D<uint32_t> worker_slice_shape = tensor_shape_in_pages;
     ttnn::ccl::Shape4D<uint32_t> worker_slice_offset = {0, 0, 0, 0};
 
-    ttnn::ccl::v2::TensorSlice tensor_slice{// using ords_t = Shape4D<uint32_t>;
-                                            tensor_shape_in_pages,
-                                            tensor_slice_shape_in_pages,
-                                            tensor_slice_offset,
-                                            worker_slice_shape,
-                                            worker_slice_offset};
+    ttnn::ccl::v2::TensorSlice tensor_slice{
+        tensor_shape_in_pages,
+        tensor_slice_shape_in_pages,
+        tensor_slice_offset,
+        worker_slice_shape,
+        worker_slice_offset};
 
     auto const in0_tensor_slice = tensor_slice;
     auto const in1_tensor_slice = tensor_slice;
@@ -2073,7 +2062,7 @@ TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SingleP
 }
 
 TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SinglePageTile_OneHop_PersistentFabric) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 32};
     constexpr size_t distance_dest_device = 1;
     constexpr size_t num_devices = 4;
     auto logical_shape = tensor_shape.logical_shape();
@@ -2106,12 +2095,12 @@ TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SingleP
     ttnn::ccl::Shape4D<uint32_t> worker_slice_shape = tensor_shape_in_pages;
     ttnn::ccl::Shape4D<uint32_t> worker_slice_offset = {0, 0, 0, 0};
 
-    ttnn::ccl::v2::TensorSlice tensor_slice{// using ords_t = Shape4D<uint32_t>;
-                                            tensor_shape_in_pages,
-                                            tensor_slice_shape_in_pages,
-                                            tensor_slice_offset,
-                                            worker_slice_shape,
-                                            worker_slice_offset};
+    ttnn::ccl::v2::TensorSlice tensor_slice{
+        tensor_shape_in_pages,
+        tensor_slice_shape_in_pages,
+        tensor_slice_offset,
+        worker_slice_shape,
+        worker_slice_offset};
 
     auto const in0_tensor_slice = tensor_slice;
     auto const in1_tensor_slice = tensor_slice;
@@ -2184,12 +2173,12 @@ void RunFabricMcastFullTensorPropagateTest(
     ttnn::ccl::Shape4D<uint32_t> worker_slice_shape = tensor_shape_in_pages;
     ttnn::ccl::Shape4D<uint32_t> worker_slice_offset = {0, 0, 0, 0};
 
-    ttnn::ccl::v2::TensorSlice tensor_slice{// using ords_t = Shape4D<uint32_t>;
-                                            tensor_shape_in_pages,
-                                            tensor_slice_shape_in_pages,
-                                            tensor_slice_offset,
-                                            worker_slice_shape,
-                                            worker_slice_offset};
+    ttnn::ccl::v2::TensorSlice tensor_slice{
+        tensor_shape_in_pages,
+        tensor_slice_shape_in_pages,
+        tensor_slice_offset,
+        worker_slice_shape,
+        worker_slice_offset};
 
     auto const in0_tensor_slice = tensor_slice;
     auto const in1_tensor_slice = tensor_slice;
@@ -2222,38 +2211,38 @@ void RunFabricMcastFullTensorPropagateTest(
 }
 
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_SingleHop) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 32};
     constexpr size_t distance_dest_device = 1;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false);
 }
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_TwoHop) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 32};
     constexpr size_t distance_dest_device = 2;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false);
 }
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_ThreeHop) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 32};
     constexpr size_t distance_dest_device = 3;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false);
 }
 
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_4PageTile_SingleHop) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 128};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 128};
     constexpr size_t distance_dest_device = 1;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false);
 }
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, DMultiInputReader_4PageTile_TwoHop) {
-    ttnn::Shape tensor_shape = {1, 1, 128, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 128, 32};
     constexpr size_t distance_dest_device = 2;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false);
 }
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_4PageTile_ThreeHop) {
-    ttnn::Shape tensor_shape = {1, 1, 64, 64};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 64, 64};
     constexpr size_t distance_dest_device = 3;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false);
@@ -2266,39 +2255,39 @@ TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_lotsP
 }
 
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_SingleHop_PersistentFabric) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 32};
     constexpr size_t distance_dest_device = 1;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true);
 }
 
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_TwoHop_PersistentFabric) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 32};
     constexpr size_t distance_dest_device = 2;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true);
 }
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_ThreeHop_PersistentFabric) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 32};
     constexpr size_t distance_dest_device = 3;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true);
 }
 
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_4PageTile_SingleHop_PersistentFabric) {
-    ttnn::Shape tensor_shape = {1, 1, 32, 128};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 32, 128};
     constexpr size_t distance_dest_device = 1;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true);
 }
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, DMultiInputReader_4PageTile_TwoHop_PersistentFabric) {
-    ttnn::Shape tensor_shape = {1, 1, 128, 32};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 128, 32};
     constexpr size_t distance_dest_device = 2;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true);
 }
 TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_4PageTile_ThreeHop_PersistentFabric) {
-    ttnn::Shape tensor_shape = {1, 1, 64, 64};  // 16384 crashes... TBD
+    ttnn::Shape tensor_shape = {1, 1, 64, 64};
     constexpr size_t distance_dest_device = 3;
     constexpr size_t num_devices = 4;
     RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true);
@@ -2330,7 +2319,7 @@ bool RunPipelinedWorkersTest(
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     if (num_devices < 4) {
-        log_info("This test can only be run on N300 devices");
+        log_info("This test can only be run on T3000 devices");
         return true;
     }
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -2347,7 +2336,7 @@ bool RunPipelinedWorkersTest(
     T3000TestDevice test_fixture;
     auto view = test_fixture.mesh_device_->get_view();
 
-    Device* device = view->get_device(0, 0);
+    Device* device = view.get_device(0, 0);
     ;
 
     // General setup is as follows:
@@ -2357,7 +2346,6 @@ bool RunPipelinedWorkersTest(
     // HOWEVER. the reader will be programmed to read the chunks in a different order than they were written, this way
     // we can identify synchronization related bugs (e.g. if sender semaphore increments before writes flush)
 
-    ///// PROMOTE EVERYTHING ABOVE HERE TO ARGUMENTS TO THIS CODE WHICH WILL EVENTUALLY BECOME A FUNCTION
     TT_FATAL(num_workers_per_stage.size() == num_stages, "Must have a read order for each stage");
     TT_FATAL(worker_chunk_read_order.size() == num_stages, "Must have a read order for each stage");
     for (size_t i = 0; i < num_stages; ++i) {
@@ -2559,7 +2547,7 @@ bool RunPipelinedWorkersTest(
                 {&device_tensors[stage]},
                 {page_size_bytes},
                 device,
-                cb_packet_size_in_pages,  // TODO: get from fabric
+                cb_packet_size_in_pages,
                 {worker_cores.at(worker)},
                 reader_cmd_stream,
                 std::nullopt,
@@ -2571,7 +2559,7 @@ bool RunPipelinedWorkersTest(
                 {&device_tensors[stage + 1]},
                 {page_size_bytes},
                 device,
-                cb_packet_size_in_pages,  // TODO: get from fabric
+                cb_packet_size_in_pages,
                 {worker_cores.at(worker)},
                 writer_cmd_stream,
                 std::nullopt,
@@ -2865,8 +2853,6 @@ TEST(
 #include "ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp"
 #include "tt_metal/common/bfloat16.hpp"
 TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) {
-    // JUST MAKE SURE IT DOESN'T HANG - CORRECTNESS CHECKS ARE DONE IN PYTEST
-    // HERE WE JUST CHECK THE BASIC PLUMBING IS DONE
     const size_t dim = 3;
     const size_t num_links = 1;
     constexpr auto layout = Layout::TILE;
@@ -2874,7 +2860,7 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) {
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     constexpr size_t test_expected_num_devices = 4;
     if (tt::tt_metal::GetNumAvailableDevices() < test_expected_num_devices) {
-        log_info("This test can only be run on N300 devices");
+        log_info("This test can only be run on T3000 devices");
         return;
     }
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -2886,7 +2872,7 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) {
 
     // build a line of devices
     std::vector<Device*> devices = {
-        view->get_device(0, 1), view->get_device(1, 1), view->get_device(1, 2), view->get_device(0, 2)};
+        view.get_device(0, 1), view.get_device(1, 1), view.get_device(1, 2), view.get_device(0, 2)};
     const size_t num_devices = devices.size();
     TT_FATAL(
         test_expected_num_devices == num_devices,
@@ -2911,8 +2897,7 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) {
         device_input_tensors.push_back(t.to(devices[i]));
     }
     // Need to make it a mesh tensor for use with the op
-    const Tensor input_mesh_tensor =
-        ttnn::distributed::api::aggregate_as_tensor(device_input_tensors, AllGatherTensor{});
+    const Tensor input_mesh_tensor = ttnn::distributed::aggregate_as_tensor(device_input_tensors, AllGatherTensor{});
 
     // FABRIC setup
     const bool enable_persistent_fabric = true;
@@ -2969,7 +2954,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     constexpr size_t test_expected_num_devices = 4;
     if (tt::tt_metal::GetNumAvailableDevices() < test_expected_num_devices) {
-        log_info("This test can only be run on N300 devices");
+        log_info("This test can only be run on T3000 devices");
         return;
     }
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -2981,7 +2966,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li
 
     // build a line of devices
     std::vector<Device*> devices = {
-        view->get_device(0, 0), view->get_device(0, 1), view->get_device(0, 2), view->get_device(0, 3)};
+        view.get_device(0, 0), view.get_device(0, 1), view.get_device(0, 2), view.get_device(0, 3)};
     const size_t num_devices = devices.size();
     TT_FATAL(
         test_expected_num_devices == num_devices,
@@ -3004,8 +2989,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li
         device_input_tensors.push_back(t.to(devices[i]));
     }
     // Need to make it a mesh tensor for use with the op
-    const Tensor input_mesh_tensor =
-        ttnn::distributed::api::aggregate_as_tensor(device_input_tensors, AllGatherTensor{});
+    const Tensor input_mesh_tensor = ttnn::distributed::aggregate_as_tensor(device_input_tensors, AllGatherTensor{});
 
     // FABRIC setup
     const bool enable_persistent_fabric = true;
@@ -3044,10 +3028,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li
 
     log_info(tt::LogTest, "Fabric teardown");
     persistent_fabric_teardown_sequence(
-        devices,
-        subdevice_managers,
-        fabric_handle.value(),
-        tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE);  // tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE);
+        devices, subdevice_managers, fabric_handle.value(), tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE);
 
     log_info(tt::LogTest, "Waiting for teardown completion");
     for (auto d : devices) {
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp
index 330f0e84902..e09ddf81d93 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp
@@ -21,9 +21,7 @@
 #include <optional>
 #include <variant>
 
-namespace ttnn {
-namespace ccl {
-namespace worker_detail {
+namespace ttnn::ccl::worker_detail {
 
 CCLWorkerArgBuilder::CCLWorkerArgBuilder(
     Device const* device,
@@ -1068,9 +1066,8 @@ void generate_multi_input_command_stream_kernel_rt_args(
                     ttnn::ccl::emit_address_generator_runtime_args(t->buffer()->device(), *t),
                     std::back_inserter(rt_args));
             }
-        } else {
-            // Interleaved addrgen passes no additional args - we specify interleaved addrgen as the default
         }
+        // else: Interleaved addrgen passes no additional args - we specify interleaved addrgen as the default
     }
 
     rt_args.push_back(forward_fabric_connections.has_value());
@@ -1570,6 +1567,4 @@ std::vector<uint32_t> CCLWorkerArgBuilder::generate_sender_writer_kernel_ct_args
     return args;
 }
 
-}  // namespace worker_detail
-}  // namespace ccl
-}  // namespace ttnn
+}  // namespace ttnn::ccl::worker_detail
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp
index e3c31986c70..79699816337 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp
@@ -13,19 +13,16 @@
 #include <optional>
 #include <unordered_map>
 
-namespace tt {
-namespace tt_metal {
+namespace tt::tt_metal {
 inline namespace v0 {
 
 // Forward declarations
 class Device;
 
 }  // namespace v0
-}  // namespace tt_metal
-}  // namespace tt
+}  // namespace tt::tt_metal
 
-namespace ttnn {
-namespace ccl {
+namespace ttnn::ccl {
 class WorkerEdmInterfaceArgs;
 class SenderWorkerAdapterSpec;
 
@@ -172,5 +169,4 @@ struct CCLWorkerArgBuilder {
 };
 
 }  // namespace worker_detail
-}  // namespace ccl
-}  // namespace ttnn
+}  // namespace ttnn::ccl
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
index 2d1d8b8f9b5..ca6e26a33e0 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
@@ -652,7 +652,6 @@ FORCE_INLINE void try_advance_read_tensor_to_cb(command_context_t<Addrgen>& cmd_
             contig_pages_advanced = std::min<uint16_t>(cmd_ctx.packet_size_in_pages - i, contig_pages_);
             ASSERT(contig_pages_advanced > 0);
             ASSERT(contig_pages_advanced <= cmd_ctx.packet_size_in_pages);
-            ASSERT(cmd_ctx.page_size != 0xD3AD);
             noc_async_read(noc_addr, l1_write_addr, cmd_ctx.page_size * contig_pages_advanced);
         }
         l1_write_addr += cmd_ctx.page_size * contig_pages_advanced;
diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
index 9f2ed3a94e3..d954dacb906 100644
--- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
@@ -721,65 +721,14 @@ void EdmLineFabricOpInterface::teardown_from_host(tt::fabric::TerminationSignal
 }
 
 void initialize_edm_fabric(distributed::MeshDevice* mesh_device) {
-    // auto build = [](std::vector<Device*> const& line_view) {
-    //     std::vector<Program> programs(line_view.size());
-    //     std::vector<Program*> program_ptrs;
-    //     program_ptrs.reserve(programs.size());
-    //     std::transform(programs.begin(), programs.end(), std::back_inserter(program_ptrs), [](Program& p) { return &p; });
-    //     EdmLineFabricOpInterface edm_fabric(line_view, program_ptrs, true);
-    //     edm_fabric.build_kernels();
-    //     for (size_t i = 0; i < line_view.size(); i++) {
-    //         log_info(tt::LogAlways, "Compile EDM program");
-    //         Device *device = line_view[i];
-    //         device->push_work([&](){tt::tt_metal::detail::CompileProgram(line_view[i], programs[i]);}, false);
-    //     }
-    //     for (size_t i = 0; i < line_view.size(); i++) {
-    //         Device *device = line_view[i];
-    //         Program &program = programs[i];
-    //         log_info(tt::LogAlways, "Enqueue EDM program");
-    //         device->push_work([&](){tt::tt_metal::EnqueueProgram(device->command_queue(), program, false);}, true);
-
-    //         device->push_work([&](){
-    //             auto wait_initialized = [&](uint32_t addr) {
-    //                 auto wait_core = [&](CoreCoord const& core) {
-    //                     bool initialized = false;
-    //                     constexpr size_t max_attempts = 10000;
-    //                     size_t attempts = 0;
-    //                     while (!initialized) {
-    //                         auto host_buffer = tt::llrt::read_hex_vec_from_core(device->id(), core, addr, 4);
-    //                         initialized = host_buffer[0] == 0;
-    //                         attempts++;
-    //                         if (attempts > max_attempts) {
-    //                             log_error(tt::LogAlways, "Failed to initialize EDM fabric");
-    //                             break;
-    //                         }
-    //                     }
-    //                     log_info(tt::LogAlways,"Initialized EDM fabric");
-    //                 };
-    //                 for (auto const& builder : edm_fabric.edm_builders_backward_direction[device->id()]) {
-    //                     wait_core(CoreCoord(builder.my_noc_x, builder.my_noc_y));
-    //                 }
-    //                 for (auto const& builder : edm_fabric.edm_builders_forward_direction[device->id()]) {
-    //                     wait_core(CoreCoord(builder.my_noc_x, builder.my_noc_y));
-    //                 }
-    //             };
-
-    //             wait_initialized(FabricEriscDatamoverConfig::sender_channel_0_buffer_index_semaphore_address);
-    //             wait_initialized(FabricEriscDatamoverConfig::sender_channel_0_local_flow_control_semaphore_address);
-    //             wait_initialized(FabricEriscDatamoverConfig::sender_channel_0_connection_semaphore_address);
-
-    //         }, true);
-    //     }
-    //     log_info(tt::LogAlways, "DONE");
-    // };
 
     std::vector<EdmLineFabricOpInterface> row_fabric_lines;
-    row_fabric_lines.reserve(mesh_device->get_view()->get_row_views().size());
+    row_fabric_lines.reserve(mesh_device->get_view().get_row_views().size());
     std::vector<EdmLineFabricOpInterface> col_fabric_lines;
-    col_fabric_lines.reserve(mesh_device->get_view()->get_column_views().size());
+    col_fabric_lines.reserve(mesh_device->get_view().get_column_views().size());
 
-    size_t num_rows = mesh_device->get_view()->get_row_views().size();
-    size_t num_cols = mesh_device->get_view()->get_column_views().size();
+    size_t num_rows = mesh_device->get_view().get_row_views().size();
+    size_t num_cols = mesh_device->get_view().get_column_views().size();
     std::vector<std::vector<Program>> programs(num_rows);
     for (size_t r = 0; r < num_rows; r++) {
         programs[r].resize(num_cols);
@@ -789,7 +738,7 @@ void initialize_edm_fabric(distributed::MeshDevice* mesh_device) {
         std::vector<Program*> program_ptrs;
         program_ptrs.reserve(num_cols);
         std::transform(programs[i].begin(), programs[i].end(), std::back_inserter(program_ptrs), [](Program& p) { return &p; });
-        row_fabric_lines.push_back(EdmLineFabricOpInterface(mesh_device->get_view()->get_row_views()[i], program_ptrs, true));
+        row_fabric_lines.push_back(EdmLineFabricOpInterface(mesh_device->get_view().get_row_views()[i], program_ptrs, true));
     }
 
     for (size_t i = 0; i < num_cols; i++) {
@@ -798,7 +747,7 @@ void initialize_edm_fabric(distributed::MeshDevice* mesh_device) {
         for (size_t r = 0; r < num_rows; r++) {
             program_ptrs.push_back(&programs[r][i]);
         }
-        col_fabric_lines.push_back(EdmLineFabricOpInterface(mesh_device->get_view()->get_column_views()[i], program_ptrs, true));
+        col_fabric_lines.push_back(EdmLineFabricOpInterface(mesh_device->get_view().get_column_views()[i], program_ptrs, true));
     }
 
     std::for_each(row_fabric_lines.begin(), row_fabric_lines.end(), [](auto& line) { line.build_kernels(); });
@@ -825,10 +774,10 @@ void teardown_edm_fabric(distributed::MeshDevice* mesh_device) {
         edm_fabric.teardown_from_host(tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE);
     };
 
-    for (auto const &row_view : mesh_device->get_view()->get_row_views()) {
+    for (auto const &row_view : mesh_device->get_view().get_row_views()) {
         teardown(row_view);
     }
-    for (auto const &col_view : mesh_device->get_view()->get_column_views()) {
+    for (auto const &col_view : mesh_device->get_view().get_column_views()) {
         teardown(col_view);
     }
 }
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp
index 7afc6f4c4ed..7ce729ed1b7 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -28,7 +28,6 @@ ttnn::Tensor ExecuteAllGatherAsync::invoke(
         create_semaphore_handles);
 }
 
-
 ttnn::Tensor ExecuteAllGatherAsync::invoke(
     const ttnn::Tensor& input_tensor,
     const int32_t dim,
@@ -39,8 +38,7 @@ ttnn::Tensor ExecuteAllGatherAsync::invoke(
     const std::optional<size_t> num_preferred_links,
     std::optional<SubDeviceId> subdevice_id,
     bool enable_persistent_fabric_mode,
-    bool create_semaphore_handles) { 
-
+    bool create_semaphore_handles) {
     return ttnn::operations::experimental::ccl::all_gather_async(
         input_tensor,
         dim,
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp
index 4c8ce03b34d..26f39484078 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -20,7 +20,7 @@ struct ExecuteAllGatherAsync {
         std::optional<SubDeviceId> subdevice_id = std::nullopt,
         bool enable_persistent_fabric_mode = false,
         bool create_semaphore_handles = true);
-        
+
     static ttnn::Tensor invoke(
         const ttnn::Tensor& input_tensor,
         const int32_t dim,
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp
index b3c5127d87e..fe39f848d98 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp
index 40dde33bc7a..29bd4ff9f18 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp
index e739be68ff5..15dcfee3671 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp
@@ -1,4 +1,4 @@
-/// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+/// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -60,8 +60,8 @@ AllGatherAsync create_all_gather_async_struct(
 }
 
 std::optional<std::vector<std::shared_ptr<GlobalSemaphore>>> get_global_semaphores(
-    std::vector<Device*> const& devices, 
-    CoreRange const& core_range,
+    const std::vector<Device*>& devices,
+    const CoreRange& core_range,
     std::optional<SubDeviceId> subdevice_id,
     bool create_semaphore_handles) {
     std::optional<std::vector<std::shared_ptr<GlobalSemaphore>>> semaphore_handles_opt;
@@ -91,7 +91,6 @@ std::optional<std::vector<std::shared_ptr<GlobalSemaphore>>> get_global_semaphor
     return semaphore_handles_opt;
 }
 
-
 }  // namespace all_gather_detail
 }  // namespace ccl
 
@@ -119,6 +118,22 @@ void AllGatherAsync::validate(const std::vector<Tensor>& input_tensors) const {
         input_tensor.memory_config().memory_layout);
 }
 
+static void validate_output_tensor_allocation(const std::vector<Tensor>& output_tensors) {
+    for (const auto& output_tensor : output_tensors) {
+        const auto& buffers = output_tensor.buffers();
+        const auto first_address = buffers.front()->address();
+        TT_FATAL(
+            std::all_of(
+                buffers.begin(),
+                buffers.end(),
+                [&first_address](const auto& buffer) {
+                    return buffer != nullptr && buffer->address() == first_address;
+                }),
+            "Output buffers for all_gather async must be lock-step allocated but some of the tensors were allocated at "
+            "different addresses across devices.");
+    }
+}
+
 std::vector<ttnn::SimpleShape> AllGatherAsync::compute_output_shapes(const std::vector<Tensor>& input_tensors) const {
     auto shape = input_tensors[0].get_padded_shape();  // TODO: Replace with get_logical_shape()
     shape[this->dim] *= this->ring_size;
@@ -204,11 +219,8 @@ Tensor all_gather_async(
     CoreCoord grid_size = devices[0]->compute_with_storage_grid_size();
     auto core_grid = CoreRange({0, 0}, {grid_size.x - 1, grid_size.y - 1});
 
-    std::optional<std::vector<std::shared_ptr<GlobalSemaphore>>> semaphore_handles_opt = ttnn::ccl::all_gather_detail::get_global_semaphores(
-        devices, 
-        core_grid,
-        subdevice_id,
-        create_semaphore_handles);
+    std::optional<std::vector<std::shared_ptr<GlobalSemaphore>>> semaphore_handles_opt =
+        ttnn::ccl::all_gather_detail::get_global_semaphores(devices, core_grid, subdevice_id, create_semaphore_handles);
 
     operation::launch_op(
         [dim,
@@ -257,7 +269,7 @@ Tensor all_gather_async(
         "This all_gather API with cluster_axis is currently supported only for the Linear topology");
     const auto mesh_view = mesh_device.get_view();
     auto devices = input_tensor.get_workers();
-    std::size_t num_devices = (cluster_axis == 0) ? mesh_view->num_rows() : mesh_view->num_cols();
+    std::size_t num_devices = (cluster_axis == 0) ? mesh_view.num_rows() : mesh_view.num_cols();
 
     int32_t rank = input_tensor.get_logical_shape().rank();
 
@@ -273,11 +285,8 @@ Tensor all_gather_async(
     std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
     CoreCoord grid_size = devices[0]->compute_with_storage_grid_size();
     auto core_grid = CoreRange({0, 0}, {grid_size.x - 1, grid_size.y - 1});
-    std::optional<std::vector<std::shared_ptr<GlobalSemaphore>>> semaphore_handles_opt = ttnn::ccl::all_gather_detail::get_global_semaphores(
-        devices, 
-        core_grid,
-        subdevice_id,
-        create_semaphore_handles);
+    std::optional<std::vector<std::shared_ptr<GlobalSemaphore>>> semaphore_handles_opt =
+        ttnn::ccl::all_gather_detail::get_global_semaphores(devices, core_grid, subdevice_id, create_semaphore_handles);
 
     operation::launch_op(
         [gather_dim,
@@ -293,12 +302,11 @@ Tensor all_gather_async(
             const std::vector<std::optional<const Tensor>>& optional_input_tensors,
             const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
             const auto& input_device_tensor = input_tensors.at(0);
-            
-            const auto coordinate = mesh_view->find_device(input_device_tensor.device()->id());
-            std::vector<Device*> devices = (cluster_axis == 0)
-                                           ? mesh_view->get_devices_on_column(coordinate.col)
-                                           : mesh_view->get_devices_on_row(coordinate.row);
-      
+
+            const auto coordinate = mesh_view.find_device(input_device_tensor.device()->id());
+            std::vector<Device*> devices = (cluster_axis == 0) ? mesh_view.get_devices_on_column(coordinate.col)
+                                                               : mesh_view.get_devices_on_row(coordinate.row);
+
             const auto& input_tensor = input_tensors.at(0);
 
             return operation::run(
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp
index 9a96d1214b4..b5bc4095f2f 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp
index 673118aee20..dc83794cdb8 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 ///
@@ -172,9 +172,6 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers(
     std::vector<Tensor> input_tensors = {input_tensor};
     std::vector<Tensor> output_tensors = {output_tensor};
     const auto& op_config = ttnn::ccl::CCLOpConfig(input_tensors, output_tensors, topology);
-    const auto& input_tensor_partition = ttnn::ccl::TensorPartition(1, 0);  // one partition, 0 index
-    const auto& output_tensor_partition =
-        ttnn::ccl::TensorPartition(ring_size, ring_index);  // ring_size partitions, ring_index index
 
     // Get worker cores, assuming 1 worker per link
     uint32_t num_workers_per_link = 1;
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp
index 8c00aca54c8..27053e75455 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp
index 5ac1b0cd1a5..5d60c264b2a 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp
index a9e623d7634..39953e0d8a1 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp
index d3d8dcd512a..8c42952c8b9 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp
index aef9877e4f9..e87c6738f49 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.hpp
index c0306ba0e18..8a29e0c4e39 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0