From 20416bdcbb306705b5619ee9f98f294f297910ce Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Sat, 21 Dec 2024 04:04:53 +0000 Subject: [PATCH] updates/fixes after rebase - also apply PR feedback --- ...c_erisc_datamover_sender_worker_sender.cpp | 18 --- .../fabric_worker_sender_multi_input.cpp | 20 +-- ...erisc_data_mover_loopback_with_workers.cpp | 147 ++++++++---------- .../ccl/common/host/ccl_worker_builder.cpp | 11 +- .../ccl/common/host/ccl_worker_builder.hpp | 12 +- .../kernels/ccl_send_reader_two_input.cpp | 1 - .../ccl/erisc_datamover_builder.cpp | 67 +------- .../ccl/all_gather_async/all_gather_async.cpp | 6 +- .../ccl/all_gather_async/all_gather_async.hpp | 4 +- .../all_gather_async_pybind.cpp | 2 +- .../all_gather_async_pybind.hpp | 2 +- .../device/all_gather_async_op.cpp | 50 +++--- .../device/all_gather_async_op.hpp | 2 +- .../device/all_gather_async_program.cpp | 5 +- .../device/reduce_scatter_async_op.cpp | 2 +- .../device/reduce_scatter_async_op.hpp | 2 +- .../reduce_scatter_async/reduce_scatter.cpp | 2 +- .../reduce_scatter_async/reduce_scatter.hpp | 2 +- .../reduce_scatter_pybind.cpp | 2 +- .../reduce_scatter_pybind.hpp | 2 +- 20 files changed, 130 insertions(+), 229 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index 50616bf25c9..39380695040 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -64,8 +64,6 @@ void kernel_main() { auto worker_buffer_index_semaphore_addr = get_semaphore(get_arg_val(arg_idx++)); bool connected_to_persistent_fabric = get_arg_val(arg_idx++) != 0; - DPRINT << "worker_buffer_index_semaphore_addr: " << (uint32_t)worker_buffer_index_semaphore_addr << "\n"; - DPRINT << "connected_to_persistent_fabric: " << (uint32_t)connected_to_persistent_fabric << "\n"; // TODO: move to semaphore auto edm_buffer_index_sem_id = get_arg_val(arg_idx++); ASSERT(edm_buffer_index_sem_id < 8); @@ -80,7 +78,6 @@ void kernel_main() { } else { config.unicast.distance = static_cast(get_arg_val(arg_idx++)); } - DPRINT << "config.unicast.distance: " << (uint32_t)config.unicast.distance << "\n"; const InterleavedAddrGen dest_addr_gen = { .bank_base_address = dest_addr, .page_size = page_size}; @@ -101,9 +98,7 @@ void kernel_main() { writer_send_sem_addr, worker_buffer_index_semaphore_addr); - DPRINT << "sender open\n"; sender.open(); - DPRINT << "opened\n"; constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; @@ -116,10 +111,8 @@ void kernel_main() { uint32_t buffer_index = 0; cb_wait_front(cb_id_in0, 1); auto a_packet_header_addr = get_read_ptr(cb_id_in0); - DPRINT << "total_pages_to_send: " << (uint32_t)total_pages_to_send << "\n"; for (uint32_t p = 0; p < total_pages_to_send; p += num_pages_per_send) { uint32_t pages_to_send = std::min(num_pages_per_send, total_pages_to_send - p); - DPRINT << "wait empty write slot\n"; sender.wait_for_empty_write_slot(); cb_wait_front(cb_id_in0, pages_to_send); @@ -156,22 +149,15 @@ void kernel_main() { uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t))); sender.send_payload_blocking_from_address(packet_addr, packet_size); - DPRINT << "noc write barrier\n"; noc_async_writes_flushed(); - DPRINT << "cb pop front\n"; cb_pop_front(cb_id_in0, pages_to_send); - DPRINT << "cb pop front done\n"; } - DPRINT << "DONE MAIN LOOP\n"; if constexpr (!mcast_mode) { - DPRINT << "TEARDOWN\n"; sender.wait_for_empty_write_slot(); auto& packet_header = *reinterpret_cast(a_packet_header_addr); ASSERT(*last_message_semaphore_address == 0); - packet_header.reserved = 0xE; - packet_header.reserved2 = 0xFFFF; packet_header.to_atomic_inc(); packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{2}); packet_header.to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader( @@ -180,16 +166,12 @@ void kernel_main() { sender.send_payload_blocking_from_address( a_packet_header_addr, packet_header.get_payload_size_including_header()); - DPRINT << "WAITING FOR COMPLETION LOOPBACK @: " << (uint32_t)last_message_semaphore_address << "\n"; noc_semaphore_wait(last_message_semaphore_address, 1); - DPRINT << "\t got it!\n"; } bool closed_fabric_connection = terminate_fabric_endpoints_farthest_to_nearest(sender, a_packet_header_addr, arg_idx); if (!closed_fabric_connection) { - DPRINT << "CLOSE\n"; sender.close(); } - DPRINT << "DONE\n"; } diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index 5697ea5b307..f699132dbca 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -8,6 +8,7 @@ #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp" #include "tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp" struct unicast_mode { uint8_t distance; @@ -51,9 +52,7 @@ auto forward_to_fabric_from_cb( // bit of a hack to extract X/Y const auto dest_noc_address = get_noc_addr(current_page, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t dest_addr = dest_noc_address & 0xFFFFFFFF; - const size_t dest_noc_x = (dest_noc_address >> NOC_ADDR_LOCAL_BITS) & ((1 << NOC_ADDR_NODE_ID_BITS) - 1); - const size_t dest_noc_y = (dest_noc_address >> (NOC_ADDR_LOCAL_BITS + NOC_ADDR_NODE_ID_BITS)) & ((1 << NOC_ADDR_NODE_ID_BITS) - 1); + const auto [dest_worker_noc, dest_addr] = get_noc_address_components(dest_noc_address); const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); auto packet_addr = get_read_ptr(cb_id); @@ -64,20 +63,16 @@ auto forward_to_fabric_from_cb( .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ dest_addr, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader), - static_cast(dest_noc_x), - static_cast(dest_noc_y) - }); - packet_header.reserved2 = 0x1111; // debug only + static_cast(dest_worker_noc.x), + static_cast(dest_worker_noc.y)}); } else { packet_header.to_write() .to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ dest_addr, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader), - static_cast(dest_noc_x), - static_cast(dest_noc_y) - }); - packet_header.reserved2 = 0x1111; // debug only + static_cast(dest_worker_noc.x), + static_cast(dest_worker_noc.y)}); } uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t))); @@ -196,12 +191,13 @@ void kernel_main() { sender.wait_for_empty_write_slot(); + constexpr size_t kLoopbackNumHopsToMyChip = 2; auto &packet_header = *reinterpret_cast(a_packet_header_addr); ASSERT(*last_message_semaphore_address == 0); packet_header.reserved = 0xE; packet_header.reserved2 = 0xFFFF; packet_header.to_atomic_inc(); - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{2}); + packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{kLoopbackNumHopsToMyChip}); packet_header.to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader( reinterpret_cast(last_message_semaphore_address), 1, diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index 834f71d6f06..13e2166bae6 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -68,8 +68,7 @@ class T3000TestDevice { arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); num_devices_ = tt::tt_metal::GetNumAvailableDevices(); - if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() == 8 and - tt::tt_metal::GetNumPCIeDevices() == 4) { + if (arch_ == tt::ARCH::WORMHOLE_B0 and num_devices_ == 8 and tt::tt_metal::GetNumPCIeDevices() == 4) { mesh_device_ = MeshDevice::create(MeshDeviceConfig(MeshShape{2, 4})); std::vector ids(num_devices_, 0); @@ -1128,7 +1127,7 @@ int TestLineFabricEntrypoint( auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); if (num_devices < 4) { - log_info("This test can only be run on N300 devices"); + log_info("This test can only be run on T3000 devices"); return 0; } if (arch == tt::ARCH::GRAYSKULL) { @@ -1141,7 +1140,7 @@ int TestLineFabricEntrypoint( // build a line of devices std::vector devices = { - view->get_device(0, 0), view->get_device(0, 1), view->get_device(0, 2), view->get_device(0, 3)}; + view.get_device(0, 0), view.get_device(0, 1), view.get_device(0, 2), view.get_device(0, 3)}; std::vector programs(enable_persistent_fabric ? 1 : devices.size()); std::optional subdevice_managers = std::nullopt; std::optional> fabric_programs; @@ -1211,7 +1210,7 @@ int TestLoopbackEntrypoint( auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); if (num_devices < 4) { - log_info("This test can only be run on N300 devices"); + log_info("This test can only be run on T3000 devices"); return 0; } if (arch == tt::ARCH::GRAYSKULL) { @@ -1222,8 +1221,8 @@ int TestLoopbackEntrypoint( T3000TestDevice test_fixture; auto view = test_fixture.mesh_device_->get_view(); - const auto& device_0 = view->get_device(0, 0); - const auto& device_1 = view->get_device(0, 1); + const auto& device_0 = view.get_device(0, 0); + const auto& device_1 = view.get_device(0, 1); auto const& active_eth_cores = device_0->get_active_ethernet_cores(true); auto eth_sender_core_iter = active_eth_cores.begin(); @@ -1385,7 +1384,7 @@ bool TestMultiInputReaderKernel( auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); if (num_devices < 4) { - log_info("This test can only be run on N300 devices"); + log_info("This test can only be run on T3000 devices"); return true; } if (arch == tt::ARCH::GRAYSKULL) { @@ -1403,7 +1402,7 @@ bool TestMultiInputReaderKernel( std::vector devices; devices.reserve(fabric_num_devices); for (size_t i = 0; i < fabric_num_devices; i++) { - devices.push_back(view->get_device(0, i)); + devices.push_back(view.get_device(0, i)); } std::vector programs(enable_persistent_fabric ? 1 : devices.size()); @@ -1702,23 +1701,15 @@ bool RunMultiInputReaderTestPropagateFullTensorIn( Tensor output_tensor0 = ttnn::ones(tensor_shape, DataType::UINT32, layout).reshape(tensor_shape); Tensor output_tensor1 = ttnn::ones(tensor_shape, DataType::UINT32, layout).reshape(tensor_shape); input_tensor0.set_tensor_spec(TensorSpec( - logical_shape, - TensorLayout( - /*DataType::BFLOAT16*/ DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config))); + logical_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config))); input_tensor1.set_tensor_spec(TensorSpec( - logical_shape, - TensorLayout( - /*DataType::BFLOAT16*/ DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in1_memory_config))); + logical_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in1_memory_config))); output_tensor0.set_tensor_spec(TensorSpec( - logical_shape, - TensorLayout( - /*DataType::BFLOAT16*/ DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config))); + logical_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config))); output_tensor1.set_tensor_spec(TensorSpec( - logical_shape, - TensorLayout( - /*DataType::BFLOAT16*/ DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config))); + logical_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config))); - size_t page_size = tile_size(DataFormat::RawUInt32); // DataFormat::Float16);// DataFormat::RawUInt32); + size_t page_size = tile_size(DataFormat::RawUInt32); ttnn::ccl::Shape4D tensor_shape_in_pages = shape_to_shape_in_tiles(tensor_shape); ttnn::ccl::Shape4D tensor_slice_shape_in_pages = tensor_shape_in_pages; @@ -1726,12 +1717,12 @@ bool RunMultiInputReaderTestPropagateFullTensorIn( ttnn::ccl::Shape4D worker_slice_shape = tensor_shape_in_pages; ttnn::ccl::Shape4D worker_slice_offset = {0, 0, 0, 0}; - ttnn::ccl::v2::TensorSlice tensor_slice{// using ords_t = Shape4D; - tensor_shape_in_pages, - tensor_slice_shape_in_pages, - tensor_slice_offset, - worker_slice_shape, - worker_slice_offset}; + ttnn::ccl::v2::TensorSlice tensor_slice{ + tensor_shape_in_pages, + tensor_slice_shape_in_pages, + tensor_slice_offset, + worker_slice_shape, + worker_slice_offset}; auto const in0_tensor_slice = tensor_slice; auto const in1_tensor_slice = tensor_slice; @@ -1755,7 +1746,7 @@ bool RunMultiInputReaderTestPropagateFullTensorIn( out1_tensor_slice, page_size, - test_writeback_mode, // TwoInputReaderKernelWriteMode::LOCAL_WRITEBACK, + test_writeback_mode, ttnn::ccl::cmd::LocalOnlyCommandDestArgs{}, false); @@ -1978,7 +1969,7 @@ TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage0_Shar // Copying even slightly large tensors exposes issues in underlying tensor code // that isn't under test here TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage1) { - ttnn::Shape tensor_shape = {1, 1, 256, 256}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 256, 256}; auto pass = RunMultiInputReaderTestPropagateFullTensorIn( tensor_shape, Layout::TILE, @@ -1993,8 +1984,6 @@ TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage1) { // TODO: update the test infra to be able to properly compare tensors if we are only // doing a slice of the larger tensor -// EnablePersistentKernelCache - // //////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////// // //// FABRIC UNICAST TENSOR WRITE (2 INPUT) @@ -2002,7 +1991,7 @@ TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage1) { // //////////////////////////////////////////////////////////////////// TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SinglePageTile_OneHop) { - ttnn::Shape tensor_shape = {1, 1, 32, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 32}; constexpr size_t distance_dest_device = 1; constexpr size_t num_devices = 4; auto logical_shape = tensor_shape.logical_shape(); @@ -2035,12 +2024,12 @@ TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SingleP ttnn::ccl::Shape4D worker_slice_shape = tensor_shape_in_pages; ttnn::ccl::Shape4D worker_slice_offset = {0, 0, 0, 0}; - ttnn::ccl::v2::TensorSlice tensor_slice{// using ords_t = Shape4D; - tensor_shape_in_pages, - tensor_slice_shape_in_pages, - tensor_slice_offset, - worker_slice_shape, - worker_slice_offset}; + ttnn::ccl::v2::TensorSlice tensor_slice{ + tensor_shape_in_pages, + tensor_slice_shape_in_pages, + tensor_slice_offset, + worker_slice_shape, + worker_slice_offset}; auto const in0_tensor_slice = tensor_slice; auto const in1_tensor_slice = tensor_slice; @@ -2073,7 +2062,7 @@ TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SingleP } TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SinglePageTile_OneHop_PersistentFabric) { - ttnn::Shape tensor_shape = {1, 1, 32, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 32}; constexpr size_t distance_dest_device = 1; constexpr size_t num_devices = 4; auto logical_shape = tensor_shape.logical_shape(); @@ -2106,12 +2095,12 @@ TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SingleP ttnn::ccl::Shape4D worker_slice_shape = tensor_shape_in_pages; ttnn::ccl::Shape4D worker_slice_offset = {0, 0, 0, 0}; - ttnn::ccl::v2::TensorSlice tensor_slice{// using ords_t = Shape4D; - tensor_shape_in_pages, - tensor_slice_shape_in_pages, - tensor_slice_offset, - worker_slice_shape, - worker_slice_offset}; + ttnn::ccl::v2::TensorSlice tensor_slice{ + tensor_shape_in_pages, + tensor_slice_shape_in_pages, + tensor_slice_offset, + worker_slice_shape, + worker_slice_offset}; auto const in0_tensor_slice = tensor_slice; auto const in1_tensor_slice = tensor_slice; @@ -2184,12 +2173,12 @@ void RunFabricMcastFullTensorPropagateTest( ttnn::ccl::Shape4D worker_slice_shape = tensor_shape_in_pages; ttnn::ccl::Shape4D worker_slice_offset = {0, 0, 0, 0}; - ttnn::ccl::v2::TensorSlice tensor_slice{// using ords_t = Shape4D; - tensor_shape_in_pages, - tensor_slice_shape_in_pages, - tensor_slice_offset, - worker_slice_shape, - worker_slice_offset}; + ttnn::ccl::v2::TensorSlice tensor_slice{ + tensor_shape_in_pages, + tensor_slice_shape_in_pages, + tensor_slice_offset, + worker_slice_shape, + worker_slice_offset}; auto const in0_tensor_slice = tensor_slice; auto const in1_tensor_slice = tensor_slice; @@ -2222,38 +2211,38 @@ void RunFabricMcastFullTensorPropagateTest( } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_SingleHop) { - ttnn::Shape tensor_shape = {1, 1, 32, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 32}; constexpr size_t distance_dest_device = 1; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_TwoHop) { - ttnn::Shape tensor_shape = {1, 1, 32, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 32}; constexpr size_t distance_dest_device = 2; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_ThreeHop) { - ttnn::Shape tensor_shape = {1, 1, 32, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 32}; constexpr size_t distance_dest_device = 3; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_4PageTile_SingleHop) { - ttnn::Shape tensor_shape = {1, 1, 32, 128}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 128}; constexpr size_t distance_dest_device = 1; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, DMultiInputReader_4PageTile_TwoHop) { - ttnn::Shape tensor_shape = {1, 1, 128, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 128, 32}; constexpr size_t distance_dest_device = 2; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_4PageTile_ThreeHop) { - ttnn::Shape tensor_shape = {1, 1, 64, 64}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 64, 64}; constexpr size_t distance_dest_device = 3; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, false); @@ -2266,39 +2255,39 @@ TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_lotsP } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_SingleHop_PersistentFabric) { - ttnn::Shape tensor_shape = {1, 1, 32, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 32}; constexpr size_t distance_dest_device = 1; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_TwoHop_PersistentFabric) { - ttnn::Shape tensor_shape = {1, 1, 32, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 32}; constexpr size_t distance_dest_device = 2; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_ThreeHop_PersistentFabric) { - ttnn::Shape tensor_shape = {1, 1, 32, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 32}; constexpr size_t distance_dest_device = 3; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_4PageTile_SingleHop_PersistentFabric) { - ttnn::Shape tensor_shape = {1, 1, 32, 128}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 32, 128}; constexpr size_t distance_dest_device = 1; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, DMultiInputReader_4PageTile_TwoHop_PersistentFabric) { - ttnn::Shape tensor_shape = {1, 1, 128, 32}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 128, 32}; constexpr size_t distance_dest_device = 2; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true); } TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_4PageTile_ThreeHop_PersistentFabric) { - ttnn::Shape tensor_shape = {1, 1, 64, 64}; // 16384 crashes... TBD + ttnn::Shape tensor_shape = {1, 1, 64, 64}; constexpr size_t distance_dest_device = 3; constexpr size_t num_devices = 4; RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true); @@ -2330,7 +2319,7 @@ bool RunPipelinedWorkersTest( auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); if (num_devices < 4) { - log_info("This test can only be run on N300 devices"); + log_info("This test can only be run on T3000 devices"); return true; } if (arch == tt::ARCH::GRAYSKULL) { @@ -2347,7 +2336,7 @@ bool RunPipelinedWorkersTest( T3000TestDevice test_fixture; auto view = test_fixture.mesh_device_->get_view(); - Device* device = view->get_device(0, 0); + Device* device = view.get_device(0, 0); ; // General setup is as follows: @@ -2357,7 +2346,6 @@ bool RunPipelinedWorkersTest( // HOWEVER. the reader will be programmed to read the chunks in a different order than they were written, this way // we can identify synchronization related bugs (e.g. if sender semaphore increments before writes flush) - ///// PROMOTE EVERYTHING ABOVE HERE TO ARGUMENTS TO THIS CODE WHICH WILL EVENTUALLY BECOME A FUNCTION TT_FATAL(num_workers_per_stage.size() == num_stages, "Must have a read order for each stage"); TT_FATAL(worker_chunk_read_order.size() == num_stages, "Must have a read order for each stage"); for (size_t i = 0; i < num_stages; ++i) { @@ -2559,7 +2547,7 @@ bool RunPipelinedWorkersTest( {&device_tensors[stage]}, {page_size_bytes}, device, - cb_packet_size_in_pages, // TODO: get from fabric + cb_packet_size_in_pages, {worker_cores.at(worker)}, reader_cmd_stream, std::nullopt, @@ -2571,7 +2559,7 @@ bool RunPipelinedWorkersTest( {&device_tensors[stage + 1]}, {page_size_bytes}, device, - cb_packet_size_in_pages, // TODO: get from fabric + cb_packet_size_in_pages, {worker_cores.at(worker)}, writer_cmd_stream, std::nullopt, @@ -2865,8 +2853,6 @@ TEST( #include "ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp" #include "tt_metal/common/bfloat16.hpp" TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { - // JUST MAKE SURE IT DOESN'T HANG - CORRECTNESS CHECKS ARE DONE IN PYTEST - // HERE WE JUST CHECK THE BASIC PLUMBING IS DONE const size_t dim = 3; const size_t num_links = 1; constexpr auto layout = Layout::TILE; @@ -2874,7 +2860,7 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); constexpr size_t test_expected_num_devices = 4; if (tt::tt_metal::GetNumAvailableDevices() < test_expected_num_devices) { - log_info("This test can only be run on N300 devices"); + log_info("This test can only be run on T3000 devices"); return; } if (arch == tt::ARCH::GRAYSKULL) { @@ -2886,7 +2872,7 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { // build a line of devices std::vector devices = { - view->get_device(0, 1), view->get_device(1, 1), view->get_device(1, 2), view->get_device(0, 2)}; + view.get_device(0, 1), view.get_device(1, 1), view.get_device(1, 2), view.get_device(0, 2)}; const size_t num_devices = devices.size(); TT_FATAL( test_expected_num_devices == num_devices, @@ -2911,8 +2897,7 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { device_input_tensors.push_back(t.to(devices[i])); } // Need to make it a mesh tensor for use with the op - const Tensor input_mesh_tensor = - ttnn::distributed::api::aggregate_as_tensor(device_input_tensors, AllGatherTensor{}); + const Tensor input_mesh_tensor = ttnn::distributed::aggregate_as_tensor(device_input_tensors, AllGatherTensor{}); // FABRIC setup const bool enable_persistent_fabric = true; @@ -2969,7 +2954,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); constexpr size_t test_expected_num_devices = 4; if (tt::tt_metal::GetNumAvailableDevices() < test_expected_num_devices) { - log_info("This test can only be run on N300 devices"); + log_info("This test can only be run on T3000 devices"); return; } if (arch == tt::ARCH::GRAYSKULL) { @@ -2981,7 +2966,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li // build a line of devices std::vector devices = { - view->get_device(0, 0), view->get_device(0, 1), view->get_device(0, 2), view->get_device(0, 3)}; + view.get_device(0, 0), view.get_device(0, 1), view.get_device(0, 2), view.get_device(0, 3)}; const size_t num_devices = devices.size(); TT_FATAL( test_expected_num_devices == num_devices, @@ -3004,8 +2989,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li device_input_tensors.push_back(t.to(devices[i])); } // Need to make it a mesh tensor for use with the op - const Tensor input_mesh_tensor = - ttnn::distributed::api::aggregate_as_tensor(device_input_tensors, AllGatherTensor{}); + const Tensor input_mesh_tensor = ttnn::distributed::aggregate_as_tensor(device_input_tensors, AllGatherTensor{}); // FABRIC setup const bool enable_persistent_fabric = true; @@ -3044,10 +3028,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li log_info(tt::LogTest, "Fabric teardown"); persistent_fabric_teardown_sequence( - devices, - subdevice_managers, - fabric_handle.value(), - tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); // tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE); + devices, subdevice_managers, fabric_handle.value(), tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); log_info(tt::LogTest, "Waiting for teardown completion"); for (auto d : devices) { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp index 330f0e84902..e09ddf81d93 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp @@ -21,9 +21,7 @@ #include #include -namespace ttnn { -namespace ccl { -namespace worker_detail { +namespace ttnn::ccl::worker_detail { CCLWorkerArgBuilder::CCLWorkerArgBuilder( Device const* device, @@ -1068,9 +1066,8 @@ void generate_multi_input_command_stream_kernel_rt_args( ttnn::ccl::emit_address_generator_runtime_args(t->buffer()->device(), *t), std::back_inserter(rt_args)); } - } else { - // Interleaved addrgen passes no additional args - we specify interleaved addrgen as the default } + // else: Interleaved addrgen passes no additional args - we specify interleaved addrgen as the default } rt_args.push_back(forward_fabric_connections.has_value()); @@ -1570,6 +1567,4 @@ std::vector CCLWorkerArgBuilder::generate_sender_writer_kernel_ct_args return args; } -} // namespace worker_detail -} // namespace ccl -} // namespace ttnn +} // namespace ttnn::ccl::worker_detail diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp index e3c31986c70..79699816337 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp @@ -13,19 +13,16 @@ #include #include -namespace tt { -namespace tt_metal { +namespace tt::tt_metal { inline namespace v0 { // Forward declarations class Device; } // namespace v0 -} // namespace tt_metal -} // namespace tt +} // namespace tt::tt_metal -namespace ttnn { -namespace ccl { +namespace ttnn::ccl { class WorkerEdmInterfaceArgs; class SenderWorkerAdapterSpec; @@ -172,5 +169,4 @@ struct CCLWorkerArgBuilder { }; } // namespace worker_detail -} // namespace ccl -} // namespace ttnn +} // namespace ttnn::ccl diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp index 2d1d8b8f9b5..ca6e26a33e0 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp @@ -652,7 +652,6 @@ FORCE_INLINE void try_advance_read_tensor_to_cb(command_context_t& cmd_ contig_pages_advanced = std::min(cmd_ctx.packet_size_in_pages - i, contig_pages_); ASSERT(contig_pages_advanced > 0); ASSERT(contig_pages_advanced <= cmd_ctx.packet_size_in_pages); - ASSERT(cmd_ctx.page_size != 0xD3AD); noc_async_read(noc_addr, l1_write_addr, cmd_ctx.page_size * contig_pages_advanced); } l1_write_addr += cmd_ctx.page_size * contig_pages_advanced; diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp index 9f2ed3a94e3..d954dacb906 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp @@ -721,65 +721,14 @@ void EdmLineFabricOpInterface::teardown_from_host(tt::fabric::TerminationSignal } void initialize_edm_fabric(distributed::MeshDevice* mesh_device) { - // auto build = [](std::vector const& line_view) { - // std::vector programs(line_view.size()); - // std::vector program_ptrs; - // program_ptrs.reserve(programs.size()); - // std::transform(programs.begin(), programs.end(), std::back_inserter(program_ptrs), [](Program& p) { return &p; }); - // EdmLineFabricOpInterface edm_fabric(line_view, program_ptrs, true); - // edm_fabric.build_kernels(); - // for (size_t i = 0; i < line_view.size(); i++) { - // log_info(tt::LogAlways, "Compile EDM program"); - // Device *device = line_view[i]; - // device->push_work([&](){tt::tt_metal::detail::CompileProgram(line_view[i], programs[i]);}, false); - // } - // for (size_t i = 0; i < line_view.size(); i++) { - // Device *device = line_view[i]; - // Program &program = programs[i]; - // log_info(tt::LogAlways, "Enqueue EDM program"); - // device->push_work([&](){tt::tt_metal::EnqueueProgram(device->command_queue(), program, false);}, true); - - // device->push_work([&](){ - // auto wait_initialized = [&](uint32_t addr) { - // auto wait_core = [&](CoreCoord const& core) { - // bool initialized = false; - // constexpr size_t max_attempts = 10000; - // size_t attempts = 0; - // while (!initialized) { - // auto host_buffer = tt::llrt::read_hex_vec_from_core(device->id(), core, addr, 4); - // initialized = host_buffer[0] == 0; - // attempts++; - // if (attempts > max_attempts) { - // log_error(tt::LogAlways, "Failed to initialize EDM fabric"); - // break; - // } - // } - // log_info(tt::LogAlways,"Initialized EDM fabric"); - // }; - // for (auto const& builder : edm_fabric.edm_builders_backward_direction[device->id()]) { - // wait_core(CoreCoord(builder.my_noc_x, builder.my_noc_y)); - // } - // for (auto const& builder : edm_fabric.edm_builders_forward_direction[device->id()]) { - // wait_core(CoreCoord(builder.my_noc_x, builder.my_noc_y)); - // } - // }; - - // wait_initialized(FabricEriscDatamoverConfig::sender_channel_0_buffer_index_semaphore_address); - // wait_initialized(FabricEriscDatamoverConfig::sender_channel_0_local_flow_control_semaphore_address); - // wait_initialized(FabricEriscDatamoverConfig::sender_channel_0_connection_semaphore_address); - - // }, true); - // } - // log_info(tt::LogAlways, "DONE"); - // }; std::vector row_fabric_lines; - row_fabric_lines.reserve(mesh_device->get_view()->get_row_views().size()); + row_fabric_lines.reserve(mesh_device->get_view().get_row_views().size()); std::vector col_fabric_lines; - col_fabric_lines.reserve(mesh_device->get_view()->get_column_views().size()); + col_fabric_lines.reserve(mesh_device->get_view().get_column_views().size()); - size_t num_rows = mesh_device->get_view()->get_row_views().size(); - size_t num_cols = mesh_device->get_view()->get_column_views().size(); + size_t num_rows = mesh_device->get_view().get_row_views().size(); + size_t num_cols = mesh_device->get_view().get_column_views().size(); std::vector> programs(num_rows); for (size_t r = 0; r < num_rows; r++) { programs[r].resize(num_cols); @@ -789,7 +738,7 @@ void initialize_edm_fabric(distributed::MeshDevice* mesh_device) { std::vector program_ptrs; program_ptrs.reserve(num_cols); std::transform(programs[i].begin(), programs[i].end(), std::back_inserter(program_ptrs), [](Program& p) { return &p; }); - row_fabric_lines.push_back(EdmLineFabricOpInterface(mesh_device->get_view()->get_row_views()[i], program_ptrs, true)); + row_fabric_lines.push_back(EdmLineFabricOpInterface(mesh_device->get_view().get_row_views()[i], program_ptrs, true)); } for (size_t i = 0; i < num_cols; i++) { @@ -798,7 +747,7 @@ void initialize_edm_fabric(distributed::MeshDevice* mesh_device) { for (size_t r = 0; r < num_rows; r++) { program_ptrs.push_back(&programs[r][i]); } - col_fabric_lines.push_back(EdmLineFabricOpInterface(mesh_device->get_view()->get_column_views()[i], program_ptrs, true)); + col_fabric_lines.push_back(EdmLineFabricOpInterface(mesh_device->get_view().get_column_views()[i], program_ptrs, true)); } std::for_each(row_fabric_lines.begin(), row_fabric_lines.end(), [](auto& line) { line.build_kernels(); }); @@ -825,10 +774,10 @@ void teardown_edm_fabric(distributed::MeshDevice* mesh_device) { edm_fabric.teardown_from_host(tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); }; - for (auto const &row_view : mesh_device->get_view()->get_row_views()) { + for (auto const &row_view : mesh_device->get_view().get_row_views()) { teardown(row_view); } - for (auto const &col_view : mesh_device->get_view()->get_column_views()) { + for (auto const &col_view : mesh_device->get_view().get_column_views()) { teardown(col_view); } } diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp index 7afc6f4c4ed..7ce729ed1b7 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 @@ -28,7 +28,6 @@ ttnn::Tensor ExecuteAllGatherAsync::invoke( create_semaphore_handles); } - ttnn::Tensor ExecuteAllGatherAsync::invoke( const ttnn::Tensor& input_tensor, const int32_t dim, @@ -39,8 +38,7 @@ ttnn::Tensor ExecuteAllGatherAsync::invoke( const std::optional num_preferred_links, std::optional subdevice_id, bool enable_persistent_fabric_mode, - bool create_semaphore_handles) { - + bool create_semaphore_handles) { return ttnn::operations::experimental::ccl::all_gather_async( input_tensor, dim, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp index 4c8ce03b34d..26f39484078 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 @@ -20,7 +20,7 @@ struct ExecuteAllGatherAsync { std::optional subdevice_id = std::nullopt, bool enable_persistent_fabric_mode = false, bool create_semaphore_handles = true); - + static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, const int32_t dim, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp index b3c5127d87e..fe39f848d98 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp index 40dde33bc7a..29bd4ff9f18 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp index e739be68ff5..15dcfee3671 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp @@ -1,4 +1,4 @@ -/// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +/// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 @@ -60,8 +60,8 @@ AllGatherAsync create_all_gather_async_struct( } std::optional>> get_global_semaphores( - std::vector const& devices, - CoreRange const& core_range, + const std::vector& devices, + const CoreRange& core_range, std::optional subdevice_id, bool create_semaphore_handles) { std::optional>> semaphore_handles_opt; @@ -91,7 +91,6 @@ std::optional>> get_global_semaphor return semaphore_handles_opt; } - } // namespace all_gather_detail } // namespace ccl @@ -119,6 +118,22 @@ void AllGatherAsync::validate(const std::vector& input_tensors) const { input_tensor.memory_config().memory_layout); } +static void validate_output_tensor_allocation(const std::vector& output_tensors) { + for (const auto& output_tensor : output_tensors) { + const auto& buffers = output_tensor.buffers(); + const auto first_address = buffers.front()->address(); + TT_FATAL( + std::all_of( + buffers.begin(), + buffers.end(), + [&first_address](const auto& buffer) { + return buffer != nullptr && buffer->address() == first_address; + }), + "Output buffers for all_gather async must be lock-step allocated but some of the tensors were allocated at " + "different addresses across devices."); + } +} + std::vector AllGatherAsync::compute_output_shapes(const std::vector& input_tensors) const { auto shape = input_tensors[0].get_padded_shape(); // TODO: Replace with get_logical_shape() shape[this->dim] *= this->ring_size; @@ -204,11 +219,8 @@ Tensor all_gather_async( CoreCoord grid_size = devices[0]->compute_with_storage_grid_size(); auto core_grid = CoreRange({0, 0}, {grid_size.x - 1, grid_size.y - 1}); - std::optional>> semaphore_handles_opt = ttnn::ccl::all_gather_detail::get_global_semaphores( - devices, - core_grid, - subdevice_id, - create_semaphore_handles); + std::optional>> semaphore_handles_opt = + ttnn::ccl::all_gather_detail::get_global_semaphores(devices, core_grid, subdevice_id, create_semaphore_handles); operation::launch_op( [dim, @@ -257,7 +269,7 @@ Tensor all_gather_async( "This all_gather API with cluster_axis is currently supported only for the Linear topology"); const auto mesh_view = mesh_device.get_view(); auto devices = input_tensor.get_workers(); - std::size_t num_devices = (cluster_axis == 0) ? mesh_view->num_rows() : mesh_view->num_cols(); + std::size_t num_devices = (cluster_axis == 0) ? mesh_view.num_rows() : mesh_view.num_cols(); int32_t rank = input_tensor.get_logical_shape().rank(); @@ -273,11 +285,8 @@ Tensor all_gather_async( std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; CoreCoord grid_size = devices[0]->compute_with_storage_grid_size(); auto core_grid = CoreRange({0, 0}, {grid_size.x - 1, grid_size.y - 1}); - std::optional>> semaphore_handles_opt = ttnn::ccl::all_gather_detail::get_global_semaphores( - devices, - core_grid, - subdevice_id, - create_semaphore_handles); + std::optional>> semaphore_handles_opt = + ttnn::ccl::all_gather_detail::get_global_semaphores(devices, core_grid, subdevice_id, create_semaphore_handles); operation::launch_op( [gather_dim, @@ -293,12 +302,11 @@ Tensor all_gather_async( const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_device_tensor = input_tensors.at(0); - - const auto coordinate = mesh_view->find_device(input_device_tensor.device()->id()); - std::vector devices = (cluster_axis == 0) - ? mesh_view->get_devices_on_column(coordinate.col) - : mesh_view->get_devices_on_row(coordinate.row); - + + const auto coordinate = mesh_view.find_device(input_device_tensor.device()->id()); + std::vector devices = (cluster_axis == 0) ? mesh_view.get_devices_on_column(coordinate.col) + : mesh_view.get_devices_on_row(coordinate.row); + const auto& input_tensor = input_tensors.at(0); return operation::run( diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp index 9a96d1214b4..b5bc4095f2f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp index 673118aee20..dc83794cdb8 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 /// @@ -172,9 +172,6 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( std::vector input_tensors = {input_tensor}; std::vector output_tensors = {output_tensor}; const auto& op_config = ttnn::ccl::CCLOpConfig(input_tensors, output_tensors, topology); - const auto& input_tensor_partition = ttnn::ccl::TensorPartition(1, 0); // one partition, 0 index - const auto& output_tensor_partition = - ttnn::ccl::TensorPartition(ring_size, ring_index); // ring_size partitions, ring_index index // Get worker cores, assuming 1 worker per link uint32_t num_workers_per_link = 1; diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp index 8c00aca54c8..27053e75455 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp index 5ac1b0cd1a5..5d60c264b2a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp index a9e623d7634..39953e0d8a1 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp index d3d8dcd512a..8c42952c8b9 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp index aef9877e4f9..e87c6738f49 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.hpp index c0306ba0e18..8a29e0c4e39 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0