diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 65fad37937b..43466f36f65 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -94,11 +94,11 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(Device *device, CommandQueue // Clear out command queue uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); - uint32_t cq_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / device->num_hw_cqs(); + uint32_t cq_size = device->sysmem_manager().get_cq_size(); std::vector cq_zeros((cq_size - CQ_START) / sizeof(uint32_t), 0); - tt::Cluster::instance().write_sysmem(cq_zeros.data(), (cq_size - CQ_START), CQ_START, mmio_device_id, channel); + tt::Cluster::instance().write_sysmem(cq_zeros.data(), (cq_size - CQ_START), get_absolute_cq_offset(channel, 0, cq_size) + CQ_START, mmio_device_id, channel); for (const bool cq_write : {true, false}) { for (const bool cq_read : {true, false}) { @@ -327,6 +327,7 @@ namespace dram_tests { TEST_F(CommandQueueSingleCardFixture, WriteOneTileToDramBank0) { TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); } } @@ -428,11 +429,9 @@ TEST_F(CommandQueueFixture, TestPageSizeTooLarge) { // Requires enqueue write buffer TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) { for (Device *device : this->devices_) { + tt::log_info("Running On Device {}", device->id()); uint32_t page_size = 2048; - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); - uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel); - uint32_t command_issue_region_size = 805310400; + uint32_t command_issue_region_size = device->sysmem_manager().get_issue_queue_size(0); uint32_t max_command_size = command_issue_region_size - CQ_START; uint32_t buffer = 14240; @@ -446,10 +445,9 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) { TEST_F(CommandQueueSingleCardFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { for (Device *device : this->devices_) { + tt::log_info("Running On Device {}", device->id()); uint32_t page_size = 2048; - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); - uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel); + uint32_t command_queue_size = device->sysmem_manager().get_cq_size(); uint32_t num_pages = command_queue_size / page_size; TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM}; @@ -465,10 +463,8 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) { uint32_t small_page_size = 2048; // page size for second read for (Device *device : devices_) { - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); - uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel); - uint32_t command_completion_region_size = 268431360; + tt::log_info("Running On Device {}", device->id()); + uint32_t command_completion_region_size = device->sysmem_manager().get_completion_queue_size(0); uint32_t first_buffer_size = tt::round_up(command_completion_region_size * 0.95, large_page_size); @@ -504,10 +500,8 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) { TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) { // Using default 75-25 issue and completion queue split for (Device *device : devices_) { - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); - uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel); - uint32_t command_completion_region_size = 268431360; + tt::log_info("Running On Device {}", device->id()); + uint32_t command_completion_region_size = device->sysmem_manager().get_completion_queue_size(0); uint32_t num_pages_buff_1 = 9; uint32_t page_size_buff_1 = 2048; @@ -653,6 +647,7 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsBlocki .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16}; for (Device *device : devices_) { + tt::log_info("Running on Device {}", device->id()); EXPECT_TRUE(local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer( device, device->command_queue(), config)); } @@ -672,34 +667,34 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsNonblo // TODO: Split this into separate tests TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) { + std::map>> test_params; + for (Device *device : devices_) { - for (const std::array cores : - {std::array{1, 1}, - std::array{5, 1}, - std::array{1, 5}, - std::array{5, 3}, - std::array{3, 5}, - std::array{5, 5}, - std::array{ - static_cast(device->compute_with_storage_grid_size().x), - static_cast(device->compute_with_storage_grid_size().y)}}) { - for (const std::array num_pages : { - std::array{1, 1}, - std::array{2, 1}, - std::array{1, 2}, - std::array{2, 2}, - std::array{7, 11}, - std::array{3, 65}, - std::array{67, 4}, - std::array{3, 137}, - }) { - for (const std::array page_shape : { - std::array{32, 32}, - std::array{1, 4}, - std::array{1, 120}, - std::array{1, 1024}, - std::array{1, 2048}, - }) { + if (tt::Cluster::instance().is_galaxy_cluster()) { + test_params = { + {"cores", + {{1, 1}, + {static_cast(device->compute_with_storage_grid_size().x), + static_cast(device->compute_with_storage_grid_size().y)}}}, + {"num_pages", {{3, 65}}}, + {"page_shape", {{32, 32}}}}; + } else { + test_params = { + {"cores", + {{1, 1}, + {5, 1}, + {1, 5}, + {5, 3}, + {3, 5}, + {5, 5}, + {static_cast(device->compute_with_storage_grid_size().x), + static_cast(device->compute_with_storage_grid_size().y)}}}, + {"num_pages", {{1, 1}, {2, 1}, {1, 2}, {2, 2}, {7, 11}, {3, 65}, {67, 4}, {3, 137}}}, + {"page_shape", {{32, 32}, {1, 4}, {1, 120}, {1, 1024}, {1, 2048}}}}; + } + for (const std::array cores : test_params.at("cores")) { + for (const std::array num_pages : test_params.at("num_pages")) { + for (const std::array page_shape : test_params.at("page_shape")) { for (const TensorMemoryLayout shard_strategy : {TensorMemoryLayout::HEIGHT_SHARDED, TensorMemoryLayout::WIDTH_SHARDED, @@ -712,7 +707,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) { config.num_iterations = num_iterations; config.mem_config = shard_strategy; config.page_shape = page_shape; - tt::log_info(tt::LogTest, fmt::format("cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str()); + tt::log_info(tt::LogTest, fmt::format("Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str()); local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded( device, device->command_queue(), config, BufferType::L1, false); } @@ -800,7 +795,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeL1ReadWrites) { config.num_iterations = num_iterations; config.mem_config = shard_strategy; config.page_shape = page_shape; - tt::log_info(tt::LogTest, fmt::format("cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str()); + tt::log_info(tt::LogTest, fmt::format("Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str()); local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded( device, device->command_queue(), config, BufferType::L1, true); } diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp index 66c0c92e0b6..3c5db2a7312 100644 --- a/tt_metal/common/core_descriptor.hpp +++ b/tt_metal/common/core_descriptor.hpp @@ -119,6 +119,10 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id, auto compute_with_storage_start = desc_yaml["compute_with_storage_grid_range"]["start"]; auto compute_with_storage_end = desc_yaml["compute_with_storage_grid_range"]["end"]; + if (tt::Cluster::instance().is_galaxy_cluster() and product_name == "nebula_x1") { + compute_with_storage_start = desc_yaml["tg_compute_with_storage_grid_range"]["start"]; + compute_with_storage_end = desc_yaml["tg_compute_with_storage_grid_range"]["end"]; + } TT_ASSERT(compute_with_storage_start.IsSequence() and compute_with_storage_end.IsSequence()); TT_ASSERT(compute_with_storage_end[0].as() >= compute_with_storage_start[0].as()); TT_ASSERT(compute_with_storage_end[1].as() >= compute_with_storage_start[1].as()); @@ -136,7 +140,11 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id, } std::vector dispatch_cores; - for (const auto& core_node : desc_yaml["dispatch_cores"]) { + auto dispatch_cores_string = "dispatch_cores"; + if (tt::Cluster::instance().is_galaxy_cluster() and product_name == "nebula_x1") { + dispatch_cores_string = "tg_dispatch_cores"; + } + for (const auto& core_node : desc_yaml[dispatch_cores_string]) { RelativeCoreCoord coord = {}; if (core_node.IsSequence()) { // Logical coord diff --git a/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml b/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml index 9afdd48ccd2..68eb9d8df87 100644 --- a/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml +++ b/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml @@ -49,12 +49,21 @@ nebula_x1: start: [0, 0] end: [7, 7] + tg_compute_with_storage_grid_range: # Logical only start and end [x, y] + start: [0, 0] + end: [7, 3] + storage_cores: [] dispatch_cores: [[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1]] + tg_dispatch_cores: + [[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1], + [0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2], + [0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]] + dispatch_core_type: "tensix" diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index f29618076d7..4b95502000e 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -74,6 +74,7 @@ Device::Device( ZoneScoped; TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs < 3, "num_hw_cqs can be between 1 and 2"); this->build_key_ = tt::Cluster::instance().get_harvesting_mask(device_id); + tunnel_device_dispatch_workers_ = {}; this->initialize(l1_small_size, l1_bank_remap, minimal); } @@ -388,6 +389,737 @@ void Device::configure_kernel_variant( } } +void Device::update_workers_build_settings(std::vector>> &device_worker_variants) { + for (uint32_t dwv = 0; dwv < device_worker_variants.size(); dwv++) + { + if (device_worker_variants[dwv].size() == 0) { + continue; + } + log_debug(tt::LogMetal, "Setting up {} Arguments", magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)dwv)); + switch(dwv) { + case PREFETCH: + { + uint32_t num_prefetchers = device_worker_variants[PREFETCH].size(); + TT_ASSERT(device_worker_variants[MUX].size() == 1, "Cannot have more than one Mux."); + auto mux_settings = std::get<1>(device_worker_variants[MUX][0]); + TT_ASSERT(num_prefetchers == mux_settings.semaphores.size(), "Mux does not have required number of semaphores for Prefetchers. Exptected = {}. Fount = {}", num_prefetchers, mux_settings.semaphores.size()); + uint32_t mux_sem = mux_settings.consumer_semaphore_id; + for (auto&[core, settings] : device_worker_variants[PREFETCH]) { + auto dispatch_core_type = settings.dispatch_core_type; + uint32_t downstream_cb_base = mux_settings.cb_start_address + mux_settings.cb_size_bytes * mux_sem; + settings.upstream_cores.push_back(tt_cxy_pair(0, 0, 0)); + settings.downstream_cores.push_back(mux_settings.worker_physical_core); + settings.compile_args.resize(23); + auto& compile_args = settings.compile_args; + compile_args[0] = downstream_cb_base; + compile_args[1] = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; + compile_args[2] = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(); + compile_args[3] = settings.producer_semaphore_id; + compile_args[4] = mux_sem++; + compile_args[5] = settings.issue_queue_start_addr; + compile_args[6] = settings.issue_queue_size; + compile_args[7] = dispatch_constants::PREFETCH_Q_BASE; + compile_args[8] = dispatch_constants::get(dispatch_core_type).prefetch_q_size(); + compile_args[9] = CQ_PREFETCH_Q_RD_PTR; + compile_args[10] = dispatch_constants::get(dispatch_core_type).cmddat_q_base(); + compile_args[11] = dispatch_constants::get(dispatch_core_type).cmddat_q_size(); + compile_args[12] = dispatch_constants::get(dispatch_core_type).scratch_db_base(); // unused for prefetch_h + compile_args[13] = dispatch_constants::get(dispatch_core_type).scratch_db_size(); // unused for prefetch_h + compile_args[14] = 0; //prefetch_sync_sem unused for prefetch_h + compile_args[15] = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(); // prefetch_d only + compile_args[16] = 0; // prefetch_d only + compile_args[17] = 0; //prefetch_downstream_cb_sem, // prefetch_d only + compile_args[18] = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; + compile_args[19] = dispatch_constants::PREFETCH_D_BUFFER_BLOCKS; // prefetch_d only + compile_args[20] = 2; //prefetch_h_exec_buf_sem, + compile_args[21] = false; // is_dram_variant + compile_args[22] = true; // is_host_variant + } + break; + } + case MUX: + { + uint32_t num_prefetchers = device_worker_variants[PREFETCH].size(); + TT_ASSERT(device_worker_variants[MUX].size() == 1, "Cannot have more than one Mux."); + auto &mux_settings = std::get<1>(device_worker_variants[MUX][0]); + TT_ASSERT(num_prefetchers == mux_settings.semaphores.size(), "Mux does not have required number of semaphores for Prefetchers. Exptected = {}. Fount = {}", num_prefetchers, mux_settings.semaphores.size()); + uint32_t mux_sem = mux_settings.consumer_semaphore_id; + + auto& compile_args = mux_settings.compile_args; + compile_args.resize(25); + compile_args[0] = 0; // 0: reserved + compile_args[1] = mux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + compile_args[2] = mux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + compile_args[3] = num_prefetchers; // 3: mux_fan_in + uint32_t arg_index = 4; + for (auto&[core, settings] : device_worker_variants[PREFETCH]) { + compile_args[arg_index++] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, + (uint32_t)settings.worker_physical_core.y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info + } + TT_ASSERT(device_worker_variants[US_TUNNELER_REMOTE].size() == 1, "Unexpected number of ethernet tunnelers."); + auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]); + + compile_args[8] = tunneler_settings.cb_start_address >> 4; // 8: remote_tx_queue_start_addr_words + compile_args[9] = tunneler_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words + compile_args[10] = tunneler_settings.worker_physical_core.x; // 10: remote_tx_x + compile_args[11] = tunneler_settings.worker_physical_core.y; // 11: remote_tx_y + compile_args[12] = 0; // 12: remote_tx_queue_id + compile_args[13] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 13: tx_network_type + compile_args[14] = BRISC_L1_RESULT_BASE; // 14: test_results_addr + compile_args[15] = 1024; // 15: test_results_size + compile_args[16] = 0; // 16: timeout_cycles + compile_args[17] = 0x0; // 17: output_depacketize + compile_args[18] = 0x0; // 18: output_depacketize info + arg_index = 19; // 19, 20, 21, 22: input x packetize info: + for (auto&[core, settings] : device_worker_variants[PREFETCH]) { + compile_args[arg_index++] = packet_switch_4B_pack(0x1, + dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, + settings.producer_semaphore_id, // upstream sem + mux_sem++); // local sem + } + compile_args[23] = packet_switch_4B_pack(0xA1, 0xA2, 0xA3, 0xA4); // 23: packetized input src id + compile_args[24] = packet_switch_4B_pack(0xB1, 0xB2, 0xB3, 0xB4); // 24: packetized input dest id + break; + } + case US_TUNNELER_REMOTE: + { + TT_ASSERT(device_worker_variants[US_TUNNELER_REMOTE].size() == 1, "Unexpected number of ethernet tunnelers."); + auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]); + bool is_tunnel_start = tunneler_settings.tunnel_stop == 0; + auto &compile_args = tunneler_settings.compile_args; + compile_args.resize(16); + compile_args[0] = 0xDACADACA; // 0: endpoint_id_start_index + compile_args[1] = 2; // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. + compile_args[2] = tunneler_settings.cb_start_address >> 4; // 2: rx_queue_start_addr_words + compile_args[3] = tunneler_settings.cb_size_bytes >> 4; // 3: rx_queue_size_words + + compile_args[4] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x, + tunneler_settings.eth_partner_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::ETH); // 4: remote_receiver_0_info + compile_args[6] = tunneler_settings.cb_start_address >> 4; // 6: remote_receiver_queue_start_addr_words 0 + compile_args[7] = tunneler_settings.cb_size_bytes >> 4; // 7: remote_receiver_queue_size_words 0 + + if (is_tunnel_start) { + auto &demux_settings = std::get<1>(device_worker_variants[DEMUX][0]); + auto &mux_settings = std::get<1>(device_worker_variants[MUX][0]); + + compile_args[5] = packet_switch_4B_pack(demux_settings.worker_physical_core.x, + demux_settings.worker_physical_core.y, + device_worker_variants[DISPATCH].size(),//num_dest_endpoints, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver_1_info + compile_args[8] = demux_settings.cb_start_address >> 4; // 8: remote_receiver_queue_start_addr_words 1 + compile_args[9] = demux_settings.cb_size_bytes >> 4; // 9: remote_receiver_queue_size_words 1 + compile_args[10] = packet_switch_4B_pack(mux_settings.worker_physical_core.x, + mux_settings.worker_physical_core.y, + device_worker_variants[PREFETCH].size(), // mux output queue id + (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender_0_info + } else { + auto &mux_d_settings = std::get<1>(device_worker_variants[MUX_D][0]); + auto &demux_d_settings = std::get<1>(device_worker_variants[DEMUX_D][0]); + + compile_args[5] = packet_switch_4B_pack(mux_d_settings.worker_physical_core.x, + mux_d_settings.worker_physical_core.y, + 1,//num_dest_endpoints, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver_1_info + compile_args[8] = (mux_d_settings.cb_start_address + mux_d_settings.cb_size_bytes) >> 4; // 8: remote_receiver_queue_start_addr_words 1 + compile_args[9] = mux_d_settings.cb_size_bytes >> 4; // 9: remote_receiver_queue_size_words 1 + compile_args[10] = packet_switch_4B_pack(demux_d_settings.worker_physical_core.x, + demux_d_settings.worker_physical_core.y, + 1, // demux output queue id + (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender_0_info + } + + compile_args[11] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x, + tunneler_settings.eth_partner_physical_core.y, + 3, // r tunneler output queue id + (uint32_t)DispatchRemoteNetworkType::ETH); // 11: remote_sender_1_info + + compile_args[12] = 0x39000; // 12: test_results_addr + compile_args[13] = 0x7000; // 13: test_results_size + compile_args[14] = 0; // 14: timeout_cycles + + break; + } + case DEMUX: + { + TT_ASSERT(device_worker_variants[DEMUX].size() == 1, "Unexpected number of ethernet tunnelers."); + auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]); + auto &demux_settings = std::get<1>(device_worker_variants[DEMUX][0]); + auto &dispatch_settings = std::get<1>(device_worker_variants[DISPATCH][0]); + + auto &compile_args = demux_settings.compile_args; + compile_args.resize(30); + + compile_args[0] = 0xD1; // 0: endpoint_id_start_index + compile_args[1] = demux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + compile_args[2] = demux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + compile_args[3] = device_worker_variants[DISPATCH].size(); // 3: demux_fan_out + + uint32_t arg_index = 4; + for (auto&[core, settings] : device_worker_variants[DISPATCH]) { + compile_args[arg_index++] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, + (uint32_t)settings.worker_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info + } + arg_index = 8; + for (auto&[core, settings] : device_worker_variants[DISPATCH]) { + compile_args[arg_index++] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x + compile_args[arg_index++] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x + } + compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x + compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y + compile_args[18] = 3; // 18: remote_rx_queue_id + compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type + uint32_t dest_map_array[4] = {0, 1, 2, 3}; + uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); + compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi + compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo + compile_args[22] = BRISC_L1_RESULT_BASE; // 22: test_results_addr + compile_args[23] = 1024; // 23: test_results_size + compile_args[24] = 0; // 24: timeout_cycles + compile_args[25] = 0xF; // 25: output_depacketize_mask + arg_index = 26; + uint32_t demux_sem = demux_settings.producer_semaphore_id; + for (auto&[core, settings] : device_worker_variants[DISPATCH]) { + // 26, 27, 28, 29: output x depacketize info: + compile_args[arg_index++] = packet_switch_4B_pack(settings.cb_log_page_size, + settings.consumer_semaphore_id, // downstream sem + demux_sem++, // local sem + 1); // remove header + } + break; + } + case DISPATCH: + { + uint32_t num_dispatchers = device_worker_variants[DISPATCH].size(); + TT_ASSERT(device_worker_variants[DEMUX].size() == 1, "Cannot have more than one Demux."); + auto demux_settings = std::get<1>(device_worker_variants[DEMUX][0]); + TT_ASSERT(num_dispatchers == demux_settings.semaphores.size(), "Demux does not have required number of semaphores for Dispatchers. Exptected = {}. Fount = {}", num_dispatchers, demux_settings.semaphores.size()); + uint32_t demux_sem = demux_settings.producer_semaphore_id; + for (auto&[core, settings] : device_worker_variants[DISPATCH]) { + auto dispatch_core_type = settings.dispatch_core_type; + settings.upstream_cores.push_back(demux_settings.worker_physical_core); + settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0)); + settings.compile_args.resize(17); + auto& compile_args = settings.compile_args; + compile_args[0] = settings.cb_start_address; + compile_args[1] = settings.cb_log_page_size; + compile_args[2] = settings.cb_pages; + compile_args[3] = settings.consumer_semaphore_id; + compile_args[4] = demux_sem++; + compile_args[5] = dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; + compile_args[6] = 0; //unused prefetch_sync_sem + compile_args[7] = settings.command_queue_start_addr; + compile_args[8] = settings.completion_queue_start_addr; + compile_args[9] = settings.completion_queue_size; + compile_args[10] = dispatch_constants::DISPATCH_BUFFER_BASE; // unused + compile_args[11] = (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); // unused + compile_args[12] = 0; // unused: local ds semaphore + compile_args[13] = 0; // unused: remote ds semaphore + compile_args[14] = 0; // preamble size + compile_args[15] = false; // is_dram_variant + compile_args[16] = true; // is_host_variant + } + break; + } + case US_TUNNELER_LOCAL: + { + bool is_tunnel_end = device_worker_variants[US_TUNNELER_REMOTE].size() == 0; + TT_ASSERT(device_worker_variants[US_TUNNELER_LOCAL].size() == 1, "Unexpected number of ethernet tunnelers."); + auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_LOCAL][0]); + auto &demux_d_settings = std::get<1>(device_worker_variants[DEMUX_D][0]); + auto &mux_d_settings = std::get<1>(device_worker_variants[MUX_D][0]); + + auto &compile_args = tunneler_settings.compile_args; + compile_args.resize(16); + compile_args[0] = 0xDACADACA; // 0: endpoint_id_start_index + compile_args[1] = 2; // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. + compile_args[2] = tunneler_settings.cb_start_address >> 4; // 2: rx_queue_start_addr_words + compile_args[3] = tunneler_settings.cb_size_bytes >> 4; // 3: rx_queue_size_words + + compile_args[4] = packet_switch_4B_pack(demux_d_settings.worker_physical_core.x, + demux_d_settings.worker_physical_core.y, + is_tunnel_end ? 1 : 2, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_receiver_0_info + + compile_args[5] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x, + tunneler_settings.eth_partner_physical_core.y, + 1, // input q id of remote ethernet tunneler + (uint32_t)DispatchRemoteNetworkType::ETH); // 5: remote_receiver_1_info + + compile_args[6] = demux_d_settings.cb_start_address >> 4; // 6: remote_receiver_queue_start_addr_words 0 + compile_args[7] = demux_d_settings.cb_size_bytes >> 4; // 7: remote_receiver_queue_size_words 0 + compile_args[8] = (tunneler_settings.cb_start_address + tunneler_settings.cb_size_bytes) >> 4; // 8: remote_receiver_queue_start_addr_words 1 + compile_args[9] = tunneler_settings.cb_size_bytes >> 4; // 9: remote_receiver_queue_size_words 1 + + compile_args[10] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x, + tunneler_settings.eth_partner_physical_core.y, + 2, // queue id of remote eth tunneler sender + (uint32_t)DispatchRemoteNetworkType::ETH); // 10: remote_sender_0_info + compile_args[11] = packet_switch_4B_pack(mux_d_settings.worker_physical_core.x, + mux_d_settings.worker_physical_core.y, + is_tunnel_end ? 1 : 2, // mux_d output queue id + (uint32_t)DispatchRemoteNetworkType::NOC0); // 11: remote_sender_1_info + compile_args[12] = 0x39000; // 12: test_results_addr + compile_args[13] = 0x7000; // 13: test_results_size + compile_args[14] = 0; // 14: timeout_cycles + if (!is_tunnel_end && tunneler_settings.tunnel_stop > 1) { + auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]); + auto mux_d_sender = us_tunneler_remote_settings.worker_physical_core; + compile_args[15] = (0x3 << 16) | (mux_d_sender.y << 8) | (mux_d_sender.x); + log_debug(tt::LogMetal, "Tunner Inner Device {} will send done to {}", tunneler_settings.worker_physical_core.str(), mux_d_sender.str()); + } + + break; + } + case DEMUX_D: + { + bool is_tunnel_end = device_worker_variants[US_TUNNELER_REMOTE].size() == 0; + TT_ASSERT(device_worker_variants[DEMUX_D].size() == 1, "Unexpected number of device demux."); + + auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_LOCAL][0]); + auto &demux_d_settings = std::get<1>(device_worker_variants[DEMUX_D][0]); + auto &prefetch_d_settings = std::get<1>(device_worker_variants[PREFETCH_D][0]); + + TT_ASSERT(demux_d_settings.tunnel_stop > 0 && demux_d_settings.tunnel_stop <= 4, "Invalid Demux D tunnel stop."); + + auto &compile_args = demux_d_settings.compile_args; + compile_args.resize(30); + + compile_args[0] = 0xB1; // 0: endpoint_id_start_index + compile_args[1] = demux_d_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + compile_args[2] = demux_d_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + compile_args[3] = is_tunnel_end ? 1 : 2; // 3: demux_fan_out + + compile_args[4] = packet_switch_4B_pack(prefetch_d_settings.worker_physical_core.x, + prefetch_d_settings.worker_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_tx_0_info + + compile_args[8] = prefetch_d_settings.cb_start_address >> 4; // 8: remote_tx_queue_start_addr_words 0 + compile_args[9] = prefetch_d_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words 0 + + if(!is_tunnel_end) { + auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]); + compile_args[5] = packet_switch_4B_pack((uint32_t)us_tunneler_remote_settings.worker_physical_core.x, + (uint32_t)us_tunneler_remote_settings.worker_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_tx_1_info + + compile_args[10] = us_tunneler_remote_settings.cb_start_address >> 4; // 10: remote_tx_queue_start_addr_words 1 + compile_args[11] = us_tunneler_remote_settings.cb_size_bytes >> 4; // 11: remote_tx_queue_size_words 1 + } + + compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x + compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y + compile_args[18] = 2; // 18: remote_rx_queue_id + compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type + uint32_t dest_map_array[4] = {1, 1, 1, 1}; // needs to be based on tunnel stop. + dest_map_array[demux_d_settings.tunnel_stop-1] = 0; + uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); + compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi + compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo + compile_args[22] = BRISC_L1_RESULT_BASE; // 22: test_results_addr + compile_args[23] = 1024; // 23: test_results_size + compile_args[24] = 0; // 24: timeout_cycles + compile_args[25] = 0x1; // 25: output_depacketize_mask + compile_args[26] = packet_switch_4B_pack(prefetch_d_settings.cb_log_page_size, + prefetch_d_settings.consumer_semaphore_id, // downstream sem + demux_d_settings.producer_semaphore_id, // local sem + 0); // remove header + break; + } + case PREFETCH_D: + { + + uint32_t num_prefetchers = device_worker_variants[PREFETCH_D].size(); + TT_ASSERT(device_worker_variants[DEMUX_D].size() == 1, "Cannot have more than one Demux D."); + auto &prefetch_d_settings = std::get<1>(device_worker_variants[PREFETCH_D][0]); + auto demux_d_settings = std::get<1>(device_worker_variants[DEMUX_D][0]); + auto dispatch_d_settings = std::get<1>(device_worker_variants[DISPATCH_D][0]); + + TT_ASSERT(num_prefetchers == demux_d_settings.semaphores.size(), "Demux D does not have required number of semaphores for Prefetcher D. Exptected = {}. Fount = {}", num_prefetchers, demux_d_settings.semaphores.size()); + + auto dispatch_core_type = prefetch_d_settings.dispatch_core_type; + prefetch_d_settings.upstream_cores.push_back(demux_d_settings.worker_physical_core); + prefetch_d_settings.downstream_cores.push_back(dispatch_d_settings.worker_physical_core); + + uint32_t scratch_db_base = (prefetch_d_settings.cb_start_address + prefetch_d_settings.cb_size_bytes + PCIE_ALIGNMENT - 1) & (~(PCIE_ALIGNMENT - 1)); + uint32_t scratch_db_size = dispatch_constants::get(dispatch_core_type).scratch_db_size(); + const uint32_t l1_size = dispatch_core_type == CoreType::WORKER ? MEM_L1_SIZE : MEM_ETH_SIZE; + TT_ASSERT(scratch_db_base + scratch_db_size <= l1_size); + + auto& compile_args = prefetch_d_settings.compile_args; + compile_args.resize(23); + compile_args[0] = dispatch_d_settings.cb_start_address; + compile_args[1] = dispatch_d_settings.cb_log_page_size; + compile_args[2] = dispatch_d_settings.cb_pages; + compile_args[3] = prefetch_d_settings.producer_semaphore_id; + compile_args[4] = dispatch_d_settings.consumer_semaphore_id; + compile_args[5] = 0; + compile_args[6] = 0; + compile_args[7] = 0; + compile_args[8] = dispatch_constants::get(dispatch_core_type).prefetch_q_size(); + compile_args[9] = CQ_PREFETCH_Q_RD_PTR; + compile_args[10] = prefetch_d_settings.cb_start_address; + compile_args[11] = prefetch_d_settings.cb_size_bytes; + compile_args[12] = scratch_db_base; + compile_args[13] = scratch_db_size; + compile_args[14] = 0; //prefetch_sync_sem + compile_args[15] = prefetch_d_settings.cb_pages; // prefetch_d only + compile_args[16] = prefetch_d_settings.consumer_semaphore_id; // prefetch_d only + compile_args[17] = demux_d_settings.producer_semaphore_id; //prefetch_downstream_cb_sem, // prefetch_d only + compile_args[18] = prefetch_d_settings.cb_log_page_size;; + compile_args[19] = dispatch_constants::PREFETCH_D_BUFFER_BLOCKS; // prefetch_d only + compile_args[20] = 2; //prefetch_h_exec_buf_sem, + compile_args[21] = true; // is_dram_variant + compile_args[22] = false; // is_host_variant + break; + } + case DISPATCH_D: + { + uint32_t num_dispatchers = device_worker_variants[DISPATCH_D].size(); + TT_ASSERT(device_worker_variants[MUX_D].size() == 1, "Cannot have more than one Mux D."); + auto mux_d_settings = std::get<1>(device_worker_variants[MUX_D][0]); + TT_ASSERT(num_dispatchers == mux_d_settings.semaphores.size(), "Mux D does not have required number of semaphores for Dispatchers. Exptected = {}. Fount = {}", num_dispatchers, mux_d_settings.semaphores.size()); + uint32_t sem = 0; + auto &dispatch_d_settings = std::get<1>(device_worker_variants[DISPATCH_D][0]); + auto prefetch_d_settings = std::get<1>(device_worker_variants[PREFETCH_D][0]); + + auto dispatch_core_type = dispatch_d_settings.dispatch_core_type; + dispatch_d_settings.upstream_cores.push_back(prefetch_d_settings.worker_physical_core); + dispatch_d_settings.downstream_cores.push_back(mux_d_settings.worker_physical_core); + dispatch_d_settings.compile_args.resize(17); + auto& compile_args = dispatch_d_settings.compile_args; + compile_args[0] = dispatch_d_settings.cb_start_address; + compile_args[1] = dispatch_d_settings.cb_log_page_size; + compile_args[2] = dispatch_d_settings.cb_pages; + compile_args[3] = dispatch_d_settings.consumer_semaphore_id; + compile_args[4] = prefetch_d_settings.producer_semaphore_id; + compile_args[5] = dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; + compile_args[6] = 0; + compile_args[7] = dispatch_d_settings.command_queue_start_addr; + compile_args[8] = dispatch_d_settings.completion_queue_start_addr; + compile_args[9] = dispatch_d_settings.completion_queue_size; + compile_args[10] = mux_d_settings.cb_start_address; + compile_args[11] = mux_d_settings.cb_size_bytes; + compile_args[12] = dispatch_d_settings.producer_semaphore_id; // unused: local ds semaphore + compile_args[13] = mux_d_settings.consumer_semaphore_id; // unused: remote ds semaphore + compile_args[14] = sizeof(dispatch_packet_header_t); // preamble size + compile_args[15] = true; // is_dram_variant + compile_args[16] = false; // is_host_variant + break; + } + case MUX_D: + { + uint32_t num_dispatchers = device_worker_variants[DISPATCH_D].size(); + TT_ASSERT(device_worker_variants[MUX_D].size() == 1, "Cannot have more than one Mux D."); + auto &mux_d_settings = std::get<1>(device_worker_variants[MUX_D][0]); + auto dispatch_d_settings = std::get<1>(device_worker_variants[DISPATCH_D][0]); + + TT_ASSERT(num_dispatchers == mux_d_settings.semaphores.size(), "Mux D does not have required number of semaphores for Dispatchers. Exptected = {}. Fount = {}", num_dispatchers, mux_d_settings.semaphores.size()); + uint32_t sem = 0; + bool is_tunnel_end = device_worker_variants[US_TUNNELER_REMOTE].size() == 0; + + auto& compile_args = mux_d_settings.compile_args; + compile_args.resize(25); + compile_args[0] = 0; // 0: reserved + compile_args[1] = mux_d_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + compile_args[2] = mux_d_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + compile_args[3] = is_tunnel_end ? 1 : 2; // 3: mux_fan_in + uint32_t arg_index = 4; + compile_args[4] = packet_switch_4B_pack(dispatch_d_settings.worker_physical_core.x, + dispatch_d_settings.worker_physical_core.y, + 1, + DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info + + if (!is_tunnel_end) { + TT_ASSERT(device_worker_variants[US_TUNNELER_REMOTE].size() == 1, "Unexpected number of ethernet tunnelers."); + auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]); + compile_args[5] = packet_switch_4B_pack(us_tunneler_remote_settings.worker_physical_core.x, + us_tunneler_remote_settings.worker_physical_core.y, + 3, + DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info + + } + + TT_ASSERT(device_worker_variants[US_TUNNELER_LOCAL].size() == 1, "Unexpected number of ethernet tunnelers."); + auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_LOCAL][0]); + + compile_args[8] = (tunneler_settings.cb_start_address + tunneler_settings.cb_size_bytes) >> 4; // 8: remote_tx_queue_start_addr_words + compile_args[9] = tunneler_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words + compile_args[10] = tunneler_settings.worker_physical_core.x; // 10: remote_tx_x + compile_args[11] = tunneler_settings.worker_physical_core.y; // 11: remote_tx_y + compile_args[12] = 1; // 12: remote_tx_queue_id + compile_args[13] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 13: tx_network_type + compile_args[14] = BRISC_L1_RESULT_BASE; // 14: test_results_addr + compile_args[15] = 1024; // 15: test_results_size + compile_args[16] = 0; // 16: timeout_cycles + compile_args[17] = 0x0; // 17: output_depacketize + compile_args[18] = 0x0; // 18: output_depacketize info + + compile_args[19] = packet_switch_4B_pack(0x1, + dispatch_d_settings.cb_log_page_size, + dispatch_d_settings.producer_semaphore_id, // upstream sem + mux_d_settings.consumer_semaphore_id); // local sem + uint32_t src_id = 0xC1 + mux_d_settings.tunnel_stop - 1; + uint32_t dest_id = 0xD1 + mux_d_settings.tunnel_stop - 1; + compile_args[23] = packet_switch_4B_pack(src_id, src_id, src_id, src_id); // 23: packetized input src id + compile_args[24] = packet_switch_4B_pack(dest_id, dest_id, dest_id, dest_id); // 24: packetized input dest id + break; + } + } + } +} + +void Device::setup_tunnel_for_remote_devices() { + chip_id_t mmio_device_id = this->id_; + uint32_t num_tunnels = tt::Cluster::instance().get_mmio_device_tunnel_count(mmio_device_id); + if (num_tunnels == 0) { + //no remote device conected to this mmio device. + return; + } + + + tunnels_from_mmio_ = tt::Cluster::instance().get_tunnels_from_mmio_device(mmio_device_id); + uint32_t index = 0; + for (auto tunnel : tunnels_from_mmio_) { + for (auto remote_dev : tunnel) { + log_info(tt::LogMetal, "MMIO Device {} : Tunnel {} : Device {}", mmio_device_id, index, remote_dev); + } + index++; + } + + std::map>>> tunnel_dispatch_core_allocations = {}; + + uint32_t tunnel_id = 0; + for (auto &tunnel: tunnels_from_mmio_) { + std::vector>> tunnel_core_allocations = {}; + tunnel_core_allocations.resize(tt::tt_metal::DispatchWorkerType::COUNT); + + for (uint32_t tunnel_stop = 1; tunnel_stop < tunnel.size(); tunnel_stop++) { + //uint32_t tunnel_stop = tt::Cluster::instance().get_device_tunnel_depth(device_id); + chip_id_t device_id = tunnel[tunnel_stop]; + // a remote device. + // tunnel_stop hops away. + uint8_t num_hw_cqs = 1; + uint32_t cq_id = 0; + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + CoreType dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(mmio_device_id); + + worker_build_settings_t settings = {}; + //allocations below are on mmio chip. + settings.tunnel_stop = 0; + uint32_t cq_size = this->sysmem_manager().get_cq_size(); + settings.command_queue_start_addr = get_absolute_cq_offset(channel, cq_id, cq_size); + settings.issue_queue_start_addr = settings.command_queue_start_addr + CQ_START; + settings.issue_queue_size = this->sysmem_manager_->get_issue_queue_size(cq_id); + settings.completion_queue_start_addr = settings.issue_queue_start_addr + settings.issue_queue_size; + settings.completion_queue_size = this->sysmem_manager_->get_completion_queue_size(cq_id); + settings.dispatch_core_type = dispatch_core_type; + + tt_cxy_pair prefetch_location = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id); + settings.worker_physical_core = tt_cxy_pair(prefetch_location.chip, get_physical_core_coordinate(prefetch_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp"; + //prefetch needs three semaphores. + settings.semaphores.push_back(0); + settings.semaphores.push_back(dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages()); + settings.semaphores.push_back(0); + settings.producer_semaphore_id = 1; + tunnel_core_allocations[PREFETCH].push_back(std::make_tuple(prefetch_location, settings)); + + settings.semaphores.clear(); + tt_cxy_pair dispatch_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id); + settings.worker_physical_core = tt_cxy_pair(dispatch_location.chip, get_physical_core_coordinate(dispatch_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp"; + //dispatch needs one semaphore. + settings.semaphores.push_back(0); + settings.producer_semaphore_id = 0; + settings.consumer_semaphore_id = 0; + settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE; + settings.cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE; + settings.cb_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); + settings.cb_size_bytes = (1 << settings.cb_log_page_size) * settings.cb_pages; + tunnel_core_allocations[DISPATCH].push_back(std::make_tuple(dispatch_location, settings)); + log_debug(LogMetal, "Device {} Channel {} : Dispatch: Issue Q Start Addr: {} - Completion Q Start Addr: {}", device_id, channel, settings.issue_queue_start_addr, settings.completion_queue_start_addr); + + if (tunnel_stop == 1) { + //need to allocate mux/demux on mmio chip only once. + //all tunnel stops, share the same mux/demux on mmio chip. + settings.semaphores.clear(); + //mux/demux need a semaphore per remote device in the tunnel. + //Tunnel includes the mmio device as well, so tunnel.size() - 1 is the number of remote devices. + settings.semaphores.resize(tunnel.size()-1); + settings.producer_semaphore_id = 0; + settings.consumer_semaphore_id = 0; + tt_cxy_pair mux_location = dispatch_core_manager::get(num_hw_cqs).mux_core(device_id, channel, cq_id); + settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_mux.cpp"; + settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE; + settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size(); + + tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); + + tt_cxy_pair demux_location = dispatch_core_manager::get(num_hw_cqs).demux_core(device_id, channel, cq_id); + settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; + settings.cb_start_address = L1_UNRESERVED_BASE; + settings.cb_size_bytes = 0x10000; + tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings)); + } + + settings.tunnel_stop = tunnel_stop - 1; + settings.semaphores.clear(); + chip_id_t us_device = tunnel[tunnel_stop - 1]; + tt_cxy_pair us_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(us_device, device_id, channel, cq_id); + tt_cxy_pair local_location = dispatch_core_manager::get(num_hw_cqs).us_tunneler_core_local(device_id, channel, cq_id); + + settings.worker_physical_core = tt_cxy_pair(us_location.chip, get_physical_core_coordinate(us_location, CoreType::ETH)); + settings.eth_partner_physical_core = tt_cxy_pair(local_location.chip, get_physical_core_coordinate(local_location, CoreType::ETH)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp"; + settings.cb_start_address = 0x19000; + settings.cb_size_bytes = 0x10000; + tunnel_core_allocations[US_TUNNELER_REMOTE].push_back(std::make_tuple(us_location, settings)); + + //all allocation below this are on a remote chip. + settings.tunnel_stop = tunnel_stop; + + //swap the two etnernet link pair cores for downstream chip on the link pair. + tt_cxy_pair temp = settings.worker_physical_core; + settings.worker_physical_core = settings.eth_partner_physical_core; + settings.eth_partner_physical_core = temp; + settings.kernel_file = "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp"; + tunnel_core_allocations[US_TUNNELER_LOCAL].push_back(std::make_tuple(local_location, settings)); + + TT_ASSERT(us_location.chip == us_device, + "Upstream Tunneler is on device {} but it is expected to be on device {}", us_location.chip, us_device); + TT_ASSERT(local_location.chip == device_id, + "Upstream Local Tunneler is on device {} but it is expected to be on device {}", local_location.chip, device_id); + + dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id); + settings.dispatch_core_type = dispatch_core_type; + + tt_cxy_pair mux_d_location = dispatch_core_manager::get(num_hw_cqs).mux_d_core(device_id, channel, cq_id); + settings.worker_physical_core = tt_cxy_pair(mux_d_location.chip, get_physical_core_coordinate(mux_d_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_mux.cpp"; + settings.semaphores.push_back(0); + settings.consumer_semaphore_id = 0; + settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE; + settings.cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE; + settings.cb_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); + settings.cb_size_bytes = (1 << settings.cb_log_page_size) * settings.cb_pages; + tunnel_core_allocations[MUX_D].push_back(std::make_tuple(mux_d_location, settings)); + + tt_cxy_pair demux_d_location = dispatch_core_manager::get(num_hw_cqs).demux_d_core(device_id, channel, cq_id); + settings.worker_physical_core = tt_cxy_pair(demux_d_location.chip, get_physical_core_coordinate(demux_d_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; + settings.producer_semaphore_id = 0; + settings.cb_start_address = L1_UNRESERVED_BASE; + settings.cb_size_bytes = 0x10000; + tunnel_core_allocations[DEMUX_D].push_back(std::make_tuple(demux_d_location, settings)); + + settings.semaphores.clear(); + uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); + settings.semaphores.push_back(0);// prefetch_d_sync_sem + settings.semaphores.push_back(0);// prefetch_d_upstream_cb_sem + settings.semaphores.push_back(dispatch_buffer_pages);// prefetch_d_downstream_cb_sem + settings.consumer_semaphore_id = 1; + settings.producer_semaphore_id = 2; + + tt_cxy_pair prefetch_d_location = dispatch_core_manager::get(num_hw_cqs).prefetcher_d_core(device_id, channel, cq_id); + settings.worker_physical_core = tt_cxy_pair(prefetch_d_location.chip, get_physical_core_coordinate(prefetch_d_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp"; + settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE; + settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size(); + settings.cb_pages = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(); + settings.cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; + tunnel_core_allocations[PREFETCH_D].push_back(std::make_tuple(prefetch_d_location, settings)); + + settings.semaphores.clear(); + settings.semaphores.push_back(0);// dispatch_sem + settings.semaphores.push_back(dispatch_buffer_pages);// dispatch_downstream_cb_sem + settings.consumer_semaphore_id = 0; + settings.producer_semaphore_id = 1; + settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE; + settings.cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE; + settings.cb_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); + settings.cb_size_bytes = (1 << settings.cb_log_page_size) * settings.cb_pages; + tt_cxy_pair dispatch_d_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_d_core(device_id, channel, cq_id); + settings.worker_physical_core = tt_cxy_pair(dispatch_d_location.chip, get_physical_core_coordinate(dispatch_d_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp"; + tunnel_core_allocations[DISPATCH_D].push_back(std::make_tuple(dispatch_d_location, settings)); + } + tunnel_dispatch_core_allocations.insert(std::make_pair(tunnel_id, tunnel_core_allocations)); + tunnel_id++; + } + + //separate out all the dispatch workers on the tunnel into individual devices. + for (const auto& pair : tunnel_dispatch_core_allocations) { + std::map>>> device_dispatch_workers = {}; + for (uint32_t i = 0; i < pair.second.size(); i++) { + if (pair.second[i].size()) { + //some workers of allocated. + auto tunnel_workers = pair.second[i]; + for (auto &[worker, settings] : tunnel_workers) { + if (device_dispatch_workers.find(worker.chip) == device_dispatch_workers.end()) { + std::vector>> temp = {}; + temp.resize(tt::tt_metal::DispatchWorkerType::COUNT); + temp[i].push_back(std::make_tuple(worker, settings)); + device_dispatch_workers.insert(std::make_pair(worker.chip, temp)); + } else { + device_dispatch_workers[worker.chip][i].push_back(std::make_tuple(worker, settings)); + } + } + } + } + tunnel_device_dispatch_workers_.insert(std::make_pair(pair.first, device_dispatch_workers)); + } + + log_debug(LogMetal, "{} tunnels found.", tunnel_device_dispatch_workers_.size()); + + for (const auto& tunnel : tunnel_device_dispatch_workers_) { + for (const auto& pair : tunnel.second) { + for (uint32_t i = 0; i < pair.second.size(); i++) { + for (auto [core, settings] : pair.second[i]) { + log_debug(LogMetal, "Tunnel {} Device {} has {} on core {}.", tunnel.first, pair.first, magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)i), core.str()); + } + } + } + } + + for (uint32_t t = 0; t < tunnels_from_mmio_.size(); t++) { + auto tunnel = tunnels_from_mmio_[t]; + TT_ASSERT(tunnel_device_dispatch_workers_.find(t) != tunnel_device_dispatch_workers_.end(), + "Tunnel {} not found on MMIO Device {}", t, mmio_device_id); + auto &tunnel_devices = tunnel_device_dispatch_workers_[t]; + for (uint32_t tunnel_stop = 0; tunnel_stop < tunnel.size(); tunnel_stop++) { + //last iteration is used to loop in tunnel workers that run on mmio device. + auto tunnel_device = tunnel[tunnel_stop]; + TT_ASSERT(tunnel_devices.find(tunnel_device) != tunnel_devices.end(), + "Device {} not found in Tunnel {} on MMIO Device {}", tunnel_device, t, mmio_device_id); + auto &device_worker_variants = tunnel_devices[tunnel_device]; + update_workers_build_settings(device_worker_variants); + + for (uint32_t dwv = 0; dwv < device_worker_variants.size(); dwv++) + { + if (device_worker_variants[dwv].size()) { + for (auto &[core, settings] : device_worker_variants[dwv]) { + log_debug(LogMetal, "Tunnel {} Stop {} is Device {}. Core {} - Physical {} will run {}.", t, tunnel_stop, tunnel_device, core.str(), settings.worker_physical_core.str(), magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)dwv)); + for (uint32_t arg = 0; arg < settings.compile_args.size(); arg++) { + log_debug(LogMetal, "CompileArgs[{}] = {}", arg, settings.compile_args[arg]); + } + + } + } + } + } + } +} + void Device::compile_command_queue_programs() { ZoneScoped; unique_ptr command_queue_program_ptr(new Program); @@ -516,393 +1248,150 @@ void Device::compile_command_queue_programs() { } detail::CompileProgram(this, *command_queue_program_ptr); this->command_queue_programs.push_back(std::move(command_queue_program_ptr)); + this->setup_tunnel_for_remote_devices(); } else { - /////////////////Following section is for mmio device serving Remote Device - uint8_t num_hw_cqs = 1; uint32_t cq_id = 0; chip_id_t device_id = this->id(); chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); Device *mmio_device = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - uint32_t cq_size = mmio_device->sysmem_manager().get_cq_size(); NOC noc_index = this->hw_command_queues_[cq_id]->noc_index; - CoreType dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(mmio_device_id); - tt_cxy_pair prefetch_core = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id); - tt_cxy_pair dispatch_core = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id); - bool dispatch_on_eth = dispatch_core_type == CoreType::ETH; - - TT_ASSERT(prefetch_core.chip == mmio_device_id and dispatch_core.chip == mmio_device_id, - "Prefetcher is on device {} and Dispatcher is on device {} but they are expected to be on device {}", prefetch_core.chip, dispatch_core.chip, mmio_device_id); - - CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_core, dispatch_core_type); - CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type); - - log_debug(LogDevice, "Dispatching out of {} cores", magic_enum::enum_name(dispatch_core_type)); - log_debug(LogDevice, "Prefetch H logical location: {} physical core: {}", prefetch_core.str(), prefetch_physical_core.str()); - log_debug(LogDevice, "Dispatch H logical location: {} physical core {}", dispatch_core.str(), dispatch_physical_core.str()); - - uint32_t command_queue_start_addr = get_absolute_cq_offset(channel, cq_id, cq_size); - uint32_t issue_queue_start_addr = command_queue_start_addr + CQ_START; - uint32_t issue_queue_size = mmio_device->sysmem_manager_->get_issue_queue_size(cq_id); - uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size; - uint32_t completion_queue_size = mmio_device->sysmem_manager_->get_completion_queue_size(cq_id); - - tt_cxy_pair mux_core = dispatch_core_manager::get(num_hw_cqs).mux_core(device_id, channel, cq_id); - tt_cxy_pair demux_core = dispatch_core_manager::get(num_hw_cqs).demux_core(device_id, channel, cq_id); - tt_cxy_pair tunneler_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(device_id, channel, cq_id); - CoreCoord tunneler_core = CoreCoord(tunneler_location.x, tunneler_location.y); - TT_ASSERT(tunneler_location.chip == mmio_device_id, - "Tunneler is on device {} but it is expected to be on device {}", tunneler_location.chip, mmio_device_id); - CoreCoord r_tunneler_core = std::get<1>(tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(tunneler_location.chip, tunneler_core))); - CoreCoord r_tunneler_physical_core = this->ethernet_core_from_logical_core(r_tunneler_core); - - CoreCoord tunneler_physical_core = mmio_device->ethernet_core_from_logical_core(tunneler_location); - CoreCoord mux_physical_core = get_physical_core_coordinate(mux_core, dispatch_core_type); - CoreCoord demux_physical_core = get_physical_core_coordinate(demux_core, dispatch_core_type); - - uint32_t tunneler_queue_start_addr = 0x19000; - uint32_t tunneler_queue_size_bytes = 0x10000; - uint32_t tunneler_test_results_addr = 0x39000; - uint32_t tunneler_test_results_size = 0x7000; - constexpr uint32_t packetized_path_test_results_addr = BRISC_L1_RESULT_BASE; - constexpr uint32_t packetized_path_test_results_size = 1024; - - constexpr uint32_t src_endpoint_start_id = 0xaa; - constexpr uint32_t dest_endpoint_start_id = 0xbb; - - tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, 0, dispatch_core_type); // prefetch_sync_sem - tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), dispatch_core_type); // prefetch_sem - tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, 0, dispatch_core_type); // prefetch_h_exec_buf_sem - - tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_core, 0, dispatch_core_type); // mux_sem - - tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_core, 0, dispatch_core_type); //demux_sem - - constexpr uint32_t dispatch_h_cb_sem = 0; // remove it. - constexpr uint32_t dispatch_sem = 0; - tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, dispatch_core, 0, dispatch_core_type); // dispatch_sem - - std::vector prefetch_compile_args = { - dispatch_constants::DISPATCH_BUFFER_BASE, - dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE, - dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), - prefetch_sem, - mux_sem, - issue_queue_start_addr, - issue_queue_size, - dispatch_constants::PREFETCH_Q_BASE, - dispatch_constants::get(dispatch_core_type).prefetch_q_size(), - CQ_PREFETCH_Q_RD_PTR, - dispatch_constants::get(dispatch_core_type).cmddat_q_base(), - dispatch_constants::get(dispatch_core_type).cmddat_q_size(), - dispatch_constants::get(dispatch_core_type).scratch_db_base(), // unused for prefetch_h - dispatch_constants::get(dispatch_core_type).scratch_db_size(), // unused for prefetch_h - prefetch_sync_sem, // unused for prefetch_h - dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), // prefetch_d only - 0, // prefetch_d only - 0, //prefetch_downstream_cb_sem, // prefetch_d only - dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE, - dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only - prefetch_h_exec_buf_sem, - false, // is_dram_variant - true // is_host_variant - }; - - configure_kernel_variant( - *mmio_command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp", - prefetch_compile_args, - prefetch_core, - prefetch_physical_core, - dispatch_core_type, - CoreCoord{0, 0}, - mux_physical_core, - std::map {}, - noc_index - ); - - log_debug(LogDevice, "run prefetch_h {}", prefetch_core.str()); - - std::vector mux_compile_args = - { - 0, // 0: reserved - (dispatch_constants::DISPATCH_BUFFER_BASE >> 4), // 1: rx_queue_start_addr_words - (dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size() >> 4), // 2: rx_queue_size_words - 1, // 3: mux_fan_in - packet_switch_4B_pack((uint32_t)prefetch_physical_core.x, - (uint32_t)prefetch_physical_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - packet_switch_4B_pack(0, - 0, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info - packet_switch_4B_pack(0, - 0, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: src 2 info - packet_switch_4B_pack(0, - 0, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: src 3 info - (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)tunneler_physical_core.x, // 10: remote_tx_x - (uint32_t)tunneler_physical_core.y, // 11: remote_tx_y - 0, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - packetized_path_test_results_addr, // 14: test_results_addr - packetized_path_test_results_size, // 15: test_results_size - 0, // 16: timeout_cycles - 0x0,// 17: output_depacketize - 0x0,// 18: output_depacketize info - // 19: input 0 packetize info: - packet_switch_4B_pack(0x1, - dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - prefetch_sem, // upstream sem - mux_sem), // local sem - packet_switch_4B_pack(0, 0, 0, 0), // 20: input 1 packetize info - packet_switch_4B_pack(0, 0, 0, 0), // 21: input 2 packetize info - packet_switch_4B_pack(0, 0, 0, 0), // 22: input 3 packetize info - packet_switch_4B_pack(src_endpoint_start_id, 0, 0, 0), // 23: packetized input src id - packet_switch_4B_pack(dest_endpoint_start_id, 0, 0, 0), // 24: packetized input dest id - }; - - log_debug(LogDevice, "run mux at {}", mux_core.str()); - - configure_kernel_variant( - *mmio_command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - mux_compile_args, - mux_core, - CoreCoord{0, 0}, - dispatch_core_type, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - std::map {{"SKIP_NOC_LOGGING", "1"}}, - noc_index - ); - - std::vector tunneler_l_compile_args = - { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 2, // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - packet_switch_4B_pack(r_tunneler_physical_core.x, - r_tunneler_physical_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::ETH), // 4: remote_receiver_0_info - packet_switch_4B_pack(demux_physical_core.x, - demux_physical_core.y, - 1,//num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_receiver_1_info - tunneler_queue_start_addr >> 4, // 6: remote_receiver_queue_start_addr_words 0 - tunneler_queue_size_bytes >> 4, // 7: remote_receiver_queue_size_words 0 - (L1_UNRESERVED_BASE >> 4), // 8: remote_receiver_queue_start_addr_words 1 - (0x10000 >> 4), // 9: remote_receiver_queue_size_words 1 - packet_switch_4B_pack(mux_physical_core.x, - mux_physical_core.y, - 1, // mux output queue id - (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_sender_0_info - packet_switch_4B_pack(r_tunneler_physical_core.x, - r_tunneler_physical_core.y, - 3, // r tunneler output queue id - (uint32_t)DispatchRemoteNetworkType::ETH), // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - 0, // 14: timeout_cycles - }; + auto &tunnel_device_dispatch_workers = mmio_device->tunnel_device_dispatch_workers_; + auto &tunnels_from_mmio = mmio_device->tunnels_from_mmio_; - configure_kernel_variant( - *mmio_command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - tunneler_l_compile_args, - tunneler_core, - CoreCoord{0, 0}, - CoreType::ETH, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - std::map {{"SKIP_NOC_LOGGING", "1"}}, - noc_index, - true - ); - - log_debug(LogDevice, "run tunneler at {}", tunneler_location.str()); - - uint32_t dest_map_array[4] = {0, 1, 2, 3}; - uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - std::vector demux_compile_args = - { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (L1_UNRESERVED_BASE >> 4), // 1: rx_queue_start_addr_words - (0x10000 >> 4), // 2: rx_queue_size_words - 1, // 3: demux_fan_out - packet_switch_4B_pack(dispatch_physical_core.x, - dispatch_physical_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack(0, - 0, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - packet_switch_4B_pack(0, - 0, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info - packet_switch_4B_pack(0, - 0, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info - (dispatch_constants::DISPATCH_BUFFER_BASE >> 4), // 8: remote_tx_queue_start_addr_words 0 - ((1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE)*dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages()) >> 4, // 9: remote_tx_queue_size_words 0 - 0, // 10: remote_tx_queue_start_addr_words 1 - 0, // 11: remote_tx_queue_size_words 1 - 0, // 12: remote_tx_queue_start_addr_words 2 - 0, // 13: remote_tx_queue_size_words 2 - 0, // 14: remote_tx_queue_start_addr_words 3 - 0, // 15: remote_tx_queue_size_words 3 - (uint32_t)tunneler_physical_core.x, // 16: remote_rx_x - (uint32_t)tunneler_physical_core.y, // 17: remote_rx_y - 3, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - packetized_path_test_results_addr, // 22: test_results_addr - packetized_path_test_results_size, // 23: test_results_size - 0, // 24: timeout_cycles - 0x1, // 25: output_depacketize_mask - // 26: output 0 packetize info: - packet_switch_4B_pack(dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - dispatch_sem, // downstream sem - demux_sem, // local sem - 1), // remove header - packet_switch_4B_pack(0, 0, 0, 0), // 27: output 1 packetize info - packet_switch_4B_pack(0, 0, 0, 0), // 28: output 2 packetize info - packet_switch_4B_pack(0, 0, 0, 0), // 29: output 3 packetize info - }; + std::vector>> device_worker_variants; + std::vector>> mmio_device_worker_variants; + uint32_t tunnel_id = 0; + for (auto tunnel : tunnel_device_dispatch_workers) { + TT_ASSERT(tunnel.second.find(mmio_device_id) != tunnel.second.end(), "MMIO Device {} not found in tunnel map.", mmio_device_id); + if (tunnel.second.find(device_id) != tunnel.second.end()) { + tunnel_id = tunnel.first; + device_worker_variants = tunnel.second[device_id]; + mmio_device_worker_variants = tunnel.second[mmio_device_id]; + break; + } + } + TT_ASSERT(device_worker_variants.size() != 0, "No worker variants found for Device {}.", device_id); + + //determine if its first tunnel stop. + //FD2 kernels running on mmio device are launched with first tunnel stop. + bool first_tunnel_stop = true; + auto tunnel = tunnels_from_mmio[tunnel_id]; + for (uint32_t ts = 1; ts < tunnel.size(); ts++) { + if (tunnel[ts] == device_id) { + first_tunnel_stop = ts == 1; + break; + } + TT_ASSERT(ts < (tunnel.size() - 1) , "Device {} tunnel stop cannot be determined on tunnel {}.", device_id, tunnel_id); + } - configure_kernel_variant( - *mmio_command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - demux_compile_args, - demux_core, - CoreCoord{0, 0}, - dispatch_core_type, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - std::map {{"SKIP_NOC_LOGGING", "1"}}, - noc_index - ); + if (first_tunnel_stop) { + /////////////////Following section is for mmio device serving Remote Device + for (auto [prefetch_core, prefetch_settings] : mmio_device_worker_variants[PREFETCH]) { + //auto [prefetch_core, prefetch_settings] = mmio_device_worker_variants[PREFETCH][0]; + for (auto sem : prefetch_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, sem, prefetch_settings.dispatch_core_type); + } + configure_kernel_variant( + *mmio_command_queue_program_ptr, + prefetch_settings.kernel_file, + prefetch_settings.compile_args, + prefetch_core, + prefetch_settings.worker_physical_core, + prefetch_settings.dispatch_core_type, + prefetch_settings.upstream_cores[0], + prefetch_settings.downstream_cores[0], + std::map {}, + noc_index + ); + } - log_debug(LogDevice, "run dispatch demux at {}", demux_core.str()); - - std::vector dispatch_compile_args = { - dispatch_constants::DISPATCH_BUFFER_BASE, - dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(), - dispatch_sem, - demux_sem, - dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS, - prefetch_sync_sem, - command_queue_start_addr, - completion_queue_start_addr, - completion_queue_size, - dispatch_constants::DISPATCH_BUFFER_BASE, - (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(), - 0, // unused: local ds semaphore - 0, // unused: remote ds semaphore - 0, // preamble size. unused unless tunneler is between h and d - false, // is_dram_variant - true // is_host_variant - }; + auto [mux_core, mux_settings] = mmio_device_worker_variants[MUX][0]; + for (auto sem : mux_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_core, sem, mux_settings.dispatch_core_type); + } + configure_kernel_variant( + *mmio_command_queue_program_ptr, + mux_settings.kernel_file, + mux_settings.compile_args, + mux_core, + CoreCoord{0, 0}, + mux_settings.dispatch_core_type, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + std::map {{"SKIP_NOC_LOGGING", "1"}}, + noc_index + ); - configure_kernel_variant( - *mmio_command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp", - dispatch_compile_args, - dispatch_core, - dispatch_physical_core, - dispatch_core_type, - demux_physical_core, - CoreCoord{0xffffffff, 0xffffffff}, - std::map {}, - noc_index - ); + auto [tunneler_core, tunneler_settings] = mmio_device_worker_variants[US_TUNNELER_REMOTE][0]; + configure_kernel_variant( + *mmio_command_queue_program_ptr, + tunneler_settings.kernel_file, + tunneler_settings.compile_args, + tunneler_core, + CoreCoord{0, 0}, + CoreType::ETH, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + std::map {{"SKIP_NOC_LOGGING", "1"}}, + noc_index, + true + ); - log_debug(LogDevice, "run dispatch_h at {}", dispatch_core.str()); + auto [demux_core, demux_settings] = mmio_device_worker_variants[DEMUX][0]; + for (auto sem : demux_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_core, sem, demux_settings.dispatch_core_type); + } + configure_kernel_variant( + *mmio_command_queue_program_ptr, + demux_settings.kernel_file, + demux_settings.compile_args, + demux_core, + CoreCoord{0, 0}, + demux_settings.dispatch_core_type, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + std::map {{"SKIP_NOC_LOGGING", "1"}}, + noc_index + ); + for (auto [dispatch_core, dispatch_settings] : mmio_device_worker_variants[DISPATCH]) { + //auto [dispatch_core, dispatch_settings] = mmio_device_worker_variants[DISPATCH][0]; + for (auto sem : dispatch_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, dispatch_core, sem, dispatch_settings.dispatch_core_type); + } + configure_kernel_variant( + *mmio_command_queue_program_ptr, + dispatch_settings.kernel_file, + dispatch_settings.compile_args, + dispatch_core, + dispatch_settings.worker_physical_core, + dispatch_settings.dispatch_core_type, + dispatch_settings.upstream_cores[0], + CoreCoord{0xffffffff, 0xffffffff}, + std::map {}, + noc_index + ); + } + } /////////////////Following section is for Remote Device - dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id); - dispatch_on_eth = dispatch_core_type == CoreType::ETH; - - uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); - uint32_t mux_queue_start_addr = dispatch_constants::DISPATCH_BUFFER_BASE; - uint32_t mux_queue_size_bytes = (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE)*dispatch_buffer_pages; - // Packetized path buffer, can be at any available address. - constexpr uint32_t demux_queue_start_addr = L1_UNRESERVED_BASE; - constexpr uint32_t demux_queue_size_bytes = 0x10000; - - tt_cxy_pair mux_d_core = dispatch_core_manager::get(num_hw_cqs).mux_d_core(device_id, channel, cq_id); - CoreCoord mux_d_physical_core = get_physical_core_coordinate(mux_d_core, dispatch_core_type); - tt_cxy_pair demux_d_core = dispatch_core_manager::get(num_hw_cqs).demux_d_core(device_id, channel, cq_id); - CoreCoord demux_d_physical_core = get_physical_core_coordinate(demux_d_core, dispatch_core_type); - - tt_cxy_pair prefetch_d_core = dispatch_core_manager::get(num_hw_cqs).prefetcher_d_core(device_id, channel, cq_id); - CoreCoord prefetch_d_physical_core = get_physical_core_coordinate(prefetch_d_core, dispatch_core_type); - - dispatch_core = dispatch_core_manager::get(num_hw_cqs).dispatcher_d_core(device_id, channel, cq_id); - dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type); - - tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, 0, dispatch_core_type); // prefetch_d_sync_sem - tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, 0, dispatch_core_type); // prefetch_d_upstream_cb_sem - tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, dispatch_buffer_pages, dispatch_core_type); // prefetch_d_downstream_cb_sem - - tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, {demux_d_core}, 0, dispatch_core_type); // demux_sem - - tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_core, 0, dispatch_core_type); // dispatch_sem - tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_core, dispatch_buffer_pages, dispatch_core_type); // dispatch_downstream_cb_sem - - tt_metal::CreateSemaphore(*command_queue_program_ptr, mux_d_core, 0, dispatch_core_type); // mux_sem - - std::vector tunneler_r_compile_args = - { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 2, // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - packet_switch_4B_pack(demux_d_physical_core.x, - demux_d_physical_core.y, - 1, //num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info - packet_switch_4B_pack(tunneler_physical_core.x, - tunneler_physical_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::ETH), // 5: remote_receiver_1_info - (L1_UNRESERVED_BASE >> 4), // 6: remote_receiver_queue_start_addr_words 0 - (0x10000 >> 4), // 7: remote_receiver_queue_size_words 0 - (tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4, // 8: remote_receiver_queue_start_addr_words 1 - tunneler_queue_size_bytes >> 4, // 9: remote_receiver_queue_size_words 1 - packet_switch_4B_pack(tunneler_physical_core.x, - tunneler_physical_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::ETH), // 10: remote_sender_0_info - packet_switch_4B_pack(mux_d_physical_core.x, - mux_d_physical_core.y, - 1, //num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - 0, // 14: timeout_cycles - }; + //Upstream device tunneler. Goes towards MMIO Device. + auto [us_tunneler_core, us_tunneler_settings] = device_worker_variants[US_TUNNELER_LOCAL][0]; configure_kernel_variant( *command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - tunneler_r_compile_args, - r_tunneler_core, + us_tunneler_settings.kernel_file, + us_tunneler_settings.compile_args, + us_tunneler_core, CoreCoord{0, 0}, CoreType::ETH, CoreCoord{0, 0}, @@ -912,221 +1401,106 @@ void Device::compile_command_queue_programs() { true ); - log_debug(LogDevice, "run tunneler at device {} Core {}", this->id(), r_tunneler_core.str()); - - std::vector demux_d_compile_args = - { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (L1_UNRESERVED_BASE >> 4), // 1: rx_queue_start_addr_words - (0x10000 >> 4), // 2: rx_queue_size_words - 1, // 3: demux_fan_out - packet_switch_4B_pack(prefetch_d_physical_core.x, - prefetch_d_physical_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack(0, - 0, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - packet_switch_4B_pack(0, - 0, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info - packet_switch_4B_pack(0, - 0, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info - (dispatch_constants::DISPATCH_BUFFER_BASE >> 4), // 8: remote_tx_queue_start_addr_words 0 - dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size() >> 4, // 9: remote_tx_queue_size_words 0 - 0, // 10: remote_tx_queue_start_addr_words 1 - 0, // 11: remote_tx_queue_size_words 1 - 0, // 12: remote_tx_queue_start_addr_words 2 - 0, // 13: remote_tx_queue_size_words 2 - 0, // 14: remote_tx_queue_start_addr_words 3 - 0, // 15: remote_tx_queue_size_words 3 - (uint32_t)r_tunneler_physical_core.x, // 16: remote_rx_x - (uint32_t)r_tunneler_physical_core.y, // 17: remote_rx_y - 2, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - packetized_path_test_results_addr, // 22: test_results_addr - packetized_path_test_results_size, // 23: test_results_size - 0, // 24: timeout_cycles - 0x1, // 25: output_depacketize_mask - // 26: output 0 packetize info: - packet_switch_4B_pack(dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - prefetch_d_upstream_cb_sem, // downstream sem - demux_sem, // local sem - 0), - packet_switch_4B_pack(0, 0, 0, 0), // 27: output 1 packetize info - packet_switch_4B_pack(0, 0, 0, 0), // 28: output 2 packetize info - packet_switch_4B_pack(0, 0, 0, 0), // 29: output 3 packetize info - }; + //Downstream device tunneler. Goes towards tunnel end. + if (device_worker_variants[US_TUNNELER_REMOTE].size()) { + auto [ds_tunneler_core, ds_tunneler_settings] = device_worker_variants[US_TUNNELER_REMOTE][0]; + configure_kernel_variant( + *command_queue_program_ptr, + ds_tunneler_settings.kernel_file, + ds_tunneler_settings.compile_args, + ds_tunneler_core, + CoreCoord{0, 0}, + CoreType::ETH, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + std::map {{"SKIP_NOC_LOGGING", "1"}}, + noc_index, + true + ); + } + auto [demux_d_core, demux_d_settings] = device_worker_variants[DEMUX_D][0]; + for (auto sem : demux_d_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, demux_d_core, sem, demux_d_settings.dispatch_core_type); + } configure_kernel_variant( *command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - demux_d_compile_args, + demux_d_settings.kernel_file, + demux_d_settings.compile_args, demux_d_core, CoreCoord{0, 0}, - dispatch_core_type, + demux_d_settings.dispatch_core_type, CoreCoord{0, 0}, CoreCoord{0, 0}, std::map {{"SKIP_NOC_LOGGING", "1"}}, noc_index ); - log_debug(LogDevice, "run demux at {}", demux_d_core.str()); - - // prefetch_d - uint32_t scratch_db_base = (dispatch_constants::DISPATCH_BUFFER_BASE + dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size() - + PCIE_ALIGNMENT - 1) & (~(PCIE_ALIGNMENT - 1)); - uint32_t scratch_db_size = dispatch_constants::get(dispatch_core_type).scratch_db_size(); - const uint32_t l1_size = dispatch_core_type == CoreType::WORKER ? MEM_L1_SIZE : MEM_ETH_SIZE; - - TT_ASSERT(scratch_db_base + scratch_db_size <= l1_size); - - std::vector prefetch_d_compile_args = { - dispatch_constants::DISPATCH_BUFFER_BASE, // overridden below for prefetch_h - dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, // overridden below for prefetch_h - dispatch_buffer_pages, // overridden below for prefetch_h - prefetch_d_downstream_cb_sem, // overridden below for prefetch_d - dispatch_sem, // overridden below for prefetch_h - 0, //issue_queue_start_addr, - 0, //issue_queue_size, - 0, //prefetch_q_base, - dispatch_constants::get(dispatch_core_type).prefetch_q_size(), - CQ_PREFETCH_Q_RD_PTR, - dispatch_constants::DISPATCH_BUFFER_BASE, // overridden for split below - dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size(), // overridden for split below - scratch_db_base, // scratch_db_base filled in below if used - scratch_db_size, - prefetch_d_sync_sem, - dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), // prefetch_d only - prefetch_d_upstream_cb_sem, // prefetch_d only my upstream - demux_sem, // prefetch_d only upstream - dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE, - dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only - prefetch_h_exec_buf_sem, - true, - false - }; - + auto [prefetch_d_core, prefetch_d_settings] = device_worker_variants[PREFETCH_D][0]; + for (auto sem : prefetch_d_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, sem, prefetch_d_settings.dispatch_core_type); + } configure_kernel_variant( *command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp", - prefetch_d_compile_args, + prefetch_d_settings.kernel_file, + prefetch_d_settings.compile_args, prefetch_d_core, - prefetch_d_physical_core, - dispatch_core_type, - demux_d_physical_core, - dispatch_physical_core, + prefetch_d_settings.worker_physical_core, + prefetch_d_settings.dispatch_core_type, + prefetch_d_settings.upstream_cores[0], + prefetch_d_settings.downstream_cores[0], std::map {}, noc_index ); - log_debug(LogDevice, "run prefertch_d at {}", prefetch_d_core.str()); - - std::vector dispatch_d_compile_args = { - dispatch_constants::DISPATCH_BUFFER_BASE, - dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - dispatch_buffer_pages, - dispatch_sem, - prefetch_d_downstream_cb_sem, - dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS, - prefetch_d_sync_sem, - 128, - 128 + 256 * 1024 * 1024, - 256 * 1024 * 1024, - dispatch_constants::DISPATCH_BUFFER_BASE, - (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_buffer_pages, - dispatch_downstream_cb_sem, // unused on hd, filled in below for h and d - mux_sem, // unused on hd, filled in below for h and d - sizeof(dispatch_packet_header_t), // unused unless tunneler is between h and d - true, // is_dram_variant - false // is_host_variant - }; - + auto [dispatch_d_core, dispatch_d_settings] = device_worker_variants[DISPATCH_D][0]; + for (auto sem : dispatch_d_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_d_core, sem, dispatch_d_settings.dispatch_core_type); + } configure_kernel_variant( *command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp", - dispatch_d_compile_args, - dispatch_core, - dispatch_physical_core, - dispatch_core_type, - prefetch_d_physical_core, - mux_d_physical_core, + dispatch_d_settings.kernel_file, + dispatch_d_settings.compile_args, + dispatch_d_core, + dispatch_d_settings.worker_physical_core, + dispatch_d_settings.dispatch_core_type, + dispatch_d_settings.upstream_cores[0], + dispatch_d_settings.downstream_cores[0], std::map {}, noc_index ); - log_debug(LogDevice, "run dispatch at {}", dispatch_core.str()); - - std::vector mux_d_compile_args = - { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - 1, // 3: mux_fan_in - packet_switch_4B_pack((uint32_t)dispatch_physical_core.x, - (uint32_t)dispatch_physical_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - packet_switch_4B_pack(0, - 0, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info - packet_switch_4B_pack(0, - 0, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: src 2 info - packet_switch_4B_pack(0, - 0, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: src 3 info - ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)r_tunneler_physical_core.x, // 10: remote_tx_x - (uint32_t)r_tunneler_physical_core.y, // 11: remote_tx_y - 1, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - packetized_path_test_results_addr, // 14: test_results_addr - packetized_path_test_results_size, // 15: test_results_size - 0, // 16: timeout_cycles - 0x0,// 17: output_depacketize - 0x0,// 18: output_depacketize info - // 19: input 0 packetize info: - packet_switch_4B_pack(0x1, - dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - dispatch_downstream_cb_sem, // upstream sem - mux_sem), // local sem - packet_switch_4B_pack(0, 0, 0, 0), // 20: input 1 packetize info - packet_switch_4B_pack(0, 0, 0, 0), // 21: input 2 packetize info - packet_switch_4B_pack(0, 0, 0, 0), // 22: input 3 packetize info - packet_switch_4B_pack(src_endpoint_start_id, 0, 0, 0), // 23: packetized input src id - packet_switch_4B_pack(dest_endpoint_start_id, 0, 0, 0), // 24: packetized input dest id - }; - + auto [mux_d_core, mux_d_settings] = device_worker_variants[MUX_D][0]; + for (auto sem : mux_d_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, mux_d_core, sem, mux_d_settings.dispatch_core_type); + } configure_kernel_variant( *command_queue_program_ptr, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - mux_d_compile_args, + mux_d_settings.kernel_file, + mux_d_settings.compile_args, mux_d_core, CoreCoord{0, 0}, - dispatch_core_type, + mux_d_settings.dispatch_core_type, CoreCoord{0, 0}, CoreCoord{0, 0}, std::map {{"SKIP_NOC_LOGGING", "1"}}, noc_index ); - log_debug(LogDevice, "run mux at {}", mux_d_core.str()); - detail::CompileProgram(this, *command_queue_program_ptr); this->command_queue_programs.push_back(std::move(command_queue_program_ptr)); - detail::CompileProgram(mmio_device, *mmio_command_queue_program_ptr); - this->command_queue_programs.push_back(std::move(mmio_command_queue_program_ptr)); + if (first_tunnel_stop) { + detail::CompileProgram(mmio_device, *mmio_command_queue_program_ptr); + this->command_queue_programs.push_back(std::move(mmio_command_queue_program_ptr)); + } } } @@ -1136,6 +1510,7 @@ void Device::configure_command_queue_programs() { chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); Device *mmio_device = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + log_debug(tt::LogMetal, "Device {} - Channel {}", this->id_, channel); std::vector zero = {0x0}; // Reset state in case L1 Clear is disabled. std::vector pointers(CQ_START / sizeof(uint32_t), 0); @@ -1144,7 +1519,8 @@ void Device::configure_command_queue_programs() { if (this->is_mmio_capable()) { TT_ASSERT(this->command_queue_programs.size() == 1); } else { - TT_ASSERT(this->command_queue_programs.size() == 2); + uint32_t program_size = tt::Cluster::instance().get_device_tunnel_depth(device_id) == 1 ? 2 : 1; + TT_ASSERT(this->command_queue_programs.size() == program_size); } Program& command_queue_program = *this->command_queue_programs[0]; @@ -1156,7 +1532,7 @@ void Device::configure_command_queue_programs() { pointers[HOST_CQ_ISSUE_READ_PTR / sizeof(uint32_t)] = (CQ_START + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4; pointers[HOST_CQ_COMPLETION_WRITE_PTR / sizeof(uint32_t)] = (CQ_START + this->sysmem_manager_->get_issue_queue_size(cq_id) + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4; - tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), cq_id * cq_size, mmio_device_id, channel); + tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), get_absolute_cq_offset(channel, cq_id, cq_size), mmio_device_id, get_umd_channel(channel)); } uint8_t num_hw_cqs = device_id == mmio_device_id ? this->num_hw_cqs() : 1; @@ -1178,7 +1554,7 @@ void Device::configure_command_queue_programs() { detail::WriteToDeviceL1(mmio_device, prefetch_location, dispatch_constants::PREFETCH_Q_BASE, prefetch_q, dispatch_core_type); // Initialize completion queue write pointer and read pointer copy - uint32_t issue_queue_size = mmio_device->sysmem_manager_->get_issue_queue_size(cq_id); + uint32_t issue_queue_size = this->sysmem_manager_->get_issue_queue_size(cq_id); uint32_t completion_queue_start_addr = CQ_START + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size); uint32_t completion_queue_start_addr_16B = completion_queue_start_addr >> 4; vector completion_queue_wr_ptr = {completion_queue_start_addr_16B}; @@ -1200,9 +1576,12 @@ void Device::configure_command_queue_programs() { detail::ConfigureDeviceWithProgram(this, command_queue_program, true); tt::Cluster::instance().l1_barrier(this->id()); if (device_id != mmio_device_id) { - Program& mmio_command_queue_program = *this->command_queue_programs[1]; - detail::ConfigureDeviceWithProgram(mmio_device, mmio_command_queue_program, true); - tt::Cluster::instance().l1_barrier(mmio_device_id); + if (tt::Cluster::instance().get_device_tunnel_depth(device_id) == 1) { + //first or only remote device on the tunnel, launch fd2 kernels on mmio device for all remote devices. + Program& mmio_command_queue_program = *this->command_queue_programs[1]; + detail::ConfigureDeviceWithProgram(mmio_device, mmio_command_queue_program, true); + tt::Cluster::instance().l1_barrier(mmio_device_id); + } } } @@ -1221,7 +1600,8 @@ void Device::initialize_command_queue() { if (this->is_mmio_capable()) { TT_ASSERT(this->command_queue_programs.size() == 1); } else { - TT_ASSERT(this->command_queue_programs.size() == 2); + uint32_t program_size = tt::Cluster::instance().get_device_tunnel_depth(this->id()) == 1 ? 2 : 1; + TT_ASSERT(this->command_queue_programs.size() == program_size); } this->configure_command_queue_programs(); Program& command_queue_program = *this->command_queue_programs[0]; @@ -1236,13 +1616,15 @@ void Device::initialize_command_queue() { } if (!this->is_mmio_capable()) { - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id()); - Device *mmio_device = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id); - Program& mmio_command_queue_program = *this->command_queue_programs[1]; - for (const auto &[core_type, logical_dispatch_cores] : mmio_command_queue_program.logical_cores()) { - for (const CoreCoord &logical_dispatch_core : logical_dispatch_cores) { - launch_msg_t msg = mmio_command_queue_program.kernels_on_core(logical_dispatch_core, core_type)->launch_msg; - tt::llrt::write_launch_msg_to_core(mmio_device_id, mmio_device->physical_core_from_logical_core(logical_dispatch_core, core_type), &msg); + if (tt::Cluster::instance().get_device_tunnel_depth(this->id()) == 1) { + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id()); + Device *mmio_device = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id); + Program& mmio_command_queue_program = *this->command_queue_programs[1]; + for (const auto &[core_type, logical_dispatch_cores] : mmio_command_queue_program.logical_cores()) { + for (const CoreCoord &logical_dispatch_core : logical_dispatch_cores) { + launch_msg_t msg = mmio_command_queue_program.kernels_on_core(logical_dispatch_core, core_type)->launch_msg; + tt::llrt::write_launch_msg_to_core(mmio_device_id, mmio_device->physical_core_from_logical_core(logical_dispatch_core, core_type), &msg); + } } } } diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 12df80a6bee..f69fab339e8 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -159,6 +159,10 @@ class Device { return tt::Cluster::instance().get_associated_mmio_device(this->id_) == this->id_; } + void setup_tunnel_for_remote_devices(); + + void update_workers_build_settings(std::vector>> &device_worker_variants); + uint32_t num_banks(const BufferType &buffer_type) const; uint32_t bank_size(const BufferType &buffer_type) const; @@ -263,6 +267,8 @@ class Device { uint32_t build_key_; std::unique_ptr allocator_ = nullptr; bool initialized_ = false; + std::map>>>> tunnel_device_dispatch_workers_; + std::vector> tunnels_from_mmio_; JitBuildEnv build_env_; JitBuildStateSet firmware_build_states_; diff --git a/tt_metal/impl/device/multi_device.cpp b/tt_metal/impl/device/multi_device.cpp index d1662ebae5c..efede66d072 100644 --- a/tt_metal/impl/device/multi_device.cpp +++ b/tt_metal/impl/device/multi_device.cpp @@ -26,9 +26,38 @@ DeviceMesh::DeviceMesh(const DeviceGrid& device_grid, const DeviceIds &device_id //TODO: for DevicePool feature delete CreateDevices and merge with this function //TODO: should there be an explicit CloseDevices call somewhere? - managed_devices = tt::tt_metal::detail::CreateDevices(device_ids, 1, l1_small_size); - for (int i = 0; i < num_requested_devices; i++) { - mesh_devices.emplace_back(device_ids[i], std::unique_ptr(managed_devices.at(device_ids[i]))); + bool is_galaxy = tt::Cluster::instance().is_galaxy_cluster(); + std::vector mmio_device_ids = {}; + if (is_galaxy) { + mmio_device_ids.push_back(0); + if (num_requested_devices > 8) { + mmio_device_ids.push_back(1); + } + if (num_requested_devices > 16) { + mmio_device_ids.push_back(2); + } + if (num_requested_devices > 24) { + mmio_device_ids.push_back(3); + } + } else { + mmio_device_ids = device_ids; + } + managed_devices = tt::tt_metal::detail::CreateDevices(mmio_device_ids, 1, l1_small_size); + if (is_galaxy) { + DeviceIds galaxy_device_ids; + for (const auto &[dev_id, dev]: managed_devices) { + galaxy_device_ids.emplace_back(dev_id); + } + for (int i = 0; i < num_requested_devices; i++) { + mesh_devices.emplace_back(device_ids[i], std::unique_ptr(managed_devices.at(galaxy_device_ids[i]))); + } + } else { + for (int i = 0; i < num_requested_devices; i++) { + mesh_devices.emplace_back(device_ids[i], std::unique_ptr(managed_devices.at(device_ids[i]))); + } + } + for (const auto& [dev_id, dev]: mesh_devices) { + std::cout << "dev_id " << dev_id << " dev " << dev->id() << std::endl; } } @@ -76,14 +105,7 @@ int DeviceMesh::num_devices() const } void DeviceMesh::close_devices() { - // TODO: change api to a yield, shouldn't allow closing sub grids in a pool of devices - tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(false); - for (const auto &[device_id, device] : managed_devices) { - if (device->is_initialized()) { - tt::tt_metal::detail::DeallocateBuffers(device); - device->close(); - } - } + tt::tt_metal::detail::CloseDevices(managed_devices); mesh_devices.clear(); managed_devices.clear(); } diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 674150dffd8..ca664bc46dd 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -1243,7 +1243,10 @@ HWCommandQueue::HWCommandQueue(Device* device, uint32_t id, NOC noc_index) : chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); this->size_B = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / device->num_hw_cqs(); - + if (tt::Cluster::instance().is_galaxy_cluster()) { + //Galaxy puts 4 devices per host channel until umd can provide one channel per device. + this->size_B = this->size_B / 4; + } tt_cxy_pair completion_q_writer_location = dispatch_core_manager::get(device->num_hw_cqs()).completion_queue_writer_core(device->id(), channel, this->id); diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index 574c2db6a40..872036ba7ab 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -17,7 +17,10 @@ using namespace tt::tt_metal; // todo consider moving these to dispatch_addr_map static constexpr uint32_t PCIE_ALIGNMENT = 32; -static constexpr uint32_t MAX_HUGEPAGE_SIZE = 1 << 30; // 1GB; +static constexpr uint32_t MAX_HUGEPAGE_SIZE = 1 << 30; // 1GB; +static constexpr uint32_t MAX_DEV_CHANNEL_SIZE = 1 << 28; // 256 MB; +static constexpr uint32_t DEVICES_PER_UMD_CHANNEL = MAX_HUGEPAGE_SIZE / MAX_DEV_CHANNEL_SIZE; // 256 MB; + static constexpr uint32_t MEMCPY_ALIGNMENT = sizeof(__m128i); @@ -85,7 +88,7 @@ struct dispatch_constants { // make this 2^N as required by the packetized stages uint32_t dispatch_buffer_block_size; if (core_type == CoreType::WORKER) { - prefetch_q_entries_ = 2048; + prefetch_q_entries_ = 1534; max_prefetch_command_size_ = 128 * 1024; cmddat_q_size_ = 256 * 1024; scratch_db_size_ = 128 * 1024; @@ -135,13 +138,17 @@ struct dispatch_constants { /// @return uint32_t relative offset inline uint32_t get_relative_cq_offset(uint8_t cq_id, uint32_t cq_size) { return cq_id * cq_size; } +inline uint16_t get_umd_channel(uint16_t channel) { + return channel & 0x3; +} + /// @brief Get absolute offset of the command queue /// @param channel uint16_t channel ID (hugepage) /// @param cq_id uint8_t ID the command queue /// @param cq_size uint32_t size of the command queue /// @return uint32_t absolute offset inline uint32_t get_absolute_cq_offset(uint16_t channel, uint8_t cq_id, uint32_t cq_size) { - return (MAX_HUGEPAGE_SIZE * channel) + get_relative_cq_offset(cq_id, cq_size); + return (MAX_HUGEPAGE_SIZE * get_umd_channel(channel)) + ((channel >> 2) * MAX_DEV_CHANNEL_SIZE) + get_relative_cq_offset(cq_id, cq_size); } template @@ -149,10 +156,11 @@ inline uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t c uint32_t recv; chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); + uint32_t channel_offset = (channel >> 2) * MAX_DEV_CHANNEL_SIZE; tt::Cluster::instance().read_sysmem( &recv, sizeof(uint32_t), - HOST_CQ_ISSUE_READ_PTR + get_relative_cq_offset(cq_id, cq_size), + HOST_CQ_ISSUE_READ_PTR + channel_offset + get_relative_cq_offset(cq_id, cq_size), mmio_device_id, channel); if (not addr_16B) { @@ -166,10 +174,11 @@ inline uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint3 uint32_t recv; chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); + uint32_t channel_offset = (channel >> 2) * MAX_DEV_CHANNEL_SIZE; tt::Cluster::instance().read_sysmem( &recv, sizeof(uint32_t), - HOST_CQ_COMPLETION_WRITE_PTR + get_relative_cq_offset(cq_id, cq_size), + HOST_CQ_COMPLETION_WRITE_PTR + channel_offset + get_relative_cq_offset(cq_id, cq_size), mmio_device_id, channel); if (not addr_16B) { @@ -330,6 +339,7 @@ class SystemMemoryManager { chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); char *hugepage_start = (char *)tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel); + hugepage_start += (channel >> 2) * MAX_DEV_CHANNEL_SIZE; this->cq_sysmem_start = hugepage_start; // TODO(abhullar): Remove env var and expose sizing at the API level @@ -339,8 +349,12 @@ class SystemMemoryManager { this->cq_size = cq_size_override; } else { this->cq_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / num_hw_cqs; + if (tt::Cluster::instance().is_galaxy_cluster()) { + //We put 4 galaxy devices per huge page since number of hugepages available is less than number of devices. + this->cq_size = this->cq_size / DEVICES_PER_UMD_CHANNEL; + } } - this->channel_offset = MAX_HUGEPAGE_SIZE * channel; + this->channel_offset = MAX_HUGEPAGE_SIZE * get_umd_channel(channel) + (channel >> 2) * MAX_DEV_CHANNEL_SIZE; CoreType core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id); for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.hpp b/tt_metal/impl/dispatch/dispatch_core_manager.hpp index 0a7a8297e1d..a184c1083c4 100644 --- a/tt_metal/impl/dispatch/dispatch_core_manager.hpp +++ b/tt_metal/impl/dispatch/dispatch_core_manager.hpp @@ -19,6 +19,45 @@ namespace tt::tt_metal { // One core dispatches commands to worker cores on the device `dispatcher` // The `remote_x` cores are used for remote fast dispatch and receive / transmit fast dispatch packets from ethernet cores +enum DispatchWorkerType : uint32_t { + PREFETCH = 0, + PREFETCH_D = 1, + DISPATCH = 2, + DISPATCH_D = 3, + MUX = 4, + MUX_D = 5, + DEMUX = 6, + DEMUX_D = 7, + US_TUNNELER_LOCAL = 8, + US_TUNNELER_REMOTE = 9, + DS_TUNNELER_LOCAL = 10, + DS_TUNNELER_REMOTE = 11, + COUNT = 12 +}; + +struct worker_build_settings_t{ + std::string kernel_file; + std::vector compile_args; + std::vector upstream_cores; + std::vector downstream_cores; + tt_cxy_pair worker_physical_core; + tt_cxy_pair eth_partner_physical_core; + CoreType dispatch_core_type; + uint32_t command_queue_start_addr; + uint32_t issue_queue_start_addr; + uint32_t issue_queue_size; + uint32_t completion_queue_start_addr; + uint32_t completion_queue_size; + std::vector semaphores; + uint32_t producer_semaphore_id; + uint32_t consumer_semaphore_id; + uint32_t cb_start_address; + uint32_t cb_size_bytes; + uint32_t cb_log_page_size; + uint32_t cb_pages; + uint32_t tunnel_stop; +}; + // std::optional is used to determine whether core has been assigned // tt_cxy_pair is used over CoreCoord to denote location because remote device command queue interface cores are on the associated MMIO device struct dispatch_core_types_t { @@ -35,6 +74,7 @@ struct dispatch_core_types_t { std::optional tunneler_d = std::nullopt; // ethernet tunneler }; + class dispatch_core_manager { public: dispatch_core_manager &operator=(const dispatch_core_manager &) = delete; @@ -193,20 +233,30 @@ class dispatch_core_manager { /// @param channel assigned to the command queue where commands are enqueued /// @param cq_id ID of the command queue within the channel /// @return tt_cxy_pair logical location (chip + core coordinate) of the ethernet tunnel core - const tt_cxy_pair &tunneler_core(chip_id_t device_id, uint16_t channel, uint8_t cq_id) { + const tt_cxy_pair &tunneler_core(chip_id_t upstream_device_id, chip_id_t device_id, uint16_t channel, uint8_t cq_id) { dispatch_core_types_t &assignment = this->dispatch_core_assignments[device_id][channel][cq_id]; if (assignment.tunneler.has_value()) { return assignment.tunneler.value(); } - TT_ASSERT(assignment.mux.has_value(), " Mux core not assigned for device {}. Must assign a Mux core before getting a tunneler core.", device_id); - tt_cxy_pair tunneler_location = tt::Cluster::instance().get_eth_core_for_dispatch_core( - assignment.mux.value(), EthRouterMode::BI_DIR_TUNNELING, device_id); - assignment.tunneler = tunneler_location; - log_debug(tt::LogMetal, "Allocated Tunneler Core: {} for Device {}", tunneler_location.str(), device_id); + auto[us_core, ds_core] = tt::Cluster::instance().get_eth_tunnel_core(upstream_device_id, device_id, EthRouterMode::BI_DIR_TUNNELING); + + assignment.tunneler = us_core; + assignment.tunneler_d = ds_core; + + log_debug(tt::LogMetal, "Allocated Tunneler Core: {} for Device {}", us_core.str(), device_id); return assignment.tunneler.value(); } + const tt_cxy_pair &us_tunneler_core_local(chip_id_t device_id, uint16_t channel, uint8_t cq_id) { + dispatch_core_types_t &assignment = this->dispatch_core_assignments[device_id][channel][cq_id]; + if (assignment.tunneler_d.has_value()) { + return assignment.tunneler_d.value(); + } + TT_ASSERT(false, "Device {} has no allocation for Local Upstream Tunneler Core.", device_id); + assignment.tunneler_d = tt_cxy_pair(0, 0, 0); + return assignment.tunneler_d.value(); + } /// @brief Gets the location of the kernel desginated to write to the completion queue region for a particular command queue diff --git a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp index 8453cca33c4..b1439eb654b 100644 --- a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp @@ -67,6 +67,7 @@ constexpr uint32_t test_results_buf_size_bytes = get_compile_time_arg_val(13); tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_buf_addr_arg); constexpr uint32_t timeout_cycles = get_compile_time_arg_val(14); +constexpr uint32_t inner_stop_mux_d_bypass = get_compile_time_arg_val(15); void kernel_main() { rtos_context_switch_ptr = (void (*)())RtosTable[0]; @@ -139,6 +140,11 @@ void kernel_main() { output_queues[i].prev_words_in_flight_check_flush(); bool output_finished = output_queues[i].is_remote_finished(); if (output_finished) { + if ((i == 1) && (inner_stop_mux_d_bypass != 0)) { + input_queues[1].remote_x = inner_stop_mux_d_bypass & 0xFF; + input_queues[1].remote_y = (inner_stop_mux_d_bypass >> 8) & 0xFF; + input_queues[1].set_remote_ready_status_addr((inner_stop_mux_d_bypass >> 16) & 0xFF); + } input_queues[i].send_remote_finished_notification(); } all_outputs_finished &= output_finished; diff --git a/tt_metal/impl/dispatch/kernels/packet_queue.hpp b/tt_metal/impl/dispatch/kernels/packet_queue.hpp index bf4e9a294fb..64f66447557 100644 --- a/tt_metal/impl/dispatch/kernels/packet_queue.hpp +++ b/tt_metal/impl/dispatch/kernels/packet_queue.hpp @@ -293,6 +293,10 @@ class packet_queue_state_t { PACKET_QUEUE_REMOTE_READY_FLAG); } + inline void set_remote_ready_status_addr(uint8_t remote_queue_id) { + this->remote_ready_status_addr = STREAM_REG_ADDR(remote_queue_id, STREAM_REMOTE_SRC_REG_INDEX); + } + inline void send_remote_finished_notification() { this->remote_reg_update(this->remote_ready_status_addr, PACKET_QUEUE_REMOTE_FINISHED_FLAG); diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 8d4b94cb366..c8cf461e24b 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -20,6 +20,9 @@ #include "tt_metal/llrt/rtoptions.hpp" #include "tt_metal/llrt/tlb_config.hpp" +static constexpr uint32_t HOST_MEM_CHANNELS = 4; +static constexpr uint32_t HOST_MEM_CHANNELS_MASK = HOST_MEM_CHANNELS - 1; + namespace tt { const Cluster &Cluster::instance() { @@ -120,6 +123,10 @@ std::filesystem::path get_cluster_desc_yaml() { return fs::absolute(cluster_desc_path); } +bool Cluster::is_galaxy_cluster() const { + return this->is_tg_cluster_; +} + void Cluster::generate_cluster_descriptor() { this->cluster_desc_path_ = (this->target_type_ == TargetDevice::Silicon and this->arch_ == tt::ARCH::WORMHOLE_B0) ? get_cluster_desc_yaml().string() @@ -138,6 +145,12 @@ void Cluster::generate_cluster_descriptor() { tt_ClusterDescriptor::create_for_grayskull_cluster(logical_mmio_device_ids, physical_mmio_device_ids); } else { this->cluster_desc_ = tt_ClusterDescriptor::create_from_yaml(this->cluster_desc_path_); + for (const auto &chip_id : this->cluster_desc_->get_all_chips()) { + if (this->cluster_desc_->get_board_type(chip_id) == BoardType::GALAXY) { + this->is_tg_cluster_ = true; + break; + } + } } // Use cluster descriptor to map MMIO device id to all devices on the same card (including the MMIO device) @@ -155,13 +168,24 @@ void Cluster::generate_cluster_descriptor() { } uint32_t total_num_hugepages = get_num_hugepages(); + if (this->is_tg_cluster_) { + // TODO: don't think this check is correct, we want to have total num hugepages == num chips even for Galaxy + TT_FATAL( + this->arch_ == tt::ARCH::BLACKHOLE or total_num_hugepages >= this->cluster_desc_->get_all_chips().size()/4, + "Machine setup error: Insufficient number of hugepages available, expected >= {} for {} devices but have {}. " + "Increase number of hugepages!", + this->cluster_desc_->get_all_chips().size()/4, + this->cluster_desc_->get_all_chips().size(), + total_num_hugepages); + } else { // TODO (abhullar): ignore hugepage set up for BH bringup - TT_FATAL( - this->arch_ == tt::ARCH::BLACKHOLE or total_num_hugepages >= this->cluster_desc_->get_all_chips().size(), - "Machine setup error: Insufficient number of hugepages available, expected one per device ({}) but have {}. " - "Increase number of hugepages!", - this->cluster_desc_->get_all_chips().size(), - total_num_hugepages); + TT_FATAL( + this->arch_ == tt::ARCH::BLACKHOLE or total_num_hugepages >= this->cluster_desc_->get_all_chips().size(), + "Machine setup error: Insufficient number of hugepages available, expected one per device ({}) but have {}. " + "Increase number of hugepages!", + this->cluster_desc_->get_all_chips().size(), + total_num_hugepages); + } } void Cluster::initialize_device_drivers() { @@ -189,7 +213,8 @@ void Cluster::assign_mem_channels_to_devices( chip_id_t mmio_device_id, const std::set &controlled_device_ids) { // g_MAX_HOST_MEM_CHANNELS (4) is defined in tt_SiliconDevice and denotes the max number of host memory channels per // MMIO device Metal currently assigns 1 channel per device. See https://github.com/tenstorrent/tt-metal/issues/4087 - TT_ASSERT(controlled_device_ids.size() <= 4, "Unable to assign each device to its own host memory channel!"); + // One WH gateway should have 8 remote deivces in its control group. + TT_ASSERT(controlled_device_ids.size() <= 9, "Unable to assign each device to its own host memory channel!"); uint16_t channel = 0; this->device_to_host_mem_channel_[mmio_device_id] = channel++; for (const chip_id_t &device_id : controlled_device_ids) { @@ -197,6 +222,7 @@ void Cluster::assign_mem_channels_to_devices( continue; } this->device_to_host_mem_channel_[device_id] = channel++; + if ((channel + 1) % 4 == 0) channel++; } } @@ -220,6 +246,9 @@ void Cluster::open_driver( // available. Metal currently uses assigns 1 channel per device uint32_t num_host_mem_ch_per_mmio_device = this->arch_ == tt::ARCH::BLACKHOLE ? 0 : controlled_device_ids.size(); + if (is_tg_cluster_) { + num_host_mem_ch_per_mmio_device = HOST_MEM_CHANNELS; + } std::unordered_map dynamic_tlb_config = ll_api::get_dynamic_tlb_config(); // This will remove harvested rows from the soc descriptor const bool perform_harvesting = true; @@ -275,6 +304,10 @@ Cluster::~Cluster() { this->mmio_device_id_to_driver_.clear(); this->sdesc_per_chip_.clear(); + this->devices_grouped_by_assoc_mmio_device_.clear(); + this->device_to_mmio_device_.clear(); + this->device_to_host_mem_channel_.clear(); + this->device_eth_routing_info_.clear(); } tt_device &Cluster::get_driver(chip_id_t device_id) const { @@ -455,13 +488,13 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr void Cluster::write_sysmem( const void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const { TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id)); - this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel, src_device_id); + this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel & HOST_MEM_CHANNELS_MASK, src_device_id); } void Cluster::read_sysmem( void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const { TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id)); - this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel, size_in_bytes, src_device_id); + this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel & HOST_MEM_CHANNELS_MASK, size_in_bytes, src_device_id); } void Cluster::verify_sw_fw_versions( @@ -510,12 +543,12 @@ uint32_t Cluster::get_num_host_channels(chip_id_t device_id) const { uint32_t Cluster::get_host_channel_size(chip_id_t device_id, uint32_t channel) const { TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(device_id)); - return this->get_driver(device_id).get_host_channel_size(device_id, channel); + return this->get_driver(device_id).get_host_channel_size(device_id, channel & HOST_MEM_CHANNELS_MASK); } void *Cluster::host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id)); - return this->get_driver(src_device_id).host_dma_address(offset, src_device_id, channel); + return this->get_driver(src_device_id).host_dma_address(offset, src_device_id, channel & HOST_MEM_CHANNELS_MASK); } uint64_t Cluster::get_pcie_base_addr_from_device(chip_id_t chip_id) const { @@ -548,6 +581,81 @@ std::unordered_map> Cluster::get_ethernet_core } return connected_chips; } +#define MAX_TUNNEL_DEPTH 4 +std::vector> Cluster::get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const { + std::vector> tunnels_from_mmio = {}; + const auto &all_eth_connections = this->cluster_desc_->get_ethernet_connections(); + TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(mmio_chip_id)); + + if (all_eth_connections.find(mmio_chip_id) == all_eth_connections.end()) { + return {}; + } + + std::set device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id); + device_ids.erase(mmio_chip_id); + + if (device_ids.size() == 0) { + return {}; + } + + for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(mmio_chip_id)) { + const auto &other_chip_id = std::get<0>(connected_chip_chan); + if (device_ids.find(other_chip_id) != device_ids.end()) { + //mmio chip is connected to a remote chip in its mmio group. + //erase from the pool so multiple ethenret connections to same remote device do not + //pollute the counts. + device_ids.erase(other_chip_id); + std::vector first_stop = {other_chip_id}; + auto it = std::find(tunnels_from_mmio.begin(), tunnels_from_mmio.end(), first_stop); + TT_ASSERT(it == tunnels_from_mmio.end(),"Duplicate first tunnel stop found when finding FD2 Tunnel devices."); + tunnels_from_mmio.push_back(first_stop); + } + } + + log_info(tt::LogMetal, "Found {} FD Tunnels originating from MMIO Device {}", tunnels_from_mmio.size(), mmio_chip_id); + + device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id); + device_ids.erase(mmio_chip_id); + + for (auto &tunnel : tunnels_from_mmio) { + TT_ASSERT(tunnel.size() == 1,"Tunnel depth must be 1 when it has only 1 stop in it."); + device_ids.erase(tunnel[0]); + } + + bool tunneled_device_hit; + for (auto it = device_ids.begin(); it != device_ids.end();) { + tunneled_device_hit = false; + for (auto &dev_vec : tunnels_from_mmio) { + for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(dev_vec.back())) { + const auto &other_chip_id = std::get<0>(connected_chip_chan); + auto id_iter = device_ids.find(other_chip_id); + if (id_iter != device_ids.end()) { + it = device_ids.erase(id_iter); + dev_vec.push_back(other_chip_id); + tunneled_device_hit = true; + break; + } + } + } + TT_ASSERT(tunneled_device_hit || (it == device_ids.end()),"Loop Exit Error."); + } + + TT_ASSERT(tunnels_from_mmio.size() != 0,"Must have at least 1 tunnel from MMIO Device."); + uint32_t tunnel_depth = tunnels_from_mmio[0].size(); + log_info(tt::LogMetal, "Each FD Tunnel is {} deep.", tunnel_depth); + + for (auto &dev_vec : tunnels_from_mmio) { + TT_ASSERT(dev_vec.size() == tunnel_depth,"All tunnels from mmio device must have same depth. Found {}. Expected {}.", dev_vec.size(), tunnel_depth); + //Now that all remotete chips have been added to respective tunnels, + //add mmio device at start of each of the tunnels. + if (dev_vec.size() > MAX_TUNNEL_DEPTH) { + dev_vec.resize(dev_vec.size() - (dev_vec.size() - MAX_TUNNEL_DEPTH)); + } + dev_vec.insert(dev_vec.begin(), mmio_chip_id); + } + return tunnels_from_mmio; +} + // Ethernet cluster api void Cluster::initialize_ethernet_sockets() { @@ -583,6 +691,71 @@ void Cluster::initialize_ethernet_sockets() { } } +void Cluster::reserve_ethernet_cores_for_tunneling() { + const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); + const uint32_t routing_info_addr = eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_BASE; + for (const auto &[assoc_mmio_device, devices] : this->devices_grouped_by_assoc_mmio_device_) { + for (const auto &chip_id : devices) { + if (this->device_eth_routing_info_.find(chip_id) == this->device_eth_routing_info_.end()) { + this->device_eth_routing_info_.insert({chip_id, {}}); + } + } + std::map, bool> reserved_chip_connections = {}; + for (const auto &chip_id : devices) { + if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { + for (const auto &[connected_chip_id, active_eth_cores] : + this->get_ethernet_cores_grouped_by_connected_chips(chip_id)) { + for (const auto ð_core : active_eth_cores) { + const auto connected_eth_core = + std::get<1>(this->get_connected_ethernet_core(std::make_tuple(chip_id, eth_core))); + if (this->device_eth_routing_info_.at(chip_id).find(eth_core) == + this->device_eth_routing_info_.at(chip_id).end()) { + if (devices.find(connected_chip_id) != devices.end() && + reserved_chip_connections.find(std::make_tuple(chip_id, connected_chip_id)) == + reserved_chip_connections.end() && + this->cluster_desc_->get_ethernet_link_distance(chip_id, assoc_mmio_device) != + this->cluster_desc_->get_ethernet_link_distance( + connected_chip_id, assoc_mmio_device)) { + // only setup fd tunneling for devices grouped with same mmio device and if no bi dir + // tunnel found between the two chips and if link distance between both chips to mmio + // chip is not the same + tt_cxy_pair(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + log_debug( + LogDevice, + "Reserving {} for tunneling", + tt_cxy_pair(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)).str()); + log_debug( + LogDevice, + "Reserving {} for tunneling", + tt_cxy_pair( + connected_chip_id, + ethernet_core_from_logical_core(connected_chip_id, connected_eth_core)) + .str()); + this->device_eth_routing_info_.at(chip_id).insert( + {eth_core, EthRouterMode::BI_DIR_TUNNELING}); + this->device_eth_routing_info_.at(connected_chip_id) + .insert({connected_eth_core, EthRouterMode::BI_DIR_TUNNELING}); + reserved_chip_connections.insert({std::make_tuple(chip_id, connected_chip_id), true}); + reserved_chip_connections.insert({std::make_tuple(connected_chip_id, chip_id), true}); + } else { + this->device_eth_routing_info_.at(chip_id).insert({eth_core, EthRouterMode::IDLE}); + } + } + } + } + } else { + // Slow dispatch mode + for (const auto &[connected_chip_id, active_eth_cores] : + this->get_ethernet_cores_grouped_by_connected_chips(chip_id)) { + for (const auto ð_core : active_eth_cores) { + this->device_eth_routing_info_.at(chip_id).insert({eth_core, EthRouterMode::IDLE}); + } + } + } + } + } +} + std::unordered_set Cluster::get_ethernet_connected_device_ids(chip_id_t chip_id) const { std::unordered_set device_ids; const auto &connected_chips = this->get_ethernet_cores_grouped_by_connected_chips(chip_id); @@ -658,56 +831,6 @@ CoreCoord Cluster::ethernet_core_from_logical_core(chip_id_t chip_id, const Core return soc_desc.get_physical_ethernet_core_from_logical(logical_core); } -void Cluster::reserve_ethernet_cores_for_tunneling() { - const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); - const uint32_t routing_info_addr = eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_BASE; - for (const auto &[assoc_mmio_device, devices] : this->devices_grouped_by_assoc_mmio_device_) { - for (const auto &chip_id : devices) { - if (this->device_eth_routing_info_.find(chip_id) == this->device_eth_routing_info_.end()) { - this->device_eth_routing_info_.insert({chip_id, {}}); - } - } - std::map, bool> reserved_chip_connections = {}; - for (const auto &chip_id : devices) { - if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { - for (const auto &[connected_chip_id, active_eth_cores] : - this->get_ethernet_cores_grouped_by_connected_chips(chip_id)) { - for (const auto ð_core : active_eth_cores) { - const auto connected_eth_core = - std::get<1>(this->get_connected_ethernet_core(std::make_tuple(chip_id, eth_core))); - if (this->device_eth_routing_info_.at(chip_id).find(eth_core) == - this->device_eth_routing_info_.at(chip_id).end()) { - tt_cxy_pair this_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); - if (devices.find(connected_chip_id) != devices.end() && - reserved_chip_connections.find(std::make_tuple(chip_id, connected_chip_id)) == - reserved_chip_connections.end()) { - // only setup fd tunneling for devices grouped with same mmio device and if no bi dir - // tunnel found between the two chips - this->device_eth_routing_info_.at(chip_id).insert( - {eth_core, EthRouterMode::BI_DIR_TUNNELING}); - this->device_eth_routing_info_.at(connected_chip_id) - .insert({connected_eth_core, EthRouterMode::BI_DIR_TUNNELING}); - reserved_chip_connections.insert({std::make_tuple(chip_id, connected_chip_id), true}); - reserved_chip_connections.insert({std::make_tuple(connected_chip_id, chip_id), true}); - } else { - this->device_eth_routing_info_.at(chip_id).insert({eth_core, EthRouterMode::IDLE}); - } - } - } - } - } else { - // Slow dispatch mode - for (const auto &[connected_chip_id, active_eth_cores] : - this->get_ethernet_cores_grouped_by_connected_chips(chip_id)) { - for (const auto ð_core : active_eth_cores) { - this->device_eth_routing_info_.at(chip_id).insert({eth_core, EthRouterMode::IDLE}); - } - } - } - } - } -} - tt_cxy_pair Cluster::get_eth_core_for_dispatch_core( tt_cxy_pair logical_dispatch_core, EthRouterMode mode, chip_id_t connected_chip_id) const { const auto &local_chip_id = logical_dispatch_core.chip; @@ -723,6 +846,20 @@ tt_cxy_pair Cluster::get_eth_core_for_dispatch_core( return {}; } +std::tuple Cluster::get_eth_tunnel_core( + chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const { + for (const auto &[eth_core, router_mode] : this->device_eth_routing_info_.at(downstream_chip_id)) { + + // Check for connected chip id since one chip can be bi directional tunneling to multiple chips + const auto [tunnel_chip_id, tunnel_eth_core] = this->get_connected_ethernet_core(std::make_tuple(downstream_chip_id, eth_core)); + if (router_mode == mode and tunnel_chip_id == upstream_chip_id) { + return std::make_tuple(tt_cxy_pair(tunnel_chip_id, tunnel_eth_core), tt_cxy_pair(downstream_chip_id, eth_core)); + } + } + TT_ASSERT(false, "Cluster does not contain requested eth routing core"); + return {}; +} + // TODO: ALLAN Can change to write one bit void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing) const { log_debug(tt::LogDevice, "Set internal routing bit {}", enable_internal_routing); @@ -785,6 +922,34 @@ void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_ } } +uint32_t Cluster::get_mmio_device_max_tunnel_depth(chip_id_t mmio_device) const { + // Assume that tunnel depth for multiple tunnels are the same + TT_ASSERT( + (this->get_associated_mmio_device(mmio_device) == mmio_device), "Called mmio device api on non-mmio device"); + uint32_t depth = 0; + for (const auto &[assoc_mmio_device, devices] : this->devices_grouped_by_assoc_mmio_device_) { + for (const auto &chip_id : devices) { + depth = + std::max(depth, uint32_t(this->cluster_desc_->get_ethernet_link_distance(chip_id, assoc_mmio_device))); + } + } + return depth; +} + +uint32_t Cluster::get_mmio_device_tunnel_count(chip_id_t mmio_device) const { + TT_ASSERT( + (this->get_associated_mmio_device(mmio_device) == mmio_device), "Called mmio device api on non-mmio device"); + const auto &chip_eth_core_modes = this->device_eth_routing_info_.at(mmio_device); + uint32_t tunnel_count = std::count_if(chip_eth_core_modes.begin(), chip_eth_core_modes.end(), [](const auto &e) { + return e.second == EthRouterMode::BI_DIR_TUNNELING; + }); + return tunnel_count; +} + +uint32_t Cluster::get_device_tunnel_depth(chip_id_t chip_id) const { + return this->cluster_desc_->get_ethernet_link_distance(chip_id, this->get_associated_mmio_device(chip_id)); +} + uint32_t Cluster::get_tensix_soft_reset_addr() const { return DEVICE_DATA.TENSIX_SOFT_RESET_ADDR; } } // namespace tt diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index c1a7901dbf0..f1fa68605db 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -149,16 +149,18 @@ class Cluster { // Converts logical ethernet core coord to physical ethernet core coord CoreCoord ethernet_core_from_logical_core(chip_id_t chip_id, const CoreCoord &logical_core) const; - // Configures routing mapping of ethernet cores - void initialize_routing_info_for_ethernet_cores(); - - void reserve_ethernet_cores_for_tunneling(); + // Bookkeeping for mmio device tunnels + uint32_t get_mmio_device_max_tunnel_depth(chip_id_t mmio_device) const; + uint32_t get_mmio_device_tunnel_count(chip_id_t mmio_device) const; + uint32_t get_device_tunnel_depth(chip_id_t chip_id) const; // Dispatch core is managed by device, so this is an api for device to get the each eth core used in FD tunneling. // Returns logical eth core that communicates with specified dispatch core tt_cxy_pair get_eth_core_for_dispatch_core( tt_cxy_pair logical_dispatch_core, EthRouterMode mode, chip_id_t connected_chip_id) const; + std::tuple get_eth_tunnel_core(chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const; + // Internal routing for SD and FD enables launching user ethernet kernels and FD tunneling for all devices in the // cluster. When using multiple devices in a cluster, this should be the flow: // CreateDevice(0) @@ -186,6 +188,12 @@ class Cluster { return this->devices_grouped_by_assoc_mmio_device_.at(mmio_device_id); } + // Returns vector of unique tunnels originating from mmio device. + // Each vecor entry is another vector of remote devices on that tunnel. + std::vector> get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const; + + bool is_galaxy_cluster() const; + private: Cluster(); ~Cluster(); @@ -202,6 +210,8 @@ class Cluster { void get_metal_desc_from_tt_desc(const std::unordered_map &input, const std::unordered_map &per_chip_id_harvesting_masks); tt_cxy_pair convert_physical_cxy_to_virtual(const tt_cxy_pair &physical_cxy) const; + // Reserves ethernet cores in cluster for tunneling + void reserve_ethernet_cores_for_tunneling(); // Returns map of connected chip ids to active ethernet cores std::unordered_map> get_ethernet_cores_grouped_by_connected_chips( chip_id_t chip_id) const; @@ -226,6 +236,10 @@ class Cluster { // Save mapping of device id to associated MMIO device id for fast lookup std::unordered_map device_to_mmio_device_; + // Flag to tell whether we are on a TG type of system. + // If any device has to board type of GALAXY, we are on a TG cluster. + bool is_tg_cluster_; + // Currently, each device is mapped to its own channel in host memory to enable fast dispatch // Channels are unique within a group of devices all controlled by a particular MMIO device // For example: diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 3fabec43b4b..5fba1ed71b2 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -273,6 +273,7 @@ std::map CreateDevices( std::unordered_set free_cores = {}; std::vector all_device_ids = {}; + bool is_galaxy = tt::Cluster::instance().is_galaxy_cluster(); for (const auto &device_id : device_ids) { // Get list of all devices in the cluster connected to the passed in device_ids const auto &mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); @@ -282,20 +283,55 @@ std::map CreateDevices( } } } + // Determine which CPU cores the worker threads need to be placed on for each device std::unordered_map device_to_core_map = device_cpu_allocator::get_device_id_to_core_map(all_device_ids, free_cores, use_numa_node_based_thread_binding); - for (const auto& device_id : all_device_ids) { - int core_assigned_to_device = device_to_core_map.at(device_id); - Device *dev = new Device( - device_id, - num_hw_cqs, - l1_small_size, - l1_bank_remap, - false, - core_assigned_to_device); - active_devices.insert({device_id, dev}); - detail::InitDeviceProfiler(dev); + for (const auto &device_id : all_device_ids) { + // For Galaxy init, we only need to loop over mmio devices + const auto &mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); + if (is_galaxy and mmio_device_id != device_id) { + continue; + } + if (active_devices.find(mmio_device_id) == active_devices.end()) { + log_debug(tt::LogMetal, "MMIO Device {} Tunnel Count: {}", mmio_device_id, tt::Cluster::instance().get_mmio_device_tunnel_count(mmio_device_id)); + log_debug(tt::LogMetal, "MMIO Device {} Tunnel Depth: {}", mmio_device_id, tt::Cluster::instance().get_mmio_device_max_tunnel_depth(mmio_device_id)); + log_debug(tt::LogMetal, "MMIO Device {} Tunnel Stop: {}", mmio_device_id, tt::Cluster::instance().get_device_tunnel_depth(mmio_device_id)); + int core_assigned_to_device = device_to_core_map.at(mmio_device_id); + Device *mmio_device = new Device( + mmio_device_id, + num_hw_cqs, + l1_small_size, + l1_bank_remap, + false, + core_assigned_to_device); + //Only include the mmio device in the active devices set returned to the caller if we are not running + //on a Galaxy cluster. + //On Galaxy, gateway (mmio devices) cannot run compute workloads. + if (!is_galaxy) { + active_devices.insert({mmio_device_id, mmio_device}); + detail::InitDeviceProfiler(mmio_device); + } + + auto tunnels_from_mmio = mmio_device->tunnels_from_mmio_; + for (uint32_t t = 0; t < tunnels_from_mmio.size(); t++) { + //Need to create devices from farthest to the closest. + for (uint32_t ts = tunnels_from_mmio[t].size() - 1; ts > 0 ; ts--) { + uint32_t mmio_controlled_device_id = tunnels_from_mmio[t][ts]; + log_debug(tt::LogMetal, "Tunnel {} Device {} Tunnel Stop: {}", t, mmio_controlled_device_id, ts); + int core_assigned_to_device = device_to_core_map.at(mmio_controlled_device_id); + Device *dev = new Device( + mmio_controlled_device_id, + num_hw_cqs, + l1_small_size, + l1_bank_remap, + false, + core_assigned_to_device); + active_devices.insert({mmio_controlled_device_id, dev}); + detail::InitDeviceProfiler(dev); + } + } + } } if (use_numa_node_based_thread_binding) { @@ -312,7 +348,45 @@ std::map CreateDevices( void CloseDevices(std::map devices) { tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(false); - for (const auto &[device_id, dev] : devices) { + std::map mmio_devices = {}; + bool is_galaxy = tt::Cluster::instance().is_galaxy_cluster(); + + if (is_galaxy) { + //On Galaxy, gateway wormhole devices (mmio devices) are not included in the set of devices + //created by CreateDevices(). So when closing devices, we need to find the corresponding + //gateway chips for all the tunneled devcies. + for (const auto &[device_id, dev] : devices) { + const auto &mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); + if (mmio_devices.find(mmio_device_id) == mmio_devices.end()) { + auto dev_handle = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id); + mmio_devices.insert({mmio_device_id, dev_handle}); + } + } + } else { + for (const auto &[device_id, dev] : devices) { + if(dev->is_mmio_capable()) { + mmio_devices.insert({device_id, dev}); + } + } + for (const auto &[device_id, dev] : mmio_devices) { + devices.erase(device_id); + } + } + + for (const auto &[device_id, dev] : mmio_devices) { + //For each mmio device, first close all the remote tunneled devices. + //Close the farthest tunneled device first. + auto tunnels_from_mmio = dev->tunnels_from_mmio_; + //iterate over all tunnels origination from this mmio device + for (auto t : tunnels_from_mmio) { + //iterate over all tunneled devices (tunnel stops) in this tunnel and close them. + for (uint32_t ts = t.size() - 1; ts > 0; ts--) { + if (devices.find(t[ts]) != devices.end()) { + devices[t[ts]]->close(); + } + } + } + //finally close the mmio device dev->close(); } }