From feb8d31a3800c28a1930ea150ee999af6ef2bc3e Mon Sep 17 00:00:00 2001 From: Umair Date: Tue, 17 Sep 2024 19:54:31 +0000 Subject: [PATCH] #13394: Galaxy 2cq support Increase demux_d to 2 for galaxy tunnels. Add multicq galaxy tests. Fix queue flow control register indexing. --- .../dispatch/test_prefetcher.cpp | 12 +- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 2 +- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 100 ++ .../common/command_queue_fixture.hpp | 49 +- .../core_descriptors/wormhole_b0_80_arch.yaml | 14 +- tt_metal/host_api.hpp | 7 + tt_metal/impl/device/device.cpp | 995 ++++++++++++------ .../impl/dispatch/command_queue_interface.hpp | 20 +- .../impl/dispatch/kernels/packet_queue.hpp | 18 +- tt_metal/tt_metal.cpp | 7 +- 10 files changed, 876 insertions(+), 348 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index 3262b251b8d5..330e2418e705 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -125,7 +125,7 @@ void init(int argc, char **argv) { log_info(LogTest, " -hp: host huge page issue buffer size (default {})", DEFAULT_HUGEPAGE_ISSUE_BUFFER_SIZE); log_info(LogTest, " -pq: prefetch queue entries (default {})", DEFAULT_PREFETCH_Q_ENTRIES); log_info(LogTest, " -cs: cmddat q size (default {})", DEFAULT_CMDDAT_Q_SIZE); - log_info(LogTest, "-pdcs: prefetch_d cmddat cb size (default {})", dispatch_constants::get(CoreType::WORKER).prefetch_d_buffer_size()); + log_info(LogTest, "-pdcs: prefetch_d cmddat cb size (default {})", dispatch_constants::get(CoreType::WORKER, 1).prefetch_d_buffer_size()); log_info(LogTest, " -ss: scratch cb size (default {})", DEFAULT_SCRATCH_DB_SIZE); log_info(LogTest, " -pcies: size of data to transfer in pcie bw test type (default: {})", PCIE_TRANSFER_SIZE_DEFAULT); log_info(LogTest, " -dpgs: dram page size in dram bw test type (default: {})", DRAM_PAGE_SIZE_DEFAULT); @@ -154,7 +154,7 @@ void init(int argc, char **argv) { pcie_transfer_size_g = test_args::get_command_option_uint32(input_args, "-pcies", PCIE_TRANSFER_SIZE_DEFAULT); dram_page_size_g = test_args::get_command_option_uint32(input_args, "-dpgs", DRAM_PAGE_SIZE_DEFAULT); dram_pages_to_read_g = test_args::get_command_option_uint32(input_args, "-dpgr", DRAM_PAGES_TO_READ_DEFAULT); - prefetch_d_buffer_size_g = test_args::get_command_option_uint32(input_args, "-pdcs", dispatch_constants::get(CoreType::WORKER).prefetch_d_buffer_size()); + prefetch_d_buffer_size_g = test_args::get_command_option_uint32(input_args, "-pdcs", dispatch_constants::get(CoreType::WORKER, 1).prefetch_d_buffer_size()); test_type_g = test_args::get_command_option_uint32(input_args, "-t", DEFAULT_TEST_TYPE); all_workers_g.end_coord.x = test_args::get_command_option_uint32(input_args, "-wx", all_workers_g.end_coord.x); @@ -1516,7 +1516,7 @@ void configure_for_single_chip(Device *device, uint32_t packetized_path_test_results_size) { const CoreType dispatch_core_type = CoreType::WORKER; - uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_block_size_pages() * dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; + uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() * dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; uint32_t num_compute_cores = device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y; CoreCoord prefetch_core = {0, 0}; @@ -1887,7 +1887,7 @@ void configure_for_single_chip(Device *device, std::vector dispatch_compile_args = { dispatch_buffer_base, dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS * dispatch_constants::get(dispatch_core_type).dispatch_buffer_block_size_pages(), + dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS * dispatch_constants::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(), dispatch_cb_sem, // overridden below for h split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem, // overridden below for dispatch_h dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS, @@ -2144,7 +2144,7 @@ void configure_for_multi_chip(Device *device, uint32_t packetized_path_test_results_size) { const CoreType dispatch_core_type = CoreType::WORKER; - uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_block_size_pages() * dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; + uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() * dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; uint32_t num_compute_cores = device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y; TT_ASSERT(num_compute_cores == (device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y)); @@ -2620,7 +2620,7 @@ void configure_for_multi_chip(Device *device, std::vector dispatch_compile_args = { dispatch_buffer_base, dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS * dispatch_constants::get(dispatch_core_type).dispatch_buffer_block_size_pages(), + dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS * dispatch_constants::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(), dispatch_cb_sem, // overridden below for h split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem, dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS, diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 88247be9d11e..214899c07af9 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -356,7 +356,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRo TEST_F(CommandQueueSingleCardFixture, Sending131072Pages) { for (Device *device : devices_) { TestBufferConfig config = {.num_pages = 131072, .page_size = 128, .buftype = BufferType::DRAM}; - + tt::log_info("Running On Device {}", device->id()); local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); } } diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 10d38edcf30c..a9596c5e3b08 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -54,6 +54,106 @@ bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, v namespace basic_tests { namespace dram_tests { +TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToDramBank0) { + TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } + +} + +TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToAllDramBanks) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + TestBufferConfig config = { + .num_pages = uint32_t(device->num_banks(BufferType::DRAM)), + .page_size = 2048, + .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { + constexpr uint32_t num_round_robins = 2; + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + TestBufferConfig config = { + .num_pages = num_round_robins * (device->num_banks(BufferType::DRAM)), + .page_size = 2048, + .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceFixture, Sending131072Pages) { + // Was a failing case where we used to accidentally program cb num pages to be total + // pages instead of cb num pages. + TestBufferConfig config = { + .num_pages = 131072, + .page_size = 128, + .buftype = BufferType::DRAM}; + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForDram) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForDram2) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + // From stable diffusion read buffer + TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + uint32_t page_size = 2048; + uint32_t command_queue_size = device->sysmem_manager().get_cq_size(); + uint32_t num_pages = command_queue_size / page_size; + + TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToDramBank0) { TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; CommandQueue& a = this->device_->command_queue(0); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp index 1bace88e6515..20c25c96c2e6 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp @@ -4,6 +4,7 @@ #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/test_utils/env_vars.hpp" #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/llrt/rtoptions.hpp" @@ -26,8 +27,10 @@ class MultiCommandQueueSingleDeviceFixture : public ::testing::Test { arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER; if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) { - tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()"); - dispatch_core_type = DispatchCoreType::ETH; + if (!tt::tt_metal::IsGalaxyCluster()) { + tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()"); + dispatch_core_type = DispatchCoreType::ETH; + } } device_ = tt::tt_metal::CreateDevice(0, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); } @@ -40,6 +43,48 @@ class MultiCommandQueueSingleDeviceFixture : public ::testing::Test { tt::ARCH arch_; }; +class MultiCommandQueueMultiDeviceFixture : public ::testing::Test { + protected: + void SetUp() override { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + GTEST_SKIP(); + } + auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs(); + if (num_cqs != 2) { + TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); + GTEST_SKIP(); + } + arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); + + + DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER; + if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) { + if (!tt::tt_metal::IsGalaxyCluster()) { + tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()"); + dispatch_core_type = DispatchCoreType::ETH; + } + } + + const chip_id_t mmio_device_id = 0; + reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + for (const auto &[id, device] : reserved_devices_) { + devices_.push_back(device); + } + + num_devices_ = reserved_devices_.size(); + } + + void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } + + std::vector devices_; + std::map reserved_devices_; + size_t num_devices_; + tt::ARCH arch_; +}; + + class SingleDeviceTraceFixture: public ::testing::Test { protected: Device* device_; diff --git a/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml b/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml index e9a6f1e49b39..2178e933f2a4 100644 --- a/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml +++ b/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml @@ -42,7 +42,7 @@ nebula_x1: tg_compute_with_storage_grid_range: # Logical only start and end [x, y] start: [0, 0] - end: [7, 3] + end: [7, 1] storage_cores: [] @@ -53,7 +53,10 @@ nebula_x1: tg_dispatch_cores: [[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1], [0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2], - [0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]] + [0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3], + [0, -4], [1, -4], [2, -4], [3, -4], [4, -4], [5, -4], [6, -4], [7, -4], + [0, -5], [1, -5], [2, -5], [3, -5], [4, -5], [5, -5], [6, -5], [7, -5], + [0, -6], [1, -6], [2, -6], [3, -6], [4, -6], [5, -6], [6, -6], [7, -6]] dispatch_core_type: "tensix" @@ -65,7 +68,7 @@ nebula_x1: tg_compute_with_storage_grid_range: # Logical only start and end [x, y] start: [0, 0] - end: [7, 3] + end: [7, 1] storage_cores: [] @@ -76,7 +79,10 @@ nebula_x1: tg_dispatch_cores: [[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1], [0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2], - [0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]] + [0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3], + [0, -4], [1, -4], [2, -4], [3, -4], [4, -4], [5, -4], [6, -4], [7, -4], + [0, -5], [1, -5], [2, -5], [3, -5], [4, -5], [5, -5], [6, -5], [7, -5], + [0, -6], [1, -6], [2, -6], [3, -6], [4, -6], [5, -6], [6, -6], [7, -6]] dispatch_core_type: "tensix" diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp index d0dde15f4c31..caa7588f931c 100644 --- a/tt_metal/host_api.hpp +++ b/tt_metal/host_api.hpp @@ -49,6 +49,13 @@ class Buffer; */ size_t GetNumAvailableDevices(); +/** + * Returns whether Tenstorrent devices are in a Galaxy cluster + * + * Return value: bool + */ +bool IsGalaxyCluster(); + /** * Returns number of Tenstorrent devices that are connected to host via PCIe and can be targeted * diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 01eb122d56bc..6cec33f2057e 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -739,22 +739,28 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::MUX][0]); - TT_ASSERT(num_prefetchers == mux_settings.semaphores.size(), "Mux does not have required number of semaphores for Prefetchers. Exptected = {}. Found = {}", num_prefetchers, mux_settings.semaphores.size()); - uint32_t mux_sem = mux_settings.consumer_semaphore_id; + uint32_t mux_count = device_worker_variants[DispatchWorkerType::MUX].size(); + TT_ASSERT((num_prefetchers / mux_count) <= MAX_SWITCH_FAN_IN, "Insufficient Mux cores. Expected = {}. Found = {}", num_prefetchers, mux_count); + //auto mux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::MUX][0]); + //TT_ASSERT(num_prefetchers == (mux_count * mux_settings.semaphores.size()), "Mux does not have required number of semaphores for Prefetchers. Exptected = {}. Found = {}", num_prefetchers, mux_settings.semaphores.size()); + //uint32_t mux_sem = mux_settings.consumer_semaphore_id; + uint32_t mux_index = 0; + std::vectormux_sem(mux_count, 0); for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::PREFETCH]) { auto dispatch_core_type = settings.dispatch_core_type; - uint32_t downstream_cb_base = mux_settings.cb_start_address + mux_settings.cb_size_bytes * mux_sem; + auto mux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::MUX][mux_index]); + + uint32_t downstream_cb_base = mux_settings.cb_start_address + mux_settings.cb_size_bytes * mux_sem[mux_index]; + uint32_t downstream_cb_pages = mux_settings.cb_pages; settings.upstream_cores.push_back(tt_cxy_pair(0, 0, 0)); settings.downstream_cores.push_back(mux_settings.worker_physical_core); settings.compile_args.resize(28); auto& compile_args = settings.compile_args; compile_args[0] = downstream_cb_base; compile_args[1] = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; - compile_args[2] = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs); + compile_args[2] = downstream_cb_pages; //dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs); compile_args[3] = settings.producer_semaphore_id; - compile_args[4] = mux_sem++; + compile_args[4] = mux_sem[mux_index]; compile_args[5] = settings.issue_queue_start_addr; compile_args[6] = settings.issue_queue_size; compile_args[7] = dispatch_constants::PREFETCH_Q_BASE; @@ -778,64 +784,87 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::MUX][0]); - TT_ASSERT(num_prefetchers == mux_settings.semaphores.size(), "Mux does not have required number of semaphores for Prefetchers. Exptected = {}. Found = {}", num_prefetchers, mux_settings.semaphores.size()); - TT_ASSERT(num_prefetchers <= MAX_SWITCH_FAN_IN, "Mux does not support required fan-in of {}.", num_prefetchers); - - uint32_t mux_sem = mux_settings.consumer_semaphore_id; + uint32_t num_muxes = device_worker_variants[DispatchWorkerType::MUX].size(); - auto& compile_args = mux_settings.compile_args; - compile_args.resize(36); - compile_args[0] = 0; // 0: reserved - compile_args[1] = mux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words - compile_args[2] = mux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words - compile_args[3] = num_prefetchers; // 3: router_lanes + TT_ASSERT(num_muxes * MAX_SWITCH_FAN_IN >= num_prefetchers, "Insufficient Mux Cores"); TT_ASSERT(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE].size() == 1, "Unexpected number of ethernet tunnelers."); auto &tunneler_settings = std::get<1>(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); TT_ASSERT(num_prefetchers == tunneler_settings.vc_count - 1, "Mux did not reserve a VC for each Prefetch H. Needed {}.", num_prefetchers); + uint32_t mux_id = 0; + for (auto&[mux_core, mux_settings] : device_worker_variants[DispatchWorkerType::MUX]) { + uint32_t mux_fanin = 1 + ((num_prefetchers - 1) % MAX_SWITCH_FAN_IN); + + //auto &mux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::MUX][0]); + TT_ASSERT(mux_fanin == mux_settings.semaphores.size(), "Mux does not have required number of semaphores for Prefetchers. Exptected = {}. Found = {}", num_prefetchers, mux_settings.semaphores.size()); + //TT_ASSERT(num_prefetchers <= MAX_SWITCH_FAN_IN, "Mux does not support required fan-in of {}.", num_prefetchers); + + uint32_t mux_sem = mux_settings.consumer_semaphore_id; + + auto& compile_args = mux_settings.compile_args; + compile_args.resize(36); + compile_args[0] = 0; // 0: reserved + compile_args[1] = mux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + compile_args[2] = mux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + compile_args[3] = mux_fanin; // 3: router_lanes + + uint32_t connections_remaining = mux_fanin; + for (int i = 0; (i < MAX_SWITCH_FAN_IN) && (connections_remaining); i++) { + compile_args[4 + i] = packet_switch_4B_pack((uint32_t)tunneler_settings.worker_physical_core.x, + (uint32_t)tunneler_settings.worker_physical_core.y, + i + (mux_id * MAX_SWITCH_FAN_IN), + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4, 5, 6, 7: dest x info + compile_args[8 + i * 2] = (tunneler_settings.cb_start_address + (i + mux_id * MAX_SWITCH_FAN_IN) * tunneler_settings.cb_size_bytes) >> 4; + compile_args[9 + i * 2] = tunneler_settings.cb_size_bytes >> 4; + connections_remaining--; + } - for (int i = 0; i < num_prefetchers; i++) { - compile_args[4 + i] = packet_switch_4B_pack((uint32_t)tunneler_settings.worker_physical_core.x, - (uint32_t)tunneler_settings.worker_physical_core.y, - i, - (uint32_t)DispatchRemoteNetworkType::NOC0); // 4, 5, 6, 7: dest x info - compile_args[8 + i * 2] = (tunneler_settings.cb_start_address + i * tunneler_settings.cb_size_bytes) >> 4; - compile_args[9 + i * 2] = tunneler_settings.cb_size_bytes >> 4; - } - - uint32_t arg_index = 16; - for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::PREFETCH]) { - compile_args[arg_index++] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, - (uint32_t)settings.worker_physical_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0); // 16,17,18,19: src x info - } + uint32_t arg_index = 16; + connections_remaining = mux_fanin; + for (int i = 0; (i < MAX_SWITCH_FAN_IN) && (connections_remaining); i++) { + auto&[core, settings] = device_worker_variants[DispatchWorkerType::PREFETCH][i * num_muxes + mux_id]; + compile_args[arg_index++] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, + (uint32_t)settings.worker_physical_core.y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 16,17,18,19: src x info + connections_remaining--; + } - compile_args[22] = 0; // 14: test_results_addr (disabled) - compile_args[23] = 0; // 15: test_results_size (disabled) - compile_args[24] = 0; // 16: timeout_cycles - compile_args[25] = 0x0; // 17: output_depacketize - compile_args[26] = 0x0; // 18: output_depacketize info dest 0 - compile_args[27] = 0x0; // 19: output_depacketize info dest 1 - compile_args[28] = 0x0; // 20: output_depacketize info dest 2 - compile_args[29] = 0x0; // 21: output_depacketize info dest 3 - arg_index = 30; // 22, 23, 24, 25: input x packetize info: - for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::PREFETCH]) { - compile_args[arg_index++] = packet_switch_4B_pack(0x1, - dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, - settings.producer_semaphore_id, // upstream sem - mux_sem++); // local sem + compile_args[22] = 0; // 14: test_results_addr (disabled) + compile_args[23] = 0; // 15: test_results_size (disabled) + compile_args[24] = 0; // 16: timeout_cycles + compile_args[25] = 0x0; // 17: output_depacketize + compile_args[26] = 0x0; // 18: output_depacketize info dest 0 + compile_args[27] = 0x0; // 19: output_depacketize info dest 1 + compile_args[28] = 0x0; // 20: output_depacketize info dest 2 + compile_args[29] = 0x0; // 21: output_depacketize info dest 3 + arg_index = 30; // 22, 23, 24, 25: input x packetize info: + + connections_remaining = mux_fanin; + for (int i = 0; (i < MAX_SWITCH_FAN_IN) && (connections_remaining); i++) { + auto&[core, settings] = device_worker_variants[DispatchWorkerType::PREFETCH][i * num_muxes + mux_id]; + compile_args[arg_index++] = packet_switch_4B_pack(0x1, + dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, + settings.producer_semaphore_id, // upstream sem + mux_sem++); // local sem + + //FIXME: connections_remaining-- ?? + } + uint32_t src_id_start = 0xA1 + mux_id * MAX_SWITCH_FAN_IN; + uint32_t dst_id_start = 0xB1 + mux_id * MAX_SWITCH_FAN_IN; + compile_args[34] = packet_switch_4B_pack(src_id_start, src_id_start + 1, src_id_start + 2, src_id_start + 3); // 26: packetized input src id + compile_args[35] = packet_switch_4B_pack(dst_id_start, dst_id_start + 1, dst_id_start + 2, dst_id_start + 3); // 26: packetized input dest id + mux_id++; } - compile_args[34] = packet_switch_4B_pack(0xA1, 0xA2, 0xA3, 0xA4); // 26: packetized input src id - compile_args[35] = packet_switch_4B_pack(0xB1, 0xB2, 0xB3, 0xB4); // 26: packetized input dest id break; } case DispatchWorkerType::US_TUNNELER_REMOTE: @@ -863,7 +892,6 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DEMUX][0]); - auto &mux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::MUX][0]); compile_args[4 + return_vc] = packet_switch_4B_pack(demux_settings.worker_physical_core.x, demux_settings.worker_physical_core.y, @@ -871,30 +899,42 @@ void Device::update_workers_build_settings(std::vector> 4; // 8: remote_receiver_queue_start_addr_words return vc compile_args[15 + return_vc * 2] = demux_settings.cb_size_bytes >> 4; // 9: remote_receiver_queue_size_words return vc - - for (uint32_t i = 0; i < fwd_vc_count; i++) { - compile_args[34 + i] = packet_switch_4B_pack(mux_settings.worker_physical_core.x, - mux_settings.worker_physical_core.y, - fwd_vc_count + i, // mux output queue id - (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender fwd vcs + uint32_t arg_index = 34; + for (auto&[mux_core, mux_settings] : device_worker_variants[DispatchWorkerType::MUX]) { + uint32_t mux_output_q_id_start = mux_settings.semaphores.size(); + uint32_t connections_remaining = mux_settings.semaphores.size(); + for (uint32_t i = 0; i < connections_remaining; i++) { + compile_args[arg_index++] = packet_switch_4B_pack(mux_settings.worker_physical_core.x, + mux_settings.worker_physical_core.y, + mux_output_q_id_start + i, // mux output queue id + (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender fwd vcs + } } } else { auto &mux_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::MUX_D][0]); - auto &demux_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX_D][0]); uint32_t prefetch_d_count = device_worker_variants[DispatchWorkerType::PREFETCH_D].size(); compile_args[4 + return_vc] = packet_switch_4B_pack(mux_d_settings.worker_physical_core.x, mux_d_settings.worker_physical_core.y, - 1,//mux_d input. This is return path from next tunnel stop towards mmio device. + mux_d_settings.semaphores.size(),//mux_d input. This is return path from next tunnel stop towards mmio device. //mux_d iput 0 is driven by local Dispatch D (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver return vc - compile_args[14 + return_vc * 2] = (mux_d_settings.cb_start_address + mux_d_settings.cb_size_bytes) >> 4; // 8: remote_receiver_queue_start_addr_words return vc + compile_args[14 + return_vc * 2] = (mux_d_settings.cb_start_address + mux_d_settings.semaphores.size() * mux_d_settings.cb_size_bytes) >> 4; // 8: remote_receiver_queue_start_addr_words return vc compile_args[15 + return_vc * 2] = mux_d_settings.cb_size_bytes >> 4; // 9: remote_receiver_queue_size_words return vc - for (uint32_t i = 0; i < fwd_vc_count; i++) { - compile_args[34 + i] = packet_switch_4B_pack(demux_d_settings.worker_physical_core.x, - demux_d_settings.worker_physical_core.y, - fwd_vc_count + 2 * prefetch_d_count + i, // demux output queue id. 0=> demux input, 1=> demux_d output to local Prefetch D, 2=> demux_d output to tunneler (to next tunnel stop) - (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender fwd vcs + uint32_t arg_index = 34; + uint32_t local_fanout = 1; + uint32_t vcs_per_demux_d = fwd_vc_count + prefetch_d_count - ((fwd_vc_count + prefetch_d_count) / 2); + + for (auto&[demux_d_core, demux_d_settings] : device_worker_variants[DispatchWorkerType::DEMUX_D]) { + uint32_t demux_d_output_q_id_start = vcs_per_demux_d; + for (uint32_t i = local_fanout; i < vcs_per_demux_d; i++) { + compile_args[arg_index++] = packet_switch_4B_pack(demux_d_settings.worker_physical_core.x, + demux_d_settings.worker_physical_core.y, + demux_d_output_q_id_start + i, // demux output queue id. 0=> demux input, 1=> demux_d output to local Prefetch D, 2=> demux_d output to tunneler (to next tunnel stop) + (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender fwd vcs + } + vcs_per_demux_d = (fwd_vc_count + prefetch_d_count) / 2; + local_fanout = prefetch_d_count - 1; } } @@ -915,99 +955,278 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); - auto &demux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][0]); - auto &dispatch_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH][0]); + if (device_worker_variants[DispatchWorkerType::DEMUX].size() == 1) { + auto &tunneler_settings = std::get<1>(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); + auto &demux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][0]); + auto &dispatch_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH][0]); + + auto &compile_args = demux_settings.compile_args; + compile_args.resize(30); + + compile_args[0] = 0xD1; // 0: endpoint_id_start_index + compile_args[1] = demux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + compile_args[2] = demux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + compile_args[3] = device_worker_variants[DispatchWorkerType::DISPATCH].size(); // 3: demux_fan_out + + uint32_t arg_index = 4; + for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::DISPATCH]) { + compile_args[arg_index++] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, + (uint32_t)settings.worker_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info + } + arg_index = 8; + for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::DISPATCH]) { + compile_args[arg_index++] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x + compile_args[arg_index++] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x + } + compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x + compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y + compile_args[18] = tunneler_settings.vc_count * 2 - 1; // 18: remote_rx_queue_id + compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type + uint32_t dest_map_array[4] = {0, 1, 2, 3}; + uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); + compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi + compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo + compile_args[22] = 0; // 22: test_results_addr (disabled) + compile_args[23] = 0; // 23: test_results_size (disabled) + compile_args[24] = 0; // 24: timeout_cycles + compile_args[25] = 0xF; // 25: output_depacketize_mask + arg_index = 26; + uint32_t demux_sem = demux_settings.producer_semaphore_id; + for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::DISPATCH]) { + // 26, 27, 28, 29: output x depacketize info: + compile_args[arg_index++] = packet_switch_4B_pack(settings.cb_log_page_size, + settings.consumer_semaphore_id, // downstream sem + demux_sem++, // local sem + 1); // remove header + } + } else if (device_worker_variants[DispatchWorkerType::DEMUX].size() == 3) { + //Galaxy 2CQ requires three demux cores. tunneler->1x2->1x4(2x)->Dispatch(8x) + auto &tunneler_settings = std::get<1>(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); + auto &demux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][0]); + auto &demux_1_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][1]); + auto &demux_2_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][2]); + + auto &compile_args = demux_settings.compile_args; + compile_args.resize(30); + compile_args[0] = 0xD1; // 0: endpoint_id_start_index + compile_args[1] = demux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + compile_args[2] = demux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + compile_args[3] = 2; // 3: demux_fan_out + + compile_args[4] = packet_switch_4B_pack((uint32_t)demux_1_settings.worker_physical_core.x, + (uint32_t)demux_1_settings.worker_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info + compile_args[5] = packet_switch_4B_pack((uint32_t)demux_2_settings.worker_physical_core.x, + (uint32_t)demux_2_settings.worker_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info - auto &compile_args = demux_settings.compile_args; - compile_args.resize(30); + compile_args[8] = demux_1_settings.cb_start_address >> 4; // 8: remote_tx_queue_start_addr_words x + compile_args[9] = demux_1_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words x + compile_args[10] = demux_2_settings.cb_start_address >> 4; // 10: remote_tx_queue_start_addr_words x + compile_args[11] = demux_2_settings.cb_size_bytes >> 4; // 11: remote_tx_queue_size_words x - compile_args[0] = 0xD1; // 0: endpoint_id_start_index - compile_args[1] = demux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words - compile_args[2] = demux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words - compile_args[3] = device_worker_variants[DispatchWorkerType::DISPATCH].size(); // 3: demux_fan_out + compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x + compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y + compile_args[18] = tunneler_settings.vc_count * 2 - 1; // 18: remote_rx_queue_id + compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type - uint32_t arg_index = 4; - for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::DISPATCH]) { - compile_args[arg_index++] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, - (uint32_t)settings.worker_physical_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info - } - arg_index = 8; - for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::DISPATCH]) { - compile_args[arg_index++] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x - compile_args[arg_index++] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x - } - compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x - compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y - compile_args[18] = tunneler_settings.vc_count * 2 - 1; // 18: remote_rx_queue_id - compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type - uint32_t dest_map_array[4] = {0, 1, 2, 3}; - uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi - compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo - compile_args[22] = 0; // 22: test_results_addr (disabled) - compile_args[23] = 0; // 23: test_results_size (disabled) - compile_args[24] = 0; // 24: timeout_cycles - compile_args[25] = 0xF; // 25: output_depacketize_mask - arg_index = 26; - uint32_t demux_sem = demux_settings.producer_semaphore_id; - for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::DISPATCH]) { - // 26, 27, 28, 29: output x depacketize info: - compile_args[arg_index++] = packet_switch_4B_pack(settings.cb_log_page_size, - settings.consumer_semaphore_id, // downstream sem - demux_sem++, // local sem - 1); // remove header + uint64_t dest_endpoint_output_map; + if (device_worker_variants[DispatchWorkerType::DISPATCH].size() == 4) { + uint32_t dest_map_array[4] = {0, 0, 1, 1}; + dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); + } else { + uint32_t dest_map_array[8] = {0, 0, 0, 0, 1, 1, 1, 1}; + dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 8); + } + compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi + compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo + + uint32_t demux_1_fanout = device_worker_variants[DispatchWorkerType::DISPATCH].size() / 2; + auto &demux_1_compile_args = demux_1_settings.compile_args; + demux_1_compile_args.resize(30); + + demux_1_compile_args[0] = 0xD1; // 0: endpoint_id_start_index + demux_1_compile_args[1] = demux_1_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + demux_1_compile_args[2] = demux_1_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + demux_1_compile_args[3] = demux_1_fanout; // 3: demux_fan_out + + for (int i = 0; i < demux_1_fanout; i++) { + auto &settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH][i]); + demux_1_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, + (uint32_t)settings.worker_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info + + demux_1_compile_args[8 + i * 2] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x + demux_1_compile_args[9 + i * 2] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x + } + demux_1_compile_args[16] = demux_settings.worker_physical_core.x; // 16: remote_rx_x + demux_1_compile_args[17] = demux_settings.worker_physical_core.y; // 17: remote_rx_y + demux_1_compile_args[18] = 1; // 18: remote_rx_queue_id + demux_1_compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type + uint32_t dest_map_array[4] = {0, 1, 2, 3}; + dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); + demux_1_compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi + demux_1_compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo + demux_1_compile_args[22] = 0; // 22: test_results_addr (disabled) + demux_1_compile_args[23] = 0; // 23: test_results_size (disabled) + demux_1_compile_args[24] = 0; // 24: timeout_cycles + demux_1_compile_args[25] = 0xF >> (4 - demux_1_fanout); // 25: output_depacketize_mask + + uint32_t demux_sem = demux_1_settings.producer_semaphore_id; + for (int i = 0; i < demux_1_fanout; i++) { + // 26, 27, 28, 29: output x depacketize info: + auto &settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH][i]); + demux_1_compile_args[26 + i] = packet_switch_4B_pack(settings.cb_log_page_size, + settings.consumer_semaphore_id, // downstream sem + demux_sem++, // local sem + 1); // remove header + } + + uint32_t demux_2_fanout = device_worker_variants[DispatchWorkerType::DISPATCH].size() / 2; + auto &demux_2_compile_args = demux_2_settings.compile_args; + demux_2_compile_args.resize(30); + + demux_2_compile_args[0] = 0xD1 + demux_1_fanout; // 0: endpoint_id_start_index + demux_2_compile_args[1] = demux_2_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + demux_2_compile_args[2] = demux_2_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + demux_2_compile_args[3] = demux_2_fanout; // 3: demux_fan_out + + for (int i = 0; i < demux_2_fanout; i++) { + auto &settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH][i + demux_1_fanout]); + demux_2_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, + (uint32_t)settings.worker_physical_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info + + demux_2_compile_args[8 + i * 2] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x + demux_2_compile_args[9 + i * 2] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x + } + demux_2_compile_args[16] = demux_settings.worker_physical_core.x; // 16: remote_rx_x + demux_2_compile_args[17] = demux_settings.worker_physical_core.y; // 17: remote_rx_y + demux_2_compile_args[18] = 2; // 18: remote_rx_queue_id + demux_2_compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type + dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); + demux_2_compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi + demux_2_compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo + demux_2_compile_args[22] = 0; // 22: test_results_addr (disabled) + demux_2_compile_args[23] = 0; // 23: test_results_size (disabled) + demux_2_compile_args[24] = 0; // 24: timeout_cycles + demux_2_compile_args[25] = 0xF >> (4 - demux_2_fanout); // 25: output_depacketize_mask + + demux_sem = demux_2_settings.producer_semaphore_id; + for (int i = 0; i < demux_2_fanout; i++) { + // 26, 27, 28, 29: output x depacketize info: + auto &settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH][i + demux_1_fanout]); + demux_2_compile_args[26 + i] = packet_switch_4B_pack(settings.cb_log_page_size, + settings.consumer_semaphore_id, // downstream sem + demux_sem++, // local sem + 1); // remove header + } + + } else { + TT_ASSERT(false, "Unsupported DEMUX core count {}", device_worker_variants[DispatchWorkerType::DEMUX].size()); } break; } case DispatchWorkerType::DISPATCH: { uint32_t num_dispatchers = device_worker_variants[DispatchWorkerType::DISPATCH].size(); - TT_ASSERT(device_worker_variants[DispatchWorkerType::DEMUX].size() == 1, "Cannot have more than one Demux."); - auto demux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][0]); - TT_ASSERT(num_dispatchers == demux_settings.semaphores.size(), "Demux does not have required number of semaphores for Dispatchers. Exptected = {}. Found = {}", num_dispatchers, demux_settings.semaphores.size()); - uint32_t demux_sem = demux_settings.producer_semaphore_id; - uint32_t dispatch_idx = 0; - for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::DISPATCH]) { - auto prefetch_h_settings = std::get<1>(device_worker_variants[DispatchWorkerType::PREFETCH][dispatch_idx]); - auto prefetch_physical_core = prefetch_h_settings.worker_physical_core; - auto dispatch_core_type = settings.dispatch_core_type; - settings.upstream_cores.push_back(demux_settings.worker_physical_core); - settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0)); - settings.compile_args.resize(27); - auto& compile_args = settings.compile_args; - compile_args[0] = settings.cb_start_address; - compile_args[1] = settings.cb_log_page_size; - compile_args[2] = settings.cb_pages; - compile_args[3] = settings.consumer_semaphore_id; - compile_args[4] = demux_sem++; - compile_args[5] = dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; - compile_args[6] = 0; //unused prefetch_sync_sem - compile_args[7] = settings.command_queue_start_addr; - compile_args[8] = settings.completion_queue_start_addr; - compile_args[9] = settings.completion_queue_size; - compile_args[10] = dispatch_constants::DISPATCH_BUFFER_BASE; // unused - compile_args[11] = (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); // unused - compile_args[12] = 0; // unused: local ds semaphore - compile_args[13] = 0; // unused: remote ds semaphore - compile_args[14] = 0; // preamble size - compile_args[15] = true, // split_prefetcher - compile_args[16] = NOC_XY_ENCODING(prefetch_physical_core.x, prefetch_physical_core.y), - compile_args[17] = prefetch_h_settings.producer_semaphore_id, // sem_id on prefetch_h that dispatch_d is meant to increment, to resume sending of cmds post exec_buf stall - compile_args[18] = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs), // XXXX should this be mux pages? - compile_args[19] = settings.num_compute_cores; - compile_args[20] = 0; // unused: dispatch_d only - compile_args[21] = 0; // unused: dispatch_d only - compile_args[22] = 0; // unused: dispatch_d only - compile_args[23] = 0; // unused: dispatch_d only - compile_args[24] = 0; - compile_args[25] = false; // is_dram_variant - compile_args[26] = true; // is_host_variant - - dispatch_idx++; + if (num_dispatchers == 1 || num_dispatchers == 2) { + TT_ASSERT(device_worker_variants[DispatchWorkerType::DEMUX].size() == 1, "Cannot have more than one Demux."); + auto demux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][0]); + TT_ASSERT(num_dispatchers == demux_settings.semaphores.size(), "Demux does not have required number of semaphores for Dispatchers. Exptected = {}. Found = {}", num_dispatchers, demux_settings.semaphores.size()); + uint32_t demux_sem = demux_settings.producer_semaphore_id; + uint32_t dispatch_idx = 0; + for (auto&[core, settings] : device_worker_variants[DispatchWorkerType::DISPATCH]) { + auto prefetch_h_settings = std::get<1>(device_worker_variants[DispatchWorkerType::PREFETCH][dispatch_idx]); + auto prefetch_physical_core = prefetch_h_settings.worker_physical_core; + auto dispatch_core_type = settings.dispatch_core_type; + settings.upstream_cores.push_back(demux_settings.worker_physical_core); + settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0)); + settings.compile_args.resize(27); + auto& compile_args = settings.compile_args; + compile_args[0] = settings.cb_start_address; + compile_args[1] = settings.cb_log_page_size; + compile_args[2] = settings.cb_pages; + compile_args[3] = settings.consumer_semaphore_id; + compile_args[4] = demux_sem++; + compile_args[5] = dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; + compile_args[6] = 0; //unused prefetch_sync_sem + compile_args[7] = settings.command_queue_start_addr; + compile_args[8] = settings.completion_queue_start_addr; + compile_args[9] = settings.completion_queue_size; + compile_args[10] = dispatch_constants::DISPATCH_BUFFER_BASE; // unused + compile_args[11] = (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); // unused + compile_args[12] = 0; // unused: local ds semaphore + compile_args[13] = 0; // unused: remote ds semaphore + compile_args[14] = 0; // preamble size + compile_args[15] = true, // split_prefetcher + compile_args[16] = NOC_XY_ENCODING(prefetch_physical_core.x, prefetch_physical_core.y), + compile_args[17] = prefetch_h_settings.producer_semaphore_id, // sem_id on prefetch_h that dispatch_d is meant to increment, to resume sending of cmds post exec_buf stall + compile_args[18] = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs), // XXXX should this be mux pages? + compile_args[19] = settings.num_compute_cores; + compile_args[20] = 0; // unused: dispatch_d only + compile_args[21] = 0; // unused: dispatch_d only + compile_args[22] = 0; // unused: dispatch_d only + compile_args[23] = 0; // unused: dispatch_d only + compile_args[24] = 0; + compile_args[25] = false; // is_dram_variant + compile_args[26] = true; // is_host_variant + + dispatch_idx++; + } + } else if (num_dispatchers == 4 || num_dispatchers == 8) { + TT_ASSERT(device_worker_variants[DispatchWorkerType::DEMUX].size() == 3, "Insufficient Demux cores. Expected = 3. Found = {}", device_worker_variants[DispatchWorkerType::DEMUX].size()); + uint32_t dispatch_idx = 0; + uint32_t demux_fanout = num_dispatchers / 2; + for (int i = 1; i < 3; i++) { + auto demux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][i]); + TT_ASSERT(demux_fanout == demux_settings.semaphores.size(), "Demux does not have required number of semaphores for Dispatchers. Exptected = {}. Found = {}", num_dispatchers / 2, demux_settings.semaphores.size()); + uint32_t demux_sem = demux_settings.producer_semaphore_id; + for (int d = 0; d < demux_fanout; d++) { + auto &settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH][dispatch_idx]); + auto prefetch_h_settings = std::get<1>(device_worker_variants[DispatchWorkerType::PREFETCH][dispatch_idx]); + auto prefetch_physical_core = prefetch_h_settings.worker_physical_core; + auto dispatch_core_type = settings.dispatch_core_type; + settings.upstream_cores.push_back(demux_settings.worker_physical_core); + settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0)); + settings.compile_args.resize(27); + auto& compile_args = settings.compile_args; + compile_args[0] = settings.cb_start_address; + compile_args[1] = settings.cb_log_page_size; + compile_args[2] = settings.cb_pages; + compile_args[3] = settings.consumer_semaphore_id; + compile_args[4] = demux_sem++; + compile_args[5] = dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS; + compile_args[6] = 0; //unused prefetch_sync_sem + compile_args[7] = settings.command_queue_start_addr; + compile_args[8] = settings.completion_queue_start_addr; + compile_args[9] = settings.completion_queue_size; + compile_args[10] = dispatch_constants::DISPATCH_BUFFER_BASE; // unused + compile_args[11] = (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); // unused + compile_args[12] = 0; // unused: local ds semaphore + compile_args[13] = 0; // unused: remote ds semaphore + compile_args[14] = 0; // preamble size + compile_args[15] = true, // split_prefetcher + compile_args[16] = NOC_XY_ENCODING(prefetch_physical_core.x, prefetch_physical_core.y), + compile_args[17] = prefetch_h_settings.producer_semaphore_id, // sem_id on prefetch_h that dispatch_d is meant to increment, to resume sending of cmds post exec_buf stall + compile_args[18] = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs), // XXXX should this be mux pages? + compile_args[19] = settings.num_compute_cores; + compile_args[20] = 0; // unused: dispatch_d only + compile_args[21] = 0; // unused: dispatch_d only + compile_args[22] = 0; // unused: dispatch_d only + compile_args[23] = 0; // unused: dispatch_d only + compile_args[24] = 0; + compile_args[25] = false; // is_dram_variant + compile_args[26] = true; // is_host_variant + dispatch_idx++; + } + } } break; } @@ -1016,25 +1235,33 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::US_TUNNELER_LOCAL][0]); - auto &demux_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX_D][0]); auto &mux_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::MUX_D][0]); uint32_t fwd_vc_count = tunneler_settings.vc_count - 1; uint32_t return_vc = fwd_vc_count; + uint32_t local_tunneler_vcs_connected = 0; auto &compile_args = tunneler_settings.compile_args; + + uint32_t num_demux_d = device_worker_variants[DispatchWorkerType::DEMUX_D].size(); + uint32_t vcs_per_demux_d = num_demux_d == 1 ? fwd_vc_count : fwd_vc_count - (fwd_vc_count / 2); + compile_args.resize(48); compile_args[0] = 0xDACADACA; // 0: endpoint_id_start_index compile_args[1] = tunneler_settings.vc_count; // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. compile_args[2] = tunneler_settings.cb_start_address >> 4; // 2: rx_queue_start_addr_words compile_args[3] = tunneler_settings.cb_size_bytes >> 4; // 3: rx_queue_size_words - for (int i = 0; i < fwd_vc_count; i++) { - compile_args[4 + i] = packet_switch_4B_pack(demux_d_settings.worker_physical_core.x, - demux_d_settings.worker_physical_core.y, - i, // input queue id of DEMUX_D - (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_receiver_0_info + for (auto&[core, demux_d_settings] : device_worker_variants[DispatchWorkerType::DEMUX_D]) { + for (int i = 0; i < vcs_per_demux_d; i++) { + compile_args[4 + i + local_tunneler_vcs_connected] = packet_switch_4B_pack(demux_d_settings.worker_physical_core.x, + demux_d_settings.worker_physical_core.y, + i, // input queue id of DEMUX_D + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_receiver_0_info - compile_args[14 + i * 2] = (demux_d_settings.cb_start_address + i * demux_d_settings.cb_size_bytes) >> 4; // 14 - 32: remote_receiver_queue_start_addr_words fwd vcs - compile_args[15 + i * 2] = demux_d_settings.cb_size_bytes >> 4; // 15 - 33: remote_receiver_queue_size_words fwd vcs + compile_args[14 + (i + local_tunneler_vcs_connected) * 2] = (demux_d_settings.cb_start_address + i * demux_d_settings.cb_size_bytes) >> 4; // 14 - 32: remote_receiver_queue_start_addr_words fwd vcs + compile_args[15 + (i + local_tunneler_vcs_connected) * 2] = demux_d_settings.cb_size_bytes >> 4; // 15 - 33: remote_receiver_queue_size_words fwd vcs + } + local_tunneler_vcs_connected += vcs_per_demux_d; + vcs_per_demux_d = fwd_vc_count - vcs_per_demux_d; } compile_args[4 + return_vc] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x, @@ -1049,10 +1276,10 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); auto mux_d_sender = us_tunneler_remote_settings.worker_physical_core; compile_args[47] = (return_vc << 24) | ((us_tunneler_remote_settings.vc_count * 2 - 1) << 16) | (mux_d_sender.y << 8) | (mux_d_sender.x); - log_debug(tt::LogMetal, "Tunner Inner Device {} will send done to {}", tunneler_settings.worker_physical_core.str(), mux_d_sender.str()); + log_debug(tt::LogMetal, "Tunnelr Inner Device {} will send done to {}", tunneler_settings.worker_physical_core.str(), mux_d_sender.str()); } break; @@ -1076,82 +1303,120 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::US_TUNNELER_LOCAL][0]); - auto &demux_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX_D][0]); - - TT_ASSERT(demux_d_settings.tunnel_stop > 0 && demux_d_settings.tunnel_stop <= 4, "Invalid Demux D tunnel stop."); uint32_t fwd_vc_count = tunneler_settings.vc_count - 1; uint32_t return_vc = fwd_vc_count; - auto &compile_args = demux_d_settings.compile_args; - compile_args.resize(36); - - compile_args[0] = 0xB1; // 0: endpoint_id_start_index - compile_args[1] = demux_d_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words - compile_args[2] = demux_d_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words - compile_args[3] = fwd_vc_count; // 3: demux_fan_out - - uint32_t demux_output_idx = 0; - uint32_t demux_output_cb_info_idx = 0; - // Tie DEMUX_D outputs to DEMUX_D output queues (prefetch_d and remote tunnel inputs) and set output CB parameters - for (const auto& prefetch_d_settings : device_worker_variants[DispatchWorkerType::PREFETCH_D]) { - auto prefetch_d_setting = std::get<1>(prefetch_d_settings); - compile_args[4 + demux_output_idx] = packet_switch_4B_pack(prefetch_d_setting.worker_physical_core.x, - prefetch_d_setting.worker_physical_core.y, - 0, // prefetch_d input queue id - (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_tx_0_info - compile_args[8 + demux_output_cb_info_idx] = prefetch_d_setting.cb_start_address >> 4; - compile_args[8 + demux_output_cb_info_idx + 1] = prefetch_d_setting.cb_size_bytes >> 4; - demux_output_idx++; - demux_output_cb_info_idx += 2; - } + uint32_t num_demux_d = device_worker_variants[DispatchWorkerType::DEMUX_D].size(); + uint32_t num_prefetch_d = device_worker_variants[DispatchWorkerType::PREFETCH_D].size(); + uint32_t num_prefetch_d_per_demux_d = num_demux_d == 1 ? num_prefetch_d : 1; + uint32_t vcs_per_demux_d = num_demux_d == 1 ? fwd_vc_count : fwd_vc_count - (fwd_vc_count / 2); + uint32_t prefetch_d_connected = 0; + uint32_t local_tunneler_vcs_connected = 0; + uint32_t remote_tunneler_vcs_connected = 0; + + for (auto&[core, demux_d_settings] : device_worker_variants[DispatchWorkerType::DEMUX_D]) { + + if (demux_d_settings.tunnel_stop == 1 && demux_d_settings.vc_count <= 3) { + // N300/T3K 1 - 2 CQs + TT_ASSERT(device_worker_variants[DispatchWorkerType::DEMUX_D].size() == 1, "Unexpected number of device demux."); + } else if ( is_tunnel_end && demux_d_settings.vc_count == 2) { + // TG/TGG 1 CQ, last tunnel chip + TT_ASSERT(device_worker_variants[DispatchWorkerType::DEMUX_D].size() == 1, "Unexpected number of device demux."); + } else { + // TG/TGG 1 - 2 CQ all tunnel chips + TT_ASSERT(device_worker_variants[DispatchWorkerType::DEMUX_D].size() == 2, "Unexpected number of device demux."); + } - fwd_vc_count -= demux_output_idx; - if (!is_tunnel_end) { - auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); - TT_ASSERT(fwd_vc_count == us_tunneler_remote_settings.vc_count - 1, "Forward VC count mismatch between DEMUX_D and US_TUNNELER_REMOTE"); - for (int i = 0; i < fwd_vc_count; i++) { - compile_args[4 + demux_output_idx + i] = packet_switch_4B_pack((uint32_t)us_tunneler_remote_settings.worker_physical_core.x, - (uint32_t)us_tunneler_remote_settings.worker_physical_core.y, - i, - (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_tx_1_info - compile_args[8 + (demux_output_idx + i) * 2] = (us_tunneler_remote_settings.cb_start_address + i * us_tunneler_remote_settings.cb_size_bytes) >> 4; // 10: remote_tx_queue_start_addr_words 1 - compile_args[9 + (demux_output_idx + i) * 2] = us_tunneler_remote_settings.cb_size_bytes >> 4; // 11: remote_tx_queue_size_words 1 + TT_ASSERT(demux_d_settings.tunnel_stop > 0 && demux_d_settings.tunnel_stop <= 4, "Invalid Demux D tunnel stop."); + + auto &compile_args = demux_d_settings.compile_args; + compile_args.resize(36); + + compile_args[0] = 0xB1; // 0: endpoint_id_start_index + compile_args[1] = demux_d_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words + compile_args[2] = demux_d_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words + compile_args[3] = vcs_per_demux_d; // 3: demux_fan_out + + uint32_t demux_output_idx = 0; + uint32_t demux_output_cb_info_idx = 0; + // Tie DEMUX_D outputs to DEMUX_D output queues (prefetch_d and remote tunnel inputs) and set output CB parameters + for (int p = 0; p < num_prefetch_d_per_demux_d; p++) { + //for (const auto& prefetch_d_settings : device_worker_variants[DispatchWorkerType::PREFETCH_D]) { + // auto prefetch_d_setting = std::get<1>(prefetch_d_settings); + auto prefetch_d_setting = std::get<1>(device_worker_variants[DispatchWorkerType::PREFETCH_D][p + prefetch_d_connected]); + compile_args[4 + demux_output_idx] = packet_switch_4B_pack(prefetch_d_setting.worker_physical_core.x, + prefetch_d_setting.worker_physical_core.y, + 0, // prefetch_d input queue id + (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_tx_0_info + compile_args[8 + demux_output_cb_info_idx] = prefetch_d_setting.cb_start_address >> 4; + compile_args[8 + demux_output_cb_info_idx + 1] = prefetch_d_setting.cb_size_bytes >> 4; + demux_output_idx++; + demux_output_cb_info_idx += 2; + } + + vcs_per_demux_d -= demux_output_idx; + if (!is_tunnel_end) { + auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); + //TT_ASSERT(fwd_vc_count == us_tunneler_remote_settings.vc_count - 1, "Forward VC count mismatch between DEMUX_D and US_TUNNELER_REMOTE"); + for (int i = 0; i < vcs_per_demux_d; i++) { + compile_args[4 + demux_output_idx + i] = packet_switch_4B_pack((uint32_t)us_tunneler_remote_settings.worker_physical_core.x, + (uint32_t)us_tunneler_remote_settings.worker_physical_core.y, + remote_tunneler_vcs_connected, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_tx_1_info + compile_args[8 + (demux_output_idx + i) * 2] = (us_tunneler_remote_settings.cb_start_address + remote_tunneler_vcs_connected * us_tunneler_remote_settings.cb_size_bytes) >> 4; // 10: remote_tx_queue_start_addr_words 1 + compile_args[9 + (demux_output_idx + i) * 2] = us_tunneler_remote_settings.cb_size_bytes >> 4; // 11: remote_tx_queue_size_words 1 + remote_tunneler_vcs_connected++; + } + } else { + TT_ASSERT(vcs_per_demux_d == 0, "Unhandled Forward VCs encountered."); + } + + //reset vcs per demux d to demux fanout. + //need to connect local tunneler ports to demux ports. + vcs_per_demux_d = compile_args[3]; + for (int i = 0; i < vcs_per_demux_d; i++) { + compile_args[16 + i] = packet_switch_4B_pack(tunneler_settings.worker_physical_core.x, + tunneler_settings.worker_physical_core.y, + tunneler_settings.vc_count + local_tunneler_vcs_connected++, + (uint32_t)DispatchRemoteNetworkType::NOC0); // 16: remote_rx_0_info } - } else { - TT_ASSERT(fwd_vc_count == 0, "Unhandled Forward VCs encountered."); - } - for (int i = 0; i < tunneler_settings.vc_count - 1; i++) { - compile_args[16 + i] = packet_switch_4B_pack(tunneler_settings.worker_physical_core.x, - tunneler_settings.worker_physical_core.y, - tunneler_settings.vc_count + i, - (uint32_t)DispatchRemoteNetworkType::NOC0); // 16: remote_rx_0_info + uint32_t dest_map_array[4] = {1, 1, 1, 1}; // needs to be based on tunnel stop. + dest_map_array[demux_d_settings.tunnel_stop-1] = 0; + uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); + compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi + compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo + compile_args[22] = 0; // 22: test_results_addr (disabled) + compile_args[23] = 0; // 23: test_results_size (disabled) + compile_args[24] = 0; // 24: timeout_cycles + compile_args[25] = 0; // 25: output_depacketize_mask + // Update output_depacketize_mask based on num prefetch_d cores (local demux_d outputs) + for (int prefetch_d_idx = 0; prefetch_d_idx < num_prefetch_d_per_demux_d; prefetch_d_idx++) compile_args[25] |= (1 << (prefetch_d_idx)); + // Set downstream and local sem ids, based on number of demux outputs + uint32_t demux_output_sem_idx = 0; + uint32_t demux_sem = demux_d_settings.producer_semaphore_id; + //for (const auto& prefetch_d_settings : device_worker_variants[DispatchWorkerType::PREFETCH_D]) { + for (int p = 0; p < num_prefetch_d_per_demux_d; p++) { + auto prefetch_d_setting = std::get<1>(device_worker_variants[DispatchWorkerType::PREFETCH_D][p + prefetch_d_connected]); + //auto prefetch_d_setting = std::get<1>(prefetch_d_settings); + compile_args[26 + demux_output_sem_idx] = packet_switch_4B_pack(prefetch_d_setting.cb_log_page_size, + prefetch_d_setting.consumer_semaphore_id, // downstream sem + demux_sem++, // local sem + 0); // remove header + demux_output_sem_idx++; + } + prefetch_d_connected += num_prefetch_d_per_demux_d; + vcs_per_demux_d = fwd_vc_count / 2; + num_prefetch_d_per_demux_d = device_worker_variants[DispatchWorkerType::PREFETCH_D].size() - num_prefetch_d_per_demux_d; } - uint32_t dest_map_array[4] = {1, 1, 1, 1}; // needs to be based on tunnel stop. - dest_map_array[demux_d_settings.tunnel_stop-1] = 0; - uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi - compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo - compile_args[22] = 0; // 22: test_results_addr (disabled) - compile_args[23] = 0; // 23: test_results_size (disabled) - compile_args[24] = 0; // 24: timeout_cycles - compile_args[25] = 0; // 25: output_depacketize_mask - // Update output_depacketize_mask based on num prefetch_d cores (local demux_d outputs) - for (int prefetch_d_idx = 0; prefetch_d_idx < device_worker_variants[DispatchWorkerType::PREFETCH_D].size(); prefetch_d_idx++) compile_args[25] |= (1 << (prefetch_d_idx)); - // Set downstream and local sem ids, based on number of demux outputs - uint32_t demux_output_sem_idx = 0; - uint32_t demux_sem = demux_d_settings.producer_semaphore_id; - for (const auto& prefetch_d_settings : device_worker_variants[DispatchWorkerType::PREFETCH_D]) { - auto prefetch_d_setting = std::get<1>(prefetch_d_settings); - compile_args[26 + demux_output_sem_idx] = packet_switch_4B_pack(prefetch_d_setting.cb_log_page_size, - prefetch_d_setting.consumer_semaphore_id, // downstream sem - demux_sem++, // local sem - 0); // remove header - demux_output_sem_idx++; + TT_ASSERT(device_worker_variants[DispatchWorkerType::PREFETCH_D].size() == prefetch_d_connected, "Found unconnected DEMUX_D to PREFETCH_D ports."); + TT_ASSERT(fwd_vc_count == local_tunneler_vcs_connected, "Found unconnected forward VCs between US_TUNNELER_LOCAL and DEMUX_D"); + if (!is_tunnel_end) { + auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); + TT_ASSERT((us_tunneler_remote_settings.vc_count - 1) == remote_tunneler_vcs_connected, "Found unconnected forward VCs between DEMUX_D and US_TUNNELER_REMOTE"); } break; } @@ -1159,13 +1424,22 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DEMUX_D][0]); + uint32_t num_demux_d = device_worker_variants[DispatchWorkerType::DEMUX_D].size(); - TT_ASSERT(num_prefetchers == demux_d_settings.semaphores.size(), "Demux D does not have required number of semaphores for Prefetcher D. Exptected = {}. Found = {}", num_prefetchers, demux_d_settings.semaphores.size()); int prefetch_d_idx = 0; - uint32_t demux_sem = demux_d_settings.producer_semaphore_id; + int demux_d_idx = 0; + std::vectordemux_sem(num_demux_d, 0); + for (int i = 0; i < num_demux_d; i++) { + auto demux_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX_D][i]); + demux_sem[i] = demux_d_settings.producer_semaphore_id; + } + for (auto&[core, prefetch_d_settings] : device_worker_variants[DispatchWorkerType::PREFETCH_D]) { + TT_ASSERT(demux_d_idx < num_demux_d , "Demux D index out of bounds. Max = {}. Found = {}", num_demux_d - 1, demux_d_idx); + auto demux_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX_D][demux_d_idx]); + if (num_demux_d == 1) { + TT_ASSERT(num_prefetchers == demux_d_settings.semaphores.size(), "Demux D does not have required number of semaphores for Prefetcher D. Exptected = {}. Found = {}", num_prefetchers, demux_d_settings.semaphores.size()); + } auto dispatch_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH_D][prefetch_d_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_d auto dispatch_s_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH_S][prefetch_d_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_s auto dispatch_core_type = prefetch_d_settings.dispatch_core_type; @@ -1205,7 +1479,7 @@ void Device::update_workers_build_settings(std::vector> 4; // 1: rx_queue_start_addr_words compile_args[2] = mux_d_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words - compile_args[3] = device_worker_variants[DispatchWorkerType::DISPATCH_D].size() + device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE].size(); // 3: mux_fan_in + compile_args[3] = num_dispatchers + device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE].size(); // 3: mux_fan_in uint32_t mux_d_input_idx = 0; for (const auto& dispatch_d_settings : device_worker_variants[DispatchWorkerType::DISPATCH_D]) { @@ -1362,11 +1641,11 @@ void Device::update_workers_build_settings(std::vectornum_hw_cqs_; - uint32_t vc_count = 1 + (tunnel.size() - 1) * num_hw_cqs; // 1 return vc. outgoing vc count depends on tunnel size and cq size. uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(mmio_device_id); auto [tensix_num_worker_cores, tensix_worker_physical_grid] = get_physical_worker_grid_config(device_id, num_hw_cqs, dispatch_core_type); @@ -1429,7 +1707,12 @@ void Device::setup_tunnel_for_remote_devices() { settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp"; //prefetch needs three semaphores. settings.semaphores.push_back(0); - settings.semaphores.push_back(dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs)); + if (tunnel.size() > 2) { + //Galaxy + settings.semaphores.push_back(dispatch_constants::get(dispatch_core_type).mux_buffer_pages(1)); + } else { + settings.semaphores.push_back(dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs)); + } settings.semaphores.push_back(0); settings.producer_semaphore_id = 1; tunnel_core_allocations[PREFETCH].push_back(std::make_tuple(prefetch_location, settings)); @@ -1458,28 +1741,90 @@ void Device::setup_tunnel_for_remote_devices() { settings.semaphores.clear(); log_debug(LogMetal, "Device {} Channel {} : Dispatch: Issue Q Start Addr: {} - Completion Q Start Addr: {}", device_id, channel, settings.issue_queue_start_addr, settings.completion_queue_start_addr); } + } + + for (uint32_t tunnel_stop = 1; tunnel_stop < tunnel.size(); tunnel_stop++) { + //tunnel.size() is mmio device + num of remote devices on the tunnel. + chip_id_t device_id = tunnel[tunnel_stop]; + // a remote device. + // tunnel_stop hops away. + uint8_t num_hw_cqs = this->num_hw_cqs_; + uint32_t vc_count = 1 + (tunnel.size() - 1) * num_hw_cqs; // 1 return vc. outgoing vc count depends on tunnel size and cq size. + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(mmio_device_id); + auto [tensix_num_worker_cores, tensix_worker_physical_grid] = get_physical_worker_grid_config(device_id, num_hw_cqs, dispatch_core_type); + + dispatch_worker_build_settings_t settings = {}; + //allocations below are on mmio chip. + settings.dispatch_core_type = dispatch_core_type; + settings.tunnel_stop = 0; + uint32_t cq_size = this->sysmem_manager().get_cq_size(); uint32_t cq_id = 0; // 1 mux, demux, local tunneler and remote tunneler per chip. Set cq_id to 0. if (tunnel_stop == 1) { //need to allocate mux/demux on mmio chip only once. //all tunnel stops, share the same mux/demux on mmio chip. //mux/demux need a semaphore per remote device in the tunnel. //Tunnel includes the mmio device as well, so tunnel.size() - 1 is the number of remote devices. - settings.semaphores = std::vector((tunnel.size() - 1) * num_hw_cqs); + uint32_t num_prefetchers = tunnel_core_allocations[PREFETCH].size(); settings.producer_semaphore_id = 0; settings.consumer_semaphore_id = 0; - tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); - settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; - settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE; - settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).mux_buffer_size(num_hw_cqs); + if (num_prefetchers == 1 || num_prefetchers == 2) { + //N300, T3K 1, 2 CQ case + settings.semaphores = std::vector(num_prefetchers); + tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 0); + settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; + settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).mux_buffer_size(num_hw_cqs); + settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE; + settings.cb_pages = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs); + tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); + + tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 0); + settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; + settings.cb_start_address = L1_UNRESERVED_BASE; + settings.cb_size_bytes = 0x10000; + tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings)); + } else if (num_prefetchers == 4 || num_prefetchers == 8) { + //TG, TGG 1, 2 CQ case + settings.semaphores = std::vector(MAX_SWITCH_FAN_IN); + tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 0); + settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; + settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE; + settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).mux_buffer_size(1); + settings.cb_pages = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(1); + tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); + if (num_prefetchers == 8) { + tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 1); + settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); + } - tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); - tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); - settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; - settings.cb_start_address = L1_UNRESERVED_BASE; - settings.cb_size_bytes = 0x10000; - tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings)); + tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 0); + settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.semaphores.clear(); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; + settings.cb_start_address = L1_UNRESERVED_BASE; + settings.cb_size_bytes = 0x10000; + tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings)); + + settings.semaphores = std::vector(num_prefetchers / 2); + demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 1); + settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; + settings.cb_start_address = L1_UNRESERVED_BASE; + settings.cb_size_bytes = 0x10000; + tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings)); + + demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 2); + settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; + settings.cb_start_address = L1_UNRESERVED_BASE; + settings.cb_size_bytes = 0x10000; + tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings)); + + } } settings.tunnel_stop = tunnel_stop - 1; @@ -1522,13 +1867,28 @@ void Device::setup_tunnel_for_remote_devices() { settings.cb_pages = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs); settings.cb_size_bytes = (1 << settings.cb_log_page_size) * settings.cb_pages; tunnel_core_allocations[MUX_D].push_back(std::make_tuple(mux_d_location, settings)); - tt_cxy_pair demux_d_location = dispatch_core_manager::instance().demux_d_core(device_id, channel, cq_id); + + uint32_t demux_vcs = settings.vc_count - 1; + tt_cxy_pair demux_d_location = dispatch_core_manager::instance().demux_d_core(device_id, channel, 0); settings.worker_physical_core = tt_cxy_pair(demux_d_location.chip, get_physical_core_coordinate(demux_d_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.producer_semaphore_id = 0; settings.cb_start_address = L1_UNRESERVED_BASE; settings.cb_size_bytes = 0x8000; + if (tunnel.size() > 2) { + settings.semaphores.resize(1); + } tunnel_core_allocations[DEMUX_D].push_back(std::make_tuple(demux_d_location, settings)); + if (tunnel.size() > 2 && demux_vcs > 1) { + //TG/TGG 1-2 CQs + demux_d_location = dispatch_core_manager::instance().demux_d_core(device_id, channel, 1); + settings.worker_physical_core = tt_cxy_pair(demux_d_location.chip, get_physical_core_coordinate(demux_d_location, dispatch_core_type)); + settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; + settings.producer_semaphore_id = 0; + settings.cb_start_address = L1_UNRESERVED_BASE; + settings.cb_size_bytes = 0x8000; + tunnel_core_allocations[DEMUX_D].push_back(std::make_tuple(demux_d_location, settings)); + } settings.semaphores.clear(); uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); for (uint32_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { @@ -1559,6 +1919,8 @@ void Device::setup_tunnel_for_remote_devices() { settings.cb_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); settings.cb_size_bytes = (1 << settings.cb_log_page_size) * settings.cb_pages; settings.compute_core_mcast_noc_coords = this->get_noc_multicast_encoding(dispatch_d_noc_index, tensix_worker_physical_grid); + CoreCoord compute_grid_size = this->compute_with_storage_grid_size(); + settings.num_compute_cores = uint32_t(compute_grid_size.x * compute_grid_size.y); tt_cxy_pair dispatch_d_location = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id); settings.worker_physical_core = tt_cxy_pair(dispatch_d_location.chip, get_physical_core_coordinate(dispatch_d_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp"; @@ -1942,27 +2304,28 @@ void Device::compile_command_queue_programs() { cq_id = (cq_id + 1) % num_hw_cqs; } - auto [mux_core, mux_settings] = mmio_device_worker_variants[DispatchWorkerType::MUX][0]; - for (auto sem : mux_settings.semaphores) { - //size of semaphores vector is number of needed semaphores on the core. - //Value of each vector entry is the initialization value for the semaphore. - tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_core, sem, mux_settings.dispatch_core_type); + for (auto [mux_core, mux_settings] : mmio_device_worker_variants[DispatchWorkerType::MUX]) { + for (auto sem : mux_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_core, sem, mux_settings.dispatch_core_type); + } + configure_kernel_variant( + *mmio_command_queue_program_ptr, + mux_settings.kernel_file, + mux_settings.compile_args, + mux_core, + CoreCoord{0, 0}, + mux_settings.dispatch_core_type, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + std::map {{"SKIP_NOC_LOGGING", "1"}}, + my_noc_index, // Only one Mux - use NOC for CQ 0 + my_noc_index, + my_noc_index + ); } - configure_kernel_variant( - *mmio_command_queue_program_ptr, - mux_settings.kernel_file, - mux_settings.compile_args, - mux_core, - CoreCoord{0, 0}, - mux_settings.dispatch_core_type, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - std::map {{"SKIP_NOC_LOGGING", "1"}}, - my_noc_index, // Only one Mux - use NOC for CQ 0 - my_noc_index, - my_noc_index - ); auto [tunneler_core, tunneler_settings] = mmio_device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]; configure_kernel_variant( @@ -1982,27 +2345,28 @@ void Device::compile_command_queue_programs() { true ); - auto [demux_core, demux_settings] = mmio_device_worker_variants[DispatchWorkerType::DEMUX][0]; - for (auto sem : demux_settings.semaphores) { - //size of semaphores vector is number of needed semaphores on the core. - //Value of each vector entry is the initialization value for the semaphore. - tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_core, sem, demux_settings.dispatch_core_type); + for (auto [demux_core, demux_settings] : mmio_device_worker_variants[DispatchWorkerType::DEMUX]) { + for (auto sem : demux_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_core, sem, demux_settings.dispatch_core_type); + } + configure_kernel_variant( + *mmio_command_queue_program_ptr, + demux_settings.kernel_file, + demux_settings.compile_args, + demux_core, + CoreCoord{0, 0}, + demux_settings.dispatch_core_type, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + std::map {{"SKIP_NOC_LOGGING", "1"}}, + my_noc_index, // Only one Demux - use NOC for CQ 0 + my_noc_index, + my_noc_index + ); } - configure_kernel_variant( - *mmio_command_queue_program_ptr, - demux_settings.kernel_file, - demux_settings.compile_args, - demux_core, - CoreCoord{0, 0}, - demux_settings.dispatch_core_type, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - std::map {{"SKIP_NOC_LOGGING", "1"}}, - my_noc_index, // Only one Demux - use NOC for CQ 0 - my_noc_index, - my_noc_index - ); cq_id = 0; for (auto [dispatch_core, dispatch_settings] : mmio_device_worker_variants[DispatchWorkerType::DISPATCH]) { for (auto sem : dispatch_settings.semaphores) { @@ -2070,27 +2434,28 @@ void Device::compile_command_queue_programs() { ); } - auto [demux_d_core, demux_d_settings] = device_worker_variants[DispatchWorkerType::DEMUX_D][0]; - for (auto sem : demux_d_settings.semaphores) { - //size of semaphores vector is number of needed semaphores on the core. - //Value of each vector entry is the initialization value for the semaphore. - tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, demux_d_core, sem, demux_d_settings.dispatch_core_type); + for (auto [demux_d_core, demux_d_settings] : device_worker_variants[DispatchWorkerType::DEMUX_D]){ + for (auto sem : demux_d_settings.semaphores) { + //size of semaphores vector is number of needed semaphores on the core. + //Value of each vector entry is the initialization value for the semaphore. + tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, demux_d_core, sem, demux_d_settings.dispatch_core_type); + } + configure_kernel_variant( + *command_queue_program_ptr, + demux_d_settings.kernel_file, + demux_d_settings.compile_args, + demux_d_core, + CoreCoord{0, 0}, + demux_d_settings.dispatch_core_type, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + CoreCoord{0, 0}, + std::map {{"SKIP_NOC_LOGGING", "1"}}, + my_noc_index, // Only one Demux - use NOC for CQ 0 + my_noc_index, + my_noc_index + ); } - configure_kernel_variant( - *command_queue_program_ptr, - demux_d_settings.kernel_file, - demux_d_settings.compile_args, - demux_d_core, - CoreCoord{0, 0}, - demux_d_settings.dispatch_core_type, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - CoreCoord{0, 0}, - std::map {{"SKIP_NOC_LOGGING", "1"}}, - my_noc_index, // Only one Demux - use NOC for CQ 0 - my_noc_index, - my_noc_index - ); uint32_t cq_id = 0; for (auto [prefetch_d_core, prefetch_d_settings] : device_worker_variants[DispatchWorkerType::PREFETCH_D]) { for (auto sem : prefetch_d_settings.semaphores) { diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index 663c7c776ab6..c4a2b5cdebd3 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -31,8 +31,10 @@ struct dispatch_constants { dispatch_constants(const dispatch_constants &) = delete; dispatch_constants(dispatch_constants &&other) noexcept = delete; - static const dispatch_constants &get(const CoreType &core_type) { - static dispatch_constants inst = dispatch_constants(core_type); + static const dispatch_constants &get(const CoreType &core_type, const uint32_t num_hw_cqs = 0) { + static uint32_t hw_cqs = num_hw_cqs; + TT_FATAL(hw_cqs > 0, "Command Queue is not initialized."); + static dispatch_constants inst = dispatch_constants(core_type, hw_cqs); return inst; } @@ -92,12 +94,12 @@ struct dispatch_constants { uint32_t dispatch_s_buffer_pages() const { return dispatch_s_buffer_size_ / (1 << DISPATCH_S_BUFFER_LOG_PAGE_SIZE); } private: - dispatch_constants(const CoreType &core_type) { + dispatch_constants(const CoreType &core_type, const uint32_t num_hw_cqs) { TT_ASSERT(core_type == CoreType::WORKER or core_type == CoreType::ETH); // make this 2^N as required by the packetized stages uint32_t dispatch_buffer_block_size; if (core_type == CoreType::WORKER) { - prefetch_q_entries_ = 1534; + prefetch_q_entries_ = 1532 / num_hw_cqs; max_prefetch_command_size_ = 128 * 1024; cmddat_q_size_ = 256 * 1024; scratch_db_size_ = 128 * 1024; @@ -422,8 +424,8 @@ class SystemMemoryManager { // must be as large as the max amount of space the prefetch queue can specify Plus 1 to handle wrapping Plus // 1 to allow us to start writing to issue queue before we reserve space in the prefetch queue TT_FATAL( - dispatch_constants::get(core_type).max_prefetch_command_size() * - (dispatch_constants::get(core_type).prefetch_q_entries() + 2) <= + dispatch_constants::get(core_type, num_hw_cqs).max_prefetch_command_size() * + (dispatch_constants::get(core_type, num_hw_cqs).prefetch_q_entries() + 2) <= this->get_issue_queue_size(cq_id), "Issue queue for cq_id {} has size of {} which is too small", cq_id, @@ -432,7 +434,7 @@ class SystemMemoryManager { this->cq_to_last_completed_event.push_back(0); this->prefetch_q_dev_ptrs[cq_id] = dispatch_constants::PREFETCH_Q_BASE; this->prefetch_q_dev_fences[cq_id] = - dispatch_constants::PREFETCH_Q_BASE + dispatch_constants::get(core_type).prefetch_q_entries() * + dispatch_constants::PREFETCH_Q_BASE + dispatch_constants::get(core_type, num_hw_cqs).prefetch_q_entries() * sizeof(dispatch_constants::prefetch_q_entry_type); } vector temp_mutexes(num_hw_cqs); @@ -710,7 +712,7 @@ class SystemMemoryManager { // Wrap FetchQ if possible CoreType core_type = dispatch_core_manager::instance().get_dispatch_core_type(device_id); uint32_t prefetch_q_base = DISPATCH_L1_UNRESERVED_BASE; - uint32_t prefetch_q_limit = prefetch_q_base + dispatch_constants::get(core_type).prefetch_q_entries() * + uint32_t prefetch_q_limit = prefetch_q_base + dispatch_constants::get(core_type, num_hw_cqs).prefetch_q_entries() * sizeof(dispatch_constants::prefetch_q_entry_type); if (this->prefetch_q_dev_ptrs[cq_id] == prefetch_q_limit) { this->prefetch_q_dev_ptrs[cq_id] = prefetch_q_base; @@ -721,7 +723,7 @@ class SystemMemoryManager { void fetch_queue_write(uint32_t command_size_B, const uint8_t cq_id, bool stall_prefetcher = false) { CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->device_id); - uint32_t max_command_size_B = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + uint32_t max_command_size_B = dispatch_constants::get(dispatch_core_type, num_hw_cqs).max_prefetch_command_size(); TT_ASSERT( command_size_B <= max_command_size_B, "Generated prefetcher command of size {} B exceeds max command size {} B", diff --git a/tt_metal/impl/dispatch/kernels/packet_queue.hpp b/tt_metal/impl/dispatch/kernels/packet_queue.hpp index a2e66c9d794f..6d552218b03a 100644 --- a/tt_metal/impl/dispatch/kernels/packet_queue.hpp +++ b/tt_metal/impl/dispatch/kernels/packet_queue.hpp @@ -161,31 +161,33 @@ class packet_queue_state_t { STREAM_REG_ADDR(NUM_PTR_REGS_PER_INPUT_QUEUE*queue_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX)); } else { this->local_wptr_val = &this->local_wptr; + uint32_t adjusted_queue_id = queue_id > 15 ? queue_id - 11 : queue_id; this->local_rptr_sent_val = reinterpret_cast( - STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*queue_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX)); + STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*adjusted_queue_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX)); this->local_rptr_cleared_val = reinterpret_cast( - STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*queue_id+1, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX)); + STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*adjusted_queue_id+1, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX)); this->local_rptr_sent_update = reinterpret_cast( - STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*queue_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX)); + STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*adjusted_queue_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX)); this->local_rptr_cleared_update = reinterpret_cast( - STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*queue_id+1, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX)); + STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*adjusted_queue_id+1, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX)); // Setting STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX resets the credit register this->local_rptr_sent_reset = reinterpret_cast( - STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*queue_id, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX)); + STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*adjusted_queue_id, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX)); this->local_rptr_cleared_reset = reinterpret_cast( - STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*queue_id+1, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX)); + STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*adjusted_queue_id+1, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX)); } this->remote_wptr_update_addr = STREAM_REG_ADDR(NUM_PTR_REGS_PER_INPUT_QUEUE*remote_queue_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); + uint32_t adjusted_remote_queue_id = remote_queue_id > 15 ? remote_queue_id - 11 : remote_queue_id; this->remote_rptr_sent_update_addr = - STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*remote_queue_id, + STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*adjusted_remote_queue_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); this->remote_rptr_cleared_update_addr = - STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*remote_queue_id+1, + STREAM_REG_ADDR(NUM_PTR_REGS_PER_OUTPUT_QUEUE*adjusted_remote_queue_id+1, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); this->remote_ready_status_addr = STREAM_REG_ADDR(remote_queue_id, STREAM_REMOTE_SRC_REG_INDEX); diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index b5f614642b2f..3e5727f06d8e 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -301,9 +301,6 @@ std::map CreateDevices( const std::vector &l1_bank_remap) { ZoneScoped; bool is_galaxy = tt::Cluster::instance().is_galaxy_cluster(); - if (is_galaxy) { - TT_FATAL(num_hw_cqs < 2, "Multiple Command Queues are not Currently Supported on Galaxy Systems"); - } tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, trace_region_size, dispatch_core_type); std::vector devices = tt::DevicePool::instance().get_all_active_devices(); std::map ret_devices; @@ -865,6 +862,10 @@ size_t GetNumAvailableDevices() { return tt::Cluster::instance().number_of_user_devices(); } +bool IsGalaxyCluster() { + return tt::Cluster::instance().is_galaxy_cluster(); +} + size_t GetNumPCIeDevices() { return tt::Cluster::instance().number_of_pci_devices(); }