From 956e2bb7a77c14f8627a1b9aeec7a58373c6bf60 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Wed, 26 Jun 2024 21:29:06 +0000 Subject: [PATCH] #9044: Move dispatch core x y to be part of launch msg, which is set in assemble_device_commands. This enables running programs on any cq --- .../test_kernels/dataflow/dram_copy.cpp | 3 +- .../test_kernels/misc/watcher_asserts.cpp | 3 +- .../CMakeLists.txt | 1 + .../command_queue/test_EnqueueProgram.cpp | 267 ++++++++++++++++++ tt_metal/detail/tt_metal.hpp | 15 +- tt_metal/hw/firmware/src/brisc.cc | 2 +- tt_metal/hw/firmware/src/erisck.cc | 2 +- tt_metal/hw/firmware/src/idle_erisc.cc | 2 +- tt_metal/hw/inc/dev_msgs.h | 2 + tt_metal/impl/device/device.cpp | 31 +- tt_metal/impl/device/device.hpp | 1 + tt_metal/impl/dispatch/command_queue.cpp | 226 ++++++++++----- tt_metal/impl/dispatch/command_queue.hpp | 16 +- tt_metal/jit_build/genfiles.cpp | 11 +- tt_metal/jit_build/genfiles.hpp | 3 +- 15 files changed, 463 insertions(+), 122 deletions(-) create mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp index 1deac01e9236..8d63d1f80a3c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp @@ -29,7 +29,8 @@ void kernel_main() { //Need to signal completion to dispatcher before hanging so that //Dispatcher Kernel is able to finish. //Device Close () requires fast dispatch kernels to finish. - uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR); + tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); + uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31, false); #endif diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp index c995df2b5ab7..745fb376f41f 100644 --- a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp @@ -35,7 +35,8 @@ void MAIN { //Need to signal completion to dispatcher before hanging so that //Dispatcher Kernel is able to finish. //Device Close () requires fast dispatch kernels to finish. - uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR); + tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); + uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/); } #else diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt index 3b3578ba3c67..154abb70d8d5 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt @@ -1,5 +1,6 @@ set(UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueProgram.cpp ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueTrace.cpp ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWaitForEvent.cpp ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp new file mode 100644 index 000000000000..9b5e28d0ee0c --- /dev/null +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp @@ -0,0 +1,267 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "command_queue_fixture.hpp" +#include "command_queue_test_utils.hpp" +#include "gtest/gtest.h" +#include "impl/buffers/buffer.hpp" +#include "tt_metal/common/bfloat16.hpp" +#include "tt_metal/common/scoped_timer.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" + +using namespace tt::tt_metal; + +struct CBConfig { + uint32_t cb_id; + uint32_t num_pages; + uint32_t page_size; + tt::DataFormat data_format; +}; + +struct DummyProgramConfig { + CoreRangeSet cr_set; + CBConfig cb_config; + uint32_t num_cbs; + uint32_t num_sems; +}; + +struct DummyProgramMultiCBConfig { + CoreRangeSet cr_set; + std::vector cb_config_vector; + uint32_t num_sems; +}; + + +namespace local_test_functions { + +// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two. +// Optionally force the max size for one of the vectors. +std::pair, std::vector> create_runtime_args(bool force_max_size = false, uint32_t unique_base = 0, uint32_t common_base = 100){ + + constexpr uint32_t MAX_RUNTIME_ARGS = 255; + + // Generate Unique Runtime Args. Common RT args starting address must be L1 Aligned, so account for that here via padding + uint32_t num_rt_args_unique = num_rt_args_unique = rand() % (MAX_RUNTIME_ARGS + 1); + uint32_t num_rt_args_unique_padded = align(num_rt_args_unique, L1_ALIGNMENT / sizeof(uint32_t)); + uint32_t num_rt_args_common = num_rt_args_unique_padded < MAX_RUNTIME_ARGS ? rand() % (MAX_RUNTIME_ARGS - num_rt_args_unique_padded + 1) : 0; + + if (force_max_size) { + if (rand() % 2) { + num_rt_args_unique = MAX_RUNTIME_ARGS; + num_rt_args_common = 0; + } else { + num_rt_args_common = MAX_RUNTIME_ARGS; + num_rt_args_unique = 0; + } + } + + vector rt_args_common; + for (uint32_t i = 0; i < num_rt_args_common; i++) { + rt_args_common.push_back(common_base + i); + } + + vector rt_args_unique; + for (uint32_t i = 0; i < num_rt_args_unique; i++) { + rt_args_unique.push_back(unique_base + i); + } + + log_trace(tt::LogTest, "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}", __FUNCTION__, num_rt_args_unique, num_rt_args_common, force_max_size); + return std::make_pair(rt_args_unique, rt_args_common); +} + + +} // namespace local_test_functions + +namespace stress_tests { + +TEST_F(MultiCommandQueueSingleDeviceFixture, TestRandomizedProgram) { + uint32_t NUM_PROGRAMS = 100; + uint32_t MAX_LOOP = 100; + uint32_t page_size = 1024; + + // Make random + auto random_seed = 0; // (unsigned int)time(NULL); + uint32_t seed = tt::parse_env("SEED", random_seed); + log_info(tt::LogTest, "Using Test Seed: {}", seed); + srand(seed); + + CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size(); + CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); + CoreRangeSet cr_set({cr}); + + log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS); + + vector programs; + for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { + programs.push_back(Program()); + Program& program = programs.back(); + + std::map data_movement_defines = {{"DATA_MOVEMENT", "1"}}; + std::map compute_defines = {{"COMPUTE", "1"}}; + + // brisc + uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS; + bool USE_MAX_RT_ARGS; + + if (i == 0) { + // Ensures that we get at least one compilation with the max amount to + // ensure it compiles and runs + BRISC_OUTER_LOOP = MAX_LOOP; + BRISC_MIDDLE_LOOP = MAX_LOOP; + BRISC_INNER_LOOP = MAX_LOOP; + NUM_CBS = NUM_CIRCULAR_BUFFERS; + NUM_SEMS = NUM_SEMAPHORES; + USE_MAX_RT_ARGS = true; + } else { + BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1; + NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1; + USE_MAX_RT_ARGS = false; + } + + log_debug(tt::LogTest, "Compiling program {}/{} w/ BRISC_OUTER_LOOP: {} BRISC_MIDDLE_LOOP: {} BRISC_INNER_LOOP: {} NUM_CBS: {} NUM_SEMS: {} USE_MAX_RT_ARGS: {}", + i+1, NUM_PROGRAMS, BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, USE_MAX_RT_ARGS); + + for (uint32_t j = 0; j < NUM_CBS; j++) { + CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}).set_page_size(j, page_size * (j + 1)); + auto cb = CreateCircularBuffer(program, cr_set, cb_config); + } + + for (uint32_t j = 0; j < NUM_SEMS; j++) { + CreateSemaphore(program, cr_set, j + 1); + } + + auto [brisc_unique_rtargs, brisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size(); + uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size(); + vector brisc_compile_args = {BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_brisc_unique_rtargs, num_brisc_common_rtargs, page_size}; + + // ncrisc + uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP; + if (i == 0) { + NCRISC_OUTER_LOOP = MAX_LOOP; + NCRISC_MIDDLE_LOOP = MAX_LOOP; + NCRISC_INNER_LOOP = MAX_LOOP; + } else { + NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + } + + auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size(); + uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size(); + vector ncrisc_compile_args = {NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_ncrisc_unique_rtargs, num_ncrisc_common_rtargs, page_size}; + + // trisc + uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP; + if (i == 0) { + TRISC_OUTER_LOOP = MAX_LOOP; + TRISC_MIDDLE_LOOP = MAX_LOOP; + TRISC_INNER_LOOP = MAX_LOOP; + } else { + TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + } + + auto [trisc_unique_rtargs, trisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size(); + uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size(); + vector trisc_compile_args = {TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_trisc_unique_rtargs, num_trisc_common_rtargs, page_size}; + + bool at_least_one_kernel = false; + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_brisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); + at_least_one_kernel = true; + } + + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_ncrisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); + at_least_one_kernel = true; + } + + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_trisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{ + .math_approx_mode = false, + .compile_args = trisc_compile_args, + .defines = compute_defines + }); + SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); + at_least_one_kernel = true; + } + + if (not at_least_one_kernel) { + uint32_t random_risc = rand() % 3 + 1; + if (random_risc == 1) { + auto dummy_brisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); + } else if (random_risc == 2) { + auto dummy_ncrisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); + } else if (random_risc == 3) { + auto dummy_trisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{ + .math_approx_mode = false, + .compile_args = trisc_compile_args, + .defines = compute_defines + }); + SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); + } else { + TT_ASSERT("Invalid"); + } + } + + tt::tt_metal::detail::CompileProgram(this->device_, program); + } + + for (uint8_t cq_id = 0; cq_id < this->device_->num_hw_cqs(); ++cq_id) { + log_info(tt::LogTest, "Running {} programs on cq {} for cache warmup.", programs.size(), (uint32_t)cq_id); + // This loop caches program and runs + for (Program& program: programs) { + EnqueueProgram(this->device_->command_queue(cq_id), program, false); + } + + // This loops assumes already cached + uint32_t NUM_ITERATIONS = 500; // TODO(agrebenisan): Bump this to 5000, saw hangs for very large number of iterations, need to come back to that + + log_info(tt::LogTest, "Running {} programs on cq {} for {} iterations now.", programs.size(), (uint32_t)cq_id, NUM_ITERATIONS); + for (uint32_t i = 0; i < NUM_ITERATIONS; i++) { + auto rng = std::default_random_engine {}; + std::shuffle(std::begin(programs), std::end(programs), rng); + if (i % 10 == 0) { + log_debug(tt::LogTest, "Enqueueing {} programs on cq {} for iter: {}/{} now.", programs.size(), (uint32_t)cq_id, i+1, NUM_ITERATIONS); + } + for (Program& program: programs) { + EnqueueProgram(this->device_->command_queue(cq_id), program, false); + } + } + + log_info(tt::LogTest, "Calling Finish."); + Finish(this->device_->command_queue(cq_id)); + } +} + +} // namespace stress_tests diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp index 426d8a83647e..4cf6a69ce37c 100644 --- a/tt_metal/detail/tt_metal.hpp +++ b/tt_metal/detail/tt_metal.hpp @@ -399,18 +399,6 @@ namespace tt::tt_metal{ pcie_chan_end_addr += tt::Cluster::instance().get_host_channel_size(device->id(), pcie_chan); } - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); - const uint8_t cq_id = 0; // Currently, only the first command queue is responsible for enqueuing programs - tt_cxy_pair enqueue_program_dispatch_core; - if (device->is_mmio_capable()) { - enqueue_program_dispatch_core = dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_core(device->id(), channel, cq_id); - } else { - enqueue_program_dispatch_core = dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_d_core(device->id(), channel, cq_id); - } - - CoreType core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id()); - CoreCoord physical_enqueue_program_dispatch_core = get_physical_core_coordinate(enqueue_program_dispatch_core, core_type); - jit_build_genfiles_noc_addr_ranges_header( path, pcie_chan_base_addr, @@ -421,8 +409,7 @@ namespace tt::tt_metal{ soc_d.get_dram_cores(), soc_d.get_physical_ethernet_cores(), soc_d.grid_size, - harvested_rows, - physical_enqueue_program_dispatch_core); + harvested_rows); } inline void CheckDataMovementConfig(Program &program, const std::string &file_name, const CoreRangeSet &core_ranges) { diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index baab7633ae33..d88ea516ceae 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -400,7 +400,7 @@ int main() { // Notify dispatcher core that it has completed if (mailboxes->launch.mode == DISPATCH_MODE_DEV) { uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR); + NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4); noc_fast_atomic_increment( noc_index, diff --git a/tt_metal/hw/firmware/src/erisck.cc b/tt_metal/hw/firmware/src/erisck.cc index 6a173069a91e..76fdb1645054 100644 --- a/tt_metal/hw/firmware/src/erisck.cc +++ b/tt_metal/hw/firmware/src/erisck.cc @@ -33,7 +33,7 @@ void __attribute__((section("erisc_l1_code"))) kernel_launch() { kernel_main(); mailboxes->launch.run = RUN_MSG_DONE; - uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR); + uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); if (routing_info->routing_enabled and mailboxes->launch.mode == DISPATCH_MODE_DEV) { internal_::notify_dispatch_core_done(dispatch_addr); } diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index c09fb0c62e47..a5bacb80d1f9 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -132,7 +132,7 @@ int main() { // Notify dispatcher core that it has completed if (mailboxes->launch.mode == DISPATCH_MODE_DEV) { - uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR); + uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/); } diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index f8e53124b785..c63ac7070507 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -73,6 +73,8 @@ struct launch_msg_t { // must be cacheline aligned volatile uint8_t brisc_noc_id; volatile uint8_t enables[DISPATCH_CLASS_MAX_PROC]; volatile uint8_t max_cb_index; + volatile uint8_t dispatch_core_x; + volatile uint8_t dispatch_core_y; volatile uint8_t run; // must be in last cacheline of this msg }; diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index c129651eaba7..46a77c88e37d 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -31,14 +31,8 @@ Device::Device( chip_id_t device_id, const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector &l1_bank_remap, bool minimal, uint32_t worker_core) : id_(device_id), worker_thread_core(worker_core), work_executor(worker_core, device_id) { ZoneScoped; - TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs < 3, "num_hw_cqs can be between 1 and 2"); + TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs <= Device::max_num_hw_cqs, "num_hw_cqs can be between 1 and {}", Device::max_num_hw_cqs); this->build_key_ = tt::Cluster::instance().get_harvesting_mask(device_id); - if (!this->is_mmio_capable()) { - //Remote device Vs MMIO Device need to be unique even if they have same harvesting mask because - //dispatch cores are allocated differently on two types of devices. - //harvest mask is 12-bit mask so adding 100000 should not alias with any possible harvest mask value. - this->build_key_ += 100000; - } tunnel_device_dispatch_workers_ = {}; this->initialize(num_hw_cqs, l1_small_size, trace_region_size, l1_bank_remap, minimal); } @@ -1900,12 +1894,23 @@ uint32_t Device::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& ph uint32_t Device::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const { const auto& grid_size = tt::Cluster::instance().get_soc_desc(this->id()).grid_size; - return NOC_MULTICAST_ENCODING( - NOC_0_X(noc_index, grid_size.x, physical_cores.start.x), - NOC_0_Y(noc_index, grid_size.y, physical_cores.start.y), - NOC_0_X(noc_index, grid_size.x, physical_cores.end.x), - NOC_0_Y(noc_index, grid_size.y, physical_cores.end.y) - ); + + // NOC 1 mcasts from bottom left to top right, so we need to reverse the coords + if (noc_index == 0) { + return NOC_MULTICAST_ENCODING( + NOC_0_X(noc_index, grid_size.x, physical_cores.start.x), + NOC_0_Y(noc_index, grid_size.y, physical_cores.start.y), + NOC_0_X(noc_index, grid_size.x, physical_cores.end.x), + NOC_0_Y(noc_index, grid_size.y, physical_cores.end.y) + ); + } else { + return NOC_MULTICAST_ENCODING( + NOC_0_X(noc_index, grid_size.x, physical_cores.end.x), + NOC_0_Y(noc_index, grid_size.y, physical_cores.end.y), + NOC_0_X(noc_index, grid_size.x, physical_cores.start.x), + NOC_0_Y(noc_index, grid_size.y, physical_cores.start.y) + ); + } } void Device::check_allocator_is_initialized() const { diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 7d29a4ba3eee..35a962273e9b 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -244,6 +244,7 @@ class Device { friend class SystemMemoryManager; static constexpr MemoryAllocator allocator_scheme_ = MemoryAllocator::L1_BANKING; + static constexpr uint32_t max_num_hw_cqs = 2; chip_id_t id_; uint32_t build_key_; std::unique_ptr allocator_ = nullptr; diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 8d61a55281d9..9cca6e99760f 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -40,7 +40,9 @@ std::condition_variable finish_cv; namespace tt::tt_metal { // TODO: Delete entries when programs are deleted to save memory -thread_local std::unordered_map +thread_local std::unordered_map< + uint64_t, + std::array, Device::max_num_hw_cqs>> EnqueueProgramCommand::cached_program_command_sequences = {}; // EnqueueReadBufferCommandSection @@ -160,8 +162,8 @@ void EnqueueWriteInterleavedBufferCommand::add_dispatch_write(HugepageDeviceComm void EnqueueWriteInterleavedBufferCommand::add_buffer_data(HugepageDeviceCommand& command_sequence) { uint32_t data_size_bytes = this->pages_to_write * this->padded_page_size; - uint32_t full_page_size = this->buffer.aligned_page_size(); // this->padded_page_size could be a partial page if buffer - // page size > MAX_PREFETCH_CMD_SIZE + uint32_t full_page_size = this->buffer.aligned_page_size(); // this->padded_page_size could be a partial page if + // buffer page size > MAX_PREFETCH_CMD_SIZE bool write_partial_pages = this->padded_page_size < full_page_size; uint32_t buffer_addr_offset = this->bank_base_address - this->buffer.address(); @@ -186,7 +188,8 @@ void EnqueueWriteInterleavedBufferCommand::add_buffer_data(HugepageDeviceCommand uint32_t unpadded_src_offset = (((buffer_addr_offset / this->padded_page_size) * num_banks) + this->dst_page_index) * this->buffer.page_size(); - if (this->buffer.page_size() % this->buffer.alignment() != 0 and this->buffer.page_size() != this->buffer.size()) { + if (this->buffer.page_size() % this->buffer.alignment() != 0 and + this->buffer.page_size() != this->buffer.size()) { // If page size is not 32B-aligned, we cannot do a contiguous write uint32_t src_address_offset = unpadded_src_offset; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_bytes; @@ -268,8 +271,8 @@ void EnqueueWriteBufferCommand::process() { this->add_dispatch_write(command_sequence); - uint32_t full_page_size = this->buffer.aligned_page_size(); // this->padded_page_size could be a partial page if buffer - // page size > MAX_PREFETCH_CMD_SIZE + uint32_t full_page_size = this->buffer.aligned_page_size(); // this->padded_page_size could be a partial page if + // buffer page size > MAX_PREFETCH_CMD_SIZE bool write_partial_pages = this->padded_page_size < full_page_size; this->add_buffer_data(command_sequence); @@ -287,18 +290,21 @@ EnqueueProgramCommand::EnqueueProgramCommand( Device* device, NOC noc_index, Program& program, + CoreCoord& dispatch_core, SystemMemoryManager& manager, uint32_t expected_num_workers_completed) : command_queue_id(command_queue_id), noc_index(noc_index), manager(manager), expected_num_workers_completed(expected_num_workers_completed), - program(program) { + program(program), + dispatch_core(dispatch_core) { this->device = device; this->dispatch_core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id()); } -void EnqueueProgramCommand::assemble_preamble_commands(bool prefetch_stall) { +void EnqueueProgramCommand::assemble_preamble_commands( + CachedProgramCommandSequence& cached_program_command_sequence, bool prefetch_stall) { if (prefetch_stall) { // Wait command so previous program finishes // Wait command with barrier for binaries to commit to DRAM @@ -307,24 +313,21 @@ void EnqueueProgramCommand::assemble_preamble_commands(bool prefetch_stall) { CQ_PREFETCH_CMD_BARE_MIN_SIZE + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_STALL - this->cached_program_command_sequences[program.id].preamble_command_sequence = - HostMemDeviceCommand(uncached_cmd_sequence_sizeB); + cached_program_command_sequence.preamble_command_sequence = HostMemDeviceCommand(uncached_cmd_sequence_sizeB); // Wait for Noc Write Barrier // wait for binaries to commit to dram, also wait for previous program to be done // Wait Noc Write Barrier, wait for binaries to be written to worker cores // Stall to allow binaries to commit to DRAM first // TODO: this can be removed for all but the first program run - this->cached_program_command_sequences[program.id] - .preamble_command_sequence.add_dispatch_wait_with_prefetch_stall( - true, DISPATCH_MESSAGE_ADDR, this->expected_num_workers_completed); + cached_program_command_sequence.preamble_command_sequence.add_dispatch_wait_with_prefetch_stall( + true, DISPATCH_MESSAGE_ADDR, this->expected_num_workers_completed); } else { // Wait command so previous program finishes constexpr uint32_t cached_cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT - this->cached_program_command_sequences[program.id].preamble_command_sequence = - HostMemDeviceCommand(cached_cmd_sequence_sizeB); - this->cached_program_command_sequences[program.id].preamble_command_sequence.add_dispatch_wait( + cached_program_command_sequence.preamble_command_sequence = HostMemDeviceCommand(cached_cmd_sequence_sizeB); + cached_program_command_sequence.preamble_command_sequence.add_dispatch_wait( false, DISPATCH_MESSAGE_ADDR, this->expected_num_workers_completed); } } @@ -338,25 +341,33 @@ uint32_t get_max_write_packed_sub_cmds(uint32_t data_size, uint32_t max_prefetch uint32_t sub_cmd_sizeB = is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); // Approximate calculation due to alignment - uint32_t max_prefetch_size = max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - PCIE_ALIGNMENT - sizeof(CQDispatchCmd) - L1_ALIGNMENT; + uint32_t max_prefetch_size = + max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - PCIE_ALIGNMENT - sizeof(CQDispatchCmd) - L1_ALIGNMENT; uint32_t max_prefetch_num_packed_cmds = no_stride ? (max_prefetch_size - align(data_size * sizeof(uint32_t), L1_ALIGNMENT)) / sub_cmd_sizeB - : max_prefetch_size / (align(data_size * sizeof(uint32_t), L1_ALIGNMENT) + sub_cmd_sizeB); - return min(max_prefetch_num_packed_cmds, is_unicast ? CQ_DISPATCH_CMD_PACKED_WRITE_MAX_UNICAST_SUB_CMDS : CQ_DISPATCH_CMD_PACKED_WRITE_MAX_MULTICAST_SUB_CMDS); + : max_prefetch_size / (align(data_size * sizeof(uint32_t), L1_ALIGNMENT) + sub_cmd_sizeB); + return min( + max_prefetch_num_packed_cmds, + is_unicast ? CQ_DISPATCH_CMD_PACKED_WRITE_MAX_UNICAST_SUB_CMDS + : CQ_DISPATCH_CMD_PACKED_WRITE_MAX_MULTICAST_SUB_CMDS); }; template -uint32_t insert_write_packed_payloads(const uint32_t num_sub_cmds, const uint32_t sub_cmd_sizeB, const uint32_t max_prefetch_command_size, std::vector>& packed_cmd_payloads) { +uint32_t insert_write_packed_payloads( + const uint32_t num_sub_cmds, + const uint32_t sub_cmd_sizeB, + const uint32_t max_prefetch_command_size, + std::vector>& packed_cmd_payloads) { const uint32_t aligned_sub_cmd_sizeB = align(sub_cmd_sizeB, L1_ALIGNMENT); - const uint32_t max_packed_sub_cmds_per_cmd = get_max_write_packed_sub_cmds(aligned_sub_cmd_sizeB, max_prefetch_command_size, false); + const uint32_t max_packed_sub_cmds_per_cmd = + get_max_write_packed_sub_cmds(aligned_sub_cmd_sizeB, max_prefetch_command_size, false); uint32_t rem_num_sub_cmds = num_sub_cmds; uint32_t cmd_payload_sizeB = 0; while (rem_num_sub_cmds != 0) { const uint32_t num_sub_cmds_in_cmd = std::min(max_packed_sub_cmds_per_cmd, rem_num_sub_cmds); const uint32_t aligned_data_sizeB = aligned_sub_cmd_sizeB * num_sub_cmds_in_cmd; - const uint32_t dispatch_cmd_sizeB = align( - sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), - L1_ALIGNMENT); + const uint32_t dispatch_cmd_sizeB = + align(sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), L1_ALIGNMENT); packed_cmd_payloads.emplace_back(num_sub_cmds_in_cmd, dispatch_cmd_sizeB + aligned_data_sizeB); cmd_payload_sizeB += align(sizeof(CQPrefetchCmd) + packed_cmd_payloads.back().second, PCIE_ALIGNMENT); rem_num_sub_cmds -= num_sub_cmds_in_cmd; @@ -434,7 +445,8 @@ void generate_runtime_args_cmds( } // Generate command sequence for unique (unicast) and common (multicast) runtime args -void EnqueueProgramCommand::assemble_runtime_args_commands() { +void EnqueueProgramCommand::assemble_runtime_args_commands( + CachedProgramCommandSequence& cached_program_command_sequence) { // Maps to enum class RISCV, tt_backend_api_types.h thread_local static const std::vector unique_processor_to_l1_arg_base_addr = { L1_KERNEL_CONFIG_BASE, @@ -542,8 +554,8 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { } // Reserve 2x for unique rtas as we pontentially split the cmds due to not fitting in one prefetch cmd // Common rtas are always expected to fit in one prefetch cmd - this->cached_program_command_sequences[program.id].runtime_args_command_sequences = {}; - this->cached_program_command_sequences[program.id].runtime_args_command_sequences.reserve( + cached_program_command_sequence.runtime_args_command_sequences = {}; + cached_program_command_sequence.runtime_args_command_sequences.reserve( 2 * unique_processors.size() + common_kernels.size()); std::vector> runtime_args_data_index; runtime_args_data_index.reserve(2 * (unique_processors.size() + common_kernels.size())); @@ -552,7 +564,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { // same for all splits for (const uint32_t& processor_idx : unique_processors) { generate_runtime_args_cmds( - this->cached_program_command_sequences[program.id].runtime_args_command_sequences, + cached_program_command_sequence.runtime_args_command_sequences, unique_processor_to_l1_arg_base_addr[processor_idx], unique_sub_cmds[processor_idx], unique_rt_data_and_sizes[processor_idx], @@ -566,7 +578,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { std::visit( [&](auto&& sub_cmds) { generate_runtime_args_cmds( - this->cached_program_command_sequences[program.id].runtime_args_command_sequences, + cached_program_command_sequence.runtime_args_command_sequences, common_processor_to_l1_arg_base_addr[kernel_id], sub_cmds, common_rt_data_and_sizes[kernel_id], @@ -579,20 +591,21 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { common_sub_cmds[kernel_id]); } uint32_t runtime_args_fetch_size_bytes = 0; - for (const auto& cmds : this->cached_program_command_sequences[program.id].runtime_args_command_sequences) { + for (const auto& cmds : cached_program_command_sequence.runtime_args_command_sequences) { // BRISC, NCRISC, TRISC... runtime_args_fetch_size_bytes += cmds.size_bytes(); } - this->cached_program_command_sequences[program.id].runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes; + cached_program_command_sequence.runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes; } -void EnqueueProgramCommand::assemble_device_commands() { - auto& cached_program_command_sequence = this->cached_program_command_sequences[this->program.id]; - if (!program.loaded_onto_device) { +void EnqueueProgramCommand::assemble_device_commands( + CachedProgramCommandSequence& cached_program_command_sequence, bool is_cached) { + if (not is_cached) { // Calculate size of command and fill program indices of data to update // TODO: Would be nice if we could pull this out of program uint32_t cmd_sequence_sizeB = 0; - const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + const uint32_t max_prefetch_command_size = + dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.multicast_semaphores) { uint32_t num_packed_cmds = 0; @@ -688,9 +701,11 @@ void EnqueueProgramCommand::assemble_device_commands() { max_overall_base_index = max(max_overall_base_index, max_base_index); i++; } - cb_config_size_bytes = (max_overall_base_index + UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG) * sizeof(uint32_t); + cb_config_size_bytes = + (max_overall_base_index + UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG) * sizeof(uint32_t); aligned_cb_config_size_bytes = align(cb_config_size_bytes, L1_ALIGNMENT); - cmd_sequence_sizeB += insert_write_packed_payloads(num_multicast_cb_sub_cmds, cb_config_size_bytes, max_prefetch_command_size, mcast_cb_payload); + cmd_sequence_sizeB += insert_write_packed_payloads( + num_multicast_cb_sub_cmds, cb_config_size_bytes, max_prefetch_command_size, mcast_cb_payload); } // Program Binaries and Go Signals @@ -821,9 +836,13 @@ void EnqueueProgramCommand::assemble_device_commands() { std::vector unicast_go_signal_sub_cmds; std::vector> multicast_go_signals_payload; std::vector> unicast_go_signals_payload; - const uint32_t go_signal_sizeB = sizeof(launch_msg_t); + constexpr uint32_t go_signal_sizeB = sizeof(launch_msg_t); + constexpr uint32_t aligned_go_signal_sizeB = align(go_signal_sizeB, L1_ALIGNMENT); + constexpr uint32_t go_signal_size_words = aligned_go_signal_sizeB / sizeof(uint32_t); for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::WORKER)) { kernel_group.launch_msg.mode = DISPATCH_MODE_DEV; + kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x; + kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y; const void* launch_message_data = (const void*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { CoreCoord physical_start = @@ -839,12 +858,18 @@ void EnqueueProgramCommand::assemble_device_commands() { } } if (multicast_go_signal_sub_cmds.size() > 0) { - cmd_sequence_sizeB += insert_write_packed_payloads(multicast_go_signal_sub_cmds.size(), sizeof(launch_msg_t), max_prefetch_command_size, multicast_go_signals_payload); + cmd_sequence_sizeB += insert_write_packed_payloads( + multicast_go_signal_sub_cmds.size(), + go_signal_sizeB, + max_prefetch_command_size, + multicast_go_signals_payload); } for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::ETH)) { kernel_group.launch_msg.mode = DISPATCH_MODE_DEV; - const void* launch_message_data = (const void*)(&kernel_group.launch_msg); + kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x; + kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y; + const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { for (auto x = core_range.start.x; x <= core_range.end.x; x++) { for (auto y = core_range.start.y; y <= core_range.end.y; y++) { @@ -858,7 +883,11 @@ void EnqueueProgramCommand::assemble_device_commands() { } } if (unicast_go_signal_sub_cmds.size() > 0) { - cmd_sequence_sizeB += insert_write_packed_payloads(unicast_go_signal_sub_cmds.size(), sizeof(launch_msg_t), max_prefetch_command_size, unicast_go_signals_payload); + cmd_sequence_sizeB += insert_write_packed_payloads( + unicast_go_signal_sub_cmds.size(), + go_signal_sizeB, + max_prefetch_command_size, + unicast_go_signals_payload); } cached_program_command_sequence.program_command_sequence = HostMemDeviceCommand(cmd_sequence_sizeB); @@ -928,7 +957,7 @@ void EnqueueProgramCommand::assemble_device_commands() { if (num_multicast_cb_sub_cmds > 0) { uint32_t curr_sub_cmd_idx = 0; cached_program_command_sequence.cb_configs_payloads.reserve(num_multicast_cb_sub_cmds); - uint32_t cb_config_size_words = aligned_cb_config_size_bytes / sizeof(uint32_t); + const uint32_t cb_config_size_words = aligned_cb_config_size_bytes / sizeof(uint32_t); for (const auto& [num_sub_cmds_in_cmd, mcast_cb_payload_sizeB] : mcast_cb_payload) { uint32_t write_offset_bytes = program_command_sequence.write_offset_bytes(); program_command_sequence.add_dispatch_write_packed( @@ -940,9 +969,13 @@ void EnqueueProgramCommand::assemble_device_commands() { multicast_cb_config_data, curr_sub_cmd_idx); curr_sub_cmd_idx += num_sub_cmds_in_cmd; - uint32_t curr_sub_cmd_data_offset_words = (write_offset_bytes + CQ_PREFETCH_CMD_BARE_MIN_SIZE + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), L1_ALIGNMENT)) / sizeof(uint32_t); + uint32_t curr_sub_cmd_data_offset_words = + (write_offset_bytes + CQ_PREFETCH_CMD_BARE_MIN_SIZE + + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), L1_ALIGNMENT)) / + sizeof(uint32_t); for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { - cached_program_command_sequence.cb_configs_payloads.push_back((uint32_t *)program_command_sequence.data() + curr_sub_cmd_data_offset_words); + cached_program_command_sequence.cb_configs_payloads.push_back( + (uint32_t*)program_command_sequence.data() + curr_sub_cmd_data_offset_words); curr_sub_cmd_data_offset_words += cb_config_size_words; } } @@ -970,9 +1003,12 @@ void EnqueueProgramCommand::assemble_device_commands() { } // Go Signals + cached_program_command_sequence.go_signals.reserve( + multicast_go_signal_sub_cmds.size() + unicast_go_signal_sub_cmds.size()); if (multicast_go_signal_sub_cmds.size() > 0) { uint32_t curr_sub_cmd_idx = 0; for (const auto& [num_sub_cmds_in_cmd, multicast_go_signal_payload_sizeB] : multicast_go_signals_payload) { + uint32_t write_offset_bytes = program_command_sequence.write_offset_bytes(); program_command_sequence.add_dispatch_write_packed( num_sub_cmds_in_cmd, GET_MAILBOX_ADDRESS_HOST(launch), @@ -982,12 +1018,22 @@ void EnqueueProgramCommand::assemble_device_commands() { multicast_go_signal_data, curr_sub_cmd_idx); curr_sub_cmd_idx += num_sub_cmds_in_cmd; + uint32_t curr_sub_cmd_data_offset_words = + (write_offset_bytes + CQ_PREFETCH_CMD_BARE_MIN_SIZE + + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), L1_ALIGNMENT)) / + sizeof(uint32_t); + for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { + cached_program_command_sequence.go_signals.push_back( + (launch_msg_t*)((uint32_t*)program_command_sequence.data() + curr_sub_cmd_data_offset_words)); + curr_sub_cmd_data_offset_words += go_signal_size_words; + } } } if (unicast_go_signal_sub_cmds.size() > 0) { uint32_t curr_sub_cmd_idx = 0; for (const auto& [num_sub_cmds_in_cmd, unicast_go_signal_payload_sizeB] : unicast_go_signals_payload) { + uint32_t write_offset_bytes = program_command_sequence.write_offset_bytes(); program_command_sequence.add_dispatch_write_packed( num_sub_cmds_in_cmd, GET_ETH_MAILBOX_ADDRESS_HOST(launch), @@ -997,9 +1043,19 @@ void EnqueueProgramCommand::assemble_device_commands() { unicast_go_signal_data, curr_sub_cmd_idx); curr_sub_cmd_idx += num_sub_cmds_in_cmd; + uint32_t curr_sub_cmd_data_offset_words = + (write_offset_bytes + CQ_PREFETCH_CMD_BARE_MIN_SIZE + + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedUnicastSubCmd), L1_ALIGNMENT)) / + sizeof(uint32_t); + for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { + cached_program_command_sequence.go_signals.push_back( + (launch_msg_t*)((uint32_t*)program_command_sequence.data() + curr_sub_cmd_data_offset_words)); + curr_sub_cmd_data_offset_words += go_signal_size_words; + } } } } else { + // CB Configs uint32_t i = 0; for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) { uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i]; @@ -1019,38 +1075,43 @@ void EnqueueProgramCommand::assemble_device_commands() { } i++; } + // Go Signals + // for (auto & go_signal : cached_program_command_sequence.go_signals) { + // go_signal->dispatch_core_x = this->dispatch_core.x; + // go_signal->dispatch_core_y = this->dispatch_core.y; + // } } } void EnqueueProgramCommand::process() { - // Calculate all commands size and determine how many fetch q entries to use + // Invalidate cache of all cqs for the program if it has been recompiled + if (not program.loaded_onto_device) { + this->cached_program_command_sequences[program.id].fill(std::nullopt); + program.loaded_onto_device = true; + } // Preamble, some waits and stalls // can be written directly to the issue queue - if (not program.loaded_onto_device) { - this->assemble_preamble_commands(true); + auto& cached_program_command_sequence = this->cached_program_command_sequences[program.id][this->command_queue_id]; + bool cmd_is_cached = cached_program_command_sequence.has_value(); + if (!cmd_is_cached) { + cached_program_command_sequence = {}; + this->assemble_preamble_commands(*cached_program_command_sequence, true); // Runtime Args Command Sequence - this->assemble_runtime_args_commands(); + this->assemble_runtime_args_commands(*cached_program_command_sequence); // Main Command Sequence - this->assemble_device_commands(); + this->assemble_device_commands(*cached_program_command_sequence, false); } else { static constexpr uint32_t count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count)); - TT_ASSERT( - this->cached_program_command_sequences.find(program.id) != this->cached_program_command_sequences.end(), - "Program cache hit, but no stored command sequence"); - this->cached_program_command_sequences[program.id].preamble_command_sequence.update_cmd_sequence( + cached_program_command_sequence->preamble_command_sequence.update_cmd_sequence( count_offset, &this->expected_num_workers_completed, sizeof(uint32_t)); - this->assemble_device_commands(); + this->assemble_device_commands(*cached_program_command_sequence, true); } - const auto& cached_program_command_sequence = this->cached_program_command_sequences[program.id]; - - uint32_t preamble_fetch_size_bytes = cached_program_command_sequence.preamble_command_sequence.size_bytes(); - - uint32_t runtime_args_fetch_size_bytes = cached_program_command_sequence.runtime_args_fetch_size_bytes; - - uint32_t program_fetch_size_bytes = cached_program_command_sequence.program_command_sequence.size_bytes(); - + // Calculate all commands size and determine how many fetch q entries to use + uint32_t preamble_fetch_size_bytes = cached_program_command_sequence->preamble_command_sequence.size_bytes(); + uint32_t runtime_args_fetch_size_bytes = cached_program_command_sequence->runtime_args_fetch_size_bytes; + uint32_t program_fetch_size_bytes = cached_program_command_sequence->program_command_sequence.size_bytes(); uint32_t total_fetch_size_bytes = preamble_fetch_size_bytes + runtime_args_fetch_size_bytes + program_fetch_size_bytes; @@ -1061,16 +1122,16 @@ void EnqueueProgramCommand::process() { uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id); this->manager.cq_write( - cached_program_command_sequence.preamble_command_sequence.data(), preamble_fetch_size_bytes, write_ptr); + cached_program_command_sequence->preamble_command_sequence.data(), preamble_fetch_size_bytes, write_ptr); write_ptr += preamble_fetch_size_bytes; - for (const auto& cmds : cached_program_command_sequence.runtime_args_command_sequences) { + for (const auto& cmds : cached_program_command_sequence->runtime_args_command_sequences) { this->manager.cq_write(cmds.data(), cmds.size_bytes(), write_ptr); write_ptr += cmds.size_bytes(); } this->manager.cq_write( - cached_program_command_sequence.program_command_sequence.data(), program_fetch_size_bytes, write_ptr); + cached_program_command_sequence->program_command_sequence.data(), program_fetch_size_bytes, write_ptr); this->manager.issue_queue_push_back(total_fetch_size_bytes, this->command_queue_id); @@ -1081,14 +1142,14 @@ void EnqueueProgramCommand::process() { this->manager.issue_queue_reserve(preamble_fetch_size_bytes, this->command_queue_id); uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id); this->manager.cq_write( - cached_program_command_sequence.preamble_command_sequence.data(), preamble_fetch_size_bytes, write_ptr); + cached_program_command_sequence->preamble_command_sequence.data(), preamble_fetch_size_bytes, write_ptr); this->manager.issue_queue_push_back(preamble_fetch_size_bytes, this->command_queue_id); // One fetch queue entry for just the wait and stall, very inefficient this->manager.fetch_queue_reserve_back(this->command_queue_id); this->manager.fetch_queue_write(preamble_fetch_size_bytes, this->command_queue_id); // TODO: We can pack multiple RT args into one fetch q entry - for (const auto& cmds : cached_program_command_sequence.runtime_args_command_sequences) { + for (const auto& cmds : cached_program_command_sequence->runtime_args_command_sequences) { uint32_t fetch_size_bytes = cmds.size_bytes(); this->manager.issue_queue_reserve(fetch_size_bytes, this->command_queue_id); write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id); @@ -1102,7 +1163,7 @@ void EnqueueProgramCommand::process() { this->manager.issue_queue_reserve(program_fetch_size_bytes, this->command_queue_id); write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id); this->manager.cq_write( - cached_program_command_sequence.program_command_sequence.data(), program_fetch_size_bytes, write_ptr); + cached_program_command_sequence->program_command_sequence.data(), program_fetch_size_bytes, write_ptr); this->manager.issue_queue_push_back(program_fetch_size_bytes, this->command_queue_id); // One fetch queue entry for rest of program commands this->manager.fetch_queue_reserve_back(this->command_queue_id); @@ -1110,9 +1171,8 @@ void EnqueueProgramCommand::process() { } // Front load generating and caching preamble without stall during program loading stage - if (not program.loaded_onto_device) { - this->assemble_preamble_commands(false); - program.loaded_onto_device = true; + if (not cmd_is_cached) { + this->assemble_preamble_commands(*cached_program_command_sequence, false); } } @@ -1319,6 +1379,19 @@ HWCommandQueue::HWCommandQueue(Device* device, uint32_t id, NOC noc_index) : // Galaxy puts 4 devices per host channel until umd can provide one channel per device. this->size_B = this->size_B / 4; } + + CoreCoord enqueue_program_dispatch_core; + if (device->is_mmio_capable()) { + enqueue_program_dispatch_core = + dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_core(device->id(), channel, id); + } else { + enqueue_program_dispatch_core = + dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_d_core(device->id(), channel, id); + } + CoreType core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id()); + this->physical_enqueue_program_dispatch_core = + device->physical_core_from_logical_core(enqueue_program_dispatch_core, core_type); + tt_cxy_pair completion_q_writer_location = dispatch_core_manager::get(device->num_hw_cqs()).completion_queue_writer_core(device->id(), channel, this->id); @@ -1768,7 +1841,13 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { } auto command = EnqueueProgramCommand( - this->id, this->device, this->noc_index, program, this->manager, expected_workers_completed); + this->id, + this->device, + this->noc_index, + program, + this->physical_enqueue_program_dispatch_core, + this->manager, + expected_workers_completed); this->enqueue_command(command, blocking); #ifdef DEBUG @@ -2394,9 +2473,6 @@ void EnqueueWriteBufferImpl( void EnqueueProgram( CommandQueue& cq, std::variant, std::shared_ptr> program, bool blocking) { detail::DispatchStateCheck(true); - if (cq.get_mode() != CommandQueue::CommandQueueMode::TRACE) { - TT_FATAL(cq.id() == 0, "EnqueueProgram only supported on first command queue on device for time being."); - } cq.run_command( CommandInterface{.type = EnqueueCommandType::ENQUEUE_PROGRAM, .blocking = blocking, .program = program}); } diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index f874edb25247..960a7c7b8b0e 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -294,6 +294,7 @@ class EnqueueProgramCommand : public Command { NOC noc_index; Program& program; SystemMemoryManager& manager; + CoreCoord dispatch_core; CoreType dispatch_core_type; uint32_t expected_num_workers_completed; @@ -303,22 +304,26 @@ class EnqueueProgramCommand : public Command { std::vector runtime_args_command_sequences; uint32_t runtime_args_fetch_size_bytes; HostMemDeviceCommand program_command_sequence; - std::vector cb_configs_payloads; + std::vector cb_configs_payloads; std::vector>> circular_buffers_on_core_ranges; + std::vector go_signals; }; - thread_local static std::unordered_map cached_program_command_sequences; + thread_local static std:: + unordered_map, Device::max_num_hw_cqs>> + cached_program_command_sequences; EnqueueProgramCommand( uint32_t command_queue_id, Device* device, NOC noc_index, Program& program, + CoreCoord& dispatch_core, SystemMemoryManager& manager, uint32_t expected_num_workers_completed); - void assemble_preamble_commands(bool prefetch_stall); - void assemble_device_commands(); - void assemble_runtime_args_commands(); + void assemble_preamble_commands(CachedProgramCommandSequence& cached_program_command_sequence, bool prefetch_stall); + void assemble_device_commands(CachedProgramCommandSequence& cached_program_command_sequence, bool is_cached); + void assemble_runtime_args_commands(CachedProgramCommandSequence& cached_program_command_sequence); void process(); @@ -490,6 +495,7 @@ class HWCommandQueue { ~HWCommandQueue(); + CoreCoord physical_enqueue_program_dispatch_core; CoreCoord completion_queue_writer_core; NOC noc_index; volatile bool is_dprint_server_hung(); diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp index a67d3a7da520..05e7fa738b4a 100644 --- a/tt_metal/jit_build/genfiles.cpp +++ b/tt_metal/jit_build/genfiles.cpp @@ -595,8 +595,7 @@ static string generate_noc_addr_ranges_string( const std::vector& dram_cores, const std::vector& ethernet_cores, CoreCoord grid_size, - const std::vector& harvested_rows, - const CoreCoord& enqueue_program_physical_dispatch_core) { + const std::vector& harvested_rows) { stringstream ss; @@ -675,9 +674,6 @@ static string generate_noc_addr_ranges_string( ss << endl; ss << endl; - ss << "#define DISPATCH_CORE_X " << enqueue_program_physical_dispatch_core.x << endl; - ss << "#define DISPATCH_CORE_Y " << enqueue_program_physical_dispatch_core.y << endl; - return ss.str(); } @@ -691,11 +687,10 @@ void jit_build_genfiles_noc_addr_ranges_header( const std::vector& dram_cores, const std::vector& ethernet_cores, CoreCoord grid_size, - const std::vector& harvested_rows, - const CoreCoord& enqueue_program_physical_dispatch_core) { + const std::vector& harvested_rows) { string output_string = generate_noc_addr_ranges_string(pcie_addr_base, pcie_addr_size, dram_addr_base, dram_addr_size, - pcie_cores, dram_cores, ethernet_cores, grid_size, harvested_rows, enqueue_program_physical_dispatch_core); + pcie_cores, dram_cores, ethernet_cores, grid_size, harvested_rows); ofstream file_stream_br(path + "/brisc/noc_addr_ranges_gen.h"); file_stream_br << output_string; diff --git a/tt_metal/jit_build/genfiles.hpp b/tt_metal/jit_build/genfiles.hpp index 49d5876de2fe..8ede269f2ea1 100644 --- a/tt_metal/jit_build/genfiles.hpp +++ b/tt_metal/jit_build/genfiles.hpp @@ -42,8 +42,7 @@ void jit_build_genfiles_noc_addr_ranges_header( const std::vector& dram_cores, const std::vector& ethernet_cores, CoreCoord grid_size, - const std::vector& harvested_rows, - const CoreCoord& enqueue_program_physical_dispatch_core); + const std::vector& harvested_rows); void jit_build_genfiles_descriptors(const JitBuildEnv& env, JitBuildOptions& options);