From d9304dd4acf235b03a3e8953b70a069802e0704c Mon Sep 17 00:00:00 2001 From: Raymond Kim Date: Wed, 1 Jan 2025 14:48:00 -0500 Subject: [PATCH] Revert "#16356: Program Dispatch Modifications for MeshWorkload" This reverts commit 3755428c7e5db16f11a0e7751c6277d0b5ffcb01. --- tt_metal/impl/CMakeLists.txt | 1 - tt_metal/impl/device/device.cpp | 3 +- tt_metal/impl/device/device_pool.cpp | 2 +- tt_metal/impl/dispatch/command_queue.cpp | 1255 +++++++++++++- tt_metal/impl/dispatch/command_queue.hpp | 11 + .../impl/dispatch/dispatch_core_common.hpp | 5 - .../impl/dispatch/kernel_config/dispatch.cpp | 3 + .../impl/dispatch/kernel_config/dispatch.hpp | 16 - .../dispatch/kernel_config/dispatch_s.cpp | 2 +- .../dispatch/kernel_config/dispatch_s.hpp | 5 +- .../dispatch/kernel_config/eth_tunneler.cpp | 4 +- .../impl/dispatch/kernel_config/prefetch.cpp | 6 + .../impl/dispatch/kernel_config/prefetch.hpp | 13 - tt_metal/impl/dispatch/topology.cpp | 37 +- tt_metal/impl/kernels/kernel_types.hpp | 2 +- tt_metal/impl/program/program.cpp | 337 +++- tt_metal/impl/program/program.hpp | 30 +- .../impl/program/program_dispatch_utils.cpp | 1472 ----------------- .../impl/program/program_dispatch_utils.hpp | 112 -- tt_metal/tt_metal.cpp | 12 +- 20 files changed, 1503 insertions(+), 1825 deletions(-) delete mode 100644 tt_metal/impl/program/program_dispatch_utils.cpp delete mode 100644 tt_metal/impl/program/program_dispatch_utils.hpp diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index 25384d6f267..97946d994d0 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -17,7 +17,6 @@ set(IMPL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/allocator/basic_allocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/allocator/l1_banking_allocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/program/program.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/program/program_dispatch_utils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/debug_tools.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/command_queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/worker_config_buffer.cpp diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 2c1f685bd9d..8f5afb8603b 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -938,7 +938,7 @@ void Device::init_command_queue_host() { this->sysmem_manager_ = std::make_unique(this->id_, this->num_hw_cqs()); hw_command_queues_.resize(num_hw_cqs()); for (size_t cq_id = 0; cq_id < num_hw_cqs(); cq_id++) { - hw_command_queues_[cq_id] = std::make_unique(this, cq_id, dispatch_downstream_noc); + hw_command_queues_[cq_id] = std::make_unique(this, cq_id, NOC::NOC_0); // Need to do this since CommandQueue constructor is private sw_command_queues_.push_back(std::unique_ptr(new CommandQueue(this, cq_id))); } @@ -1750,7 +1750,6 @@ LaunchMessageRingBufferState& Device::get_worker_launch_message_buffer_state(Sub return this->active_sub_device_manager_->get_worker_launch_message_buffer_state(sub_device_id); } -// Main source to get NOC idx for dispatch core NOC Device::dispatch_go_signal_noc() const { return this->dispatch_s_enabled() ? NOC::NOC_1 : NOC::NOC_0; } diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 0ccadf286d5..06f3d807460 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -334,12 +334,12 @@ void DevicePool::add_devices_to_pool(const std::vector& device_ids) { } } + populate_fd_kernels(devices_to_activate, this->num_hw_cqs); for (const auto& device_id : devices_to_activate) { if (not this->is_device_active(device_id)) { this->activate_device(device_id); } } - populate_fd_kernels(devices_to_activate, this->num_hw_cqs); } void DevicePool::register_worker_thread_for_device(v1::DeviceHandle device, std::thread::id worker_thread_id) { diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 7d26c02eff2..f6c4d500289 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -35,11 +35,12 @@ #include "tt_metal/impl/dispatch/dispatch_core_manager.hpp" #include "tt_metal/impl/event/event.hpp" #include "tt_metal/impl/kernels/kernel.hpp" -#include "tt_metal/impl/program/program_dispatch_utils.hpp" #include "umd/device/tt_xy_pair.h" #include "llrt/hal.hpp" +#define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST) + using namespace tt::tt_metal; namespace tt::tt_metal { @@ -112,10 +113,10 @@ void EnqueueReadBufferCommand::process() { uint32_t num_worker_counters = this->sub_device_ids.size(); // accounts for padding uint32_t cmd_sequence_sizeB = - hal.get_alignment(HalMemType::HOST) * num_worker_counters + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT - hal.get_alignment(HalMemType::HOST) + // CQ_PREFETCH_CMD_STALL - hal.get_alignment(HalMemType::HOST) + // CQ_PREFETCH_CMD_RELAY_INLINE_NOFLUSH + CQ_DISPATCH_CMD_WRITE_LINEAR_HOST - hal.get_alignment(HalMemType::HOST); // CQ_PREFETCH_CMD_RELAY_LINEAR or CQ_PREFETCH_CMD_RELAY_PAGED + CQ_PREFETCH_CMD_BARE_MIN_SIZE * num_worker_counters + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + CQ_PREFETCH_CMD_BARE_MIN_SIZE + // CQ_PREFETCH_CMD_STALL + CQ_PREFETCH_CMD_BARE_MIN_SIZE + // CQ_PREFETCH_CMD_RELAY_INLINE_NOFLUSH + CQ_DISPATCH_CMD_WRITE_LINEAR_HOST + CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_LINEAR or CQ_PREFETCH_CMD_RELAY_PAGED void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); @@ -291,7 +292,7 @@ void EnqueueWriteBufferCommand::process() { sizeof(CQDispatchCmd) + // CQ_DISPATCH_CMD_WRITE_PAGED or CQ_DISPATCH_CMD_WRITE_LINEAR data_size_bytes, pcie_alignment); if (this->issue_wait) { - cmd_sequence_sizeB += hal.get_alignment(HalMemType::HOST) * num_worker_counters; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE * num_worker_counters; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT } void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); @@ -358,6 +359,1067 @@ EnqueueProgramCommand::EnqueueProgramCommand( dispatch_constants::get(this->dispatch_core_type).get_dispatch_message_offset(this->sub_device_id.to_index()); } +void EnqueueProgramCommand::assemble_preamble_commands( + ProgramCommandSequence& program_command_sequence, const tt::stl::Span kernel_config_addrs) { + uint32_t uncached_cmd_sequence_sizeB = + CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_SET_WRITE_OFFSET + + program_command_sequence.preamble_command_sequence = + HostMemDeviceCommand(uncached_cmd_sequence_sizeB); + + // Send write offsets + if (hal.get_programmable_core_type_count() >= 2) { + program_command_sequence.preamble_command_sequence.add_dispatch_set_write_offsets( + 0, + kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)].addr, + kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)].addr); + } else { + program_command_sequence.preamble_command_sequence.add_dispatch_set_write_offsets( + 0, kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)].addr, 0); + } +} + +void EnqueueProgramCommand::assemble_stall_commands(ProgramCommandSequence& program_command_sequence, bool prefetch_stall) { + if (prefetch_stall) { + // Wait command so previous program finishes + // Wait command with barrier for binaries to commit to DRAM + // Prefetch stall to prevent prefetcher picking up incomplete binaries from DRAM + uint32_t uncached_cmd_sequence_sizeB = + CQ_PREFETCH_CMD_BARE_MIN_SIZE + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_STALL + + program_command_sequence.stall_command_sequences[UncachedStallSequenceIdx] = + HostMemDeviceCommand(uncached_cmd_sequence_sizeB); + + // Wait for Noc Write Barrier + // wait for binaries to commit to dram, also wait for previous program to be done + // Wait Noc Write Barrier, wait for binaries to be written to worker cores + // Stall to allow binaries to commit to DRAM first + // TODO: this can be removed for all but the first program run + program_command_sequence.stall_command_sequences[UncachedStallSequenceIdx].add_dispatch_wait_with_prefetch_stall( + true, this->dispatch_message_addr, this->expected_num_workers_completed); + } else { + // Wait command so previous program finishes + uint32_t cached_cmd_sequence_sizeB = + CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + + program_command_sequence.stall_command_sequences[CachedStallSequenceIdx] = + HostMemDeviceCommand(cached_cmd_sequence_sizeB); + program_command_sequence.stall_command_sequences[CachedStallSequenceIdx].add_dispatch_wait( + false, this->dispatch_message_addr, this->expected_num_workers_completed); + } +} + +template +uint32_t get_max_write_packed_sub_cmds( + uint32_t data_size, uint32_t max_prefetch_cmd_size, uint32_t packed_write_max_unicast_sub_cmds, bool no_stride) { + static_assert( + std::is_same::value or + std::is_same::value); + constexpr bool is_unicast = std::is_same::value; + uint32_t sub_cmd_sizeB = + is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); + // Approximate calculation due to alignment + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + uint32_t max_prefetch_size = + max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - hal.get_alignment(HalMemType::HOST) - sizeof(CQDispatchCmd) - l1_alignment; + uint32_t max_prefetch_num_packed_cmds = + no_stride ? (max_prefetch_size - align(data_size * sizeof(uint32_t), l1_alignment)) / sub_cmd_sizeB + : max_prefetch_size / (align(data_size * sizeof(uint32_t), l1_alignment) + sub_cmd_sizeB); + + uint32_t packed_write_max_multicast_sub_cmds = + get_packed_write_max_multicast_sub_cmds(packed_write_max_unicast_sub_cmds); + return std::min( + max_prefetch_num_packed_cmds, + is_unicast ? packed_write_max_unicast_sub_cmds : packed_write_max_multicast_sub_cmds); +}; + +template +uint32_t insert_write_packed_payloads( + const uint32_t num_sub_cmds, + const uint32_t sub_cmd_sizeB, + const uint32_t max_prefetch_command_size, + const uint32_t packed_write_max_unicast_sub_cmds, + std::vector>& packed_cmd_payloads) { + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + const uint32_t aligned_sub_cmd_sizeB = align(sub_cmd_sizeB, l1_alignment); + const uint32_t max_packed_sub_cmds_per_cmd = get_max_write_packed_sub_cmds( + aligned_sub_cmd_sizeB, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false); + uint32_t rem_num_sub_cmds = num_sub_cmds; + uint32_t cmd_payload_sizeB = 0; + uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); + while (rem_num_sub_cmds != 0) { + const uint32_t num_sub_cmds_in_cmd = std::min(max_packed_sub_cmds_per_cmd, rem_num_sub_cmds); + const uint32_t aligned_data_sizeB = aligned_sub_cmd_sizeB * num_sub_cmds_in_cmd; + const uint32_t dispatch_cmd_sizeB = + align(sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), l1_alignment); + packed_cmd_payloads.emplace_back(num_sub_cmds_in_cmd, dispatch_cmd_sizeB + aligned_data_sizeB); + cmd_payload_sizeB += align(sizeof(CQPrefetchCmd) + packed_cmd_payloads.back().second, pcie_alignment); + rem_num_sub_cmds -= num_sub_cmds_in_cmd; + } + return cmd_payload_sizeB; +} + +template +void generate_runtime_args_cmds( + std::vector& runtime_args_command_sequences, + const uint32_t& l1_arg_base_addr, + const std::vector& sub_cmds, + const std::vector>>& rt_data_and_sizes, + const uint32_t& max_runtime_args_len, + std::vector>>& rt_args_data, + const uint32_t max_prefetch_command_size, + const uint32_t packed_write_max_unicast_sub_cmds, + bool no_stride, + enum DispatchWriteOffsets write_offset_index) { + static_assert( + std::is_same::value or + std::is_same::value); + + thread_local static auto get_runtime_payload_sizeB = + [](uint32_t num_packed_cmds, uint32_t runtime_args_len, bool is_unicast, bool no_stride) { + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + uint32_t sub_cmd_sizeB = + is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); + uint32_t dispatch_cmd_sizeB = sizeof(CQDispatchCmd) + align(num_packed_cmds * sub_cmd_sizeB, l1_alignment); + uint32_t aligned_runtime_data_sizeB = + (no_stride ? 1 : num_packed_cmds) * align(runtime_args_len * sizeof(uint32_t), l1_alignment); + return dispatch_cmd_sizeB + aligned_runtime_data_sizeB; + }; + thread_local static auto get_runtime_args_data_offset = + [](uint32_t num_packed_cmds, uint32_t runtime_args_len, bool is_unicast) { + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + uint32_t sub_cmd_sizeB = + is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); + uint32_t dispatch_cmd_sizeB = sizeof(CQDispatchCmd) + align(num_packed_cmds * sub_cmd_sizeB, l1_alignment); + return sizeof(CQPrefetchCmd) + dispatch_cmd_sizeB; + }; + + constexpr bool unicast = std::is_same::value; + + uint32_t num_packed_cmds_in_seq = sub_cmds.size(); + uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( + max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, no_stride); + uint32_t offset_idx = 0; + if (no_stride) { + TT_FATAL( + max_packed_cmds >= num_packed_cmds_in_seq, + "num_packed_cmds_in_seq {} cannot exceed max_packed_cmds {} when no_stride is true", + num_packed_cmds_in_seq, + max_packed_cmds); + } + uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + while (num_packed_cmds_in_seq != 0) { + // Generate the device command + uint32_t num_packed_cmds = std::min(num_packed_cmds_in_seq, max_packed_cmds); + uint32_t rt_payload_sizeB = + get_runtime_payload_sizeB(num_packed_cmds, max_runtime_args_len, unicast, no_stride); + uint32_t cmd_sequence_sizeB = align(sizeof(CQPrefetchCmd) + rt_payload_sizeB, pcie_alignment); + runtime_args_command_sequences.emplace_back(cmd_sequence_sizeB); + runtime_args_command_sequences.back().add_dispatch_write_packed( + num_packed_cmds, + l1_arg_base_addr, + max_runtime_args_len * sizeof(uint32_t), + rt_payload_sizeB, + sub_cmds, + rt_data_and_sizes, + packed_write_max_unicast_sub_cmds, + offset_idx, + no_stride, + write_offset_index); + + // Update kernel RTA pointers to point into the generated command + // Future RTA updates through the API will update the command sequence directly + uint32_t data_offset = (uint32_t)get_runtime_args_data_offset(num_packed_cmds, max_runtime_args_len, unicast); + const uint32_t data_inc = align(max_runtime_args_len * sizeof(uint32_t), l1_alignment); + uint32_t num_data_copies = no_stride ? 1 : num_packed_cmds; + for (uint32_t i = offset_idx; i < offset_idx + num_data_copies; ++i) { + uint32_t offset = 0; + for (auto& data : rt_args_data[i]) { + data.get().rt_args_data = + (uint32_t*)((char*)runtime_args_command_sequences.back().data() + data_offset + offset); + offset += data.get().rt_args_count * sizeof(uint32_t); + } + data_offset += data_inc; + } + num_packed_cmds_in_seq -= num_packed_cmds; + offset_idx += num_packed_cmds; + } +} + +// Generate command sequence for unique (unicast) and common (multicast) runtime args +void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequence& program_command_sequence) { + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->device->id()); + const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + + // Note: each sub_cmd contain data for multiple kernels (DM*, COMPUTE) + // the outer vector counts through the kernels, the inner vectors contains the data for each kernel + std::vector unique_sub_cmds; + std::vector>> unique_rt_data_and_sizes; + std::vector>> unique_rt_args_data; + + std::variant, std::vector> + common_sub_cmds; + std::vector>> common_rt_data_and_sizes; + std::vector>> common_rt_args_data; + + program_command_sequence.runtime_args_command_sequences = {}; + + uint32_t command_count = 0; + // Unique RTAs + for (uint32_t programmable_core_type_index = 0; + programmable_core_type_index < hal.get_programmable_core_type_count(); + programmable_core_type_index++) { + if (hal.get_programmable_core_type(programmable_core_type_index) == HalProgrammableCoreType::IDLE_ETH) { + // Fast dispatch not supported on IDLE_ETH yet + // TODO: can't just loop here as code below confuses ACTIVE/IDLE + continue; + } + for (auto& kg : program.get_kernel_groups(programmable_core_type_index)) { + if (kg.total_rta_size != 0) { + uint32_t num_sub_cmds = kg.core_ranges.num_cores(); + uint32_t max_runtime_args_len = kg.total_rta_size / sizeof(uint32_t); + uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( + max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false); + command_count += div_up(num_sub_cmds, max_packed_cmds); + } + } + } + // Common RTAs + for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) { + auto kernel = detail::GetKernel(program, kernel_id); + auto programmable_core_type = kernel->get_kernel_programmable_core_type(); + if (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) { + // Fast dispatch not supported on IDLE_ETH yet + // TODO: can't just loop here as code below confuses ACTIVE/IDLE + continue; + } + uint32_t programmable_core_type_index = hal.get_programmable_core_type_index(programmable_core_type); + uint32_t common_size = program.get_program_config(programmable_core_type_index) + .crta_sizes[kernel->dispatch_class()]; + if (common_size != 0) { + uint32_t max_runtime_args_len = common_size / sizeof(uint32_t); + const auto& common_rt_args = kernel->common_runtime_args(); + if (common_rt_args.size() > 0) { + CoreType core_type = hal.get_core_type(programmable_core_type_index); + if (core_type == CoreType::ETH) { + uint32_t num_sub_cmds = kernel->logical_cores().size(); + uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( + max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true); + command_count += div_up(num_sub_cmds, max_packed_cmds); + } else { + uint32_t num_sub_cmds = kernel->logical_coreranges().size(); + uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( + max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true); + command_count += div_up(num_sub_cmds, max_packed_cmds); + } + } + } + } + + program_command_sequence.runtime_args_command_sequences.reserve(command_count); + // Unique Runtime Args (Unicast) + for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) { + if (hal.get_programmable_core_type(index) == HalProgrammableCoreType::IDLE_ETH) { + // Fast dispatch not supported on IDLE_ETH yet + // TODO: can't just loop here as code below confuses ACTIVE/IDLE + continue; + } + CoreType core_type = hal.get_core_type(index); + uint32_t processor_classes = hal.get_processor_classes_count(index); + + for (auto& kg : program.get_kernel_groups(index)) { + if (kg.total_rta_size != 0) { + for (const CoreRange& core_range : kg.core_ranges.ranges()) { + for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { + for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { + CoreCoord core_coord(x, y); + + unique_rt_args_data.resize(unique_rt_args_data.size() + 1); + unique_rt_data_and_sizes.resize(unique_rt_data_and_sizes.size() + 1); + for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { + auto& optional_id = kg.kernel_ids[dispatch_class]; + if (optional_id) { + auto kernel = detail::GetKernel(program, optional_id.value()); + if (!kernel->cores_with_runtime_args().empty()) { + const auto& runtime_args_data = kernel->runtime_args(core_coord); + unique_rt_args_data.back().emplace_back(kernel->runtime_args_data(core_coord)); + TT_ASSERT( + runtime_args_data.size() * sizeof(uint32_t) <= + kg.rta_sizes[dispatch_class]); + unique_rt_data_and_sizes.back().emplace_back( + runtime_args_data.data(), + runtime_args_data.size() * sizeof(uint32_t), + kg.rta_sizes[dispatch_class]); + } + } + } + CoreCoord virtual_core = device->virtual_core_from_logical_core(core_coord, core_type); + unique_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{ + .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, virtual_core)}); + } + } + } + uint32_t rta_offset = program.get_program_config(index).rta_offset; + generate_runtime_args_cmds( + program_command_sequence.runtime_args_command_sequences, + rta_offset, + unique_sub_cmds, + unique_rt_data_and_sizes, + kg.total_rta_size / sizeof(uint32_t), + unique_rt_args_data, + max_prefetch_command_size, + packed_write_max_unicast_sub_cmds, + false, + core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE + : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE); + for (auto& data_per_kernel : unique_rt_data_and_sizes) { + for (auto& data_and_sizes : data_per_kernel) { + RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes)); + } + } + unique_sub_cmds.clear(); + unique_rt_data_and_sizes.clear(); + unique_rt_args_data.clear(); + } + } + + for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { + uint32_t common_size = program.get_program_config(index).crta_sizes[dispatch_class]; + if (common_size == 0) { + continue; + } + for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) { + auto kernel = detail::GetKernel(program, kernel_id); + if (kernel->get_kernel_core_type() != core_type) { + continue; // TODO: fixme, need list of kernels by core_typexdispatch_class + } + if (kernel->dispatch_class() != dispatch_class) { + continue; // TODO: fixme, need list of kernels by core_typexdispatch_class + } + + const auto& common_rt_args = kernel->common_runtime_args(); + if (common_rt_args.size() > 0) { + common_rt_args_data.resize(common_rt_args_data.size() + 1); + common_rt_data_and_sizes.resize(common_rt_data_and_sizes.size() + 1); + + TT_ASSERT(kernel->common_runtime_args_data().size() * sizeof(uint32_t) == common_size); + TT_ASSERT(common_rt_args.size() * sizeof(uint32_t) <= common_size); + common_rt_data_and_sizes.back().emplace_back( + common_rt_args.data(), common_rt_args.size() * sizeof(uint32_t), common_size); + common_rt_args_data.back().emplace_back(kernel->common_runtime_args_data()); + + if (core_type == CoreType::ETH) { + common_sub_cmds.emplace>( + std::vector()); + auto& unicast_sub_cmd = + std::get>(common_sub_cmds); + unicast_sub_cmd.reserve(kernel->logical_cores().size()); + for (auto& core_coord : kernel->logical_cores()) { + // can make a vector of unicast encodings here + CoreCoord virtual_core_coords = device->virtual_core_from_logical_core(core_coord, CoreType::ETH); + unicast_sub_cmd.emplace_back(CQDispatchWritePackedUnicastSubCmd{ + .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, virtual_core_coords)}); + } + } else { + std::vector> dst_noc_multicast_info = + device->extract_dst_noc_multicast_info( + kernel->logical_coreranges(), core_type); + common_sub_cmds.emplace>( + std::vector()); + auto& multicast_sub_cmd = + std::get>(common_sub_cmds); + multicast_sub_cmd.reserve(dst_noc_multicast_info.size()); + for (const auto& mcast_dests : dst_noc_multicast_info) { + multicast_sub_cmd.emplace_back(CQDispatchWritePackedMulticastSubCmd{ + .noc_xy_addr = this->device->get_noc_multicast_encoding( + this->noc_index, std::get(mcast_dests.first)), + .num_mcast_dests = mcast_dests.second}); + } + } + } + } + + uint32_t crta_offset = program.get_program_config(index).crta_offsets[dispatch_class]; + + // Common rtas are always expected to fit in one prefetch cmd + // TODO: use a linear write instead of a packed-write + std::visit( + [&](auto&& sub_cmds) { + generate_runtime_args_cmds( + program_command_sequence.runtime_args_command_sequences, + crta_offset, + sub_cmds, + common_rt_data_and_sizes, + common_size / sizeof(uint32_t), + common_rt_args_data, + max_prefetch_command_size, + packed_write_max_unicast_sub_cmds, + true, + core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE + : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE); + sub_cmds.clear(); + }, + common_sub_cmds); + + for (auto& data_per_kernel : common_rt_data_and_sizes) { + for (auto& data_and_sizes : data_per_kernel) { + RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes)); + } + } + common_rt_data_and_sizes.clear(); + common_rt_args_data.clear(); + } + } + TT_ASSERT( + command_count >= program_command_sequence.runtime_args_command_sequences.size(), + "Incorrect number of commands reserved {}, final size {}. Vector reallocation causes cached addresses to be incorrect.", + command_count, + program_command_sequence.runtime_args_command_sequences.size()); + + uint32_t runtime_args_fetch_size_bytes = 0; + for (const auto& cmds : program_command_sequence.runtime_args_command_sequences) { + // BRISC, NCRISC, TRISC... + runtime_args_fetch_size_bytes += cmds.size_bytes(); + } + program_command_sequence.runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes; +} + +void EnqueueProgramCommand::assemble_device_commands( + ProgramCommandSequence& program_command_sequence, const tt::stl::Span kernel_config_addrs) { + // Calculate size of command and fill program indices of data to update + // TODO: Would be nice if we could pull this out of program + uint32_t cmd_sequence_sizeB = 0; + const uint32_t max_prefetch_command_size = + dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + + const auto &program_transfer_info = program.get_program_transfer_info(); + // Multicast Semaphore Cmd + uint32_t num_multicast_semaphores = program_transfer_info.multicast_semaphores.size(); + std::vector> multicast_sem_sub_cmds(num_multicast_semaphores); + std::vector>> multicast_sem_data(num_multicast_semaphores); + std::vector>> multicast_sem_payload(num_multicast_semaphores); + std::vector> multicast_sem_dst_size; + multicast_sem_dst_size.reserve(num_multicast_semaphores); + if (num_multicast_semaphores > 0) { + uint32_t i = 0; + for (const auto& [dst, transfer_info_vec] : program_transfer_info.multicast_semaphores) { + // TODO: loop over things inside transfer_info[i] + uint32_t write_packed_len = transfer_info_vec[0].data.size(); + multicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t))); + + for (const auto& transfer_info : transfer_info_vec) { + for (const auto& dst_noc_info : transfer_info.dst_noc_info) { + TT_ASSERT( + transfer_info.data.size() == write_packed_len, + "Not all data std::vectors in write packed semaphore cmd equal in len"); + multicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedMulticastSubCmd{ + .noc_xy_addr = this->device->get_noc_multicast_encoding( + this->noc_index, std::get(dst_noc_info.first)), + .num_mcast_dests = dst_noc_info.second}); + multicast_sem_data[i].emplace_back( + transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t)); + } + } + cmd_sequence_sizeB += insert_write_packed_payloads( + multicast_sem_sub_cmds[i].size(), + multicast_sem_dst_size.back().second, + max_prefetch_command_size, + this->packed_write_max_unicast_sub_cmds, + multicast_sem_payload[i]); + i++; + } + } + + // Unicast Semaphore Cmd + uint32_t num_unicast_semaphores = program_transfer_info.unicast_semaphores.size(); + std::vector> unicast_sem_sub_cmds(num_unicast_semaphores); + std::vector>> unicast_sem_data(num_unicast_semaphores); + std::vector>> unicast_sem_payload(num_unicast_semaphores); + std::vector> unicast_sem_dst_size; + unicast_sem_dst_size.reserve(num_unicast_semaphores); + if (num_unicast_semaphores > 0) { + uint32_t i = 0; + for (const auto& [dst, transfer_info_vec] : program_transfer_info.unicast_semaphores) { + // TODO: loop over things inside transfer_info[i] + uint32_t write_packed_len = transfer_info_vec[0].data.size(); + unicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t))); + + for (const auto& transfer_info : transfer_info_vec) { + for (const auto& dst_noc_info : transfer_info.dst_noc_info) { + TT_ASSERT( + transfer_info.data.size() == write_packed_len, + "Not all data std::vectors in write packed semaphore cmd equal in len"); + unicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedUnicastSubCmd{ + .noc_xy_addr = this->device->get_noc_unicast_encoding( + this->noc_index, std::get(dst_noc_info.first))}); + unicast_sem_data[i].emplace_back( + transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t)); + } + } + cmd_sequence_sizeB += insert_write_packed_payloads( + unicast_sem_sub_cmds[i].size(), + unicast_sem_dst_size.back().second, + max_prefetch_command_size, + this->packed_write_max_unicast_sub_cmds, + unicast_sem_payload[i]); + i++; + } + } + + uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); + + const auto& circular_buffers_unique_coreranges = program.circular_buffers_unique_coreranges(); + const uint16_t num_multicast_cb_sub_cmds = circular_buffers_unique_coreranges.size(); + std::vector> mcast_cb_payload; + uint16_t cb_config_size_bytes = 0; + uint32_t aligned_cb_config_size_bytes = 0; + std::vector> cb_config_payloads( + num_multicast_cb_sub_cmds, + std::vector(program.get_program_config(index).cb_size / sizeof(uint32_t), 0)); + std::vector multicast_cb_config_sub_cmds; + std::vector> multicast_cb_config_data; + if (num_multicast_cb_sub_cmds > 0) { + multicast_cb_config_sub_cmds.reserve(num_multicast_cb_sub_cmds); + multicast_cb_config_data.reserve(num_multicast_cb_sub_cmds); + program_command_sequence.circular_buffers_on_core_ranges.resize(num_multicast_cb_sub_cmds); + uint32_t i = 0; + uint32_t max_overall_index = 0; + uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t); + for (const CoreRange& core_range : circular_buffers_unique_coreranges) { + const CoreCoord virtual_start = device->virtual_core_from_logical_core(core_range.start_coord, CoreType::WORKER); + const CoreCoord virtual_end = device->virtual_core_from_logical_core(core_range.end_coord, CoreType::WORKER); + + const uint32_t num_receivers = core_range.size(); + auto& cb_config_payload = cb_config_payloads[i]; + uint32_t max_index = 0; + const auto& circular_buffers_on_corerange = program.circular_buffers_on_corerange(core_range); + program_command_sequence.circular_buffers_on_core_ranges[i].reserve( + circular_buffers_on_corerange.size()); + for (const std::shared_ptr& cb : circular_buffers_on_corerange) { + program_command_sequence.circular_buffers_on_core_ranges[i].emplace_back(cb); + const uint32_t cb_address = cb->address(); + const uint32_t cb_size = cb->size(); + for (const auto& buffer_index : cb->local_buffer_indices()) { + // 1 cmd for all 32 buffer indices, populate with real data for specified indices + // cb config payload + const uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; + cb_config_payload[base_index] = cb_address; + cb_config_payload[base_index + 1] = cb_size; + cb_config_payload[base_index + 2] = cb->num_pages(buffer_index); + cb_config_payload[base_index + 3] = cb->page_size(buffer_index); + max_index = std::max(max_index, base_index + UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG); + } + for (const auto& buffer_index : cb->remote_buffer_indices()) { + const uint32_t base_index = + remote_offset_index + + (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) * UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG; + cb_config_payload[base_index] = cb->config_address(); + cb_config_payload[base_index + 1] = cb->page_size(buffer_index); + max_index = std::max(max_index, base_index + UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG); + } + } + multicast_cb_config_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{ + .noc_xy_addr = this->device->get_noc_multicast_encoding( + this->noc_index, CoreRange(virtual_start, virtual_end)), + .num_mcast_dests = (uint32_t)core_range.size()}); + multicast_cb_config_data.emplace_back(cb_config_payload.data(), max_index * sizeof(uint32_t)); + max_overall_index = std::max(max_overall_index, max_index); + i++; + } + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + cb_config_size_bytes = max_overall_index * sizeof(uint32_t); + aligned_cb_config_size_bytes = align(cb_config_size_bytes, l1_alignment); + cmd_sequence_sizeB += insert_write_packed_payloads( + num_multicast_cb_sub_cmds, + cb_config_size_bytes, + max_prefetch_command_size, + this->packed_write_max_unicast_sub_cmds, + mcast_cb_payload); + } + + // Program Binaries and Go Signals + // Get launch msg data while getting size of cmds + std::vector> kernel_bins_prefetch_subcmds; + std::vector> kernel_bins_dispatch_subcmds; + std::vector kernel_bins_write_packed_large_data_aligned_sizeB; + std::vector kernel_bins_unicast_cmds; + const uint32_t max_length_per_sub_cmd = dispatch_constants::get(this->dispatch_core_type).scratch_db_size() / 2; + const uint32_t max_paged_length_per_sub_cmd = + max_length_per_sub_cmd / HostMemDeviceCommand::PROGRAM_PAGE_SIZE * HostMemDeviceCommand::PROGRAM_PAGE_SIZE; + if (program_transfer_info.kernel_bins.size()) { + TT_FATAL(program.get_kernels_buffer(this->device).get(), "Expected Kernel Binary Buffer to be allocated for program."); + } + const auto kernels_buffer = program.get_kernels_buffer(this->device); + for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program_transfer_info.kernel_bins) { + bool write_linear; + uint32_t noc_encoding; + std::visit( + [&](auto&& cores) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + noc_encoding = this->device->get_noc_multicast_encoding(this->noc_index, cores); + write_linear = false; + } else { + noc_encoding = this->device->get_noc_unicast_encoding(this->noc_index, cores); + write_linear = true; + } + }, + cores); + for (uint32_t kernel_idx = 0; kernel_idx < kg_transfer_info.dst_base_addrs.size(); kernel_idx++) { + if (write_linear) { + kernel_bins_unicast_cmds.emplace_back(2 * CQ_PREFETCH_CMD_BARE_MIN_SIZE); + cmd_sequence_sizeB += 2 * CQ_PREFETCH_CMD_BARE_MIN_SIZE; + constexpr bool flush_prefetch = false; + kernel_bins_unicast_cmds.back().add_dispatch_write_linear( + num_mcast_dests, // num_mcast_dests + noc_encoding, // noc_xy_addr + kg_transfer_info.dst_base_addrs[kernel_idx], + kg_transfer_info.lengths[kernel_idx]); + RecordDispatchData( + program, + DISPATCH_DATA_BINARY, + kg_transfer_info.lengths[kernel_idx], + kg_transfer_info.riscvs[kernel_idx]); + // Difference between prefetch total relayed pages and dispatch write linear + uint32_t relayed_bytes = + align(kg_transfer_info.lengths[kernel_idx], HostMemDeviceCommand::PROGRAM_PAGE_SIZE); + uint16_t length_adjust = uint16_t(relayed_bytes - kg_transfer_info.lengths[kernel_idx]); + + uint32_t base_address, page_offset; + if (kg_transfer_info.page_offsets[kernel_idx] > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) { + const uint32_t num_banks = this->device->num_banks(kernels_buffer->buffer_type()); + page_offset = kg_transfer_info.page_offsets[kernel_idx] % num_banks; + uint32_t num_full_pages_written_per_bank = + kg_transfer_info.page_offsets[kernel_idx] / num_banks; + base_address = kernels_buffer->address() + + num_full_pages_written_per_bank * kernels_buffer->page_size(); + } else { + base_address = kernels_buffer->address(); + page_offset = kg_transfer_info.page_offsets[kernel_idx]; + } + + kernel_bins_unicast_cmds.back().add_prefetch_relay_paged( + true, // is_dram + page_offset, + base_address, + kernels_buffer->page_size(), + relayed_bytes / kernels_buffer->page_size(), + length_adjust); + } else { + uint32_t base_address = kernels_buffer->address(); + uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx]; + + // TODO: pack all these writes into 1 linear write + uint32_t kernel_config_buffer_offset = kg_transfer_info.dst_base_addrs[kernel_idx]; + uint32_t aligned_length = align(kg_transfer_info.lengths[kernel_idx], hal.get_alignment(HalMemType::DRAM)); + uint32_t padding = aligned_length - kg_transfer_info.lengths[kernel_idx]; + while (aligned_length != 0) { + if (kernel_bins_dispatch_subcmds.empty() || + kernel_bins_dispatch_subcmds.back().size() == + CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_MAX_SUB_CMDS) { + kernel_bins_dispatch_subcmds.push_back({}); + kernel_bins_prefetch_subcmds.push_back({}); + kernel_bins_write_packed_large_data_aligned_sizeB.push_back(0); + } + uint32_t write_length, read_length; + if (aligned_length <= max_length_per_sub_cmd) { + read_length = aligned_length; + write_length = read_length - padding; + } else { + read_length = max_paged_length_per_sub_cmd; + write_length = read_length; + } + if (!kernel_bins_dispatch_subcmds.back().empty()) { + auto& back = kernel_bins_dispatch_subcmds.back().back(); + if (back.noc_xy_addr != noc_encoding) { + back.flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; + } + } + kernel_bins_dispatch_subcmds.back().emplace_back(CQDispatchWritePackedLargeSubCmd{ + .noc_xy_addr = noc_encoding, + .addr = kernel_config_buffer_offset, + .length = (uint16_t)write_length, + .num_mcast_dests = (uint8_t)num_mcast_dests, + .flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE}); + RecordDispatchData( + program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]); + kernel_config_buffer_offset += write_length; + + kernel_bins_prefetch_subcmds.back().emplace_back(CQPrefetchRelayPagedPackedSubCmd{ + .start_page = (uint16_t)page_offset, + .log_page_size = (uint16_t)HostMemDeviceCommand::LOG2_PROGRAM_PAGE_SIZE, + .base_addr = base_address, + .length = read_length}); + page_offset += read_length / HostMemDeviceCommand::PROGRAM_PAGE_SIZE; + aligned_length -= read_length; + kernel_bins_write_packed_large_data_aligned_sizeB.back() += read_length; + } + } + } + } + // Unlink the last subcmd of every dispatch, to ensure we don't hold the + // path reservation for an incredible long time. This also prevents a hang + // if the next mcast is to a different destination. + for (auto& subcmd_list : kernel_bins_dispatch_subcmds) { + if (!subcmd_list.empty()) { + subcmd_list.back().flags |= CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; + } + } + uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); + for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) { + cmd_sequence_sizeB += align( + ((sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd))) + + kernel_bins_dispatch_subcmds[i].size() * sizeof(CQDispatchWritePackedLargeSubCmd), + pcie_alignment); + cmd_sequence_sizeB += align( + kernel_bins_prefetch_subcmds[i].size() * sizeof(CQPrefetchRelayPagedPackedSubCmd) + + sizeof(CQPrefetchCmd), + pcie_alignment); + } + + std::vector> multicast_go_signal_data; + std::vector> unicast_go_signal_data; + std::vector multicast_go_signal_sub_cmds; + std::vector unicast_go_signal_sub_cmds; + std::vector> multicast_go_signals_payload; + std::vector> unicast_go_signals_payload; + constexpr uint32_t go_signal_sizeB = sizeof(launch_msg_t); + uint32_t aligned_go_signal_sizeB = align(go_signal_sizeB, hal.get_alignment(HalMemType::L1)); + uint32_t go_signal_size_words = aligned_go_signal_sizeB / sizeof(uint32_t); + + // TODO: eventually the code below could be structured to loop over programmable_indices + // and check for mcast/unicast + uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); + for (KernelGroup& kernel_group : program.get_kernel_groups(programmable_core_index)) { + kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; + for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) { + kernel_group.launch_msg.kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr; + } + kernel_group.launch_msg.kernel_config.host_assigned_id = program.get_runtime_id(); + const void* launch_message_data = (const void*)(&kernel_group.launch_msg); + for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { + CoreCoord virtual_start = device->virtual_core_from_logical_core(core_range.start_coord, kernel_group.get_core_type()); + CoreCoord virtual_end = device->virtual_core_from_logical_core(core_range.end_coord, kernel_group.get_core_type()); + + multicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{ + .noc_xy_addr = this->device->get_noc_multicast_encoding( + this->noc_index, CoreRange(virtual_start, virtual_end)), + .num_mcast_dests = (uint32_t)core_range.size()}); + multicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB); + } + } + if (multicast_go_signal_sub_cmds.size() > 0) { + cmd_sequence_sizeB += insert_write_packed_payloads( + multicast_go_signal_sub_cmds.size(), + go_signal_sizeB, + max_prefetch_command_size, + this->packed_write_max_unicast_sub_cmds, + multicast_go_signals_payload); + } + + programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); + // TODO: ugly, can be fixed by looping over indices w/ some work + if (programmable_core_index != -1) { + for (KernelGroup& kernel_group : program.get_kernel_groups(programmable_core_index)) { + kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; + for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) { + kernel_group.launch_msg.kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr; + } + kernel_group.launch_msg.kernel_config.host_assigned_id = program.get_runtime_id(); + const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg); + for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { + for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { + for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { + CoreCoord virtual_coord = device->virtual_core_from_logical_core( + CoreCoord({x, y}), kernel_group.get_core_type()); + unicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{ + .noc_xy_addr = + this->device->get_noc_unicast_encoding(this->noc_index, virtual_coord)}); + unicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB); + } + } + } + } + } + + if (unicast_go_signal_sub_cmds.size() > 0) { + cmd_sequence_sizeB += insert_write_packed_payloads( + unicast_go_signal_sub_cmds.size(), + go_signal_sizeB, + max_prefetch_command_size, + this->packed_write_max_unicast_sub_cmds, + unicast_go_signals_payload); + } + // if dispatch_s is enabled have dispatch_d send a semaphore update to dispatch_s (this will include a write barrier on dispatch_d if program is active) + // if not, check if the program is active on workers. If active, have dispatch_d issue a write barrier + cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE; + + // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command) + const auto& noc_data_start_idx = device->noc_data_start_index(this->sub_device_id, multicast_go_signal_sub_cmds.size() > 0, unicast_go_signal_sub_cmds.size() > 0); + const auto& num_noc_mcast_txns = multicast_go_signal_sub_cmds.size() > 0 ? device->num_noc_mcast_txns(this->sub_device_id) : 0; + const auto& num_noc_unicast_txns = unicast_go_signal_sub_cmds.size() > 0 ? device->num_noc_unicast_txns(this->sub_device_id) : 0; + cmd_sequence_sizeB += align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment); + + program_command_sequence.device_command_sequence = HostMemDeviceCommand(cmd_sequence_sizeB); + + auto& device_command_sequence = program_command_sequence.device_command_sequence; + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + + // Semaphores + // Multicast Semaphore Cmd + index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); + for (uint32_t i = 0; i < num_multicast_semaphores; ++i) { + uint32_t curr_sub_cmd_idx = 0; + for (const auto& [num_sub_cmds_in_cmd, multicast_sem_payload_sizeB] : multicast_sem_payload[i]) { + device_command_sequence.add_dispatch_write_packed( + num_sub_cmds_in_cmd, + multicast_sem_dst_size[i].first + program.get_program_config(index).sem_offset, + multicast_sem_dst_size[i].second, + multicast_sem_payload_sizeB, + multicast_sem_sub_cmds[i], + multicast_sem_data[i], + this->packed_write_max_unicast_sub_cmds, + curr_sub_cmd_idx, + false, + DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE); + curr_sub_cmd_idx += num_sub_cmds_in_cmd; + for (auto& data_and_size : multicast_sem_data[i]) { + RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second); + } + } + } + + // Unicast Semaphore Cmd + index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); + for (uint32_t i = 0; i < num_unicast_semaphores; ++i) { + uint32_t curr_sub_cmd_idx = 0; + for (const auto& [num_sub_cmds_in_cmd, unicast_sem_payload_sizeB] : unicast_sem_payload[i]) { + device_command_sequence.add_dispatch_write_packed( + num_sub_cmds_in_cmd, + unicast_sem_dst_size[i].first + program.get_program_config(index).sem_offset, + unicast_sem_dst_size[i].second, + unicast_sem_payload_sizeB, + unicast_sem_sub_cmds[i], + unicast_sem_data[i], + this->packed_write_max_unicast_sub_cmds, + curr_sub_cmd_idx, + false, + DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE); + curr_sub_cmd_idx += num_sub_cmds_in_cmd; + for (auto& data_and_size : unicast_sem_data[i]) { + RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second); + } + } + } + + // CB Configs commands + index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); + if (num_multicast_cb_sub_cmds > 0) { + uint32_t curr_sub_cmd_idx = 0; + program_command_sequence.cb_configs_payloads.reserve(num_multicast_cb_sub_cmds); + const uint32_t cb_config_size_words = aligned_cb_config_size_bytes / sizeof(uint32_t); + for (const auto& [num_sub_cmds_in_cmd, mcast_cb_payload_sizeB] : mcast_cb_payload) { + uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes(); + device_command_sequence.add_dispatch_write_packed( + num_sub_cmds_in_cmd, + program.get_program_config(index).cb_offset, + cb_config_size_bytes, + mcast_cb_payload_sizeB, + multicast_cb_config_sub_cmds, + multicast_cb_config_data, + this->packed_write_max_unicast_sub_cmds, + curr_sub_cmd_idx, + false, + DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE); + for (auto& data_and_size : multicast_cb_config_data) { + RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, data_and_size.second); + } + curr_sub_cmd_idx += num_sub_cmds_in_cmd; + RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, mcast_cb_payload_sizeB); + uint32_t curr_sub_cmd_data_offset_words = + (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) + + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), l1_alignment)) / + sizeof(uint32_t); + for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { + program_command_sequence.cb_configs_payloads.push_back( + (uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words); + curr_sub_cmd_data_offset_words += cb_config_size_words; + } + } + } + + // All Previous Cmds Up to This Point Go Into the Kernel Config Buffer + program_command_sequence.program_config_buffer_data_size_bytes = + device_command_sequence.write_offset_bytes(); + + // Program Binaries + for (const auto& kernel_bins_unicast_cmd : kernel_bins_unicast_cmds) { + device_command_sequence.add_data( + kernel_bins_unicast_cmd.data(), + kernel_bins_unicast_cmd.size_bytes(), + kernel_bins_unicast_cmd.size_bytes()); + } + uint32_t dram_alignment = hal.get_alignment(HalMemType::DRAM); + for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) { + device_command_sequence.add_dispatch_write_packed_large( + dram_alignment, + kernel_bins_dispatch_subcmds[i].size(), + kernel_bins_dispatch_subcmds[i], + 0, + DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE); + device_command_sequence.add_prefetch_relay_paged_packed( + kernel_bins_write_packed_large_data_aligned_sizeB[i], + kernel_bins_prefetch_subcmds[i], + kernel_bins_prefetch_subcmds[i].size()); + } + + // Go Signals + program_command_sequence.go_signals.reserve( + multicast_go_signal_sub_cmds.size() + unicast_go_signal_sub_cmds.size()); + + // Get the address for the slot this launch_message will be written to + uint32_t multicast_launch_msg_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::LAUNCH) + this->multicast_cores_launch_message_wptr * sizeof(launch_msg_t); + + if (multicast_go_signal_sub_cmds.size() > 0) { + uint32_t curr_sub_cmd_idx = 0; + for (const auto& [num_sub_cmds_in_cmd, multicast_go_signal_payload_sizeB] : multicast_go_signals_payload) { + uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes(); + device_command_sequence.add_dispatch_write_packed( + num_sub_cmds_in_cmd, + multicast_launch_msg_addr, + go_signal_sizeB, + multicast_go_signal_payload_sizeB, + multicast_go_signal_sub_cmds, + multicast_go_signal_data, + this->packed_write_max_unicast_sub_cmds, + curr_sub_cmd_idx); + curr_sub_cmd_idx += num_sub_cmds_in_cmd; + program_command_sequence.launch_msg_write_packed_cmd_ptrs.push_back(&((CQDispatchCmd*) ((uint32_t*)device_command_sequence.data() + (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))->write_packed); + uint32_t curr_sub_cmd_data_offset_words = + (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) + + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), l1_alignment)) / + sizeof(uint32_t); + for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { + program_command_sequence.go_signals.push_back( + (launch_msg_t*)((uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words)); + curr_sub_cmd_data_offset_words += go_signal_size_words; + } + } + } + + if (unicast_go_signal_sub_cmds.size() > 0) { + uint32_t unicast_launch_msg_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH) + this->unicast_cores_launch_message_wptr * sizeof(launch_msg_t); + uint32_t curr_sub_cmd_idx = 0; + for (const auto& [num_sub_cmds_in_cmd, unicast_go_signal_payload_sizeB] : unicast_go_signals_payload) { + uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes(); + device_command_sequence.add_dispatch_write_packed( + num_sub_cmds_in_cmd, + unicast_launch_msg_addr, + go_signal_sizeB, + unicast_go_signal_payload_sizeB, + unicast_go_signal_sub_cmds, + unicast_go_signal_data, + this->packed_write_max_unicast_sub_cmds, + curr_sub_cmd_idx); + curr_sub_cmd_idx += num_sub_cmds_in_cmd; + program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs.push_back(&((CQDispatchCmd*) ((uint32_t*)device_command_sequence.data() + (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))->write_packed); + uint32_t curr_sub_cmd_data_offset_words = + (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) + + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedUnicastSubCmd), l1_alignment)) / + sizeof(uint32_t); + for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { + program_command_sequence.go_signals.push_back( + (launch_msg_t*)((uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words)); + curr_sub_cmd_data_offset_words += go_signal_size_words; + } + } + } + + DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER; + auto sub_device_index = this->sub_device_id.to_index(); + if (this->device->dispatch_s_enabled()) { + // dispatch_d signals dispatch_s to send the go signal, use a barrier if there are cores active + uint16_t index_bitmask = 0; + index_bitmask |= 1 << sub_device_index; + device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program_transfer_info.num_active_cores > 0, index_bitmask); + dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE; + } else { + // Wait Noc Write Barrier, wait for binaries/configs and launch_msg to be written to worker cores + if (program_transfer_info.num_active_cores > 0) { + device_command_sequence.add_dispatch_wait(true, this->dispatch_message_addr, 0, 0, false, false); + } + } + go_msg_t run_program_go_signal; + run_program_go_signal.signal = RUN_MSG_GO; + run_program_go_signal.master_x = (uint8_t)this->dispatch_core.x; + run_program_go_signal.master_y = (uint8_t)this->dispatch_core.y; + run_program_go_signal.dispatch_message_offset = (uint8_t)dispatch_constants::get(this->dispatch_core_type).get_dispatch_message_offset(sub_device_index); + uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes(); + device_command_sequence.add_dispatch_go_signal_mcast(this->expected_num_workers_completed, *reinterpret_cast(&run_program_go_signal), this->dispatch_message_addr, num_noc_mcast_txns, num_noc_unicast_txns, noc_data_start_idx, dispatcher_for_go_signal); + program_command_sequence.mcast_go_signal_cmd_ptr = &((CQDispatchCmd*) ((uint32_t*)device_command_sequence.data() + (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t)))->mcast; +} + +void EnqueueProgramCommand::update_device_commands( + ProgramCommandSequence& cached_program_command_sequence, + const tt::stl::Span kernel_config_addrs) { + uint32_t i = 0; + ZoneScopedN("program_loaded_on_device"); + uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); + uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t); + for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) { + uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i]; + for (const std::shared_ptr& cb : cbs_on_core_range) { + const uint32_t cb_address = cb->address(); + const uint32_t cb_size = cb->size(); + for (const auto& buffer_index : cb->local_buffer_indices()) { + // 1 cmd for all 32 buffer indices, populate with real data for specified indices + + // cb config payload + uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; + cb_config_payload[base_index] = cb_address; + cb_config_payload[base_index + 1] = cb_size; + cb_config_payload[base_index + 2] = cb->num_pages(buffer_index); + cb_config_payload[base_index + 3] = cb->page_size(buffer_index); + } + for (const auto& buffer_index : cb->remote_buffer_indices()) { + const uint32_t base_index = remote_offset_index + (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) * + UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG; + cb_config_payload[base_index] = cb->config_address(); + cb_config_payload[base_index + 1] = cb->page_size(buffer_index); + } + } + i++; + } + for (auto& go_signal : cached_program_command_sequence.go_signals) { + for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) { + go_signal->kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr; + } + go_signal->kernel_config.host_assigned_id = program.get_runtime_id(); + } + // Update launch message addresses to reflect new launch_msg slot in ring buffer + uint32_t multicast_cores_launch_msg_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::LAUNCH) + this->multicast_cores_launch_message_wptr * sizeof(launch_msg_t); + for (auto launch_msg_cmd_ptr : cached_program_command_sequence.launch_msg_write_packed_cmd_ptrs) { + launch_msg_cmd_ptr->addr = multicast_cores_launch_msg_addr; + } + if (cached_program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs.size()) { + uint32_t unicast_cores_launch_message_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH) + this->unicast_cores_launch_message_wptr * sizeof(launch_msg_t); + for (auto launch_msg_cmd_ptr : cached_program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs) { + launch_msg_cmd_ptr->addr = unicast_cores_launch_message_addr; + } + } + // Update go signal to reflect potentially modified dispatch core and new wait count + go_msg_t run_program_go_signal; + run_program_go_signal.signal = RUN_MSG_GO; + run_program_go_signal.master_x = (uint8_t)this->dispatch_core.x; + run_program_go_signal.master_y = (uint8_t)this->dispatch_core.y; + run_program_go_signal.dispatch_message_offset = (uint8_t)dispatch_constants::get(this->dispatch_core_type).get_dispatch_message_offset(this->sub_device_id.to_index()); + cached_program_command_sequence.mcast_go_signal_cmd_ptr->go_signal = *reinterpret_cast(&run_program_go_signal); + cached_program_command_sequence.mcast_go_signal_cmd_ptr->wait_count = this->expected_num_workers_completed; +} + void EnqueueProgramCommand::write_program_command_sequence( const ProgramCommandSequence& program_command_sequence, bool stall_first, bool stall_before_program) { TT_ASSERT(!(stall_first && stall_before_program)); @@ -501,11 +1563,55 @@ void EnqueueProgramCommand::write_program_command_sequence( } void EnqueueProgramCommand::process() { - // Dispatch metadata contains runtime information based on - // the kernel config ring buffer state - program_utils::ProgramDispatchMetadata dispatch_metadata; - - // Compute the total number of workers this program uses + std::pair&> reservation = + this->config_buffer_mgr.reserve(program.get_program_config_sizes()); + uint32_t sync_count = 0; + bool stall_first = reservation.first.need_sync; + bool stall_before_program = false; + if (!program.kernel_binary_always_stored_in_ringbuffer()) { + // Wait for all existing commands to run before writing out the kernel binary. + sync_count = this->expected_num_workers_completed; + stall_before_program = !stall_first; + } else if (reservation.first.need_sync) { + // TODO: attempt to send RTA only without stalling. + sync_count = reservation.first.sync_count; + // Check if the launch message is the only thing preventing us from + // sending the program. If so, we can at least send the RTAs. Ideally we + // would also send the kernel binaries in this case, but the rest of the + // code isn't set up for that. + auto config_sizes = program.get_program_config_sizes(); + config_sizes[config_sizes.size() - 1] = 0; + const std::pair&> memory_reservation = + this->config_buffer_mgr.reserve(config_sizes); + if (!memory_reservation.first.need_sync) { + stall_first = false; + stall_before_program = true; + } + reservation = this->config_buffer_mgr.reserve(program.get_program_config_sizes()); + } + + // Access the program command cache + auto& cached_program_command_sequences = program.get_cached_program_command_sequences(); + // Start constructing the cache entry, using the build_key + uint64_t command_hash = device->build_key(); + if (not hal.is_coordinate_virtualization_enabled()) { + // When coordinate virtualization is not enabled, explicitly encode the device + // id into the command hash, to always assert on programs being reused across devices. + command_hash = (command_hash << 32) | (device->id()); + } + bool is_cached = program.is_cached(); // Program is cached, its is assocated command stream was previously generated + auto cached_cmd_iter = cached_program_command_sequences.find(command_hash); + // Is the program was cached, the current command_hash, must match the one generated when the program was cached + // Finalizing the program multiple times/Regenerating program cache state is not currently supported + TT_FATAL((not is_cached) or cached_cmd_iter != cached_program_command_sequences.end(), "Enqueueing a Program across devices with different cores harvested is not supported, unless coordinate virtualization is enabled (only enabled on Wormhole and above)."); + if (program.get_program_binary_status(device->id()) == ProgramBinaryStatus::InFlight) { + // assemble_stall_commands is hardcoded to always wait for everything for now. + this->config_buffer_mgr.free(this->expected_num_workers_completed); + } else { + if (stall_first || stall_before_program) { + this->config_buffer_mgr.free(sync_count); + } + } uint32_t num_workers = 0; if (program.runs_on_noc_multicast_only_cores()) { num_workers += device->num_worker_cores(HalProgrammableCoreType::TENSIX, this->sub_device_id); @@ -513,41 +1619,70 @@ void EnqueueProgramCommand::process() { if (program.runs_on_noc_unicast_only_cores()) { num_workers += device->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, this->sub_device_id); } - // Reserve space for this program in the kernel config ring buffer - program_utils::reserve_space_in_kernel_config_buffer( - this->config_buffer_mgr, - program.get_program_config_sizes(), - program.kernel_binary_always_stored_in_ringbuffer(), - program.get_program_binary_status(device->id()), - num_workers, - this->expected_num_workers_completed, - dispatch_metadata); + this->config_buffer_mgr.alloc(this->expected_num_workers_completed + num_workers); + std::vector& kernel_config_addrs_raw = reservation.second; // Remove launch buffer from config addrs, since it's not a real core. const tt::stl::Span kernel_config_addrs{ - dispatch_metadata.kernel_config_addrs.data(), dispatch_metadata.kernel_config_addrs.size() - 1}; + kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 1}; RecordProgramRun(program); - // Access the program dispatch-command cache - auto& cached_program_command_sequence = program.get_cached_program_command_sequences().begin()->second; - // Update the generated dispatch commands based on the state of the CQ and the ring buffer - program_utils::update_program_dispatch_commands( - program, - cached_program_command_sequence, - kernel_config_addrs, - this->multicast_cores_launch_message_wptr, - this->unicast_cores_launch_message_wptr, - this->expected_num_workers_completed, - this->dispatch_core, - this->dispatch_core_type, - this->sub_device_id, - dispatch_metadata, - program.get_program_binary_status(device->id())); - // Issue dispatch commands for this program - this->write_program_command_sequence(cached_program_command_sequence, dispatch_metadata.stall_first, dispatch_metadata.stall_before_program); - // Kernel Binaries are committed to DRAM, the first time the program runs on device. Reflect this on host. - program.set_program_binary_status(device->id(), ProgramBinaryStatus::Committed); + // Calculate all commands size and determine how many fetch q entries to use + // Preamble, some waits and stalls + // can be written directly to the issue queue + if (!is_cached) { + ProgramCommandSequence program_command_sequence; + this->assemble_preamble_commands(program_command_sequence, kernel_config_addrs); + this->assemble_stall_commands(program_command_sequence, true); + program_command_sequence.current_stall_seq_idx = UncachedStallSequenceIdx; + // Runtime Args Command Sequence + this->assemble_runtime_args_commands(program_command_sequence); + + // Record kernel groups in this program, only need to do it once. + for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) { + CoreType core_type = hal.get_core_type(index); + RecordKernelGroups(program, core_type, program.get_kernel_groups(index)); + } + this->assemble_device_commands(program_command_sequence, kernel_config_addrs); + // Stall first for simplicity, because we don't use `sync_count` in assemble_stall_commands. + this->write_program_command_sequence( + program_command_sequence, /*stall_first=*/true, /*stall_before_program=*/false); + this->assemble_stall_commands(program_command_sequence, false); + cached_program_command_sequences.insert({command_hash, std::move(program_command_sequence)}); + program.set_cached(); + program.set_program_binary_status(device->id(), ProgramBinaryStatus::Committed); + } else { + static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count)); + static constexpr uint32_t tensix_l1_write_offset_offset = + (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset1)); + static constexpr uint32_t eth_l1_write_offset_offset = + (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset2)); + + auto& cached_program_command_sequence = cached_cmd_iter->second; + if (program.get_program_binary_status(device->id()) != ProgramBinaryStatus::Committed) { + cached_program_command_sequence.current_stall_seq_idx = UncachedStallSequenceIdx; + program.set_program_binary_status(device->id(), ProgramBinaryStatus::Committed); + } else { + cached_program_command_sequence.current_stall_seq_idx = CachedStallSequenceIdx; + } + auto& curr_stall_seq_idx = cached_program_command_sequence.current_stall_seq_idx; + cached_program_command_sequence.stall_command_sequences[curr_stall_seq_idx].update_cmd_sequence( + wait_count_offset, &sync_count, sizeof(uint32_t)); + + cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence( + tensix_l1_write_offset_offset, + &kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)], + sizeof(uint32_t)); + if (hal.get_programmable_core_type_count() >= 2) { + cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence( + eth_l1_write_offset_offset, + &kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)], + sizeof(uint32_t)); + } + this->update_device_commands(cached_program_command_sequence, kernel_config_addrs); + this->write_program_command_sequence(cached_program_command_sequence, stall_first, stall_before_program); + } } EnqueueRecordEventCommand::EnqueueRecordEventCommand( @@ -585,7 +1720,7 @@ void EnqueueRecordEventCommand::process() { uint32_t num_worker_counters = this->sub_device_ids.size(); uint32_t cmd_sequence_sizeB = - hal.get_alignment(HalMemType::HOST) * num_worker_counters + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + CQ_PREFETCH_CMD_BARE_MIN_SIZE * num_worker_counters + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT packed_write_sizeB + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WRITE_PACKED + unicast subcmds + event // payload align( @@ -675,7 +1810,7 @@ EnqueueWaitForEventCommand::EnqueueWaitForEventCommand( } void EnqueueWaitForEventCommand::process() { - uint32_t cmd_sequence_sizeB = hal.get_alignment(HalMemType::HOST); // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); @@ -720,14 +1855,14 @@ void EnqueueTraceCommand::process() { uint32_t go_signals_cmd_size = align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment) * descriptor->descriptors.size(); uint32_t cmd_sequence_sizeB = - this->device->dispatch_s_enabled() * hal.get_alignment(HalMemType::HOST) + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running) + this->device->dispatch_s_enabled() * CQ_PREFETCH_CMD_BARE_MIN_SIZE + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running) go_signals_cmd_size + // go signal cmd - (hal.get_alignment(HalMemType::HOST) + // wait to ensure that reset go signal was processed (dispatch_d) + (CQ_PREFETCH_CMD_BARE_MIN_SIZE + // wait to ensure that reset go signal was processed (dispatch_d) // when dispatch_s and dispatch_d are running on 2 cores, workers update dispatch_s. dispatch_s is responsible for resetting worker count // and giving dispatch_d the latest worker state. This is encapsulated in the dispatch_s wait command (only to be sent when dispatch is distributed // on 2 cores) - (this->device->distributed_dispatcher()) * hal.get_alignment(HalMemType::HOST)) * num_sub_devices + - hal.get_alignment(HalMemType::HOST); // CQ_PREFETCH_CMD_EXEC_BUF + (this->device->distributed_dispatcher()) * CQ_PREFETCH_CMD_BARE_MIN_SIZE) * num_sub_devices + + CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_EXEC_BUF void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); @@ -810,7 +1945,7 @@ EnqueueTerminateCommand::EnqueueTerminateCommand( void EnqueueTerminateCommand::process() { // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_TERMINATE // CQ_PREFETCH_CMD_TERMINATE - uint32_t cmd_sequence_sizeB = hal.get_alignment(HalMemType::HOST); + uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; // dispatch and prefetch terminate commands each needs to be a separate fetch queue entry void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); @@ -892,7 +2027,7 @@ void HWCommandQueue::set_num_worker_sems_on_dispatch(uint32_t num_worker_sems) { if (!this->device->dispatch_s_enabled()) { return; } - uint32_t cmd_sequence_sizeB = hal.get_alignment(HalMemType::HOST); + uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->id); HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); command_sequence.add_dispatch_set_num_worker_sems(num_worker_sems, DispatcherSelect::DISPATCH_SLAVE); @@ -921,13 +2056,13 @@ void HWCommandQueue::reset_worker_state(bool reset_launch_msg_state) { go_signals_cmd_size = align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment) * num_sub_devices; } uint32_t cmd_sequence_sizeB = - reset_launch_msg_state * this->device->dispatch_s_enabled() * hal.get_alignment(HalMemType::HOST) + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running) + reset_launch_msg_state * this->device->dispatch_s_enabled() * CQ_PREFETCH_CMD_BARE_MIN_SIZE + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running) go_signals_cmd_size + // go signal cmd - (hal.get_alignment(HalMemType::HOST) + // wait to ensure that reset go signal was processed (dispatch_d) + (CQ_PREFETCH_CMD_BARE_MIN_SIZE + // wait to ensure that reset go signal was processed (dispatch_d) // when dispatch_s and dispatch_d are running on 2 cores, workers update dispatch_s. dispatch_s is responsible for resetting worker count // and giving dispatch_d the latest worker state. This is encapsulated in the dispatch_s wait command (only to be sent when dispatch is distributed // on 2 cores) - this->device->distributed_dispatcher() * hal.get_alignment(HalMemType::HOST)) * num_sub_devices; + this->device->distributed_dispatcher() * CQ_PREFETCH_CMD_BARE_MIN_SIZE) * num_sub_devices; void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->id); HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); bool clear_count = true; @@ -1179,7 +2314,7 @@ void HWCommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->device->id()); const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); uint32_t max_data_sizeB = - max_prefetch_command_size - (hal.get_alignment(HalMemType::HOST) * 2); // * 2 to account for issue + max_prefetch_command_size - (CQ_PREFETCH_CMD_BARE_MIN_SIZE * 2); // * 2 to account for issue uint32_t dst_page_index = 0; @@ -1303,7 +2438,7 @@ void HWCommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool uint32_t num_full_pages_written = 0; while (total_pages_to_write > 0) { - uint32_t data_offsetB = hal.get_alignment(HalMemType::HOST); // data appended after CQ_PREFETCH_CMD_RELAY_INLINE + uint32_t data_offsetB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; // data appended after CQ_PREFETCH_CMD_RELAY_INLINE // + CQ_DISPATCH_CMD_WRITE_PAGED bool issue_wait = (dst_page_index == 0 and @@ -1369,12 +2504,11 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { ZoneScopedN("HWCommandQueue_enqueue_program"); std::vector sub_device_ids = {program.determine_sub_device_ids(device)}; TT_FATAL(sub_device_ids.size() == 1, "Programs must be executed on a single sub-device"); - // Finalize Program: Compute relative offsets for data structures (semaphores, kernel binaries, etc) in L1 if (not program.is_finalized()) { program.finalize(device); } if (program.get_program_binary_status(device->id()) == ProgramBinaryStatus::NotSent) { - // Write program binaries to device if it hasn't previously been cached + // Write program binaries to device program.allocate_kernel_bin_buf_on_device(device); if (program.get_program_transfer_info().binary_data.size()) { this->enqueue_write_buffer( @@ -1382,10 +2516,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { } program.set_program_binary_status(device->id(), ProgramBinaryStatus::InFlight); } - // Lower the program to device: Generate dispatch commands. - // Values in these commands will get updated based on kernel config ring - // buffer state at runtime. - program.generate_dispatch_commands(device); + program.set_last_used_command_queue_for_testing(this); #ifdef DEBUG @@ -1842,10 +2973,10 @@ volatile bool HWCommandQueue::is_noc_hung() { return illegal_noc_txn_hang; } void HWCommandQueue::record_begin(const uint32_t tid, std::shared_ptr ctx) { uint32_t num_sub_devices = this->device->num_sub_devices(); // Issue event as a barrier and a counter reset - uint32_t cmd_sequence_sizeB = hal.get_alignment(HalMemType::HOST); + uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; if (this->device->distributed_dispatcher()) { // wait on dispatch_s before issuing counter reset - cmd_sequence_sizeB += hal.get_alignment(HalMemType::HOST); + cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE; } cmd_sequence_sizeB *= num_sub_devices; void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->id); @@ -1894,7 +3025,7 @@ void HWCommandQueue::record_end() { auto &trace_data = this->trace_ctx->data; trace_data = std::move(this->manager.get_bypass_data()); // Add command to terminate the trace buffer - DeviceCommand command_sequence(hal.get_alignment(HalMemType::HOST)); + DeviceCommand command_sequence(CQ_PREFETCH_CMD_BARE_MIN_SIZE); command_sequence.add_prefetch_exec_buf_end(); for (int i = 0; i < command_sequence.size_bytes() / sizeof(uint32_t); i++) { trace_data.push_back(((uint32_t*)command_sequence.data())[i]); diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index 5a3a77b786b..661e3d6d499 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -331,6 +331,16 @@ class EnqueueProgramCommand : public Command { uint32_t unicast_cores_launch_message_wptr, SubDeviceId sub_device_id); + void assemble_preamble_commands( + ProgramCommandSequence& program_command_sequence, const tt::stl::Span kernel_config_addrs); + void assemble_stall_commands(ProgramCommandSequence& program_command_sequence, bool prefetch_stall); + void assemble_runtime_args_commands(ProgramCommandSequence& program_command_sequence); + void assemble_device_commands( + ProgramCommandSequence& program_command_sequence, const tt::stl::Span kernel_config_addrs); + void update_device_commands( + ProgramCommandSequence& cached_program_command_sequence, + const tt::stl::Span kernel_config_addrs); + void write_program_command_sequence( const ProgramCommandSequence& program_command_sequence, bool stall_first, bool stall_before_program); @@ -517,6 +527,7 @@ class HWCommandQueue { void set_num_worker_sems_on_dispatch(uint32_t num_worker_sems); void set_go_signal_noc_data_on_dispatch(const vector_memcpy_aligned& go_signal_noc_data); void reset_worker_state(bool reset_launch_msg_state); + private: uint32_t id; uint32_t size_B; diff --git a/tt_metal/impl/dispatch/dispatch_core_common.hpp b/tt_metal/impl/dispatch/dispatch_core_common.hpp index 35c94d6e775..3336175e72b 100644 --- a/tt_metal/impl/dispatch/dispatch_core_common.hpp +++ b/tt_metal/impl/dispatch/dispatch_core_common.hpp @@ -6,7 +6,6 @@ #include "common/core_descriptor.hpp" #include "tt_metal/common/core_coord.hpp" -#include "tt_metal/impl/kernels/data_types.hpp" #include "tt_metal/llrt/get_platform_architecture.hpp" #include "tt_metal/tt_stl/reflection.hpp" @@ -33,10 +32,6 @@ enum DispatchWorkerType : uint32_t { COUNT = 17 }; -// NOC ID used by dispatch kernels to communicate with downstream cores. This parameter -// is required when setting up Command Queue objects on host. -static constexpr NOC dispatch_downstream_noc = NOC::NOC_0; - enum class DispatchCoreType : uint32_t { WORKER, ETH, COUNT }; enum class DispatchCoreAxis { ROW, COL, COUNT }; diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch.cpp b/tt_metal/impl/dispatch/kernel_config/dispatch.cpp index 1bfbbd579a3..c97c5457c2d 100644 --- a/tt_metal/impl/dispatch/kernel_config/dispatch.cpp +++ b/tt_metal/impl/dispatch/kernel_config/dispatch.cpp @@ -26,6 +26,7 @@ void DispatchKernel::GenerateStaticConfigs() { uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size; uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_); + logical_core_ = dispatch_core_manager::instance().dispatcher_core(device_->id(), channel, cq_id_); static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base(); static_config_.dispatch_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE; static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages(); @@ -76,6 +77,7 @@ void DispatchKernel::GenerateStaticConfigs() { uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size; uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_); + logical_core_ = dispatch_core_manager::instance().dispatcher_core(servicing_device_id_, channel, cq_id_); static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base(); static_config_.dispatch_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE; static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages(); @@ -122,6 +124,7 @@ void DispatchKernel::GenerateStaticConfigs() { uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size; uint32_t completion_queue_size = device_->sysmem_manager().get_completion_queue_size(cq_id_); + logical_core_ = dispatch_core_manager::instance().dispatcher_d_core(device_->id(), channel, cq_id_); static_config_.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base(); static_config_.dispatch_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; static_config_.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages(); diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch.hpp b/tt_metal/impl/dispatch/kernel_config/dispatch.hpp index c2a18bdfa3e..431d54c8605 100644 --- a/tt_metal/impl/dispatch/kernel_config/dispatch.hpp +++ b/tt_metal/impl/dispatch/kernel_config/dispatch.hpp @@ -65,24 +65,8 @@ class DispatchKernel : public FDKernel { bool h_variant, bool d_variant) : FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) { - TT_FATAL( - noc_selection.downstream_noc == dispatch_downstream_noc, - "Invalid downstream NOC specified for Dispatcher kernel"); - TT_FATAL( - noc_selection.upstream_noc != noc_selection.downstream_noc, - "Dispatcher kernel cannot have identical upstream and downstream NOCs."); static_config_.is_h_variant = h_variant; static_config_.is_d_variant = d_variant; - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - if (h_variant && d_variant) { - this->logical_core_ = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id); - } else if (h_variant) { - channel = tt::Cluster::instance().get_assigned_channel_for_device(servicing_device_id); - this->logical_core_ = - dispatch_core_manager::instance().dispatcher_core(servicing_device_id, channel, cq_id); - } else if (d_variant) { - this->logical_core_ = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id); - } } void CreateKernel() override; void GenerateStaticConfigs() override; diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp b/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp index 236cf78939e..9552dd2d5f4 100644 --- a/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp +++ b/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp @@ -27,7 +27,7 @@ void DispatchSKernel::GenerateStaticConfigs() { dispatch_s_buffer_base = dispatch_buffer_base; } } - + logical_core_ = dispatch_core_manager::instance().dispatcher_s_core(device_->id(), channel, cq_id_); static_config_.cb_base = dispatch_s_buffer_base; static_config_.cb_log_page_size = dispatch_constants::DISPATCH_S_BUFFER_LOG_PAGE_SIZE; static_config_.cb_size = my_dispatch_constants.dispatch_s_buffer_size(); diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp b/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp index ba1760f53f8..a9293e7bd1f 100644 --- a/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp +++ b/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp @@ -29,10 +29,7 @@ class DispatchSKernel : public FDKernel { public: DispatchSKernel( int node_id, chip_id_t device_id, chip_id_t servicing_device_id, uint8_t cq_id, noc_selection_t noc_selection) : - FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) { - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - this->logical_core_ = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id_); - } + FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) {} void CreateKernel() override; void GenerateStaticConfigs() override; void GenerateDependentConfigs() override; diff --git a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp index bd991bdb349..5ffcb7939d0 100644 --- a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp @@ -54,7 +54,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() { } else if (auto tk = dynamic_cast(k)) { tunneler_kernel = tk; } else { - TT_FATAL(false, "Unexpected kernel type upstream of TUNNELER"); + TT_FATAL(false, "Unexpected kernelt tyoe downstream of TUNNELER"); } } TT_ASSERT(tunneler_kernel && !tunneler_kernel->IsRemote()); @@ -175,7 +175,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() { } else if (auto tk = dynamic_cast(k)) { ds_tunneler_kernel = tk; } else { - TT_FATAL(false, "Unexpected kernel type downstream of TUNNELER"); + TT_FATAL(false, "Unexpected kernelt tyoe downstream of TUNNELER"); } } TT_ASSERT(ds_tunneler_kernel && ds_tunneler_kernel == tunneler_kernel); diff --git a/tt_metal/impl/dispatch/kernel_config/prefetch.cpp b/tt_metal/impl/dispatch/kernel_config/prefetch.cpp index e7f86fe0581..28646120468 100644 --- a/tt_metal/impl/dispatch/kernel_config/prefetch.cpp +++ b/tt_metal/impl/dispatch/kernel_config/prefetch.cpp @@ -23,6 +23,8 @@ void PrefetchKernel::GenerateStaticConfigs() { uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start; uint32_t issue_queue_size = device_->sysmem_manager().get_issue_queue_size(cq_id_); + logical_core_ = dispatch_core_manager::instance().prefetcher_core(device_->id(), channel, cq_id_); + dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base(); static_config_.downstream_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE; static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages(); @@ -81,6 +83,8 @@ void PrefetchKernel::GenerateStaticConfigs() { uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start; uint32_t issue_queue_size = device_->sysmem_manager().get_issue_queue_size(cq_id_); + logical_core_ = dispatch_core_manager::instance().prefetcher_core(servicing_device_id_, channel, cq_id_); + static_config_.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; if (tt::Cluster::instance().is_galaxy_cluster()) { // TODO: whys is this hard-coded for galaxy? static_config_.downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(1); @@ -119,6 +123,8 @@ void PrefetchKernel::GenerateStaticConfigs() { static_config_.dispatch_s_buffer_size = 0; static_config_.dispatch_s_cb_log_page_size = 0; } else if (static_config_.is_d_variant.value()) { + logical_core_ = dispatch_core_manager::instance().prefetcher_d_core(device_->id(), channel, cq_id_); + dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base(); static_config_.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages(); diff --git a/tt_metal/impl/dispatch/kernel_config/prefetch.hpp b/tt_metal/impl/dispatch/kernel_config/prefetch.hpp index a029049928e..2410a1ca275 100644 --- a/tt_metal/impl/dispatch/kernel_config/prefetch.hpp +++ b/tt_metal/impl/dispatch/kernel_config/prefetch.hpp @@ -66,19 +66,6 @@ class PrefetchKernel : public FDKernel { FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) { static_config_.is_h_variant = h_variant; static_config_.is_d_variant = d_variant; - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - TT_FATAL( - noc_selection.downstream_noc == dispatch_downstream_noc, - "Invalid downstream NOC specified for Prefetcher kernel"); - if (h_variant && d_variant) { - this->logical_core_ = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); - } else if (h_variant) { - channel = tt::Cluster::instance().get_assigned_channel_for_device(servicing_device_id); - this->logical_core_ = - dispatch_core_manager::instance().prefetcher_core(servicing_device_id, channel, cq_id); - } else if (d_variant) { - this->logical_core_ = dispatch_core_manager::instance().prefetcher_d_core(device_id, channel, cq_id); - } } void CreateKernel() override; void GenerateStaticConfigs() override; diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp index 93bc8e3e7df..ce040d65b96 100644 --- a/tt_metal/impl/dispatch/topology.cpp +++ b/tt_metal/impl/dispatch/topology.cpp @@ -79,18 +79,15 @@ static const std::vector two_chip_arch_1cq = { {3, 0, 1, 0, PREFETCH_H, {x, x, x, x}, {5, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {4, 0, 1, 0, DISPATCH_H, {6, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, - {5, 0, 1, 0, PACKET_ROUTER_MUX, {3, x, x, x}, {7, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {6, 0, 1, 0, DEMUX, {7, x, x, x}, {4, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {7, 0, 1, 0, US_TUNNELER_REMOTE, {11, 5, x, x}, {11, 6, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - - {8, 1, x, 0, PREFETCH_D, {13, x, x, x}, {9, 10, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {9, 1, x, 0, DISPATCH_D, {8, x, x, x}, {10, 12, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, - {10, 1, x, 0, DISPATCH_S, {8, x, x, x}, {9, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1}, - - {11, 1, x, 0, US_TUNNELER_LOCAL, {7, 12, x, x}, {7, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {12, 1, x, 0, MUX_D, {9, x, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {7, 0, 1, 0, US_TUNNELER_REMOTE, {8, 5, x, x}, {8, 6, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {8, 1, x, 0, US_TUNNELER_LOCAL, {7, 9, x, x}, {7, 10, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {9, 1, x, 0, MUX_D, {12, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {10, 1, x, 0, PACKET_ROUTER_DEMUX, {8, x, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {11, 1, x, 0, PREFETCH_D, {10, x, x, x}, {12, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {12, 1, x, 0, DISPATCH_D, {11, x, x, x}, {13, 9, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, + {13, 1, x, 0, DISPATCH_S, {11, x, x, x}, {12, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1}, }; static const std::vector two_chip_arch_2cq = { @@ -106,17 +103,15 @@ static const std::vector two_chip_arch_2cq = { {8, 0, 1, 0, PACKET_ROUTER_MUX, {4, 5, x, x}, {10, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {9, 0, 1, 0, DEMUX, {10, x, x, x}, {6, 7, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {10, 0, 1, 0, US_TUNNELER_REMOTE, {15, 8, x, x}, {15, 9, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - - {11, 1, x, 0, PREFETCH_D, {17, x, x, x}, {13, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {12, 1, x, 1, PREFETCH_D, {17, x, x, x}, {14, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {13, 1, x, 0, DISPATCH_D, {11, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, - {14, 1, x, 1, DISPATCH_D, {12, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, - - {15, 1, x, 0, US_TUNNELER_LOCAL, {10, 16, x, x}, {10, 17, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {16, 1, x, 0, MUX_D, {13, 14, x, x}, {15, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {17, 1, x, 0, PACKET_ROUTER_DEMUX, {15, x, x, x}, {11, 12, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - + {10, 0, 1, 0, US_TUNNELER_REMOTE, {11, 8, x, x}, {11, 9, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {11, 1, x, 0, US_TUNNELER_LOCAL, {10, 12, x, x}, {10, 13, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {12, 1, x, 0, MUX_D, {16, 17, x, x}, {11, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {14, 15, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + + {14, 1, x, 0, PREFETCH_D, {13, x, x, x}, {16, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {15, 1, x, 1, PREFETCH_D, {13, x, x, x}, {17, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, + {16, 1, x, 0, DISPATCH_D, {14, x, x, x}, {12, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, + {17, 1, x, 1, DISPATCH_D, {15, x, x, x}, {12, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, }; static const std::vector galaxy_nine_chip_arch_1cq = { diff --git a/tt_metal/impl/kernels/kernel_types.hpp b/tt_metal/impl/kernels/kernel_types.hpp index 444f4619cee..195b592eb77 100644 --- a/tt_metal/impl/kernels/kernel_types.hpp +++ b/tt_metal/impl/kernels/kernel_types.hpp @@ -15,7 +15,7 @@ namespace tt::tt_metal { -using KernelHandle = std::uint32_t; +using KernelHandle = std::uint16_t; struct DataMovementConfig { DataMovementProcessor processor = DataMovementProcessor::RISCV_0; // For data transfer kernels: NCRISC & BRISC diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index b37424c7903..c2cb844b499 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -2,7 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "tt_metal/impl/program/program_dispatch_utils.hpp" +#include "tt_metal/impl/program/program.hpp" + #include #include @@ -106,8 +107,7 @@ class Program_ { const std::vector< Semaphore > & semaphores() const; KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index); - std::vector>& get_kernel_groups(uint32_t programmable_core_type_index); - std::unordered_map>& get_kernels(uint32_t programmable_core_type_index); + std::vector& get_kernel_groups(uint32_t programmable_core_type_index); void add_buffer(std::shared_ptr buf); void release_buffers(); std::vector> circular_buffers_on_core(const CoreCoord &core) const; @@ -155,9 +155,10 @@ class Program_ { uint32_t get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const; uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const; void set_last_used_command_queue_for_testing(HWCommandQueue *queue); - void populate_dispatch_data(Device *device); private: + void populate_dispatch_data(Device *device); + HWCommandQueue *last_used_command_queue_for_testing = nullptr; // Buffers temporarily owned by the program @@ -220,8 +221,9 @@ class Program_ { bool local_circular_buffer_allocation_needed_; static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff; - std::vector>> kernel_groups_; + std::vector> kernel_groups_; std::vector> core_to_kernel_group_index_table_; + uint32_t tensix_go_signal_count_; std::vector> config_buffers_; @@ -442,29 +444,21 @@ CoreType KernelGroup::get_core_type() const { return hal.get_core_type(this->programmable_core_type_index); }; -std::vector> &detail::Program_::get_kernel_groups(uint32_t programmable_core_type_index) { +std::vector &detail::Program_::get_kernel_groups(uint32_t programmable_core_type_index) { update_kernel_groups(programmable_core_type_index); return kernel_groups_[programmable_core_type_index]; } -std::vector> &Program::get_kernel_groups(uint32_t programmable_core_type_index) { +std::vector &Program::get_kernel_groups(uint32_t programmable_core_type_index) { return pimpl_->get_kernel_groups(programmable_core_type_index); } -std::unordered_map>& detail::Program_::get_kernels(uint32_t programmable_core_type_index) { - return this->kernels_.at(programmable_core_type_index); -} - -std::unordered_map>& Program::get_kernels(uint32_t programmable_core_type_index) { - return pimpl_->get_kernels(programmable_core_type_index); -} - KernelGroup *detail::Program_::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) { update_kernel_groups(programmable_core_type_index); if (core.x >= grid_extent_[programmable_core_type_index].x || core.y >= grid_extent_[programmable_core_type_index].y) return nullptr; uint8_t index = core_to_kernel_group_index_table_[programmable_core_type_index].at(core.y * grid_extent_[programmable_core_type_index].x + core.x); - return (index == core_to_kernel_group_invalid_index) ? nullptr : kernel_groups_[programmable_core_type_index].at(index).get(); + return (index == core_to_kernel_group_invalid_index) ? nullptr : &kernel_groups_[programmable_core_type_index].at(index); } KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) { @@ -589,16 +583,14 @@ void detail::Program_::update_kernel_groups(uint32_t programmable_core_type_inde programmable_core_type_index, max_local_cb_end_index, min_remote_cb_start_index); - kernel_groups_[programmable_core_type_index].push_back( - std::make_shared( - *this, - programmable_core_type_index, - kg_to_cores.first.kernel_ids, - erisc_is_idle, - max_local_cb_end_index, - min_remote_cb_start_index, - kg_to_cores.second) - ); + kernel_groups_[programmable_core_type_index].push_back(KernelGroup( + *this, + programmable_core_type_index, + kg_to_cores.first.kernel_ids, + erisc_is_idle, + max_local_cb_end_index, + min_remote_cb_start_index, + kg_to_cores.second)); index++; } } @@ -1107,22 +1099,21 @@ void detail::Program_::populate_dispatch_data(Device *device) { std::uint32_t num_active_cores = 0; for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) { CoreType core_type = hal.get_core_type(index); - for (const auto& kernel_group : this->get_kernel_groups(index)) { + for (KernelGroup &kernel_group : this->get_kernel_groups(index)) { // TODO: add a bit in the hal that says if this core type is unicast/multicast if (core_type == CoreType::WORKER) { std::vector> dst_noc_multicast_info = device->extract_dst_noc_multicast_info( - kernel_group->core_ranges.ranges(), core_type); + kernel_group.core_ranges.ranges(), core_type); std::vector kernel_ids; - for (int dispatch_class = 0; dispatch_class < kernel_group->kernel_ids.size(); dispatch_class++) { - auto &optional_id = kernel_group->kernel_ids[dispatch_class]; + for (int dispatch_class = 0; dispatch_class < kernel_group.kernel_ids.size(); dispatch_class++) { + auto &optional_id = kernel_group.kernel_ids[dispatch_class]; if (optional_id) { - KernelHandle device_local_kernel_id = program_utils::get_device_local_kernel_handle(optional_id.value()); - kernel_ids.push_back(device_local_kernel_id); + kernel_ids.push_back(optional_id.value()); int proc_sub_class = 0; - for (uint32_t& dst_addr : kernel_transfer_info.at(device_local_kernel_id).dst_base_addrs) { + for (uint32_t& dst_addr : kernel_transfer_info.at(optional_id.value()).dst_base_addrs) { // TODO: ditch this w/ linear writes based on program config kernel_text_offset and size - dst_addr = kernel_group->kernel_text_offsets[dispatch_class + proc_sub_class]; + dst_addr = kernel_group.kernel_text_offsets[dispatch_class + proc_sub_class]; proc_sub_class++; } } @@ -1137,12 +1128,11 @@ void detail::Program_::populate_dispatch_data(Device *device) { } else { TT_ASSERT(core_type == CoreType::ETH); std::vector> dst_noc_unicast_info = - extract_dst_noc_unicast_info(kernel_group->core_ranges.ranges(), core_type); + extract_dst_noc_unicast_info(kernel_group.core_ranges.ranges(), core_type); std::vector kernel_ids; - if (kernel_group->kernel_ids[DISPATCH_CLASS_ETH_DM0]) { - KernelHandle device_local_kernel_id = program_utils::get_device_local_kernel_handle(kernel_group->kernel_ids[DISPATCH_CLASS_ETH_DM0].value()); - kernel_ids.push_back(device_local_kernel_id); + if (kernel_group.kernel_ids[DISPATCH_CLASS_ETH_DM0]) { + kernel_ids.push_back(kernel_group.kernel_ids[DISPATCH_CLASS_ETH_DM0].value()); } for (const auto &[cores, num_mcast_dsts] : dst_noc_unicast_info) { @@ -1162,18 +1152,110 @@ void detail::Program_::populate_dispatch_data(Device *device) { } uint32_t detail::Program_::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) { + // Iterate over kernels in the program and "level" the number of RTAs based on the max // Unique RTAs are packed across dispatch classes // Common RTAs come after unique RTAs - return program_utils::finalize_rt_args( - this->kernels_[programmable_core_type_index], - this->get_kernel_groups(programmable_core_type_index), - base_offset, - programmable_core_type_index, - this->get_program_config(programmable_core_type_index).rta_offset, - this->get_program_config(programmable_core_type_index).crta_offsets, - this->get_program_config(programmable_core_type_index).crta_sizes - ); + uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index); + + std::vector max_rtas(processor_classes); + std::vector max_crtas(processor_classes); + uint32_t max_unique_rta_size = 0; + uint32_t total_crta_size = 0; + + CoreType core_type = hal.get_core_type(programmable_core_type_index); + HalProgrammableCoreType programmable_core_type = hal.get_programmable_core_type(programmable_core_type_index); + + this->get_program_config(programmable_core_type_index).rta_offset = base_offset; + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) { + for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { + max_rtas[dispatch_class] = 0; + auto& optional_id = kg.kernel_ids[dispatch_class]; + if (optional_id) { + auto kernel = get_kernel(optional_id.value()); + for (const CoreRange &core_range : kg.core_ranges.ranges()) { + for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { + for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { + CoreCoord core_coord(x, y); + max_rtas[dispatch_class] = + std::max(max_rtas[dispatch_class], (uint32_t)kernel->runtime_args(core_coord).size()); + } + } + } + } + } + + uint32_t offset = 0; + for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { + auto& optional_id = kg.kernel_ids[dispatch_class]; + kg.rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t); + if (optional_id) { + auto kernel = get_kernel(optional_id.value()); + kernel->set_runtime_args_count(kg.core_ranges, max_rtas[dispatch_class]); + kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = base_offset + offset; + offset += max_rtas[dispatch_class] * sizeof(uint32_t); + } else { + kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = 0; + } + } + + kg.total_rta_size = offset; + offset = align(offset, l1_alignment); + max_unique_rta_size = std::max(offset, max_unique_rta_size); + } + + for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { + max_crtas[dispatch_class] = 0; + } + // Find the max # common RTAs across all kernels for each dispatch class + for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) { + auto kernel = get_kernel(kernel_id); + // TODO: kernels should be stored by programmable core type + if (core_type == kernel->get_kernel_core_type() && + (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) { + uint32_t dispatch_class = kernel->dispatch_class(); + max_crtas[dispatch_class] = + std::max(max_crtas[dispatch_class], (uint32_t)kernel->common_runtime_args().size()); + } + } + + // Calculate the address offset and size for common RTAs for each dispatch class + uint32_t offset = 0; + for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { + uint32_t size = max_crtas[dispatch_class] * sizeof(uint32_t); + this->get_program_config(programmable_core_type_index).crta_offsets[dispatch_class] = base_offset + max_unique_rta_size + offset; + this->get_program_config(programmable_core_type_index).crta_sizes[dispatch_class] = size; + offset += size; + offset = align(offset, l1_alignment); + } + total_crta_size = offset; + + // Set the runtime_args_data sizing info based on the shared max + for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) { + auto kernel = get_kernel(kernel_id); + // TODO: as above, fix when kernels are stored by programmable core type + if (core_type == kernel->get_kernel_core_type() && + (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) { + uint32_t dispatch_class = kernel->dispatch_class(); + kernel->set_common_runtime_args_count(max_crtas[dispatch_class]); + } + } + + // Set the kernel group common runtime arg offsets use in the launch message + for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) { + for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { + kg.launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset = this->get_program_config(programmable_core_type_index).crta_offsets[dispatch_class]; + } + } + + // TODO: this is asserted here as the leveling above can break the limits enforced by the API + // Once we use a ring buffer, memory space will be dynamic and this assert won't matter + std::uint32_t l1_kernel_config_size = tt::tt_metal::hal.get_dev_size(tt::tt_metal::HalProgrammableCoreType::TENSIX, tt::tt_metal::HalL1MemAddrType::KERNEL_CONFIG); + TT_FATAL(offset <= l1_kernel_config_size, "offset {} cannot exceed config size {}", offset, l1_kernel_config_size); + + return max_unique_rta_size + total_crta_size; } ProgramConfig& detail::Program_::get_program_config(uint32_t programmable_core_type_index) { @@ -1185,7 +1267,21 @@ ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index } uint32_t detail::Program_::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) { - return program_utils::finalize_sems(programmable_core_type_index, base_offset, this->semaphores_, this->program_configs_[programmable_core_type_index].sem_offset, this->program_configs_[programmable_core_type_index].sem_size); + + int max_id = -1; + CoreType core_type = hal.get_core_type(programmable_core_type_index); + for (const auto & sem : this->semaphores_) { + if (sem.core_type() == core_type && (int)sem.id() > max_id) { + max_id = sem.id(); + } + } + + uint32_t sem_size = (max_id + 1) * hal.get_alignment(HalMemType::L1); + + this->program_configs_[programmable_core_type_index].sem_offset = base_offset; + this->program_configs_[programmable_core_type_index].sem_size = sem_size; + + return base_offset + sem_size; } void detail::Program_::set_launch_msg_sem_offsets() { @@ -1193,7 +1289,7 @@ void detail::Program_::set_launch_msg_sem_offsets() { for (uint32_t kg_type_index = 0; kg_type_index < hal.get_programmable_core_type_count(); kg_type_index++) { for (auto& kg : this->get_kernel_groups(kg_type_index)) { for (uint32_t sem_type_index = 0; sem_type_index < hal.get_programmable_core_type_count(); sem_type_index++) { - kg->launch_msg.kernel_config.sem_offset[sem_type_index] = + kg.launch_msg.kernel_config.sem_offset[sem_type_index] = this->program_configs_[sem_type_index].sem_offset; } } @@ -1201,11 +1297,104 @@ void detail::Program_::set_launch_msg_sem_offsets() { } uint32_t detail::Program_::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) { - return program_utils::finalize_cbs(programmable_core_type_index, this->get_kernel_groups(programmable_core_type_index), base_offset, this->program_configs_[programmable_core_type_index].cb_offset, this->program_configs_[programmable_core_type_index].cb_size, this->program_configs_[programmable_core_type_index].local_cb_size); + uint32_t max_local_end_index = 0; + uint32_t min_remote_start_index = NUM_CIRCULAR_BUFFERS; + // TODO: has to be better way to do this and don't read from volatile + auto& kernel_groups = this->get_kernel_groups(programmable_core_type_index); + for (auto& kg : kernel_groups) { + max_local_end_index = + std::max(max_local_end_index, (uint32_t)kg.launch_msg.kernel_config.max_local_cb_end_index); + min_remote_start_index = + std::min(min_remote_start_index, (uint32_t)kg.launch_msg.kernel_config.min_remote_cb_start_index); + } + + uint32_t local_cb_size = max_local_end_index * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); + uint32_t remote_cb_offset = base_offset + local_cb_size; + for (auto& kg : kernel_groups) { + kg.launch_msg.kernel_config.local_cb_offset = base_offset; + kg.launch_msg.kernel_config.remote_cb_offset = remote_cb_offset; + } + + uint32_t remote_cb_size = (NUM_CIRCULAR_BUFFERS - min_remote_start_index) * + UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); + uint32_t total_cb_size = local_cb_size + remote_cb_size; + this->program_configs_[programmable_core_type_index].cb_offset = base_offset; + this->program_configs_[programmable_core_type_index].cb_size = total_cb_size; + this->program_configs_[programmable_core_type_index].local_cb_size = local_cb_size; + + return align(base_offset + total_cb_size, hal.get_alignment(HalMemType::L1)); } uint32_t detail::Program_::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) { - return program_utils::finalize_kernel_bins(device, programmable_core_type_index, this->kernels_[programmable_core_type_index], this->get_kernel_groups(programmable_core_type_index), base_offset, this->program_configs_[programmable_core_type_index].kernel_text_offset, this->program_configs_[programmable_core_type_index].kernel_text_size); + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + + uint32_t max_offset = 0; + for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) { + uint32_t offset = base_offset; + + for (int class_id = 0; class_id < DISPATCH_CLASS_MAX; class_id++) { + auto& optional_id = kg.kernel_ids[class_id]; + if (optional_id) { + const auto kernel = this->get_kernel(optional_id.value()); + std::vector const& binaries = kernel->binaries(device->build_key()); + // TODO: this is really ugly, save me future-HAL! + if (programmable_core_type_index == hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) { + uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0); + + if (class_id == DISPATCH_CLASS_TENSIX_DM0) { + kg.kernel_bin_sizes[0] = binary_packed_size; + kg.kernel_text_offsets[0] = offset; + kg.launch_msg.kernel_config.kernel_text_offset[0] = offset; + offset += binary_packed_size; + offset = align(offset, l1_alignment); + } else if (class_id == DISPATCH_CLASS_TENSIX_DM1) { + kg.kernel_bin_sizes[1] = binary_packed_size; + kg.kernel_text_offsets[1] = offset; + kg.launch_msg.kernel_config.kernel_text_offset[1] = offset; + offset += binary_packed_size; + offset = align(offset, l1_alignment); + + uint32_t binary_text_size = kernel->get_binary_text_size(device, 0); + TT_ASSERT(binary_text_size >> 4 <= std::numeric_limits::max()); + kg.launch_msg.kernel_config.ncrisc_kernel_size16 = (binary_text_size + 15) >> 4; + } else { + constexpr uint32_t max_math_processors_count = 3; + for (uint32_t proc_type_index = 0; proc_type_index < max_math_processors_count; proc_type_index++) { + uint32_t binary_packed_size = kernel->get_binary_packed_size(device, proc_type_index); + kg.kernel_bin_sizes[2 + proc_type_index] = binary_packed_size; + kg.kernel_text_offsets[2 + proc_type_index] = offset; + kg.launch_msg.kernel_config.kernel_text_offset[2 + proc_type_index] = offset; + offset += binary_packed_size; + offset = align(offset, l1_alignment); + } + } + } else { + uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0); + kg.kernel_bin_sizes[class_id] = binary_packed_size; + + // No kernel config buffer on active eth yet + if (hal.get_programmable_core_type(kg.programmable_core_type_index) == + HalProgrammableCoreType::IDLE_ETH) { + kg.kernel_text_offsets[class_id] = offset; + kg.launch_msg.kernel_config.kernel_text_offset[class_id] = offset; + offset += binary_packed_size; + offset = align(offset, l1_alignment); + } else { + kg.kernel_text_offsets[class_id] = binaries[0]->get_text_addr(); + kg.launch_msg.kernel_config.kernel_text_offset[class_id] = binaries[0]->get_text_addr(); + } + } + } + } + + max_offset = std::max(offset, max_offset); + } + + this->program_configs_[programmable_core_type_index].kernel_text_offset = base_offset; + this->program_configs_[programmable_core_type_index].kernel_text_size = max_offset - base_offset; + + return max_offset; } uint32_t& detail::Program_::get_program_config_size(uint32_t programmable_core_type_index) { @@ -1238,13 +1427,13 @@ const std::vector &detail::Program_::determine_sub_device_ids(const for (const auto& kg : program_kgs) { for (size_t i = 0; i < device->num_sub_devices(); ++i) { const auto& sub_device_cores = device->worker_cores(core_type, SubDeviceId{i}); - auto intersection = sub_device_cores.intersection(kg->core_ranges); + auto intersection = sub_device_cores.intersection(kg.core_ranges); if (intersection.size() > 0) { used_sub_device_ids.insert(SubDeviceId{i}); num_intersections += intersection.num_cores(); } } - num_cores += kg->core_ranges.num_cores(); + num_cores += kg.core_ranges.num_cores(); } TT_FATAL(num_intersections == num_cores, "Kernel group cores do not match sub device cores for programmable core type {}", @@ -1273,6 +1462,15 @@ void detail::Program_::finalize(Device *device) { // Store the number of tensix "go signals" for use by CQ // CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix) // TODO: should store all the counts + this->tensix_go_signal_count_ = 0; + for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) { + CoreType core_type = hal.get_core_type(index); + if (core_type == CoreType::WORKER) { + for (auto& kg : this->get_kernel_groups(index)) { + this->tensix_go_signal_count_ += kg.core_ranges.size(); + } + } + } for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) { HalProgrammableCoreType programmable_core_type = static_cast(index); @@ -1309,33 +1507,6 @@ void detail::Program_::finalize(Device *device) { finalized_ = true; } -void Program::set_launch_msg_sem_offsets() { pimpl_->set_launch_msg_sem_offsets(); } -void Program::populate_dispatch_data(Device* device) { pimpl_->populate_dispatch_data(device); } - -void Program::generate_dispatch_commands(Device* device) { - bool is_cached = this->is_cached(); - uint64_t command_hash = device->build_key(); - if (not hal.is_coordinate_virtualization_enabled()) { - // When coordinate virtualization is not enabled, explicitly encode the device - // id into the command hash, to always assert on programs being reused across devices. - command_hash = (command_hash << 32) | (device->id()); - } - auto& cached_program_command_sequences = this->get_cached_program_command_sequences(); - if (!is_cached) { - auto sub_device_id = this->determine_sub_device_ids(device)[0]; - ProgramCommandSequence program_command_sequence; - program_utils::insert_empty_program_dispatch_preamble_cmd(program_command_sequence); - program_utils::insert_stall_cmds(program_command_sequence, sub_device_id, device); - program_utils::assemble_runtime_args_commands(program_command_sequence, *this, device); - program_utils::assemble_device_commands(program_command_sequence, *this, device, sub_device_id); - cached_program_command_sequences.insert({command_hash, std::move(program_command_sequence)}); - this->set_cached(); - } else { - auto cached_cmd_iter = cached_program_command_sequences.find(command_hash); - TT_FATAL(cached_cmd_iter != cached_program_command_sequences.end(), "Enqueueing a Program across devices with different cores harvested is not supported, unless coordinate virtualization is enabled (only enabled on Wormhole and above)."); - } -} - void Program::allocate_kernel_bin_buf_on_device(Device* device) { pimpl_->allocate_kernel_bin_buf_on_device(device); } void Program::finalize(Device *device) { pimpl_->finalize(device); } @@ -1642,11 +1813,7 @@ std::shared_ptr Program::get_kernels_buffer(Device* device) const noexce return nullptr; } -void Program::set_kernels_bin_buffer(const std::shared_ptr& buffer) { - pimpl_->kernels_buffer_.insert({buffer->device()->id(), buffer}); -} - -std::vector &Program::get_program_config_sizes() const noexcept { return pimpl_->program_config_sizes_; } +const std::vector &Program::get_program_config_sizes() const noexcept { return pimpl_->program_config_sizes_; } std::unordered_map &Program::get_cached_program_command_sequences() noexcept { return pimpl_->cached_program_command_sequences_; diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index b237860be15..aee260a805a 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -12,7 +12,6 @@ #include "tt_metal/impl/buffers/semaphore.hpp" #include "tt_metal/impl/dispatch/program_command_sequence.hpp" #include "tt_metal/impl/program/program_device_map.hpp" -#include "tt_metal/impl/dispatch/worker_config_buffer.hpp" #include "dev_msgs.h" namespace tt { @@ -43,11 +42,6 @@ CBHandle CreateCircularBuffer( } // namespace experimental } // namespace v1 -namespace program_utils { - void assemble_device_commands( - ProgramCommandSequence& program_command_sequence, Program& program, Device* device, SubDeviceId sub_device_id); -} // namespace program_utils - class EnqueueProgramCommand; class HWCommandQueue; class JitBuildOptions; @@ -138,8 +132,7 @@ class Program { const std::vector< Semaphore > & semaphores() const; KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index); - std::vector>& get_kernel_groups(uint32_t programmable_core_type_index); - std::unordered_map>& get_kernels(uint32_t programmable_core_type_index); + std::vector& get_kernel_groups(uint32_t programmable_core_type_index); void add_buffer(std::shared_ptr buf); void release_buffers(); std::vector> circular_buffers_on_core(const CoreCoord &core) const; @@ -158,8 +151,6 @@ class Program { void compile(Device * device, bool fd_bootloader_mode = false); - void generate_dispatch_commands(Device* device); - void invalidate_circular_buffer_allocation(); void allocate_circular_buffers(const Device *device); @@ -183,7 +174,7 @@ class Program { void set_last_used_command_queue_for_testing(HWCommandQueue *queue); const std::vector &determine_sub_device_ids(const Device *device); - void set_kernels_bin_buffer(const std::shared_ptr& buffer); + private: std::unique_ptr pimpl_; @@ -209,23 +200,20 @@ class Program { void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type); - void set_launch_msg_sem_offsets(); - void populate_dispatch_data(Device* device); - const ProgramTransferInfo &get_program_transfer_info() const noexcept; - std::shared_ptr get_kernels_buffer(Device* device) const noexcept; - std::vector &get_program_config_sizes() const noexcept; + friend void detail::AddConfigBuffer(Program &program, const std::shared_ptr& config_buffer); + bool runs_on_noc_unicast_only_cores(); bool runs_on_noc_multicast_only_cores(); - std::unordered_map &get_cached_program_command_sequences() noexcept; bool kernel_binary_always_stored_in_ringbuffer(); - friend void detail::AddConfigBuffer(Program &program, const std::shared_ptr& config_buffer); - friend void program_utils::assemble_device_commands( - ProgramCommandSequence& program_command_sequence, Program& program, Device* device, SubDeviceId sub_device_id); - friend HWCommandQueue; friend EnqueueProgramCommand; friend detail::Internal_; + + const ProgramTransferInfo &get_program_transfer_info() const noexcept; + std::shared_ptr get_kernels_buffer(Device* device) const noexcept; + const std::vector &get_program_config_sizes() const noexcept; + std::unordered_map &get_cached_program_command_sequences() noexcept; }; } // namespace v0 diff --git a/tt_metal/impl/program/program_dispatch_utils.cpp b/tt_metal/impl/program/program_dispatch_utils.cpp deleted file mode 100644 index c59357cdf26..00000000000 --- a/tt_metal/impl/program/program_dispatch_utils.cpp +++ /dev/null @@ -1,1472 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "program_dispatch_utils.hpp" -#include "tt_metal/impl/dispatch/command_queue.hpp" -#include "tt_metal/impl/dispatch/data_collection.hpp" - -namespace tt::tt_metal { -namespace program_utils { - -enum DispatchWriteOffsets { - DISPATCH_WRITE_OFFSET_ZERO = 0, - DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE = 1, - DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE = 2, -}; - -uint32_t configure_rta_offsets_for_kernel_groups( - uint32_t programmable_core_type_index, - std::unordered_map>& kernels, - std::vector>& kernel_groups, - uint32_t base_offset) { - uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index); - std::vector max_rtas(processor_classes); - uint32_t max_unique_rta_size = 0; - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - - for (auto& kg : kernel_groups) { - for (std::size_t dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { - max_rtas[dispatch_class] = 0; - auto& optional_id = kg->kernel_ids[dispatch_class]; - if (optional_id) { - auto kernel = kernels.at(optional_id.value()); - for (const CoreRange& core_range : kg->core_ranges.ranges()) { - for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { - for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord core_coord(x, y); - max_rtas[dispatch_class] = - std::max(max_rtas[dispatch_class], (uint32_t)kernel->runtime_args(core_coord).size()); - } - } - } - } - } - uint32_t offset = 0; - for (std::size_t dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { - auto& optional_id = kg->kernel_ids[dispatch_class]; - kg->rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t); - if (optional_id) { - auto kernel = kernels.at(optional_id.value()); - kernel->set_runtime_args_count(kg->core_ranges, max_rtas[dispatch_class]); - kg->launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = base_offset + offset; - offset += max_rtas[dispatch_class] * sizeof(uint32_t); - } else { - kg->launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = 0; - } - } - kg->total_rta_size = offset; - offset = align(offset, l1_alignment); - max_unique_rta_size = std::max(offset, max_unique_rta_size); - } - return max_unique_rta_size; -} - -uint32_t configure_crta_offsets_for_kernel_groups( - uint32_t programmable_core_type_index, - std::unordered_map>& kernels, - std::vector>& kernel_groups, - uint32_t crta_base_offset, - std::array& crta_offsets, - std::array& crta_sizes) { - uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index); - std::vector max_crtas(processor_classes); - - for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { - max_crtas[dispatch_class] = 0; - } - // Find the max # common RTAs across all kernels for each dispatch class - for (auto& kernel_info : kernels) { - auto kernel = kernel_info.second; - uint32_t dispatch_class = kernel->dispatch_class(); - max_crtas[dispatch_class] = std::max(max_crtas[dispatch_class], (uint32_t)kernel->common_runtime_args().size()); - } - - // Derive crta offsets and sizes per dispatch class - uint32_t offset = 0; - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { - uint32_t size = max_crtas[dispatch_class] * sizeof(uint32_t); - crta_offsets[dispatch_class] = crta_base_offset + offset; - crta_sizes[dispatch_class] = size; - offset += size; - offset = align(offset, l1_alignment); - } - uint32_t total_crta_size = offset; - - // Set the runtime_args_data sizing info based on the shared max - for (auto& kernel_info : kernels) { - auto kernel = kernel_info.second; - uint32_t dispatch_class = kernel->dispatch_class(); - kernel->set_common_runtime_args_count(max_crtas[dispatch_class]); - } - // Set the kernel group common runtime arg offsets use in the launch message - for (auto& kg : kernel_groups) { - for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { - kg->launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset = crta_offsets[dispatch_class]; - } - } - return total_crta_size; -} - -uint32_t finalize_rt_args( - std::unordered_map>& kernels, - std::vector>& kernel_groups, - uint32_t base_offset, - uint32_t programmable_core_type_index, - uint32_t& rta_offset, - std::array& crta_offsets, - std::array& crta_sizes) { - CoreType core_type = hal.get_core_type(programmable_core_type_index); - HalProgrammableCoreType programmable_core_type = hal.get_programmable_core_type(programmable_core_type_index); - - uint32_t max_unique_rta_size = program_utils::configure_rta_offsets_for_kernel_groups( - programmable_core_type_index, kernels, kernel_groups, base_offset); - uint32_t crta_base_offset = base_offset + max_unique_rta_size; - uint32_t total_crta_size = program_utils::configure_crta_offsets_for_kernel_groups( - programmable_core_type_index, kernels, kernel_groups, crta_base_offset, crta_offsets, crta_sizes); - - uint32_t offset = max_unique_rta_size + total_crta_size; - // TODO: this is asserted here as the leveling above can break the limits enforced by the API - // Once we use a ring buffer, memory space will be dynamic and this assert won't matter - std::uint32_t l1_kernel_config_size = tt::tt_metal::hal.get_dev_size( - tt::tt_metal::HalProgrammableCoreType::TENSIX, tt::tt_metal::HalL1MemAddrType::KERNEL_CONFIG); - TT_FATAL(offset <= l1_kernel_config_size, "offset {} cannot exceed config size {}", offset, l1_kernel_config_size); - - rta_offset = base_offset; - return offset; -} - -uint32_t finalize_sems( - uint32_t programmable_core_type_index, - uint32_t sem_base_offset, - const std::vector& semaphores, - uint32_t& semaphore_offset, - uint32_t& semaphore_size) { - int max_id = -1; - CoreType core_type = hal.get_core_type(programmable_core_type_index); - for (const auto& sem : semaphores) { - if (sem.core_type() == core_type && (int)sem.id() > max_id) { - max_id = sem.id(); - } - } - uint32_t sem_size = (max_id + 1) * hal.get_alignment(HalMemType::L1); - semaphore_offset = sem_base_offset; - semaphore_size = sem_size; - return sem_base_offset + sem_size; -} - -uint32_t finalize_cbs( - uint32_t programmable_core_type_index, - std::vector>& kernel_groups, - uint32_t base_offset, - uint32_t& cb_offset, - uint32_t& cb_size, - uint32_t& local_cb_size) { - uint32_t max_local_end_index = 0; - uint32_t min_remote_start_index = NUM_CIRCULAR_BUFFERS; - - for (auto& kg : kernel_groups) { - max_local_end_index = - std::max(max_local_end_index, (uint32_t)kg->launch_msg.kernel_config.max_local_cb_end_index); - min_remote_start_index = - std::min(min_remote_start_index, (uint32_t)kg->launch_msg.kernel_config.min_remote_cb_start_index); - } - - local_cb_size = max_local_end_index * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); - uint32_t remote_cb_offset = base_offset + local_cb_size; - for (auto& kg : kernel_groups) { - kg->launch_msg.kernel_config.local_cb_offset = base_offset; - kg->launch_msg.kernel_config.remote_cb_offset = remote_cb_offset; - } - - uint32_t remote_cb_size = (NUM_CIRCULAR_BUFFERS - min_remote_start_index) * - UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); - uint32_t total_cb_size = local_cb_size + remote_cb_size; - cb_offset = base_offset; - cb_size = total_cb_size; - - return align(base_offset + total_cb_size, hal.get_alignment(HalMemType::L1)); -} - -uint32_t finalize_kernel_bins( - Device* device, - uint32_t programmable_core_type_index, - const std::unordered_map>& kernels, - std::vector>& kernel_groups, - uint32_t base_offset, - uint32_t& kernel_text_offset, - uint32_t& kernel_text_size) { - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - - uint32_t max_offset = 0; - for (auto& kg : kernel_groups) { - uint32_t offset = base_offset; - - for (int class_id = 0; class_id < DISPATCH_CLASS_MAX; class_id++) { - auto& optional_id = kg->kernel_ids[class_id]; - if (optional_id) { - const auto kernel = kernels.at(optional_id.value()); - const std::vector& binaries = kernel->binaries(device->build_key()); - // TODO: this is really ugly, save me future-HAL! - if (programmable_core_type_index == - hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) { - uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0); - - if (class_id == DISPATCH_CLASS_TENSIX_DM0) { - kg->kernel_bin_sizes[0] = binary_packed_size; - kg->kernel_text_offsets[0] = offset; - kg->launch_msg.kernel_config.kernel_text_offset[0] = offset; - offset += binary_packed_size; - offset = align(offset, l1_alignment); - } else if (class_id == DISPATCH_CLASS_TENSIX_DM1) { - kg->kernel_bin_sizes[1] = binary_packed_size; - kg->kernel_text_offsets[1] = offset; - kg->launch_msg.kernel_config.kernel_text_offset[1] = offset; - offset += binary_packed_size; - offset = align(offset, l1_alignment); - - uint32_t binary_text_size = kernel->get_binary_text_size(device, 0); - TT_ASSERT(binary_text_size >> 4 <= std::numeric_limits::max()); - kg->launch_msg.kernel_config.ncrisc_kernel_size16 = (binary_text_size + 15) >> 4; - } else { - constexpr uint32_t max_math_processors_count = 3; - for (uint32_t proc_type_index = 0; proc_type_index < max_math_processors_count; - proc_type_index++) { - uint32_t binary_packed_size = kernel->get_binary_packed_size(device, proc_type_index); - kg->kernel_bin_sizes[2 + proc_type_index] = binary_packed_size; - kg->kernel_text_offsets[2 + proc_type_index] = offset; - kg->launch_msg.kernel_config.kernel_text_offset[2 + proc_type_index] = offset; - offset += binary_packed_size; - offset = align(offset, l1_alignment); - } - } - } else { - uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0); - kg->kernel_bin_sizes[class_id] = binary_packed_size; - - // No kernel config buffer on active eth yet - if (hal.get_programmable_core_type(kg->programmable_core_type_index) == - HalProgrammableCoreType::IDLE_ETH) { - kg->kernel_text_offsets[class_id] = offset; - kg->launch_msg.kernel_config.kernel_text_offset[class_id] = offset; - offset += binary_packed_size; - offset = align(offset, l1_alignment); - } else { - kg->kernel_text_offsets[class_id] = binaries[0]->get_text_addr(); - kg->launch_msg.kernel_config.kernel_text_offset[class_id] = binaries[0]->get_text_addr(); - } - } - } - } - - max_offset = std::max(offset, max_offset); - } - kernel_text_offset = base_offset; - kernel_text_size = max_offset - base_offset; - return max_offset; -} - -uint32_t get_packed_write_max_unicast_sub_cmds(Device* device) { - return device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y; -} - -void insert_empty_program_dispatch_preamble_cmd(ProgramCommandSequence& program_command_sequence) { - // Initialize an empty preamble command in the Program Dispatch Cmd Sequence, which will be - // updated with the correct write offsets when the program is enqueued - uint32_t preamble_cmd_sizeB = hal.get_alignment(HalMemType::HOST); - program_command_sequence.preamble_command_sequence = HostMemDeviceCommand(preamble_cmd_sizeB); - program_command_sequence.preamble_command_sequence.add_dispatch_set_write_offsets(0, 0, 0); -} - -void insert_stall_cmds(ProgramCommandSequence& program_command_sequence, SubDeviceId sub_device_id, Device* device) { - // Initialize stall command sequences for this program. - auto dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - uint32_t dispatch_message_addr = - dispatch_constants::get(dispatch_core_type) - .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE) + - dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(sub_device_id.to_index()); - uint32_t uncached_stall_cmd_sizeB = hal.get_alignment(HalMemType::HOST) + hal.get_alignment(HalMemType::HOST); - uint32_t cached_stall_cmd_seqB = hal.get_alignment(HalMemType::HOST); - - program_command_sequence.stall_command_sequences[UncachedStallSequenceIdx] = - HostMemDeviceCommand(uncached_stall_cmd_sizeB); - // Empty wait command initialized here. Will get updated when program is enqueued. - program_command_sequence.stall_command_sequences[UncachedStallSequenceIdx].add_dispatch_wait_with_prefetch_stall( - true, dispatch_message_addr, 0); - // Empty wait command initialized here. Will get updated when program is enqueued. - program_command_sequence.stall_command_sequences[CachedStallSequenceIdx] = - HostMemDeviceCommand(cached_stall_cmd_seqB); - program_command_sequence.stall_command_sequences[CachedStallSequenceIdx].add_dispatch_wait( - false, dispatch_message_addr, 0); -} - -template -uint32_t get_max_write_packed_sub_cmds( - uint32_t data_size, uint32_t max_prefetch_cmd_size, uint32_t packed_write_max_unicast_sub_cmds, bool no_stride) { - static_assert( - std::is_same::value or - std::is_same::value); - constexpr bool is_unicast = std::is_same::value; - uint32_t sub_cmd_sizeB = - is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); - // Approximate calculation due to alignment - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - uint32_t max_prefetch_size = max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - hal.get_alignment(HalMemType::HOST) - - sizeof(CQDispatchCmd) - l1_alignment; - uint32_t max_prefetch_num_packed_cmds = - no_stride ? (max_prefetch_size - align(data_size * sizeof(uint32_t), l1_alignment)) / sub_cmd_sizeB - : max_prefetch_size / (align(data_size * sizeof(uint32_t), l1_alignment) + sub_cmd_sizeB); - - uint32_t packed_write_max_multicast_sub_cmds = - get_packed_write_max_multicast_sub_cmds(packed_write_max_unicast_sub_cmds); - return std::min( - max_prefetch_num_packed_cmds, - is_unicast ? packed_write_max_unicast_sub_cmds : packed_write_max_multicast_sub_cmds); -}; - -template -void generate_runtime_args_cmds( - std::vector& runtime_args_command_sequences, - const uint32_t& l1_arg_base_addr, - const std::vector& sub_cmds, - const std::vector>>& rt_data_and_sizes, - const uint32_t& max_runtime_args_len, - std::vector>>& rt_args_data, - const uint32_t max_prefetch_command_size, - const uint32_t packed_write_max_unicast_sub_cmds, - bool no_stride, - enum DispatchWriteOffsets write_offset_index) { - static_assert( - std::is_same::value or - std::is_same::value); - - thread_local static auto get_runtime_payload_sizeB = - [](uint32_t num_packed_cmds, uint32_t runtime_args_len, bool is_unicast, bool no_stride) { - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - uint32_t sub_cmd_sizeB = - is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); - uint32_t dispatch_cmd_sizeB = sizeof(CQDispatchCmd) + align(num_packed_cmds * sub_cmd_sizeB, l1_alignment); - uint32_t aligned_runtime_data_sizeB = - (no_stride ? 1 : num_packed_cmds) * align(runtime_args_len * sizeof(uint32_t), l1_alignment); - return dispatch_cmd_sizeB + aligned_runtime_data_sizeB; - }; - thread_local static auto get_runtime_args_data_offset = - [](uint32_t num_packed_cmds, uint32_t runtime_args_len, bool is_unicast) { - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - uint32_t sub_cmd_sizeB = - is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); - uint32_t dispatch_cmd_sizeB = sizeof(CQDispatchCmd) + align(num_packed_cmds * sub_cmd_sizeB, l1_alignment); - return sizeof(CQPrefetchCmd) + dispatch_cmd_sizeB; - }; - - constexpr bool unicast = std::is_same::value; - - uint32_t num_packed_cmds_in_seq = sub_cmds.size(); - uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( - max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, no_stride); - uint32_t offset_idx = 0; - if (no_stride) { - TT_FATAL( - max_packed_cmds >= num_packed_cmds_in_seq, - "num_packed_cmds_in_seq {} cannot exceed max_packed_cmds {} when no_stride is true", - num_packed_cmds_in_seq, - max_packed_cmds); - } - uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - while (num_packed_cmds_in_seq != 0) { - // Generate the device command - uint32_t num_packed_cmds = std::min(num_packed_cmds_in_seq, max_packed_cmds); - uint32_t rt_payload_sizeB = - get_runtime_payload_sizeB(num_packed_cmds, max_runtime_args_len, unicast, no_stride); - uint32_t cmd_sequence_sizeB = align(sizeof(CQPrefetchCmd) + rt_payload_sizeB, pcie_alignment); - runtime_args_command_sequences.emplace_back(cmd_sequence_sizeB); - runtime_args_command_sequences.back().add_dispatch_write_packed( - num_packed_cmds, - l1_arg_base_addr, - max_runtime_args_len * sizeof(uint32_t), - rt_payload_sizeB, - sub_cmds, - rt_data_and_sizes, - packed_write_max_unicast_sub_cmds, - offset_idx, - no_stride, - write_offset_index); - - // Update kernel RTA pointers to point into the generated command - // Future RTA updates through the API will update the command sequence directly - uint32_t data_offset = (uint32_t)get_runtime_args_data_offset(num_packed_cmds, max_runtime_args_len, unicast); - const uint32_t data_inc = align(max_runtime_args_len * sizeof(uint32_t), l1_alignment); - uint32_t num_data_copies = no_stride ? 1 : num_packed_cmds; - for (uint32_t i = offset_idx; i < offset_idx + num_data_copies; ++i) { - uint32_t offset = 0; - for (auto& data : rt_args_data[i]) { - data.get().rt_args_data = - (uint32_t*)((char*)runtime_args_command_sequences.back().data() + data_offset + offset); - offset += data.get().rt_args_count * sizeof(uint32_t); - } - data_offset += data_inc; - } - num_packed_cmds_in_seq -= num_packed_cmds; - offset_idx += num_packed_cmds; - } -} - -void assemble_runtime_args_commands( - ProgramCommandSequence& program_command_sequence, Program& program, Device* device) { - static const uint32_t packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(device); - NOC noc_index = dispatch_downstream_noc; - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); - - // Dispatch Commands to Unicast Unique Runtime Args to Workers - std::vector unique_sub_cmds; - std::vector>> unique_rt_data_and_sizes; - std::vector>> unique_rt_args_data; - // Dispatch Commands to Multicast Common Runtime Args to Workers - std::variant, std::vector> - common_sub_cmds; - std::vector>> common_rt_data_and_sizes; - std::vector>> common_rt_args_data; - - program_command_sequence.runtime_args_command_sequences = {}; - uint32_t command_count = 0; - - // Unique RTAs - for (uint32_t programmable_core_type_index = 0; - programmable_core_type_index < hal.get_programmable_core_type_count(); - programmable_core_type_index++) { - if (hal.get_programmable_core_type(programmable_core_type_index) == HalProgrammableCoreType::IDLE_ETH) { - // Fast dispatch not supported on IDLE_ETH yet - continue; - } - for (auto& kg : program.get_kernel_groups(programmable_core_type_index)) { - if (kg->total_rta_size != 0) { - uint32_t num_sub_cmds = kg->core_ranges.num_cores(); - uint32_t max_runtime_args_len = kg->total_rta_size / sizeof(uint32_t); - uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( - max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false); - command_count += div_up(num_sub_cmds, max_packed_cmds); - } - } - } - // Common RTAs - for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) { - auto kernel = detail::GetKernel(program, kernel_id); - auto programmable_core_type = kernel->get_kernel_programmable_core_type(); - if (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) { - // Fast dispatch not supported on IDLE_ETH yet - continue; - } - uint32_t programmable_core_type_index = hal.get_programmable_core_type_index(programmable_core_type); - uint32_t common_size = - program.get_program_config(programmable_core_type_index).crta_sizes[kernel->dispatch_class()]; - if (common_size != 0) { - uint32_t max_runtime_args_len = common_size / sizeof(uint32_t); - const auto& common_rt_args = kernel->common_runtime_args(); - if (common_rt_args.size() > 0) { - CoreType core_type = hal.get_core_type(programmable_core_type_index); - if (core_type == CoreType::ETH) { - uint32_t num_sub_cmds = kernel->logical_cores().size(); - uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( - max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true); - command_count += div_up(num_sub_cmds, max_packed_cmds); - } else { - uint32_t num_sub_cmds = kernel->logical_coreranges().size(); - uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( - max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true); - command_count += div_up(num_sub_cmds, max_packed_cmds); - } - } - } - } - - program_command_sequence.runtime_args_command_sequences.reserve(command_count); - for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) { - if (hal.get_programmable_core_type(index) == HalProgrammableCoreType::IDLE_ETH) { - // Fast dispatch not supported on IDLE_ETH yet - // TODO: can't just loop here as code below confuses ACTIVE/IDLE - continue; - } - CoreType core_type = hal.get_core_type(index); - uint32_t processor_classes = hal.get_processor_classes_count(index); - - for (auto& kg : program.get_kernel_groups(index)) { - if (kg->total_rta_size != 0) { - for (const CoreRange& core_range : kg->core_ranges.ranges()) { - for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { - for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord core_coord(x, y); - - unique_rt_args_data.resize(unique_rt_args_data.size() + 1); - unique_rt_data_and_sizes.resize(unique_rt_data_and_sizes.size() + 1); - for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { - auto& optional_id = kg->kernel_ids[dispatch_class]; - if (optional_id) { - auto device_local_kernel_handle = - get_device_local_kernel_handle(optional_id.value()); - auto kernel = detail::GetKernel(program, device_local_kernel_handle); - if (!kernel->cores_with_runtime_args().empty()) { - const auto& runtime_args_data = kernel->runtime_args(core_coord); - unique_rt_args_data.back().emplace_back(kernel->runtime_args_data(core_coord)); - TT_ASSERT( - runtime_args_data.size() * sizeof(uint32_t) <= - kg->rta_sizes[dispatch_class]); - unique_rt_data_and_sizes.back().emplace_back( - runtime_args_data.data(), - runtime_args_data.size() * sizeof(uint32_t), - kg->rta_sizes[dispatch_class]); - } - } - } - CoreCoord virtual_core = device->virtual_core_from_logical_core(core_coord, core_type); - unique_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = device->get_noc_unicast_encoding(noc_index, virtual_core)}); - } - } - } - uint32_t rta_offset = program.get_program_config(index).rta_offset; - generate_runtime_args_cmds( - program_command_sequence.runtime_args_command_sequences, - rta_offset, - unique_sub_cmds, - unique_rt_data_and_sizes, - kg->total_rta_size / sizeof(uint32_t), - unique_rt_args_data, - max_prefetch_command_size, - packed_write_max_unicast_sub_cmds, - false, - core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE - : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE); - for (auto& data_per_kernel : unique_rt_data_and_sizes) { - for (auto& data_and_sizes : data_per_kernel) { - RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes)); - } - } - unique_sub_cmds.clear(); - unique_rt_data_and_sizes.clear(); - unique_rt_args_data.clear(); - } - } - - for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { - uint32_t common_size = program.get_program_config(index).crta_sizes[dispatch_class]; - if (common_size == 0) { - continue; - } - for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) { - auto kernel = detail::GetKernel(program, kernel_id); - if (kernel->get_kernel_core_type() != core_type) { - continue; // TODO: fixme, need list of kernels by core_typexdispatch_class - } - if (kernel->dispatch_class() != dispatch_class) { - continue; // TODO: fixme, need list of kernels by core_typexdispatch_class - } - - const auto& common_rt_args = kernel->common_runtime_args(); - if (common_rt_args.size() > 0) { - common_rt_args_data.resize(common_rt_args_data.size() + 1); - common_rt_data_and_sizes.resize(common_rt_data_and_sizes.size() + 1); - - TT_ASSERT(kernel->common_runtime_args_data().size() * sizeof(uint32_t) == common_size); - TT_ASSERT(common_rt_args.size() * sizeof(uint32_t) <= common_size); - common_rt_data_and_sizes.back().emplace_back( - common_rt_args.data(), common_rt_args.size() * sizeof(uint32_t), common_size); - common_rt_args_data.back().emplace_back(kernel->common_runtime_args_data()); - - if (core_type == CoreType::ETH) { - common_sub_cmds.emplace>( - std::vector()); - auto& unicast_sub_cmd = - std::get>(common_sub_cmds); - unicast_sub_cmd.reserve(kernel->logical_cores().size()); - for (auto& core_coord : kernel->logical_cores()) { - // can make a vector of unicast encodings here - CoreCoord virtual_core_coords = - device->virtual_core_from_logical_core(core_coord, CoreType::ETH); - unicast_sub_cmd.emplace_back(CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = device->get_noc_unicast_encoding(noc_index, virtual_core_coords)}); - } - } else { - std::vector> dst_noc_multicast_info = - device->extract_dst_noc_multicast_info( - kernel->logical_coreranges(), core_type); - common_sub_cmds.emplace>( - std::vector()); - auto& multicast_sub_cmd = - std::get>(common_sub_cmds); - multicast_sub_cmd.reserve(dst_noc_multicast_info.size()); - for (const auto& mcast_dests : dst_noc_multicast_info) { - multicast_sub_cmd.emplace_back(CQDispatchWritePackedMulticastSubCmd{ - .noc_xy_addr = device->get_noc_multicast_encoding( - noc_index, std::get(mcast_dests.first)), - .num_mcast_dests = mcast_dests.second}); - } - } - } - } - - uint32_t crta_offset = program.get_program_config(index).crta_offsets[dispatch_class]; - - // Common rtas are always expected to fit in one prefetch cmd - // TODO: use a linear write instead of a packed-write - std::visit( - [&](auto&& sub_cmds) { - generate_runtime_args_cmds( - program_command_sequence.runtime_args_command_sequences, - crta_offset, - sub_cmds, - common_rt_data_and_sizes, - common_size / sizeof(uint32_t), - common_rt_args_data, - max_prefetch_command_size, - packed_write_max_unicast_sub_cmds, - true, - core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE - : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE); - sub_cmds.clear(); - }, - common_sub_cmds); - - for (auto& data_per_kernel : common_rt_data_and_sizes) { - for (auto& data_and_sizes : data_per_kernel) { - RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes)); - } - } - common_rt_data_and_sizes.clear(); - common_rt_args_data.clear(); - } - } - TT_ASSERT( - command_count >= program_command_sequence.runtime_args_command_sequences.size(), - "Incorrect number of commands reserved {}, final size {}. Vector reallocation causes cached addresses to be " - "incorrect.", - command_count, - program_command_sequence.runtime_args_command_sequences.size()); - - uint32_t runtime_args_fetch_size_bytes = 0; - for (const auto& cmds : program_command_sequence.runtime_args_command_sequences) { - // BRISC, NCRISC, TRISC... - runtime_args_fetch_size_bytes += cmds.size_bytes(); - } - program_command_sequence.runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes; -} - -template -uint32_t insert_write_packed_payloads( - const uint32_t num_sub_cmds, - const uint32_t sub_cmd_sizeB, - const uint32_t max_prefetch_command_size, - const uint32_t packed_write_max_unicast_sub_cmds, - std::vector>& packed_cmd_payloads) { - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - const uint32_t aligned_sub_cmd_sizeB = align(sub_cmd_sizeB, l1_alignment); - const uint32_t max_packed_sub_cmds_per_cmd = get_max_write_packed_sub_cmds( - aligned_sub_cmd_sizeB, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false); - uint32_t rem_num_sub_cmds = num_sub_cmds; - uint32_t cmd_payload_sizeB = 0; - uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - while (rem_num_sub_cmds != 0) { - const uint32_t num_sub_cmds_in_cmd = std::min(max_packed_sub_cmds_per_cmd, rem_num_sub_cmds); - const uint32_t aligned_data_sizeB = aligned_sub_cmd_sizeB * num_sub_cmds_in_cmd; - const uint32_t dispatch_cmd_sizeB = - align(sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), l1_alignment); - packed_cmd_payloads.emplace_back(num_sub_cmds_in_cmd, dispatch_cmd_sizeB + aligned_data_sizeB); - cmd_payload_sizeB += align(sizeof(CQPrefetchCmd) + packed_cmd_payloads.back().second, pcie_alignment); - rem_num_sub_cmds -= num_sub_cmds_in_cmd; - } - return cmd_payload_sizeB; -} - -void assemble_device_commands( - ProgramCommandSequence& program_command_sequence, Program& program, Device* device, SubDeviceId sub_device_id) { - uint32_t cmd_sequence_sizeB = 0; - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - NOC noc_index = dispatch_downstream_noc; - const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); - static const uint32_t packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(device); - const auto& program_transfer_info = program.get_program_transfer_info(); - // Multicast Semaphore Cmd - uint32_t num_multicast_semaphores = program_transfer_info.multicast_semaphores.size(); - std::vector> multicast_sem_sub_cmds(num_multicast_semaphores); - std::vector>> multicast_sem_data(num_multicast_semaphores); - std::vector>> multicast_sem_payload(num_multicast_semaphores); - std::vector> multicast_sem_dst_size; - multicast_sem_dst_size.reserve(num_multicast_semaphores); - - if (num_multicast_semaphores > 0) { - uint32_t i = 0; - for (const auto& [dst, transfer_info_vec] : program_transfer_info.multicast_semaphores) { - // TODO: loop over things inside transfer_info[i] - uint32_t write_packed_len = transfer_info_vec[0].data.size(); - multicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t))); - - for (const auto& transfer_info : transfer_info_vec) { - for (const auto& dst_noc_info : transfer_info.dst_noc_info) { - TT_ASSERT( - transfer_info.data.size() == write_packed_len, - "Not all data std::vectors in write packed semaphore cmd equal in len"); - multicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedMulticastSubCmd{ - .noc_xy_addr = - device->get_noc_multicast_encoding(noc_index, std::get(dst_noc_info.first)), - .num_mcast_dests = dst_noc_info.second}); - multicast_sem_data[i].emplace_back( - transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t)); - } - } - cmd_sequence_sizeB += insert_write_packed_payloads( - multicast_sem_sub_cmds[i].size(), - multicast_sem_dst_size.back().second, - max_prefetch_command_size, - packed_write_max_unicast_sub_cmds, - multicast_sem_payload[i]); - i++; - } - } - - // Unicast Semaphore Cmd - uint32_t num_unicast_semaphores = program_transfer_info.unicast_semaphores.size(); - std::vector> unicast_sem_sub_cmds(num_unicast_semaphores); - std::vector>> unicast_sem_data(num_unicast_semaphores); - std::vector>> unicast_sem_payload(num_unicast_semaphores); - std::vector> unicast_sem_dst_size; - unicast_sem_dst_size.reserve(num_unicast_semaphores); - if (num_unicast_semaphores > 0) { - uint32_t i = 0; - for (const auto& [dst, transfer_info_vec] : program_transfer_info.unicast_semaphores) { - // TODO: loop over things inside transfer_info[i] - uint32_t write_packed_len = transfer_info_vec[0].data.size(); - unicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t))); - - for (const auto& transfer_info : transfer_info_vec) { - for (const auto& dst_noc_info : transfer_info.dst_noc_info) { - TT_ASSERT( - transfer_info.data.size() == write_packed_len, - "Not all data std::vectors in write packed semaphore cmd equal in len"); - unicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = - device->get_noc_unicast_encoding(noc_index, std::get(dst_noc_info.first))}); - unicast_sem_data[i].emplace_back( - transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t)); - } - } - cmd_sequence_sizeB += insert_write_packed_payloads( - unicast_sem_sub_cmds[i].size(), - unicast_sem_dst_size.back().second, - max_prefetch_command_size, - packed_write_max_unicast_sub_cmds, - unicast_sem_payload[i]); - i++; - } - } - - uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - - const auto& circular_buffers_unique_coreranges = program.circular_buffers_unique_coreranges(); - const uint16_t num_multicast_cb_sub_cmds = circular_buffers_unique_coreranges.size(); - std::vector> mcast_cb_payload; - uint16_t cb_config_size_bytes = 0; - uint32_t aligned_cb_config_size_bytes = 0; - std::vector> cb_config_payloads( - num_multicast_cb_sub_cmds, - std::vector(program.get_program_config(index).cb_size / sizeof(uint32_t), 0)); - std::vector multicast_cb_config_sub_cmds; - std::vector> multicast_cb_config_data; - if (num_multicast_cb_sub_cmds > 0) { - multicast_cb_config_sub_cmds.reserve(num_multicast_cb_sub_cmds); - multicast_cb_config_data.reserve(num_multicast_cb_sub_cmds); - program_command_sequence.circular_buffers_on_core_ranges.resize(num_multicast_cb_sub_cmds); - uint32_t i = 0; - uint32_t max_overall_index = 0; - uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t); - for (const CoreRange& core_range : circular_buffers_unique_coreranges) { - const CoreCoord virtual_start = - device->virtual_core_from_logical_core(core_range.start_coord, CoreType::WORKER); - const CoreCoord virtual_end = - device->virtual_core_from_logical_core(core_range.end_coord, CoreType::WORKER); - - const uint32_t num_receivers = core_range.size(); - auto& cb_config_payload = cb_config_payloads[i]; - uint32_t max_index = 0; - const auto& circular_buffers_on_corerange = program.circular_buffers_on_corerange(core_range); - program_command_sequence.circular_buffers_on_core_ranges[i].reserve(circular_buffers_on_corerange.size()); - for (const std::shared_ptr& cb : circular_buffers_on_corerange) { - program_command_sequence.circular_buffers_on_core_ranges[i].emplace_back(cb); - const uint32_t cb_address = cb->address(); - const uint32_t cb_size = cb->size(); - for (const auto& buffer_index : cb->local_buffer_indices()) { - // 1 cmd for all 32 buffer indices, populate with real data for specified indices - // cb config payload - const uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; - cb_config_payload[base_index] = cb_address; - cb_config_payload[base_index + 1] = cb_size; - cb_config_payload[base_index + 2] = cb->num_pages(buffer_index); - cb_config_payload[base_index + 3] = cb->page_size(buffer_index); - max_index = std::max(max_index, base_index + UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG); - } - for (const auto& buffer_index : cb->remote_buffer_indices()) { - const uint32_t base_index = - remote_offset_index + - (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) * UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG; - cb_config_payload[base_index] = cb->config_address(); - cb_config_payload[base_index + 1] = cb->page_size(buffer_index); - max_index = std::max(max_index, base_index + UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG); - } - } - multicast_cb_config_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{ - .noc_xy_addr = device->get_noc_multicast_encoding(noc_index, CoreRange(virtual_start, virtual_end)), - .num_mcast_dests = (uint32_t)core_range.size()}); - multicast_cb_config_data.emplace_back(cb_config_payload.data(), max_index * sizeof(uint32_t)); - max_overall_index = std::max(max_overall_index, max_index); - i++; - } - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - cb_config_size_bytes = max_overall_index * sizeof(uint32_t); - aligned_cb_config_size_bytes = align(cb_config_size_bytes, l1_alignment); - cmd_sequence_sizeB += insert_write_packed_payloads( - num_multicast_cb_sub_cmds, - cb_config_size_bytes, - max_prefetch_command_size, - packed_write_max_unicast_sub_cmds, - mcast_cb_payload); - } - - // Program Binaries and Go Signals - // Get launch msg data while getting size of cmds - std::vector> kernel_bins_prefetch_subcmds; - std::vector> kernel_bins_dispatch_subcmds; - std::vector kernel_bins_write_packed_large_data_aligned_sizeB; - std::vector kernel_bins_unicast_cmds; - const uint32_t max_length_per_sub_cmd = dispatch_constants::get(dispatch_core_type).scratch_db_size() / 2; - const uint32_t max_paged_length_per_sub_cmd = - max_length_per_sub_cmd / HostMemDeviceCommand::PROGRAM_PAGE_SIZE * HostMemDeviceCommand::PROGRAM_PAGE_SIZE; - if (program_transfer_info.kernel_bins.size()) { - TT_FATAL( - program.get_kernels_buffer(device).get(), "Expected Kernel Binary Buffer to be allocated for program."); - } - const auto kernels_buffer = program.get_kernels_buffer(device); - for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program_transfer_info.kernel_bins) { - bool write_linear; - uint32_t noc_encoding; - std::visit( - [&](auto&& cores) { - using T = std::decay_t; - if constexpr (std::is_same_v) { - noc_encoding = device->get_noc_multicast_encoding(noc_index, cores); - write_linear = false; - } else { - noc_encoding = device->get_noc_unicast_encoding(noc_index, cores); - write_linear = true; - } - }, - cores); - for (uint32_t kernel_idx = 0; kernel_idx < kg_transfer_info.dst_base_addrs.size(); kernel_idx++) { - if (write_linear) { - kernel_bins_unicast_cmds.emplace_back(2 * hal.get_alignment(HalMemType::HOST)); - cmd_sequence_sizeB += 2 * hal.get_alignment(HalMemType::HOST); - constexpr bool flush_prefetch = false; - kernel_bins_unicast_cmds.back().add_dispatch_write_linear( - num_mcast_dests, // num_mcast_dests - noc_encoding, // noc_xy_addr - kg_transfer_info.dst_base_addrs[kernel_idx], - kg_transfer_info.lengths[kernel_idx]); - RecordDispatchData( - program, - DISPATCH_DATA_BINARY, - kg_transfer_info.lengths[kernel_idx], - kg_transfer_info.riscvs[kernel_idx]); - // Difference between prefetch total relayed pages and dispatch write linear - uint32_t relayed_bytes = - align(kg_transfer_info.lengths[kernel_idx], HostMemDeviceCommand::PROGRAM_PAGE_SIZE); - uint16_t length_adjust = uint16_t(relayed_bytes - kg_transfer_info.lengths[kernel_idx]); - - uint32_t base_address, page_offset; - if (kg_transfer_info.page_offsets[kernel_idx] > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) { - const uint32_t num_banks = device->num_banks(kernels_buffer->buffer_type()); - page_offset = kg_transfer_info.page_offsets[kernel_idx] % num_banks; - uint32_t num_full_pages_written_per_bank = kg_transfer_info.page_offsets[kernel_idx] / num_banks; - base_address = - kernels_buffer->address() + num_full_pages_written_per_bank * kernels_buffer->page_size(); - } else { - base_address = kernels_buffer->address(); - page_offset = kg_transfer_info.page_offsets[kernel_idx]; - } - - kernel_bins_unicast_cmds.back().add_prefetch_relay_paged( - true, // is_dram - page_offset, - base_address, - kernels_buffer->page_size(), - relayed_bytes / kernels_buffer->page_size(), - length_adjust); - } else { - uint32_t base_address = kernels_buffer->address(); - uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx]; - - // TODO: pack all these writes into 1 linear write - uint32_t kernel_config_buffer_offset = kg_transfer_info.dst_base_addrs[kernel_idx]; - uint32_t aligned_length = - align(kg_transfer_info.lengths[kernel_idx], hal.get_alignment(HalMemType::DRAM)); - uint32_t padding = aligned_length - kg_transfer_info.lengths[kernel_idx]; - while (aligned_length != 0) { - if (kernel_bins_dispatch_subcmds.empty() || - kernel_bins_dispatch_subcmds.back().size() == CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_MAX_SUB_CMDS) { - kernel_bins_dispatch_subcmds.push_back({}); - kernel_bins_prefetch_subcmds.push_back({}); - kernel_bins_write_packed_large_data_aligned_sizeB.push_back(0); - } - uint32_t write_length, read_length; - if (aligned_length <= max_length_per_sub_cmd) { - read_length = aligned_length; - write_length = read_length - padding; - } else { - read_length = max_paged_length_per_sub_cmd; - write_length = read_length; - } - if (!kernel_bins_dispatch_subcmds.back().empty()) { - auto& back = kernel_bins_dispatch_subcmds.back().back(); - if (back.noc_xy_addr != noc_encoding) { - back.flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; - } - } - kernel_bins_dispatch_subcmds.back().emplace_back(CQDispatchWritePackedLargeSubCmd{ - .noc_xy_addr = noc_encoding, - .addr = kernel_config_buffer_offset, - .length = (uint16_t)write_length, - .num_mcast_dests = (uint8_t)num_mcast_dests, - .flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE}); - RecordDispatchData( - program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]); - kernel_config_buffer_offset += write_length; - - kernel_bins_prefetch_subcmds.back().emplace_back(CQPrefetchRelayPagedPackedSubCmd{ - .start_page = (uint16_t)page_offset, - .log_page_size = (uint16_t)HostMemDeviceCommand::LOG2_PROGRAM_PAGE_SIZE, - .base_addr = base_address, - .length = read_length}); - page_offset += read_length / HostMemDeviceCommand::PROGRAM_PAGE_SIZE; - aligned_length -= read_length; - kernel_bins_write_packed_large_data_aligned_sizeB.back() += read_length; - } - } - } - } - // Unlink the last subcmd of every dispatch, to ensure we don't hold the - // path reservation for an incredible long time. This also prevents a hang - // if the next mcast is to a different destination. - for (auto& subcmd_list : kernel_bins_dispatch_subcmds) { - if (!subcmd_list.empty()) { - subcmd_list.back().flags |= CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; - } - } - uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) { - cmd_sequence_sizeB += align( - ((sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd))) + - kernel_bins_dispatch_subcmds[i].size() * sizeof(CQDispatchWritePackedLargeSubCmd), - pcie_alignment); - cmd_sequence_sizeB += align( - kernel_bins_prefetch_subcmds[i].size() * sizeof(CQPrefetchRelayPagedPackedSubCmd) + sizeof(CQPrefetchCmd), - pcie_alignment); - } - std::vector> multicast_go_signal_data; - std::vector> unicast_go_signal_data; - std::vector multicast_go_signal_sub_cmds; - std::vector unicast_go_signal_sub_cmds; - std::vector> multicast_go_signals_payload; - std::vector> unicast_go_signals_payload; - constexpr uint32_t go_signal_sizeB = sizeof(launch_msg_t); - uint32_t aligned_go_signal_sizeB = align(go_signal_sizeB, hal.get_alignment(HalMemType::L1)); - uint32_t go_signal_size_words = aligned_go_signal_sizeB / sizeof(uint32_t); - - // TODO: eventually the code below could be structured to loop over programmable_indices - // and check for mcast/unicast - uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - for (auto& kernel_group : program.get_kernel_groups(programmable_core_index)) { - kernel_group->launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; - for (uint32_t i = 0; i < NUM_PROGRAMMABLE_CORE_TYPES; i++) { - kernel_group->launch_msg.kernel_config.kernel_config_base[i] = 0; - } - kernel_group->launch_msg.kernel_config.host_assigned_id = program.get_runtime_id(); - const void* launch_message_data = (const void*)(&(kernel_group->launch_msg)); - for (const CoreRange& core_range : kernel_group->core_ranges.ranges()) { - CoreCoord virtual_start = - device->virtual_core_from_logical_core(core_range.start_coord, kernel_group->get_core_type()); - CoreCoord virtual_end = - device->virtual_core_from_logical_core(core_range.end_coord, kernel_group->get_core_type()); - - multicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{ - .noc_xy_addr = device->get_noc_multicast_encoding(noc_index, CoreRange(virtual_start, virtual_end)), - .num_mcast_dests = (uint32_t)core_range.size()}); - multicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB); - } - } - if (multicast_go_signal_sub_cmds.size() > 0) { - cmd_sequence_sizeB += insert_write_packed_payloads( - multicast_go_signal_sub_cmds.size(), - go_signal_sizeB, - max_prefetch_command_size, - packed_write_max_unicast_sub_cmds, - multicast_go_signals_payload); - } - - programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); - // TODO: ugly, can be fixed by looping over indices w/ some work - if (programmable_core_index != -1) { - for (auto& kernel_group : program.get_kernel_groups(programmable_core_index)) { - kernel_group->launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; - // Set the kernel_config_base addrs to 0 when generating the dispatch commands for the program - // Will be resolved at runtime - for (uint32_t i = 0; i < NUM_PROGRAMMABLE_CORE_TYPES; i++) { - kernel_group->launch_msg.kernel_config.kernel_config_base[i] = 0; - } - kernel_group->launch_msg.kernel_config.host_assigned_id = program.get_runtime_id(); - const void* launch_message_data = (const launch_msg_t*)(&(kernel_group->launch_msg)); - for (const CoreRange& core_range : kernel_group->core_ranges.ranges()) { - for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { - for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord virtual_coord = - device->virtual_core_from_logical_core(CoreCoord({x, y}), kernel_group->get_core_type()); - unicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = device->get_noc_unicast_encoding(noc_index, virtual_coord)}); - unicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB); - } - } - } - } - } - - if (unicast_go_signal_sub_cmds.size() > 0) { - cmd_sequence_sizeB += insert_write_packed_payloads( - unicast_go_signal_sub_cmds.size(), - go_signal_sizeB, - max_prefetch_command_size, - packed_write_max_unicast_sub_cmds, - unicast_go_signals_payload); - } - - // if dispatch_s is enabled have dispatch_d send a semaphore update to dispatch_s (this will include a write barrier - // on dispatch_d if program is active) if not, check if the program is active on workers. If active, have - // dispatch_d issue a write barrier - cmd_sequence_sizeB += (device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) * - hal.get_alignment(HalMemType::HOST); - - // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command) - const auto& noc_data_start_idx = device->noc_data_start_index( - sub_device_id, multicast_go_signal_sub_cmds.size() > 0, unicast_go_signal_sub_cmds.size() > 0); - const auto& num_noc_mcast_txns = - multicast_go_signal_sub_cmds.size() > 0 ? device->num_noc_mcast_txns(sub_device_id) : 0; - const auto& num_noc_unicast_txns = - unicast_go_signal_sub_cmds.size() > 0 ? device->num_noc_unicast_txns(sub_device_id) : 0; - cmd_sequence_sizeB += align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment); - - program_command_sequence.device_command_sequence = HostMemDeviceCommand(cmd_sequence_sizeB); - - auto& device_command_sequence = program_command_sequence.device_command_sequence; - - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - - // Semaphores - // Multicast Semaphore Cmd - index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - for (uint32_t i = 0; i < num_multicast_semaphores; ++i) { - uint32_t curr_sub_cmd_idx = 0; - for (const auto& [num_sub_cmds_in_cmd, multicast_sem_payload_sizeB] : multicast_sem_payload[i]) { - device_command_sequence.add_dispatch_write_packed( - num_sub_cmds_in_cmd, - multicast_sem_dst_size[i].first + program.get_program_config(index).sem_offset, - multicast_sem_dst_size[i].second, - multicast_sem_payload_sizeB, - multicast_sem_sub_cmds[i], - multicast_sem_data[i], - packed_write_max_unicast_sub_cmds, - curr_sub_cmd_idx, - false, - DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE); - curr_sub_cmd_idx += num_sub_cmds_in_cmd; - for (auto& data_and_size : multicast_sem_data[i]) { - RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second); - } - } - } - - // Unicast Semaphore Cmd - index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); - for (uint32_t i = 0; i < num_unicast_semaphores; ++i) { - uint32_t curr_sub_cmd_idx = 0; - for (const auto& [num_sub_cmds_in_cmd, unicast_sem_payload_sizeB] : unicast_sem_payload[i]) { - device_command_sequence.add_dispatch_write_packed( - num_sub_cmds_in_cmd, - unicast_sem_dst_size[i].first + program.get_program_config(index).sem_offset, - unicast_sem_dst_size[i].second, - unicast_sem_payload_sizeB, - unicast_sem_sub_cmds[i], - unicast_sem_data[i], - packed_write_max_unicast_sub_cmds, - curr_sub_cmd_idx, - false, - DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE); - curr_sub_cmd_idx += num_sub_cmds_in_cmd; - for (auto& data_and_size : unicast_sem_data[i]) { - RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second); - } - } - } - - // CB Configs commands - index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - if (num_multicast_cb_sub_cmds > 0) { - uint32_t curr_sub_cmd_idx = 0; - program_command_sequence.cb_configs_payloads.reserve(num_multicast_cb_sub_cmds); - const uint32_t cb_config_size_words = aligned_cb_config_size_bytes / sizeof(uint32_t); - for (const auto& [num_sub_cmds_in_cmd, mcast_cb_payload_sizeB] : mcast_cb_payload) { - uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes(); - device_command_sequence.add_dispatch_write_packed( - num_sub_cmds_in_cmd, - program.get_program_config(index).cb_offset, - cb_config_size_bytes, - mcast_cb_payload_sizeB, - multicast_cb_config_sub_cmds, - multicast_cb_config_data, - packed_write_max_unicast_sub_cmds, - curr_sub_cmd_idx, - false, - DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE); - for (auto& data_and_size : multicast_cb_config_data) { - RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, data_and_size.second); - } - curr_sub_cmd_idx += num_sub_cmds_in_cmd; - RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, mcast_cb_payload_sizeB); - uint32_t curr_sub_cmd_data_offset_words = - (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) + - align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), l1_alignment)) / - sizeof(uint32_t); - for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { - program_command_sequence.cb_configs_payloads.push_back( - (uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words); - curr_sub_cmd_data_offset_words += cb_config_size_words; - } - } - } - // All Previous Cmds Up to This Point Go Into the Kernel Config Buffer - program_command_sequence.program_config_buffer_data_size_bytes = device_command_sequence.write_offset_bytes(); - - // Program Binaries - for (const auto& kernel_bins_unicast_cmd : kernel_bins_unicast_cmds) { - device_command_sequence.add_data( - kernel_bins_unicast_cmd.data(), kernel_bins_unicast_cmd.size_bytes(), kernel_bins_unicast_cmd.size_bytes()); - } - uint32_t dram_alignment = hal.get_alignment(HalMemType::DRAM); - for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) { - device_command_sequence.add_dispatch_write_packed_large( - dram_alignment, - kernel_bins_dispatch_subcmds[i].size(), - kernel_bins_dispatch_subcmds[i], - 0, - DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE); - device_command_sequence.add_prefetch_relay_paged_packed( - kernel_bins_write_packed_large_data_aligned_sizeB[i], - kernel_bins_prefetch_subcmds[i], - kernel_bins_prefetch_subcmds[i].size()); - } - - // Go Signals - program_command_sequence.go_signals.reserve( - multicast_go_signal_sub_cmds.size() + unicast_go_signal_sub_cmds.size()); - - // Launch Message address is resolved when the program is enqueued - uint32_t multicast_launch_msg_addr = 0; - - if (multicast_go_signal_sub_cmds.size() > 0) { - uint32_t curr_sub_cmd_idx = 0; - for (const auto& [num_sub_cmds_in_cmd, multicast_go_signal_payload_sizeB] : multicast_go_signals_payload) { - uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes(); - device_command_sequence.add_dispatch_write_packed( - num_sub_cmds_in_cmd, - multicast_launch_msg_addr, - go_signal_sizeB, - multicast_go_signal_payload_sizeB, - multicast_go_signal_sub_cmds, - multicast_go_signal_data, - packed_write_max_unicast_sub_cmds, - curr_sub_cmd_idx); - curr_sub_cmd_idx += num_sub_cmds_in_cmd; - program_command_sequence.launch_msg_write_packed_cmd_ptrs.push_back( - &((CQDispatchCmd*)((uint32_t*)device_command_sequence.data() + - (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t))) - ->write_packed); - uint32_t curr_sub_cmd_data_offset_words = - (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) + - align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), l1_alignment)) / - sizeof(uint32_t); - for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { - program_command_sequence.go_signals.push_back( - (launch_msg_t*)((uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words)); - curr_sub_cmd_data_offset_words += go_signal_size_words; - } - } - } - - if (unicast_go_signal_sub_cmds.size() > 0) { - // Launch Message address is resolved when the program is enqueued - uint32_t unicast_launch_msg_addr = 0; - uint32_t curr_sub_cmd_idx = 0; - for (const auto& [num_sub_cmds_in_cmd, unicast_go_signal_payload_sizeB] : unicast_go_signals_payload) { - uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes(); - device_command_sequence.add_dispatch_write_packed( - num_sub_cmds_in_cmd, - unicast_launch_msg_addr, - go_signal_sizeB, - unicast_go_signal_payload_sizeB, - unicast_go_signal_sub_cmds, - unicast_go_signal_data, - packed_write_max_unicast_sub_cmds, - curr_sub_cmd_idx); - curr_sub_cmd_idx += num_sub_cmds_in_cmd; - program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs.push_back( - &((CQDispatchCmd*)((uint32_t*)device_command_sequence.data() + - (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t))) - ->write_packed); - uint32_t curr_sub_cmd_data_offset_words = - (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) + - align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedUnicastSubCmd), l1_alignment)) / - sizeof(uint32_t); - for (uint32_t i = 0; i < num_sub_cmds_in_cmd; ++i) { - program_command_sequence.go_signals.push_back( - (launch_msg_t*)((uint32_t*)device_command_sequence.data() + curr_sub_cmd_data_offset_words)); - curr_sub_cmd_data_offset_words += go_signal_size_words; - } - } - } - - DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER; - auto sub_device_index = sub_device_id.to_index(); - uint32_t dispatch_message_addr = - dispatch_constants::get(dispatch_core_type) - .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE) + - dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(sub_device_index); - if (device->dispatch_s_enabled()) { - // dispatch_d signals dispatch_s to send the go signal, use a barrier if there are cores active - uint16_t index_bitmask = 0; - index_bitmask |= 1 << sub_device_index; - device_command_sequence.add_notify_dispatch_s_go_signal_cmd( - program_transfer_info.num_active_cores > 0, index_bitmask); - dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE; - } else { - // Wait Noc Write Barrier, wait for binaries/configs and launch_msg to be written to worker cores - if (program_transfer_info.num_active_cores > 0) { - device_command_sequence.add_dispatch_wait(true, dispatch_message_addr, 0, 0, false, false); - } - } - go_msg_t run_program_go_signal; - run_program_go_signal.signal = RUN_MSG_GO; - // Dispatch X/Y resolved when the program is enqueued - run_program_go_signal.master_x = 0; - run_program_go_signal.master_y = 0; - run_program_go_signal.dispatch_message_offset = - (uint8_t)dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(sub_device_index); - uint32_t write_offset_bytes = device_command_sequence.write_offset_bytes(); - // Num Workers Resolved when the program is enqueued - device_command_sequence.add_dispatch_go_signal_mcast( - 0, - *reinterpret_cast(&run_program_go_signal), - dispatch_message_addr, - num_noc_mcast_txns, - num_noc_unicast_txns, - noc_data_start_idx, - dispatcher_for_go_signal); - program_command_sequence.mcast_go_signal_cmd_ptr = - &((CQDispatchCmd*)((uint32_t*)device_command_sequence.data() + - (write_offset_bytes + sizeof(CQPrefetchCmd)) / sizeof(uint32_t))) - ->mcast; -} - -void reserve_space_in_kernel_config_buffer( - WorkerConfigBufferMgr& config_buffer_mgr, - const std::vector& program_config_sizes, - bool kernel_binary_always_stored_in_ringbuffer, - ProgramBinaryStatus program_binary_status, - uint32_t num_program_workers, - uint32_t expected_num_workers_completed, - ProgramDispatchMetadata& dispatch_md) { - // Reserve space in kernel config ring buffer for the current program - std::pair&> reservation = - config_buffer_mgr.reserve(program_config_sizes); - // Determine where a sync (dispatch wait on workers) must be inserted in the program sequence and the number - // of workers to wait on - dispatch_md.sync_count = 0; - dispatch_md.stall_first = reservation.first.need_sync; - dispatch_md.stall_before_program = false; - - if (!kernel_binary_always_stored_in_ringbuffer) { - // Program runs on cores without ring buffer. Sync on all - // workers before dispatching program data - dispatch_md.sync_count = expected_num_workers_completed; - dispatch_md.stall_before_program = !dispatch_md.stall_first; - } else if (reservation.first.need_sync) { - // TODO: attempt to send RTA only without stalling. - dispatch_md.sync_count = reservation.first.sync_count; - // Check if the launch message is the only thing preventing us from - // sending the program. If so, we can at least send the RTAs. Ideally we - // would also send the kernel binaries in this case, but the rest of the - // code isn't set up for that. - auto config_sizes = program_config_sizes; - config_sizes[config_sizes.size() - 1] = 0; - const std::pair&> memory_reservation = - config_buffer_mgr.reserve(config_sizes); - if (!memory_reservation.first.need_sync) { - dispatch_md.stall_first = false; - dispatch_md.stall_before_program = true; - } - reservation = config_buffer_mgr.reserve(program_config_sizes); - } - - if (program_binary_status == ProgramBinaryStatus::InFlight) { - // Program binary not commited to DRAM. Sync on all workers before dispatching kernel - // binaries for this program. This requires freeing the entire kernel config buffer. - config_buffer_mgr.free(expected_num_workers_completed); - } else { - if (dispatch_md.stall_first || dispatch_md.stall_before_program) { - config_buffer_mgr.free(dispatch_md.sync_count); - } - } - config_buffer_mgr.alloc(expected_num_workers_completed + num_program_workers); - - if (program_binary_status != ProgramBinaryStatus::Committed) { - // Insert a stall before writing any program configs when binaries are in flight - dispatch_md.stall_first = true; - dispatch_md.stall_before_program = false; - // Wait on all previous workers before writing kernel binaries to workers - dispatch_md.sync_count = expected_num_workers_completed; - } - - dispatch_md.kernel_config_addrs = reservation.second; -} - -void update_program_dispatch_commands( - Program& program, - ProgramCommandSequence& cached_program_command_sequence, - const tt::stl::Span kernel_config_addrs, - uint32_t multicast_cores_launch_message_wptr, - uint32_t unicast_cores_launch_message_wptr, - uint32_t expected_num_workers_completed, - CoreCoord dispatch_core, - CoreType dispatch_core_type, - SubDeviceId sub_device_id, - const ProgramDispatchMetadata& dispatch_md, - ProgramBinaryStatus program_binary_status, - int num_unicast_txns) { - uint32_t i = 0; - ZoneScopedN("program_loaded_on_device"); - - static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count)); - static constexpr uint32_t tensix_l1_write_offset_offset = - (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset1)); - static constexpr uint32_t eth_l1_write_offset_offset = - (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset2)); - // Update Stall Command Sequence - if (program_binary_status != ProgramBinaryStatus::Committed) { - // Program binary is in flight. Issue a Prefetch Stall - cached_program_command_sequence.current_stall_seq_idx = UncachedStallSequenceIdx; - } else { - // Program Binary is in DRAM. Prefetcher does not need to stall before reading - // binary - cached_program_command_sequence.current_stall_seq_idx = CachedStallSequenceIdx; - } - - auto& curr_stall_seq_idx = cached_program_command_sequence.current_stall_seq_idx; - cached_program_command_sequence.stall_command_sequences[curr_stall_seq_idx].update_cmd_sequence( - wait_count_offset, &(dispatch_md.sync_count), sizeof(uint32_t)); - - // Update preamble based on kernel config ring buffer slot - cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence( - tensix_l1_write_offset_offset, - &kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)], - sizeof(uint32_t)); - if (hal.get_programmable_core_type_count() >= 2) { - cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence( - eth_l1_write_offset_offset, - &kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)], - sizeof(uint32_t)); - } - - // Update CB Configs - uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t); - for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) { - uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i]; - for (const std::shared_ptr& cb : cbs_on_core_range) { - const uint32_t cb_address = cb->address(); - const uint32_t cb_size = cb->size(); - for (const auto& buffer_index : cb->local_buffer_indices()) { - // 1 cmd for all 32 buffer indices, populate with real data for specified indices - - // cb config payload - uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; - cb_config_payload[base_index] = cb_address; - cb_config_payload[base_index + 1] = cb_size; - cb_config_payload[base_index + 2] = cb->num_pages(buffer_index); - cb_config_payload[base_index + 3] = cb->page_size(buffer_index); - } - for (const auto& buffer_index : cb->remote_buffer_indices()) { - const uint32_t base_index = remote_offset_index + (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) * - UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG; - cb_config_payload[base_index] = cb->config_address(); - cb_config_payload[base_index + 1] = cb->page_size(buffer_index); - } - } - i++; - } - // Update launch messages - for (auto& go_signal : cached_program_command_sequence.go_signals) { - for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) { - go_signal->kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr; - } - go_signal->kernel_config.host_assigned_id = program.get_runtime_id(); - } - // Update launch message addresses to reflect new launch_msg slot in ring buffer - uint32_t multicast_cores_launch_msg_addr = - hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::LAUNCH) + - multicast_cores_launch_message_wptr * sizeof(launch_msg_t); - for (auto launch_msg_cmd_ptr : cached_program_command_sequence.launch_msg_write_packed_cmd_ptrs) { - launch_msg_cmd_ptr->addr = multicast_cores_launch_msg_addr; - } - if (cached_program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs.size()) { - uint32_t unicast_cores_launch_message_addr = - hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH) + - unicast_cores_launch_message_wptr * sizeof(launch_msg_t); - for (auto launch_msg_cmd_ptr : cached_program_command_sequence.unicast_launch_msg_write_packed_cmd_ptrs) { - launch_msg_cmd_ptr->addr = unicast_cores_launch_message_addr; - } - } - // Update go signal to reflect potentially modified dispatch core and new wait count - go_msg_t run_program_go_signal; - run_program_go_signal.signal = RUN_MSG_GO; - run_program_go_signal.master_x = (uint8_t)dispatch_core.x; - run_program_go_signal.master_y = (uint8_t)dispatch_core.y; - run_program_go_signal.dispatch_message_offset = - (uint8_t)dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(sub_device_id.to_index()); - cached_program_command_sequence.mcast_go_signal_cmd_ptr->go_signal = - *reinterpret_cast(&run_program_go_signal); - cached_program_command_sequence.mcast_go_signal_cmd_ptr->wait_count = expected_num_workers_completed; - // Update the number of unicast txns based on user provided parameter - // This is required when a MeshWorkload users ethernet cores on a set of devices - // where the number of active eth cores is heterogenous across devices. - // Update the number of unicast txns to eth cores to match the minimum number of cores - // across devices (specified by user) - if (num_unicast_txns >= 0 && cached_program_command_sequence.mcast_go_signal_cmd_ptr->num_unicast_txns) { - cached_program_command_sequence.mcast_go_signal_cmd_ptr->num_unicast_txns = num_unicast_txns; - } -} - -KernelHandle get_device_local_kernel_handle(KernelHandle kernel_handle) { - // Device local Kernel Handle/Kernel Ids are 16 bit. The top 16 bits of - // the Kernel Handle may encode device coordinates when MeshWorkloads are - // being dispatched. - return kernel_handle & 0xffff; -} - -} // namespace program_utils - -} // namespace tt::tt_metal diff --git a/tt_metal/impl/program/program_dispatch_utils.hpp b/tt_metal/impl/program/program_dispatch_utils.hpp deleted file mode 100644 index 68cd099f3d2..00000000000 --- a/tt_metal/impl/program/program_dispatch_utils.hpp +++ /dev/null @@ -1,112 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "tt_metal/impl/buffers/circular_buffer.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/kernels/kernel.hpp" -#include "tt_metal/impl/program/program.hpp" -#include "tt_metal/impl/dispatch/worker_config_buffer.hpp" - -namespace tt { - -namespace tt_metal { - -namespace program_utils { -#define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST) - -struct ProgramDispatchMetadata { - std::vector kernel_config_addrs; - uint32_t sync_count; - uint32_t stall_first; - uint32_t stall_before_program; -}; - -uint32_t configure_rta_offsets_for_kernel_groups( - uint32_t programmable_core_type_index, - std::unordered_map>& kernels, - std::vector>& kernel_groups, - uint32_t base_offset); - -uint32_t configure_crta_offsets_for_kernel_groups( - uint32_t programmable_core_type_index, - std::unordered_map>& kernels, - std::vector>& kernel_groups, - uint32_t crta_base_offset, - std::array& crta_offsets, - std::array& crta_sizes); - -uint32_t finalize_rt_args( - std::unordered_map>& kernels, - std::vector>& kernel_groups, - uint32_t base_offset, - uint32_t programmable_core_type_index, - uint32_t& rta_offset, - std::array& crta_offsets, - std::array& crta_sizes); - -uint32_t finalize_sems( - uint32_t programmable_core_type_index, - uint32_t sem_base_offset, - const std::vector& semaphores, - uint32_t& semaphore_offset, - uint32_t& semaphore_size); - -uint32_t finalize_cbs( - uint32_t programmable_core_type_index, - std::vector>& kernel_groups, - uint32_t base_offset, - uint32_t& cb_offset, - uint32_t& cb_size, - uint32_t& local_cb_size); - -uint32_t finalize_kernel_bins( - Device* device, - uint32_t programmable_core_type_index, - const std::unordered_map>& kernels, - std::vector>& kernel_groups, - uint32_t base_offset, - uint32_t& kernel_text_offset, - uint32_t& kernel_text_size); - -void insert_empty_program_dispatch_preamble_cmd(ProgramCommandSequence& program_command_sequence); - -void insert_stall_cmds(ProgramCommandSequence& program_command_sequence, SubDeviceId sub_device_id, Device* device); - -void assemble_runtime_args_commands(ProgramCommandSequence& program_command_sequence, Program& program, Device* device); - -void assemble_device_commands( - ProgramCommandSequence& program_command_sequence, Program& program, Device* device, SubDeviceId sub_device_id); - -void reserve_space_in_kernel_config_buffer( - WorkerConfigBufferMgr& config_buffer_mgr, - const std::vector& program_config_sizes, - bool kernel_binary_always_stored_in_ringbuffer, - ProgramBinaryStatus program_binary_status, - uint32_t num_program_workers, - uint32_t expected_num_workers_completed, - ProgramDispatchMetadata& dispatch_md); - -void update_program_dispatch_commands( - Program& program, - ProgramCommandSequence& cached_program_command_sequence, - const tt::stl::Span kernel_config_addrs, - uint32_t multicast_cores_launch_message_wptr, - uint32_t unicast_cores_launch_message_wptr, - uint32_t expected_num_workers_completed, - CoreCoord dispatch_core, - CoreType dispatch_core_type, - SubDeviceId sub_device_id, - const ProgramDispatchMetadata& dispatch_md, - ProgramBinaryStatus program_binary_status, - int num_unicast_txns = -1); - -KernelHandle get_device_local_kernel_handle(KernelHandle kernel_handle); - -} // namespace program_utils - -} // namespace tt_metal - -} // namespace tt diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index d6ed7beab3d..38d17d6631f 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -801,15 +801,15 @@ void WriteRuntimeArgsToDevice(Device* device, Program& program) { for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) { CoreType core_type = hal.get_core_type(index); uint32_t processor_classes = hal.get_processor_classes_count(index); - for (const auto& kg : program.get_kernel_groups(index)) { - uint32_t kernel_config_base = kg->launch_msg.kernel_config.kernel_config_base[index]; - for (const CoreRange& core_range : kg->core_ranges.ranges()) { + for (auto& kg : program.get_kernel_groups(index)) { + uint32_t kernel_config_base = kg.launch_msg.kernel_config.kernel_config_base[index]; + for (const CoreRange& core_range : kg.core_ranges.ranges()) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord logical_core(x, y); auto physical_core = device->virtual_core_from_logical_core(logical_core, core_type); for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { - auto& optional_id = kg->kernel_ids[dispatch_class]; + auto& optional_id = kg.kernel_ids[dispatch_class]; if (optional_id) { const auto& kernel = detail::GetKernel(program, optional_id.value()); const auto& rt_args = kernel->runtime_args(logical_core); @@ -817,7 +817,7 @@ void WriteRuntimeArgsToDevice(Device* device, Program& program) { if (rt_args.size() > 0) { auto rt_args_addr = kernel_config_base + - kg->launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset; + kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset; log_trace( tt::LogMetal, "{} - Writing {} unique rtargs to core {} (physical: {}) addr 0x{:x} => args: " @@ -835,7 +835,7 @@ void WriteRuntimeArgsToDevice(Device* device, Program& program) { if (common_rt_args.size() > 0) { auto common_rt_args_addr = kernel_config_base + - kg->launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset; + kg.launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset; log_trace( tt::LogMetal, "{} - Writing {} common rtargs to core {} (physical: {}) addr 0x{:x} => args: "