From 5058b8f40d005dfd9f037f49b648b91846adb417 Mon Sep 17 00:00:00 2001 From: John Bauman Date: Thu, 19 Dec 2024 17:04:31 +0000 Subject: [PATCH] #15605: Only force-stall ethernet programs on earlier ethernet programs Keep track of when the last program using active ethernet cores was dispatched, so we can wait on that program before sending out binaries. This is better than always waiting on the immediate previous program, since in most cases we don't run programs on the ethernet cores back-to-back. --- tt_metal/impl/dispatch/command_queue.cpp | 14 +++++++------- tt_metal/impl/program/program.cpp | 7 ++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 9d6db6cdbf4..4d5dec0ed96 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -1568,11 +1568,7 @@ void EnqueueProgramCommand::process() { uint32_t sync_count = 0; bool stall_first = reservation.first.need_sync; bool stall_before_program = false; - if (!program.kernel_binary_always_stored_in_ringbuffer()) { - // Wait for all existing commands to run before writing out the kernel binary. - sync_count = this->expected_num_workers_completed; - stall_before_program = !stall_first; - } else if (reservation.first.need_sync) { + if (reservation.first.need_sync) { // TODO: attempt to send RTA only without stalling. sync_count = reservation.first.sync_count; // Check if the launch message is the only thing preventing us from @@ -1580,6 +1576,7 @@ void EnqueueProgramCommand::process() { // would also send the kernel binaries in this case, but the rest of the // code isn't set up for that. auto config_sizes = program.get_program_config_sizes(); + config_sizes[config_sizes.size() - 2] = 0; config_sizes[config_sizes.size() - 1] = 0; const std::pair&> memory_reservation = this->config_buffer_mgr.reserve(config_sizes); @@ -1622,9 +1619,9 @@ void EnqueueProgramCommand::process() { this->config_buffer_mgr.alloc(this->expected_num_workers_completed + num_workers); std::vector& kernel_config_addrs_raw = reservation.second; - // Remove launch buffer from config addrs, since it's not a real core. + // Remove launch buffers from config addrs, since they're not real cores. const tt::stl::Span kernel_config_addrs{ - kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 1}; + kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 2}; RecordProgramRun(program); @@ -3077,6 +3074,9 @@ void HWCommandQueue::reset_config_buffer_mgr(const uint32_t num_entries) { // Subtract 1 from the number of entries, so the watcher can read information (e.g. fired asserts) from the // previous launch message. this->config_buffer_mgr[i].init_add_buffer(0, launch_msg_buffer_num_entries - 1); + + // There's no ring buffer for active ethernet binaries, so keep track of them separately. + this->config_buffer_mgr[i].init_add_buffer(0, 1); } } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index d3cf81833f1..6ff0a5c2a64 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -337,9 +337,7 @@ detail::Program_::Program_() : } program_configs_.resize(programmable_core_count); - program_config_sizes_.resize(programmable_core_count + 1); - // Always need one launch buffer msg for a program. - program_config_sizes_[programmable_core_count] = 1; + program_config_sizes_.resize(programmable_core_count + 2); } Program::Program() : pimpl_(std::make_unique()) {} @@ -1504,6 +1502,9 @@ void detail::Program_::finalize(Device *device) { offset, max_size, magic_enum::enum_name(programmable_core_type)); } + this->get_program_config_size(hal.get_programmable_core_type_count()) = runs_on_noc_multicast_only_cores(); + this->get_program_config_size(hal.get_programmable_core_type_count() + 1) = runs_on_noc_unicast_only_cores(); + // The sem offsets cross programmable_core_types so must be set after the loop above this->set_launch_msg_sem_offsets();