#15605: Only force-stall ethernet programs on earlier ethernet programs

Keep track of when the last program using active ethernet cores was dispatched, so we can wait on that program before sending out binaries. This is better than always waiting on the immediate previous program, since in most cases we don't run programs on the ethernet cores back-to-back.
tenstorrent · Dec 20, 2024 · 5058b8f · 5058b8f
1 parent b4d8b4c
commit 5058b8f
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 10 deletions.
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
@@ -1568,18 +1568,15 @@ void EnqueueProgramCommand::process() {
     uint32_t sync_count = 0;
     bool stall_first = reservation.first.need_sync;
     bool stall_before_program = false;
-    if (!program.kernel_binary_always_stored_in_ringbuffer()) {
-        // Wait for all existing commands to run before writing out the kernel binary.
-        sync_count = this->expected_num_workers_completed;
-        stall_before_program = !stall_first;
-    } else if (reservation.first.need_sync) {
+    if (reservation.first.need_sync) {
         // TODO: attempt to send RTA only without stalling.
         sync_count = reservation.first.sync_count;
         // Check if the launch message is the only thing preventing us from
         // sending the program. If so, we can at least send the RTAs. Ideally we
         // would also send the kernel binaries in this case, but the rest of the
         // code isn't set up for that.
         auto config_sizes = program.get_program_config_sizes();
+        config_sizes[config_sizes.size() - 2] = 0;
         config_sizes[config_sizes.size() - 1] = 0;
         const std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> memory_reservation =
             this->config_buffer_mgr.reserve(config_sizes);
@@ -1622,9 +1619,9 @@ void EnqueueProgramCommand::process() {
     this->config_buffer_mgr.alloc(this->expected_num_workers_completed + num_workers);
     std::vector<ConfigBufferEntry>& kernel_config_addrs_raw = reservation.second;
 
-    // Remove launch buffer from config addrs, since it's not a real core.
+    // Remove launch buffers from config addrs, since they're not real cores.
     const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs{
-        kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 1};
+        kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 2};
 
     RecordProgramRun(program);
 
@@ -3077,6 +3074,9 @@ void HWCommandQueue::reset_config_buffer_mgr(const uint32_t num_entries) {
         // Subtract 1 from the number of entries, so the watcher can read information (e.g. fired asserts) from the
         // previous launch message.
         this->config_buffer_mgr[i].init_add_buffer(0, launch_msg_buffer_num_entries - 1);
+
+        // There's no ring buffer for active ethernet binaries, so keep track of them separately.
+        this->config_buffer_mgr[i].init_add_buffer(0, 1);
     }
 }
 

diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
@@ -337,9 +337,7 @@ detail::Program_::Program_() :
     }
 
     program_configs_.resize(programmable_core_count);
-    program_config_sizes_.resize(programmable_core_count + 1);
-    // Always need one launch buffer msg for a program.
-    program_config_sizes_[programmable_core_count] = 1;
+    program_config_sizes_.resize(programmable_core_count + 2);
 }
 
 Program::Program() : pimpl_(std::make_unique<detail::Program_>()) {}
@@ -1504,6 +1502,9 @@ void detail::Program_::finalize(Device *device) {
                  offset, max_size, magic_enum::enum_name(programmable_core_type));
     }
 
+    this->get_program_config_size(hal.get_programmable_core_type_count()) = runs_on_noc_multicast_only_cores();
+    this->get_program_config_size(hal.get_programmable_core_type_count() + 1) = runs_on_noc_unicast_only_cores();
+
     // The sem offsets cross programmable_core_types so must be set after the loop above
     this->set_launch_msg_sem_offsets();