From 25a35e333ace1772d2c0ab1854162df78e9d66d3 Mon Sep 17 00:00:00 2001
From: Aditya Saigal <129097327+tt-asaigal@users.noreply.github.com>
Date: Mon, 28 Oct 2024 19:12:01 -0400
Subject: [PATCH] #0: Move SynchronizeWorkerThreads to tt_metal::detail
 namespace (#14322)

- This is a core tt_metal API and shouldn't be in tensor_ops
- Also account for worker thread deadlock when calling this API
---
 tt_metal/detail/tt_metal.hpp        |  2 ++
 tt_metal/tt_metal.cpp               | 17 +++++++++++++++++
 ttnn/cpp/ttnn/tensor/tensor_ops.cpp | 16 +---------------
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
index e5464e721a6..a4e84ef12b4 100644
--- a/tt_metal/detail/tt_metal.hpp
+++ b/tt_metal/detail/tt_metal.hpp
@@ -279,5 +279,7 @@ inline namespace v0 {
         DeviceAddr AllocateBuffer(Buffer* buffer);
 
         void DeallocateBuffer(Buffer *buffer);
+
+        void SynchronizeWorkerThreads(const std::vector<Device*>& workers);
     }  // namespace detail
 }  // namespace tt::tt_metal
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index 001cec165e1..5b56a11e844 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -830,6 +830,23 @@ void DeallocateBuffer(Buffer *buffer) {
     allocator::deallocate_buffer(*buffer->device()->allocator_, buffer);
 }
 
+void SynchronizeWorkerThreads(const std::vector<Device*>& workers) {
+    if (tt::tt_metal::detail::InWorkerThread()) {
+        // Early exit if in a worker thread, since waiting for the worker
+        // queue to become empty inside a worker thread leads to a deadlock
+        // Synchronizing in a worker thread should be a nop by definition
+        return;
+    }
+    // Push empty work to threads and ensure its been picked up
+    for (auto target_device : workers) {
+        target_device->work_executor.push_work([](){});
+    }
+    // Block until work has been picked up, to flush the queue
+    for (auto target_device : workers) {
+        while(not target_device->work_executor.worker_queue.empty());
+    }
+}
+
 }  // namespace detail
 
 inline namespace v0 {
diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
index c133a1aff71..c2460932205 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
@@ -23,20 +23,6 @@
 #include "ttnn/core.hpp"
 
 
-namespace{
-    inline void SynchronizeWorkerThreads(const std::vector<Device*>& workers) {
-        // Push empty work to threads and ensure its been picked up
-        for (auto target_device : workers) {
-            target_device->work_executor.push_work([](){});
-        }
-        // Block until work has been picked up, to flush the queue
-        for (auto target_device : workers) {
-            while(not target_device->work_executor.worker_queue.empty());
-        }
-    }
-}
-
-
 namespace tt::tt_metal::tensor_ops {
 
 Tensor tensor_to(const Tensor& input_tensor, Device* target_device, const MemoryConfig& mem_config) {
@@ -147,7 +133,7 @@ Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, uint8_t cq_id) {
     }
 
     if (blocking) {
-        SynchronizeWorkerThreads(workers);
+        tt::tt_metal::detail::SynchronizeWorkerThreads(workers);
     }
     // Update main_thread_ref_count for tensor after pushing to queue.
     input_tensor.tensor_attributes->update_main_thread_ref_count(workers.at(0), original_tensor_ref_count);