#0: Update global semaphores and global circular buffers metal apis t…

…o be thread-safe instead of depending on ttnn apis
tenstorrent · Dec 12, 2024 · 8926ed0 · 8926ed0
1 parent 9f83120
commit 8926ed0
Show file tree

Hide file tree

Showing 9 changed files with 173 additions and 121 deletions.
diff --git a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
@@ -86,6 +86,7 @@ TEST_F(DispatchFixture, ResetGlobalSemaphores) {
     for (auto device : devices_) {
         {
             uint32_t initial_value = 1;
+            uint32_t reset_value = 2;
             std::vector<uint32_t> overwrite_value = {2};
             auto global_semaphore = tt::tt_metal::CreateGlobalSemaphore(device, cores, initial_value);
             auto address = global_semaphore->address();
@@ -104,14 +105,14 @@ TEST_F(DispatchFixture, ResetGlobalSemaphores) {
 
                 EXPECT_EQ(sem_vals[0], overwrite_value[0]);
             }
-            global_semaphore->reset_semaphore_value();
+            global_semaphore->reset_semaphore_value(reset_value);
             Synchronize(device);
             for (const auto& core : cores_vec) {
                 auto sem_vals = tt::llrt::read_hex_vec_from_core(
                     device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t));
                 tt::llrt::write_hex_vec_to_core(
                     device->id(), device->worker_core_from_logical_core(core), overwrite_value, address);
-                EXPECT_EQ(sem_vals[0], initial_value);
+                EXPECT_EQ(sem_vals[0], reset_value);
             }
         }
     }

diff --git a/tests/ttnn/unit_tests/test_global_semaphore.py b/tests/ttnn/unit_tests/test_global_semaphore.py
@@ -29,7 +29,7 @@ def run_global_semaphore(device):
 
     assert ttnn.get_global_semaphore_address(global_sem0) != ttnn.get_global_semaphore_address(global_sem1)
 
-    ttnn.reset_global_semaphore_value(global_sem0)
+    ttnn.reset_global_semaphore_value(global_sem0, 3)
 
 
 @pytest.mark.parametrize("enable_async_mode", (False, True), indirect=True)

diff --git a/tt_metal/impl/buffers/global_circular_buffer.cpp b/tt_metal/impl/buffers/global_circular_buffer.cpp
@@ -28,7 +28,8 @@ GlobalCircularBuffer::GlobalCircularBuffer(
     const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
     uint32_t size,
     BufferType buffer_type,
-    tt::stl::Span<const SubDeviceId> sub_device_ids) :
+    tt::stl::Span<const SubDeviceId> sub_device_ids,
+    Private) :
     device_(device), sender_receiver_core_mapping_(sender_receiver_core_mapping), size_(size) {
     TT_FATAL(this->device_ != nullptr, "Device cannot be null");
     uint32_t num_sender_cores = sender_receiver_core_mapping.size();
@@ -86,58 +87,65 @@ void GlobalCircularBuffer::setup_cb_buffers(
         shard_parameters,
         std::nullopt);
 
-    const auto& core_to_core_id = this->cb_config_buffer_->get_buffer_page_mapping()->core_to_core_id_;
-
-    std::vector<uint32_t> cb_config_host_buffer(cb_config_size / sizeof(uint32_t), 0);
-    uint32_t buffer_address = this->cb_buffer_->address();
-    uint32_t noc_xy_address = this->cb_config_buffer_->address() + num_config_elements * sizeof(uint32_t);
-    uint32_t pages_sent_address = align(noc_xy_address + num_noc_xy_words * sizeof(uint32_t), l1_alignment);
-
-    for (const auto& [sender_core, receiver_cores] : this->sender_receiver_core_mapping_) {
-        const auto& receiver_cores_vec = corerange_to_cores(receiver_cores);
-        uint32_t sender_idx = core_to_core_id.at(sender_core) * cb_config_page_size / sizeof(uint32_t);
-        uint32_t num_receivers = receiver_cores.num_cores();
-        uint32_t pages_acked_address = pages_sent_address + num_receivers * l1_alignment;
-        cb_config_host_buffer[sender_idx++] = 1;
-        cb_config_host_buffer[sender_idx++] = receiver_cores.num_cores();
-        cb_config_host_buffer[sender_idx++] = buffer_address;
-        cb_config_host_buffer[sender_idx++] = this->size_;
-        cb_config_host_buffer[sender_idx++] = buffer_address;
-        cb_config_host_buffer[sender_idx++] = noc_xy_address;
-        cb_config_host_buffer[sender_idx++] = pages_sent_address;
-
-        auto sender_physical_coord = this->device_->worker_core_from_logical_core(sender_core);
-        for (uint32_t i = 0; i < receiver_cores_vec.size(); i++) {
-            auto receiver_physical_coord = this->device_->worker_core_from_logical_core(receiver_cores_vec[i]);
-            cb_config_host_buffer[sender_idx++] = receiver_physical_coord.x;
-            cb_config_host_buffer[sender_idx++] = receiver_physical_coord.y;
-
-            uint32_t receiver_idx = core_to_core_id.at(receiver_cores_vec[i]) * cb_config_page_size / sizeof(uint32_t);
-            cb_config_host_buffer[receiver_idx++] = 0;
-            cb_config_host_buffer[receiver_idx++] = num_receivers;
-            cb_config_host_buffer[receiver_idx++] = buffer_address;
-            cb_config_host_buffer[receiver_idx++] = this->size_;
-            cb_config_host_buffer[receiver_idx++] = buffer_address;
-            cb_config_host_buffer[receiver_idx++] = noc_xy_address;
-            cb_config_host_buffer[receiver_idx++] = pages_sent_address + 2 * i * l1_alignment;
-            cb_config_host_buffer[receiver_idx++] = sender_physical_coord.x;
-            cb_config_host_buffer[receiver_idx++] = sender_physical_coord.y;
-        }
-    }
-
     // Write the config buffer to the device
     // Only block for the slow dispatch case
-    if (this->device_->using_slow_dispatch()) {
-        detail::WriteToBuffer(*this->cb_config_buffer_, cb_config_host_buffer);
-        tt::Cluster::instance().l1_barrier(this->device_->id());
-    } else {
-        EnqueueWriteBuffer(
-            this->device_->command_queue(),
-            this->cb_config_buffer_,
-            cb_config_host_buffer.data(),
-            false,
-            sub_device_ids);
-    }
+    auto* device = this->device_;
+    device->push_work([device,
+                       cb_config_size,
+                       cb_config_page_size,
+                       num_noc_xy_words,
+                       l1_alignment,
+                       buffer_address = this->cb_buffer_->address(),
+                       cb_config_buffer = this->cb_config_buffer_,
+                       size = this->size_,
+                       sender_receiver_core_mapping = this->sender_receiver_core_mapping_,
+                       sub_device_ids = std::vector<SubDeviceId>(sub_device_ids.begin(), sub_device_ids.end())] {
+        auto config_buffer_address = cb_config_buffer->address();
+        const auto& core_to_core_id = cb_config_buffer->get_buffer_page_mapping()->core_to_core_id_;
+        std::vector<uint32_t> cb_config_host_buffer(cb_config_size / sizeof(uint32_t), 0);
+        uint32_t noc_xy_address = config_buffer_address + num_config_elements * sizeof(uint32_t);
+        uint32_t pages_sent_address = align(noc_xy_address + num_noc_xy_words * sizeof(uint32_t), l1_alignment);
+
+        for (const auto& [sender_core, receiver_cores] : sender_receiver_core_mapping) {
+            const auto& receiver_cores_vec = corerange_to_cores(receiver_cores);
+            uint32_t sender_idx = core_to_core_id.at(sender_core) * cb_config_page_size / sizeof(uint32_t);
+            uint32_t num_receivers = receiver_cores.num_cores();
+            uint32_t pages_acked_address = pages_sent_address + num_receivers * l1_alignment;
+            cb_config_host_buffer[sender_idx++] = 1;
+            cb_config_host_buffer[sender_idx++] = receiver_cores.num_cores();
+            cb_config_host_buffer[sender_idx++] = buffer_address;
+            cb_config_host_buffer[sender_idx++] = size;
+            cb_config_host_buffer[sender_idx++] = buffer_address;
+            cb_config_host_buffer[sender_idx++] = noc_xy_address;
+            cb_config_host_buffer[sender_idx++] = pages_sent_address;
+
+            auto sender_physical_coord = device->worker_core_from_logical_core(sender_core);
+            for (uint32_t i = 0; i < receiver_cores_vec.size(); i++) {
+                auto receiver_physical_coord = device->worker_core_from_logical_core(receiver_cores_vec[i]);
+                cb_config_host_buffer[sender_idx++] = receiver_physical_coord.x;
+                cb_config_host_buffer[sender_idx++] = receiver_physical_coord.y;
+
+                uint32_t receiver_idx =
+                    core_to_core_id.at(receiver_cores_vec[i]) * cb_config_page_size / sizeof(uint32_t);
+                cb_config_host_buffer[receiver_idx++] = 0;
+                cb_config_host_buffer[receiver_idx++] = num_receivers;
+                cb_config_host_buffer[receiver_idx++] = buffer_address;
+                cb_config_host_buffer[receiver_idx++] = size;
+                cb_config_host_buffer[receiver_idx++] = buffer_address;
+                cb_config_host_buffer[receiver_idx++] = noc_xy_address;
+                cb_config_host_buffer[receiver_idx++] = pages_sent_address + 2 * i * l1_alignment;
+                cb_config_host_buffer[receiver_idx++] = sender_physical_coord.x;
+                cb_config_host_buffer[receiver_idx++] = sender_physical_coord.y;
+            }
+        }
+        if (device->using_slow_dispatch()) {
+            detail::WriteToBuffer(*cb_config_buffer, cb_config_host_buffer);
+            tt::Cluster::instance().l1_barrier(device->id());
+        } else {
+            EnqueueWriteBuffer(
+                device->command_queue(), cb_config_buffer, cb_config_host_buffer.data(), false, sub_device_ids);
+        }
+    });
 }
 
 std::shared_ptr<GlobalCircularBuffer> GlobalCircularBuffer::create(
@@ -147,7 +155,7 @@ std::shared_ptr<GlobalCircularBuffer> GlobalCircularBuffer::create(
     BufferType buffer_type,
     tt::stl::Span<const SubDeviceId> sub_device_ids) {
     return std::make_shared<GlobalCircularBuffer>(
-        device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids);
+        device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids, Private());
 }
 
 const Buffer& GlobalCircularBuffer::cb_buffer() const { return *this->cb_buffer_; }

diff --git a/tt_metal/impl/buffers/global_circular_buffer.hpp b/tt_metal/impl/buffers/global_circular_buffer.hpp
@@ -26,27 +26,24 @@ namespace v1 {
 namespace experimental {
 
 class GlobalCircularBuffer {
-public:
-    GlobalCircularBuffer(
-        Device* device,
-        const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
-        uint32_t size,
-        BufferType buffer_type,
-        tt::stl::Span<const SubDeviceId> sub_device_ids);
-
-    GlobalCircularBuffer(const GlobalCircularBuffer&) = default;
-    GlobalCircularBuffer& operator=(const GlobalCircularBuffer&) = default;
-
-    GlobalCircularBuffer(GlobalCircularBuffer&&) noexcept = default;
-    GlobalCircularBuffer& operator=(GlobalCircularBuffer&&) noexcept = default;
+    struct Private {
+        explicit Private() = default;
+    };
 
+public:
     static std::shared_ptr<GlobalCircularBuffer> create(
         Device* device,
         const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
         uint32_t size,
         BufferType buffer_type = BufferType::L1,
         tt::stl::Span<const SubDeviceId> sub_device_ids = {});
 
+    GlobalCircularBuffer(const GlobalCircularBuffer&) = delete;
+    GlobalCircularBuffer& operator=(const GlobalCircularBuffer&) = delete;
+
+    GlobalCircularBuffer(GlobalCircularBuffer&&) noexcept = delete;
+    GlobalCircularBuffer& operator=(GlobalCircularBuffer&&) noexcept = delete;
+
     const Buffer& cb_buffer() const;
 
     const CoreRangeSet& sender_cores() const;
@@ -59,6 +56,16 @@ class GlobalCircularBuffer {
     static constexpr auto attribute_names = std::forward_as_tuple("sender_receiver_core_mapping", "size");
     const auto attribute_values() const { return std::make_tuple(this->sender_receiver_core_mapping_, this->size_); }
 
+    // "Private" constructor to prevent direct instantiation
+    // Use GlobalCircularBuffer::create instead
+    GlobalCircularBuffer(
+        Device* device,
+        const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
+        uint32_t size,
+        BufferType buffer_type,
+        tt::stl::Span<const SubDeviceId> sub_device_ids,
+        Private);
+
 private:
     void setup_cb_buffers(
         BufferType buffer_type, uint32_t max_num_receivers_per_sender, tt::stl::Span<const SubDeviceId> sub_device_ids);

diff --git a/tt_metal/impl/buffers/global_semaphore.cpp b/tt_metal/impl/buffers/global_semaphore.cpp
@@ -24,22 +24,25 @@ GlobalSemaphore::GlobalSemaphore(
     const CoreRangeSet& cores,
     uint32_t initial_value,
     BufferType buffer_type,
-    tt::stl::Span<const SubDeviceId> sub_device_ids) :
-    device_(device), cores_(cores), initial_value_(initial_value) {
-    this->setup_buffer(buffer_type, sub_device_ids);
+    tt::stl::Span<const SubDeviceId> sub_device_ids,
+    Private) :
+    device_(device), cores_(cores) {
+    this->setup_buffer(initial_value, buffer_type, sub_device_ids);
 }
 
 GlobalSemaphore::GlobalSemaphore(
     Device* device,
     CoreRangeSet&& cores,
     uint32_t initial_value,
     BufferType buffer_type,
-    tt::stl::Span<const SubDeviceId> sub_device_ids) :
-    device_(device), cores_(std::move(cores)), initial_value_(initial_value) {
-    this->setup_buffer(buffer_type, sub_device_ids);
+    tt::stl::Span<const SubDeviceId> sub_device_ids,
+    Private) :
+    device_(device), cores_(std::move(cores)) {
+    this->setup_buffer(initial_value, buffer_type, sub_device_ids);
 }
 
-void GlobalSemaphore::setup_buffer(BufferType buffer_type, tt::stl::Span<const SubDeviceId> sub_device_ids) {
+void GlobalSemaphore::setup_buffer(
+    uint32_t initial_value, BufferType buffer_type, tt::stl::Span<const SubDeviceId> sub_device_ids) {
     TT_FATAL(
         buffer_type == BufferType::L1 or buffer_type == BufferType::L1_SMALL,
         "Global semaphore can only be created for L1 buffer types");
@@ -58,8 +61,7 @@ void GlobalSemaphore::setup_buffer(BufferType buffer_type, tt::stl::Span<const S
         shard_parameters,
         std::nullopt);
 
-    this->host_buffer_ = std::vector<uint32_t>(num_cores, this->initial_value_);
-    this->reset_semaphore_value(sub_device_ids);
+    this->reset_semaphore_value(initial_value, sub_device_ids);
 }
 
 std::shared_ptr<GlobalSemaphore> GlobalSemaphore::create(
@@ -68,31 +70,39 @@ std::shared_ptr<GlobalSemaphore> GlobalSemaphore::create(
     uint32_t initial_value,
     BufferType buffer_type,
     tt::stl::Span<const SubDeviceId> sub_device_ids) {
-    return std::make_shared<GlobalSemaphore>(device, cores, initial_value, buffer_type, sub_device_ids);
+    return std::make_shared<GlobalSemaphore>(device, cores, initial_value, buffer_type, sub_device_ids, Private());
 }
 std::shared_ptr<GlobalSemaphore> GlobalSemaphore::create(
     Device* device,
     CoreRangeSet&& cores,
     uint32_t initial_value,
     BufferType buffer_type,
     tt::stl::Span<const SubDeviceId> sub_device_ids) {
-    return std::make_shared<GlobalSemaphore>(device, std::move(cores), initial_value, buffer_type, sub_device_ids);
+    return std::make_shared<GlobalSemaphore>(
+        device, std::move(cores), initial_value, buffer_type, sub_device_ids, Private());
 }
 
 Device* GlobalSemaphore::device() const { return device_; }
 
 DeviceAddr GlobalSemaphore::address() const { return buffer_->address(); }
 
-void GlobalSemaphore::reset_semaphore_value(tt::stl::Span<const SubDeviceId> sub_device_ids) {
+void GlobalSemaphore::reset_semaphore_value(uint32_t reset_value, tt::stl::Span<const SubDeviceId> sub_device_ids) {
     // Write the initial value to the semaphore to the device
     // Only block for the slow dispatch case
-    if (this->device_->using_slow_dispatch()) {
-        detail::WriteToBuffer(*this->buffer_, this->host_buffer_);
-        tt::Cluster::instance().l1_barrier(this->device_->id());
-    } else {
-        EnqueueWriteBuffer(
-            this->device_->command_queue(), this->buffer_, this->host_buffer_.data(), false, sub_device_ids);
-    }
+    auto* device = this->device_;
+    device->push_work([device,
+                       reset_value,
+                       sub_device_ids = std::vector<SubDeviceId>(sub_device_ids.begin(), sub_device_ids.end()),
+                       num_cores = this->cores_.num_cores(),
+                       buffer = this->buffer_] {
+        std::vector<uint32_t> host_buffer(num_cores, reset_value);
+        if (device->using_slow_dispatch()) {
+            detail::WriteToBuffer(*buffer, host_buffer);
+            tt::Cluster::instance().l1_barrier(device->id());
+        } else {
+            EnqueueWriteBuffer(device->command_queue(), buffer, host_buffer, false, sub_device_ids);
+        }
+    });
 }
 
 }  // namespace tt::tt_metal