diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index 172bc369b81d..879079b247d2 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -154,7 +154,7 @@ Buffer::Buffer( TT_FATAL(this->device_ != nullptr and this->device_->allocator_ != nullptr, "Device and allocator need to not be null."); if (size == 0) { - is_allocated_ = true; + allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed); return; } @@ -228,8 +228,16 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer) { } void Buffer::allocate() { + { + std::unique_lock lock(allocation_mutex_); + TT_FATAL(allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::NOT_ALLOCATED, "Can't allocate buffer after it was already allocated"); + allocation_status_.store(AllocationStatus::ALLOCATION_REQUESTED, std::memory_order::relaxed); + } + device_->push_work([self = weak_self.lock()] { - if (self->is_allocated_) { + std::unique_lock lock(self->allocation_mutex_); + if (self->allocation_status_.load(std::memory_order::relaxed) != AllocationStatus::ALLOCATION_REQUESTED) { + // The allocation was interrupted by a deallocation return; } @@ -237,40 +245,86 @@ void Buffer::allocate() { detail::AllocateBuffer(self.get(), bottom_up); detail::BUFFER_MAP.insert({self->device_->id(), self->address_}, self.get()); - self->is_allocated_ = true; + self->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed); + lock.unlock(); + self->allocation_cv_.notify_all(); }); } void Buffer::deallocate() { + if (size_ == 0) { + // 0-size buffer, no need to deallocate + return; + } + + { + std::unique_lock lock(allocation_mutex_); + auto status = allocation_status_.load(std::memory_order::relaxed); + if (status != AllocationStatus::ALLOCATED && status != AllocationStatus::ALLOCATION_REQUESTED) { + // Buffer isn't allocated, nothing to be done + return; + } + // Overwriting either ALLOCATED or ALLOCATION_REQUESTED with DEALLOCATION_REQUESTED + allocation_status_.store(AllocationStatus::DEALLOCATION_REQUESTED, std::memory_order::relaxed); + } + device_->push_work([self = weak_self.lock()] { - if (!self->is_allocated_ || !self->device_->initialized_ || self->size_ == 0) { + // Because the status is DEALLOCATION_REQUESTED, it won't be changed by anyone else, no need to lock a mutex + if (!self->device_->initialized_) { return; } detail::BUFFER_MAP.erase({self->device()->id(), self->address()}); detail::DeallocateBuffer(self.get()); - self->is_allocated_ = false; + self->allocation_status_.store(AllocationStatus::DEALLOCATED, std::memory_order::relaxed); }); } void Buffer::deallocateAndDelete(Buffer* buffer) { + // This is the last reference to the buffer, no need to lock or update AllocationStatus buffer->device_->push_work([buffer] { - if (buffer->is_allocated_ && buffer->device_->initialized_ && buffer->size_ != 0) { - detail::BUFFER_MAP.erase({buffer->device_->id(), buffer->address_}); - detail::DeallocateBuffer(buffer); + // Buffer will be deleted at the end of this block + std::unique_ptr unique_buffer = std::unique_ptr(buffer); + + auto status = buffer->allocation_status_.load(std::memory_order::relaxed); + if (status == AllocationStatus::NOT_ALLOCATED || status == AllocationStatus::ALLOCATION_REQUESTED || status == AllocationStatus::DEALLOCATED) { + // Buffer isn't allocated, nothing to be done + return; } - delete buffer; + if (!buffer->device_->initialized_ || buffer->size_ == 0) { + return; + } + + detail::BUFFER_MAP.erase({buffer->device_->id(), buffer->address_}); + detail::DeallocateBuffer(buffer); }); } +bool Buffer::is_allocated() const { + auto allocation_status = allocation_status_.load(std::memory_order::relaxed); + if (device_->can_use_passthrough_scheduling()) { + return allocation_status == AllocationStatus::ALLOCATED; + } + // For calls from different threads we consider buffer to be allocated even if it's just ALLOCATION_REQUESTED, + // because once the caller will try to access it, the buffer will already be fully allocated + return allocation_status == AllocationStatus::ALLOCATED || allocation_status == AllocationStatus::ALLOCATION_REQUESTED; +} + uint32_t Buffer::address() const { - TT_FATAL(device_->can_use_passthrough_scheduling() , "Buffer::address must be called in device worker thread"); + if (device_->can_use_passthrough_scheduling()) { + // No locking required, because address can only be modified from the same thread + return address_; + } + + std::unique_lock lock(allocation_mutex_); + allocation_cv_.wait(lock, [this] { return this->allocation_status_.load(std::memory_order::relaxed) != AllocationStatus::ALLOCATION_REQUESTED; }); return address_; } void Buffer::set_address(uint64_t addr) { TT_FATAL(device_->can_use_passthrough_scheduling() , "Buffer::set_address must be called in device worker thread"); + TT_FATAL(allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::ALLOCATION_REQUESTED, "Buffer address can only be set during allocation"); address_ = addr; } diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp index 6b16becf456c..812125f8fe94 100644 --- a/tt_metal/impl/buffers/buffer.hpp +++ b/tt_metal/impl/buffers/buffer.hpp @@ -158,7 +158,7 @@ class Buffer final { Device *device() const { return device_; } DeviceAddr size() const { return size_; } - bool is_allocated() const { return is_allocated_; } + bool is_allocated() const; // Returns address of buffer in the first bank uint32_t address() const; @@ -224,12 +224,35 @@ class Buffer final { DeviceAddr translate_page_address(uint64_t offset, uint32_t bank_id) const; + enum class AllocationStatus : uint8_t { + // The buffer is created in NOT_ALLOCATED state (except for 0-size buffers, which are initially ALLOCATED). + // The buffer can transition from NOT_ALLOCATED to ALLOCATION_REQUESTED only once in its lifetime. + NOT_ALLOCATED, + // The task is scheduled on the queue to allocate the buffer. + // When the task succeeds, the buffer transitions into ALLOCATED state. + // The scheduled allocation can be interrupted by a deallocation, which would transition the buffer to DEALLOCATION_REQUESTED, + // and then to DEALLOCATED states. + ALLOCATION_REQUESTED, + // The buffer is completely allocated and the address is available. + // The buffer can transition from ALLOCATED only to DEALLOCATION_REQUESTED. + ALLOCATED, + // The task is scheduled to deallocate the buffer. + // When the task succeeds, the buffer transitions into DEALLOCATED state. + DEALLOCATION_REQUESTED, + // The buffer is completely deallocated. + // This is the final state, no transitions from this state are possible. + DEALLOCATED, + }; + Device * const device_; const DeviceAddr size_; // Size in bytes const BufferType buffer_type_; const TensorMemoryLayout buffer_layout_; const std::optional bottom_up_; - std::atomic is_allocated_ = false; + + std::atomic allocation_status_ = AllocationStatus::NOT_ALLOCATED; + mutable std::condition_variable allocation_cv_; + mutable std::mutex allocation_mutex_; // These members must be only accessed on the device worker thread DeviceAddr address_ = 0; // Address of buffer