Skip to content

Commit

Permalink
#14038: Remove global BUFFER_MAP and make the tracking of buffers loc…
Browse files Browse the repository at this point in the history
…al to an allocator
  • Loading branch information
tt-aho committed Oct 25, 2024
1 parent e0c4924 commit 561e2fd
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 172 deletions.
2 changes: 1 addition & 1 deletion tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ inline namespace v0 {

void SetLazyCommandQueueMode(bool lazy);

DeviceAddr AllocateBuffer(const Buffer* buffer, bool bottom_up);
DeviceAddr AllocateBuffer(Buffer* buffer);

void DeallocateBuffer(Buffer *buffer);
} // namespace detail
Expand Down
8 changes: 4 additions & 4 deletions tt_metal/graph/graph_tracking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ bool GraphTracker::add_hook(const std::shared_ptr<IGraphHooks>& new_hook) {
return true;
}

void GraphTracker::track_allocate(const Buffer* buffer, bool bottom_up) {
void GraphTracker::track_allocate(const Buffer* buffer) {
if (processors.empty()) {
return;
}
for (auto& it : processors) {
it->track_allocate(buffer, bottom_up);
it->track_allocate(buffer);
}
}

Expand Down Expand Up @@ -73,11 +73,11 @@ void GraphTracker::track_program(Program* program) {
}
}

bool GraphTracker::hook_allocate(const Buffer* buffer, bool bottom_up) {
bool GraphTracker::hook_allocate(const Buffer* buffer) {
if (hook == nullptr)
return false;

return hook->hook_allocate(buffer, bottom_up);
return hook->hook_allocate(buffer);
}

bool GraphTracker::hook_deallocate(Buffer* buffer) {
Expand Down
8 changes: 4 additions & 4 deletions tt_metal/graph/graph_tracking.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ inline namespace v0 {

IGraphProcessor() = default;

virtual void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {};
virtual void track_allocate(const tt::tt_metal::Buffer* buffer) {};

virtual void track_deallocate(tt::tt_metal::Buffer* buffer) {};

Expand All @@ -54,7 +54,7 @@ inline namespace v0 {
class IGraphHooks {
public:
IGraphHooks() = default;
virtual bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) = 0;
virtual bool hook_allocate(const tt::tt_metal::Buffer* buffer) = 0;

virtual bool hook_deallocate(tt::tt_metal::Buffer* buffer) = 0;

Expand All @@ -77,7 +77,7 @@ inline namespace v0 {

bool add_hook(const std::shared_ptr<IGraphHooks>& hook);

void track_allocate(const Buffer* buffer, bool bottom_up);
void track_allocate(const Buffer* buffer);

void track_deallocate(Buffer* buffer);

Expand Down Expand Up @@ -118,7 +118,7 @@ inline namespace v0 {
}
}

bool hook_allocate(const Buffer* buffer, bool bottom_up);
bool hook_allocate(const Buffer* buffer);

bool hook_deallocate(Buffer* buffer);

Expand Down
41 changes: 26 additions & 15 deletions tt_metal/impl/allocator/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,38 +377,45 @@ void verify_safe_allocation(Allocator& allocator) {
}
}

uint64_t allocate_buffer(
Allocator &allocator,
DeviceAddr size,
DeviceAddr page_size,
const BufferType &buffer_type,
bool bottom_up,
std::optional<uint32_t> num_shards) {
uint64_t address = 0;
const std::unordered_set<Buffer *> &get_allocated_buffers(const Allocator &allocator) { return allocator.allocated_buffers; }

DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, Buffer *buffer) {
DeviceAddr address = 0;
auto page_size = buffer->page_size();
auto buffer_type = buffer->buffer_type();
auto bottom_up = buffer->bottom_up();
auto num_shards = buffer->num_cores();
verify_safe_allocation(allocator);
switch (buffer_type) {
case BufferType::DRAM:
return allocator.descriptor.dram.alloc(
address = allocator.descriptor.dram.alloc(
allocator.config, allocator.dram_manager, size, page_size, bottom_up, num_shards);
break;
case BufferType::L1:
return allocator.descriptor.l1.alloc(
address = allocator.descriptor.l1.alloc(
allocator.config, allocator.l1_manager, size, page_size, bottom_up, num_shards);
break;
case BufferType::L1_SMALL: {
TT_FATAL(num_shards.has_value(), "L1_SMALL only supports sharded allocations, see validate_num_banks");
return allocator.descriptor.l1.alloc(
address = allocator.descriptor.l1.alloc(
allocator.config, allocator.l1_small_manager, size, page_size, bottom_up, num_shards);
case BufferType::TRACE:
return allocator.descriptor.dram.alloc(
allocator.config, allocator.trace_buffer_manager, size, page_size, bottom_up, num_shards);
break;
}
case BufferType::TRACE:
address = allocator.descriptor.dram.alloc(
allocator.config, allocator.trace_buffer_manager, size, page_size, bottom_up, num_shards);
break;
default: {
TT_THROW("Unsupported buffer type!");
}
}
allocator.allocated_buffers.insert(buffer);
return address;
}

void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferType &buffer_type) {
void deallocate_buffer(Allocator &allocator, Buffer *buffer) {
auto address = buffer->address();
auto buffer_type = buffer->buffer_type();
switch (buffer_type) {
case BufferType::DRAM: allocator.dram_manager.deallocate_buffer(address); break;
case BufferType::L1: allocator.l1_manager.deallocate_buffer(address); break;
Expand All @@ -418,20 +425,23 @@ void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferTyp
TT_THROW("Unsupported buffer type!");
}
}
allocator.allocated_buffers.erase(buffer);
}

void deallocate_buffers(Allocator &allocator) {
allocator.dram_manager.deallocate_all();
allocator.l1_manager.deallocate_all();
allocator.l1_small_manager.deallocate_all();
allocator.trace_buffer_manager.deallocate_all();
allocator.allocated_buffers.clear();
}

void clear(Allocator &allocator) {
allocator.dram_manager.clear();
allocator.l1_manager.clear();
allocator.l1_small_manager.clear();
allocator.trace_buffer_manager.clear();
allocator.allocated_buffers.clear();
}

} // namespace allocator
Expand Down Expand Up @@ -460,6 +470,7 @@ void Allocator::reset() {
l1_manager.clear();
l1_small_manager.clear();
trace_buffer_manager.clear();
allocated_buffers.clear();
config.reset();
}

Expand Down
13 changes: 11 additions & 2 deletions tt_metal/impl/allocator/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ namespace tt {

namespace tt_metal {

inline namespace v0 {

class Buffer;

} // namespace v0

// Fwd declares
enum class BufferType;
struct Allocator;
Expand Down Expand Up @@ -99,15 +105,17 @@ std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator &allocator,

DeviceAddr base_alloc(const AllocatorConfig & config, BankManager &bank_manager, DeviceAddr size, DeviceAddr page_size, bool bottom_up, std::optional<uint32_t> num_shards);

DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, DeviceAddr page_size, const BufferType &buffer_type, bool bottom_up, std::optional<uint32_t> num_shards = std::nullopt);
DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, Buffer *buffer);

void mark_allocations_unsafe(Allocator &allocator);

void mark_allocations_safe(Allocator &allocator);

void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferType &buffer_type);
void deallocate_buffer(Allocator &allocator, Buffer *buffer);
void deallocate_buffers(Allocator &allocator);

const std::unordered_set<Buffer *> &get_allocated_buffers(const Allocator &allocator);

void clear(Allocator &allocatator);

} // namespace allocator
Expand All @@ -127,6 +135,7 @@ struct Allocator {
std::unordered_map<uint32_t, std::vector<uint32_t>> dram_channel_to_bank_ids;
std::unordered_map<uint32_t, CoreCoord> bank_id_to_logical_core;
std::unordered_map<BufferType, std::unordered_map<CoreCoord, std::vector<uint32_t>>> logical_core_to_bank_ids;
std::unordered_set<Buffer *> allocated_buffers;

AllocatorConfig config;
// Callbacks to invoke during initialization and allocation
Expand Down
31 changes: 15 additions & 16 deletions tt_metal/impl/buffers/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,15 @@ void validate_buffer_size_and_page_size(
"Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values");

if (is_sharded(buffer_layout)) {
TT_FATAL(shard_parameters != std::nullopt, "Sharded buffers must have a core grid assigned");
} else if (buffer_layout == TensorMemoryLayout::SINGLE_BANK) {
TT_FATAL(page_size == size, "Contiguous buffer must be one contiguous page");
TT_FATAL(
shard_parameters != std::nullopt,
"Buffer was specified as sharded but does not have shard_parameters specified");
} else {
TT_FATAL(
shard_parameters == std::nullopt, "Buffer was specified as not sharded but has shard_parameters specified");
if (buffer_layout == TensorMemoryLayout::SINGLE_BANK) {
TT_FATAL(page_size == size, "Contiguous buffer must be one contiguous page");
}
}
}

Expand Down Expand Up @@ -125,7 +131,7 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer) {
auto shard_spec = buffer.shard_spec();

bool row_major = shard_spec.orientation() == ShardOrientation::ROW_MAJOR;
uint32_t num_cores = buffer.num_cores();
uint32_t num_cores = buffer.num_cores().value();

buffer_page_mapping.all_cores_ = corerange_to_cores(shard_spec.grid(), num_cores, row_major);
TT_FATAL(num_cores == buffer_page_mapping.all_cores_.size(), "Buffer has {} cores, but page mapping expects {} cores", num_cores, buffer_page_mapping.all_cores_.size());
Expand Down Expand Up @@ -196,7 +202,7 @@ Buffer::Buffer(
buffer_type_(buffer_type),
buffer_layout_(buffer_layout),
shard_parameters_(shard_parameters),
bottom_up_(bottom_up),
bottom_up_(bottom_up.value_or(this->is_dram())),
buffer_page_mapping_(nullptr) {
TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null.");

Expand All @@ -223,9 +229,7 @@ std::shared_ptr<Buffer> Buffer::create(
}

buffer->device_->push_work([buffer] {
bool bottom_up = buffer->bottom_up_.value_or(buffer->is_dram());
buffer->address_ = detail::AllocateBuffer(buffer.get(), bottom_up);
detail::BUFFER_MAP.insert({buffer->device_->id(), buffer->address_}, buffer.get());
buffer->address_ = detail::AllocateBuffer(buffer.get());

std::unique_lock lock(buffer->allocation_mutex_);
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);
Expand Down Expand Up @@ -257,7 +261,6 @@ void Buffer::deallocate_impl() {

if (device_->initialized_ && size_ != 0) {
// address_ is only modified from this thread, no sync required
detail::BUFFER_MAP.erase({device_->id(), address_});
detail::DeallocateBuffer(this);
}

Expand Down Expand Up @@ -306,7 +309,7 @@ uint32_t Buffer::num_dev_pages() const {
return this->num_pages();
}

return this->shard_spec().size() * this->num_cores();
return this->shard_spec().size() * this->num_cores().value();
}

CoreType Buffer::core_type() const {
Expand Down Expand Up @@ -399,9 +402,9 @@ void Buffer::set_shard_spec(const ShardSpecBuffer& shard_spec) {
this->buffer_page_mapping_ = nullptr;
}

uint32_t Buffer::num_cores() const {
std::optional<uint32_t> Buffer::num_cores() const {
if (!is_sharded(this->buffer_layout_))
return 1;
return std::nullopt;

return this->shard_spec().tensor_shard_spec.grid.num_cores();
}
Expand Down Expand Up @@ -433,10 +436,6 @@ DeviceAddr ShardSpecBuffer::size() const {
return shape_in_pages_[0] * shape_in_pages_[1];
}

namespace detail {
buffer_map_t BUFFER_MAP = {};
}

} // namespace tt_metal
} // namespace tt

Expand Down
36 changes: 4 additions & 32 deletions tt_metal/impl/buffers/buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ class Buffer final {

TensorMemoryLayout buffer_layout() const { return buffer_layout_; }

bool bottom_up() const { return bottom_up_; }

uint32_t dram_channel_from_bank_id(uint32_t bank_id) const;

CoreCoord logical_core_from_bank_id(uint32_t bank_id) const;
Expand All @@ -199,7 +201,7 @@ class Buffer final {
ShardSpecBuffer shard_spec() const;
void set_shard_spec(const ShardSpecBuffer& shard_spec);

uint32_t num_cores() const;
std::optional<uint32_t> num_cores() const;

const std::shared_ptr<const BufferPageMapping>& get_buffer_page_mapping();

Expand Down Expand Up @@ -231,7 +233,7 @@ class Buffer final {
const DeviceAddr size_; // Size in bytes
const BufferType buffer_type_;
const TensorMemoryLayout buffer_layout_;
const std::optional<bool> bottom_up_;
const bool bottom_up_;

std::atomic<AllocationStatus> allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED;
DeviceAddr address_ = 0;
Expand All @@ -252,36 +254,6 @@ class Buffer final {

BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer);

namespace detail {
using Deviceid = uint32_t;

class buffer_map_t {
public:
void insert(std::tuple<Deviceid, DeviceAddr> buf_attr, Buffer *buffer) {
std::scoped_lock<std::mutex> lock(this->map_mutex);
this->map.insert({buf_attr, buffer});
}

void erase(std::tuple<Deviceid, DeviceAddr> buf_attr) {
std::scoped_lock<std::mutex> lock(this->map_mutex);
this->map.erase(buf_attr);
}

std::map<std::tuple<Deviceid, DeviceAddr>, Buffer *> value() {
std::scoped_lock<std::mutex> lock(this->map_mutex);
return this->map;
}

~buffer_map_t() { TT_ASSERT(this->map.empty(), "Not all buffers deallocated by runtime!"); }

private:
std::mutex map_mutex;
std::map<std::tuple<Deviceid, DeviceAddr>, Buffer *> map = {};
};

extern buffer_map_t BUFFER_MAP;
} // namespace detail

inline namespace v0 {

using HostDataType = std::variant<
Expand Down
11 changes: 7 additions & 4 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2949,10 +2949,8 @@ bool Device::close() {
tt::Cluster::instance().l1_barrier(id_);
allocator::clear(*this->allocator_);
// After device close, no buffers on this device should be used
for (const auto &[buf_attr, buf] : detail::BUFFER_MAP.value()) {
if (std::get<0>(buf_attr) == this->id()) {
DeallocateBuffer(*buf);
}
for (const auto &buf : this->get_allocated_buffers()) {
DeallocateBuffer(*buf);
}

this->compute_cores_.clear();
Expand Down Expand Up @@ -3174,6 +3172,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou
return allocator::dump_memory_blocks(*this->allocator_, buffer_type, out);
}

const std::unordered_set<Buffer *> &Device::get_allocated_buffers() const {
this->check_allocator_is_initialized();
return allocator::get_allocated_buffers(*this->allocator_);
}

void Device::deallocate_buffers(){
allocator::deallocate_buffers(*allocator_);
}
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ class Device {
uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& physical_core) const;
uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const;

const std::unordered_set<Buffer *> &get_allocated_buffers() const;

void deallocate_buffers();

// machine epsilon
Expand Down
Loading

0 comments on commit 561e2fd

Please sign in to comment.