Skip to content

Commit

Permalink
#0: Update global cb, sem apis to take in sub_device_ids to know what…
Browse files Browse the repository at this point in the history
… to stall on when writing to device
  • Loading branch information
tt-aho committed Dec 10, 2024
1 parent 4bcc79b commit c230ae5
Show file tree
Hide file tree
Showing 14 changed files with 225 additions and 84 deletions.
8 changes: 5 additions & 3 deletions tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ TEST_F(DispatchFixture, InitializeGlobalSemaphores) {
uint32_t initial_value = 1;
auto global_semaphore = tt::tt_metal::CreateGlobalSemaphore(device, cores, initial_value);
auto address = global_semaphore->address();

Synchronize(device);
for (const auto& core : cores_vec) {
auto sem_vals = tt::llrt::read_hex_vec_from_core(
device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t));
Expand All @@ -33,7 +33,7 @@ TEST_F(DispatchFixture, InitializeGlobalSemaphores) {
uint32_t initial_value = 2;
auto global_semaphore = tt::tt_metal::CreateGlobalSemaphore(device, cores, initial_value);
auto address = global_semaphore->address();

Synchronize(device);
for (const auto& core : cores_vec) {
auto sem_vals = tt::llrt::read_hex_vec_from_core(
device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t));
Expand Down Expand Up @@ -61,6 +61,7 @@ TEST_F(DispatchFixture, CreateMultipleGlobalSemaphoresOnSameCore) {
global_semaphores.push_back(tt::tt_metal::CreateGlobalSemaphore(device, cores[i], initial_values[i]));
addresses.push_back(global_semaphores[i]->address());
}
Synchronize(device);
for (size_t i = 0; i < cores.size(); i++) {
const auto& address = addresses[i];
const auto& initial_value = initial_values[i];
Expand All @@ -85,7 +86,7 @@ TEST_F(DispatchFixture, ResetGlobalSemaphores) {
std::vector<uint32_t> overwrite_value = {2};
auto global_semaphore = tt::tt_metal::CreateGlobalSemaphore(device, cores, initial_value);
auto address = global_semaphore->address();

Synchronize(device);
for (const auto& core : cores_vec) {
auto sem_vals = tt::llrt::read_hex_vec_from_core(
device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t));
Expand All @@ -101,6 +102,7 @@ TEST_F(DispatchFixture, ResetGlobalSemaphores) {
EXPECT_EQ(sem_vals[0], overwrite_value[0]);
}
global_semaphore->reset_semaphore_value();
Synchronize(device);
for (const auto& core : cores_vec) {
auto sem_vals = tt::llrt::read_hex_vec_from_core(
device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t));
Expand Down
38 changes: 24 additions & 14 deletions tt_metal/host_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,16 +299,21 @@ uint32_t CreateSemaphore(
*
* Return value: std::shared_ptr<GlobalSemaphore>
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|------------------------------------------------------|-----------------------------------------------------------|--------------|----------|
* | device | The device to create the semaphore on | Device * | | Yes |
* | cores | Range of the Tensix co-ordinates using the semaphore | const CoreRangeSet & | | Yes |
* | initial_value | Initial value of the semaphore | uint32_t | | Yes |
* | buffer_type | Buffer type to store the semaphore | BufferType | L1 types | No |
* | Argument | Description | Type | Valid Range | Required |
* |----------------|--------------------------------------------------------|-----------------------------------------------------------|--------------|----------|
* | device | The device to create the semaphore on | Device * | | Yes |
* | cores | Range of the Tensix co-ordinates using the semaphore | const CoreRangeSet & | | Yes |
* | initial_value | Initial value of the semaphore | uint32_t | | Yes |
* | buffer_type | Buffer type to store the semaphore | BufferType | L1 types | No |
* | sub_device_ids | Sub-device ids to wait on before writing the semaphore | tt::stl::Span<const SubDeviceId> | | No |
*/
// clang-format on
std::shared_ptr<GlobalSemaphore> CreateGlobalSemaphore(
Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1);
Device* device,
const CoreRangeSet& cores,
uint32_t initial_value,
BufferType buffer_type = BufferType::L1,
tt::stl::Span<const SubDeviceId> sub_device_ids = {});

// clang-format off
/**
Expand All @@ -317,16 +322,21 @@ std::shared_ptr<GlobalSemaphore> CreateGlobalSemaphore(
*
* Return value: std::shared_ptr<GlobalSemaphore>
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|------------------------------------------------------|-----------------------------------------------------------|--------------|----------|
* | device | The device to create the semaphore on | Device * | | Yes |
* | cores | Range of the Tensix co-ordinates using the semaphore | CoreRangeSet && | | Yes |
* | initial_value | Initial value of the semaphore | uint32_t | | Yes |
* | buffer_type | Buffer type to store the semaphore | BufferType | L1 types | No |
* | Argument | Description | Type | Valid Range | Required |
* |----------------|--------------------------------------------------------|-----------------------------------------------------------|--------------|----------|
* | device | The device to create the semaphore on | Device * | | Yes |
* | cores | Range of the Tensix co-ordinates using the semaphore | CoreRangeSet && | | Yes |
* | initial_value | Initial value of the semaphore | uint32_t | | Yes |
* | buffer_type | Buffer type to store the semaphore | BufferType | L1 types | No |
* | sub_device_ids | Sub-device ids to wait on before writing the semaphore | tt::stl::Span<const SubDeviceId> | | No |
*/
// clang-format on
std::shared_ptr<GlobalSemaphore> CreateGlobalSemaphore(
Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1);
Device* device,
CoreRangeSet&& cores,
uint32_t initial_value,
BufferType buffer_type = BufferType::L1,
tt::stl::Span<const SubDeviceId> sub_device_ids = {});

// clang-format off
/**
Expand Down
24 changes: 17 additions & 7 deletions tt_metal/impl/buffers/global_circular_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ GlobalCircularBuffer::GlobalCircularBuffer(
Device* device,
const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
uint32_t size,
BufferType buffer_type) :
BufferType buffer_type,
tt::stl::Span<const SubDeviceId> sub_device_ids) :
device_(device), sender_receiver_core_mapping_(sender_receiver_core_mapping), size_(size) {
TT_FATAL(this->device_ != nullptr, "Device cannot be null");
uint32_t num_sender_cores = sender_receiver_core_mapping.size();
Expand All @@ -46,10 +47,11 @@ GlobalCircularBuffer::GlobalCircularBuffer(
TT_FATAL(num_receiver_cores == this->receiver_cores_.num_cores(), "Duplicate receiver cores found");
this->all_cores_ = this->sender_cores_.merge(this->receiver_cores_);
TT_FATAL(this->all_cores_.num_cores() == num_sender_cores + num_receiver_cores, "Duplicate cores found");
this->setup_cb_buffers(buffer_type, max_num_receivers_per_sender);
this->setup_cb_buffers(buffer_type, max_num_receivers_per_sender, sub_device_ids);
}

void GlobalCircularBuffer::setup_cb_buffers(BufferType buffer_type, uint32_t max_num_receivers_per_sender) {
void GlobalCircularBuffer::setup_cb_buffers(
BufferType buffer_type, uint32_t max_num_receivers_per_sender, tt::stl::Span<const SubDeviceId> sub_device_ids) {
TT_FATAL(
buffer_type == BufferType::L1 or buffer_type == BufferType::L1_SMALL,
"Global circular buffer can only be created for L1 buffer types");
Expand Down Expand Up @@ -123,21 +125,29 @@ void GlobalCircularBuffer::setup_cb_buffers(BufferType buffer_type, uint32_t max
}
}

// Blocking write of cb config to buffer
// Write the config buffer to the device
// Only block for the slow dispatch case
if (this->device_->using_slow_dispatch()) {
detail::WriteToBuffer(*this->cb_config_buffer_, cb_config_host_buffer);
tt::Cluster::instance().l1_barrier(this->device_->id());
} else {
EnqueueWriteBuffer(this->device_->command_queue(), this->cb_config_buffer_, cb_config_host_buffer.data(), true);
EnqueueWriteBuffer(
this->device_->command_queue(),
this->cb_config_buffer_,
cb_config_host_buffer.data(),
false,
sub_device_ids);
}
}

std::shared_ptr<GlobalCircularBuffer> GlobalCircularBuffer::create(
Device* device,
const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
uint32_t size,
BufferType buffer_type) {
return std::make_unique<GlobalCircularBuffer>(device, sender_receiver_core_mapping, size, buffer_type);
BufferType buffer_type,
tt::stl::Span<const SubDeviceId> sub_device_ids) {
return std::make_shared<GlobalCircularBuffer>(
device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids);
}

const Buffer& GlobalCircularBuffer::cb_buffer() const { return *this->cb_buffer_; }
Expand Down
10 changes: 7 additions & 3 deletions tt_metal/impl/buffers/global_circular_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/impl/buffers/buffer_constants.hpp"
#include "tt_metal/impl/sub_device/sub_device_types.hpp"
#include "tt_metal/llrt/hal.hpp"

namespace tt::tt_metal {
Expand All @@ -30,7 +31,8 @@ class GlobalCircularBuffer {
Device* device,
const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
uint32_t size,
BufferType buffer_type);
BufferType buffer_type,
tt::stl::Span<const SubDeviceId> sub_device_ids);

GlobalCircularBuffer(const GlobalCircularBuffer&) = default;
GlobalCircularBuffer& operator=(const GlobalCircularBuffer&) = default;
Expand All @@ -42,7 +44,8 @@ class GlobalCircularBuffer {
Device* device,
const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
uint32_t size,
BufferType buffer_type = BufferType::L1);
BufferType buffer_type = BufferType::L1,
tt::stl::Span<const SubDeviceId> sub_device_ids = {});

const Buffer& cb_buffer() const;

Expand All @@ -57,7 +60,8 @@ class GlobalCircularBuffer {
const auto attribute_values() const { return std::make_tuple(this->sender_receiver_core_mapping_, this->size_); }

private:
void setup_cb_buffers(BufferType buffer_type, uint32_t max_num_receivers_per_sender);
void setup_cb_buffers(
BufferType buffer_type, uint32_t max_num_receivers_per_sender, tt::stl::Span<const SubDeviceId> sub_device_ids);

// GlobalCircularBuffer is implemented as a wrapper around a sharded buffer
// This can be updated in the future to be its own container with optimized dispatch functions
Expand Down
45 changes: 32 additions & 13 deletions tt_metal/impl/buffers/global_semaphore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,26 @@
namespace tt::tt_metal {

GlobalSemaphore::GlobalSemaphore(
Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type) :
Device* device,
const CoreRangeSet& cores,
uint32_t initial_value,
BufferType buffer_type,
tt::stl::Span<const SubDeviceId> sub_device_ids) :
device_(device), cores_(cores), initial_value_(initial_value) {
this->setup_buffer(buffer_type);
this->setup_buffer(buffer_type, sub_device_ids);
}

GlobalSemaphore::GlobalSemaphore(Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type) :
GlobalSemaphore::GlobalSemaphore(
Device* device,
CoreRangeSet&& cores,
uint32_t initial_value,
BufferType buffer_type,
tt::stl::Span<const SubDeviceId> sub_device_ids) :
device_(device), cores_(std::move(cores)), initial_value_(initial_value) {
this->setup_buffer(buffer_type);
this->setup_buffer(buffer_type, sub_device_ids);
}

void GlobalSemaphore::setup_buffer(BufferType buffer_type) {
void GlobalSemaphore::setup_buffer(BufferType buffer_type, tt::stl::Span<const SubDeviceId> sub_device_ids) {
TT_FATAL(
buffer_type == BufferType::L1 or buffer_type == BufferType::L1_SMALL,
"Global semaphore can only be created for L1 buffer types");
Expand All @@ -50,29 +59,39 @@ void GlobalSemaphore::setup_buffer(BufferType buffer_type) {
std::nullopt);

this->host_buffer_ = std::vector<uint32_t>(num_cores, this->initial_value_);
this->reset_semaphore_value();
this->reset_semaphore_value(sub_device_ids);
}

std::shared_ptr<GlobalSemaphore> GlobalSemaphore::create(
Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type) {
return std::make_unique<GlobalSemaphore>(device, cores, initial_value, buffer_type);
Device* device,
const CoreRangeSet& cores,
uint32_t initial_value,
BufferType buffer_type,
tt::stl::Span<const SubDeviceId> sub_device_ids) {
return std::make_shared<GlobalSemaphore>(device, cores, initial_value, buffer_type, sub_device_ids);
}
std::shared_ptr<GlobalSemaphore> GlobalSemaphore::create(
Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type) {
return std::make_unique<GlobalSemaphore>(device, std::move(cores), initial_value, buffer_type);
Device* device,
CoreRangeSet&& cores,
uint32_t initial_value,
BufferType buffer_type,
tt::stl::Span<const SubDeviceId> sub_device_ids) {
return std::make_shared<GlobalSemaphore>(device, std::move(cores), initial_value, buffer_type, sub_device_ids);
}

Device* GlobalSemaphore::device() const { return device_; }

DeviceAddr GlobalSemaphore::address() const { return buffer_->address(); }

void GlobalSemaphore::reset_semaphore_value() {
// Blocking write of semaphore value to buffer
void GlobalSemaphore::reset_semaphore_value(tt::stl::Span<const SubDeviceId> sub_device_ids) {
// Write the initial value to the semaphore to the device
// Only block for the slow dispatch case
if (this->device_->using_slow_dispatch()) {
detail::WriteToBuffer(*this->buffer_, this->host_buffer_);
tt::Cluster::instance().l1_barrier(this->device_->id());
} else {
EnqueueWriteBuffer(this->device_->command_queue(), this->buffer_, this->host_buffer_.data(), true);
EnqueueWriteBuffer(
this->device_->command_queue(), this->buffer_, this->host_buffer_.data(), false, sub_device_ids);
}
}

Expand Down
29 changes: 23 additions & 6 deletions tt_metal/impl/buffers/global_semaphore.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/impl/buffers/buffer_constants.hpp"
#include "tt_metal/impl/sub_device/sub_device_types.hpp"
#include "tt_metal/llrt/hal.hpp"

namespace tt::tt_metal {
Expand All @@ -21,10 +22,18 @@ class Device;
class GlobalSemaphore {
public:
GlobalSemaphore(
Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1);
Device* device,
const CoreRangeSet& cores,
uint32_t initial_value,
BufferType buffer_type = BufferType::L1,
tt::stl::Span<const SubDeviceId> sub_device_ids = {});

GlobalSemaphore(
Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1);
Device* device,
CoreRangeSet&& cores,
uint32_t initial_value,
BufferType buffer_type = BufferType::L1,
tt::stl::Span<const SubDeviceId> sub_device_ids = {});

GlobalSemaphore(const GlobalSemaphore&) = default;
GlobalSemaphore& operator=(const GlobalSemaphore&) = default;
Expand All @@ -33,22 +42,30 @@ class GlobalSemaphore {
GlobalSemaphore& operator=(GlobalSemaphore&&) noexcept = default;

static std::shared_ptr<GlobalSemaphore> create(
Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1);
Device* device,
const CoreRangeSet& cores,
uint32_t initial_value,
BufferType buffer_type = BufferType::L1,
tt::stl::Span<const SubDeviceId> sub_device_ids = {});

static std::shared_ptr<GlobalSemaphore> create(
Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1);
Device* device,
CoreRangeSet&& cores,
uint32_t initial_value,
BufferType buffer_type = BufferType::L1,
tt::stl::Span<const SubDeviceId> sub_device_ids = {});

Device* device() const;

DeviceAddr address() const;

void reset_semaphore_value();
void reset_semaphore_value(tt::stl::Span<const SubDeviceId> sub_device_ids = {});

static constexpr auto attribute_names = std::forward_as_tuple("cores", "initial_value");
const auto attribute_values() const { return std::make_tuple(this->cores_, this->initial_value_); }

private:
void setup_buffer(BufferType buffer_type);
void setup_buffer(BufferType buffer_type, tt::stl::Span<const SubDeviceId> sub_device_ids);

// GlobalSemaphore is implemented as a wrapper around a sharded buffer
// This can be updated in the future to be its own container with optimized dispatch functions
Expand Down
5 changes: 4 additions & 1 deletion tt_metal/include/tt_metal/global_circular_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,16 @@ namespace experimental {
* @param sender_receiver_core_mapping The mapping of remote sender to remote receiver cores for the circular buffer.
* @param size Size of the global circular buffer per core in bytes.
* @param buffer_type Buffer type to store the global circular buffer. Can only be an L1 buffer type.
* @param sub_device_ids Sub-device IDs to wait on before writing the global circular buffer config to device. Defaults
* to waiting on all sub-devices.
* @return Handle to the allocated global circular buffer.
*/
std::shared_ptr<GlobalCircularBuffer> CreateGlobalCircularBuffer(
Device* device,
const std::unordered_map<CoreCoord, CoreRangeSet>& sender_receiver_core_mapping,
uint32_t size,
BufferType buffer_type = BufferType::L1);
BufferType buffer_type = BufferType::L1,
tt::stl::Span<const SubDeviceId> sub_device_ids = {});

} // namespace experimental

Expand Down
Loading

0 comments on commit c230ae5

Please sign in to comment.