Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Global CB Support #15180

Merged
merged 5 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .clang-format-ignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ tt_metal/llrt/rtoptions.hpp
tt_metal/llrt/tt_cluster.cpp
tt_metal/llrt/tt_cluster.hpp
tt_metal/llrt/tt_memory.h
tt_metal/tt_metal.cpp
ttnn/cpp/pybind11/tensor.cpp
ttnn/cpp/ttnn/device_operation.hpp
ttnn/cpp/ttnn/graph/graph_processor.cpp
Expand Down
1 change: 1 addition & 0 deletions Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,7 @@ INPUT = tt_metal/hw/inc/dataflow_api.h \
tt_metal/include/tt_metal/command_queue.hpp \
tt_metal/include/tt_metal/device.hpp \
tt_metal/include/tt_metal/event.hpp \
tt_metal/include/tt_metal/global_circular_buffer.hpp \
tt_metal/include/tt_metal/kernel.hpp \
tt_metal/include/tt_metal/program.hpp \
tt_metal/include/tt_metal/trace.hpp
Expand Down
1 change: 1 addition & 0 deletions tests/tt_metal/tt_metal/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ set(UNIT_TESTS_API_SRC
${CMAKE_CURRENT_SOURCE_DIR}/test_direct.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_dram_to_l1_multicast.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_dram.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_global_circular_buffers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_global_semaphores.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_kernel_creation.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_noc.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "common/bfloat16.hpp"
#include "tt_metal/hw/inc/circular_buffer_constants.h"

using std::vector;
using namespace tt::tt_metal;
Expand All @@ -22,7 +23,8 @@ void validate_cb_address(
detail::LaunchProgram(device, program);

vector<uint32_t> cb_config_vector;
uint32_t cb_config_buffer_size = NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
uint32_t cb_config_buffer_size =
NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);

for (const CoreRange& core_range : cr_set.ranges()) {
for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
Expand All @@ -38,8 +40,9 @@ void validate_cb_address(
std::map<uint8_t, uint32_t> address_per_buffer_index = core_to_address_per_buffer_index.at(core_coord);

for (const auto& [buffer_index, expected_address] : address_per_buffer_index) {
auto base_index = UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(expected_address >> 4, cb_config_vector.at(base_index));
auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(
expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, cb_config_vector.at(base_index));
}
}
}
Expand Down Expand Up @@ -337,7 +340,7 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) {

vector<uint32_t> cb_config_vector;
uint32_t cb_config_buffer_size =
NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);

for (const CoreRange& core_range : cr_set.ranges()) {
for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
Expand All @@ -354,8 +357,10 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) {
std::map<uint8_t, uint32_t> num_pages_per_buffer_index = golden_num_pages_per_core.at(core_coord);

for (const auto& [buffer_index, expected_address] : address_per_buffer_index) {
auto base_index = UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(expected_address >> 4, cb_config_vector.at(base_index)); // address validation
auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(
expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config_vector.at(base_index)); // address validation
EXPECT_EQ(
num_pages_per_buffer_index.at(buffer_index),
cb_config_vector.at(base_index + 2)); // num pages validation
Expand Down Expand Up @@ -385,8 +390,10 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) {
std::map<uint8_t, uint32_t> num_pages_per_buffer_index = golden_num_pages_per_core.at(core_coord);

for (const auto& [buffer_index, expected_address] : address_per_buffer_index) {
auto base_index = UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(expected_address >> 4, cb_config_vector.at(base_index)); // address validation
auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(
expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config_vector.at(base_index)); // address validation
EXPECT_EQ(
num_pages_per_buffer_index.at(buffer_index),
cb_config_vector.at(base_index + 2)); // num pages validation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ bool test_cb_config_written_to_core(
cb_config_vector);

for (const auto& [buffer_index, golden_cb_config] : cb_config_per_buffer_index) {
auto base_index = UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index;
auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
pass &= (golden_cb_config.at(0) == cb_config_vector.at(base_index)); // address
pass &= (golden_cb_config.at(1) == cb_config_vector.at(base_index + 1)); // size
pass &= (golden_cb_config.at(2) == cb_config_vector.at(base_index + 2)); // num pages
Expand All @@ -65,10 +65,22 @@ TEST_F(DeviceFixture, TensixTestCreateCircularBufferAtValidIndices) {

uint32_t l1_unreserved_base = devices_.at(0)->get_base_allocator_addr(HalMemType::L1);
std::map<uint8_t, std::vector<uint32_t>> golden_cb_config = {
{0, {l1_unreserved_base >> 4, cb_config.page_size >> 4, cb_config.num_pages}},
{2, {l1_unreserved_base >> 4, cb_config.page_size >> 4, cb_config.num_pages}},
{16, {l1_unreserved_base >> 4, cb_config.page_size >> 4, cb_config.num_pages}},
{24, {l1_unreserved_base >> 4, cb_config.page_size >> 4, cb_config.num_pages}}};
{0,
{l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.num_pages}},
{2,
{l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.num_pages}},
{16,
{l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.num_pages}},
{24,
{l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.num_pages}}};
std::map<uint8_t, tt::DataFormat> data_format_spec = {
{0, cb_config.data_format},
{2, cb_config.data_format},
Expand Down
114 changes: 114 additions & 0 deletions tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <gtest/gtest.h>

#include <vector>

#include "device_fixture.hpp"
#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
#include "tt_metal/include/tt_metal/program.hpp"

TEST_F(DispatchFixture, TensixCreateGlobalCircularBuffers) {
CoreRangeSet cores(CoreRange({1, 1}, {1, 1}));
CoreRangeSet cores2(CoreRange({1, 1}, {2, 2}));
CoreRangeSet cores3(CoreRange({3, 3}, {3, 3}));

auto device = devices_[0];
{
std::unordered_map<CoreCoord, CoreRangeSet> sender_receiver_core_mapping;
sender_receiver_core_mapping[CoreCoord(0, 0)] = cores;
auto global_cb = tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
device, sender_receiver_core_mapping, 3200, tt::tt_metal::BufferType::L1);
auto buffer_address = global_cb->buffer_address();
auto config_address = global_cb->config_address();
}
{
std::unordered_map<CoreCoord, CoreRangeSet> sender_receiver_core_mapping;
sender_receiver_core_mapping[CoreCoord(0, 0)] = cores;
sender_receiver_core_mapping[CoreCoord(1, 1)] = cores3;
// sender receiver cores overlap
EXPECT_THROW(
tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
device, sender_receiver_core_mapping, 3200, tt::tt_metal::BufferType::L1),
std::exception);
}
{
std::unordered_map<CoreCoord, CoreRangeSet> sender_receiver_core_mapping;
sender_receiver_core_mapping[CoreCoord(0, 0)] = cores;
sender_receiver_core_mapping[CoreCoord(0, 1)] = cores2;
// receiver cores overlap
EXPECT_THROW(
tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
device, sender_receiver_core_mapping, 3200, tt::tt_metal::BufferType::L1),
std::exception);
}
}

TEST_F(DispatchFixture, TensixProgramGlobalCircularBuffers) {
CoreCoord sender_core = CoreCoord(0, 0);
CoreRangeSet sender_cores = CoreRangeSet(CoreRange(sender_core));
CoreRangeSet receiver_cores(CoreRange({1, 1}, {2, 2}));
CoreRangeSet dummy_receiver_cores(CoreRange({3, 3}, {3, 3}));
uint32_t global_cb_size = 3200;
uint32_t cb_page_size = 32;
tt::DataFormat tile_format = tt::DataFormat::Float16_b;
auto all_cores = sender_cores.merge(receiver_cores).merge(dummy_receiver_cores);
auto device = devices_[0];
std::unordered_map<CoreCoord, CoreRangeSet> sender_receiver_core_mapping;
sender_receiver_core_mapping[CoreCoord(0, 0)] = receiver_cores;
auto global_cb = tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
device, sender_receiver_core_mapping, 3200, tt::tt_metal::BufferType::L1);
std::unordered_map<CoreCoord, CoreRangeSet> dummy_sender_receiver_core_mapping;
dummy_sender_receiver_core_mapping[CoreCoord(0, 0)] = dummy_receiver_cores;
auto dummy_global_cb = tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
device, dummy_sender_receiver_core_mapping, 3200, tt::tt_metal::BufferType::L1);
{
tt::tt_metal::Program program = CreateProgram();
tt::tt_metal::KernelHandle blank_kernel = tt::tt_metal::CreateKernel(
program,
"tt_metal/kernels/dataflow/blank.cpp",
all_cores,
tt::tt_metal::DataMovementConfig{
.processor = tt::tt_metal::DataMovementProcessor::RISCV_0, .noc = tt::tt_metal::NOC::RISCV_0_default});
uint32_t remote_cb_index = 31;
uint32_t local_cb_index = 0;
tt::tt_metal::CircularBufferConfig global_cb_config = tt::tt_metal::CircularBufferConfig(cb_page_size);
global_cb_config.remote_index(remote_cb_index).set_page_size(cb_page_size).set_data_format(tile_format);
global_cb_config.index(local_cb_index).set_page_size(cb_page_size).set_data_format(tile_format);
EXPECT_THROW(global_cb_config.remote_index(2), std::exception);
EXPECT_THROW(
tt::tt_metal::v1::experimental::CreateCircularBuffer(
program, CoreRangeSet(CoreRange({3, 3})), global_cb_config, *global_cb),
std::exception);
auto remote_cb =
tt::tt_metal::v1::experimental::CreateCircularBuffer(program, receiver_cores, global_cb_config, *global_cb);
tt::tt_metal::detail::CompileProgram(device, program);
program.finalize(device);
tt::tt_metal::v1::experimental::UpdateDynamicCircularBufferAddress(program, remote_cb, *global_cb);
EXPECT_THROW(UpdateDynamicCircularBufferAddress(program, remote_cb, *dummy_global_cb), std::exception);
}
{
tt::tt_metal::Program program = CreateProgram();
tt::tt_metal::KernelHandle blank_kernel = tt::tt_metal::CreateKernel(
program,
"tt_metal/kernels/dataflow/blank.cpp",
all_cores,
tt::tt_metal::DataMovementConfig{
.processor = tt::tt_metal::DataMovementProcessor::RISCV_0, .noc = tt::tt_metal::NOC::RISCV_0_default});
uint32_t remote_cb_index = 16;
uint32_t local_cb_index = 17;
tt::tt_metal::CircularBufferConfig global_cb_config = tt::tt_metal::CircularBufferConfig(cb_page_size);
global_cb_config.remote_index(remote_cb_index).set_page_size(cb_page_size).set_data_format(tile_format);
global_cb_config.index(local_cb_index).set_page_size(cb_page_size).set_data_format(tile_format);
auto remote_cb =
tt::tt_metal::v1::experimental::CreateCircularBuffer(program, receiver_cores, global_cb_config, *global_cb);
tt::tt_metal::detail::CompileProgram(device, program);
EXPECT_THROW(program.finalize(device), std::exception);
}
}
15 changes: 6 additions & 9 deletions tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
#include "tt_metal/host_api.hpp"
#include "tt_metal/impl/buffers/global_semaphore.hpp"

TEST_F(DeviceFixture, InitializeGlobalSemaphores) {
TEST_F(DispatchFixture, InitializeGlobalSemaphores) {
CoreRangeSet cores(CoreRange({0, 0}, {1, 1}));

auto cores_vec = corerange_to_cores(cores);
for (unsigned int id = 0; id < num_devices_; id++) {
auto device = devices_.at(id);
for (auto device : devices_) {
{
uint32_t initial_value = 1;
auto global_semaphore = tt::tt_metal::CreateGlobalSemaphore(device, cores, initial_value);
Expand All @@ -44,16 +43,15 @@ TEST_F(DeviceFixture, InitializeGlobalSemaphores) {
}
}

TEST_F(DeviceFixture, CreateMultipleGlobalSemaphoresOnSameCore) {
TEST_F(DispatchFixture, CreateMultipleGlobalSemaphoresOnSameCore) {
std::vector<CoreRangeSet> cores{CoreRange({0, 0}, {1, 1}), CoreRange({0, 0}, {2, 2}), CoreRange({3, 3}, {5, 6})};
std::vector<std::vector<CoreCoord>> cores_vecs;
cores_vecs.reserve(cores.size());
std::vector<uint32_t> initial_values{1, 2, 3};
for (const auto& crs : cores) {
cores_vecs.push_back(corerange_to_cores(crs));
}
for (unsigned int id = 0; id < num_devices_; id++) {
auto device = devices_.at(id);
for (auto device : devices_) {
{
std::vector<std::unique_ptr<tt::tt_metal::GlobalSemaphore>> global_semaphores;
global_semaphores.reserve(cores.size());
Expand All @@ -77,12 +75,11 @@ TEST_F(DeviceFixture, CreateMultipleGlobalSemaphoresOnSameCore) {
}
}

TEST_F(DeviceFixture, ResetGlobalSemaphores) {
TEST_F(DispatchFixture, ResetGlobalSemaphores) {
CoreRangeSet cores(CoreRange({0, 0}, {1, 1}));

auto cores_vec = corerange_to_cores(cores);
for (unsigned int id = 0; id < num_devices_; id++) {
auto device = devices_.at(id);
for (auto device : devices_) {
{
uint32_t initial_value = 1;
std::vector<uint32_t> overwrite_value = {2};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ bool cb_config_successful(Device* device, Program& program, const DummyProgramMu
// Need to use old APIs to read since we cannot allocate a buffer in the reserved space we're trying
// to read from
vector<uint32_t> cb_config_vector;
uint32_t cb_config_buffer_size = NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
uint32_t cb_config_buffer_size =
NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);

uint32_t l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
for (const CoreRange& core_range : program_config.cr_set.ranges()) {
Expand Down Expand Up @@ -831,7 +832,8 @@ TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCBSharedAddressSpace
uint32_t num_tiles = 2;
uint32_t cb_size = num_tiles * single_tile_size;

uint32_t cb_config_buffer_size = NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
uint32_t cb_config_buffer_size =
NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
CoreCoord core_coord(0, 0);

for (Device* device : devices_) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ TEST_F(DispatchFixture, TensixActiveEthTestCBsAcrossDifferentCoreTypes) {
uint32_t num_tiles = 2;
uint32_t cb_size = num_tiles * single_tile_size;

uint32_t cb_config_buffer_size = NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
uint32_t cb_config_buffer_size =
NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);

for (Device* device : devices_) {
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
Expand Down
Loading
Loading