Skip to content

Commit

Permalink
#0: Remove incorrect assertion on cb page sizes being a multiple of 4…
Browse files Browse the repository at this point in the history
… (should have been 16) and update circular buffer init/usage to support any page size for cbs that aren't used by compute. Compute CBs still require 16B multiple for address/sizes
  • Loading branch information
tt-aho committed Dec 5, 2024
1 parent d6c5a99 commit 604612d
Show file tree
Hide file tree
Showing 13 changed files with 41 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ void validate_cb_address(

for (const auto& [buffer_index, expected_address] : address_per_buffer_index) {
auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(
expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, cb_config_vector.at(base_index));
EXPECT_EQ(expected_address, cb_config_vector.at(base_index));
}
}
}
Expand Down Expand Up @@ -358,9 +357,8 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) {

for (const auto& [buffer_index, expected_address] : address_per_buffer_index) {
auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(
expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config_vector.at(base_index)); // address validation
EXPECT_EQ(expected_address,
cb_config_vector.at(base_index)); // address validation
EXPECT_EQ(
num_pages_per_buffer_index.at(buffer_index),
cb_config_vector.at(base_index + 2)); // num pages validation
Expand Down Expand Up @@ -391,9 +389,8 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) {

for (const auto& [buffer_index, expected_address] : address_per_buffer_index) {
auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
EXPECT_EQ(
expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config_vector.at(base_index)); // address validation
EXPECT_EQ(expected_address,
cb_config_vector.at(base_index)); // address validation
EXPECT_EQ(
num_pages_per_buffer_index.at(buffer_index),
cb_config_vector.at(base_index + 2)); // num pages validation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,10 @@ TEST_F(DeviceFixture, TensixTestCreateCircularBufferAtValidIndices) {

uint32_t l1_unreserved_base = devices_.at(0)->get_base_allocator_addr(HalMemType::L1);
std::map<uint8_t, std::vector<uint32_t>> golden_cb_config = {
{0,
{l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.num_pages}},
{2,
{l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.num_pages}},
{16,
{l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.num_pages}},
{24,
{l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES,
cb_config.num_pages}}};
{0, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}},
{2, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}},
{16, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}},
{24, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}};
std::map<uint8_t, tt::DataFormat> data_format_spec = {
{0, cb_config.data_format},
{2, cb_config.data_format},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void kernel_main() {
(uint32_t tt_l1_ptr*)(kernel_config_base +
mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.local_cb_offset);
uint32_t cb_val = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(cb_l1_base + i * 4)[3];
uint32_t expected = ((i + 1) * page_size) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
uint32_t expected = ((i + 1) * page_size);
if (cb_val != expected) {
DPRINT << "Problem with CB idx: " << i << " Expected: " << expected << " Got: " << cb_val << ENDL();
while (true); // Purposefully hang the kernel if CBs did not arrive correctly
Expand Down
6 changes: 6 additions & 0 deletions tt_metal/hw/inc/circular_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,9 @@ FORCE_INLINE RemoteSenderCBInterface& get_remote_sender_cb_interface(uint32_t cb
FORCE_INLINE RemoteReceiverCBInterface& get_remote_receiver_cb_interface(uint32_t cb_id) {
return cb_interface[cb_id].remote_receiver_cb_interface;
}

#if defined(COMPILE_FOR_TRISC)
constexpr uint32_t cb_addr_shift = CIRCULAR_BUFFER_COMPUTE_ADDR_SHIFT;
#else
constexpr uint32_t cb_addr_shift = 0;
#endif
3 changes: 1 addition & 2 deletions tt_metal/hw/inc/circular_buffer_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@
constexpr static std::uint32_t NUM_CIRCULAR_BUFFERS = 32;
constexpr static std::uint32_t UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG = 4;
constexpr static std::uint32_t UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG = 2;
constexpr static std::uint32_t CIRCULAR_BUFFER_WORD_SIZE_BYTES = 16;
constexpr static std::uint32_t CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES = 4;
constexpr static std::uint32_t CIRCULAR_BUFFER_COMPUTE_ADDR_SHIFT = 4;
6 changes: 3 additions & 3 deletions tt_metal/hw/inc/circular_buffer_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ inline void setup_local_cb_read_write_interfaces(

for (uint32_t cb_id = start_cb_index; cb_id < max_cb_index; cb_id++) {
// NOTE: fifo_addr, fifo_size and fifo_limit in 16B words!
uint32_t fifo_addr = circular_buffer_config_addr[0];
uint32_t fifo_size = circular_buffer_config_addr[1];
uint32_t fifo_addr = circular_buffer_config_addr[0] >> cb_addr_shift;
uint32_t fifo_size = circular_buffer_config_addr[1] >> cb_addr_shift;
uint32_t fifo_num_pages = circular_buffer_config_addr[2];
uint32_t fifo_page_size = circular_buffer_config_addr[3];
uint32_t fifo_page_size = circular_buffer_config_addr[3] >> cb_addr_shift;
uint32_t fifo_limit = fifo_addr + fifo_size;

LocalCBInterface& local_interface = get_local_cb_interface(cb_id);
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/hw/inc/dataflow_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ constexpr inline DataFormat get_dataformat(const std::int32_t operand) {
FORCE_INLINE
uint32_t get_write_ptr(uint32_t operand) {
// return byte address (fifo_wr_ptr is 16B address)
uint32_t wr_ptr_bytes = get_local_cb_interface(operand).fifo_wr_ptr << 4;
uint32_t wr_ptr_bytes = get_local_cb_interface(operand).fifo_wr_ptr;
return wr_ptr_bytes;
}

Expand All @@ -429,7 +429,7 @@ uint32_t get_write_ptr(uint32_t operand) {
FORCE_INLINE
uint32_t get_read_ptr(uint32_t operand) {
// return byte address (fifo_rd_ptr is 16B address)
uint32_t rd_ptr_bytes = get_local_cb_interface(operand).fifo_rd_ptr << 4;
uint32_t rd_ptr_bytes = get_local_cb_interface(operand).fifo_rd_ptr;
return rd_ptr_bytes;
}

Expand Down
9 changes: 4 additions & 5 deletions tt_metal/hw/inc/debug/dprint_tile.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@
#endif

// Macros for printing circular buffer internals
#define CB_RD_PTR(id) (get_local_cb_interface(id).fifo_rd_ptr << 4) // only valid in unpacker thread
#define CB_RD_LIM(id) ((get_local_cb_interface(id).fifo_limit_plus_1 - 1) << 4)
#define CB_RD_SZ(id) (get_local_cb_interface(id).fifo_size << 4)
#define CB_RD_PTR(id) (get_local_cb_interface(id).fifo_rd_ptr << cb_addr_shift) // only valid in unpacker thread
#define CB_RD_SZ(id) (get_local_cb_interface(id).fifo_size << cb_addr_shift)

#define CB_WR_PTR(id) (get_local_cb_interface(id).fifo_wr_ptr << 4) // only valid in packer thread
#define CB_WR_PTR(id) (get_local_cb_interface(id).fifo_wr_ptr << cb_addr_shift) // only valid in packer thread
#define CB_PAGE_COUNT(id) (get_local_cb_interface(id).fifo_num_pages)
#define CB_PAGE_SIZE(id) (get_local_cb_interface(id).fifo_page_size << 4)
#define CB_PAGE_SIZE(id) (get_local_cb_interface(id).fifo_page_size << cb_addr_shift)

//
// Slices/samples elements of a tile 'itile' from cb using a given numpy style slice object SliceRange.
Expand Down
8 changes: 5 additions & 3 deletions tt_metal/hw/inc/remote_circular_buffer_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#pragma once

#include "tt_metal/hw/inc/circular_buffer.h"
#include "tt_metal/hw/inc/debug/assert.h"
#include "utils/utils.h"
#ifndef COMPILE_FOR_TRISC
Expand Down Expand Up @@ -242,11 +243,12 @@ FORCE_INLINE void align_local_cbs_to_remote_cb(
// We assert that the offset of sender and receiver common attributes are the same
// so we can use either interface here
const RemoteReceiverCBInterface& remote_cb = get_remote_receiver_cb_interface(remote_cb_index);
uint32_t fifo_limit = remote_cb.fifo_limit_page_aligned >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
uint32_t fifo_size = fifo_limit - (remote_cb.fifo_start_addr >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES);
uint32_t fifo_ptr = remote_cb.fifo_rd_ptr >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
uint32_t fifo_limit = remote_cb.fifo_limit_page_aligned >> cb_addr_shift;
uint32_t fifo_size = fifo_limit - (remote_cb.fifo_start_addr >> cb_addr_shift);
uint32_t fifo_ptr = remote_cb.fifo_rd_ptr >> cb_addr_shift;
for (uint32_t i = 0; i < num_local_cbs; i++) {
LocalCBInterface& local_cb = get_local_cb_interface(local_cb_indices[i]);
ASSERT(fifo_size % local_cb.fifo_page_size == 0);
uint32_t fifo_num_pages = fifo_size / local_cb.fifo_page_size;
local_cb.fifo_limit = fifo_limit;
local_cb.fifo_size = fifo_size;
Expand Down
4 changes: 0 additions & 4 deletions tt_metal/impl/buffers/circular_buffer_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ CircularBufferConfig& CircularBufferConfig::set_page_size(uint8_t buffer_index,
if (this->total_size_ % page_size != 0) {
TT_THROW("Total circular buffer size {} B must be divisible by page size {} B", this->total_size_, page_size);
}
// TODO: Should use CIRCULAR_BUFFER_WORD_SIZE_BYTES here
if (page_size % sizeof(uint32_t) != 0) {
TT_THROW("Page size must be divisible by sizeof(uint32_t) because buffers holds uint32_t values");
}

this->page_sizes_[buffer_index] = page_size;
return *this;
Expand Down
12 changes: 6 additions & 6 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -901,16 +901,16 @@ void EnqueueProgramCommand::assemble_device_commands(
circular_buffers_on_corerange.size());
for (const std::shared_ptr<CircularBuffer>& cb : circular_buffers_on_corerange) {
program_command_sequence.circular_buffers_on_core_ranges[i].emplace_back(cb);
const uint32_t cb_address = cb->address() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
const uint32_t cb_size = cb->size() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
const uint32_t cb_address = cb->address();
const uint32_t cb_size = cb->size();
for (const auto& buffer_index : cb->local_buffer_indices()) {
// 1 cmd for all 32 buffer indices, populate with real data for specified indices
// cb config payload
const uint32_t base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index;
cb_config_payload[base_index] = cb_address;
cb_config_payload[base_index + 1] = cb_size;
cb_config_payload[base_index + 2] = cb->num_pages(buffer_index);
cb_config_payload[base_index + 3] = cb->page_size(buffer_index) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
cb_config_payload[base_index + 3] = cb->page_size(buffer_index);
max_index = std::max(max_index, base_index + UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG);
}
for (const auto& buffer_index : cb->remote_buffer_indices()) {
Expand Down Expand Up @@ -1363,8 +1363,8 @@ void EnqueueProgramCommand::update_device_commands(
for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) {
uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i];
for (const std::shared_ptr<CircularBuffer>& cb : cbs_on_core_range) {
const uint32_t cb_address = cb->address() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
const uint32_t cb_size = cb->size() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
const uint32_t cb_address = cb->address();
const uint32_t cb_size = cb->size();
for (const auto& buffer_index : cb->local_buffer_indices()) {
// 1 cmd for all 32 buffer indices, populate with real data for specified indices

Expand All @@ -1373,7 +1373,7 @@ void EnqueueProgramCommand::update_device_commands(
cb_config_payload[base_index] = cb_address;
cb_config_payload[base_index + 1] = cb_size;
cb_config_payload[base_index + 2] = cb->num_pages(buffer_index);
cb_config_payload[base_index + 3] = cb->page_size(buffer_index) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
cb_config_payload[base_index + 3] = cb->page_size(buffer_index);
}
for (const auto& buffer_index : cb->remote_buffer_indices()) {
const uint32_t base_index = remote_offset_index + (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) *
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ void detail::Program_::allocate_circular_buffers(const Device *device) {
}
}
}

computed_addr = align(computed_addr, device->get_allocator_alignment());
for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) {
for (CircularBufferAllocator &cb_allocator : this->cb_allocators_) {
if (cb_allocator.core_range.intersects(core_range)) {
Expand Down
9 changes: 3 additions & 6 deletions tt_metal/tt_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -745,13 +745,10 @@ bool ConfigureDeviceWithProgram(Device* device, Program& program, bool fd_bootlo
uint32_t size_in_bytes = circular_buffer->size();
uint32_t num_pages = circular_buffer->num_pages(buffer_index);
uint32_t page_size = size_in_bytes / num_pages;
circular_buffer_config_vec[base_index] =
addr_in_bytes >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; // convert to addr in 16B words
circular_buffer_config_vec[base_index + 1] =
size_in_bytes >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; // convert to addr in 16B words
circular_buffer_config_vec[base_index] = addr_in_bytes; // convert to addr in 16B words
circular_buffer_config_vec[base_index + 1] = size_in_bytes; // convert to addr in 16B words
circular_buffer_config_vec[base_index + 2] = num_pages;
circular_buffer_config_vec[base_index + 3] =
page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES;
circular_buffer_config_vec[base_index + 3] = page_size;
}
for (uint32_t buffer_index : circular_buffer->remote_buffer_indices()) {
uint32_t base_index =
Expand Down

0 comments on commit 604612d

Please sign in to comment.