diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp index 624226798ecc..8c3321858408 100644 --- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp @@ -41,8 +41,7 @@ void validate_cb_address( for (const auto& [buffer_index, expected_address] : address_per_buffer_index) { auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; - EXPECT_EQ( - expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, cb_config_vector.at(base_index)); + EXPECT_EQ(expected_address, cb_config_vector.at(base_index)); } } } @@ -358,9 +357,8 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) { for (const auto& [buffer_index, expected_address] : address_per_buffer_index) { auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; - EXPECT_EQ( - expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config_vector.at(base_index)); // address validation + EXPECT_EQ(expected_address, + cb_config_vector.at(base_index)); // address validation EXPECT_EQ( num_pages_per_buffer_index.at(buffer_index), cb_config_vector.at(base_index + 2)); // num pages validation @@ -391,9 +389,8 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) { for (const auto& [buffer_index, expected_address] : address_per_buffer_index) { auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; - EXPECT_EQ( - expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config_vector.at(base_index)); // address validation + EXPECT_EQ(expected_address, + cb_config_vector.at(base_index)); // address validation EXPECT_EQ( num_pages_per_buffer_index.at(buffer_index), cb_config_vector.at(base_index + 2)); // num pages validation diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp index b9d1c369973f..1278c8abb7d1 100644 --- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp @@ -65,22 +65,10 @@ TEST_F(DeviceFixture, TensixTestCreateCircularBufferAtValidIndices) { uint32_t l1_unreserved_base = devices_.at(0)->get_base_allocator_addr(HalMemType::L1); std::map> golden_cb_config = { - {0, - {l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.num_pages}}, - {2, - {l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.num_pages}}, - {16, - {l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.num_pages}}, - {24, - {l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.num_pages}}}; + {0, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}, + {2, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}, + {16, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}, + {24, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}}; std::map data_format_spec = { {0, cb_config.data_format}, {2, cb_config.data_format}, diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp index 356f7766820d..a7ecd4a8e587 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp @@ -110,8 +110,8 @@ bool cb_config_successful(Device* device, Program& program, const DummyProgramMu const uint32_t index = program_config.cb_config_vector[i].cb_id * sizeof(uint32_t); const uint32_t cb_num_pages = program_config.cb_config_vector[i].num_pages; const uint32_t cb_size = cb_num_pages * program_config.cb_config_vector[i].page_size; - const bool addr_match = cb_config_vector.at(index) == ((cb_addr) >> 4); - const bool size_match = cb_config_vector.at(index + 1) == (cb_size >> 4); + const bool addr_match = cb_config_vector.at(index) == cb_addr; + const bool size_match = cb_config_vector.at(index + 1) == cb_size; const bool num_pages_match = cb_config_vector.at(index + 2) == cb_num_pages; pass &= (addr_match and size_match and num_pages_match); diff --git a/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp index 55c9d3d40a46..fc87c2b58df1 100644 --- a/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp +++ b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp @@ -9,6 +9,7 @@ #include "llrt/hal.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/hw/inc/circular_buffer_constants.h" #include "tt_metal/impl/kernels/kernel.hpp" #include "tt_metal/common/tt_backend_api_types.hpp" #include "dispatch_test_utils.hpp" @@ -141,13 +142,14 @@ class RandomProgramFixture : virtual public CommandQueueSingleCardProgramFixture const uint32_t num_cbs = this->generate_random_num(min, max); std::vector cb_page_sizes; for (uint32_t cb_idx = 0; cb_idx < num_cbs; cb_idx++) { - const uint32_t cb_page_size = this->generate_random_num(MIN_CB_PAGE_SIZE, MAX_CB_PAGE_SIZE, 16); + const uint32_t cb_page_size = + this->generate_random_num(MIN_CB_PAGE_SIZE, MAX_CB_PAGE_SIZE, CIRCULAR_BUFFER_COMPUTE_WORD_SIZE); const uint32_t cb_total_size = this->generate_random_num(MIN_CB_TOTAL_SIZE, MAX_CB_TOTAL_SIZE, cb_page_size); CircularBufferConfig config = CircularBufferConfig(cb_total_size, {{cb_idx, tt::DataFormat::Float16_b}}) .set_page_size(cb_idx, cb_page_size); CreateCircularBuffer(program, cores, config); - cb_page_sizes.push_back(cb_page_size / 16); + cb_page_sizes.push_back(cb_page_size); } return cb_page_sizes; } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp index f3e64ae1b3ec..3d5ee14d71be 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp @@ -45,7 +45,7 @@ void kernel_main() { (uint32_t tt_l1_ptr*)(kernel_config_base + mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.local_cb_offset); uint32_t cb_val = reinterpret_cast(cb_l1_base + i * 4)[3]; - uint32_t expected = ((i + 1) * page_size) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + uint32_t expected = ((i + 1) * page_size); if (cb_val != expected) { DPRINT << "Problem with CB idx: " << i << " Expected: " << expected << " Got: " << cb_val << ENDL(); while (true); // Purposefully hang the kernel if CBs did not arrive correctly diff --git a/tt_metal/hw/inc/circular_buffer.h b/tt_metal/hw/inc/circular_buffer.h index 35942bb49d72..68a2b3436cc9 100644 --- a/tt_metal/hw/inc/circular_buffer.h +++ b/tt_metal/hw/inc/circular_buffer.h @@ -103,3 +103,9 @@ FORCE_INLINE RemoteSenderCBInterface& get_remote_sender_cb_interface(uint32_t cb FORCE_INLINE RemoteReceiverCBInterface& get_remote_receiver_cb_interface(uint32_t cb_id) { return cb_interface[cb_id].remote_receiver_cb_interface; } + +#if defined(COMPILE_FOR_TRISC) +constexpr uint32_t cb_addr_shift = CIRCULAR_BUFFER_COMPUTE_ADDR_SHIFT; +#else +constexpr uint32_t cb_addr_shift = 0; +#endif diff --git a/tt_metal/hw/inc/circular_buffer_constants.h b/tt_metal/hw/inc/circular_buffer_constants.h index 8ff3fda763b7..3b80937753ce 100644 --- a/tt_metal/hw/inc/circular_buffer_constants.h +++ b/tt_metal/hw/inc/circular_buffer_constants.h @@ -9,5 +9,5 @@ constexpr static std::uint32_t NUM_CIRCULAR_BUFFERS = 32; constexpr static std::uint32_t UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG = 4; constexpr static std::uint32_t UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG = 2; -constexpr static std::uint32_t CIRCULAR_BUFFER_WORD_SIZE_BYTES = 16; -constexpr static std::uint32_t CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES = 4; +constexpr static std::uint32_t CIRCULAR_BUFFER_COMPUTE_WORD_SIZE = 16; +constexpr static std::uint32_t CIRCULAR_BUFFER_COMPUTE_ADDR_SHIFT = 4; diff --git a/tt_metal/hw/inc/circular_buffer_init.h b/tt_metal/hw/inc/circular_buffer_init.h index 29f2af20cb4d..b44025080222 100644 --- a/tt_metal/hw/inc/circular_buffer_init.h +++ b/tt_metal/hw/inc/circular_buffer_init.h @@ -25,10 +25,10 @@ inline void setup_local_cb_read_write_interfaces( for (uint32_t cb_id = start_cb_index; cb_id < max_cb_index; cb_id++) { // NOTE: fifo_addr, fifo_size and fifo_limit in 16B words! - uint32_t fifo_addr = circular_buffer_config_addr[0]; - uint32_t fifo_size = circular_buffer_config_addr[1]; + uint32_t fifo_addr = circular_buffer_config_addr[0] >> cb_addr_shift; + uint32_t fifo_size = circular_buffer_config_addr[1] >> cb_addr_shift; uint32_t fifo_num_pages = circular_buffer_config_addr[2]; - uint32_t fifo_page_size = circular_buffer_config_addr[3]; + uint32_t fifo_page_size = circular_buffer_config_addr[3] >> cb_addr_shift; uint32_t fifo_limit = fifo_addr + fifo_size; LocalCBInterface& local_interface = get_local_cb_interface(cb_id); diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index f5ee832f60b5..3c6e76d3e0ce 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -410,7 +410,7 @@ constexpr inline DataFormat get_dataformat(const std::int32_t operand) { FORCE_INLINE uint32_t get_write_ptr(uint32_t operand) { // return byte address (fifo_wr_ptr is 16B address) - uint32_t wr_ptr_bytes = get_local_cb_interface(operand).fifo_wr_ptr << 4; + uint32_t wr_ptr_bytes = get_local_cb_interface(operand).fifo_wr_ptr; return wr_ptr_bytes; } @@ -429,7 +429,7 @@ uint32_t get_write_ptr(uint32_t operand) { FORCE_INLINE uint32_t get_read_ptr(uint32_t operand) { // return byte address (fifo_rd_ptr is 16B address) - uint32_t rd_ptr_bytes = get_local_cb_interface(operand).fifo_rd_ptr << 4; + uint32_t rd_ptr_bytes = get_local_cb_interface(operand).fifo_rd_ptr; return rd_ptr_bytes; } diff --git a/tt_metal/hw/inc/debug/dprint_tile.h b/tt_metal/hw/inc/debug/dprint_tile.h index 85aa838d8e55..1e737f66cf12 100644 --- a/tt_metal/hw/inc/debug/dprint_tile.h +++ b/tt_metal/hw/inc/debug/dprint_tile.h @@ -17,13 +17,12 @@ #endif // Macros for printing circular buffer internals -#define CB_RD_PTR(id) (get_local_cb_interface(id).fifo_rd_ptr << 4) // only valid in unpacker thread -#define CB_RD_LIM(id) ((get_local_cb_interface(id).fifo_limit_plus_1 - 1) << 4) -#define CB_RD_SZ(id) (get_local_cb_interface(id).fifo_size << 4) +#define CB_RD_PTR(id) (get_local_cb_interface(id).fifo_rd_ptr << cb_addr_shift) // only valid in unpacker thread +#define CB_RD_SZ(id) (get_local_cb_interface(id).fifo_size << cb_addr_shift) -#define CB_WR_PTR(id) (get_local_cb_interface(id).fifo_wr_ptr << 4) // only valid in packer thread +#define CB_WR_PTR(id) (get_local_cb_interface(id).fifo_wr_ptr << cb_addr_shift) // only valid in packer thread #define CB_PAGE_COUNT(id) (get_local_cb_interface(id).fifo_num_pages) -#define CB_PAGE_SIZE(id) (get_local_cb_interface(id).fifo_page_size << 4) +#define CB_PAGE_SIZE(id) (get_local_cb_interface(id).fifo_page_size << cb_addr_shift) // // Slices/samples elements of a tile 'itile' from cb using a given numpy style slice object SliceRange. diff --git a/tt_metal/hw/inc/remote_circular_buffer_api.h b/tt_metal/hw/inc/remote_circular_buffer_api.h index 712458d62b14..044e3705f939 100644 --- a/tt_metal/hw/inc/remote_circular_buffer_api.h +++ b/tt_metal/hw/inc/remote_circular_buffer_api.h @@ -4,6 +4,7 @@ #pragma once +#include "tt_metal/hw/inc/circular_buffer.h" #include "tt_metal/hw/inc/debug/assert.h" #include "utils/utils.h" #ifndef COMPILE_FOR_TRISC @@ -242,11 +243,12 @@ FORCE_INLINE void align_local_cbs_to_remote_cb( // We assert that the offset of sender and receiver common attributes are the same // so we can use either interface here const RemoteReceiverCBInterface& remote_cb = get_remote_receiver_cb_interface(remote_cb_index); - uint32_t fifo_limit = remote_cb.fifo_limit_page_aligned >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; - uint32_t fifo_size = fifo_limit - (remote_cb.fifo_start_addr >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES); - uint32_t fifo_ptr = remote_cb.fifo_rd_ptr >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + uint32_t fifo_limit = remote_cb.fifo_limit_page_aligned >> cb_addr_shift; + uint32_t fifo_size = fifo_limit - (remote_cb.fifo_start_addr >> cb_addr_shift); + uint32_t fifo_ptr = remote_cb.fifo_rd_ptr >> cb_addr_shift; for (uint32_t i = 0; i < num_local_cbs; i++) { LocalCBInterface& local_cb = get_local_cb_interface(local_cb_indices[i]); + ASSERT(fifo_size % local_cb.fifo_page_size == 0); uint32_t fifo_num_pages = fifo_size / local_cb.fifo_page_size; local_cb.fifo_limit = fifo_limit; local_cb.fifo_size = fifo_size; diff --git a/tt_metal/impl/buffers/circular_buffer_types.cpp b/tt_metal/impl/buffers/circular_buffer_types.cpp index a14738e0edb7..7877c2648206 100644 --- a/tt_metal/impl/buffers/circular_buffer_types.cpp +++ b/tt_metal/impl/buffers/circular_buffer_types.cpp @@ -43,10 +43,6 @@ CircularBufferConfig& CircularBufferConfig::set_page_size(uint8_t buffer_index, if (this->total_size_ % page_size != 0) { TT_THROW("Total circular buffer size {} B must be divisible by page size {} B", this->total_size_, page_size); } - // TODO: Should use CIRCULAR_BUFFER_WORD_SIZE_BYTES here - if (page_size % sizeof(uint32_t) != 0) { - TT_THROW("Page size must be divisible by sizeof(uint32_t) because buffers holds uint32_t values"); - } this->page_sizes_[buffer_index] = page_size; return *this; diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 7b90f313dbe1..e0ef8b96cfc4 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -901,8 +901,8 @@ void EnqueueProgramCommand::assemble_device_commands( circular_buffers_on_corerange.size()); for (const std::shared_ptr& cb : circular_buffers_on_corerange) { program_command_sequence.circular_buffers_on_core_ranges[i].emplace_back(cb); - const uint32_t cb_address = cb->address() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; - const uint32_t cb_size = cb->size() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + const uint32_t cb_address = cb->address(); + const uint32_t cb_size = cb->size(); for (const auto& buffer_index : cb->local_buffer_indices()) { // 1 cmd for all 32 buffer indices, populate with real data for specified indices // cb config payload @@ -910,7 +910,7 @@ void EnqueueProgramCommand::assemble_device_commands( cb_config_payload[base_index] = cb_address; cb_config_payload[base_index + 1] = cb_size; cb_config_payload[base_index + 2] = cb->num_pages(buffer_index); - cb_config_payload[base_index + 3] = cb->page_size(buffer_index) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + cb_config_payload[base_index + 3] = cb->page_size(buffer_index); max_index = std::max(max_index, base_index + UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG); } for (const auto& buffer_index : cb->remote_buffer_indices()) { @@ -1363,8 +1363,8 @@ void EnqueueProgramCommand::update_device_commands( for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) { uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i]; for (const std::shared_ptr& cb : cbs_on_core_range) { - const uint32_t cb_address = cb->address() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; - const uint32_t cb_size = cb->size() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + const uint32_t cb_address = cb->address(); + const uint32_t cb_size = cb->size(); for (const auto& buffer_index : cb->local_buffer_indices()) { // 1 cmd for all 32 buffer indices, populate with real data for specified indices @@ -1373,7 +1373,7 @@ void EnqueueProgramCommand::update_device_commands( cb_config_payload[base_index] = cb_address; cb_config_payload[base_index + 1] = cb_size; cb_config_payload[base_index + 2] = cb->num_pages(buffer_index); - cb_config_payload[base_index + 3] = cb->page_size(buffer_index) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + cb_config_payload[base_index + 3] = cb->page_size(buffer_index); } for (const auto& buffer_index : cb->remote_buffer_indices()) { const uint32_t base_index = remote_offset_index + (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) * diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 09f90b8017b9..216ffcae5b32 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -776,7 +776,7 @@ void detail::Program_::allocate_circular_buffers(const Device *device) { } } } - + computed_addr = align(computed_addr, device->get_allocator_alignment()); for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) { for (CircularBufferAllocator &cb_allocator : this->cb_allocators_) { if (cb_allocator.core_range.intersects(core_range)) { diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 38f061df9876..e59f14430cd6 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -745,13 +745,10 @@ bool ConfigureDeviceWithProgram(Device* device, Program& program, bool fd_bootlo uint32_t size_in_bytes = circular_buffer->size(); uint32_t num_pages = circular_buffer->num_pages(buffer_index); uint32_t page_size = size_in_bytes / num_pages; - circular_buffer_config_vec[base_index] = - addr_in_bytes >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; // convert to addr in 16B words - circular_buffer_config_vec[base_index + 1] = - size_in_bytes >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; // convert to addr in 16B words + circular_buffer_config_vec[base_index] = addr_in_bytes; // convert to addr in 16B words + circular_buffer_config_vec[base_index + 1] = size_in_bytes; // convert to addr in 16B words circular_buffer_config_vec[base_index + 2] = num_pages; - circular_buffer_config_vec[base_index + 3] = - page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + circular_buffer_config_vec[base_index + 3] = page_size; } for (uint32_t buffer_index : circular_buffer->remote_buffer_indices()) { uint32_t base_index =