#8706: Add accessors to get aligned page size, aligned buffer size and

alignment on buffer
tenstorrent · Jun 24, 2024 · 9d208cc · 9d208cc
1 parent 5212c7d
commit 9d208cc
Show file tree

Hide file tree

Showing 21 changed files with 172 additions and 132 deletions.
diff --git a/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp b/tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp
@@ -35,7 +35,7 @@ void AllGather::validate(const std::vector<Tensor> &input_tensors) const {
     const auto& layout = input_tensors[0].get_layout();
     const auto& dtype = input_tensors[0].get_dtype();
     const auto& page_size = input_tensors[0].buffer()->page_size();
-    TT_FATAL(page_size % ADDRESS_ALIGNMENT == 0, "All Gather currently requires aligned pages");
+    TT_FATAL(page_size % input_tensors[0].buffer()->alignment() == 0, "All Gather currently requires aligned pages");
 
     // TODO: This can be removed by passing two page sizes, actual and aligned to be used for address offsets
     // Buffer sizes also need to take this aligned page size into consideration

diff --git a/tt_eager/tt_dnn/op_library/concat/concat_op.cpp b/tt_eager/tt_dnn/op_library/concat/concat_op.cpp
@@ -43,7 +43,7 @@ void Concat::validate(const std::vector<Tensor> &input_tensors) const {
         TT_FATAL(curr_shape == shape_first, "concat tensors differ in shape across non-concat dimensions.");
         if (in_ref.get_layout() == Layout::ROW_MAJOR && this->dim == shape_first.rank() - 1) {
             TT_FATAL(
-                (in_ref.get_legacy_shape()[this->dim] * in_ref.element_size()) % ADDRESS_ALIGNMENT == 0,
+                (in_ref.get_legacy_shape()[this->dim] * in_ref.element_size()) % in_ref.buffer()->alignment() == 0,
                 "Current concat implementation requires aligned last dim when concatting on last dim");
         }
         TT_FATAL(in_ref.is_sharded() == shard_first, "All tensors must be sharded or all must be interleaved");
@@ -114,7 +114,7 @@ Tensor concat(std::vector<Tensor> &input_tensors, const std::int64_t dim, const
                 if (input_tensors[0].get_layout() == Layout::ROW_MAJOR && normalized_dim == ref_rank - 1) {
                     for (const auto &input_tensor : input_tensors) {
                         TT_FATAL(
-                            (input_tensor.get_legacy_shape()[dim] * input_tensor.element_size()) % ADDRESS_ALIGNMENT ==
+                            (input_tensor.get_legacy_shape()[dim] * input_tensor.element_size()) % input_tensor.buffer()->alignment() ==
                                 0,
                             "Current concat implementation requires aligned last dim when concatting on last dim");
                     }

diff --git a/tt_eager/tt_dnn/op_library/concat/multi_core/concat_op_multi_core.cpp b/tt_eager/tt_dnn/op_library/concat/multi_core/concat_op_multi_core.cpp
@@ -532,7 +532,7 @@ operation::ProgramWithCallbacks concat_multi_core(
     uint32_t single_page_size;
     if (rm_layout) {
         num_output_pages = output.volume() / output.get_legacy_shape()[-1];
-        single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], ADDRESS_ALIGNMENT);
+        single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], output.buffer()->alignment());
     } else {
         num_output_pages = output.volume() / TILE_HW;
         single_page_size = tt_metal::detail::TileSize(cb_data_format);

diff --git a/tt_eager/tt_dnn/op_library/move/move_op.hpp b/tt_eager/tt_dnn/op_library/move/move_op.hpp
@@ -74,13 +74,13 @@ inline Tensor move(const Tensor& input_tensor, const std::optional<MemoryConfig>
     // Input and output addresses won't overlap if they are in different memory substrates
     bool non_overlap = not move_within_same_mem_space;
     const auto num_banks = input_tensor.device()->num_banks(output_tensor.buffer()->buffer_type());
-    uint32_t size_per_bank = tt_metal::detail::SizeBytesPerBank(output_tensor.buffer()->size(), output_tensor.buffer()->page_size(), num_banks);
+    uint32_t size_per_bank = tt_metal::detail::SizeBytesPerBank(output_tensor.buffer()->size(), output_tensor.buffer()->page_size(), num_banks, output_tensor.buffer()->alignment());
 
     // If input and output buffers overlap, input has to be copied into circular buffer before writing to output
     // Only compute with storage cores allow CBs to be created
     auto compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size();
     const auto num_l1_banks = compute_with_storage_grid_size.x * compute_with_storage_grid_size.y;
-    uint32_t size_per_l1_bank = tt_metal::detail::SizeBytesPerBank(output_tensor.buffer()->size(), output_tensor.buffer()->page_size(), num_l1_banks);
+    uint32_t size_per_l1_bank = tt_metal::detail::SizeBytesPerBank(output_tensor.buffer()->size(), output_tensor.buffer()->page_size(), num_l1_banks, L1_ALIGNMENT);
 
     if (move_within_same_mem_space) {
         switch (input_mem_config.buffer_type) {

diff --git a/tt_eager/tt_dnn/op_library/move/multi_core/move_op_multi_core_overlap.cpp b/tt_eager/tt_dnn/op_library/move/multi_core/move_op_multi_core_overlap.cpp
@@ -76,7 +76,7 @@ operation::ProgramWithCallbacks move_multi_core_with_overlap(const Tensor &input
     const auto num_dram_banks = device->num_banks(BufferType::DRAM);
     const auto num_l1_banks = compute_with_storage_grid_size.x * compute_with_storage_grid_size.y;
 
-    uint32_t size_per_l1_bank = tt_metal::detail::SizeBytesPerBank(output.buffer()->size(), output.buffer()->page_size(), num_l1_banks);
+    uint32_t size_per_l1_bank = tt_metal::detail::SizeBytesPerBank(output.buffer()->size(), output.buffer()->page_size(), num_l1_banks, L1_ALIGNMENT);
 
     // CB is being used as temp L1 buffer to copy src data into before writing to dst
     uint32_t cb_index = 0;

diff --git a/tt_eager/tt_dnn/op_library/move/multi_core/move_op_multi_core_sharded.cpp b/tt_eager/tt_dnn/op_library/move/multi_core/move_op_multi_core_sharded.cpp
@@ -64,10 +64,13 @@ operation::ProgramWithCallbacks move_multi_core_sharded(const Tensor& input, Ten
         output_buffer_address > input_buffer_address,
         "Expected output buffer to be allocated at a higher address than input buffer");
     uint32_t move_chunk_size_bytes = output_buffer_address - input_buffer_address;
+    TT_FATAL(input.buffer()->alignment() == output.buffer()->alignment(),
+        "Expected input buffer alignment ({} B) and output buffer alignment ({} B) to be equal",
+        input.buffer()->alignment(), output.buffer()->alignment());
     TT_FATAL(
-        move_chunk_size_bytes % ADDRESS_ALIGNMENT == 0,
+        move_chunk_size_bytes % input.buffer()->alignment() == 0,
         "Expected chunk size bytes to move to be {} byte aligned.",
-        ADDRESS_ALIGNMENT);
+        input.buffer()->alignment());
     uint32_t num_chunks = total_size_bytes / move_chunk_size_bytes;
     uint32_t remainder_chunk_size_bytes = total_size_bytes % move_chunk_size_bytes;
 

diff --git a/tt_eager/tt_dnn/op_library/repeat/multi_core/repeat_op_multi_core.cpp b/tt_eager/tt_dnn/op_library/repeat/multi_core/repeat_op_multi_core.cpp
@@ -29,7 +29,7 @@ operation::ProgramWithCallbacks repeat_multi_core(
     uint32_t single_page_size;
     if (rm_layout) {
         num_output_pages = output.volume() / output.get_legacy_shape()[-1];
-        single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], ADDRESS_ALIGNMENT);
+        single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], output.buffer()->alignment());
     } else {
         num_output_pages = output.volume() / TILE_HW;
         single_page_size = tt_metal::detail::TileSize(cb_data_format);

diff --git a/tt_eager/tt_dnn/op_library/repeat/repeat_op.cpp b/tt_eager/tt_dnn/op_library/repeat/repeat_op.cpp
@@ -24,7 +24,7 @@ void Repeat::validate(const std::vector<Tensor> &input_tensors) const {
     TT_FATAL(this->repeat_dim < input_shape.rank(), "Repeat dim specified is larger than input tensor rank.");
     if (input_tensor.get_layout() == Layout::ROW_MAJOR && this->repeat_dim == input_shape.rank() - 1) {
         TT_FATAL(
-            (input_shape[this->repeat_dim] * input_tensor.element_size()) % ADDRESS_ALIGNMENT == 0,
+            (input_shape[this->repeat_dim] * input_tensor.element_size()) % input_tensor.buffer()->alignment() == 0,
             "Current repeat implementation requires aligned last dim when repeating on last dim");
     }
     TT_FATAL(this->num_repeats > 0, "Number of repeats should be greater than 0");
@@ -75,7 +75,7 @@ Tensor repeat(const Tensor &input_tensor, const Shape &shape, const MemoryConfig
                 TT_FATAL(shape[dim] > 0, "Number of repetitions along a dim must be greater than 0");
                 if (input_tensor.get_layout() == Layout::ROW_MAJOR && dim == input_rank - 1) {
                     TT_FATAL(
-                        (input_tensor.get_legacy_shape()[dim] * input_tensor.element_size()) % ADDRESS_ALIGNMENT == 0,
+                        (input_tensor.get_legacy_shape()[dim] * input_tensor.element_size()) % input_tensor.buffer()->alignment() == 0,
                         "Current repeat implementation requires aligned last dim when repeating on last dim");
                 }
                 output = operation::run_without_autoformat(Repeat{dim, shape[dim], output_mem_config}, {output}).at(0);

diff --git a/tt_eager/tt_dnn/op_library/sharded/multi_core/sharded_op_multi_core.cpp b/tt_eager/tt_dnn/op_library/sharded/multi_core/sharded_op_multi_core.cpp
@@ -82,7 +82,7 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
         // TODO: Use a different variable name. Units refers to pages, but this is being used as size
         num_units_per_shard_width_last =
             input_unit_size - (round_up(num_units_per_row, input_unit_size) - num_units_per_row);
-        padded_offset_bytes = align(input_unit_size, ADDRESS_ALIGNMENT);
+        padded_offset_bytes = align(input_unit_size, input.buffer()->alignment());
     }
 
     bool convert_df = input_cb_data_format != output_cb_data_format;
@@ -98,10 +98,10 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
     uint32_t scratch_cb_index = CB::c_in1;
     uint32_t out_cb_index = input_cb_index;
     uint32_t num_input_units = num_units_per_shard;
-    uint32_t output_page_size = align(output_unit_size, ADDRESS_ALIGNMENT);
+    uint32_t output_page_size = align(output_unit_size, dst_buffer->alignment());
     if (convert_df) {
         out_cb_index = CB::c_out0;
-        uint32_t input_page_size = align(input_unit_size, ADDRESS_ALIGNMENT);
+        uint32_t input_page_size = align(input_unit_size, src_buffer->alignment());
         tt_metal::CircularBufferConfig input_cb_out_config =
             tt_metal::CircularBufferConfig(num_input_units * input_page_size, {{input_cb_index, input_cb_data_format}})
                 .set_page_size(input_cb_index, input_page_size);
@@ -379,15 +379,15 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
     uint32_t src0_cb_index = CB::c_in0;
     uint32_t out_cb_index = src0_cb_index;
     uint32_t num_input_units = num_units_per_shard;
-    uint32_t input_page_size = align(input_unit_size, ADDRESS_ALIGNMENT);
+    uint32_t input_page_size = align(input_unit_size, input.buffer()->alignment());
     tt_metal::CircularBufferConfig cb_src0_config =
         tt_metal::CircularBufferConfig(num_input_units * input_page_size, {{src0_cb_index, input_cb_data_format}})
             .set_page_size(src0_cb_index, input_page_size)
             .set_globally_allocated_address(*input.buffer());
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
     if (convert_df) {
         out_cb_index = CB::c_out0;
-        uint32_t output_page_size = align(output_unit_size, ADDRESS_ALIGNMENT);
+        uint32_t output_page_size = align(output_unit_size, output.buffer()->alignment());
         tt_metal::CircularBufferConfig output_cb_out_config =
             tt_metal::CircularBufferConfig(num_input_units * output_page_size, {{out_cb_index, output_cb_data_format}})
                 .set_page_size(out_cb_index, output_page_size);
@@ -450,7 +450,7 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
     uint32_t curr_idx_w = 0;
 
     const auto cores = corerange_to_cores(all_cores, std::nullopt, rm_orientation);
-    uint32_t padded_shard_width = align(output_unit_size, ADDRESS_ALIGNMENT);
+    uint32_t padded_shard_width = align(output_unit_size, dst_buffer->alignment());
     for (const auto& core : cores) {
         if (input.get_layout() == Layout::TILE) {
             uint32_t shard_height = num_units_per_shard_height;

diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
@@ -230,7 +230,6 @@ namespace tt::tt_metal{
         {
             bool pass = true;
             TT_FATAL(address >= DRAM_UNRESERVED_BASE, "Cannot write to reserved DRAM region, addresses [0, {}) are reserved!", DRAM_UNRESERVED_BASE);
-            // TODO: add a check for address validating dram alignment
             tt::Cluster::instance().write_dram_vec(host_buffer, tt_target_dram{device->id(), dram_channel, 0}, address);
             return pass;
         }
@@ -251,7 +250,6 @@ namespace tt::tt_metal{
         inline bool ReadFromDeviceDRAMChannel(Device *device, int dram_channel, uint32_t address, uint32_t size, std::vector<uint32_t> &host_buffer)
         {
             bool pass = true;
-            // todo: add a check for address validating dram alignment
             tt::Cluster::instance().dram_barrier(device->id());
             tt::Cluster::instance().read_dram_vec(host_buffer, size, tt_target_dram{device->id(), dram_channel, 0}, address);
             return pass;
@@ -272,7 +270,6 @@ namespace tt::tt_metal{
         inline bool WriteToDeviceL1(Device *device, const CoreCoord &logical_core, uint32_t address, std::vector<uint32_t> &host_buffer, CoreType core_type = CoreType::WORKER)
         {
             ZoneScoped;
-            // todo: add a check for address validating l1 alignment
             auto worker_core = device->physical_core_from_logical_core(logical_core, core_type);
             llrt::write_hex_vec_to_core(device->id(), worker_core, host_buffer, address);
             return true;
@@ -301,7 +298,6 @@ namespace tt::tt_metal{
          */
         inline bool ReadFromDeviceL1(Device *device, const CoreCoord &logical_core, uint32_t address, uint32_t size, std::vector<uint32_t> &host_buffer)
         {
-            // todo: add a check for address validating l1 alignment
             tt::Cluster::instance().l1_barrier(device->id());
             auto worker_core = device->worker_core_from_logical_core(logical_core);
             host_buffer = llrt::read_hex_vec_from_core(device->id(), worker_core, address, size);

diff --git a/tt_metal/detail/util.hpp b/tt_metal/detail/util.hpp
@@ -24,11 +24,11 @@ namespace tt::tt_metal::detail{
         return tt::tile_size(data_format);
     }
 
-    inline uint32_t SizeBytesPerBank(uint32_t size_bytes, uint32_t page_size_bytes, uint32_t num_banks) {
+    inline uint32_t SizeBytesPerBank(uint32_t size_bytes, uint32_t page_size_bytes, uint32_t num_banks, uint32_t alignment_bytes) {
         TT_ASSERT(page_size_bytes > 0 and size_bytes % page_size_bytes == 0, "Page size {} should be divisible by buffer size {}", page_size_bytes, size_bytes);
         uint32_t num_pages = size_bytes / page_size_bytes;
         int num_equally_distributed_pages = num_pages == 1 ? 1 : 1 + ((num_pages - 1) / num_banks);
-        return num_equally_distributed_pages * round_up(page_size_bytes, ADDRESS_ALIGNMENT);
+        return num_equally_distributed_pages * round_up(page_size_bytes, alignment_bytes);
     }
 
     inline NOC GetPreferredNOCForDRAMRead(ARCH arch) {

diff --git a/tt_metal/hostdevcommon/common_runtime_address_map.h b/tt_metal/hostdevcommon/common_runtime_address_map.h
@@ -44,7 +44,7 @@ static_assert (PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC > PROFILER_L1_BUFFER_SIZE
 constexpr static std::uint32_t L1_KERNEL_CONFIG_BASE = 98 * 1024;
 constexpr static std::uint32_t IDLE_ERISC_L1_KERNEL_CONFIG_BASE = 32 * 1024;
 
-constexpr static std::uint32_t L1_ALIGNMENT = 16;
+constexpr static std::uint32_t L1_ALIGNMENT = NOC_L1_READ_ALIGNMENT_BYTES >= NOC_L1_WRITE_ALIGNMENT_BYTES ? NOC_L1_READ_ALIGNMENT_BYTES : NOC_L1_WRITE_ALIGNMENT_BYTES;
 
 // config for 32 L1 buffers is at addr BUFFER_CONFIG_BASE
 // 12 bytes for each buffer: (addr, size, size_in_tiles)
@@ -87,7 +87,7 @@ constexpr static std::uint32_t ERISC_L1_UNRESERVED_BASE = L1_UNRESERVED_BASE; //
 // Reserved DRAM addresses
 // Host writes (4B value) to and reads from DRAM_BARRIER_BASE across all channels to ensure previous writes have been committed
 constexpr static std::uint32_t DRAM_BARRIER_BASE = 0;
-constexpr static std::uint32_t DRAM_ALIGNMENT = std::max(NOC_DRAM_READ_ALIGNMENT_BYTES, NOC_DRAM_WRITE_ALIGNMENT_BYTES);
+constexpr static std::uint32_t DRAM_ALIGNMENT = NOC_DRAM_READ_ALIGNMENT_BYTES >= NOC_DRAM_WRITE_ALIGNMENT_BYTES ? NOC_DRAM_READ_ALIGNMENT_BYTES : NOC_DRAM_WRITE_ALIGNMENT_BYTES;
 constexpr static std::uint32_t DRAM_BARRIER_SIZE = ((sizeof(uint32_t) + DRAM_ALIGNMENT - 1) / DRAM_ALIGNMENT) * DRAM_ALIGNMENT;
 constexpr static std::uint32_t DRAM_UNRESERVED_BASE = DRAM_BARRIER_BASE + DRAM_BARRIER_SIZE; // Start of unreserved space
 

diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h
@@ -97,8 +97,8 @@ inline void debug_sanitize_post_noc_addr_and_hang(
 
 // Return value is the alignment mask for the type of core the noc address points
 // to. Need to do this because L1 alignment needs to match the noc address alignment requirements,
-// even if it's different than the inherent L1 alignment requirements. Note that additional
-// alignment restrictions only apply for writes from L1, so need to specify direction as well.
+// even if it's different than the inherent L1 alignment requirements.
+// Direction is specified because reads and writes may have different L1 requirements (see noc_parameters.h).
 uint32_t debug_sanitize_noc_addr(
     uint64_t noc_addr,
     uint32_t l1_addr,
@@ -129,16 +129,15 @@ uint32_t debug_sanitize_noc_addr(
 
     // Check noc addr, we save the alignment requirement from the noc src/dst because the L1 address
     // needs to match alignment.
+    // Reads and writes may have different alignment requirements, see noc_parameters.h for details.
     uint32_t alignment_mask = (dir == DEBUG_SANITIZE_NOC_READ ? NOC_L1_READ_ALIGNMENT_BYTES : NOC_L1_WRITE_ALIGNMENT_BYTES) - 1;  // Default alignment, only override in ceratin cases.
     uint32_t invalid = multicast ? DebugSanitizeNocInvalidMulticast : DebugSanitizeNocInvalidUnicast;
     if (NOC_PCIE_XY_P(x, y)) {
-        // Additional alignment restriction only applies to reads
         alignment_mask = (dir == DEBUG_SANITIZE_NOC_READ ? NOC_PCIE_READ_ALIGNMENT_BYTES : NOC_PCIE_WRITE_ALIGNMENT_BYTES) - 1;
         if (!DEBUG_VALID_PCIE_ADDR(noc_local_addr, noc_len)) {
             debug_sanitize_post_noc_addr_and_hang(noc_addr, l1_addr, noc_len, multicast, invalid);
         }
     } else if (NOC_DRAM_XY_P(x, y)) {
-        // Additional alignment restriction only applies to reads
         alignment_mask = (dir == DEBUG_SANITIZE_NOC_READ ? NOC_DRAM_READ_ALIGNMENT_BYTES : NOC_DRAM_WRITE_ALIGNMENT_BYTES) - 1;
         if (!DEBUG_VALID_DRAM_ADDR(noc_local_addr, noc_len)) {
             debug_sanitize_post_noc_addr_and_hang(noc_addr, l1_addr, noc_len, multicast, invalid);