tenstorrent · abhullar-tt · Oct 11, 2024 · Oct 16, 2024 · Oct 30, 2024 · Dec 16, 2024
diff --git a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
@@ -63,7 +63,7 @@ uint32_t shard_size = shard_height * shard_width;
 uint32_t input_unit_size = sizeof(uint32_t);
 uint32_t shard_width_bytes = shard_width * data_size;
 uint32_t num_units_per_row = shard_width * input_unit_size;
-uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment());
+uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::L1));
 ```
 
 In order to shard the correct data segments to the respective core, we indicate the shard height, width, size, and other data for the kernel function.

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
@@ -164,14 +164,15 @@ def test_sharded_rm(
         ),
     )
 
-    yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation)
+    yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation, keep_l1_aligned=True)
 
     zt = ttnn.sharded_to_interleaved(
         yt,
         ttnn.MemoryConfig(
             memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED,
             buffer_type=ttnn.BufferType.L1,
         ),
+        is_l1_aligned=True,
     )
 
     tt_og = xt.cpu().to_torch()

@@ -19,7 +19,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::Device* device) {
     auto dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id());
     auto storage_core_bank_size =
         tt::get_storage_core_bank_size(device->id(), device->num_hw_cqs(), dispatch_core_config);
-    const uint32_t allocator_alignment = device->get_allocator_alignment();
+    const uint32_t allocator_alignment = device->get_allocator_alignment(BufferType::L1);
     const uint32_t interleaved_l1_bank_size = storage_core_bank_size.has_value()
                                                   ? storage_core_bank_size.value()
                                                   : (soc_desc.worker_l1_size - l1_unreserved_base);

@@ -649,7 +649,7 @@ inline void generate_random_paged_payload(
         words_per_page);
 
     // Note: the dst address marches in unison regardless of whether or not a core is written to
-    uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
+    uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buf_type);
     for (uint32_t page_id = start_page; page_id < start_page + cmd.write_paged.pages; page_id++) {
         CoreCoord bank_core;
         uint32_t bank_id = page_id % num_banks;
@@ -929,8 +929,9 @@ inline void gen_dispatcher_paged_write_cmd(
     uint32_t start_page,
     uint32_t page_size,
     uint32_t pages) {
-    uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
-    uint32_t num_banks = device->num_banks(is_dram ? BufferType::DRAM : BufferType::L1);
+    BufferType buffer_type = is_dram ? BufferType::DRAM : BufferType::L1;
+    uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buffer_type);
+    uint32_t num_banks = device->num_banks(buffer_type);
     CoreType core_type = is_dram ? CoreType::DRAM : CoreType::WORKER;
 
     // Not safe to mix paged L1 and paged DRAM writes currently in this test since same book-keeping.

@@ -527,8 +527,9 @@ def test_bh_alignment_i2s(
         memory_config=input_buffer_type,
         dtype=ttnn.bfloat16,
     )
-    x_t_sharded = ttnn.to_memory_config(x_t, shard_config)
-    x_t = ttnn.to_memory_config(x_t_sharded, output_buffer_type)
+    # So far the sharded tensor alignment is controled by keep_l1_aligned flag, will remove it later after launch
+    x_t_sharded = ttnn.interleaved_to_sharded(x_t, shard_config, keep_l1_aligned=True)
+    x_t = ttnn.sharded_to_interleaved(x_t_sharded, output_buffer_type, is_l1_aligned=True)
     output_data = ttnn.from_device(x_t)
     output_data = ttnn.to_torch(output_data)
     passing = torch.equal(input_data, output_data)

@@ -292,7 +292,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
     sharded_mem_config = ttnn.MemoryConfig(
         ttnn.types.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec
     )
-    input_tensor = ttnn.to_memory_config(input_tensor, sharded_mem_config)
+    input_tensor = ttnn.interleaved_to_sharded(input_tensor, sharded_mem_config, keep_l1_aligned=True)
 
     # groupnorm
     output_tensor = ttnn.group_norm(
@@ -306,7 +306,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
     )
 
     # output tensor
-    output_tensor = ttnn.to_memory_config(output_tensor, ttnn.L1_MEMORY_CONFIG)
+    output_tensor = ttnn.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG, is_l1_aligned=True)
     output_tensor = ttnn.from_device(output_tensor)
     output_tensor = ttnn.to_torch(output_tensor)
 

@@ -226,8 +226,10 @@ def test_pad_rm_sharded_stickwise(
     ttnn_input_tensor = ttnn.from_torch(
         torch_input_tensor, dtype=ttnn.float32, layout=ttnn.ROW_MAJOR_LAYOUT, device=device
     )
-    ttnn_sharded_input_tensor = ttnn.to_memory_config(ttnn_input_tensor, input_shard_memory_config)
-
+    # Still relay on keep_l1_aligned = True to make it work with the current implementation
+    ttnn_sharded_input_tensor = ttnn.interleaved_to_sharded(
+        ttnn_input_tensor, input_shard_memory_config, keep_l1_aligned=True
+    )
     padded_tensor = ttnn.pad(ttnn_sharded_input_tensor, pad_to_shape, input_tensor_start, pad_value)
 
     tt_output_tensor = ttnn.to_memory_config(padded_tensor, ttnn.L1_MEMORY_CONFIG)

@@ -25,5 +25,5 @@ constexpr uint8_t NumEthDispatchClasses = 2;
 constexpr uint8_t NumDramDispatchClasses = 1;
 constexpr uint8_t noc_size_x = 17;
 constexpr uint8_t noc_size_y = 12;
-#define ALLOCATOR_ALIGNMENT 64
-#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 6
+#define LOG_BASE_2_OF_DRAM_ALIGNMENT 6
+#define LOG_BASE_2_OF_L1_ALIGNMENT 4
@@ -143,6 +143,25 @@ FORCE_INLINE uint32_t get_bank_offset(uint32_t bank_index) {
     }
 }
 
+template <bool DRAM>
+FORCE_INLINE
+constexpr uint32_t get_allocator_alignment() {
+    if constexpr (DRAM) {
+        return DRAM_ALIGNMENT;
+    } else {
+        return L1_ALIGNMENT;
+    }
+}
+
+template <bool DRAM>
+FORCE_INLINE
+constexpr uint32_t get_log_base2_of_allocator_alignment() {
+    if constexpr (DRAM) {
+        return LOG_BASE_2_OF_DRAM_ALIGNMENT;
+    } else {
+        return LOG_BASE_2_OF_L1_ALIGNMENT;
+    }
+}
 }  // namespace interleaved_addr_gen
 
 /**
@@ -669,7 +688,7 @@ uint64_t get_dram_noc_addr(
     uint8_t noc = noc_index) {
     uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index<true>(id);
     uint32_t bank_index = interleaved_addr_gen::get_bank_index<true>(id, bank_offset_index);
-    uint32_t addr = (bank_offset_index * align(page_size, ALLOCATOR_ALIGNMENT)) + bank_base_address + offset +
+    uint32_t addr = (bank_offset_index * align(page_size, interleaved_addr_gen::get_allocator_alignment<true>())) + bank_base_address + offset +
                     bank_to_dram_offset[bank_index];
     uint32_t noc_xy = interleaved_addr_gen::get_noc_xy<true>(bank_index, noc);
     uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr);
@@ -684,7 +703,7 @@ uint64_t get_l1_noc_addr(
     uint8_t noc = noc_index) {
     uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index<false>(id);
     uint32_t bank_index = interleaved_addr_gen::get_bank_index<false>(id, bank_offset_index);
-    uint32_t addr = (bank_offset_index * align(page_size, ALLOCATOR_ALIGNMENT)) + bank_base_address + offset +
+    uint32_t addr = (bank_offset_index * align(page_size, interleaved_addr_gen::get_allocator_alignment<false>())) + bank_base_address + offset +
                     bank_to_dram_offset[bank_index];
     uint32_t noc_xy = interleaved_addr_gen::get_noc_xy<false>(bank_index, noc);
     uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr);
@@ -1055,7 +1074,7 @@ template <bool DRAM>
 struct InterleavedAddrGen {
     uint32_t bank_base_address;  // Base address for the whole tensor.
     const uint32_t page_size;    // Num bytes in page.
-    const uint32_t aligned_page_size = align(page_size, ALLOCATOR_ALIGNMENT);
+    const uint32_t aligned_page_size = align(page_size, interleaved_addr_gen::get_allocator_alignment<DRAM>());
 
     FORCE_INLINE
     uint32_t get_addr(
@@ -1090,9 +1109,11 @@ struct InterleavedPow2AddrGen {
     const uint32_t bank_base_address;
     const uint32_t log_base_2_of_page_size;  // WARNING: This struct is used for optimized get_noc_addr in which case
                                              // you know that bank_unit_size is a power of 2
-    const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT
+    static constexpr uint32_t log_base_2_of_allocator_alignment =
+        interleaved_addr_gen::get_log_base2_of_allocator_alignment<DRAM>();
+    const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment
                                                          ? this->log_base_2_of_page_size
-                                                         : LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT;
+                                                         : log_base_2_of_allocator_alignment;
 
     FORCE_INLINE
     uint32_t get_addr(
@@ -1205,9 +1226,11 @@ template <bool DRAM>
 struct InterleavedPow2AddrGenFast {
     uint32_t bank_base_address;              // Base address for the whole tensor.
     const uint32_t log_base_2_of_page_size;  // Num bytes in bank unit.
-    const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT
+    static constexpr uint32_t log_base_2_of_allocator_alignment =
+        interleaved_addr_gen::get_log_base2_of_allocator_alignment<DRAM>();
+    const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment
                                                          ? this->log_base_2_of_page_size
-                                                         : LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT;
+                                                         : log_base_2_of_allocator_alignment;
 
     FORCE_INLINE
     uint32_t get_addr(

@@ -17,5 +17,5 @@ constexpr uint8_t MaxProcessorsPerCoreType = 5;
 constexpr uint8_t NumTensixDispatchClasses = 3;
 constexpr uint8_t noc_size_x = 13;
 constexpr uint8_t noc_size_y = 12;
-#define ALLOCATOR_ALIGNMENT 32
-#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 5
+#define LOG_BASE_2_OF_DRAM_ALIGNMENT 5
+#define LOG_BASE_2_OF_L1_ALIGNMENT 4
@@ -22,5 +22,5 @@ constexpr uint8_t NumTensixDispatchClasses = 3;
 constexpr uint8_t NumEthDispatchClasses = 1;
 constexpr uint8_t noc_size_x = 10;
 constexpr uint8_t noc_size_y = 12;
-#define ALLOCATOR_ALIGNMENT 32
-#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 5
+#define LOG_BASE_2_OF_DRAM_ALIGNMENT 5
+#define LOG_BASE_2_OF_L1_ALIGNMENT 4
@@ -236,7 +236,7 @@ void init_one_bank_per_channel(Allocator& allocator, const AllocatorConfig& allo
         BufferType::DRAM,
         bank_offsets,
         dram_bank_size,
-        alloc_config.alignment,
+        alloc_config.dram_alignment,
         alloc_config.dram_unreserved_base,
         alloc_config.disable_interleaved);
     for (uint32_t bank_id = 0; bank_id < alloc_config.num_dram_channels; bank_id++) {
@@ -251,7 +251,7 @@ void init_one_bank_per_channel(Allocator& allocator, const AllocatorConfig& allo
         BufferType::TRACE,
         bank_offsets,
         alloc_config.trace_region_size,
-        alloc_config.alignment,
+        alloc_config.dram_alignment,
         dram_bank_size + alloc_config.dram_unreserved_base,
         alloc_config.disable_interleaved);
     for (uint32_t bank_id = 0; bank_id < alloc_config.num_dram_channels; bank_id++) {
@@ -272,7 +272,7 @@ void init_one_bank_per_l1(Allocator& allocator, const AllocatorConfig& alloc_con
         BufferType::L1,
         bank_offsets,
         l1_bank_size,
-        alloc_config.alignment,
+        alloc_config.l1_alignment,
         alloc_config.l1_unreserved_base,
         alloc_config.disable_interleaved);
 
@@ -349,6 +349,18 @@ const std::vector<uint32_t>& bank_ids_from_logical_core(
     return allocator.logical_core_to_bank_ids.at(buffer_type).at(logical_core);
 }
 
+uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type) {
+    switch (buffer_type) {
+        case BufferType::DRAM:
+        case BufferType::TRACE: return alloator.config.dram_alignment;
+        case BufferType::L1:
+        case BufferType::L1_SMALL: return alloator.config.l1_alignment;
+        default: {
+            TT_THROW("Allocator does not support buffer ");
+        }
+    }
+}
+
 Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type) {
     Statistics stats;
     switch (buffer_type) {

@@ -118,6 +118,8 @@ const std::vector<uint32_t>& bank_ids_from_dram_channel(const Allocator& allocat
 const std::vector<uint32_t>& bank_ids_from_logical_core(
     const Allocator& allocator, BufferType buffer_type, const CoreCoord& logical_core);
 
+uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type);
+
 Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type);
 
 void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);

@@ -36,6 +36,7 @@ struct AllocatorConfig {
     size_t dram_bank_size = 0;
     std::vector<size_t> dram_bank_offsets = {};
     uint32_t dram_unreserved_base = 0;
+    uint32_t dram_alignment = 0;
     //! worker specific configuration
     uint32_t l1_unreserved_base = 0;
     CoreRangeSet worker_grid = {};
@@ -49,7 +50,7 @@ struct AllocatorConfig {
     BankMapping l1_bank_remap =
         {};  // for remapping which l1 bank points to which bank if we assume normal row-major assignment
     CoreRangeSet compute_grid = {};
-    uint32_t alignment = 0;
+    uint32_t l1_alignment = 0;
     bool disable_interleaved = false;
     void reset();
     ~AllocatorConfig() { reset(); }

@@ -189,7 +189,7 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca
     // Storage only cores only need to reserve mailbox space to hold barriers
     uint32_t mem_mailbox_base = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::MAILBOX);
     uint32_t storage_core_unreserved_base =
-        ((mem_mailbox_base + alloc_config.alignment - 1) / alloc_config.alignment) * alloc_config.alignment;
+        ((mem_mailbox_base + alloc_config.l1_alignment - 1) / alloc_config.l1_alignment) * alloc_config.l1_alignment;
 
     // There is only l1_bank_size bytes available for L1 buffers to be allocated in
     uint64_t l1_bank_size = alloc_config.storage_core_bank_size.has_value()
@@ -205,7 +205,7 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca
         bank_id_to_bank_offset,
         allocatable_l1_size,
         interleaved_address_limit,
-        alloc_config.alignment,
+        alloc_config.l1_alignment,
         alloc_config.l1_unreserved_base,
         alloc_config.disable_interleaved);
 
@@ -219,7 +219,7 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca
         small_bank_id_to_bank_offset,
         alloc_config.l1_small_size,
         small_interleaved_address_limit,
-        alloc_config.alignment,
+        alloc_config.l1_alignment,
         small_alloc_offset,
         alloc_config.disable_interleaved);
 }

@@ -449,7 +449,7 @@ DeviceAddr Buffer::bank_local_page_address(uint32_t bank_id, uint32_t page_index
 }
 
 uint32_t Buffer::alignment() const {
-    return this->allocator_->config.alignment;
+    return this->device_->get_allocator_alignment(this->buffer_type_);
 }
 
 DeviceAddr Buffer::aligned_page_size() const {

@@ -257,6 +257,7 @@ std::unique_ptr<Allocator> Device::initialize_allocator(size_t l1_small_size, si
          .dram_bank_offsets = {},
          .dram_unreserved_base = hal.get_dev_addr(HalDramMemAddrType::DRAM_BARRIER) + \
                                  hal.get_dev_size(HalDramMemAddrType::DRAM_BARRIER),
+         .dram_alignment = hal.get_alignment(HalMemType::DRAM),
          .l1_unreserved_base = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED),
          .worker_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(logical_size.x - 1, logical_size.y - 1))),
          .worker_l1_size = static_cast<size_t>(soc_desc.worker_l1_size),
@@ -268,14 +269,14 @@ std::unique_ptr<Allocator> Device::initialize_allocator(size_t l1_small_size, si
          .worker_log_to_virtual_routing_y = tt::Cluster::instance().get_worker_logical_to_virtual_y(this->id()),
          .l1_bank_remap = {l1_bank_remap.begin(), l1_bank_remap.end()},
          .compute_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(compute_size.x - 1, compute_size.y - 1))),
-         .alignment = std::max(hal.get_alignment(HalMemType::DRAM), hal.get_alignment(HalMemType::L1)),
+         .l1_alignment = hal.get_alignment(HalMemType::L1),
          .disable_interleaved = false});
     TT_FATAL(config.l1_small_size < (config.storage_core_bank_size.has_value() ? config.storage_core_bank_size.value() : config.worker_l1_size - config.l1_unreserved_base),
             "Reserved size must be less than bank size");
     TT_FATAL(
-        config.l1_small_size % config.alignment == 0,
-        "Reserved size must be aligned to allocator alignment {}",
-        config.alignment);
+        config.l1_small_size % config.l1_alignment == 0,
+        "Reserved size must be aligned to L1 allocator alignment {}",
+        config.l1_alignment);
     // Initialize dram_offsets from soc_descriptor
     for (auto channel = 0; channel < soc_desc.get_num_dram_channels(); channel++) {
         config.dram_bank_offsets.push_back(soc_desc.get_address_offset(channel));
@@ -3491,14 +3492,14 @@ allocator::Statistics Device::get_memory_allocation_statistics(const BufferType
     return allocator::get_statistics(*allocator, buffer_type);
 }
 
-uint32_t Device::get_allocator_alignment() const {
+uint32_t Device::get_allocator_alignment(const BufferType &buffer_type) const {
     const auto& allocator = this->get_initialized_allocator();
-    return allocator->config.alignment;
+    return allocator::get_alignment(*allocator, buffer_type);
 }
 
-uint32_t Device::get_allocator_alignment(SubDeviceId sub_device_id) const {
+uint32_t Device::get_allocator_alignment(const BufferType &buffer_type, SubDeviceId sub_device_id) const {
     const auto& allocator = this->get_initialized_allocator(sub_device_id);
-    return allocator->config.alignment;
+    return allocator::get_alignment(*allocator, buffer_type);
 }
 
 size_t Device::get_l1_small_size() const {

@@ -160,8 +160,8 @@ class Device {
     allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type) const;
     allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type, SubDeviceId sub_device_id) const;
 
-    uint32_t get_allocator_alignment() const;
-    uint32_t get_allocator_alignment(SubDeviceId sub_device_id) const;
+    uint32_t get_allocator_alignment(const BufferType &buffer_type) const;
+    uint32_t get_allocator_alignment(const BufferType &buffer_type, SubDeviceId sub_device_id) const;
 
     std::optional<DeviceAddr> lowest_occupied_compute_l1_address() const;
     std::optional<DeviceAddr> lowest_occupied_compute_l1_address(tt::stl::Span<const SubDeviceId> sub_device_ids) const;