diff --git a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md index c64a9be60b1..b70d1badcda 100644 --- a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md +++ b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md @@ -63,12 +63,12 @@ uint32_t shard_size = shard_height * shard_width; uint32_t input_unit_size = sizeof(uint32_t); uint32_t shard_width_bytes = shard_width * data_size; uint32_t num_units_per_row = shard_width * input_unit_size; -uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment()); +uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::L1)); ``` In order to shard the correct data segments to the respective core, we indicate the shard height, width, size, and other data for the kernel function. For this situation, 16 units of data will be sharded across 4 cores; each core will have 4 units of data in their corresponding circular buffer. -The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory. +The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory with explicit referencing to BufferType::L1. This example demonstrates height sharding; the shard height is therefore set to evenly distribute the number of vector values across the cores. If the sharding strategy was different (i.e. width sharding or block sharding), the appropriate values for both the shard height and width would need to be set. diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py index df55fb657bf..191f1d735f2 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py @@ -164,7 +164,7 @@ def test_sharded_rm( ), ) - yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation) + yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation, keep_l1_aligned=True) zt = ttnn.sharded_to_interleaved( yt, @@ -172,6 +172,7 @@ def test_sharded_rm( memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED, buffer_type=ttnn.BufferType.L1, ), + is_l1_aligned=True, ) tt_og = xt.cpu().to_torch() diff --git a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp index e37436fd3af..b15dadb62a3 100644 --- a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp +++ b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp @@ -19,7 +19,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::IDevice* device) { auto dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id()); auto storage_core_bank_size = tt::get_storage_core_bank_size(device->id(), device->num_hw_cqs(), dispatch_core_config); - const uint32_t allocator_alignment = device->get_allocator_alignment(); + const uint32_t allocator_alignment = device->get_allocator_alignment(BufferType::L1); const uint32_t interleaved_l1_bank_size = storage_core_bank_size.has_value() ? storage_core_bank_size.value() : (soc_desc.worker_l1_size - l1_unreserved_base); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 808eb020896..c7f71166b53 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -650,7 +650,7 @@ inline void generate_random_paged_payload( words_per_page); // Note: the dst address marches in unison regardless of whether or not a core is written to - uint32_t page_size_alignment_bytes = device->get_allocator_alignment(); + uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buf_type); for (uint32_t page_id = start_page; page_id < start_page + cmd.write_paged.pages; page_id++) { CoreCoord bank_core; uint32_t bank_id = page_id % num_banks; @@ -930,8 +930,9 @@ inline void gen_dispatcher_paged_write_cmd( uint32_t start_page, uint32_t page_size, uint32_t pages) { - uint32_t page_size_alignment_bytes = device->get_allocator_alignment(); - uint32_t num_banks = device->num_banks(is_dram ? BufferType::DRAM : BufferType::L1); + BufferType buffer_type = is_dram ? BufferType::DRAM : BufferType::L1; + uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buffer_type); + uint32_t num_banks = device->num_banks(buffer_type); CoreType core_type = is_dram ? CoreType::DRAM : CoreType::WORKER; // Not safe to mix paged L1 and paged DRAM writes currently in this test since same book-keeping. diff --git a/tests/ttnn/unit_tests/operations/test_core.py b/tests/ttnn/unit_tests/operations/test_core.py index c39154379df..57709827f07 100644 --- a/tests/ttnn/unit_tests/operations/test_core.py +++ b/tests/ttnn/unit_tests/operations/test_core.py @@ -527,8 +527,9 @@ def test_bh_alignment_i2s( memory_config=input_buffer_type, dtype=ttnn.bfloat16, ) - x_t_sharded = ttnn.to_memory_config(x_t, shard_config) - x_t = ttnn.to_memory_config(x_t_sharded, output_buffer_type) + # So far the sharded tensor alignment is controled by keep_l1_aligned flag, will remove it later after launch + x_t_sharded = ttnn.interleaved_to_sharded(x_t, shard_config, keep_l1_aligned=True) + x_t = ttnn.sharded_to_interleaved(x_t_sharded, output_buffer_type, is_l1_aligned=True) output_data = ttnn.from_device(x_t) output_data = ttnn.to_torch(output_data) passing = torch.equal(input_data, output_data) diff --git a/tests/ttnn/unit_tests/operations/test_group_norm.py b/tests/ttnn/unit_tests/operations/test_group_norm.py index 6c286ef1a6a..ae0828c6d85 100644 --- a/tests/ttnn/unit_tests/operations/test_group_norm.py +++ b/tests/ttnn/unit_tests/operations/test_group_norm.py @@ -292,7 +292,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group sharded_mem_config = ttnn.MemoryConfig( ttnn.types.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec ) - input_tensor = ttnn.to_memory_config(input_tensor, sharded_mem_config) + input_tensor = ttnn.interleaved_to_sharded(input_tensor, sharded_mem_config, keep_l1_aligned=True) # groupnorm output_tensor = ttnn.group_norm( @@ -306,7 +306,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group ) # output tensor - output_tensor = ttnn.to_memory_config(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG, is_l1_aligned=True) output_tensor = ttnn.from_device(output_tensor) output_tensor = ttnn.to_torch(output_tensor) diff --git a/tests/ttnn/unit_tests/operations/test_pad.py b/tests/ttnn/unit_tests/operations/test_pad.py index 0dc9aa18ef1..8ea3be9302a 100644 --- a/tests/ttnn/unit_tests/operations/test_pad.py +++ b/tests/ttnn/unit_tests/operations/test_pad.py @@ -226,8 +226,10 @@ def test_pad_rm_sharded_stickwise( ttnn_input_tensor = ttnn.from_torch( torch_input_tensor, dtype=ttnn.float32, layout=ttnn.ROW_MAJOR_LAYOUT, device=device ) - ttnn_sharded_input_tensor = ttnn.to_memory_config(ttnn_input_tensor, input_shard_memory_config) - + # Still relay on keep_l1_aligned = True to make it work with the current implementation + ttnn_sharded_input_tensor = ttnn.interleaved_to_sharded( + ttnn_input_tensor, input_shard_memory_config, keep_l1_aligned=True + ) padded_tensor = ttnn.pad(ttnn_sharded_input_tensor, pad_to_shape, input_tensor_start, pad_value) tt_output_tensor = ttnn.to_memory_config(padded_tensor, ttnn.L1_MEMORY_CONFIG) diff --git a/tt_metal/hw/inc/blackhole/core_config.h b/tt_metal/hw/inc/blackhole/core_config.h index 9e4ba749e7b..beab0ab565c 100644 --- a/tt_metal/hw/inc/blackhole/core_config.h +++ b/tt_metal/hw/inc/blackhole/core_config.h @@ -25,5 +25,5 @@ constexpr uint8_t NumEthDispatchClasses = 2; constexpr uint8_t NumDramDispatchClasses = 1; constexpr uint8_t noc_size_x = 17; constexpr uint8_t noc_size_y = 12; -#define ALLOCATOR_ALIGNMENT 64 -#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 6 +#define LOG_BASE_2_OF_DRAM_ALIGNMENT 6 +#define LOG_BASE_2_OF_L1_ALIGNMENT 4 diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 0c7d1cdbd9d..32e7888f668 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -118,6 +118,25 @@ FORCE_INLINE uint32_t get_bank_offset(uint32_t bank_index) { } } +template +FORCE_INLINE +constexpr uint32_t get_allocator_alignment() { + if constexpr (DRAM) { + return DRAM_ALIGNMENT; + } else { + return L1_ALIGNMENT; + } +} + +template +FORCE_INLINE +constexpr uint32_t get_log_base2_of_allocator_alignment() { + if constexpr (DRAM) { + return LOG_BASE_2_OF_DRAM_ALIGNMENT; + } else { + return LOG_BASE_2_OF_L1_ALIGNMENT; + } +} } // namespace interleaved_addr_gen /** @@ -619,8 +638,8 @@ uint64_t get_dram_noc_addr( uint8_t noc = noc_index) { uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t addr = (bank_offset_index * align_power_of_2(page_size, ALLOCATOR_ALIGNMENT)) + bank_base_address + - offset + bank_to_dram_offset[bank_index]; + uint32_t addr = (bank_offset_index * align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment())) + bank_base_address + offset + + bank_to_dram_offset[bank_index]; uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); return noc_addr; @@ -634,8 +653,8 @@ uint64_t get_l1_noc_addr( uint8_t noc = noc_index) { uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t addr = (bank_offset_index * align_power_of_2(page_size, ALLOCATOR_ALIGNMENT)) + bank_base_address + - offset + bank_to_dram_offset[bank_index]; + uint32_t addr = (bank_offset_index * align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment())) + bank_base_address + offset + + bank_to_dram_offset[bank_index]; uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); return noc_addr; @@ -1005,7 +1024,7 @@ template struct InterleavedAddrGen { uint32_t bank_base_address; // Base address for the whole tensor. const uint32_t page_size; // Num bytes in page. - const uint32_t aligned_page_size = align_power_of_2(page_size, ALLOCATOR_ALIGNMENT); + const uint32_t aligned_page_size = align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment()); FORCE_INLINE uint32_t get_addr( @@ -1040,9 +1059,11 @@ struct InterleavedPow2AddrGen { const uint32_t bank_base_address; const uint32_t log_base_2_of_page_size; // WARNING: This struct is used for optimized get_noc_addr in which case // you know that bank_unit_size is a power of 2 - const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT + static constexpr uint32_t log_base_2_of_allocator_alignment = + interleaved_addr_gen::get_log_base2_of_allocator_alignment(); + const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment ? this->log_base_2_of_page_size - : LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT; + : log_base_2_of_allocator_alignment; FORCE_INLINE uint32_t get_addr( @@ -1155,9 +1176,11 @@ template struct InterleavedPow2AddrGenFast { uint32_t bank_base_address; // Base address for the whole tensor. const uint32_t log_base_2_of_page_size; // Num bytes in bank unit. - const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT + static constexpr uint32_t log_base_2_of_allocator_alignment = + interleaved_addr_gen::get_log_base2_of_allocator_alignment(); + const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment ? this->log_base_2_of_page_size - : LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT; + : log_base_2_of_allocator_alignment; FORCE_INLINE uint32_t get_addr( diff --git a/tt_metal/hw/inc/grayskull/core_config.h b/tt_metal/hw/inc/grayskull/core_config.h index 5f73abc2364..066d86376c0 100644 --- a/tt_metal/hw/inc/grayskull/core_config.h +++ b/tt_metal/hw/inc/grayskull/core_config.h @@ -17,5 +17,5 @@ constexpr uint8_t MaxProcessorsPerCoreType = 5; constexpr uint8_t NumTensixDispatchClasses = 3; constexpr uint8_t noc_size_x = 13; constexpr uint8_t noc_size_y = 12; -#define ALLOCATOR_ALIGNMENT 32 -#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 5 +#define LOG_BASE_2_OF_DRAM_ALIGNMENT 5 +#define LOG_BASE_2_OF_L1_ALIGNMENT 4 diff --git a/tt_metal/hw/inc/wormhole/core_config.h b/tt_metal/hw/inc/wormhole/core_config.h index 491ab6bb54a..e1d0c168036 100644 --- a/tt_metal/hw/inc/wormhole/core_config.h +++ b/tt_metal/hw/inc/wormhole/core_config.h @@ -22,5 +22,5 @@ constexpr uint8_t NumTensixDispatchClasses = 3; constexpr uint8_t NumEthDispatchClasses = 1; constexpr uint8_t noc_size_x = 10; constexpr uint8_t noc_size_y = 12; -#define ALLOCATOR_ALIGNMENT 32 -#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 5 +#define LOG_BASE_2_OF_DRAM_ALIGNMENT 5 +#define LOG_BASE_2_OF_L1_ALIGNMENT 4 diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp index 059c209d7a0..36772afe944 100644 --- a/tt_metal/impl/allocator/allocator.cpp +++ b/tt_metal/impl/allocator/allocator.cpp @@ -236,7 +236,7 @@ void init_one_bank_per_channel(Allocator& allocator, const AllocatorConfig& allo BufferType::DRAM, bank_offsets, dram_bank_size, - alloc_config.alignment, + alloc_config.dram_alignment, alloc_config.dram_unreserved_base, alloc_config.disable_interleaved); for (uint32_t bank_id = 0; bank_id < alloc_config.num_dram_channels; bank_id++) { @@ -251,7 +251,7 @@ void init_one_bank_per_channel(Allocator& allocator, const AllocatorConfig& allo BufferType::TRACE, bank_offsets, alloc_config.trace_region_size, - alloc_config.alignment, + alloc_config.dram_alignment, dram_bank_size + alloc_config.dram_unreserved_base, alloc_config.disable_interleaved); for (uint32_t bank_id = 0; bank_id < alloc_config.num_dram_channels; bank_id++) { @@ -272,7 +272,7 @@ void init_one_bank_per_l1(Allocator& allocator, const AllocatorConfig& alloc_con BufferType::L1, bank_offsets, l1_bank_size, - alloc_config.alignment, + alloc_config.l1_alignment, alloc_config.l1_unreserved_base, alloc_config.disable_interleaved); @@ -349,6 +349,18 @@ const std::vector& bank_ids_from_logical_core( return allocator.logical_core_to_bank_ids.at(buffer_type).at(logical_core); } +uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type) { + switch (buffer_type) { + case BufferType::DRAM: + case BufferType::TRACE: return alloator.config.dram_alignment; + case BufferType::L1: + case BufferType::L1_SMALL: return alloator.config.l1_alignment; + default: { + TT_THROW("Allocator does not support buffer "); + } + } +} + Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type) { Statistics stats; switch (buffer_type) { diff --git a/tt_metal/impl/allocator/allocator.hpp b/tt_metal/impl/allocator/allocator.hpp index a3eb7f05ed0..ab48af937df 100644 --- a/tt_metal/impl/allocator/allocator.hpp +++ b/tt_metal/impl/allocator/allocator.hpp @@ -118,6 +118,8 @@ const std::vector& bank_ids_from_dram_channel(const Allocator& allocat const std::vector& bank_ids_from_logical_core( const Allocator& allocator, BufferType buffer_type, const CoreCoord& logical_core); +uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type); + Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type); void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out); diff --git a/tt_metal/impl/allocator/allocator_types.hpp b/tt_metal/impl/allocator/allocator_types.hpp index b4ad6bf960c..5de487b95c9 100644 --- a/tt_metal/impl/allocator/allocator_types.hpp +++ b/tt_metal/impl/allocator/allocator_types.hpp @@ -36,6 +36,7 @@ struct AllocatorConfig { size_t dram_bank_size = 0; std::vector dram_bank_offsets = {}; uint32_t dram_unreserved_base = 0; + uint32_t dram_alignment = 0; //! worker specific configuration uint32_t l1_unreserved_base = 0; CoreRangeSet worker_grid = {}; @@ -49,7 +50,7 @@ struct AllocatorConfig { BankMapping l1_bank_remap = {}; // for remapping which l1 bank points to which bank if we assume normal row-major assignment CoreRangeSet compute_grid = {}; - uint32_t alignment = 0; + uint32_t l1_alignment = 0; bool disable_interleaved = false; void reset(); ~AllocatorConfig() { reset(); } diff --git a/tt_metal/impl/allocator/l1_banking_allocator.cpp b/tt_metal/impl/allocator/l1_banking_allocator.cpp index c79cd949a44..652ed7794f1 100644 --- a/tt_metal/impl/allocator/l1_banking_allocator.cpp +++ b/tt_metal/impl/allocator/l1_banking_allocator.cpp @@ -189,7 +189,7 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca // Storage only cores only need to reserve mailbox space to hold barriers uint32_t mem_mailbox_base = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::MAILBOX); uint32_t storage_core_unreserved_base = - ((mem_mailbox_base + alloc_config.alignment - 1) / alloc_config.alignment) * alloc_config.alignment; + ((mem_mailbox_base + alloc_config.l1_alignment - 1) / alloc_config.l1_alignment) * alloc_config.l1_alignment; // There is only l1_bank_size bytes available for L1 buffers to be allocated in uint64_t l1_bank_size = alloc_config.storage_core_bank_size.has_value() @@ -205,7 +205,8 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca bank_id_to_bank_offset, allocatable_l1_size, interleaved_address_limit, - alloc_config.alignment, + alloc_config.dram_alignment, // used DRAM alignment for L1 banks, to be consistent with DRAM and save trouble + // of meomory address shifting alloc_config.l1_unreserved_base, alloc_config.disable_interleaved); @@ -219,7 +220,7 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca small_bank_id_to_bank_offset, alloc_config.l1_small_size, small_interleaved_address_limit, - alloc_config.alignment, + alloc_config.l1_alignment, small_alloc_offset, alloc_config.disable_interleaved); } diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index 47fef708876..71ef23207fd 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -449,7 +449,7 @@ DeviceAddr Buffer::bank_local_page_address(uint32_t bank_id, uint32_t page_index } uint32_t Buffer::alignment() const { - return this->allocator_->config.alignment; + return this->device_->get_allocator_alignment(this->buffer_type_); } DeviceAddr Buffer::aligned_page_size() const { diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index aed0658c120..ab78aa9be3f 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -253,6 +253,7 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si .dram_bank_offsets = {}, .dram_unreserved_base = hal.get_dev_addr(HalDramMemAddrType::DRAM_BARRIER) + \ hal.get_dev_size(HalDramMemAddrType::DRAM_BARRIER), + .dram_alignment = hal.get_alignment(HalMemType::DRAM), .l1_unreserved_base = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED), .worker_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(logical_size.x - 1, logical_size.y - 1))), .worker_l1_size = static_cast(soc_desc.worker_l1_size), @@ -264,14 +265,14 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si .worker_log_to_virtual_routing_y = tt::Cluster::instance().get_worker_logical_to_virtual_y(this->id()), .l1_bank_remap = {l1_bank_remap.begin(), l1_bank_remap.end()}, .compute_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(compute_size.x - 1, compute_size.y - 1))), - .alignment = std::max(hal.get_alignment(HalMemType::DRAM), hal.get_alignment(HalMemType::L1)), + .l1_alignment = hal.get_alignment(HalMemType::L1), .disable_interleaved = false}); TT_FATAL(config.l1_small_size < (config.storage_core_bank_size.has_value() ? config.storage_core_bank_size.value() : config.worker_l1_size - config.l1_unreserved_base), "Reserved size must be less than bank size"); TT_FATAL( - config.l1_small_size % config.alignment == 0, - "Reserved size must be aligned to allocator alignment {}", - config.alignment); + config.l1_small_size % config.l1_alignment == 0, + "Reserved size must be aligned to L1 allocator alignment {}", + config.l1_alignment); // Initialize dram_offsets from soc_descriptor for (auto channel = 0; channel < soc_desc.get_num_dram_channels(); channel++) { config.dram_bank_offsets.push_back(soc_desc.get_address_offset(channel)); @@ -1401,14 +1402,14 @@ allocator::Statistics Device::get_memory_allocation_statistics(const BufferType return allocator::get_statistics(*allocator, buffer_type); } -uint32_t Device::get_allocator_alignment() const { +uint32_t Device::get_allocator_alignment(const BufferType &buffer_type) const { const auto& allocator = this->get_initialized_allocator(); - return allocator->config.alignment; + return allocator::get_alignment(*allocator, buffer_type); } -uint32_t Device::get_allocator_alignment(SubDeviceId sub_device_id) const { +uint32_t Device::get_allocator_alignment(const BufferType &buffer_type, SubDeviceId sub_device_id) const { const auto& allocator = this->get_initialized_allocator(sub_device_id); - return allocator->config.alignment; + return allocator::get_alignment(*allocator, buffer_type); } size_t Device::get_l1_small_size() const { diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 88bfdeb4b91..74ccf39400d 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -129,8 +129,8 @@ class Device : public IDevice { allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type) const override; allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type, SubDeviceId sub_device_id) const override; - uint32_t get_allocator_alignment() const override; - uint32_t get_allocator_alignment(SubDeviceId sub_device_id) const override; + uint32_t get_allocator_alignment(const BufferType &buffer_type) const override; + uint32_t get_allocator_alignment(const BufferType &buffer_type, SubDeviceId sub_device_id) const override; std::optional lowest_occupied_compute_l1_address() const override; std::optional lowest_occupied_compute_l1_address(tt::stl::Span sub_device_ids) const override; diff --git a/tt_metal/impl/dispatch/memcpy.hpp b/tt_metal/impl/dispatch/memcpy.hpp index d10a606b349..d1134ccbba4 100644 --- a/tt_metal/impl/dispatch/memcpy.hpp +++ b/tt_metal/impl/dispatch/memcpy.hpp @@ -32,6 +32,14 @@ static inline void memcpy_to_device(void* __restrict dst, const void* __restrict uint8_t* dst8 = (uint8_t*)dst; if (size_t num_lines = n / inner_blk_size) { + if ((uintptr_t)dst8 % sizeof(__m256i) != 0) { + __m128i blk = _mm_loadu_si128((const __m128i *)src8); + _mm_stream_si128((__m128i *)dst8, blk); + src8 += sizeof(__m128i); + dst8 += sizeof(__m128i); + n -= sizeof(__m128i); + num_lines = n / inner_blk_size; + } for (size_t i = 0; i < num_lines; ++i) { for (size_t j = 0; j < inner_loop; ++j) { __m256i blk = _mm256_loadu_si256((const __m256i*)src8); @@ -45,6 +53,14 @@ static inline void memcpy_to_device(void* __restrict dst, const void* __restrict if (n > 0) { if (size_t num_lines = n / sizeof(__m256i)) { + if ((uintptr_t)dst8 % sizeof(__m256i) != 0) { + __m128i blk = _mm_loadu_si128((const __m128i *)src8); + _mm_stream_si128((__m128i *)dst8, blk); + src8 += sizeof(__m128i); + dst8 += sizeof(__m128i); + n -= sizeof(__m128i); + num_lines = n / sizeof(__m256i); + } for (size_t i = 0; i < num_lines; ++i) { __m256i blk = _mm256_loadu_si256((const __m256i*)src8); _mm256_stream_si256((__m256i*)dst8, blk); diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 6ad9bbc0ac6..fddcda11d63 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -790,7 +790,7 @@ void detail::Program_::allocate_circular_buffers(const IDevice* device) { } } } - computed_addr = align(computed_addr, device->get_allocator_alignment()); + computed_addr = align(computed_addr, device->get_allocator_alignment(BufferType::DRAM)); for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) { for (CircularBufferAllocator &cb_allocator : this->cb_allocators_) { if (cb_allocator.core_range.intersects(core_range)) { diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp index 80ad3a259af..fe7547b31f4 100644 --- a/tt_metal/impl/sub_device/sub_device_manager.cpp +++ b/tt_metal/impl/sub_device/sub_device_manager.cpp @@ -277,6 +277,7 @@ void SubDeviceManager::populate_sub_allocators() { .dram_bank_size = 0, .dram_bank_offsets = global_allocator_config.dram_bank_offsets, .dram_unreserved_base = global_allocator_config.dram_unreserved_base, + .dram_alignment = global_allocator_config.dram_alignment, .l1_unreserved_base = global_allocator_config.l1_unreserved_base, .worker_grid = compute_cores, .worker_l1_size = global_allocator_config.l1_unreserved_base + local_l1_size_, @@ -288,7 +289,7 @@ void SubDeviceManager::populate_sub_allocators() { .worker_log_to_virtual_routing_y = global_allocator_config.worker_log_to_virtual_routing_y, .l1_bank_remap = std::move(l1_bank_remap), .compute_grid = compute_cores, - .alignment = global_allocator_config.alignment, + .l1_alignment = global_allocator_config.l1_alignment, .disable_interleaved = true}); TT_FATAL( config.l1_small_size < (config.storage_core_bank_size.has_value() @@ -296,9 +297,9 @@ void SubDeviceManager::populate_sub_allocators() { : config.worker_l1_size - config.l1_unreserved_base), "Reserved size must be less than bank size"); TT_FATAL( - config.l1_small_size % config.alignment == 0, - "Reserved size must be aligned to allocator alignment {}", - config.alignment); + config.l1_small_size % config.l1_alignment == 0, + "Reserved size must be aligned to allocator L1 alignment {}", + config.l1_alignment); // sub_devices only have compute cores for allocation for (const CoreCoord& core : corerange_to_cores(compute_cores)) { diff --git a/tt_metal/include/tt_metal/device.hpp b/tt_metal/include/tt_metal/device.hpp index 3cb6a375706..9fc566e6d81 100644 --- a/tt_metal/include/tt_metal/device.hpp +++ b/tt_metal/include/tt_metal/device.hpp @@ -144,8 +144,8 @@ class IDevice { virtual allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type) const = 0; virtual allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type, SubDeviceId sub_device_id) const = 0; - virtual uint32_t get_allocator_alignment() const = 0; - virtual uint32_t get_allocator_alignment(SubDeviceId sub_device_id) const = 0; + virtual uint32_t get_allocator_alignment(const BufferType& buffer_type) const = 0; + virtual uint32_t get_allocator_alignment(const BufferType& buffer_type, SubDeviceId sub_device_id) const = 0; virtual std::optional lowest_occupied_compute_l1_address() const = 0; virtual std::optional lowest_occupied_compute_l1_address(tt::stl::Span sub_device_ids) const = 0; diff --git a/tt_metal/programming_examples/sharding/shard_data_rm.cpp b/tt_metal/programming_examples/sharding/shard_data_rm.cpp index 81d2841f84c..bdbc56de13a 100644 --- a/tt_metal/programming_examples/sharding/shard_data_rm.cpp +++ b/tt_metal/programming_examples/sharding/shard_data_rm.cpp @@ -45,7 +45,7 @@ int main(int argc, char** argv) { uint32_t input_unit_size = sizeof(uint32_t); uint32_t shard_width_bytes = shard_width * data_size; uint32_t num_units_per_row = shard_width * input_unit_size; - uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment()); + uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::DRAM)); // configure and create interleaved DRAM buffer to insert source data into uint32_t src_buffer_size = input_unit_size * num_values / data_size; diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp index 2e99d29eb9d..f497f5aae16 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp @@ -454,9 +454,10 @@ tt_metal::operation::ProgramWithCallbacks concat_multi_core( uint32_t num_output_pages; uint32_t single_page_size; + uint32_t common_align_len = std::max(input_tensors[0].buffer()->alignment(), output.buffer()->alignment()); if (rm_layout) { num_output_pages = output.volume() / output.get_legacy_shape()[-1]; - single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], output.buffer()->alignment()); + single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], common_align_len); } else { num_output_pages = output.volume() / TILE_HW; single_page_size = tt_metal::detail::TileSize(cb_data_format); diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp index 41851065fa2..2019e696d0c 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp @@ -39,7 +39,7 @@ Fold::MultiCore::cached_program_t fold_multi_core( // input CB uint32_t cb_src0_index = tt::CBIndex::c_0; - uint32_t aligned_pixel_size = round_up_to_mul32(pixel_size); + uint32_t aligned_pixel_size = round_up_to_mul16(pixel_size); auto src_cb_config = CircularBufferConfig(num_pixels * aligned_pixel_size, {{cb_src0_index, cb_data_format}}) .set_page_size(cb_src0_index, aligned_pixel_size) .set_globally_allocated_address(*input.buffer()); @@ -47,7 +47,7 @@ Fold::MultiCore::cached_program_t fold_multi_core( // output CB uint32_t cb_dst0_index = tt::CBIndex::c_16; - uint32_t aligned_dst_pixel_size = round_up_to_mul32(dst_pixel_size); + uint32_t aligned_dst_pixel_size = round_up_to_mul16(dst_pixel_size); auto dst_cb_config = CircularBufferConfig(num_dst_pixels * aligned_dst_pixel_size, {{cb_dst0_index, cb_data_format}}) .set_page_size(cb_dst0_index, aligned_dst_pixel_size) diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp index 3c6817ba62f..fb71f62c8eb 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp @@ -1429,14 +1429,12 @@ operation::ProgramWithCallbacks pad_rm_sharded_width_only( TT_THROW("ttnn.pad: unsupported data type for pad_rm_sharded_stickwise"); } - // FIXME: assumes that this was sharded using DRAM alignment so that gaps are left in the tensor. - // if this changes, we should change the stick step to be 16B (L1 alignment). - auto dram_alignment_bytes = tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::DRAM); + auto l1_alignment_bytes = tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::L1); uint32_t padded_stick_step = tt::round_up( - padded_stick_bytes, dram_alignment_bytes); // round padded_stick bytes to a multiple of dram_alignment_bytes + padded_stick_bytes, l1_alignment_bytes); // round padded_stick bytes to a multiple of l1_alignment_bytes uint32_t unpadded_stick_step = tt::round_up( unpadded_stick_bytes, - dram_alignment_bytes); // round unpadded_stick bytes to a multiple of dram_alignment_bytes + l1_alignment_bytes); // round unpadded_stick bytes to a multiple of l1_alignment_bytes std::vector reader_ct_args = { unpadded_stick_bytes, diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp index cf8f62d1825..2a07682522e 100644 --- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp @@ -171,7 +171,8 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded( uint32_t per_core_N = a.shard_spec().value().shape[1]; uint32_t per_core_Mt = per_core_M / TILE_HEIGHT; uint32_t per_core_Nt = (per_core_N + TILE_WIDTH - 1) / TILE_WIDTH; - uint32_t per_core_N_bytes_padded = round_up_to_mul32(per_core_N * datum_size_bytes); + uint32_t l1_alignment = tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::L1); + uint32_t per_core_N_bytes_padded = tt::round_up(per_core_N * datum_size_bytes, l1_alignment); bool reader_repack_output = (per_core_N % TILE_WIDTH) != 0; bool tilize_in = a.get_layout() == Layout::ROW_MAJOR; bool untilize_out = output.get_layout() == Layout::ROW_MAJOR;