Skip to content

Commit

Permalink
#8706: Add accessors to get aligned page size, aligned buffer size and
Browse files Browse the repository at this point in the history
alignment on buffer
  • Loading branch information
abhullar-tt committed Jun 24, 2024
1 parent 5212c7d commit 9d208cc
Show file tree
Hide file tree
Showing 21 changed files with 172 additions and 132 deletions.
2 changes: 1 addition & 1 deletion tt_eager/tt_dnn/op_library/all_gather/all_gather_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ void AllGather::validate(const std::vector<Tensor> &input_tensors) const {
const auto& layout = input_tensors[0].get_layout();
const auto& dtype = input_tensors[0].get_dtype();
const auto& page_size = input_tensors[0].buffer()->page_size();
TT_FATAL(page_size % ADDRESS_ALIGNMENT == 0, "All Gather currently requires aligned pages");
TT_FATAL(page_size % input_tensors[0].buffer()->alignment() == 0, "All Gather currently requires aligned pages");

// TODO: This can be removed by passing two page sizes, actual and aligned to be used for address offsets
// Buffer sizes also need to take this aligned page size into consideration
Expand Down
4 changes: 2 additions & 2 deletions tt_eager/tt_dnn/op_library/concat/concat_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void Concat::validate(const std::vector<Tensor> &input_tensors) const {
TT_FATAL(curr_shape == shape_first, "concat tensors differ in shape across non-concat dimensions.");
if (in_ref.get_layout() == Layout::ROW_MAJOR && this->dim == shape_first.rank() - 1) {
TT_FATAL(
(in_ref.get_legacy_shape()[this->dim] * in_ref.element_size()) % ADDRESS_ALIGNMENT == 0,
(in_ref.get_legacy_shape()[this->dim] * in_ref.element_size()) % in_ref.buffer()->alignment() == 0,
"Current concat implementation requires aligned last dim when concatting on last dim");
}
TT_FATAL(in_ref.is_sharded() == shard_first, "All tensors must be sharded or all must be interleaved");
Expand Down Expand Up @@ -114,7 +114,7 @@ Tensor concat(std::vector<Tensor> &input_tensors, const std::int64_t dim, const
if (input_tensors[0].get_layout() == Layout::ROW_MAJOR && normalized_dim == ref_rank - 1) {
for (const auto &input_tensor : input_tensors) {
TT_FATAL(
(input_tensor.get_legacy_shape()[dim] * input_tensor.element_size()) % ADDRESS_ALIGNMENT ==
(input_tensor.get_legacy_shape()[dim] * input_tensor.element_size()) % input_tensor.buffer()->alignment() ==
0,
"Current concat implementation requires aligned last dim when concatting on last dim");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ operation::ProgramWithCallbacks concat_multi_core(
uint32_t single_page_size;
if (rm_layout) {
num_output_pages = output.volume() / output.get_legacy_shape()[-1];
single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], ADDRESS_ALIGNMENT);
single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], output.buffer()->alignment());
} else {
num_output_pages = output.volume() / TILE_HW;
single_page_size = tt_metal::detail::TileSize(cb_data_format);
Expand Down
4 changes: 2 additions & 2 deletions tt_eager/tt_dnn/op_library/move/move_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ inline Tensor move(const Tensor& input_tensor, const std::optional<MemoryConfig>
// Input and output addresses won't overlap if they are in different memory substrates
bool non_overlap = not move_within_same_mem_space;
const auto num_banks = input_tensor.device()->num_banks(output_tensor.buffer()->buffer_type());
uint32_t size_per_bank = tt_metal::detail::SizeBytesPerBank(output_tensor.buffer()->size(), output_tensor.buffer()->page_size(), num_banks);
uint32_t size_per_bank = tt_metal::detail::SizeBytesPerBank(output_tensor.buffer()->size(), output_tensor.buffer()->page_size(), num_banks, output_tensor.buffer()->alignment());

// If input and output buffers overlap, input has to be copied into circular buffer before writing to output
// Only compute with storage cores allow CBs to be created
auto compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size();
const auto num_l1_banks = compute_with_storage_grid_size.x * compute_with_storage_grid_size.y;
uint32_t size_per_l1_bank = tt_metal::detail::SizeBytesPerBank(output_tensor.buffer()->size(), output_tensor.buffer()->page_size(), num_l1_banks);
uint32_t size_per_l1_bank = tt_metal::detail::SizeBytesPerBank(output_tensor.buffer()->size(), output_tensor.buffer()->page_size(), num_l1_banks, L1_ALIGNMENT);

if (move_within_same_mem_space) {
switch (input_mem_config.buffer_type) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ operation::ProgramWithCallbacks move_multi_core_with_overlap(const Tensor &input
const auto num_dram_banks = device->num_banks(BufferType::DRAM);
const auto num_l1_banks = compute_with_storage_grid_size.x * compute_with_storage_grid_size.y;

uint32_t size_per_l1_bank = tt_metal::detail::SizeBytesPerBank(output.buffer()->size(), output.buffer()->page_size(), num_l1_banks);
uint32_t size_per_l1_bank = tt_metal::detail::SizeBytesPerBank(output.buffer()->size(), output.buffer()->page_size(), num_l1_banks, L1_ALIGNMENT);

// CB is being used as temp L1 buffer to copy src data into before writing to dst
uint32_t cb_index = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,13 @@ operation::ProgramWithCallbacks move_multi_core_sharded(const Tensor& input, Ten
output_buffer_address > input_buffer_address,
"Expected output buffer to be allocated at a higher address than input buffer");
uint32_t move_chunk_size_bytes = output_buffer_address - input_buffer_address;
TT_FATAL(input.buffer()->alignment() == output.buffer()->alignment(),
"Expected input buffer alignment ({} B) and output buffer alignment ({} B) to be equal",
input.buffer()->alignment(), output.buffer()->alignment());
TT_FATAL(
move_chunk_size_bytes % ADDRESS_ALIGNMENT == 0,
move_chunk_size_bytes % input.buffer()->alignment() == 0,
"Expected chunk size bytes to move to be {} byte aligned.",
ADDRESS_ALIGNMENT);
input.buffer()->alignment());
uint32_t num_chunks = total_size_bytes / move_chunk_size_bytes;
uint32_t remainder_chunk_size_bytes = total_size_bytes % move_chunk_size_bytes;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ operation::ProgramWithCallbacks repeat_multi_core(
uint32_t single_page_size;
if (rm_layout) {
num_output_pages = output.volume() / output.get_legacy_shape()[-1];
single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], ADDRESS_ALIGNMENT);
single_page_size = align(output.element_size() * output.get_legacy_shape()[-1], output.buffer()->alignment());
} else {
num_output_pages = output.volume() / TILE_HW;
single_page_size = tt_metal::detail::TileSize(cb_data_format);
Expand Down
4 changes: 2 additions & 2 deletions tt_eager/tt_dnn/op_library/repeat/repeat_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ void Repeat::validate(const std::vector<Tensor> &input_tensors) const {
TT_FATAL(this->repeat_dim < input_shape.rank(), "Repeat dim specified is larger than input tensor rank.");
if (input_tensor.get_layout() == Layout::ROW_MAJOR && this->repeat_dim == input_shape.rank() - 1) {
TT_FATAL(
(input_shape[this->repeat_dim] * input_tensor.element_size()) % ADDRESS_ALIGNMENT == 0,
(input_shape[this->repeat_dim] * input_tensor.element_size()) % input_tensor.buffer()->alignment() == 0,
"Current repeat implementation requires aligned last dim when repeating on last dim");
}
TT_FATAL(this->num_repeats > 0, "Number of repeats should be greater than 0");
Expand Down Expand Up @@ -75,7 +75,7 @@ Tensor repeat(const Tensor &input_tensor, const Shape &shape, const MemoryConfig
TT_FATAL(shape[dim] > 0, "Number of repetitions along a dim must be greater than 0");
if (input_tensor.get_layout() == Layout::ROW_MAJOR && dim == input_rank - 1) {
TT_FATAL(
(input_tensor.get_legacy_shape()[dim] * input_tensor.element_size()) % ADDRESS_ALIGNMENT == 0,
(input_tensor.get_legacy_shape()[dim] * input_tensor.element_size()) % input_tensor.buffer()->alignment() == 0,
"Current repeat implementation requires aligned last dim when repeating on last dim");
}
output = operation::run_without_autoformat(Repeat{dim, shape[dim], output_mem_config}, {output}).at(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
// TODO: Use a different variable name. Units refers to pages, but this is being used as size
num_units_per_shard_width_last =
input_unit_size - (round_up(num_units_per_row, input_unit_size) - num_units_per_row);
padded_offset_bytes = align(input_unit_size, ADDRESS_ALIGNMENT);
padded_offset_bytes = align(input_unit_size, input.buffer()->alignment());
}

bool convert_df = input_cb_data_format != output_cb_data_format;
Expand All @@ -98,10 +98,10 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
uint32_t scratch_cb_index = CB::c_in1;
uint32_t out_cb_index = input_cb_index;
uint32_t num_input_units = num_units_per_shard;
uint32_t output_page_size = align(output_unit_size, ADDRESS_ALIGNMENT);
uint32_t output_page_size = align(output_unit_size, dst_buffer->alignment());
if (convert_df) {
out_cb_index = CB::c_out0;
uint32_t input_page_size = align(input_unit_size, ADDRESS_ALIGNMENT);
uint32_t input_page_size = align(input_unit_size, src_buffer->alignment());
tt_metal::CircularBufferConfig input_cb_out_config =
tt_metal::CircularBufferConfig(num_input_units * input_page_size, {{input_cb_index, input_cb_data_format}})
.set_page_size(input_cb_index, input_page_size);
Expand Down Expand Up @@ -379,15 +379,15 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
uint32_t src0_cb_index = CB::c_in0;
uint32_t out_cb_index = src0_cb_index;
uint32_t num_input_units = num_units_per_shard;
uint32_t input_page_size = align(input_unit_size, ADDRESS_ALIGNMENT);
uint32_t input_page_size = align(input_unit_size, input.buffer()->alignment());
tt_metal::CircularBufferConfig cb_src0_config =
tt_metal::CircularBufferConfig(num_input_units * input_page_size, {{src0_cb_index, input_cb_data_format}})
.set_page_size(src0_cb_index, input_page_size)
.set_globally_allocated_address(*input.buffer());
auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
if (convert_df) {
out_cb_index = CB::c_out0;
uint32_t output_page_size = align(output_unit_size, ADDRESS_ALIGNMENT);
uint32_t output_page_size = align(output_unit_size, output.buffer()->alignment());
tt_metal::CircularBufferConfig output_cb_out_config =
tt_metal::CircularBufferConfig(num_input_units * output_page_size, {{out_cb_index, output_cb_data_format}})
.set_page_size(out_cb_index, output_page_size);
Expand Down Expand Up @@ -450,7 +450,7 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
uint32_t curr_idx_w = 0;

const auto cores = corerange_to_cores(all_cores, std::nullopt, rm_orientation);
uint32_t padded_shard_width = align(output_unit_size, ADDRESS_ALIGNMENT);
uint32_t padded_shard_width = align(output_unit_size, dst_buffer->alignment());
for (const auto& core : cores) {
if (input.get_layout() == Layout::TILE) {
uint32_t shard_height = num_units_per_shard_height;
Expand Down
4 changes: 0 additions & 4 deletions tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,6 @@ namespace tt::tt_metal{
{
bool pass = true;
TT_FATAL(address >= DRAM_UNRESERVED_BASE, "Cannot write to reserved DRAM region, addresses [0, {}) are reserved!", DRAM_UNRESERVED_BASE);
// TODO: add a check for address validating dram alignment
tt::Cluster::instance().write_dram_vec(host_buffer, tt_target_dram{device->id(), dram_channel, 0}, address);
return pass;
}
Expand All @@ -251,7 +250,6 @@ namespace tt::tt_metal{
inline bool ReadFromDeviceDRAMChannel(Device *device, int dram_channel, uint32_t address, uint32_t size, std::vector<uint32_t> &host_buffer)
{
bool pass = true;
// todo: add a check for address validating dram alignment
tt::Cluster::instance().dram_barrier(device->id());
tt::Cluster::instance().read_dram_vec(host_buffer, size, tt_target_dram{device->id(), dram_channel, 0}, address);
return pass;
Expand All @@ -272,7 +270,6 @@ namespace tt::tt_metal{
inline bool WriteToDeviceL1(Device *device, const CoreCoord &logical_core, uint32_t address, std::vector<uint32_t> &host_buffer, CoreType core_type = CoreType::WORKER)
{
ZoneScoped;
// todo: add a check for address validating l1 alignment
auto worker_core = device->physical_core_from_logical_core(logical_core, core_type);
llrt::write_hex_vec_to_core(device->id(), worker_core, host_buffer, address);
return true;
Expand Down Expand Up @@ -301,7 +298,6 @@ namespace tt::tt_metal{
*/
inline bool ReadFromDeviceL1(Device *device, const CoreCoord &logical_core, uint32_t address, uint32_t size, std::vector<uint32_t> &host_buffer)
{
// todo: add a check for address validating l1 alignment
tt::Cluster::instance().l1_barrier(device->id());
auto worker_core = device->worker_core_from_logical_core(logical_core);
host_buffer = llrt::read_hex_vec_from_core(device->id(), worker_core, address, size);
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/detail/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ namespace tt::tt_metal::detail{
return tt::tile_size(data_format);
}

inline uint32_t SizeBytesPerBank(uint32_t size_bytes, uint32_t page_size_bytes, uint32_t num_banks) {
inline uint32_t SizeBytesPerBank(uint32_t size_bytes, uint32_t page_size_bytes, uint32_t num_banks, uint32_t alignment_bytes) {
TT_ASSERT(page_size_bytes > 0 and size_bytes % page_size_bytes == 0, "Page size {} should be divisible by buffer size {}", page_size_bytes, size_bytes);
uint32_t num_pages = size_bytes / page_size_bytes;
int num_equally_distributed_pages = num_pages == 1 ? 1 : 1 + ((num_pages - 1) / num_banks);
return num_equally_distributed_pages * round_up(page_size_bytes, ADDRESS_ALIGNMENT);
return num_equally_distributed_pages * round_up(page_size_bytes, alignment_bytes);
}

inline NOC GetPreferredNOCForDRAMRead(ARCH arch) {
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/hostdevcommon/common_runtime_address_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ static_assert (PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC > PROFILER_L1_BUFFER_SIZE
constexpr static std::uint32_t L1_KERNEL_CONFIG_BASE = 98 * 1024;
constexpr static std::uint32_t IDLE_ERISC_L1_KERNEL_CONFIG_BASE = 32 * 1024;

constexpr static std::uint32_t L1_ALIGNMENT = 16;
constexpr static std::uint32_t L1_ALIGNMENT = NOC_L1_READ_ALIGNMENT_BYTES >= NOC_L1_WRITE_ALIGNMENT_BYTES ? NOC_L1_READ_ALIGNMENT_BYTES : NOC_L1_WRITE_ALIGNMENT_BYTES;

// config for 32 L1 buffers is at addr BUFFER_CONFIG_BASE
// 12 bytes for each buffer: (addr, size, size_in_tiles)
Expand Down Expand Up @@ -87,7 +87,7 @@ constexpr static std::uint32_t ERISC_L1_UNRESERVED_BASE = L1_UNRESERVED_BASE; //
// Reserved DRAM addresses
// Host writes (4B value) to and reads from DRAM_BARRIER_BASE across all channels to ensure previous writes have been committed
constexpr static std::uint32_t DRAM_BARRIER_BASE = 0;
constexpr static std::uint32_t DRAM_ALIGNMENT = std::max(NOC_DRAM_READ_ALIGNMENT_BYTES, NOC_DRAM_WRITE_ALIGNMENT_BYTES);
constexpr static std::uint32_t DRAM_ALIGNMENT = NOC_DRAM_READ_ALIGNMENT_BYTES >= NOC_DRAM_WRITE_ALIGNMENT_BYTES ? NOC_DRAM_READ_ALIGNMENT_BYTES : NOC_DRAM_WRITE_ALIGNMENT_BYTES;
constexpr static std::uint32_t DRAM_BARRIER_SIZE = ((sizeof(uint32_t) + DRAM_ALIGNMENT - 1) / DRAM_ALIGNMENT) * DRAM_ALIGNMENT;
constexpr static std::uint32_t DRAM_UNRESERVED_BASE = DRAM_BARRIER_BASE + DRAM_BARRIER_SIZE; // Start of unreserved space

Expand Down
7 changes: 3 additions & 4 deletions tt_metal/hw/inc/debug/sanitize_noc.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ inline void debug_sanitize_post_noc_addr_and_hang(

// Return value is the alignment mask for the type of core the noc address points
// to. Need to do this because L1 alignment needs to match the noc address alignment requirements,
// even if it's different than the inherent L1 alignment requirements. Note that additional
// alignment restrictions only apply for writes from L1, so need to specify direction as well.
// even if it's different than the inherent L1 alignment requirements.
// Direction is specified because reads and writes may have different L1 requirements (see noc_parameters.h).
uint32_t debug_sanitize_noc_addr(
uint64_t noc_addr,
uint32_t l1_addr,
Expand Down Expand Up @@ -129,16 +129,15 @@ uint32_t debug_sanitize_noc_addr(

// Check noc addr, we save the alignment requirement from the noc src/dst because the L1 address
// needs to match alignment.
// Reads and writes may have different alignment requirements, see noc_parameters.h for details.
uint32_t alignment_mask = (dir == DEBUG_SANITIZE_NOC_READ ? NOC_L1_READ_ALIGNMENT_BYTES : NOC_L1_WRITE_ALIGNMENT_BYTES) - 1; // Default alignment, only override in ceratin cases.
uint32_t invalid = multicast ? DebugSanitizeNocInvalidMulticast : DebugSanitizeNocInvalidUnicast;
if (NOC_PCIE_XY_P(x, y)) {
// Additional alignment restriction only applies to reads
alignment_mask = (dir == DEBUG_SANITIZE_NOC_READ ? NOC_PCIE_READ_ALIGNMENT_BYTES : NOC_PCIE_WRITE_ALIGNMENT_BYTES) - 1;
if (!DEBUG_VALID_PCIE_ADDR(noc_local_addr, noc_len)) {
debug_sanitize_post_noc_addr_and_hang(noc_addr, l1_addr, noc_len, multicast, invalid);
}
} else if (NOC_DRAM_XY_P(x, y)) {
// Additional alignment restriction only applies to reads
alignment_mask = (dir == DEBUG_SANITIZE_NOC_READ ? NOC_DRAM_READ_ALIGNMENT_BYTES : NOC_DRAM_WRITE_ALIGNMENT_BYTES) - 1;
if (!DEBUG_VALID_DRAM_ADDR(noc_local_addr, noc_len)) {
debug_sanitize_post_noc_addr_and_hang(noc_addr, l1_addr, noc_len, multicast, invalid);
Expand Down
Loading

0 comments on commit 9d208cc

Please sign in to comment.