From e035930dc10083fa276a9d684f58bdb42612cc47 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Mon, 2 Dec 2024 20:45:58 +0000 Subject: [PATCH] #13944: Redesign memory packing API --- .../eth/test_erisc_app_direct_send.cpp | 8 +- .../test_compile_sets_kernel_binaries.cpp | 23 ++-- tt_metal/hw/toolchain/sections.ld | 22 ++- tt_metal/impl/device/device.cpp | 10 +- tt_metal/impl/kernels/kernel.cpp | 27 +--- tt_metal/llrt/llrt.cpp | 33 ++--- tt_metal/llrt/llrt.hpp | 11 +- tt_metal/llrt/tt_elffile.cpp | 64 ++++----- tt_metal/llrt/tt_elffile.hpp | 10 +- tt_metal/llrt/tt_memory.cpp | 129 ++++++------------ tt_metal/llrt/tt_memory.h | 106 +++++++------- 11 files changed, 179 insertions(+), 264 deletions(-) diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp index 3b65d3684db..abe18295c73 100644 --- a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp +++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp @@ -227,10 +227,10 @@ bool send_over_eth( // TODO: this should be updated to use kernel api uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); - ll_api::memory const& binary_mem_send = llrt::get_risc_binary( - sender_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0); - ll_api::memory const& binary_mem_receive = llrt::get_risc_binary( - receiver_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0); + ll_api::memory const& binary_mem_send = + llrt::get_risc_binary(sender_device->build_firmware_target_path(active_eth_index, 0, 0)); + ll_api::memory const& binary_mem_receive = + llrt::get_risc_binary(receiver_device->build_firmware_target_path(active_eth_index, 0, 0)); for (const auto& eth_core : eth_cores) { llrt::write_hex_vec_to_core( diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 6a338ac7358..5f62aa46e74 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -209,8 +209,8 @@ int main(int argc, char** argv) { dm_class_idx, 0, get_latest_kernel_binary_path(mask, riscv0_kernel)); - ll_api::memory const& brisc_binary = llrt::get_risc_binary( - brisc_hex_path, 0, 0, 0, ll_api::memory::PackSpans::PACK, ll_api::memory::Relocate::XIP); + ll_api::memory const& brisc_binary = + llrt::get_risc_binary(brisc_hex_path, ll_api::memory::Loading::CONTIGUOUS_XIP); TT_FATAL( brisc_binary == *brisc_binaries.at(mask).at(0), "Expected saved BRISC binary to be the same as binary in persistent cache"); @@ -219,13 +219,11 @@ int main(int argc, char** argv) { dm_class_idx, 1, get_latest_kernel_binary_path(mask, riscv1_kernel)); - ll_api::memory::Relocate relo_type = + auto load_type = (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0) - ? ll_api::memory::Relocate::NONE - : ll_api::memory::Relocate::XIP; - - ll_api::memory const& ncrisc_binary = - llrt::get_risc_binary(ncrisc_hex_path, 0, 1, 0, ll_api::memory::PackSpans::PACK, relo_type); + ? ll_api::memory::Loading::CONTIGUOUS + : ll_api::memory::Loading::CONTIGUOUS_XIP; + ll_api::memory const& ncrisc_binary = llrt::get_risc_binary(ncrisc_hex_path, load_type); TT_FATAL( ncrisc_binary == *ncrisc_binaries.at(mask).at(0), "Expected saved NCRISC binary to be the same as binary in persistent cache"); @@ -236,13 +234,8 @@ int main(int argc, char** argv) { compute_class_idx, trisc_id, get_latest_kernel_binary_path(mask, compute_kernel)); - ll_api::memory const& trisc_binary = llrt::get_risc_binary( - trisc_hex_path, - 0, - 2, - trisc_id, - ll_api::memory::PackSpans::PACK, - ll_api::memory::Relocate::XIP); + ll_api::memory const& trisc_binary = + llrt::get_risc_binary(trisc_hex_path, ll_api::memory::Loading::CONTIGUOUS_XIP); TT_FATAL( trisc_binary == *compute_binaries.at(mask).at(trisc_id), "Expected saved TRISC binary for {} to be the same as binary in persistent cache", diff --git a/tt_metal/hw/toolchain/sections.ld b/tt_metal/hw/toolchain/sections.ld index 1abe5276410..2f0e0544e5f 100644 --- a/tt_metal/hw/toolchain/sections.ld +++ b/tt_metal/hw/toolchain/sections.ld @@ -65,6 +65,7 @@ SECTIONS *(.text .stub .text.* .gnu.linkonce.t.*) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning) + . = ALIGN(4); } > REGION_CODE :text .init.fini : { @@ -73,23 +74,30 @@ SECTIONS ASSERT(SIZEOF(.init.fini) == 0, ".init/.fini sections have contents"); } > REGION_CODE :text - . = ALIGN(. + MEM_PAD, MEM_ALIGN); +#if defined(TYPE_KERNEL) + __kernel_data_lma = .; +#endif + . = ALIGN(ABSOLUTE(.) + MEM_PAD, MEM_ALIGN); #if defined(TYPE_FIRMWARE) - __fw_export_end_text = .; + __fw_export_end_text = ABSOLUTE(.); #if defined(TARGET_NCRISC) - PROVIDE (KERNEL_ENTRY_SYMBOL = __fw_export_end_text); + PROVIDE (KERNEL_ENTRY_SYMBOL = ABSOLUTE(__fw_export_end_text)); #endif #endif #if defined(TYPE_KERNEL) - __kernel_init_local_l1_base = .; + __kernel_init_local_l1_base = ABSOLUTE(.); #endif #if defined(TYPE_FIRMWARE) PROVIDE(__global_pointer$ = ORIGIN(REGION_DATA) + 0x7f0); #endif - .data DATA_START : ALIGN(4) + .data DATA_START : +#if defined (TYPE_KERNEL) + AT(__kernel_data_lma) +#endif + ALIGN(4) { . = .; /* Force section emission. */ __ldm_data_start = .; @@ -141,8 +149,8 @@ SECTIONS } > REGION_DATA :data #ifdef TYPE_FIRMWARE - . = ALIGN(MEM_ALIGN); - __fw_export_ldm_end = .; + . = ALIGN(ABSOLUTE(.), MEM_ALIGN); + __fw_export_ldm_end = ABSOLUTE(.); #endif #ifdef TYPE_FIRMWARE diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 7a708b760d4..81597324722 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -438,10 +438,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); for (uint32_t riscv_id = build_idx; riscv_id < (build_idx + num_build_states); riscv_id++) { ll_api::memory const& binary_mem = llrt::get_risc_binary( - firmware_build_states_[riscv_id]->get_target_out_path(""), - core_type_idx, - processor_class, - (riscv_id - build_idx)); + firmware_build_states_[riscv_id]->get_target_out_path("")); uint32_t fw_size = binary_mem.get_text_size(); if (riscv_id == 1) { // TODO: clean up how brisc/ncrisc are handled // In this context, ncrisc_kernel_size16 is the size of the fw @@ -485,10 +482,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); for (uint32_t eriscv_id = build_idx; eriscv_id < (build_idx + num_build_states); eriscv_id++) { ll_api::memory const& binary_mem = llrt::get_risc_binary( - firmware_build_states_[eriscv_id]->get_target_out_path(""), - core_type_idx, - processor_class, - (eriscv_id - build_idx)); + firmware_build_states_[eriscv_id]->get_target_out_path("")); uint32_t fw_size = binary_mem.get_text_size(); log_debug(LogDevice, "ERISC fw binary size: {} in bytes", fw_size); llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (eriscv_id - build_idx)); diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index a3f67470d21..b6cecf023d0 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -378,17 +378,12 @@ void DataMovementKernel::read_binaries(Device *device) { int riscv_id = static_cast::type>(this->config_.processor); const JitBuildState &build_state = device->build_kernel_state(tensix_core_type, dm_class_idx, riscv_id); // TODO: from HAL - ll_api::memory::Relocate relo_type = + auto load_type = (riscv_id == 1 && (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0)) ? - ll_api::memory::Relocate::NONE : ll_api::memory::Relocate::XIP; + ll_api::memory::Loading::CONTIGUOUS : ll_api::memory::Loading::CONTIGUOUS_XIP; ll_api::memory const& binary_mem = llrt::get_risc_binary( build_state.get_target_out_path(this->kernel_full_name_), - // processor class is BRISC/NCRISC and each have one data movement processor type - tensix_core_type, - riscv_id, - dm_class_idx, - ll_api::memory::PackSpans::PACK, - relo_type); + load_type); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", riscv_id, binary_size); @@ -405,15 +400,11 @@ void EthernetKernel::read_binaries(Device *device) { const JitBuildState &build_state = device->build_kernel_state(erisc_core_type, dm_class_idx, erisc_id); int risc_id = erisc_id + (this->config_.eth_mode == Eth::IDLE ? 6 : 5); // TODO (abhullar): clean this up when llrt helpers use HAL // TODO: fix when active eth supports relo - ll_api::memory::Relocate relo_type = (this->config_.eth_mode == Eth::IDLE) ? - ll_api::memory::Relocate::XIP : ll_api::memory::Relocate::NONE; + auto load_type = (this->config_.eth_mode == Eth::IDLE) ? + ll_api::memory::Loading::CONTIGUOUS_XIP : ll_api::memory::Loading::DISCRETE; ll_api::memory const& binary_mem = llrt::get_risc_binary( build_state.get_target_out_path(this->kernel_full_name_), - erisc_core_type, - erisc_id, - dm_class_idx, - ll_api::memory::PackSpans::PACK, - relo_type); + load_type); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "ERISC {} kernel binary size: {} in bytes", erisc_id, binary_size); @@ -429,11 +420,7 @@ void ComputeKernel::read_binaries(Device *device) { const JitBuildState &build_state = device->build_kernel_state(tensix_core_type, compute_class_idx, trisc_id); ll_api::memory const& binary_mem = llrt::get_risc_binary( build_state.get_target_out_path(this->kernel_full_name_), - tensix_core_type, - compute_class_idx, - trisc_id, - ll_api::memory::PackSpans::PACK, - ll_api::memory::Relocate::XIP); + ll_api::memory::Loading::CONTIGUOUS_XIP); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", trisc_id + 2, binary_size); diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index 71c22599113..f180b4a43ab 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -47,33 +47,20 @@ using std::uint64_t; ll_api::memory const& get_risc_binary( string const& path, - uint32_t core_type_idx, - uint32_t processor_class_idx, - uint32_t processor_type_idx, - ll_api::memory::PackSpans span_type, - ll_api::memory::Relocate relo_type) { + ll_api::memory::Loading loading) { static struct { - std::unordered_map> map; + std::unordered_map> map; std::mutex mutex; std::condition_variable cvar; } cache; std::unique_lock lock(cache.mutex); auto [slot, inserted] = cache.map.try_emplace(path); + ll_api::memory const* ptr = nullptr; if (inserted) { // We're the first with PATH. Create and insert. lock.unlock(); - auto *ptr = new ll_api::memory(path, relo_type); - - // TODO: pass pack_spans into reader, generate text/data sizes - // from segment sizes and pack there - if (span_type == ll_api::memory::PackSpans::PACK) { - uint64_t data_start = tt::tt_metal::hal.get_dev_addr(tt::tt_metal::HalProgrammableCoreType::TENSIX, tt::tt_metal::HalL1MemAddrType::LOCAL); - uint64_t text_start = (relo_type == ll_api::memory::Relocate::XIP) ? - 0 : - tt::tt_metal::hal.get_base_firmware_addr(core_type_idx, processor_class_idx, processor_type_idx); - ptr->pack_data_into_text(text_start, data_start); - } + ptr = new ll_api::memory(path, loading); lock.lock(); // maps have iterator stability, so SLOT is still valid. @@ -81,12 +68,16 @@ ll_api::memory const& get_risc_binary( // We can't wake just those waiting on this slot, so wake them // all. Should be a rare event anyway. cache.cvar.notify_all(); - } else if (!slot->second) { - // Someone else is creating the initial entry, wait for them. - cache.cvar.wait(lock, [=] { return bool(slot->second); }); + } else { + if (!slot->second) { + // Someone else is creating the initial entry, wait for them. + cache.cvar.wait(lock, [=] { return bool(slot->second); }); + } + ptr = slot->second.get(); + TT_ASSERT(ptr->get_loading() == loading); } - return *slot->second.get(); + return *ptr; } // CoreCoord core --> NOC coordinates ("functional workers" from the SOC descriptor) diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/llrt/llrt.hpp index 0bf814e5869..60ceeadd004 100644 --- a/tt_metal/llrt/llrt.hpp +++ b/tt_metal/llrt/llrt.hpp @@ -49,17 +49,10 @@ using WorkerCore = tt_cxy_pair; using WorkerCores = std::vector; // Return a reference to a potentially shared binary image. -// The images are cached by path name, which is never erased. -// TODO: Remove core_type_idx, processor_class_idx, -// processor_type_idx -- the information they provide can be -// obtained directly from the binary image. +// The images are cached by path name. ll_api::memory const& get_risc_binary( string const& path, - uint32_t core_type_idx, - uint32_t processor_class_idx, - uint32_t processor_type_idx, - ll_api::memory::PackSpans span_type = ll_api::memory::PackSpans::NO_PACK, - ll_api::memory::Relocate relo_type = ll_api::memory::Relocate::NONE); + ll_api::memory::Loading loading = ll_api::memory::Loading::DISCRETE); // TODO: try using "stop" method from device instead, it's the proper way of asserting reset diff --git a/tt_metal/llrt/tt_elffile.cpp b/tt_metal/llrt/tt_elffile.cpp index b38a7903f4f..f46ce1d2990 100644 --- a/tt_metal/llrt/tt_elffile.cpp +++ b/tt_metal/llrt/tt_elffile.cpp @@ -112,8 +112,7 @@ class ElfFile::Impl { // advantage of the (a) fact that sections cannot straddle // segment boundaries -- they're either wholey inside or // wholey outside, and (b) unsigned arithmetic. - return shdr.sh_flags & SHF_ALLOC && shdr.sh_addr + shdr.sh_size - segment.address <= - (segment.contents.size() + segment.bss) * sizeof(word_t); + return shdr.sh_flags & SHF_ALLOC && shdr.sh_addr + shdr.sh_size - segment.address <= segment.membytes; } [[nodiscard]] bool IsInSegment(unsigned _ix, Elf32_Shdr const& shdr) const { return IsInSegment(GetSegments()[_ix], shdr); @@ -243,12 +242,13 @@ void ElfFile::Impl::LoadImage() { } GetSegments().reserve(hdr.e_phnum); - bool haveText = false, haveStack = false; + bool haveStack = false; for (auto const& phdr : GetPhdrs()) { if (phdr.p_type == PT_RISCV_ATTRIBUTES) { // TODO: verify Arch is ok? continue; } + if (phdr.p_type == PT_GNU_STACK) { haveStack = true; } else if (phdr.p_type != PT_LOAD) { @@ -268,38 +268,35 @@ void ElfFile::Impl::LoadImage() { phdr.p_offset); // Require loadable segments to be nicely aligned - if ((phdr.p_offset | phdr.p_vaddr) & (sizeof(word_t) - 1)) { + if ((phdr.p_offset | phdr.p_vaddr | phdr.p_paddr) & (sizeof(word_t) - 1)) { TT_THROW( - "{}: loadable segment {} is misaligned, [{},+{}/{})@{}", + "{}: loadable segment {} is misaligned, [{}({}),+{}/{})@{}", path_, unsigned(GetSegments().size()), phdr.p_vaddr, + phdr.p_paddr, phdr.p_filesz, phdr.p_memsz, phdr.p_offset); } auto contents = GetContents(phdr); - // We require the entry point to be the start of the text segment, - // so use a simple comparison -- if the entry point is elsewhere - // we'll complain about lack of text segment. - if (hdr.e_entry == phdr.p_vaddr) { - haveText = true; - if (!GetSegments().empty()) { - TT_THROW("{}: first loadable segment is not text", path_); - } + // We require the first segment to be text, and that the entry + // point is the start of that segment. + if (GetSegments().empty() && hdr.e_entry != phdr.p_vaddr) { + TT_THROW("{}: first loadable segment is not text", path_); } // This word-size rounding up means the span can occupy some bytes // outside the range of the original span, but those bytes will // still be inside the span covering the whole file, so that's ok. - offset_t file_size = (phdr.p_filesz + sizeof(word_t) - 1) / sizeof(word_t); - offset_t mem_size = (phdr.p_memsz + sizeof(word_t) - 1) / sizeof(word_t); + offset_t file_words = (phdr.p_filesz + sizeof(word_t) - 1) / sizeof(word_t); + offset_t mem_bytes = (phdr.p_memsz + sizeof(word_t) - 1) & ~(sizeof(word_t) - 1); GetSegments().emplace_back( - std::span(reinterpret_cast(contents.data()), file_size), phdr.p_vaddr, mem_size - file_size); - } - if (!haveText) { - TT_THROW("{}: cannot find text segment", path_); + std::span(reinterpret_cast(contents.data()), file_words), + phdr.p_vaddr, + phdr.p_paddr, + mem_bytes); } // Check sections @@ -550,20 +547,22 @@ void ElfFile::Impl::XIPify() { unsigned kind = PCREL; switch (type) { - // Abs relocs to text will need fixing up case R_RISCV_LO12_I: - case R_RISCV_LO12_S: - if (!is_to_text) { + case R_RISCV_LO12_S: kind = ABS; [[fallthrough]]; + + case R_RISCV_PCREL_LO12_I: + case R_RISCV_PCREL_LO12_S: + if (kind == ABS && !is_to_text) { + // Abs relocs not to text do not need to be translated. break; } - kind = ABS; - [[fallthrough]]; - // PCrel relocs not to text will need fixing up. At - // this point we don't know the symbol from the LO12 - // relocs, as that points at the hi20 reloc. - case R_RISCV_PCREL_LO12_I: - case R_RISCV_PCREL_LO12_S: lo[kind].push_back(&reloc); break; + // PCrel relocs to text will not need translation, + // but at this point we don't know the symbol as + // these relocs point to the hi20 reloc. Record + // them all and filter later. + lo[kind].push_back(&reloc); + break; case R_RISCV_HI20: kind = ABS; [[fallthrough]]; @@ -573,9 +572,11 @@ void ElfFile::Impl::XIPify() { "{}: segment-crossing {} relocation found at {}", path_, r_names[kind][0], reloc.r_offset); } - if (!is_to_text && kind == ABS) { + if (kind == ABS && !is_to_text) { + // Abs relocs not to text do not need to be translated. break; } + composed[kind].emplace(reloc.r_offset, ComposedReloc(&reloc)); break; @@ -668,7 +669,8 @@ void ElfFile::Impl::XIPify() { unsigned sym_ix = ELF32_R_SYM(hi_reloc->r_info); auto const& symbol = symbols[sym_ix]; bool is_to_text = IsTextSymbol(symbol); - if (is_to_text == is_from_text) { + if (kind == PCREL && is_to_text == is_from_text) { + // intra-text PCREL is ok. continue; } diff --git a/tt_metal/llrt/tt_elffile.hpp b/tt_metal/llrt/tt_elffile.hpp index 7ea8c2b591e..d38d3c85245 100644 --- a/tt_metal/llrt/tt_elffile.hpp +++ b/tt_metal/llrt/tt_elffile.hpp @@ -27,12 +27,14 @@ class ElfFile { struct Segment { std::vector relocs; // 32-bit relocs to apply std::span contents; // Non-owning span - address_t address = 0; // byte address or 0 for XIP - offset_t bss = 0; // words of BSS + address_t address = 0; // Byte execution address (0 for + // XIP) + address_t lma = 0; // Byte load address + offset_t membytes = 0; // Byte size of memory image. public: - inline Segment(std::span contents, address_t addr, offset_t bss) : - contents(contents), address(addr), bss(bss) {} + inline Segment(std::span contents, address_t addr, address_t lma, offset_t membytes) : + contents(contents), address(addr), lma(lma), membytes(membytes) {} }; public: diff --git a/tt_metal/llrt/tt_memory.cpp b/tt_metal/llrt/tt_memory.cpp index e1455040846..2d2f6bb09b8 100644 --- a/tt_metal/llrt/tt_memory.cpp +++ b/tt_metal/llrt/tt_memory.cpp @@ -14,44 +14,60 @@ namespace ll_api { memory::memory() { + constexpr uint32_t initial_data_space_ = 0x400; + constexpr uint32_t initial_span_space_ = 4; + data_.reserve(initial_data_space_); link_spans_.reserve(initial_span_space_); - text_size_ = 0; - packed_size_ = 0; } -memory::memory(std::string const& path, Relocate relo_type) : memory() { +memory::memory(std::string const& path, Loading loading) : loading_(loading) { ElfFile elf; elf.ReadImage(path); - if (relo_type == Relocate::XIP) { + if (loading == Loading::CONTIGUOUS_XIP) { elf.MakeExecuteInPlace(); } - // The ELF file puts the text segment first, but memory wants - // ordered spans. - // FIXME: Perhaps we can relax that? - uint32_t total_size = 0; - auto emit_segment = [&](ElfFile::Segment const& segment) { - TT_ASSERT(segment.relocs.empty(), "Unexpected dynamic relocations"); - link_spans_.emplace_back(segment.address, segment.contents.size()); - data_.insert(data_.end(), segment.contents.begin(), segment.contents.end()); - total_size += segment.contents.size(); - }; - auto* text = &elf.GetSegments()[0]; - for (auto& segment : std::span(elf.GetSegments()).subspan(1)) { - if (text && segment.address > text->address) { - emit_segment(*text); - text = nullptr; - } - emit_segment(segment); + auto const& segments = elf.GetSegments(); + + // The ELF file puts the text segment first, but one set of + // binaries (ncrisc) places data a lower address, and at least one + // consumer (unknown) requires spans in address order, so generate + // a mapping table. + // TODO: Perhaps we can relax this? + std::vector map; + map.reserve(segments.size()); + for (unsigned ix = 0; ix != segments.size(); ix++) { + map.push_back(ix); } - if (text) { - emit_segment(*text); + if (loading == Loading::DISCRETE) { + std::sort( + map.begin(), map.end(), [&](unsigned a, unsigned b) { return segments[a].address < segments[b].address; }); } - set_text_size(elf.GetSegments()[0].contents.size() * sizeof(word_t)); - set_packed_size(total_size * sizeof(uint32_t)); + link_spans_.reserve(segments.size()); + text_addr_ = segments[0].address; + text_size_ = segments[0].contents.size() * sizeof(word_t); + auto lma = segments[0].lma; + + for (unsigned ix : map) { + auto const& segment = segments[map[ix]]; + if (not segment.relocs.empty()) { + TT_THROW("{}: unexpected dynamic relocations", path); + } + if (loading != Loading::DISCRETE) { + if (segment.lma != lma) { + TT_THROW("{}: inconsistent load addresses for packing", path); + } + lma += segment.contents.size() * sizeof(word_t); + } + if (loading == Loading::DISCRETE ? segment.contents.size() != 0 : link_spans_.empty()) { + link_spans_.emplace_back(segment.address, 0); + } + link_spans_.back().len += segment.contents.size(); + data_.insert(data_.end(), segment.contents.begin(), segment.contents.end()); + } } bool memory::operator==(const memory& other) const { return data_ == other.data_ && link_spans_ == other.link_spans_; } @@ -84,67 +100,4 @@ void memory::process_spans( } } -// Takes spans and merges the data to the text span -// Used for kernels (not firmware) -// Spans get packed for kernels so they can be loaded in one NOC transaction -// A symbol at the end of the text segment allows the FW to find the data segment to copy into place -void memory::pack_data_into_text(std::uint64_t text_start, std::uint64_t data_start) { - uint64_t text_end, data_end; - if (text_start > data_start) { - text_end = std::numeric_limits::max(); - data_end = text_start; - } else { - text_end = data_start; - data_end = std::numeric_limits::max(); - } - - TT_ASSERT(this->link_spans_.size() != 0); - - std::vector new_data; - new_data.resize(this->data_.size()); - struct span new_span; - size_t new_len = 0; - - bool first_text = true; - size_t offset = 0; - // Copy text spans. May start after data span (ncrisc) - // TODO: Ideally would be just 1, sometimes init doesn't merge w/ text and we get 2 - // TODO: (and init is just a jump to text and should be removed) - for (const auto& span : this->link_spans_) { - if (span.addr >= text_start && span.addr < text_end) { - if (first_text) { - new_span.addr = span.addr; - first_text = false; - } else if (span.addr > new_span.addr + new_len * sizeof(uint32_t)) { - uint64_t delta = span.addr - (new_span.addr + new_len * sizeof(uint32_t)); - delta /= sizeof(uint32_t); - // Pad the prior span - new_data.resize(new_data.size() + delta); - new_len += delta; - } - memcpy(&new_data[new_len], &this->data_[offset], span.len * sizeof(uint32_t)); - new_len += span.len; - } - - offset += span.len; - } - TT_ASSERT(!first_text); - - // Copy data spans. Should be just 1. May start before text span (ncrisc) - offset = 0; - for (const auto& span : this->link_spans_) { - if (span.addr >= data_start && span.addr < data_end) { - memcpy(&new_data[new_len], &this->data_[offset], span.len * sizeof(uint32_t)); - new_len += span.len; - } - offset += span.len; - } - - new_span.len = new_len; - this->link_spans_.resize(1); - this->link_spans_[0] = new_span; - this->data_ = new_data; - this->text_addr_ = new_span.addr; -} - } // namespace ll_api diff --git a/tt_metal/llrt/tt_memory.h b/tt_metal/llrt/tt_memory.h index ac25131db0d..fcbcd073515 100644 --- a/tt_metal/llrt/tt_memory.h +++ b/tt_metal/llrt/tt_memory.h @@ -13,67 +13,59 @@ namespace ll_api { class memory { - public: - typedef std::uint64_t address_t; - typedef std::uint32_t word_t; - enum class PackSpans { PACK, NO_PACK }; - enum class Relocate { XIP, NONE }; - - private: - static constexpr uint32_t initial_data_space_ = 0x400; - static constexpr uint32_t initial_span_space_ = 4; - - struct span { - // Note: the offset of the data for a span in data_ is generated on the - // fly by processing spans in order - address_t addr; // byte address in device memory - size_t len; - bool operator==(const span& other) const { return addr == other.addr && len == other.len; } - }; - - std::vector data_; - std::vector link_spans_; - uint32_t text_size_; - uint32_t packed_size_; - uint32_t text_addr_; - - public: - memory(); - memory(std::string const &path, Relocate relo_type); - - public: - // These can be large objects, so ban copying ... - memory(memory const&) = delete; - memory& operator=(memory const&) = delete; - // ... but permit moving. - memory(memory&&) = default; - memory& operator=(memory&&) = default; - - public: - const std::vector& data() const { return this->data_; } - - // memory& operator=(memory &&src); - bool operator==(const memory& other) const; - - void set_text_size(uint32_t size) { this->text_size_ = size; } - void set_packed_size(uint32_t size) { this->packed_size_ = size; } - uint32_t get_text_size() const { return this->text_size_; } - uint32_t get_packed_size() const { return this->packed_size_; } - uint32_t get_text_addr() const { return this->text_addr_; } - - size_t size() const { return data_.size(); } - - size_t num_spans() const { return link_spans_.size(); } +public: + typedef std::uint64_t address_t; + typedef std::uint32_t word_t; + enum class Loading : std::uint8_t { DISCRETE, CONTIGUOUS, CONTIGUOUS_XIP }; + +private: + struct span { + // Note: the offset of the data for a span in data_ is generated on the + // fly by processing spans in order + address_t addr; // byte address in device memory + size_t len; + bool operator==(const span& other) const { return addr == other.addr && len == other.len; } + }; + + std::vector data_; + std::vector link_spans_; + uint32_t text_size_ = 0; + uint32_t text_addr_ = 0; + Loading loading_; + +public: + memory(); + memory(std::string const &path, Loading loading); + +public: + // These can be large objects, so ban copying ... + memory(memory const&) = delete; + memory& operator=(memory const&) = delete; + // ... but permit moving. + memory(memory&&) = default; + memory& operator=(memory&&) = default; public: - // Process spans in arg mem to fill data in *this (eg, from device) - void fill_from_mem_template(const memory& mem_template, const std::function::iterator, uint64_t addr, uint32_t len)>& callback); + const std::vector& data() const { return this->data_; } - // Iterate over spans_ to act on data_ (eg., to device) - void process_spans(const std::function::const_iterator, uint64_t addr, uint32_t len)>& callback) const; - void process_spans(const std::function::iterator, uint64_t addr, uint32_t len)>& callback); + bool operator==(const memory& other) const; + Loading get_loading() const {return loading_;} + + uint32_t get_text_size() const { return this->text_size_; } + uint32_t get_packed_size() const { return data_.size() * sizeof(word_t); } + uint32_t get_text_addr() const { return this->text_addr_; } + + size_t size() const { return data_.size(); } + + size_t num_spans() const { return link_spans_.size(); } + +public: + // Process spans in arg mem to fill data in *this (eg, from device) + void fill_from_mem_template(const memory& mem_template, const std::function::iterator, uint64_t addr, uint32_t len)>& callback); - void pack_data_into_text(std::uint64_t text_start, std::uint64_t data_start); + // Iterate over spans_ to act on data_ (eg., to device) + void process_spans(const std::function::const_iterator, uint64_t addr, uint32_t len)>& callback) const; + void process_spans(const std::function::iterator, uint64_t addr, uint32_t len)>& callback); }; } // namespace ll_api