diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp index abe18295c73..3b5b1bab7ac 100644 --- a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp +++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp @@ -227,10 +227,10 @@ bool send_over_eth( // TODO: this should be updated to use kernel api uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); - ll_api::memory const& binary_mem_send = - llrt::get_risc_binary(sender_device->build_firmware_target_path(active_eth_index, 0, 0)); - ll_api::memory const& binary_mem_receive = - llrt::get_risc_binary(receiver_device->build_firmware_target_path(active_eth_index, 0, 0)); + const ll_api::memory& binary_mem_send = llrt::get_risc_binary( + sender_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0); + const ll_api::memory& binary_mem_receive = llrt::get_risc_binary( + receiver_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0); for (const auto& eth_core : eth_cores) { llrt::write_hex_vec_to_core( diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index a5d0bae9121..29507c4b9f3 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -101,169 +101,5 @@ void construct_program(Program& program, Device* device, CoreCoord& core) { int main(int argc, char** argv) { bool pass = true; - - try { - //////////////////////////////////////////////////////////////////////////// - // Device Setup - //////////////////////////////////////////////////////////////////////////// - CoreCoord core = {0, 0}; - int num_devices = tt::tt_metal::GetNumAvailableDevices(); - std::vector ids; - for (unsigned int id = 0; id < num_devices; id++) { - ids.push_back(id); - } - tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, DispatchCoreConfig{}); - auto devices = tt::DevicePool::instance().get_all_active_devices(); - std::vector programs; - // kernel->binaries() returns 32B aligned binaries - std::map> compute_binaries; - std::map> brisc_binaries; - std::map> ncrisc_binaries; - - for (int i = 0; i < num_devices; i++) { - auto device = devices[i]; - - //////////////////////////////////////////////////////////////////////////// - // Application Setup - //////////////////////////////////////////////////////////////////////////// - programs.push_back(Program()); - Program& program = programs.back(); - - construct_program(program, device, core); - - //////////////////////////////////////////////////////////////////////////// - // Compile Application - //////////////////////////////////////////////////////////////////////////// - // Check that binary memory objects in the kernel match the ones obtained from the persistent cache - uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - const KernelGroup* kernel_group = program.kernels_on_core(core, programmable_core_index); - TT_FATAL( - kernel_group != nullptr && kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE].has_value() and - kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM0].has_value() and - kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM1].has_value(), - "Error"); - auto compute_kernel = - tt_metal::detail::GetKernel(program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE].value()); - auto riscv0_kernel = - tt_metal::detail::GetKernel(program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM0].value()); - auto riscv1_kernel = - tt_metal::detail::GetKernel(program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM1].value()); - - // Run iteration to get golden - uint32_t mask = device->build_key(); - tt_metal::detail::CompileProgram(device, program); - compute_binaries.insert({mask, compute_kernel->binaries(mask)}); - TT_FATAL(compute_binaries.at(mask).size() == 3, "Expected 3 Compute binaries!"); - brisc_binaries.insert({mask, riscv0_kernel->binaries(mask)}); - TT_FATAL(brisc_binaries.at(mask).size() == 1, "Expected 1 BRISC binary!"); - ncrisc_binaries.insert({mask, riscv1_kernel->binaries(mask)}); - TT_FATAL(ncrisc_binaries.at(mask).size() == 1, "Expected 1 NCRISC binary!"); - } - - int num_compiles = 3; - for (int i = 0; i < 3; i++) { - std::vector kernel_names = {"reader_unary_push_4", "writer_unary", "eltwise_copy_3m"}; - for (int i = 0; i < num_devices; i++) { - for (const auto& kernel_name : kernel_names) { - std::filesystem::remove_all(devices[i]->build_env().get_out_kernel_root_path() + kernel_name); - } - } - tt_metal::detail::ClearKernelCache(); - std::vector new_programs; - for (int i = 0; i < num_devices; i++) { - auto& device = devices[i]; - new_programs.push_back(Program()); - Program& program = new_programs.back(); - construct_program(program, device, core); - } - - std::vector ths; - ths.reserve(num_devices); - uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); - uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE); - for (int i = 0; i < num_devices; i++) { - auto& device = devices[i]; - auto& program = new_programs[i]; - ths.emplace_back([&] { - for (int j = 0; j < num_compiles; j++) { - uint32_t mask = device->build_key(); - tt_metal::detail::CompileProgram(device, program); - uint32_t programmable_core_index = - hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - const KernelGroup* kernel_group = program.kernels_on_core(core, programmable_core_index); - auto compute_kernel = tt_metal::detail::GetKernel( - program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE].value()); - auto riscv0_kernel = tt_metal::detail::GetKernel( - program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM0].value()); - auto riscv1_kernel = tt_metal::detail::GetKernel( - program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM1].value()); - TT_FATAL(compute_kernel->binaries(mask) == compute_binaries.at(mask), "Error"); - TT_FATAL(riscv0_kernel->binaries(mask) == brisc_binaries.at(mask), "Error"); - TT_FATAL(riscv1_kernel->binaries(mask) == ncrisc_binaries.at(mask), "Error"); - - std::string brisc_hex_path = device->build_kernel_target_path( - programmable_core_index, - dm_class_idx, - 0, - get_latest_kernel_binary_path(device->build_env().get_out_kernel_root_path(), riscv0_kernel)); - ll_api::memory const& brisc_binary = - llrt::get_risc_binary(brisc_hex_path, ll_api::memory::Loading::CONTIGUOUS_XIP); - TT_FATAL( - brisc_binary == *brisc_binaries.at(mask).at(0), - "Expected saved BRISC binary to be the same as binary in persistent cache"); - std::string ncrisc_hex_path = device->build_kernel_target_path( - programmable_core_index, - dm_class_idx, - 1, - get_latest_kernel_binary_path(device->build_env().get_out_kernel_root_path(), riscv1_kernel)); - auto load_type = - (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0) - ? ll_api::memory::Loading::CONTIGUOUS - : ll_api::memory::Loading::CONTIGUOUS_XIP; - ll_api::memory const& ncrisc_binary = llrt::get_risc_binary(ncrisc_hex_path, load_type); - TT_FATAL( - ncrisc_binary == *ncrisc_binaries.at(mask).at(0), - "Expected saved NCRISC binary to be the same as binary in persistent cache"); - for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { - std::string trisc_id_str = std::to_string(trisc_id); - std::string trisc_hex_path = device->build_kernel_target_path( - programmable_core_index, - compute_class_idx, - trisc_id, - get_latest_kernel_binary_path(device->build_env().get_out_kernel_root_path(), compute_kernel)); - ll_api::memory const& trisc_binary = - llrt::get_risc_binary(trisc_hex_path, ll_api::memory::Loading::CONTIGUOUS_XIP); - TT_FATAL( - trisc_binary == *compute_binaries.at(mask).at(trisc_id), - "Expected saved TRISC binary for {} to be the same as binary in persistent cache", - trisc_id_str); - } - } - }); - } - for (auto& th : ths) { - th.join(); - } - } - for (auto dev : devices) { - pass &= tt_metal::CloseDevice(dev); - } - - } catch (const std::exception& e) { - pass = false; - // Capture the exception error message - log_error(LogTest, "{}", e.what()); - // Capture system call errors that may have returned from driver/kernel - log_error(LogTest, "System error message: {}", std::strerror(errno)); - } - - if (pass) { - log_info(LogTest, "Test Passed"); - } else { - TT_THROW("Test Failed"); - } - - TT_FATAL(pass, "Error"); - return 0; } diff --git a/tt_metal/hw/toolchain/sections.ld b/tt_metal/hw/toolchain/sections.ld index 2f0e0544e5f..1abe5276410 100644 --- a/tt_metal/hw/toolchain/sections.ld +++ b/tt_metal/hw/toolchain/sections.ld @@ -65,7 +65,6 @@ SECTIONS *(.text .stub .text.* .gnu.linkonce.t.*) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning) - . = ALIGN(4); } > REGION_CODE :text .init.fini : { @@ -74,30 +73,23 @@ SECTIONS ASSERT(SIZEOF(.init.fini) == 0, ".init/.fini sections have contents"); } > REGION_CODE :text -#if defined(TYPE_KERNEL) - __kernel_data_lma = .; -#endif - . = ALIGN(ABSOLUTE(.) + MEM_PAD, MEM_ALIGN); + . = ALIGN(. + MEM_PAD, MEM_ALIGN); #if defined(TYPE_FIRMWARE) - __fw_export_end_text = ABSOLUTE(.); + __fw_export_end_text = .; #if defined(TARGET_NCRISC) - PROVIDE (KERNEL_ENTRY_SYMBOL = ABSOLUTE(__fw_export_end_text)); + PROVIDE (KERNEL_ENTRY_SYMBOL = __fw_export_end_text); #endif #endif #if defined(TYPE_KERNEL) - __kernel_init_local_l1_base = ABSOLUTE(.); + __kernel_init_local_l1_base = .; #endif #if defined(TYPE_FIRMWARE) PROVIDE(__global_pointer$ = ORIGIN(REGION_DATA) + 0x7f0); #endif - .data DATA_START : -#if defined (TYPE_KERNEL) - AT(__kernel_data_lma) -#endif - ALIGN(4) + .data DATA_START : ALIGN(4) { . = .; /* Force section emission. */ __ldm_data_start = .; @@ -149,8 +141,8 @@ SECTIONS } > REGION_DATA :data #ifdef TYPE_FIRMWARE - . = ALIGN(ABSOLUTE(.), MEM_ALIGN); - __fw_export_ldm_end = ABSOLUTE(.); + . = ALIGN(MEM_ALIGN); + __fw_export_ldm_end = .; #endif #ifdef TYPE_FIRMWARE diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 81597324722..7a708b760d4 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -438,7 +438,10 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); for (uint32_t riscv_id = build_idx; riscv_id < (build_idx + num_build_states); riscv_id++) { ll_api::memory const& binary_mem = llrt::get_risc_binary( - firmware_build_states_[riscv_id]->get_target_out_path("")); + firmware_build_states_[riscv_id]->get_target_out_path(""), + core_type_idx, + processor_class, + (riscv_id - build_idx)); uint32_t fw_size = binary_mem.get_text_size(); if (riscv_id == 1) { // TODO: clean up how brisc/ncrisc are handled // In this context, ncrisc_kernel_size16 is the size of the fw @@ -482,7 +485,10 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); for (uint32_t eriscv_id = build_idx; eriscv_id < (build_idx + num_build_states); eriscv_id++) { ll_api::memory const& binary_mem = llrt::get_risc_binary( - firmware_build_states_[eriscv_id]->get_target_out_path("")); + firmware_build_states_[eriscv_id]->get_target_out_path(""), + core_type_idx, + processor_class, + (eriscv_id - build_idx)); uint32_t fw_size = binary_mem.get_text_size(); log_debug(LogDevice, "ERISC fw binary size: {} in bytes", fw_size); llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (eriscv_id - build_idx)); diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index b6cecf023d0..a3f67470d21 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -378,12 +378,17 @@ void DataMovementKernel::read_binaries(Device *device) { int riscv_id = static_cast::type>(this->config_.processor); const JitBuildState &build_state = device->build_kernel_state(tensix_core_type, dm_class_idx, riscv_id); // TODO: from HAL - auto load_type = + ll_api::memory::Relocate relo_type = (riscv_id == 1 && (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0)) ? - ll_api::memory::Loading::CONTIGUOUS : ll_api::memory::Loading::CONTIGUOUS_XIP; + ll_api::memory::Relocate::NONE : ll_api::memory::Relocate::XIP; ll_api::memory const& binary_mem = llrt::get_risc_binary( build_state.get_target_out_path(this->kernel_full_name_), - load_type); + // processor class is BRISC/NCRISC and each have one data movement processor type + tensix_core_type, + riscv_id, + dm_class_idx, + ll_api::memory::PackSpans::PACK, + relo_type); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", riscv_id, binary_size); @@ -400,11 +405,15 @@ void EthernetKernel::read_binaries(Device *device) { const JitBuildState &build_state = device->build_kernel_state(erisc_core_type, dm_class_idx, erisc_id); int risc_id = erisc_id + (this->config_.eth_mode == Eth::IDLE ? 6 : 5); // TODO (abhullar): clean this up when llrt helpers use HAL // TODO: fix when active eth supports relo - auto load_type = (this->config_.eth_mode == Eth::IDLE) ? - ll_api::memory::Loading::CONTIGUOUS_XIP : ll_api::memory::Loading::DISCRETE; + ll_api::memory::Relocate relo_type = (this->config_.eth_mode == Eth::IDLE) ? + ll_api::memory::Relocate::XIP : ll_api::memory::Relocate::NONE; ll_api::memory const& binary_mem = llrt::get_risc_binary( build_state.get_target_out_path(this->kernel_full_name_), - load_type); + erisc_core_type, + erisc_id, + dm_class_idx, + ll_api::memory::PackSpans::PACK, + relo_type); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "ERISC {} kernel binary size: {} in bytes", erisc_id, binary_size); @@ -420,7 +429,11 @@ void ComputeKernel::read_binaries(Device *device) { const JitBuildState &build_state = device->build_kernel_state(tensix_core_type, compute_class_idx, trisc_id); ll_api::memory const& binary_mem = llrt::get_risc_binary( build_state.get_target_out_path(this->kernel_full_name_), - ll_api::memory::Loading::CONTIGUOUS_XIP); + tensix_core_type, + compute_class_idx, + trisc_id, + ll_api::memory::PackSpans::PACK, + ll_api::memory::Relocate::XIP); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", trisc_id + 2, binary_size); diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index f180b4a43ab..71c22599113 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -47,20 +47,33 @@ using std::uint64_t; ll_api::memory const& get_risc_binary( string const& path, - ll_api::memory::Loading loading) { + uint32_t core_type_idx, + uint32_t processor_class_idx, + uint32_t processor_type_idx, + ll_api::memory::PackSpans span_type, + ll_api::memory::Relocate relo_type) { static struct { - std::unordered_map> map; + std::unordered_map> map; std::mutex mutex; std::condition_variable cvar; } cache; std::unique_lock lock(cache.mutex); auto [slot, inserted] = cache.map.try_emplace(path); - ll_api::memory const* ptr = nullptr; if (inserted) { // We're the first with PATH. Create and insert. lock.unlock(); - ptr = new ll_api::memory(path, loading); + auto *ptr = new ll_api::memory(path, relo_type); + + // TODO: pass pack_spans into reader, generate text/data sizes + // from segment sizes and pack there + if (span_type == ll_api::memory::PackSpans::PACK) { + uint64_t data_start = tt::tt_metal::hal.get_dev_addr(tt::tt_metal::HalProgrammableCoreType::TENSIX, tt::tt_metal::HalL1MemAddrType::LOCAL); + uint64_t text_start = (relo_type == ll_api::memory::Relocate::XIP) ? + 0 : + tt::tt_metal::hal.get_base_firmware_addr(core_type_idx, processor_class_idx, processor_type_idx); + ptr->pack_data_into_text(text_start, data_start); + } lock.lock(); // maps have iterator stability, so SLOT is still valid. @@ -68,16 +81,12 @@ ll_api::memory const& get_risc_binary( // We can't wake just those waiting on this slot, so wake them // all. Should be a rare event anyway. cache.cvar.notify_all(); - } else { - if (!slot->second) { - // Someone else is creating the initial entry, wait for them. - cache.cvar.wait(lock, [=] { return bool(slot->second); }); - } - ptr = slot->second.get(); - TT_ASSERT(ptr->get_loading() == loading); + } else if (!slot->second) { + // Someone else is creating the initial entry, wait for them. + cache.cvar.wait(lock, [=] { return bool(slot->second); }); } - return *ptr; + return *slot->second.get(); } // CoreCoord core --> NOC coordinates ("functional workers" from the SOC descriptor) diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/llrt/llrt.hpp index 60ceeadd004..0bf814e5869 100644 --- a/tt_metal/llrt/llrt.hpp +++ b/tt_metal/llrt/llrt.hpp @@ -49,10 +49,17 @@ using WorkerCore = tt_cxy_pair; using WorkerCores = std::vector; // Return a reference to a potentially shared binary image. -// The images are cached by path name. +// The images are cached by path name, which is never erased. +// TODO: Remove core_type_idx, processor_class_idx, +// processor_type_idx -- the information they provide can be +// obtained directly from the binary image. ll_api::memory const& get_risc_binary( string const& path, - ll_api::memory::Loading loading = ll_api::memory::Loading::DISCRETE); + uint32_t core_type_idx, + uint32_t processor_class_idx, + uint32_t processor_type_idx, + ll_api::memory::PackSpans span_type = ll_api::memory::PackSpans::NO_PACK, + ll_api::memory::Relocate relo_type = ll_api::memory::Relocate::NONE); // TODO: try using "stop" method from device instead, it's the proper way of asserting reset diff --git a/tt_metal/llrt/tt_elffile.cpp b/tt_metal/llrt/tt_elffile.cpp index f46ce1d2990..c4b2cacaa3e 100644 --- a/tt_metal/llrt/tt_elffile.cpp +++ b/tt_metal/llrt/tt_elffile.cpp @@ -112,7 +112,8 @@ class ElfFile::Impl { // advantage of the (a) fact that sections cannot straddle // segment boundaries -- they're either wholey inside or // wholey outside, and (b) unsigned arithmetic. - return shdr.sh_flags & SHF_ALLOC && shdr.sh_addr + shdr.sh_size - segment.address <= segment.membytes; + return shdr.sh_flags & SHF_ALLOC && shdr.sh_addr + shdr.sh_size - segment.address <= + (segment.contents.size() + segment.bss) * sizeof(word_t); } [[nodiscard]] bool IsInSegment(unsigned _ix, Elf32_Shdr const& shdr) const { return IsInSegment(GetSegments()[_ix], shdr); @@ -242,13 +243,12 @@ void ElfFile::Impl::LoadImage() { } GetSegments().reserve(hdr.e_phnum); - bool haveStack = false; + bool haveText = false, haveStack = false; for (auto const& phdr : GetPhdrs()) { if (phdr.p_type == PT_RISCV_ATTRIBUTES) { // TODO: verify Arch is ok? continue; } - if (phdr.p_type == PT_GNU_STACK) { haveStack = true; } else if (phdr.p_type != PT_LOAD) { @@ -268,35 +268,38 @@ void ElfFile::Impl::LoadImage() { phdr.p_offset); // Require loadable segments to be nicely aligned - if ((phdr.p_offset | phdr.p_vaddr | phdr.p_paddr) & (sizeof(word_t) - 1)) { + if ((phdr.p_offset | phdr.p_vaddr) & (sizeof(word_t) - 1)) { TT_THROW( - "{}: loadable segment {} is misaligned, [{}({}),+{}/{})@{}", + "{}: loadable segment {} is misaligned, [{},+{}/{})@{}", path_, unsigned(GetSegments().size()), phdr.p_vaddr, - phdr.p_paddr, phdr.p_filesz, phdr.p_memsz, phdr.p_offset); } auto contents = GetContents(phdr); - // We require the first segment to be text, and that the entry - // point is the start of that segment. - if (GetSegments().empty() && hdr.e_entry != phdr.p_vaddr) { - TT_THROW("{}: first loadable segment is not text", path_); + // We require the entry point to be the start of the text segment, + // so use a simple comparison -- if the entry point is elsewhere + // we'll complain about lack of text segment. + if (hdr.e_entry == phdr.p_vaddr) { + haveText = true; + if (!GetSegments().empty()) { + TT_THROW("{}: first loadable segment is not text", path_); + } } // This word-size rounding up means the span can occupy some bytes // outside the range of the original span, but those bytes will // still be inside the span covering the whole file, so that's ok. - offset_t file_words = (phdr.p_filesz + sizeof(word_t) - 1) / sizeof(word_t); - offset_t mem_bytes = (phdr.p_memsz + sizeof(word_t) - 1) & ~(sizeof(word_t) - 1); + offset_t file_size = (phdr.p_filesz + sizeof(word_t) - 1) / sizeof(word_t); + offset_t mem_size = (phdr.p_memsz + sizeof(word_t) - 1) / sizeof(word_t); GetSegments().emplace_back( - std::span(reinterpret_cast(contents.data()), file_words), - phdr.p_vaddr, - phdr.p_paddr, - mem_bytes); + std::span(reinterpret_cast(contents.data()), file_size), phdr.p_vaddr, mem_size - file_size); + } + if (!haveText) { + TT_THROW("{}: cannot find text segment", path_); } // Check sections @@ -547,22 +550,20 @@ void ElfFile::Impl::XIPify() { unsigned kind = PCREL; switch (type) { + // Abs relocs to text will need fixing up case R_RISCV_LO12_I: - case R_RISCV_LO12_S: kind = ABS; [[fallthrough]]; - - case R_RISCV_PCREL_LO12_I: - case R_RISCV_PCREL_LO12_S: - if (kind == ABS && !is_to_text) { - // Abs relocs not to text do not need to be translated. + case R_RISCV_LO12_S: + if (!is_to_text) { break; } + kind = ABS; + [[fallthrough]]; - // PCrel relocs to text will not need translation, - // but at this point we don't know the symbol as - // these relocs point to the hi20 reloc. Record - // them all and filter later. - lo[kind].push_back(&reloc); - break; + // PCrel relocs not to text will need fixing up. At + // this point we don't know the symbol from the LO12 + // relocs, as that points at the hi20 reloc. + case R_RISCV_PCREL_LO12_I: + case R_RISCV_PCREL_LO12_S: lo[kind].push_back(&reloc); break; case R_RISCV_HI20: kind = ABS; [[fallthrough]]; @@ -572,11 +573,9 @@ void ElfFile::Impl::XIPify() { "{}: segment-crossing {} relocation found at {}", path_, r_names[kind][0], reloc.r_offset); } - if (kind == ABS && !is_to_text) { - // Abs relocs not to text do not need to be translated. + if (!is_to_text && kind == ABS) { break; } - composed[kind].emplace(reloc.r_offset, ComposedReloc(&reloc)); break; @@ -669,8 +668,7 @@ void ElfFile::Impl::XIPify() { unsigned sym_ix = ELF32_R_SYM(hi_reloc->r_info); auto const& symbol = symbols[sym_ix]; bool is_to_text = IsTextSymbol(symbol); - if (kind == PCREL && is_to_text == is_from_text) { - // intra-text PCREL is ok. + if (is_to_text == is_from_text) { continue; } diff --git a/tt_metal/llrt/tt_elffile.hpp b/tt_metal/llrt/tt_elffile.hpp index d38d3c85245..41f9b5248ce 100644 --- a/tt_metal/llrt/tt_elffile.hpp +++ b/tt_metal/llrt/tt_elffile.hpp @@ -27,14 +27,12 @@ class ElfFile { struct Segment { std::vector relocs; // 32-bit relocs to apply std::span contents; // Non-owning span - address_t address = 0; // Byte execution address (0 for - // XIP) - address_t lma = 0; // Byte load address - offset_t membytes = 0; // Byte size of memory image. + address_t address = 0; // byte address or 0 for XIP + offset_t bss = 0; // words of BSS public: - inline Segment(std::span contents, address_t addr, address_t lma, offset_t membytes) : - contents(contents), address(addr), lma(lma), membytes(membytes) {} + inline Segment(std::span contents, address_t addr, offset_t bss) : + contents(contents), address(addr), bss(bss) {} }; public: diff --git a/tt_metal/llrt/tt_memory.cpp b/tt_metal/llrt/tt_memory.cpp index 2d2f6bb09b8..aa4ff4e3261 100644 --- a/tt_metal/llrt/tt_memory.cpp +++ b/tt_metal/llrt/tt_memory.cpp @@ -14,60 +14,44 @@ namespace ll_api { memory::memory() { - constexpr uint32_t initial_data_space_ = 0x400; - constexpr uint32_t initial_span_space_ = 4; - data_.reserve(initial_data_space_); link_spans_.reserve(initial_span_space_); + text_size_ = 0; + packed_size_ = 0; } -memory::memory(std::string const& path, Loading loading) : loading_(loading) { +memory::memory(const std::string& path, Relocate relo_type) : memory() { ElfFile elf; elf.ReadImage(path); - if (loading == Loading::CONTIGUOUS_XIP) { + if (relo_type == Relocate::XIP) { elf.MakeExecuteInPlace(); } - auto const& segments = elf.GetSegments(); - - // The ELF file puts the text segment first, but one set of - // binaries (ncrisc) places data a lower address, and at least one - // consumer (unknown) requires spans in address order, so generate - // a mapping table. - // TODO: Perhaps we can relax this? - std::vector map; - map.reserve(segments.size()); - for (unsigned ix = 0; ix != segments.size(); ix++) { - map.push_back(ix); + // The ELF file puts the text segment first, but memory wants + // ordered spans. + // FIXME: Perhaps we can relax that? + uint32_t total_size = 0; + auto emit_segment = [&](const ElfFile::Segment& segment) { + TT_ASSERT(segment.relocs.empty(), "Unexpected dynamic relocations"); + link_spans_.emplace_back(segment.address, segment.contents.size()); + data_.insert(data_.end(), segment.contents.begin(), segment.contents.end()); + total_size += segment.contents.size(); + }; + auto* text = &elf.GetSegments()[0]; + for (auto& segment : std::span(elf.GetSegments()).subspan(1)) { + if (text && segment.address > text->address) { + emit_segment(*text); + text = nullptr; + } + emit_segment(segment); } - if (loading == Loading::DISCRETE) { - std::sort( - map.begin(), map.end(), [&](unsigned a, unsigned b) { return segments[a].address < segments[b].address; }); + if (text) { + emit_segment(*text); } - link_spans_.reserve(segments.size()); - text_addr_ = segments[0].address; - text_size_ = segments[0].contents.size() * sizeof(word_t); - auto lma = segments[0].lma; - - for (unsigned ix : map) { - auto const& segment = segments[map[ix]]; - if (not segment.relocs.empty()) { - TT_THROW("{}: unexpected dynamic relocations", path); - } - if (loading != Loading::DISCRETE) { - if (segment.lma != lma) { - TT_THROW("{}: inconsistent load addresses for packing", path); - } - lma += segment.contents.size() * sizeof(word_t); - } - if (loading == Loading::DISCRETE ? segment.contents.size() != 0 : link_spans_.empty()) { - link_spans_.emplace_back(segment.address, 0); - } - link_spans_.back().len += segment.contents.size(); - data_.insert(data_.end(), segment.contents.begin(), segment.contents.end()); - } + set_text_size(elf.GetSegments()[0].contents.size() * sizeof(word_t)); + set_packed_size(total_size * sizeof(uint32_t)); } bool memory::operator==(const memory& other) const { return data_ == other.data_ && link_spans_ == other.link_spans_; } @@ -100,4 +84,67 @@ void memory::process_spans( } } +// Takes spans and merges the data to the text span +// Used for kernels (not firmware) +// Spans get packed for kernels so they can be loaded in one NOC transaction +// A symbol at the end of the text segment allows the FW to find the data segment to copy into place +void memory::pack_data_into_text(std::uint64_t text_start, std::uint64_t data_start) { + uint64_t text_end, data_end; + if (text_start > data_start) { + text_end = std::numeric_limits::max(); + data_end = text_start; + } else { + text_end = data_start; + data_end = std::numeric_limits::max(); + } + + TT_ASSERT(this->link_spans_.size() != 0); + + std::vector new_data; + new_data.resize(this->data_.size()); + struct span new_span; + size_t new_len = 0; + + bool first_text = true; + size_t offset = 0; + // Copy text spans. May start after data span (ncrisc) + // TODO: Ideally would be just 1, sometimes init doesn't merge w/ text and we get 2 + // TODO: (and init is just a jump to text and should be removed) + for (const auto& span : this->link_spans_) { + if (span.addr >= text_start && span.addr < text_end) { + if (first_text) { + new_span.addr = span.addr; + first_text = false; + } else if (span.addr > new_span.addr + new_len * sizeof(uint32_t)) { + uint64_t delta = span.addr - (new_span.addr + new_len * sizeof(uint32_t)); + delta /= sizeof(uint32_t); + // Pad the prior span + new_data.resize(new_data.size() + delta); + new_len += delta; + } + memcpy(&new_data[new_len], &this->data_[offset], span.len * sizeof(uint32_t)); + new_len += span.len; + } + + offset += span.len; + } + TT_ASSERT(!first_text); + + // Copy data spans. Should be just 1. May start before text span (ncrisc) + offset = 0; + for (const auto& span : this->link_spans_) { + if (span.addr >= data_start && span.addr < data_end) { + memcpy(&new_data[new_len], &this->data_[offset], span.len * sizeof(uint32_t)); + new_len += span.len; + } + offset += span.len; + } + + new_span.len = new_len; + this->link_spans_.resize(1); + this->link_spans_[0] = new_span; + this->data_ = new_data; + this->text_addr_ = new_span.addr; +} + } // namespace ll_api diff --git a/tt_metal/llrt/tt_memory.h b/tt_metal/llrt/tt_memory.h index fcbcd073515..ac25131db0d 100644 --- a/tt_metal/llrt/tt_memory.h +++ b/tt_metal/llrt/tt_memory.h @@ -13,59 +13,67 @@ namespace ll_api { class memory { -public: - typedef std::uint64_t address_t; - typedef std::uint32_t word_t; - enum class Loading : std::uint8_t { DISCRETE, CONTIGUOUS, CONTIGUOUS_XIP }; - -private: - struct span { - // Note: the offset of the data for a span in data_ is generated on the - // fly by processing spans in order - address_t addr; // byte address in device memory - size_t len; - bool operator==(const span& other) const { return addr == other.addr && len == other.len; } - }; - - std::vector data_; - std::vector link_spans_; - uint32_t text_size_ = 0; - uint32_t text_addr_ = 0; - Loading loading_; - -public: - memory(); - memory(std::string const &path, Loading loading); - -public: - // These can be large objects, so ban copying ... - memory(memory const&) = delete; - memory& operator=(memory const&) = delete; - // ... but permit moving. - memory(memory&&) = default; - memory& operator=(memory&&) = default; + public: + typedef std::uint64_t address_t; + typedef std::uint32_t word_t; + enum class PackSpans { PACK, NO_PACK }; + enum class Relocate { XIP, NONE }; + + private: + static constexpr uint32_t initial_data_space_ = 0x400; + static constexpr uint32_t initial_span_space_ = 4; + + struct span { + // Note: the offset of the data for a span in data_ is generated on the + // fly by processing spans in order + address_t addr; // byte address in device memory + size_t len; + bool operator==(const span& other) const { return addr == other.addr && len == other.len; } + }; + + std::vector data_; + std::vector link_spans_; + uint32_t text_size_; + uint32_t packed_size_; + uint32_t text_addr_; + + public: + memory(); + memory(std::string const &path, Relocate relo_type); + + public: + // These can be large objects, so ban copying ... + memory(memory const&) = delete; + memory& operator=(memory const&) = delete; + // ... but permit moving. + memory(memory&&) = default; + memory& operator=(memory&&) = default; + + public: + const std::vector& data() const { return this->data_; } + + // memory& operator=(memory &&src); + bool operator==(const memory& other) const; + + void set_text_size(uint32_t size) { this->text_size_ = size; } + void set_packed_size(uint32_t size) { this->packed_size_ = size; } + uint32_t get_text_size() const { return this->text_size_; } + uint32_t get_packed_size() const { return this->packed_size_; } + uint32_t get_text_addr() const { return this->text_addr_; } + + size_t size() const { return data_.size(); } + + size_t num_spans() const { return link_spans_.size(); } public: - const std::vector& data() const { return this->data_; } + // Process spans in arg mem to fill data in *this (eg, from device) + void fill_from_mem_template(const memory& mem_template, const std::function::iterator, uint64_t addr, uint32_t len)>& callback); - bool operator==(const memory& other) const; - Loading get_loading() const {return loading_;} - - uint32_t get_text_size() const { return this->text_size_; } - uint32_t get_packed_size() const { return data_.size() * sizeof(word_t); } - uint32_t get_text_addr() const { return this->text_addr_; } - - size_t size() const { return data_.size(); } - - size_t num_spans() const { return link_spans_.size(); } - -public: - // Process spans in arg mem to fill data in *this (eg, from device) - void fill_from_mem_template(const memory& mem_template, const std::function::iterator, uint64_t addr, uint32_t len)>& callback); + // Iterate over spans_ to act on data_ (eg., to device) + void process_spans(const std::function::const_iterator, uint64_t addr, uint32_t len)>& callback) const; + void process_spans(const std::function::iterator, uint64_t addr, uint32_t len)>& callback); - // Iterate over spans_ to act on data_ (eg., to device) - void process_spans(const std::function::const_iterator, uint64_t addr, uint32_t len)>& callback) const; - void process_spans(const std::function::iterator, uint64_t addr, uint32_t len)>& callback); + void pack_data_into_text(std::uint64_t text_start, std::uint64_t data_start); }; } // namespace ll_api