diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt index cecc88db..36b34679 100644 --- a/device/CMakeLists.txt +++ b/device/CMakeLists.txt @@ -1,20 +1,18 @@ set(UMD_DEVICE_SRCS architecture_implementation.cpp - blackhole_implementation.cpp cpuset_lib.cpp - grayskull_implementation.cpp tlb.cpp tt_cluster_descriptor.cpp tt_device.cpp - tt_emulation_stub.cpp tt_silicon_driver.cpp tt_silicon_driver_common.cpp tt_soc_descriptor.cpp - tt_versim_stub.cpp - wormhole_implementation.cpp simulation/tt_simulation_device.cpp simulation/tt_simulation_host.cpp + blackhole/blackhole_implementation.cpp + grayskull/grayskull_implementation.cpp + wormhole/wormhole_implementation.cpp ) add_library(umd_device SHARED ${UMD_DEVICE_SRCS}) target_link_libraries(umd_device diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp index 96117d96..d55d3e29 100644 --- a/device/architecture_implementation.cpp +++ b/device/architecture_implementation.cpp @@ -4,9 +4,9 @@ #include "device/architecture_implementation.h" -#include "device/blackhole_implementation.h" -#include "device/grayskull_implementation.h" -#include "device/wormhole_implementation.h" +#include "device/blackhole/blackhole_implementation.h" +#include "device/grayskull/grayskull_implementation.h" +#include "device/wormhole/wormhole_implementation.h" namespace tt::umd { diff --git a/device/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp similarity index 98% rename from device/blackhole_implementation.cpp rename to device/blackhole/blackhole_implementation.cpp index 4c36838c..eda2f140 100644 --- a/device/blackhole_implementation.cpp +++ b/device/blackhole/blackhole_implementation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "device/blackhole_implementation.h" +#include "blackhole_implementation.h" namespace tt::umd { diff --git a/device/blackhole_implementation.h b/device/blackhole/blackhole_implementation.h similarity index 100% rename from device/blackhole_implementation.h rename to device/blackhole/blackhole_implementation.h diff --git a/device/blackhole/impl_device.hpp b/device/blackhole/impl_device.hpp deleted file mode 100644 index afb4091c..00000000 --- a/device/blackhole/impl_device.hpp +++ /dev/null @@ -1,76 +0,0 @@ -#pragma once - -#include "device/tt_silicon_driver_common.hpp" - -// See src/t6ifc/t6py/packages/tenstorrent/data/wormhole/pci/tlb.yaml -// local_offset: [ 0, 15, 0, "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."] -// x_end : [ 0, 21, 16, "" ] -// y_end : [ 0, 27, 22, "" ] -// x_start : [ 0, 33, 28, "" ] -// y_start : [ 0, 39, 34, "" ] -// noc_sel: [ 0, 40, 40, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 41, 41, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 43, 42, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 44, 44, "linked"] - -// local_offset: [ 0, 14, 0, "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."] -// x_end : [ 0, 20, 15, "" ] -// y_end : [ 0, 26, 21, "" ] -// x_start : [ 0, 32, 27, "" ] -// y_start : [ 0, 38, 33, "" ] -// noc_sel: [ 0, 39, 39, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 40, 40, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 42, 41, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 43, 43, "linked"] - -// local_offset: [ 0, 11, 0, "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."] -// x_end : [ 0, 17, 12, "" ] -// y_end : [ 0, 23, 18, "" ] -// x_start : [ 0, 29, 24, "" ] -// y_start : [ 0, 35, 30, "" ] -// noc_sel: [ 0, 36, 36, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 37, 37, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 39, 38, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 40, 40, "linked"] - -const auto TLB_1M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 16, - .y_end = 22, - .x_start = 28, - .y_start = 34, - .noc_sel = 40, - .mcast = 41, - .ordering = 42, - .linked = 44, - .static_vc = 45, - .static_vc_end = 46 -}; - -const auto TLB_2M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 15, - .y_end = 21, - .x_start = 27, - .y_start = 33, - .noc_sel = 39, - .mcast = 40, - .ordering = 41, - .linked = 43, - .static_vc = 44, - .static_vc_end = 45 -}; - -const auto TLB_16M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 12, - .y_end = 18, - .x_start = 24, - .y_start = 30, - .noc_sel = 36, - .mcast = 37, - .ordering = 38, - .linked = 40, - .static_vc = 41, - .static_vc_end = 42 -}; diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp index 803ee8eb..123b5fd0 100644 --- a/device/cpuset_lib.cpp +++ b/device/cpuset_lib.cpp @@ -7,39 +7,13 @@ #include "cpuset_lib.hpp" #include "common/logger.hpp" #include -#include "device/device_api.h" +#include "device/tt_device.h" #include namespace tt { namespace fs = std::filesystem; namespace cpuset { -// Unrelated to hwloc binding of threads, instead to query cpu affinity to find reasonable number of threads to parallelize over. -int get_allowed_num_threads(){ - unsigned int num_pus_in_system = sysconf(_SC_NPROCESSORS_ONLN); - unsigned int num_threads = num_pus_in_system; - - cpu_set_t mask; - if (sched_getaffinity(0, sizeof(cpu_set_t), &mask) == -1) { - log_warning(LogSiliconDriver, "Could not detect current process cpu id affinity for calculating num_threads, will use default num_threads: {}.", num_threads); - } else{ - unsigned int visible_pu_count = CPU_COUNT(&mask); - if (visible_pu_count < num_pus_in_system){ - num_threads = visible_pu_count; - } - log_trace(LogSiliconDriver, "Detected (allowed) visible_pu_count: {}, setting num_threads: {}", visible_pu_count, num_threads); - } - - char const* override_thread_count = std::getenv("TT_BACKEND_COMPILE_THREADS"); - if (override_thread_count != nullptr && std::atoi(override_thread_count) > 0){ - num_threads = std::atoi(override_thread_count); - log_trace(LogSiliconDriver, "Overriding via env-var to num_threads: {}", num_threads); - } - - return num_threads; -} - - ///////////////////////////////////////////////////////////////////////// // Initialization Functions ///////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// @@ -49,7 +23,6 @@ tt_cpuset_allocator::tt_cpuset_allocator() { m_pid = getpid(); m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false; - m_skip_singlify = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SKIP_SINGLIFY") ? true : false; // Chicken bit to disable this entire feature for debug/comparison. bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false; @@ -72,7 +45,6 @@ tt_cpuset_allocator::tt_cpuset_allocator() { if (is_cpu_supported){ m_enable_cpuset_allocator &= init_determine_cpuset_allocations(); - m_enable_cpuset_allocator &= init_populate_physical_mmio_device_id_map(); }else{ m_enable_cpuset_allocator = false; } @@ -351,206 +323,10 @@ bool tt_cpuset_allocator::init_determine_cpuset_allocations(){ } -// Step 6 - Populate map of logical to physical mmio device map. -bool tt_cpuset_allocator::init_populate_physical_mmio_device_id_map(){ - - if (!m_enable_cpuset_allocator){ - return false; - } - - log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::populate_physical_mmio_device_id_map()"); - - // Get map of logical to physical device ids - FIXME: This is not accurate for some WHB0 clusters. - std::vector available_device_ids = tt_SiliconDevice::detect_available_device_ids(); - m_logical_to_physical_mmio_device_id_map = tt_SiliconDevice::get_logical_to_physical_mmio_device_id_map(available_device_ids); - - for (auto &d: m_logical_to_physical_mmio_device_id_map){ - auto logical_device_id = d.first; - auto physical_device_id = d.second; - log_debug(LogSiliconDriver, "populate_physical_mmio_device_id_map() -- available_devices: {} logical_device_id: {} => physical_device_id: {}", available_device_ids.size(), (int) logical_device_id, (int) physical_device_id); - m_num_threads_pinned_per_tt_device.insert({physical_device_id, 0}); - } - - return true; // Success -} - - ///////////////////////////////////////////////////////////////////////// // Runtime Functions //////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// -// Idea - Something to compare cpuset from Slurm to cpuset picked by this function. -hwloc_cpuset_t tt_cpuset_allocator::allocate_cpu_set_for_thread(chip_id_t physical_device_id, bool skip_singlify){ - - // To prevent races on read/modify/write to m_num_threads_pinned_per_tt_device across threads to same device. - const std::lock_guard lock(allocate_cpu_id_mutex); - - int num_alloc_slots_for_tt_device = m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); - int tt_device_alloc_idx = m_num_threads_pinned_per_tt_device.at(physical_device_id) % num_alloc_slots_for_tt_device; - - // Check if 2CCX-PER-CCD Optimization can be enabled. For AMD EPYC models : There is 1 L3Cache per CCX and 2 CCX per CCD. - // Better perf to first allocate to unique CCD's if we have enough per device. Expand to other CPU types? - bool enable_special_case = true; - auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id); - auto num_l3_per_ccx = m_package_id_to_num_l3_per_ccx_map.at(package_id); - auto num_ccx_per_ccd = m_package_id_to_num_ccx_per_ccd_map.at(package_id); - - if (enable_special_case && num_l3_per_ccx == 1 && num_ccx_per_ccd == 2 && num_alloc_slots_for_tt_device > num_ccx_per_ccd && m_object_per_alloc_slot == HWLOC_OBJ_L3CACHE){ - int alloc_idx_for_device = m_num_threads_pinned_per_tt_device.at(physical_device_id); - int ccx_in_ccd = (alloc_idx_for_device % num_alloc_slots_for_tt_device) < num_alloc_slots_for_tt_device/num_ccx_per_ccd ? 0 : 1; - tt_device_alloc_idx = (ccx_in_ccd + (alloc_idx_for_device * num_ccx_per_ccd)) % num_alloc_slots_for_tt_device; - log_debug(LogSiliconDriver,"Special L3Cache case physical_device_id: {} alloc_idx_for_device: {} ccx_in_ccd: {} tt_device_alloc_idx: {}", physical_device_id, alloc_idx_for_device, ccx_in_ccd, tt_device_alloc_idx); - } - - - // Get the desired cpuset and prevent migration between different PU's in set by singlifying to single PU. - hwloc_cpuset_t cpuset = hwloc_bitmap_dup(m_physical_device_id_to_cpusets_map.at(physical_device_id).at(tt_device_alloc_idx)); - if (!m_skip_singlify && !skip_singlify){ - hwloc_bitmap_singlify(cpuset); - } - - // Debug - auto tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver,"Allocating for physical_device_id: {} num_alloc_slots: {} num_threads_pinned: {} alloc_idx: {} skip_singlify: {} (pid: {} tid: {}) => {} PU's {}", - physical_device_id, num_alloc_slots_for_tt_device, m_num_threads_pinned_per_tt_device.at(physical_device_id), tt_device_alloc_idx, skip_singlify, - m_pid, tid, hwloc_bitmap_weight(cpuset), get_hwloc_bitmap_vector(cpuset)); - - // Increment counter to keep track of number of pinned thread per device, to get unique cpuset per thread. - m_num_threads_pinned_per_tt_device.at(physical_device_id)++; - - return cpuset; -} - -void tt_cpuset_allocator::store_thread_original_cpuset(){ - - auto tid = std::this_thread::get_id(); - hwloc_cpuset_t orig_cpuset = hwloc_bitmap_alloc(); - - if (hwloc_get_cpubind(m_topology, orig_cpuset, HWLOC_CPUBIND_THREAD)){ - log_warning(LogSiliconDriver,"store_thread_original_cpuset() calling hwloc_get_cpubind() failed with errno: {} (pid: {} tid:{})", strerror(errno), m_pid, tid); - }else{ - auto orig_cpuset_vector = get_hwloc_bitmap_vector(orig_cpuset); - log_debug(LogSiliconDriver, "store_thread_original_cpuset() success - got orig cpuset: {} PU's: {} (pid: {} tid: {})", orig_cpuset_vector.size(), orig_cpuset_vector, m_pid, tid); - m_global_thread_id_to_original_cpuset_map.insert({tid, hwloc_bitmap_dup(orig_cpuset)}); - } - hwloc_bitmap_free(orig_cpuset); -} - - - -// Given a logical device_id, determine the right cpu_ids associated with it and pin this thread to them. -void tt_cpuset_allocator::bind_thread_cpuset(tt_cluster_description *ndesc, chip_id_t logical_device_id, bool skip_singlify){ - - auto tid = std::this_thread::get_id(); - - // This needed to be protected by not-empty otherwise arithmetic error. - if ((!m_global_thread_ids_pinned.empty() && m_global_thread_ids_pinned.count(tid)) || (!m_enable_cpuset_allocator)){ - return; - }else{ - - if (!ndesc->is_chip_mmio_capable(logical_device_id)){ - logical_device_id = ndesc->get_closest_mmio_capable_chip(logical_device_id); - } - - log_debug(LogSiliconDriver,"bind_thread_cpuset_cpuset() for logical_device_id: {} m_logical_to_physical_mmio_device_id_map.size(): {}", logical_device_id, m_logical_to_physical_mmio_device_id_map.size()); - - // If a main thread ID was captured, make sure it is not attempted to be pinned. Only IO API sub threads are expected to be pinned today. - if (m_stored_main_thread_id && tid == m_main_thread_id){ - log_warning(LogSiliconDriver, "bind_thread_cpuset() - Skipping cpubind for runtime main thread_id: {} to prevent undesired inheritence. Consider moving device IO (ie. push/pop/get) to sub-threads for binding to be supported.", m_main_thread_id); - return; - } - - if (m_logical_to_physical_mmio_device_id_map.count(logical_device_id) > 0){ - - auto physical_device_id = m_logical_to_physical_mmio_device_id_map.at(logical_device_id); - auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id); - - store_thread_original_cpuset(); // Store original cpuset for later unbinding if necessary. - - // Get the cpuset, and attempt to bind thread to it. - hwloc_cpuset_t cpuset = allocate_cpu_set_for_thread(physical_device_id, skip_singlify); - auto cpuset_vector = get_hwloc_bitmap_vector(cpuset); - - if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT )){; // HWLOC_CPUBIND_NOMEMBIND - log_warning(LogSiliconDriver,"bind_thread_cpuset() binding failed (errno: {}) for physical_device_id: {} on package_id: {} to {} PU's: {} (pid: {} tid: {})", - strerror(errno), physical_device_id, package_id, cpuset_vector.size(), cpuset_vector, m_pid, tid); - }else{ - log_debug(LogSiliconDriver,"bind_thread_cpuset() binding success skip: {} for physical_device_id: {} on package_id: {} to {} PU's: {} (pid: {} tid: {})", - skip_singlify, physical_device_id, package_id, cpuset_vector.size(), cpuset_vector, m_pid, tid); - // Record that this thread is pinned, no need to repeat on subsequent IO API calls. - m_global_thread_ids_pinned.insert(tid); - m_global_thread_id_to_physical_device_id_map.insert({tid, physical_device_id}); - } - - }else{ - log_warning(LogSiliconDriver,"Could not find logical_device_id: {} in m_logical_to_physical_mmio_device_id_map. This shouldn't happen.", logical_device_id); - } - } -} - - -// Restore thread's original cpubind. Perhaps could be simplified to not require physical_device_id or previous binding, and just always bind to MACHINE cpuset. -void tt_cpuset_allocator::unbind_thread_cpuset(){ - - if (m_enable_cpuset_allocator){ - auto tid = std::this_thread::get_id(); - - // Make sure this thread was successfully and previously binded to a cpuset. - if (!m_global_thread_id_to_original_cpuset_map.count(tid)){ - log_warning(LogSiliconDriver,"unbind_thread_cpuset() called for tid: {} but no original cpuset for this thread found. Previous cpu binding skipped or failed?", tid); - return; - } - - if (!m_global_thread_id_to_physical_device_id_map.count(tid)){ - log_warning(LogSiliconDriver,"unbind_thread_cpuset() called for tid: {} but no physical_device_id this thread found. Previous cpu binding skipped or failed?", tid); - return; - } - - // Handle the case where something goes wrong during original binding above, don't want to error out. - auto cpuset = m_global_thread_id_to_original_cpuset_map.at(tid); - auto physical_device_id = m_global_thread_id_to_physical_device_id_map.at(tid); - auto cpuset_vector = get_hwloc_bitmap_vector(cpuset); // Can tighten this up and remove, it's purely for debug anyways. - - if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD)){ - log_warning(LogSiliconDriver,"unbind_thread_cpuset() binding failed (errno: {}) for physical_device_id: {} to original {} PU's: {} (pid: {} tid: {})", - strerror(errno), physical_device_id, cpuset_vector.size(), cpuset_vector, m_pid, tid); - }else{ - log_debug(LogSiliconDriver,"unbind_thread_cpuset() binding success for physical_device_id: {} to original {} PU's: {} (pid: {} tid: {})", - physical_device_id, cpuset_vector.size(), cpuset_vector, m_pid, tid); - - // To prevent races on read/modify/write to m_num_threads_pinned_per_tt_device across threads to same device. - const std::lock_guard lock(allocate_cpu_id_mutex); - - // Update book-keeping by removing entry, so this thread can be re-pinned in the future. - m_num_threads_pinned_per_tt_device.at(physical_device_id)--; - m_global_thread_ids_pinned.erase(tid); - m_global_thread_id_to_physical_device_id_map.erase(tid); - } - } -} - -// Teardown/Cleanup for end of process. Don't do anything if feature disabled. Probably don't even need this if process is going to be ended. -void tt_cpuset_allocator::clear_state(){ - if (m_enable_cpuset_allocator){ - - auto tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver,"Clearing state and unbinding entire process' cpuset (pid: {} tid: {}).", m_pid, tid); - - // Reset state variables so that next time the thread can be freshly pinned - m_global_thread_ids_pinned.clear(); - for (auto &device: m_num_threads_pinned_per_tt_device){ - device.second = 0; - } - - // Undo previous pinning, by binding to full machine cpuset. Alternatively could have saved and restored orig cpuset per thread. - auto machine_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_MACHINE, 0); - if (hwloc_set_cpubind(m_topology, machine_obj->cpuset, HWLOC_CPUBIND_PROCESS)){ - log_warning(LogSiliconDriver,"clear_state() binding failed (errno: {}) to Machine cpuset (pid: {} tid: {})", strerror(errno), m_pid, tid); - } - } -} - - // Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it. bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ @@ -580,14 +356,6 @@ bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, return true; // Success } - -// For checking purposes, to make sure main thread is not cpubinded accidentally. -void tt_cpuset_allocator::_set_main_thread_id(){ - m_main_thread_id = std::this_thread::get_id(); - m_stored_main_thread_id = true; - log_debug(LogSiliconDriver,"Captured main_thread_id: {}", m_main_thread_id); -} - int tt_cpuset_allocator::_get_num_tt_pci_devices() { for (auto &d : m_physical_device_id_to_package_id_map) { diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp index 65e31eaa..a14a4f33 100644 --- a/device/cpuset_lib.hpp +++ b/device/cpuset_lib.hpp @@ -24,8 +24,6 @@ namespace tt { //! Utility functions for various backend paramsf namespace cpuset { -int get_allowed_num_threads(); - // CPU ID allocator for pinning threads to cpu_ids // It's a singleton that should be retrieved via get() struct tt_cpuset_allocator { @@ -34,39 +32,12 @@ struct tt_cpuset_allocator { tt_cpuset_allocator(tt_cpuset_allocator const&) = delete; void operator=(tt_cpuset_allocator const&) = delete; - static void bind_thread_to_cpuset(tt_cluster_description *ndesc, chip_id_t device_id, bool skip_singlify=false){ - auto& instance = tt_cpuset_allocator::get(); - instance.bind_thread_cpuset(ndesc, device_id, skip_singlify); - } - - static void unbind_thread_from_cpuset(){ - auto& instance = tt_cpuset_allocator::get(); - instance.unbind_thread_cpuset(); - } - - static void clear_state_and_cpuset_pins(){ - auto& instance = tt_cpuset_allocator::get(); - instance.clear_state(); - } - // Bind an already allocated memory region to particular numa nodes static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ auto& instance = tt_cpuset_allocator::get(); return instance.bind_area_memory_nodeset(physical_device_id, addr, len); } - // Store process' main thread_id (not required, mainly for checking purposes to ensure no cpubinds on it occur). - static void set_main_thread_id(){ - auto& instance = tt_cpuset_allocator::get(); - instance._set_main_thread_id(); - } - - static int get_num_cpu_cores_allocated_to_device(chip_id_t physical_device_id){ - auto& instance = tt_cpuset_allocator::get(); - auto num_cores = instance.m_enable_cpuset_allocator ? instance.m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) : get_allowed_num_threads(); - return num_cores; - } - static int get_num_tt_pci_devices(){ auto& instance = tt_cpuset_allocator::get(); return instance._get_num_tt_pci_devices(); @@ -88,17 +59,10 @@ struct tt_cpuset_allocator { int TENSTORRENT_VENDOR_ID = 0x1e52; - void bind_thread_cpuset(tt_cluster_description *ndesc, chip_id_t device_id, bool skip_singlify); - void unbind_thread_cpuset(); - void store_thread_original_cpuset(); bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len); - void _set_main_thread_id(); int _get_num_tt_pci_devices(); int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id); - void clear_state(); - hwloc_cpuset_t allocate_cpu_set_for_thread(chip_id_t physical_device_id, bool skip_singlify); - // Series of init functions, must be called in this order. Seperated out to support // early exit in case of errors. bool init_topology_init_and_load(); @@ -106,7 +70,6 @@ struct tt_cpuset_allocator { bool init_get_number_of_packages(); bool init_is_cpu_model_supported(); bool init_determine_cpuset_allocations(); - bool init_populate_physical_mmio_device_id_map(); // Helper Functions std::string get_pci_bus_id(hwloc_obj_t pci_device_obj); @@ -122,11 +85,8 @@ struct tt_cpuset_allocator { std::vector get_hwloc_nodeset_vector(hwloc_obj_t &obj); hwloc_topology_t m_topology; bool m_debug; - bool m_skip_singlify; pid_t m_pid; - std::unordered_map m_logical_to_physical_mmio_device_id_map; - // Items calculated by parsing system info, used by allocation algorithm: std::map> m_package_id_to_devices_map; std::map m_physical_device_id_to_pci_bus_id_map; // Debug/Info @@ -135,30 +95,16 @@ struct tt_cpuset_allocator { std::map> m_physical_device_id_to_cpusets_map; std::map m_physical_device_id_to_package_id_map; - std::mutex allocate_cpu_id_mutex; - bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing. int m_num_packages = 0; std::vector m_all_tt_devices = {}; hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default - // For 2CCX-PER-CCD Optimization detection. std::map m_package_id_to_num_l3_per_ccx_map; std::map m_package_id_to_num_ccx_per_ccd_map; - std::map m_num_threads_pinned_per_tt_device; - std::unordered_set m_global_thread_ids_pinned = {}; - std::thread::id m_main_thread_id; - bool m_stored_main_thread_id = false; - - // For quicker unbinding of threads, record the physical_device_id during binding. - std::map m_global_thread_id_to_physical_device_id_map = {}; - - // For storing original cpuset during binding, to restore during unbinding. - std::map m_global_thread_id_to_original_cpuset_map = {}; - // Memory Binding std::map m_physical_device_id_to_numa_nodeset_map; diff --git a/device/device_api.h b/device/device_api.h deleted file mode 100644 index a2728e7a..00000000 --- a/device/device_api.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once -#include "device/tt_device.h" -#include "device/driver_atomics.h" -#include "device/tt_emulation_device.h" diff --git a/device/device_api_metal.h b/device/device_api_metal.h index a2728e7a..0fc7820c 100644 --- a/device/device_api_metal.h +++ b/device/device_api_metal.h @@ -7,4 +7,3 @@ #pragma once #include "device/tt_device.h" #include "device/driver_atomics.h" -#include "device/tt_emulation_device.h" diff --git a/device/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp similarity index 98% rename from device/grayskull_implementation.cpp rename to device/grayskull/grayskull_implementation.cpp index 9d773166..6ed7aaaf 100644 --- a/device/grayskull_implementation.cpp +++ b/device/grayskull/grayskull_implementation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "device/grayskull_implementation.h" +#include "grayskull_implementation.h" namespace tt::umd { diff --git a/device/grayskull_implementation.h b/device/grayskull/grayskull_implementation.h similarity index 99% rename from device/grayskull_implementation.h rename to device/grayskull/grayskull_implementation.h index 79bdfdee..c014350a 100644 --- a/device/grayskull_implementation.h +++ b/device/grayskull/grayskull_implementation.h @@ -99,7 +99,6 @@ enum class arc_message_type { ARC_GO_LONG_IDLE = 0x54, ARC_GET_HARVESTING = 0x57, TEST = 0x90, - NOC_DMA_TRANSFER = 0x9A, SETUP_IATU_FOR_PEER_TO_PEER = 0x97, DEASSERT_RISCV_RESET = 0xba }; diff --git a/device/grayskull/impl_device.hpp b/device/grayskull/impl_device.hpp deleted file mode 100644 index 21a18125..00000000 --- a/device/grayskull/impl_device.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include "device/tt_silicon_driver_common.hpp" - -// See src/t6ifc/t6py/packages/tenstorrent/data/grayskull/pci/tlb.yaml -// 1M -// local_offset: [ 0, 11, 0, "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."] -// x_end : [ 0, 17, 12, "" ] -// y_end : [ 0, 23, 18, "" ] -// x_start : [ 0, 29, 24, "" ] -// y_start : [ 0, 35, 30, "" ] -// noc_sel: [ 0, 36, 36, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 37, 37, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 39, 38, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 40, 40, "linked"] - -// 2M -// local_offset: [ 0, 10, 0, "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."] -// x_end : [ 0, 16, 11, "" ] -// y_end : [ 0, 22, 17, "" ] -// x_start : [ 0, 28, 23, "" ] -// y_start : [ 0, 34, 29, "" ] -// noc_sel: [ 0, 35, 35, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 36, 36, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 38, 37, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 39, 39, "linked"] - -// 16M -// local_offset: [ 0, 7 , 0, "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."] -// x_end : [ 0, 13, 8, "" ] -// y_end : [ 0, 19, 14, "" ] -// x_start : [ 0, 25, 20, "" ] -// y_start : [ 0, 31, 26, "" ] -// noc_sel: [ 0, 32, 32, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 33, 33, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 35, 34, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 36, 36, "linked"] - -const auto TLB_1M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 12, - .y_end = 18, - .x_start = 24, - .y_start = 30, - .noc_sel = 36, - .mcast = 37, - .ordering = 38, - .linked = 40, - .static_vc = 41, - .static_vc_end = 42 -}; - -const auto TLB_2M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 11, - .y_end = 17, - .x_start = 23, - .y_start = 29, - .noc_sel = 35, - .mcast = 36, - .ordering = 37, - .linked = 39, - .static_vc = 40, - .static_vc_end = 41 -}; - -const auto TLB_16M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 8, - .y_end = 14, - .x_start = 20, - .y_start = 26, - .noc_sel = 32, - .mcast = 33, - .ordering = 34, - .linked = 36, - .static_vc = 37, - .static_vc_end = 38 -}; diff --git a/device/kmdif.h b/device/kmdif.h index 32596d55..c013202b 100644 --- a/device/kmdif.h +++ b/device/kmdif.h @@ -9,15 +9,6 @@ typedef std::uint32_t DWORD; -const uint32_t MAX_DMA_BYTES = 4*1024*1024; - -// DMA -struct DMAbuffer { - void *pBuf = NULL; - std::uint64_t pDma = 0; - std::uint64_t size; -}; - struct TTDevice; struct PCIdevice { diff --git a/device/tt_emulation_device.cpp b/device/simulation/deprecated/tt_emulation_device.cpp similarity index 100% rename from device/tt_emulation_device.cpp rename to device/simulation/deprecated/tt_emulation_device.cpp diff --git a/device/tt_emulation_device.h b/device/simulation/deprecated/tt_emulation_device.h similarity index 97% rename from device/tt_emulation_device.h rename to device/simulation/deprecated/tt_emulation_device.h index 259841c4..fb2b5e0d 100644 --- a/device/tt_emulation_device.h +++ b/device/simulation/deprecated/tt_emulation_device.h @@ -1,3 +1,9 @@ +/* + * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + #pragma once #include diff --git a/device/tt_emulation_stub.cpp b/device/simulation/deprecated/tt_emulation_stub.cpp similarity index 100% rename from device/tt_emulation_stub.cpp rename to device/simulation/deprecated/tt_emulation_stub.cpp diff --git a/device/tt_versim_device.cpp b/device/simulation/deprecated/tt_versim_device.cpp similarity index 100% rename from device/tt_versim_device.cpp rename to device/simulation/deprecated/tt_versim_device.cpp diff --git a/device/simulation/deprecated/tt_versim_device.h b/device/simulation/deprecated/tt_versim_device.h new file mode 100644 index 00000000..087b7336 --- /dev/null +++ b/device/simulation/deprecated/tt_versim_device.h @@ -0,0 +1,72 @@ +/* + * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "tt_device.h" +#include "tt_soc_descriptor.h" +#include "tt_xy_pair.h" + +class c_versim_core; +namespace nuapi {namespace device {template class Simulator;}} +namespace versim { + struct VersimSimulatorState; + using VersimSimulator = nuapi::device::Simulator; +} + +/** + * @brief Versim Backend Class, derived from the tt_device class + * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device. +*/ +class tt_VersimDevice: public tt_device +{ + public: + virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); + virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); + tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path); + virtual std::unordered_map& get_virtual_soc_descriptors(); + virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); + virtual void start_device(const tt_device_params &device_params); + virtual void close_device(); + virtual void deassert_risc_reset(); + virtual void deassert_risc_reset_at_core(tt_cxy_pair core); + virtual void assert_risc_reset(); + virtual void assert_risc_reset_at_core(tt_cxy_pair core); + virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); + virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); + virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + virtual void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); + virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); + virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void wait_for_non_mmio_flush(); + void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); + virtual bool using_harvested_soc_descriptors(); + virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); + virtual bool noc_translation_en(); + virtual std::set get_target_mmio_device_ids(); + virtual std::set get_target_remote_device_ids(); + virtual ~tt_VersimDevice(); + virtual tt_ClusterDescriptor* get_cluster_description(); + virtual int get_number_of_chips_in_cluster(); + virtual std::unordered_set get_all_chips_in_cluster(); + static int detect_number_of_chips(); + virtual std::map get_clocks(); + virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); + virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); + virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); + virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); + private: + bool stop(); + tt_device_l1_address_params l1_address_params; + tt_device_dram_address_params dram_address_params; + versim::VersimSimulator* versim; + std::shared_ptr ndesc; + void* p_ca_soc_manager; +}; diff --git a/device/tt_versim_stub.cpp b/device/simulation/deprecated/tt_versim_stub.cpp similarity index 100% rename from device/tt_versim_stub.cpp rename to device/simulation/deprecated/tt_versim_stub.cpp diff --git a/device/simulation/tt_simulation_device.h b/device/simulation/tt_simulation_device.h index c57bc1da..27a5fdc2 100644 --- a/device/simulation/tt_simulation_device.h +++ b/device/simulation/tt_simulation_device.h @@ -36,13 +36,7 @@ class tt_SimulationDevice: public tt_device { // Runtime Functions virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - - // void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - - // virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - // virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - virtual void write_to_sysmem(std::vector& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id); virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); virtual void read_from_sysmem(std::vector &vec, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); @@ -57,26 +51,16 @@ class tt_SimulationDevice: public tt_device { // Misc. Functions to Query/Set Device State // virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - // virtual bool noc_translation_en(); - // virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); - // virtual int get_number_of_chips_in_cluster(); - // virtual std::unordered_set get_all_chips_in_cluster(); - // virtual tt_ClusterDescriptor* get_cluster_description(); static std::vector detect_available_device_ids(); - // static std::unordered_map get_logical_to_physical_mmio_device_id_map(std::vector physical_device_ids); virtual std::set get_target_remote_device_ids(); virtual std::map get_clocks(); - // virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1); - // virtual void *channel_0_address(std::uint32_t offset, std::uint32_t device_id) const; virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device() const; virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - // virtual std::uint32_t get_pcie_speed(std::uint32_t device_id); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); - // virtual tt_version get_ethernet_fw_version() const; private: // State variables diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp index 2ef5ec64..08a15f18 100644 --- a/device/tt_cluster_descriptor.cpp +++ b/device/tt_cluster_descriptor.cpp @@ -52,29 +52,6 @@ std::vector> tt_ClusterDescri return directly_connected_channels; } -bool tt_ClusterDescriptor::channels_are_directly_connected(const chip_id_t &first, const ethernet_channel_t &first_channel, const chip_id_t &second, const ethernet_channel_t &second_channel) const { - if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) { - return false; - } - - if (this->ethernet_connections.at(first).find(first_channel) == this->ethernet_connections.at(first).end()) { - return false; - } - - const auto &[connected_chip, connected_channel] = this->ethernet_connections.at(first).at(first_channel); - return connected_chip == second && connected_channel == second_channel; -} - -// const eth_coord_t tt_ClusterDescriptor::get_chip_xy(const chip_id_t &chip_id) const { -// // For now we only support a 1D cluster, so the mapping is trivial (where the chip ID is the x value of the xy -// location) return eth_coord_t(chip_id, 0, 0, 0); -// } - -// const chip_id_t tt_ClusterDescriptor::get_chip_id_at_location(const eth_coord_t &chip_location) const { -// // For now we only support a 1D cluster, so the mapping is trivial (where the chip ID is the x value of the xy -// location) return chip_location.x; -// } - bool tt_ClusterDescriptor::is_chip_mmio_capable(const chip_id_t &chip_id) const { return this->chips_with_mmio.find(chip_id) != this->chips_with_mmio.end(); } @@ -367,14 +344,6 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull return desc; } -std::set get_sequential_chip_id_set(int num_chips) { - std::set chip_ids; - for (int i = 0; i < num_chips; ++i) { - chip_ids.insert(static_cast(i)); - } - return chip_ids; -} - void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML"); for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as>()) { @@ -594,22 +563,10 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus } } -void tt_ClusterDescriptor::specify_enabled_devices(const std::vector &chip_ids) { - this->enabled_active_chips.clear(); - for (auto chip_id : chip_ids) { - this->enabled_active_chips.insert(chip_id); - } -} - void tt_ClusterDescriptor::enable_all_devices() { this->enabled_active_chips = this->all_chips; } -bool tt_ClusterDescriptor::chips_have_ethernet_connectivity() const { - return ethernet_connections.size() > 0; -} - - std::unordered_map > > tt_ClusterDescriptor::get_ethernet_connections() const { auto eth_connections = std::unordered_map > >(); diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h index 1a923a8b..bbb8a796 100644 --- a/device/tt_cluster_descriptor.h +++ b/device/tt_cluster_descriptor.h @@ -76,7 +76,6 @@ class tt_ClusterDescriptor { */ std::vector> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const; - bool channels_are_directly_connected(const chip_id_t &first, const ethernet_channel_t &first_channel, const chip_id_t &second, const ethernet_channel_t &second_channel) const; bool is_chip_mmio_capable(const chip_id_t &chip_id) const; chip_id_t get_closest_mmio_capable_chip(const chip_id_t &chip); chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord); @@ -84,10 +83,7 @@ class tt_ClusterDescriptor { static std::unique_ptr create_for_grayskull_cluster( const std::set &logical_mmio_device_ids, const std::vector &physical_mmio_device_ids); - // const eth_coord_t get_chip_xy(const chip_id_t &chip_id) const; - // const chip_id_t get_chip_id_at_location(const eth_coord_t &chip_location) const; - bool chips_have_ethernet_connectivity() const; std::unordered_map get_harvesting_info() const; std::unordered_map get_noc_translation_table_en() const; std::unordered_map get_chip_locations() const; @@ -103,9 +99,6 @@ class tt_ClusterDescriptor { bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; std::tuple get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; - void specify_enabled_devices(const std::vector &chip_ids); void enable_all_devices(); }; - -std::set get_sequential_chip_id_set(int num_chips); diff --git a/device/tt_device.h b/device/tt_device.h index f3064cd5..22117ad2 100644 --- a/device/tt_device.h +++ b/device/tt_device.h @@ -20,7 +20,6 @@ #include "device/tlb.h" #include "device/tt_io.hpp" -using TLB_OFFSETS = tt::umd::tlb_offsets; using TLB_DATA = tt::umd::tlb_data; @@ -37,13 +36,6 @@ enum tt_DevicePowerState { LONG_IDLE }; -enum tt_MutexType { - LARGE_READ_TLB, - LARGE_WRITE_TLB, - SMALL_READ_WRITE_TLB, - ARC_MSG -}; - enum tt_MemBarFlag { SET = 0xaa, RESET = 0xbb, @@ -353,10 +345,8 @@ class tt_device * \param core chip-x-y struct specifying device and core * \param addr Address to write to * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor) - * \param send_epoch_cmd Specifies that this is an epoch_cmd write, forcing runtime to take a faster write path (Buda only) - * \param last_send_epoch_cmd Specifies that this is the last epoch command being written, which requires metadata to be updated (Buda only) */ - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false) { + virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { // Only implement this for Silicon Backend throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); } @@ -369,38 +359,11 @@ class tt_device * \param core chip-x-y struct specifying device and core * \param addr Address to write to * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor) - * \param send_epoch_cmd Specifies that this is an epoch_cmd write, forcing runtime to take a faster write path (Buda only) - * \param last_send_epoch_cmd Specifies that this is the last epoch command being written, which requires metadata to be updated (Buda only) */ - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false) { + virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); } - /** - * @brief Unroll/replicate uint32_t data (as specified by ptr + len pair) and write it to specified device, core and address (defined for Silicon). - * \param mem_ptr src data address - * \param len src data size (specified for uint32_t) - * \param unroll_count Number of times vector should be unrolled - * \param core chip-x-y struct specifying device and core - * \param addr Address to write to - * \param fallback_tlb Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor) - */ - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - // Only implement this for Silicon Backend - throw std::runtime_error("---- tt_device::rolled_write_to_device is not implemented\n"); - } - /** - * @brief Unroll/replicate a uint32_t vector and write it to specified device, core and address (defined for Silicon and Versim). - * \param vec Vector to write - * \param unroll_count Number of times vector should be unrolled - * \param core chip-x-y struct specifying device and core - * \param addr Address to write to - * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor) - */ - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { - throw std::runtime_error("---- tt_device::rolled_write_to_device is not implemented\n"); - } - /** * @brief Read uint32_t data from a specified device, core and address to host memory (defined for Silicon). * \param mem_ptr dest data address on host (expected to be preallocated, depending on transfer size) @@ -491,13 +454,7 @@ class tt_device virtual std::unordered_map get_harvesting_masks_for_soc_descriptors() { throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n"); } - /** - * @brief Get Hardware Translation Table state - * \returns true if translation tables are enabled (WH only) - */ - virtual bool noc_translation_en() { - throw std::runtime_error("---- tt_device:noc_translation_en is not implemented\n"); - } + /** * @brief Issue message to device, meant to be picked up by ARC Firmare * \param logical_device_id Chip to target @@ -566,14 +523,6 @@ class tt_device return std::map(); } - /** - * @brief Get the PCIe speed for a specific device based on link width and link speed - * \returns Bandwidth in Gbps - */ - virtual std::uint32_t get_pcie_speed(std::uint32_t device_id) { - return 8 * 16; // default to x8 at 16 GT/s - } - virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) { throw std::runtime_error("---- tt_device::get_numa_node_for_pcie_device is not implemented\n"); } @@ -585,30 +534,6 @@ class tt_device virtual tt_version get_ethernet_fw_version() const { throw std::runtime_error("---- tt_device::get_ethernet_fw_version is not implemented \n"); } - - /** - * @brief Get the total hugepage (host memory) size allocated for a device. - * This memory is not entirely accessible by device. To query the number of channels - * or memory per channel that is accessbile, see get_host_channel_size or get_num_host_channels - * \param src_device_id Device for which allocated host memory is being queried - * \returns Total memory allocated on host for a specific device - * - */ - virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1) { - throw std::runtime_error("---- tt_device::dma_allocation_size is not implemented\n"); - return 0; - } - - /** - * Get the address for the MMIO mapped region on Channel (as seen from host memory) - * \param offset Address in DRAM - * \param target chip-x-y struct specifying device and core of target DRAM - * \returns Host interpretation of MMIO mapped channel 0 address - */ - virtual void *channel_address(std::uint32_t offset, const tt_cxy_pair& target) { - throw std::runtime_error("---- tt_device::channel_address is not implemented\n"); - return nullptr; - } /** * @brief Query number of DRAM channels on a specific device * \param device_id Logical device id to query @@ -676,67 +601,6 @@ class tt_device std::unordered_map soc_descriptor_per_chip = {}; }; -class c_versim_core; -namespace nuapi {namespace device {template class Simulator;}} -namespace versim { - struct VersimSimulatorState; - using VersimSimulator = nuapi::device::Simulator; -} - -/** - * @brief Versim Backend Class, derived from the tt_device class - * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device. -*/ -class tt_VersimDevice: public tt_device -{ - public: - virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); - virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path); - virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params &device_params); - virtual void close_device(); - virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core); - virtual void assert_risc_reset(); - virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - virtual void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void wait_for_non_mmio_flush(); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); - virtual bool using_harvested_soc_descriptors(); - virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual bool noc_translation_en(); - virtual std::set get_target_mmio_device_ids(); - virtual std::set get_target_remote_device_ids(); - virtual ~tt_VersimDevice(); - virtual tt_ClusterDescriptor* get_cluster_description(); - virtual int get_number_of_chips_in_cluster(); - virtual std::unordered_set get_all_chips_in_cluster(); - static int detect_number_of_chips(); - virtual std::map get_clocks(); - virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); - virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); - virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); - virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - private: - bool stop(); - tt_device_l1_address_params l1_address_params; - tt_device_dram_address_params dram_address_params; - versim::VersimSimulator* versim; - std::shared_ptr ndesc; - void* p_ca_soc_manager; -}; - #include "device/architecture_implementation.h" /** @@ -781,14 +645,10 @@ class tt_SiliconDevice: public tt_device virtual void close_device(); // Runtime Functions - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); + virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - virtual void write_epoch_cmd_to_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write); - virtual void write_epoch_cmd_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write); - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); virtual void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); virtual void write_to_sysmem(std::vector& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id); @@ -809,7 +669,7 @@ class tt_SiliconDevice: public tt_device /** * @brief This API allows you to write directly to device memory that is addressable by a static TLB */ - std::function get_fast_pcie_static_tlb_write_callable(int device_id); + std::function get_fast_pcie_static_tlb_write_callable(int device_id); /** * @brief Provide fast write access to a statically-mapped TLB. @@ -824,40 +684,30 @@ class tt_SiliconDevice: public tt_device */ tt::Writer get_static_tlb_writer(tt_cxy_pair target); - /** - * @brief Returns the DMA buf size - */ - uint32_t get_m_dma_buf_size() const; // Misc. Functions to Query/Set Device State virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual bool noc_translation_en(); virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); virtual int get_number_of_chips_in_cluster(); virtual std::unordered_set get_all_chips_in_cluster(); virtual tt_ClusterDescriptor* get_cluster_description(); static int detect_number_of_chips(); static std::vector detect_available_device_ids(); - static std::unordered_map get_logical_to_physical_mmio_device_id_map(std::vector physical_device_ids); virtual std::set get_target_mmio_device_ids(); virtual std::set get_target_remote_device_ids(); virtual std::map get_clocks(); - virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1); - virtual void *channel_address(std::uint32_t offset, const tt_cxy_pair& target); virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device() const; static std::vector extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows); static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows); static std::unordered_map create_harvested_coord_translation(const tt::ARCH arch, bool identity_map); - static std::unordered_map get_harvesting_masks_from_harvested_rows(std::unordered_map> harvested_rows); std::unordered_map get_harvested_coord_translation_map(chip_id_t logical_device_id); virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - virtual std::uint32_t get_pcie_speed(std::uint32_t device_id); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); virtual tt_version get_ethernet_fw_version() const; @@ -877,14 +727,9 @@ class tt_SiliconDevice: public tt_device void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting); void populate_cores(); - void init_pcie_iatus(); - void init_pcie_iatus_no_p2p(); + void init_pcie_iatus(); // No more p2p support. bool init_hugepage(chip_id_t device_id); - bool init_dmabuf(chip_id_t device_id); void check_pcie_device_initialized(int device_id); - bool init_dma_turbo_buf(struct PCIdevice* pci_device); - bool uninit_dma_turbo_buf(struct PCIdevice* pci_device); - static std::map get_physical_device_id_to_bus_id_map(std::vector physical_device_ids); void set_pcie_power_state(tt_DevicePowerState state); int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state); void set_power_state(tt_DevicePowerState state); @@ -900,13 +745,11 @@ class tt_SiliconDevice: public tt_device int get_clock(int logical_device_id); // Communication Functions - void read_dma_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id); - void write_dma_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); + void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id); + void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb); void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector broadcast_header = {}); void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb); - void write_to_non_mmio_device_send_epoch_cmd(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write); - void rolled_write_to_non_mmio_device(const uint32_t *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t address, uint32_t unroll_count); void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes); void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); @@ -930,9 +773,7 @@ class tt_SiliconDevice: public tt_device // Test functions void verify_eth_fw(); void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions); - int test_pcie_tlb_setup (struct PCIdevice* pci_device); int test_setup_interface (); - int test_broadcast (int logical_device_id); // State variables tt_device_dram_address_params dram_address_params; @@ -962,17 +803,10 @@ class tt_SiliconDevice: public tt_device static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1); int active_core = NON_EPOCH_ETH_CORES_START_ID; - int active_core_epoch = EPOCH_ETH_CORES_START_ID; - bool erisc_q_ptrs_initialized = false; - std::vector erisc_q_ptrs_epoch[NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS]; - bool erisc_q_wrptr_updated[NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS]; std::vector< std::vector > remote_transfer_ethernet_cores; bool flush_non_mmio = false; bool non_mmio_transfer_cores_customized = false; std::unordered_map active_eth_core_idx_per_chip = {}; - // Size of the PCIE DMA buffer - // The setting should not exceed MAX_DMA_BYTES - std::uint32_t m_dma_buf_size; std::unordered_map noc_translation_enabled_for_chip = {}; std::map> hardware_resource_mutex_map = {}; std::unordered_map> harvested_coord_translation = {}; @@ -991,9 +825,6 @@ class tt_SiliconDevice: public tt_device std::unordered_map dynamic_tlb_config = {}; std::unordered_map dynamic_tlb_ordering_modes = {}; std::map, std::unordered_map>>> bcast_header_cache = {}; - std::uint64_t buf_physical_addr = 0; - void * buf_mapping = nullptr; - int driver_id; bool perform_harvesting_on_sdesc = false; bool use_ethernet_ordered_writes = true; bool use_ethernet_broadcast = true; diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp index a147f3e0..f406fc6f 100644 --- a/device/tt_silicon_driver.cpp +++ b/device/tt_silicon_driver.cpp @@ -84,17 +84,6 @@ void clr_printf(const char *clr, const char *fmt, ...) { int g_DEBUG_LEVEL; // /src/t6ifc/t6py/packages/tenstorrent/jlink/jtag_comm.cpp bool g_READ_CHECKING_ENABLED = true; -bool g_USE_MSI_FOR_DMA = false; // Whether to wait for MSI after DMA transfer, or poll a variable -uint32_t g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES = 0; // 0 - never use DMA. Otherwise use DMA for all blocks larger than this size -uint32_t g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES = 0; // 0 - never use DMA. Otherwise use DMA for all blocks larger than this size - -// Address in CSM where the DMA request structure resides -uint32_t c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0; -// Address where the trigger for transfer resides -uint32_t c_DMA_TRIGGER_ADDRESS = 0; -// To trigger arc interrupt -uint32_t c_ARC_MISC_CNTL_ADDRESS = 0; - // Print all buffers smaller than this number of bytes uint32_t g_NUM_BYTES_TO_PRINT = 8; @@ -102,24 +91,15 @@ uint32_t g_NUM_BYTES_TO_PRINT = 8; const bool g_SINGLE_PIN_PAGE_PER_FD_WORKAROND = true; const uint32_t g_MAX_HOST_MEM_CHANNELS = 4; -volatile bool msi_interrupt_received = false; - const char device_name_pattern[] = "/dev/tenstorrent/%u"; -const std::string tlb_large_read_mutex_name_prefix = "mem_tlb_large_read_mutex_pci_interface_id_"; -const std::string tlb_large_write_mutex_name_prefix = "mem_tlb_large_write_mutex_pci_interface_id_"; -const std::string tlb_small_read_write_mutex_name_prefix = "mem_tlb_small_read_write_mutex_pci_interface_id_"; -const std::string arc_msg_mutex_name_prefix = "arc_msg_mutex_pci_interface_id_"; - static uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24); static uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078; static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044; -const uint32_t DMA_BUF_REGION_SIZE = 4 << 20; const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB -const uint32_t DMA_MAP_MASK = DMA_BUF_REGION_SIZE - 1; const uint32_t HUGEPAGE_MAP_MASK = HUGEPAGE_REGION_SIZE - 1; static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF; @@ -141,10 +121,6 @@ const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200; PCIdevice ttkmd_open(DWORD device_id, bool sharable /* = false */); int ttkmd_close(struct PCIdevice &device); -uint32_t pcie_dma_transfer_turbo (TTDevice *dev, uint32_t chip_addr, uint32_t host_phys_addr, uint32_t size_bytes, bool write); -DMAbuffer pci_allocate_dma_buffer(TTDevice *dev, uint32_t size); -void pcie_init_dma_transfer_turbo (PCIdevice* dev); - void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len); // Stash all the fields of TTDevice in TTDeviceBase to make moving simpler. @@ -182,17 +158,8 @@ struct TTDeviceBase std::uint8_t pci_device; std::uint8_t pci_function; - unsigned int next_dma_buf = 0; - - DMAbuffer dma_completion_flag_buffer; // When DMA completes, it writes to this buffer - DMAbuffer dma_transfer_buffer; // Buffer for large DMA transfers - - std::uint32_t max_dma_buf_size_log2; - tenstorrent_get_device_info_out device_info; - std::vector dma_buffer_mappings; - std::uint32_t read_checking_offset; }; @@ -256,10 +223,6 @@ struct TTDevice : TTDeviceBase munmap(system_reg_mapping, system_reg_mapping_size); } - for (auto &&buf : dma_buffer_mappings) { - munmap(buf.pBuf, buf.size); - } - if (sysfs_config_fd != -1) { close(sysfs_config_fd); } @@ -274,7 +237,6 @@ struct TTDevice : TTDeviceBase bar2_uc = nullptr; bar4_wc = nullptr; system_reg_mapping = nullptr; - dma_buffer_mappings.clear(); sysfs_config_fd = -1; } @@ -469,8 +431,6 @@ void TTDevice::do_open() { this->device_info = device_info.out; - max_dma_buf_size_log2 = device_info.out.max_dma_buf_size_log2; - struct { tenstorrent_query_mappings query_mappings; tenstorrent_mapping mapping_array[8]; @@ -621,23 +581,6 @@ void TTDevice::do_open() { this->read_checking_offset = is_blackhole(device_info.out) ? BH_NOC_NODE_ID_OFFSET : GS_WH_ARC_SCRATCH_6_OFFSET; } -void set_debug_level(int dl) { - g_DEBUG_LEVEL = dl; -} - -std::uint64_t pci_dma_buffer_get_physical_addr(DMAbuffer &dma_buffer) { - log_assert (dma_buffer.pDma, "DMA Buffer not initialized"); - return reinterpret_cast(dma_buffer.pDma); -} - -std::uint64_t pci_dma_buffer_get_user_addr(DMAbuffer &dma_buffer) { - log_assert (dma_buffer.pBuf, "DMA Buffer not initialized"); - return reinterpret_cast(dma_buffer.pBuf); -} - -DWORD ttkmd_init() { return 0; } // 0 on success -DWORD ttkmd_uninit() { return 0; } // 0 on success - bool is_char_dev(const dirent *ent, const char *parent_dir) { if (ent->d_type == DT_UNKNOWN || ent->d_type == DT_LNK) { char name[2 * NAME_MAX + 2]; @@ -731,39 +674,6 @@ int get_revision_id(TTDevice *dev) { } } -int get_link_width(TTDevice *dev) { - - static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/current_link_width"; - char buf[sizeof(pattern)]; - std::snprintf(buf, sizeof(buf), pattern, - (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function); - - std::ifstream linkwidth_file(buf); - std::string linkwidth_string; - if (std::getline(linkwidth_file, linkwidth_string)) { - return std::stoi(linkwidth_string, nullptr, 0); - } else { - throw std::runtime_error("Link width read failed for device"); - } -} - -int get_link_speed(TTDevice *dev) { - - static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/current_link_speed"; - char buf[sizeof(pattern)]; - std::snprintf(buf, sizeof(buf), pattern, - (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function); - - std::ifstream linkspeed_file(buf); - std::string linkspeed_string; - int linkspeed; - if (std::getline(linkspeed_file, linkspeed_string) && sscanf(linkspeed_string.c_str(), "%d", &linkspeed) == 1) { - return linkspeed; - } else { - throw std::runtime_error("Link speed read failed for device"); - } -} - int get_numa_node(TTDevice *dev) { static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/numa_node"; @@ -792,41 +702,6 @@ std::uint64_t read_bar0_base(TTDevice *dev) { return bar01 & bar_address_mask; } -DMAbuffer allocate_dma_buffer(TTDevice *ttdev, unsigned int buffer_index, std::size_t size) { - tenstorrent_allocate_dma_buf allocate_dma_buf; - - if (size > std::numeric_limits::max()) { - throw std::runtime_error(std::string("Requested DMA buffer size (" + std::to_string(allocate_dma_buf.in.requested_size) - + ") bytes exceeds interface size limit for device " + std::to_string(ttdev->index) + ", with error: " + std::strerror(errno))); - } - - memset(&allocate_dma_buf, 0, sizeof(allocate_dma_buf)); - allocate_dma_buf.in.requested_size = std::max(size, getpagesize()); - allocate_dma_buf.in.buf_index = buffer_index; - - if (ioctl(ttdev->device_fd, TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF, &allocate_dma_buf) == -1) { - throw std::runtime_error(std::string("DMA buffer allocation failed (") + std::to_string(allocate_dma_buf.in.requested_size) - + " bytes) for device " + std::to_string(ttdev->index) + "."); - } - - void *mapping = mmap(NULL, allocate_dma_buf.out.size, PROT_READ | PROT_WRITE, MAP_SHARED, ttdev->device_fd, allocate_dma_buf.out.mapping_offset); - - log_trace(tt::LogSiliconDriver, "DMA buffer succeeded with size {} offset {} phy_addr {}", allocate_dma_buf.out.size, allocate_dma_buf.out.mapping_offset, allocate_dma_buf.out.physical_address); - - if (mapping == MAP_FAILED) { - throw std::runtime_error(std::string("DMA buffer memory mapping failed for device ") + std::to_string(ttdev->index) + "."); - } - - DMAbuffer dmabuf; - dmabuf.pBuf = mapping; - dmabuf.pDma = allocate_dma_buf.out.physical_address; - dmabuf.size = allocate_dma_buf.out.size; - - ttdev->dma_buffer_mappings.push_back(dmabuf); - - return dmabuf; -} - PCIdevice ttkmd_open(DWORD device_id, bool sharable /* = false */) { (void)sharable; // presently ignored @@ -1053,24 +928,7 @@ void memcpy_from_device(void *dest, const void *src, std::size_t num_bytes) { } } -void read_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr, uint32_t dma_buf_size) { - if (num_bytes >= g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES && g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES > 0) { - record_access ("read_block_a", byte_addr, num_bytes, true, false, true, true); // addr, size, turbo, write, block, endline - - DMAbuffer &transfer_buffer = dev->dma_transfer_buffer; - - uint64_t host_phys_addr = pci_dma_buffer_get_physical_addr (transfer_buffer); - uint64_t host_user_addr = pci_dma_buffer_get_user_addr (transfer_buffer); - while (num_bytes > 0) { - uint32_t transfered_bytes = std::min(num_bytes, dma_buf_size); - pcie_dma_transfer_turbo (dev, byte_addr, host_phys_addr, transfered_bytes, false); - memcpy (buffer_addr, (void*)host_user_addr, transfered_bytes); - num_bytes -= transfered_bytes; - byte_addr += transfered_bytes; - buffer_addr += transfered_bytes; - } - return; - } +void read_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) { record_access("read_block_b", byte_addr, num_bytes, false, false, true, false); // addr, size, turbo, write, block, endline @@ -1116,24 +974,7 @@ void read_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, uint8_t* print_buffer (buffer_addr, std::min((uint64_t)g_NUM_BYTES_TO_PRINT, num_bytes), true); } -void write_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr, uint32_t dma_buf_size) { - if (num_bytes >= g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES && g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES > 0) { - record_access ("write_block_a", byte_addr, num_bytes, true, true, true, true); // addr, size, turbo, write, block, endline - - DMAbuffer &transfer_buffer = dev->dma_transfer_buffer; - - uint64_t host_phys_addr = pci_dma_buffer_get_physical_addr (transfer_buffer); - uint64_t host_user_addr = pci_dma_buffer_get_user_addr (transfer_buffer); - while (num_bytes > 0) { - uint32_t transfered_bytes = std::min(num_bytes, dma_buf_size); - memcpy ( (void*)host_user_addr, buffer_addr, transfered_bytes); - pcie_dma_transfer_turbo (dev, byte_addr, host_phys_addr, transfered_bytes, true); - num_bytes -= transfered_bytes; - byte_addr += transfered_bytes; - buffer_addr += transfered_bytes; - } - return; - } +void write_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) { record_access("write_block_b", byte_addr, num_bytes, false, true, true, false); // addr, size, turbo, write, block, endline @@ -1174,57 +1015,6 @@ void write_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, const ui print_buffer (buffer_addr, std::min((uint64_t)g_NUM_BYTES_TO_PRINT, num_bytes), true); } -void read_checking_enable(bool enable = true) { - g_READ_CHECKING_ENABLED = enable; -} - -// Read/write to the configuration space of the device -// pData is a pointer to a buffer (see memory module) -DWORD read_cfg(TTDevice *dev, DWORD byte_offset, uint64_t pData, DWORD num_bytes) { - - if (pread(get_config_space_fd(dev), reinterpret_cast(pData), num_bytes, byte_offset) != num_bytes) { - throw std::runtime_error("Config space read failed for device "); - } - - return 0; -} - -DWORD write_cfg(TTDevice *dev, DWORD byte_offset, uint64_t pData, DWORD num_bytes) { - - if (pwrite(get_config_space_fd(dev), reinterpret_cast(pData), num_bytes, byte_offset) != num_bytes) { - throw std::runtime_error("Config space read failed for device "); - } - - return 0; -} - -DMAbuffer pci_allocate_dma_buffer(TTDevice *dev, uint32_t size) { - - uint32_t page_size = getpagesize(); - uint32_t page_aligned_size = (size + page_size - 1) & ~(page_size - 1); - - DMAbuffer ret_val = allocate_dma_buffer(dev, dev->next_dma_buf++, page_aligned_size); - LOG1 ("Allocated DMA buffer at 0x%lx 0x%lx size: %u\n", ret_val.pBuf, ret_val.pDma, size); - return ret_val; -} - -void pcie_init_dma_transfer_turbo (PCIdevice* dev) { - // From SHA 8cf7ff1bc7b3886a: - if (detect_arch(dev) == tt::ARCH::WORMHOLE_B0) { - c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0x1fef84c8; // chip.AXI.get_path_info("ARC_CSM.ARC_PCIE_DMA_REQUEST") - } else { - c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0x1fef84c0; // chip.AXI.get_path_info("ARC_CSM.ARC_PCIE_DMA_REQUEST") - } - c_DMA_TRIGGER_ADDRESS = 0x1ff30074; // chip.AXI.get_path_info("ARC_RESET.SCRATCH[5]") - c_ARC_MISC_CNTL_ADDRESS = 0x1ff30100; // chip.AXI.get_path_info("ARC_RESET.ARC_MISC_CNTL") -} - -void set_use_dma(bool msi, uint32_t dma_block_size_read_threshold_bytes, uint32_t dma_block_size_write_threshold_bytes) { - g_USE_MSI_FOR_DMA = msi; - g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES = dma_block_size_read_threshold_bytes; - g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES = dma_block_size_write_threshold_bytes; -} - void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len) { while (word_len-- != 0) { *dest++ = *src++; @@ -1286,66 +1076,6 @@ void read_regs(TTDevice *dev, uint32_t byte_addr, uint32_t word_len, void *data) print_buffer (data, std::min(g_NUM_BYTES_TO_PRINT, word_len * 4), true); } -void handle_dma_timeout(TTDevice *dev, uint32_t size_bytes, bool write) { - detect_ffffffff_read(dev); - throw std::runtime_error(std::string("DMA transfer timeout: ") - + std::to_string(size_bytes) - + (write ? " byte write." : " byte read.")); -} -uint32_t pcie_dma_transfer_turbo (TTDevice *dev, uint32_t chip_addr, uint32_t host_phys_addr, uint32_t size_bytes, bool write) { - // c_timer t (""); - - // t.now_in ("1. DMA setup"); - - if (c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET == 0) { - throw std::runtime_error ("pcie_init_dma_transfer_turbo must be called before pcie_dma_transfer_turbo"); - } - - arc_pcie_ctrl_dma_request_t req = { - .chip_addr = chip_addr, - .host_phys_addr = host_phys_addr, - .completion_flag_phys_addr = static_cast(pci_dma_buffer_get_physical_addr(dev->dma_completion_flag_buffer)), - .size_bytes = size_bytes, - .write = (write ? 1U : 0U), - .pcie_msi_on_done = g_USE_MSI_FOR_DMA ? 1U : 0U, - .pcie_write_on_done = g_USE_MSI_FOR_DMA ? 0U : 1U, - .trigger = 1U, - .repeat = 1 - }; - - volatile uint32_t *complete_flag = (uint32_t *)pci_dma_buffer_get_user_addr(dev->dma_completion_flag_buffer); - *complete_flag = 0; - - // Configure the DMA engine - msi_interrupt_received = false; - write_regs (dev, c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET, sizeof(req) / sizeof(uint32_t), &req); - - // Trigger ARC interrupt 0 on core 0 - int arc_misc_cntl_value = 0; - - // NOTE: Ideally, we should read the state of this register before writing to it, but that - // casues a lot of delay (reads have huge latencies) - arc_misc_cntl_value |= (1 << 16); // Cause IRQ0 on core 0 - write_regs (dev, c_ARC_MISC_CNTL_ADDRESS, 1, &arc_misc_cntl_value); - - if (!g_USE_MSI_FOR_DMA) { - // t.now_in ("2. DMA poll"); - int wait_loops = 0; - while (true) { - // The complete flag is set ty by ARC (see src/hardware/soc/tb/arc_fw/lib/pcie_dma.c) - if (*complete_flag == 0xfaca) break; - wait_loops++; - } - // LOG2 ("Waited %d iterations\n", wait_loops); - } else { - // t.now_in ("2. DMA wait for MSI"); - while (msi_interrupt_received == false) - ; - } - - return 0; // TODO: status -} - void print_device_info (struct PCIdevice &d) { LOG1("PCIEIntfId 0x%x\n", d.id); LOG1("VID:DID 0x%x:0x%x\n", d.vendor_id, d.device_id); @@ -1519,25 +1249,16 @@ void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, boo void tt_SiliconDevice::create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) { m_pci_log_level = 0; - m_dma_buf_size = 0; LOG1("---- tt_SiliconDevice::tt_SiliconDevice\n"); - static int unique_driver_id = 0; - driver_id = unique_driver_id++; // Set the log level for debugging const char* pci_log_level = std::getenv("TT_PCI_LOG_LEVEL"); if (pci_log_level) { m_pci_log_level = atoi (pci_log_level); } - set_debug_level(m_pci_log_level); + g_DEBUG_LEVEL = m_pci_log_level; LOG1 ("TT_PCI_LOG_LEVEL=%d\n", m_pci_log_level); - const char* dma_buf_size = std::getenv("TT_PCI_DMA_BUF_SIZE"); - if (dma_buf_size) { - m_dma_buf_size = atoi (dma_buf_size); - } - LOG1 ("TT_PCI_DMA_BUF_SIZE=%d\n", m_dma_buf_size); - // Don't buffer stdout. setbuf(stdout, NULL); @@ -1584,7 +1305,7 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target print_device_info (*pci_device); // MT: Initial BH - hugepages will fail init - // For using silicon driver without workload to query mission mode params, no need for hugepage/dmabuf. + // For using silicon driver without workload to query mission mode params, no need for hugepage. if (!skip_driver_allocs){ bool hugepages_initialized = init_hugepage(logical_device_id); // Large writes to remote chips require hugepages to be initialized. @@ -1592,9 +1313,8 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target if(target_remote_chips.size()) { log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!"); } - uint16_t channel = 0; // Single channel sufficient for this? - if (not hugepage_mapping.at(logical_device_id).at(channel)) { - init_dmabuf(logical_device_id); + if (not hugepage_mapping.at(logical_device_id).at(0)) { + log_warning(LogSiliconDriver, "No hugepage mapping at device {}", logical_device_id); } } harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map @@ -1609,9 +1329,6 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target } } -bool tt_SiliconDevice::noc_translation_en() { - return translation_tables_en; -} bool tt_SiliconDevice::using_harvested_soc_descriptors() { return perform_harvesting_on_sdesc && performed_harvesting; } @@ -1811,17 +1528,6 @@ void tt_SiliconDevice::populate_cores() { } } -std::unordered_map tt_SiliconDevice::get_harvesting_masks_from_harvested_rows(std::unordered_map> harvested_rows) { - std::unordered_map harvesting_masks = {}; - for(const auto& chip : harvested_rows) { - uint32_t harvesting_mask_per_chip = 0; - harvesting_masks.insert({chip.first, 0}); - for(const auto& row : chip.second) { - harvesting_masks.at(chip.first) |= (1 << row); - } - } - return harvesting_masks; -} std::vector tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) { // Check if harvesting config is legal for GS and WH log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested"); @@ -2031,47 +1737,11 @@ void tt_SiliconDevice::initialize_pcie_devices() { check_pcie_device_initialized(device_it.first); } - // If requires multi-channel or doesn't support mmio-p2p, init iatus without p2p. - if (m_num_host_mem_channels <= 1 && arch_name == tt::ARCH::GRAYSKULL) { - init_pcie_iatus(); - } else { - // TODO: Implement support for multiple host channels on BLACKHOLE. - log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), - "More channels are not yet supported for Blackhole"); - init_pcie_iatus_no_p2p(); - } + // TODO: Implement support for multiple host channels on BLACKHOLE. + log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), "More channels are not yet supported for Blackhole"); + init_pcie_iatus(); init_membars(); - - // https://yyz-gitlab.local.tenstorrent.com/ihamer/ll-sw/issues/25 - // Note: using pcie dma while device is idle is safe, mixing p2p is unsafe, see issue above - // TODO: disable pcie dma if p2p traffic is present, ie. chip-to-chip or chip-to-host - - for (auto &device_it : m_pci_device_map){ - struct PCIdevice* pci_device = device_it.second; - auto device_id = pci_device->device_id; - // MT Initial BH - Don't use PCIe DMA - bool enable_pcie_dma; - if (arch_name == tt::ARCH::BLACKHOLE) { - enable_pcie_dma = false; - } else { - enable_pcie_dma = m_dma_buf_size>0; - } - // Use DMA only for transfers that cross the size thresholds (empirically determined) - if (enable_pcie_dma) { - try { - log_trace(LogSiliconDriver, "Enable PCIE DMA with bufsize {}", m_dma_buf_size); - set_use_dma (false, 128, 0); // use dma for reads only - init_dma_turbo_buf(pci_device); - } catch (const std::exception &e) { - log_trace(LogSiliconDriver, "Disable PCIE DMA, fallback to MMIO transfers due to exepction {}", e.what()); - set_use_dma (false, 0, 0); - uninit_dma_turbo_buf(pci_device); - } - } else { - log_trace(LogSiliconDriver, "Disable PCIE DMA"); - } - } } void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(struct PCIdevice *device, const TensixSoftResetOptions &soft_resets) { @@ -2169,24 +1839,12 @@ std::vector tt_SiliconDevice::detect_available_device_ids() { return detected_device_ids; } -static bool check_dram_core_exists(const std::vector> &all_dram_cores, tt_xy_pair target_core) { - bool dram_core_exists = false; - for (const auto &dram_cores_in_channel : all_dram_cores) { - for (auto dram_core : dram_cores_in_channel) { - if (dram_core.x == target_core.x && dram_core.y == target_core.y) { - return true; - } - } - } - return false; -} - -std::function tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) { +std::function tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) { struct PCIdevice* pci_device = get_pci_device(device_id); TTDevice* dev = pci_device->hdev; - const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr, uint32_t dma_buf_size) { - write_block(dev, byte_addr, num_bytes, buffer_addr, dma_buf_size); + const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) { + write_block(dev, byte_addr, num_bytes, buffer_addr); }; return callable; @@ -2242,9 +1900,9 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset // to which we write so write_block knows it needs to target BAR4 - write_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr, m_dma_buf_size); + write_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr); } else { - write_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr, m_dma_buf_size); + write_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr); } } else { const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); @@ -2254,7 +1912,7 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); - write_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size); + write_block(dev, mapped_address, transfer_size, buffer_addr); size_in_bytes -= transfer_size; address += transfer_size; @@ -2285,9 +1943,9 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset // from which we read so read_block knows it needs to target BAR4 - read_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr, m_dma_buf_size); + read_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr); } else { - read_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr, m_dma_buf_size); + read_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr); } LOG1 (" read_block called with tlb_offset: %d, tlb_size: %d\n", tlb_offset, tlb_size); } else { @@ -2298,7 +1956,7 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); - read_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size); + read_block(dev, mapped_address, transfer_size, buffer_addr); size_in_bytes -= transfer_size; address += transfer_size; @@ -2308,7 +1966,7 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std } } -void tt_SiliconDevice::read_dma_buffer( +void tt_SiliconDevice::read_buffer( void* mem_ptr, std::uint32_t address, std::uint16_t channel, @@ -2321,20 +1979,18 @@ void tt_SiliconDevice::read_dma_buffer( if(hugepage_mapping.at(src_device_id).at(channel)) { user_scratchspace = static_cast(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK); - } else if (buf_mapping) { - user_scratchspace = static_cast(buf_mapping) + (address & DMA_MAP_MASK); } else { - std::string err_msg = "write_dma_buffer: Hugepage or DMAbuffer are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel); + std::string err_msg = "write_buffer: Hugepages are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel); err_msg += " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)"; throw std::runtime_error(err_msg); } - LOG1("---- tt_SiliconDevice::read_dma_buffer (src_device_id: %d, ch: %d) from 0x%lx\n", src_device_id, channel, user_scratchspace); + LOG1("---- tt_SiliconDevice::read_buffer (src_device_id: %d, ch: %d) from 0x%lx\n", src_device_id, channel, user_scratchspace); memcpy(mem_ptr, user_scratchspace, size_in_bytes); } -void tt_SiliconDevice::write_dma_buffer( +void tt_SiliconDevice::write_buffer( const void *mem_ptr, std::uint32_t size, std::uint32_t address, @@ -2343,24 +1999,15 @@ void tt_SiliconDevice::write_dma_buffer( void * user_scratchspace = nullptr; if(hugepage_mapping.at(src_device_id).at(channel)) { - log_assert(size <= HUGEPAGE_REGION_SIZE, "write_dma_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE); + log_assert(size <= HUGEPAGE_REGION_SIZE, "write_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE); log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}", hugepage_mapping.at(src_device_id).at(channel), (address & HUGEPAGE_MAP_MASK), channel, size); user_scratchspace = static_cast(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK); - } - else if(buf_mapping) { - log_assert(size <= DMA_BUF_REGION_SIZE, "write_dma_buffer data has larger size {} than destination buffer {}", size, DMA_BUF_REGION_SIZE); - log_debug(LogSiliconDriver, "Using DMA Buffer at address {} offset {} size {}", - buf_mapping, - address, - size); - // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in - user_scratchspace = reinterpret_cast(buf_mapping); } else { - std::string err_msg = "write_dma_buffer: Hugepage or DMAbuffer are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel); + std::string err_msg = "write_buffer: Hugepage are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel); throw std::runtime_error(err_msg); } memcpy(user_scratchspace, mem_ptr, size); @@ -2435,46 +2082,6 @@ std::map tt_SiliconDevice::get_clocks() { return clock_freq_map; } -//! Simple test of communication to device/target. true if it passes. -// bool tt_SiliconDevice::test_write_read(tt_cxy_pair target) { -// WARN("---- tt_SiliconDevice::test_write_read not implemented\n"); -// return true; -// } - -// bool tt_SiliconDevice::test_write_speed (struct PCIdevice* pci_device) { -// TTDevice *dev = pci_device->hdev; - -// if (dev->bar0_uc == dev->bar0_wc) { -// WARN("---- tt_SiliconDevice::test_write_speed WC not configured\n"); -// } - -// std::byte fill_value{0x42}; -// std::vector write_buf(architecture_implementation->get_static_tlb_size(), fill_value); - -// auto before = std::chrono::high_resolution_clock::now(); -// for (std::uint32_t y = 1; y < architecture_implementation->get_grid_size_y(); y++) -// { -// for (std::uint32_t x = 1; x < architecture_implementation->get_grid_size_x(); x++) -// { -// auto tlb_index = map_core_to_tlb(tt_xy_pair(x, y)); -// if (tlb_index < 0) { continue; } - -// auto offset = tlb_index * architecture_implementation->get_static_tlb_size(); - -// memcpy(static_cast(dev->bar0_wc) + offset, write_buf.data(), write_buf.size()); -// } -// } -// auto after = std::chrono::high_resolution_clock::now(); - -// std::chrono::duration interval = after - before; - -// unsigned int write_bw = 120 * std::milli::den / interval.count(); - -// LOG1("---- tt_SiliconDevice::test_write_speed Wrote 120MB @ %u MB/s\n", write_bw); - -// return (write_bw >= 512); // L1 write BW scales with AICLK, for low AICLK it will be very slow. -// } - tt_SiliconDevice::~tt_SiliconDevice () { LOG1 ("---- tt_SiliconDevice::~tt_SiliconDevice\n"); @@ -2535,10 +2142,6 @@ std::optional> tt_SiliconDevice::get_tlb_data_fro return tlb_data; } -uint32_t tt_SiliconDevice::get_m_dma_buf_size() const { - return m_dma_buf_size; -} - void tt_SiliconDevice::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) { log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb"); struct PCIdevice* pci_device = get_pci_device(logical_device_id); @@ -2554,118 +2157,11 @@ void tt_SiliconDevice::set_fallback_tlb_ordering_mode(const std::string& fallbac log_assert(fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; } -// This function checks that all TLBs are properly setup. It should return 0 if all is good (i.e. if init_pcie_tlb is called prior) -// int tt_SiliconDevice::test_pcie_tlb_setup (struct PCIdevice* pci_device) { - // LOG1("---- tt_SiliconDevice::test_pcie_tlb_setup\n"); - // uint64_t tlb_data; - // int ret_val; - // // Check static TLBs (only active Tensix cores for GS ... Active tensix cores + ethernet cores for WH) - // for (uint32_t y = 0; y < architecture_implementation->get_grid_size_y() - num_rows_harvested; y++) { - // for (uint32_t x = 0; x < architecture_implementation->get_grid_size_x(); x++) { - // int tlb_index = get_static_tlb_index(tt_xy_pair(x, y)); - // auto translated_coords = harvested_coord_translation.at(pci_device -> id).at(tt_xy_pair(x, y)); - // if (tlb_index < 0) { continue; } - - // auto tlb_data_attempt = architecture_implementation->get_tlb_data(tlb_index, TLB_DATA { - // .x_end = translated_coords.x, - // .y_end = translated_coords.y, - // }); - // if (!tlb_data_attempt.has_value()) { - // throw std::runtime_error("Error setting up (" + std::to_string(x) + ", " + std::to_string(y) + ") in pcie_tlb_test."); - // } - // uint64_t expected_tlb_data = tlb_data_attempt.value(); - - // uint32_t tlb_setup_addr = architecture_implementation->get_static_tlb_cfg_addr() + 8 * tlb_index; // Each tlb setup takes 2 dwords, hence 8 bytes - // read_regs(pci_device->hdev, tlb_setup_addr, 2, &tlb_data); - - // } - // } - - // // Check 16MB TLBs 1-16 for peer-to-peer communication with DRAM channel 0 - // uint64_t peer_dram_offset = architecture_implementation->get_dram_channel_0_peer2peer_region_start(); - // for (uint32_t tlb_id = 1; tlb_id < 17; tlb_id++) { - // auto tlb_data_expected = architecture_implementation->get_tlb_data(architecture_implementation->get_tlb_base_index_16m() + tlb_id, TLB_DATA { - // .local_offset = peer_dram_offset / architecture_implementation->get_dynamic_tlb_16m_size(), - // .x_end = architecture_implementation->get_dram_channel_0_x(), - // .y_end = architecture_implementation->get_dram_channel_0_y(), - // .ordering = TLB_DATA::Posted, - // .static_vc = true, - // }); - // uint64_t tlb_data_observed; - // uint32_t tlb_setup_addr = architecture_implementation->get_dynamic_tlb_16m_cfg_addr() + 8 * tlb_id; // Each tlb setup takes 2 dwords, hence 8 bytes - // read_regs(pci_device->hdev, tlb_setup_addr, 2, &tlb_data_observed); - // ret_val = (tlb_data_expected == tlb_data_observed) ? 0 : 1; - // if (ret_val != 0) return ret_val; - // peer_dram_offset += architecture_implementation->get_dynamic_tlb_16m_size(); - // } - // return ret_val; -//} - -// Set up IATU for peer2peer -// Consider changing this function -void tt_SiliconDevice::init_pcie_iatus() { - - int starting_device_id = m_pci_device_map.begin()->first; - int ending_device_id = m_pci_device_map.rbegin()->first; - int num_enabled_devices = m_pci_device_map.size(); - - LOG1("---- tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: %d starting_device_id: %d ending_device_id: %d\n", num_enabled_devices, starting_device_id, ending_device_id); - log_assert(m_num_host_mem_channels <= 1, "Maximum of 1x 1GB Host memory channels supported."); - - // Requirement for ring topology in GS, but since WH can share below code, check it again here for mmio mapped devices, - // otherwise us/ds device calculations will not be correct. Don't expect to see this for Wormhole today. - log_assert((starting_device_id + num_enabled_devices - 1) == ending_device_id, "The set of workload mmio-mapped target_device_id's must be sequential, without gaps."); - - for (auto &src_device_it : m_pci_device_map){ - int src_pci_id = src_device_it.first; - struct PCIdevice* src_pci_device = src_device_it.second; - - uint32_t current_peer_region = 0; - const int num_peer_ids = 3; // 0=HOST, 1=UPSTREAM Device, 2=DOWNSTREAM Device, 3=Unused - for (int peer_id = 0; peer_id < num_peer_ids; peer_id++) { - - //TODO: migrate this to huge pages when that support is in - if (peer_id == 0){ - LOG2 ("Setting up src_pci_id: %d peer_id: %d to Host. current_peer_region: %d\n", src_pci_id, peer_id, current_peer_region); - // Device to Host (peer_id==0) - const uint16_t host_memory_channel = 0; // Only single channel supported. - if (hugepage_mapping.at(src_pci_id).at(host_memory_channel)) { - iatu_configure_peer_region(src_pci_id, current_peer_region, hugepage_physical_address.at(src_pci_id).at(host_memory_channel), HUGEPAGE_REGION_SIZE); - host_channel_size.insert({(int)src_pci_device->logical_id, {HUGEPAGE_REGION_SIZE}}); - } else if(buf_mapping) { - // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in - iatu_configure_peer_region(src_pci_id, current_peer_region, buf_physical_addr, DMA_BUF_REGION_SIZE); - } - } else if (peer_id == 1 || peer_id == 2){ - // Device to Device (peer_id==1 : Upstream, peer_id==2 : Downstream) - // For determining upstream/downstream peers in ring topology - this matches is_target_device_downstream() in net2pipe - int upstream_peer_device_id = src_pci_id > starting_device_id ? src_pci_id - 1 : ending_device_id; - int downstream_peer_device_id = src_pci_id < (ending_device_id) ? src_pci_id + 1 : starting_device_id; - - int peer_device_id = peer_id == 1 ? upstream_peer_device_id : downstream_peer_device_id; - - struct PCIdevice* peer_pci_device = m_pci_device_map.at(peer_device_id); - uint64_t peer_BAR_addr = peer_pci_device->BAR_addr; - uint32_t peer_pci_interface_id = peer_pci_device->id; - uint32_t TLB1_16MB_OFFSET = 0; // Was 192MB offset to DRAM, now added by net2pipe since ATU maps to base of 512MB PCI Bar. - uint32_t PEER_REGION_SIZE = 1024 * 1024 * 1024; // Was 256MB. Want 512MB. Updated to 1024MB to match net2pipe more easily. - // FIXME - How to reduce PEER_REGION_SIZE=256 again, and make this still work? Need to make the ATU mappings non-contiguous 256MB chunks (every 1GB?) to match net2pipe? - - LOG2 ("Setting up src_pci_id: %d peer_id: %d to Device (upstream_peer_device_id: %d downstream_peer_device_id: %d) gives peer_device_id: %d (peer_pci_interface_id: %d) current_peer_region: %d\n", - src_pci_id, peer_id, upstream_peer_device_id, downstream_peer_device_id, peer_device_id, peer_pci_interface_id, current_peer_region ); - - iatu_configure_peer_region (src_pci_id, current_peer_region, peer_BAR_addr + TLB1_16MB_OFFSET, PEER_REGION_SIZE); - } - current_peer_region ++; - } - } -} // TT<->TT P2P support removed in favor of increased Host memory. -void tt_SiliconDevice::init_pcie_iatus_no_p2p() { - +void tt_SiliconDevice::init_pcie_iatus() { int num_enabled_devices = m_pci_device_map.size(); - LOG1("---- tt_SiliconDevice::init_pcie_iatus_no_p2p() num_enabled_devices: %d\n", num_enabled_devices); + LOG1("---- tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: %d\n", num_enabled_devices); log_assert(m_num_host_mem_channels <= g_MAX_HOST_MEM_CHANNELS, "Maximum of {} 1GB Host memory channels supported.", g_MAX_HOST_MEM_CHANNELS); for (auto &src_device_it : m_pci_device_map){ @@ -2674,7 +2170,6 @@ void tt_SiliconDevice::init_pcie_iatus_no_p2p() { // Device to Host (multiple channels) for (int channel_id = 0; channel_id < m_num_host_mem_channels; channel_id++) { - // TODO - Try to remove DMA buffer support. if (hugepage_mapping.at(src_pci_id).at(channel_id)) { std::uint32_t region_size = HUGEPAGE_REGION_SIZE; if(channel_id == 3) region_size = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation) @@ -2684,34 +2179,14 @@ void tt_SiliconDevice::init_pcie_iatus_no_p2p() { host_channel_size.insert({(int)src_pci_device->logical_id, {}}); } host_channel_size.at(src_pci_device -> logical_id).push_back(region_size); - } else if(buf_mapping) { - log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to DMA buffer.", channel_id); - // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in - iatu_configure_peer_region(src_pci_id, channel_id, buf_physical_addr, DMA_BUF_REGION_SIZE); + } else { + std::string err_msg = "init_pcie_iatus: Hugepages are not allocated for src_pci_id: " + std::to_string(src_pci_id) + " ch: " + std::to_string(channel_id); + throw std::runtime_error(err_msg); } } } } -uint32_t tt_SiliconDevice::dma_allocation_size(chip_id_t src_device_id) -{ - - // Fall back to first device if no src_device_id is provided. Assumes all devices have the same size, which is true. - chip_id_t device_index = src_device_id == -1 ? m_pci_device_map.begin()->first : src_device_id; - - if (hugepage_mapping.at(device_index).at(0)) { - return HUGEPAGE_REGION_SIZE; - } else if (buf_mapping) { - return DMA_BUF_REGION_SIZE; - } else { - log_fatal("Nothing has been allocated yet"); - return 0; - } -} - - - - // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G) std::string find_hugepage_dir(std::size_t pagesize) { @@ -2799,52 +2274,6 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi return fd; } -bool tt_SiliconDevice::init_dmabuf(chip_id_t device_id) { - if (buf_mapping == nullptr) { - - TTDevice *dev = m_pci_device_map.begin()->second->hdev; - - DMAbuffer buf = pci_allocate_dma_buffer(dev, DMA_BUF_REGION_SIZE); - buf_mapping = static_cast(reinterpret_cast(pci_dma_buffer_get_user_addr(buf))); - buf_physical_addr= pci_dma_buffer_get_physical_addr(buf); - } - return true; -} - -bool tt_SiliconDevice::init_dma_turbo_buf (struct PCIdevice* pci_device) { - // Allocate buffers for DMA transfer data and flag - pci_device->hdev->dma_completion_flag_buffer = pci_allocate_dma_buffer(pci_device->hdev, sizeof(uint64_t)); - pci_device->hdev->dma_transfer_buffer = pci_allocate_dma_buffer(pci_device->hdev, m_dma_buf_size); - pcie_init_dma_transfer_turbo(pci_device); - return true; -} - -bool tt_SiliconDevice::uninit_dma_turbo_buf (struct PCIdevice* pci_device) { - struct DMAbuffer &flag_buffer = pci_device->hdev->dma_completion_flag_buffer; - struct DMAbuffer &xfer_buffer = pci_device->hdev->dma_transfer_buffer; - if (flag_buffer.pBuf) { - for (auto it = pci_device->hdev->dma_buffer_mappings.begin(); it != pci_device->hdev->dma_buffer_mappings.end();) { - if (it->pBuf == flag_buffer.pBuf) { - it = pci_device->hdev->dma_buffer_mappings.erase(it); - } else { - ++it; - } - } - munmap(flag_buffer.pBuf, flag_buffer.size); - } - if (xfer_buffer.pBuf) { - for (auto it = pci_device->hdev->dma_buffer_mappings.begin(); it != pci_device->hdev->dma_buffer_mappings.end();) { - if (it->pBuf == xfer_buffer.pBuf) { - it = pci_device->hdev->dma_buffer_mappings.erase(it); - } else { - ++it; - } - } - munmap(xfer_buffer.pBuf, xfer_buffer.size); - } - return true; -} - // For debug purposes when various stages fails. void print_file_contents(std::string filename, std::string hint = ""){ if (std::filesystem::exists(filename)){ @@ -2975,67 +2404,11 @@ int tt_SiliconDevice::test_setup_interface () { } } -// Code used to test non existent broadcast TLB -// Keep for now, in case we need to test broadcast TLB again. -// int tt_SiliconDevice::test_broadcast (int logical_device_id) { -// LOG1("---- tt_SiliconDevice::test_broadcast\n"); - -// int ret_val = 0; -// struct PCIdevice* pci_device = get_pci_device(logical_device_id); - -// assert (test_pcie_tlb_setup(pci_device) == 0); - -// std::vector fill_array (1024, 0); -// uint32_t broadcast_bar_offset = architecture_implementation->get_broadcast_tlb_index() * architecture_implementation->get_static_tlb_size(); -// LOG2 ("broadcast_bar_offset = 0x%x\n", broadcast_bar_offset); - -// uint64_t fill_array_ptr = (uint64_t)(&fill_array[0]); - -// // a. Fill with increasing numbers -// // -// for (size_t i = 0; i < fill_array.size(); i++) { -// fill_array[i] = i; -// } -// write_block(pci_device->hdev, broadcast_bar_offset, fill_array.size() * sizeof (std::uint32_t), fill_array_ptr, m_dma_buf_size); - -// // Check individual locations -// for (uint32_t xi = 0; xi < architecture_implementation->get_t6_x_locations().size(); xi++) { -// for (uint32_t yi = 0; yi < architecture_implementation->get_t6_y_locations().size(); yi++) { -// tt_cxy_pair read_loc(logical_device_id, architecture_implementation->get_t6_x_locations()[xi], architecture_implementation->get_t6_y_locations()[yi]); -// read_vector (fill_array, read_loc, 0, fill_array.size() * sizeof (fill_array[0]) ); -// for (size_t i = 0; i < fill_array.size(); i++) { -// ret_val = (fill_array[i] == i) ? 0 : 1; -// if (ret_val) return ret_val; -// } -// } -// } - -// // b. Test with zeroes -// // -// std::vector fill_array_zeroes (1024, 0); -// uint64_t fill_array_zeroes_ptr = (uint64_t)(&fill_array_zeroes[0]); -// write_block(pci_device->hdev, broadcast_bar_offset, fill_array.size() * sizeof (std::uint32_t), fill_array_zeroes_ptr, m_dma_buf_size); - -// // Check individual locations -// for (uint32_t xi = 0; xi < architecture_implementation->get_t6_x_locations().size(); xi++) { -// for (uint32_t yi = 0; yi < architecture_implementation->get_t6_y_locations().size(); yi++) { -// tt_cxy_pair read_loc(logical_device_id, architecture_implementation->get_t6_x_locations()[xi], architecture_implementation->get_t6_y_locations()[yi]); -// read_vector (fill_array, read_loc, 0, fill_array.size() * sizeof (fill_array_zeroes[0]) ); -// for (size_t i = 0; i < fill_array.size(); i++) { -// ret_val = (fill_array_zeroes[i] == 0) ? 0 : 1; -// if (ret_val) return ret_val; -// } -// } -// } - -// return ret_val; -// } - void tt_SiliconDevice::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) { TTDevice* dev = get_pci_device(logical_device_id)->hdev; if (addr < dev->bar0_uc_offset) { - write_block (dev, addr, sizeof(data), reinterpret_cast(&data), m_dma_buf_size); + write_block (dev, addr, sizeof(data), reinterpret_cast(&data)); } else { write_regs (dev, addr, 1, &data); } @@ -3046,7 +2419,7 @@ uint32_t tt_SiliconDevice::bar_read32 (int logical_device_id, uint32_t addr) { uint32_t data; if (addr < dev->bar0_uc_offset) { - read_block (dev, addr, sizeof(data), reinterpret_cast(&data), m_dma_buf_size); + read_block (dev, addr, sizeof(data), reinterpret_cast(&data)); } else { read_regs (dev, addr, 1, &data); } @@ -3228,37 +2601,9 @@ void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t &device_id, i } } -void *tt_SiliconDevice::channel_address(std::uint32_t offset, const tt_cxy_pair& target) { - log_assert(ndesc->is_chip_mmio_capable(target.chip), "Cannot call channel_address for non-MMIO device"); - struct PCIdevice* pci_device = get_pci_device(target.chip); - auto architecture_implementation = pci_device->hdev->get_architecture_implementation(); - std::uint64_t bar0_offset; - - // Temporary hack for blackhole bringup. - if (arch_name == tt::ARCH::BLACKHOLE) { - // We use BAR4 segment for mapping for Blackhole. - log_assert(tlbs_init, "TLBs were not initialized."); - std::int32_t tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y)); - auto [tlb_offset, tlb_size] = pci_device->hdev->get_architecture_implementation()->describe_tlb(tlb_index).value(); - - log_assert(pci_device->hdev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE, "BAR4 not initialized, or TLBs not initialized properly."); - return static_cast(pci_device->hdev->bar4_wc) + tlb_offset + offset; - } else { - // This hard-codes that we use 16MB TLB #1 onwards for the mapping. - bar0_offset = offset - architecture_implementation->get_dram_channel_0_peer2peer_region_start() - + architecture_implementation->get_dynamic_tlb_16m_base() + architecture_implementation->get_dynamic_tlb_16m_size(); - } - - return static_cast(pci_device->hdev->bar0_wc) + bar0_offset; -} - void *tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { - if (hugepage_mapping.at(src_device_id).at(channel) != nullptr) { return static_cast(hugepage_mapping.at(src_device_id).at(channel)) + offset; - } else if(buf_mapping) { - // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in - return static_cast(buf_mapping) + offset; } else { return nullptr; } @@ -3277,46 +2622,6 @@ std::shared_ptr tt_SiliconDevice::get_mutex(co return hardware_resource_mutex_map.at(mutex_name); } - -std::unordered_map tt_SiliconDevice::get_logical_to_physical_mmio_device_id_map(std::vector physical_device_ids){ - - std::unordered_map logical_to_physical_mmio_device_id_map; - - LOG1("get_logical_to_physical_mmio_device_id_map() -- num_physical_devices: %d\n", physical_device_ids.size()); - - for (int logical_device_idx=0; logical_device_idx < physical_device_ids.size(); logical_device_idx++){ - logical_to_physical_mmio_device_id_map.insert({logical_device_idx, physical_device_ids.at(logical_device_idx)}); - } - - return logical_to_physical_mmio_device_id_map; - -} - - -// Get PCI bus_id info for looking up TT devices in hwloc to find associated CPU package. -std::map tt_SiliconDevice::get_physical_device_id_to_bus_id_map(std::vector physical_device_ids){ - - std::map physical_device_id_to_bus_id_map; - - for (auto &pci_interface_id : physical_device_ids){ - - auto ttdev = std::make_unique(TTDevice::open(pci_interface_id)); - - std::ostringstream pci_bsf; - pci_bsf << std::hex << std::setw(2) << std::setfill('0') << (int) ttdev->pci_bus << ":"; - pci_bsf << std::hex << std::setw(2) << std::setfill('0') << (int) ttdev->pci_device << "."; - pci_bsf << std::hex << (int) ttdev->pci_function; - - std::string pci_bsf_str = pci_bsf.str(); - LOG2("get_physical_device_id_to_bus_id_map() -- pci_interface_id: %d BSF: %s\n", pci_interface_id, pci_bsf_str.c_str()); - physical_device_id_to_bus_id_map.insert({pci_interface_id, pci_bsf_str}); - - } - - return physical_device_id_to_bus_id_map; - -} - uint64_t tt_SiliconDevice::get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) { uint64_t result = chip_y; uint64_t noc_addr_local_bits_mask = (1UL << eth_interface_params.noc_addr_local_bits) - 1; @@ -3349,7 +2654,6 @@ bool tt_SiliconDevice::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_ * * Relevant functions: * - write_to_non_mmio_device - * - rolled_write_to_non_mmio_device * - read_from_non_mmio_device * * The non-MMIO read/write functions (excluding the `*_epoch_cmd` variants) are responsible for the @@ -3583,282 +2887,6 @@ void tt_SiliconDevice::write_to_non_mmio_device( } } - -// Specialized function for small epoch commands: -// 1) uses separate eth cores than other non-mmio transfers hence does not require mutex -// 2) does not have the code paths for transfers larger than 32kB (1024 cmds) -// 3) only reads erisc_q_ptrs_epoch once, or when the queues are full -// 4) only updates wptr on eth command queues for the last epoch command or when the queue is full or when switching eth cores based on eth-ordered-writes policy, or when -// eth-ordered-writes are not supported but current write must be ordered (flush prev wrptr). -// 5) When eth-ordered-write not supported, allow flush to be used as ordering mechanism when ordering is requested via arg. When eth-ordered-write is supported, always use it -// and ensure ordering to same remote chip destinations by always using same remote xfer eth core for a given destination based on noc xy. Must ensure wrptr is flushed on -// switch of eth cores, and copy of rdptr/wrptr maintained on host for each eth xfer core. -void tt_SiliconDevice::write_to_non_mmio_device_send_epoch_cmd(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!non_mmio_transfer_cores_customized, "{} cannot be used if ethernet cores for host->cluster transfers are customized. The default Ethernet Core configuration must be used.", __FUNCTION__); - using data_word_t = uint32_t; - constexpr int DATA_WORD_SIZE = sizeof(data_word_t); - - const auto &mmio_capable_chip = ndesc->get_closest_mmio_capable_chip(core.chip); - const auto target_chip = ndesc->get_chip_locations().at(core.chip); - - std::string write_tlb = "LARGE_WRITE_TLB"; - std::string read_tlb = "LARGE_READ_TLB"; - std::string empty_tlb = ""; - translate_to_noc_table_coords(core.chip, core.y, core.x); - - const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); - tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch]; - - // read all eth queue ptrs for the first time, and initialize wrptr_updated bool for strict ordering. - if (!erisc_q_ptrs_initialized) { - for (int core_epoch = EPOCH_ETH_CORES_START_ID; core_epoch < EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS + EPOCH_ETH_CORES_START_ID; core_epoch++) { - erisc_q_ptrs_epoch[core_epoch].reserve(eth_interface_params.remote_update_ptr_size_bytes*2/sizeof(uint32_t)); - read_device_memory(erisc_q_ptrs_epoch[core_epoch].data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - erisc_q_wrptr_updated[core_epoch] = false; - erisc_q_ptrs_initialized = true; - } - } - - std::vector erisc_command(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - routing_cmd_t *new_cmd = (routing_cmd_t *)&erisc_command[0]; - std::vector data_block; - - // Two mechanisms for ordering depending on eth fw version. - if (use_ethernet_ordered_writes) { - // Feature in this function to ensure ordering via eth-ordered-writes by using same eth core for all epoch writes to same dest noc xy. - auto &soc_desc = get_soc_descriptor(mmio_capable_chip); - int core_id = core.x * soc_desc.grid_size.y + core.y; - int new_active_core_epoch = (core_id % EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS) + EPOCH_ETH_CORES_START_ID; - - // Switch eth cores, and if wrptr was not flushed to device for previous eth core, do it now. - if (new_active_core_epoch != active_core_epoch) { - if (!erisc_q_wrptr_updated[active_core_epoch]) { - std::vector erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] }; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); - tt_driver_atomics::sfence(); - erisc_q_wrptr_updated[active_core_epoch] = true; - } - active_core_epoch = new_active_core_epoch; - remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch]; - } - } else if (ordered_with_prev_remote_write) { - // Flush used as ordering mechanism when eth ordered writes are unsupported. If previous write requires flush, - // handle it here before setting flush_non_mmio for the current write. - if (!erisc_q_wrptr_updated[active_core_epoch]) { - std::vector erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] }; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); - tt_driver_atomics::sfence(); - erisc_q_wrptr_updated[active_core_epoch] = true; - } - wait_for_non_mmio_flush(); - } - - flush_non_mmio = true; - uint32_t timestamp = 0; //CMD_TIMESTAMP; - - bool use_dram = size_in_bytes > 256 * DATA_WORD_SIZE ? true : false; - uint32_t max_block_size = use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size; - uint32_t block_size; - - // Ethernet ordered writes must originate from same erisc core, so prevent updating active core here. - while (is_non_mmio_cmd_q_full(erisc_q_ptrs_epoch[active_core_epoch][0], erisc_q_ptrs_epoch[active_core_epoch][4])) { - if (!use_ethernet_ordered_writes){ - active_core_epoch++; - log_assert(active_core_epoch - EPOCH_ETH_CORES_START_ID >= 0, "Invalid ERISC core for sending epoch commands"); - active_core_epoch = ((active_core_epoch - EPOCH_ETH_CORES_START_ID) % EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS) + EPOCH_ETH_CORES_START_ID; - remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch]; - } - read_device_memory(erisc_q_ptrs_epoch[active_core_epoch].data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - } - - uint32_t req_wr_ptr = erisc_q_ptrs_epoch[active_core_epoch][0] & eth_interface_params.cmd_buf_size_mask; - if (address & 0x1F) { // address not 32-byte aligned - // can send it in one transfer, no need to break it up - log_assert(size_in_bytes == DATA_WORD_SIZE, "Non-mmio cmd queue update is too big"); - block_size = DATA_WORD_SIZE; - } else { - // can send it in one transfer, no need to break it up - log_assert(size_in_bytes <= max_block_size, "Non-mmio cmd queue update is too big. size_in_bytes: {} exceeds max_block_size: {}", size_in_bytes, max_block_size); - block_size = size_in_bytes; - } - uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req; - if (use_ethernet_ordered_writes) { - req_flags |= eth_interface_params.cmd_ordered; - } - - uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack; - timestamp = 0; - - uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_epoch * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. - - // send the data - if (req_flags & eth_interface_params.cmd_data_block) { - // Copy data to sysmem or device DRAM for Block mode - if (use_dram) { - req_flags |= eth_interface_params.cmd_data_block_dram; - resp_flags |= eth_interface_params.cmd_data_block_dram; - size_buffer_to_capacity(data_block, block_size); - memcpy(&data_block[0], mem_ptr, block_size); - write_to_sysmem(data_block, host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical); - } else { - uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size; - size_buffer_to_capacity(data_block, block_size); - memcpy(&data_block[0], mem_ptr, block_size); - write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb); - } - tt_driver_atomics::sfence(); - } - - // send the write request - log_assert((req_flags & eth_interface_params.cmd_data_block) ? (address & 0x1F) == 0 : true, "Block mode address must be 32-byte aligned."); - - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address); - new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); - new_cmd->data = req_flags & eth_interface_params.cmd_data_block ? block_size : *mem_ptr; - new_cmd->flags = req_flags; - if (use_dram) { - new_cmd->src_addr_tag = host_dram_block_addr; - } - - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb); - tt_driver_atomics::sfence(); - - // update the wptr only if the eth queue is full or for the last command - erisc_q_ptrs_epoch[active_core_epoch][0] = (erisc_q_ptrs_epoch[active_core_epoch][0] + 1) & eth_interface_params.cmd_buf_ptr_mask; - if (last_send_epoch_cmd || is_non_mmio_cmd_q_full(erisc_q_ptrs_epoch[active_core_epoch][0], erisc_q_ptrs_epoch[active_core_epoch][4])) { - std::vector erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] }; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); - tt_driver_atomics::sfence(); - erisc_q_wrptr_updated[active_core_epoch] = true; - } else { - erisc_q_wrptr_updated[active_core_epoch] = false; - } -} - -/* - * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue - * DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above - */ -void tt_SiliconDevice::rolled_write_to_non_mmio_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, uint32_t unroll_count) { - using data_word_t = uint32_t; - constexpr int DATA_WORD_SIZE = sizeof(data_word_t); - - std::string write_tlb = "LARGE_WRITE_TLB"; - std::string read_tlb = "LARGE_READ_TLB"; - std::string empty_tlb = ""; - translate_to_noc_table_coords(core.chip, core.y, core.x); - - const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip); - - - std::vector erisc_command; - std::vector erisc_q_rptr = std::vector(1); - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); - - std::vector data_block = std::vector(size_in_bytes / DATA_WORD_SIZE); - - routing_cmd_t *new_cmd; - - flush_non_mmio = true; - uint32_t transfer_size = size_in_bytes * unroll_count; - uint32_t buffer_id = 0; - uint32_t timestamp = 0; //CMD_TIMESTAMP; - - // - // MUTEX ACQUIRE (NON-MMIO) - // do not locate any ethernet core reads/writes before this acquire - // - const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); - - if (non_mmio_transfer_cores_customized) { - log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices."); - } - - const scoped_lock lock( - *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->id)); - - erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - new_cmd = (routing_cmd_t *)&erisc_command[0]; - int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - - uint32_t offset = 0; - - bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); - erisc_q_rptr.resize(1); - erisc_q_rptr[0] = erisc_q_ptrs[4]; - - uint32_t unroll_offset = 0; - - while (offset < transfer_size) { - while (full) { - read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]); - } - //full = true; - // set full only if this command will make the q full. - // otherwise full stays false so that we do not poll the rd pointer in next iteration. - // As long as current command push does not fill up the queue completely, we do not want - // to poll rd pointer in every iteration. - //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]); - - log_assert(((address + offset) & 0x1F) == 0, "Base address + offset in incorrect range!"); - - uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask; - - uint32_t req_flags = eth_interface_params.cmd_data_block_dram | eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req; - timestamp = 0; - - uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * host_address_params.eth_routing_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. - - memcpy(data_block.data(), mem_ptr, size_in_bytes); - uint32_t byte_increment = data_block.size() * DATA_WORD_SIZE; - uint32_t host_mem_offset = 0; - uint32_t i = 0; - for (i = 0; (i + unroll_offset) < unroll_count; i++) { - if ((host_mem_offset + byte_increment) > host_address_params.eth_routing_block_size) { - break; - } - data_block[0] = i + unroll_offset; - write_to_sysmem(data_block, host_dram_block_addr + host_mem_offset, host_dram_channel, mmio_capable_chip_logical); - host_mem_offset += byte_increment; - } - unroll_offset += i; - tt_driver_atomics::sfence(); - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); - new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); - new_cmd->data = host_mem_offset; - new_cmd->flags = req_flags; - new_cmd->src_addr_tag = host_dram_block_addr; - - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb); - tt_driver_atomics::sfence(); - erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; - std::vector erisc_q_wptr; - erisc_q_wptr.resize(1); - erisc_q_wptr[0] = erisc_q_ptrs[0]; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); - tt_driver_atomics::sfence(); - offset += host_mem_offset; - - // If there is more data to send and this command will make the q full, switch to next Q. - // otherwise full stays false so that we do not poll the rd pointer in next iteration. - // As long as current command push does not fill up the queue completely, we do not want - // to poll rd pointer in every iteration. - - if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) { - active_core_for_txn++; - uint32_t update_mask_for_chip = (remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1); - active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID); - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); - erisc_q_rptr[0] = erisc_q_ptrs[4]; - } - } -} - /* * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above @@ -4198,7 +3226,7 @@ void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, while(size_in_bytes > 0) { auto [mapped_address, tlb_size] = set_dynamic_tlb_broadcast(pci_device, tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); - write_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size); + write_block(dev, mapped_address, transfer_size, buffer_addr); size_in_bytes -= transfer_size; addr += transfer_size; @@ -4423,18 +3451,18 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_ } void tt_SiliconDevice::write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { - write_dma_buffer(mem_ptr, size, addr, channel, src_device_id); + write_buffer(mem_ptr, size, addr, channel, src_device_id); } void tt_SiliconDevice::write_to_sysmem(std::vector& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { - write_dma_buffer(vec.data(), vec.size() * sizeof(uint32_t), addr, channel, src_device_id); + write_buffer(vec.data(), vec.size() * sizeof(uint32_t), addr, channel, src_device_id); } void tt_SiliconDevice::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { - read_dma_buffer(mem_ptr, addr, channel, size, src_device_id); + read_buffer(mem_ptr, addr, channel, size, src_device_id); } void tt_SiliconDevice::read_from_sysmem(std::vector &vec, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { size_buffer_to_capacity(vec, size); - read_dma_buffer(vec.data(), addr, channel, size, src_device_id); + read_buffer(vec.data(), addr, channel, size, src_device_id); } void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) { @@ -4548,7 +3576,7 @@ void tt_SiliconDevice::dram_membar(const chip_id_t chip, const std::string& fall } } -void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { +void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); if(target_is_mmio_capable) { if (fallback_tlb == "REG_TLB") { @@ -4556,60 +3584,16 @@ void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cx } else { write_device_memory(mem_ptr, size, core, addr, fallback_tlb); } - } - else if (!send_epoch_cmd) { + } else { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!"); write_to_non_mmio_device(mem_ptr, size, core, addr); - } else { - log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); - // as long as epoch commands are sent single-threaded, no need to acquire mutex - log_assert(!(size % 4), "Epoch commands must be 4 byte aligned!"); - write_to_non_mmio_device_send_epoch_cmd((uint32_t*)mem_ptr, size, core, addr, last_send_epoch_cmd, ordered_with_prev_remote_write); } } - -void tt_SiliconDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - // Overloaded device writer that accepts a vector - write_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); -} - - -void tt_SiliconDevice::write_epoch_cmd_to_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); - if(target_is_mmio_capable) { - write_device_memory(mem_ptr, size_in_bytes, core, addr, fallback_tlb); - } else { - log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); // MT: Use only dynamic TLBs and never program static - write_to_non_mmio_device_send_epoch_cmd(mem_ptr, size_in_bytes, core, addr, last_send_epoch_cmd, ordered_with_prev_remote_write); - } -} - -void tt_SiliconDevice::write_epoch_cmd_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { +void tt_SiliconDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { // Overloaded device writer that accepts a vector - write_epoch_cmd_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb, last_send_epoch_cmd, ordered_with_prev_remote_write); -} - -void tt_SiliconDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - log_assert(!(size_in_bytes % 4), "{} only supports 4-byte aligned data", __FUNCTION__); - bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip); - - if (target_is_mmio_capable) { - for (int i=0; i 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!"); - rolled_write_to_non_mmio_device(mem_ptr, size_in_bytes, core, addr, unroll_count); - } -} - -void tt_SiliconDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - rolled_write_to_device(vec.data(), vec.size() * sizeof(uint32_t), unroll_count, core, addr, fallback_tlb); + write_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb); } void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { @@ -4909,20 +3893,6 @@ std::uint32_t tt_SiliconDevice::get_host_channel_size(std::uint32_t device_id, s return host_channel_size.at(device_id).at(channel); } -std::uint32_t tt_SiliconDevice::get_pcie_speed(std::uint32_t device_id) { - int link_width = 0; - int link_speed = 0; - if (ndesc->is_chip_mmio_capable(device_id)) { - PCIdevice *pci_device = get_pci_device(device_id); - link_width = get_link_width(pci_device->hdev); - link_speed = get_link_speed(pci_device->hdev); - log_debug(LogSiliconDriver, "Device {} PCIe link width: x{}, speed: {} Gb/s", device_id, link_width, link_speed); - } else { - log_debug(LogSiliconDriver, "Device {} is NOT a PCIe device, width: x{}, speed: {} Gb/s", device_id, link_width, link_speed); - } - return (link_width * link_speed); -} - std::uint32_t tt_SiliconDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) { return get_numa_node(get_pci_device(device_id)->hdev); } diff --git a/device/tt_silicon_driver_common.hpp b/device/tt_silicon_driver_common.hpp index 1649bf70..9f275668 100644 --- a/device/tt_silicon_driver_common.hpp +++ b/device/tt_silicon_driver_common.hpp @@ -9,19 +9,6 @@ #include #include - -typedef struct { - uint32_t chip_addr; - uint32_t host_phys_addr; - uint32_t completion_flag_phys_addr; - uint32_t size_bytes : 28; - uint32_t write : 1; - uint32_t pcie_msi_on_done : 1; - uint32_t pcie_write_on_done : 1; - uint32_t trigger : 1; - uint32_t repeat; -} arc_pcie_ctrl_dma_request_t; // 5 * 4 = 20B - enum class TensixSoftResetOptions: std::uint32_t { NONE = 0, BRISC = ((std::uint32_t) 1 << 11), diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp index 4320b3ef..60958372 100644 --- a/device/tt_soc_descriptor.cpp +++ b/device/tt_soc_descriptor.cpp @@ -199,98 +199,20 @@ int tt_SocDescriptor::get_num_dram_channels() const { return num_channels; } -std::vector tt_SocDescriptor::get_dram_chan_map() { - std::vector chan_map; - for (unsigned int i = 0; i < dram_cores.size(); i++) { - chan_map.push_back(i); - } - return chan_map; -}; - bool tt_SocDescriptor::is_worker_core(const tt_xy_pair &core) const { return ( routing_x_to_worker_x.find(core.x) != routing_x_to_worker_x.end() && routing_y_to_worker_y.find(core.y) != routing_y_to_worker_y.end()); } -tt_xy_pair tt_SocDescriptor::get_worker_core(const tt_xy_pair &core) const { - tt_xy_pair worker_xy = { - static_cast(routing_x_to_worker_x.at(core.x)), static_cast(routing_y_to_worker_y.at(core.y))}; - return worker_xy; -} - -tt_xy_pair tt_SocDescriptor::get_routing_core(const tt_xy_pair& core) const { - tt_xy_pair routing_xy = { - static_cast(worker_log_to_routing_x.at(core.x)), static_cast(worker_log_to_routing_y.at(core.y))}; - return routing_xy; -} - tt_xy_pair tt_SocDescriptor::get_core_for_dram_channel(int dram_chan, int subchannel) const { return this->dram_cores.at(dram_chan).at(subchannel); }; -tt_xy_pair tt_SocDescriptor::get_pcie_core(int pcie_id) const { - return this->pcie_cores.at(pcie_id); -}; - bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const { return this->ethernet_core_channel_map.find(core) != ethernet_core_channel_map.end(); } -bool tt_SocDescriptor::is_dram_core(const tt_xy_pair &core) const { - static std::unordered_set cores = {}; - if (cores.empty()) { - for (const std::vector &dram_chan : this->dram_cores) { - for (const tt_xy_pair &subchannel : dram_chan) { - cores.insert(subchannel); - } - } - } - return cores.find(core) != cores.end(); -} - -int tt_SocDescriptor::get_channel_of_ethernet_core(const tt_xy_pair &core) const { - return this->ethernet_core_channel_map.at(core); -} - -int tt_SocDescriptor::get_num_dram_subchans() const { - int num_chan = 0; - for (const std::vector &core : this->dram_cores) { - num_chan += core.size(); - } - return num_chan; -} - -int tt_SocDescriptor::get_num_dram_blocks_per_channel() const { - int num_blocks = 0; - if (arch == tt::ARCH::GRAYSKULL) { - num_blocks = 1; - } else if (arch == tt::ARCH::WORMHOLE) { - num_blocks = 2; - } else if (arch == tt::ARCH::WORMHOLE_B0) { - num_blocks = 2; - } else if (arch == tt::ARCH::BLACKHOLE) { - num_blocks = 2; - } - return num_blocks; -} - -// Note: same as t_SiliconDevice::get_pcie_base_addr_from_device -uint64_t tt_SocDescriptor::get_noc2host_offset(uint16_t host_channel) const { - - const std::uint64_t PEER_REGION_SIZE = (1024 * 1024 * 1024); - - if (arch == tt::ARCH::GRAYSKULL) { - return (host_channel * PEER_REGION_SIZE); - }else if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) { - return (host_channel * PEER_REGION_SIZE) + 0x800000000; - } else if (arch == tt::ARCH::BLACKHOLE) { - return (host_channel * PEER_REGION_SIZE) + (1ULL << 60); - } else { - throw std::runtime_error("Unsupported architecture"); - } -} - std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) { if (arch_name == tt::ARCH::JAWBRIDGE) { out << "jawbridge"; diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h index 2be98749..87ea1799 100644 --- a/device/tt_soc_descriptor.h +++ b/device/tt_soc_descriptor.h @@ -23,8 +23,6 @@ namespace YAML { class Node; } -static constexpr std::size_t DEFAULT_DRAM_SIZE_PER_CORE = 8 * 1024 * 1024; - std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name); static inline std::string get_arch_str(const tt::ARCH arch_name){ @@ -132,18 +130,9 @@ class tt_SocDescriptor { uint64_t dram_bank_size; int get_num_dram_channels() const; - std::vector get_dram_chan_map(); bool is_worker_core(const tt_xy_pair &core) const; - tt_xy_pair get_worker_core(const tt_xy_pair& core) const; - tt_xy_pair get_routing_core(const tt_xy_pair& core) const; tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const; - tt_xy_pair get_pcie_core(int pcie_id = 0) const; - bool is_dram_core(const tt_xy_pair& core) const; bool is_ethernet_core(const tt_xy_pair& core) const; - int get_channel_of_ethernet_core(const tt_xy_pair &core) const; - int get_num_dram_subchans() const; - int get_num_dram_blocks_per_channel() const; - uint64_t get_noc2host_offset(uint16_t host_channel) const; // Default constructor. Creates uninitialized object with public access to all of its attributes. tt_SocDescriptor() = default; diff --git a/device/wormhole/impl_device.hpp b/device/wormhole/impl_device.hpp deleted file mode 100644 index 227cac48..00000000 --- a/device/wormhole/impl_device.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include "device/tt_silicon_driver_common.hpp" - -// See src/t6ifc/t6py/packages/tenstorrent/data/wormhole/pci/tlb.yaml -// local_offset: [ 0, 15, 0, "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."] -// x_end : [ 0, 21, 16, "" ] -// y_end : [ 0, 27, 22, "" ] -// x_start : [ 0, 33, 28, "" ] -// y_start : [ 0, 39, 34, "" ] -// noc_sel: [ 0, 40, 40, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 41, 41, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 43, 42, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 44, 44, "linked"] - -// local_offset: [ 0, 14, 0, "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."] -// x_end : [ 0, 20, 15, "" ] -// y_end : [ 0, 26, 21, "" ] -// x_start : [ 0, 32, 27, "" ] -// y_start : [ 0, 38, 33, "" ] -// noc_sel: [ 0, 39, 39, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 40, 40, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 42, 41, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 43, 43, "linked"] - -// local_offset: [ 0, 11, 0, "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."] -// x_end : [ 0, 17, 12, "" ] -// y_end : [ 0, 23, 18, "" ] -// x_start : [ 0, 29, 24, "" ] -// y_start : [ 0, 35, 30, "" ] -// noc_sel: [ 0, 36, 36, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 37, 37, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 39, 38, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 40, 40, "linked"] - -const auto TLB_1M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 16, - .y_end = 22, - .x_start = 28, - .y_start = 34, - .noc_sel = 40, - .mcast = 41, - .ordering = 42, - .linked = 44, - .static_vc = 45, - .static_vc_end = 46 -}; - -const auto TLB_2M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 15, - .y_end = 21, - .x_start = 27, - .y_start = 33, - .noc_sel = 39, - .mcast = 40, - .ordering = 41, - .linked = 43, - .static_vc = 44, - .static_vc_end = 45 -}; - -const auto TLB_16M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 12, - .y_end = 18, - .x_start = 24, - .y_start = 30, - .noc_sel = 36, - .mcast = 37, - .ordering = 38, - .linked = 40, - .static_vc = 41, - .static_vc_end = 42 -}; diff --git a/device/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp similarity index 98% rename from device/wormhole_implementation.cpp rename to device/wormhole/wormhole_implementation.cpp index 9295e2de..96722311 100644 --- a/device/wormhole_implementation.cpp +++ b/device/wormhole/wormhole_implementation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "device/wormhole_implementation.h" +#include "wormhole_implementation.h" namespace tt::umd { diff --git a/device/wormhole_implementation.h b/device/wormhole/wormhole_implementation.h similarity index 100% rename from device/wormhole_implementation.h rename to device/wormhole/wormhole_implementation.h diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp index d6c938aa..23816841 100644 --- a/tests/blackhole/test_silicon_driver_bh.cpp +++ b/tests/blackhole/test_silicon_driver_bh.cpp @@ -12,7 +12,7 @@ #include #include -#include "device/blackhole_implementation.h" +#include "device/blackhole/blackhole_implementation.h" #include "device/tt_cluster_descriptor.h" #include "tests/test_utils/generate_cluster_desc.hpp" diff --git a/tests/emulation/test_emulation_device.cpp b/tests/emulation/test_emulation_device.cpp index e54fa8f0..aef96112 100644 --- a/tests/emulation/test_emulation_device.cpp +++ b/tests/emulation/test_emulation_device.cpp @@ -3,6 +3,8 @@ #include "device/tt_device.h" #include "device/tt_emulation_device.h" +// DEPRECATED TEST SUITE !!! + TEST(EmulationDeviceGS, BasicEmuTest) { tt_emulation_device device = tt_emulation_device("../../tests/soc_descs/grayskull_10x12.yaml"); tt_device_params default_params; diff --git a/tests/galaxy/test_umd_remote_api_stability.cpp b/tests/galaxy/test_umd_remote_api_stability.cpp index f6bd28e8..ecf99862 100644 --- a/tests/galaxy/test_umd_remote_api_stability.cpp +++ b/tests/galaxy/test_umd_remote_api_stability.cpp @@ -76,13 +76,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, MixedRemoteTransfers) { 100000 * scale_number_of_tests, seed, - transfer_type_weights_t{.write = 0.40, .rolled_write = 0.2, .read = 0.4, .epoch_cmd_write = 0.0}, + transfer_type_weights_t{.write = 0.40, .read = 0.4}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -108,13 +106,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran 50000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.50, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -129,13 +125,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran 50000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.25, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -150,13 +144,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran 50000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.5, .read = 0.25}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -171,13 +163,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 0.1, .rolled_write = 0, .read = 0.1, .epoch_cmd_write = 0.8}, + transfer_type_weights_t{.write = 0.1, .read = 0.1}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp index d8324f13..d890d8a9 100644 --- a/tests/grayskull/test_silicon_driver.cpp +++ b/tests/grayskull/test_silicon_driver.cpp @@ -7,7 +7,7 @@ #include "gtest/gtest.h" #include "tt_device.h" #include "device/tt_soc_descriptor.h" -#include "device/wormhole_implementation.h" +#include "device/wormhole/wormhole_implementation.h" #include "l1_address_map.h" #include "tests/test_utils/generate_cluster_desc.hpp" diff --git a/tests/test_utils/stimulus_generators.hpp b/tests/test_utils/stimulus_generators.hpp index 094f06cb..6d35afb8 100644 --- a/tests/test_utils/stimulus_generators.hpp +++ b/tests/test_utils/stimulus_generators.hpp @@ -36,7 +36,7 @@ namespace tt::umd::test::utils { static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml"; -enum RemoteTransferType : uint8_t { WRITE = 0, ROLLED_WRITE, READ, EPOCH_CMD_WRITE }; +enum RemoteTransferType : uint8_t { WRITE = 0, READ }; template < typename SAMPLE_T, @@ -102,14 +102,6 @@ struct write_transfer_sample_t { std::string tlb_to_use; // (payload.data(), size, destination, address, tlb_to_use, false, false); }; -struct rolled_write_transfer_sample_t { - destination_t destination; - address_t address; - transfer_size_t size_in_bytes; - int unroll_count; - std::string tlb_to_use; - // (payload, 2, destination, address, tlb_to_use); -}; struct read_transfer_sample_t { destination_t destination; address_t address; @@ -117,17 +109,8 @@ struct read_transfer_sample_t { std::string tlb_to_use; // (payload.data(), destination, address, size, tlb_to_use); }; -struct write_epoch_cmd_sample_t { - destination_t destination; - address_t address; - transfer_size_t size_in_bytes; - std::string tlb_to_use; - bool last_epoch_command; - bool ordered_with_prev_remote_write; - // (payload.data(), size, destination, address, tlb_to_use, last_epoch_command, ordered_with_prev_remote_write); -}; -using remote_transfer_sample_t = std::tuple>; +using remote_transfer_sample_t = std::tuple>; template < template @@ -267,25 +250,6 @@ template < template class WRITE_SIZE_DISTR_T, - template - class WRITE_EPOCH_CMD_DEST_DISTR_T, - template - class WRITE_EPOCH_CMD_ADDR_DISTR_T, - template - class WRITE_EPOCH_CMD_SIZE_DISTR_T, - class WRITE_EPOCH_CMD_LAST_CMD_DISTR_T, - class WRITE_EPOCH_CMD_ORDERED_DISTR_T, - - template - class ROLLED_WRITE_DEST_DISTR_T, - template - class ROLLED_WRITE_ADDR_DISTR_T, - class ROLLED_WRITE_SIZE_DISTR_OUT_T, - template - class ROLLED_WRITE_SIZE_DISTR_T, - template - class ROLLED_WRITE_UNROLL_DISTR_T, - template class READ_DEST_DISTR_T, template @@ -299,8 +263,6 @@ class TestGenerator { using transfer_type_generator_t = DefaultTransferTypeGenerator; // ConstrainedTemplateTemplateGenerator; using write_command_generator_t = WriteCommandGenerator; - using write_epoch_cmd_command_generator_t = WriteEpochCmdCommandGenerator; - using rolled_write_command_generator_t = RolledWriteCommandGenerator; using read_command_generator_t = ReadCommandGenerator; public: @@ -308,14 +270,10 @@ class TestGenerator { int seed, transfer_type_generator_t const& transfer_type_distribution, write_command_generator_t const& write_command_generator, - rolled_write_command_generator_t const& rolled_write_command_generator, - write_epoch_cmd_command_generator_t const& write_epoch_cmd_command_generator, read_command_generator_t const& read_command_generator) : generator(seed), transfer_type_distribution(transfer_type_distribution), write_command_generator(write_command_generator), - rolled_write_command_generator(rolled_write_command_generator), - write_epoch_cmd_command_generator(write_epoch_cmd_command_generator), read_command_generator(read_command_generator) { } @@ -338,34 +296,6 @@ class TestGenerator { .tlb_to_use = "LARGE_WRITE_TLB"}}; } break; - case RemoteTransferType::ROLLED_WRITE: { - destination_t const& destination = rolled_write_command_generator.destination_generator.generate(); - address_t const& address = rolled_write_command_generator.address_generator.generate(); - transfer_size_t const& size_in_bytes = rolled_write_command_generator.size_generator.generate(); - int unroll_count = rolled_write_command_generator.unroll_generator.generate(); - return {transfer_type, rolled_write_transfer_sample_t{ - .destination = destination, - .address = address, - .size_in_bytes = size_in_bytes, - .unroll_count = unroll_count, - .tlb_to_use = "LARGE_WRITE_TLB"}}; - } break; - - case RemoteTransferType::EPOCH_CMD_WRITE: { - destination_t const& destination = write_epoch_cmd_command_generator.destination_generator.generate(); - address_t const& address = write_epoch_cmd_command_generator.address_generator.generate(); - transfer_size_t const& size_in_bytes = write_epoch_cmd_command_generator.size_generator.generate(); - bool last_epoch_cmd = write_epoch_cmd_command_generator.last_cmd_generator.generate(); - bool ordered_with_prev_remote_write = write_epoch_cmd_command_generator.ordered_generator.generate(); - return {transfer_type, write_epoch_cmd_sample_t{ - .destination = destination, - .address = address, - .size_in_bytes = size_in_bytes, - .tlb_to_use = "LARGE_WRITE_TLB", - .last_epoch_command = last_epoch_cmd, - .ordered_with_prev_remote_write = ordered_with_prev_remote_write}}; - } break; - case RemoteTransferType::READ: { destination_t const& destination = read_command_generator.destination_generator.generate(); address_t const& address = read_command_generator.address_generator.generate(); @@ -388,22 +318,17 @@ class TestGenerator { transfer_type_generator_t transfer_type_distribution; write_command_generator_t write_command_generator; - rolled_write_command_generator_t rolled_write_command_generator; - write_epoch_cmd_command_generator_t write_epoch_cmd_command_generator; read_command_generator_t read_command_generator; }; struct transfer_type_weights_t { double write; - double rolled_write; double read; - double epoch_cmd_write; }; static auto address_aligner = [](address_t addr) -> address_t { addr = (((addr - 1) / 32) + 1) * 32; assert(addr % 32 == 0); return addr;}; static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 4) + 1) * 4; assert(size > 0); assert(size % 4 == 0); return size; }; -static auto rolled_write_transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; template @@ -433,28 +358,12 @@ static void print_command(remote_transfer_sample_t const& command) { << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl; } break; - case RemoteTransferType::ROLLED_WRITE: { - rolled_write_transfer_sample_t const& command_args = - std::get(std::get<1>(command)); - std::cout << "Transfer type: ROLLED_WRITE, destination: (c=" << command_args.destination.chip - << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x - << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes - << ", unroll_count: " << command_args.unroll_count << std::endl; - } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); std::cout << "Transfer type: READ, destination: (c=" << command_args.destination.chip << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl; } break; - case RemoteTransferType::EPOCH_CMD_WRITE: { - write_epoch_cmd_sample_t const& command_args = std::get(std::get<1>(command)); - std::cout << "Transfer type: EPOCH_CMD_WRITE, destination: (c=" << command_args.destination.chip - << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x - << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes - << ", last_cmd: " << (command_args.last_epoch_command ? " True" : "False") - << ", ordered_w_prev_remote_write: " << (command_args.ordered_with_prev_remote_write ? " True" : "False") << std::endl; - } break; default: throw std::runtime_error("Invalid transfer type"); }; } @@ -479,14 +388,7 @@ static inline void dispatch_remote_transfer_command( write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); assert(command_args.size_in_bytes >= sizeof(uint32_t)); resize_payload(payload,command_args.size_in_bytes); - driver.write_to_device(payload.data(), bytes_to_words(command_args.size_in_bytes), command_args.destination, command_args.address, command_args. - tlb_to_use, false, false); - } break; - case RemoteTransferType::ROLLED_WRITE: { - rolled_write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); - assert(command_args.size_in_bytes >= sizeof(uint32_t)); - resize_payload(payload,command_args.size_in_bytes); - driver.rolled_write_to_device(payload, command_args.unroll_count, command_args.destination, command_args.address, command_args.tlb_to_use); + driver.write_to_device(payload.data(), bytes_to_words(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use); } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); @@ -494,12 +396,6 @@ static inline void dispatch_remote_transfer_command( resize_payload(payload,command_args.size_in_bytes); driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size_in_bytes, command_args.tlb_to_use); } break; - case RemoteTransferType::EPOCH_CMD_WRITE: { - write_epoch_cmd_sample_t const& command_args = std::get(std::get<1>(command)); - assert(command_args.size_in_bytes >= sizeof(uint32_t)); - resize_payload(payload,command_args.size_in_bytes); - driver.write_epoch_cmd_to_device(payload.data(), bytes_to_words(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use, command_args.last_epoch_command, command_args.ordered_with_prev_remote_write); - } break; default: throw std::runtime_error("Invalid transfer type"); }; @@ -524,16 +420,9 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman std::cout << "assert(" << command_args.size_in_bytes << " >= sizeof(uint32_t));" << std::endl; emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t)); emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\", false, false);" << std::endl; + std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl; // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, false, false); } break; - case RemoteTransferType::ROLLED_WRITE: { - rolled_write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); - std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; - emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->rolled_write_to_device(payload, " << command_args.unroll_count << ", destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl; - // driver.rolled_write_to_device(payload, command_args.unroll_count, command_args.destination, command_args.address, command_args.tlb_to_use); - } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; @@ -541,15 +430,6 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl; // driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size, command_args.tlb_to_use); } break; - case RemoteTransferType::EPOCH_CMD_WRITE: { - write_epoch_cmd_sample_t const& command_args = std::get(std::get<1>(command)); - std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; - emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->write_epoch_cmd_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\", " << (command_args.last_epoch_command ? "true":"false") - << "\", " << (command_args.ordered_with_prev_remote_write ? "true":"false") << ");" << std::endl; - // driver.write_epoch_cmd_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, command_args.last_epoch_command, command_args.ordered_with_prev_remote_write); - } break; default: throw std::runtime_error("Invalid transfer type"); }; @@ -572,18 +452,6 @@ template< template class WRITE_ADDR_DISTR_T, class WRITE_SIZE_DISTR_OUT_T, template class WRITE_SIZE_DISTR_T, - - template class ROLLED_WRITE_DEST_DISTR_T, - template class ROLLED_WRITE_ADDR_DISTR_T, - class ROLLED_WRITE_SIZE_DISTR_OUT_T, - template class ROLLED_WRITE_SIZE_DISTR_T, - template class ROLLED_WRITE_UNROLL_COUNT_DISTR_T, - - template class WRITE_EPOCH_CMD_DEST_DISTR_T, - template class WRITE_EPOCH_CMD_ADDR_DISTR_T, - template class WRITE_EPOCH_CMD_SIZE_DISTR_T, - class WRITE_EPOCH_CMD_LAST_CMD_DISTR_T, - class WRITE_EPOCH_CMD_ORDERED_DISTR_T, template class READ_DEST_DISTR_T, template class READ_ADDR_DISTR_T, @@ -598,8 +466,6 @@ void RunMixedTransfers( transfer_type_weights_t const& transfer_type_weights, WriteCommandGenerator const& write_command_generator, - RolledWriteCommandGenerator const& rolled_write_command_generator, - WriteEpochCmdCommandGenerator const& write_epoch_cmd_command_generator, ReadCommandGenerator const& read_command_generator, bool record_command_history = false, @@ -609,14 +475,12 @@ void RunMixedTransfers( auto test_generator = TestGenerator( seed, {seed, - {transfer_type_weights.write, transfer_type_weights.rolled_write, transfer_type_weights.read, transfer_type_weights.epoch_cmd_write}, + {transfer_type_weights.write, transfer_type_weights.read}, [](int transfer_type) -> RemoteTransferType { assert(transfer_type < 4); return static_cast(transfer_type); }}, write_command_generator, - rolled_write_command_generator, - write_epoch_cmd_command_generator, read_command_generator); if (record_command_history) { @@ -663,58 +527,6 @@ static ConstrainedTemplateTemplateGenerator destination_t { return core_index_to_location.at(dest); }); } - -static RolledWriteCommandGenerator < - std::uniform_int_distribution, - std::uniform_int_distribution, - transfer_size_t, - std::uniform_int_distribution, - std::uniform_int_distribution -> - build_dummy_rolled_write_command_generator(tt_SiliconDevice &device) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); - tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); - std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); - auto dest_generator = ConstrainedTemplateTemplateGenerator( - 0, - std::uniform_int_distribution(0, core_index_to_location.size() - 1), - [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(0, std::uniform_int_distribution(0,0), address_aligner_32B); - auto rolled_write_size_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), rolled_write_transfer_size_aligner); - auto unroll_count_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), [](int unroll_count) -> int { return unroll_count; }); - - return RolledWriteCommandGenerator( - dest_generator, addr_generator_32B_aligned, rolled_write_size_generator, unroll_count_generator); -} - -static WriteEpochCmdCommandGenerator < - std::uniform_int_distribution, - std::uniform_int_distribution, - std::uniform_int_distribution, - std::bernoulli_distribution, - std::bernoulli_distribution -> build_dummy_write_epoch_cmd_command_generator(tt_SiliconDevice &device) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); - tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); - std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); - auto dest_generator = ConstrainedTemplateTemplateGenerator( - 0, - std::uniform_int_distribution(0, core_index_to_location.size() - 1), - [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(0, std::uniform_int_distribution(0,0), address_aligner_32B); - auto write_epoch_cmd_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), transfer_size_aligner); - auto last_epoch_cmd_generator = ConstrainedTemplateGenerator( - 0, std::bernoulli_distribution(1), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; }); - auto ordered_generator = ConstrainedTemplateGenerator( - 0, std::bernoulli_distribution(1), [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; }); - - return WriteEpochCmdCommandGenerator( - dest_generator, addr_generator_32B_aligned, write_epoch_cmd_generator, last_epoch_cmd_generator, ordered_generator); -} - static WriteCommandGenerator< std::uniform_int_distribution, std::uniform_int_distribution, @@ -764,10 +576,6 @@ template< template class WRITE_SIZE_GENERATOR_T, template - class ROLLED_WRITE_SIZE_GENERATOR_T, - template - class WRITE_EPOCH_CMD_SIZE_GENERATOR_T, - template class READ_SIZE_GENERATOR_T, template class UNROLL_COUNT_GENERATOR_T @@ -780,9 +588,7 @@ void RunMixedTransfersUniformDistributions( transfer_type_weights_t const& transfer_type_weights, ADDR_GENERATOR_T const& address_distribution, WRITE_SIZE_GENERATOR_T const& write_size_distribution, - ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution, - WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, float percent_not_last_epoch_cmd, float percent_not_remote_ordered, READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -802,12 +608,8 @@ void RunMixedTransfersUniformDistributions( auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(seed + 1, address_distribution, address_aligner_32B); auto write_size_generator = ConstrainedTemplateTemplateGenerator( seed + 2, write_size_distribution, transfer_size_aligner); - auto rolled_write_size_generator = ConstrainedTemplateTemplateGenerator( - seed + 2, rolled_write_size_distribution, rolled_write_transfer_size_aligner); auto read_size_generator = ConstrainedTemplateTemplateGenerator( seed + 2, read_size_distribution, transfer_size_aligner); - auto write_epoch_cmd_generator = ConstrainedTemplateTemplateGenerator( - seed + 2, write_epoch_cmd_size_distribution, transfer_size_aligner); auto last_epoch_cmd_generator = ConstrainedTemplateGenerator( seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; }); auto ordered_generator = ConstrainedTemplateGenerator( @@ -823,9 +625,6 @@ void RunMixedTransfersUniformDistributions( transfer_type_weights, WriteCommandGenerator(dest_generator, addr_generator, write_size_generator), - RolledWriteCommandGenerator(dest_generator, addr_generator_32B_aligned, rolled_write_size_generator, unroll_count_generator), - WriteEpochCmdCommandGenerator( - dest_generator, addr_generator_32B_aligned, write_epoch_cmd_generator, last_epoch_cmd_generator, ordered_generator), ReadCommandGenerator(dest_generator, addr_generator, read_size_generator), record_command_history, diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp index df686dfa..6551b3cc 100644 --- a/tests/wormhole/test_silicon_driver_wh.cpp +++ b/tests/wormhole/test_silicon_driver_wh.cpp @@ -13,7 +13,7 @@ #include "host_mem_address_map.h" #include "device/tt_cluster_descriptor.h" -#include "device/wormhole_implementation.h" +#include "device/wormhole/wormhole_implementation.h" #include "tests/test_utils/generate_cluster_desc.hpp" void set_params_for_remote_txn(tt_SiliconDevice& device) { diff --git a/tests/wormhole/test_umd_remote_api_stability.cpp b/tests/wormhole/test_umd_remote_api_stability.cpp index 36c02914..96fef09a 100644 --- a/tests/wormhole/test_umd_remote_api_stability.cpp +++ b/tests/wormhole/test_umd_remote_api_stability.cpp @@ -73,13 +73,11 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) { 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.25}, + transfer_type_weights_t{.write = 0.25, .read = 0.25}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -108,13 +106,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.50, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -129,13 +125,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall 100000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.25, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -150,13 +144,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall 100000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.5, .read = 0.25}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -171,13 +163,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 1.0, .rolled_write = 0, .read = 0.0, .epoch_cmd_write = 0.0}, + transfer_type_weights_t{.write = 1.0, .read = 0.0}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -206,13 +196,11 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersLarge) { 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.15, .rolled_write = 0, .read = 0.15, .epoch_cmd_write = 0.7}, + transfer_type_weights_t{.write = 0.15, .read = 0.15}, std::uniform_int_distribution(0x10000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 300000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 300000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -247,11 +235,9 @@ TEST_F(WormholeNebulaX2TestFixture, WritesOnlyNormalDistributionMean10kStd3kMinS 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 1., .read = 0.}, WriteCommandGenerator(dest_generator, address_generator, write_size_generator), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), build_dummy_read_command_generator(*device), false, // Set to true if you want to emit the command history code to command line @@ -279,13 +265,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.50, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -300,13 +284,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { 100000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.25, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -321,13 +303,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { 100000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.5, .read = 0.25}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -342,13 +322,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 1.0, .rolled_write = 0, .read = 0.0, .epoch_cmd_write = 0.0}, + transfer_type_weights_t{.write = 1.0, .read = 0.0}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -387,11 +365,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 1., .read = 0.}, WriteCommandGenerator(dest_generator, address_generator, write_size_generator), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), build_dummy_read_command_generator(*device), false, // Set to true if you want to emit the command history code to command line @@ -404,11 +380,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 1., .read = 0.}, WriteCommandGenerator(dest_generator, address_generator, write_size_generator), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), build_dummy_read_command_generator(*device), false, // Set to true if you want to emit the command history code to command line @@ -421,11 +395,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0, .rolled_write = 0., .read = 1., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0, .read = 1.}, build_dummy_write_command_generator(*device), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), ReadCommandGenerator(dest_generator, address_generator, read_size_generator), false, // Set to true if you want to emit the command history code to command line @@ -438,11 +410,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0, .rolled_write = 0., .read = 1., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0, .read = 1.}, build_dummy_write_command_generator(*device), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), ReadCommandGenerator(dest_generator, address_generator, read_size_generator), false, // Set to true if you want to emit the command history code to command line