From e8bf964a20e7c3b6588f0a4411f486d23e3978a6 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Wed, 11 Dec 2024 07:28:27 -0500 Subject: [PATCH 1/3] Correct the OUTPUT of add_custom_command so CMake/Ninja know when it doesn't need to run (#392) --- device/CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt index c86d6704..151afff0 100644 --- a/device/CMakeLists.txt +++ b/device/CMakeLists.txt @@ -1,15 +1,13 @@ set(POSITION_INDEPENDENT_CODE ON) set(FBS_FILE ${PROJECT_SOURCE_DIR}/device/simulation/tt_simulation_device.fbs) -get_filename_component(FBS_FILE_NAME ${FBS_FILE} NAME) +get_filename_component(FBS_FILE_NAME ${FBS_FILE} NAME_WLE) set(FBS_GENERATED_HEADER "${CMAKE_CURRENT_BINARY_DIR}/${FBS_FILE_NAME}_generated.h") add_custom_command( OUTPUT ${FBS_GENERATED_HEADER} COMMAND - flatc - ARGS - --cpp -o "${CMAKE_CURRENT_BINARY_DIR}/" ${FBS_FILE} + flatc --cpp -o "${CMAKE_CURRENT_BINARY_DIR}/" ${FBS_FILE} DEPENDS flatc ${FBS_FILE} From 91cc73b17f52e23a17af863c0fc9f94fd46bb6d4 Mon Sep 17 00:00:00 2001 From: Pavle Janevski <165378935+pjanevskiTT@users.noreply.github.com> Date: Wed, 11 Dec 2024 18:28:33 +0100 Subject: [PATCH 2/3] Coordinate manager internal redesign (#374) --- .../umd/device/blackhole_coordinate_manager.h | 8 +- device/api/umd/device/coordinate_manager.h | 107 ++--- .../umd/device/grayskull_coordinate_manager.h | 2 +- device/api/umd/device/tt_core_coordinates.h | 22 + .../umd/device/wormhole_coordinate_manager.h | 4 +- .../blackhole_coordinate_manager.cpp | 170 +++++--- device/coordinate_manager.cpp | 405 ++++++------------ .../grayskull_coordinate_manager.cpp | 23 +- .../wormhole/wormhole_coordinate_manager.cpp | 46 +- tests/api/test_core_coord_translation_bh.cpp | 84 ++++ tests/api/test_core_coord_translation_gs.cpp | 56 +++ tests/api/test_core_coord_translation_wh.cpp | 90 +++- 12 files changed, 567 insertions(+), 450 deletions(-) diff --git a/device/api/umd/device/blackhole_coordinate_manager.h b/device/api/umd/device/blackhole_coordinate_manager.h index 1cdfc5e5..6d5acdd3 100644 --- a/device/api/umd/device/blackhole_coordinate_manager.h +++ b/device/api/umd/device/blackhole_coordinate_manager.h @@ -29,10 +29,10 @@ class BlackholeCoordinateManager : public CoordinateManager { void translate_dram_coords() override; void translate_tensix_coords() override; - void fill_tensix_logical_to_translated() override; - void fill_eth_logical_to_translated() override; - void fill_pcie_logical_to_translated() override; - void fill_dram_logical_to_translated() override; + void fill_tensix_physical_translated_mapping() override; + void fill_eth_physical_translated_mapping() override; + void fill_pcie_physical_translated_mapping() override; + void fill_dram_physical_translated_mapping() override; private: void map_column_of_dram_banks(const size_t start_bank, const size_t end_bank, const size_t x_coord); diff --git a/device/api/umd/device/coordinate_manager.h b/device/api/umd/device/coordinate_manager.h index 81916939..4e44ae71 100644 --- a/device/api/umd/device/coordinate_manager.h +++ b/device/api/umd/device/coordinate_manager.h @@ -16,20 +16,6 @@ class CoordinateManager { public: - CoordinateManager( - const tt_xy_pair& tensix_grid_size, - const std::vector& tensix_cores, - const size_t tensix_harvesting_mask, - const tt_xy_pair& dram_grid_size, - const std::vector& dram_cores, - const size_t dram_harvesting_mask, - const tt_xy_pair& eth_grid_size, - const std::vector& eth_cores, - const tt_xy_pair& arc_grid_size, - const std::vector& arc_cores, - const tt_xy_pair& pcie_grid_size, - const std::vector& pcie_cores); - static std::shared_ptr create_coordinate_manager( tt::ARCH arch, const tt_xy_pair& tensix_grid_size, @@ -59,21 +45,35 @@ class CoordinateManager { virtual ~CoordinateManager() = default; private: - tt::umd::CoreCoord to_physical(const tt::umd::CoreCoord core_coord); - tt::umd::CoreCoord to_logical(const tt::umd::CoreCoord core_coord); - tt::umd::CoreCoord to_virtual(const tt::umd::CoreCoord core_coord); - tt::umd::CoreCoord to_translated(const tt::umd::CoreCoord core_coord); - static void assert_create_coordinate_manager( const tt::ARCH arch, const size_t tensix_harvesting_mask, const size_t dram_harvesting_mask); protected: + CoordinateManager( + const tt_xy_pair& tensix_grid_size, + const std::vector& tensix_cores, + const size_t tensix_harvesting_mask, + const tt_xy_pair& dram_grid_size, + const std::vector& dram_cores, + const size_t dram_harvesting_mask, + const tt_xy_pair& eth_grid_size, + const std::vector& eth_cores, + const tt_xy_pair& arc_grid_size, + const std::vector& arc_cores, + const tt_xy_pair& pcie_grid_size, + const std::vector& pcie_cores); + + void initialize(); + virtual void translate_tensix_coords(); virtual void translate_dram_coords(); virtual void translate_eth_coords(); virtual void translate_arc_coords(); virtual void translate_pcie_coords(); + void identity_map_physical_cores(); + void add_core_translation(const tt::umd::CoreCoord& core_coord, const tt_xy_pair& physical_pair); + /* * Fills the logical to translated mapping for the tensix cores. * By default, translated coordinates are the same as physical coordinates. @@ -81,90 +81,45 @@ class CoordinateManager { * should override this method. Wormhole and Blackhole coordinate managers * override this method to implement different mapping. */ - virtual void fill_tensix_logical_to_translated(); + virtual void fill_tensix_physical_translated_mapping(); /* - * Fills the logical to translated mapping for the ethernet cores. + * Fills the physical to translated mapping for the ethernet cores. * By default, translated coordinates are the same as physical coordinates. * Derived coordinate managers that need to implement different mapping * should override this method. Wormhole and Blackhole coordinate managers * override this method to implement different mapping. */ - virtual void fill_eth_logical_to_translated(); + virtual void fill_eth_physical_translated_mapping(); /* - * Fills the logical to translated mapping for the DRAM cores. + * Fills the physical to translated mapping for the DRAM cores. * By default, translated coordinates are the same as physical coordinates. * Derived coordinate managers that need to implement different mapping * should override this method. Blackhole coordinate manager overrides * this method to implement different mapping. */ - virtual void fill_dram_logical_to_translated(); + virtual void fill_dram_physical_translated_mapping(); /* - * Fills the logical to translated mapping for the PCIE cores. + * Fills the physical to translated mapping for the PCIE cores. * By default, translated coordinates are the same as physical coordinates. * Derived coordinate managers that need to implement different mapping * should override this method. Blackhole coordinate manager overrides * this method to implement different mapping. */ - virtual void fill_pcie_logical_to_translated(); + virtual void fill_pcie_physical_translated_mapping(); /* - * Fills the logical to translated mapping for the ARC cores. + * Fills the physical to translated mapping for the ARC cores. * By default, translated coordinates are the same as physical coordinates. * Derived coordinate managers that need to implement different mapping * should override this method. */ - virtual void fill_arc_logical_to_translated(); - - std::map tensix_logical_to_translated; - std::map tensix_logical_to_virtual; - std::map tensix_logical_to_physical; - - std::map tensix_physical_to_logical; - std::map tensix_virtual_to_logical; - std::map tensix_translated_to_logical; + virtual void fill_arc_physical_translated_mapping(); - std::map dram_logical_to_translated; - std::map dram_logical_to_virtual; - std::map dram_logical_to_physical; - - std::map dram_physical_to_logical; - std::map dram_virtual_to_logical; - std::map dram_translated_to_logical; - - std::map eth_logical_to_translated; - std::map eth_logical_to_virtual; - std::map eth_logical_to_physical; - - std::map eth_physical_to_logical; - std::map eth_virtual_to_logical; - std::map eth_translated_to_logical; - - std::map arc_logical_to_translated; - std::map arc_logical_to_virtual; - std::map arc_logical_to_physical; - - std::map arc_physical_to_logical; - std::map arc_virtual_to_logical; - std::map arc_translated_to_logical; - - std::map pcie_logical_to_translated; - std::map pcie_logical_to_virtual; - std::map pcie_logical_to_physical; - - std::map pcie_physical_to_logical; - std::map pcie_virtual_to_logical; - std::map pcie_translated_to_logical; - - std::map& get_logical_to_translated(CoreType core_type); - std::map& get_logical_to_virtual(CoreType core_type); - std::map& get_logical_to_physical(CoreType core_type); - - std::map& get_physical_to_logical(CoreType core_type); - std::map& get_virtual_to_logical(CoreType core_type); - std::map& get_translated_to_logical(CoreType core_type); + std::map to_physical_map; + std::map, tt::umd::CoreCoord> from_physical_map; const tt_xy_pair tensix_grid_size; const std::vector& tensix_cores; @@ -183,3 +138,5 @@ class CoordinateManager { const tt_xy_pair pcie_grid_size; const std::vector& pcie_cores; }; + +// friend diff --git a/device/api/umd/device/grayskull_coordinate_manager.h b/device/api/umd/device/grayskull_coordinate_manager.h index b3a49ba0..20660bd7 100644 --- a/device/api/umd/device/grayskull_coordinate_manager.h +++ b/device/api/umd/device/grayskull_coordinate_manager.h @@ -26,5 +26,5 @@ class GrayskullCoordinateManager : public CoordinateManager { const std::vector& pcie_cores); protected: - void fill_eth_logical_to_translated() override; + void fill_eth_physical_translated_mapping() override; }; diff --git a/device/api/umd/device/tt_core_coordinates.h b/device/api/umd/device/tt_core_coordinates.h index cdcec0dc..3e0b5803 100644 --- a/device/api/umd/device/tt_core_coordinates.h +++ b/device/api/umd/device/tt_core_coordinates.h @@ -58,6 +58,28 @@ struct CoreCoord : public tt_xy_pair { return this->x == other.x && this->y == other.y && this->core_type == other.core_type && this->coord_system == other.coord_system; } + + bool operator<(const CoreCoord& o) const { + if (x < o.x) { + return true; + } + if (x > o.x) { + return false; + } + if (y < o.y) { + return true; + } + if (y > o.y) { + return false; + } + if (core_type < o.core_type) { + return true; + } + if (core_type > o.core_type) { + return false; + } + return coord_system < o.coord_system; + } }; } // namespace tt::umd diff --git a/device/api/umd/device/wormhole_coordinate_manager.h b/device/api/umd/device/wormhole_coordinate_manager.h index 82b74549..488f9f23 100644 --- a/device/api/umd/device/wormhole_coordinate_manager.h +++ b/device/api/umd/device/wormhole_coordinate_manager.h @@ -26,6 +26,6 @@ class WormholeCoordinateManager : public CoordinateManager { const std::vector& pcie_cores); protected: - void fill_tensix_logical_to_translated() override; - void fill_eth_logical_to_translated() override; + void fill_tensix_physical_translated_mapping() override; + void fill_eth_physical_translated_mapping() override; }; diff --git a/device/blackhole/blackhole_coordinate_manager.cpp b/device/blackhole/blackhole_coordinate_manager.cpp index 6ff3de50..f0bbbccd 100644 --- a/device/blackhole/blackhole_coordinate_manager.cpp +++ b/device/blackhole/blackhole_coordinate_manager.cpp @@ -33,111 +33,132 @@ BlackholeCoordinateManager::BlackholeCoordinateManager( arc_cores, pcie_grid_size, pcie_cores) { - this->translate_tensix_coords(); - this->translate_dram_coords(); - this->translate_eth_coords(); - this->translate_arc_coords(); - this->translate_pcie_coords(); + initialize(); } void BlackholeCoordinateManager::translate_tensix_coords() { - size_t num_harvested_x = __builtin_popcount(tensix_harvesting_mask); + size_t num_harvested_x = CoordinateManager::get_num_harvested(tensix_harvesting_mask); size_t grid_size_x = tensix_grid_size.x; size_t grid_size_y = tensix_grid_size.y; size_t logical_x = 0; + size_t x_index = grid_size_x - num_harvested_x; for (size_t x = 0; x < grid_size_x; x++) { - if (!(tensix_harvesting_mask & (1 << x))) { + if (tensix_harvesting_mask & (1 << x)) { + for (size_t y = 0; y < grid_size_y; y++) { + const tt_xy_pair& physical_core = tensix_cores[x + y * grid_size_x]; + const tt_xy_pair& virtual_core = tensix_cores[x_index + y * grid_size_x]; + + CoreCoord virtual_coord = + CoreCoord(virtual_core.x, virtual_core.y, CoreType::TENSIX, CoordSystem::VIRTUAL); + + add_core_translation(virtual_coord, physical_core); + } + x_index++; + } else { for (size_t y = 0; y < grid_size_y; y++) { const tt_xy_pair& tensix_core = tensix_cores[x + y * grid_size_x]; - tensix_logical_to_physical[{logical_x, y}] = - CoreCoord(tensix_core.x, tensix_core.y, CoreType::TENSIX, CoordSystem::PHYSICAL); - tensix_physical_to_logical[tensix_core] = - CoreCoord(logical_x, y, CoreType::TENSIX, CoordSystem::LOGICAL); + const tt_xy_pair& virtual_core = tensix_cores[logical_x + y * grid_size_x]; + + CoreCoord logical_coord = CoreCoord(logical_x, y, CoreType::TENSIX, CoordSystem::LOGICAL); + add_core_translation(logical_coord, tensix_core); + + CoreCoord virtual_coord = + CoreCoord(virtual_core.x, virtual_core.y, CoreType::TENSIX, CoordSystem::VIRTUAL); + add_core_translation(virtual_coord, tensix_core); } logical_x++; } } - for (size_t x = 0; x < grid_size_x - num_harvested_x; x++) { - for (size_t y = 0; y < grid_size_y; y++) { - const tt_xy_pair& tensix_core = tensix_cores[x + y * grid_size_x]; - tensix_logical_to_virtual[{x, y}] = - CoreCoord(tensix_core.x, tensix_core.y, CoreType::TENSIX, CoordSystem::VIRTUAL); - tensix_virtual_to_logical[tensix_core] = CoreCoord(x, y, CoreType::TENSIX, CoordSystem::LOGICAL); - } - } - - fill_tensix_logical_to_translated(); + fill_tensix_physical_translated_mapping(); } -void BlackholeCoordinateManager::fill_tensix_logical_to_translated() { - const size_t num_harvested_x = __builtin_popcount(tensix_harvesting_mask); - const size_t grid_size_x = tensix_grid_size.x; - const size_t grid_size_y = tensix_grid_size.y; - - for (size_t x = 0; x < grid_size_x - num_harvested_x; x++) { - for (size_t y = 0; y < grid_size_y; y++) { - const CoreCoord virtual_coord = tensix_logical_to_virtual[{x, y}]; - const size_t translated_x = virtual_coord.x; - const size_t translated_y = virtual_coord.y; - tensix_logical_to_translated[{x, y}] = - CoreCoord(translated_x, translated_y, CoreType::TENSIX, CoordSystem::TRANSLATED); - tensix_translated_to_logical[{translated_x, translated_y}] = - CoreCoord(x, y, CoreType::TENSIX, CoordSystem::LOGICAL); - } +void BlackholeCoordinateManager::fill_tensix_physical_translated_mapping() { + for (const tt_xy_pair& physical_core : tensix_cores) { + const CoreCoord virtual_coord = from_physical_map.at({physical_core, CoordSystem::VIRTUAL}); + const CoreCoord translated_coord = + CoreCoord(virtual_coord.x, virtual_coord.y, CoreType::TENSIX, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_core); } } void BlackholeCoordinateManager::translate_dram_coords() { - size_t num_harvested_banks = __builtin_popcount(dram_harvesting_mask); + size_t num_harvested_banks = CoordinateManager::get_num_harvested(dram_harvesting_mask); + + size_t logical_x = 0; + for (size_t x = 0; x < dram_grid_size.x; x++) { + if (!(dram_harvesting_mask & (1 << x))) { + for (size_t y = 0; y < dram_grid_size.y; y++) { + const tt_xy_pair& dram_core = dram_cores[x * dram_grid_size.y + y]; + + CoreCoord logical_coord = CoreCoord(logical_x, y, CoreType::DRAM, CoordSystem::LOGICAL); + + add_core_translation(logical_coord, dram_core); + } + logical_x++; + } + } for (size_t x = 0; x < dram_grid_size.x - num_harvested_banks; x++) { for (size_t y = 0; y < dram_grid_size.y; y++) { const tt_xy_pair& dram_core = dram_cores[x * dram_grid_size.y + y]; - dram_logical_to_virtual[{x, y}] = CoreCoord(dram_core.x, dram_core.y, CoreType::DRAM, CoordSystem::VIRTUAL); - dram_virtual_to_logical[dram_core] = CoreCoord(x, y, CoreType::DRAM, CoordSystem::LOGICAL); + CoreCoord dram_logical = CoreCoord(x, y, CoreType::DRAM, CoordSystem::LOGICAL); + CoreCoord dram_virtual = CoreCoord(dram_core.x, dram_core.y, CoreType::DRAM, CoordSystem::VIRTUAL); + + const tt_xy_pair physical_pair = to_physical_map[dram_logical]; + + add_core_translation(dram_virtual, physical_pair); } } - size_t logical_x = 0; + size_t harvested_index = (dram_grid_size.x - num_harvested_banks) * dram_grid_size.y; for (size_t x = 0; x < dram_grid_size.x; x++) { - if (!(dram_harvesting_mask & (1 << x))) { + if (dram_harvesting_mask & (1 << x)) { for (size_t y = 0; y < dram_grid_size.y; y++) { const tt_xy_pair& dram_core = dram_cores[x * dram_grid_size.y + y]; - dram_logical_to_physical[{logical_x, y}] = - CoreCoord(dram_core.x, dram_core.y, CoreType::DRAM, CoordSystem::PHYSICAL); - dram_physical_to_logical[dram_core] = CoreCoord(logical_x, y, CoreType::DRAM, CoordSystem::LOGICAL); + const tt_xy_pair& virtual_core = dram_cores[harvested_index++]; + + CoreCoord virtual_coord = + CoreCoord(virtual_core.x, virtual_core.y, CoreType::DRAM, CoordSystem::VIRTUAL); + + add_core_translation(virtual_coord, dram_core); } - logical_x++; } } - fill_dram_logical_to_translated(); + fill_dram_physical_translated_mapping(); } -void BlackholeCoordinateManager::fill_eth_logical_to_translated() { +void BlackholeCoordinateManager::fill_eth_physical_translated_mapping() { for (size_t x = 0; x < eth_grid_size.x; x++) { for (size_t y = 0; y < eth_grid_size.y; y++) { const size_t translated_x = x + blackhole::eth_translated_coordinate_start_x; const size_t translated_y = y + blackhole::eth_translated_coordinate_start_y; - eth_logical_to_translated[{x, y}] = - CoreCoord(translated_x, translated_y, CoreType::ETH, CoordSystem::TRANSLATED); - eth_translated_to_logical[{translated_x, translated_y}] = - CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + + CoreCoord logical_coord = CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::ETH, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_pair); } } } -void BlackholeCoordinateManager::fill_pcie_logical_to_translated() { - pcie_logical_to_translated[{0, 0}] = CoreCoord( +void BlackholeCoordinateManager::fill_pcie_physical_translated_mapping() { + CoreCoord logical_coord = CoreCoord(0, 0, CoreType::PCIE, CoordSystem::LOGICAL); + + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + + CoreCoord translated_coord = CoreCoord( blackhole::pcie_translated_coordinate_start_x, blackhole::pcie_translated_coordinate_start_y, CoreType::PCIE, CoordSystem::TRANSLATED); - pcie_translated_to_logical[{ - blackhole::pcie_translated_coordinate_start_x, blackhole::pcie_translated_coordinate_start_y}] = - CoreCoord(0, 0, CoreType::PCIE, CoordSystem::LOGICAL); + + add_core_translation(translated_coord, physical_pair); } void BlackholeCoordinateManager::map_column_of_dram_banks( @@ -145,16 +166,19 @@ void BlackholeCoordinateManager::map_column_of_dram_banks( size_t translated_y = blackhole::dram_translated_coordinate_start_y; for (size_t bank = start_bank; bank < end_bank; bank++) { for (size_t port = 0; port < blackhole::NUM_NOC_PORTS_PER_DRAM_BANK; port++) { - dram_logical_to_translated[{bank, port}] = - CoreCoord(x_coord, translated_y, CoreType::DRAM, CoordSystem::TRANSLATED); - dram_translated_to_logical[{x_coord, translated_y}] = - CoreCoord(bank, port, CoreType::DRAM, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(bank, port, CoreType::DRAM, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + + CoreCoord translated_coord = CoreCoord(x_coord, translated_y, CoreType::DRAM, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_pair); + translated_y++; } } } -void BlackholeCoordinateManager::fill_dram_logical_to_translated() { +void BlackholeCoordinateManager::fill_dram_physical_translated_mapping() { const std::vector harvested_banks = CoordinateManager::get_harvested_indices(dram_harvesting_mask); if (harvested_banks.empty()) { @@ -184,4 +208,28 @@ void BlackholeCoordinateManager::fill_dram_logical_to_translated() { blackhole::NUM_DRAM_BANKS - 1, blackhole::dram_translated_coordinate_start_x + 1); } + + const size_t virtual_index = (dram_grid_size.x - 1) * dram_grid_size.y; + const size_t physical_index = harvested_bank * dram_grid_size.y; + + const size_t harvested_bank_translated_x = blackhole::dram_translated_coordinate_start_x + 1; + const size_t harvested_bank_translated_y = + blackhole::dram_translated_coordinate_start_y + (dram_grid_size.x / 2 - 1) * dram_grid_size.y; + + for (size_t noc_port = 0; noc_port < dram_grid_size.y; noc_port++) { + const tt_xy_pair& physical_core = dram_cores[physical_index + noc_port]; + const tt_xy_pair& virtual_core = dram_cores[virtual_index + noc_port]; + + CoreCoord virtual_coord = CoreCoord(virtual_core.x, virtual_core.y, CoreType::DRAM, CoordSystem::VIRTUAL); + + add_core_translation(virtual_coord, physical_core); + + CoreCoord translated_coord = CoreCoord( + harvested_bank_translated_x, + harvested_bank_translated_y + noc_port, + CoreType::DRAM, + CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_core); + } } diff --git a/device/coordinate_manager.cpp b/device/coordinate_manager.cpp index e936bdac..595421f3 100644 --- a/device/coordinate_manager.cpp +++ b/device/coordinate_manager.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. * * SPDX-License-Identifier: Apache-2.0 */ @@ -38,252 +38,118 @@ CoordinateManager::CoordinateManager( pcie_grid_size(pcie_grid_size), pcie_cores(pcie_cores) {} -std::map& CoordinateManager::get_logical_to_translated(CoreType core_type) { - switch (core_type) { - case CoreType::TENSIX: - return tensix_logical_to_translated; - case CoreType::DRAM: - return dram_logical_to_translated; - case CoreType::ACTIVE_ETH: - case CoreType::IDLE_ETH: - case CoreType::ETH: - return eth_logical_to_translated; - case CoreType::ARC: - return arc_logical_to_translated; - case CoreType::PCIE: - return pcie_logical_to_translated; - default: - throw std::runtime_error("Core type is not supported for getting logical to translated mapping"); - } -} - -std::map& CoordinateManager::get_logical_to_virtual(CoreType core_type) { - switch (core_type) { - case CoreType::TENSIX: - return tensix_logical_to_virtual; - case CoreType::DRAM: - return dram_logical_to_virtual; - case CoreType::ACTIVE_ETH: - case CoreType::IDLE_ETH: - case CoreType::ETH: - return eth_logical_to_virtual; - case CoreType::ARC: - return arc_logical_to_virtual; - case CoreType::PCIE: - return pcie_logical_to_virtual; - default: - throw std::runtime_error("Core type is not supported for getting logical to virtual mapping"); - } -} - -std::map& CoordinateManager::get_logical_to_physical(CoreType core_type) { - switch (core_type) { - case CoreType::TENSIX: - return tensix_logical_to_physical; - case CoreType::DRAM: - return dram_logical_to_physical; - case CoreType::ACTIVE_ETH: - case CoreType::IDLE_ETH: - case CoreType::ETH: - return eth_logical_to_physical; - case CoreType::ARC: - return arc_logical_to_physical; - case CoreType::PCIE: - return pcie_logical_to_physical; - default: - throw std::runtime_error("Core type is not supported for getting logical to physical mapping"); - } -} - -std::map& CoordinateManager::get_physical_to_logical(CoreType core_type) { - switch (core_type) { - case CoreType::TENSIX: - return tensix_physical_to_logical; - case CoreType::DRAM: - return dram_physical_to_logical; - case CoreType::ACTIVE_ETH: - case CoreType::IDLE_ETH: - case CoreType::ETH: - return eth_physical_to_logical; - case CoreType::ARC: - return arc_physical_to_logical; - case CoreType::PCIE: - return pcie_physical_to_logical; - default: - throw std::runtime_error("Core type is not supported for getting physical to logical mapping"); - } +void CoordinateManager::initialize() { + this->identity_map_physical_cores(); + this->translate_tensix_coords(); + this->translate_dram_coords(); + this->translate_eth_coords(); + this->translate_arc_coords(); + this->translate_pcie_coords(); } -std::map& CoordinateManager::get_virtual_to_logical(CoreType core_type) { - switch (core_type) { - case CoreType::TENSIX: - return tensix_virtual_to_logical; - case CoreType::DRAM: - return dram_virtual_to_logical; - case CoreType::ACTIVE_ETH: - case CoreType::IDLE_ETH: - case CoreType::ETH: - return eth_virtual_to_logical; - case CoreType::ARC: - return arc_virtual_to_logical; - case CoreType::PCIE: - return pcie_virtual_to_logical; - default: - throw std::runtime_error("Core type is not supported for getting virtual to logical mapping"); - } +void CoordinateManager::add_core_translation(const CoreCoord& core_coord, const tt_xy_pair& physical_pair) { + to_physical_map.insert({core_coord, physical_pair}); + from_physical_map.insert({{{physical_pair.x, physical_pair.y}, core_coord.coord_system}, core_coord}); } -std::map& CoordinateManager::get_translated_to_logical(CoreType core_type) { - switch (core_type) { - case CoreType::TENSIX: - return tensix_translated_to_logical; - case CoreType::DRAM: - return dram_translated_to_logical; - case CoreType::ACTIVE_ETH: - case CoreType::IDLE_ETH: - case CoreType::ETH: - return eth_translated_to_logical; - case CoreType::ARC: - return arc_translated_to_logical; - case CoreType::PCIE: - return pcie_translated_to_logical; - default: - throw std::runtime_error("Core type is not supported for getting translated to logical mapping"); +void CoordinateManager::identity_map_physical_cores() { + for (auto& core : tensix_cores) { + const CoreCoord core_coord = CoreCoord(core.x, core.y, CoreType::TENSIX, CoordSystem::PHYSICAL); + add_core_translation(core_coord, core); } -} -CoreCoord CoordinateManager::to_physical(const CoreCoord core_coord) { - switch (core_coord.coord_system) { - case CoordSystem::PHYSICAL: - return core_coord; - case CoordSystem::VIRTUAL: - case CoordSystem::TRANSLATED: - return to_physical(to_logical(core_coord)); - case CoordSystem::LOGICAL: { - auto& logical_mapping = get_logical_to_physical(core_coord.core_type); - return logical_mapping[{core_coord.x, core_coord.y}]; - } - default: - throw std::runtime_error( - "Unexpected CoordSystem value " + std::to_string((uint8_t)core_coord.coord_system)); + for (auto& core : dram_cores) { + const CoreCoord core_coord = CoreCoord(core.x, core.y, CoreType::DRAM, CoordSystem::PHYSICAL); + add_core_translation(core_coord, core); } -} -CoreCoord CoordinateManager::to_virtual(const CoreCoord core_coord) { - switch (core_coord.coord_system) { - case CoordSystem::TRANSLATED: - case CoordSystem::PHYSICAL: - return to_virtual(to_logical(core_coord)); - case CoordSystem::VIRTUAL: - return core_coord; - case CoordSystem::LOGICAL: { - auto& logical_mapping = get_logical_to_virtual(core_coord.core_type); - return logical_mapping[{core_coord.x, core_coord.y}]; - } - default: - throw std::runtime_error( - "Unexpected CoordSystem value " + std::to_string((uint8_t)core_coord.coord_system)); + for (auto& core : eth_cores) { + const CoreCoord core_coord = CoreCoord(core.x, core.y, CoreType::ETH, CoordSystem::PHYSICAL); + add_core_translation(core_coord, core); } -} -CoreCoord CoordinateManager::to_logical(const CoreCoord core_coord) { - switch (core_coord.coord_system) { - case CoordSystem::LOGICAL: - return core_coord; - case CoordSystem::PHYSICAL: { - auto& physical_mapping = get_physical_to_logical(core_coord.core_type); - return physical_mapping[{core_coord.x, core_coord.y}]; - } - case CoordSystem::VIRTUAL: { - auto& virtual_mapping = get_virtual_to_logical(core_coord.core_type); - return virtual_mapping[{core_coord.x, core_coord.y}]; - } - case CoordSystem::TRANSLATED: { - auto& translated_mapping = get_translated_to_logical(core_coord.core_type); - return translated_mapping[{core_coord.x, core_coord.y}]; - } - default: - throw std::runtime_error( - "Unexpected CoordSystem value " + std::to_string((uint8_t)core_coord.coord_system)); + for (auto& core : arc_cores) { + const CoreCoord core_coord = CoreCoord(core.x, core.y, CoreType::ARC, CoordSystem::PHYSICAL); + add_core_translation(core_coord, core); } -} -CoreCoord CoordinateManager::to_translated(const CoreCoord core_coord) { - switch (core_coord.coord_system) { - case CoordSystem::PHYSICAL: - case CoordSystem::VIRTUAL: - return to_translated(to_logical(core_coord)); - case CoordSystem::TRANSLATED: - return core_coord; - case CoordSystem::LOGICAL: { - auto& logical_mapping = get_logical_to_translated(core_coord.core_type); - return logical_mapping[{core_coord.x, core_coord.y}]; - } - default: - throw std::runtime_error( - "Unexpected CoordSystem value " + std::to_string((uint8_t)core_coord.coord_system)); + for (auto& core : pcie_cores) { + const CoreCoord core_coord = CoreCoord(core.x, core.y, CoreType::PCIE, CoordSystem::PHYSICAL); + add_core_translation(core_coord, core); } } CoreCoord CoordinateManager::to(const CoreCoord core_coord, const CoordSystem coord_system) { - switch (coord_system) { - case CoordSystem::LOGICAL: - return to_logical(core_coord); - case CoordSystem::PHYSICAL: - return to_physical(core_coord); - case CoordSystem::VIRTUAL: - return to_virtual(core_coord); - case CoordSystem::TRANSLATED: - return to_translated(core_coord); - default: - throw std::runtime_error( - "Unexpected CoordSystem value " + std::to_string((uint8_t)core_coord.coord_system)); - } + return from_physical_map.at({to_physical_map.at(core_coord), coord_system}); } void CoordinateManager::translate_tensix_coords() { - size_t num_harvested_y = __builtin_popcount(tensix_harvesting_mask); + size_t num_harvested_y = CoordinateManager::get_num_harvested(tensix_harvesting_mask); size_t grid_size_x = tensix_grid_size.x; size_t grid_size_y = tensix_grid_size.y; size_t logical_y = 0; + size_t harvested_index = (grid_size_y - num_harvested_y) * grid_size_x; for (size_t y = 0; y < grid_size_y; y++) { - if (!(tensix_harvesting_mask & (1 << y))) { + if (tensix_harvesting_mask & (1 << y)) { + for (size_t x = 0; x < grid_size_x; x++) { + const tt_xy_pair& physical_core = tensix_cores[y * grid_size_x + x]; + const tt_xy_pair& virtual_core = tensix_cores[harvested_index++]; + + CoreCoord virtual_coord = + CoreCoord(virtual_core.x, virtual_core.y, CoreType::TENSIX, CoordSystem::VIRTUAL); + + add_core_translation(virtual_coord, physical_core); + } + } else { for (size_t x = 0; x < grid_size_x; x++) { const tt_xy_pair& tensix_core = tensix_cores[y * grid_size_x + x]; - tensix_logical_to_physical[{x, logical_y}] = - CoreCoord(tensix_core.x, tensix_core.y, CoreType::TENSIX, CoordSystem::PHYSICAL); - tensix_physical_to_logical[tensix_core] = - CoreCoord(x, logical_y, CoreType::TENSIX, CoordSystem::LOGICAL); + const tt_xy_pair& virtual_core = tensix_cores[logical_y * grid_size_x + x]; + + CoreCoord logical_coord = CoreCoord(x, logical_y, CoreType::TENSIX, CoordSystem::LOGICAL); + add_core_translation(logical_coord, tensix_core); + + CoreCoord virtual_coord = + CoreCoord(virtual_core.x, virtual_core.y, CoreType::TENSIX, CoordSystem::VIRTUAL); + add_core_translation(virtual_coord, tensix_core); } logical_y++; } } - for (size_t y = 0; y < grid_size_y - num_harvested_y; y++) { - for (size_t x = 0; x < grid_size_x; x++) { - const tt_xy_pair& tensix_core = tensix_cores[y * grid_size_x + x]; - tensix_logical_to_virtual[{x, y}] = - CoreCoord(tensix_core.x, tensix_core.y, CoreType::TENSIX, CoordSystem::VIRTUAL); - tensix_virtual_to_logical[tensix_core] = CoreCoord(x, y, CoreType::TENSIX, CoordSystem::LOGICAL); - } - } - - fill_tensix_logical_to_translated(); + this->fill_tensix_physical_translated_mapping(); } -void CoordinateManager::fill_tensix_logical_to_translated() { - size_t num_harvested_y = __builtin_popcount(tensix_harvesting_mask); +void CoordinateManager::fill_tensix_physical_translated_mapping() { + size_t num_harvested_y = CoordinateManager::get_num_harvested(tensix_harvesting_mask); for (size_t x = 0; x < tensix_grid_size.x; x++) { for (size_t y = 0; y < tensix_grid_size.y - num_harvested_y; y++) { - const CoreCoord physical_coord = tensix_logical_to_physical[{x, y}]; - const size_t translated_x = physical_coord.x; - const size_t translated_y = physical_coord.y; - tensix_logical_to_translated[{x, y}] = + CoreCoord logical_coord = CoreCoord(x, y, CoreType::TENSIX, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + const size_t translated_x = physical_pair.x; + const size_t translated_y = physical_pair.y; + + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::TENSIX, CoordSystem::TRANSLATED); - tensix_translated_to_logical[tt_xy_pair(translated_x, translated_y)] = - CoreCoord(x, y, CoreType::TENSIX, CoordSystem::LOGICAL); + + add_core_translation(translated_coord, physical_pair); + } + } + + size_t harvested_index = (tensix_grid_size.y - num_harvested_y) * tensix_grid_size.x; + for (size_t y = 0; y < tensix_grid_size.y; y++) { + if (tensix_harvesting_mask & (1 << y)) { + for (size_t x = 0; x < tensix_grid_size.x; x++) { + const tt_xy_pair& physical_core = tensix_cores[y * tensix_grid_size.x + x]; + const size_t translated_x = physical_core.x; + const size_t translated_y = physical_core.y; + + CoreCoord translated_coord = + CoreCoord(translated_x, translated_y, CoreType::TENSIX, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_core); + } } } } @@ -292,120 +158,121 @@ void CoordinateManager::translate_dram_coords() { for (size_t x = 0; x < dram_grid_size.x; x++) { for (size_t y = 0; y < dram_grid_size.y; y++) { const tt_xy_pair dram_core = dram_cores[x * dram_grid_size.y + y]; - dram_logical_to_virtual[{x, y}] = CoreCoord(dram_core.x, dram_core.y, CoreType::DRAM, CoordSystem::VIRTUAL); - dram_virtual_to_logical[dram_core] = CoreCoord(x, y, CoreType::DRAM, CoordSystem::LOGICAL); - dram_logical_to_physical[{x, y}] = - CoreCoord(dram_core.x, dram_core.y, CoreType::DRAM, CoordSystem::PHYSICAL); - dram_physical_to_logical[dram_core] = CoreCoord(x, y, CoreType::DRAM, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::DRAM, CoordSystem::LOGICAL); + CoreCoord virtual_coord = CoreCoord(dram_core.x, dram_core.y, CoreType::DRAM, CoordSystem::VIRTUAL); + + add_core_translation(logical_coord, dram_core); + add_core_translation(virtual_coord, dram_core); } } - fill_dram_logical_to_translated(); + fill_dram_physical_translated_mapping(); } void CoordinateManager::translate_eth_coords() { for (size_t x = 0; x < eth_grid_size.x; x++) { for (size_t y = 0; y < eth_grid_size.y; y++) { const tt_xy_pair eth_core = eth_cores[x * eth_grid_size.y + y]; - eth_logical_to_virtual[{x, y}] = CoreCoord(eth_core.x, eth_core.y, CoreType::ETH, CoordSystem::VIRTUAL); - eth_virtual_to_logical[eth_core] = CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); - eth_logical_to_physical[{x, y}] = CoreCoord(eth_core.x, eth_core.y, CoreType::ETH, CoordSystem::PHYSICAL); - eth_physical_to_logical[eth_core] = CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + CoreCoord virtual_coord = CoreCoord(eth_core.x, eth_core.y, CoreType::ETH, CoordSystem::VIRTUAL); + + add_core_translation(logical_coord, eth_core); + add_core_translation(virtual_coord, eth_core); } } - fill_eth_logical_to_translated(); + fill_eth_physical_translated_mapping(); } void CoordinateManager::translate_arc_coords() { for (size_t x = 0; x < arc_grid_size.x; x++) { for (size_t y = 0; y < arc_grid_size.y; y++) { const tt_xy_pair arc_core = arc_cores[x * arc_grid_size.y + y]; - arc_logical_to_virtual[{x, y}] = CoreCoord(arc_core.x, arc_core.y, CoreType::ARC, CoordSystem::VIRTUAL); - arc_virtual_to_logical[arc_core] = CoreCoord(x, y, CoreType::ARC, CoordSystem::LOGICAL); - arc_logical_to_physical[{x, y}] = CoreCoord(arc_core.x, arc_core.y, CoreType::ARC, CoordSystem::PHYSICAL); - arc_physical_to_logical[arc_core] = CoreCoord(x, y, CoreType::ARC, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::ARC, CoordSystem::LOGICAL); + CoreCoord virtual_coord = CoreCoord(arc_core.x, arc_core.y, CoreType::ARC, CoordSystem::VIRTUAL); - arc_logical_to_translated[{x, y}] = - CoreCoord(arc_core.x, arc_core.y, CoreType::ARC, CoordSystem::TRANSLATED); - arc_translated_to_logical[arc_core] = CoreCoord(x, y, CoreType::ARC, CoordSystem::LOGICAL); + add_core_translation(logical_coord, arc_core); + add_core_translation(virtual_coord, arc_core); } } - fill_arc_logical_to_translated(); + fill_arc_physical_translated_mapping(); } void CoordinateManager::translate_pcie_coords() { for (size_t x = 0; x < pcie_grid_size.x; x++) { for (size_t y = 0; y < pcie_grid_size.y; y++) { const tt_xy_pair pcie_core = pcie_cores[x * pcie_grid_size.y + y]; - pcie_logical_to_virtual[{x, y}] = CoreCoord(pcie_core.x, pcie_core.y, CoreType::PCIE, CoordSystem::VIRTUAL); - pcie_virtual_to_logical[pcie_core] = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL); + CoreCoord virtual_coord = CoreCoord(pcie_core.x, pcie_core.y, CoreType::PCIE, CoordSystem::VIRTUAL); - pcie_logical_to_physical[{x, y}] = - CoreCoord(pcie_core.x, pcie_core.y, CoreType::PCIE, CoordSystem::PHYSICAL); - pcie_physical_to_logical[pcie_core] = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL); + add_core_translation(logical_coord, pcie_core); + add_core_translation(virtual_coord, pcie_core); } } - fill_pcie_logical_to_translated(); + fill_pcie_physical_translated_mapping(); } -void CoordinateManager::fill_eth_logical_to_translated() { +void CoordinateManager::fill_eth_physical_translated_mapping() { for (size_t x = 0; x < eth_grid_size.x; x++) { for (size_t y = 0; y < eth_grid_size.y; y++) { - const CoreCoord physical_coord = eth_logical_to_physical[{x, y}]; - const size_t translated_x = physical_coord.x; - const size_t translated_y = physical_coord.y; - eth_logical_to_translated[{x, y}] = - CoreCoord(translated_x, translated_y, CoreType::ETH, CoordSystem::TRANSLATED); - eth_translated_to_logical[{translated_x, translated_y}] = - CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + const size_t translated_x = physical_pair.x; + const size_t translated_y = physical_pair.y; + + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::ETH, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_pair); } } } -void CoordinateManager::fill_dram_logical_to_translated() { +void CoordinateManager::fill_dram_physical_translated_mapping() { for (size_t x = 0; x < dram_grid_size.x; x++) { for (size_t y = 0; y < dram_grid_size.y; y++) { - const CoreCoord physical_coord = dram_logical_to_physical[{x, y}]; - const size_t translated_x = physical_coord.x; - const size_t translated_y = physical_coord.y; - dram_logical_to_translated[{x, y}] = - CoreCoord(translated_x, translated_y, CoreType::DRAM, CoordSystem::TRANSLATED); - dram_translated_to_logical[{translated_x, translated_y}] = - CoreCoord(x, y, CoreType::DRAM, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::DRAM, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + const size_t translated_x = physical_pair.x; + const size_t translated_y = physical_pair.y; + + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::DRAM, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_pair); } } } -void CoordinateManager::fill_pcie_logical_to_translated() { +void CoordinateManager::fill_pcie_physical_translated_mapping() { for (size_t x = 0; x < pcie_grid_size.x; x++) { for (size_t y = 0; y < pcie_grid_size.y; y++) { - const CoreCoord physical_coord = pcie_logical_to_physical[{x, y}]; - const size_t translated_x = physical_coord.x; - const size_t translated_y = physical_coord.y; - pcie_logical_to_translated[{x, y}] = - CoreCoord(translated_x, translated_y, CoreType::PCIE, CoordSystem::TRANSLATED); - pcie_translated_to_logical[{translated_x, translated_y}] = - CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + const size_t translated_x = physical_pair.x; + const size_t translated_y = physical_pair.y; + + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::PCIE, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_pair); } } } -void CoordinateManager::fill_arc_logical_to_translated() { +void CoordinateManager::fill_arc_physical_translated_mapping() { for (size_t x = 0; x < arc_grid_size.x; x++) { for (size_t y = 0; y < arc_grid_size.y; y++) { - const CoreCoord physical_coord = arc_logical_to_physical[{x, y}]; - const size_t translated_x = physical_coord.x; - const size_t translated_y = physical_coord.y; - arc_logical_to_translated[{x, y}] = - CoreCoord(translated_x, translated_y, CoreType::ARC, CoordSystem::TRANSLATED); - arc_translated_to_logical[{translated_x, translated_y}] = - CoreCoord(x, y, CoreType::ARC, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::ARC, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + const size_t translated_x = physical_pair.x; + const size_t translated_y = physical_pair.y; + + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::ARC, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_pair); } } } @@ -538,9 +405,9 @@ std::shared_ptr CoordinateManager::create_coordinate_manager( pcie_cores); case tt::ARCH::Invalid: throw std::runtime_error("Invalid architecture for creating coordinate manager"); + default: + throw std::runtime_error("Unexpected ARCH value " + std::to_string((int)arch)); } - - throw std::runtime_error("Invalid architecture for creating coordinate manager"); } size_t CoordinateManager::get_num_harvested(const size_t harvesting_mask) { diff --git a/device/grayskull/grayskull_coordinate_manager.cpp b/device/grayskull/grayskull_coordinate_manager.cpp index 3232717f..2f4eef99 100644 --- a/device/grayskull/grayskull_coordinate_manager.cpp +++ b/device/grayskull/grayskull_coordinate_manager.cpp @@ -33,23 +33,20 @@ GrayskullCoordinateManager::GrayskullCoordinateManager( arc_cores, pcie_grid_size, pcie_cores) { - this->translate_tensix_coords(); - this->translate_dram_coords(); - this->translate_eth_coords(); - this->translate_arc_coords(); - this->translate_pcie_coords(); + initialize(); } -void GrayskullCoordinateManager::fill_eth_logical_to_translated() { +void GrayskullCoordinateManager::fill_eth_physical_translated_mapping() { for (size_t x = 0; x < eth_grid_size.x; x++) { for (size_t y = 0; y < eth_grid_size.y; y++) { - const CoreCoord physical_coord = eth_logical_to_physical[{x, y}]; - const size_t translated_x = physical_coord.x; - const size_t translated_y = physical_coord.y; - eth_logical_to_translated[{x, y}] = - CoreCoord(translated_x, translated_y, CoreType::ETH, CoordSystem::TRANSLATED); - eth_translated_to_logical[{translated_x, translated_y}] = - CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + const size_t translated_x = physical_pair.x; + const size_t translated_y = physical_pair.y; + + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::ETH, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_pair); } } } diff --git a/device/wormhole/wormhole_coordinate_manager.cpp b/device/wormhole/wormhole_coordinate_manager.cpp index 4ba6dc52..375aaaa1 100644 --- a/device/wormhole/wormhole_coordinate_manager.cpp +++ b/device/wormhole/wormhole_coordinate_manager.cpp @@ -33,37 +33,53 @@ WormholeCoordinateManager::WormholeCoordinateManager( arc_cores, pcie_grid_size, pcie_cores) { - this->translate_tensix_coords(); - this->translate_dram_coords(); - this->translate_eth_coords(); - this->translate_arc_coords(); - this->translate_pcie_coords(); + initialize(); } -void WormholeCoordinateManager::fill_tensix_logical_to_translated() { - size_t num_harvested_y = __builtin_popcount(tensix_harvesting_mask); +void WormholeCoordinateManager::fill_tensix_physical_translated_mapping() { + size_t num_harvested_y = CoordinateManager::get_num_harvested(tensix_harvesting_mask); for (size_t y = 0; y < tensix_grid_size.y - num_harvested_y; y++) { for (size_t x = 0; x < tensix_grid_size.x; x++) { + CoreCoord logical_coord = CoreCoord(x, y, CoreType::TENSIX, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; const size_t translated_x = x + wormhole::tensix_translated_coordinate_start_x; const size_t translated_y = y + wormhole::tensix_translated_coordinate_start_y; - tensix_logical_to_translated[{x, y}] = + + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::TENSIX, CoordSystem::TRANSLATED); - tensix_translated_to_logical[tt_xy_pair(translated_x, translated_y)] = - CoreCoord(x, y, CoreType::TENSIX, CoordSystem::LOGICAL); + + add_core_translation(translated_coord, physical_pair); + } + } + + size_t harvested_index = (tensix_grid_size.y - num_harvested_y) * tensix_grid_size.x; + size_t translated_y = wormhole::tensix_translated_coordinate_start_y + tensix_grid_size.y - num_harvested_y; + for (size_t y = 0; y < tensix_grid_size.y; y++) { + if (tensix_harvesting_mask & (1 << y)) { + for (size_t x = 0; x < tensix_grid_size.x; x++) { + const tt_xy_pair physical_core = tensix_cores[y * tensix_grid_size.x + x]; + const size_t translated_x = x + wormhole::tensix_translated_coordinate_start_x; + CoreCoord translated_coord = + CoreCoord(translated_x, translated_y, CoreType::TENSIX, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_core); + } + translated_y++; } } } -void WormholeCoordinateManager::fill_eth_logical_to_translated() { +void WormholeCoordinateManager::fill_eth_physical_translated_mapping() { for (size_t x = 0; x < eth_grid_size.x; x++) { for (size_t y = 0; y < eth_grid_size.y; y++) { const size_t translated_x = x + wormhole::eth_translated_coordinate_start_x; const size_t translated_y = y + wormhole::eth_translated_coordinate_start_y; - eth_logical_to_translated[{x, y}] = - CoreCoord(translated_x, translated_y, CoreType::ETH, CoordSystem::TRANSLATED); - eth_translated_to_logical[{translated_x, translated_y}] = - CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + CoreCoord logical_coord = CoreCoord(x, y, CoreType::ETH, CoordSystem::LOGICAL); + const tt_xy_pair physical_pair = to_physical_map[logical_coord]; + CoreCoord translated_coord = CoreCoord(translated_x, translated_y, CoreType::ETH, CoordSystem::TRANSLATED); + + add_core_translation(translated_coord, physical_pair); } } } diff --git a/tests/api/test_core_coord_translation_bh.cpp b/tests/api/test_core_coord_translation_bh.cpp index 95cbe155..e468f590 100644 --- a/tests/api/test_core_coord_translation_bh.cpp +++ b/tests/api/test_core_coord_translation_bh.cpp @@ -201,6 +201,49 @@ TEST(CoordinateManager, CoordinateManagerBlackholeVirtualEqualTranslated) { } } +// Test mapping of the coordinates for harvested DRAM bank. +TEST(CoordinateManager, CoordinateManagerBlackholeTransltedMappingHarvested) { + const size_t harvesting_mask = 3; + std::shared_ptr coordinate_manager = + CoordinateManager::create_coordinate_manager(tt::ARCH::BLACKHOLE, harvesting_mask); + + const tt_xy_pair tensix_grid_size = tt::umd::blackhole::TENSIX_GRID_SIZE; + const std::vector tensix_cores = tt::umd::blackhole::TENSIX_CORES; + + size_t num_harvested_x = CoordinateManager::get_num_harvested(harvesting_mask); + + size_t index = 0; + size_t virtual_index = tensix_grid_size.x - num_harvested_x; + + for (size_t cnt = 0; cnt < num_harvested_x * tensix_grid_size.y; cnt++) { + CoreCoord physical_core = + CoreCoord(tensix_cores[index].x, tensix_cores[index].y, CoreType::TENSIX, CoordSystem::PHYSICAL); + const CoreCoord translated_core = coordinate_manager->to(physical_core, CoordSystem::TRANSLATED); + + const CoreCoord virtual_core = CoreCoord( + tensix_cores[virtual_index].x, tensix_cores[virtual_index].y, CoreType::TENSIX, CoordSystem::VIRTUAL); + const CoreCoord translated_core_from_virtual = coordinate_manager->to(virtual_core, CoordSystem::TRANSLATED); + + EXPECT_EQ(translated_core, translated_core_from_virtual); + + EXPECT_EQ(translated_core.x, tensix_cores[virtual_index].x); + EXPECT_EQ(translated_core.y, tensix_cores[virtual_index].y); + + index += tensix_grid_size.x; + virtual_index += tensix_grid_size.x; + + if (index >= tensix_cores.size()) { + index = index % tensix_cores.size(); + index++; + } + + if (virtual_index >= tensix_cores.size()) { + virtual_index = virtual_index % tensix_cores.size(); + virtual_index++; + } + } +} + // Test mapping of DRAM coordinates from logical to physical. When there is no DRAM harvesting, logical // coordinates should cover all physical coordinates. TEST(CoordinateManager, CoordinateManagerBlackholeDRAMNoHarvesting) { @@ -381,6 +424,47 @@ TEST(CoordinateManager, CoordinateManagerBlackholeDRAMTranslatedMapping) { } } +// Test DRAM translated/virtual/physical mapping +TEST(CoordinateManager, CoordinateManagerBlackholeDRAMVirtualPhysicalMapping) { + const size_t max_num_banks_harvested = tt::umd::blackhole::NUM_DRAM_BANKS; + const size_t num_dram_banks = tt::umd::blackhole::NUM_DRAM_BANKS; + const size_t num_noc_ports_per_bank = tt::umd::blackhole::NUM_NOC_PORTS_PER_DRAM_BANK; + + const std::vector dram_cores = tt::umd::blackhole::DRAM_CORES; + + const size_t dram_harvesting_mask = 1; + + std::shared_ptr coordinate_manager = + CoordinateManager::create_coordinate_manager(tt::ARCH::BLACKHOLE, 0, dram_harvesting_mask); + + const size_t physical_index = 0; + const size_t virtual_index = (num_dram_banks - 1) * num_noc_ports_per_bank; + + const size_t harvested_translated_bank_x = tt::umd::blackhole::dram_translated_coordinate_start_x + 1; + const size_t harvested_translated_bank_y = + tt::umd::blackhole::dram_translated_coordinate_start_y + 3 * num_noc_ports_per_bank; + + for (size_t noc_port = 0; noc_port < num_noc_ports_per_bank; noc_port++) { + const tt_xy_pair physical_pair = dram_cores[physical_index + noc_port]; + const tt_xy_pair virtual_pair = dram_cores[virtual_index + noc_port]; + + CoreCoord physical_core = CoreCoord(physical_pair.x, physical_pair.y, CoreType::DRAM, CoordSystem::PHYSICAL); + CoreCoord virtual_from_physical = coordinate_manager->to(physical_core, CoordSystem::VIRTUAL); + + CoreCoord virtual_core = CoreCoord(virtual_pair.x, virtual_pair.y, CoreType::DRAM, CoordSystem::VIRTUAL); + + EXPECT_EQ(virtual_from_physical, virtual_core); + + CoreCoord translated_core = coordinate_manager->to(physical_core, CoordSystem::TRANSLATED); + CoreCoord translated_from_virtual = coordinate_manager->to(virtual_core, CoordSystem::TRANSLATED); + + EXPECT_EQ(translated_core, translated_from_virtual); + + EXPECT_EQ(translated_core.x, harvested_translated_bank_x); + EXPECT_EQ(translated_core.y, harvested_translated_bank_y + noc_port); + } +} + // Test that we cannot create a coordinate manager with more than one DRAM bank harvested. TEST(CoordinateManager, CoordinateManagerBlackholeDRAMPMoreThanOneDRAMBankHarvested) { const size_t max_num_banks_harvested = tt::umd::blackhole::NUM_DRAM_BANKS; diff --git a/tests/api/test_core_coord_translation_gs.cpp b/tests/api/test_core_coord_translation_gs.cpp index 233cb9e3..1bbe2387 100644 --- a/tests/api/test_core_coord_translation_gs.cpp +++ b/tests/api/test_core_coord_translation_gs.cpp @@ -174,6 +174,62 @@ TEST(CoordinateManager, CoordinateManagerGrayskullLogicalVirtualMapping) { } } +// Test that harvested physical coordinates map to the last row of the virtual coordinates. +TEST(CoordinateManager, CoordinateManagerWormholePhysicalGrayskullHarvestedMapping) { + // Harvest first and second NOC layout row. + const size_t harvesting_mask = 3; + const size_t num_harvested = CoordinateManager::get_num_harvested(harvesting_mask); + std::shared_ptr coordinate_manager = + CoordinateManager::create_coordinate_manager(tt::ARCH::GRAYSKULL, 3); + + const std::vector tensix_cores = tt::umd::grayskull::TENSIX_CORES; + const tt_xy_pair tensix_grid_size = tt::umd::grayskull::TENSIX_GRID_SIZE; + + size_t virtual_index = (tensix_grid_size.y - num_harvested) * tensix_grid_size.x; + + for (size_t index = 0; index < num_harvested * tensix_grid_size.x; index++) { + const CoreCoord physical_core = + CoreCoord(tensix_cores[index].x, tensix_cores[index].y, CoreType::TENSIX, CoordSystem::PHYSICAL); + const CoreCoord virtual_core = coordinate_manager->to(physical_core, CoordSystem::VIRTUAL); + + EXPECT_EQ(virtual_core.x, tensix_cores[virtual_index].x); + EXPECT_EQ(virtual_core.y, tensix_cores[virtual_index].y); + + virtual_index++; + } +} + +// Test that harvested physical coordinates map to the last row of the virtual coordinates. +TEST(CoordinateManager, CoordinateManagerGrayskullPhysicalTranslatedHarvestedMapping) { + // Harvest first and second NOC layout row. + const size_t harvesting_mask = 3; + const size_t num_harvested = CoordinateManager::get_num_harvested(harvesting_mask); + std::shared_ptr coordinate_manager = + CoordinateManager::create_coordinate_manager(tt::ARCH::GRAYSKULL, 3); + + const std::vector tensix_cores = tt::umd::grayskull::TENSIX_CORES; + const tt_xy_pair tensix_grid_size = tt::umd::grayskull::TENSIX_GRID_SIZE; + + size_t virtual_index = (tensix_grid_size.y - num_harvested) * tensix_grid_size.x; + + for (size_t index = 0; index < num_harvested * tensix_grid_size.x; index++) { + const CoreCoord physical_core = + CoreCoord(tensix_cores[index].x, tensix_cores[index].y, CoreType::TENSIX, CoordSystem::PHYSICAL); + const CoreCoord translated_core = coordinate_manager->to(physical_core, CoordSystem::TRANSLATED); + + const CoreCoord virtual_core = CoreCoord( + tensix_cores[virtual_index].x, tensix_cores[virtual_index].y, CoreType::TENSIX, CoordSystem::VIRTUAL); + const CoreCoord translated_core_from_virtual = coordinate_manager->to(virtual_core, CoordSystem::TRANSLATED); + + EXPECT_EQ(translated_core, translated_core_from_virtual); + + EXPECT_EQ(physical_core.x, translated_core.x); + EXPECT_EQ(physical_core.y, translated_core.y); + + virtual_index++; + } +} + // Test mapping of DRAM coordinates from logical to physical. We have no DRAM harvesting on Grayskull, // so logical coordinates should cover all physical coordinates. TEST(CoordinateManager, CoordinateManagerGrayskullDRAMNoHarvesting) { diff --git a/tests/api/test_core_coord_translation_wh.cpp b/tests/api/test_core_coord_translation_wh.cpp index d19539a8..019d220b 100644 --- a/tests/api/test_core_coord_translation_wh.cpp +++ b/tests/api/test_core_coord_translation_wh.cpp @@ -8,6 +8,7 @@ #include "umd/device/wormhole_implementation.h" using namespace tt::umd; +#include // Tests that all physical coordinates are same as all virtual coordinates // when there is no harvesting. @@ -146,8 +147,8 @@ TEST(CoordinateManager, CoordinateManagerWormholeLogicalTranslatedTopLeft) { const size_t max_num_harvested_y = 10; - // We go up to numbers less than 2^10 - 1 to test all possible harvesting masks, we fon't want to try to convert if - // everything is harvested. + // We go up to numbers less than 2^10 - 1 to test all possible harvesting masks, we fon't want to try to convert + // if everything is harvested. for (size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y) - 1; harvesting_mask++) { std::shared_ptr coordinate_manager = CoordinateManager::create_coordinate_manager(tt::ARCH::WORMHOLE_B0, harvesting_mask); @@ -170,6 +171,75 @@ TEST(CoordinateManager, CoordinateManagerWormholeLogicalTranslatedTopLeft) { } } +// Test that harvested physical coordinates map to the last row of the virtual coordinates. +TEST(CoordinateManager, CoordinateManagerWormholePhysicalVirtualHarvestedMapping) { + // Harvest first and second NOC layout row. + const size_t harvesting_mask = 3; + const size_t num_harvested = CoordinateManager::get_num_harvested(harvesting_mask); + std::shared_ptr coordinate_manager = + CoordinateManager::create_coordinate_manager(tt::ARCH::WORMHOLE_B0, 3); + + const std::vector tensix_cores = tt::umd::wormhole::TENSIX_CORES; + const tt_xy_pair tensix_grid_size = tt::umd::wormhole::TENSIX_GRID_SIZE; + + size_t virtual_index = (tensix_grid_size.y - num_harvested) * tensix_grid_size.x; + + for (size_t index = 0; index < num_harvested * tensix_grid_size.x; index++) { + const CoreCoord physical_core = + CoreCoord(tensix_cores[index].x, tensix_cores[index].y, CoreType::TENSIX, CoordSystem::PHYSICAL); + const CoreCoord virtual_core = coordinate_manager->to(physical_core, CoordSystem::VIRTUAL); + + EXPECT_EQ(virtual_core.x, tensix_cores[virtual_index].x); + EXPECT_EQ(virtual_core.y, tensix_cores[virtual_index].y); + + virtual_index++; + } +} + +// Test that harvested physical coordinates map to the last row of the virtual coordinates. +TEST(CoordinateManager, CoordinateManagerWormholePhysicalTranslatedHarvestedMapping) { + // Harvest first and second NOC layout row. + const size_t harvesting_mask = 3; + const size_t num_harvested = CoordinateManager::get_num_harvested(harvesting_mask); + std::shared_ptr coordinate_manager = + CoordinateManager::create_coordinate_manager(tt::ARCH::WORMHOLE_B0, 3); + + const std::vector tensix_cores = tt::umd::wormhole::TENSIX_CORES; + const tt_xy_pair tensix_grid_size = tt::umd::wormhole::TENSIX_GRID_SIZE; + + size_t virtual_index = (tensix_grid_size.y - num_harvested) * tensix_grid_size.x; + + const size_t translated_x_start = tt::umd::wormhole::tensix_translated_coordinate_start_x; + const size_t translated_y_start = tt::umd::wormhole::tensix_translated_coordinate_start_y; + + size_t logical_x = 0; + size_t logical_y = tensix_grid_size.y - num_harvested; + + for (size_t index = 0; index < num_harvested * tensix_grid_size.x; index++) { + const CoreCoord physical_core = + CoreCoord(tensix_cores[index].x, tensix_cores[index].y, CoreType::TENSIX, CoordSystem::PHYSICAL); + const CoreCoord translated_core = coordinate_manager->to(physical_core, CoordSystem::TRANSLATED); + + const CoreCoord virtual_core = CoreCoord( + tensix_cores[virtual_index].x, tensix_cores[virtual_index].y, CoreType::TENSIX, CoordSystem::VIRTUAL); + const CoreCoord translated_core_from_virtual = coordinate_manager->to(virtual_core, CoordSystem::TRANSLATED); + + EXPECT_EQ(translated_core, translated_core_from_virtual); + + EXPECT_EQ(translated_core.x, translated_x_start + logical_x); + EXPECT_EQ(translated_core.y, translated_y_start + logical_y); + + logical_x++; + + if (logical_x == tensix_grid_size.x) { + logical_x = 0; + logical_y++; + } + + virtual_index++; + } +} + // Test translation of DRAM core coordinates. There is no DRAM harvesting on Wormhole, // so logical coordinates should cover all physical coordinates. TEST(CoordinateManager, CoordinateManagerWormholeDRAMNoHarvesting) { @@ -263,16 +333,16 @@ TEST(CoordinateManager, CoordinateManagerWormholePCIETranslation) { for (size_t x = 0; x < pcie_grid_size.x; x++) { for (size_t y = 0; y < pcie_grid_size.y; y++) { - const CoreCoord arc_logical = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL); - const CoreCoord arc_virtual = coordinate_manager->to(arc_logical, CoordSystem::VIRTUAL); - const CoreCoord arc_physical = coordinate_manager->to(arc_logical, CoordSystem::PHYSICAL); - const CoreCoord arc_translated = coordinate_manager->to(arc_logical, CoordSystem::TRANSLATED); + const CoreCoord pcie_logical = CoreCoord(x, y, CoreType::PCIE, CoordSystem::LOGICAL); + const CoreCoord pcie_virtual = coordinate_manager->to(pcie_logical, CoordSystem::VIRTUAL); + const CoreCoord pcie_physical = coordinate_manager->to(pcie_logical, CoordSystem::PHYSICAL); + const CoreCoord pcie_translated = coordinate_manager->to(pcie_logical, CoordSystem::TRANSLATED); - EXPECT_EQ(arc_virtual.x, arc_physical.x); - EXPECT_EQ(arc_virtual.y, arc_physical.y); + EXPECT_EQ(pcie_virtual.x, pcie_physical.x); + EXPECT_EQ(pcie_virtual.y, pcie_physical.y); - EXPECT_EQ(arc_physical.x, arc_translated.x); - EXPECT_EQ(arc_physical.y, arc_translated.y); + EXPECT_EQ(pcie_virtual.x, pcie_translated.x); + EXPECT_EQ(pcie_virtual.y, pcie_translated.y); } } } From bf740bd52406f9682f24c8380a4fb3bd5ee59d0e Mon Sep 17 00:00:00 2001 From: Joel Smith <140545543+joelsmithTT@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:17:33 -0600 Subject: [PATCH 3/3] IOMMU support episode II (#393) ### Issue https://github.com/tenstorrent/tt-umd/issues/370 ### Description Adds IOMMU support for Blackhole in a way that should be transparent to the application. ### List of the changes * Allow Blackhole to have multiple hugepages / host memory channels * Add an API on TTDevice for iATU programming * Rehome Blackhole iATU programming code to blackhole_tt_device.cpp * Remove unnecessary logic to determine hugepage quantity (just use what the application passes to Cluster constructor) * Add sysmem tests for Blackhole. ### Testing Manual testing was performed for both IOMMU on and IOMMU off cases using the newly-added sysmem tests for Blackhole. With IOMMU on: ``` [==========] Running 2 tests from 1 test suite. [----------] Global test environment set-up. [----------] 2 tests from SiliconDriverBH [ RUN ] SiliconDriverBH.SysmemTestWithPcie Detecting chips (found 1) 2024-12-10 20:40:07.019 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:40:07.020 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:40:07.083 | INFO | SiliconDriver - Detected PCI devices: [0] 2024-12-10 20:40:07.083 | INFO | SiliconDriver - Using local chip ids: {0} and remote chip ids {} 2024-12-10 20:40:07.083 | INFO | SiliconDriver - Opened PCI device 0; KMD version: 1.30.0, IOMMU: enabled 2024-12-10 20:40:07.170 | INFO | SiliconDriver - Allocating sysmem without hugepages (size: 0x40000000). 2024-12-10 20:40:07.417 | INFO | SiliconDriver - Mapped sysmem without hugepages to IOVA 0x3ffffff80000000. 2024-12-10 20:40:07.418 | INFO | SiliconDriver - Device: 0 Mapping iATU region 0 from 0x0 to 0x3fffffff to 0x3ffffff80000000 [ OK ] SiliconDriverBH.SysmemTestWithPcie (658 ms) [ RUN ] SiliconDriverBH.RandomSysmemTestWithPcie 2024-12-10 20:40:07.672 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:40:07.672 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:40:07.731 | INFO | SiliconDriver - Detected PCI devices: [0] 2024-12-10 20:40:07.731 | INFO | SiliconDriver - Using local chip ids: {0} and remote chip ids {} 2024-12-10 20:40:07.731 | INFO | SiliconDriver - Opened PCI device 0; KMD version: 1.30.0, IOMMU: enabled 2024-12-10 20:40:07.818 | INFO | SiliconDriver - Allocating sysmem without hugepages (size: 0x40000000). 2024-12-10 20:40:08.081 | INFO | SiliconDriver - Mapped sysmem without hugepages to IOVA 0x3ffffff80000000. 2024-12-10 20:40:08.327 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:40:08.327 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:40:08.387 | INFO | SiliconDriver - Detected PCI devices: [0] 2024-12-10 20:40:08.387 | INFO | SiliconDriver - Using local chip ids: {0} and remote chip ids {} 2024-12-10 20:40:08.387 | INFO | SiliconDriver - Opened PCI device 0; KMD version: 1.30.0, IOMMU: enabled 2024-12-10 20:40:08.474 | INFO | SiliconDriver - Allocating sysmem without hugepages (size: 0x100000000). 2024-12-10 20:40:09.453 | INFO | SiliconDriver - Mapped sysmem without hugepages to IOVA 0x3fffffe00000000. 2024-12-10 20:40:09.453 | INFO | SiliconDriver - Device: 0 Mapping iATU region 0 from 0x0 to 0x3fffffff to 0x3fffffe00000000 2024-12-10 20:40:09.454 | INFO | SiliconDriver - Device: 0 Mapping iATU region 1 from 0x40000000 to 0x7fffffff to 0x3fffffe40000000 2024-12-10 20:40:09.454 | INFO | SiliconDriver - Device: 0 Mapping iATU region 2 from 0x80000000 to 0xbfffffff to 0x3fffffe80000000 2024-12-10 20:40:09.454 | INFO | SiliconDriver - Device: 0 Mapping iATU region 3 from 0xc0000000 to 0xffffffff to 0x3fffffec0000000 [ OK ] SiliconDriverBH.RandomSysmemTestWithPcie (7754 ms) [----------] 2 tests from SiliconDriverBH (8413 ms total) [----------] Global test environment tear-down [==========] 2 tests from 1 test suite ran. (8413 ms total) [ PASSED ] 2 tests. ``` With IOMMU in passthrough: ``` [==========] Running 2 tests from 1 test suite. [----------] Global test environment set-up. [----------] 2 tests from SiliconDriverBH [ RUN ] SiliconDriverBH.SysmemTestWithPcie Detecting chips (found 1) 2024-12-10 20:59:03.744 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:59:03.745 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:59:03.812 | INFO | SiliconDriver - Detected PCI devices: [0] 2024-12-10 20:59:03.812 | INFO | SiliconDriver - Using local chip ids: {0} and remote chip ids {} 2024-12-10 20:59:03.813 | INFO | SiliconDriver - Opened PCI device 0; KMD version: 1.30.0, IOMMU: disabled 2024-12-10 20:59:03.928 | INFO | SiliconDriver - Device: 0 Mapping iATU region 0 from 0x0 to 0x3fffffff to 0xe00000000 [ OK ] SiliconDriverBH.SysmemTestWithPcie (383 ms) [ RUN ] SiliconDriverBH.RandomSysmemTestWithPcie 2024-12-10 20:59:04.121 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:59:04.121 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:59:04.177 | INFO | SiliconDriver - Detected PCI devices: [0] 2024-12-10 20:59:04.177 | INFO | SiliconDriver - Using local chip ids: {0} and remote chip ids {} 2024-12-10 20:59:04.177 | INFO | SiliconDriver - Opened PCI device 0; KMD version: 1.30.0, IOMMU: disabled 2024-12-10 20:59:04.380 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:59:04.380 | WARNING | SiliconDriver - Unknown board type for chip 0. This might happen because chip is running old firmware. Defaulting to UNKNOWN 2024-12-10 20:59:04.435 | INFO | SiliconDriver - Detected PCI devices: [0] 2024-12-10 20:59:04.435 | INFO | SiliconDriver - Using local chip ids: {0} and remote chip ids {} 2024-12-10 20:59:04.436 | INFO | SiliconDriver - Opened PCI device 0; KMD version: 1.30.0, IOMMU: disabled 2024-12-10 20:59:04.513 | INFO | SiliconDriver - Device: 0 Mapping iATU region 0 from 0x0 to 0x3fffffff to 0xe00000000 2024-12-10 20:59:04.513 | INFO | SiliconDriver - Device: 0 Mapping iATU region 1 from 0x40000000 to 0x7fffffff to 0xe40000000 2024-12-10 20:59:04.513 | INFO | SiliconDriver - Device: 0 Mapping iATU region 2 from 0x80000000 to 0xbfffffff to 0xe80000000 2024-12-10 20:59:04.513 | INFO | SiliconDriver - Device: 0 Mapping iATU region 3 from 0xc0000000 to 0xffffffff to 0xec0000000 [ OK ] SiliconDriverBH.RandomSysmemTestWithPcie (11055 ms) [----------] 2 tests from SiliconDriverBH (11438 ms total) [----------] Global test environment tear-down [==========] 2 tests from 1 test suite ran. (11438 ms total) [ PASSED ] 2 tests. ``` ### API Changes There are no API changes in this PR. --- device/api/umd/device/hugepage.h | 4 - device/api/umd/device/pci_device.hpp | 4 +- .../device/tt_device/blackhole_tt_device.h | 8 + device/api/umd/device/tt_device/tt_device.h | 29 +++ device/cluster.cpp | 179 +++++++----------- device/hugepage.cpp | 62 ------ device/pcie/pci_device.cpp | 6 +- device/tt_device/blackhole_tt_device.cpp | 77 ++++++-- device/tt_device/tt_device.cpp | 9 + tests/blackhole/test_silicon_driver_bh.cpp | 128 +++++++++++++ tests/microbenchmark/test_rw_tensix.cpp | 6 + 11 files changed, 320 insertions(+), 192 deletions(-) diff --git a/device/api/umd/device/hugepage.h b/device/api/umd/device/hugepage.h index f53246f9..9b4c9a5b 100644 --- a/device/api/umd/device/hugepage.h +++ b/device/api/umd/device/hugepage.h @@ -16,10 +16,6 @@ namespace tt::umd { // Get number of 1GB host hugepages installed. uint32_t get_num_hugepages(); -// Dynamically figure out how many host memory channels (based on hugepages installed) for each device, based on arch. -uint32_t get_available_num_host_mem_channels( - const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id); - // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G) std::string find_hugepage_dir(std::size_t pagesize); diff --git a/device/api/umd/device/pci_device.hpp b/device/api/umd/device/pci_device.hpp index 0c149422..ce4b6392 100644 --- a/device/api/umd/device/pci_device.hpp +++ b/device/api/umd/device/pci_device.hpp @@ -142,8 +142,8 @@ class PCIDevice { */ bool init_iommu(size_t size); - int get_num_host_mem_channels() const; - hugepage_mapping get_hugepage_mapping(int channel) const; + size_t get_num_host_mem_channels() const; + hugepage_mapping get_hugepage_mapping(size_t channel) const; /** * Map a buffer for DMA access by the device. diff --git a/device/api/umd/device/tt_device/blackhole_tt_device.h b/device/api/umd/device/tt_device/blackhole_tt_device.h index 27955749..dd22bded 100644 --- a/device/api/umd/device/tt_device/blackhole_tt_device.h +++ b/device/api/umd/device/tt_device/blackhole_tt_device.h @@ -6,6 +6,8 @@ #pragma once +#include + #include "umd/device/tt_device/tt_device.h" namespace tt::umd { @@ -13,5 +15,11 @@ class BlackholeTTDevice : public TTDevice { public: BlackholeTTDevice(std::unique_ptr pci_device); ~BlackholeTTDevice(); + + void configure_iatu_region(size_t region, uint64_t base, uint64_t target, size_t size) override; + +private: + static constexpr uint64_t ATU_OFFSET_IN_BH_BAR2 = 0x1200; + std::set iatu_regions_; }; } // namespace tt::umd diff --git a/device/api/umd/device/tt_device/tt_device.h b/device/api/umd/device/tt_device/tt_device.h index 31f9b818..056d0f08 100644 --- a/device/api/umd/device/tt_device/tt_device.h +++ b/device/api/umd/device/tt_device/tt_device.h @@ -85,6 +85,35 @@ class TTDevice { tt_xy_pair end, std::uint64_t ordering = tt::umd::tlb_data::Relaxed); + /** + * Configures a PCIe Address Translation Unit (iATU) region. + * + * Device software expects to be able to access memory that is shared with + * the host using the following NOC addresses at the PCIe core: + * - GS: 0x0 + * - WH: 0x8_0000_0000 + * - BH: 0x1000_0000_0000_0000 + * Without iATU configuration, these map to host PA 0x0. + * + * While modern hardware supports IOMMU with flexible IOVA mapping, we must + * maintain the iATU configuration to satisfy software that has hard-coded + * the above NOC addresses rather than using driver-provided IOVAs. + * + * This interface is only intended to be used for configuring sysmem with + * either 1GB hugepages or a compatible scheme. + * + * @param region iATU region index (0-15) + * @param base region * (1 << 30) + * @param target DMA address (PA or IOVA) to map to + * @param size size of the mapping window; must be (1 << 30) + * + * NOTE: Programming the iATU from userspace is architecturally incorrect: + * - iATU should be managed by KMD to ensure proper cleanup on process exit + * - Multiple processes can corrupt each other's iATU configurations + * We should fix this! + */ + virtual void configure_iatu_region(size_t region, uint64_t base, uint64_t target, size_t size); + protected: std::unique_ptr pci_device_; std::unique_ptr architecture_impl_; diff --git a/device/cluster.cpp b/device/cluster.cpp index 80859faa..e1981064 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -240,18 +240,7 @@ void Cluster::create_device( } auto pci_device = m_tt_device_map.at(logical_device_id)->get_pci_device(); - uint16_t pcie_device_id = pci_device->get_pci_device_id(); - uint32_t pcie_revision = pci_device->get_pci_revision(); - // TODO: get rid of this, it doesn't make any sense. - int num_host_mem_channels = - get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision); - if (pci_device->get_arch() == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1) { - // TODO: Implement support for multiple host channels on BLACKHOLE. - log_warning( - LogSiliconDriver, - "Forcing a single channel for Blackhole device. Multiple host channels not supported."); - num_host_mem_channels = 1; - } + int num_host_mem_channels = num_host_mem_ch_per_mmio_device; log_debug( LogSiliconDriver, @@ -268,11 +257,6 @@ void Cluster::create_device( // MT: Initial BH - hugepages will fail init // For using silicon driver without workload to query mission mode params, no need for hugepage. if (!skip_driver_allocs) { - // TODO: Implement support for multiple host channels on BLACKHOLE. - log_assert( - !(arch_name == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1), - "More channels are not yet supported for Blackhole"); - // Same number of host channels per device for now bool hugepages_initialized = pci_device->init_hugepage(num_host_mem_channels); // Large writes to remote chips require hugepages to be initialized. // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused @@ -1403,43 +1387,68 @@ void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, ui dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; } -// TT<->TT P2P support removed in favor of increased Host memory. -// TODO: this is in the wrong place, it should be in the PCIDevice. +// TODO: this is in the wrong place, it should be in the TTDevice. +// It should also happen at the same time the huge pages or sysmem buffers are +// allocated/pinned/mapped. void Cluster::init_pcie_iatus() { int num_enabled_devices = m_tt_device_map.size(); log_debug(LogSiliconDriver, "Cluster::init_pcie_iatus() num_enabled_devices: {}", num_enabled_devices); - for (auto& src_device_it : m_tt_device_map) { - int logical_id = src_device_it.first; - PCIDevice* src_pci_device = src_device_it.second->get_pci_device(); + for (auto& [logical_id, tt_device] : m_tt_device_map) { + PCIDevice* pci_device = tt_device->get_pci_device(); // TODO: with the IOMMU case, I think we can get away with using just // one iATU region for WH. (On BH, we don't need iATU). We can only // cover slightly less than 4GB with WH, and the iATU can cover 4GB. // Splitting it into multiple regions is fine, but it's not necessary. // - // ... something to consider when this code is refactored into PCIDevice - // where it belongs. - - // Device to Host (multiple channels) - for (int channel_id = 0; channel_id < src_pci_device->get_num_host_mem_channels(); channel_id++) { - hugepage_mapping hugepage_map = src_pci_device->get_hugepage_mapping(channel_id); - if (hugepage_map.mapping) { - std::uint32_t region_size = hugepage_map.mapping_size; - if (channel_id == 3) { + // Update: unfortunately this turned out to be unrealistic. For the + // IOMMU case, the easiest thing to do is fake that we have hugepages + // so we can support the hugepage-inspired API that the user application + // has come to rely on. In that scenario, it's simpler to treat such + // fake hugepages the same way we treat real ones -- even if underneath + // there is only a single buffer. Simple is good. + // + // With respect to BH: it turns out that Metal has hard-coded NOC + // addressing assumptions for sysmem access. First step to fix this is + // have Metal ask us where sysmem is at runtime, and use that value in + // on-device code. Until then, we're stuck programming iATU. A more + // forward-looking solution is to abandon the sysmem API entirely, and + // have the application assume a more active role in managing the memory + // shared between host and device. UMD would be relegated to assisting + // the application set up and tear down the mappings. This is probably + // a unrealistic for GS/WH, but it's a good goal for BH. + // + // Until then... + // + // For every 1GB channel of memory mapped for DMA, program an iATU + // region to map it to the underlying buffer's IOVA (IOMMU case) or PA + // (non-IOMMU case). + for (size_t channel = 0; channel < pci_device->get_num_host_mem_channels(); channel++) { + hugepage_mapping hugepage_map = pci_device->get_hugepage_mapping(channel); + size_t region_size = hugepage_map.mapping_size; + + if (!hugepage_map.mapping) { + throw std::runtime_error( + fmt::format("Hugepages are not allocated for logical device id: {} ch: {}", logical_id, channel)); + } + + if (arch_name == tt::ARCH::BLACKHOLE) { + uint64_t base = channel * region_size; + uint64_t target = hugepage_map.physical_address; + tt_device->configure_iatu_region(channel, base, target, region_size); + } else { + // TODO: stop doing this. The intent was good, but it's not + // documented and nothing takes advantage of it. + if (channel == 3) { region_size = HUGEPAGE_CHANNEL_3_SIZE_LIMIT; } - // This log message doesn't look right. - log_debug( - LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id); - iatu_configure_peer_region(logical_id, channel_id, hugepage_map.physical_address, region_size); - - } else { - throw std::runtime_error(fmt::format( - "init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", - logical_id, - channel_id)); + // TODO: remove this and the Blackhole special case after ARC + // messaging is lowered to the TTDevice layer and we have a + // configure_iatu_region that works for GS/WH. Longer term it'd + // be nice to have KMD deal with iATU for us... + iatu_configure_peer_region(logical_id, channel, hugepage_map.physical_address, region_size); } } } @@ -1596,86 +1605,40 @@ int Cluster::pcie_arc_msg( return exit_code; } +// TODO: this method should be lowered into TTDevice, where a common +// implementation can be shared between GS/WH. The major obstacle to doing it +// (and the reason I'm leaving it alone for now) is the lack of ARC messaging +// support at that layer of abstraction. int Cluster::iatu_configure_peer_region( int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) { + if (arch_name == tt::ARCH::BLACKHOLE) { + throw std::runtime_error("Don't call this for Blackhole"); + } + uint32_t dest_bar_lo = bar_addr_64 & 0xffffffff; uint32_t dest_bar_hi = (bar_addr_64 >> 32) & 0xffffffff; std::uint32_t region_id_to_use = peer_region_id; + + // TODO: stop doing this. It's related to HUGEPAGE_CHANNEL_3_SIZE_LIMIT. if (peer_region_id == 3) { region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address // space with the correct start offset } + TTDevice* tt_device = get_tt_device(logical_device_id); PCIDevice* pci_device = tt_device->get_pci_device(); auto architecture_implementation = tt_device->get_architecture_implementation(); - // BR: ARC doesn't work yet on Blackhole, so programming ATU directly. Should be removed when arc starts working. - // TODO: Remove when ARC is implemented on BH. - if (arch_name == tt::ARCH::BLACKHOLE) { - uint64_t base_addr = region_id_to_use * region_size; - uint64_t base_size = (region_id_to_use + 1) * region_size; - uint64_t limit_address = base_addr + base_size - 1; - - uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1 - uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1 - uint32_t region_ctrl_3 = 0; - uint32_t base_addr_lo = base_addr & 0xffffffff; - uint32_t base_addr_hi = (base_addr >> 32) & 0xffffffff; - uint32_t limit_address_lo = limit_address & 0xffffffff; - uint32_t limit_address_hi = (limit_address >> 32) & 0xffffffff; - - uint64_t iatu_index = 0; - uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x00), - ®ion_ctrl_1, - 1); - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x04), - ®ion_ctrl_2, - 1); - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x08), - &base_addr_lo, - 1); - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x0c), - &base_addr_hi, - 1); - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x10), - &limit_address_lo, - 1); - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x14), - &dest_bar_lo, - 1); - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x18), - &dest_bar_hi, - 1); - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x1c), - ®ion_ctrl_3, - 1); - tt_device->write_regs( - reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x20), - &limit_address_hi, - 1); - } else { - bar_write32( - logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); - bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 1 * 4, dest_bar_lo); - bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 2 * 4, dest_bar_hi); - bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 3 * 4, region_size); - arc_msg( - logical_device_id, - 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), - true, - 0, - 0); - } + bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); + bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 1 * 4, dest_bar_lo); + bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 2 * 4, dest_bar_hi); + bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 3 * 4, region_size); + arc_msg( + logical_device_id, + 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), + true, + 0, + 0); // Print what just happened uint32_t peer_region_start = region_id_to_use * region_size; diff --git a/device/hugepage.cpp b/device/hugepage.cpp index 8883bff2..31aad9b3 100644 --- a/device/hugepage.cpp +++ b/device/hugepage.cpp @@ -37,68 +37,6 @@ uint32_t get_num_hugepages() { return num_hugepages; } -uint32_t get_available_num_host_mem_channels( - const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) { - // To minimally support hybrid dev systems with mix of ARCH, get only devices matching current ARCH's device_id. - uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices(); - uint32_t num_tt_mmio_devices_for_arch = - tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); - uint32_t total_hugepages = get_num_hugepages(); - - // This shouldn't happen on silicon machines. - if (num_tt_mmio_devices_for_arch == 0) { - log_warning( - LogSiliconDriver, - "No TT devices found that match PCI device_id: 0x{:x} revision: {}, returning NumHostMemChannels:0", - device_id, - revision_id); - return 0; - } - - // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups - // that were incomplete ie fewer hugepages than devices, which would partially work previously for some devices. - uint32_t num_channels_per_device_available = - std::min(num_channels_per_device_target, std::max((uint32_t)1, total_hugepages / num_tt_mmio_devices_for_arch)); - - // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later - // on. - if (total_num_tt_mmio_devices > num_tt_mmio_devices_for_arch) { - log_warning( - LogSiliconDriver, - "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient " - "Hugepages/HostMemChannels per device."); - } - - if (total_hugepages < num_tt_mmio_devices_for_arch) { - log_warning( - LogSiliconDriver, - "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. " - "NumHostMemChannels would be 0, bumping to 1.", - total_hugepages, - num_tt_mmio_devices_for_arch, - device_id, - revision_id); - } - - if (num_channels_per_device_available < num_channels_per_device_target) { - log_warning( - LogSiliconDriver, - "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds " - "NumHostMemChannels. Increase Number of Hugepages.", - num_channels_per_device_available, - device_id, - num_channels_per_device_target); - } - - log_assert( - num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS, - "NumHostMemChannels: {} exceeds supported maximum: {}, this is unexpected.", - num_channels_per_device_available, - g_MAX_HOST_MEM_CHANNELS); - - return num_channels_per_device_available; -} - std::string find_hugepage_dir(std::size_t pagesize) { static const std::regex hugetlbfs_mount_re( fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir)); diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp index 0d248741..ef112a89 100644 --- a/device/pcie/pci_device.cpp +++ b/device/pcie/pci_device.cpp @@ -526,10 +526,10 @@ bool PCIDevice::init_iommu(size_t size) { return true; } -int PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); } +size_t PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); } -hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const { - if (channel < 0 || hugepage_mapping_per_channel.size() <= channel) { +hugepage_mapping PCIDevice::get_hugepage_mapping(size_t channel) const { + if (hugepage_mapping_per_channel.size() <= channel) { return {nullptr, 0, 0}; } else { return hugepage_mapping_per_channel[channel]; diff --git a/device/tt_device/blackhole_tt_device.cpp b/device/tt_device/blackhole_tt_device.cpp index 304a3443..192f6b48 100644 --- a/device/tt_device/blackhole_tt_device.cpp +++ b/device/tt_device/blackhole_tt_device.cpp @@ -5,6 +5,7 @@ #include // for MAP_FAILED +#include "logger.hpp" #include "umd/device/blackhole_implementation.h" namespace tt::umd { @@ -13,20 +14,70 @@ BlackholeTTDevice::BlackholeTTDevice(std::unique_ptr pci_device) : TTDevice(std::move(pci_device), std::make_unique()) {} BlackholeTTDevice::~BlackholeTTDevice() { + // Turn off iATU for the regions we programmed. This won't happen if the + // application crashes -- this is a good example of why userspace should not + // be touching this hardware resource directly -- but it's a good idea to + // clean up after ourselves. if (pci_device_->bar2_uc != nullptr && pci_device_->bar2_uc != MAP_FAILED) { - // Disable ATU index 0 - // TODO: Implement disabling for all indexes, once more host channels are enabled. - - // This is not going to happen if the application crashes, so if it's - // essential for correctness then it needs to move to the driver. - uint64_t iatu_index = 0; - uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 - - volatile uint32_t *dest = - reinterpret_cast(static_cast(pci_device_->bar2_uc) + iatu_base + 0x04); - const uint32_t *src = ®ion_ctrl_2; - *dest = *src; + auto *bar2 = static_cast(pci_device_->bar2_uc); + + for (size_t region : iatu_regions_) { + uint64_t iatu_base = ATU_OFFSET_IN_BH_BAR2 + (region * 0x200); + uint64_t region_ctrl_2 = 0; + *reinterpret_cast(bar2 + iatu_base + 0x04) = region_ctrl_2; + } + } +} + +void BlackholeTTDevice::configure_iatu_region(size_t region, uint64_t base, uint64_t target, size_t size) { + uint64_t iatu_base = ATU_OFFSET_IN_BH_BAR2 + (region * 0x200); + auto *bar2 = static_cast(pci_device_->bar2_uc); + + if (size % (1ULL << 30) != 0 || size > (1ULL << 32)) { + // If you hit this, the suggestion is to not use iATU: map your buffer + // with the driver, and use the IOVA it provides in your device code. + throw std::runtime_error("Constraint: size % (1ULL << 30) == 0; size <= (1ULL <<32)"); + } + + if (bar2 == nullptr || bar2 == MAP_FAILED) { + throw std::runtime_error("BAR2 not mapped"); } + + auto write_iatu_reg = [bar2](uint64_t offset, uint32_t value) { + *reinterpret_cast(bar2 + offset) = value; + }; + + uint64_t limit = (base + (size - 1)) & 0xffff'ffff; + uint32_t base_lo = (base >> 0x00) & 0xffff'ffff; + uint32_t base_hi = (base >> 0x20) & 0xffff'ffff; + uint32_t target_lo = (target >> 0x00) & 0xffff'ffff; + uint32_t target_hi = (target >> 0x20) & 0xffff'ffff; + + uint32_t region_ctrl_1 = 0; + uint32_t region_ctrl_2 = 1 << 31; // REGION_EN + uint32_t region_ctrl_3 = 0; + uint32_t limit_hi = 0; + + write_iatu_reg(iatu_base + 0x00, region_ctrl_1); + write_iatu_reg(iatu_base + 0x04, region_ctrl_2); + write_iatu_reg(iatu_base + 0x08, base_lo); + write_iatu_reg(iatu_base + 0x0c, base_hi); + write_iatu_reg(iatu_base + 0x10, limit); + write_iatu_reg(iatu_base + 0x14, target_lo); + write_iatu_reg(iatu_base + 0x18, target_hi); + write_iatu_reg(iatu_base + 0x1c, limit_hi); + write_iatu_reg(iatu_base + 0x20, region_ctrl_3); + + iatu_regions_.insert(region); + + log_info( + LogSiliconDriver, + "Device: {} Mapped iATU region {} from 0x{:x} to 0x{:x} to 0x{:x}", + this->pci_device_->get_device_num(), + region, + base, + limit, + target); } + } // namespace tt::umd diff --git a/device/tt_device/tt_device.cpp b/device/tt_device/tt_device.cpp index d9045bcd..e4ecca50 100644 --- a/device/tt_device/tt_device.cpp +++ b/device/tt_device/tt_device.cpp @@ -310,4 +310,13 @@ dynamic_tlb TTDevice::set_dynamic_tlb_broadcast( return set_dynamic_tlb(tlb_index, start, end, address, true, harvested_coord_translation, ordering); } +void TTDevice::configure_iatu_region(size_t region, uint64_t base, uint64_t target, size_t size) { + // TODO: The code to do this is still up in cluster.cpp. It should be moved + // here, but a prerequisite is to have an ARC messaging interface at this + // (TTDevice) level... it too is still up in cluster.cpp. + // + // For now, just throw an exception. + throw std::runtime_error("configure_iatu_region is not implemented for this device"); +} + } // namespace tt::umd diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp index 3f6ac8b3..5b653d4d 100644 --- a/tests/blackhole/test_silicon_driver_bh.cpp +++ b/tests/blackhole/test_silicon_driver_bh.cpp @@ -878,3 +878,131 @@ TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as } device.close_device(); } + +/** + * Copied from the Wormhole test. + */ +TEST(SiliconDriverBH, SysmemTestWithPcie) { + auto target_devices = get_target_devices(); + + Cluster cluster( + 1, // one "host memory channel", + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes + + set_params_for_remote_txn(cluster); + cluster.start_device(tt_device_params{}); // no special parameters + + const chip_id_t mmio_chip_id = 0; + const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); + const size_t test_size_bytes = 0x4000; // Arbitrarilly chosen, but small size so the test runs quickly. + + uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); + ASSERT_NE(sysmem, nullptr); + + uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); + + // Buffer that we will use to read sysmem into, then write sysmem from. + std::vector buffer(test_size_bytes, 0x0); + + // Step 1: Fill sysmem with random bytes. + test_utils::fill_with_random_bytes(sysmem, test_size_bytes); + + // Step 2: Read sysmem into buffer. + cluster.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB"); + + // Step 3: Verify that buffer matches sysmem. + ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); + + // Step 4: Fill buffer with random bytes. + test_utils::fill_with_random_bytes(&buffer[0], test_size_bytes); + + // Step 5: Write buffer into sysmem, overwriting what was there. + cluster.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB"); + + // Step 5b: Read back sysmem into a throwaway buffer. The intent is to + // ensure the write has completed before we check sysmem against buffer. + std::vector throwaway(test_size_bytes, 0x0); + cluster.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB"); + + // Step 6: Verify that sysmem matches buffer. + ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); +} + +static bool is_iommu_available() { + const size_t num_channels = 1; + auto target_devices = get_target_devices(); + Cluster cluster( + test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), + target_devices, + num_channels, + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes + return cluster.get_tt_device(0)->get_pci_device()->is_iommu_enabled(); +} + +/** + * Same idea as above, but with multiple channels of sysmem and random addresses. + * The hardware mechanism is too slow to sweep the entire range. + */ +TEST(SiliconDriverBH, RandomSysmemTestWithPcie) { + // How many hugepages will Blackhole CI systems allocate? Hopefully zero, + // and they'll have IOMMU instead. But if not, let's assume 2. + const size_t num_channels = is_iommu_available() ? 4 : 2; + auto target_devices = get_target_devices(); + + Cluster cluster( + test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), + target_devices, + num_channels, + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes + + set_params_for_remote_txn(cluster); + cluster.start_device(tt_device_params{}); // no special parameters + + const chip_id_t mmio_chip_id = 0; + const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); + const size_t ONE_GIG = 1 << 30; + const size_t num_tests = 0x20000; // runs in a reasonable amount of time + + const uint64_t ALIGNMENT = sizeof(uint32_t); + auto generate_aligned_address = [&](uint64_t lo, uint64_t hi) -> uint64_t { + static std::random_device rd; + static std::mt19937_64 gen(rd()); + std::uniform_int_distribution dis(lo / ALIGNMENT, hi / ALIGNMENT); + return dis(gen) * ALIGNMENT; + }; + + uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); + for (size_t channel = 0; channel < num_channels; ++channel) { + uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, channel); + ASSERT_NE(sysmem, nullptr); + + test_utils::fill_with_random_bytes(sysmem, ONE_GIG); + + uint64_t lo = (ONE_GIG * channel); + uint64_t hi = (lo + ONE_GIG) - 1; + + for (size_t i = 0; i < num_tests; ++i) { + uint64_t address = generate_aligned_address(lo, hi); + uint64_t noc_addr = base_address + address; + uint64_t sysmem_address = address - lo; + + ASSERT_GE(address, lo) << "Address too low"; + ASSERT_LE(address, hi) << "Address too high"; + ASSERT_EQ(address % ALIGNMENT, 0) << "Address not properly aligned"; + + uint32_t value = 0; + cluster.read_from_device(&value, PCIE_CORE, noc_addr, sizeof(uint32_t), "LARGE_READ_TLB"); + + uint32_t expected = *reinterpret_cast(&sysmem[sysmem_address]); + ASSERT_EQ(value, expected) << fmt::format("Mismatch at address {:#x}", address); + } + } +} diff --git a/tests/microbenchmark/test_rw_tensix.cpp b/tests/microbenchmark/test_rw_tensix.cpp index 81f55819..f0d9a9d5 100644 --- a/tests/microbenchmark/test_rw_tensix.cpp +++ b/tests/microbenchmark/test_rw_tensix.cpp @@ -10,6 +10,10 @@ #include "nanobench.h" #include "tests/test_utils/device_test_utils.hpp" +// This isn't compiling and I'm too busy to figure out why. +// I'm also too busy to coax my environment into not building it, so... +#if 0 + std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min = 0) { ankerl::nanobench::Rng gen(80085); std::uniform_int_distribution<> dis(min, max); // between 0 and 1MB @@ -139,3 +143,5 @@ TEST_F(uBenchmarkFixture, Read32BytesRandomAddr) { bench.render(ankerl::nanobench::templates::csv(), results_csv); } + +#endif