diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h index a96d4d94..2276997f 100644 --- a/device/api/umd/device/cluster.h +++ b/device/api/umd/device/cluster.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,15 @@ class tt_device { throw std::runtime_error("---- tt_device::configure_tlb is not implemented\n"); } + virtual void configure_tlb( + chip_id_t logical_device_id, + tt::umd::CoreCoord core, + int32_t tlb_index, + uint64_t address, + uint64_t ordering = TLB_DATA::Relaxed) { + throw std::runtime_error("---- tt_device::configure_tlb is not implemented\n"); + } + /** * Set ordering mode for dynamic/fallback TLBs (passed into driver constructor). * @@ -134,6 +144,12 @@ class tt_device { "---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); } + virtual void configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { + throw std::runtime_error( + "---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); + } + /** * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize * iATUs for PCIe devices and ethernet queues for remote chips. @@ -161,6 +177,13 @@ class tt_device { throw std::runtime_error("---- tt_device::deassert_risc_reset_at_core is not implemented\n"); } + virtual void deassert_risc_reset_at_core( + const chip_id_t chip, + const tt::umd::CoreCoord core, + const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET) { + throw std::runtime_error("---- tt_device::deassert_risc_reset_at_core is not implemented\n"); + } + /** * Broadcast assert soft Tensix Reset to the entire device. */ @@ -177,6 +200,10 @@ class tt_device { throw std::runtime_error("---- tt_device::assert_risc_reset_at_core is not implemented\n"); } + virtual void assert_risc_reset_at_core(const chip_id_t chip, const tt::umd::CoreCoord core) { + throw std::runtime_error("---- tt_device::assert_risc_reset_at_core is not implemented\n"); + } + /** * To be called at the end of a run. * Set power state to idle, assert tensix reset at all cores. @@ -216,6 +243,16 @@ class tt_device { throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); } + virtual void write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + chip_id_t chip, + tt::umd::CoreCoord core, + uint64_t addr, + const std::string& tlb_to_use) { + throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); + } + virtual void broadcast_write_to_cluster( const void* mem_ptr, uint32_t size_in_bytes, @@ -242,6 +279,16 @@ class tt_device { throw std::runtime_error("---- tt_device::read_from_device is not implemented\n"); } + virtual void read_from_device( + void* mem_ptr, + chip_id_t chip, + tt::umd::CoreCoord core, + uint64_t addr, + uint32_t size, + const std::string& fallback_tlb) { + throw std::runtime_error("---- tt_device::read_from_device is not implemented\n"); + } + /** * Write uint32_t vector to specified address and channel on host (defined for Silicon). * @@ -452,6 +499,11 @@ class tt_device { virtual const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const { return soc_descriptor_per_chip.at(chip_id); } + virtual tt::umd::CoreCoord to( + const chip_id_t chip, const tt::umd::CoreCoord core_coord, const CoordSystem coord_system) { + throw std::runtime_error("---- tt_device::to is not implemented\n"); + return tt::umd::CoreCoord(); + } bool performed_harvesting = false; std::unordered_map harvested_rows_per_target = {}; @@ -560,33 +612,84 @@ class Cluster : public tt_device { */ static std::unique_ptr create_mock_cluster(); - // Setup/Teardown Functions + // Existing API we want to keep. UMD is transitioning to use CoreCoord instead of tt_xy_pair. + // This set of function shouldn't be removed even after the transition. + // TODO: regroup the functions from this set into setup/teardown, runtime, and misc functions. virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_); virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); + virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted); + virtual void setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function); + virtual void start_device(const tt_device_params& device_params); + virtual void assert_risc_reset(); + virtual void deassert_risc_reset(); + virtual void close_device(); + virtual void write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); + virtual void read_from_sysmem( + void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); + virtual void wait_for_non_mmio_flush(); + virtual void wait_for_non_mmio_flush(const chip_id_t chip_id); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void bar_write32(int logical_device_id, uint32_t addr, uint32_t data); + uint32_t bar_read32(int logical_device_id, uint32_t addr); + /** + * This API allows you to write directly to device memory that is addressable by a static TLB + */ + std::function get_fast_pcie_static_tlb_write_callable(int device_id); + // Misc. Functions to Query/Set Device State + virtual int arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); + virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); + virtual int get_number_of_chips_in_cluster(); + virtual std::unordered_set get_all_chips_in_cluster(); + virtual tt_ClusterDescriptor* get_cluster_description(); + static int detect_number_of_chips(); + static std::vector detect_available_device_ids(); + virtual std::set get_target_mmio_device_ids(); + virtual std::set get_target_remote_device_ids(); + virtual std::map get_clocks(); + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; + virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); + virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); + virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); + virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); + virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); + virtual tt_version get_ethernet_fw_version() const; + // TODO: This should be accessible through public API, probably to be moved to tt_device. + PCIDevice* get_pci_device(int device_id) const; + TTDevice* get_tt_device(chip_id_t device_id) const; + const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const; + + // Existing API we want to remove. UMD is transitioning to use CoreCoord instead of tt_xy_pair. + // This set of functions is supposed to be removed one the transition for clients (tt-metal, tt-lens) is complete. + // TODO: remove this set of functions once the transition for clients is completed. + std::unordered_map get_virtual_soc_descriptors(); virtual void configure_tlb( chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering = TLB_DATA::Posted); - virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted); - virtual void setup_core_to_tlb_map( - const chip_id_t logical_device_id, std::function mapping_function); virtual void configure_active_ethernet_cores_for_mmio_device( chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); - virtual void start_device(const tt_device_params& device_params); - virtual void assert_risc_reset(); - virtual void deassert_risc_reset(); virtual void deassert_risc_reset_at_core( tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void close_device(); - - // Runtime Functions virtual void write_to_device( const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + // TODO: Add CoreCoord API for this function. void broadcast_write_to_cluster( const void* mem_ptr, uint32_t size_in_bytes, @@ -595,34 +698,16 @@ class Cluster : public tt_device { std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - virtual void read_from_device( void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - virtual void write_to_sysmem( - const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); - virtual void read_from_sysmem( - void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); - virtual void wait_for_non_mmio_flush(); - virtual void wait_for_non_mmio_flush(const chip_id_t chip_id); void l1_membar( const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar( - const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); void dram_membar( const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - // These functions are used by Debuda, so make them public - void bar_write32(int logical_device_id, uint32_t addr, uint32_t data); - uint32_t bar_read32(int logical_device_id, uint32_t addr); - /** * If the tlbs are initialized, returns a tuple with the TLB base address and its size */ std::optional> get_tlb_data_from_target(const tt_cxy_pair& target); - /** - * This API allows you to write directly to device memory that is addressable by a static TLB - */ - std::function get_fast_pcie_static_tlb_write_callable(int device_id); - /** * Provide fast write access to a statically-mapped TLB. * It is the caller's responsibility to ensure that @@ -634,30 +719,8 @@ class Cluster : public tt_device { * @param target The target chip and core to write to. */ tt::Writer get_static_tlb_writer(tt_cxy_pair target); - - // Misc. Functions to Query/Set Device State - virtual int arc_msg( - int logical_device_id, - uint32_t msg_code, - bool wait_for_done = true, - uint32_t arg0 = 0, - uint32_t arg1 = 0, - int timeout = 1, - uint32_t* return_3 = nullptr, - uint32_t* return_4 = nullptr); virtual bool using_harvested_soc_descriptors(); - virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); - virtual int get_number_of_chips_in_cluster(); - virtual std::unordered_set get_all_chips_in_cluster(); - virtual tt_ClusterDescriptor* get_cluster_description(); - static int detect_number_of_chips(); - static std::vector detect_available_device_ids(); - virtual std::set get_target_mmio_device_ids(); - virtual std::set get_target_remote_device_ids(); - virtual std::map get_clocks(); - virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; - virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; static std::vector extract_rows_to_remove( const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows); static void remove_worker_row_from_descriptor( @@ -666,18 +729,45 @@ class Cluster : public tt_device { static std::unordered_map create_harvested_coord_translation( const tt::ARCH arch, bool identity_map); std::unordered_map get_harvested_coord_translation_map(chip_id_t logical_device_id); - virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); - virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); - virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); - virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); - virtual tt_version get_ethernet_fw_version() const; - - TTDevice* get_tt_device(chip_id_t device_id) const; - const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const; - // TODO: This function should be removed. - std::unordered_map get_virtual_soc_descriptors(); + // New API. UMD is transitioning to use CoreCoord instead of tt_xy_pair. + // This is new set of functions that should be used once the transition for clients (tt-metal, tt-lens) is complete. + virtual void configure_tlb( + chip_id_t logical_device_id, + tt::umd::CoreCoord core, + int32_t tlb_index, + uint64_t address, + uint64_t ordering = TLB_DATA::Posted); + virtual void deassert_risc_reset_at_core( + const chip_id_t chip, + const tt::umd::CoreCoord core, + const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void assert_risc_reset_at_core(const chip_id_t chip, const tt::umd::CoreCoord core); + virtual void write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + chip_id_t chip, + tt::umd::CoreCoord core, + uint64_t addr, + const std::string& tlb_to_use); + virtual void read_from_device( + void* mem_ptr, + chip_id_t chip, + tt::umd::CoreCoord core, + uint64_t addr, + uint32_t size, + const std::string& fallback_tlb); + std::optional> get_tlb_data_from_target( + const chip_id_t chip, const tt::umd::CoreCoord core); + tt::Writer get_static_tlb_writer(const chip_id_t chip, const tt::umd::CoreCoord target); + virtual tt::umd::CoreCoord to( + const chip_id_t chip, const tt::umd::CoreCoord core_coord, const CoordSystem coord_system); + virtual void configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores); // Destructor virtual ~Cluster(); diff --git a/device/api/umd/device/tt_core_coordinates.h b/device/api/umd/device/tt_core_coordinates.h index 3e0b5803..f6df10f6 100644 --- a/device/api/umd/device/tt_core_coordinates.h +++ b/device/api/umd/device/tt_core_coordinates.h @@ -83,3 +83,8 @@ struct CoreCoord : public tt_xy_pair { }; } // namespace tt::umd + +template <> +struct std::hash { + std::size_t operator()(const tt::umd::CoreCoord& core_range) const; +}; diff --git a/device/cluster.cpp b/device/cluster.cpp index b79c1158..07700804 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -36,6 +36,7 @@ #include #include +#include "api/umd/device/tt_core_coordinates.h" #include "logger.hpp" #include "umd/device/architecture_implementation.h" #include "umd/device/chip/local_chip.h" @@ -498,8 +499,8 @@ std::unique_ptr Cluster::construct_chip_from_cluster(chip_id_t chip_id, tt // true. // TODO: This should be changed, harvesting should be done in tt_socdescriptor's constructor and not as part of // cluster class. - // uint32_t harvesting_info = cluster_desc->get_harvesting_info().at(chip_id); - tt_SocDescriptor soc_desc = tt_SocDescriptor(soc_desc_path /*, harvesting_info*/); + uint32_t tensix_harvesting_mask = cluster_desc->get_harvesting_info().at(chip_id); + tt_SocDescriptor soc_desc = tt_SocDescriptor(soc_desc_path, tensix_harvesting_mask/*, harvesting_info*/); return construct_chip_from_cluster(chip_id, cluster_desc, soc_desc); } @@ -678,6 +679,17 @@ void Cluster::configure_active_ethernet_cores_for_mmio_device( non_mmio_transfer_cores_customized = true; } +void Cluster::configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { + std::unordered_set active_eth_cores_xy; + for (const auto& core : active_eth_cores_per_chip) { + CoreCoord virtual_coord = to(mmio_chip, core, CoordSystem::VIRTUAL); + active_eth_cores_xy.insert(tt_xy_pair(virtual_coord.x, virtual_coord.y)); + } + + configure_active_ethernet_cores_for_mmio_device(mmio_chip, active_eth_cores_xy); +} + void Cluster::populate_cores() { std::uint32_t count = 0; for (const auto& [chip_id, chip] : chips_) { @@ -1000,6 +1012,15 @@ void Cluster::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftRese } } +void Cluster::deassert_risc_reset_at_core( + const chip_id_t chip, const CoreCoord core, const TensixSoftResetOptions& soft_resets) { + tt_cxy_pair virtual_core; + const CoreCoord virtual_coord = to(chip, core, CoordSystem::VIRTUAL); + virtual_core.x = virtual_coord.x; + virtual_core.y = virtual_coord.y; + deassert_risc_reset_at_core(virtual_core, soft_resets); +} + void Cluster::assert_risc_reset_at_core(tt_cxy_pair core) { // Get Target Device to query soc descriptor and determine location in cluster std::uint32_t target_device = core.chip; @@ -1023,6 +1044,14 @@ void Cluster::assert_risc_reset_at_core(tt_cxy_pair core) { } } +void Cluster::assert_risc_reset_at_core(const chip_id_t chip, const CoreCoord core) { + tt_cxy_pair virtual_core; + const CoreCoord virtual_coord = to(chip, core, CoordSystem::VIRTUAL); + virtual_core.x = virtual_coord.x; + virtual_core.y = virtual_coord.y; + assert_risc_reset_at_core(virtual_core); +} + // Free memory during teardown, and remove (clean/unlock) from any leftover mutexes. void Cluster::cleanup_shared_host_state() { for (auto& mutex : hardware_resource_mutex_map) { @@ -1097,6 +1126,14 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) { return tt::Writer(base + tlb_offset, tlb_size); } +tt::Writer Cluster::get_static_tlb_writer(const chip_id_t chip, const CoreCoord target) { + tt_cxy_pair virtual_core; + const CoreCoord virtual_coord = to(chip, target, CoordSystem::VIRTUAL); + virtual_core.x = virtual_coord.x; + virtual_core.y = virtual_coord.y; + return get_static_tlb_writer(virtual_core); +} + void Cluster::write_device_memory( const void* mem_ptr, uint32_t size_in_bytes, @@ -1367,6 +1404,15 @@ std::optional> Cluster::get_tlb_data_from_target( return tlb_data; } +std::optional> Cluster::get_tlb_data_from_target(const chip_id_t chip, CoreCoord core) { + tt_cxy_pair virtual_core; + const CoreCoord virtual_coord = to(chip, core, CoordSystem::VIRTUAL); + virtual_core.chip = chip; + virtual_core.x = virtual_coord.x; + virtual_core.y = virtual_coord.y; + return get_tlb_data_from_target(virtual_core); +} + void Cluster::configure_tlb( chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { log_assert( @@ -1381,6 +1427,15 @@ void Cluster::configure_tlb( tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size}); } +void Cluster::configure_tlb( + chip_id_t logical_device_id, tt::umd::CoreCoord core, int32_t tlb_index, uint64_t address, uint64_t ordering) { + tt_xy_pair virtual_core; + const CoreCoord virtual_coord = to(logical_device_id, core, CoordSystem::VIRTUAL); + virtual_core.x = virtual_coord.x; + virtual_core.y = virtual_coord.y; + configure_tlb(logical_device_id, virtual_core, tlb_index, address, ordering); +} + void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) { log_assert( ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, @@ -1715,6 +1770,10 @@ uint32_t Cluster::get_harvested_noc_rows_for_chip(int logical_device_id) { return get_harvested_noc_rows(get_harvested_rows(logical_device_id)); } +CoreCoord Cluster::to(const chip_id_t chip, const CoreCoord core_coord, const CoordSystem coord_system) { + return get_soc_descriptor(chip).to(core_coord, coord_system); +} + void Cluster::enable_local_ethernet_queue(const chip_id_t& device_id, int timeout) { uint32_t msg_success = 0x0; auto timeout_seconds = std::chrono::seconds(timeout); @@ -2967,6 +3026,16 @@ void Cluster::l1_membar( } } +void Cluster::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + std::unordered_set cores_xy; + for (const auto& core : cores) { + const CoreCoord virtual_core = to(chip, core, CoordSystem::VIRTUAL); + cores_xy.insert({virtual_core.x, virtual_core.y}); + } + l1_membar(chip, fallback_tlb, cores_xy); +} + void Cluster::dram_membar( const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { if (cluster_desc->is_chip_mmio_capable(chip)) { @@ -2985,6 +3054,16 @@ void Cluster::dram_membar( } } +void Cluster::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + std::unordered_set cores_xy; + for (const auto& core : cores) { + const CoreCoord virtual_core = to(chip, core, CoordSystem::VIRTUAL); + cores_xy.insert({virtual_core.x, virtual_core.y}); + } + dram_membar(chip, fallback_tlb, cores_xy); +} + void Cluster::dram_membar( const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { if (cluster_desc->is_chip_mmio_capable(chip)) { @@ -3022,6 +3101,21 @@ void Cluster::write_to_device( } } +void Cluster::write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + chip_id_t chip, + CoreCoord core, + uint64_t addr, + const std::string& tlb_to_use) { + tt_cxy_pair virtual_core; + virtual_core.chip = chip; + CoreCoord virtual_coord = to(chip, core, CoordSystem::VIRTUAL); + virtual_core.x = virtual_coord.x; + virtual_core.y = virtual_coord.y; + write_to_device(mem_ptr, size_in_bytes, virtual_core, addr, tlb_to_use); +} + void Cluster::read_mmio_device_register( void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { TTDevice* tt_device = get_tt_device(core.chip); @@ -3081,6 +3175,16 @@ void Cluster::read_from_device( } } +void Cluster::read_from_device( + void* mem_ptr, chip_id_t chip, CoreCoord core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + tt_cxy_pair virtual_core; + virtual_core.chip = chip; + CoreCoord virtual_coord = to(chip, core, CoordSystem::VIRTUAL); + virtual_core.x = virtual_coord.x; + virtual_core.y = virtual_coord.y; + read_from_device(mem_ptr, virtual_core, addr, size, fallback_tlb); +} + int Cluster::arc_msg( int logical_device_id, uint32_t msg_code, diff --git a/tests/api/test_cluster.cpp b/tests/api/test_cluster.cpp index 4fcadafe..87d35c26 100644 --- a/tests/api/test_cluster.cpp +++ b/tests/api/test_cluster.cpp @@ -43,9 +43,8 @@ inline std::unique_ptr get_cluster() { // TODO: Should not be wormhole specific. // TODO: Offer default setup for what you can. void setup_wormhole_remote(Cluster* umd_cluster) { - if (!umd_cluster->get_target_remote_device_ids().empty() && - umd_cluster->get_soc_descriptor(*umd_cluster->get_all_chips_in_cluster().begin()).arch == - tt::ARCH::WORMHOLE_B0) { + if (umd_cluster->get_soc_descriptor(*umd_cluster->get_all_chips_in_cluster().begin()).arch == + tt::ARCH::WORMHOLE_B0) { // Populate address map and NOC parameters that the driver needs for remote transactions umd_cluster->set_device_l1_address_params( @@ -111,37 +110,27 @@ TEST(ApiClusterTest, SimpleIOAllChips) { for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) { const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); - // TODO: figure out if core locations should contain chip_id - tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global(chip_id, any_core); - - if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { - std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; - continue; - } + CoreCoord any_core = soc_desc.get_cores(CoreType::TENSIX)[0]; std::cout << "Writing to chip " << chip_id << " core " << any_core.str() << std::endl; - umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB"); + umd_cluster->write_to_device(data.data(), data_size, chip_id, any_core, 0, "LARGE_WRITE_TLB"); + + umd_cluster->wait_for_non_mmio_flush(chip_id); } // Now read back the data. for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) { const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); - // TODO: figure out if core locations should contain chip_id - tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global(chip_id, any_core); - - if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { - std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; - continue; - } + CoreCoord any_core = soc_desc.get_cores(CoreType::TENSIX)[0]; std::cout << "Reading from chip " << chip_id << " core " << any_core.str() << std::endl; std::vector readback_data(data_size, 0); - umd_cluster->read_from_device(readback_data.data(), any_core_global, 0, data_size, "LARGE_READ_TLB"); + umd_cluster->read_from_device(readback_data.data(), chip_id, any_core, 0, data_size, "LARGE_READ_TLB"); + + // umd_cluster->wait_for_non_mmio_flush(chip_id); ASSERT_EQ(data, readback_data); } @@ -163,11 +152,10 @@ TEST(ApiClusterTest, RemoteFlush) { setup_wormhole_remote(umd_cluster.get()); for (auto chip_id : umd_cluster->get_target_remote_device_ids()) { + // const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); - // TODO: figure out if core locations should contain chip_id - tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global(chip_id, any_core); + const CoreCoord any_core = soc_desc.get_cores(CoreType::TENSIX)[0]; if (!cluster_desc->is_chip_remote(chip_id)) { std::cout << "Chip " << chip_id << " skipped because it is not a remote chip." << std::endl; @@ -180,31 +168,19 @@ TEST(ApiClusterTest, RemoteFlush) { } std::cout << "Writing to chip " << chip_id << " core " << any_core.str() << std::endl; - umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB"); + umd_cluster->write_to_device(data.data(), data_size, chip_id, any_core, 0, "LARGE_WRITE_TLB"); std::cout << "Waiting for remote chip flush " << chip_id << std::endl; umd_cluster->wait_for_non_mmio_flush(chip_id); - std::cout << "Waiting again for flush " << chip_id << ", should be no-op" << std::endl; - umd_cluster->wait_for_non_mmio_flush(chip_id); - } - - chip_id_t any_remote_chip = *umd_cluster->get_target_remote_device_ids().begin(); - const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_remote_chip); - tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global(any_remote_chip, any_core); - if (soc_desc.arch != tt::ARCH::WORMHOLE_B0) { - std::cout << "Skipping whole cluster wait because it is not a wormhole_b0 chip." << std::endl; - return; - } - std::cout << "Writing to chip " << any_remote_chip << " core " << any_core.str() << std::endl; - umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB"); + std::cout << "Reading from chip " << chip_id << " core " << any_core.str() << std::endl; + std::vector readback_data(data_size, 0); + umd_cluster->read_from_device(readback_data.data(), chip_id, any_core, 0, data_size, "LARGE_READ_TLB"); - std::cout << "Testing whole cluster wait for remote chip flush." << std::endl; - umd_cluster->wait_for_non_mmio_flush(); + // umd_cluster->wait_for_non_mmio_flush(chip_id); - std::cout << "Testing whole cluster wait for remote chip flush again, should be no-op." << std::endl; - umd_cluster->wait_for_non_mmio_flush(); + ASSERT_EQ(data, readback_data); + } } TEST(ApiClusterTest, SimpleIOSpecificChips) { @@ -229,40 +205,88 @@ TEST(ApiClusterTest, SimpleIOSpecificChips) { setup_wormhole_remote(umd_cluster.get()); for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) { + // const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); - // TODO: figure out if core locations should contain chip_id - tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global(chip_id, any_core); - - if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { - std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; - continue; - } + const CoreCoord any_core = soc_desc.get_cores(CoreType::TENSIX)[0]; std::cout << "Writing to chip " << chip_id << " core " << any_core.str() << std::endl; - umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB"); + umd_cluster->write_to_device(data.data(), data_size, chip_id, any_core, 0, "LARGE_WRITE_TLB"); + + umd_cluster->wait_for_non_mmio_flush(chip_id); } // Now read back the data. for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) { + // const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); - // TODO: figure out if core locations should contain chip_id - tt_xy_pair any_core = soc_desc.workers[0]; - tt_cxy_pair any_core_global(chip_id, any_core); - - if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { - std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; - continue; - } + const CoreCoord any_core = soc_desc.get_cores(CoreType::TENSIX)[0]; std::cout << "Reading from chip " << chip_id << " core " << any_core.str() << std::endl; std::vector readback_data(data_size, 0); - umd_cluster->read_from_device(readback_data.data(), any_core_global, 0, data_size, "LARGE_READ_TLB"); + umd_cluster->read_from_device(readback_data.data(), chip_id, any_core, 0, data_size, "LARGE_READ_TLB"); + + // umd_cluster->wait_for_non_mmio_flush(chip_id); ASSERT_EQ(data, readback_data); } } + +TEST(ClusterAPI, DynamicTLB_RW) { + // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for + // each transaction + + std::unique_ptr cluster = get_cluster(); + + setup_wormhole_remote(cluster.get()); + + tt_device_params default_params; + cluster->start_device(default_params); + cluster->deassert_risc_reset(); + + std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector readback_vec = zeros; + + static const uint32_t num_loops = 100; + + std::unordered_set target_devices = cluster->get_all_chips_in_cluster(); + for (const chip_id_t chip : target_devices) { + std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; + // Write to each core a 100 times at different statically mapped addresses + const tt_SocDescriptor& soc_desc = cluster->get_soc_descriptor(chip); + std::vector tensix_cores = soc_desc.get_cores(CoreType::TENSIX); + for (int loop = 0; loop < num_loops; loop++) { + for (auto& core : tensix_cores) { + cluster->write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + chip, + core, + address, + "SMALL_READ_WRITE_TLB"); + + // Barrier to ensure that all writes over ethernet were commited + cluster->wait_for_non_mmio_flush(); + cluster->read_from_device(readback_vec.data(), chip, core, address, 40, "SMALL_READ_WRITE_TLB"); + + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + + cluster->wait_for_non_mmio_flush(); + + cluster->write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), chip, core, address, "SMALL_READ_WRITE_TLB"); + + cluster->wait_for_non_mmio_flush(); + + readback_vec = zeros; + } + address += 0x20; // Increment by uint32_t size for each write + } + } + cluster->close_device(); +} diff --git a/tests/api/test_core_coord_translation_wh.cpp b/tests/api/test_core_coord_translation_wh.cpp index 933499b4..a4aa6364 100644 --- a/tests/api/test_core_coord_translation_wh.cpp +++ b/tests/api/test_core_coord_translation_wh.cpp @@ -13,10 +13,8 @@ using namespace tt::umd; // Tests that all physical coordinates are same as all virtual coordinates // when there is no harvesting. TEST(CoordinateManager, CoordinateManagerWormholeNoHarvesting) { - const size_t harvesting_mask = 0; - std::shared_ptr coordinate_manager = - CoordinateManager::create_coordinate_manager(tt::ARCH::WORMHOLE_B0, 0, 0); + CoordinateManager::create_coordinate_manager(tt::ARCH::WORMHOLE_B0); // We expect full grid size since there is no harvesting. tt_xy_pair tensix_grid_size = tt::umd::wormhole::TENSIX_GRID_SIZE; diff --git a/tests/blackhole/CMakeLists.txt b/tests/blackhole/CMakeLists.txt index d153cdda..e47a5655 100644 --- a/tests/blackhole/CMakeLists.txt +++ b/tests/blackhole/CMakeLists.txt @@ -1,4 +1,4 @@ -set(UNIT_TESTS_BH_SRCS test_silicon_driver_bh.cpp) +set(UNIT_TESTS_BH_SRCS test_cluster_bh.cpp) add_executable(unit_tests_blackhole ${UNIT_TESTS_BH_SRCS}) target_link_libraries(unit_tests_blackhole PRIVATE test_common) diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_cluster_bh.cpp similarity index 100% rename from tests/blackhole/test_silicon_driver_bh.cpp rename to tests/blackhole/test_cluster_bh.cpp diff --git a/tests/grayskull/CMakeLists.txt b/tests/grayskull/CMakeLists.txt index 9bd6ff74..5231b27d 100644 --- a/tests/grayskull/CMakeLists.txt +++ b/tests/grayskull/CMakeLists.txt @@ -1,4 +1,4 @@ -set(UNIT_TESTS_GS_SRCS test_silicon_driver.cpp) +set(UNIT_TESTS_GS_SRCS test_cluster_gs.cpp) add_executable(unit_tests_grayskull ${UNIT_TESTS_GS_SRCS}) target_link_libraries(unit_tests_grayskull PRIVATE test_common) diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_cluster_gs.cpp similarity index 100% rename from tests/grayskull/test_silicon_driver.cpp rename to tests/grayskull/test_cluster_gs.cpp diff --git a/tests/wormhole/CMakeLists.txt b/tests/wormhole/CMakeLists.txt index b6886ffa..5602a558 100644 --- a/tests/wormhole/CMakeLists.txt +++ b/tests/wormhole/CMakeLists.txt @@ -1,5 +1,5 @@ set(UNIT_TESTS_WH_SRCS - test_silicon_driver_wh.cpp + test_cluster_wh.cpp test_umd_remote_api_stability.cpp ) diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_cluster_wh.cpp similarity index 100% rename from tests/wormhole/test_silicon_driver_wh.cpp rename to tests/wormhole/test_cluster_wh.cpp