diff --git a/tests/ttnn/unit_tests/test_multi_device.py b/tests/ttnn/unit_tests/test_multi_device.py index 501840cfe5c..11308b0e6a1 100644 --- a/tests/ttnn/unit_tests/test_multi_device.py +++ b/tests/ttnn/unit_tests/test_multi_device.py @@ -65,6 +65,32 @@ def test_multi_device_open_close_using_context_manager(silicon_arch_name, silico pass +def test_multi_device_open_close_galaxy_mesh(silicon_arch_name, silicon_arch_wormhole_b0): + if ttnn.get_num_devices() < 32: + pytest.skip("Test is only valid on Galaxy") + + """Manually open and close multi-device""" + device_grid, device_ids = ttnn.DeviceGrid(1, 4), ttnn.get_device_ids() + multi_device = ttnn.open_device_mesh(device_grid, device_ids) + assert multi_device.get_num_devices() == 4 + ttnn.close_device_mesh(multi_device) + + device_grid, device_ids = ttnn.DeviceGrid(8, 1), ttnn.get_device_ids() + multi_device = ttnn.open_device_mesh(device_grid, device_ids) + assert multi_device.get_num_devices() == 8 + ttnn.close_device_mesh(multi_device) + + device_grid, device_ids = ttnn.DeviceGrid(8, 4), ttnn.get_device_ids() + multi_device = ttnn.open_device_mesh(device_grid, device_ids) + assert multi_device.get_num_devices() == 32 + ttnn.close_device_mesh(multi_device) + + device_grid = ttnn.DeviceGrid(3, 2) + multi_device = ttnn.open_device_mesh(device_grid, device_ids) + assert multi_device.get_num_devices() == 6 + ttnn.close_device_mesh(multi_device) + + ####### # Simple Multi-Device Tensor tests ####### diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index a5a2ab0d661..50ec3ee76ca 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -126,8 +126,7 @@ void DevicePool::initialize_device(Device* dev) const { } void DevicePool::activate_device(chip_id_t id) { - TT_ASSERT(id < tt::tt_metal::GetNumAvailableDevices(), "Tried to add device id larger than available devices"); - + TT_ASSERT(id < tt::Cluster::instance().number_of_devices(), "Tried to add device id larger than available devices"); const std::lock_guard lock(this->lock); if (this->devices.size() < id + 1) { this->devices.resize(id + 1); @@ -237,7 +236,7 @@ DevicePool::DevicePool( log_debug(tt::LogMetal, "DevicePool constructor"); bool use_numa_node_based_thread_binding = parse_env("TT_METAL_NUMA_BASED_AFFINITY", false); std::vector all_device_ids; - for (int i = 0; i < tt::tt_metal::GetNumAvailableDevices(); i++) { + for (int i = 0; i < tt::Cluster::instance().number_of_devices(); i++) { all_device_ids.emplace_back((chip_id_t)i); } std::unordered_set free_cores = {}; diff --git a/tt_metal/impl/device/multi_device.cpp b/tt_metal/impl/device/multi_device.cpp index e51601506bd..bb52a65e2f9 100644 --- a/tt_metal/impl/device/multi_device.cpp +++ b/tt_metal/impl/device/multi_device.cpp @@ -13,7 +13,6 @@ namespace ttnn { namespace multi_device { - DeviceMesh::DeviceMesh(const DeviceGrid& device_grid, const DeviceIds &device_ids, size_t l1_small_size, size_t trace_region_size) : device_grid(device_grid) { @@ -23,43 +22,46 @@ DeviceMesh::DeviceMesh(const DeviceGrid& device_grid, const DeviceIds &device_id TT_ASSERT(num_requested_devices <= num_available_devices, "Requested more devices than available"); TT_ASSERT(num_requested_devices <= device_ids.size(), "User provided insufficient number of device_ids for DeviceMesh"); - - //TODO: for DevicePool feature delete CreateDevices and merge with this function - //TODO: should there be an explicit CloseDevices call somewhere? bool is_galaxy = tt::Cluster::instance().is_galaxy_cluster(); - std::vector mmio_device_ids = {}; - if (is_galaxy) { - mmio_device_ids.push_back(0); - if (num_requested_devices > 8) { - mmio_device_ids.push_back(1); - } - if (num_requested_devices > 16) { - mmio_device_ids.push_back(2); - } - if (num_requested_devices > 24) { - mmio_device_ids.push_back(3); - } - } else { - mmio_device_ids = device_ids; - } - managed_devices = tt::tt_metal::detail::CreateDevices(mmio_device_ids, 1, l1_small_size, trace_region_size); if (is_galaxy) { + // Temp solution until we add algorithmic way to determine chip connectivity + // Map col to tunnel depth and row to tunnel count + int cluster_tunnel_depth = tt::Cluster::instance().get_mmio_device_max_tunnel_depth(0); + int cluster_tunnel_count = tt::Cluster::instance().get_mmio_device_tunnel_count(0); + int num_mmio_devices = tt::Cluster::instance().number_of_pci_devices(); + TT_ASSERT(num_cols <= cluster_tunnel_depth and num_rows <= cluster_tunnel_count * num_mmio_devices, "Unsupported Galaxy mesh shape"); + DeviceIds galaxy_device_ids; - for (const auto &[dev_id, dev]: managed_devices) { - galaxy_device_ids.emplace_back(dev_id); + for (int mmio_device_id = 0; mmio_device_id < num_mmio_devices; mmio_device_id++) { + auto tunnels_from_mmio = tt::Cluster::instance().get_tunnels_from_mmio_device(mmio_device_id); + for (uint32_t t = 0; t < tunnels_from_mmio.size(); t++) { + if (galaxy_device_ids.size() == num_requested_devices) { + break; + } + int col_idx = 0; + for (uint32_t ts = 1; ts < tunnels_from_mmio[t].size(); ts++) { + galaxy_device_ids.push_back(tunnels_from_mmio[t][ts]); + col_idx ++; + if (col_idx == num_cols) { + break; + } + } + } } + managed_devices = tt::tt_metal::detail::CreateDevices(galaxy_device_ids, 1, l1_small_size, trace_region_size); for (int i = 0; i < num_requested_devices; i++) { mesh_devices.emplace_back(device_ids[i], managed_devices.at(galaxy_device_ids[i])); } } else { - for (int i = 0; i < num_requested_devices; i++) { + managed_devices = tt::tt_metal::detail::CreateDevices(device_ids, 1, l1_small_size, trace_region_size); + for (int i = 0; i < num_requested_devices; i++) { mesh_devices.emplace_back(device_ids[i], managed_devices.at(device_ids[i])); - } + } } - /* + for (const auto& [dev_id, dev]: mesh_devices) { - std::cout << "dev_id " << dev_id << " dev " << dev->id() << std::endl; - }*/ + log_debug(tt::LogMetal, "TTNN Dev {}: Metal Dev {}", dev_id, dev->id()); + } } diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 941e703c67b..7dec1f5e951 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -44,6 +44,8 @@ Cluster::Cluster() { this->initialize_ethernet_sockets(); + this->set_tunnels_from_mmio_device(); + this->assert_risc_reset(); } @@ -308,6 +310,7 @@ Cluster::~Cluster() { this->device_to_mmio_device_.clear(); this->device_to_host_mem_channel_.clear(); this->device_eth_routing_info_.clear(); + this->tunnels_from_mmio_device.clear(); } tt_device &Cluster::get_driver(chip_id_t device_id) const { @@ -582,82 +585,94 @@ std::unordered_map> Cluster::get_ethernet_core return connected_chips; } #define MAX_TUNNEL_DEPTH 4 -std::vector> Cluster::get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const { - std::vector> tunnels_from_mmio = {}; - const auto &all_eth_connections = this->cluster_desc_->get_ethernet_connections(); - TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(mmio_chip_id)); - - if (all_eth_connections.find(mmio_chip_id) == all_eth_connections.end()) { - return {}; - } +void Cluster::set_tunnels_from_mmio_device() { + for (const auto &[mmio_chip_id, physical_chip_id] : this->cluster_desc_->get_chips_with_mmio()) { + std::vector> tunnels_from_mmio = {}; + const auto &all_eth_connections = this->cluster_desc_->get_ethernet_connections(); + TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(mmio_chip_id)); + + if (all_eth_connections.find(mmio_chip_id) == all_eth_connections.end()) { + this->tunnels_from_mmio_device.insert({mmio_chip_id, {}}); + continue; + } - std::set device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id); - device_ids.erase(mmio_chip_id); + std::set device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id); + device_ids.erase(mmio_chip_id); - if (device_ids.size() == 0) { - return {}; - } + if (device_ids.size() == 0) { + this->tunnels_from_mmio_device.insert({mmio_chip_id, {}}); + continue; + } - for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(mmio_chip_id)) { - const auto &other_chip_id = std::get<0>(connected_chip_chan); - if (device_ids.find(other_chip_id) != device_ids.end()) { - //mmio chip is connected to a remote chip in its mmio group. - //erase from the pool so multiple ethenret connections to same remote device do not - //pollute the counts. - device_ids.erase(other_chip_id); - std::vector first_stop = {other_chip_id}; - auto it = std::find(tunnels_from_mmio.begin(), tunnels_from_mmio.end(), first_stop); - TT_ASSERT(it == tunnels_from_mmio.end(),"Duplicate first tunnel stop found when finding FD2 Tunnel devices."); - tunnels_from_mmio.push_back(first_stop); + for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(mmio_chip_id)) { + const auto &other_chip_id = std::get<0>(connected_chip_chan); + if (device_ids.find(other_chip_id) != device_ids.end()) { + // mmio chip is connected to a remote chip in its mmio group. + // erase from the pool so multiple ethenret connections to same remote device do not + // pollute the counts. + device_ids.erase(other_chip_id); + std::vector first_stop = {other_chip_id}; + auto it = std::find(tunnels_from_mmio.begin(), tunnels_from_mmio.end(), first_stop); + TT_ASSERT( + it == tunnels_from_mmio.end(), + "Duplicate first tunnel stop found when finding FD2 Tunnel devices."); + tunnels_from_mmio.push_back(first_stop); + } } - } - log_debug( - tt::LogMetal, "Found {} FD Tunnels originating from MMIO Device {}", tunnels_from_mmio.size(), mmio_chip_id); + log_debug( + tt::LogMetal, + "Found {} FD Tunnels originating from MMIO Device {}", + tunnels_from_mmio.size(), + mmio_chip_id); - device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id); - device_ids.erase(mmio_chip_id); + device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id); + device_ids.erase(mmio_chip_id); - for (auto &tunnel : tunnels_from_mmio) { - TT_ASSERT(tunnel.size() == 1,"Tunnel depth must be 1 when it has only 1 stop in it."); - device_ids.erase(tunnel[0]); - } + for (auto &tunnel : tunnels_from_mmio) { + TT_ASSERT(tunnel.size() == 1, "Tunnel depth must be 1 when it has only 1 stop in it."); + device_ids.erase(tunnel[0]); + } - bool tunneled_device_hit; - for (auto it = device_ids.begin(); it != device_ids.end();) { - tunneled_device_hit = false; - for (auto &dev_vec : tunnels_from_mmio) { - for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(dev_vec.back())) { - const auto &other_chip_id = std::get<0>(connected_chip_chan); - auto id_iter = device_ids.find(other_chip_id); - if (id_iter != device_ids.end()) { - it = device_ids.erase(id_iter); - dev_vec.push_back(other_chip_id); - tunneled_device_hit = true; - break; + bool tunneled_device_hit; + for (auto it = device_ids.begin(); it != device_ids.end();) { + tunneled_device_hit = false; + for (auto &dev_vec : tunnels_from_mmio) { + for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(dev_vec.back())) { + const auto &other_chip_id = std::get<0>(connected_chip_chan); + auto id_iter = device_ids.find(other_chip_id); + if (id_iter != device_ids.end()) { + it = device_ids.erase(id_iter); + dev_vec.push_back(other_chip_id); + tunneled_device_hit = true; + break; + } } } + TT_ASSERT(tunneled_device_hit || (it == device_ids.end()), "Loop Exit Error."); } - TT_ASSERT(tunneled_device_hit || (it == device_ids.end()),"Loop Exit Error."); - } - TT_ASSERT(tunnels_from_mmio.size() != 0,"Must have at least 1 tunnel from MMIO Device."); - uint32_t tunnel_depth = tunnels_from_mmio[0].size(); - log_debug(tt::LogMetal, "Each FD Tunnel is {} deep.", tunnel_depth); + TT_ASSERT(tunnels_from_mmio.size() != 0, "Must have at least 1 tunnel from MMIO Device."); + uint32_t tunnel_depth = tunnels_from_mmio[0].size(); + log_debug(tt::LogMetal, "Each FD Tunnel is {} deep.", tunnel_depth); - for (auto &dev_vec : tunnels_from_mmio) { - TT_ASSERT(dev_vec.size() == tunnel_depth,"All tunnels from mmio device must have same depth. Found {}. Expected {}.", dev_vec.size(), tunnel_depth); - //Now that all remotete chips have been added to respective tunnels, - //add mmio device at start of each of the tunnels. - if (dev_vec.size() > MAX_TUNNEL_DEPTH) { - dev_vec.resize(dev_vec.size() - (dev_vec.size() - MAX_TUNNEL_DEPTH)); + for (auto &dev_vec : tunnels_from_mmio) { + TT_ASSERT( + dev_vec.size() == tunnel_depth, + "All tunnels from mmio device must have same depth. Found {}. Expected {}.", + dev_vec.size(), + tunnel_depth); + // Now that all remotete chips have been added to respective tunnels, + // add mmio device at start of each of the tunnels. + if (dev_vec.size() > MAX_TUNNEL_DEPTH) { + dev_vec.resize(dev_vec.size() - (dev_vec.size() - MAX_TUNNEL_DEPTH)); + } + dev_vec.insert(dev_vec.begin(), mmio_chip_id); } - dev_vec.insert(dev_vec.begin(), mmio_chip_id); + this->tunnels_from_mmio_device.insert({mmio_chip_id, tunnels_from_mmio}); } - return tunnels_from_mmio; } - // Ethernet cluster api void Cluster::initialize_ethernet_sockets() { for (const auto &chip_id : this->cluster_desc_->get_all_chips()) { diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 804d357be71..bd9017461fb 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -49,7 +49,21 @@ class Cluster { static const Cluster &instance(); + // For TG Galaxy systems, mmio chips are gateway chips that are only used for dispatc, so user_devices are meant for + // user facing host apis + size_t number_of_user_devices() const { + if (this->is_tg_cluster_) { + const auto &chips = this->cluster_desc_->get_all_chips(); + return std::count_if(chips.begin(), chips.end(), [&](const auto &id) { + return this->cluster_desc_->get_board_type(id) == BoardType::GALAXY; + }); + } else { + return this->cluster_desc_->get_number_of_chips(); + } + } + size_t number_of_devices() const { return this->cluster_desc_->get_number_of_chips(); } + size_t number_of_pci_devices() const { return this->cluster_desc_->get_chips_with_mmio().size(); } ARCH arch() const { return this->arch_; } @@ -208,8 +222,10 @@ class Cluster { } // Returns vector of unique tunnels originating from mmio device. - // Each vecor entry is another vector of remote devices on that tunnel. - std::vector> get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const; + // Each vector entry is another vector of remote devices on that tunnel. + std::vector> get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const { + return this->tunnels_from_mmio_device.at(mmio_chip_id); + } bool is_galaxy_cluster() const; @@ -241,6 +257,9 @@ class Cluster { chip_id_t chip_id) const; void initialize_ethernet_sockets(); + // Set tunnels from mmio + void set_tunnels_from_mmio_device(); + ARCH arch_; TargetDevice target_type_; @@ -266,6 +285,9 @@ class Cluster { // If any device has to board type of GALAXY, we are on a TG cluster. bool is_tg_cluster_; + // Tunnels setup in cluster + std::map>> tunnels_from_mmio_device = {}; + // Currently, each device is mapped to its own channel in host memory to enable fast dispatch // Channels are unique within a group of devices all controlled by a particular MMIO device // For example: diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index d8c50a347ab..7bf6b4f058f 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -722,7 +722,7 @@ void EnableAllocs(Device *device) { tt::tt_metal::allocator::enable_allocs(*(dev size_t GetNumAvailableDevices() { #ifdef TT_METAL_VERSIM_DISABLED - return tt::Cluster::instance().number_of_devices(); + return tt::Cluster::instance().number_of_user_devices(); #else return 1; #endif