Skip to content

Commit

Permalink
#9451: add galaxy device mesh
Browse files Browse the repository at this point in the history
  • Loading branch information
aliuTT committed Jun 21, 2024
1 parent 4657af0 commit 36848ad
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 92 deletions.
26 changes: 26 additions & 0 deletions tests/ttnn/unit_tests/test_multi_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,32 @@ def test_multi_device_open_close_using_context_manager(silicon_arch_name, silico
pass


def test_multi_device_open_close_galaxy_mesh(silicon_arch_name, silicon_arch_wormhole_b0):
if ttnn.get_num_devices() < 32:
pytest.skip("Test is only valid on Galaxy")

"""Manually open and close multi-device"""
device_grid, device_ids = ttnn.DeviceGrid(1, 4), ttnn.get_device_ids()
multi_device = ttnn.open_device_mesh(device_grid, device_ids)
assert multi_device.get_num_devices() == 4
ttnn.close_device_mesh(multi_device)

device_grid, device_ids = ttnn.DeviceGrid(8, 1), ttnn.get_device_ids()
multi_device = ttnn.open_device_mesh(device_grid, device_ids)
assert multi_device.get_num_devices() == 8
ttnn.close_device_mesh(multi_device)

device_grid, device_ids = ttnn.DeviceGrid(8, 4), ttnn.get_device_ids()
multi_device = ttnn.open_device_mesh(device_grid, device_ids)
assert multi_device.get_num_devices() == 32
ttnn.close_device_mesh(multi_device)

device_grid = ttnn.DeviceGrid(3, 2)
multi_device = ttnn.open_device_mesh(device_grid, device_ids)
assert multi_device.get_num_devices() == 6
ttnn.close_device_mesh(multi_device)


#######
# Simple Multi-Device Tensor tests
#######
Expand Down
5 changes: 2 additions & 3 deletions tt_metal/impl/device/device_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ void DevicePool::initialize_device(Device* dev) const {
}

void DevicePool::activate_device(chip_id_t id) {
TT_ASSERT(id < tt::tt_metal::GetNumAvailableDevices(), "Tried to add device id larger than available devices");

TT_ASSERT(id < tt::Cluster::instance().number_of_devices(), "Tried to add device id larger than available devices");
const std::lock_guard<std::mutex> lock(this->lock);
if (this->devices.size() < id + 1) {
this->devices.resize(id + 1);
Expand Down Expand Up @@ -237,7 +236,7 @@ DevicePool::DevicePool(
log_debug(tt::LogMetal, "DevicePool constructor");
bool use_numa_node_based_thread_binding = parse_env("TT_METAL_NUMA_BASED_AFFINITY", false);
std::vector<chip_id_t> all_device_ids;
for (int i = 0; i < tt::tt_metal::GetNumAvailableDevices(); i++) {
for (int i = 0; i < tt::Cluster::instance().number_of_devices(); i++) {
all_device_ids.emplace_back((chip_id_t)i);
}
std::unordered_set<uint32_t> free_cores = {};
Expand Down
56 changes: 29 additions & 27 deletions tt_metal/impl/device/multi_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ namespace ttnn {

namespace multi_device {


DeviceMesh::DeviceMesh(const DeviceGrid& device_grid, const DeviceIds &device_ids, size_t l1_small_size, size_t trace_region_size)
: device_grid(device_grid)
{
Expand All @@ -23,43 +22,46 @@ DeviceMesh::DeviceMesh(const DeviceGrid& device_grid, const DeviceIds &device_id
TT_ASSERT(num_requested_devices <= num_available_devices, "Requested more devices than available");
TT_ASSERT(num_requested_devices <= device_ids.size(), "User provided insufficient number of device_ids for DeviceMesh");


//TODO: for DevicePool feature delete CreateDevices and merge with this function
//TODO: should there be an explicit CloseDevices call somewhere?
bool is_galaxy = tt::Cluster::instance().is_galaxy_cluster();
std::vector<chip_id_t> mmio_device_ids = {};
if (is_galaxy) {
mmio_device_ids.push_back(0);
if (num_requested_devices > 8) {
mmio_device_ids.push_back(1);
}
if (num_requested_devices > 16) {
mmio_device_ids.push_back(2);
}
if (num_requested_devices > 24) {
mmio_device_ids.push_back(3);
}
} else {
mmio_device_ids = device_ids;
}
managed_devices = tt::tt_metal::detail::CreateDevices(mmio_device_ids, 1, l1_small_size, trace_region_size);
if (is_galaxy) {
// Temp solution until we add algorithmic way to determine chip connectivity
// Map col to tunnel depth and row to tunnel count
int cluster_tunnel_depth = tt::Cluster::instance().get_mmio_device_max_tunnel_depth(0);
int cluster_tunnel_count = tt::Cluster::instance().get_mmio_device_tunnel_count(0);
int num_mmio_devices = tt::Cluster::instance().number_of_pci_devices();
TT_ASSERT(num_cols <= cluster_tunnel_depth and num_rows <= cluster_tunnel_count * num_mmio_devices, "Unsupported Galaxy mesh shape");

DeviceIds galaxy_device_ids;
for (const auto &[dev_id, dev]: managed_devices) {
galaxy_device_ids.emplace_back(dev_id);
for (int mmio_device_id = 0; mmio_device_id < num_mmio_devices; mmio_device_id++) {
auto tunnels_from_mmio = tt::Cluster::instance().get_tunnels_from_mmio_device(mmio_device_id);
for (uint32_t t = 0; t < tunnels_from_mmio.size(); t++) {
if (galaxy_device_ids.size() == num_requested_devices) {
break;
}
int col_idx = 0;
for (uint32_t ts = 1; ts < tunnels_from_mmio[t].size(); ts++) {
galaxy_device_ids.push_back(tunnels_from_mmio[t][ts]);
col_idx ++;
if (col_idx == num_cols) {
break;
}
}
}
}
managed_devices = tt::tt_metal::detail::CreateDevices(galaxy_device_ids, 1, l1_small_size, trace_region_size);
for (int i = 0; i < num_requested_devices; i++) {
mesh_devices.emplace_back(device_ids[i], managed_devices.at(galaxy_device_ids[i]));
}
} else {
for (int i = 0; i < num_requested_devices; i++) {
managed_devices = tt::tt_metal::detail::CreateDevices(device_ids, 1, l1_small_size, trace_region_size);
for (int i = 0; i < num_requested_devices; i++) {
mesh_devices.emplace_back(device_ids[i], managed_devices.at(device_ids[i]));
}
}
}
/*

for (const auto& [dev_id, dev]: mesh_devices) {
std::cout << "dev_id " << dev_id << " dev " << dev->id() << std::endl;
}*/
log_debug(tt::LogMetal, "TTNN Dev {}: Metal Dev {}", dev_id, dev->id());
}
}


Expand Down
133 changes: 74 additions & 59 deletions tt_metal/llrt/tt_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ Cluster::Cluster() {

this->initialize_ethernet_sockets();

this->set_tunnels_from_mmio_device();

this->assert_risc_reset();
}

Expand Down Expand Up @@ -308,6 +310,7 @@ Cluster::~Cluster() {
this->device_to_mmio_device_.clear();
this->device_to_host_mem_channel_.clear();
this->device_eth_routing_info_.clear();
this->tunnels_from_mmio_device.clear();
}

tt_device &Cluster::get_driver(chip_id_t device_id) const {
Expand Down Expand Up @@ -582,82 +585,94 @@ std::unordered_map<chip_id_t, std::vector<CoreCoord>> Cluster::get_ethernet_core
return connected_chips;
}
#define MAX_TUNNEL_DEPTH 4
std::vector<std::vector<chip_id_t>> Cluster::get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const {
std::vector<std::vector<chip_id_t>> tunnels_from_mmio = {};
const auto &all_eth_connections = this->cluster_desc_->get_ethernet_connections();
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(mmio_chip_id));

if (all_eth_connections.find(mmio_chip_id) == all_eth_connections.end()) {
return {};
}
void Cluster::set_tunnels_from_mmio_device() {
for (const auto &[mmio_chip_id, physical_chip_id] : this->cluster_desc_->get_chips_with_mmio()) {
std::vector<std::vector<chip_id_t>> tunnels_from_mmio = {};
const auto &all_eth_connections = this->cluster_desc_->get_ethernet_connections();
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(mmio_chip_id));

if (all_eth_connections.find(mmio_chip_id) == all_eth_connections.end()) {
this->tunnels_from_mmio_device.insert({mmio_chip_id, {}});
continue;
}

std::set<chip_id_t> device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id);
device_ids.erase(mmio_chip_id);
std::set<chip_id_t> device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id);
device_ids.erase(mmio_chip_id);

if (device_ids.size() == 0) {
return {};
}
if (device_ids.size() == 0) {
this->tunnels_from_mmio_device.insert({mmio_chip_id, {}});
continue;
}

for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(mmio_chip_id)) {
const auto &other_chip_id = std::get<0>(connected_chip_chan);
if (device_ids.find(other_chip_id) != device_ids.end()) {
//mmio chip is connected to a remote chip in its mmio group.
//erase from the pool so multiple ethenret connections to same remote device do not
//pollute the counts.
device_ids.erase(other_chip_id);
std::vector<chip_id_t> first_stop = {other_chip_id};
auto it = std::find(tunnels_from_mmio.begin(), tunnels_from_mmio.end(), first_stop);
TT_ASSERT(it == tunnels_from_mmio.end(),"Duplicate first tunnel stop found when finding FD2 Tunnel devices.");
tunnels_from_mmio.push_back(first_stop);
for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(mmio_chip_id)) {
const auto &other_chip_id = std::get<0>(connected_chip_chan);
if (device_ids.find(other_chip_id) != device_ids.end()) {
// mmio chip is connected to a remote chip in its mmio group.
// erase from the pool so multiple ethenret connections to same remote device do not
// pollute the counts.
device_ids.erase(other_chip_id);
std::vector<chip_id_t> first_stop = {other_chip_id};
auto it = std::find(tunnels_from_mmio.begin(), tunnels_from_mmio.end(), first_stop);
TT_ASSERT(
it == tunnels_from_mmio.end(),
"Duplicate first tunnel stop found when finding FD2 Tunnel devices.");
tunnels_from_mmio.push_back(first_stop);
}
}
}

log_debug(
tt::LogMetal, "Found {} FD Tunnels originating from MMIO Device {}", tunnels_from_mmio.size(), mmio_chip_id);
log_debug(
tt::LogMetal,
"Found {} FD Tunnels originating from MMIO Device {}",
tunnels_from_mmio.size(),
mmio_chip_id);

device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id);
device_ids.erase(mmio_chip_id);
device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id);
device_ids.erase(mmio_chip_id);

for (auto &tunnel : tunnels_from_mmio) {
TT_ASSERT(tunnel.size() == 1,"Tunnel depth must be 1 when it has only 1 stop in it.");
device_ids.erase(tunnel[0]);
}
for (auto &tunnel : tunnels_from_mmio) {
TT_ASSERT(tunnel.size() == 1, "Tunnel depth must be 1 when it has only 1 stop in it.");
device_ids.erase(tunnel[0]);
}

bool tunneled_device_hit;
for (auto it = device_ids.begin(); it != device_ids.end();) {
tunneled_device_hit = false;
for (auto &dev_vec : tunnels_from_mmio) {
for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(dev_vec.back())) {
const auto &other_chip_id = std::get<0>(connected_chip_chan);
auto id_iter = device_ids.find(other_chip_id);
if (id_iter != device_ids.end()) {
it = device_ids.erase(id_iter);
dev_vec.push_back(other_chip_id);
tunneled_device_hit = true;
break;
bool tunneled_device_hit;
for (auto it = device_ids.begin(); it != device_ids.end();) {
tunneled_device_hit = false;
for (auto &dev_vec : tunnels_from_mmio) {
for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(dev_vec.back())) {
const auto &other_chip_id = std::get<0>(connected_chip_chan);
auto id_iter = device_ids.find(other_chip_id);
if (id_iter != device_ids.end()) {
it = device_ids.erase(id_iter);
dev_vec.push_back(other_chip_id);
tunneled_device_hit = true;
break;
}
}
}
TT_ASSERT(tunneled_device_hit || (it == device_ids.end()), "Loop Exit Error.");
}
TT_ASSERT(tunneled_device_hit || (it == device_ids.end()),"Loop Exit Error.");
}

TT_ASSERT(tunnels_from_mmio.size() != 0,"Must have at least 1 tunnel from MMIO Device.");
uint32_t tunnel_depth = tunnels_from_mmio[0].size();
log_debug(tt::LogMetal, "Each FD Tunnel is {} deep.", tunnel_depth);
TT_ASSERT(tunnels_from_mmio.size() != 0, "Must have at least 1 tunnel from MMIO Device.");
uint32_t tunnel_depth = tunnels_from_mmio[0].size();
log_debug(tt::LogMetal, "Each FD Tunnel is {} deep.", tunnel_depth);

for (auto &dev_vec : tunnels_from_mmio) {
TT_ASSERT(dev_vec.size() == tunnel_depth,"All tunnels from mmio device must have same depth. Found {}. Expected {}.", dev_vec.size(), tunnel_depth);
//Now that all remotete chips have been added to respective tunnels,
//add mmio device at start of each of the tunnels.
if (dev_vec.size() > MAX_TUNNEL_DEPTH) {
dev_vec.resize(dev_vec.size() - (dev_vec.size() - MAX_TUNNEL_DEPTH));
for (auto &dev_vec : tunnels_from_mmio) {
TT_ASSERT(
dev_vec.size() == tunnel_depth,
"All tunnels from mmio device must have same depth. Found {}. Expected {}.",
dev_vec.size(),
tunnel_depth);
// Now that all remotete chips have been added to respective tunnels,
// add mmio device at start of each of the tunnels.
if (dev_vec.size() > MAX_TUNNEL_DEPTH) {
dev_vec.resize(dev_vec.size() - (dev_vec.size() - MAX_TUNNEL_DEPTH));
}
dev_vec.insert(dev_vec.begin(), mmio_chip_id);
}
dev_vec.insert(dev_vec.begin(), mmio_chip_id);
this->tunnels_from_mmio_device.insert({mmio_chip_id, tunnels_from_mmio});
}
return tunnels_from_mmio;
}


// Ethernet cluster api
void Cluster::initialize_ethernet_sockets() {
for (const auto &chip_id : this->cluster_desc_->get_all_chips()) {
Expand Down
26 changes: 24 additions & 2 deletions tt_metal/llrt/tt_cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,21 @@ class Cluster {

static const Cluster &instance();

// For TG Galaxy systems, mmio chips are gateway chips that are only used for dispatc, so user_devices are meant for
// user facing host apis
size_t number_of_user_devices() const {
if (this->is_tg_cluster_) {
const auto &chips = this->cluster_desc_->get_all_chips();
return std::count_if(chips.begin(), chips.end(), [&](const auto &id) {
return this->cluster_desc_->get_board_type(id) == BoardType::GALAXY;
});
} else {
return this->cluster_desc_->get_number_of_chips();
}
}

size_t number_of_devices() const { return this->cluster_desc_->get_number_of_chips(); }

size_t number_of_pci_devices() const { return this->cluster_desc_->get_chips_with_mmio().size(); }

ARCH arch() const { return this->arch_; }
Expand Down Expand Up @@ -208,8 +222,10 @@ class Cluster {
}

// Returns vector of unique tunnels originating from mmio device.
// Each vecor entry is another vector of remote devices on that tunnel.
std::vector<std::vector<chip_id_t>> get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const;
// Each vector entry is another vector of remote devices on that tunnel.
std::vector<std::vector<chip_id_t>> get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const {
return this->tunnels_from_mmio_device.at(mmio_chip_id);
}

bool is_galaxy_cluster() const;

Expand Down Expand Up @@ -241,6 +257,9 @@ class Cluster {
chip_id_t chip_id) const;
void initialize_ethernet_sockets();

// Set tunnels from mmio
void set_tunnels_from_mmio_device();

ARCH arch_;
TargetDevice target_type_;

Expand All @@ -266,6 +285,9 @@ class Cluster {
// If any device has to board type of GALAXY, we are on a TG cluster.
bool is_tg_cluster_;

// Tunnels setup in cluster
std::map<chip_id_t, std::vector<std::vector<chip_id_t>>> tunnels_from_mmio_device = {};

// Currently, each device is mapped to its own channel in host memory to enable fast dispatch
// Channels are unique within a group of devices all controlled by a particular MMIO device
// For example:
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/tt_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,7 @@ void EnableAllocs(Device *device) { tt::tt_metal::allocator::enable_allocs(*(dev

size_t GetNumAvailableDevices() {
#ifdef TT_METAL_VERSIM_DISABLED
return tt::Cluster::instance().number_of_devices();
return tt::Cluster::instance().number_of_user_devices();
#else
return 1;
#endif
Expand Down

0 comments on commit 36848ad

Please sign in to comment.