Skip to content

Commit

Permalink
#0: T3K fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-dma committed Dec 6, 2024
1 parent 63d8f38 commit ba22769
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 32 deletions.
1 change: 1 addition & 0 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2189,6 +2189,7 @@ void Device::compile_command_queue_programs_new() {
if (this->is_mmio_capable()) {
auto command_queue_program_ptr = create_mmio_cq_program(this);
this->command_queue_programs.push_back(std::move(command_queue_program_ptr));
this->setup_tunnel_for_remote_devices();
} else {
auto command_queue_program_ptr = create_mmio_cq_program(this);
this->command_queue_programs.push_back(std::move(command_queue_program_ptr));
Expand Down
24 changes: 17 additions & 7 deletions tt_metal/impl/device/device_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,14 +315,12 @@ bool DevicePool::is_device_active(chip_id_t id) const {
}

void DevicePool::add_devices_to_pool(const std::vector<chip_id_t>& device_ids) {
populate_fd_kernels(device_ids.size(), this->num_hw_cqs);
std::set<chip_id_t> devices_to_activate;
if (this->skip_remote_devices) {
for (const auto& device_id : device_ids) {
const auto& mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
TT_ASSERT(device_id == mmio_device_id, "Skipping remote devices is only available for mmio devices");
if (not this->is_device_active(device_id)) {
this->activate_device(device_id);
}
devices_to_activate.insert(device_id);
}
} else {
std::vector<chip_id_t> all_device_ids = {};
Expand All @@ -331,12 +329,24 @@ void DevicePool::add_devices_to_pool(const std::vector<chip_id_t>& device_ids) {
const auto& mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
for (const auto& mmio_controlled_device_id :
tt::Cluster::instance().get_devices_controlled_by_mmio_device(mmio_device_id)) {
if (not this->is_device_active(mmio_controlled_device_id)) {
this->activate_device(mmio_controlled_device_id);
}
devices_to_activate.insert(mmio_controlled_device_id);
}
}
}

std::string ids_str = "";
for (auto id : devices_to_activate) {
ids_str += fmt::format("{}, ", id);
}
log_warning("Device Pool init, ids: {}skip_remote_devices: {}", ids_str, this->skip_remote_devices);
if (llrt::OptionsG.get_use_new_fd_init()) {
populate_fd_kernels(devices_to_activate, this->num_hw_cqs);
}
for (const auto& device_id : devices_to_activate) {
if (not this->is_device_active(device_id)) {
this->activate_device(device_id);
}
}
}

void DevicePool::register_worker_thread_for_device(v1::DeviceHandle device, std::thread::id worker_thread_id) {
Expand Down
111 changes: 87 additions & 24 deletions tt_metal/impl/dispatch/arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ static const std::vector<dispatch_kernel_node_t> single_card_arch_2cq = {
{3, 0, 1, DISPATCH_HD, { 2, x, x, x}, { x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
};

static const std::vector<dispatch_kernel_node_t> single_card_arch_2cq_dispatch_s = {
{0, 0, 0, PREFETCH_HD, { x, x, x, x}, { 1, 4, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, DISPATCH_HD, { 0, x, x, x}, { 4, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{2, 0, 1, PREFETCH_HD, { x, x, x, x}, { 3, 5, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{3, 0, 1, DISPATCH_HD, { 2, x, x, x}, { 5, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{4, 0, 0, DISPATCH_S, { 0, x, x, x}, { 1, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
{5, 0, 1, DISPATCH_S, { 2, x, x, x}, { 3, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
};

static const std::vector<dispatch_kernel_node_t> two_card_arch_1cq = {
{ 0, 0, 0, PREFETCH_HD, { x, x, x, x}, { 1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{ 1, 0, 0, DISPATCH_HD, { 0, x, x, x}, { 2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
Expand Down Expand Up @@ -94,31 +103,85 @@ std::vector<FDKernel *> node_id_to_kernel;
// Populate node_id_to_kernel and set up kernel objects. Do this once at the beginning since they (1) don't need a valid
// Device until fields are populated, (2) need to be connected to kernel objects for devices that aren't created yet,
// and (3) the table to choose depends on total number of devices, not know at Device creation.
void populate_fd_kernels(uint32_t num_devices, uint32_t num_hw_cqs) {
tt::log_warning("FD Config: {} devices, {} HW CQs", num_devices, num_hw_cqs);
// Select/generate the right input table. TODO: read this out of YAML instead of the structs above?
void populate_fd_kernels(const std::set<chip_id_t> &device_ids, uint32_t num_hw_cqs) {
// Select/generate the right input table, depends on (1) board [detected from total # of devices], and (2) number
// of active devices. TODO: read this out of YAML instead of the structs above?
uint32_t total_devices = tt::Cluster::instance().number_of_user_devices();
uint32_t num_devices = device_ids.size();
TT_ASSERT(num_devices > 0, "Can't determine dispatch architecture with no active devices.");
TT_ASSERT(num_devices <= total_devices);
tt::log_warning("FD Config: {}/{} devices, {} HW CQs", num_devices, total_devices, num_hw_cqs);
std::vector<dispatch_kernel_node_t> nodes;
if (num_devices == 0)
num_devices = tt::Cluster::instance().number_of_user_devices();

if (num_devices == 1) { // E150, N150
nodes = (num_hw_cqs == 1) ? single_card_arch_1cq : single_card_arch_2cq;
} else if (num_devices == 2) { // N300
nodes = (num_hw_cqs == 1) ? two_card_arch_1cq : two_card_arch_2cq;
} else if (num_devices == 8) { // T3K
const std::vector<dispatch_kernel_node_t> *nodes_for_one_mmio = (num_hw_cqs == 1) ? &two_card_arch_1cq : &two_card_arch_2cq;
// TODO: specify replication + device id mapping from struct/yaml? Just to avoid having these huge graphs typed out
uint32_t num_mmio_devices = 4;
uint32_t num_nodes_for_one_mmio = nodes_for_one_mmio->size();
for (int mmio_device_id = 0; mmio_device_id < num_mmio_devices; mmio_device_id++) {
for (dispatch_kernel_node_t node : *nodes_for_one_mmio) {
TT_ASSERT(node.device_id == 0 || node.device_id == 1);
if (node.device_id == 0)
node.device_id = mmio_device_id;
else
node.device_id = mmio_device_id + num_mmio_devices;
increment_node_ids(node, mmio_device_id * num_nodes_for_one_mmio);
nodes.push_back(node);

// Helper function to get nodes for single device
auto populate_single_device = [&]() {
if (num_hw_cqs == 1) {
return single_card_arch_1cq;
} else {
// Special case here, single-device can either have dispatch_s or no dispatch_s, depending on the dispatch
// core type. This is only an issue for single-chip, since multi-chip always has ethernet dispatch (and
// therefore no dispatch_s). TODO: determine whether dispatch_s is inserted at this level, instead of inside
// Device::dispatch_s_enabled().
if (dispatch_core_manager::instance().get_dispatch_core_type(0) == CoreType::WORKER) {
return single_card_arch_2cq_dispatch_s;
} else {
return single_card_arch_2cq;
}
}
};

if (total_devices == 1) { // E150, N150
nodes = populate_single_device();
} else if (total_devices == 2) { // N300
if (num_devices == 1) {
nodes = populate_single_device();
} else {
nodes = (num_hw_cqs == 1) ? two_card_arch_1cq : two_card_arch_2cq;
}
} else if (total_devices == 8) { // T3K
// Need to determine the submesh of devices that are being used. TODO: user to pass in the correct architecture?
std::set<chip_id_t> mmio_devices;
std::set<chip_id_t> remote_devices;
for (auto id : device_ids) {
if (tt::Cluster::instance().get_associated_mmio_device(id) == id)
mmio_devices.insert(id);
else
remote_devices.insert(id);
}
tt::log_warning("T3000, mmio_count={}, remote_count={}", mmio_devices.size(), remote_devices.size());

// Supported grid either has one remote per mmio or none
TT_ASSERT(mmio_devices.size() == remote_devices.size() or remote_devices.empty(), "Unexpected device grid");
if (remote_devices.empty()) {
// All mmio chips, replicate as required
std::vector<dispatch_kernel_node_t> nodes_for_one_mmio = populate_single_device();
uint32_t index_offset = 0;
for (auto id : mmio_devices) {
for (auto node : nodes_for_one_mmio) {
node.device_id = id;
increment_node_ids(node, index_offset);
nodes.push_back(node);
}
index_offset += nodes_for_one_mmio.size();
}
} else {
// Paired mmio/remote chips
// Here we assume that the mmio chips are enumerated first, and the remote chips are enumerated afterwards
// in the same order as they are connected. TODO: This seems to always be the case, but may need to change in the future.
const std::vector<dispatch_kernel_node_t> *nodes_for_one_mmio = (num_hw_cqs == 1) ? &two_card_arch_1cq : &two_card_arch_2cq;
uint32_t num_mmio_devices = 4;
uint32_t index_offset = 0;
for (auto mmio_device_id : mmio_devices) {
for (dispatch_kernel_node_t node : *nodes_for_one_mmio) {
TT_ASSERT(node.device_id == 0 || node.device_id == 1);
if (node.device_id == 0)
node.device_id = mmio_device_id;
else
node.device_id = mmio_device_id + num_mmio_devices;
increment_node_ids(node, index_offset);
nodes.push_back(node);
}
index_offset += nodes_for_one_mmio->size();
}
}
} else { // TG, TGG
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/dispatch/arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once

void populate_fd_kernels(uint32_t num_devices, uint32_t num_hw_cqs);
void populate_fd_kernels(const std::set<chip_id_t> &device_ids, uint32_t num_hw_cqs);

std::unique_ptr<Program> create_mmio_cq_program(Device *device);

Expand Down

0 comments on commit ba22769

Please sign in to comment.