Skip to content

Commit

Permalink
#0: T3K working
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-dma committed Nov 22, 2024
1 parent 4f33694 commit 6422a3e
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 29 deletions.
4 changes: 2 additions & 2 deletions tests/scripts/run_tools_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_old.csv
TT_METAL_NEW=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*WatcherRingBufferBrisc
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_new.csv
if cmp kernel_args_old.csv kernel_args_new.csv; then
if diff kernel_args_old.csv kernel_args_new.csv; then
echo "FD Compile Args Test - 1CQ PASS"
else
echo "FD Compile Args Test - 1CQ FAIL"
Expand All @@ -26,7 +26,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_old.csv
TT_METAL_GTEST_ETH_DISPATCH=1 TT_METAL_GTEST_NUM_HW_CQS=2 TT_METAL_NEW=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*WatcherRingBufferBrisc
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_new.csv
if cmp kernel_args_old.csv kernel_args_new.csv; then
if diff kernel_args_old.csv kernel_args_new.csv; then
echo "FD Compile Args Test - 2CQ PASS"
else
echo "FD Compile Args Test - 2CQ FAIL"
Expand Down
63 changes: 51 additions & 12 deletions tt_metal/impl/dispatch/arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,17 @@ typedef struct {

// For readablity, unset = x = -1
#define x -1

void increment_node_ids(dispatch_kernel_node_t &node, uint32_t inc) {
node.id += inc;
for (int &id : node.upstream_ids)
if (id != x)
id += inc;
for (int &id : node.downstream_ids)
if (id != x)
id += inc;
}

static const std::vector<dispatch_kernel_node_t> single_card_arch_1cq = {
{0, 0, 0, PREFETCH_HD, { x, x, x, x}, { 1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, DISPATCH_HD, { 0, x, x, x}, { 2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
Expand Down Expand Up @@ -80,34 +91,61 @@ static const std::vector<dispatch_kernel_node_t> two_card_arch_2cq = {
std::vector<FDKernel *> node_id_to_kernel;

std::unique_ptr<Program> create_mmio_cq_program(Device *device) {
const std::vector<dispatch_kernel_node_t> *nodes;
if (tt::Cluster::instance().number_of_user_devices() == 1) {
nodes = (device->num_hw_cqs() == 1) ? &single_card_arch_1cq : &single_card_arch_2cq;
} else if (tt::Cluster::instance().number_of_user_devices() == 2) {
nodes = (device->num_hw_cqs() == 1) ? &two_card_arch_1cq : &two_card_arch_2cq;
} else {
std::vector<dispatch_kernel_node_t> nodes;
uint32_t num_devices = tt::Cluster::instance().number_of_user_devices();
if (num_devices == 1) { // E150, N150
nodes = (device->num_hw_cqs() == 1) ? single_card_arch_1cq : single_card_arch_2cq;
} else if (num_devices == 2) { // N300
nodes = (device->num_hw_cqs() == 1) ? two_card_arch_1cq : two_card_arch_2cq;
} else if (num_devices == 8) { // T3K
const std::vector<dispatch_kernel_node_t> *nodes_for_one_mmio = (device->num_hw_cqs() == 1) ? &two_card_arch_1cq : &two_card_arch_2cq;
// TODO: specify replication + device id mapping from struct/yaml? Just to avoid having these huge graphs typed out
uint32_t num_mmio_devices = 4;
uint32_t num_nodes_for_one_mmio = nodes_for_one_mmio->size();
for (int mmio_device_id = 0; mmio_device_id < num_mmio_devices; mmio_device_id++) {
for (dispatch_kernel_node_t node : *nodes_for_one_mmio) {
TT_ASSERT(node.device_id == 0 || node.device_id == 1);
if (node.device_id == 0)
node.device_id = mmio_device_id;
else
node.device_id = mmio_device_id + num_mmio_devices;
increment_node_ids(node, mmio_device_id * num_nodes_for_one_mmio);
nodes.push_back(node);
}
}
} else { // TG, TGG
TT_FATAL(false, "Not yet implemented!");
}
for (auto &node : nodes) {
std::string upstream = "";
for (int id : node.upstream_ids)
upstream += fmt::format("{}, ", id);
std::string downstream = "";
for (int id : node.downstream_ids)
downstream += fmt::format("{}, ", id);

// tt::log_info("[{}, {}, {}, {}, [{}], [{}], {}, {}, {}]", node.id, node.device_id, node.cq_id, node.kernel_type, upstream, downstream, node.my_noc, node.upstream_noc, node.downstream_noc);
}
if (node_id_to_kernel.empty()) {
// Do setup of kernel objects one time at the beginning, since they (1) don't need a valid Device until fields
// are populated, and (2) need to be connected to kernel objects for devices that aren't being created yet.
// Read the input table, create configs for each node
for (const auto &node : *nodes) {
for (const auto &node : nodes) {
node_id_to_kernel.push_back(FDKernel::Generate(
node.id, node.cq_id, {node.my_noc, node.upstream_noc, node.downstream_noc}, node.kernel_type));
}

// Connect the graph with upstream/downstream kernels
for (const auto &node : *nodes) {
for (const auto &node : nodes) {
for (int idx = 0; idx < DISPATCH_MAX_UPSTREAM; idx++) {
if (node.upstream_ids[idx] >= 0) {
tt::log_info("Node {} has upstream node: {}", node.id, node.upstream_ids[idx]);
// tt::log_info("Node {} has upstream node: {}", node.id, node.upstream_ids[idx]);
node_id_to_kernel.at(node.id)->AddUpstreamKernel(node_id_to_kernel.at(node.upstream_ids[idx]));
}
}
for (int idx = 0; idx < DISPATCH_MAX_DOWNSTREAM; idx++) {
if (node.downstream_ids[idx] >= 0) {
tt::log_info("Node {} has downstream node: {}", node.id, node.downstream_ids[idx]);
// tt::log_info("Node {} has downstream node: {}", node.id, node.downstream_ids[idx]);
node_id_to_kernel.at(node.id)->AddDownstreamKernel(node_id_to_kernel.at(node.downstream_ids[idx]));
}
}
Expand All @@ -118,16 +156,17 @@ std::unique_ptr<Program> create_mmio_cq_program(Device *device) {
auto cq_program_ptr = std::make_unique<Program>();
// for (auto &node_and_kernel : node_id_to_kernel) {
for (int idx = 0; idx < node_id_to_kernel.size(); idx++) {
if (nodes->at(idx).device_id == device->id()) {
if (nodes.at(idx).device_id == device->id()) {
node_id_to_kernel[idx]->AddDeviceAndProgram(device, cq_program_ptr.get());
node_id_to_kernel[idx]->GenerateStaticConfigs();
// tt::log_warning("Node {} has coord: {} (phys={})", idx, node_id_to_kernel[idx]->GetLogicalCore().str(), node_id_to_kernel[idx]->GetPhysicalCore().str());
}
}

// Third pass, populate dependent configs and create kernels for each node
// for (auto &node_and_kernel : node_id_to_kernel) {
for (int idx = 0; idx < node_id_to_kernel.size(); idx++) {
if (nodes->at(idx).device_id == device->id()) {
if (nodes.at(idx).device_id == device->id()) {
node_id_to_kernel[idx]->GenerateDependentConfigs();
node_id_to_kernel[idx]->CreateKernel();
}
Expand Down
31 changes: 17 additions & 14 deletions tt_metal/impl/dispatch/dispatch_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "impl/debug/dprint_server.hpp"

#define UNUSED_LOGICAL_CORE tt_cxy_pair(this->device->id(), 0, 0)
// TODO: Just to make match with previous implementation, remove later
#define UNUSED_LOGICAL_CORE_ADJUSTED tt_cxy_pair(this->device->id() + tt::Cluster::instance().number_of_pci_devices(), 0, 0)
#define UNUSED_SEM_ID 0

static std::vector<string> dispatch_kernel_file_names = {
Expand Down Expand Up @@ -193,7 +195,7 @@ void PrefetchKernel::GenerateStaticConfigs() {
uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start;
uint32_t issue_queue_size = device->sysmem_manager().get_issue_queue_size(cq_id);

this->logical_core = dispatch_core_manager::instance().prefetcher_core(device->id() + 1, channel, cq_id); // TODO
this->logical_core = dispatch_core_manager::instance().prefetcher_core(device->id() + tt::Cluster::instance().number_of_pci_devices(), channel, cq_id); // TODO

this->config.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
this->config.downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(device->num_hw_cqs());
Expand Down Expand Up @@ -352,7 +354,7 @@ void DispatchKernel::GenerateStaticConfigs() {
uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
uint32_t completion_queue_size = device->sysmem_manager().get_completion_queue_size(cq_id);

this->logical_core = dispatch_core_manager::instance().dispatcher_core(device->id() + 1, channel, cq_id); // TODO
this->logical_core = dispatch_core_manager::instance().dispatcher_core(device->id() + tt::Cluster::instance().number_of_pci_devices(), channel, cq_id); // TODO
this->config.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base();
this->config.dispatch_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
this->config.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
Expand Down Expand Up @@ -501,8 +503,8 @@ void MuxKernel::GenerateStaticConfigs() {

void DemuxKernel::GenerateStaticConfigs() {
tt::log_warning("GenerateStaticConfigs: Demux");
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id() + 1); // TODO
this->logical_core = dispatch_core_manager::instance().demux_core(this->device->id() + 1, channel, this->cq_id);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id() + tt::Cluster::instance().number_of_pci_devices()); // TODO: this is the downstream
this->logical_core = dispatch_core_manager::instance().demux_core(this->device->id() + tt::Cluster::instance().number_of_pci_devices(), channel, this->cq_id);
this->config.vc_count = downstream_kernels.size() + 1; // TODO: update for deeper tunnels?
this->config.endpoint_id_start_index = 0xD1;
this->config.rx_queue_start_addr_words = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED) >> 4;
Expand All @@ -529,9 +531,10 @@ void DemuxKernel::GenerateStaticConfigs() {

void EthTunnelerKernel::GenerateStaticConfigs() {
tt::log_warning("GenerateStaticConfigs: Tunneler");
this->downstream_device_id = device->id() + tt::Cluster::instance().number_of_pci_devices(); // TODO: update for galaxy...
if (this->IsRemote()) {
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(downstream_device_id);
this->logical_core = dispatch_core_manager::instance().tunneler_core(device->id(), downstream_device_id, channel, cq_id);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(downstream_device_id.value());
this->logical_core = dispatch_core_manager::instance().tunneler_core(device->id(), downstream_device_id.value(), channel, cq_id);
} else {
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
this->logical_core = dispatch_core_manager::instance().us_tunneler_core_local(device->id(), channel, cq_id);
Expand All @@ -548,8 +551,8 @@ void EthRouterKernel::GenerateStaticConfigs() {
auto &my_dispatch_constants = dispatch_constants::get(GetCoreType());
if (this->as_mux) {
tt::log_warning("GenerateStaticConfigs: Router (MUX)");
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id() + 1); // TODO
this->logical_core = dispatch_core_manager::instance().mux_core(device->id() + 1, channel, cq_id);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id() + tt::Cluster::instance().number_of_pci_devices()); // TODO: this is the downstream
this->logical_core = dispatch_core_manager::instance().mux_core(device->id() + tt::Cluster::instance().number_of_pci_devices(), channel, cq_id);
this->config.vc_count = upstream_kernels.size() + 1;
this->config.rx_queue_start_addr_words = my_dispatch_constants.dispatch_buffer_base() >> 4;
this->config.rx_queue_size_words = my_dispatch_constants.mux_buffer_size(device->num_hw_cqs()) >> 4;
Expand Down Expand Up @@ -648,15 +651,15 @@ void PrefetchKernel::GenerateDependentConfigs() {
} else if (this->config.is_h_variant.value()) {
// Upstream, just host so no dispatch core
TT_ASSERT(this->upstream_kernels.size() == 0);
this->config.upstream_logical_core = UNUSED_LOGICAL_CORE;
this->config.upstream_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
this->config.upstream_cb_sem_id = 0; // Used in prefetch_d only

// Downstream, expect just one ROUTER
TT_ASSERT(this->downstream_kernels.size() == 1);
auto router_kernel = dynamic_cast<EthRouterKernel *>(this->downstream_kernels[0]);
TT_ASSERT(router_kernel);
this->config.downstream_logical_core = router_kernel->GetLogicalCore();
this->config.downstream_s_logical_core = UNUSED_LOGICAL_CORE;
this->config.downstream_s_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
uint32_t router_idx = router_kernel->GetUpstreamPort(this); // Need the port that this connects to downstream
this->config.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base() + my_dispatch_constants.mux_buffer_size(device->num_hw_cqs()) * router_idx;
this->config.downstream_cb_sem_id = router_kernel->GetConfig().input_packetize_local_sem[router_idx];
Expand Down Expand Up @@ -753,8 +756,8 @@ void DispatchKernel::GenerateDependentConfigs() {
TT_ASSERT(this->downstream_kernels.size() == 1);
auto prefetch_h_kernel = dynamic_cast<PrefetchKernel *>(this->downstream_kernels[0]);
TT_ASSERT(prefetch_h_kernel);
this->config.downstream_logical_core = UNUSED_LOGICAL_CORE;
this->config.downstream_s_logical_core = UNUSED_LOGICAL_CORE;
this->config.downstream_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
this->config.downstream_s_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
this->config.prefetch_h_noc_xy = NOC_XY_ENCODING(prefetch_h_kernel->GetPhysicalCore().x, prefetch_h_kernel->GetPhysicalCore().y);
this->config.prefetch_h_local_downstream_sem_addr = prefetch_h_kernel->GetConfig().my_downstream_cb_sem_id;
this->config.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base(); // Unused
Expand Down Expand Up @@ -895,9 +898,9 @@ void EthTunnelerKernel::GenerateDependentConfigs() {
if (this->IsRemote()) {
// For remote tunneler, we don't actually have the device constructed for the paired tunneler, so can't pull
// info from it. Core coord can be computed without the device, and relevant fields match this tunneler.
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(downstream_device_id);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(downstream_device_id.value());
tt_cxy_pair paired_logical_core =
dispatch_core_manager::instance().us_tunneler_core_local(downstream_device_id, channel, cq_id);
dispatch_core_manager::instance().us_tunneler_core_local(downstream_device_id.value(), channel, cq_id);
CoreCoord paired_physical_coord = tt::get_physical_core_coordinate(paired_logical_core, CoreType::ETH);

// Upstream, we expect a US_TUNNELER_LOCAL and a PACKET_ROUTER
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/dispatch/dispatch_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ class EthTunnelerKernel : public FDKernel {
private:
eth_tunneler_config_t config;
uint32_t tunnel_stop;
chip_id_t downstream_device_id = 1; // TODO
std::optional<chip_id_t> downstream_device_id; // TODO
bool is_remote;
bool is_tunnel_start = true;
bool is_tunnel_end = true;
Expand Down

0 comments on commit 6422a3e

Please sign in to comment.