#0: T3K working

tenstorrent · Nov 22, 2024 · 6422a3e · 6422a3e
1 parent 4f33694
commit 6422a3e
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 29 deletions.
diff --git a/tests/scripts/run_tools_tests.sh b/tests/scripts/run_tools_tests.sh
@@ -14,7 +14,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
     find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_old.csv
     TT_METAL_NEW=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*WatcherRingBufferBrisc
     find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_new.csv
-    if cmp kernel_args_old.csv kernel_args_new.csv; then
+    if diff kernel_args_old.csv kernel_args_new.csv; then
         echo "FD Compile Args Test - 1CQ PASS"
     else
         echo "FD Compile Args Test - 1CQ FAIL"
@@ -26,7 +26,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
         find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_old.csv
         TT_METAL_GTEST_ETH_DISPATCH=1 TT_METAL_GTEST_NUM_HW_CQS=2 TT_METAL_NEW=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*WatcherRingBufferBrisc
         find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_new.csv
-        if cmp kernel_args_old.csv kernel_args_new.csv; then
+        if diff kernel_args_old.csv kernel_args_new.csv; then
             echo "FD Compile Args Test - 2CQ PASS"
         else
             echo "FD Compile Args Test - 2CQ FAIL"

diff --git a/tt_metal/impl/dispatch/arch.cpp b/tt_metal/impl/dispatch/arch.cpp
@@ -22,6 +22,17 @@ typedef struct {
 
 // For readablity, unset = x = -1
 #define x -1
+
+void increment_node_ids(dispatch_kernel_node_t &node, uint32_t inc) {
+    node.id += inc;
+    for (int &id : node.upstream_ids)
+        if (id != x)
+            id += inc;
+    for (int &id : node.downstream_ids)
+        if (id != x)
+            id += inc;
+}
+
 static const std::vector<dispatch_kernel_node_t> single_card_arch_1cq = {
     {0, 0, 0, PREFETCH_HD,   { x,  x,  x,  x}, { 1,  2,  x,  x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {1, 0, 0, DISPATCH_HD,   { 0,  x,  x,  x}, { 2,  x,  x,  x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
@@ -80,34 +91,61 @@ static const std::vector<dispatch_kernel_node_t> two_card_arch_2cq = {
 std::vector<FDKernel *> node_id_to_kernel;
 
 std::unique_ptr<Program> create_mmio_cq_program(Device *device) {
-    const std::vector<dispatch_kernel_node_t> *nodes;
-    if (tt::Cluster::instance().number_of_user_devices() == 1) {
-        nodes = (device->num_hw_cqs() == 1) ? &single_card_arch_1cq : &single_card_arch_2cq;
-    } else if (tt::Cluster::instance().number_of_user_devices() == 2) {
-        nodes = (device->num_hw_cqs() == 1) ? &two_card_arch_1cq : &two_card_arch_2cq;
-    } else {
+    std::vector<dispatch_kernel_node_t> nodes;
+    uint32_t num_devices = tt::Cluster::instance().number_of_user_devices();
+    if (num_devices == 1) { // E150, N150
+        nodes = (device->num_hw_cqs() == 1) ? single_card_arch_1cq : single_card_arch_2cq;
+    } else if (num_devices == 2) { // N300
+        nodes = (device->num_hw_cqs() == 1) ? two_card_arch_1cq : two_card_arch_2cq;
+    } else if (num_devices == 8) { // T3K
+        const std::vector<dispatch_kernel_node_t> *nodes_for_one_mmio = (device->num_hw_cqs() == 1) ? &two_card_arch_1cq : &two_card_arch_2cq;
+        // TODO: specify replication + device id mapping from struct/yaml? Just to avoid having these huge graphs typed out
+        uint32_t num_mmio_devices = 4;
+        uint32_t num_nodes_for_one_mmio = nodes_for_one_mmio->size();
+        for (int mmio_device_id = 0; mmio_device_id < num_mmio_devices; mmio_device_id++) {
+            for (dispatch_kernel_node_t node : *nodes_for_one_mmio) {
+                TT_ASSERT(node.device_id == 0 || node.device_id == 1);
+                if (node.device_id == 0)
+                    node.device_id = mmio_device_id;
+                else
+                    node.device_id = mmio_device_id + num_mmio_devices;
+                increment_node_ids(node, mmio_device_id * num_nodes_for_one_mmio);
+                nodes.push_back(node);
+            }
+        }
+    } else { // TG, TGG
         TT_FATAL(false, "Not yet implemented!");
     }
+    for (auto &node : nodes) {
+        std::string upstream = "";
+        for (int id : node.upstream_ids)
+            upstream += fmt::format("{}, ", id);
+        std::string downstream = "";
+        for (int id : node.downstream_ids)
+            downstream += fmt::format("{}, ", id);
+
+        // tt::log_info("[{}, {}, {}, {}, [{}], [{}], {}, {}, {}]", node.id, node.device_id, node.cq_id, node.kernel_type, upstream, downstream, node.my_noc, node.upstream_noc, node.downstream_noc);
+    }
     if (node_id_to_kernel.empty()) {
         // Do setup of kernel objects one time at the beginning, since they (1) don't need a valid Device until fields
         // are populated, and (2) need to be connected to kernel objects for devices that aren't being created yet.
         // Read the input table, create configs for each node
-        for (const auto &node : *nodes) {
+        for (const auto &node : nodes) {
             node_id_to_kernel.push_back(FDKernel::Generate(
                 node.id, node.cq_id, {node.my_noc, node.upstream_noc, node.downstream_noc}, node.kernel_type));
         }
 
         // Connect the graph with upstream/downstream kernels
-        for (const auto &node : *nodes) {
+        for (const auto &node : nodes) {
             for (int idx = 0; idx < DISPATCH_MAX_UPSTREAM; idx++) {
                 if (node.upstream_ids[idx] >= 0) {
-                    tt::log_info("Node {} has upstream node: {}", node.id, node.upstream_ids[idx]);
+                    // tt::log_info("Node {} has upstream node: {}", node.id, node.upstream_ids[idx]);
                     node_id_to_kernel.at(node.id)->AddUpstreamKernel(node_id_to_kernel.at(node.upstream_ids[idx]));
                 }
             }
             for (int idx = 0; idx < DISPATCH_MAX_DOWNSTREAM; idx++) {
                 if (node.downstream_ids[idx] >= 0) {
-                    tt::log_info("Node {} has downstream node: {}", node.id, node.downstream_ids[idx]);
+                    // tt::log_info("Node {} has downstream node: {}", node.id, node.downstream_ids[idx]);
                     node_id_to_kernel.at(node.id)->AddDownstreamKernel(node_id_to_kernel.at(node.downstream_ids[idx]));
                 }
             }
@@ -118,16 +156,17 @@ std::unique_ptr<Program> create_mmio_cq_program(Device *device) {
     auto cq_program_ptr = std::make_unique<Program>();
     // for (auto &node_and_kernel : node_id_to_kernel) {
     for (int idx = 0; idx < node_id_to_kernel.size(); idx++) {
-        if (nodes->at(idx).device_id == device->id()) {
+        if (nodes.at(idx).device_id == device->id()) {
             node_id_to_kernel[idx]->AddDeviceAndProgram(device, cq_program_ptr.get());
             node_id_to_kernel[idx]->GenerateStaticConfigs();
+            // tt::log_warning("Node {} has coord: {} (phys={})", idx, node_id_to_kernel[idx]->GetLogicalCore().str(), node_id_to_kernel[idx]->GetPhysicalCore().str());
         }
     }
 
     // Third pass, populate dependent configs and create kernels for each node
     // for (auto &node_and_kernel : node_id_to_kernel) {
     for (int idx = 0; idx < node_id_to_kernel.size(); idx++) {
-        if (nodes->at(idx).device_id == device->id()) {
+        if (nodes.at(idx).device_id == device->id()) {
             node_id_to_kernel[idx]->GenerateDependentConfigs();
             node_id_to_kernel[idx]->CreateKernel();
         }

diff --git a/tt_metal/impl/dispatch/dispatch_kernels.cpp b/tt_metal/impl/dispatch/dispatch_kernels.cpp
@@ -7,6 +7,8 @@
 #include "impl/debug/dprint_server.hpp"
 
 #define UNUSED_LOGICAL_CORE tt_cxy_pair(this->device->id(), 0, 0)
+// TODO: Just to make match with previous implementation, remove later
+#define UNUSED_LOGICAL_CORE_ADJUSTED tt_cxy_pair(this->device->id() + tt::Cluster::instance().number_of_pci_devices(), 0, 0)
 #define UNUSED_SEM_ID 0
 
 static std::vector<string> dispatch_kernel_file_names = {
@@ -193,7 +195,7 @@ void PrefetchKernel::GenerateStaticConfigs() {
         uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start;
         uint32_t issue_queue_size = device->sysmem_manager().get_issue_queue_size(cq_id);
 
-        this->logical_core = dispatch_core_manager::instance().prefetcher_core(device->id() + 1, channel, cq_id); // TODO
+        this->logical_core = dispatch_core_manager::instance().prefetcher_core(device->id() + tt::Cluster::instance().number_of_pci_devices(), channel, cq_id); // TODO
 
         this->config.downstream_cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
         this->config.downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(device->num_hw_cqs());
@@ -352,7 +354,7 @@ void DispatchKernel::GenerateStaticConfigs() {
         uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
         uint32_t completion_queue_size = device->sysmem_manager().get_completion_queue_size(cq_id);
 
-        this->logical_core = dispatch_core_manager::instance().dispatcher_core(device->id() + 1, channel, cq_id); // TODO
+        this->logical_core = dispatch_core_manager::instance().dispatcher_core(device->id() + tt::Cluster::instance().number_of_pci_devices(), channel, cq_id); // TODO
         this->config.dispatch_cb_base = my_dispatch_constants.dispatch_buffer_base();
         this->config.dispatch_cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
         this->config.dispatch_cb_pages = my_dispatch_constants.dispatch_buffer_pages();
@@ -501,8 +503,8 @@ void MuxKernel::GenerateStaticConfigs() {
 
 void DemuxKernel::GenerateStaticConfigs() {
     tt::log_warning("GenerateStaticConfigs: Demux");
-    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id() + 1); // TODO
-    this->logical_core = dispatch_core_manager::instance().demux_core(this->device->id() + 1, channel, this->cq_id);
+    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id() + tt::Cluster::instance().number_of_pci_devices()); // TODO: this is the downstream
+    this->logical_core = dispatch_core_manager::instance().demux_core(this->device->id() + tt::Cluster::instance().number_of_pci_devices(), channel, this->cq_id);
     this->config.vc_count = downstream_kernels.size() + 1; // TODO: update for deeper tunnels?
     this->config.endpoint_id_start_index = 0xD1;
     this->config.rx_queue_start_addr_words = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED) >> 4;
@@ -529,9 +531,10 @@ void DemuxKernel::GenerateStaticConfigs() {
 
 void EthTunnelerKernel::GenerateStaticConfigs() {
     tt::log_warning("GenerateStaticConfigs: Tunneler");
+    this->downstream_device_id = device->id() + tt::Cluster::instance().number_of_pci_devices(); // TODO: update for galaxy...
     if (this->IsRemote()) {
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(downstream_device_id);
-        this->logical_core = dispatch_core_manager::instance().tunneler_core(device->id(), downstream_device_id, channel, cq_id);
+        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(downstream_device_id.value());
+        this->logical_core = dispatch_core_manager::instance().tunneler_core(device->id(), downstream_device_id.value(), channel, cq_id);
     } else {
         uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
         this->logical_core = dispatch_core_manager::instance().us_tunneler_core_local(device->id(), channel, cq_id);
@@ -548,8 +551,8 @@ void EthRouterKernel::GenerateStaticConfigs() {
     auto &my_dispatch_constants = dispatch_constants::get(GetCoreType());
     if (this->as_mux) {
         tt::log_warning("GenerateStaticConfigs: Router (MUX)");
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id() + 1); // TODO
-        this->logical_core = dispatch_core_manager::instance().mux_core(device->id() + 1, channel, cq_id);
+        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id() + tt::Cluster::instance().number_of_pci_devices()); // TODO: this is the downstream
+        this->logical_core = dispatch_core_manager::instance().mux_core(device->id() + tt::Cluster::instance().number_of_pci_devices(), channel, cq_id);
         this->config.vc_count = upstream_kernels.size() + 1;
         this->config.rx_queue_start_addr_words = my_dispatch_constants.dispatch_buffer_base() >> 4;
         this->config.rx_queue_size_words = my_dispatch_constants.mux_buffer_size(device->num_hw_cqs()) >> 4;
@@ -648,15 +651,15 @@ void PrefetchKernel::GenerateDependentConfigs() {
     } else if (this->config.is_h_variant.value()) {
         // Upstream, just host so no dispatch core
         TT_ASSERT(this->upstream_kernels.size() == 0);
-        this->config.upstream_logical_core = UNUSED_LOGICAL_CORE;
+        this->config.upstream_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
         this->config.upstream_cb_sem_id = 0;  // Used in prefetch_d only
 
         // Downstream, expect just one ROUTER
         TT_ASSERT(this->downstream_kernels.size() == 1);
         auto router_kernel = dynamic_cast<EthRouterKernel *>(this->downstream_kernels[0]);
         TT_ASSERT(router_kernel);
         this->config.downstream_logical_core = router_kernel->GetLogicalCore();
-        this->config.downstream_s_logical_core = UNUSED_LOGICAL_CORE;
+        this->config.downstream_s_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
         uint32_t router_idx = router_kernel->GetUpstreamPort(this); // Need the port that this connects to downstream
         this->config.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base() + my_dispatch_constants.mux_buffer_size(device->num_hw_cqs()) * router_idx;
         this->config.downstream_cb_sem_id = router_kernel->GetConfig().input_packetize_local_sem[router_idx];
@@ -753,8 +756,8 @@ void DispatchKernel::GenerateDependentConfigs() {
         TT_ASSERT(this->downstream_kernels.size() == 1);
         auto prefetch_h_kernel = dynamic_cast<PrefetchKernel *>(this->downstream_kernels[0]);
         TT_ASSERT(prefetch_h_kernel);
-        this->config.downstream_logical_core = UNUSED_LOGICAL_CORE;
-        this->config.downstream_s_logical_core = UNUSED_LOGICAL_CORE;
+        this->config.downstream_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
+        this->config.downstream_s_logical_core = UNUSED_LOGICAL_CORE_ADJUSTED;
         this->config.prefetch_h_noc_xy = NOC_XY_ENCODING(prefetch_h_kernel->GetPhysicalCore().x, prefetch_h_kernel->GetPhysicalCore().y);
         this->config.prefetch_h_local_downstream_sem_addr = prefetch_h_kernel->GetConfig().my_downstream_cb_sem_id;
         this->config.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base(); // Unused
@@ -895,9 +898,9 @@ void EthTunnelerKernel::GenerateDependentConfigs() {
     if (this->IsRemote()) {
         // For remote tunneler, we don't actually have the device constructed for the paired tunneler, so can't pull
         // info from it. Core coord can be computed without the device, and relevant fields match this tunneler.
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(downstream_device_id);
+        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(downstream_device_id.value());
         tt_cxy_pair paired_logical_core =
-            dispatch_core_manager::instance().us_tunneler_core_local(downstream_device_id, channel, cq_id);
+            dispatch_core_manager::instance().us_tunneler_core_local(downstream_device_id.value(), channel, cq_id);
         CoreCoord paired_physical_coord = tt::get_physical_core_coordinate(paired_logical_core, CoreType::ETH);
 
         // Upstream, we expect a US_TUNNELER_LOCAL and a PACKET_ROUTER

diff --git a/tt_metal/impl/dispatch/dispatch_kernels.hpp b/tt_metal/impl/dispatch/dispatch_kernels.hpp
@@ -380,7 +380,7 @@ class EthTunnelerKernel : public FDKernel {
    private:
     eth_tunneler_config_t config;
     uint32_t tunnel_stop;
-    chip_id_t downstream_device_id = 1; // TODO
+    std::optional<chip_id_t> downstream_device_id; // TODO
     bool is_remote;
     bool is_tunnel_start = true;
     bool is_tunnel_end = true;