From eee479885d9c22cd630038c6bafd65514bd7db0d Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Fri, 6 Dec 2024 22:55:53 +0000 Subject: [PATCH] #14427: packet queue improvements - Removed power of 2 constraint - Replaced registers with L1 buffers. Registers are still used for the handshaking process though. - Refactor packet queue into specialized variants - Removed unnecessary branching in packet queue and kernels - Removed the "if not valid then start next packet" logic in the input queue curr packet getters and made them const. This lowers the branches by 5 when looping - Move many checks to compile-time - Removed the timeout logic in kernels. It was only used for testing purposes and slows down code in prod. If it hangs then you know your test failed --- .../perf_microbenchmark/CMakeLists.txt | 5 - .../routing/kernels/traffic_gen_rx.cpp | 58 +- .../routing/kernels/traffic_gen_tx.cpp | 111 +- .../routing/test_bi_tunnel.cpp | 725 --------- .../routing/test_common.hpp | 39 +- .../routing/test_mux_demux.cpp | 72 +- .../routing/test_mux_demux_2level.cpp | 112 +- .../routing/test_tunnel_1cq.cpp | 832 ---------- .../routing/test_tunnel_2cq.cpp | 848 ---------- .../routing/test_tx_rx.cpp | 28 +- .../routing/test_uni_tunnel.cpp | 647 -------- .../routing/test_uni_tunnel_single_chip.cpp | 635 -------- .../routing/test_vc_bi_tunnel_2ep.cpp | 1123 +++++++------ .../routing/test_vc_bi_tunnel_4ep.cpp | 1414 +++++++++-------- .../routing/test_vc_loopback_tunnel.cpp | 253 ++- .../routing/test_vc_mux_demux.cpp | 104 +- .../routing/test_vc_uni_tunnel.cpp | 200 ++- .../impl/dispatch/kernels/eth_tunneler.cpp | 185 --- .../impl/dispatch/kernels/packet_demux.cpp | 180 ++- tt_metal/impl/dispatch/kernels/packet_mux.cpp | 198 ++- .../dispatch/kernels/packet_queue_ctrl.hpp | 149 +- .../dispatch/kernels/packet_queue_remotes.hpp | 244 +++ .../impl/dispatch/kernels/packet_queue_v2.hpp | 1379 ++++++++++++++++ .../impl/dispatch/kernels/vc_eth_tunneler.cpp | 294 +++- .../dispatch/kernels/vc_packet_router.cpp | 216 ++- 25 files changed, 4473 insertions(+), 5578 deletions(-) delete mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp delete mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp delete mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp delete mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp delete mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp delete mode 100644 tt_metal/impl/dispatch/kernels/eth_tunneler.cpp create mode 100644 tt_metal/impl/dispatch/kernels/packet_queue_remotes.hpp create mode 100644 tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt index 680e8f104592..3df6a286f43a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt @@ -12,12 +12,7 @@ set(PERF_MICROBENCH_TESTS_SRCS routing/test_mux_demux.cpp routing/test_vc_mux_demux.cpp routing/test_mux_demux_2level.cpp - routing/test_tunnel_1cq.cpp - routing/test_tunnel_2cq.cpp - routing/test_uni_tunnel.cpp routing/test_vc_uni_tunnel.cpp - routing/test_uni_tunnel_single_chip.cpp - routing/test_bi_tunnel.cpp routing/test_vc_bi_tunnel_2ep.cpp routing/test_vc_bi_tunnel_4ep.cpp routing/test_vc_loopback_tunnel.cpp diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp index d97e81e4e977..b3fb60d10ba4 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp @@ -3,11 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 #include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp" -packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN]; +using namespace packet_queue; constexpr uint32_t endpoint_id = get_compile_time_arg_val(0); @@ -22,8 +21,6 @@ constexpr uint32_t input_queue_id = 0; constexpr uint32_t queue_start_addr_words = get_compile_time_arg_val(3); constexpr uint32_t queue_size_words = get_compile_time_arg_val(4); -static_assert(is_power_of_2(queue_size_words), "queue_size_words must be a power of 2"); - constexpr uint32_t remote_tx_x = get_compile_time_arg_val(5); constexpr uint32_t remote_tx_y = get_compile_time_arg_val(6); constexpr uint32_t remote_tx_queue_id = get_compile_time_arg_val(7); @@ -51,13 +48,34 @@ constexpr uint32_t timeout_cycles = get_compile_time_arg_val(17); constexpr uint32_t disable_header_check = get_compile_time_arg_val(18); +// Inputs - Update remote rptr +constexpr uint32_t traffic_gen_input_ptrs_addr = get_compile_time_arg_val(19); +constexpr uint32_t traffic_gen_input_remote_ptrs_addr = get_compile_time_arg_val(20); + +// Outputs +// None. This is a receiver to check data for testing purposes + // predicts size and payload of packets from each destination, should have // the same random seed as the corresponding traffic_gen_tx input_queue_rnd_state_t src_rnd_state[num_src_endpoints]; +PacketInputQueueVariant raw_input_queue; +constexpr init_params_t input_queue_init_params{ + .is_input = true, + .queue_id = input_queue_id, + .queue_start_addr_words = queue_start_addr_words, + .queue_size_words = queue_size_words, + .remote_queue_id = remote_tx_queue_id, + .remote_x = remote_tx_x, + .remote_y = remote_tx_y, + .ptrs_addr = traffic_gen_input_ptrs_addr, + .remote_ptrs_addr = traffic_gen_input_remote_ptrs_addr, +}; + +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; void kernel_main() { - zero_l1_buf(test_results, test_results_size_bytes); test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; test_results[PQ_TEST_MISC_INDEX] = 0xff000000; @@ -70,13 +88,13 @@ void kernel_main() { src_rnd_state[i].init(prng_seed, src_endpoint_start_id+i); } - packet_input_queue_state_t* input_queue = &(input_queues[input_queue_id]); + raw_input_queue.engage(); + // safe to use now - input_queue->init(input_queue_id, queue_start_addr_words, queue_size_words, - remote_tx_x, remote_tx_y, remote_tx_queue_id, - rx_rptr_update_network_type); + auto* input_queue = raw_input_queue.get(); + input_queue->init(&input_queue_init_params); - if (!wait_all_src_dest_ready(input_queue, 1, NULL, 0, timeout_cycles)) { + if (!wait_all_input_output_ready(&raw_input_queue, NULL, timeout_cycles)) { test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; return; } @@ -116,12 +134,12 @@ void kernel_main() { } #endif uint32_t num_words_available; - packet_available = input_queue->input_queue_full_packet_available_to_send(num_words_available); + packet_available = input_queue->full_packet_available_to_send(num_words_available); if (!packet_available) { // Mark works as "sent" immediately to keep pipeline from stalling. // This is OK since num_words_available comes from the call above, so // it's guaranteed to be smaller than the full next packet. - input_queue->input_queue_advance_words_sent(num_words_available); + input_queue->advance_words_sent(num_words_available); words_sent += num_words_available; } } @@ -180,13 +198,13 @@ void kernel_main() { } } - uint32_t num_words_available = input_queue->input_queue_curr_packet_num_words_available_to_send(); + uint32_t num_words_available = input_queue->get_curr_packet_num_words_available_to_send(); // we have the packet header info for checking, input queue can now switch to the next packet - input_queue->input_queue_advance_words_sent(num_words_available); + input_queue->advance_words_sent(num_words_available); words_sent += num_words_available; // move rptr_cleared to the packet payload - input_queue->input_queue_advance_words_cleared(1); + input_queue->advance_words_cleared(1); words_cleared++; // === parse packet payload === @@ -209,7 +227,7 @@ void kernel_main() { test_results[PQ_TEST_MISC_INDEX+5] = words_after_wrap; break; } - input_queue->input_queue_advance_words_cleared(words_before_wrap); + input_queue->advance_words_cleared(words_before_wrap); words_cleared += words_before_wrap; if (words_after_wrap > 0) { if (!check_packet_data(reinterpret_cast(input_queue->get_queue_rptr_cleared_addr_bytes()), @@ -222,11 +240,11 @@ void kernel_main() { test_results[PQ_TEST_MISC_INDEX+5] = words_after_wrap; break; } - input_queue->input_queue_advance_words_cleared(words_after_wrap); + input_queue->advance_words_cleared(words_after_wrap); words_cleared += words_after_wrap; } } else { - input_queue->input_queue_advance_words_cleared(curr_packet_payload_words); + input_queue->advance_words_cleared(curr_packet_payload_words); words_cleared += curr_packet_payload_words; } progress_timestamp = get_timestamp_32b(); @@ -257,13 +275,11 @@ void kernel_main() { test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; set_64b_result(test_results, words_sent, PQ_TEST_MISC_INDEX+12); set_64b_result(test_results, words_cleared, PQ_TEST_MISC_INDEX+14); - input_queue->dprint_object(); } else if (check_failed) { test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_DATA_MISMATCH; test_results[PQ_TEST_MISC_INDEX+12] = mismatch_addr; test_results[PQ_TEST_MISC_INDEX+12] = mismatch_val; test_results[PQ_TEST_MISC_INDEX+12] = expected_val; - input_queue->dprint_object(); } else { test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; test_results[PQ_TEST_MISC_INDEX] = 0xff000005; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp index 902c012a36d1..0bb4a0c302df 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp @@ -1,3 +1,4 @@ + // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -5,10 +6,12 @@ // clang-format off #include "dataflow_api.h" #include "debug/dprint.h" -#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp" // clang-format on +using namespace packet_queue; + constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0); constexpr uint32_t num_dest_endpoints = get_compile_time_arg_val(1); @@ -18,13 +21,9 @@ constexpr uint32_t queue_start_addr_words = get_compile_time_arg_val(2); constexpr uint32_t queue_size_words = get_compile_time_arg_val(3); constexpr uint32_t queue_size_bytes = queue_size_words * PACKET_WORD_SIZE_BYTES; -static_assert(is_power_of_2(queue_size_words), "queue_size_words must be a power of 2"); - constexpr uint32_t remote_rx_queue_start_addr_words = get_compile_time_arg_val(4); constexpr uint32_t remote_rx_queue_size_words = get_compile_time_arg_val(5); -static_assert(is_power_of_2(remote_rx_queue_size_words), "remote_rx_queue_size_words must be a power of 2"); - constexpr uint32_t remote_rx_x = get_compile_time_arg_val(6); constexpr uint32_t remote_rx_y = get_compile_time_arg_val(7); constexpr uint32_t remote_rx_queue_id = get_compile_time_arg_val(8); @@ -62,30 +61,68 @@ constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(21); constexpr uint32_t input_queue_id = 0; constexpr uint32_t output_queue_id = 1; -packet_input_queue_state_t input_queue; -packet_output_queue_state_t output_queue; +// Inputs +constexpr uint32_t traffic_gen_input_ptrs_addr = get_compile_time_arg_val(22); +constexpr uint32_t traffic_gen_input_mock_remote_ptrs_addr = get_compile_time_arg_val(23); -constexpr packet_input_queue_state_t* input_queue_ptr = &input_queue; -constexpr packet_output_queue_state_t* output_queue_ptr = &output_queue; +// Outputs - Update remote wptr +constexpr uint32_t traffic_gen_output_ptrs_addr = get_compile_time_arg_val(24); +constexpr uint32_t traffic_gen_output_remote_ptrs_addr = get_compile_time_arg_val(25); // input_queue_rnd_state_t input_queue_state; auto input_queue_state = select_input_queue(); +PacketInputQueueVariant raw_input_queue; +constexpr init_params_t input_queue_init_params{ + .is_input = true, + .queue_id = input_queue_id, + .queue_start_addr_words = queue_start_addr_words, + .queue_size_words = queue_size_words, + .remote_queue_id = 0, + .remote_x = 0, + .remote_y = 0, + .ptrs_addr = traffic_gen_input_ptrs_addr, + .remote_ptrs_addr = traffic_gen_input_mock_remote_ptrs_addr, +}; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; + + +PacketOutputQueueVariant raw_output_queue; +constexpr init_params_t output_queue_init_params{ + .queue_id = output_queue_id, + .queue_start_addr_words = remote_rx_queue_start_addr_words, + .queue_size_words = remote_rx_queue_size_words, + .remote_queue_id = remote_rx_queue_id, + .remote_x = remote_rx_x, + .remote_y = remote_rx_y, + .ptrs_addr = traffic_gen_output_ptrs_addr, + .remote_ptrs_addr = traffic_gen_output_remote_ptrs_addr, + + .input_queues = &raw_input_queue, + .num_input_queues = 1, +}; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; + + // generates packets with random size and payload on the input side inline bool input_queue_handler() { if (input_queue_state.all_packets_done()) { return true; } - uint32_t free_words = input_queue_ptr->get_queue_data_num_words_free(); + auto* input_queue = raw_input_queue.get(); + + uint32_t free_words = input_queue->get_queue_data_num_words_free(); if (free_words == 0) { return false; } // Each call to input_queue_handler initializes only up to the end // of the queue buffer, so we don't need to handle wrapping. - uint32_t byte_wr_addr = input_queue_ptr->get_queue_wptr_addr_bytes(); - uint32_t words_to_init = std::min(free_words, input_queue_ptr->get_queue_words_before_wptr_wrap()); + uint32_t byte_wr_addr = input_queue->get_queue_wptr_addr_bytes(); + uint32_t words_to_init = std::min(free_words, input_queue->get_queue_words_before_wptr_wrap()); uint32_t words_initialized = 0; while (words_initialized < words_to_init) { @@ -122,7 +159,8 @@ inline bool input_queue_handler() { byte_wr_addr += num_words * PACKET_WORD_SIZE_BYTES; } } - input_queue_ptr->advance_queue_local_wptr(words_initialized); + + input_queue->advance_queue_local_wptr(words_initialized); return false; } @@ -143,28 +181,17 @@ void kernel_main() { input_queue_state.init(src_endpoint_id, prng_seed); } - input_queue_ptr->init( - input_queue_id, - queue_start_addr_words, - queue_size_words, - // remote_x, remote_y, remote_queue_id, remote_update_network_type: - 0, - 0, - 0, - DispatchRemoteNetworkType::NONE); - - output_queue_ptr->init( - output_queue_id, - remote_rx_queue_start_addr_words, - remote_rx_queue_size_words, - remote_rx_x, - remote_rx_y, - remote_rx_queue_id, - tx_network_type, - input_queue_ptr, - 1); - - if (!wait_all_src_dest_ready(NULL, 0, output_queue_ptr, 1, timeout_cycles)) { + raw_input_queue.engage(); + raw_output_queue.engage(); + + // safe to use now + auto* input_queue = raw_input_queue.get(); + auto* new_output_queue = raw_output_queue.get(); + + input_queue->init(&input_queue_init_params); + new_output_queue->init(&output_queue_init_params); + + if (!wait_all_input_output_ready(NULL, &raw_output_queue, timeout_cycles)) { test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; return; } @@ -193,10 +220,9 @@ void kernel_main() { } #endif bool all_packets_initialized = input_queue_handler(); - if (input_queue_ptr->get_curr_packet_valid()) { + if (input_queue->get_curr_packet_valid()) { bool full_packet_sent; - uint32_t curr_data_words_sent = output_queue_ptr->forward_data_from_input( - input_queue_id, full_packet_sent, input_queue.get_end_of_cmd()); + uint32_t curr_data_words_sent = new_output_queue->forward_data_from_input(full_packet_sent, input_queue->get_end_of_cmd()); data_words_sent += curr_data_words_sent; if constexpr (!(data_sent_per_iter_low == 0 && data_sent_per_iter_high == 0)) { zero_data_sent_iter += static_cast(curr_data_words_sent <= 0); @@ -209,12 +235,12 @@ void kernel_main() { } else if (all_packets_initialized) { break; } - words_flushed += output_queue_ptr->prev_words_in_flight_check_flush(); + words_flushed += new_output_queue->prev_words_in_flight_check_flush(); } if (!timeout) { test_results[PQ_TEST_MISC_INDEX] = 0xff00002; - if (!output_queue_ptr->output_barrier(timeout_cycles)) { + if (!new_output_queue->output_barrier(timeout_cycles)) { timeout = true; } } @@ -224,7 +250,7 @@ void kernel_main() { if (!timeout) { test_results[PQ_TEST_MISC_INDEX] = 0xff00003; progress_timestamp = get_timestamp_32b(); - while (!output_queue_ptr->is_remote_finished()) { + while (!new_output_queue->is_remote_finished()) { if (timeout_cycles > 0) { uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; if (cycles_since_progress > timeout_cycles) { @@ -251,8 +277,5 @@ void kernel_main() { } else { test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; set_64b_result(test_results, words_flushed, TX_TEST_IDX_WORDS_FLUSHED); - // these calls lead to code size issues? - // input_queue_ptr->dprint_object(); - // output_queue_ptr->dprint_object(); } } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp deleted file mode 100644 index 5c798d609e14..000000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp +++ /dev/null @@ -1,725 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" -#include "tt_metal/impl/device/device.hpp" - -using std::vector; -using namespace tt; - -int main(int argc, char** argv) { - constexpr uint32_t default_tx_x = 0; - constexpr uint32_t default_tx_y = 0; - constexpr uint32_t default_rx_x = 0; - constexpr uint32_t default_rx_y = 3; - - constexpr uint32_t default_mux_x = 0; - constexpr uint32_t default_mux_y = 1; - constexpr uint32_t default_demux_x = 0; - constexpr uint32_t default_demux_y = 2; - - constexpr uint32_t default_tunneler_x = 0; - constexpr uint32_t default_tunneler_y = 0; - - constexpr uint32_t default_prng_seed = 0x100; - constexpr uint32_t default_data_kb_per_tx = 16 * 1024; - constexpr uint32_t default_max_packet_size_words = 0x100; - - constexpr uint32_t default_tx_queue_start_addr = 0x80000; - constexpr uint32_t default_tx_queue_size_bytes = 0x10000; - constexpr uint32_t default_rx_queue_start_addr = 0xa0000; - constexpr uint32_t default_rx_queue_size_bytes = 0x20000; - constexpr uint32_t default_mux_queue_start_addr = 0x80000; - constexpr uint32_t default_mux_queue_size_bytes = 0x10000; - constexpr uint32_t default_demux_queue_start_addr = 0x90000; - constexpr uint32_t default_demux_queue_size_bytes = 0x20000; - - constexpr uint32_t default_tunneler_queue_start_addr = 0x19000; - constexpr uint32_t default_tunneler_queue_size_bytes = 0x10000; - - constexpr uint32_t default_test_results_addr = 0x100000; - constexpr uint32_t default_test_results_size = 0x40000; - - constexpr uint32_t default_tunneler_test_results_addr = 0x39000; - constexpr uint32_t default_tunneler_test_results_size = 0x7000; - - constexpr uint32_t default_timeout_mcycles = 1000; - constexpr uint32_t default_rx_disable_data_check = 0; - - constexpr uint32_t src_endpoint_start_id = 0xaa; - constexpr uint32_t dest_endpoint_start_id = 0xbb; - - constexpr uint32_t num_src_endpoints = 4; - constexpr uint32_t num_dest_endpoints = 4; - - constexpr uint32_t default_test_device_id = 0; - - std::vector input_args(argv, argv + argc); - if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { - log_info(LogTest, "Usage:"); - log_info(LogTest, " --prng_seed: PRNG seed, default = 0x{:x}", default_prng_seed); - log_info(LogTest, " --data_kb_per_tx: Total data in KB per TX endpoint, default = {}", default_data_kb_per_tx); - log_info( - LogTest, - " --max_packet_size_words: Max packet size in words, default = 0x{:x}", - default_max_packet_size_words); - log_info(LogTest, " --tx_x: X coordinate of the starting TX core, default = {}", default_tx_x); - log_info(LogTest, " --tx_y: Y coordinate of the starting TX core, default = {}", default_tx_y); - log_info(LogTest, " --rx_x: X coordinate of the starting RX core, default = {}", default_rx_x); - log_info(LogTest, " --rx_y: Y coordinate of the starting RX core, default = {}", default_rx_y); - log_info(LogTest, " --mux_x: X coordinate of the starting mux core, default = {}", default_mux_x); - log_info(LogTest, " --mux_y: Y coordinate of the starting mux core, default = {}", default_mux_y); - log_info(LogTest, " --demux_x: X coordinate of the starting demux core, default = {}", default_demux_x); - log_info(LogTest, " --demux_y: Y coordinate of the starting demux core, default = {}", default_demux_y); - log_info( - LogTest, " --tx_queue_start_addr: TX queue start address, default = 0x{:x}", default_tx_queue_start_addr); - log_info( - LogTest, " --tx_queue_size_bytes: TX queue size in bytes, default = 0x{:x}", default_tx_queue_size_bytes); - log_info( - LogTest, " --rx_queue_start_addr: RX queue start address, default = 0x{:x}", default_rx_queue_start_addr); - log_info( - LogTest, " --rx_queue_size_bytes: RX queue size in bytes, default = 0x{:x}", default_rx_queue_size_bytes); - log_info( - LogTest, - " --mux_queue_start_addr: MUX queue start address, default = 0x{:x}", - default_mux_queue_start_addr); - log_info( - LogTest, - " --mux_queue_size_bytes: MUX queue size in bytes, default = 0x{:x}", - default_mux_queue_size_bytes); - log_info( - LogTest, - " --demux_queue_start_addr: DEMUX queue start address, default = 0x{:x}", - default_demux_queue_start_addr); - log_info( - LogTest, - " --demux_queue_size_bytes: DEMUX queue size in bytes, default = 0x{:x}", - default_demux_queue_size_bytes); - log_info( - LogTest, " --test_results_addr: test results buf address, default = 0x{:x}", default_test_results_addr); - log_info(LogTest, " --test_results_size: test results buf size, default = 0x{:x}", default_test_results_size); - log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); - log_info( - LogTest, - " --rx_disable_data_check: Disable data check on RX, default = {}", - default_rx_disable_data_check); - log_info(LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id); - return 0; - } - - uint32_t tx_x = test_args::get_command_option_uint32(input_args, "--tx_x", default_tx_x); - uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); - uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); - uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - uint32_t mux_x = test_args::get_command_option_uint32(input_args, "--mux_x", default_mux_x); - uint32_t mux_y = test_args::get_command_option_uint32(input_args, "--mux_y", default_mux_y); - uint32_t demux_x = test_args::get_command_option_uint32(input_args, "--demux_x", default_demux_x); - uint32_t demux_y = test_args::get_command_option_uint32(input_args, "--demux_y", default_demux_y); - uint32_t tunneler_x = test_args::get_command_option_uint32(input_args, "--tunneler_x", default_tunneler_x); - uint32_t tunneler_y = test_args::get_command_option_uint32(input_args, "--tunneler_y", default_tunneler_y); - uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); - uint32_t data_kb_per_tx = - test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); - uint32_t max_packet_size_words = - test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); - uint32_t tx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--tx_queue_start_addr", default_tx_queue_start_addr); - uint32_t tx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--tx_queue_size_bytes", default_tx_queue_size_bytes); - uint32_t rx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--rx_queue_start_addr", default_rx_queue_start_addr); - uint32_t rx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--rx_queue_size_bytes", default_rx_queue_size_bytes); - uint32_t mux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--mux_queue_start_addr", default_mux_queue_start_addr); - uint32_t mux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--mux_queue_size_bytes", default_mux_queue_size_bytes); - uint32_t demux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--demux_queue_start_addr", default_demux_queue_start_addr); - uint32_t demux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--demux_queue_size_bytes", default_demux_queue_size_bytes); - uint32_t tunneler_queue_start_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_start_addr", default_tunneler_queue_start_addr); - uint32_t tunneler_queue_size_bytes = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_size_bytes", default_tunneler_queue_size_bytes); - uint32_t test_results_addr = - test_args::get_command_option_uint32(input_args, "--test_results_addr", default_test_results_addr); - uint32_t test_results_size = - test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); - uint32_t tunneler_test_results_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_addr", default_tunneler_test_results_addr); - uint32_t tunneler_test_results_size = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_size", default_tunneler_test_results_size); - uint32_t timeout_mcycles = - test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); - uint32_t rx_disable_data_check = - test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); - uint32_t test_device_id = test_args::get_command_option_uint32(input_args, "--device_id", default_test_device_id); - - bool pass = true; - - std::map defines = { - {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth - }; - - try { - int num_devices = tt_metal::GetNumAvailableDevices(); - if (test_device_id >= num_devices) { - log_info(LogTest, "Device {} is not valid. Highest valid device id = {}.", test_device_id, num_devices - 1); - throw std::runtime_error("Invalid Device Id."); - } - int device_id_l = test_device_id; - - tt_metal::Device* device = tt_metal::CreateDevice(device_id_l); - auto const& device_active_eth_cores = device->get_active_ethernet_cores(); - - if (device_active_eth_cores.size() == 0) { - log_info( - LogTest, - "Device {} does not have enough active cores. Need 1 active ethernet core for this test.", - device_id_l); - tt_metal::CloseDevice(device); - throw std::runtime_error("Test cannot run on specified device."); - } - - auto eth_core_iter = device_active_eth_cores.begin(); - auto [device_id_r, eth_receiver_core] = device->get_connected_ethernet_core(*eth_core_iter); - - tt_metal::Device* device_r = tt_metal::CreateDevice(device_id_r); - - CoreCoord tunneler_logical_core = device->get_ethernet_sockets(device_id_r)[0]; - CoreCoord tunneler_phys_core = device->ethernet_core_from_logical_core(tunneler_logical_core); - - CoreCoord r_tunneler_logical_core = device_r->get_ethernet_sockets(device_id_l)[0]; - CoreCoord r_tunneler_phys_core = device_r->ethernet_core_from_logical_core(r_tunneler_logical_core); - - log_info(LogTest, "Tx/Rx Device {}. Tunneling Ethernet core = {}.", device_id_l, tunneler_logical_core.str()); - - log_info( - LogTest, "Loopback Device {}. Tunneling Ethernet core = {}.", device_id_r, r_tunneler_logical_core.str()); - - // std::cout<<"Left Tunneler = "< mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)tx_phys_core[0].x, - (uint32_t)tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[1].x, - (uint32_t)tx_phys_core[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[2].x, - (uint32_t)tx_phys_core[2].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: src 2 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[3].x, - (uint32_t)tx_phys_core[3].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: src 3 info - (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)tunneler_phys_core.x, // 10: remote_tx_x - (uint32_t)tunneler_phys_core.y, // 11: remote_tx_y - 0, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - test_results_addr, // 14: test_results_addr - test_results_size, // 15: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 16: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 17-24: packetize/depacketize settings - }; - - log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); - auto mux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - {mux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = mux_compile_args, - .defines = defines}); - - std::vector tunneler_l_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 2, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::ETH), // 4: remote_receiver_0_info - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_receiver_1_info - (tunneler_queue_start_addr >> 4), // 6: remote_receiver_queue_start_addr_words 0 - (tunneler_queue_size_bytes >> 4), // 7: remote_receiver_queue_size_words 0 - (demux_queue_start_addr >> 4), // 8: remote_receiver_queue_start_addr_words 1 - (demux_queue_size_bytes >> 4), // 9: remote_receiver_queue_size_words 1 - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_sender_0_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 3, - (uint32_t)DispatchRemoteNetworkType::ETH), // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - 0}; - - auto tunneler_l_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - tunneler_logical_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_l_compile_args, .defines = defines}); - - std::vector tunneler_r_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 2, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - packet_switch_4B_pack( - loopback_mux_phys_core.x, - loopback_mux_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::ETH), // 5: remote_receiver_1_info - (mux_queue_start_addr >> 4), // 6: remote_receiver_queue_start_addr_words 0 - (mux_queue_size_bytes >> 4), // 7: remote_receiver_queue_size_words 0 - ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> - 4), // 8: remote_receiver_queue_start_addr_words 0 - (tunneler_queue_size_bytes >> 4), // 9: remote_receiver_queue_size_words 0 - - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::ETH), // 10: remote_sender_0_info - packet_switch_4B_pack( - loopback_mux_phys_core.x, - loopback_mux_phys_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - 0}; - - auto tunneler_r_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - r_tunneler_logical_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_r_compile_args, .defines = defines}); - - // Loopback 1-1 Mux On R Chip - std::vector loopback_mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - 1, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)r_tunneler_phys_core.x, - (uint32_t)r_tunneler_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - packet_switch_4B_pack( - 0, - 0, - 0, - 0), // 5: src 1 info - packet_switch_4B_pack( - 0, - 0, - 0, - 0), // 6: src 2 info - packet_switch_4B_pack( - 0, - 0, - 0, - 0), // 7: src 3 info - ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)r_tunneler_phys_core.x, // 10: remote_tx_x - (uint32_t)r_tunneler_phys_core.y, // 11: remote_tx_y - 1, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - test_results_addr, // 14: test_results_addr - test_results_size, // 15: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 16: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 17-24: packetize/depacketize settings - }; - - log_info(LogTest, "run loopback mux at x={},y={}", loopback_mux_core.x, loopback_mux_core.y); - auto loopback_mux_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - {loopback_mux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = loopback_mux_compile_args, - .defines = defines}); - - std::vector rx_phys_core; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x+i, rx_y}; - rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = - { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i + 1, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - - log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = defines}); - } - - // Demux - uint32_t dest_map_array[4] = {0, 1, 2, 3}; - uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - std::vector demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - rx_phys_core[0].x, - rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - rx_phys_core[1].x, - rx_phys_core[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - packet_switch_4B_pack( - rx_phys_core[2].x, - rx_phys_core[2].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info - packet_switch_4B_pack( - rx_phys_core[3].x, - rx_phys_core[3].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - (uint32_t)tunneler_phys_core.x, // 16: remote_rx_x - (uint32_t)tunneler_phys_core.y, // 17: remote_rx_y - 3, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0 // 25-29: packetize/depacketize settings - }; - - log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); - auto demux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - {demux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = demux_compile_args, - .defines = defines}); - - log_info(LogTest, "Starting test..."); - - auto start = std::chrono::system_clock::now(); - tt_metal::detail::LaunchProgram(device, program, false); - tt_metal::detail::LaunchProgram(device_r, program_r, false); - tt_metal::detail::WaitProgramDone(device, program); - tt_metal::detail::WaitProgramDone(device_r, program_r); - auto end = std::chrono::system_clock::now(); - - std::chrono::duration elapsed_seconds = (end - start); - log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - - vector> tx_results; - vector> rx_results; - - for (uint32_t i = 0; i < num_src_endpoints; i++) { - tx_results.push_back( - tt::llrt::read_hex_vec_from_core(device->id(), tx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - rx_results.push_back( - tt::llrt::read_hex_vec_from_core(device->id(), rx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "RX{} status = {}", - i, - packet_queue_test_status_to_string(rx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - vector mux_results = - tt::llrt::read_hex_vec_from_core(device->id(), mux_phys_core, test_results_addr, test_results_size); - log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); - pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - - vector demux_results = - tt::llrt::read_hex_vec_from_core(device->id(), demux_phys_core, test_results_addr, test_results_size); - log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); - pass &= (demux_results[0] == PACKET_QUEUE_TEST_PASS); - - pass &= tt_metal::CloseDevice(device); - pass &= tt_metal::CloseDevice(device_r); - - if (pass) { - double total_tx_bw = 0.0; - uint64_t total_tx_words_sent = 0; - uint64_t total_rx_words_checked = 0; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); - double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; - log_info( - LogTest, - "TX {} words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - tx_words_sent, - tx_elapsed_cycles, - tx_bw); - total_tx_bw += tx_bw; - } - log_info(LogTest, "Total TX BW = {:.2f} B/cycle", total_tx_bw); - double total_rx_bw = 0.0; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - uint64_t rx_words_checked = get_64b_result(rx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_rx_words_checked += rx_words_checked; - uint64_t rx_elapsed_cycles = get_64b_result(rx_results[i], PQ_TEST_CYCLES_INDEX); - double rx_bw = ((double)rx_words_checked) * PACKET_WORD_SIZE_BYTES / rx_elapsed_cycles; - log_info( - LogTest, - "RX {} words checked = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - rx_words_checked, - rx_elapsed_cycles, - rx_bw); - total_rx_bw += rx_bw; - } - log_info(LogTest, "Total RX BW = {:.2f} B/cycle", total_rx_bw); - if (total_tx_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "Total TX words sent = {} != Total RX words checked = {}", - total_tx_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "Total TX words sent = {} == Total RX words checked = {} -> OK", - total_tx_words_sent, - total_rx_words_checked); - } - uint64_t mux_words_sent = get_64b_result(mux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t mux_elapsed_cycles = get_64b_result(mux_results, PQ_TEST_CYCLES_INDEX); - uint64_t mux_iter = get_64b_result(mux_results, PQ_TEST_ITER_INDEX); - double mux_bw = ((double)mux_words_sent) * PACKET_WORD_SIZE_BYTES / mux_elapsed_cycles; - double mux_cycles_per_iter = ((double)mux_elapsed_cycles) / mux_iter; - log_info( - LogTest, - "MUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - mux_words_sent, - mux_elapsed_cycles, - mux_bw); - log_info(LogTest, "MUX iters = {} -> cycles/iter = {:.1f}", mux_iter, mux_cycles_per_iter); - if (mux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "MUX words sent = {} != Total RX words checked = {}", - mux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "MUX words sent = {} == Total RX words checked = {} -> OK", - mux_words_sent, - total_rx_words_checked); - } - - uint64_t demux_words_sent = get_64b_result(demux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t demux_elapsed_cycles = get_64b_result(demux_results, PQ_TEST_CYCLES_INDEX); - double demux_bw = ((double)demux_words_sent) * PACKET_WORD_SIZE_BYTES / demux_elapsed_cycles; - uint64_t demux_iter = get_64b_result(demux_results, PQ_TEST_ITER_INDEX); - double demux_cycles_per_iter = ((double)demux_elapsed_cycles) / demux_iter; - log_info( - LogTest, - "DEMUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - demux_words_sent, - demux_elapsed_cycles, - demux_bw); - log_info(LogTest, "DEMUX iters = {} -> cycles/iter = {:.1f}", demux_iter, demux_cycles_per_iter); - if (demux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "DEMUX words sent = {} != Total RX words checked = {}", - demux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "DEMUX words sent = {} == Total RX words checked = {} -> OK", - demux_words_sent, - total_rx_words_checked); - } - } - - } catch (const std::exception& e) { - pass = false; - log_fatal(e.what()); - } - - tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); - - if (pass) { - log_info(LogTest, "Test Passed"); - return 0; - } else { - log_fatal(LogTest, "Test Failed\n"); - return 1; - } -} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp index 3bdf79115cda..1a6981099519 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp @@ -4,8 +4,11 @@ #pragma once -#include "tt_metal/third_party/json/json.hpp" +#include "kernels/traffic_gen_test.hpp" #include "tt_metal/common/core_coord.hpp" +#include "tt_metal/impl/dispatch/cq_commands.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" +#include "tt_metal/third_party/json/json.hpp" static inline std::string to_string(pkt_dest_size_choices_t choice) { switch (choice) { @@ -24,3 +27,37 @@ static inline void log_phys_coord_to_json(nlohmann::json& config, const std::vec static inline void log_phys_coord_to_json(nlohmann::json& config, const CoreCoord& phys_core, const std::string& name) { config[name] = fmt::format("({}, {})", phys_core.x, phys_core.y); } + +static std::vector> allocated; + +// Make buffer address for test. The address are created continguously on order of the stage_config +std::map> make_buffer_addresses_for_test(uint32_t base_address, uint32_t per_buffer_size, const std::vector>& stage_config) { + // Use a vector to make the addresses to keep everything continiguous + std::vector> scratch_buffers; + + for (int stage = 0; stage < stage_config.size(); stage++) { + assert(stage_config[stage].second > 0 && "Each stage must consist of at least 1 kernel"); + + std::vector addresses_for_stage; + if (stage == 0) { + addresses_for_stage.push_back(base_address); + } else { + addresses_for_stage.push_back(scratch_buffers.back().back() + per_buffer_size); + } + + // Start at kernel 1 as the address for kernel 0 is already in the addresses_for_stage + for (int kernel = 1; kernel < stage_config[stage].second; kernel++) { + addresses_for_stage.push_back(addresses_for_stage.back() + per_buffer_size); + } + + scratch_buffers.push_back(std::move(addresses_for_stage)); + } + + // Convert the vector to a map indexed by the stage name + std::map> scratch_buffers_by_stage; + for (int stage = 0; stage < stage_config.size(); stage++) { + scratch_buffers_by_stage[stage_config[stage].first] = std::move(scratch_buffers[stage]); + } + + return scratch_buffers_by_stage; +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp index 7a92606736a6..baf2009321aa 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp @@ -5,18 +5,14 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" #include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" -using std::vector; using namespace tt; using json = nlohmann::json; -int main(int argc, char **argv) { +int main(int argc, char **argv) { constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; @@ -31,6 +27,9 @@ int main(int argc, char **argv) { constexpr uint32_t default_data_kb_per_tx = 1024*1024; constexpr uint32_t default_max_packet_size_words = 0x100; + constexpr uint32_t default_input_scratch_buffer_base_addr = 0x60000; + constexpr uint32_t default_output_scratch_buffer_base_addr = 0x70000; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; constexpr uint32_t default_tx_queue_size_bytes = 0x10000; constexpr uint32_t default_rx_queue_start_addr = 0xa0000; @@ -98,6 +97,8 @@ int main(int argc, char **argv) { log_info(LogTest, " --tx_data_sent_per_iter_high: the criteria to determine the amount of tx data sent per iter is high (unit: words); if both 0, then disable counting it in tx kernel, default = {}", default_tx_data_sent_per_iter_high); log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); + log_info(LogTest, " --input_scratch_buffer_base_addr: Scratch buffer for input queues base address, default = {:#x}", default_input_scratch_buffer_base_addr); + log_info(LogTest, " --output_scratch_buffer_base_addr: Scratch buffer for output queues base address, default = {:#x}", default_output_scratch_buffer_base_addr); return 0; } @@ -133,12 +134,28 @@ int main(int argc, char **argv) { uint8_t tx_pkt_dest_size_choice = (uint8_t) test_args::get_command_option_uint32(input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); + uint32_t input_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--input_scratch_buffer_base_addr", default_input_scratch_buffer_base_addr); + uint32_t output_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--output_scratch_buffer_base_addr", default_output_scratch_buffer_base_addr); assert((pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && rx_disable_header_check || (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); uint32_t num_src_endpoints = num_endpoints; uint32_t num_dest_endpoints = num_endpoints; + const auto input_scratch_buffers = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx", num_src_endpoints }, + { "mux", num_src_endpoints }, + { "demux", 1 }, + { "traffic_gen_rx", num_dest_endpoints}, + }); + + const auto output_scratch_buffers = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx_mock", num_src_endpoints }, + { "traffic_gen_tx", num_src_endpoints }, + { "mux", 1 }, + { "demux", num_dest_endpoints }, + }); + bool pass = true; std::map defines = { @@ -188,7 +205,11 @@ int main(int argc, char **argv) { tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low - tx_data_sent_per_iter_high // 21: data_sent_per_iter_high + tx_data_sent_per_iter_high, // 21: data_sent_per_iter_high + input_scratch_buffers.at("traffic_gen_tx")[i], // 22: traffic_gen_input_ptrs_addr + output_scratch_buffers.at("traffic_gen_tx_mock")[i], // 23: traffic_gen_input_mock_remote_ptrs_addr + output_scratch_buffers.at("traffic_gen_tx")[i], // 24: traffic_gen_output_ptrs_addr + input_scratch_buffers.at("mux")[i], // 25: traffic_gen_output_remote_ptrs_addr }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); @@ -229,7 +250,9 @@ int main(int argc, char **argv) { src_endpoint_start_id, // 15: src_endpoint_start_id dest_endpoint_start_id, // 16: dest_endpoint_start_id timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - rx_disable_header_check // 18: disable_header_check + rx_disable_header_check, // 18: disable_header_check + input_scratch_buffers.at("traffic_gen_rx")[i], // 19: traffic_gen_input_ptrs_addr + output_scratch_buffers.at("demux")[i], // 20: traffic_gen_input_remote_ptrs_addr }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); @@ -282,7 +305,19 @@ int main(int argc, char **argv) { test_results_addr, // 14: test_results_addr test_results_size, // 15: test_results_size timeout_mcycles * 1000 * 1000, // 16: timeout_cycles, - 0, 0, 0, 0, 0, 0, 0, 0 // 17-24: packetize/depacketize settings + 0, 0, 0, 0, 0, 0, 0, 0, // 17-24: packetize/depacketize settings + input_scratch_buffers.at("mux")[0], // 25: mux_input_ptr_buffers[0] + num_src_endpoints > 1 ? input_scratch_buffers.at("mux")[1] : 0, // 26: mux_input_ptr_buffers[1] + num_src_endpoints > 2 ? input_scratch_buffers.at("mux")[2] : 0, // 27: mux_input_ptr_buffers[2] + num_src_endpoints > 3 ? input_scratch_buffers.at("mux")[3] : 0, // 28: mux_input_ptr_buffers[3] + + output_scratch_buffers.at("traffic_gen_tx")[0], // 29: mux_input_remote_ptr_buffers[0] + num_src_endpoints > 1 ? output_scratch_buffers.at("traffic_gen_tx")[1] : 0, // 30: mux_input_remote_ptr_buffers[1] + num_src_endpoints > 2 ? output_scratch_buffers.at("traffic_gen_tx")[2] : 0, // 31: mux_input_remote_ptr_buffers[2] + num_src_endpoints > 3 ? output_scratch_buffers.at("traffic_gen_tx")[3] : 0, // 32: mux_input_remote_ptr_buffers[3] + + output_scratch_buffers.at("mux")[0], // 33: mux_output_ptr_buffer + input_scratch_buffers.at("demux")[0], // 34: mux_output_remote_ptr_buffer }; log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); @@ -340,7 +375,18 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000, // 24: timeout_cycles - 0, 0, 0, 0, 0 // 25-29: packetize/depacketize settings + 0, 0, 0, 0, 0, // 25-29: packetize/depacketize settings + input_scratch_buffers.at("demux")[0], // 30: demux_input_ptr_buffer + output_scratch_buffers.at("mux")[0], // 31: demux_input_remote_ptr_buffer + output_scratch_buffers.at("demux")[0], // 32: demux_output_ptr_buffers[0] + num_dest_endpoints > 1 ? output_scratch_buffers.at("demux")[1] : 0, // 33: demux_output_ptr_buffers[1] + num_dest_endpoints > 2 ? output_scratch_buffers.at("demux")[2] : 0, // 34: demux_output_ptr_buffers[2] + num_dest_endpoints > 3 ? output_scratch_buffers.at("demux")[3] : 0, // 35: demux_output_ptr_buffers[3] + + input_scratch_buffers.at("traffic_gen_rx")[0], // 36: demux_output_remote_ptr_buffers[0] + num_dest_endpoints > 1 ? input_scratch_buffers.at("traffic_gen_rx")[1] : 0, // 37: demux_output_remote_ptr_buffers[1] + num_dest_endpoints > 2 ? input_scratch_buffers.at("traffic_gen_rx")[2] : 0, // 38: demux_output_remote_ptr_buffers[2] + num_dest_endpoints > 3 ? input_scratch_buffers.at("traffic_gen_rx")[3] : 0, // 39: demux_output_remote_ptr_buffers[3] }; log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); @@ -365,8 +411,8 @@ int main(int argc, char **argv) { std::chrono::duration elapsed_seconds = (end-start); log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - vector> tx_results; - vector> rx_results; + std::vector> tx_results; + std::vector> rx_results; for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back( @@ -384,13 +430,13 @@ int main(int argc, char **argv) { pass &= (rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); } - vector mux_results = + std::vector mux_results = tt::llrt::read_hex_vec_from_core( device->id(), mux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector demux_results = + std::vector demux_results = tt::llrt::read_hex_vec_from_core( device->id(), demux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp index 13810530fdf6..af52874c2002 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp @@ -6,19 +6,19 @@ #include "tt_metal/impl/device/device.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" -using std::vector; using namespace tt; int main(int argc, char **argv) { - constexpr uint32_t default_prng_seed = 0x100; constexpr uint32_t default_data_kb_per_tx = 64*1024; constexpr uint32_t default_max_packet_size_words = 0x100; + constexpr uint32_t default_input_scratch_buffer_base_addr = 0x60000; + constexpr uint32_t default_output_scratch_buffer_base_addr = 0x70000; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; constexpr uint32_t default_tx_queue_size_bytes = 0x10000; constexpr uint32_t default_rx_queue_start_addr = 0xa0000; @@ -56,6 +56,8 @@ int main(int argc, char **argv) { log_info(LogTest, " --test_results_size: test results buffer size, default = 0x{:x}", default_test_results_size); log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); log_info(LogTest, " --rx_disable_data_check: Disable data check on RX, default = {}", default_rx_disable_data_check); + log_info(LogTest, " --input_scratch_buffer_base_addr: Scratch buffer for input queues base address, default = {:#x}", default_input_scratch_buffer_base_addr); + log_info(LogTest, " --output_scratch_buffer_base_addr: Scratch buffer for output queues base address, default = {:#x}", default_output_scratch_buffer_base_addr); return 0; } @@ -74,6 +76,8 @@ int main(int argc, char **argv) { uint32_t test_results_size = test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); uint32_t timeout_mcycles = test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); uint32_t rx_disable_data_check = test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); + uint32_t input_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--input_scratch_buffer_base_addr", default_input_scratch_buffer_base_addr); + uint32_t output_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--output_scratch_buffer_base_addr", default_output_scratch_buffer_base_addr); constexpr uint32_t num_src_endpoints = 16; constexpr uint32_t num_dest_endpoints = 16; @@ -91,6 +95,26 @@ int main(int argc, char **argv) { {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; + const auto input_scratch_buffers = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx", num_src_endpoints }, + { "mux_l1", num_src_endpoints }, + { "mux_l2", num_mux_l1 }, + { "demux_l1", num_demux_l1 }, + { "demux_l2", num_demux_l2 }, + { "traffic_gen_rx", num_dest_endpoints}, + }); + + const auto output_scratch_buffers = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx", num_src_endpoints }, + { "traffic_gen_tx_mock", num_src_endpoints }, + { "mux_l1", num_mux_l1 }, + { "mux_l2", num_mux_l2 }, + { "demux_l1", num_demux_l2 }, + { "demux_l2", num_dest_endpoints }, + { "demux", num_dest_endpoints }, + // RX Gen has no output + }); + try { int device_id = 0; tt_metal::Device *device = tt_metal::CreateDevice(device_id); @@ -172,7 +196,11 @@ int main(int argc, char **argv) { 0, // 18: skip_pkt_content_gen 0, // 19: pkt_dest_size_choice 0, // 20: data_sent_per_iter_low - 0 // 21: data_sent_per_iter_high + 0, // 21: data_sent_per_iter_high + input_scratch_buffers.at("traffic_gen_tx")[i], // 22: traffic_gen_input_ptrs_addr + output_scratch_buffers.at("traffic_gen_tx_mock")[i], // 23: traffic_gen_input_mock_remote_ptrs_addr + output_scratch_buffers.at("traffic_gen_tx")[i], // 24: traffic_gen_output_ptrs_addr + input_scratch_buffers.at("mux_l1")[i], // 25: traffic_gen_output_remote_ptrs_addr }; log_info(LogTest, "run TX {} at x={},y={} (phys x={},y={})", i, tx_core[i].x, tx_core[i].y, tx_phys_core[i].x, tx_phys_core[i].y); @@ -212,7 +240,9 @@ int main(int argc, char **argv) { src_endpoint_start_id, // 15: src_endpoint_start_id dest_endpoint_start_id, // 16: dest_endpoint_start_id timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - 0 // 18: disable_header_check + 0, // 18: disable_header_check + input_scratch_buffers.at("traffic_gen_rx")[i], // 19: traffic_gen_input_ptrs_addr + output_scratch_buffers.at("demux_l2")[i], // 20: traffic_gen_input_remote_ptrs_addr }; log_info(LogTest, "run RX {} at x={},y={} (phys x={},y={})", i, rx_core[i].x, rx_core[i].y, rx_phys_core[i].x, rx_phys_core[i].y); @@ -263,7 +293,19 @@ int main(int argc, char **argv) { test_results_addr, // 14: test_results_addr test_results_size, // 15: test_results_size timeout_mcycles * 1000 * 1000, // 16: timeout_cycles - 0, 0, 0, 0, 0, 0, 0, 0 // 17-24: packetize/depacketize settings + 0, 0, 0, 0, 0, 0, 0, 0, // 17-24: packetize/depacketize settings + input_scratch_buffers.at("mux_l1")[i * MAX_SWITCH_FAN_IN], // 25: mux_input_ptr_buffers[0] + input_scratch_buffers.at("mux_l1")[i * MAX_SWITCH_FAN_IN + 1], // 26: mux_input_ptr_buffers[1] + input_scratch_buffers.at("mux_l1")[i * MAX_SWITCH_FAN_IN + 2], // 27: mux_input_ptr_buffers[2] + input_scratch_buffers.at("mux_l1")[i * MAX_SWITCH_FAN_IN + 3], // 28: mux_input_ptr_buffers[3] + + output_scratch_buffers.at("traffic_gen_tx")[i * MAX_SWITCH_FAN_IN], // 29: mux_input_remote_ptr_buffers[0] + output_scratch_buffers.at("traffic_gen_tx")[i * MAX_SWITCH_FAN_IN + 1], // 30: mux_input_remote_ptr_buffers[1] + output_scratch_buffers.at("traffic_gen_tx")[i * MAX_SWITCH_FAN_IN + 2], // 31: mux_input_remote_ptr_buffers[2] + output_scratch_buffers.at("traffic_gen_tx")[i * MAX_SWITCH_FAN_IN + 3], // 32: mux_input_remote_ptr_buffers[3] + + output_scratch_buffers.at("mux_l1")[i], // 33: mux_output_ptr_buffer + input_scratch_buffers.at("mux_l2")[mux_l2_port_index], // 34: mux_output_remote_ptr_buffer }; log_info(LogTest, "run L1 MUX {} at x={},y={} (phys x={},y={})", i, mux_l1_core[i].x, mux_l1_core[i].y, mux_l1_phys_core[i].x, mux_l1_phys_core[i].y); @@ -311,7 +353,19 @@ int main(int argc, char **argv) { test_results_addr, // 14: test_results_addr test_results_size, // 15: test_results_size timeout_mcycles * 1000 * 1000, // 16: timeout_cycles - 0, 0, 0, 0, 0, 0, 0, 0 // 17-24: packetize/depacketize settings + 0, 0, 0, 0, 0, 0, 0, 0, // 17-24: packetize/depacketize settings + input_scratch_buffers.at("mux_l2")[0], // 25: mux_input_ptr_buffers[0] + input_scratch_buffers.at("mux_l2")[1], // 26: mux_input_ptr_buffers[1] + input_scratch_buffers.at("mux_l2")[2], // 27: mux_input_ptr_buffers[2] + input_scratch_buffers.at("mux_l2")[3], // 28: mux_input_ptr_buffers[3] + + output_scratch_buffers.at("mux_l1")[0], // 29: mux_input_remote_ptr_buffers[0] + output_scratch_buffers.at("mux_l1")[1], // 30: mux_input_remote_ptr_buffers[1] + output_scratch_buffers.at("mux_l1")[2], // 31: mux_input_remote_ptr_buffers[2] + output_scratch_buffers.at("mux_l1")[3], // 32: mux_input_remote_ptr_buffers[3] + + output_scratch_buffers.at("mux_l2")[0], // 33: mux_output_ptr_buffer + input_scratch_buffers.at("demux_l1")[0], // 34: mux_output_remote_ptr_buffer }; log_info(LogTest, "run L2 MUX at x={},y={} (phys x={},y={})", mux_l2_core.x, mux_l2_core.y, mux_l2_phys_core.x, mux_l2_phys_core.y); @@ -371,7 +425,18 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000, // 24: timeout_cycles - 0, 0, 0, 0, 0 // 25-29: packetize/depacketize settings + 0, 0, 0, 0, 0, // 25-29: packetize/depacketize settings + input_scratch_buffers.at("demux_l1")[0], // 30: demux_input_ptr_buffer + output_scratch_buffers.at("mux_l2")[0], // 31: demux_input_remote_ptr_buffer + output_scratch_buffers.at("demux_l1")[0], // 32: demux_output_ptr_buffers[0] + output_scratch_buffers.at("demux_l1")[1], // 33: demux_output_ptr_buffers[1] + output_scratch_buffers.at("demux_l1")[2], // 34: demux_output_ptr_buffers[2] + output_scratch_buffers.at("demux_l1")[3], // 35: demux_output_ptr_buffers[3] + + input_scratch_buffers.at("demux_l2")[0], // 36: demux_output_remote_ptr_buffers[0] + input_scratch_buffers.at("demux_l2")[1], // 37: demux_output_remote_ptr_buffers[1] + input_scratch_buffers.at("demux_l2")[2], // 38: demux_output_remote_ptr_buffers[2] + input_scratch_buffers.at("demux_l2")[3], // 39: demux_output_remote_ptr_buffers[3] }; log_info(LogTest, "run L1 DEMUX at x={},y={} (phys x={},y={})", @@ -436,11 +501,22 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000, // 24: timeout_cycles - 0, 0, 0, 0, 0 // 25-29: packetize/depacketize settings + 0, 0, 0, 0, 0, // 25-29: packetize/depacketize settings + input_scratch_buffers.at("demux_l2")[i], // 30: demux_input_ptr_buffer + output_scratch_buffers.at("demux_l1")[i], // 31: demux_input_remote_ptr_buffer + output_scratch_buffers.at("demux_l2")[i * num_demux_l2], // 32: demux_output_ptr_buffers[0] + output_scratch_buffers.at("demux_l2")[i * num_demux_l2 + 1], // 33: demux_output_ptr_buffers[1] + output_scratch_buffers.at("demux_l2")[i * num_demux_l2 + 2], // 34: demux_output_ptr_buffers[2] + output_scratch_buffers.at("demux_l2")[i * num_demux_l2 + 3], // 35: demux_output_ptr_buffers[3] + + input_scratch_buffers.at("traffic_gen_rx")[i * num_demux_l2], // 36: demux_output_remote_ptr_buffers[0] + input_scratch_buffers.at("traffic_gen_rx")[i * num_demux_l2 + 1], // 37: demux_output_remote_ptr_buffers[1] + input_scratch_buffers.at("traffic_gen_rx")[i * num_demux_l2 + 2], // 38: demux_output_remote_ptr_buffers[2] + input_scratch_buffers.at("traffic_gen_rx")[i * num_demux_l2 + 3], // 39: demux_output_remote_ptr_buffers[3] }; - log_info(LogTest, "run L2 DEMUX at x={},y={} (phys x={},y={})", - demux_l2_core[i].x, demux_l2_core[i].y, demux_l2_phys_core[i].x, demux_l2_phys_core[i].y); + log_info(LogTest, "run L2 DEMUX {} at x={},y={} (phys x={},y={})", + i, demux_l2_core[i].x, demux_l2_core[i].y, demux_l2_phys_core[i].x, demux_l2_phys_core[i].y); auto demux_kernel = tt_metal::CreateKernel( program, "tt_metal/impl/dispatch/kernels/packet_demux.cpp", @@ -464,12 +540,12 @@ int main(int argc, char **argv) { std::chrono::duration elapsed_seconds = (end-start); log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - vector> tx_results; - vector> rx_results; - vector> mux_l1_results; - vector mux_l2_results; - vector> demux_l2_results; - vector demux_l1_results; + std::vector> tx_results; + std::vector> rx_results; + std::vector> mux_l1_results; + std::vector mux_l2_results; + std::vector> demux_l2_results; + std::vector demux_l1_results; for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp deleted file mode 100644 index f883facbe080..000000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp +++ /dev/null @@ -1,832 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" - -using std::vector; -using namespace tt; - -int main(int argc, char** argv) { - constexpr uint32_t default_tx_x = 0; - constexpr uint32_t default_tx_y = 0; - constexpr uint32_t default_rx_x = 0; - constexpr uint32_t default_rx_y = 3; - - constexpr uint32_t default_mux_x = 0; - constexpr uint32_t default_mux_y = 1; - constexpr uint32_t default_demux_x = 0; - constexpr uint32_t default_demux_y = 2; - - constexpr uint32_t default_tunneler_x = 0; - constexpr uint32_t default_tunneler_y = 0; - - constexpr uint32_t default_prng_seed = 0x100; - constexpr uint32_t default_data_kb_per_tx = 16 * 1024; - constexpr uint32_t default_max_packet_size_words = 0x100; - - constexpr uint32_t default_tx_queue_start_addr = 0x80000; - constexpr uint32_t default_tx_queue_size_bytes = 0x10000; - constexpr uint32_t default_rx_queue_start_addr = 0xa0000; - constexpr uint32_t default_rx_queue_size_bytes = 0x20000; - constexpr uint32_t default_mux_queue_start_addr = 0x80000; - constexpr uint32_t default_mux_queue_size_bytes = 0x10000; - constexpr uint32_t default_demux_queue_start_addr = 0x90000; - constexpr uint32_t default_demux_queue_size_bytes = 0x20000; - - constexpr uint32_t default_tunneler_queue_start_addr = 0x19000; - constexpr uint32_t default_tunneler_queue_size_bytes = 0x10000; - - constexpr uint32_t default_test_results_addr = 0x100000; - constexpr uint32_t default_test_results_size = 0x40000; - - constexpr uint32_t default_tunneler_test_results_addr = 0x39000; - constexpr uint32_t default_tunneler_test_results_size = 0x7000; - - constexpr uint32_t default_timeout_mcycles = 1000; - constexpr uint32_t default_rx_disable_data_check = 0; - - constexpr uint32_t src_endpoint_start_id = 0xaa; - constexpr uint32_t dest_endpoint_start_id = 0xbb; - - constexpr uint32_t num_src_endpoints = 1; - constexpr uint32_t num_dest_endpoints = 1; - - constexpr uint32_t default_test_device_id = 0; - - constexpr uint32_t default_tunnel_mode = 0; - - std::vector input_args(argv, argv + argc); - if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { - log_info(LogTest, "Usage:"); - log_info(LogTest, " --prng_seed: PRNG seed, default = 0x{:x}", default_prng_seed); - log_info(LogTest, " --data_kb_per_tx: Total data in KB per TX endpoint, default = {}", default_data_kb_per_tx); - log_info( - LogTest, - " --max_packet_size_words: Max packet size in words, default = 0x{:x}", - default_max_packet_size_words); - log_info(LogTest, " --tx_x: X coordinate of the starting TX core, default = {}", default_tx_x); - log_info(LogTest, " --tx_y: Y coordinate of the starting TX core, default = {}", default_tx_y); - log_info(LogTest, " --rx_x: X coordinate of the starting RX core, default = {}", default_rx_x); - log_info(LogTest, " --rx_y: Y coordinate of the starting RX core, default = {}", default_rx_y); - log_info(LogTest, " --mux_x: X coordinate of the starting mux core, default = {}", default_mux_x); - log_info(LogTest, " --mux_y: Y coordinate of the starting mux core, default = {}", default_mux_y); - log_info(LogTest, " --demux_x: X coordinate of the starting demux core, default = {}", default_demux_x); - log_info(LogTest, " --demux_y: Y coordinate of the starting demux core, default = {}", default_demux_y); - log_info( - LogTest, " --tx_queue_start_addr: TX queue start address, default = 0x{:x}", default_tx_queue_start_addr); - log_info( - LogTest, " --tx_queue_size_bytes: TX queue size in bytes, default = 0x{:x}", default_tx_queue_size_bytes); - log_info( - LogTest, " --rx_queue_start_addr: RX queue start address, default = 0x{:x}", default_rx_queue_start_addr); - log_info( - LogTest, " --rx_queue_size_bytes: RX queue size in bytes, default = 0x{:x}", default_rx_queue_size_bytes); - log_info( - LogTest, - " --mux_queue_start_addr: MUX queue start address, default = 0x{:x}", - default_mux_queue_start_addr); - log_info( - LogTest, - " --mux_queue_size_bytes: MUX queue size in bytes, default = 0x{:x}", - default_mux_queue_size_bytes); - log_info( - LogTest, - " --demux_queue_start_addr: DEMUX queue start address, default = 0x{:x}", - default_demux_queue_start_addr); - log_info( - LogTest, - " --demux_queue_size_bytes: DEMUX queue size in bytes, default = 0x{:x}", - default_demux_queue_size_bytes); - log_info( - LogTest, " --test_results_addr: test results buf address, default = 0x{:x}", default_test_results_addr); - log_info(LogTest, " --test_results_size: test results buf size, default = 0x{:x}", default_test_results_size); - log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); - log_info( - LogTest, - " --rx_disable_data_check: Disable data check on RX, default = {}", - default_rx_disable_data_check); - log_info(LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id); - log_info(LogTest, " --tunnel_mode: 0: Bidirectional. 1 L->R, 2 R->L, default = {}", default_tunnel_mode); - - return 0; - } - - uint32_t tx_x = test_args::get_command_option_uint32(input_args, "--tx_x", default_tx_x); - uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); - uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); - uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - uint32_t mux_x = test_args::get_command_option_uint32(input_args, "--mux_x", default_mux_x); - uint32_t mux_y = test_args::get_command_option_uint32(input_args, "--mux_y", default_mux_y); - uint32_t demux_x = test_args::get_command_option_uint32(input_args, "--demux_x", default_demux_x); - uint32_t demux_y = test_args::get_command_option_uint32(input_args, "--demux_y", default_demux_y); - uint32_t tunneler_x = test_args::get_command_option_uint32(input_args, "--tunneler_x", default_tunneler_x); - uint32_t tunneler_y = test_args::get_command_option_uint32(input_args, "--tunneler_y", default_tunneler_y); - uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); - uint32_t data_kb_per_tx = - test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); - uint32_t max_packet_size_words = - test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); - uint32_t tx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--tx_queue_start_addr", default_tx_queue_start_addr); - uint32_t tx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--tx_queue_size_bytes", default_tx_queue_size_bytes); - uint32_t rx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--rx_queue_start_addr", default_rx_queue_start_addr); - uint32_t rx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--rx_queue_size_bytes", default_rx_queue_size_bytes); - uint32_t mux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--mux_queue_start_addr", default_mux_queue_start_addr); - uint32_t mux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--mux_queue_size_bytes", default_mux_queue_size_bytes); - uint32_t demux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--demux_queue_start_addr", default_demux_queue_start_addr); - uint32_t demux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--demux_queue_size_bytes", default_demux_queue_size_bytes); - uint32_t tunneler_queue_start_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_start_addr", default_tunneler_queue_start_addr); - uint32_t tunneler_queue_size_bytes = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_size_bytes", default_tunneler_queue_size_bytes); - uint32_t test_results_addr = - test_args::get_command_option_uint32(input_args, "--test_results_addr", default_test_results_addr); - uint32_t test_results_size = - test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); - uint32_t tunneler_test_results_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_addr", default_tunneler_test_results_addr); - uint32_t tunneler_test_results_size = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_size", default_tunneler_test_results_size); - uint32_t timeout_mcycles = - test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); - uint32_t rx_disable_data_check = - test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); - uint32_t test_device_id = test_args::get_command_option_uint32(input_args, "--device_id", default_test_device_id); - uint32_t tunnel_mode = test_args::get_command_option_uint32(input_args, "--tunnel_mode", default_tunnel_mode); - - bool pass = true; - bool l_to_r = tunnel_mode == 1 || tunnel_mode == 0; - bool r_to_l = tunnel_mode == 2 || tunnel_mode == 0; - try { - int num_devices = tt_metal::GetNumAvailableDevices(); - if (test_device_id >= num_devices) { - log_info(LogTest, "Device {} is not valid. Highest valid device id = {}.", test_device_id, num_devices - 1); - throw std::runtime_error("Invalid Device Id."); - } - int device_id_l = test_device_id; - - tt_metal::Device* device = tt_metal::CreateDevice(device_id_l); - auto const& device_active_eth_cores = device->get_active_ethernet_cores(); - - if (device_active_eth_cores.size() == 0) { - log_info( - LogTest, - "Device {} does not have enough active cores. Need 1 active ethernet core for this test.", - device_id_l); - tt_metal::CloseDevice(device); - throw std::runtime_error("Test cannot run on specified device."); - } - - auto eth_core_iter = device_active_eth_cores.begin(); - auto [device_id_r, eth_receiver_core] = device->get_connected_ethernet_core(*eth_core_iter); - - tt_metal::Device* device_r = tt_metal::CreateDevice(device_id_r); - - CoreCoord tunneler_logical_core = device->get_ethernet_sockets(device_id_r)[0]; - CoreCoord tunneler_phys_core = device->ethernet_core_from_logical_core(tunneler_logical_core); - - CoreCoord r_tunneler_logical_core = device_r->get_ethernet_sockets(device_id_l)[0]; - CoreCoord r_tunneler_phys_core = device_r->ethernet_core_from_logical_core(r_tunneler_logical_core); - - std::cout << "Left Tunneler = " << tunneler_logical_core.str() << std::endl; - std::cout << "Right Tunneler = " << r_tunneler_logical_core.str() << std::endl; - - tt_metal::Program program = tt_metal::CreateProgram(); - tt_metal::Program program_r = tt_metal::CreateProgram(); - - CoreCoord mux_core = {mux_x, mux_y}; - CoreCoord mux_phys_core = device->worker_core_from_logical_core(mux_core); - - CoreCoord demux_core = {demux_x, demux_y}; - CoreCoord demux_phys_core = device_r->worker_core_from_logical_core(demux_core); - - // tx on left chip - std::vector l_tx_phys_core; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x+i, tx_y}; - l_tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = - { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - if (l_to_r) { - log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - } - - // tx on right chip - std::vector r_tx_phys_core; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x+i, tx_y}; - r_tx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = - { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - if (r_to_l) { - log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program_r, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - } - - // Mux Left - std::vector l_mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)l_tx_phys_core[0].x, - (uint32_t)l_tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - 0, // 5: src 1 info - 0, // 6: src 2 info - 0, // 7: src 3 info - (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)tunneler_phys_core.x, // 10: remote_tx_x - (uint32_t)tunneler_phys_core.y, // 11: remote_tx_y - 0, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - test_results_addr, // 14: test_results_addr - test_results_size, // 15: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 16: timeout_cycles - }; - if (l_to_r) { - log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); - auto l_mux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - {mux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = l_mux_compile_args, - .defines = {}}); - } - // Mux Right - std::vector r_mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)r_tx_phys_core[0].x, - (uint32_t)r_tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - 0, // 5: src 1 info - 0, // 6: src 2 info - 0, // 7: src 3 info - l_to_r ? ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4) - : (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)r_tunneler_phys_core.x, // 10: remote_tx_x - (uint32_t)r_tunneler_phys_core.y, // 11: remote_tx_y - l_to_r ? (uint32_t)1 : (uint32_t)0, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - test_results_addr, // 14: test_results_addr - test_results_size, // 15: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 16: timeout_cycles - }; - if (r_to_l) { - log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); - auto r_mux_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - {mux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = r_mux_compile_args, - .defines = {}}); - } - - uint32_t tunnel_receiver_eth = packet_switch_4B_pack( - r_tunneler_phys_core.x, r_tunneler_phys_core.y, 0, (uint32_t)DispatchRemoteNetworkType::ETH); - uint32_t receiver_eth_q_addr = tunneler_queue_start_addr >> 4; - uint32_t receiver_eth_q_size = tunneler_queue_size_bytes >> 4; - - uint32_t tunnel_receiver_noc = packet_switch_4B_pack( - demux_phys_core.x, demux_phys_core.y, num_dest_endpoints, (uint32_t)DispatchRemoteNetworkType::NOC0); - uint32_t receiver_noc_q_addr = demux_queue_start_addr >> 4; - uint32_t receiver_noc_q_size = demux_queue_size_bytes >> 4; - - uint32_t tunnel_sender_noc = packet_switch_4B_pack( - mux_phys_core.x, mux_phys_core.y, num_dest_endpoints, (uint32_t)DispatchRemoteNetworkType::NOC0); - uint32_t tunnel_sender_eth = packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - l_to_r ? (uint32_t)3 : (uint32_t)2, - (uint32_t)DispatchRemoteNetworkType::ETH); - - std::vector tunneler_l_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - l_to_r && r_to_l ? (uint32_t)2 : (uint32_t)1, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - l_to_r ? tunnel_receiver_eth : tunnel_receiver_noc, // 4: remote_receiver_0_info - tunnel_receiver_noc, // 5: remote_receiver_1_info - l_to_r ? receiver_eth_q_addr : receiver_noc_q_addr, // 6: remote_receiver_queue_start_addr_words 0 - l_to_r ? receiver_eth_q_size : receiver_noc_q_size, // 7: remote_receiver_queue_size_words 0 - receiver_noc_q_addr, // 8: remote_receiver_queue_start_addr_words 1 - receiver_noc_q_size, // 9: remote_receiver_queue_size_words 1 - l_to_r ? tunnel_sender_noc : tunnel_sender_eth, // 10: remote_sender_0_info - tunnel_sender_eth, // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - }; - - auto tunneler_l_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - tunneler_logical_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_l_compile_args}); - - tunnel_receiver_eth = packet_switch_4B_pack( - tunneler_phys_core.x, tunneler_phys_core.y, l_to_r ? 1 : 0, (uint32_t)DispatchRemoteNetworkType::ETH); - receiver_eth_q_addr = - l_to_r ? ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4) : tunneler_queue_start_addr >> 4; - - tunnel_sender_eth = packet_switch_4B_pack( - tunneler_phys_core.x, tunneler_phys_core.y, 2, (uint32_t)DispatchRemoteNetworkType::ETH); - std::vector tunneler_r_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - l_to_r && r_to_l ? (uint32_t)2 : (uint32_t)1, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - l_to_r ? tunnel_receiver_noc : tunnel_receiver_eth, // 4: remote_receiver_0_info - tunnel_receiver_eth, // 5: remote_receiver_1_info - l_to_r ? receiver_noc_q_addr : receiver_eth_q_addr, // 6: remote_receiver_queue_start_addr_words 0 - l_to_r ? receiver_noc_q_size : receiver_eth_q_size, // 7: remote_receiver_queue_size_words 0 - receiver_eth_q_addr, // 8: remote_receiver_queue_start_addr_words 1 - receiver_eth_q_size, // 9: remote_receiver_queue_size_words 1 - l_to_r ? tunnel_sender_eth : tunnel_sender_noc, // 10: remote_sender_0_info - tunnel_sender_noc, // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - }; - - auto tunneler_r_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - r_tunneler_logical_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_r_compile_args}); - - // Rx Right - std::vector r_rx_phys_core; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x+i, rx_y}; - r_rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = - { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - if (l_to_r) { - log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program_r, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - } - - // Rx Left - std::vector l_rx_phys_core; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x+i, rx_y}; - l_rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = - { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - if (r_to_l) { - log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - } - - // Demux Right - uint32_t dest_map_array[4] = {0, 1, 2, 3}; - uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - std::vector r_demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - r_rx_phys_core[0].x, - r_rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - 0, // 5: remote_tx_1_info - 0, // 6: remote_tx_2_info - 0, // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - (uint32_t)r_tunneler_phys_core.x, // 16: remote_rx_x - (uint32_t)r_tunneler_phys_core.y, // 17: remote_rx_y - 2, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - }; - if (l_to_r) { - log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); - auto r_demux_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - {demux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = r_demux_compile_args, - .defines = {}}); - } - // Demux Left - std::vector l_demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - l_rx_phys_core[0].x, - l_rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - 0, // 5: remote_tx_1_info - 0, // 6: remote_tx_2_info - 0, // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - (uint32_t)tunneler_phys_core.x, // 16: remote_rx_x - (uint32_t)tunneler_phys_core.y, // 17: remote_rx_y - l_to_r ? (uint32_t)3 : (uint32_t)2, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - }; - if (r_to_l) { - log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); - auto l_demux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - {demux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = l_demux_compile_args, - .defines = {}}); - } - - log_info(LogTest, "Starting test..."); - - auto start = std::chrono::system_clock::now(); - tt_metal::detail::LaunchProgram(device, program, false); - tt_metal::detail::LaunchProgram(device_r, program_r, false); - tt_metal::detail::WaitProgramDone(device, program); - tt_metal::detail::WaitProgramDone(device_r, program_r); - auto end = std::chrono::system_clock::now(); - - std::chrono::duration elapsed_seconds = (end - start); - log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - - vector> l_tx_results; - vector> l_rx_results; - vector l_mux_results; - vector l_demux_results; - vector> r_tx_results; - vector> r_rx_results; - vector r_mux_results; - vector r_demux_results; - - if (l_to_r) { - for (uint32_t i = 0; i < num_src_endpoints; i++) { - l_tx_results.push_back(tt::llrt::read_hex_vec_from_core( - device->id(), l_tx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(l_tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (l_tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - r_rx_results.push_back(tt::llrt::read_hex_vec_from_core( - device_r->id(), r_rx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "RX{} status = {}", - i, - packet_queue_test_status_to_string(r_rx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (r_rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - l_mux_results = - tt::llrt::read_hex_vec_from_core(device->id(), mux_phys_core, test_results_addr, test_results_size); - log_info( - LogTest, "MUX status = {}", packet_queue_test_status_to_string(l_mux_results[PQ_TEST_STATUS_INDEX])); - pass &= (l_mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - - r_demux_results = - tt::llrt::read_hex_vec_from_core(device_r->id(), demux_phys_core, test_results_addr, test_results_size); - log_info( - LogTest, - "DEMUX status = {}", - packet_queue_test_status_to_string(r_demux_results[PQ_TEST_STATUS_INDEX])); - pass &= (r_demux_results[0] == PACKET_QUEUE_TEST_PASS); - } - - if (r_to_l) { - for (uint32_t i = 0; i < num_src_endpoints; i++) { - r_tx_results.push_back(tt::llrt::read_hex_vec_from_core( - device_r->id(), r_tx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(r_tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (r_tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - l_rx_results.push_back(tt::llrt::read_hex_vec_from_core( - device->id(), l_rx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "RX{} status = {}", - i, - packet_queue_test_status_to_string(l_rx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (l_rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - r_mux_results = - tt::llrt::read_hex_vec_from_core(device_r->id(), mux_phys_core, test_results_addr, test_results_size); - log_info( - LogTest, "MUX status = {}", packet_queue_test_status_to_string(r_mux_results[PQ_TEST_STATUS_INDEX])); - pass &= (r_mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - - l_demux_results = - tt::llrt::read_hex_vec_from_core(device->id(), demux_phys_core, test_results_addr, test_results_size); - log_info( - LogTest, - "DEMUX status = {}", - packet_queue_test_status_to_string(l_demux_results[PQ_TEST_STATUS_INDEX])); - pass &= (l_demux_results[0] == PACKET_QUEUE_TEST_PASS); - } - - pass &= tt_metal::CloseDevice(device); - pass &= tt_metal::CloseDevice(device_r); - - if (pass && l_to_r) { - double total_tx_bw = 0.0; - uint64_t total_tx_words_sent = 0; - uint64_t total_rx_words_checked = 0; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - uint64_t tx_words_sent = get_64b_result(l_tx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(l_tx_results[i], PQ_TEST_CYCLES_INDEX); - double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; - log_info( - LogTest, - "TX {} words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - tx_words_sent, - tx_elapsed_cycles, - tx_bw); - total_tx_bw += tx_bw; - } - log_info(LogTest, "Total TX BW = {:.2f} B/cycle", total_tx_bw); - double total_rx_bw = 0.0; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - uint64_t rx_words_checked = get_64b_result(r_rx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_rx_words_checked += rx_words_checked; - uint64_t rx_elapsed_cycles = get_64b_result(r_rx_results[i], PQ_TEST_CYCLES_INDEX); - double rx_bw = ((double)rx_words_checked) * PACKET_WORD_SIZE_BYTES / rx_elapsed_cycles; - log_info( - LogTest, - "RX {} words checked = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - rx_words_checked, - rx_elapsed_cycles, - rx_bw); - total_rx_bw += rx_bw; - } - log_info(LogTest, "Total RX BW = {:.2f} B/cycle", total_rx_bw); - if (total_tx_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "Total TX words sent = {} != Total RX words checked = {}", - total_tx_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "Total TX words sent = {} == Total RX words checked = {} -> OK", - total_tx_words_sent, - total_rx_words_checked); - } - uint64_t mux_words_sent = get_64b_result(l_mux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t mux_elapsed_cycles = get_64b_result(l_mux_results, PQ_TEST_CYCLES_INDEX); - uint64_t mux_iter = get_64b_result(l_mux_results, PQ_TEST_ITER_INDEX); - double mux_bw = ((double)mux_words_sent) * PACKET_WORD_SIZE_BYTES / mux_elapsed_cycles; - double mux_cycles_per_iter = ((double)mux_elapsed_cycles) / mux_iter; - log_info( - LogTest, - "MUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - mux_words_sent, - mux_elapsed_cycles, - mux_bw); - log_info(LogTest, "MUX iters = {} -> cycles/iter = {:.1f}", mux_iter, mux_cycles_per_iter); - if (mux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "MUX words sent = {} != Total RX words checked = {}", - mux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "MUX words sent = {} == Total RX words checked = {} -> OK", - mux_words_sent, - total_rx_words_checked); - } - - uint64_t demux_words_sent = get_64b_result(r_demux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t demux_elapsed_cycles = get_64b_result(r_demux_results, PQ_TEST_CYCLES_INDEX); - double demux_bw = ((double)demux_words_sent) * PACKET_WORD_SIZE_BYTES / demux_elapsed_cycles; - uint64_t demux_iter = get_64b_result(r_demux_results, PQ_TEST_ITER_INDEX); - double demux_cycles_per_iter = ((double)demux_elapsed_cycles) / demux_iter; - log_info( - LogTest, - "DEMUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - demux_words_sent, - demux_elapsed_cycles, - demux_bw); - log_info(LogTest, "DEMUX iters = {} -> cycles/iter = {:.1f}", demux_iter, demux_cycles_per_iter); - if (demux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "DEMUX words sent = {} != Total RX words checked = {}", - demux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "DEMUX words sent = {} == Total RX words checked = {} -> OK", - demux_words_sent, - total_rx_words_checked); - } - } - - } catch (const std::exception& e) { - pass = false; - log_fatal(e.what()); - } - - tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); - - if (pass) { - log_info(LogTest, "Test Passed"); - return 0; - } else { - log_fatal(LogTest, "Test Failed\n"); - return 1; - } -} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp deleted file mode 100644 index b455914329ee..000000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp +++ /dev/null @@ -1,848 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" - -using std::vector; -using namespace tt; - -int main(int argc, char** argv) { - constexpr uint32_t default_tx_x = 0; - constexpr uint32_t default_tx_y = 0; - constexpr uint32_t default_rx_x = 0; - constexpr uint32_t default_rx_y = 3; - - constexpr uint32_t default_mux_x = 0; - constexpr uint32_t default_mux_y = 1; - constexpr uint32_t default_demux_x = 0; - constexpr uint32_t default_demux_y = 2; - - constexpr uint32_t default_tunneler_x = 0; - constexpr uint32_t default_tunneler_y = 0; - - constexpr uint32_t default_prng_seed = 0x100; - constexpr uint32_t default_data_kb_per_tx = 16 * 1024; - constexpr uint32_t default_max_packet_size_words = 0x100; - - constexpr uint32_t default_tx_queue_start_addr = 0x80000; - constexpr uint32_t default_tx_queue_size_bytes = 0x10000; - constexpr uint32_t default_rx_queue_start_addr = 0xa0000; - constexpr uint32_t default_rx_queue_size_bytes = 0x20000; - constexpr uint32_t default_mux_queue_start_addr = 0x80000; - constexpr uint32_t default_mux_queue_size_bytes = 0x10000; - constexpr uint32_t default_demux_queue_start_addr = 0x90000; - constexpr uint32_t default_demux_queue_size_bytes = 0x20000; - - constexpr uint32_t default_tunneler_queue_start_addr = 0x19000; - constexpr uint32_t default_tunneler_queue_size_bytes = 0x10000; - - constexpr uint32_t default_test_results_addr = 0x100000; - constexpr uint32_t default_test_results_size = 0x40000; - - constexpr uint32_t default_tunneler_test_results_addr = 0x39000; - constexpr uint32_t default_tunneler_test_results_size = 0x7000; - - constexpr uint32_t default_timeout_mcycles = 1000; - constexpr uint32_t default_rx_disable_data_check = 0; - - constexpr uint32_t src_endpoint_start_id = 0xaa; - constexpr uint32_t dest_endpoint_start_id = 0xbb; - - constexpr uint32_t num_src_endpoints = 2; - constexpr uint32_t num_dest_endpoints = 2; - - constexpr uint32_t default_test_device_id = 0; - - constexpr uint32_t default_tunnel_mode = 0; - - std::vector input_args(argv, argv + argc); - if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { - log_info(LogTest, "Usage:"); - log_info(LogTest, " --prng_seed: PRNG seed, default = 0x{:x}", default_prng_seed); - log_info(LogTest, " --data_kb_per_tx: Total data in KB per TX endpoint, default = {}", default_data_kb_per_tx); - log_info( - LogTest, - " --max_packet_size_words: Max packet size in words, default = 0x{:x}", - default_max_packet_size_words); - log_info(LogTest, " --tx_x: X coordinate of the starting TX core, default = {}", default_tx_x); - log_info(LogTest, " --tx_y: Y coordinate of the starting TX core, default = {}", default_tx_y); - log_info(LogTest, " --rx_x: X coordinate of the starting RX core, default = {}", default_rx_x); - log_info(LogTest, " --rx_y: Y coordinate of the starting RX core, default = {}", default_rx_y); - log_info(LogTest, " --mux_x: X coordinate of the starting mux core, default = {}", default_mux_x); - log_info(LogTest, " --mux_y: Y coordinate of the starting mux core, default = {}", default_mux_y); - log_info(LogTest, " --demux_x: X coordinate of the starting demux core, default = {}", default_demux_x); - log_info(LogTest, " --demux_y: Y coordinate of the starting demux core, default = {}", default_demux_y); - log_info( - LogTest, " --tx_queue_start_addr: TX queue start address, default = 0x{:x}", default_tx_queue_start_addr); - log_info( - LogTest, " --tx_queue_size_bytes: TX queue size in bytes, default = 0x{:x}", default_tx_queue_size_bytes); - log_info( - LogTest, " --rx_queue_start_addr: RX queue start address, default = 0x{:x}", default_rx_queue_start_addr); - log_info( - LogTest, " --rx_queue_size_bytes: RX queue size in bytes, default = 0x{:x}", default_rx_queue_size_bytes); - log_info( - LogTest, - " --mux_queue_start_addr: MUX queue start address, default = 0x{:x}", - default_mux_queue_start_addr); - log_info( - LogTest, - " --mux_queue_size_bytes: MUX queue size in bytes, default = 0x{:x}", - default_mux_queue_size_bytes); - log_info( - LogTest, - " --demux_queue_start_addr: DEMUX queue start address, default = 0x{:x}", - default_demux_queue_start_addr); - log_info( - LogTest, - " --demux_queue_size_bytes: DEMUX queue size in bytes, default = 0x{:x}", - default_demux_queue_size_bytes); - log_info( - LogTest, " --test_results_addr: test results buf address, default = 0x{:x}", default_test_results_addr); - log_info(LogTest, " --test_results_size: test results buf size, default = 0x{:x}", default_test_results_size); - log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); - log_info( - LogTest, - " --rx_disable_data_check: Disable data check on RX, default = {}", - default_rx_disable_data_check); - log_info(LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id); - log_info(LogTest, " --tunnel_mode: 0: Bidirectional. 1 L->R, 2 R->L, default = {}", default_tunnel_mode); - - return 0; - } - - uint32_t tx_x = test_args::get_command_option_uint32(input_args, "--tx_x", default_tx_x); - uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); - uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); - uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - uint32_t mux_x = test_args::get_command_option_uint32(input_args, "--mux_x", default_mux_x); - uint32_t mux_y = test_args::get_command_option_uint32(input_args, "--mux_y", default_mux_y); - uint32_t demux_x = test_args::get_command_option_uint32(input_args, "--demux_x", default_demux_x); - uint32_t demux_y = test_args::get_command_option_uint32(input_args, "--demux_y", default_demux_y); - uint32_t tunneler_x = test_args::get_command_option_uint32(input_args, "--tunneler_x", default_tunneler_x); - uint32_t tunneler_y = test_args::get_command_option_uint32(input_args, "--tunneler_y", default_tunneler_y); - uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); - uint32_t data_kb_per_tx = - test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); - uint32_t max_packet_size_words = - test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); - uint32_t tx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--tx_queue_start_addr", default_tx_queue_start_addr); - uint32_t tx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--tx_queue_size_bytes", default_tx_queue_size_bytes); - uint32_t rx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--rx_queue_start_addr", default_rx_queue_start_addr); - uint32_t rx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--rx_queue_size_bytes", default_rx_queue_size_bytes); - uint32_t mux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--mux_queue_start_addr", default_mux_queue_start_addr); - uint32_t mux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--mux_queue_size_bytes", default_mux_queue_size_bytes); - uint32_t demux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--demux_queue_start_addr", default_demux_queue_start_addr); - uint32_t demux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--demux_queue_size_bytes", default_demux_queue_size_bytes); - uint32_t tunneler_queue_start_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_start_addr", default_tunneler_queue_start_addr); - uint32_t tunneler_queue_size_bytes = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_size_bytes", default_tunneler_queue_size_bytes); - uint32_t test_results_addr = - test_args::get_command_option_uint32(input_args, "--test_results_addr", default_test_results_addr); - uint32_t test_results_size = - test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); - uint32_t tunneler_test_results_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_addr", default_tunneler_test_results_addr); - uint32_t tunneler_test_results_size = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_size", default_tunneler_test_results_size); - uint32_t timeout_mcycles = - test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); - uint32_t rx_disable_data_check = - test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); - uint32_t test_device_id = test_args::get_command_option_uint32(input_args, "--device_id", default_test_device_id); - uint32_t tunnel_mode = test_args::get_command_option_uint32(input_args, "--tunnel_mode", default_tunnel_mode); - - bool pass = true; - bool l_to_r = tunnel_mode == 1 || tunnel_mode == 0; - bool r_to_l = tunnel_mode == 2 || tunnel_mode == 0; - try { - int num_devices = tt_metal::GetNumAvailableDevices(); - if (test_device_id >= num_devices) { - log_info(LogTest, "Device {} is not valid. Highest valid device id = {}.", test_device_id, num_devices - 1); - throw std::runtime_error("Invalid Device Id."); - } - int device_id_l = test_device_id; - - tt_metal::Device* device = tt_metal::CreateDevice(device_id_l); - auto const& device_active_eth_cores = device->get_active_ethernet_cores(); - - if (device_active_eth_cores.size() == 0) { - log_info( - LogTest, - "Device {} does not have enough active cores. Need 1 active ethernet core for this test.", - device_id_l); - tt_metal::CloseDevice(device); - throw std::runtime_error("Test cannot run on specified device."); - } - - auto eth_core_iter = device_active_eth_cores.begin(); - auto [device_id_r, eth_receiver_core] = device->get_connected_ethernet_core(*eth_core_iter); - - tt_metal::Device* device_r = tt_metal::CreateDevice(device_id_r); - - CoreCoord tunneler_logical_core = device->get_ethernet_sockets(device_id_r)[0]; - CoreCoord tunneler_phys_core = device->ethernet_core_from_logical_core(tunneler_logical_core); - - CoreCoord r_tunneler_logical_core = device_r->get_ethernet_sockets(device_id_l)[0]; - CoreCoord r_tunneler_phys_core = device_r->ethernet_core_from_logical_core(r_tunneler_logical_core); - - std::cout << "Left Tunneler = " << tunneler_logical_core.str() << std::endl; - std::cout << "Right Tunneler = " << r_tunneler_logical_core.str() << std::endl; - - tt_metal::Program program = tt_metal::CreateProgram(); - tt_metal::Program program_r = tt_metal::CreateProgram(); - - CoreCoord mux_core = {mux_x, mux_y}; - CoreCoord mux_phys_core = device->worker_core_from_logical_core(mux_core); - - CoreCoord demux_core = {demux_x, demux_y}; - CoreCoord demux_phys_core = device_r->worker_core_from_logical_core(demux_core); - - // tx on left chip - std::vector l_tx_phys_core; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x+i, tx_y}; - l_tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = - { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - if (l_to_r) { - log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - } - - // tx on right chip - std::vector r_tx_phys_core; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x+i, tx_y}; - r_tx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = - { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - if (r_to_l) { - log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program_r, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - } - - // Mux Left - std::vector l_mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)l_tx_phys_core[0].x, - (uint32_t)l_tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - packet_switch_4B_pack( - (uint32_t)l_tx_phys_core[1].x, - (uint32_t)l_tx_phys_core[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info - 0, // 6: src 2 info - 0, // 7: src 3 info - (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)tunneler_phys_core.x, // 10: remote_tx_x - (uint32_t)tunneler_phys_core.y, // 11: remote_tx_y - 0, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - test_results_addr, // 14: test_results_addr - test_results_size, // 15: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 16: timeout_cycles - }; - if (l_to_r) { - log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); - auto l_mux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - {mux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = l_mux_compile_args, - .defines = {}}); - } - // Mux Right - std::vector r_mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)r_tx_phys_core[0].x, - (uint32_t)r_tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - packet_switch_4B_pack( - (uint32_t)r_tx_phys_core[1].x, - (uint32_t)r_tx_phys_core[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info - 0, // 6: src 2 info - 0, // 7: src 3 info - l_to_r ? ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4) - : (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)r_tunneler_phys_core.x, // 10: remote_tx_x - (uint32_t)r_tunneler_phys_core.y, // 11: remote_tx_y - l_to_r ? (uint32_t)1 : (uint32_t)0, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - test_results_addr, // 14: test_results_addr - test_results_size, // 15: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 16: timeout_cycles - }; - if (r_to_l) { - log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); - auto r_mux_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - {mux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = r_mux_compile_args, - .defines = {}}); - } - - uint32_t tunnel_receiver_eth = packet_switch_4B_pack( - r_tunneler_phys_core.x, r_tunneler_phys_core.y, 0, (uint32_t)DispatchRemoteNetworkType::ETH); - uint32_t receiver_eth_q_addr = tunneler_queue_start_addr >> 4; - uint32_t receiver_eth_q_size = tunneler_queue_size_bytes >> 4; - - uint32_t tunnel_receiver_noc = packet_switch_4B_pack( - demux_phys_core.x, demux_phys_core.y, num_dest_endpoints, (uint32_t)DispatchRemoteNetworkType::NOC0); - uint32_t receiver_noc_q_addr = demux_queue_start_addr >> 4; - uint32_t receiver_noc_q_size = demux_queue_size_bytes >> 4; - - uint32_t tunnel_sender_noc = packet_switch_4B_pack( - mux_phys_core.x, mux_phys_core.y, num_dest_endpoints, (uint32_t)DispatchRemoteNetworkType::NOC0); - uint32_t tunnel_sender_eth = packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - l_to_r ? (uint32_t)3 : (uint32_t)2, - (uint32_t)DispatchRemoteNetworkType::ETH); - - std::vector tunneler_l_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - l_to_r && r_to_l ? (uint32_t)2 : (uint32_t)1, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - l_to_r ? tunnel_receiver_eth : tunnel_receiver_noc, // 4: remote_receiver_0_info - tunnel_receiver_noc, // 5: remote_receiver_1_info - l_to_r ? receiver_eth_q_addr : receiver_noc_q_addr, // 6: remote_receiver_queue_start_addr_words 0 - l_to_r ? receiver_eth_q_size : receiver_noc_q_size, // 7: remote_receiver_queue_size_words 0 - receiver_noc_q_addr, // 8: remote_receiver_queue_start_addr_words 1 - receiver_noc_q_size, // 9: remote_receiver_queue_size_words 1 - l_to_r ? tunnel_sender_noc : tunnel_sender_eth, // 10: remote_sender_0_info - tunnel_sender_eth, // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - }; - - auto tunneler_l_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - tunneler_logical_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_l_compile_args}); - - tunnel_receiver_eth = packet_switch_4B_pack( - tunneler_phys_core.x, tunneler_phys_core.y, l_to_r ? 1 : 0, (uint32_t)DispatchRemoteNetworkType::ETH); - receiver_eth_q_addr = - l_to_r ? ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4) : tunneler_queue_start_addr >> 4; - - tunnel_sender_eth = packet_switch_4B_pack( - tunneler_phys_core.x, tunneler_phys_core.y, 2, (uint32_t)DispatchRemoteNetworkType::ETH); - std::vector tunneler_r_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - l_to_r && r_to_l ? (uint32_t)2 : (uint32_t)1, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - l_to_r ? tunnel_receiver_noc : tunnel_receiver_eth, // 4: remote_receiver_0_info - tunnel_receiver_eth, // 5: remote_receiver_1_info - l_to_r ? receiver_noc_q_addr : receiver_eth_q_addr, // 6: remote_receiver_queue_start_addr_words 0 - l_to_r ? receiver_noc_q_size : receiver_eth_q_size, // 7: remote_receiver_queue_size_words 0 - receiver_eth_q_addr, // 8: remote_receiver_queue_start_addr_words 1 - receiver_eth_q_size, // 9: remote_receiver_queue_size_words 1 - l_to_r ? tunnel_sender_eth : tunnel_sender_noc, // 10: remote_sender_0_info - tunnel_sender_noc, // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - }; - - auto tunneler_r_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - r_tunneler_logical_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_r_compile_args}); - - // Rx Right - std::vector r_rx_phys_core; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x+i, rx_y}; - r_rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = - { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - if (l_to_r) { - log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program_r, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - } - - // Rx Left - std::vector l_rx_phys_core; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x+i, rx_y}; - l_rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = - { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - if (r_to_l) { - log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - } - - // Demux Right - uint32_t dest_map_array[4] = {0, 1, 2, 3}; - uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - std::vector r_demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - r_rx_phys_core[0].x, - r_rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - r_rx_phys_core[1].x, - r_rx_phys_core[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - 0, // 6: remote_tx_2_info - 0, // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - (uint32_t)r_tunneler_phys_core.x, // 16: remote_rx_x - (uint32_t)r_tunneler_phys_core.y, // 17: remote_rx_y - 2, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - }; - if (l_to_r) { - log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); - auto r_demux_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - {demux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = r_demux_compile_args, - .defines = {}}); - } - // Demux Left - std::vector l_demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - l_rx_phys_core[0].x, - l_rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - l_rx_phys_core[1].x, - l_rx_phys_core[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - 0, // 6: remote_tx_2_info - 0, // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - (uint32_t)tunneler_phys_core.x, // 16: remote_rx_x - (uint32_t)tunneler_phys_core.y, // 17: remote_rx_y - l_to_r ? (uint32_t)3 : (uint32_t)2, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - }; - if (r_to_l) { - log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); - auto l_demux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - {demux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = l_demux_compile_args, - .defines = {}}); - } - - log_info(LogTest, "Starting test..."); - - auto start = std::chrono::system_clock::now(); - tt_metal::detail::LaunchProgram(device, program, false); - tt_metal::detail::LaunchProgram(device_r, program_r, false); - tt_metal::detail::WaitProgramDone(device, program); - tt_metal::detail::WaitProgramDone(device_r, program_r); - auto end = std::chrono::system_clock::now(); - - std::chrono::duration elapsed_seconds = (end - start); - log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - - vector> l_tx_results; - vector> l_rx_results; - vector l_mux_results; - vector l_demux_results; - vector> r_tx_results; - vector> r_rx_results; - vector r_mux_results; - vector r_demux_results; - - if (l_to_r) { - for (uint32_t i = 0; i < num_src_endpoints; i++) { - l_tx_results.push_back(tt::llrt::read_hex_vec_from_core( - device->id(), l_tx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(l_tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (l_tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - r_rx_results.push_back(tt::llrt::read_hex_vec_from_core( - device_r->id(), r_rx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "RX{} status = {}", - i, - packet_queue_test_status_to_string(r_rx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (r_rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - l_mux_results = - tt::llrt::read_hex_vec_from_core(device->id(), mux_phys_core, test_results_addr, test_results_size); - log_info( - LogTest, "MUX status = {}", packet_queue_test_status_to_string(l_mux_results[PQ_TEST_STATUS_INDEX])); - pass &= (l_mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - - r_demux_results = - tt::llrt::read_hex_vec_from_core(device_r->id(), demux_phys_core, test_results_addr, test_results_size); - log_info( - LogTest, - "DEMUX status = {}", - packet_queue_test_status_to_string(r_demux_results[PQ_TEST_STATUS_INDEX])); - pass &= (r_demux_results[0] == PACKET_QUEUE_TEST_PASS); - } - - if (r_to_l) { - for (uint32_t i = 0; i < num_src_endpoints; i++) { - r_tx_results.push_back(tt::llrt::read_hex_vec_from_core( - device_r->id(), r_tx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(r_tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (r_tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - l_rx_results.push_back(tt::llrt::read_hex_vec_from_core( - device->id(), l_rx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "RX{} status = {}", - i, - packet_queue_test_status_to_string(l_rx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (l_rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - r_mux_results = - tt::llrt::read_hex_vec_from_core(device_r->id(), mux_phys_core, test_results_addr, test_results_size); - log_info( - LogTest, "MUX status = {}", packet_queue_test_status_to_string(r_mux_results[PQ_TEST_STATUS_INDEX])); - pass &= (r_mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - - l_demux_results = - tt::llrt::read_hex_vec_from_core(device->id(), demux_phys_core, test_results_addr, test_results_size); - log_info( - LogTest, - "DEMUX status = {}", - packet_queue_test_status_to_string(l_demux_results[PQ_TEST_STATUS_INDEX])); - pass &= (l_demux_results[0] == PACKET_QUEUE_TEST_PASS); - } - - pass &= tt_metal::CloseDevice(device); - pass &= tt_metal::CloseDevice(device_r); - - if (pass && l_to_r) { - double total_tx_bw = 0.0; - uint64_t total_tx_words_sent = 0; - uint64_t total_rx_words_checked = 0; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - uint64_t tx_words_sent = get_64b_result(l_tx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(l_tx_results[i], PQ_TEST_CYCLES_INDEX); - double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; - log_info( - LogTest, - "TX {} words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - tx_words_sent, - tx_elapsed_cycles, - tx_bw); - total_tx_bw += tx_bw; - } - log_info(LogTest, "Total TX BW = {:.2f} B/cycle", total_tx_bw); - double total_rx_bw = 0.0; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - uint64_t rx_words_checked = get_64b_result(r_rx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_rx_words_checked += rx_words_checked; - uint64_t rx_elapsed_cycles = get_64b_result(r_rx_results[i], PQ_TEST_CYCLES_INDEX); - double rx_bw = ((double)rx_words_checked) * PACKET_WORD_SIZE_BYTES / rx_elapsed_cycles; - log_info( - LogTest, - "RX {} words checked = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - rx_words_checked, - rx_elapsed_cycles, - rx_bw); - total_rx_bw += rx_bw; - } - log_info(LogTest, "Total RX BW = {:.2f} B/cycle", total_rx_bw); - if (total_tx_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "Total TX words sent = {} != Total RX words checked = {}", - total_tx_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "Total TX words sent = {} == Total RX words checked = {} -> OK", - total_tx_words_sent, - total_rx_words_checked); - } - uint64_t mux_words_sent = get_64b_result(l_mux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t mux_elapsed_cycles = get_64b_result(l_mux_results, PQ_TEST_CYCLES_INDEX); - uint64_t mux_iter = get_64b_result(l_mux_results, PQ_TEST_ITER_INDEX); - double mux_bw = ((double)mux_words_sent) * PACKET_WORD_SIZE_BYTES / mux_elapsed_cycles; - double mux_cycles_per_iter = ((double)mux_elapsed_cycles) / mux_iter; - log_info( - LogTest, - "MUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - mux_words_sent, - mux_elapsed_cycles, - mux_bw); - log_info(LogTest, "MUX iters = {} -> cycles/iter = {:.1f}", mux_iter, mux_cycles_per_iter); - if (mux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "MUX words sent = {} != Total RX words checked = {}", - mux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "MUX words sent = {} == Total RX words checked = {} -> OK", - mux_words_sent, - total_rx_words_checked); - } - - uint64_t demux_words_sent = get_64b_result(r_demux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t demux_elapsed_cycles = get_64b_result(r_demux_results, PQ_TEST_CYCLES_INDEX); - double demux_bw = ((double)demux_words_sent) * PACKET_WORD_SIZE_BYTES / demux_elapsed_cycles; - uint64_t demux_iter = get_64b_result(r_demux_results, PQ_TEST_ITER_INDEX); - double demux_cycles_per_iter = ((double)demux_elapsed_cycles) / demux_iter; - log_info( - LogTest, - "DEMUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - demux_words_sent, - demux_elapsed_cycles, - demux_bw); - log_info(LogTest, "DEMUX iters = {} -> cycles/iter = {:.1f}", demux_iter, demux_cycles_per_iter); - if (demux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "DEMUX words sent = {} != Total RX words checked = {}", - demux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "DEMUX words sent = {} == Total RX words checked = {} -> OK", - demux_words_sent, - total_rx_words_checked); - } - } - - } catch (const std::exception& e) { - pass = false; - log_fatal(e.what()); - } - - tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); - - if (pass) { - log_info(LogTest, "Test Passed"); - return 0; - } else { - log_fatal(LogTest, "Test Failed\n"); - return 1; - } -} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp index 09de8d75e784..c518765895d5 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp @@ -10,12 +10,10 @@ #include "kernels/traffic_gen_test.hpp" #include "utils.hpp" -using std::vector; using namespace tt; int main(int argc, char **argv) { - bool pass = true; try { constexpr uint32_t default_tx_x = 0; @@ -30,11 +28,18 @@ int main(int argc, char **argv) { int device_id = 0; tt_metal::Device *device = tt_metal::CreateDevice(device_id); uint32_t l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1); - uint32_t default_test_result_buf_addr = l1_unreserved_base; + + uint32_t tx_input_ptrs_addr = l1_unreserved_base; + uint32_t tx_output_ptrs_addr = l1_unreserved_base + packet_queue_ptr_buffer_size; + uint32_t tx_output_mock_ptrs_addr = tx_output_ptrs_addr + packet_queue_ptr_buffer_size; + + uint32_t rx_input_ptrs_addr = tx_output_mock_ptrs_addr + packet_queue_ptr_buffer_size; + + uint32_t default_test_result_buf_addr = rx_input_ptrs_addr + packet_queue_ptr_buffer_size; constexpr uint32_t default_test_result_buf_size = 1024; - uint32_t default_tx_queue_start_addr = l1_unreserved_base + default_test_result_buf_size; + uint32_t default_tx_queue_start_addr = default_test_result_buf_addr + default_test_result_buf_size; constexpr uint32_t default_tx_queue_size_bytes = 0x10000; - uint32_t default_rx_queue_start_addr = l1_unreserved_base + 0x2000; + uint32_t default_rx_queue_start_addr = default_tx_queue_start_addr + default_tx_queue_size_bytes; constexpr uint32_t default_rx_queue_size_bytes = 0x20000; constexpr uint32_t default_timeout_mcycles = 1000; @@ -96,9 +101,6 @@ int main(int argc, char **argv) { uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); - assert(is_power_of_2(tx_queue_size_bytes) && (tx_queue_size_bytes >= 1024)); - assert(is_power_of_2(rx_queue_size_bytes) && (rx_queue_size_bytes >= 1024)); - tt_metal::Program program = tt_metal::CreateProgram(); CoreCoord traffic_gen_tx_core = {tx_x, tx_y}; @@ -131,6 +133,10 @@ int main(int argc, char **argv) { tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low tx_data_sent_per_iter_high, // 21: data_sent_per_iter_high + tx_input_ptrs_addr, // 22: traffic_gen_input_ptrs_addr + tx_output_mock_ptrs_addr, // 23: tx_output_mock_ptrs_addr + tx_output_ptrs_addr, // 24: traffic_gen_output_ptrs_addr + rx_input_ptrs_addr, // 25: traffic_gen_output_remote_ptrs_addr }; std::vector traffic_gen_rx_compile_args = @@ -154,6 +160,8 @@ int main(int argc, char **argv) { 0xbb, // 16: dest_endpoint_start_id timeout_mcycles * 1000 * 1000, // 17: timeout_cycles rx_disable_header_check, // 18: disable_header_check + rx_input_ptrs_addr, // 19: traffic_gen_input_ptrs_addr + tx_output_ptrs_addr, // 20: traffic_gen_input_remote_ptrs_addr }; std::map common_defines = { @@ -195,11 +203,11 @@ int main(int argc, char **argv) { std::chrono::duration elapsed_seconds = (end-start); log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - vector tx_results = + std::vector tx_results = tt::llrt::read_hex_vec_from_core( device->id(), phys_traffic_gen_tx_core, test_result_buf_addr, test_result_buf_size); - vector rx_results = + std::vector rx_results = tt::llrt::read_hex_vec_from_core( device->id(), phys_traffic_gen_rx_core, test_result_buf_addr, test_result_buf_size); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp deleted file mode 100644 index ce36770ab4d8..000000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp +++ /dev/null @@ -1,647 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" - -using std::vector; -using namespace tt; - -int main(int argc, char** argv) { - constexpr uint32_t default_tx_x = 0; - constexpr uint32_t default_tx_y = 0; - constexpr uint32_t default_rx_x = 0; - constexpr uint32_t default_rx_y = 3; - - constexpr uint32_t default_mux_x = 0; - constexpr uint32_t default_mux_y = 1; - constexpr uint32_t default_demux_x = 0; - constexpr uint32_t default_demux_y = 2; - - constexpr uint32_t default_tunneler_x = 0; - constexpr uint32_t default_tunneler_y = 0; - - constexpr uint32_t default_prng_seed = 0x100; - constexpr uint32_t default_data_kb_per_tx = 16 * 1024; - constexpr uint32_t default_max_packet_size_words = 0x100; - - constexpr uint32_t default_tx_queue_start_addr = 0x80000; - constexpr uint32_t default_tx_queue_size_bytes = 0x10000; - constexpr uint32_t default_rx_queue_start_addr = 0xa0000; - constexpr uint32_t default_rx_queue_size_bytes = 0x20000; - constexpr uint32_t default_mux_queue_start_addr = 0x80000; - constexpr uint32_t default_mux_queue_size_bytes = 0x10000; - constexpr uint32_t default_demux_queue_start_addr = 0x90000; - constexpr uint32_t default_demux_queue_size_bytes = 0x20000; - - constexpr uint32_t default_tunneler_queue_start_addr = 0x19000; - constexpr uint32_t default_tunneler_queue_size_bytes = 0x10000; - - constexpr uint32_t default_test_results_addr = 0x100000; - constexpr uint32_t default_test_results_size = 0x40000; - - constexpr uint32_t default_tunneler_test_results_addr = 0x29000; - constexpr uint32_t default_tunneler_test_results_size = 0x8000; - - constexpr uint32_t default_timeout_mcycles = 1000; - constexpr uint32_t default_rx_disable_data_check = 0; - - constexpr uint32_t src_endpoint_start_id = 0xaa; - constexpr uint32_t dest_endpoint_start_id = 0xbb; - - constexpr uint32_t num_src_endpoints = 4; - constexpr uint32_t num_dest_endpoints = 4; - - constexpr uint32_t default_test_device_id = 0; - - std::vector input_args(argv, argv + argc); - if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { - log_info(LogTest, "Usage:"); - log_info(LogTest, " --prng_seed: PRNG seed, default = 0x{:x}", default_prng_seed); - log_info(LogTest, " --data_kb_per_tx: Total data in KB per TX endpoint, default = {}", default_data_kb_per_tx); - log_info( - LogTest, - " --max_packet_size_words: Max packet size in words, default = 0x{:x}", - default_max_packet_size_words); - log_info(LogTest, " --tx_x: X coordinate of the starting TX core, default = {}", default_tx_x); - log_info(LogTest, " --tx_y: Y coordinate of the starting TX core, default = {}", default_tx_y); - log_info(LogTest, " --rx_x: X coordinate of the starting RX core, default = {}", default_rx_x); - log_info(LogTest, " --rx_y: Y coordinate of the starting RX core, default = {}", default_rx_y); - log_info(LogTest, " --mux_x: X coordinate of the starting mux core, default = {}", default_mux_x); - log_info(LogTest, " --mux_y: Y coordinate of the starting mux core, default = {}", default_mux_y); - log_info(LogTest, " --demux_x: X coordinate of the starting demux core, default = {}", default_demux_x); - log_info(LogTest, " --demux_y: Y coordinate of the starting demux core, default = {}", default_demux_y); - log_info( - LogTest, " --tx_queue_start_addr: TX queue start address, default = 0x{:x}", default_tx_queue_start_addr); - log_info( - LogTest, " --tx_queue_size_bytes: TX queue size in bytes, default = 0x{:x}", default_tx_queue_size_bytes); - log_info( - LogTest, " --rx_queue_start_addr: RX queue start address, default = 0x{:x}", default_rx_queue_start_addr); - log_info( - LogTest, " --rx_queue_size_bytes: RX queue size in bytes, default = 0x{:x}", default_rx_queue_size_bytes); - log_info( - LogTest, - " --mux_queue_start_addr: MUX queue start address, default = 0x{:x}", - default_mux_queue_start_addr); - log_info( - LogTest, - " --mux_queue_size_bytes: MUX queue size in bytes, default = 0x{:x}", - default_mux_queue_size_bytes); - log_info( - LogTest, - " --demux_queue_start_addr: DEMUX queue start address, default = 0x{:x}", - default_demux_queue_start_addr); - log_info( - LogTest, - " --demux_queue_size_bytes: DEMUX queue size in bytes, default = 0x{:x}", - default_demux_queue_size_bytes); - log_info( - LogTest, " --test_results_addr: test results buf address, default = 0x{:x}", default_test_results_addr); - log_info(LogTest, " --test_results_size: test results buf size, default = 0x{:x}", default_test_results_size); - log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); - log_info( - LogTest, - " --rx_disable_data_check: Disable data check on RX, default = {}", - default_rx_disable_data_check); - log_info(LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id); - return 0; - } - - uint32_t tx_x = test_args::get_command_option_uint32(input_args, "--tx_x", default_tx_x); - uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); - uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); - uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - uint32_t mux_x = test_args::get_command_option_uint32(input_args, "--mux_x", default_mux_x); - uint32_t mux_y = test_args::get_command_option_uint32(input_args, "--mux_y", default_mux_y); - uint32_t demux_x = test_args::get_command_option_uint32(input_args, "--demux_x", default_demux_x); - uint32_t demux_y = test_args::get_command_option_uint32(input_args, "--demux_y", default_demux_y); - uint32_t tunneler_x = test_args::get_command_option_uint32(input_args, "--tunneler_x", default_tunneler_x); - uint32_t tunneler_y = test_args::get_command_option_uint32(input_args, "--tunneler_y", default_tunneler_y); - uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); - uint32_t data_kb_per_tx = - test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); - uint32_t max_packet_size_words = - test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); - uint32_t tx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--tx_queue_start_addr", default_tx_queue_start_addr); - uint32_t tx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--tx_queue_size_bytes", default_tx_queue_size_bytes); - uint32_t rx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--rx_queue_start_addr", default_rx_queue_start_addr); - uint32_t rx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--rx_queue_size_bytes", default_rx_queue_size_bytes); - uint32_t mux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--mux_queue_start_addr", default_mux_queue_start_addr); - uint32_t mux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--mux_queue_size_bytes", default_mux_queue_size_bytes); - uint32_t demux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--demux_queue_start_addr", default_demux_queue_start_addr); - uint32_t demux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--demux_queue_size_bytes", default_demux_queue_size_bytes); - uint32_t tunneler_queue_start_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_start_addr", default_tunneler_queue_start_addr); - uint32_t tunneler_queue_size_bytes = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_size_bytes", default_tunneler_queue_size_bytes); - uint32_t test_results_addr = - test_args::get_command_option_uint32(input_args, "--test_results_addr", default_test_results_addr); - uint32_t test_results_size = - test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); - uint32_t tunneler_test_results_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_addr", default_tunneler_test_results_addr); - uint32_t tunneler_test_results_size = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_size", default_tunneler_test_results_size); - uint32_t timeout_mcycles = - test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); - uint32_t rx_disable_data_check = - test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); - uint32_t test_device_id = test_args::get_command_option_uint32(input_args, "--device_id", default_test_device_id); - - bool pass = true; - - std::map defines = { - {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth - }; - - try { - int num_devices = tt_metal::GetNumAvailableDevices(); - if (test_device_id >= num_devices) { - log_info(LogTest, "Device {} is not valid. Highest valid device id = {}.", test_device_id, num_devices - 1); - throw std::runtime_error("Invalid Device Id."); - } - int device_id_l = test_device_id; - - tt_metal::Device* device = tt_metal::CreateDevice(device_id_l); - auto const& device_active_eth_cores = device->get_active_ethernet_cores(); - - if (device_active_eth_cores.size() == 0) { - log_info( - LogTest, - "Device {} does not have enough active cores. Need 1 active ethernet core for this test.", - device_id_l); - tt_metal::CloseDevice(device); - throw std::runtime_error("Test cannot run on specified device."); - } - - auto eth_core_iter = device_active_eth_cores.begin(); - auto [device_id_r, eth_receiver_core] = device->get_connected_ethernet_core(*eth_core_iter); - - tt_metal::Device* device_r = tt_metal::CreateDevice(device_id_r); - - CoreCoord tunneler_logical_core = device->get_ethernet_sockets(device_id_r)[0]; - CoreCoord tunneler_phys_core = device->ethernet_core_from_logical_core(tunneler_logical_core); - - CoreCoord r_tunneler_logical_core = device_r->get_ethernet_sockets(device_id_l)[0]; - CoreCoord r_tunneler_phys_core = device_r->ethernet_core_from_logical_core(r_tunneler_logical_core); - - std::cout << "Left Tunneler = " << tunneler_logical_core.str() << std::endl; - std::cout << "Right Tunneler = " << r_tunneler_logical_core.str() << std::endl; - - tt_metal::Program program = tt_metal::CreateProgram(); - tt_metal::Program program_r = tt_metal::CreateProgram(); - - CoreCoord mux_core = {mux_x, mux_y}; - CoreCoord mux_phys_core = device->worker_core_from_logical_core(mux_core); - - CoreCoord demux_core = {demux_x, demux_y}; - CoreCoord demux_phys_core = device_r->worker_core_from_logical_core(demux_core); - - std::vector tx_phys_core; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x+i, tx_y}; - tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = - { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - - log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = defines}); - } - - // Mux - std::vector mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)tx_phys_core[0].x, - (uint32_t)tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[1].x, - (uint32_t)tx_phys_core[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[2].x, - (uint32_t)tx_phys_core[2].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: src 2 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[3].x, - (uint32_t)tx_phys_core[3].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: src 3 info - (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)tunneler_phys_core.x, // 10: remote_tx_x - (uint32_t)tunneler_phys_core.y, // 11: remote_tx_y - 0, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - test_results_addr, // 14: test_results_addr - test_results_size, // 15: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 16: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 17-24: packetize/depacketize settings - }; - - log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); - auto mux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - {mux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = mux_compile_args, - .defines = defines}); - - std::vector tunneler_l_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 1, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::ETH), // 4: remote_receiver_0_info - 0, // 5: remote_receiver_1_info - (tunneler_queue_start_addr >> 4), // 6: remote_receiver_queue_start_addr_words 0 - (tunneler_queue_size_bytes >> 4), // 7: remote_receiver_queue_size_words 0 - 0, // 8: remote_receiver_queue_start_addr_words 1 - 2, // 9: remote_receiver_queue_size_words 1. - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_sender_0_info - 0, // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - 0, // 15: - }; - - auto tunneler_l_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - tunneler_logical_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_l_compile_args, .defines = defines}); - - std::vector tunneler_r_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 1, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info - 0, // 5: remote_receiver_1_info - (demux_queue_start_addr >> 4), // 6: remote_receiver_queue_start_addr_words 0 - (demux_queue_size_bytes >> 4), // 7: remote_receiver_queue_size_words 0 - 0, // 8: remote_receiver_queue_start_addr_words 1 - 2, // 9: remote_receiver_queue_size_words 1 - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::ETH), // 10: remote_sender_0_info - 0, // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - 0, // 15: - }; - - auto tunneler_r_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - r_tunneler_logical_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_r_compile_args, .defines = defines}); - - std::vector rx_phys_core; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x+i, rx_y}; - rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = - { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i + 1, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; - - log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program_r, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = defines}); - } - - // Demux - uint32_t dest_map_array[4] = {0, 1, 2, 3}; - uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - std::vector demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - rx_phys_core[0].x, - rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - rx_phys_core[1].x, - rx_phys_core[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - packet_switch_4B_pack( - rx_phys_core[2].x, - rx_phys_core[2].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info - packet_switch_4B_pack( - rx_phys_core[3].x, - rx_phys_core[3].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - (uint32_t)r_tunneler_phys_core.x, // 16: remote_rx_x - (uint32_t)r_tunneler_phys_core.y, // 17: remote_rx_y - 2, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0 // 25-29: packetize/depacketize settings - }; - - log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); - auto demux_kernel = tt_metal::CreateKernel( - program_r, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - {demux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = demux_compile_args, - .defines = defines}); - - log_info(LogTest, "Starting test..."); - - auto start = std::chrono::system_clock::now(); - tt_metal::detail::LaunchProgram(device, program, false); - tt_metal::detail::LaunchProgram(device_r, program_r, false); - tt_metal::detail::WaitProgramDone(device, program); - tt_metal::detail::WaitProgramDone(device_r, program_r); - auto end = std::chrono::system_clock::now(); - - std::chrono::duration elapsed_seconds = (end - start); - log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - - vector> tx_results; - vector> rx_results; - - for (uint32_t i = 0; i < num_src_endpoints; i++) { - tx_results.push_back( - tt::llrt::read_hex_vec_from_core(device->id(), tx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - rx_results.push_back(tt::llrt::read_hex_vec_from_core( - device_r->id(), rx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "RX{} status = {}", - i, - packet_queue_test_status_to_string(rx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - vector mux_results = - tt::llrt::read_hex_vec_from_core(device->id(), mux_phys_core, test_results_addr, test_results_size); - log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); - pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - - vector demux_results = - tt::llrt::read_hex_vec_from_core(device_r->id(), demux_phys_core, test_results_addr, test_results_size); - log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); - pass &= (demux_results[0] == PACKET_QUEUE_TEST_PASS); - - pass &= tt_metal::CloseDevice(device); - pass &= tt_metal::CloseDevice(device_r); - - if (pass) { - double total_tx_bw = 0.0; - uint64_t total_tx_words_sent = 0; - uint64_t total_rx_words_checked = 0; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); - double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; - log_info( - LogTest, - "TX {} words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - tx_words_sent, - tx_elapsed_cycles, - tx_bw); - total_tx_bw += tx_bw; - } - log_info(LogTest, "Total TX BW = {:.2f} B/cycle", total_tx_bw); - double total_rx_bw = 0.0; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - uint64_t rx_words_checked = get_64b_result(rx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_rx_words_checked += rx_words_checked; - uint64_t rx_elapsed_cycles = get_64b_result(rx_results[i], PQ_TEST_CYCLES_INDEX); - double rx_bw = ((double)rx_words_checked) * PACKET_WORD_SIZE_BYTES / rx_elapsed_cycles; - log_info( - LogTest, - "RX {} words checked = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - rx_words_checked, - rx_elapsed_cycles, - rx_bw); - total_rx_bw += rx_bw; - } - log_info(LogTest, "Total RX BW = {:.2f} B/cycle", total_rx_bw); - if (total_tx_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "Total TX words sent = {} != Total RX words checked = {}", - total_tx_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "Total TX words sent = {} == Total RX words checked = {} -> OK", - total_tx_words_sent, - total_rx_words_checked); - } - uint64_t mux_words_sent = get_64b_result(mux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t mux_elapsed_cycles = get_64b_result(mux_results, PQ_TEST_CYCLES_INDEX); - uint64_t mux_iter = get_64b_result(mux_results, PQ_TEST_ITER_INDEX); - double mux_bw = ((double)mux_words_sent) * PACKET_WORD_SIZE_BYTES / mux_elapsed_cycles; - double mux_cycles_per_iter = ((double)mux_elapsed_cycles) / mux_iter; - log_info( - LogTest, - "MUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - mux_words_sent, - mux_elapsed_cycles, - mux_bw); - log_info(LogTest, "MUX iters = {} -> cycles/iter = {:.1f}", mux_iter, mux_cycles_per_iter); - if (mux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "MUX words sent = {} != Total RX words checked = {}", - mux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "MUX words sent = {} == Total RX words checked = {} -> OK", - mux_words_sent, - total_rx_words_checked); - } - - uint64_t demux_words_sent = get_64b_result(demux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t demux_elapsed_cycles = get_64b_result(demux_results, PQ_TEST_CYCLES_INDEX); - double demux_bw = ((double)demux_words_sent) * PACKET_WORD_SIZE_BYTES / demux_elapsed_cycles; - uint64_t demux_iter = get_64b_result(demux_results, PQ_TEST_ITER_INDEX); - double demux_cycles_per_iter = ((double)demux_elapsed_cycles) / demux_iter; - log_info( - LogTest, - "DEMUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - demux_words_sent, - demux_elapsed_cycles, - demux_bw); - log_info(LogTest, "DEMUX iters = {} -> cycles/iter = {:.1f}", demux_iter, demux_cycles_per_iter); - if (demux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "DEMUX words sent = {} != Total RX words checked = {}", - demux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "DEMUX words sent = {} == Total RX words checked = {} -> OK", - demux_words_sent, - total_rx_words_checked); - } - } - - } catch (const std::exception& e) { - pass = false; - log_fatal(e.what()); - } - - tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); - - if (pass) { - log_info(LogTest, "Test Passed"); - return 0; - } else { - log_fatal(LogTest, "Test Failed\n"); - return 1; - } -} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp deleted file mode 100644 index aed8e50c833a..000000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp +++ /dev/null @@ -1,635 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" - -using std::vector; -using namespace tt; - -int main(int argc, char** argv) { - constexpr uint32_t default_tx_x = 0; - constexpr uint32_t default_tx_y = 0; - constexpr uint32_t default_rx_x = 0; - constexpr uint32_t default_rx_y = 3; - - constexpr uint32_t default_mux_x = 0; - constexpr uint32_t default_mux_y = 1; - constexpr uint32_t default_demux_x = 0; - constexpr uint32_t default_demux_y = 2; - - constexpr uint32_t default_tunneler_x = 0; - constexpr uint32_t default_tunneler_y = 0; - - constexpr uint32_t default_prng_seed = 0x100; - constexpr uint32_t default_data_kb_per_tx = 16 * 1024; - constexpr uint32_t default_max_packet_size_words = 0x100; - - constexpr uint32_t default_tx_queue_start_addr = 0x80000; - constexpr uint32_t default_tx_queue_size_bytes = 0x10000; - constexpr uint32_t default_rx_queue_start_addr = 0xa0000; - constexpr uint32_t default_rx_queue_size_bytes = 0x20000; - constexpr uint32_t default_mux_queue_start_addr = 0x80000; - constexpr uint32_t default_mux_queue_size_bytes = 0x10000; - constexpr uint32_t default_demux_queue_start_addr = 0x90000; - constexpr uint32_t default_demux_queue_size_bytes = 0x20000; - - constexpr uint32_t default_tunneler_queue_start_addr = 0x19000; - constexpr uint32_t default_tunneler_queue_size_bytes = 0x10000; - - constexpr uint32_t default_test_results_addr = 0x100000; - constexpr uint32_t default_test_results_size = 0x40000; - - constexpr uint32_t default_tunneler_test_results_addr = 0x29000; - constexpr uint32_t default_tunneler_test_results_size = 0x8000; - - constexpr uint32_t default_timeout_mcycles = 1000; - constexpr uint32_t default_rx_disable_data_check = 0; - - constexpr uint32_t src_endpoint_start_id = 0xaa; - constexpr uint32_t dest_endpoint_start_id = 0xbb; - - constexpr uint32_t num_src_endpoints = 4; - constexpr uint32_t num_dest_endpoints = 4; - - constexpr uint32_t default_test_device_id = 0; - - std::vector input_args(argv, argv + argc); - if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { - log_info(LogTest, "Usage:"); - log_info(LogTest, " --prng_seed: PRNG seed, default = 0x{:x}", default_prng_seed); - log_info(LogTest, " --data_kb_per_tx: Total data in KB per TX endpoint, default = {}", default_data_kb_per_tx); - log_info( - LogTest, - " --max_packet_size_words: Max packet size in words, default = 0x{:x}", - default_max_packet_size_words); - log_info(LogTest, " --tx_x: X coordinate of the starting TX core, default = {}", default_tx_x); - log_info(LogTest, " --tx_y: Y coordinate of the starting TX core, default = {}", default_tx_y); - log_info(LogTest, " --rx_x: X coordinate of the starting RX core, default = {}", default_rx_x); - log_info(LogTest, " --rx_y: Y coordinate of the starting RX core, default = {}", default_rx_y); - log_info(LogTest, " --mux_x: X coordinate of the starting mux core, default = {}", default_mux_x); - log_info(LogTest, " --mux_y: Y coordinate of the starting mux core, default = {}", default_mux_y); - log_info(LogTest, " --demux_x: X coordinate of the starting demux core, default = {}", default_demux_x); - log_info(LogTest, " --demux_y: Y coordinate of the starting demux core, default = {}", default_demux_y); - log_info( - LogTest, " --tx_queue_start_addr: TX queue start address, default = 0x{:x}", default_tx_queue_start_addr); - log_info( - LogTest, " --tx_queue_size_bytes: TX queue size in bytes, default = 0x{:x}", default_tx_queue_size_bytes); - log_info( - LogTest, " --rx_queue_start_addr: RX queue start address, default = 0x{:x}", default_rx_queue_start_addr); - log_info( - LogTest, " --rx_queue_size_bytes: RX queue size in bytes, default = 0x{:x}", default_rx_queue_size_bytes); - log_info( - LogTest, - " --mux_queue_start_addr: MUX queue start address, default = 0x{:x}", - default_mux_queue_start_addr); - log_info( - LogTest, - " --mux_queue_size_bytes: MUX queue size in bytes, default = 0x{:x}", - default_mux_queue_size_bytes); - log_info( - LogTest, - " --demux_queue_start_addr: DEMUX queue start address, default = 0x{:x}", - default_demux_queue_start_addr); - log_info( - LogTest, - " --demux_queue_size_bytes: DEMUX queue size in bytes, default = 0x{:x}", - default_demux_queue_size_bytes); - log_info( - LogTest, " --test_results_addr: test results buf address, default = 0x{:x}", default_test_results_addr); - log_info(LogTest, " --test_results_size: test results buf size, default = 0x{:x}", default_test_results_size); - log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); - log_info( - LogTest, - " --rx_disable_data_check: Disable data check on RX, default = {}", - default_rx_disable_data_check); - log_info(LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id); - - return 0; - } - - uint32_t tx_x = test_args::get_command_option_uint32(input_args, "--tx_x", default_tx_x); - uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); - uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); - uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - uint32_t mux_x = test_args::get_command_option_uint32(input_args, "--mux_x", default_mux_x); - uint32_t mux_y = test_args::get_command_option_uint32(input_args, "--mux_y", default_mux_y); - uint32_t demux_x = test_args::get_command_option_uint32(input_args, "--demux_x", default_demux_x); - uint32_t demux_y = test_args::get_command_option_uint32(input_args, "--demux_y", default_demux_y); - uint32_t tunneler_x = test_args::get_command_option_uint32(input_args, "--tunneler_x", default_tunneler_x); - uint32_t tunneler_y = test_args::get_command_option_uint32(input_args, "--tunneler_y", default_tunneler_y); - uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); - uint32_t data_kb_per_tx = - test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); - uint32_t max_packet_size_words = - test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); - uint32_t tx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--tx_queue_start_addr", default_tx_queue_start_addr); - uint32_t tx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--tx_queue_size_bytes", default_tx_queue_size_bytes); - uint32_t rx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--rx_queue_start_addr", default_rx_queue_start_addr); - uint32_t rx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--rx_queue_size_bytes", default_rx_queue_size_bytes); - uint32_t mux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--mux_queue_start_addr", default_mux_queue_start_addr); - uint32_t mux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--mux_queue_size_bytes", default_mux_queue_size_bytes); - uint32_t demux_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--demux_queue_start_addr", default_demux_queue_start_addr); - uint32_t demux_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--demux_queue_size_bytes", default_demux_queue_size_bytes); - uint32_t tunneler_queue_start_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_start_addr", default_tunneler_queue_start_addr); - uint32_t tunneler_queue_size_bytes = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_size_bytes", default_tunneler_queue_size_bytes); - uint32_t test_results_addr = - test_args::get_command_option_uint32(input_args, "--test_results_addr", default_test_results_addr); - uint32_t test_results_size = - test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); - uint32_t tunneler_test_results_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_addr", default_tunneler_test_results_addr); - uint32_t tunneler_test_results_size = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_size", default_tunneler_test_results_size); - uint32_t timeout_mcycles = - test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); - uint32_t rx_disable_data_check = - test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); - uint32_t test_device_id = test_args::get_command_option_uint32(input_args, "--device_id", default_test_device_id); - - bool pass = true; - try { - int num_devices = tt_metal::GetNumAvailableDevices(); - if (test_device_id >= num_devices) { - log_info(LogTest, "Device {} is not valid. Highest valid device id = {}.", test_device_id, num_devices - 1); - throw std::runtime_error("Invalid Device Id."); - } - int device_id = test_device_id; - - tt_metal::Device* device = tt_metal::CreateDevice(device_id); - auto const& device_active_eth_cores = device->get_active_ethernet_cores(); - - if (device_active_eth_cores.size() < 2) { - log_info( - LogTest, - "Device {} does not have enough active cores. Need 2 active ethernet cores for this test.", - device_id); - tt_metal::CloseDevice(device); - throw std::runtime_error("Test cannot run on specified device."); - } - - auto eth_core_iter = device_active_eth_cores.begin(); - // CoreCoord tunneler_logical_core = device->get_ethernet_sockets(5)[0]; - CoreCoord tunneler_logical_core = *eth_core_iter; - CoreCoord tunneler_phys_core = device->ethernet_core_from_logical_core(tunneler_logical_core); - - // CoreCoord r_tunneler_logical_core = device->get_ethernet_sockets(5)[1]; - eth_core_iter++; - CoreCoord r_tunneler_logical_core = *eth_core_iter; - CoreCoord r_tunneler_phys_core = device->ethernet_core_from_logical_core(r_tunneler_logical_core); - - std::cout << "Left Tunneler = " << tunneler_logical_core.str() << std::endl; - std::cout << "Right Tunneler = " << r_tunneler_logical_core.str() << std::endl; - - tt_metal::Program program = tt_metal::CreateProgram(); - - CoreCoord mux_core = {mux_x, mux_y}; - CoreCoord mux_phys_core = device->worker_core_from_logical_core(mux_core); - - CoreCoord demux_core = {demux_x, demux_y}; - CoreCoord demux_phys_core = device->worker_core_from_logical_core(demux_core); - - std::vector tx_phys_core; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x+i, tx_y}; - tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = - { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - }; - - log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - - // Mux - std::vector mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)tx_phys_core[0].x, - (uint32_t)tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[1].x, - (uint32_t)tx_phys_core[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[2].x, - (uint32_t)tx_phys_core[2].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: src 2 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[3].x, - (uint32_t)tx_phys_core[3].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: src 3 info - (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - (uint32_t)tunneler_phys_core.x, // 10: remote_tx_x - (uint32_t)tunneler_phys_core.y, // 11: remote_tx_y - 0, // 12: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type - test_results_addr, // 14: test_results_addr - test_results_size, // 15: test_results_size - timeout_mcycles * 1000 * 1000, // 16: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 17-24: packetize/depacketize settings - }; - - log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); - auto mux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_mux.cpp", - {mux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = mux_compile_args, - .defines = {}}); - - std::vector tunneler_l_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 1, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info - 0, // 5: remote_receiver_1_info - (tunneler_queue_start_addr >> 4), // 6: remote_receiver_queue_start_addr_words 0 - (tunneler_queue_size_bytes >> 4), // 7: remote_receiver_queue_size_words 0 - 0, // 8: remote_receiver_queue_start_addr_words 1 - 2, // 9: remote_receiver_queue_size_words 1. - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_sender_0_info - 0, // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - }; - - auto tunneler_l_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - tunneler_logical_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_l_compile_args}); - - std::vector tunneler_r_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 1, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info - 0, // 5: remote_receiver_1_info - (demux_queue_start_addr >> 4), // 6: remote_receiver_queue_start_addr_words 0 - (demux_queue_size_bytes >> 4), // 7: remote_receiver_queue_size_words 0 - 0, // 8: remote_receiver_queue_start_addr_words 1 - 2, // 9: remote_receiver_queue_size_words 1 - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_sender_0_info - 0, // 11: remote_sender_1_info - tunneler_test_results_addr, // 12: test_results_addr - tunneler_test_results_size, // 13: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 14: timeout_cycles - }; - - auto tunneler_r_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp", - r_tunneler_logical_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_r_compile_args}); - - std::vector rx_phys_core; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x+i, rx_y}; - rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = - { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - }; - - log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = {}}); - } - - // Demux - uint32_t dest_map_array[4] = {0, 1, 2, 3}; - uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - std::vector demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - rx_phys_core[0].x, - rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - rx_phys_core[1].x, - rx_phys_core[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - packet_switch_4B_pack( - rx_phys_core[2].x, - rx_phys_core[2].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info - packet_switch_4B_pack( - rx_phys_core[3].x, - rx_phys_core[3].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - //(uint32_t)mux_phys_core.x, // 16: remote_rx_x - //(uint32_t)mux_phys_core.y, // 17: remote_rx_y - (uint32_t)r_tunneler_phys_core.x, // 16: remote_rx_x - (uint32_t)r_tunneler_phys_core.y, // 17: remote_rx_y - 2, // 18: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0 // 25-29: packetize/depacketize settings - }; - - log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); - auto demux_kernel = tt_metal::CreateKernel( - program, - "tt_metal/impl/dispatch/kernels/packet_demux.cpp", - {demux_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = demux_compile_args, - .defines = {}}); - - log_info(LogTest, "Starting test..."); - - auto start = std::chrono::system_clock::now(); - tt_metal::detail::LaunchProgram(device, program); - auto end = std::chrono::system_clock::now(); - - std::chrono::duration elapsed_seconds = (end - start); - log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - - vector> tx_results; - vector> rx_results; - - for (uint32_t i = 0; i < num_src_endpoints; i++) { - tx_results.push_back( - tt::llrt::read_hex_vec_from_core(device->id(), tx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - rx_results.push_back( - tt::llrt::read_hex_vec_from_core(device->id(), rx_phys_core[i], test_results_addr, test_results_size)); - log_info( - LogTest, - "RX{} status = {}", - i, - packet_queue_test_status_to_string(rx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - - vector mux_results = - tt::llrt::read_hex_vec_from_core(device->id(), mux_phys_core, test_results_addr, test_results_size); - log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); - pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - - vector demux_results = - tt::llrt::read_hex_vec_from_core(device->id(), demux_phys_core, test_results_addr, test_results_size); - log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); - pass &= (demux_results[0] == PACKET_QUEUE_TEST_PASS); - - pass &= tt_metal::CloseDevice(device); - - if (pass) { - double total_tx_bw = 0.0; - uint64_t total_tx_words_sent = 0; - uint64_t total_rx_words_checked = 0; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); - double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; - log_info( - LogTest, - "TX {} words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - tx_words_sent, - tx_elapsed_cycles, - tx_bw); - total_tx_bw += tx_bw; - } - log_info(LogTest, "Total TX BW = {:.2f} B/cycle", total_tx_bw); - double total_rx_bw = 0.0; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - uint64_t rx_words_checked = get_64b_result(rx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_rx_words_checked += rx_words_checked; - uint64_t rx_elapsed_cycles = get_64b_result(rx_results[i], PQ_TEST_CYCLES_INDEX); - double rx_bw = ((double)rx_words_checked) * PACKET_WORD_SIZE_BYTES / rx_elapsed_cycles; - log_info( - LogTest, - "RX {} words checked = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - rx_words_checked, - rx_elapsed_cycles, - rx_bw); - total_rx_bw += rx_bw; - } - log_info(LogTest, "Total RX BW = {:.2f} B/cycle", total_rx_bw); - if (total_tx_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "Total TX words sent = {} != Total RX words checked = {}", - total_tx_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "Total TX words sent = {} == Total RX words checked = {} -> OK", - total_tx_words_sent, - total_rx_words_checked); - } - uint64_t mux_words_sent = get_64b_result(mux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t mux_elapsed_cycles = get_64b_result(mux_results, PQ_TEST_CYCLES_INDEX); - uint64_t mux_iter = get_64b_result(mux_results, PQ_TEST_ITER_INDEX); - double mux_bw = ((double)mux_words_sent) * PACKET_WORD_SIZE_BYTES / mux_elapsed_cycles; - double mux_cycles_per_iter = ((double)mux_elapsed_cycles) / mux_iter; - log_info( - LogTest, - "MUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - mux_words_sent, - mux_elapsed_cycles, - mux_bw); - log_info(LogTest, "MUX iters = {} -> cycles/iter = {:.1f}", mux_iter, mux_cycles_per_iter); - if (mux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "MUX words sent = {} != Total RX words checked = {}", - mux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "MUX words sent = {} == Total RX words checked = {} -> OK", - mux_words_sent, - total_rx_words_checked); - } - - uint64_t demux_words_sent = get_64b_result(demux_results, PQ_TEST_WORD_CNT_INDEX); - uint64_t demux_elapsed_cycles = get_64b_result(demux_results, PQ_TEST_CYCLES_INDEX); - double demux_bw = ((double)demux_words_sent) * PACKET_WORD_SIZE_BYTES / demux_elapsed_cycles; - uint64_t demux_iter = get_64b_result(demux_results, PQ_TEST_ITER_INDEX); - double demux_cycles_per_iter = ((double)demux_elapsed_cycles) / demux_iter; - log_info( - LogTest, - "DEMUX words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - demux_words_sent, - demux_elapsed_cycles, - demux_bw); - log_info(LogTest, "DEMUX iters = {} -> cycles/iter = {:.1f}", demux_iter, demux_cycles_per_iter); - if (demux_words_sent != total_rx_words_checked) { - log_error( - LogTest, - "DEMUX words sent = {} != Total RX words checked = {}", - demux_words_sent, - total_rx_words_checked); - pass = false; - } else { - log_info( - LogTest, - "DEMUX words sent = {} == Total RX words checked = {} -> OK", - demux_words_sent, - total_rx_words_checked); - } - } - - } catch (const std::exception& e) { - pass = false; - log_fatal(e.what()); - } - - tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); - - if (pass) { - log_info(LogTest, "Test Passed"); - return 0; - } else { - log_fatal(LogTest, "Test Failed\n"); - return 1; - } -} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp index c0594af1ff9f..e93038161894 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp @@ -11,7 +11,6 @@ #include "tt_metal/impl/device/device.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" -using std::vector; using namespace tt; using json = nlohmann::json; @@ -30,6 +29,9 @@ int main(int argc, char** argv) { constexpr uint32_t default_data_kb_per_tx = 1024 * 1024; constexpr uint32_t default_max_packet_size_words = 0x100; + constexpr uint32_t default_input_scratch_buffer_base_addr = 0x50000; + constexpr uint32_t default_output_scratch_buffer_base_addr = 0x60000; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; constexpr uint32_t default_tx_queue_size_bytes = 0x10000; constexpr uint32_t default_rx_queue_start_addr = 0xa0000; @@ -46,7 +48,8 @@ int main(int argc, char** argv) { constexpr uint32_t default_tunneler_queue_size_bytes = 0x8000; // times 8, as it is birectional, maximum queue size for ecore L1 (power of 2) constexpr uint32_t default_tunneler_test_results_addr = 0x39000; - constexpr uint32_t default_tunneler_test_results_size = 0x7000; + constexpr uint32_t default_tunneler_test_results_size = 0x1000; + constexpr uint32_t default_tunneler_buffer_base_addr = 0x3A000; constexpr uint32_t default_timeout_mcycles = 1000; constexpr uint32_t default_rx_disable_data_check = 0; @@ -151,6 +154,9 @@ int main(int argc, char** argv) { default_tx_data_sent_per_iter_high); log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); + log_info(LogTest, " --input_scratch_buffer_base_addr: Scratch buffer for input queues base address, default = {:#x}", default_input_scratch_buffer_base_addr); + log_info(LogTest, " --output_scratch_buffer_base_addr: Scratch buffer for output queues base address, default = {:#x}", default_output_scratch_buffer_base_addr); + log_info(LogTest, " --tunneler_scratch_buffer_base_addr: Scratch buffer for tunneler queues base address, default = {:#x}", default_tunneler_buffer_base_addr); return 0; } @@ -216,10 +222,11 @@ int main(int argc, char** argv) { uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32( input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); - assert( - (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && - rx_disable_header_check || - (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); + uint32_t input_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--input_scratch_buffer_base_addr", default_input_scratch_buffer_base_addr); + uint32_t output_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--output_scratch_buffer_base_addr", default_output_scratch_buffer_base_addr); + uint32_t tunneler_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--tunneler_buffer_base_addr", default_tunneler_buffer_base_addr); + + assert((pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && rx_disable_header_check || (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); bool pass = true; @@ -227,6 +234,40 @@ int main(int argc, char** argv) { {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; + // Same kernel layout on both devices + // The addresses will be the same but on remote noc, + // however, still make two arrays for clarity in the compile args + // TODO(nhuang): Refactor / cleanup the duplicate code for the kernel setups + using topology = std::vector>; + + const topology input_topology = { + { "traffic_gen_tx", num_src_endpoints }, + { "mux", MAX_SWITCH_FAN_OUT }, + { "demux", MAX_SWITCH_FAN_OUT }, + { "traffic_gen_rx", MAX_SWITCH_FAN_OUT }, + }; + + const topology output_topology = { + { "traffic_gen_tx", num_src_endpoints }, + { "traffic_gen_tx_mock", num_src_endpoints }, + { "mux", MAX_SWITCH_FAN_OUT }, + { "demux", MAX_SWITCH_FAN_OUT }, + { "traffic_gen_rx", MAX_SWITCH_FAN_OUT }, + }; + + const topology tunneler_topology = { + { "input", MAX_TUNNEL_LANES }, + { "output", MAX_TUNNEL_LANES }, + }; + + const auto input_scratch_buffers_left = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, input_topology); + const auto output_scratch_buffers_left = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, output_topology); + const auto tunneler_buffers_left = make_buffer_addresses_for_test(tunneler_buffer_base_addr, packet_queue_ptr_buffer_size, tunneler_topology); + + const auto input_scratch_buffers_right = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, input_topology); + const auto output_scratch_buffers_right = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, output_topology); + const auto tunneler_buffers_right = make_buffer_addresses_for_test(tunneler_buffer_base_addr, packet_queue_ptr_buffer_size, tunneler_topology); + try { int num_devices = tt_metal::GetNumAvailableDevices(); if (test_device_id >= num_devices) { @@ -282,30 +323,35 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { CoreCoord core = {tx_x + i, tx_y}; tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - 1, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id + i, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low - tx_data_sent_per_iter_high // 21: data_sent_per_iter_high - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + 1, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id+i, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 21: data_sent_per_iter_high + input_scratch_buffers_left.at("traffic_gen_tx")[i], // 22: traffic_gen_input_ptrs_addr + output_scratch_buffers_left.at("traffic_gen_tx_mock")[i], // 23: traffic_gen_input_mock_remote_ptrs_addr + output_scratch_buffers_left.at("traffic_gen_tx")[i], // 24: traffic_gen_output_ptrs_addr + input_scratch_buffers_left.at("mux")[i], // 25: traffic_gen_output_remote_ptrs_addr + }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -323,30 +369,35 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { CoreCoord core = {tx_x + i, tx_y}; tx_phys_core_r.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - 1, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core_r.x, // 6: remote_rx_x - (uint32_t)mux_phys_core_r.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id + i, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low - tx_data_sent_per_iter_high // 21: data_sent_per_iter_high - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + 1, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core_r.x, // 6: remote_rx_x + (uint32_t)mux_phys_core_r.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id+i, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 21: data_sent_per_iter_high + input_scratch_buffers_right.at("traffic_gen_tx")[i], // 22: traffic_gen_input_ptrs_addr + output_scratch_buffers_right.at("traffic_gen_tx_mock")[i], // 23: traffic_gen_input_mock_remote_ptrs_addr + output_scratch_buffers_right.at("traffic_gen_tx")[i], // 24: traffic_gen_output_ptrs_addr + input_scratch_buffers_right.at("mux")[i], // 25: traffic_gen_output_remote_ptrs_addr + }; log_info(LogTest, "run traffic_gen_tx_r at x={},y={}", core.x, core.y); auto kernel_r = tt_metal::CreateKernel( @@ -364,27 +415,30 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_dest_endpoints; i++) { CoreCoord core = {rx_x + i, rx_y}; rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - 1, // num_src_endpoints, // 1: num_src_endpoints - 1, // num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - num_dest_endpoints + i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id + i, // 15: src_endpoint_start_id - dest_endpoint_start_id + i, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - rx_disable_header_check // 18: disable_header_check - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + 1, //num_src_endpoints, // 1: num_src_endpoints + 1, //num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + num_dest_endpoints + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id + i, // 15: src_endpoint_start_id + dest_endpoint_start_id + i, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + rx_disable_header_check, // 18: disable_header_check + input_scratch_buffers_left.at("traffic_gen_rx")[i], // 19: traffic_gen_input_ptrs_addr + output_scratch_buffers_left.at("demux")[i], // 20: traffic_gen_input_remote_ptrs_addr + }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -402,27 +456,30 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_dest_endpoints; i++) { CoreCoord core = {rx_x + i, rx_y}; rx_phys_core_r.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - 1, // num_src_endpoints, // 1: num_src_endpoints - 1, // num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core_r.x, // 5: remote_tx_x - (uint32_t)demux_phys_core_r.y, // 6: remote_tx_y - num_dest_endpoints + i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id + i, // 15: src_endpoint_start_id - dest_endpoint_start_id + i, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - rx_disable_header_check // 18: disable_header_check - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + 1, //num_src_endpoints, // 1: num_src_endpoints + 1, //num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core_r.x, // 5: remote_tx_x + (uint32_t)demux_phys_core_r.y, // 6: remote_tx_y + num_dest_endpoints + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id + i, // 15: src_endpoint_start_id + dest_endpoint_start_id + i, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + rx_disable_header_check, // 18: disable_header_check + input_scratch_buffers_right.at("traffic_gen_rx")[i], // 19: traffic_gen_input_ptrs_addr + output_scratch_buffers_right.at("demux")[i], // 20: traffic_gen_input_remote_ptrs_addr + }; log_info(LogTest, "run traffic_gen_rx_r at x={},y={}", core.x, core.y); auto kernel_r = tt_metal::CreateKernel( @@ -441,61 +498,66 @@ int main(int argc, char** argv) { } // Mux - std::vector mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)tunneler_phys_core.x, - (uint32_t)tunneler_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: dest 0 info - packet_switch_4B_pack( - (uint32_t)tunneler_phys_core.x, - (uint32_t)tunneler_phys_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: dest 1 info - 0, // 6: dest 2 info - 0, // 7: dest 3 info - - (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 10: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words - 0, // 12: remote_tx_queue_start_addr_words - 0, // 13: remote_tx_queue_size_words - 0, // 14: remote_tx_queue_start_addr_words - 0, // 15: remote_tx_queue_size_words - packet_switch_4B_pack( - (uint32_t)tx_phys_core[0].x, - (uint32_t)tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: src 0 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[1].x, - (uint32_t)tx_phys_core[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: src 1 info - 0, // 18: src 2 info - 0, // 19: src 3 info - 0, - 0, // 20, 21 - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 25-35: packetize/depacketize settings - }; + std::vector mux_compile_args = + { + 0, // 0: reserved + (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words + num_src_endpoints, // 3: mux_fan_in + packet_switch_4B_pack((uint32_t)tunneler_phys_core.x, + (uint32_t)tunneler_phys_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: dest 0 info + packet_switch_4B_pack((uint32_t)tunneler_phys_core.x, + (uint32_t)tunneler_phys_core.y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: dest 1 info + 0, // 6: dest 2 info + 0, // 7: dest 3 info + (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words + ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 10: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words + 0, // 12: remote_tx_queue_start_addr_words + 0, // 13: remote_tx_queue_size_words + 0, // 14: remote_tx_queue_start_addr_words + 0, // 15: remote_tx_queue_size_words + packet_switch_4B_pack((uint32_t)tx_phys_core[0].x, + (uint32_t)tx_phys_core[0].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: src 0 info + packet_switch_4B_pack((uint32_t)tx_phys_core[1].x, + (uint32_t)tx_phys_core[1].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: src 1 info + 0, // 18: src 2 info + 0, // 19: src 3 info + 0, 0, // 20, 21 + test_results_addr, // 22: test_results_addr + test_results_size, // 23: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + + input_scratch_buffers_left.at("mux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_left.at("mux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_left.at("mux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_left.at("mux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + output_scratch_buffers_left.at("traffic_gen_tx")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + output_scratch_buffers_left.at("traffic_gen_tx")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + 0, // 42: vc_packet_router_input_remote_scratch_buffers[2] + 0, // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_left.at("mux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_left.at("mux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_left.at("mux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_left.at("mux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + tunneler_buffers_left.at("input")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + tunneler_buffers_left.at("input")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + 0, // 50: vc_packet_router_output_remote_scratch_buffers[2] + 0, // 51: vc_packet_router_output_remote_scratch_buffers[3] + }; log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); auto mux_kernel = tt_metal::CreateKernel( @@ -506,63 +568,69 @@ int main(int argc, char** argv) { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = mux_compile_args, - .defines = defines}); - - std::vector mux_compile_args_r = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)r_tunneler_phys_core.x, - (uint32_t)r_tunneler_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: dest 0 info - packet_switch_4B_pack( - (uint32_t)r_tunneler_phys_core.x, - (uint32_t)r_tunneler_phys_core.y, - 3, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: dest 1 info - 0, // 6: dest 2 info - 0, // 7: dest 3 info - - ((tunneler_queue_start_addr + 2 * tunneler_queue_size_bytes) >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> 4), // 10: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words - 0, // 12: remote_tx_queue_start_addr_words - 0, // 13: remote_tx_queue_size_words - 0, // 14: remote_tx_queue_start_addr_words - 0, // 15: remote_tx_queue_size_words - packet_switch_4B_pack( - (uint32_t)tx_phys_core_r[0].x, - (uint32_t)tx_phys_core_r[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: src 0 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core_r[1].x, - (uint32_t)tx_phys_core_r[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: src 1 info - 0, // 18: src 2 info - 0, // 19: src 3 info - 0, - 0, // 20, 21 - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 25-35: packetize/depacketize settings - }; + .defines = defines + } + ); + + std::vector mux_compile_args_r = + { + 0, // 0: reserved + (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words + num_src_endpoints, // 3: mux_fan_in + packet_switch_4B_pack((uint32_t)r_tunneler_phys_core.x, + (uint32_t)r_tunneler_phys_core.y, + 2, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: dest 0 info + packet_switch_4B_pack((uint32_t)r_tunneler_phys_core.x, + (uint32_t)r_tunneler_phys_core.y, + 3, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: dest 1 info + 0, // 6: dest 2 info + 0, // 7: dest 3 info + ((tunneler_queue_start_addr + 2 * tunneler_queue_size_bytes) >> 4), // 8: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words + ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> 4), // 10: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words + 0, // 12: remote_tx_queue_start_addr_words + 0, // 13: remote_tx_queue_size_words + 0, // 14: remote_tx_queue_start_addr_words + 0, // 15: remote_tx_queue_size_words + packet_switch_4B_pack((uint32_t)tx_phys_core_r[0].x, + (uint32_t)tx_phys_core_r[0].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: src 0 info + packet_switch_4B_pack((uint32_t)tx_phys_core_r[1].x, + (uint32_t)tx_phys_core_r[1].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: src 1 info + 0, // 18: src 2 info + 0, // 19: src 3 info + 0, 0, // 20, 21 + test_results_addr, // 22: test_results_addr + test_results_size, // 23: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + input_scratch_buffers_right.at("mux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_right.at("mux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_right.at("mux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_right.at("mux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + output_scratch_buffers_right.at("traffic_gen_tx")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + output_scratch_buffers_right.at("traffic_gen_tx")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + 0, // 42: vc_packet_router_input_remote_scratch_buffers[2] + 0, // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_right.at("mux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_right.at("mux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_right.at("mux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_right.at("mux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + tunneler_buffers_right.at("input")[2], // 48: vc_packet_router_output_remote_scratch_buffers[0] + tunneler_buffers_right.at("input")[3], // 49: vc_packet_router_output_remote_scratch_buffers[1] + 0, // 50: vc_packet_router_output_remote_scratch_buffers[2] + 0, // 51: vc_packet_router_output_remote_scratch_buffers[3] + }; log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); auto mux_kernel_r = tt_metal::CreateKernel( @@ -573,264 +641,258 @@ int main(int argc, char** argv) { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = mux_compile_args_r, - .defines = defines}); - - std::vector tunneler_l_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 2 * num_endpoints, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::ETH), // 4: remote_receiver_0_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::ETH), // 5: remote_receiver_1_info - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_receiver_2_info - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_receiver_3_info - 0, - 0, // 8 - 9: remote_receiver 4 - 5 - 0, - 0, // 10 - 11: remote_receiver 6 - 7 - 0, - 0, // 12 - 13: remote_receiver 8 - 9 - - (tunneler_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 - (tunneler_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 - ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> - 4), // 16: remote_receiver_queue_start_addr_words 1 - (tunneler_queue_size_bytes >> 4), // 17: remote_receiver_queue_size_words 1 - (demux_queue_start_addr >> 4), // 18: remote_receiver_queue_start_addr_words 4 - (demux_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 4 - ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 20: remote_receiver_queue_start_addr_words 5 - (demux_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 5 - 0, - 2, // 22 - 23 Settings for remote reciver 4 - 0, - 2, // 24 - 25 Settings for remote reciver 5 - 0, - 2, // 26 - 27 Settings for remote reciver 6 - 0, - 2, // 28 - 29 Settings for remote reciver 7 - 0, - 2, // 30 - 31 Settings for remote reciver 8 - 0, // 32: remote_receiver_queue_start_addr_words 9 - 2, // 33: remote_receiver_queue_size_words 9. - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 34: remote_sender_0_info - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints + 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 35: remote_sender_1_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 6, - (uint32_t)DispatchRemoteNetworkType::ETH), // 36: remote_sender_2_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 7, - (uint32_t)DispatchRemoteNetworkType::ETH), // 37: remote_sender_3_info - 0, - 0, // 38 - 39: remote_sender 4 - 5 - 0, - 0, // 40 - 41: remote_sender 6 - 7 - 0, - 0, // 42 - 43: remote_sender 8 - 9 - - tunneler_test_results_addr, // 44: test_results_addr - tunneler_test_results_size, // 45: test_results_size - 0, // 46: timeout_cycles - 0, // 47: inner_stop_mux_d_bypass - }; + .defines = defines + } + ); + + std::vector tunneler_l_compile_args = + { + dest_endpoint_start_id, // 0: endpoint_id_start_index + 2*num_endpoints, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. + (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words + + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::ETH), // 4: remote_receiver_0_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 1, + (uint32_t)DispatchRemoteNetworkType::ETH), // 5: remote_receiver_1_info + packet_switch_4B_pack(demux_phys_core.x, + demux_phys_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_receiver_2_info + packet_switch_4B_pack(demux_phys_core.x, + demux_phys_core.y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_receiver_3_info + 0, 0, 0, 0, 0, 0, // 8 - 13: remote_receiver 4 - 9 + (tunneler_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 + (tunneler_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 + ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 + (tunneler_queue_size_bytes >> 4), // 17: remote_receiver_queue_size_words 1 + (demux_queue_start_addr >> 4), // 18: remote_receiver_queue_start_addr_words 4 + (demux_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 4 + ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 20: remote_receiver_queue_start_addr_words 5 + (demux_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 5 + 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, // 22 - 31: Settings for remote reciever[4 - 9] + 0, // 32: remote_receiver_queue_start_addr_words 9 + 2, // 33: remote_receiver_queue_size_words 9. Unused. Setting to 2 to get around size check assertion that does not allow 0. + packet_switch_4B_pack(mux_phys_core.x, + mux_phys_core.y, + num_dest_endpoints, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 34: remote_sender_0_info + packet_switch_4B_pack(mux_phys_core.x, + mux_phys_core.y, + num_dest_endpoints + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 35: remote_sender_1_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 6, + (uint32_t)DispatchRemoteNetworkType::ETH), // 36: remote_sender_2_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 7, + (uint32_t)DispatchRemoteNetworkType::ETH), // 37: remote_sender_3_info + 0, 0, 0, 0, 0, 0, // 38 - 43: remote_sender[4 - 9] + tunneler_test_results_addr, // 44: test_results_addr + tunneler_test_results_size, // 45: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 46: timeout_cycles + 0, // 47: inner_stop_mux_d_bypass + tunneler_buffers_left.at("input")[0], // 48: vc_eth_tunneler_input_scratch_buffers[0] + tunneler_buffers_left.at("input")[1], // 49: vc_eth_tunneler_input_scratch_buffers[1] + tunneler_buffers_left.at("input")[2], // 50: vc_eth_tunneler_input_scratch_buffers[2] + tunneler_buffers_left.at("input")[3], // 51: vc_eth_tunneler_input_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 52 - 57: vc_eth_tunneler_input_scratch_buffers[4 - 9] + + output_scratch_buffers_left.at("mux")[0], // 58: vc_eth_tunneler_input_remote_scratch_buffers[0] + output_scratch_buffers_left.at("mux")[1], // 69: vc_eth_tunneler_input_remote_scratch_buffers[1] + tunneler_buffers_right.at("output")[2], // 60: vc_eth_tunneler_input_remote_scratch_buffers[2] + tunneler_buffers_right.at("output")[3], // 61: vc_eth_tunneler_input_remote_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 62 - 67: vc_eth_tunneler_input_remote_scratch_buffers[4 - 9] + + tunneler_buffers_left.at("output")[0], // 68: vc_eth_tunneler_output_scratch_buffers[0] + tunneler_buffers_left.at("output")[1], // 69: vc_eth_tunneler_output_scratch_buffers[1] + tunneler_buffers_left.at("output")[2], // 70: vc_eth_tunneler_output_scratch_buffers[2] + tunneler_buffers_left.at("output")[3], // 71: vc_eth_tunneler_output_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 72 - 77: vc_eth_tunneler_output_scratch_buffers[4 - 9] + + tunneler_buffers_right.at("input")[0], // 78: vc_eth_tunneler_output_remote_scratch_buffers[0] + tunneler_buffers_right.at("input")[1], // 79: vc_eth_tunneler_output_remote_scratch_buffers[1] + input_scratch_buffers_left.at("demux")[0], // 80: vc_eth_tunneler_output_remote_scratch_buffers[2] + input_scratch_buffers_left.at("demux")[1], // 81: vc_eth_tunneler_output_remote_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 82 - 87: vc_eth_tunneler_output_remote_scratch_buffers[4 - 9] + }; auto tunneler_l_kernel = tt_metal::CreateKernel( program, "tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp", tunneler_logical_core, tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_l_compile_args, .defines = defines}); - - std::vector tunneler_r_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 2 * num_endpoints, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - - packet_switch_4B_pack( - demux_phys_core_r.x, - demux_phys_core_r.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info - packet_switch_4B_pack( - demux_phys_core_r.x, - demux_phys_core_r.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_receiver_1_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::ETH), // 6: remote_receiver_2_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 3, - (uint32_t)DispatchRemoteNetworkType::ETH), // 7: remote_receiver_3_info - 0, - 0, // 8 - 9: remote_receiver 4 - 5 - 0, - 0, // 10 - 11: remote_receiver 6 - 7 - 0, - 0, // 12 - 13: remote_receiver 8 - 9 - - (demux_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 - (demux_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 - ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 - (demux_queue_size_bytes >> 4), // 17: remote_receiver_queue_size_words 1 - ((tunneler_queue_start_addr + 2 * tunneler_queue_size_bytes) >> - 4), // 18: remote_receiver_queue_start_addr_words 4 - (tunneler_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 4 - ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> - 4), // 20: remote_receiver_queue_start_addr_words 5 - (tunneler_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 5 - 0, - 2, // 22 - 23 Settings for remote reciver 4 - 0, - 2, // 24 - 25 Settings for remote reciver 5 - 0, - 2, // 26 - 27 Settings for remote reciver 6 - 0, - 2, // 28 - 29 Settings for remote reciver 7 - 0, - 2, // 30 - 31 Settings for remote reciver 8 - 0, // 32: remote_receiver_queue_start_addr_words 9 - 2, // 33: remote_receiver_queue_size_words 9. - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 4, - (uint32_t)DispatchRemoteNetworkType::ETH), // 34: remote_sender_0_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 5, - (uint32_t)DispatchRemoteNetworkType::ETH), // 35: remote_sender_1_info - packet_switch_4B_pack( - mux_phys_core_r.x, - mux_phys_core_r.y, - num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 36: remote_sender_2_info - packet_switch_4B_pack( - mux_phys_core_r.x, - mux_phys_core_r.y, - num_dest_endpoints + 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 37: remote_sender_3_info - 0, - 0, // 38 - 39: remote_sender 4 - 5 - 0, - 0, // 40 - 41: remote_sender 6 - 7 - 0, - 0, // 42 - 43: remote_sender 8 - 9 - - tunneler_test_results_addr, // 44: test_results_addr - tunneler_test_results_size, // 45: test_results_size - 0, // 46: timeout_cycles - 0, // 47: inner_stop_mux_d_bypass - }; + .noc = tt_metal::NOC::NOC_0, + .compile_args = tunneler_l_compile_args, + .defines = defines + } + ); + + + std::vector tunneler_r_compile_args = + { + dest_endpoint_start_id, // 0: endpoint_id_start_index + 2*num_endpoints, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. + (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words + + packet_switch_4B_pack(demux_phys_core_r.x, + demux_phys_core_r.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info + packet_switch_4B_pack(demux_phys_core_r.x, + demux_phys_core_r.y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_receiver_1_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 2, + (uint32_t)DispatchRemoteNetworkType::ETH), // 6: remote_receiver_2_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 3, + (uint32_t)DispatchRemoteNetworkType::ETH), // 7: remote_receiver_3_info + 0, 0, 0, 0, 0, 0, // 8 - 13: remote_receiver 4 - 9 + (demux_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 + (demux_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 + ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 + (demux_queue_size_bytes >> 4), // 17: remote_receiver_queue_size_words 1 + ((tunneler_queue_start_addr + 2 * tunneler_queue_size_bytes) >> 4), // 18: remote_receiver_queue_start_addr_words 4 + (tunneler_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 4 + ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> 4), // 20: remote_receiver_queue_start_addr_words 5 + (tunneler_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 5 + 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, // 22 - 31 Settings for remote receiver[4 - 9] + 0, // 32: remote_receiver_queue_start_addr_words 9 + 2, // 33: remote_receiver_queue_size_words 9. Unused. Setting to 2 to get around size check assertion that does not allow 0. + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 4, + (uint32_t)DispatchRemoteNetworkType::ETH), // 34: remote_sender_0_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 5, + (uint32_t)DispatchRemoteNetworkType::ETH), // 35: remote_sender_1_info + packet_switch_4B_pack(mux_phys_core_r.x, + mux_phys_core_r.y, + num_dest_endpoints, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 36: remote_sender_2_info + packet_switch_4B_pack(mux_phys_core_r.x, + mux_phys_core_r.y, + num_dest_endpoints+1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 37: remote_sender_3_info + 0, 0, 0, 0, 0, 0, // 38 - 43: remote_sender[4 - 9] + tunneler_test_results_addr, // 44: test_results_addr + tunneler_test_results_size, // 45: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 46: timeout_cycles + 0, // 47: inner_stop_mux_d_bypass + tunneler_buffers_right.at("input")[0], // 48: vc_eth_tunneler_input_scratch_buffers[0] + tunneler_buffers_right.at("input")[1], // 49: vc_eth_tunneler_input_scratch_buffers[1] + tunneler_buffers_right.at("input")[2], // 50: vc_eth_tunneler_input_scratch_buffers[2] + tunneler_buffers_right.at("input")[3], // 51: vc_eth_tunneler_input_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 52 - 57: vc_eth_tunneler_input_scratch_buffers[4 - 9] + + tunneler_buffers_left.at("output")[0], // 58: vc_eth_tunneler_input_remote_scratch_buffers[0] + tunneler_buffers_left.at("output")[1], // 59: vc_eth_tunneler_input_remote_scratch_buffers[1] + output_scratch_buffers_right.at("mux")[0], // 60: vc_eth_tunneler_input_remote_scratch_buffers[2] + output_scratch_buffers_right.at("mux")[1], // 61: vc_eth_tunneler_input_remote_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 62 - 67: vc_eth_tunneler_input_remote_scratch_buffers[4 - 9] + + tunneler_buffers_right.at("output")[0], // 68: vc_eth_tunneler_output_scratch_buffers[0] + tunneler_buffers_right.at("output")[1], // 69: vc_eth_tunneler_output_scratch_buffers[1] + tunneler_buffers_right.at("output")[2], // 70: vc_eth_tunneler_output_scratch_buffers[2] + tunneler_buffers_right.at("output")[3], // 71: vc_eth_tunneler_output_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 72 - 77: vc_eth_tunneler_output_scratch_buffers[4 - 9] + + input_scratch_buffers_right.at("demux")[0], // 78: vc_eth_tunneler_output_remote_scratch_buffers[0] + input_scratch_buffers_right.at("demux")[1], // 79: vc_eth_tunneler_output_remote_scratch_buffers[1] + tunneler_buffers_left.at("input")[2], // 80: vc_eth_tunneler_output_remote_scratch_buffers[2] + tunneler_buffers_left.at("input")[3], // 81: vc_eth_tunneler_output_remote_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 82 - 87: vc_eth_tunneler_output_remote_scratch_buffers[4 - 9] + }; auto tunneler_r_kernel = tt_metal::CreateKernel( program_r, "tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp", r_tunneler_logical_core, tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_r_compile_args, .defines = defines}); + .noc = tt_metal::NOC::NOC_0, + .compile_args = tunneler_r_compile_args, + .defines = defines + } + ); // Demux uint32_t dest_map_array[4] = {0, 1, 2, 3}; uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - std::vector demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - rx_phys_core[0].x, - rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - rx_phys_core[1].x, - rx_phys_core[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - 0, // 6: remote_tx_2_info - 0, // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - 0, // 12: remote_tx_queue_start_addr_words 2 - 0, // 13: remote_tx_queue_size_words 2 - 0, // 14: remote_tx_queue_start_addr_words 3 - 0, // 15: remote_tx_queue_size_words 3 - //(uint32_t)tunneler_phys_core.x, // 16: remote_rx_x - //(uint32_t)tunneler_phys_core.y, // 17: remote_rx_y - // 3, // 18: remote_rx_queue_id - //(uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 6, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: remote_rx_0_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 7, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: remote_rx_1_info - 0, // 18: remote_rx_2_info - 0, // 19: remote_rx_3_info - - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 25-35: packetize/depacketize settings - }; + std::vector demux_compile_args = + { + dest_endpoint_start_id, // 0: endpoint_id_start_index + (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words + (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words + num_dest_endpoints, // 3: demux_fan_out + packet_switch_4B_pack(rx_phys_core[0].x, + rx_phys_core[0].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info + packet_switch_4B_pack(rx_phys_core[1].x, + rx_phys_core[1].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info + 0, // 6: remote_tx_2_info + 0, // 7: remote_tx_3_info + (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 + (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 + (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 + (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 + 0, // 12: remote_tx_queue_start_addr_words 2 + 0, // 13: remote_tx_queue_size_words 2 + 0, // 14: remote_tx_queue_start_addr_words 3 + 0, // 15: remote_tx_queue_size_words 3 + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 6, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: remote_rx_0_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 7, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: remote_rx_1_info + 0, // 18: remote_rx_2_info + 0, // 19: remote_rx_3_info + (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi + (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo + test_results_addr, // 22: test_results_addr + test_results_size, // 23: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + input_scratch_buffers_left.at("demux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_left.at("demux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_left.at("demux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_left.at("demux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + tunneler_buffers_left.at("output")[2], // 40: vc_packet_router_input_remote_scratch_buffers[0] + tunneler_buffers_left.at("output")[3], // 41: vc_packet_router_input_remote_scratch_buffers[1] + 0, // 42: vc_packet_router_input_remote_scratch_buffers[2] + 0, // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_left.at("demux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_left.at("demux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_left.at("demux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_left.at("demux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + input_scratch_buffers_left.at("traffic_gen_rx")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + input_scratch_buffers_left.at("traffic_gen_rx")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + 0, // 50: vc_packet_router_output_remote_scratch_buffers[2] + 0, // 51: vc_packet_router_output_remote_scratch_buffers[3] + }; log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); log_info(LogTest, "run demux at physical x={},y={}", demux_phys_core.x, demux_phys_core.y); @@ -843,68 +905,70 @@ int main(int argc, char** argv) { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = demux_compile_args, - .defines = defines}); - - std::vector demux_compile_args_r = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - rx_phys_core_r[0].x, - rx_phys_core_r[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - rx_phys_core_r[1].x, - rx_phys_core_r[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - 0, // 6: remote_tx_2_info - 0, // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - 0, // 12: remote_tx_queue_start_addr_words 2 - 0, // 13: remote_tx_queue_size_words 2 - 0, // 14: remote_tx_queue_start_addr_words 3 - 0, // 15: remote_tx_queue_size_words 3 - //(uint32_t)tunneler_phys_core.x, // 16: remote_rx_x - //(uint32_t)tunneler_phys_core.y, // 17: remote_rx_y - // 3, // 18: remote_rx_queue_id - //(uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 4, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: remote_rx_0_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 5, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: remote_rx_1_info - 0, // 18: remote_rx_2_info - 0, // 19: remote_rx_3_info - - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 25-35: packetize/depacketize settings - }; + .defines = defines + } + ); + + std::vector demux_compile_args_r = + { + dest_endpoint_start_id, // 0: endpoint_id_start_index + (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words + (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words + num_dest_endpoints, // 3: demux_fan_out + packet_switch_4B_pack(rx_phys_core_r[0].x, + rx_phys_core_r[0].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info + packet_switch_4B_pack(rx_phys_core_r[1].x, + rx_phys_core_r[1].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info + 0, // 6: remote_tx_2_info + 0, // 7: remote_tx_3_info + (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 + (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 + (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 + (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 + 0, // 12: remote_tx_queue_start_addr_words 2 + 0, // 13: remote_tx_queue_size_words 2 + 0, // 14: remote_tx_queue_start_addr_words 3 + 0, // 15: remote_tx_queue_size_words 3 + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 4, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: remote_rx_0_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 5, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: remote_rx_1_info + 0, // 18: remote_rx_2_info + 0, // 19: remote_rx_3_info + (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi + (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo + test_results_addr, // 22: test_results_addr + test_results_size, // 23: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + input_scratch_buffers_right.at("demux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_right.at("demux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_right.at("demux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_right.at("demux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + tunneler_buffers_right.at("output")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + tunneler_buffers_right.at("output")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + 0, // 42: vc_packet_router_input_remote_scratch_buffers[2] + 0, // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_right.at("demux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_right.at("demux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_right.at("demux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_right.at("demux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + input_scratch_buffers_right.at("traffic_gen_rx")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + input_scratch_buffers_right.at("traffic_gen_rx")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + 0, // 50: vc_packet_router_output_remote_scratch_buffers[2] + 0, // 51: vc_packet_router_output_remote_scratch_buffers[3] + }; log_info(LogTest, "run remote demux at x={},y={}", demux_core.x, demux_core.y); log_info(LogTest, "run remote demux at physical x={},y={}", demux_phys_core_r.x, demux_phys_core_r.y); @@ -917,7 +981,9 @@ int main(int argc, char** argv) { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = demux_compile_args_r, - .defines = defines}); + .defines = defines + } + ); log_info(LogTest, "Starting test..."); @@ -931,10 +997,10 @@ int main(int argc, char** argv) { std::chrono::duration elapsed_seconds = (end - start); log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - vector> tx_results; - vector> tx_results_r; - vector> rx_results; - vector> rx_results_r; + std::vector> tx_results; + std::vector> tx_results_r; + std::vector> rx_results; + std::vector> rx_results_r; for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back( @@ -979,25 +1045,28 @@ int main(int argc, char** argv) { pass &= (rx_results_r[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); } - vector mux_results = - tt::llrt::read_hex_vec_from_core(device->id(), mux_phys_core, test_results_addr, test_results_size); + std::vector mux_results = + tt::llrt::read_hex_vec_from_core( + device->id(), mux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector mux_results_r = - tt::llrt::read_hex_vec_from_core(device_r->id(), mux_phys_core_r, test_results_addr, test_results_size); + std::vector mux_results_r = + tt::llrt::read_hex_vec_from_core( + device_r->id(), mux_phys_core_r, test_results_addr, test_results_size); log_info(LogTest, "R MUX status = {}", packet_queue_test_status_to_string(mux_results_r[PQ_TEST_STATUS_INDEX])); pass &= (mux_results_r[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector demux_results = - tt::llrt::read_hex_vec_from_core(device->id(), demux_phys_core, test_results_addr, test_results_size); + std::vector demux_results = + tt::llrt::read_hex_vec_from_core( + device->id(), demux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); pass &= (demux_results[0] == PACKET_QUEUE_TEST_PASS); - vector demux_results_r = - tt::llrt::read_hex_vec_from_core(device_r->id(), demux_phys_core_r, test_results_addr, test_results_size); - log_info( - LogTest, "R DEMUX status = {}", packet_queue_test_status_to_string(demux_results_r[PQ_TEST_STATUS_INDEX])); + std::vector demux_results_r = + tt::llrt::read_hex_vec_from_core( + device_r->id(), demux_phys_core_r, test_results_addr, test_results_size); + log_info(LogTest, "R DEMUX status = {}", packet_queue_test_status_to_string(demux_results_r[PQ_TEST_STATUS_INDEX])); pass &= (demux_results_r[0] == PACKET_QUEUE_TEST_PASS); pass &= tt_metal::CloseDevice(device); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp index ab830d20bb83..0725dd8dc428 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp @@ -11,7 +11,6 @@ #include "tt_metal/impl/device/device.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" -using std::vector; using namespace tt; using json = nlohmann::json; @@ -30,6 +29,9 @@ int main(int argc, char** argv) { constexpr uint32_t default_data_kb_per_tx = 1024 * 1024; constexpr uint32_t default_max_packet_size_words = 0x100; + constexpr uint32_t default_input_scratch_buffer_base_addr = 0x50000; + constexpr uint32_t default_output_scratch_buffer_base_addr = 0x60000; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; constexpr uint32_t default_tx_queue_size_bytes = 0x10000; constexpr uint32_t default_rx_queue_start_addr = 0xa0000; @@ -46,7 +48,8 @@ int main(int argc, char** argv) { constexpr uint32_t default_tunneler_queue_size_bytes = 0x4000; // * 8 as it is birectional, maximum queue size for ecore L1 (power of 2) constexpr uint32_t default_tunneler_test_results_addr = 0x39000; - constexpr uint32_t default_tunneler_test_results_size = 0x7000; + constexpr uint32_t default_tunneler_test_results_size = 0x1000; + constexpr uint32_t default_tunneler_buffer_base_addr = 0x3A000; constexpr uint32_t default_timeout_mcycles = 1000; constexpr uint32_t default_rx_disable_data_check = 0; @@ -151,6 +154,9 @@ int main(int argc, char** argv) { default_tx_data_sent_per_iter_high); log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); + log_info(LogTest, " --input_scratch_buffer_base_addr: Scratch buffer for input queues base address, default = {:#x}", default_input_scratch_buffer_base_addr); + log_info(LogTest, " --output_scratch_buffer_base_addr: Scratch buffer for output queues base address, default = {:#x}", default_output_scratch_buffer_base_addr); + log_info(LogTest, " --tunneler_scratch_buffer_base_addr: Scratch buffer for tunneler queues base address, default = {:#x}", default_tunneler_buffer_base_addr); return 0; } @@ -207,19 +213,15 @@ int main(int argc, char** argv) { uint32_t dump_stat_json = test_args::get_command_option_uint32(input_args, "--dump_stat_json", default_dump_stat_json); std::string output_dir = test_args::get_command_option(input_args, "--output_dir", std::string(default_output_dir)); - uint32_t check_txrx_timeout = - test_args::get_command_option_uint32(input_args, "--check_txrx_timeout", default_check_txrx_timeout); - uint8_t tx_pkt_dest_size_choice = (uint8_t)test_args::get_command_option_uint32( - input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); - uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32( - input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); - uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32( - input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); - - assert( - (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && - rx_disable_header_check || - (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); + uint32_t check_txrx_timeout = test_args::get_command_option_uint32(input_args, "--check_txrx_timeout", default_check_txrx_timeout); + uint8_t tx_pkt_dest_size_choice = (uint8_t) test_args::get_command_option_uint32(input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); + uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); + uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); + uint32_t input_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--input_scratch_buffer_base_addr", default_input_scratch_buffer_base_addr); + uint32_t output_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--output_scratch_buffer_base_addr", default_output_scratch_buffer_base_addr); + uint32_t tunneler_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--tunneler_buffer_base_addr", default_tunneler_buffer_base_addr); + + assert((pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && rx_disable_header_check || (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); bool pass = true; @@ -227,6 +229,40 @@ int main(int argc, char** argv) { {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; + // Same kernel layout on both devices + // The addresses will be the same but on remote noc, + // however, still make two arrays for clarity in the compile args + // TODO(nhuang): Refactor / cleanup the duplicate code for the kernel setups + using topology = std::vector>; + + const topology input_topology = { + { "traffic_gen_tx", num_src_endpoints }, + { "mux", MAX_SWITCH_FAN_OUT }, + { "demux", MAX_SWITCH_FAN_OUT }, + { "traffic_gen_rx", MAX_SWITCH_FAN_OUT }, + }; + + const topology output_topology = { + { "traffic_gen_tx", num_src_endpoints }, + { "traffic_gen_tx_mock", num_src_endpoints }, + { "mux", MAX_SWITCH_FAN_OUT }, + { "demux", MAX_SWITCH_FAN_OUT }, + { "traffic_gen_rx", MAX_SWITCH_FAN_OUT }, + }; + + const topology tunneler_topology = { + { "input", MAX_TUNNEL_LANES }, + { "output", MAX_TUNNEL_LANES }, + }; + + const auto input_scratch_buffers_left = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, input_topology); + const auto output_scratch_buffers_left = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, output_topology); + const auto tunneler_buffers_left = make_buffer_addresses_for_test(tunneler_buffer_base_addr, packet_queue_ptr_buffer_size, tunneler_topology); + + const auto input_scratch_buffers_right = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, input_topology); + const auto output_scratch_buffers_right = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, output_topology); + const auto tunneler_buffers_right = make_buffer_addresses_for_test(tunneler_buffer_base_addr, packet_queue_ptr_buffer_size, tunneler_topology); + try { int num_devices = tt_metal::GetNumAvailableDevices(); if (test_device_id >= num_devices) { @@ -282,30 +318,35 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { CoreCoord core = {tx_x + i, tx_y}; tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - 1, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id + i, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low - tx_data_sent_per_iter_high // 21: data_sent_per_iter_high - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + 1, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id+i, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 21: data_sent_per_iter_high + input_scratch_buffers_left.at("traffic_gen_tx")[i], // 22: traffic_gen_input_ptrs_addr + output_scratch_buffers_left.at("traffic_gen_tx_mock")[i], // 23: traffic_gen_input_mock_remote_ptrs_addr + output_scratch_buffers_left.at("traffic_gen_tx")[i], // 24: traffic_gen_output_ptrs_addr + input_scratch_buffers_left.at("mux")[i], // 25: traffic_gen_output_remote_ptrs_addr + }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -323,30 +364,35 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { CoreCoord core = {tx_x + i, tx_y}; tx_phys_core_r.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - 1, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core_r.x, // 6: remote_rx_x - (uint32_t)mux_phys_core_r.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id + i, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low - tx_data_sent_per_iter_high // 21: data_sent_per_iter_high - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + 1, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core_r.x, // 6: remote_rx_x + (uint32_t)mux_phys_core_r.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id+i, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 21: data_sent_per_iter_high + input_scratch_buffers_right.at("traffic_gen_tx")[i], // 22: traffic_gen_input_ptrs_addr + output_scratch_buffers_right.at("traffic_gen_tx_mock")[i], // 23: traffic_gen_input_mock_remote_ptrs_addr + output_scratch_buffers_right.at("traffic_gen_tx")[i], // 24: traffic_gen_output_ptrs_addr + input_scratch_buffers_right.at("mux")[i], // 25: traffic_gen_output_remote_ptrs_addr + }; log_info(LogTest, "run traffic_gen_tx_r at x={},y={}", core.x, core.y); auto kernel_r = tt_metal::CreateKernel( @@ -364,27 +410,30 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_dest_endpoints; i++) { CoreCoord core = {rx_x + i, rx_y}; rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - 1, // num_src_endpoints, // 1: num_src_endpoints - 1, // num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - num_dest_endpoints + i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id + i, // 15: src_endpoint_start_id - dest_endpoint_start_id + i, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - rx_disable_header_check // 18: disable_header_check - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + 1, //num_src_endpoints, // 1: num_src_endpoints + 1, //num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + num_dest_endpoints + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id + i, // 15: src_endpoint_start_id + dest_endpoint_start_id + i, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + rx_disable_header_check, // 18: disable_header_check + input_scratch_buffers_left.at("traffic_gen_rx")[i], // 19: traffic_gen_input_ptrs_addr + output_scratch_buffers_left.at("demux")[i], // 20: traffic_gen_input_remote_ptrs_addr + }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -402,27 +451,30 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_dest_endpoints; i++) { CoreCoord core = {rx_x + i, rx_y}; rx_phys_core_r.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - 1, // num_src_endpoints, // 1: num_src_endpoints - 1, // num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core_r.x, // 5: remote_tx_x - (uint32_t)demux_phys_core_r.y, // 6: remote_tx_y - num_dest_endpoints + i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id + i, // 15: src_endpoint_start_id - dest_endpoint_start_id + i, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - rx_disable_header_check // 18: disable_header_check - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + 1, //num_src_endpoints, // 1: num_src_endpoints + 1, //num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core_r.x, // 5: remote_tx_x + (uint32_t)demux_phys_core_r.y, // 6: remote_tx_y + num_dest_endpoints + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id + i, // 15: src_endpoint_start_id + dest_endpoint_start_id + i, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + rx_disable_header_check, // 18: disable_header_check + input_scratch_buffers_right.at("traffic_gen_rx")[i], // 19: traffic_gen_input_ptrs_addr + output_scratch_buffers_right.at("demux")[i], // 20: traffic_gen_input_remote_ptrs_addr + }; log_info(LogTest, "run traffic_gen_rx_r at x={},y={}", core.x, core.y); auto kernel_r = tt_metal::CreateKernel( @@ -441,77 +493,78 @@ int main(int argc, char** argv) { } // Mux - std::vector mux_compile_args = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)tunneler_phys_core.x, - (uint32_t)tunneler_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: dest 0 info - packet_switch_4B_pack( - (uint32_t)tunneler_phys_core.x, - (uint32_t)tunneler_phys_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: dest 0 info - packet_switch_4B_pack( - (uint32_t)tunneler_phys_core.x, - (uint32_t)tunneler_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: dest 0 info - packet_switch_4B_pack( - (uint32_t)tunneler_phys_core.x, - (uint32_t)tunneler_phys_core.y, - 3, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: dest 0 info - - (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 10: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words - ((tunneler_queue_start_addr + 2 * tunneler_queue_size_bytes) >> 4), // 12: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words - ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> 4), // 14: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words - packet_switch_4B_pack( - (uint32_t)tx_phys_core[0].x, - (uint32_t)tx_phys_core[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: src 0 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[1].x, - (uint32_t)tx_phys_core[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: src 1 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[2].x, - (uint32_t)tx_phys_core[2].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 18: src 2 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core[3].x, - (uint32_t)tx_phys_core[3].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 19: src 3 info - 0, - 0, // 20, 21 - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 25-35: packetize/depacketize settings - }; + std::vector mux_compile_args = + { + 0, // 0: reserved + (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words + num_src_endpoints, // 3: mux_fan_in + packet_switch_4B_pack((uint32_t)tunneler_phys_core.x, + (uint32_t)tunneler_phys_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: dest 0 info + packet_switch_4B_pack((uint32_t)tunneler_phys_core.x, + (uint32_t)tunneler_phys_core.y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: dest 0 info + packet_switch_4B_pack((uint32_t)tunneler_phys_core.x, + (uint32_t)tunneler_phys_core.y, + 2, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: dest 0 info + packet_switch_4B_pack((uint32_t)tunneler_phys_core.x, + (uint32_t)tunneler_phys_core.y, + 3, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: dest 0 info + (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words + ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 10: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words + ((tunneler_queue_start_addr + 2 * tunneler_queue_size_bytes) >> 4), // 12: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words + ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> 4), // 14: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words + packet_switch_4B_pack((uint32_t)tx_phys_core[0].x, + (uint32_t)tx_phys_core[0].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: src 0 info + packet_switch_4B_pack((uint32_t)tx_phys_core[1].x, + (uint32_t)tx_phys_core[1].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: src 1 info + packet_switch_4B_pack((uint32_t)tx_phys_core[2].x, + (uint32_t)tx_phys_core[2].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 18: src 2 info + packet_switch_4B_pack((uint32_t)tx_phys_core[3].x, + (uint32_t)tx_phys_core[3].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 19: src 3 info + 0, 0, // 20, 21 + test_results_addr, // 22: test_results_addr + test_results_size, // 23: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + + input_scratch_buffers_left.at("mux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_left.at("mux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_left.at("mux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_left.at("mux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + output_scratch_buffers_left.at("traffic_gen_tx")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + output_scratch_buffers_left.at("traffic_gen_tx")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + output_scratch_buffers_left.at("traffic_gen_tx")[2], // 42: vc_packet_router_input_remote_scratch_buffers[2] + output_scratch_buffers_left.at("traffic_gen_tx")[3], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_left.at("mux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_left.at("mux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_left.at("mux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_left.at("mux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + tunneler_buffers_left.at("input")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + tunneler_buffers_left.at("input")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + tunneler_buffers_left.at("input")[2], // 50: vc_packet_router_output_remote_scratch_buffers[2] + tunneler_buffers_left.at("input")[3], // 51: vc_packet_router_output_remote_scratch_buffers[3] + }; log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); auto mux_kernel = tt_metal::CreateKernel( @@ -522,79 +575,82 @@ int main(int argc, char** argv) { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = mux_compile_args, - .defines = defines}); - - std::vector mux_compile_args_r = { - 0, // 0: reserved - (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_src_endpoints, // 3: mux_fan_in - packet_switch_4B_pack( - (uint32_t)r_tunneler_phys_core.x, - (uint32_t)r_tunneler_phys_core.y, - 4, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: dest 0 info - packet_switch_4B_pack( - (uint32_t)r_tunneler_phys_core.x, - (uint32_t)r_tunneler_phys_core.y, - 5, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: dest 0 info - packet_switch_4B_pack( - (uint32_t)r_tunneler_phys_core.x, - (uint32_t)r_tunneler_phys_core.y, - 6, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: dest 0 info - packet_switch_4B_pack( - (uint32_t)r_tunneler_phys_core.x, - (uint32_t)r_tunneler_phys_core.y, - 7, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: dest 0 info - - ((tunneler_queue_start_addr + 4 * tunneler_queue_size_bytes) >> 4), // 8: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words - ((tunneler_queue_start_addr + 5 * tunneler_queue_size_bytes) >> 4), // 10: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words - ((tunneler_queue_start_addr + 6 * tunneler_queue_size_bytes) >> 4), // 12: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words - ((tunneler_queue_start_addr + 7 * tunneler_queue_size_bytes) >> 4), // 14: remote_tx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words - packet_switch_4B_pack( - (uint32_t)tx_phys_core_r[0].x, - (uint32_t)tx_phys_core_r[0].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: src 0 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core_r[1].x, - (uint32_t)tx_phys_core_r[1].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: src 1 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core_r[2].x, - (uint32_t)tx_phys_core_r[2].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 18: src 2 info - packet_switch_4B_pack( - (uint32_t)tx_phys_core_r[3].x, - (uint32_t)tx_phys_core_r[3].y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 19: src 3 info - 0, - 0, // 20, 21 - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 25-35: packetize/depacketize settings - }; + .defines = defines + } + ); + + std::vector mux_compile_args_r = + { + 0, // 0: reserved + (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words + num_src_endpoints, // 3: mux_fan_in + packet_switch_4B_pack((uint32_t)r_tunneler_phys_core.x, + (uint32_t)r_tunneler_phys_core.y, + 4, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: dest 0 info + packet_switch_4B_pack((uint32_t)r_tunneler_phys_core.x, + (uint32_t)r_tunneler_phys_core.y, + 5, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: dest 0 info + packet_switch_4B_pack((uint32_t)r_tunneler_phys_core.x, + (uint32_t)r_tunneler_phys_core.y, + 6, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: dest 0 info + packet_switch_4B_pack((uint32_t)r_tunneler_phys_core.x, + (uint32_t)r_tunneler_phys_core.y, + 7, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: dest 0 info + + ((tunneler_queue_start_addr + 4 * tunneler_queue_size_bytes) >> 4), // 8: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words + ((tunneler_queue_start_addr + 5 * tunneler_queue_size_bytes) >> 4), // 10: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words + ((tunneler_queue_start_addr + 6 * tunneler_queue_size_bytes) >> 4), // 12: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words + ((tunneler_queue_start_addr + 7 * tunneler_queue_size_bytes) >> 4), // 14: remote_tx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words + packet_switch_4B_pack((uint32_t)tx_phys_core_r[0].x, + (uint32_t)tx_phys_core_r[0].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: src 0 info + packet_switch_4B_pack((uint32_t)tx_phys_core_r[1].x, + (uint32_t)tx_phys_core_r[1].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: src 1 info + packet_switch_4B_pack((uint32_t)tx_phys_core_r[2].x, + (uint32_t)tx_phys_core_r[2].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 18: src 2 info + packet_switch_4B_pack((uint32_t)tx_phys_core_r[3].x, + (uint32_t)tx_phys_core_r[3].y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 19: src 3 info + 0, 0, // 20, 21 + test_results_addr, // 22: test_results_addr + test_results_size, // 23: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + input_scratch_buffers_right.at("mux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_right.at("mux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_right.at("mux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_right.at("mux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + output_scratch_buffers_right.at("traffic_gen_tx")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + output_scratch_buffers_right.at("traffic_gen_tx")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + output_scratch_buffers_right.at("traffic_gen_tx")[2], // 42: vc_packet_router_input_remote_scratch_buffers[2] + output_scratch_buffers_right.at("traffic_gen_tx")[3], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_right.at("mux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_right.at("mux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_right.at("mux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_right.at("mux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + tunneler_buffers_right.at("input")[4], // 48: vc_packet_router_output_remote_scratch_buffers[0] + tunneler_buffers_right.at("input")[5], // 49: vc_packet_router_output_remote_scratch_buffers[1] + tunneler_buffers_right.at("input")[6], // 50: vc_packet_router_output_remote_scratch_buffers[2] + tunneler_buffers_right.at("input")[7], // 51: vc_packet_router_output_remote_scratch_buffers[3] + }; log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); auto mux_kernel_r = tt_metal::CreateKernel( @@ -605,267 +661,295 @@ int main(int argc, char** argv) { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = mux_compile_args_r, - .defines = defines}); - - std::vector tunneler_l_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 2 * num_endpoints, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::ETH), // 4: remote_receiver_0_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::ETH), // 5: remote_receiver_1_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::ETH), // 6: remote_receiver_2_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 3, - (uint32_t)DispatchRemoteNetworkType::ETH), // 7: remote_receiver_3_info - - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 8: remote_receiver_1_info - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 9: remote_receiver_1_info - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - 2, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_receiver_1_info - packet_switch_4B_pack( - demux_phys_core.x, - demux_phys_core.y, - 3, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 11: remote_receiver_1_info - 0, - 0, // 12 - 13: remote_receiver 8 - 9 - - (tunneler_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 - (tunneler_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 - ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> - 4), // 16: remote_receiver_queue_start_addr_words 1 - (tunneler_queue_size_bytes >> 4), // 17: remote_receiver_queue_size_words 1 - ((tunneler_queue_start_addr + 2 * tunneler_queue_size_bytes) >> - 4), // 18: remote_receiver_queue_start_addr_words 2 - (tunneler_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 2 - ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> - 4), // 20: remote_receiver_queue_start_addr_words 3 - (tunneler_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 3 - (demux_queue_start_addr >> 4), // 22: remote_receiver_queue_start_addr_words 4 - (demux_queue_size_bytes >> 4), // 23: remote_receiver_queue_size_words 4 - ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 24: remote_receiver_queue_start_addr_words 5 - (demux_queue_size_bytes >> 4), // 25: remote_receiver_queue_size_words 5 - ((demux_queue_start_addr + 2 * demux_queue_size_bytes) >> - 4), // 26: remote_receiver_queue_start_addr_words 6 - (demux_queue_size_bytes >> 4), // 27: remote_receiver_queue_size_words 6 - ((demux_queue_start_addr + 3 * demux_queue_size_bytes) >> - 4), // 28: remote_receiver_queue_start_addr_words 7 - (demux_queue_size_bytes >> 4), // 29: remote_receiver_queue_size_words 7 - - 0, - 2, // 30 - 31 Settings for remote reciver 8 - 0, // 32: remote_receiver_queue_start_addr_words 9 - 2, // 33: remote_receiver_queue_size_words 9. - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 34: remote_sender_0_info - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints + 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 35: remote_sender_1_info - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints + 2, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 36: remote_sender_2_info - packet_switch_4B_pack( - mux_phys_core.x, - mux_phys_core.y, - num_dest_endpoints + 3, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 37: remote_sender_3_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 12, - (uint32_t)DispatchRemoteNetworkType::ETH), // 38: remote_sender_4_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 13, - (uint32_t)DispatchRemoteNetworkType::ETH), // 39: remote_sender_5_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 14, - (uint32_t)DispatchRemoteNetworkType::ETH), // 40: remote_sender_6_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 15, - (uint32_t)DispatchRemoteNetworkType::ETH), // 41: remote_sender_7_info - 0, - 0, // 42 - 43: remote_sender 8 - 9 - - tunneler_test_results_addr, // 44: test_results_addr - tunneler_test_results_size, // 45: test_results_size - 0, // 46: timeout_cycles - 0, // 47: inner_stop_mux_d_bypass - }; + .defines = defines + } + ); + + std::vector tunneler_l_compile_args = + { + dest_endpoint_start_id, // 0: endpoint_id_start_index + 2*num_endpoints, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. + (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::ETH), // 4: remote_receiver_0_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 1, + (uint32_t)DispatchRemoteNetworkType::ETH), // 5: remote_receiver_1_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 2, + (uint32_t)DispatchRemoteNetworkType::ETH), // 6: remote_receiver_2_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 3, + (uint32_t)DispatchRemoteNetworkType::ETH), // 7: remote_receiver_3_info + + packet_switch_4B_pack(demux_phys_core.x, + demux_phys_core.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 8: remote_receiver_4_info + packet_switch_4B_pack(demux_phys_core.x, + demux_phys_core.y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 9: remote_receiver_5_info + packet_switch_4B_pack(demux_phys_core.x, + demux_phys_core.y, + 2, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_receiver_6_info + packet_switch_4B_pack(demux_phys_core.x, + demux_phys_core.y, + 3, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 11: remote_receiver_7_info + 0, 0, // 12 - 13: remote_receiver_info[8 - 9] + (tunneler_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 + (tunneler_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 + ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 + (tunneler_queue_size_bytes >> 4), // 17: remote_receiver_queue_size_words 1 + ((tunneler_queue_start_addr + 2 * tunneler_queue_size_bytes) >> 4), // 18: remote_receiver_queue_start_addr_words 2 + (tunneler_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 2 + ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> 4), // 20: remote_receiver_queue_start_addr_words 3 + (tunneler_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 3 + (demux_queue_start_addr >> 4), // 22: remote_receiver_queue_start_addr_words 4 + (demux_queue_size_bytes >> 4), // 23: remote_receiver_queue_size_words 4 + ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 24: remote_receiver_queue_start_addr_words 5 + (demux_queue_size_bytes >> 4), // 25: remote_receiver_queue_size_words 5 + ((demux_queue_start_addr + 2 * demux_queue_size_bytes) >> 4), // 26: remote_receiver_queue_start_addr_words 6 + (demux_queue_size_bytes >> 4), // 27: remote_receiver_queue_size_words 6 + ((demux_queue_start_addr + 3 * demux_queue_size_bytes) >> 4), // 28: remote_receiver_queue_start_addr_words 7 + (demux_queue_size_bytes >> 4), // 29: remote_receiver_queue_size_words 7 + 0, 2, // 30 - 31 Settings for remote receiver 8 + 0, // 32: remote_receiver_queue_start_addr_words 9 + 2, // 33: remote_receiver_queue_size_words 9. Unused. Setting to 2 to get around size check assertion that does not allow 0. + packet_switch_4B_pack(mux_phys_core.x, + mux_phys_core.y, + num_dest_endpoints, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 34: remote_sender_0_info + packet_switch_4B_pack(mux_phys_core.x, + mux_phys_core.y, + num_dest_endpoints + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 35: remote_sender_1_info + packet_switch_4B_pack(mux_phys_core.x, + mux_phys_core.y, + num_dest_endpoints + 2, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 36: remote_sender_2_info + packet_switch_4B_pack(mux_phys_core.x, + mux_phys_core.y, + num_dest_endpoints + 3, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 37: remote_sender_3_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 12, + (uint32_t)DispatchRemoteNetworkType::ETH), // 38: remote_sender_4_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 13, + (uint32_t)DispatchRemoteNetworkType::ETH), // 39: remote_sender_5_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 14, + (uint32_t)DispatchRemoteNetworkType::ETH), // 40: remote_sender_6_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 15, + (uint32_t)DispatchRemoteNetworkType::ETH), // 41: remote_sender_7_info + 0, 0, // 42 - 43: remote_sender[8 - 9] + tunneler_test_results_addr, // 44: test_results_addr + tunneler_test_results_size, // 45: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 46: timeout_cycles + 0, // 47: inner_stop_mux_d_bypass + tunneler_buffers_left.at("input")[0], // 48: vc_eth_tunneler_input_scratch_buffers[0] + tunneler_buffers_left.at("input")[1], // 49: vc_eth_tunneler_input_scratch_buffers[1] + tunneler_buffers_left.at("input")[2], // 50: vc_eth_tunneler_input_scratch_buffers[2] + tunneler_buffers_left.at("input")[3], // 51: vc_eth_tunneler_input_scratch_buffers[3] + tunneler_buffers_left.at("input")[4], // 52: vc_eth_tunneler_input_scratch_buffers[4] + tunneler_buffers_left.at("input")[5], // 53: vc_eth_tunneler_input_scratch_buffers[5] + tunneler_buffers_left.at("input")[6], // 54: vc_eth_tunneler_input_scratch_buffers[6] + tunneler_buffers_left.at("input")[7], // 55: vc_eth_tunneler_input_scratch_buffers[7] + 0, 0, // 56 - 57: vc_eth_tunneler_input_scratch_buffers[8 - 9] + + output_scratch_buffers_left.at("mux")[0], // 58: vc_eth_tunneler_input_remote_scratch_buffers[0] + output_scratch_buffers_left.at("mux")[1], // 59: vc_eth_tunneler_input_remote_scratch_buffers[1] + output_scratch_buffers_left.at("mux")[2], // 60: vc_eth_tunneler_input_remote_scratch_buffers[2] + output_scratch_buffers_left.at("mux")[3], // 61: vc_eth_tunneler_input_remote_scratch_buffers[3] + tunneler_buffers_right.at("output")[4], // 62: vc_eth_tunneler_input_remote_scratch_buffers[4] + tunneler_buffers_right.at("output")[5], // 63: vc_eth_tunneler_input_remote_scratch_buffers[5] + tunneler_buffers_right.at("output")[6], // 63: vc_eth_tunneler_input_remote_scratch_buffers[7] + tunneler_buffers_right.at("output")[7], // 63: vc_eth_tunneler_input_remote_scratch_buffers[8] + 0, 0, // 66 - 67: vc_eth_tunneler_input_remote_scratch_buffers[8 - 9] + + tunneler_buffers_left.at("output")[0], // 68: vc_eth_tunneler_output_scratch_buffers[0] + tunneler_buffers_left.at("output")[1], // 69: vc_eth_tunneler_output_scratch_buffers[1] + tunneler_buffers_left.at("output")[2], // 70: vc_eth_tunneler_output_scratch_buffers[2] + tunneler_buffers_left.at("output")[3], // 71: vc_eth_tunneler_output_scratch_buffers[3] + tunneler_buffers_left.at("output")[4], // 72: vc_eth_tunneler_output_scratch_buffers[4] + tunneler_buffers_left.at("output")[5], // 73: vc_eth_tunneler_output_scratch_buffers[5] + tunneler_buffers_left.at("output")[6], // 74: vc_eth_tunneler_output_scratch_buffers[6] + tunneler_buffers_left.at("output")[7], // 75: vc_eth_tunneler_output_scratch_buffers[7] + 0, 0, // 76 - 77: vc_eth_tunneler_output_scratch_buffers[8 - 9] + + tunneler_buffers_right.at("input")[0], // 78: vc_eth_tunneler_output_remote_scratch_buffers[0] + tunneler_buffers_right.at("input")[1], // 79: vc_eth_tunneler_output_remote_scratch_buffers[1] + tunneler_buffers_right.at("input")[2], // 80: vc_eth_tunneler_output_remote_scratch_buffers[2] + tunneler_buffers_right.at("input")[3], // 81: vc_eth_tunneler_output_remote_scratch_buffers[3] + input_scratch_buffers_left.at("demux")[0], // 82: vc_eth_tunneler_output_remote_scratch_buffers[4] + input_scratch_buffers_left.at("demux")[1], // 83: vc_eth_tunneler_output_remote_scratch_buffers[5] + input_scratch_buffers_left.at("demux")[2], // 84: vc_eth_tunneler_output_remote_scratch_buffers[6] + input_scratch_buffers_left.at("demux")[3], // 85: vc_eth_tunneler_output_remote_scratch_buffers[7] + 0, 0, // 86 - 87: vc_eth_tunneler_output_remote_scratch_buffers[8 - 9] + }; auto tunneler_l_kernel = tt_metal::CreateKernel( program, "tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp", tunneler_logical_core, tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = tunneler_l_compile_args, .defines = defines}); - - std::vector tunneler_r_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - 2 * num_endpoints, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. - (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words - (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words - - packet_switch_4B_pack( - demux_phys_core_r.x, - demux_phys_core_r.y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info - packet_switch_4B_pack( - demux_phys_core_r.x, - demux_phys_core_r.y, - 1, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_receiver_1_info - packet_switch_4B_pack( - demux_phys_core_r.x, - demux_phys_core_r.y, - 2, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_receiver_2_info - packet_switch_4B_pack( - demux_phys_core_r.x, - demux_phys_core_r.y, - 3, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_receiver_3_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 4, - (uint32_t)DispatchRemoteNetworkType::ETH), // 8: remote_receiver_4_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 5, - (uint32_t)DispatchRemoteNetworkType::ETH), // 9: remote_receiver_5_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 6, - (uint32_t)DispatchRemoteNetworkType::ETH), // 10: remote_receiver_6_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 7, - (uint32_t)DispatchRemoteNetworkType::ETH), // 11: remote_receiver_7_info - 0, - 0, // 12 - 13: remote_receiver 8 - 9 - - (demux_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 - (demux_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 - ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 - (demux_queue_size_bytes >> 4), // 17: remote_receiver_queue_size_words 1 - ((demux_queue_start_addr + 2 * demux_queue_size_bytes) >> - 4), // 18: remote_receiver_queue_start_addr_words 2 - (demux_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 2 - ((demux_queue_start_addr + 3 * demux_queue_size_bytes) >> - 4), // 20: remote_receiver_queue_start_addr_words 3 - (demux_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 3 - ((tunneler_queue_start_addr + 4 * tunneler_queue_size_bytes) >> - 4), // 22: remote_receiver_queue_start_addr_words 4 - (tunneler_queue_size_bytes >> 4), // 23: remote_receiver_queue_size_words 4 - ((tunneler_queue_start_addr + 5 * tunneler_queue_size_bytes) >> - 4), // 24: remote_receiver_queue_start_addr_words 5 - (tunneler_queue_size_bytes >> 4), // 25: remote_receiver_queue_size_words 5 - ((tunneler_queue_start_addr + 6 * tunneler_queue_size_bytes) >> - 4), // 26: remote_receiver_queue_start_addr_words 6 - (tunneler_queue_size_bytes >> 4), // 27: remote_receiver_queue_size_words 6 - ((tunneler_queue_start_addr + 7 * tunneler_queue_size_bytes) >> - 4), // 28: remote_receiver_queue_start_addr_words 7 - (tunneler_queue_size_bytes >> 4), // 29: remote_receiver_queue_size_words 7 - 0, - 2, // 30 - 31 Settings for remote reciver 8 - 0, // 32: remote_receiver_queue_start_addr_words 9 - 2, // 33: remote_receiver_queue_size_words 9. - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 8, - (uint32_t)DispatchRemoteNetworkType::ETH), // 34: remote_sender_0_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 9, - (uint32_t)DispatchRemoteNetworkType::ETH), // 35: remote_sender_1_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 10, - (uint32_t)DispatchRemoteNetworkType::ETH), // 36: remote_sender_2_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 11, - (uint32_t)DispatchRemoteNetworkType::ETH), // 37: remote_sender_3_info - packet_switch_4B_pack( - mux_phys_core_r.x, - mux_phys_core_r.y, - 4, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 38: remote_sender_4_info - packet_switch_4B_pack( - mux_phys_core_r.x, - mux_phys_core_r.y, - 5, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 39: remote_sender_5_info - packet_switch_4B_pack( - mux_phys_core_r.x, - mux_phys_core_r.y, - 6, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 40: remote_sender_6_info - packet_switch_4B_pack( - mux_phys_core_r.x, - mux_phys_core_r.y, - 7, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 41: remote_sender_7_info - 0, - 0, // 42 - 43: remote_sender 8 - 9 - - tunneler_test_results_addr, // 44: test_results_addr - tunneler_test_results_size, // 45: test_results_size - 0, // 46: timeout_cycles - 0, // 47: inner_stop_mux_d_bypass - }; + .noc = tt_metal::NOC::NOC_0, + .compile_args = tunneler_l_compile_args, + .defines = defines + } + ); + + + std::vector tunneler_r_compile_args = + { + dest_endpoint_start_id, // 0: endpoint_id_start_index + 2*num_endpoints, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. + (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words + (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words + packet_switch_4B_pack(demux_phys_core_r.x, + demux_phys_core_r.y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info + packet_switch_4B_pack(demux_phys_core_r.x, + demux_phys_core_r.y, + 1, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_receiver_1_info + packet_switch_4B_pack(demux_phys_core_r.x, + demux_phys_core_r.y, + 2, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_receiver_2_info + packet_switch_4B_pack(demux_phys_core_r.x, + demux_phys_core_r.y, + 3, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_receiver_3_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 4, + (uint32_t)DispatchRemoteNetworkType::ETH), // 8: remote_receiver_4_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 5, + (uint32_t)DispatchRemoteNetworkType::ETH), // 9: remote_receiver_5_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 6, + (uint32_t)DispatchRemoteNetworkType::ETH), // 10: remote_receiver_6_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 7, + (uint32_t)DispatchRemoteNetworkType::ETH), // 11: remote_receiver_7_info + 0, 0, // 12 - 13: remote_receiver_info[8 - 9] + (demux_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 + (demux_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 + ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 + (demux_queue_size_bytes >> 4), // 17: remote_receiver_queue_size_words 1 + ((demux_queue_start_addr + 2 * demux_queue_size_bytes) >> 4), // 18: remote_receiver_queue_start_addr_words 2 + (demux_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 2 + ((demux_queue_start_addr + 3 * demux_queue_size_bytes) >> 4), // 20: remote_receiver_queue_start_addr_words 3 + (demux_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 3 + ((tunneler_queue_start_addr + 4 * tunneler_queue_size_bytes) >> 4), // 22: remote_receiver_queue_start_addr_words 4 + (tunneler_queue_size_bytes >> 4), // 23: remote_receiver_queue_size_words 4 + ((tunneler_queue_start_addr + 5 * tunneler_queue_size_bytes) >> 4), // 24: remote_receiver_queue_start_addr_words 5 + (tunneler_queue_size_bytes >> 4), // 25: remote_receiver_queue_size_words 5 + ((tunneler_queue_start_addr + 6 * tunneler_queue_size_bytes) >> 4), // 26: remote_receiver_queue_start_addr_words 6 + (tunneler_queue_size_bytes >> 4), // 27: remote_receiver_queue_size_words 6 + ((tunneler_queue_start_addr + 7 * tunneler_queue_size_bytes) >> 4), // 28: remote_receiver_queue_start_addr_words 7 + (tunneler_queue_size_bytes >> 4), // 29: remote_receiver_queue_size_words 7 + 0, 2, // 30 - 31 Settings for remote receiver 8 + 0, // 32: remote_receiver_queue_start_addr_words 9 + 2, // 33: remote_receiver_queue_size_words 9. Unused. Setting to 2 to get around size check assertion that does not allow 0. + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 8, + (uint32_t)DispatchRemoteNetworkType::ETH), // 34: remote_sender_0_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 9, + (uint32_t)DispatchRemoteNetworkType::ETH), // 35: remote_sender_1_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 10, + (uint32_t)DispatchRemoteNetworkType::ETH), // 36: remote_sender_2_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 11, + (uint32_t)DispatchRemoteNetworkType::ETH), // 37: remote_sender_3_info + packet_switch_4B_pack(mux_phys_core_r.x, + mux_phys_core_r.y, + 4, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 38: remote_sender_4_info + packet_switch_4B_pack(mux_phys_core_r.x, + mux_phys_core_r.y, + 5, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 39: remote_sender_5_info + packet_switch_4B_pack(mux_phys_core_r.x, + mux_phys_core_r.y, + 6, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 40: remote_sender_6_info + packet_switch_4B_pack(mux_phys_core_r.x, + mux_phys_core_r.y, + 7, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 41: remote_sender_7_info + 0, 0, // 42 - 43: remote_sender 8 - 9 + + tunneler_test_results_addr, // 44: test_results_addr + tunneler_test_results_size, // 45: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 46: timeout_cycles + 0, // 47: inner_stop_mux_d_bypass, + tunneler_buffers_right.at("input")[0], // 48: vc_eth_tunneler_input_scratch_buffers[0] + tunneler_buffers_right.at("input")[1], // 49: vc_eth_tunneler_input_scratch_buffers[1] + tunneler_buffers_right.at("input")[2], // 50: vc_eth_tunneler_input_scratch_buffers[2] + tunneler_buffers_right.at("input")[3], // 51: vc_eth_tunneler_input_scratch_buffers[3] + tunneler_buffers_right.at("input")[4], // 52: vc_eth_tunneler_input_scratch_buffers[4] + tunneler_buffers_right.at("input")[5], // 53: vc_eth_tunneler_input_scratch_buffers[5] + tunneler_buffers_right.at("input")[6], // 54: vc_eth_tunneler_input_scratch_buffers[6] + tunneler_buffers_right.at("input")[7], // 55: vc_eth_tunneler_input_scratch_buffers[7] + 0, 0, // 56 - 57: vc_eth_tunneler_input_scratch_buffers[8 - 9] + + tunneler_buffers_left.at("output")[0], // 58: vc_eth_tunneler_input_remote_scratch_buffers[0] + tunneler_buffers_left.at("output")[1], // 59: vc_eth_tunneler_input_remote_scratch_buffers[1] + tunneler_buffers_left.at("output")[2], // 60: vc_eth_tunneler_input_remote_scratch_buffers[2] + tunneler_buffers_left.at("output")[3], // 61: vc_eth_tunneler_input_remote_scratch_buffers[3] + output_scratch_buffers_right.at("mux")[0], // 62: vc_eth_tunneler_input_remote_scratch_buffers[4] + output_scratch_buffers_right.at("mux")[1], // 63: vc_eth_tunneler_input_remote_scratch_buffers[5] + output_scratch_buffers_right.at("mux")[2], // 64: vc_eth_tunneler_input_remote_scratch_buffers[6] + output_scratch_buffers_right.at("mux")[3], // 65: vc_eth_tunneler_input_remote_scratch_buffers[7] + 0, 0, // 66 - 67: vc_eth_tunneler_input_remote_scratch_buffers[8 - 9] + + tunneler_buffers_right.at("output")[0], // 68: vc_eth_tunneler_output_scratch_buffers[0] + tunneler_buffers_right.at("output")[1], // 69: vc_eth_tunneler_output_scratch_buffers[1] + tunneler_buffers_right.at("output")[2], // 70: vc_eth_tunneler_output_scratch_buffers[2] + tunneler_buffers_right.at("output")[3], // 71: vc_eth_tunneler_output_scratch_buffers[3] + tunneler_buffers_right.at("output")[4], // 72: vc_eth_tunneler_output_scratch_buffers[4] + tunneler_buffers_right.at("output")[5], // 73: vc_eth_tunneler_output_scratch_buffers[5] + tunneler_buffers_right.at("output")[6], // 74: vc_eth_tunneler_output_scratch_buffers[6] + tunneler_buffers_right.at("output")[7], // 75: vc_eth_tunneler_output_scratch_buffers[7] + 0, 0, // 76 - 77: vc_eth_tunneler_output_scratch_buffers[8 - 9] + + input_scratch_buffers_right.at("demux")[0], // 78: vc_eth_tunneler_output_remote_scratch_buffers[0] + input_scratch_buffers_right.at("demux")[1], // 79: vc_eth_tunneler_output_remote_scratch_buffers[1] + input_scratch_buffers_right.at("demux")[2], // 80: vc_eth_tunneler_output_remote_scratch_buffers[2] + input_scratch_buffers_right.at("demux")[3], // 81: vc_eth_tunneler_output_remote_scratch_buffers[3] + tunneler_buffers_left.at("input")[4], // 82: vc_eth_tunneler_output_remote_scratch_buffers[4] + tunneler_buffers_left.at("input")[5], // 83: vc_eth_tunneler_output_remote_scratch_buffers[5] + tunneler_buffers_left.at("input")[6], // 84: vc_eth_tunneler_output_remote_scratch_buffers[6] + tunneler_buffers_left.at("input")[7], // 85: vc_eth_tunneler_output_remote_scratch_buffers[7] + 0, 0, // 86 - 87: vc_eth_tunneler_output_remote_scratch_buffers[8 - 9] + }; auto tunneler_r_kernel = tt_metal::CreateKernel( program_r, @@ -877,82 +961,78 @@ int main(int argc, char** argv) { // Demux uint32_t dest_map_array[4] = {0, 1, 2, 3}; uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); - std::vector demux_compile_args = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - rx_phys_core[0].x, - rx_phys_core[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - rx_phys_core[1].x, - rx_phys_core[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - packet_switch_4B_pack( - rx_phys_core[2].x, - rx_phys_core[2].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info - packet_switch_4B_pack( - rx_phys_core[3].x, - rx_phys_core[3].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - //(uint32_t)tunneler_phys_core.x, // 16: remote_rx_x - //(uint32_t)tunneler_phys_core.y, // 17: remote_rx_y - // 3, // 18: remote_rx_queue_id - //(uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 12, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: remote_rx_0_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 13, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: remote_rx_1_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 14, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 18: remote_rx_2_info - packet_switch_4B_pack( - tunneler_phys_core.x, - tunneler_phys_core.y, - 15, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 19: remote_rx_3_info - - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 25-35: packetize/depacketize settings - }; + std::vector demux_compile_args = + { + dest_endpoint_start_id, // 0: endpoint_id_start_index + (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words + (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words + num_dest_endpoints, // 3: demux_fan_out + packet_switch_4B_pack(rx_phys_core[0].x, + rx_phys_core[0].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info + packet_switch_4B_pack(rx_phys_core[1].x, + rx_phys_core[1].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info + packet_switch_4B_pack(rx_phys_core[2].x, + rx_phys_core[2].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info + packet_switch_4B_pack(rx_phys_core[3].x, + rx_phys_core[3].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info + (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 + (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 + (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 + (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 + (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 + (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 + (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 + (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 12, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: remote_rx_0_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 13, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: remote_rx_1_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 14, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 18: remote_rx_2_info + packet_switch_4B_pack(tunneler_phys_core.x, + tunneler_phys_core.y, + 15, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 19: remote_rx_3_info + (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi + (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo + test_results_addr, // 22: test_results_addr + test_results_size, // 23: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + input_scratch_buffers_left.at("demux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_left.at("demux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_left.at("demux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_left.at("demux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + tunneler_buffers_left.at("output")[4], // 40: vc_packet_router_input_remote_scratch_buffers[0] + tunneler_buffers_left.at("output")[5], // 41: vc_packet_router_input_remote_scratch_buffers[1] + tunneler_buffers_left.at("output")[6], // 42: vc_packet_router_input_remote_scratch_buffers[2] + tunneler_buffers_left.at("output")[7], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_left.at("demux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_left.at("demux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_left.at("demux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_left.at("demux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + input_scratch_buffers_left.at("traffic_gen_rx")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + input_scratch_buffers_left.at("traffic_gen_rx")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + input_scratch_buffers_left.at("traffic_gen_rx")[2], // 50: vc_packet_router_output_remote_scratch_buffers[2] + input_scratch_buffers_left.at("traffic_gen_rx")[3], // 51: vc_packet_router_output_remote_scratch_buffers[3] + }; log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); log_info(LogTest, "run demux at physical x={},y={}", demux_phys_core.x, demux_phys_core.y); @@ -965,84 +1045,82 @@ int main(int argc, char** argv) { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = demux_compile_args, - .defines = defines}); - - std::vector demux_compile_args_r = { - dest_endpoint_start_id, // 0: endpoint_id_start_index - (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words - (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words - num_dest_endpoints, // 3: demux_fan_out - packet_switch_4B_pack( - rx_phys_core_r[0].x, - rx_phys_core_r[0].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info - packet_switch_4B_pack( - rx_phys_core_r[1].x, - rx_phys_core_r[1].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info - packet_switch_4B_pack( - rx_phys_core_r[2].x, - rx_phys_core_r[2].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info - packet_switch_4B_pack( - rx_phys_core_r[3].x, - rx_phys_core_r[3].y, - 0, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info - (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 - (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 - (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 - (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 - (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 - (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 - (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 - (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - //(uint32_t)tunneler_phys_core.x, // 16: remote_rx_x - //(uint32_t)tunneler_phys_core.y, // 17: remote_rx_y - // 3, // 18: remote_rx_queue_id - //(uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 8, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: remote_rx_0_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 9, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: remote_rx_1_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 10, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 18: remote_rx_2_info - packet_switch_4B_pack( - r_tunneler_phys_core.x, - r_tunneler_phys_core.y, - 11, - (uint32_t)DispatchRemoteNetworkType::NOC0), // 19: remote_rx_3_info - - (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi - (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo - test_results_addr, // 22: test_results_addr - test_results_size, // 23: test_results_size - timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 // 25-35: packetize/depacketize settings - }; + .defines = defines + } + ); + + std::vector demux_compile_args_r = + { + dest_endpoint_start_id, // 0: endpoint_id_start_index + (demux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words + (demux_queue_size_bytes >> 4), // 2: rx_queue_size_words + num_dest_endpoints, // 3: demux_fan_out + packet_switch_4B_pack(rx_phys_core_r[0].x, + rx_phys_core_r[0].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info + packet_switch_4B_pack(rx_phys_core_r[1].x, + rx_phys_core_r[1].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info + packet_switch_4B_pack(rx_phys_core_r[2].x, + rx_phys_core_r[2].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info + packet_switch_4B_pack(rx_phys_core_r[3].x, + rx_phys_core_r[3].y, + 0, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info + (rx_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words 0 + (rx_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words 0 + (rx_queue_start_addr >> 4), // 10: remote_tx_queue_start_addr_words 1 + (rx_queue_size_bytes >> 4), // 11: remote_tx_queue_size_words 1 + (rx_queue_start_addr >> 4), // 12: remote_tx_queue_start_addr_words 2 + (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 + (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 + (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 8, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 16: remote_rx_0_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 9, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 17: remote_rx_1_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 10, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 18: remote_rx_2_info + packet_switch_4B_pack(r_tunneler_phys_core.x, + r_tunneler_phys_core.y, + 11, + (uint32_t)DispatchRemoteNetworkType::NOC0), // 19: remote_rx_3_info + (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi + (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo + test_results_addr, // 22: test_results_addr + test_results_size, // 23: test_results_size + timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + input_scratch_buffers_right.at("demux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_right.at("demux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_right.at("demux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_right.at("demux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + tunneler_buffers_right.at("output")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + tunneler_buffers_right.at("output")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + tunneler_buffers_right.at("output")[2], // 42: vc_packet_router_input_remote_scratch_buffers[2] + tunneler_buffers_right.at("output")[3], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_right.at("demux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_right.at("demux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_right.at("demux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_right.at("demux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + input_scratch_buffers_right.at("traffic_gen_rx")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + input_scratch_buffers_right.at("traffic_gen_rx")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + input_scratch_buffers_right.at("traffic_gen_rx")[2], // 50: vc_packet_router_output_remote_scratch_buffers[2] + input_scratch_buffers_right.at("traffic_gen_rx")[3], // 51: vc_packet_router_output_remote_scratch_buffers[3] + }; log_info(LogTest, "run remote demux at x={},y={}", demux_core.x, demux_core.y); log_info(LogTest, "run remote demux at physical x={},y={}", demux_phys_core_r.x, demux_phys_core_r.y); @@ -1055,7 +1133,9 @@ int main(int argc, char** argv) { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = demux_compile_args_r, - .defines = defines}); + .defines = defines + } + ); log_info(LogTest, "Starting test..."); @@ -1069,10 +1149,10 @@ int main(int argc, char** argv) { std::chrono::duration elapsed_seconds = (end - start); log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - vector> tx_results; - vector> tx_results_r; - vector> rx_results; - vector> rx_results_r; + std::vector> tx_results; + std::vector> tx_results_r; + std::vector> rx_results; + std::vector> rx_results_r; for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back( @@ -1117,22 +1197,22 @@ int main(int argc, char** argv) { pass &= (rx_results_r[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); } - vector mux_results = + std::vector mux_results = tt::llrt::read_hex_vec_from_core(device->id(), mux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector mux_results_r = + std::vector mux_results_r = tt::llrt::read_hex_vec_from_core(device_r->id(), mux_phys_core_r, test_results_addr, test_results_size); log_info(LogTest, "R MUX status = {}", packet_queue_test_status_to_string(mux_results_r[PQ_TEST_STATUS_INDEX])); pass &= (mux_results_r[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector demux_results = + std::vector demux_results = tt::llrt::read_hex_vec_from_core(device->id(), demux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); pass &= (demux_results[0] == PACKET_QUEUE_TEST_PASS); - vector demux_results_r = + std::vector demux_results_r = tt::llrt::read_hex_vec_from_core(device_r->id(), demux_phys_core_r, test_results_addr, test_results_size); log_info( LogTest, "R DEMUX status = {}", packet_queue_test_status_to_string(demux_results_r[PQ_TEST_STATUS_INDEX])); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp index 03119c6e726c..acf1dd9185c0 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp @@ -5,18 +5,14 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" #include "tt_metal/impl/device/device.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" -using std::vector; using namespace tt; using json = nlohmann::json; -int main(int argc, char **argv) { +int main(int argc, char **argv) { constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; @@ -31,6 +27,9 @@ int main(int argc, char **argv) { constexpr uint32_t default_data_kb_per_tx = 1024*1024; constexpr uint32_t default_max_packet_size_words = 0x100; + constexpr uint32_t default_input_scratch_buffer_base_addr = 0x50000; + constexpr uint32_t default_output_scratch_buffer_base_addr = 0x60000; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; constexpr uint32_t default_tx_queue_size_bytes = 0x10000; constexpr uint32_t default_rx_queue_start_addr = 0xa0000; @@ -46,7 +45,8 @@ int main(int argc, char **argv) { constexpr uint32_t default_tunneler_queue_start_addr = 0x19000; constexpr uint32_t default_tunneler_queue_size_bytes = 0x4000; // * 8 as it is birectional, maximum queue size for ecore L1 (power of 2) constexpr uint32_t default_tunneler_test_results_addr = 0x39000; - constexpr uint32_t default_tunneler_test_results_size = 0x7000; + constexpr uint32_t default_tunneler_test_results_size = 0x1000; + constexpr uint32_t default_tunneler_buffer_base_addr = 0x3A000; constexpr uint32_t default_timeout_mcycles = 1000; constexpr uint32_t default_rx_disable_data_check = 0; @@ -107,6 +107,9 @@ int main(int argc, char **argv) { log_info(LogTest, " --tx_data_sent_per_iter_high: the criteria to determine the amount of tx data sent per iter is high (unit: words); if both 0, then disable counting it in tx kernel, default = {}", default_tx_data_sent_per_iter_high); log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); + log_info(LogTest, " --input_scratch_buffer_base_addr: Scratch buffer for input queues base address, default = {:#x}", default_input_scratch_buffer_base_addr); + log_info(LogTest, " --output_scratch_buffer_base_addr: Scratch buffer for output queues base address, default = {:#x}", default_output_scratch_buffer_base_addr); + log_info(LogTest, " --tunneler_scratch_buffer_base_addr: Scratch buffer for tunneler queues base address, default = {:#x}", default_tunneler_buffer_base_addr); return 0; } @@ -146,6 +149,9 @@ int main(int argc, char **argv) { uint8_t tx_pkt_dest_size_choice = (uint8_t) test_args::get_command_option_uint32(input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); + uint32_t input_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--input_scratch_buffer_base_addr", default_input_scratch_buffer_base_addr); + uint32_t output_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--output_scratch_buffer_base_addr", default_output_scratch_buffer_base_addr); + uint32_t tunneler_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--tunneler_buffer_base_addr", default_tunneler_buffer_base_addr); assert((pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && rx_disable_header_check || (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); @@ -155,6 +161,40 @@ int main(int argc, char **argv) { {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; + // ----- Left Chip ----- + const auto input_scratch_buffers_left = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx", num_src_endpoints }, + { "mux", MAX_SWITCH_FAN_OUT }, + { "demux", MAX_SWITCH_FAN_OUT }, + { "traffic_gen_rx", num_dest_endpoints }, + }); + + const auto tunneler_buffers_left = make_buffer_addresses_for_test(tunneler_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "input", MAX_TUNNEL_LANES }, + { "output", MAX_TUNNEL_LANES }, + }); + + const auto output_scratch_buffers_left = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx", num_src_endpoints }, + { "traffic_gen_tx_mock", num_src_endpoints }, + { "mux", MAX_SWITCH_FAN_OUT }, + { "demux", MAX_SWITCH_FAN_OUT }, + }); + + // ----- Right Chip ----- + const auto input_scratch_buffers_right = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "loopback_mux", MAX_SWITCH_FAN_OUT }, + }); + + const auto tunneler_buffers_right = make_buffer_addresses_for_test(tunneler_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "input", MAX_TUNNEL_LANES }, + { "output", MAX_TUNNEL_LANES }, + }); + + const auto output_scratch_buffers_right = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "loopback_mux", MAX_SWITCH_FAN_OUT }, + }); + try { int num_devices = tt_metal::GetNumAvailableDevices(); if (test_device_id >= num_devices) { @@ -240,7 +280,11 @@ int main(int argc, char **argv) { tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low - tx_data_sent_per_iter_high // 21: data_sent_per_iter_high + tx_data_sent_per_iter_high, // 21: data_sent_per_iter_high + input_scratch_buffers_left.at("traffic_gen_tx")[i], // 22: traffic_gen_input_ptrs_addr + output_scratch_buffers_left.at("traffic_gen_tx_mock")[i], // 23: traffic_gen_input_mock_remote_ptrs_addr + output_scratch_buffers_left.at("traffic_gen_tx")[i], // 24: traffic_gen_output_ptrs_addr + input_scratch_buffers_left.at("mux")[i], // 25: traffic_gen_output_remote_ptrs_addr }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); @@ -281,7 +325,9 @@ int main(int argc, char **argv) { src_endpoint_start_id + i, // 15: src_endpoint_start_id dest_endpoint_start_id + i, // 16: dest_endpoint_start_id timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - rx_disable_header_check // 18: disable_header_check + rx_disable_header_check, // 18: disable_header_check + input_scratch_buffers_left.at("traffic_gen_rx")[i], // 19: traffic_gen_input_ptrs_addr + output_scratch_buffers_left.at("demux")[i], // 20: traffic_gen_input_remote_ptrs_addr }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); @@ -354,7 +400,27 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 25-35: packetize/depacketize settings + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25-35: packetize/depacketize settings + + input_scratch_buffers_left.at("mux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_left.at("mux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_left.at("mux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_left.at("mux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + output_scratch_buffers_left.at("traffic_gen_tx")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + output_scratch_buffers_left.at("traffic_gen_tx")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + output_scratch_buffers_left.at("traffic_gen_tx")[2], // 42: vc_packet_router_input_remote_scratch_buffers[2] + output_scratch_buffers_left.at("traffic_gen_tx")[3], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_left.at("mux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_left.at("mux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_left.at("mux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_left.at("mux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + tunneler_buffers_left.at("input")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + tunneler_buffers_left.at("input")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + tunneler_buffers_left.at("input")[2], // 50: vc_packet_router_output_remote_scratch_buffers[2] + tunneler_buffers_left.at("input")[3], // 51: vc_packet_router_output_remote_scratch_buffers[3] }; log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); @@ -429,12 +495,11 @@ int main(int argc, char **argv) { ((demux_queue_start_addr + 3 * demux_queue_size_bytes) >> 4), // 28: remote_receiver_queue_start_addr_words 7 (demux_queue_size_bytes >> 4), // 29: remote_receiver_queue_size_words 7 - 0, 2, // 30 - 31 Settings for remote reciver 8 + 0, 2, // 30 - 31 Settings for remote receiver 8 0, // 32: remote_receiver_queue_start_addr_words 9 2, // 33: remote_receiver_queue_size_words 9. // Unused. Setting to 2 to get around size check assertion that does not allow 0. - packet_switch_4B_pack(mux_phys_core.x, mux_phys_core.y, num_dest_endpoints, @@ -472,7 +537,50 @@ int main(int argc, char **argv) { tunneler_test_results_addr, // 44: test_results_addr tunneler_test_results_size, // 45: test_results_size timeout_mcycles * 1000 * 1000 * 4, // 46: timeout_cycles - 0, //47: inner_stop_mux_d_bypass + 0, // 47: inner_stop_mux_d_bypass + tunneler_buffers_left.at("input")[0], // 48: vc_eth_tunneler_input_scratch_buffers[0] + tunneler_buffers_left.at("input")[1], // 49: vc_eth_tunneler_input_scratch_buffers[1] + tunneler_buffers_left.at("input")[2], // 50: vc_eth_tunneler_input_scratch_buffers[2] + tunneler_buffers_left.at("input")[3], // 51: vc_eth_tunneler_input_scratch_buffers[3] + tunneler_buffers_left.at("input")[4], // 52: vc_eth_tunneler_input_scratch_buffers[4] + tunneler_buffers_left.at("input")[5], // 53: vc_eth_tunneler_input_scratch_buffers[5] + tunneler_buffers_left.at("input")[6], // 54: vc_eth_tunneler_input_scratch_buffers[6] + tunneler_buffers_left.at("input")[7], // 55: vc_eth_tunneler_input_scratch_buffers[7] + 0, 0, // 56 - 57: vc_eth_tunneler_input_scratch_buffers[8 - 9] + + // from left vc packet router mux + output_scratch_buffers_left.at("mux")[0], // 58: vc_eth_tunneler_input_remote_scratch_buffers[0] + output_scratch_buffers_left.at("mux")[1], // 59: vc_eth_tunneler_input_remote_scratch_buffers[1] + output_scratch_buffers_left.at("mux")[2], // 60: vc_eth_tunneler_input_remote_scratch_buffers[2] + output_scratch_buffers_left.at("mux")[3], // 61: vc_eth_tunneler_input_remote_scratch_buffers[3] + // from right tunneler + tunneler_buffers_right.at("output")[4], // 62: vc_eth_tunneler_input_remote_scratch_buffers[4] + tunneler_buffers_right.at("output")[5], // 63: vc_eth_tunneler_input_remote_scratch_buffers[5] + tunneler_buffers_right.at("output")[6], // 64: vc_eth_tunneler_input_remote_scratch_buffers[6] + tunneler_buffers_right.at("output")[7], // 65: vc_eth_tunneler_input_remote_scratch_buffers[7] + 0, 0, // 66 - 67: vc_eth_tunneler_input_remote_scratch_buffers[8 - 9] + + tunneler_buffers_left.at("output")[0], // 68: vc_eth_tunneler_output_scratch_buffers[0] + tunneler_buffers_left.at("output")[1], // 69: vc_eth_tunneler_output_scratch_buffers[1] + tunneler_buffers_left.at("output")[2], // 70: vc_eth_tunneler_output_scratch_buffers[2] + tunneler_buffers_left.at("output")[3], // 71: vc_eth_tunneler_output_scratch_buffers[3] + tunneler_buffers_left.at("output")[4], // 72: vc_eth_tunneler_output_scratch_buffers[4] + tunneler_buffers_left.at("output")[5], // 73: vc_eth_tunneler_output_scratch_buffers[5] + tunneler_buffers_left.at("output")[6], // 74: vc_eth_tunneler_output_scratch_buffers[6] + tunneler_buffers_left.at("output")[7], // 75: vc_eth_tunneler_output_scratch_buffers[7] + 0, 0, // 76 - 77: vc_eth_tunneler_output_scratch_buffers[8 - 9] + + // to right tunneler + tunneler_buffers_right.at("input")[0], // 78: vc_eth_tunneler_output_remote_scratch_buffers[0] + tunneler_buffers_right.at("input")[1], // 79: vc_eth_tunneler_output_remote_scratch_buffers[1] + tunneler_buffers_right.at("input")[2], // 80: vc_eth_tunneler_output_remote_scratch_buffers[2] + tunneler_buffers_right.at("input")[3], // 81: vc_eth_tunneler_output_remote_scratch_buffers[3] + // to left vc packet router mux + input_scratch_buffers_left.at("demux")[0], // 82: vc_eth_tunneler_output_remote_scratch_buffers[4] + input_scratch_buffers_left.at("demux")[1], // 83: vc_eth_tunneler_output_remote_scratch_buffers[5] + input_scratch_buffers_left.at("demux")[2], // 84: vc_eth_tunneler_output_remote_scratch_buffers[6] + input_scratch_buffers_left.at("demux")[3], // 85: vc_eth_tunneler_output_remote_scratch_buffers[7] + 0, 0, // 86 - 87: vc_eth_tunneler_output_remote_scratch_buffers[8 - 9] }; auto tunneler_l_kernel = tt_metal::CreateKernel( @@ -526,8 +634,7 @@ int main(int argc, char **argv) { tunneler_phys_core.y, 7, (uint32_t)DispatchRemoteNetworkType::ETH), // 11: remote_receiver_7_info - 0, 0, // 12 - 13: remote_receiver 8 - 9 - + 0, 0, // 12 - 13: remote_receiver_7_info[8 - 9] (mux_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 (mux_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 ((mux_queue_start_addr + mux_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 @@ -544,11 +651,9 @@ int main(int argc, char **argv) { (tunneler_queue_size_bytes >> 4), // 27: remote_receiver_queue_size_words 6 ((tunneler_queue_start_addr + 7 * tunneler_queue_size_bytes) >> 4), // 28: remote_receiver_queue_start_addr_words 7 (tunneler_queue_size_bytes >> 4), // 29: remote_receiver_queue_size_words 7 - 0, 2, // 30 - 31 Settings for remote reciver 8 + 0, 2, // 30 - 31 Settings for remote receiver 8 0, // 32: remote_receiver_queue_start_addr_words 9 - 2, // 33: remote_receiver_queue_size_words 9. - // Unused. Setting to 2 to get around size check assertion that does not allow 0. - + 2, // 33: remote_receiver_queue_size_words 9. Unused. Setting to 2 to get around size check assertion that does not allow 0. packet_switch_4B_pack(tunneler_phys_core.x, tunneler_phys_core.y, 8, @@ -581,12 +686,53 @@ int main(int argc, char **argv) { loopback_mux_phys_core.y, 7, (uint32_t)DispatchRemoteNetworkType::NOC0), // 41: remote_sender_7_info - 0, 0, // 42 - 43: remote_sender 8 - 9 - + 0, 0, // 42 - 43: remote_sender_info[8 - 9] tunneler_test_results_addr, // 44: test_results_addr tunneler_test_results_size, // 45: test_results_size timeout_mcycles * 1000 * 1000 * 4, // 46: timeout_cycles - 0, //47: inner_stop_mux_d_bypass + 0, // 47: inner_stop_mux_d_bypass + tunneler_buffers_right.at("input")[0], // 48: vc_eth_tunneler_input_scratch_buffers[0] + tunneler_buffers_right.at("input")[1], // 49: vc_eth_tunneler_input_scratch_buffers[1] + tunneler_buffers_right.at("input")[2], // 50: vc_eth_tunneler_input_scratch_buffers[2] + tunneler_buffers_right.at("input")[3], // 51: vc_eth_tunneler_input_scratch_buffers[3] + tunneler_buffers_right.at("input")[4], // 52: vc_eth_tunneler_input_scratch_buffers[4] + tunneler_buffers_right.at("input")[5], // 53: vc_eth_tunneler_input_scratch_buffers[5] + tunneler_buffers_right.at("input")[6], // 54: vc_eth_tunneler_input_scratch_buffers[6] + tunneler_buffers_right.at("input")[7], // 55: vc_eth_tunneler_input_scratch_buffers[7] + 0, 0, // 56 - 57: vc_eth_tunneler_input_scratch_buffers[8 - 9] + // from left tunneler + tunneler_buffers_left.at("output")[0], // 58: vc_eth_tunneler_input_remote_scratch_buffers[0] + tunneler_buffers_left.at("output")[1], // 59: vc_eth_tunneler_input_remote_scratch_buffers[1] + tunneler_buffers_left.at("output")[2], // 60: vc_eth_tunneler_input_remote_scratch_buffers[2] + tunneler_buffers_left.at("output")[3], // 61: vc_eth_tunneler_input_remote_scratch_buffers[3] + // from loopback mux on the right + output_scratch_buffers_right.at("loopback_mux")[0], // 62: vc_eth_tunneler_input_remote_scratch_buffers[4] + output_scratch_buffers_right.at("loopback_mux")[1], // 63: vc_eth_tunneler_input_remote_scratch_buffers[5] + output_scratch_buffers_right.at("loopback_mux")[2], // 64: vc_eth_tunneler_input_remote_scratch_buffers[6] + output_scratch_buffers_right.at("loopback_mux")[3], // 65: vc_eth_tunneler_input_remote_scratch_buffers[7] + 0, 0, // 66 - 67: vc_eth_tunneler_input_remote_scratch_buffers[8 - 9] + + tunneler_buffers_right.at("output")[0], // 68: vc_eth_tunneler_output_scratch_buffers[0] + tunneler_buffers_right.at("output")[1], // 69: vc_eth_tunneler_output_scratch_buffers[1] + tunneler_buffers_right.at("output")[2], // 70: vc_eth_tunneler_output_scratch_buffers[2] + tunneler_buffers_right.at("output")[3], // 71: vc_eth_tunneler_output_scratch_buffers[3] + tunneler_buffers_right.at("output")[4], // 72: vc_eth_tunneler_output_scratch_buffers[4] + tunneler_buffers_right.at("output")[5], // 73: vc_eth_tunneler_output_scratch_buffers[5] + tunneler_buffers_right.at("output")[6], // 74: vc_eth_tunneler_output_scratch_buffers[6] + tunneler_buffers_right.at("output")[7], // 75: vc_eth_tunneler_output_scratch_buffers[7] + 0, 0, // 76 - 77: vc_eth_tunneler_output_scratch_buffers[8 - 9] + + // to loopback mux on the right + input_scratch_buffers_right.at("loopback_mux")[0], // 78: vc_eth_tunneler_output_remote_scratch_buffers[0] + input_scratch_buffers_right.at("loopback_mux")[1], // 79: vc_eth_tunneler_output_remote_scratch_buffers[1] + input_scratch_buffers_right.at("loopback_mux")[2], // 80: vc_eth_tunneler_output_remote_scratch_buffers[2] + input_scratch_buffers_right.at("loopback_mux")[3], // 81: vc_eth_tunneler_output_remote_scratch_buffers[3] + // to left tunneler + tunneler_buffers_left.at("input")[4], // 82: vc_eth_tunneler_output_remote_scratch_buffers[4] + tunneler_buffers_left.at("input")[5], // 83: vc_eth_tunneler_output_remote_scratch_buffers[5] + tunneler_buffers_left.at("input")[6], // 84: vc_eth_tunneler_output_remote_scratch_buffers[6] + tunneler_buffers_left.at("input")[7], // 85: vc_eth_tunneler_output_remote_scratch_buffers[7] + 0, 0, // 86 - 87: vc_eth_tunneler_output_remote_scratch_buffers[8 - 9] }; auto tunneler_r_kernel = tt_metal::CreateKernel( @@ -651,7 +797,29 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 25-35: packetize/depacketize settings + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + + input_scratch_buffers_right.at("loopback_mux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_right.at("loopback_mux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_right.at("loopback_mux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_right.at("loopback_mux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + // from right tunneler + tunneler_buffers_right.at("output")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + tunneler_buffers_right.at("output")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + tunneler_buffers_right.at("output")[2], // 42: vc_packet_router_input_remote_scratch_buffers[2] + tunneler_buffers_right.at("output")[3], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_right.at("loopback_mux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_right.at("loopback_mux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_right.at("loopback_mux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_right.at("loopback_mux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + // to right tunneler + tunneler_buffers_right.at("input")[4], // 48: vc_packet_router_output_remote_scratch_buffers[0] + tunneler_buffers_right.at("input")[5], // 49: vc_packet_router_output_remote_scratch_buffers[1] + tunneler_buffers_right.at("input")[6], // 50: vc_packet_router_output_remote_scratch_buffers[2] + tunneler_buffers_right.at("input")[7], // 51: vc_packet_router_output_remote_scratch_buffers[3] }; @@ -668,8 +836,6 @@ int main(int argc, char **argv) { } ); - - // Demux uint32_t dest_map_array[4] = {0, 1, 2, 3}; uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); @@ -703,11 +869,6 @@ int main(int argc, char **argv) { (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - //(uint32_t)tunneler_phys_core.x, // 16: remote_rx_x - //(uint32_t)tunneler_phys_core.y, // 17: remote_rx_y - //3, // 18: remote_rx_queue_id - //(uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type - packet_switch_4B_pack(tunneler_phys_core.x, tunneler_phys_core.y, 12, @@ -730,7 +891,29 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 25-35: packetize/depacketize settings + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25 - 35: packetize/depacketize settings + + input_scratch_buffers_left.at("demux")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_left.at("demux")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_left.at("demux")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_left.at("demux")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + // from left tunneler + tunneler_buffers_left.at("output")[4], // 40: vc_packet_router_input_remote_scratch_buffers[0] + tunneler_buffers_left.at("output")[5], // 41: vc_packet_router_input_remote_scratch_buffers[1] + tunneler_buffers_left.at("output")[6], // 42: vc_packet_router_input_remote_scratch_buffers[2] + tunneler_buffers_left.at("output")[7], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_left.at("demux")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_left.at("demux")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_left.at("demux")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_left.at("demux")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + // to rx + input_scratch_buffers_left.at("traffic_gen_rx")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + input_scratch_buffers_left.at("traffic_gen_rx")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + input_scratch_buffers_left.at("traffic_gen_rx")[2], // 50: vc_packet_router_output_remote_scratch_buffers[2] + input_scratch_buffers_left.at("traffic_gen_rx")[3], // 51: vc_packet_router_output_remote_scratch_buffers[3] }; log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); @@ -761,8 +944,8 @@ int main(int argc, char **argv) { std::chrono::duration elapsed_seconds = (end-start); log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - vector> tx_results; - vector> rx_results; + std::vector> tx_results; + std::vector> rx_results; for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back( @@ -780,19 +963,19 @@ int main(int argc, char **argv) { pass &= (rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); } - vector mux_results = + std::vector mux_results = tt::llrt::read_hex_vec_from_core( device->id(), mux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector loopback_mux_results = + std::vector loopback_mux_results = tt::llrt::read_hex_vec_from_core( device_r->id(), loopback_mux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "LOOPBACK MUX status = {}", packet_queue_test_status_to_string(loopback_mux_results[PQ_TEST_STATUS_INDEX])); pass &= (loopback_mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector demux_results = + std::vector demux_results = tt::llrt::read_hex_vec_from_core( device->id(), demux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp index afed803037c3..71fe41e70e94 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp @@ -5,18 +5,14 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" #include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" -using std::vector; using namespace tt; using json = nlohmann::json; -int main(int argc, char **argv) { +int main(int argc, char **argv) { constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; @@ -31,6 +27,9 @@ int main(int argc, char **argv) { constexpr uint32_t default_data_kb_per_tx = 1024*1024; constexpr uint32_t default_max_packet_size_words = 0x100; + constexpr uint32_t default_input_scratch_buffer_base_addr = 0x60000; + constexpr uint32_t default_output_scratch_buffer_base_addr = 0x70000; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; constexpr uint32_t default_tx_queue_size_bytes = 0x10000; constexpr uint32_t default_rx_queue_start_addr = 0xa0000; @@ -98,6 +97,8 @@ int main(int argc, char **argv) { log_info(LogTest, " --tx_data_sent_per_iter_high: the criteria to determine the amount of tx data sent per iter is high (unit: words); if both 0, then disable counting it in tx kernel, default = {}", default_tx_data_sent_per_iter_high); log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); + log_info(LogTest, " --input_scratch_buffer_base_addr: Scratch buffer for input queues base address, default = {:#x}", default_input_scratch_buffer_base_addr); + log_info(LogTest, " --output_scratch_buffer_base_addr: Scratch buffer for output queues base address, default = {:#x}", default_output_scratch_buffer_base_addr); return 0; } @@ -133,12 +134,29 @@ int main(int argc, char **argv) { uint8_t tx_pkt_dest_size_choice = (uint8_t) test_args::get_command_option_uint32(input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); + uint32_t input_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--input_scratch_buffer_base_addr", default_input_scratch_buffer_base_addr); + uint32_t output_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--output_scratch_buffer_base_addr", default_output_scratch_buffer_base_addr); assert((pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && rx_disable_header_check || (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); uint32_t num_src_endpoints = num_endpoints; uint32_t num_dest_endpoints = num_endpoints; + const auto input_scratch_buffers = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx", num_src_endpoints }, + { "vc_packet_router_l", MAX_SWITCH_FAN_OUT }, + { "vc_packet_router_r", MAX_SWITCH_FAN_OUT }, + { "traffic_gen_rx", num_dest_endpoints }, + + }); + + const auto output_scratch_buffers = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx_mock", num_src_endpoints }, + { "traffic_gen_tx", num_src_endpoints }, + { "vc_packet_router_l", MAX_SWITCH_FAN_OUT}, + { "vc_packet_router_r", MAX_SWITCH_FAN_OUT }, + }); + bool pass = true; std::map defines = { @@ -188,10 +206,14 @@ int main(int argc, char **argv) { tx_skip_pkt_content_gen, // 18: skip_pkt_content_gen tx_pkt_dest_size_choice, // 19: pkt_dest_size_choice tx_data_sent_per_iter_low, // 20: data_sent_per_iter_low - tx_data_sent_per_iter_high // 21: data_sent_per_iter_high + tx_data_sent_per_iter_high, // 21: data_sent_per_iter_high + input_scratch_buffers.at("traffic_gen_tx")[i], // 22: traffic_gen_input_ptrs_addr + output_scratch_buffers.at("traffic_gen_tx_mock")[i], // 23: traffic_gen_input_mock_remote_ptrs_addr + output_scratch_buffers.at("traffic_gen_tx")[i], // 24: traffic_gen_output_ptrs_addr + input_scratch_buffers.at("vc_packet_router_l")[i], // 25: traffic_gen_output_remote_ptrs_addr }; - log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); + log_info(LogTest, "run traffic_gen_tx at x={},y={}", tx_phys_core.back().x, tx_phys_core.back().y); auto kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp", @@ -229,10 +251,12 @@ int main(int argc, char **argv) { src_endpoint_start_id + i, // 15: src_endpoint_start_id dest_endpoint_start_id + i, // 16: dest_endpoint_start_id timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - rx_disable_header_check // 18: disable_header_check + rx_disable_header_check, // 18: disable_header_check + input_scratch_buffers.at("traffic_gen_rx")[i], // 19: traffic_gen_input_ptrs_addr + output_scratch_buffers.at("vc_packet_router_r")[i], // 20: traffic_gen_input_remote_ptrs_addr }; - log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); + log_info(LogTest, "run traffic_gen_rx at x={},y={}", rx_phys_core.back().x, rx_phys_core.back().y); auto kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp", @@ -302,10 +326,30 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000, // 24: timeout_cycles, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 25-35: packetize/depacketize settings + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25-35: packetize/depacketize settings + + input_scratch_buffers.at("vc_packet_router_l")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers.at("vc_packet_router_l")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers.at("vc_packet_router_l")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers.at("vc_packet_router_l")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + output_scratch_buffers.at("traffic_gen_tx")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + output_scratch_buffers.at("traffic_gen_tx")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + output_scratch_buffers.at("traffic_gen_tx")[2], // 42: vc_packet_router_input_remote_scratch_buffers[2] + output_scratch_buffers.at("traffic_gen_tx")[3], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers.at("vc_packet_router_l")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers.at("vc_packet_router_l")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers.at("vc_packet_router_l")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers.at("vc_packet_router_l")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + input_scratch_buffers.at("vc_packet_router_r")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + input_scratch_buffers.at("vc_packet_router_r")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + input_scratch_buffers.at("vc_packet_router_r")[2], // 50: vc_packet_router_output_remote_scratch_buffers[2] + input_scratch_buffers.at("vc_packet_router_r")[3], // 51: vc_packet_router_output_remote_scratch_buffers[3] }; - log_info(LogTest, "run mux at x={},y={}", mux_core.x, mux_core.y); + log_info(LogTest, "run mux at x={},y={}", mux_phys_core.x, mux_phys_core.y); auto mux_kernel = tt_metal::CreateKernel( program, "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp", @@ -350,10 +394,6 @@ int main(int argc, char **argv) { (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - //(uint32_t)mux_phys_core.x, // 16: remote_rx_x - //(uint32_t)mux_phys_core.y, // 17: remote_rx_y - //num_dest_endpoints, // 18: remote_rx_queue_id - //(uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type packet_switch_4B_pack(mux_phys_core.x, mux_phys_core.y, num_dest_endpoints, @@ -375,11 +415,31 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000, // 24: timeout_cycles - 0, 0, 0, 0, 0, // 25-29: depacketize settings - 0, 0, 0, 0, 0, 0// 30-35: packetize settings + 0, 0, 0, 0, 0, // 25 - 29: depacketize settings + 0, 0, 0, 0, 0, 0, // 30 - 35: packetize settings + + input_scratch_buffers.at("vc_packet_router_r")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers.at("vc_packet_router_r")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers.at("vc_packet_router_r")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers.at("vc_packet_router_r")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + output_scratch_buffers.at("vc_packet_router_l")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + output_scratch_buffers.at("vc_packet_router_l")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + output_scratch_buffers.at("vc_packet_router_l")[2], // 42: vc_packet_router_input_remote_scratch_buffers[2] + output_scratch_buffers.at("vc_packet_router_l")[3], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers.at("vc_packet_router_r")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers.at("vc_packet_router_r")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers.at("vc_packet_router_r")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers.at("vc_packet_router_r")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + input_scratch_buffers.at("traffic_gen_rx")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + input_scratch_buffers.at("traffic_gen_rx")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + input_scratch_buffers.at("traffic_gen_rx")[2], // 50: vc_packet_router_output_remote_scratch_buffers[2] + input_scratch_buffers.at("traffic_gen_rx")[3], // 51: vc_packet_router_output_remote_scratch_buffers[3] }; - log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); + log_info(LogTest, "run demux at x={},y={}", demux_phys_core.x, demux_phys_core.y); auto demux_kernel = tt_metal::CreateKernel( program, "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp", @@ -401,8 +461,8 @@ int main(int argc, char **argv) { std::chrono::duration elapsed_seconds = (end-start); log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - vector> tx_results; - vector> rx_results; + std::vector> tx_results; + std::vector> rx_results; for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back( @@ -420,13 +480,13 @@ int main(int argc, char **argv) { pass &= (rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); } - vector mux_results = + std::vector mux_results = tt::llrt::read_hex_vec_from_core( device->id(), mux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector demux_results = + std::vector demux_results = tt::llrt::read_hex_vec_from_core( device->id(), demux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp index 302de15eb4c8..c8d8083b1462 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp @@ -6,18 +6,13 @@ #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/impl/device/device.hpp" #include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/impl/dispatch/cq_commands.hpp" -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" -using std::vector; using namespace tt; using json = nlohmann::json; int main(int argc, char **argv) { - constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; @@ -32,6 +27,9 @@ int main(int argc, char **argv) { constexpr uint32_t default_data_kb_per_tx = 1024*1024; constexpr uint32_t default_max_packet_size_words = 0x100; + constexpr uint32_t default_input_scratch_buffer_base_addr = 0x50000; + constexpr uint32_t default_output_scratch_buffer_base_addr = 0x60000; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; constexpr uint32_t default_tx_queue_size_bytes = 0x10000; constexpr uint32_t default_rx_queue_start_addr = 0xa0000; @@ -46,8 +44,9 @@ int main(int argc, char **argv) { constexpr uint32_t default_tunneler_queue_start_addr = 0x19000; constexpr uint32_t default_tunneler_queue_size_bytes = 0x8000; // maximum queue (power of 2) - constexpr uint32_t default_tunneler_test_results_addr = 0x39000; // 0x8000 * 4 + 0x19000; 0x10000 * 4 + 0x19000 = 0x59000 > 0x40000 (256kB) - constexpr uint32_t default_tunneler_test_results_size = 0x7000; // 256kB total L1 in ethernet core - 0x39000 + constexpr uint32_t default_tunneler_test_results_addr = 0x39000; + constexpr uint32_t default_tunneler_test_results_size = 0x1000; + constexpr uint32_t default_tunneler_buffer_base_addr = 0x3A000; constexpr uint32_t default_timeout_mcycles = 1000; constexpr uint32_t default_rx_disable_data_check = 0; @@ -108,6 +107,9 @@ int main(int argc, char **argv) { log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); log_info(LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id); + log_info(LogTest, " --input_scratch_buffer_base_addr: Scratch buffer for input queues base address, default = {:#x}", default_input_scratch_buffer_base_addr); + log_info(LogTest, " --output_scratch_buffer_base_addr: Scratch buffer for output queues base address, default = {:#x}", default_output_scratch_buffer_base_addr); + log_info(LogTest, " --tunneler_scratch_buffer_base_addr: Scratch buffer for tunneler queues base address, default = {:#x}", default_tunneler_buffer_base_addr); return 0; } @@ -146,6 +148,9 @@ int main(int argc, char **argv) { uint8_t tx_pkt_dest_size_choice = (uint8_t) test_args::get_command_option_uint32(input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32(input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); + uint32_t input_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--input_scratch_buffer_base_addr", default_input_scratch_buffer_base_addr); + uint32_t output_scratch_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--output_scratch_buffer_base_addr", default_output_scratch_buffer_base_addr); + uint32_t tunneler_buffer_base_addr = test_args::get_command_option_uint32(input_args, "--tunneler_buffer_base_addr", default_tunneler_buffer_base_addr); assert((pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && rx_disable_header_check || (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); @@ -157,6 +162,38 @@ int main(int argc, char **argv) { {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; + // ----- Left Chip ----- + const auto input_scratch_buffers_left = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx", num_src_endpoints }, + { "vc_packet_router", MAX_SWITCH_FAN_OUT }, + }); + + const auto tunneler_buffers_left = make_buffer_addresses_for_test(tunneler_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "input", num_src_endpoints }, + { "output", num_src_endpoints }, + }); + + const auto output_scratch_buffers_left = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "traffic_gen_tx", num_src_endpoints }, + { "traffic_gen_tx_mock", num_src_endpoints }, + { "vc_packet_router", MAX_SWITCH_FAN_OUT }, + }); + + // ----- Right Chip ----- + const auto input_scratch_buffers_right = make_buffer_addresses_for_test(input_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "vc_packet_router", MAX_SWITCH_FAN_OUT }, + { "traffic_gen_rx", num_dest_endpoints }, + }); + + const auto tunneler_buffers_right = make_buffer_addresses_for_test(tunneler_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "input", num_dest_endpoints }, + { "output", num_dest_endpoints }, + }); + + const auto output_scratch_buffers_right = make_buffer_addresses_for_test(output_scratch_buffer_base_addr, packet_queue_ptr_buffer_size, { + { "vc_packet_router", MAX_SWITCH_FAN_OUT }, + }); + try { int num_devices = tt_metal::GetNumAvailableDevices(); if (test_device_id >= num_devices) { @@ -189,10 +226,8 @@ int main(int argc, char **argv) { CoreCoord r_tunneler_logical_core = device_r->get_ethernet_sockets(device_id_l)[0]; CoreCoord r_tunneler_phys_core = device_r->ethernet_core_from_logical_core(r_tunneler_logical_core); - - - std::cout<<"Left Tunneler = "<> 4), // 14: remote_receiver_queue_start_addr_words 0 (tunneler_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 @@ -394,10 +455,9 @@ int main(int argc, char **argv) { (tunneler_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 2 ((tunneler_queue_start_addr + 3 * tunneler_queue_size_bytes) >> 4), // 20: remote_receiver_queue_start_addr_words 3 (tunneler_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 3 - 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, // 22 - 31 Settings for remote reciver 4 - 8 + 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, // 22 - 31 Settings for remote receiver[4 - 9] 0, // 32: remote_receiver_queue_start_addr_words 9 - 2, // 33: remote_receiver_queue_size_words 9. - // Unused. Setting to 2 to get around size check assertion that does not allow 0. + 2, // 33: remote_receiver_queue_size_words 9. Unused. Setting to 2 to get around size check assertion that does not allow 0. packet_switch_4B_pack(mux_phys_core.x, mux_phys_core.y, num_dest_endpoints, @@ -414,11 +474,34 @@ int main(int argc, char **argv) { mux_phys_core.y, num_dest_endpoints + 3, (uint32_t)DispatchRemoteNetworkType::NOC0), // 37: remote_sender_3_info - 0, 0, 0, 0, 0, 0, // 38 - 43: remote_sender 4 - 9 + 0, 0, 0, 0, 0, 0, // 38 - 43: remote_sender_info[4 - 9] tunneler_test_results_addr, // 44: test_results_addr tunneler_test_results_size, // 45: test_results_size - 0, // 46: timeout_cycles - 0, //47: inner_stop_mux_d_bypass + timeout_mcycles * 1000 * 1000 * 4, // 46: timeout_cycles + 0, // 47: inner_stop_mux_d_bypass + tunneler_buffers_left.at("input")[0], // 48: vc_eth_tunneler_input_scratch_buffers[0] + tunneler_buffers_left.at("input")[1], // 49: vc_eth_tunneler_input_scratch_buffers[1] + tunneler_buffers_left.at("input")[2], // 50: vc_eth_tunneler_input_scratch_buffers[2] + tunneler_buffers_left.at("input")[3], // 51: vc_eth_tunneler_input_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 52 - 57: vc_eth_tunneler_input_scratch_buffers[4 - 9] + + output_scratch_buffers_left.at("vc_packet_router")[0], // 58: vc_eth_tunneler_input_remote_scratch_buffers[0] + output_scratch_buffers_left.at("vc_packet_router")[1], // 59: vc_eth_tunneler_input_remote_scratch_buffers[1] + output_scratch_buffers_left.at("vc_packet_router")[2], // 60: vc_eth_tunneler_input_remote_scratch_buffers[2] + output_scratch_buffers_left.at("vc_packet_router")[3], // 61: vc_eth_tunneler_input_remote_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 62 - 67: vc_eth_tunneler_input_remote_scratch_buffers[4 - 9] + + tunneler_buffers_left.at("output")[0], // 68: vc_eth_tunneler_output_scratch_buffers[0] + tunneler_buffers_left.at("output")[1], // 69: vc_eth_tunneler_output_scratch_buffers[1] + tunneler_buffers_left.at("output")[2], // 70: vc_eth_tunneler_output_scratch_buffers[2] + tunneler_buffers_left.at("output")[3], // 71: vc_eth_tunneler_output_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 72 - 77: vc_eth_tunneler_output_scratch_buffers[4 - 9] + + tunneler_buffers_right.at("input")[0], // 78: vc_eth_tunneler_output_remote_scratch_buffers[0] + tunneler_buffers_right.at("input")[1], // 79: vc_eth_tunneler_output_remote_scratch_buffers[1] + tunneler_buffers_right.at("input")[2], // 80: vc_eth_tunneler_output_remote_scratch_buffers[2] + tunneler_buffers_right.at("input")[3], // 81: vc_eth_tunneler_output_remote_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 82 - 87: vc_eth_tunneler_output_remote_scratch_buffers[4 - 9] }; auto tunneler_l_kernel = tt_metal::CreateKernel( @@ -432,11 +515,10 @@ int main(int argc, char **argv) { } ); - std::vector tunneler_r_compile_args = { dest_endpoint_start_id, // 0: endpoint_id_start_index - 4, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. + 4, // 1: tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional. (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words packet_switch_4B_pack(demux_phys_core.x, @@ -455,7 +537,7 @@ int main(int argc, char **argv) { demux_phys_core.y, 3, //num_dest_endpoints + 3, (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_receiver_3_info - 0, 0, 0, 0, 0, 0, // 8 - 13: remote_receiver 4 - 9 + 0, 0, 0, 0, 0, 0, // 8 - 13: remote_receiver_3_info[4 - 9] (demux_queue_start_addr >> 4), // 14: remote_receiver_queue_start_addr_words 0 (demux_queue_size_bytes >> 4), // 15: remote_receiver_queue_size_words 0 ((demux_queue_start_addr + demux_queue_size_bytes) >> 4), // 16: remote_receiver_queue_start_addr_words 1 @@ -464,10 +546,9 @@ int main(int argc, char **argv) { (demux_queue_size_bytes >> 4), // 19: remote_receiver_queue_size_words 2 ((demux_queue_start_addr + 3 * demux_queue_size_bytes) >> 4), // 20: remote_receiver_queue_start_addr_words 3 (demux_queue_size_bytes >> 4), // 21: remote_receiver_queue_size_words 3 - 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, // 22 - 31 Settings for remote reciver 4 - 8 + 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, // 22 - 31 Settings for remote receiver 4 - 8 0, // 32: remote_receiver_queue_start_addr_words 9 - 2, // 33: remote_receiver_queue_size_words 9 - // Unused. Setting to 2 to get around size check assertion that does not allow 0. + 2, // 33: remote_receiver_queue_size_words 9. Unused. Setting to 2 to get around size check assertion that does not allow 0. packet_switch_4B_pack(tunneler_phys_core.x, tunneler_phys_core.y, 4, @@ -484,11 +565,34 @@ int main(int argc, char **argv) { tunneler_phys_core.y, 7, (uint32_t)DispatchRemoteNetworkType::ETH), // 37: remote_sender_3_info - 0, 0, 0, 0, 0, 0, // 38 - 43: remote_sender 4 - 9 + 0, 0, 0, 0, 0, 0, // 38 - 43: remote_sender_3_info[4 - 9] tunneler_test_results_addr, // 44: test_results_addr tunneler_test_results_size, // 45: test_results_size - 0, // 46: timeout_cycles - 0, //47: inner_stop_mux_d_bypass + timeout_mcycles * 1000 * 1000 * 4, // 46: timeout_cycles + 0, // 47: inner_stop_mux_d_bypass + tunneler_buffers_right.at("input")[0], // 48: vc_eth_tunneler_input_scratch_buffers[0] + tunneler_buffers_right.at("input")[1], // 49: vc_eth_tunneler_input_scratch_buffers[1] + tunneler_buffers_right.at("input")[2], // 50: vc_eth_tunneler_input_scratch_buffers[2] + tunneler_buffers_right.at("input")[3], // 51: vc_eth_tunneler_input_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 52 - 57: vc_eth_tunneler_input_scratch_buffers[4 - 9] + + tunneler_buffers_left.at("output")[0], // 58: vc_eth_tunneler_input_remote_scratch_buffers[0] + tunneler_buffers_left.at("output")[1], // 59: vc_eth_tunneler_input_remote_scratch_buffers[1] + tunneler_buffers_left.at("output")[2], // 60: vc_eth_tunneler_input_remote_scratch_buffers[2] + tunneler_buffers_left.at("output")[3], // 61: vc_eth_tunneler_input_remote_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 62 - 67: vc_eth_tunneler_input_remote_scratch_buffers[4 - 9] + + tunneler_buffers_right.at("output")[0], // 68: vc_eth_tunneler_output_scratch_buffers[0] + tunneler_buffers_right.at("output")[1], // 69: vc_eth_tunneler_output_scratch_buffers[1] + tunneler_buffers_right.at("output")[2], // 70: vc_eth_tunneler_output_scratch_buffers[2] + tunneler_buffers_right.at("output")[3], // 71: vc_eth_tunneler_output_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 72 - 77: vc_eth_tunneler_output_scratch_buffers[4 - 9] + + input_scratch_buffers_right.at("vc_packet_router")[0], // 78: vc_eth_tunneler_output_remote_scratch_buffers[0] + input_scratch_buffers_right.at("vc_packet_router")[1], // 79: vc_eth_tunneler_output_remote_scratch_buffers[1] + input_scratch_buffers_right.at("vc_packet_router")[2], // 80: vc_eth_tunneler_output_remote_scratch_buffers[2] + input_scratch_buffers_right.at("vc_packet_router")[3], // 81: vc_eth_tunneler_output_remote_scratch_buffers[3] + 0, 0, 0, 0, 0, 0, // 82 - 87: vc_eth_tunneler_output_remote_scratch_buffers[4 - 9] }; auto tunneler_r_kernel = tt_metal::CreateKernel( @@ -502,7 +606,6 @@ int main(int argc, char **argv) { } ); - // Demux uint32_t dest_map_array[4] = {0, 1, 2, 3}; uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); @@ -536,10 +639,6 @@ int main(int argc, char **argv) { (rx_queue_size_bytes >> 4), // 13: remote_tx_queue_size_words 2 (rx_queue_start_addr >> 4), // 14: remote_tx_queue_start_addr_words 3 (rx_queue_size_bytes >> 4), // 15: remote_tx_queue_size_words 3 - //(uint32_t)r_tunneler_phys_core.x, // 16: remote_rx_x - //(uint32_t)r_tunneler_phys_core.y, // 17: remote_rx_y - //2, // 18: remote_rx_queue_id - //(uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type packet_switch_4B_pack(r_tunneler_phys_core.x, r_tunneler_phys_core.y, 4, @@ -561,7 +660,27 @@ int main(int argc, char **argv) { test_results_addr, // 22: test_results_addr test_results_size, // 23: test_results_size timeout_mcycles * 1000 * 1000 * 4, // 24: timeout_cycles - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 25-35: packetize/depacketize settings + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 25-35: packetize/depacketize settings + + input_scratch_buffers_right.at("vc_packet_router")[0], // 36: vc_packet_router_input_scratch_buffers[0] + input_scratch_buffers_right.at("vc_packet_router")[1], // 37: vc_packet_router_input_scratch_buffers[1] + input_scratch_buffers_right.at("vc_packet_router")[2], // 38: vc_packet_router_input_scratch_buffers[2] + input_scratch_buffers_right.at("vc_packet_router")[3], // 39: vc_packet_router_input_scratch_buffers[3] + + tunneler_buffers_right.at("output")[0], // 40: vc_packet_router_input_remote_scratch_buffers[0] + tunneler_buffers_right.at("output")[1], // 41: vc_packet_router_input_remote_scratch_buffers[1] + tunneler_buffers_right.at("output")[2], // 42: vc_packet_router_input_remote_scratch_buffers[2] + tunneler_buffers_right.at("output")[3], // 43: vc_packet_router_input_remote_scratch_buffers[3] + + output_scratch_buffers_right.at("vc_packet_router")[0], // 44: vc_packet_router_output_scratch_buffers[0] + output_scratch_buffers_right.at("vc_packet_router")[1], // 45: vc_packet_router_output_scratch_buffers[1] + output_scratch_buffers_right.at("vc_packet_router")[2], // 46: vc_packet_router_output_scratch_buffers[2] + output_scratch_buffers_right.at("vc_packet_router")[3], // 47: vc_packet_router_output_scratch_buffers[3] + + input_scratch_buffers_right.at("traffic_gen_rx")[0], // 48: vc_packet_router_output_remote_scratch_buffers[0] + input_scratch_buffers_right.at("traffic_gen_rx")[1], // 49: vc_packet_router_output_remote_scratch_buffers[1] + input_scratch_buffers_right.at("traffic_gen_rx")[2], // 50: vc_packet_router_output_remote_scratch_buffers[2] + input_scratch_buffers_right.at("traffic_gen_rx")[3], // 51: vc_packet_router_output_remote_scratch_buffers[3] }; log_info(LogTest, "run demux at x={},y={}", demux_core.x, demux_core.y); @@ -589,8 +708,9 @@ int main(int argc, char **argv) { std::chrono::duration elapsed_seconds = (end-start); log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - vector> tx_results; - vector> rx_results; + std::vector> tx_results; + std::vector> rx_results; + for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back( tt::llrt::read_hex_vec_from_core( @@ -607,13 +727,13 @@ int main(int argc, char **argv) { pass &= (rx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); } - vector mux_results = + std::vector mux_results = tt::llrt::read_hex_vec_from_core( device->id(), mux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "MUX status = {}", packet_queue_test_status_to_string(mux_results[PQ_TEST_STATUS_INDEX])); pass &= (mux_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - vector demux_results = + std::vector demux_results = tt::llrt::read_hex_vec_from_core( device_r->id(), demux_phys_core, test_results_addr, test_results_size); log_info(LogTest, "DEMUX status = {}", packet_queue_test_status_to_string(demux_results[PQ_TEST_STATUS_INDEX])); diff --git a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp deleted file mode 100644 index 3e37adebc804..000000000000 --- a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp +++ /dev/null @@ -1,185 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -// clang-format off -#include "dataflow_api.h" -#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" -// clang-format on - -#define NUM_BIDIR_TUNNELS 1 -#define NUM_TUNNEL_QUEUES (NUM_BIDIR_TUNNELS * 2) - -packet_input_queue_state_t input_queues[NUM_TUNNEL_QUEUES]; -packet_output_queue_state_t output_queues[NUM_TUNNEL_QUEUES]; - -constexpr uint32_t endpoint_id_start_index = get_compile_time_arg_val(0); -constexpr uint32_t tunnel_lanes = get_compile_time_arg_val(1); -constexpr uint32_t in_queue_start_addr_words = get_compile_time_arg_val(2); -constexpr uint32_t in_queue_size_words = get_compile_time_arg_val(3); -constexpr uint32_t in_queue_size_bytes = in_queue_size_words * PACKET_WORD_SIZE_BYTES; -static_assert(is_power_of_2(in_queue_size_words), "in_queue_size_words must be a power of 2"); -static_assert(tunnel_lanes <= NUM_TUNNEL_QUEUES, "cannot have more than 2 tunnel directions."); -static_assert(tunnel_lanes, "tunnel directions cannot be 0. 1 => Unidirectional. 2 => Bidirectional"); - -constexpr uint32_t remote_receiver_x[NUM_TUNNEL_QUEUES] = { - (get_compile_time_arg_val(4) & 0xFF), (get_compile_time_arg_val(5) & 0xFF)}; - -constexpr uint32_t remote_receiver_y[NUM_TUNNEL_QUEUES] = { - (get_compile_time_arg_val(4) >> 8) & 0xFF, (get_compile_time_arg_val(5) >> 8) & 0xFF}; - -constexpr uint32_t remote_receiver_queue_id[NUM_TUNNEL_QUEUES] = { - (get_compile_time_arg_val(4) >> 16) & 0xFF, (get_compile_time_arg_val(5) >> 16) & 0xFF}; - -constexpr DispatchRemoteNetworkType remote_receiver_network_type[NUM_TUNNEL_QUEUES] = { - static_cast((get_compile_time_arg_val(4) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(5) >> 24) & 0xFF)}; - -constexpr uint32_t remote_receiver_queue_start_addr_words[NUM_TUNNEL_QUEUES] = { - get_compile_time_arg_val(6), get_compile_time_arg_val(8)}; - -constexpr uint32_t remote_receiver_queue_size_words[NUM_TUNNEL_QUEUES] = { - get_compile_time_arg_val(7), get_compile_time_arg_val(9)}; - -static_assert( - is_power_of_2(remote_receiver_queue_size_words[0]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert( - is_power_of_2(remote_receiver_queue_size_words[1]), "remote_receiver_queue_size_words must be a power of 2"); - -constexpr uint32_t remote_sender_x[NUM_TUNNEL_QUEUES] = { - (get_compile_time_arg_val(10) & 0xFF), (get_compile_time_arg_val(11) & 0xFF)}; - -constexpr uint32_t remote_sender_y[NUM_TUNNEL_QUEUES] = { - (get_compile_time_arg_val(10) >> 8) & 0xFF, (get_compile_time_arg_val(11) >> 8) & 0xFF}; - -constexpr uint32_t remote_sender_queue_id[NUM_TUNNEL_QUEUES] = { - (get_compile_time_arg_val(10) >> 16) & 0xFF, (get_compile_time_arg_val(11) >> 16) & 0xFF}; - -constexpr DispatchRemoteNetworkType remote_sender_network_type[NUM_TUNNEL_QUEUES] = { - static_cast((get_compile_time_arg_val(10) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(11) >> 24) & 0xFF)}; - -constexpr uint32_t test_results_buf_addr_arg = get_compile_time_arg_val(12); -constexpr uint32_t test_results_buf_size_bytes = get_compile_time_arg_val(13); - -// careful, may be null -tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_buf_addr_arg); - -constexpr uint32_t timeout_cycles = get_compile_time_arg_val(14); -constexpr uint32_t inner_stop_mux_d_bypass = get_compile_time_arg_val(15); - -void kernel_main() { - rtos_context_switch_ptr = (void (*)())RtosTable[0]; - - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000000); - write_test_results(test_results, PQ_TEST_MISC_INDEX + 1, 0xbb000000); - write_test_results(test_results, PQ_TEST_MISC_INDEX + 2, 0xAABBCCDD); - write_test_results(test_results, PQ_TEST_MISC_INDEX + 3, 0xDDCCBBAA); - write_test_results(test_results, PQ_TEST_MISC_INDEX + 4, endpoint_id_start_index); - - for (uint32_t i = 0; i < tunnel_lanes; i++) { - input_queues[i].init( - i, - in_queue_start_addr_words + i * in_queue_size_words, - in_queue_size_words, - remote_sender_x[i], - remote_sender_y[i], - remote_sender_queue_id[i], - remote_sender_network_type[i]); - } - - for (uint32_t i = 0; i < tunnel_lanes; i++) { - output_queues[i].init( - i + NUM_TUNNEL_QUEUES, - remote_receiver_queue_start_addr_words[i], - remote_receiver_queue_size_words[i], - remote_receiver_x[i], - remote_receiver_y[i], - remote_receiver_queue_id[i], - remote_receiver_network_type[i], - &input_queues[i], - 1); - } - - if (!wait_all_src_dest_ready(input_queues, tunnel_lanes, output_queues, tunnel_lanes, timeout_cycles)) { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); - return; - } - - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000001); - - bool timeout = false; - bool all_outputs_finished = false; - uint64_t data_words_sent = 0; - uint64_t iter = 0; - uint64_t start_timestamp = get_timestamp(); - uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; - while (!all_outputs_finished && !timeout) { - iter++; - if (timeout_cycles > 0) { - uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; - if (cycles_since_progress > timeout_cycles) { - timeout = true; - break; - } - } - all_outputs_finished = true; - for (uint32_t i = 0; i < tunnel_lanes; i++) { - if (input_queues[i].get_curr_packet_valid()) { - bool full_packet_sent; - uint32_t words_sent = - output_queues[i].forward_data_from_input(0, full_packet_sent, input_queues[i].get_end_of_cmd()); - // data_words_sent += words_sent; - // if ((words_sent > 0) && (timeout_cycles > 0)) { - progress_timestamp = get_timestamp_32b(); - //} - } - output_queues[i].prev_words_in_flight_check_flush(); - bool output_finished = output_queues[i].is_remote_finished(); - if (output_finished) { - if ((i == 1) && (inner_stop_mux_d_bypass != 0)) { - input_queues[1].remote_x = inner_stop_mux_d_bypass & 0xFF; - input_queues[1].remote_y = (inner_stop_mux_d_bypass >> 8) & 0xFF; - input_queues[1].set_remote_ready_status_addr((inner_stop_mux_d_bypass >> 16) & 0xFF); - } - input_queues[i].send_remote_finished_notification(); - } - all_outputs_finished &= output_finished; - } - uint32_t launch_msg_rd_ptr = *GET_MAILBOX_ADDRESS_DEV(launch_msg_rd_ptr); - tt_l1_ptr launch_msg_t * const launch_msg = GET_MAILBOX_ADDRESS_DEV(launch[launch_msg_rd_ptr]); - if (launch_msg->kernel_config.exit_erisc_kernel) { - return; - } - // need to optimize this. - // context switch to base fw is very costly. - internal_::risc_context_switch(); - } - - if (!timeout) { - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000002); - for (uint32_t i = 0; i < tunnel_lanes; i++) { - if (!output_queues[i].output_barrier(timeout_cycles)) { - timeout = true; - break; - } - } - } - - uint64_t cycles_elapsed = get_timestamp() - start_timestamp; - if (!timeout) { - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000003); - } - - set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); - set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); - set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); - - if (timeout) { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); - } else { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff00005); - } -} diff --git a/tt_metal/impl/dispatch/kernels/packet_demux.cpp b/tt_metal/impl/dispatch/kernels/packet_demux.cpp index 56d67ed8da04..20add6cda9c5 100644 --- a/tt_metal/impl/dispatch/kernels/packet_demux.cpp +++ b/tt_metal/impl/dispatch/kernels/packet_demux.cpp @@ -3,12 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 #include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" +#include "packet_queue_ctrl.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp" #include "tt_metal/impl/dispatch/kernels/cq_helpers.hpp" -packet_input_queue_state_t input_queue; -packet_output_queue_state_t output_queues[MAX_SWITCH_FAN_OUT]; +using namespace packet_queue; constexpr uint32_t endpoint_id_start_index = get_compile_time_arg_val(0); @@ -16,8 +15,6 @@ constexpr uint32_t rx_queue_start_addr_words = get_compile_time_arg_val(1); constexpr uint32_t rx_queue_size_words = get_compile_time_arg_val(2); constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words*PACKET_WORD_SIZE_BYTES; -static_assert(is_power_of_2(rx_queue_size_words), "rx_queue_size_words must be a power of 2"); - constexpr uint32_t demux_fan_out = get_compile_time_arg_val(3); // FIXME imatosevic - is there a way to do this without explicit indexes? @@ -74,11 +71,6 @@ constexpr uint32_t remote_tx_queue_size_words[MAX_SWITCH_FAN_OUT] = get_compile_time_arg_val(15) }; -static_assert(is_power_of_2(remote_tx_queue_size_words[0]), "remote_tx_queue_size_words must be a power of 2"); -static_assert((demux_fan_out < 2) || is_power_of_2(remote_tx_queue_size_words[1]), "remote_tx_queue_size_words must be a power of 2"); -static_assert((demux_fan_out < 3) || is_power_of_2(remote_tx_queue_size_words[2]), "remote_tx_queue_size_words must be a power of 2"); -static_assert((demux_fan_out < 4) || is_power_of_2(remote_tx_queue_size_words[3]), "remote_tx_queue_size_words must be a power of 2"); - constexpr uint32_t remote_rx_x = get_compile_time_arg_val(16); constexpr uint32_t remote_rx_y = get_compile_time_arg_val(17); constexpr uint32_t remote_rx_queue_id = get_compile_time_arg_val(18); @@ -180,15 +172,70 @@ constexpr uint32_t output_depacketize_remove_header[MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(29) >> 24) & 0x1 }; +constexpr uint32_t demux_input_ptr_buffer = get_compile_time_arg_val(30); +constexpr uint32_t demux_input_remote_ptr_buffer = get_compile_time_arg_val(31); + +constexpr uint32_t demux_output_ptr_buffers[MAX_SWITCH_FAN_IN] = + { + get_compile_time_arg_val(32), + get_compile_time_arg_val(33), + get_compile_time_arg_val(34), + get_compile_time_arg_val(35) + }; +constexpr uint32_t demux_output_remote_ptr_buffers[MAX_SWITCH_FAN_IN] = + { + get_compile_time_arg_val(36), + get_compile_time_arg_val(37), + get_compile_time_arg_val(38), + get_compile_time_arg_val(39) + }; + +PacketOutputQueueVariant raw_output_queues[MAX_SWITCH_FAN_OUT]; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; +PacketInputQueueVariant raw_input_queue; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; inline uint8_t dest_output_queue_id(uint32_t dest_endpoint_id) { uint32_t dest_endpoint_index = dest_endpoint_id - endpoint_id_start_index; return dest_output_queue_id_map[dest_endpoint_index]; } -void kernel_main() { +inline void initialize_output_queues() { + init_params_t init_params{ + .input_queues = &raw_input_queue, + .num_input_queues = 1, + }; + process_queues([&](auto i) -> bool { + raw_output_queues[i].template engage(); + auto* active_output_queue = raw_output_queues[i].template get(); + + init_params.queue_id = (uint8_t)sequence_i + 1; + init_params.queue_start_addr_words = remote_tx_queue_start_addr_words[sequence_i]; + init_params.queue_size_words = remote_tx_queue_size_words[sequence_i]; + init_params.remote_queue_id = (uint8_t)remote_tx_queue_id[sequence_i]; + init_params.remote_x = (uint8_t)remote_tx_x[sequence_i]; + init_params.remote_y = (uint8_t)remote_tx_y[sequence_i]; + + init_params.ptrs_addr = demux_output_ptr_buffers[sequence_i]; + init_params.remote_ptrs_addr = demux_output_remote_ptr_buffers[sequence_i]; + + init_params.cb_mode = output_depacketize[sequence_i]; + init_params.local_sem_id = (uint8_t)output_depacketize_local_sem[sequence_i]; + init_params.remote_sem_id = (uint8_t)output_depacketize_downstream_sem[sequence_i]; + init_params.log_page_size = (uint8_t)output_depacketize_log_page_size[sequence_i]; + + init_params.unpacketizer_output_remove_header = (uint16_t)output_depacketize_remove_header[sequence_i]; + + active_output_queue->init(&init_params); + return true; + }); +} + +void kernel_main() { write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000000); write_test_results(test_results, PQ_TEST_MISC_INDEX+1, 0xbb000000 | demux_fan_out); @@ -196,85 +243,96 @@ void kernel_main() { write_test_results(test_results, PQ_TEST_MISC_INDEX+3, dest_endpoint_output_map_lo); write_test_results(test_results, PQ_TEST_MISC_INDEX+4, endpoint_id_start_index); - for (uint32_t i = 0; i < demux_fan_out; i++) { - output_queues[i].init(i + 1, remote_tx_queue_start_addr_words[i], remote_tx_queue_size_words[i], - remote_tx_x[i], remote_tx_y[i], remote_tx_queue_id[i], remote_tx_network_type[i], - &input_queue, 1, - output_depacketize[i], output_depacketize_log_page_size[i], - output_depacketize_local_sem[i], output_depacketize_downstream_sem[i], - output_depacketize_remove_header[i]); + raw_input_queue.engage(); + auto* input_queue = raw_input_queue.get(); + { + constexpr init_params_t input_queue_init_params{ + .is_input = true, + .queue_id = 0, + .queue_start_addr_words = rx_queue_start_addr_words, + .queue_size_words = rx_queue_size_words, + .remote_queue_id = remote_rx_queue_id, + .remote_x = remote_rx_x, + .remote_y = remote_rx_y, + .ptrs_addr = demux_input_ptr_buffer, + .remote_ptrs_addr = demux_input_remote_ptr_buffer, + }; + input_queue->init(&input_queue_init_params); } - input_queue.init(0, rx_queue_start_addr_words, rx_queue_size_words, - remote_rx_x, remote_rx_y, remote_rx_queue_id, remote_rx_network_type); + initialize_output_queues(); - if (!wait_all_src_dest_ready(&input_queue, 1, output_queues, demux_fan_out, timeout_cycles)) { + if (!wait_all_input_output_ready(&raw_input_queue, raw_output_queues, timeout_cycles)) { write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000001); - bool timeout = false; + uint64_t start_timestamp = get_timestamp(); bool all_outputs_finished = false; uint64_t data_words_sent = 0; uint64_t iter = 0; - uint64_t start_timestamp = get_timestamp(); - uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; uint32_t heartbeat = 0; - while (!all_outputs_finished && !timeout) { + while (!all_outputs_finished) { IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); iter++; - if (timeout_cycles > 0) { - uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; - if (cycles_since_progress > timeout_cycles) { - timeout = true; - break; - } - } - if (input_queue.get_curr_packet_valid()) { - uint32_t dest = input_queue.get_curr_packet_dest(); - uint8_t output_queue_id = dest_output_queue_id(dest); + if (input_queue->get_curr_packet_valid()) { + uint32_t output_queue_id = dest_output_queue_id(input_queue->get_curr_packet_dest()); bool full_packet_sent; - uint32_t words_sent = output_queues[output_queue_id].forward_data_from_input(0, full_packet_sent, input_queue.get_end_of_cmd()); - data_words_sent += words_sent; - if ((words_sent > 0) && (timeout_cycles > 0)) { - progress_timestamp = get_timestamp_32b(); + switch(output_queue_id) { + case 0: + data_words_sent += (raw_output_queues[output_queue_id].get())->forward_data_from_input<0>(full_packet_sent, input_queue->get_end_of_cmd()); + break; + case 1: + data_words_sent += (raw_output_queues[output_queue_id].get())->forward_data_from_input<0>(full_packet_sent, input_queue->get_end_of_cmd()); + break; + case 2: + data_words_sent += (raw_output_queues[output_queue_id].get())->forward_data_from_input<0>(full_packet_sent, input_queue->get_end_of_cmd()); + break; + case 3: + data_words_sent += (raw_output_queues[output_queue_id].get())->forward_data_from_input<0>(full_packet_sent, input_queue->get_end_of_cmd()); + break; + default: + break; } } + all_outputs_finished = true; - for (uint32_t i = 0; i < demux_fan_out; i++) { - output_queues[i].prev_words_in_flight_check_flush(); - all_outputs_finished &= output_queues[i].is_remote_finished(); - } + process_queues([&](auto) -> bool { + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + active_output_queue->prev_words_in_flight_check_flush(); + all_outputs_finished &= active_output_queue->is_remote_finished(); + return true; + }); } - if (!timeout) { - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000002); - for (uint32_t i = 0; i < demux_fan_out; i++) { - if (!output_queues[i].output_barrier(timeout_cycles)) { - timeout = true; - break; - } + write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000002); + + bool timed_out = false; + process_queues([&](auto) -> bool { + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + if (!active_output_queue->output_barrier(timeout_cycles)) { + timed_out = true; + return false; } - } + return true; + }); uint64_t cycles_elapsed = get_timestamp() - start_timestamp; - if (!timeout) { - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000003); - input_queue.send_remote_finished_notification(); - } + write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000003); + input_queue->send_remote_finished_notification(); set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); - if (timeout) { + if (timed_out) { write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); - // DPRINT << "demux timeout" << ENDL(); - // // input_queue.dprint_object(); - // output_queues[0].dprint_object(); } else { write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff00005); } + write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff00005); } diff --git a/tt_metal/impl/dispatch/kernels/packet_mux.cpp b/tt_metal/impl/dispatch/kernels/packet_mux.cpp index 8fdd03532e15..b23c6f916f17 100644 --- a/tt_metal/impl/dispatch/kernels/packet_mux.cpp +++ b/tt_metal/impl/dispatch/kernels/packet_mux.cpp @@ -3,12 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 #include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp" #include "tt_metal/impl/dispatch/kernels/cq_helpers.hpp" -packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN]; -packet_output_queue_state_t output_queue; +using namespace packet_queue; constexpr uint32_t reserved = get_compile_time_arg_val(0); @@ -18,8 +16,6 @@ constexpr uint32_t rx_queue_start_addr_words = get_compile_time_arg_val(1); constexpr uint32_t rx_queue_size_words = get_compile_time_arg_val(2); constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words*PACKET_WORD_SIZE_BYTES; -static_assert(is_power_of_2(rx_queue_size_words), "rx_queue_size_words must be a power of 2"); - constexpr uint32_t mux_fan_in = get_compile_time_arg_val(3); // FIXME imatosevic - is there a way to do this without explicit indexes? @@ -63,8 +59,6 @@ constexpr DispatchRemoteNetworkType remote_rx_network_type[MAX_SWITCH_FAN_IN] = constexpr uint32_t remote_tx_queue_start_addr_words = get_compile_time_arg_val(8); constexpr uint32_t remote_tx_queue_size_words = get_compile_time_arg_val(9); -static_assert(is_power_of_2(remote_tx_queue_size_words), "remote_tx_queue_size_words must be a power of 2"); - constexpr uint32_t remote_tx_x = get_compile_time_arg_val(10); constexpr uint32_t remote_tx_y = get_compile_time_arg_val(11); constexpr uint32_t remote_tx_queue_id = get_compile_time_arg_val(12); @@ -137,99 +131,155 @@ constexpr uint32_t input_packetize_dest_endpoint[MAX_SWITCH_FAN_IN] = (get_compile_time_arg_val(24) >> 24) & 0xFF }; +constexpr uint32_t mux_input_ptr_buffers[MAX_SWITCH_FAN_IN] = + { + get_compile_time_arg_val(25), + get_compile_time_arg_val(26), + get_compile_time_arg_val(27), + get_compile_time_arg_val(28) + }; +constexpr uint32_t mux_input_remote_ptr_buffers[MAX_SWITCH_FAN_IN] = + { + get_compile_time_arg_val(29), + get_compile_time_arg_val(30), + get_compile_time_arg_val(31), + get_compile_time_arg_val(32) + }; -void kernel_main() { +constexpr uint32_t mux_output_ptr_buffer = get_compile_time_arg_val(33); +constexpr uint32_t mux_output_remote_ptr_buffer = get_compile_time_arg_val(34); + +PacketInputQueueVariant raw_input_queues[MAX_SWITCH_FAN_IN]; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; + +PacketOutputQueueVariant raw_output_queue; +constexpr init_params_t output_queue_init_params{ + .queue_id = mux_fan_in, + .queue_start_addr_words = remote_tx_queue_start_addr_words, + .queue_size_words = remote_tx_queue_size_words, + .remote_queue_id = remote_tx_queue_id, + .remote_x = remote_tx_x, + .remote_y = remote_tx_y, + .ptrs_addr = mux_output_ptr_buffer, + .remote_ptrs_addr = mux_output_remote_ptr_buffer, + + .cb_mode = output_depacketize, + .local_sem_id = output_depacketize_local_sem, + .remote_sem_id = output_depacketize_downstream_sem, + .log_page_size = output_depacketize_log_page_size, + + .input_queues = raw_input_queues, + .num_input_queues = mux_fan_in, + .unpacketizer_output_remove_header = output_depacketize_remove_header, +}; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; + +inline void initialize_input_queues() { + init_params_t init_params{ + .is_input = true, + }; + process_queues([&](auto) -> bool { + raw_input_queues[sequence_i].template engage(); - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000000); - write_test_results(test_results, PQ_TEST_MISC_INDEX+1, 0xaa000000 | mux_fan_in); + auto* active_input_queue = raw_input_queues[sequence_i].template get(); - for (uint32_t i = 0; i < mux_fan_in; i++) { - input_queues[i].init(i, rx_queue_start_addr_words + i*rx_queue_size_words, rx_queue_size_words, - remote_rx_x[i], remote_rx_y[i], remote_rx_queue_id[i], remote_rx_network_type[i], - input_packetize[i], input_packetize_log_page_size[i], - input_packetize_local_sem[i], input_packetize_upstream_sem[i], - input_packetize_src_endpoint[i], input_packetize_dest_endpoint[i]); - } + init_params.queue_id = (uint8_t)sequence_i; + init_params.queue_start_addr_words = rx_queue_start_addr_words + sequence_i * rx_queue_size_words; + init_params.queue_size_words = rx_queue_size_words; + init_params.remote_queue_id = (uint8_t)remote_rx_queue_id[sequence_i]; + init_params.remote_x = (uint8_t)remote_rx_x[sequence_i]; + init_params.remote_y = (uint8_t)remote_rx_y[sequence_i]; + + init_params.ptrs_addr = mux_input_ptr_buffers[sequence_i]; + init_params.remote_ptrs_addr = mux_input_remote_ptr_buffers[sequence_i]; + + init_params.cb_mode = input_packetize[sequence_i]; + init_params.local_sem_id = (uint8_t)input_packetize_local_sem[sequence_i]; + init_params.remote_sem_id = (uint8_t)input_packetize_upstream_sem[sequence_i]; + init_params.log_page_size = (uint8_t)input_packetize_log_page_size[sequence_i]; + + init_params.packetizer_input_src = (uint16_t)input_packetize_src_endpoint[sequence_i]; + init_params.packetizer_input_dest = (uint16_t)input_packetize_dest_endpoint[sequence_i]; + + active_input_queue->init(&init_params); + + return true; + }); +} - output_queue.init(mux_fan_in, remote_tx_queue_start_addr_words, remote_tx_queue_size_words, - remote_tx_x, remote_tx_y, remote_tx_queue_id, tx_network_type, - input_queues, mux_fan_in, - output_depacketize, output_depacketize_log_page_size, - output_depacketize_downstream_sem, output_depacketize_local_sem, - output_depacketize_remove_header); +inline void initialize_output_queues() { + raw_output_queue.engage(); + auto* output_queue = raw_output_queue.get(); + output_queue->init(&output_queue_init_params); +} + +void kernel_main() { + initialize_input_queues(); + initialize_output_queues(); + + auto* output_queue = raw_output_queue.get(); - if (!wait_all_src_dest_ready(input_queues, mux_fan_in, &output_queue, 1, timeout_cycles)) { + if (!wait_all_input_output_ready(raw_input_queues, &raw_output_queue, timeout_cycles)) { write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000001); - - uint32_t curr_input = 0; - bool timeout = false; bool dest_finished = false; bool curr_input_partial_packet_sent = false; + uint32_t partial_packet_sent_index = 0; uint64_t data_words_sent = 0; uint64_t iter = 0; uint64_t start_timestamp = get_timestamp(); - uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; uint32_t heartbeat = 0; - while (!dest_finished && !timeout) { + while (!dest_finished) { IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); iter++; - if (timeout_cycles > 0) { - uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; - if (cycles_since_progress > timeout_cycles) { - timeout = true; - break; - } - } - if (input_queues[curr_input].get_curr_packet_valid()) { - bool full_packet_sent; - uint32_t words_sent = output_queue.forward_data_from_input(curr_input, full_packet_sent, input_queues[curr_input].get_end_of_cmd()); - data_words_sent += words_sent; - if ((words_sent > 0) && (timeout_cycles > 0)) { - progress_timestamp = get_timestamp_32b(); + + process_queues([&](auto i) -> bool { + if (curr_input_partial_packet_sent && partial_packet_sent_index != i) return true; + auto* active_input_queue = raw_input_queues[i].template get(); + curr_input_partial_packet_sent = false; + if (active_input_queue->get_curr_packet_valid()) { + bool full_packet_sent; + uint32_t words_sent = output_queue->forward_data_from_input(full_packet_sent, active_input_queue->get_end_of_cmd()); + data_words_sent += words_sent; + curr_input_partial_packet_sent = !full_packet_sent; } - curr_input_partial_packet_sent = !full_packet_sent; - } - if (!curr_input_partial_packet_sent) { - curr_input++; - if (curr_input == mux_fan_in) { - curr_input = 0; + + if (curr_input_partial_packet_sent) { + partial_packet_sent_index = i; + // stop looping at this queue. come back to it at the next iteration from the outer while loop + return false; } - } - output_queue.prev_words_in_flight_check_flush(); - dest_finished = output_queue.is_remote_finished(); + + return true; // keep looping + }); + + output_queue->prev_words_in_flight_check_flush(); + dest_finished = output_queue->is_remote_finished(); } - if (!timeout) { - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000002); - if (!output_queue.output_barrier(timeout_cycles)) { - timeout = true; - } + if (!output_queue->output_barrier(timeout_cycles)) { + write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); + return; } uint64_t cycles_elapsed = get_timestamp() - start_timestamp; - if (!timeout) { - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000003); - for (uint32_t i = 0; i < mux_fan_in; i++) { - input_queues[i].send_remote_finished_notification(); - } - } + process_queues([&](auto i) -> bool { + auto* active_input_queue = raw_input_queues[i].template get(); + active_input_queue->send_remote_finished_notification(); + return true; + }); set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); - if (timeout) { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); - // DPRINT << "mux timeout" << ENDL(); - // input_queues[0].dprint_object(); - // output_queue.dprint_object(); - } else { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); - write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff00005); - } + write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); + write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff00005); } diff --git a/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp b/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp index 1d0607812110..3a118c66762e 100644 --- a/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp +++ b/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp @@ -4,6 +4,10 @@ #pragma once +#include + +constexpr uint32_t NUM_TUNNEL_QUEUES_BIDIR = 2; + constexpr uint32_t PACKET_WORD_SIZE_BYTES = 16; constexpr uint32_t MAX_SWITCH_FAN_IN = 4; constexpr uint32_t MAX_SWITCH_FAN_OUT = 4; @@ -18,11 +22,11 @@ constexpr uint32_t OUTPUT_QUEUE_START_ID = MAX_SWITCH_FAN_IN; constexpr uint32_t PACKET_QUEUE_REMOTE_READY_FLAG = 0xA; constexpr uint32_t PACKET_QUEUE_REMOTE_FINISHED_FLAG = 0xB; -constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; -constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; -constexpr uint32_t PACKET_QUEUE_TEST_PASS = PACKET_QUEUE_STAUS_MASK | 0x1; -constexpr uint32_t PACKET_QUEUE_TEST_TIMEOUT = PACKET_QUEUE_STAUS_MASK | 0x2; -constexpr uint32_t PACKET_QUEUE_TEST_DATA_MISMATCH = PACKET_QUEUE_STAUS_MASK | 0x3; +constexpr uint32_t PACKET_QUEUE_STATUS_MASK = 0xabc00000; +constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STATUS_MASK | 0x0; +constexpr uint32_t PACKET_QUEUE_TEST_PASS = PACKET_QUEUE_STATUS_MASK | 0x1; +constexpr uint32_t PACKET_QUEUE_TEST_TIMEOUT = PACKET_QUEUE_STATUS_MASK | 0x2; +constexpr uint32_t PACKET_QUEUE_TEST_DATA_MISMATCH = PACKET_QUEUE_STATUS_MASK | 0x3; // indexes of return values in test results buffer constexpr uint32_t PQ_TEST_STATUS_INDEX = 0; @@ -31,7 +35,6 @@ constexpr uint32_t PQ_TEST_CYCLES_INDEX = 4; constexpr uint32_t PQ_TEST_ITER_INDEX = 6; constexpr uint32_t PQ_TEST_MISC_INDEX = 16; - enum DispatchPacketFlag : uint32_t { PACKET_CMD_START = (0x1 << 1), PACKET_CMD_END = (0x1 << 2), @@ -40,54 +43,132 @@ enum DispatchPacketFlag : uint32_t { }; enum DispatchRemoteNetworkType : uint8_t { - NOC0 = 0, - NOC1 = 1, - ETH = 2, - NONE = 3 + SKIP = 0, // No queue. Will be skipped during queue looping + NOC0 = 1, + NOC1 = 2, + ETH = 3, + NONE = 4, }; -inline bool is_remote_network_type_noc(DispatchRemoteNetworkType type) { - return type == NOC0 || type == NOC1; + +#define is_power_of_2(x) (((x) > 0) && (((x) & ((x) - 1)) == 0)) + +inline uint32_t packet_switch_4B_pack(uint32_t b0, uint32_t b1, uint32_t b2, uint32_t b3) { + return (b3 << 24) | (b2 << 16) | (b1 << 8) | b0; +} + +static_assert(MAX_DEST_ENDPOINTS <= 32, + "MAX_DEST_ENDPOINTS must be <= 32 for the packing funcitons below to work"); + +static_assert(MAX_SWITCH_FAN_OUT <= 4, + "MAX_SWITCH_FAN_OUT must be <= 4 for the packing funcitons below to work"); + +inline uint64_t packet_switch_dest_pack(uint32_t* dest_output_map_array, uint32_t num_dests) { + uint64_t result = 0; + for (uint32_t i = 0; i < num_dests; i++) { + result |= ((uint64_t)(dest_output_map_array[i])) << (2*i); + } + return result; } -struct dispatch_packet_header_t { +struct dispatch_packet_header_t { uint32_t packet_size_bytes; uint16_t packet_src; uint16_t packet_dest; uint16_t packet_flags; uint16_t num_cmds; uint32_t tag; +}; - inline bool check_packet_flags(uint32_t flags) const { - return (packet_flags & flags) == flags; +// Packet Queue Scratch Buffer +struct packet_queue_ptr_buffer_t { + // padding to make each entry == 1 packet (16B) + uint32_t wptr; + uint8_t padding0[12]; + + uint32_t rptr_sent; + uint8_t padding1[12]; + + uint32_t rptr_cleared; + uint8_t padding2[12]; + + // Due to the lack of an inc command and we are not using semaphores, + // a copy of the remote value is stored on the owner queue. It gets incremented here first, + // and then we write this value to the remote L1. + // For an input queue, it owns and updates the rptr. And wptr for output queue. + uint32_t shadow_remote_wptr; + uint8_t padding3[12]; + + uint32_t shadow_remote_rptr_sent; + uint8_t padding4[12]; + + uint32_t shadow_remote_rptr_cleared; + uint8_t padding5[12]; + + // Sent and Recv value for ethernet acks + uint64_t eth_sent; + uint8_t padding6[8]; + + uint64_t eth_recv; + uint8_t padding7[8]; +} __attribute__((aligned(16))); + +constexpr uint32_t packet_queue_ptr_buffer_size = sizeof(packet_queue_ptr_buffer_t); + +// Do not modify the scratch buffer without updating the usages of it +static_assert(packet_queue_ptr_buffer_size == 128 && "packet_queue_ptr_buffer_size expected to be 128B"); + +// Packet Queue Scratch Buffer Memory Layout +struct packet_queue_ptr_buffer_layout_t { + static constexpr uint32_t WPTR_OFFSET = 0; + static constexpr uint32_t RPTR_SENT_OFFSET = 16; + static constexpr uint32_t RPTR_CLEARED_OFFSET = 32; + static constexpr uint32_t SHADOW_REMOTE_WPTR_OFFSET = 48; + static constexpr uint32_t SHADOW_REMOTE_RPTR_SENT_OFFSET = 64; + static constexpr uint32_t SHADOW_REMOTE_RPTR_CLEARED_OFFSET = 80; + static constexpr uint32_t ETH_SENT_OFFSET = 96; + static constexpr uint32_t ETH_RECV_OFFSET = 112; + + static volatile uint32_t* get_wptr(uint32_t base_addr) { + return reinterpret_cast(base_addr + WPTR_OFFSET); } - inline void set_packet_flags(uint32_t flags) { - packet_flags |= flags; + static volatile uint32_t* get_rptr_sent(uint32_t base_addr) { + return reinterpret_cast(base_addr + RPTR_SENT_OFFSET); } - inline void clear_packet_flags(uint32_t flags) { - packet_flags &= ~flags; + static volatile uint32_t* get_rptr_cleared(uint32_t base_addr) { + return reinterpret_cast(base_addr + RPTR_CLEARED_OFFSET); } -}; -#define is_power_of_2(x) (((x) > 0) && (((x) & ((x) - 1)) == 0)) + static uint32_t* get_shadow_remote_wptr(uint32_t base_addr) { + return reinterpret_cast(base_addr + SHADOW_REMOTE_WPTR_OFFSET); + } -inline uint32_t packet_switch_4B_pack(uint32_t b0, uint32_t b1, uint32_t b2, uint32_t b3) { - return (b3 << 24) | (b2 << 16) | (b1 << 8) | b0; -} + static uint32_t* get_shadow_remote_rptr_sent(uint32_t base_addr) { + return reinterpret_cast(base_addr + SHADOW_REMOTE_RPTR_SENT_OFFSET); + } -static_assert(MAX_DEST_ENDPOINTS <= 32, - "MAX_DEST_ENDPOINTS must be <= 32 for the packing funcitons below to work"); + static uint32_t* get_shadow_remote_rptr_cleared(uint32_t base_addr) { + return reinterpret_cast(base_addr + SHADOW_REMOTE_RPTR_CLEARED_OFFSET); + } -static_assert(MAX_SWITCH_FAN_OUT <= 4, - "MAX_SWITCH_FAN_OUT must be <= 4 for the packing funcitons below to work"); + static volatile uint32_t* get_eth_sent(uint32_t base_addr) { + return reinterpret_cast(base_addr + ETH_SENT_OFFSET); + } -inline uint64_t packet_switch_dest_pack(uint32_t* dest_output_map_array, uint32_t num_dests) { - uint64_t result = 0; - for (uint32_t i = 0; i < num_dests; i++) { - result |= ((uint64_t)(dest_output_map_array[i])) << (2*i); + static volatile uint32_t* get_eth_recv(uint32_t base_addr) { + return reinterpret_cast(base_addr + ETH_RECV_OFFSET); } - return result; -} + + // Is this layout correct? + static_assert(offsetof(packet_queue_ptr_buffer_t, wptr) == WPTR_OFFSET, "wptr offset mismatch"); + static_assert(offsetof(packet_queue_ptr_buffer_t, rptr_sent) == RPTR_SENT_OFFSET, "rptr_sent offset mismatch"); + static_assert(offsetof(packet_queue_ptr_buffer_t, rptr_cleared) == RPTR_CLEARED_OFFSET, "rptr_cleared offset mismatch"); + static_assert(offsetof(packet_queue_ptr_buffer_t, shadow_remote_wptr) == SHADOW_REMOTE_WPTR_OFFSET, "shadow_remote_wptr offset mismatch"); + static_assert(offsetof(packet_queue_ptr_buffer_t, shadow_remote_rptr_sent) == SHADOW_REMOTE_RPTR_SENT_OFFSET, "shadow_remote_rptr_sent offset mismatch"); + static_assert(offsetof(packet_queue_ptr_buffer_t, shadow_remote_rptr_cleared) == SHADOW_REMOTE_RPTR_CLEARED_OFFSET, "shadow_remote_rptr_cleared offset mismatch"); + static_assert(offsetof(packet_queue_ptr_buffer_t, eth_sent) == ETH_SENT_OFFSET, "eth_sent offset mismatch"); + static_assert(offsetof(packet_queue_ptr_buffer_t, eth_recv) == ETH_RECV_OFFSET, "eth_recv offset mismatch"); +}; diff --git a/tt_metal/impl/dispatch/kernels/packet_queue_remotes.hpp b/tt_metal/impl/dispatch/kernels/packet_queue_remotes.hpp new file mode 100644 index 000000000000..909aeeb146fb --- /dev/null +++ b/tt_metal/impl/dispatch/kernels/packet_queue_remotes.hpp @@ -0,0 +1,244 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "dataflow_api.h" +#include "ethernet/dataflow_api.h" +#include "noc_parameters.h" + +namespace packet_queue { + +constexpr uint32_t NUM_WR_CMD_BUFS = 4; + +constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = + (NUM_WR_CMD_BUFS - 1) * (NOC_MAX_BURST_WORDS * NOC_WORD_BYTES) / PACKET_WORD_SIZE_BYTES; +constexpr uint32_t DEFAULT_MAX_ETH_SEND_WORDS = 2 * 1024; + +// Max 2 bits +enum PtrUpdateType : uint8_t { + NONE = 0, + WPTR = 1, + RPTR_SENT = 2, + RPTR_CLEARED = 3, +}; + +// Base for remote network controller. +// T is the implementation. +template +class packet_queue_remote_control_t { +private: + inline T& impl() noexcept { return static_cast(*this); } + + inline const T& impl() const noexcept { return static_cast(*this); } + +protected: + packet_queue_remote_control_t() = default; + ~packet_queue_remote_control_t() = default; + +public: + // Kernel init + void init( + uint32_t queue_id, + uint32_t remote_queue_id, + uint8_t remote_x, + uint8_t remote_y, + uint32_t local_ptrs_addr, + uint32_t remote_ptrs_addr) { + this->impl()._init(queue_id, remote_queue_id, remote_x, remote_y, local_ptrs_addr, remote_ptrs_addr); + } + + // Set stream register value + inline void reg_update(uint32_t reg_addr, uint32_t val) { this->impl()._reg_update(reg_addr, val); } + + // Update a pointer on the remote + inline void ptr_update(uint32_t src_addr, uint32_t dest_addr, PtrUpdateType update_type) { + this->impl()._ptr_update(src_addr, dest_addr, update_type); + } + + // Send data to the remote + inline void send_data(uint32_t src_addr, uint32_t dest_addr, uint32_t num_words) { + this->impl()._send_data(src_addr, dest_addr, num_words); + } + + // Returns true if the controller is busy and cannot be used yet + inline bool busy() const { return this->impl()._busy(); } + + // Handle any pending acks from the remote sender + inline void handle_recv() { this->impl()._handle_recv(); } +}; // packet_queue_remote_control_t + +// Remote updates over NOC0. +class packet_queue_remote_noc0_impl final : public packet_queue_remote_control_t { +private: + uint8_t remote_x; + uint8_t remote_y; + +public: + void _init( + uint32_t queue_id, + uint32_t remote_queue_id, + uint8_t remote_x, + uint8_t remote_y, + uint32_t local_ptrs_addr, + uint32_t remote_ptrs_addr) { + this->remote_x = remote_x; + this->remote_y = remote_y; + } + + inline void _reg_update(uint32_t reg_addr, uint32_t val) { + noc_inline_dw_write(get_noc_addr(this->remote_x, this->remote_y, reg_addr), val); + } + + inline void _ptr_update(uint32_t src_addr, uint32_t dest_addr, PtrUpdateType update_type) { + noc_inline_dw_write( + get_noc_addr(this->remote_x, this->remote_y, dest_addr), *reinterpret_cast(src_addr)); + } + + inline void _send_data(uint32_t src_addr, uint32_t dest_addr, uint32_t num_words) { + noc_async_write( + src_addr, + get_noc_addr(this->remote_x, this->remote_y, dest_addr), + num_words * 16 // bytes + ); + } + + inline bool _busy() const { return false; } + + inline void _handle_recv() {} +}; // packet_queue_remote_noc0_impl + +// Remote updates over Ethernet. +class packet_queue_remote_eth_impl final : public packet_queue_remote_control_t { +private: + struct ptr_reg_fields_t { + uint32_t ptr_value; + uint32_t dest_addr; + }; + + union ptr_value_reg_t { + uint64_t raw; + ptr_reg_fields_t fields; + }; + + volatile ptr_value_reg_t* sent; + volatile ptr_value_reg_t* recv; + uint32_t remote_sent_addr; + uint32_t remote_recv_addr; + + ptr_value_reg_t outgoing_data_0; + +public: + void _init( + uint32_t queue_id, + uint32_t remote_queue_id, + uint8_t remote_x, + uint8_t remote_y, + uint32_t ptrs_addr, + uint32_t remote_ptrs_addr) { + this->sent = + reinterpret_cast(packet_queue_ptr_buffer_layout_t::get_eth_sent(ptrs_addr)); + this->recv = + reinterpret_cast(packet_queue_ptr_buffer_layout_t::get_eth_recv(ptrs_addr)); + this->remote_sent_addr = + reinterpret_cast(packet_queue_ptr_buffer_layout_t::get_eth_sent(remote_ptrs_addr)); + this->remote_recv_addr = + reinterpret_cast(packet_queue_ptr_buffer_layout_t::get_eth_recv(remote_ptrs_addr)); + this->sent->raw = 0; + this->recv->raw = 0; + } + + inline void _reg_update(uint32_t reg_addr, uint32_t val) { internal_::eth_write_remote_reg(0, reg_addr, val); } + + inline void _ptr_update(uint32_t src_addr, uint32_t dest_addr, PtrUpdateType update_type) { + // Need to replace this with sending only 1 packet + internal_::eth_send_packet( + 0, // txq + src_addr >> 4, // source in words + dest_addr >> 4, // dest in words + 1 // words + ); + this->outgoing_data_0.fields.ptr_value = *reinterpret_cast(src_addr); + this->outgoing_data_0.fields.dest_addr = dest_addr; + internal_::eth_send_packet( + 0, // txq + (uint32_t)&this->outgoing_data_0 >> 4, // source in words + this->remote_recv_addr >> 4, // dest in words + 1 // words + ); + this->sent->raw = 1; + } + + inline void _send_data(uint32_t src_addr, uint32_t dest_addr, uint32_t num_words) { + internal_::eth_send_packet(0, src_addr >> 4, dest_addr >> 4, num_words); + } + + inline bool _busy() const { return (bool)this->sent->raw; /* there is pending data in the sent buffer */ } + + inline void _handle_recv() { + if (!this->recv->raw) { + return; + } + + // uint32_t value = this->recv->fields.ptr_value; + // uint32_t dest_addr = this->recv->fields.dest_addr; + // *reinterpret_cast(dest_addr) = value; + this->recv->raw = 0; + internal_::eth_send_packet( + 0, // txq + (uint32_t)this->recv >> 4, // source in words + this->remote_sent_addr >> 4, // dest in words + 1 // words + ); + } +}; // packet_queue_remote_eth_impl + +// Dummy remote update class for testing. +class packet_queue_remote_nop_impl final : public packet_queue_remote_control_t { +public: + void _init( + uint32_t queue_id, + uint32_t remote_queue_id, + uint8_t remote_x, + uint8_t remote_y, + uint32_t local_ptrs_addr, + uint32_t remote_ptrs_addr) {} + + inline void _reg_update(uint32_t reg_addr, uint32_t val) {} + + inline void _ptr_update(uint32_t src_addr, uint32_t dest_addr, PtrUpdateType update_type) {} + + inline void _send_data(uint32_t src_addr, uint32_t dest_addr, uint32_t num_words) {} + + inline bool _busy() const { return false; } + + inline void _handle_recv() {} +}; // packet_queue_remote_nop_impl + +// Query the maximum words that can be sent through each remote type +template +struct remote_max_send_words { + static constexpr uint32_t value = 0; +}; + +template <> +struct remote_max_send_words { + static constexpr uint32_t value = DEFAULT_MAX_NOC_SEND_WORDS; +}; + +template <> +struct remote_max_send_words { + static constexpr uint32_t value = DEFAULT_MAX_ETH_SEND_WORDS; +}; + +template <> +struct remote_max_send_words { + static constexpr uint32_t value = DEFAULT_MAX_NOC_SEND_WORDS; +}; + +template +constexpr uint32_t remote_max_send_words_v = remote_max_send_words::value; + +}; // namespace packet_queue diff --git a/tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp b/tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp new file mode 100644 index 000000000000..3f1a312f5a06 --- /dev/null +++ b/tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp @@ -0,0 +1,1379 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include + +#include "debug/assert.h" +#include "debug/dprint.h" +#include "debug/waypoint.h" + +#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_remotes.hpp" + +namespace packet_queue { + +static constexpr uint32_t NUM_PTR_REGS_PER_INPUT_QUEUE = 1; +static constexpr uint32_t NUM_PTR_REGS_PER_OUTPUT_QUEUE = 2; + +constexpr ProgrammableCoreType fd_core_type = static_cast(FD_CORE_TYPE); + +static constexpr uint32_t k_MaxInputOutputQueues = 10; +static_assert(k_MaxInputOutputQueues >= MAX_SWITCH_FAN_IN); +static_assert(k_MaxInputOutputQueues >= MAX_SWITCH_FAN_OUT); +static_assert(k_MaxInputOutputQueues >= MAX_TUNNEL_LANES); + +/* +********************** +* * +* Functions * +* * +********************** +*/ +void zero_l1_buf(tt_l1_ptr uint32_t* buf, uint32_t size_bytes) { + for (uint32_t i = 0; i < size_bytes / 4; i++) { + buf[i] = 0; + } +} + +void write_test_results(tt_l1_ptr uint32_t* const buf, uint32_t i, uint32_t val) { + if (buf != nullptr) { + buf[i] = val; + } +} + +void write_kernel_status(tt_l1_ptr uint32_t* const buf, uint32_t i, uint32_t val) { + if (buf != nullptr) { + buf[i] = val; + } +} + +void set_64b_result(uint32_t* buf, uint64_t val, uint32_t index = 0) { + if (buf != nullptr) { + buf[index] = val >> 32; + buf[index + 1] = val & 0xFFFFFFFF; + } +} + +// Get 64 bit riscv timestamp +inline uint64_t get_timestamp() { + uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + return (((uint64_t)timestamp_high) << 32) | timestamp_low; +} + +// Get lower 32 bits of riscv timestamp +inline uint64_t get_timestamp_32b() { return reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); } + +// Helper function to call the lambda N times, passing in the template arguments and index +template +bool process_queues(F&& func, std::index_sequence) { + static_assert(NetworkTypeSequence::size == CBModeSequence::size); + bool result = true; + (([&]() { + if constexpr (NetworkTypeSequence::values[Index] == DispatchRemoteNetworkType::SKIP) { + return true; + } else { + return std::forward(func) + .template operator()(Index); + } + }() && + (result = true)), + ...); + return result; +} + +// F is a lambda that will be called over each queue from index 0 to NetworkTypeSequence::size +// If any function returns false it will stop. +template +bool process_queues(F&& func) { + static_assert(NetworkTypeSequence::size == CBModeSequence::size); + return process_queues( + std::forward(func), std::make_index_sequence()); +} + +/* +**************************** +* * +* Classes/Structs/Storage * +* * +**************************** +*/ + +// CB Mode configuration info +struct cb_mode_config_t { + uint32_t page_size_words; // Must be a power of 2 + uint32_t page_size_mask; + uint8_t local_sem_id; + uint8_t remote_sem_id; + + // Initialize cb mode config options + void init(uint32_t log_page_size, uint8_t local_sem_id, uint8_t remote_sem_id) { + this->page_size_words = (((uint32_t)0x1) << log_page_size) / PACKET_WORD_SIZE_BYTES; + this->page_size_mask = this->page_size_words - 1; + this->local_sem_id = local_sem_id; + this->remote_sem_id = remote_sem_id; + } +}; + +// Dummy placeholder queue +class packet_queue_nop { +public: + template + uint32_t forward_data_from_input(bool, bool) { + return 0; + } +}; + +// Base for all packet queues. +// T is the implementation. R is the remote controller for pointer updates. +// DisableRemotePtr will disable the remote ptr updates if the implementation uses a custom method. +template +class packet_queue_base_t; + +// Base for all input packet queues. +// T is the implementation. R is the remote controller for pointer updates. +// DisableRemotePtr will disable the remote ptr updates if the implementation uses a custom method. +template +class packet_queue_input_t; + +// Copies data to local L1 from a remote queue. +// R is the specialized remote control type. +template +class input_queue_impl_t; + +// Copies paged data to local L1 from a remote queue. +// R is the specialized remote control type. +template +class input_queue_cb_mode_impl_t; + +// Base for all output packet queues. +// T is the implementation. R is the remote controller for pointer updates. +// DisableRemotePtr will disable the remote ptr updates if the implementation uses a custom method. +template +class packet_queue_output_t; + +// Writes paged data to a remote queue. +// R is the specialized remote control type. +template +class output_queue_cb_mode_impl_t; + +// Writes data to a remote queue. +// R is the specialized remote control type. +template +class output_queue_impl_t; + +template +struct select_network { + using type = std::conditional_t< + NetworkType == DispatchRemoteNetworkType::NOC0, + packet_queue_remote_noc0_impl, + std::conditional_t< + NetworkType == DispatchRemoteNetworkType::ETH, + packet_queue_remote_eth_impl, + packet_queue_remote_nop_impl>>; +}; + +// Provides a concrete remote network implementation given a NetworkType +template +using select_network_t = typename select_network::type; + +template +struct select_input_queue_type { + using type = std::conditional_t< + NetworkType == DispatchRemoteNetworkType::SKIP, + packet_queue_nop, + std::conditional_t< + CBMode == true, + input_queue_cb_mode_impl_t>, + input_queue_impl_t>>>; +}; + +// Provides a concrete input queue implementation based on the network type and CB mode. +template +using select_input_queue_t = typename select_input_queue_type::type; + +template < + DispatchRemoteNetworkType NetworkType, + bool CBMode, + typename InputNetworkTypeSequence, + typename InputCBSequence> +struct select_output_queue { + using type = std::conditional_t< + NetworkType == DispatchRemoteNetworkType::SKIP, + packet_queue_nop, + std::conditional_t< + CBMode == true, + output_queue_cb_mode_impl_t, InputNetworkTypeSequence, InputCBSequence>, + output_queue_impl_t, InputNetworkTypeSequence, InputCBSequence>>>; +}; + +// Provides a concrete output queue implementation based on the network type and CB mode. +template < + DispatchRemoteNetworkType NetworkType, + bool CBMode, + typename InputNetworkTypeSequence, + typename InputCBSequence> +using select_output_queue_t = + typename select_output_queue::type; + +// Stores an input queue in raw memory. Engage must be called once +// before using the accessors. +class PacketInputQueueVariant { +private: + // Static assert to ensure the specialized classes do not exceed this hardcoded storage size + static constexpr size_t k_QueueSize = 144; + alignas(16) std::byte storage[k_QueueSize]; + +public: + // Initialize memory contents + template + void engage() { + using T = select_input_queue_t; + static_assert(sizeof(T) <= k_QueueSize); + new (storage) T(); + } + + // Get pointer to concrete instance + template + inline select_input_queue_t* get() { + return std::launder(reinterpret_cast*>(storage)); + } + + // Get pointer to a known type + template + inline T* get_known_type() { + return std::launder(reinterpret_cast(storage)); + } +}; + +// Stores an output queue in raw memory. Engage must be called once +// before using the accessors. +class PacketOutputQueueVariant { +private: + // Output queue is expected to be larger than the input queue due to the additional + // state tracking that's needed + static constexpr size_t k_QueueSize = 224; // 192; + alignas(16) std::byte storage[k_QueueSize]; + +public: + // Initialize memory contents + template < + DispatchRemoteNetworkType NetworkType, + bool CBMode, + typename InputNetworkTypeSequence, + typename InputCBSequence> + void engage() { + using T = select_output_queue_t; + static_assert(sizeof(T) <= k_QueueSize); + new (storage) T(); + } + + // Get pointer to concrete instance + template < + DispatchRemoteNetworkType NetworkType, + bool CBMode, + typename InputNetworkTypeSequence, + typename InputCBSequence> + inline select_output_queue_t* get() { + return std::launder( + reinterpret_cast*>( + storage)); + } + + // Get pointer to a known type + template + inline T* get_known_type() { + return std::launder(reinterpret_cast(storage)); + } +}; + +// A sequence of DispatchRemoteNetworkType's used for stamping out multiple input/output queue templates +template +struct NetworkTypeSequence { + static constexpr size_t size = sizeof...(NetworkTypes); + static constexpr std::array values = {NetworkTypes...}; +}; +// When there is no queue +using NoNetworkTypeSequence = NetworkTypeSequence<>; + +// A sequence of CB Mode enabled or disabled used for stamping out multiple input/output queue templates +template +struct CBModeTypeSequence { + static constexpr size_t size = sizeof...(CBModeEnabled); + static constexpr std::array values = {CBModeEnabled...}; +}; +// When there is no queue +using NoCBModeTypeSequence = CBModeTypeSequence<>; + +// Common Initialization parameters for packet queues +struct init_params_t { + // Local Queue + bool is_input{false}; + uint8_t queue_id; + uint32_t queue_start_addr_words; + uint32_t queue_size_words; + + // Remote Queue + uint8_t remote_queue_id; + uint8_t remote_x; + uint8_t remote_y; + + // Scratch Buffers + uint32_t ptrs_addr; + uint32_t remote_ptrs_addr; + + // CB Mode + bool cb_mode{false}; + uint8_t local_sem_id{0}; + uint8_t remote_sem_id{0}; + uint8_t log_page_size{0}; + + // Input Queue + // bool packetizer_input{false}; can be determined using std::derived_from + uint16_t packetizer_input_src{0}; // ex cb mode + uint16_t packetizer_input_dest{0}; + + // Output Queue + PacketInputQueueVariant* input_queues{nullptr}; + uint32_t num_input_queues{0}; // number of elements in input_queues + // bool unpacketizer_output{false}; can be determined using std::derived_from + bool unpacketizer_output_remove_header{false}; +}; + +// (CRTP) Base Class for all packet queues. Do not create instances of this directly (UB). +// T is the implementation +// R is the remote type +// DisableRemotePtr will disable the remote ptr updates if the implementation uses a custom method. +template +class packet_queue_base_t { +private: + static constexpr uint32_t k_ReservedWords = 1; + + // All pointers are in the units of words + // to get the actual address, need to multiply it by the word size. + volatile uint32_t* wptr; + volatile uint32_t* rptr_sent; + volatile uint32_t* rptr_cleared; + volatile uint32_t* local_ready_status; // stream reg + + uint32_t* shadow_remote_wptr; + uint32_t* shadow_remote_rptr_sent; + uint32_t* shadow_remote_rptr_cleared; + + uint32_t remote_wptr_addr; + uint32_t remote_rptr_sent_addr; + uint32_t remote_rptr_cleared_addr; + + uint32_t queue_id; + uint32_t queue_start_addr_words; + uint32_t queue_size_words; + + uint32_t remote_queue_id; + uint8_t remote_x; + uint8_t remote_y; + bool cb_mode; + bool is_input; + + uint32_t remote_ready_status_addr; // stream reg + + inline T* impl() { return static_cast(this); } + + // Increments that wrap over the queue size are not supported. + // to support them we need to add a case for if value >= 2*queue_size_words then + // use the modulus operator. + inline uint32_t wrap_ptr(uint32_t v) const { + if (v >= this->queue_size_words) { + v -= this->queue_size_words; + } + return v; + } + + // Returns the distance between two positions, end and start, accounting + // for any wrapping. + // without wrapping, end is the pointer on the right. start is the pointer on the left. + inline uint32_t distance(uint32_t end, uint32_t start) const { + if (end >= start) { + return end - start; + } else { + return this->queue_size_words - start + end; + } + } + +protected: + R remote; + + packet_queue_base_t() = default; + ~packet_queue_base_t() = default; + +public: + void init(const init_params_t* params) { + WAYPOINT("PQA0"); + ASSERT(params->ptrs_addr != 0); + if constexpr (!DisableRemotePtr) { + // If remote ptr will be accessed we need to make sure not to overwrite L1[0] + WAYPOINT("PQA1"); + ASSERT(params->remote_ptrs_addr != 0); + } + WAYPOINT("PQA2"); + this->queue_id = params->queue_id; + this->queue_start_addr_words = params->queue_start_addr_words; + this->queue_size_words = params->queue_size_words; + this->remote_queue_id = params->remote_queue_id; + this->remote_x = params->remote_x; + this->remote_y = params->remote_y; + this->cb_mode = params->cb_mode; + this->is_input = params->is_input; + + this->remote_wptr_addr = + reinterpret_cast(packet_queue_ptr_buffer_layout_t::get_wptr(params->remote_ptrs_addr)); + this->remote_rptr_sent_addr = + reinterpret_cast(packet_queue_ptr_buffer_layout_t::get_rptr_sent(params->remote_ptrs_addr)); + this->remote_rptr_cleared_addr = + reinterpret_cast(packet_queue_ptr_buffer_layout_t::get_rptr_cleared(params->remote_ptrs_addr)); + + this->wptr = packet_queue_ptr_buffer_layout_t::get_wptr(params->ptrs_addr); + this->rptr_sent = packet_queue_ptr_buffer_layout_t::get_rptr_sent(params->ptrs_addr); + this->rptr_cleared = packet_queue_ptr_buffer_layout_t::get_rptr_cleared(params->ptrs_addr); + this->shadow_remote_wptr = packet_queue_ptr_buffer_layout_t::get_shadow_remote_wptr(params->ptrs_addr); + this->shadow_remote_rptr_sent = + packet_queue_ptr_buffer_layout_t::get_shadow_remote_rptr_sent(params->ptrs_addr); + this->shadow_remote_rptr_cleared = + packet_queue_ptr_buffer_layout_t::get_shadow_remote_rptr_cleared(params->ptrs_addr); + + // Reset local values + *this->wptr = 0; + *this->rptr_sent = 0; + *this->rptr_cleared = 0; + *this->shadow_remote_wptr = 0; + *this->shadow_remote_rptr_sent = 0; + *this->shadow_remote_rptr_cleared = 0; + + this->remote_ready_status_addr = STREAM_REG_ADDR(params->remote_queue_id, STREAM_REMOTE_SRC_REG_INDEX); + this->local_ready_status = + reinterpret_cast(STREAM_REG_ADDR(params->queue_id, STREAM_REMOTE_SRC_REG_INDEX)); + + this->remote.init( + params->queue_id, + params->remote_queue_id, + params->remote_x, + params->remote_y, + params->ptrs_addr, + params->remote_ptrs_addr); + + // Call child init + this->impl()->_init(params); + + // Queue is ready + this->reset_ready_flag(); + } + + inline uint32_t get_queue_id() { return this->queue_id; } + + inline uint8_t get_remote_x() { return this->remote_x; } + + inline uint8_t get_remote_y() { return this->remote_y; } + + inline uint32_t get_queue_start_addr_words() const { return this->queue_start_addr_words; } + + inline uint32_t get_queue_size_words() const { return this->queue_size_words; } + + inline uint32_t get_queue_local_wptr() const { return *this->wptr; } + + inline uint32_t get_queue_local_rptr_sent() const { return *this->rptr_sent; } + + inline uint32_t get_queue_local_rptr_cleared() const { return *this->rptr_cleared; } + + inline uint32_t get_queue_data_num_words_occupied() const { + return distance(this->get_queue_local_wptr(), this->get_queue_local_rptr_cleared()); + } + + inline uint32_t get_queue_data_num_words_free() const { + return this->queue_size_words - this->get_queue_data_num_words_occupied() - k_ReservedWords; + } + + inline uint32_t get_num_words_sent_not_cleared() const { + return distance(this->get_queue_local_rptr_sent(), this->get_queue_local_rptr_cleared()); + } + + inline uint32_t get_num_words_written_not_sent() const { + return distance(this->get_queue_local_wptr(), this->get_queue_local_rptr_sent()); + } + + inline uint32_t get_queue_rptr_sent_addr_bytes() const { + return (this->queue_start_addr_words + this->get_queue_local_rptr_sent()) * PACKET_WORD_SIZE_BYTES; + } + + inline uint32_t get_queue_rptr_cleared_addr_bytes() const { + return (this->queue_start_addr_words + this->get_queue_local_rptr_cleared()) * PACKET_WORD_SIZE_BYTES; + } + + inline uint32_t get_queue_wptr_addr_bytes() const { + return (this->queue_start_addr_words + this->get_queue_local_wptr()) * PACKET_WORD_SIZE_BYTES; + } + + inline uint32_t get_queue_words_before_rptr_sent_wrap() const { + return this->queue_size_words - this->get_queue_local_rptr_sent(); + } + + inline uint32_t get_queue_words_before_rptr_cleared_wrap() const { + return this->queue_size_words - this->get_queue_local_rptr_cleared(); + } + + inline uint32_t get_queue_words_before_wptr_wrap() const { + return this->queue_size_words - this->get_queue_local_wptr(); + } + + inline uint32_t get_remote_ready_status() const { return *this->local_ready_status; } + + inline bool is_remote_ready() const { return *this->local_ready_status == PACKET_QUEUE_REMOTE_READY_FLAG; } + + inline bool is_remote_finished() const { return *this->local_ready_status == PACKET_QUEUE_REMOTE_FINISHED_FLAG; } + + inline void reset_ready_flag() { *this->local_ready_status = 0; } + + inline void set_queue_finished() { *this->local_ready_status = PACKET_QUEUE_REMOTE_FINISHED_FLAG; } + + inline void set_final_remote_xy(uint8_t x, uint8_t y) { + this->remote.init(this->queue_id, this->remote_queue_id, x, y, 0, 0); + } + + inline void set_remote_ready_status_addr(uint8_t remote_queue_id) { + this->remote_ready_status_addr = STREAM_REG_ADDR(remote_queue_id, STREAM_REMOTE_SRC_REG_INDEX); + } + + inline void send_remote_finished_notification() { + this->remote.reg_update(this->remote_ready_status_addr, PACKET_QUEUE_REMOTE_FINISHED_FLAG); + } + + inline void send_remote_ready_notification() { + this->remote.reg_update(this->remote_ready_status_addr, PACKET_QUEUE_REMOTE_READY_FLAG); + } + + inline void advance_queue_local_wptr(uint32_t num_words) { *this->wptr = wrap_ptr(*this->wptr + num_words); } + + inline void advance_queue_local_rptr_sent(uint32_t num_words) { + *this->rptr_sent = wrap_ptr(*this->rptr_sent + num_words); + } + + inline void advance_queue_local_rptr_cleared(uint32_t num_words) { + *this->rptr_cleared = wrap_ptr(*this->rptr_cleared + num_words); + } + + inline void advance_queue_remote_wptr(uint32_t num_words) { + if constexpr (!DisableRemotePtr) { + *this->shadow_remote_wptr = wrap_ptr(*this->shadow_remote_wptr + num_words); + this->remote.ptr_update((uint32_t)this->shadow_remote_wptr, this->remote_wptr_addr, PtrUpdateType::WPTR); + } + } + + inline void advance_queue_remote_rptr_sent(uint32_t num_words) { + if constexpr (!DisableRemotePtr) { + *this->shadow_remote_rptr_sent = wrap_ptr(*this->shadow_remote_rptr_sent + num_words); + this->remote.ptr_update( + (uint32_t)this->shadow_remote_rptr_sent, this->remote_rptr_sent_addr, PtrUpdateType::RPTR_SENT); + } + } + + inline void advance_queue_remote_rptr_cleared(uint32_t num_words) { + if constexpr (!DisableRemotePtr) { + *this->shadow_remote_rptr_cleared = wrap_ptr(*this->shadow_remote_rptr_cleared + num_words); + this->remote.ptr_update( + (uint32_t)this->shadow_remote_rptr_cleared, + this->remote_rptr_cleared_addr, + PtrUpdateType::RPTR_CLEARED); + } + } + + inline void handle_recv() { this->remote.handle_recv(); } + + inline bool busy() const { return this->remote.busy(); } + + inline uint32_t cb_mode_get_local_sem_val(uint32_t sem_id) { + invalidate_l1_cache(); + volatile tt_l1_ptr uint32_t* local_sem_addr = + reinterpret_cast(get_semaphore(sem_id)); + // semaphore underflow is currently used to signal path teardown with minimal prefetcher changes + uint32_t val = *local_sem_addr; + if (val & 0x80000000) { + val &= 0x7fffffff; + this->set_queue_finished(); + } + return val; + } + + inline bool cb_mode_local_sem_downstream_complete(uint32_t sem_id) { + invalidate_l1_cache(); + volatile tt_l1_ptr uint32_t* local_sem_addr = + reinterpret_cast(get_semaphore(sem_id)); + // semaphore underflow is currently used to signal path teardown with minimal prefetcher changes + uint32_t val = *local_sem_addr; + return (val & 0x80000000); + } + + inline void cb_mode_inc_local_sem_val(uint32_t sem_id, uint32_t val) { + const auto sem_l1_addr = get_semaphore(sem_id); + const auto sem_noc_addr = get_noc_addr(sem_l1_addr); + noc_semaphore_inc(sem_noc_addr, val); + noc_async_atomic_barrier(); + } + + inline void cb_mode_inc_remote_sem_val(uint32_t sem_id, uint32_t val) { + const auto sem_l1_addr = get_semaphore(sem_id); + const auto sem_noc_addr = get_noc_addr(this->get_remote_x(), this->get_remote_y(), sem_l1_addr); + if (val > 0) { + noc_semaphore_inc(sem_noc_addr, val); + } + } + + inline uint32_t cb_mode_rptr_sent_advance_page_align(uint32_t page_size_words) { + const auto rptr_val = this->get_queue_local_rptr_sent(); + const auto page_size_words_mask = page_size_words - 1; + const auto num_words_past_page_boundary = rptr_val & page_size_words_mask; + uint32_t input_pad_words_skipped = 0; + if (num_words_past_page_boundary > 0) { + input_pad_words_skipped = page_size_words - num_words_past_page_boundary; + this->advance_queue_local_rptr_sent(input_pad_words_skipped); + } + return input_pad_words_skipped; + } + + inline void cb_mode_local_sem_wptr_update(uint32_t sem_id, uint32_t page_size_words) { + const auto local_sem_val = this->cb_mode_get_local_sem_val(sem_id); + for (uint32_t i = 0; i < local_sem_val; ++i) { + this->advance_queue_local_wptr(page_size_words); + } + this->cb_mode_inc_local_sem_val(sem_id, -local_sem_val); + } + + inline void cb_mode_local_sem_rptr_cleared_update(uint32_t sem_id, uint32_t page_size_words) { + const auto local_sem_val = this->cb_mode_get_local_sem_val(sem_id); + for (uint32_t i = 0; i < local_sem_val; ++i) { + this->advance_queue_local_rptr_cleared(page_size_words); + } + this->cb_mode_inc_local_sem_val(sem_id, -local_sem_val); + } +}; // packet_queue_base_t + +// (CRTP CRTP) Base Class for all input queues. Do not create instances of this directly (UB). +// T is the implementation +// R is the remote type +// DisableRemotePtr will disable the remote ptr updates if the implementation uses a custom method. +template +class packet_queue_input_t + : public packet_queue_base_t, R, DisableRemotePtr> { +private: + friend packet_queue_base_t, R, DisableRemotePtr>; + friend input_queue_impl_t; + + tt_l1_ptr dispatch_packet_header_t* curr_packet_header; + bool curr_packet_valid; + uint16_t curr_packet_src; + uint16_t curr_packet_dest; + uint32_t curr_packet_size_words; + uint32_t curr_packet_words_sent; + uint32_t curr_packet_tag; + uint16_t curr_packet_flags; + uint16_t end_of_cmd; + + inline T* impl() { return static_cast(this); } + + // Update the state of this queue to the next packet. This function is only valid if get_num_words_written_not_sent + // > 0. _set_next_packet will be called with the next packet header + inline void start_next_packet() { + WAYPOINT("PQI1"); + if (!this->get_num_words_written_not_sent()) { + return; + } + const uint32_t next_header_addr = + (this->get_queue_start_addr_words() + this->get_queue_local_rptr_sent()) * PACKET_WORD_SIZE_BYTES; + + auto next_packet_header = reinterpret_cast(next_header_addr); + + this->curr_packet_header = next_packet_header; + + uint32_t packet_size_and_flags = next_packet_header->packet_size_bytes; + uint32_t packet_size_bytes = packet_size_and_flags & 0xFFFFFFFE; + + this->end_of_cmd = !(packet_size_and_flags & 1); + this->curr_packet_size_words = packet_size_bytes / PACKET_WORD_SIZE_BYTES; + + // Round up to next word to not truncate the data + static_assert(is_power_of_2(PACKET_WORD_SIZE_BYTES)); + if (packet_size_bytes & (PACKET_WORD_SIZE_BYTES - 1)) { + this->curr_packet_size_words++; + } + + // Set this to true right now so if the downstream _set_next_packet will not call + // start_next_packet again + this->curr_packet_valid = true; + this->curr_packet_words_sent = 0; + this->impl()->_set_next_packet(this->curr_packet_header); + } + +protected: + packet_queue_input_t() = default; + ~packet_queue_input_t() = default; + + void set_curr_packet_src(uint16_t src) { this->curr_packet_src = src; } + + void set_curr_packet_dest(uint16_t dest) { this->curr_packet_dest = dest; } + + void set_curr_packet_tag(uint32_t tag) { this->curr_packet_tag = tag; } + + void set_curr_packet_flags(uint16_t flags) { this->curr_packet_flags = flags; } + + void _init(const init_params_t* params) { + this->curr_packet_valid = false; + this->impl()->_init(params); + }; + +public: + // Returns true if the current packet is valid + inline bool get_curr_packet_valid() { + this->impl()->_update_local_wptr_val(); + + if (!this->curr_packet_valid) { + this->start_next_packet(); + } + + return this->curr_packet_valid; + } + + // Returns the packet source identifier + inline uint16_t get_curr_packet_src() { + if (!this->curr_packet_valid) { + this->start_next_packet(); + } + return this->curr_packet_src; + } + + // Returns the packet destination identifier + inline uint16_t get_curr_packet_dest() { + if (!this->curr_packet_valid) { + this->start_next_packet(); + } + return this->curr_packet_dest; + } + + // Returns the packet size in words + inline uint32_t get_curr_packet_size_words() { + if (!this->curr_packet_valid) { + this->start_next_packet(); + } + return this->curr_packet_size_words; + } + + // Returns the packet's tags. + inline uint32_t get_curr_packet_tag() { + if (!this->curr_packet_valid) { + this->start_next_packet(); + } + return this->curr_packet_tag; + } + + // Returns the packet's flags + inline uint16_t get_curr_packet_flags() { + if (!this->curr_packet_valid) { + this->start_next_packet(); + } + return this->curr_packet_flags; + } + + // Returns how many words left to send. This function will try to advance to the next + // packet if the current one is not valid. + inline uint32_t get_curr_packet_words_remaining() { + if (!this->curr_packet_valid) { + this->start_next_packet(); + } + + return this->curr_packet_size_words - this->curr_packet_words_sent; + } + + // Returns a pointer to the current packet's header + inline tt_l1_ptr dispatch_packet_header_t* get_curr_packet_header_ptr() const { return this->curr_packet_header; } + + // Get end of cmd + inline uint16_t get_end_of_cmd() const { return this->end_of_cmd; } + + // Returns true if data for current packet is partially sent + inline bool partial_packet_sent() const { return this->curr_packet_valid && (this->curr_packet_words_sent > 0); } + + // Returns true if data for current packet has not started to be sent yet + inline bool curr_packet_start() const { return this->curr_packet_valid && (this->curr_packet_words_sent == 0); } + + // Returns true if the buffer has enough space for the current packet can be sent in one shot + // num_words_available_to_send will be sent to the words to send + inline bool full_packet_available_to_send(uint32_t& num_words_available_to_send) { + num_words_available_to_send = this->get_num_words_written_not_sent(); + + if (!num_words_available_to_send) { + return false; + } + return num_words_available_to_send >= this->get_curr_packet_words_remaining(); + } + + // Returns the number of words that can be written to the destination buffer. + inline uint32_t get_curr_packet_num_words_available_to_send() { + this->impl()->_update_local_wptr_val(); + uint32_t num_words = this->get_num_words_written_not_sent(); + if (num_words == 0) { + return 0; + } + num_words = std::min(num_words, this->get_curr_packet_words_remaining()); + return num_words; + } + + // Advance the sent read pointer indicating copy has started + // Returns the number of words that were potentially skipped (e.g., alignment) + inline uint32_t advance_words_sent(uint32_t num_words) { + if (!num_words) { + return 0; + } + + this->advance_queue_local_rptr_sent(num_words); + this->advance_queue_remote_rptr_sent(num_words); + this->curr_packet_words_sent += num_words; + + // Current packet is done. Move to next packet. Maybe there will be an adjustment + // for alignment. + if (!this->get_curr_packet_words_remaining()) { + const auto adjustment = this->impl()->_align_rptr_sent(num_words); + this->curr_packet_valid = false; + this->start_next_packet(); + return adjustment; + } + + // No adjustment needed + return 0; + } + + // Advance the cleared read pointer indicating copy is complete + inline void advance_words_cleared(uint32_t num_words) { + if (!num_words) { + return; + } + + this->advance_queue_local_rptr_cleared(num_words); + this->advance_queue_remote_rptr_cleared(num_words); + + this->impl()->_align_rptr_cleared(num_words); + } + + // Advance cleared pointer to match sent + inline void clear_all_words_sent() { + uint32_t num_words = this->get_num_words_sent_not_cleared(); + if (num_words > 0) { + this->advance_words_cleared(num_words); + } + } +}; // packet_queue_input_t + +// Regular Input Queue with remote R +// R is the remote type +template +class input_queue_impl_t final : public packet_queue_input_t, R, false> { + friend packet_queue_input_t, R, false>; + +protected: + void _init(const init_params_t* params) {} + + inline void _set_next_packet(tt_l1_ptr dispatch_packet_header_t* header) { + this->set_curr_packet_dest(header->packet_dest); + this->set_curr_packet_src(header->packet_src); + this->set_curr_packet_tag(header->tag); + this->set_curr_packet_flags(header->packet_flags); + } + + inline void _update_local_wptr_val() {} + + inline uint32_t _align_rptr_sent(uint32_t num_words) { return 0; } + + inline void _align_rptr_cleared(uint32_t num_words) {} + +public: + input_queue_impl_t() = default; + ~input_queue_impl_t() = default; +}; // input_queue_impl_t + +// Paged Input Queue with remote R +// R is the remote type +template +class input_queue_cb_mode_impl_t final : public packet_queue_input_t, R, true> { +private: + friend packet_queue_input_t, R, true>; + + uint32_t packetizer_page_words_cleared; + + cb_mode_config_t config; + +protected: + inline void _init(const init_params_t* params) { + this->packetizer_page_words_cleared = 0; + this->set_curr_packet_src(params->packetizer_input_src); + this->set_curr_packet_dest(params->packetizer_input_dest); + this->set_curr_packet_flags(0); + this->set_curr_packet_tag(0xabcd); + + this->config.init(params->log_page_size, params->local_sem_id, params->remote_sem_id); + } + + inline void _set_next_packet(tt_l1_ptr dispatch_packet_header_t* header) { + // Update the current packet header to continue forwarding data as is + // to the original location by updating the headers to match + // prefetcher has size in bytes + header->packet_dest = this->get_curr_packet_dest(); + header->packet_src = this->get_curr_packet_src(); + header->tag = this->get_curr_packet_tag(); + header->packet_flags = this->get_curr_packet_flags(); + } + + inline void _update_local_wptr_val() { + this->cb_mode_local_sem_wptr_update(this->config.local_sem_id, this->config.page_size_words); + } + + inline uint32_t _align_rptr_sent(uint32_t num_words) { + return this->cb_mode_rptr_sent_advance_page_align(this->config.page_size_words); + } + + inline void _align_rptr_cleared(uint32_t num_words) { + this->packetizer_page_words_cleared += num_words; + uint32_t remote_sem_inc = 0; + while (this->packetizer_page_words_cleared >= this->config.page_size_words) { + remote_sem_inc++; + this->packetizer_page_words_cleared -= this->config.page_size_words; + } + this->cb_mode_inc_remote_sem_val(this->config.remote_sem_id, remote_sem_inc); + } +}; // input_queue_cb_mode_impl_t + +// (CRTP CRTP) Base Class for all output queues. Do not create instances of this directly (UB). +// T is the implementation +// R is the remote type +// InputNetworkTypeSequence types[i] is the network type for input queue i +// InputCBSequence cb_mode_enabled[i] is cb mode enabled for input queue i +// DisableRemotePtr will disable the remote ptr updates if the implementation uses a custom method. +template +class packet_queue_output_t + : public packet_queue_base_t< + packet_queue_output_t, + R, + DisableRemotePtr> { +private: + static_assert(InputNetworkTypeSequence::size == InputCBSequence::size); + static_assert(InputNetworkTypeSequence::size <= k_MaxInputOutputQueues); + + friend packet_queue_base_t< + packet_queue_output_t, + R, + DisableRemotePtr>; + + // How do we keep track of which variant is at each index? + // Number of output queues and output queue configuration does not change during runtime. + // This output_queue is templated to pass in a sequence of the input queue types. + // Depending on which input_queues[i] is accessed, we can get the type info from the sequence + // template args + PacketInputQueueVariant* input_queues; + uint32_t num_input_queues; + + uint32_t words_in_flight[2 * k_MaxInputOutputQueues]; // 2X for curr and prev + uint32_t unpacketizer_page_words_sent; + bool unpacketizer_remove_header; + + // Pointer to words_in_flight[current queue] and [previous queue] + uint32_t* curr_input_queue_words_in_flight; + uint32_t* prev_input_queue_words_in_flight; + uint32_t curr_output_total_words_in_flight; + uint32_t prev_output_total_words_in_flight; + + inline T* impl() { return static_cast(this); } + +protected: + packet_queue_output_t() = default; + ~packet_queue_output_t() = default; + + void _init(const init_params_t* params) { + WAYPOINT("PQO0"); + ASSERT(params->num_input_queues <= k_MaxInputOutputQueues); + WAYPOINT("PQO1"); + + this->unpacketizer_page_words_sent = 0; + this->unpacketizer_remove_header = params->unpacketizer_output_remove_header; + + this->num_input_queues = params->num_input_queues; + this->input_queues = params->input_queues; + + this->curr_input_queue_words_in_flight = &(this->words_in_flight[0]); + this->prev_input_queue_words_in_flight = &(this->words_in_flight[k_MaxInputOutputQueues]); + this->curr_output_total_words_in_flight = 0; + this->prev_output_total_words_in_flight = 0; + for (uint32_t i = 0; i < k_MaxInputOutputQueues; i++) { + this->words_in_flight[i] = 0; + } + + this->impl()->_init(params); + } + + // Set the unpacketizer page words sent + inline void set_unpacketizer_page_words_sent(uint32_t num_words) { this->unpacketizer_page_words_sent = num_words; } + + // Increment unpacketizer page words sent + inline void inc_unpacketizer_page_words_sent(uint32_t num_words) { + this->unpacketizer_page_words_sent += num_words; + } + + // Decrement unpacketizer page words sent + inline void dec_unpacketizer_page_words_sent(uint32_t num_words) { + this->unpacketizer_page_words_sent -= num_words; + } + +public: + // Return unpacketizer remove header + inline bool get_unpacketizer_remove_header() const { return this->unpacketizer_remove_header; } + + // Returns the page words set for unpacketize header mode + inline uint32_t get_unpacketizer_page_words_sent() const { return this->unpacketizer_page_words_sent; } + + // Returns the total words in flight of the current output queue + inline uint32_t get_curr_output_total_words_in_flight() const { return this->curr_output_total_words_in_flight; } + + // Return the total words in flight of the previous output queue + inline uint32_t get_prev_output_total_words_in_flight() const { return this->prev_output_total_words_in_flight; } + + // Return the number of words that can be forwarded. The number of words that + // can be sent is the minimum of + // 1. words available in the input queue + // 2. words available in the input queue before the rptr will wrap + // 3. space available in the output buffer + // 4. space available in the output buffer before the wptr will wrap + // 5. maximum transmission size for the remote type + // InputQueueIndex is the queue index + template + inline uint32_t get_num_words_to_send() const { + static_assert(InputQueueIndex < k_MaxInputOutputQueues && InputQueueIndex < InputNetworkTypeSequence::size); + static constexpr auto k_max_send_words = remote_max_send_words_v; + + // Based on the InputQueueIndex, cast the variant element at that index to the correct + // type in the InputNetworkTypeSequence X InputCBSequence combo + auto* active_input_queue = this->input_queues[InputQueueIndex] + .template get< + InputNetworkTypeSequence::values[InputQueueIndex], + InputCBSequence::values[InputQueueIndex]>(); + + uint32_t num_words_available_in_input = active_input_queue->get_curr_packet_num_words_available_to_send(); + uint32_t num_words_before_input_rptr_wrap = active_input_queue->get_queue_words_before_rptr_sent_wrap(); + uint32_t num_words_free_in_output = this->get_queue_data_num_words_free(); + uint32_t output_buf_words_before_wptr_wrap = this->get_queue_words_before_wptr_wrap(); + + uint32_t num_words_to_forward = std::min(num_words_available_in_input, num_words_before_input_rptr_wrap); + num_words_to_forward = std::min(num_words_to_forward, num_words_free_in_output); + + if (num_words_to_forward == 0) { + return 0; + } + + num_words_to_forward = std::min(num_words_to_forward, output_buf_words_before_wptr_wrap); + num_words_to_forward = std::min(num_words_to_forward, k_max_send_words); + + return num_words_to_forward; + } + + // Advance words sent to cleared for all input queues + inline uint32_t prev_words_in_flight_flush() { + uint32_t words_flushed = this->get_prev_output_total_words_in_flight(); + if (words_flushed > 0) { + process_queues( + [&](auto index) -> bool { + auto active_input_queue = input_queues[index].template get(); + active_input_queue->advance_words_cleared(this->prev_input_queue_words_in_flight[index]); + this->prev_input_queue_words_in_flight[index] = 0; + return true; + }); + } + + // Swapping current to previous + std::swap(this->prev_input_queue_words_in_flight, this->curr_input_queue_words_in_flight); + this->prev_output_total_words_in_flight = this->curr_output_total_words_in_flight; + this->curr_output_total_words_in_flight = 0; + + return words_flushed; + } + + // Check if any words need to be flushed and flush + inline uint32_t prev_words_in_flight_check_flush() { return this->impl()->_prev_words_in_flight_check_flush(); } + + // Set words as in flight (sent) for an input queue + // InputQueueIndex is the queue index + template + inline void register_words_in_flight(uint32_t num_words) { + static_assert(InputQueueIndex < k_MaxInputOutputQueues && InputQueueIndex < InputNetworkTypeSequence::size); + auto active_input_queue = input_queues[InputQueueIndex] + .template get< + InputNetworkTypeSequence::values[InputQueueIndex], + InputCBSequence::values[InputQueueIndex]>(); + + uint32_t input_pad_words_skipped = active_input_queue->advance_words_sent(num_words); + + this->curr_input_queue_words_in_flight[InputQueueIndex] += (num_words + input_pad_words_skipped); + this->curr_output_total_words_in_flight += num_words; + } + + // Forward data from input to the remote output + // InputQueueIndex is the input queue index to forward + template + inline uint32_t forward_data_from_input(bool& full_packet_sent, uint16_t end_of_cmd) { + static_assert(InputQueueIndex < k_MaxInputOutputQueues && InputQueueIndex < InputNetworkTypeSequence::size); + uint32_t num_words = this->get_num_words_to_send(); + auto* active_input_queue = input_queues[InputQueueIndex] + .template get< + InputNetworkTypeSequence::values[InputQueueIndex], + InputCBSequence::values[InputQueueIndex]>(); + + // It will be possible to send the full packet one shot + full_packet_sent = (num_words == active_input_queue->get_curr_packet_words_remaining()); + if (!num_words) { + return 0; + } + + if (this->get_unpacketizer_remove_header() && active_input_queue->curr_packet_start()) { + // remove 1 word == header + num_words--; + this->register_words_in_flight(1); + if (!num_words) { + return 0; + } + } + + const uint32_t src_addr = + (active_input_queue->get_queue_start_addr_words() + active_input_queue->get_queue_local_rptr_sent()) * + PACKET_WORD_SIZE_BYTES; // Local + const uint32_t dest_addr = + (this->get_queue_start_addr_words() + this->get_queue_local_wptr()) * PACKET_WORD_SIZE_BYTES; // Remote + + this->remote.send_data(src_addr, dest_addr, num_words); + + this->register_words_in_flight(num_words); + + this->advance_queue_local_wptr(num_words); + this->impl()->_forward_data_complete(num_words, full_packet_sent, end_of_cmd); + + return num_words; + } + + // Block until all outputs are complete + inline bool output_barrier(uint32_t timeout_cycles = 0) { + uint32_t start_timestamp = 0; + if (timeout_cycles > 0) { + start_timestamp = get_timestamp_32b(); + } + + this->impl()->_barrier_setup(); + + while (this->get_queue_data_num_words_occupied() > 0) { + this->impl()->_barrier_process(); + + if (timeout_cycles > 0) { + uint32_t cycles_elapsed = get_timestamp_32b() - start_timestamp; + if (cycles_elapsed > timeout_cycles) { + return false; + } + } + } + + // Advance to cleared + this->prev_words_in_flight_flush(); + return true; + } +}; // packet_queue_output_t + +// Regular Output Queue with remote R +// R is the remote type +template +class output_queue_impl_t final : public packet_queue_output_t< + output_queue_impl_t, + R, + InputNetworkTypeSequence, + InputCBSequence, + false> { + friend packet_queue_output_t< + output_queue_impl_t, + R, + InputNetworkTypeSequence, + InputCBSequence, + false>; + +protected: + void _init(const init_params_t* params) {} + + inline uint32_t _prev_words_in_flight_check_flush() { + if (this->get_num_words_written_not_sent() <= this->get_curr_output_total_words_in_flight()) { + return this->prev_words_in_flight_flush(); + } else { + return 0; + } + } + + inline void _forward_data_complete(uint32_t words_forwarded, bool full_packet_sent, uint16_t end_of_cmd) { + this->advance_queue_remote_wptr(words_forwarded); + } + + inline void _barrier_setup() {} + + inline void _barrier_process() {} +}; // output_queue_impl_t + +// Paged Output Queue with remote R +// R is the remote type +template +class output_queue_cb_mode_impl_t final : public packet_queue_output_t< + output_queue_cb_mode_impl_t, + R, + InputNetworkTypeSequence, + InputCBSequence, + true> { +private: + friend packet_queue_output_t< + output_queue_cb_mode_impl_t, + R, + InputNetworkTypeSequence, + InputCBSequence, + true>; + cb_mode_config_t config; + +protected: + void _init(const init_params_t* params) { + this->config.init(params->log_page_size, params->local_sem_id, params->remote_sem_id); + } + + inline uint32_t _prev_words_in_flight_check_flush() { + uint32_t words_written_not_sent = this->get_num_words_written_not_sent(); + noc_async_writes_flushed(); + this->advance_queue_local_rptr_sent(words_written_not_sent); + + uint32_t words_flushed = this->prev_words_in_flight_flush(); + this->cb_mode_local_sem_rptr_cleared_update(this->config.local_sem_id, this->config.page_size_words); + return words_flushed; + } + + inline void _forward_data_complete(uint32_t words_forwarded, bool full_packet_sent, uint16_t end_of_cmd) { + this->inc_unpacketizer_page_words_sent(words_forwarded); + + if (full_packet_sent && end_of_cmd) { + uint32_t unpacketizer_page_words_sent_past_page_bound = + this->get_unpacketizer_page_words_sent() & this->config.page_size_mask; + if (unpacketizer_page_words_sent_past_page_bound > 0) { + uint32_t pad_words = this->config.page_size_words - unpacketizer_page_words_sent_past_page_bound; + this->inc_unpacketizer_page_words_sent(pad_words); + this->advance_queue_local_wptr(pad_words); + } + } + + uint32_t remote_sem_inc = 0; + while (this->get_unpacketizer_page_words_sent() >= this->config.page_size_words) { + this->dec_unpacketizer_page_words_sent(this->config.page_size_words); + remote_sem_inc++; + } + this->cb_mode_inc_remote_sem_val(this->config.remote_sem_id, remote_sem_inc); + } + + inline void _barrier_setup() { noc_async_writes_flushed(); } + + inline void _barrier_process() { + this->cb_mode_local_sem_rptr_cleared_update(this->config.local_sem_id, this->config.page_size_words); + if (this->cb_mode_local_sem_downstream_complete(this->config.local_sem_id)) { + // There is no guaranteed that dispatch_h will increment semaphore for all commmands + // (specifically the final terminate command). + // So just clear whatever remains once the completion signal is received. + uint32_t words_occupied = this->get_queue_data_num_words_occupied(); + this->advance_queue_local_rptr_cleared(words_occupied); + } + } +}; // output_queue_cb_mode_impl_t + +/* +********************** +* * +* Functions * +* * +********************** +*/ + +// Wait for all input and output queues and their remotes to signal Ready on the remote ready status +template < + typename InputNetworkTypeSequence, + typename InputCBSequence, + typename OutputNetworkTypeSequence, + typename OutputCBSequence> +bool wait_all_input_output_ready( + PacketInputQueueVariant* input_queues, PacketOutputQueueVariant* output_queues, uint32_t timeout_cycles = 0) { + static_assert(InputNetworkTypeSequence::size == InputCBSequence::size); + static_assert(OutputNetworkTypeSequence::size == OutputCBSequence::size); + + bool src_ready[InputNetworkTypeSequence::size]; + bool dest_ready[OutputNetworkTypeSequence::size]; + bool all_src_dest_ready = false; + uint32_t iters = 0; + + std::fill_n(src_ready, InputNetworkTypeSequence::size, false); + std::fill_n(dest_ready, OutputNetworkTypeSequence::size, false); + + uint32_t start_timestamp = get_timestamp_32b(); + while (!all_src_dest_ready) { + iters++; + if (timeout_cycles > 0) { + uint32_t cycles_since_start = get_timestamp_32b() - start_timestamp; + if (cycles_since_start > timeout_cycles) { + return false; + } + } + + all_src_dest_ready = true; + // checking input queues + process_queues( + [&](auto index) -> bool { + if (!src_ready[index]) { + auto* active_input_queue = input_queues[index].template get(); + src_ready[index] = cb_mode || active_input_queue->is_remote_ready(); + if (!src_ready[index]) { + active_input_queue->send_remote_ready_notification(); + all_src_dest_ready = false; + } else { + // handshake with src complete + } + } + + return true; // keep looping through other queues + }); + + // checking output queues + process_queues( + [&](auto index) -> bool { + if (!dest_ready[index]) { + auto* active_output_queue = + output_queues[index] + .template get(); + dest_ready[index] = cb_mode || active_output_queue->is_remote_ready(); + if (dest_ready[index]) { + active_output_queue->send_remote_ready_notification(); + } else { + all_src_dest_ready = false; + } + } + return true; + }); + +#if defined(COMPILE_FOR_ERISC) + // Just for init purposes it's ok to keep context switching + internal_::risc_context_switch(); +#endif + } + return true; +} + +}; // namespace packet_queue diff --git a/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp index dc5cd06c75db..a5096d81d6d7 100644 --- a/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp @@ -2,20 +2,20 @@ // // SPDX-License-Identifier: Apache-2.0 -// clang-format off #include "dataflow_api.h" -#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" -// clang-format on +#include "packet_queue_ctrl.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" +#include "tt_metal/impl/dispatch/kernels/cq_helpers.hpp" +#include "debug/dprint.h" -packet_input_queue_state_t input_queues[MAX_TUNNEL_LANES]; -packet_output_queue_state_t output_queues[MAX_TUNNEL_LANES]; +using namespace packet_queue; constexpr uint32_t endpoint_id_start_index = get_compile_time_arg_val(0); constexpr uint32_t tunnel_lanes = get_compile_time_arg_val(1); constexpr uint32_t in_queue_start_addr_words = get_compile_time_arg_val(2); constexpr uint32_t in_queue_size_words = get_compile_time_arg_val(3); constexpr uint32_t in_queue_size_bytes = in_queue_size_words * PACKET_WORD_SIZE_BYTES; -static_assert(is_power_of_2(in_queue_size_words), "in_queue_size_words must be a power of 2"); static_assert(tunnel_lanes <= MAX_TUNNEL_LANES, "cannot have more than 2 tunnel directions."); static_assert(tunnel_lanes, "tunnel directions cannot be 0. 1 => Unidirectional. 2 => Bidirectional"); @@ -103,17 +103,6 @@ constexpr uint32_t remote_receiver_queue_size_words[MAX_TUNNEL_LANES] = get_compile_time_arg_val(33) }; -static_assert(is_power_of_2(remote_receiver_queue_size_words[0]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[1]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[2]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[3]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[4]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[5]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[6]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[7]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[8]), "remote_receiver_queue_size_words must be a power of 2"); -static_assert(is_power_of_2(remote_receiver_queue_size_words[9]), "remote_receiver_queue_size_words must be a power of 2"); - constexpr uint32_t remote_sender_x[MAX_TUNNEL_LANES] = { (get_compile_time_arg_val(34) & 0xFF), @@ -170,7 +159,6 @@ constexpr DispatchRemoteNetworkType remote_sender_network_type[MAX_TUNNEL_LANES] static_cast((get_compile_time_arg_val(43) >> 24) & 0xFF) }; - constexpr uint32_t kernel_status_buf_addr_arg = get_compile_time_arg_val(44); constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(45); @@ -180,42 +168,172 @@ tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast( constexpr uint32_t timeout_cycles = get_compile_time_arg_val(46); constexpr uint32_t inner_stop_mux_d_bypass = get_compile_time_arg_val(47); +constexpr uint32_t vc_eth_tunneler_input_scratch_buffers[MAX_TUNNEL_LANES] = { + get_compile_time_arg_val(48), + get_compile_time_arg_val(49), + get_compile_time_arg_val(50), + get_compile_time_arg_val(51), + get_compile_time_arg_val(52), + get_compile_time_arg_val(53), + get_compile_time_arg_val(54), + get_compile_time_arg_val(55), + get_compile_time_arg_val(56), + get_compile_time_arg_val(57), +}; + +constexpr uint32_t vc_eth_tunneler_input_remote_scratch_buffers[MAX_TUNNEL_LANES] = { + get_compile_time_arg_val(58), + get_compile_time_arg_val(59), + get_compile_time_arg_val(60), + get_compile_time_arg_val(61), + get_compile_time_arg_val(62), + get_compile_time_arg_val(63), + get_compile_time_arg_val(64), + get_compile_time_arg_val(65), + get_compile_time_arg_val(66), + get_compile_time_arg_val(67), +}; + +constexpr uint32_t vc_eth_tunneler_output_scratch_buffers[MAX_TUNNEL_LANES] = { + get_compile_time_arg_val(68), + get_compile_time_arg_val(69), + get_compile_time_arg_val(70), + get_compile_time_arg_val(71), + get_compile_time_arg_val(72), + get_compile_time_arg_val(73), + get_compile_time_arg_val(74), + get_compile_time_arg_val(75), + get_compile_time_arg_val(76), + get_compile_time_arg_val(77), +}; + +constexpr uint32_t vc_eth_tunneler_output_remote_scratch_buffers[MAX_TUNNEL_LANES] = { + get_compile_time_arg_val(78), + get_compile_time_arg_val(79), + get_compile_time_arg_val(80), + get_compile_time_arg_val(81), + get_compile_time_arg_val(82), + get_compile_time_arg_val(83), + get_compile_time_arg_val(84), + get_compile_time_arg_val(85), + get_compile_time_arg_val(86), + get_compile_time_arg_val(87), +}; + +PacketInputQueueVariant raw_input_queues[MAX_TUNNEL_LANES]; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; + +PacketOutputQueueVariant raw_output_queues[MAX_TUNNEL_LANES]; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; + + +void initialize_input_queues() { + init_params_t init_params{ + .is_input = true, + }; + process_queues([&](auto) -> bool { + raw_input_queues[sequence_i].template engage(); + + auto* active_input_queue = raw_input_queues[sequence_i].template get(); + init_params.queue_id = (uint8_t)sequence_i; + init_params.queue_start_addr_words = in_queue_start_addr_words + sequence_i * in_queue_size_words; + init_params.queue_size_words = in_queue_size_words; + init_params.remote_queue_id = (uint8_t)remote_sender_queue_id[sequence_i]; + init_params.remote_x = remote_sender_x[sequence_i]; + init_params.remote_y = remote_sender_y[sequence_i]; + init_params.ptrs_addr = vc_eth_tunneler_input_scratch_buffers[sequence_i]; + init_params.remote_ptrs_addr = vc_eth_tunneler_input_remote_scratch_buffers[sequence_i]; + + active_input_queue->init(&init_params); + + return true; + }); +} + +void initialize_output_queue() { + init_params_t init_params{}; + process_queues([&](auto) -> bool { + // Sequence number for input queues should line up with the output queues + // input network/cb mode sequence in here is not the same as the global one + // each output queue only has 1 input queue connected to it + using this_input_networks = NetworkTypeSequence; + using this_input_cb_mode = CBModeTypeSequence; + + raw_output_queues[sequence_i].template engage(); + + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + + init_params.queue_id = (uint8_t)sequence_i + tunnel_lanes, + init_params.queue_start_addr_words = remote_receiver_queue_start_addr_words[sequence_i], + init_params.queue_size_words = remote_receiver_queue_size_words[sequence_i], + init_params.remote_queue_id = (uint8_t)remote_receiver_queue_id[sequence_i], + init_params.remote_x = remote_receiver_x[sequence_i], + init_params.remote_y = remote_receiver_y[sequence_i], + init_params.ptrs_addr = vc_eth_tunneler_output_scratch_buffers[sequence_i], + init_params.remote_ptrs_addr = vc_eth_tunneler_output_remote_scratch_buffers[sequence_i], + + init_params.input_queues = &raw_input_queues[sequence_i], + init_params.num_input_queues = 1, + + active_output_queue->init(&init_params); + + return true; + }); +} + #define SWITCH_THRESHOLD 16 void kernel_main() { rtos_context_switch_ptr = (void (*)())RtosTable[0]; write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000000); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 1, 0xbb000000); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 2, 0xAABBCCDD); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 3, 0xDDCCBBAA); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 4, endpoint_id_start_index); - for (uint32_t i = 0; i < tunnel_lanes; i++) { - input_queues[i].init( - i, - in_queue_start_addr_words + i * in_queue_size_words, - in_queue_size_words, - remote_sender_x[i], - remote_sender_y[i], - remote_sender_queue_id[i], - remote_sender_network_type[i]); - } - - for (uint32_t i = 0; i < tunnel_lanes; i++) { - output_queues[i].init( - i + tunnel_lanes, //MAX_TUNNEL_LANES, - remote_receiver_queue_start_addr_words[i], - remote_receiver_queue_size_words[i], - remote_receiver_x[i], - remote_receiver_y[i], - remote_receiver_queue_id[i], - remote_receiver_network_type[i], - &input_queues[i], - 1); - } + initialize_input_queues(); + initialize_output_queue(); - if (!wait_all_src_dest_ready(input_queues, tunnel_lanes, output_queues, tunnel_lanes, timeout_cycles)) { + if (!wait_all_input_output_ready(raw_input_queues, raw_output_queues, timeout_cycles)) { write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } @@ -226,37 +344,58 @@ void kernel_main() { uint64_t data_words_sent = 0; uint64_t iter = 0; uint64_t start_timestamp = get_timestamp(); + uint32_t heartbeat = 0; uint32_t switch_counter = 0; while (!all_outputs_finished) { + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); iter++; switch_counter++; all_outputs_finished = switch_counter >= SWITCH_THRESHOLD; - for (uint32_t i = 0; i < tunnel_lanes; i++) { - if (input_queues[i].get_curr_packet_valid()) { - bool full_packet_sent; - uint32_t words_sent = - output_queues[i].forward_data_from_input(0, full_packet_sent, input_queues[i].get_end_of_cmd()); - data_words_sent += words_sent; - if (words_sent > 0) { - switch_counter = 0; - all_outputs_finished = false; + process_queues([&](auto) -> bool { + using this_input_networks = NetworkTypeSequence; + using this_input_cb_mode = CBModeTypeSequence; + + auto* active_input_queue = raw_input_queues[sequence_i].template get(); + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + + active_input_queue->handle_recv(); + active_output_queue->handle_recv(); + + // No progress will be made when either of the queues are waiting + // If we are waiting too long, no progress will be made, and + // the while loop will exit + if (active_input_queue->busy() || active_output_queue->busy()) { + all_outputs_finished = false; + } else { + if (active_input_queue->get_curr_packet_valid()) { + bool full_packet_sent; + uint32_t words_sent = active_output_queue->template forward_data_from_input<0>(full_packet_sent, active_input_queue->get_end_of_cmd()); + data_words_sent += words_sent; + if (words_sent > 0) { + switch_counter = 0; + all_outputs_finished = false; + } } - } - output_queues[i].prev_words_in_flight_check_flush(); - if (switch_counter >= SWITCH_THRESHOLD) { - bool output_finished = output_queues[i].is_remote_finished(); - if (output_finished) { - uint32_t return_vc = (inner_stop_mux_d_bypass >> 24) & 0xFF; - if ((i == return_vc) && (inner_stop_mux_d_bypass != 0)) { - input_queues[i].remote_x = inner_stop_mux_d_bypass & 0xFF; - input_queues[i].remote_y = (inner_stop_mux_d_bypass >> 8) & 0xFF; - input_queues[i].set_remote_ready_status_addr((inner_stop_mux_d_bypass >> 16) & 0xFF); + + active_output_queue->prev_words_in_flight_check_flush(); + + if (switch_counter >= SWITCH_THRESHOLD) { + bool output_finished = active_output_queue->is_remote_finished(); + if (output_finished) { + uint32_t return_vc = (inner_stop_mux_d_bypass >> 24) & 0xFF; + if ((sequence_i == return_vc) && (inner_stop_mux_d_bypass != 0)) { + active_input_queue->set_final_remote_xy(inner_stop_mux_d_bypass & 0xFF, (inner_stop_mux_d_bypass >> 8) & 0xFF); + active_input_queue->set_remote_ready_status_addr((inner_stop_mux_d_bypass >> 16) & 0xFF); + } + active_input_queue->send_remote_finished_notification(); } - input_queues[i].send_remote_finished_notification(); + all_outputs_finished &= output_finished; } - all_outputs_finished &= output_finished; } - } + + return true; + }); + uint32_t launch_msg_rd_ptr = *GET_MAILBOX_ADDRESS_DEV(launch_msg_rd_ptr); tt_l1_ptr launch_msg_t * const launch_msg = GET_MAILBOX_ADDRESS_DEV(launch[launch_msg_rd_ptr]); if (launch_msg->kernel_config.exit_erisc_kernel) { @@ -268,13 +407,18 @@ void kernel_main() { internal_::risc_context_switch(); switch_counter = SWITCH_THRESHOLD; } - } + bool timeout = false; write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); - for (uint32_t i = 0; i < tunnel_lanes; i++) { - output_queues[i].output_barrier(); - } + process_queues([&](auto) -> bool { + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + if (!active_output_queue->output_barrier(timeout_cycles)) { + timeout = true; + return false; + } + return true; + }); uint64_t cycles_elapsed = get_timestamp() - start_timestamp; write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); @@ -283,6 +427,10 @@ void kernel_main() { set_64b_result(kernel_status, cycles_elapsed, PQ_TEST_CYCLES_INDEX); set_64b_result(kernel_status, iter, PQ_TEST_ITER_INDEX); - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); + if (timeout) { + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); + } else { + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); + } } diff --git a/tt_metal/impl/dispatch/kernels/vc_packet_router.cpp b/tt_metal/impl/dispatch/kernels/vc_packet_router.cpp index 34ff7feac268..86d95319d294 100644 --- a/tt_metal/impl/dispatch/kernels/vc_packet_router.cpp +++ b/tt_metal/impl/dispatch/kernels/vc_packet_router.cpp @@ -4,15 +4,16 @@ #include "dataflow_api.h" #include "debug/dprint.h" -#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" #include "tt_metal/impl/dispatch/kernels/cq_helpers.hpp" +#include "packet_queue_ctrl.hpp" +#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp" + +using namespace packet_queue; constexpr uint32_t rx_queue_start_addr_words = get_compile_time_arg_val(1); constexpr uint32_t rx_queue_size_words = get_compile_time_arg_val(2); constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words*PACKET_WORD_SIZE_BYTES; -static_assert(is_power_of_2(rx_queue_size_words), "rx_queue_size_words must be a power of 2"); - constexpr uint32_t router_lanes = get_compile_time_arg_val(3); // FIXME imatosevic - is there a way to do this without explicit indexes? @@ -69,11 +70,6 @@ constexpr uint32_t remote_tx_queue_size_words[MAX_SWITCH_FAN_OUT] = get_compile_time_arg_val(15) }; -static_assert(is_power_of_2(remote_tx_queue_size_words[0]), "remote_tx_queue_size_words must be a power of 2"); -static_assert((router_lanes < 2) || is_power_of_2(remote_tx_queue_size_words[1]), "remote_tx_queue_size_words must be a power of 2"); -static_assert((router_lanes < 3) || is_power_of_2(remote_tx_queue_size_words[2]), "remote_tx_queue_size_words must be a power of 2"); -static_assert((router_lanes < 4) || is_power_of_2(remote_tx_queue_size_words[3]), "remote_tx_queue_size_words must be a power of 2"); - constexpr uint8_t remote_rx_x[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(16) & 0xFF), @@ -203,37 +199,136 @@ constexpr uint8_t input_packetize_dest_endpoint[MAX_SWITCH_FAN_IN] = (get_compile_time_arg_val(35) >> 24) & 0xFF }; -packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN]; -packet_output_queue_state_t output_queues[MAX_SWITCH_FAN_OUT]; +constexpr uint32_t vc_packet_router_input_scratch_buffers[MAX_SWITCH_FAN_IN] = + { + get_compile_time_arg_val(36), + get_compile_time_arg_val(37), + get_compile_time_arg_val(38), + get_compile_time_arg_val(39), + }; + +constexpr uint32_t vc_packet_router_input_remote_scratch_buffers[MAX_SWITCH_FAN_IN] = + { + get_compile_time_arg_val(40), + get_compile_time_arg_val(41), + get_compile_time_arg_val(42), + get_compile_time_arg_val(43), + }; + +constexpr uint32_t vc_packet_router_output_scratch_buffers[MAX_SWITCH_FAN_OUT] = + { + get_compile_time_arg_val(44), + get_compile_time_arg_val(45), + get_compile_time_arg_val(46), + get_compile_time_arg_val(47), + }; + +constexpr uint32_t vc_packet_router_output_remote_scratch_buffers[MAX_SWITCH_FAN_OUT] = + { + get_compile_time_arg_val(48), + get_compile_time_arg_val(49), + get_compile_time_arg_val(50), + get_compile_time_arg_val(51), + }; + +PacketInputQueueVariant raw_input_queues[MAX_SWITCH_FAN_IN]; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; + +PacketOutputQueueVariant raw_output_queues[MAX_SWITCH_FAN_OUT]; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; + +inline void initialize_input_queues() { + init_params_t init_params{ + .is_input = true, + .queue_size_words = rx_queue_size_words, + }; + + process_queues([&](auto) -> bool { + raw_input_queues[sequence_i].template engage(); + + auto* active_input_queue = raw_input_queues[sequence_i].template get(); + init_params.queue_id = (uint8_t)sequence_i; + init_params.queue_start_addr_words = rx_queue_start_addr_words + sequence_i * rx_queue_size_words; + init_params.queue_size_words = rx_queue_size_words; + init_params.remote_queue_id = (uint8_t)remote_rx_queue_id[sequence_i]; + init_params.remote_x = remote_rx_x[sequence_i]; + init_params.remote_y = remote_rx_y[sequence_i]; + init_params.ptrs_addr = vc_packet_router_input_scratch_buffers[sequence_i]; + init_params.remote_ptrs_addr = vc_packet_router_input_remote_scratch_buffers[sequence_i]; + + init_params.cb_mode = input_packetize[sequence_i]; + init_params.local_sem_id = (uint8_t)input_packetize_local_sem[sequence_i]; + init_params.remote_sem_id = (uint8_t)input_packetize_upstream_sem[sequence_i]; + init_params.log_page_size = (uint8_t)input_packetize_log_page_size[sequence_i]; + + init_params.packetizer_input_src = input_packetize_src_endpoint[sequence_i]; + init_params.packetizer_input_dest = input_packetize_dest_endpoint[sequence_i]; + + active_input_queue->init(&init_params); + + return true; + }); +} + +inline void initialize_output_queues() { + init_params_t init_params{}; + + process_queues([&](auto) -> bool { + // Sequence number for input queues should line up with the output queues + // input network/cb mode sequence in here is not the same as the global one + // each output queue only has 1 input queue connected to it + using this_input_networks = NetworkTypeSequence; + using this_input_cb_mode = CBModeTypeSequence; + + raw_output_queues[sequence_i].template engage(); + + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + + init_params.queue_id = (uint8_t)sequence_i + router_lanes, + init_params.queue_start_addr_words = remote_tx_queue_start_addr_words[sequence_i], + init_params.queue_size_words = remote_tx_queue_size_words[sequence_i], + init_params.remote_queue_id = (uint8_t)remote_tx_queue_id[sequence_i], + init_params.remote_x = remote_tx_x[sequence_i], + init_params.remote_y = remote_tx_y[sequence_i], + init_params.ptrs_addr = vc_packet_router_output_scratch_buffers[sequence_i], + init_params.remote_ptrs_addr = vc_packet_router_output_remote_scratch_buffers[sequence_i], + + init_params.cb_mode = output_depacketize[sequence_i]; + init_params.local_sem_id = (uint8_t)output_depacketize_local_sem[sequence_i], + init_params.remote_sem_id = (uint8_t)output_depacketize_downstream_sem[sequence_i], + init_params.log_page_size = (uint8_t)output_depacketize_log_page_size[sequence_i], + + init_params.input_queues = &raw_input_queues[sequence_i], + init_params.num_input_queues = 1, + + init_params.unpacketizer_output_remove_header = output_depacketize_remove_header[sequence_i], + + active_output_queue->init(&init_params); + + return true; + }); +} void kernel_main() { write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000000); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX+1, 0xbb000000 | router_lanes); - for (uint32_t i = 0; i < router_lanes; i++) { - input_queues[i].init(i, rx_queue_start_addr_words + i*rx_queue_size_words, rx_queue_size_words, - remote_rx_x[i], remote_rx_y[i], remote_rx_queue_id[i], remote_rx_network_type[i], - input_packetize[i], input_packetize_log_page_size[i], - input_packetize_local_sem[i], input_packetize_upstream_sem[i], - input_packetize_src_endpoint[i], input_packetize_dest_endpoint[i]); - - output_queues[i].init(i + router_lanes, remote_tx_queue_start_addr_words[i], remote_tx_queue_size_words[i], - remote_tx_x[i], remote_tx_y[i], remote_tx_queue_id[i], remote_tx_network_type[i], - &input_queues[i], 1, - output_depacketize[i], output_depacketize_log_page_size[i], - output_depacketize_local_sem[i], output_depacketize_downstream_sem[i], - output_depacketize_remove_header[i]); - } + initialize_input_queues(); + initialize_output_queues(); - if (!wait_all_src_dest_ready(input_queues, router_lanes, output_queues, router_lanes, timeout_cycles)) { + if (!wait_all_input_output_ready(raw_input_queues, raw_output_queues, timeout_cycles)) { write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000001); - uint32_t curr_input = 0; bool timeout = false; bool all_outputs_finished = false; uint64_t data_words_sent = 0; @@ -241,56 +336,55 @@ void kernel_main() { uint64_t start_timestamp = get_timestamp(); uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; uint32_t heartbeat = 0; - while (!all_outputs_finished && !timeout) { + while (!all_outputs_finished) { IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); iter++; - if (timeout_cycles > 0) { - uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; - if (cycles_since_progress > timeout_cycles) { - timeout = true; - break; - } - } - if (input_queues[curr_input].get_curr_packet_valid()) { - bool full_packet_sent; - uint32_t words_sent = output_queues[curr_input].forward_data_from_input(0, full_packet_sent, input_queues[curr_input].get_end_of_cmd()); - data_words_sent += words_sent; - if ((words_sent > 0) && (timeout_cycles > 0)) { - progress_timestamp = get_timestamp_32b(); + + process_queues([&](auto) -> bool { + using this_input_networks = NetworkTypeSequence; + using this_input_cb_mode = CBModeTypeSequence; + + auto* active_input_queue = raw_input_queues[sequence_i].template get(); + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + + if (active_input_queue->get_curr_packet_valid()) { + bool full_packet_sent; + data_words_sent += active_output_queue->template forward_data_from_input<0>(full_packet_sent, active_input_queue->get_end_of_cmd()); } - } - output_queues[curr_input].prev_words_in_flight_check_flush(); + active_output_queue->prev_words_in_flight_check_flush(); + + return true; + }); if ((iter & 0xFF) == 0) { all_outputs_finished = true; - for (uint32_t i = 0; i < router_lanes; i++) { - all_outputs_finished &= output_queues[i].is_remote_finished(); - } - } - - curr_input++; - if (curr_input == router_lanes) { - curr_input = 0; + process_queues([&](auto) -> bool { + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + all_outputs_finished &= active_output_queue->is_remote_finished(); + return true; + }); } } - if (!timeout) { - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); - for (uint32_t i = 0; i < router_lanes; i++) { - if (!output_queues[i].output_barrier(timeout_cycles)) { - timeout = true; - break; - } + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); + process_queues([&](auto) -> bool { + auto* active_output_queue = raw_output_queues[sequence_i].template get(); + if (!active_output_queue->output_barrier(timeout_cycles)) { + timeout = true; + return false; } - } + return true; + }); uint64_t cycles_elapsed = get_timestamp() - start_timestamp; if (!timeout) { write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); - for (uint32_t i = 0; i < router_lanes; i++) { - input_queues[i].send_remote_finished_notification(); - } + process_queues([&](auto i) -> bool { + auto* active_input_queue = raw_input_queues[i].template get(); + active_input_queue->send_remote_finished_notification(); + return true; + }); } set_64b_result(kernel_status, data_words_sent, PQ_TEST_WORD_CNT_INDEX);