Skip to content

Commit

Permalink
#14427: packet queue improvements
Browse files Browse the repository at this point in the history
    - Removed power of 2 constraint

    - Replaced registers with L1 buffers. Registers are still used for
      the handshaking process though.

    - Refactor packet queue into specialized variants

    - Removed unnecessary branching in packet queue and kernels

    - Removed the "if not valid then start next packet" logic in the
      input queue curr packet getters and made them const.
      This lowers the branches by 5 when looping

    - Move many checks to compile-time

    - Removed the timeout logic in kernels. It was only used for testing
      purposes and slows down code in prod. If it hangs then you
      know your test failed
  • Loading branch information
nhuang-tt committed Dec 16, 2024
1 parent 6d7cc2c commit ccf7535
Show file tree
Hide file tree
Showing 32 changed files with 4,896 additions and 5,750 deletions.
5 changes: 0 additions & 5 deletions tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@ set(PERF_MICROBENCH_TESTS_SRCS
routing/test_mux_demux.cpp
routing/test_vc_mux_demux.cpp
routing/test_mux_demux_2level.cpp
routing/test_tunnel_1cq.cpp
routing/test_tunnel_2cq.cpp
routing/test_uni_tunnel.cpp
routing/test_vc_uni_tunnel.cpp
routing/test_uni_tunnel_single_chip.cpp
routing/test_bi_tunnel.cpp
routing/test_vc_bi_tunnel_2ep.cpp
routing/test_vc_bi_tunnel_4ep.cpp
routing/test_vc_loopback_tunnel.cpp
Expand Down
211 changes: 120 additions & 91 deletions tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

#pragma once

#include "debug/dprint.h"
#include <cstdint>
#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp"
#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"

inline uint32_t prng_next(uint32_t n) {
uint32_t x = n;
Expand Down Expand Up @@ -59,7 +60,7 @@ struct input_queue_raw_state_t {
uint32_t max_packet_size_words,
uint64_t total_data_words) {
this->curr_packet_dest = this->num_dests_sent_last_packet + dest_endpoint_start_id;
this->curr_packet_flags = DispatchPacketFlag::PACKET_TEST_LAST;
this->curr_packet_flags = static_cast<uint32_t>(packet_queue::DispatchPacketFlag::PACKET_TEST_LAST);
this->curr_packet_size_words = 2;
this->curr_packet_words_remaining = this->curr_packet_size_words;
this->data_words_input += 2;
Expand Down Expand Up @@ -174,10 +175,10 @@ constexpr auto select_input_queue() {
}

inline void fill_packet_data(tt_l1_ptr uint32_t* start_addr, uint32_t num_words, uint32_t start_val) {
tt_l1_ptr uint32_t* addr = start_addr + (PACKET_WORD_SIZE_BYTES / 4 - 1);
tt_l1_ptr uint32_t* addr = start_addr + (packet_queue::PACKET_WORD_SIZE_BYTES / 4 - 1);
for (uint32_t i = 0; i < num_words; i++) {
*addr = start_val++;
addr += (PACKET_WORD_SIZE_BYTES / 4);
addr += (packet_queue::PACKET_WORD_SIZE_BYTES / 4);
}
}

Expand All @@ -188,7 +189,7 @@ inline bool check_packet_data(
uint32_t& mismatch_addr,
uint32_t& mismatch_val,
uint32_t& expected_val) {
tt_l1_ptr uint32_t* addr = start_addr + (PACKET_WORD_SIZE_BYTES / 4 - 1);
tt_l1_ptr uint32_t* addr = start_addr + (packet_queue::PACKET_WORD_SIZE_BYTES / 4 - 1);
for (uint32_t i = 0; i < num_words; i++) {
if (*addr != start_val) {
mismatch_addr = reinterpret_cast<uint32_t>(addr);
Expand All @@ -197,7 +198,7 @@ inline bool check_packet_data(
return false;
}
start_val++;
addr += (PACKET_WORD_SIZE_BYTES / 4);
addr += (packet_queue::PACKET_WORD_SIZE_BYTES / 4);
}
return true;
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
// SPDX-License-Identifier: Apache-2.0

#include "dataflow_api.h"
#include "debug/dprint.h"
#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp"
#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp"
#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp"

packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN];
using namespace packet_queue;

constexpr uint32_t endpoint_id = get_compile_time_arg_val(0);

Expand All @@ -22,8 +21,6 @@ constexpr uint32_t input_queue_id = 0;
constexpr uint32_t queue_start_addr_words = get_compile_time_arg_val(3);
constexpr uint32_t queue_size_words = get_compile_time_arg_val(4);

static_assert(is_power_of_2(queue_size_words), "queue_size_words must be a power of 2");

constexpr uint32_t remote_tx_x = get_compile_time_arg_val(5);
constexpr uint32_t remote_tx_y = get_compile_time_arg_val(6);
constexpr uint32_t remote_tx_queue_id = get_compile_time_arg_val(7);
Expand Down Expand Up @@ -51,13 +48,33 @@ constexpr uint32_t timeout_cycles = get_compile_time_arg_val(17);

constexpr uint32_t disable_header_check = get_compile_time_arg_val(18);

// Inputs - Update remote rptr
constexpr uint32_t traffic_gen_input_ptrs_addr = get_compile_time_arg_val(19);
constexpr uint32_t traffic_gen_input_remote_ptrs_addr = get_compile_time_arg_val(20);

// Outputs
// None. This is a receiver to check data for testing purposes

// predicts size and payload of packets from each destination, should have
// the same random seed as the corresponding traffic_gen_tx
input_queue_rnd_state_t src_rnd_state[num_src_endpoints];

PacketInputQueueVariant raw_input_queue;
constexpr init_params_t input_queue_init_params{
.queue_id = input_queue_id,
.queue_start_addr_words = queue_start_addr_words,
.queue_size_words = queue_size_words,
.remote_queue_id = remote_tx_queue_id,
.remote_x = remote_tx_x,
.remote_y = remote_tx_y,
.ptrs_addr = traffic_gen_input_ptrs_addr,
.remote_ptrs_addr = traffic_gen_input_remote_ptrs_addr,
};

using input_queue_network_sequence = NetworkTypeSequence<rx_rptr_update_network_type>;
using input_queue_cb_mode_sequence = CBModeTypeSequence<false>;

void kernel_main() {

zero_l1_buf(test_results, test_results_size_bytes);
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED;
test_results[PQ_TEST_MISC_INDEX] = 0xff000000;
Expand All @@ -70,13 +87,13 @@ void kernel_main() {
src_rnd_state[i].init(prng_seed, src_endpoint_start_id+i);
}

packet_input_queue_state_t* input_queue = &(input_queues[input_queue_id]);
raw_input_queue.engage<rx_rptr_update_network_type, false /*cb mode*/>();
// safe to use now

input_queue->init(input_queue_id, queue_start_addr_words, queue_size_words,
remote_tx_x, remote_tx_y, remote_tx_queue_id,
rx_rptr_update_network_type);
auto* input_queue = raw_input_queue.get<rx_rptr_update_network_type, false>();
input_queue->init(&input_queue_init_params);

if (!wait_all_src_dest_ready(input_queue, 1, NULL, 0, timeout_cycles)) {
if (!wait_all_input_output_ready<input_queue_network_sequence, input_queue_cb_mode_sequence, NoNetworkTypeSequence, NoCBModeTypeSequence>(&raw_input_queue, NULL, timeout_cycles)) {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT;
return;
}
Expand Down Expand Up @@ -116,12 +133,12 @@ void kernel_main() {
}
#endif
uint32_t num_words_available;
packet_available = input_queue->input_queue_full_packet_available_to_send(num_words_available);
packet_available = input_queue->full_packet_available_to_send(num_words_available);
if (!packet_available) {
// Mark works as "sent" immediately to keep pipeline from stalling.
// This is OK since num_words_available comes from the call above, so
// it's guaranteed to be smaller than the full next packet.
input_queue->input_queue_advance_words_sent(num_words_available);
input_queue->advance_words_sent(num_words_available);
words_sent += num_words_available;
}
}
Expand Down Expand Up @@ -151,7 +168,7 @@ void kernel_main() {
break;
}

if (curr_packet_flags & PACKET_TEST_LAST) {
if (curr_packet_flags & static_cast<uint32_t>(packet_queue::DispatchPacketFlag::PACKET_TEST_LAST)) {
if (src_endpoint_last_packet[src_endpoint_index] ||
curr_packet_size_words != 2 ||
curr_packet_tag != 0xffffffff) {
Expand Down Expand Up @@ -180,13 +197,13 @@ void kernel_main() {
}
}

uint32_t num_words_available = input_queue->input_queue_curr_packet_num_words_available_to_send();
uint32_t num_words_available = input_queue->get_curr_packet_num_words_available_to_send();
// we have the packet header info for checking, input queue can now switch to the next packet
input_queue->input_queue_advance_words_sent(num_words_available);
input_queue->advance_words_sent(num_words_available);
words_sent += num_words_available;

// move rptr_cleared to the packet payload
input_queue->input_queue_advance_words_cleared(1);
input_queue->advance_words_cleared(1);
words_cleared++;

// === parse packet payload ===
Expand All @@ -209,7 +226,7 @@ void kernel_main() {
test_results[PQ_TEST_MISC_INDEX+5] = words_after_wrap;
break;
}
input_queue->input_queue_advance_words_cleared(words_before_wrap);
input_queue->advance_words_cleared(words_before_wrap);
words_cleared += words_before_wrap;
if (words_after_wrap > 0) {
if (!check_packet_data(reinterpret_cast<tt_l1_ptr uint32_t*>(input_queue->get_queue_rptr_cleared_addr_bytes()),
Expand All @@ -222,11 +239,11 @@ void kernel_main() {
test_results[PQ_TEST_MISC_INDEX+5] = words_after_wrap;
break;
}
input_queue->input_queue_advance_words_cleared(words_after_wrap);
input_queue->advance_words_cleared(words_after_wrap);
words_cleared += words_after_wrap;
}
} else {
input_queue->input_queue_advance_words_cleared(curr_packet_payload_words);
input_queue->advance_words_cleared(curr_packet_payload_words);
words_cleared += curr_packet_payload_words;
}
progress_timestamp = get_timestamp_32b();
Expand Down Expand Up @@ -257,13 +274,11 @@ void kernel_main() {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT;
set_64b_result(test_results, words_sent, PQ_TEST_MISC_INDEX+12);
set_64b_result(test_results, words_cleared, PQ_TEST_MISC_INDEX+14);
input_queue->dprint_object();
} else if (check_failed) {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_DATA_MISMATCH;
test_results[PQ_TEST_MISC_INDEX+12] = mismatch_addr;
test_results[PQ_TEST_MISC_INDEX+12] = mismatch_val;
test_results[PQ_TEST_MISC_INDEX+12] = expected_val;
input_queue->dprint_object();
} else {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS;
test_results[PQ_TEST_MISC_INDEX] = 0xff000005;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@

inline const char* packet_queue_test_status_to_string(uint32_t status) {
switch (status) {
case PACKET_QUEUE_TEST_STARTED: return "STARTED";
case PACKET_QUEUE_TEST_PASS: return "DONE/OK";
case PACKET_QUEUE_TEST_TIMEOUT: return "TIMEOUT";
case PACKET_QUEUE_TEST_DATA_MISMATCH: return "DATA_MISMATCH";
case packet_queue::PACKET_QUEUE_TEST_STARTED: return "STARTED";
case packet_queue::PACKET_QUEUE_TEST_PASS: return "DONE/OK";
case packet_queue::PACKET_QUEUE_TEST_TIMEOUT: return "TIMEOUT";
case packet_queue::PACKET_QUEUE_TEST_DATA_MISMATCH: return "DATA_MISMATCH";
default: return "UNKNOWN";
}
}
Expand Down
Loading

0 comments on commit ccf7535

Please sign in to comment.