Skip to content

Commit

Permalink
#14427: packet queue improvements
Browse files Browse the repository at this point in the history
    - Removed power of 2 constraint

    - Replaced registers with L1 buffers. Registers are still used for
      the handshaking process though.

    - Refactor packet queue into specialized variants

    - Removed unnecessary branching in packet queue and kernels

    - Removed the "if not valid then start next packet" logic in the
      input queue curr packet getters and made them const.
      This lowers the branches by 5 when looping

    - Move many checks to compile-time

    - Removed the timeout logic in kernels. It was only used for testing
      purposes and slows down code in prod. If it hangs then you
      know your test failed
  • Loading branch information
nhuang-tt committed Dec 12, 2024
1 parent 52cc437 commit eee4798
Show file tree
Hide file tree
Showing 25 changed files with 4,473 additions and 5,578 deletions.
5 changes: 0 additions & 5 deletions tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@ set(PERF_MICROBENCH_TESTS_SRCS
routing/test_mux_demux.cpp
routing/test_vc_mux_demux.cpp
routing/test_mux_demux_2level.cpp
routing/test_tunnel_1cq.cpp
routing/test_tunnel_2cq.cpp
routing/test_uni_tunnel.cpp
routing/test_vc_uni_tunnel.cpp
routing/test_uni_tunnel_single_chip.cpp
routing/test_bi_tunnel.cpp
routing/test_vc_bi_tunnel_2ep.cpp
routing/test_vc_bi_tunnel_4ep.cpp
routing/test_vc_loopback_tunnel.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
// SPDX-License-Identifier: Apache-2.0

#include "dataflow_api.h"
#include "debug/dprint.h"
#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp"
#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp"
#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp"

packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN];
using namespace packet_queue;

constexpr uint32_t endpoint_id = get_compile_time_arg_val(0);

Expand All @@ -22,8 +21,6 @@ constexpr uint32_t input_queue_id = 0;
constexpr uint32_t queue_start_addr_words = get_compile_time_arg_val(3);
constexpr uint32_t queue_size_words = get_compile_time_arg_val(4);

static_assert(is_power_of_2(queue_size_words), "queue_size_words must be a power of 2");

constexpr uint32_t remote_tx_x = get_compile_time_arg_val(5);
constexpr uint32_t remote_tx_y = get_compile_time_arg_val(6);
constexpr uint32_t remote_tx_queue_id = get_compile_time_arg_val(7);
Expand Down Expand Up @@ -51,13 +48,34 @@ constexpr uint32_t timeout_cycles = get_compile_time_arg_val(17);

constexpr uint32_t disable_header_check = get_compile_time_arg_val(18);

// Inputs - Update remote rptr
constexpr uint32_t traffic_gen_input_ptrs_addr = get_compile_time_arg_val(19);
constexpr uint32_t traffic_gen_input_remote_ptrs_addr = get_compile_time_arg_val(20);

// Outputs
// None. This is a receiver to check data for testing purposes

// predicts size and payload of packets from each destination, should have
// the same random seed as the corresponding traffic_gen_tx
input_queue_rnd_state_t src_rnd_state[num_src_endpoints];

PacketInputQueueVariant raw_input_queue;
constexpr init_params_t input_queue_init_params{
.is_input = true,
.queue_id = input_queue_id,
.queue_start_addr_words = queue_start_addr_words,
.queue_size_words = queue_size_words,
.remote_queue_id = remote_tx_queue_id,
.remote_x = remote_tx_x,
.remote_y = remote_tx_y,
.ptrs_addr = traffic_gen_input_ptrs_addr,
.remote_ptrs_addr = traffic_gen_input_remote_ptrs_addr,
};

using input_queue_network_sequence = NetworkTypeSequence<rx_rptr_update_network_type>;
using input_queue_cb_mode_sequence = CBModeTypeSequence<false>;

void kernel_main() {

zero_l1_buf(test_results, test_results_size_bytes);
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED;
test_results[PQ_TEST_MISC_INDEX] = 0xff000000;
Expand All @@ -70,13 +88,13 @@ void kernel_main() {
src_rnd_state[i].init(prng_seed, src_endpoint_start_id+i);
}

packet_input_queue_state_t* input_queue = &(input_queues[input_queue_id]);
raw_input_queue.engage<rx_rptr_update_network_type, false /*cb mode*/>();
// safe to use now

input_queue->init(input_queue_id, queue_start_addr_words, queue_size_words,
remote_tx_x, remote_tx_y, remote_tx_queue_id,
rx_rptr_update_network_type);
auto* input_queue = raw_input_queue.get<rx_rptr_update_network_type, false>();
input_queue->init(&input_queue_init_params);

if (!wait_all_src_dest_ready(input_queue, 1, NULL, 0, timeout_cycles)) {
if (!wait_all_input_output_ready<input_queue_network_sequence, input_queue_cb_mode_sequence, NoNetworkTypeSequence, NoCBModeTypeSequence>(&raw_input_queue, NULL, timeout_cycles)) {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT;
return;
}
Expand Down Expand Up @@ -116,12 +134,12 @@ void kernel_main() {
}
#endif
uint32_t num_words_available;
packet_available = input_queue->input_queue_full_packet_available_to_send(num_words_available);
packet_available = input_queue->full_packet_available_to_send(num_words_available);
if (!packet_available) {
// Mark works as "sent" immediately to keep pipeline from stalling.
// This is OK since num_words_available comes from the call above, so
// it's guaranteed to be smaller than the full next packet.
input_queue->input_queue_advance_words_sent(num_words_available);
input_queue->advance_words_sent(num_words_available);
words_sent += num_words_available;
}
}
Expand Down Expand Up @@ -180,13 +198,13 @@ void kernel_main() {
}
}

uint32_t num_words_available = input_queue->input_queue_curr_packet_num_words_available_to_send();
uint32_t num_words_available = input_queue->get_curr_packet_num_words_available_to_send();
// we have the packet header info for checking, input queue can now switch to the next packet
input_queue->input_queue_advance_words_sent(num_words_available);
input_queue->advance_words_sent(num_words_available);
words_sent += num_words_available;

// move rptr_cleared to the packet payload
input_queue->input_queue_advance_words_cleared(1);
input_queue->advance_words_cleared(1);
words_cleared++;

// === parse packet payload ===
Expand All @@ -209,7 +227,7 @@ void kernel_main() {
test_results[PQ_TEST_MISC_INDEX+5] = words_after_wrap;
break;
}
input_queue->input_queue_advance_words_cleared(words_before_wrap);
input_queue->advance_words_cleared(words_before_wrap);
words_cleared += words_before_wrap;
if (words_after_wrap > 0) {
if (!check_packet_data(reinterpret_cast<tt_l1_ptr uint32_t*>(input_queue->get_queue_rptr_cleared_addr_bytes()),
Expand All @@ -222,11 +240,11 @@ void kernel_main() {
test_results[PQ_TEST_MISC_INDEX+5] = words_after_wrap;
break;
}
input_queue->input_queue_advance_words_cleared(words_after_wrap);
input_queue->advance_words_cleared(words_after_wrap);
words_cleared += words_after_wrap;
}
} else {
input_queue->input_queue_advance_words_cleared(curr_packet_payload_words);
input_queue->advance_words_cleared(curr_packet_payload_words);
words_cleared += curr_packet_payload_words;
}
progress_timestamp = get_timestamp_32b();
Expand Down Expand Up @@ -257,13 +275,11 @@ void kernel_main() {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT;
set_64b_result(test_results, words_sent, PQ_TEST_MISC_INDEX+12);
set_64b_result(test_results, words_cleared, PQ_TEST_MISC_INDEX+14);
input_queue->dprint_object();
} else if (check_failed) {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_DATA_MISMATCH;
test_results[PQ_TEST_MISC_INDEX+12] = mismatch_addr;
test_results[PQ_TEST_MISC_INDEX+12] = mismatch_val;
test_results[PQ_TEST_MISC_INDEX+12] = expected_val;
input_queue->dprint_object();
} else {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS;
test_results[PQ_TEST_MISC_INDEX] = 0xff000005;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@

// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

// clang-format off
#include "dataflow_api.h"
#include "debug/dprint.h"
#include "tt_metal/impl/dispatch/kernels/packet_queue.hpp"
#include "tt_metal/impl/dispatch/kernels/packet_queue_v2.hpp"
#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp"
// clang-format on

using namespace packet_queue;

constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0);
constexpr uint32_t num_dest_endpoints = get_compile_time_arg_val(1);

Expand All @@ -18,13 +21,9 @@ constexpr uint32_t queue_start_addr_words = get_compile_time_arg_val(2);
constexpr uint32_t queue_size_words = get_compile_time_arg_val(3);
constexpr uint32_t queue_size_bytes = queue_size_words * PACKET_WORD_SIZE_BYTES;

static_assert(is_power_of_2(queue_size_words), "queue_size_words must be a power of 2");

constexpr uint32_t remote_rx_queue_start_addr_words = get_compile_time_arg_val(4);
constexpr uint32_t remote_rx_queue_size_words = get_compile_time_arg_val(5);

static_assert(is_power_of_2(remote_rx_queue_size_words), "remote_rx_queue_size_words must be a power of 2");

constexpr uint32_t remote_rx_x = get_compile_time_arg_val(6);
constexpr uint32_t remote_rx_y = get_compile_time_arg_val(7);
constexpr uint32_t remote_rx_queue_id = get_compile_time_arg_val(8);
Expand Down Expand Up @@ -62,30 +61,68 @@ constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(21);
constexpr uint32_t input_queue_id = 0;
constexpr uint32_t output_queue_id = 1;

packet_input_queue_state_t input_queue;
packet_output_queue_state_t output_queue;
// Inputs
constexpr uint32_t traffic_gen_input_ptrs_addr = get_compile_time_arg_val(22);
constexpr uint32_t traffic_gen_input_mock_remote_ptrs_addr = get_compile_time_arg_val(23);

constexpr packet_input_queue_state_t* input_queue_ptr = &input_queue;
constexpr packet_output_queue_state_t* output_queue_ptr = &output_queue;
// Outputs - Update remote wptr
constexpr uint32_t traffic_gen_output_ptrs_addr = get_compile_time_arg_val(24);
constexpr uint32_t traffic_gen_output_remote_ptrs_addr = get_compile_time_arg_val(25);

// input_queue_rnd_state_t input_queue_state;
auto input_queue_state = select_input_queue<pkt_dest_size_choice>();

PacketInputQueueVariant raw_input_queue;
constexpr init_params_t input_queue_init_params{
.is_input = true,
.queue_id = input_queue_id,
.queue_start_addr_words = queue_start_addr_words,
.queue_size_words = queue_size_words,
.remote_queue_id = 0,
.remote_x = 0,
.remote_y = 0,
.ptrs_addr = traffic_gen_input_ptrs_addr,
.remote_ptrs_addr = traffic_gen_input_mock_remote_ptrs_addr,
};
using input_queue_network_sequence = NetworkTypeSequence<DispatchRemoteNetworkType::NONE>;
using input_queue_cb_mode_sequence = CBModeTypeSequence<false>;


PacketOutputQueueVariant raw_output_queue;
constexpr init_params_t output_queue_init_params{
.queue_id = output_queue_id,
.queue_start_addr_words = remote_rx_queue_start_addr_words,
.queue_size_words = remote_rx_queue_size_words,
.remote_queue_id = remote_rx_queue_id,
.remote_x = remote_rx_x,
.remote_y = remote_rx_y,
.ptrs_addr = traffic_gen_output_ptrs_addr,
.remote_ptrs_addr = traffic_gen_output_remote_ptrs_addr,

.input_queues = &raw_input_queue,
.num_input_queues = 1,
};
using output_queue_network_sequence = NetworkTypeSequence<tx_network_type>;
using output_queue_cb_mode_sequence = CBModeTypeSequence<false>;


// generates packets with random size and payload on the input side
inline bool input_queue_handler() {
if (input_queue_state.all_packets_done()) {
return true;
}

uint32_t free_words = input_queue_ptr->get_queue_data_num_words_free();
auto* input_queue = raw_input_queue.get<DispatchRemoteNetworkType::NONE, false>();

uint32_t free_words = input_queue->get_queue_data_num_words_free();
if (free_words == 0) {
return false;
}

// Each call to input_queue_handler initializes only up to the end
// of the queue buffer, so we don't need to handle wrapping.
uint32_t byte_wr_addr = input_queue_ptr->get_queue_wptr_addr_bytes();
uint32_t words_to_init = std::min(free_words, input_queue_ptr->get_queue_words_before_wptr_wrap());
uint32_t byte_wr_addr = input_queue->get_queue_wptr_addr_bytes();
uint32_t words_to_init = std::min(free_words, input_queue->get_queue_words_before_wptr_wrap());
uint32_t words_initialized = 0;

while (words_initialized < words_to_init) {
Expand Down Expand Up @@ -122,7 +159,8 @@ inline bool input_queue_handler() {
byte_wr_addr += num_words * PACKET_WORD_SIZE_BYTES;
}
}
input_queue_ptr->advance_queue_local_wptr(words_initialized);

input_queue->advance_queue_local_wptr(words_initialized);
return false;
}

Expand All @@ -143,28 +181,17 @@ void kernel_main() {
input_queue_state.init(src_endpoint_id, prng_seed);
}

input_queue_ptr->init(
input_queue_id,
queue_start_addr_words,
queue_size_words,
// remote_x, remote_y, remote_queue_id, remote_update_network_type:
0,
0,
0,
DispatchRemoteNetworkType::NONE);

output_queue_ptr->init(
output_queue_id,
remote_rx_queue_start_addr_words,
remote_rx_queue_size_words,
remote_rx_x,
remote_rx_y,
remote_rx_queue_id,
tx_network_type,
input_queue_ptr,
1);

if (!wait_all_src_dest_ready(NULL, 0, output_queue_ptr, 1, timeout_cycles)) {
raw_input_queue.engage<DispatchRemoteNetworkType::NONE, false>();
raw_output_queue.engage<tx_network_type, false, input_queue_network_sequence, input_queue_cb_mode_sequence>();

// safe to use now
auto* input_queue = raw_input_queue.get<DispatchRemoteNetworkType::NONE, false>();
auto* new_output_queue = raw_output_queue.get<tx_network_type, false, input_queue_network_sequence, input_queue_cb_mode_sequence>();

input_queue->init(&input_queue_init_params);
new_output_queue->init(&output_queue_init_params);

if (!wait_all_input_output_ready<NoNetworkTypeSequence, NoCBModeTypeSequence, output_queue_network_sequence, output_queue_cb_mode_sequence>(NULL, &raw_output_queue, timeout_cycles)) {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT;
return;
}
Expand Down Expand Up @@ -193,10 +220,9 @@ void kernel_main() {
}
#endif
bool all_packets_initialized = input_queue_handler();
if (input_queue_ptr->get_curr_packet_valid()) {
if (input_queue->get_curr_packet_valid()) {
bool full_packet_sent;
uint32_t curr_data_words_sent = output_queue_ptr->forward_data_from_input(
input_queue_id, full_packet_sent, input_queue.get_end_of_cmd());
uint32_t curr_data_words_sent = new_output_queue->forward_data_from_input<input_queue_id>(full_packet_sent, input_queue->get_end_of_cmd());
data_words_sent += curr_data_words_sent;
if constexpr (!(data_sent_per_iter_low == 0 && data_sent_per_iter_high == 0)) {
zero_data_sent_iter += static_cast<uint64_t>(curr_data_words_sent <= 0);
Expand All @@ -209,12 +235,12 @@ void kernel_main() {
} else if (all_packets_initialized) {
break;
}
words_flushed += output_queue_ptr->prev_words_in_flight_check_flush();
words_flushed += new_output_queue->prev_words_in_flight_check_flush();
}

if (!timeout) {
test_results[PQ_TEST_MISC_INDEX] = 0xff00002;
if (!output_queue_ptr->output_barrier(timeout_cycles)) {
if (!new_output_queue->output_barrier(timeout_cycles)) {
timeout = true;
}
}
Expand All @@ -224,7 +250,7 @@ void kernel_main() {
if (!timeout) {
test_results[PQ_TEST_MISC_INDEX] = 0xff00003;
progress_timestamp = get_timestamp_32b();
while (!output_queue_ptr->is_remote_finished()) {
while (!new_output_queue->is_remote_finished()) {
if (timeout_cycles > 0) {
uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp;
if (cycles_since_progress > timeout_cycles) {
Expand All @@ -251,8 +277,5 @@ void kernel_main() {
} else {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT;
set_64b_result(test_results, words_flushed, TX_TEST_IDX_WORDS_FLUSHED);
// these calls lead to code size issues?
// input_queue_ptr->dprint_object();
// output_queue_ptr->dprint_object();
}
}
Loading

0 comments on commit eee4798

Please sign in to comment.