Skip to content

Commit

Permalink
Stress NOC mcast test
Browse files Browse the repository at this point in the history
  • Loading branch information
pgkeller committed Jan 13, 2025
1 parent 9162a0c commit 3738051
Show file tree
Hide file tree
Showing 4 changed files with 348 additions and 0 deletions.
1 change: 1 addition & 0 deletions tests/tt_metal/tt_metal/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ set(TT_METAL_TESTS_SRCS
# test_eltwise_unary.cpp <- not tested in run_tt_metal.py
test_matmul_single_tile_bfp8b.cpp
test_matmul_single_tile_output_in_l1.cpp
test_stress_noc_mcast.cpp
test_dram_loopback_single_core.cpp
test_datacopy_bfp8b.cpp
test_datacopy.cpp
Expand Down
58 changes: 58 additions & 0 deletions tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <c_tensix_core.h>

constexpr bool mcaster = get_compile_time_arg_val(0);
constexpr uint32_t tlx = get_compile_time_arg_val(1);
constexpr uint32_t tly = get_compile_time_arg_val(2);
constexpr uint32_t width = get_compile_time_arg_val(3);
constexpr uint32_t height = get_compile_time_arg_val(4);
constexpr uint64_t duration = (uint64_t)get_compile_time_arg_val(5) * 1000 * 1000 * 1000;
constexpr uint32_t ucast_size = get_compile_time_arg_val(6);
constexpr uint32_t mcast_size = get_compile_time_arg_val(7);
constexpr uint32_t virtual_grid_offset = get_compile_time_arg_val(8);
constexpr uint32_t nrands = get_compile_time_arg_val(9);
constexpr bool enable_rnd_delay = get_compile_time_arg_val(10);
constexpr uint32_t ucast_l1_addr = get_compile_time_arg_val(11);
constexpr uint32_t mcast_l1_addr = get_compile_time_arg_val(12);

inline uint32_t next_rand(tt_l1_ptr uint8_t* rnds, uint32_t& rnd_index) {
uint32_t rnd = rnds[rnd_index];
rnd_index = (rnd_index + 1) & (nrands - 1);
return rnd;
}

void kernel_main() {
uint64_t done_time = c_tensix_core::read_wall_clock() + duration;
tt_l1_ptr uint8_t* rnds = (tt_l1_ptr uint8_t*)(get_arg_addr(0));
uint32_t rnd_index = 0;

uint64_t stall_time = 0;
while (c_tensix_core::read_wall_clock() < done_time) {
for (uint32_t count = 0; count < 1000; count++) {
if (enable_rnd_delay) {
// reading time here biases us to have more ~0 cycle stalls as this
// includes the write time
while (c_tensix_core::read_wall_clock() < stall_time);
stall_time = c_tensix_core::read_wall_clock() + next_rand(rnds, rnd_index);
}

if (mcaster) {
uint64_t dst_noc_multicast_addr =
get_noc_multicast_addr(tlx, tly, tlx + width - 1, tly + height - 1, ucast_l1_addr);
noc_async_write_multicast(mcast_l1_addr, dst_noc_multicast_addr, mcast_size, width * height, false);
} else {
uint32_t dst_x, dst_y;
uint8_t noc_addr = next_rand(rnds, rnd_index);
dst_x = (noc_addr & 0xf) + virtual_grid_offset;
dst_y = (noc_addr >> 4) + virtual_grid_offset;
uint64_t noc_write_addr = NOC_XY_ADDR(NOC_X(dst_x), NOC_Y(dst_y), ucast_l1_addr);
noc_async_write(ucast_l1_addr, noc_write_addr, ucast_size);
}
}
}

noc_async_write_barrier();
}
53 changes: 53 additions & 0 deletions tests/tt_metal/tt_metal/test_kernels/sweep_stress_noc_mcast.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#/bin/bash

if [ "$ARCH_NAME" = "wormhole_b0" ]; then
echo "Configured core range for wormhole_b0"
width=8
height=8
elif [ "$ARCH_NAME" = "blackhole" ]; then
echo "Configured core range for blackhole"
width=12
height=10
else
echo "Unknown arch: $ARCH_NAME"
exit 1
fi

function run_set() {
echo "running: $@"
TT_METAL_SLOW_DISPATCH_MODE=1 build/test/tt_metal/test_stress_noc_mcast -t 120 $@
}

function run_all() {
run_set $@ -u 32 -m 32
run_set $@ -u 32 -m 256
run_set $@ -u 32 -m 2048
run_set $@ -u 32 -m 4096
run_set $@ -u 32 -m 8192
run_set $@ -u 256 -m 32
run_set $@ -u 2048 -m 256
run_set $@ -u 4096 -m 2048
run_set $@ -u 8192 -m 4096
}

# sweep w/ randomized noc address, tensix mcast
for (( i=0; i<$width; i++ )); do
h=$((height -1))
run_all -x 0 -y 0 -width $width -height $h -mx $i -my $h
done

# sweep w/ randomized delay+noc address, tensix mcast
for (( i=0; i<=11; i++ )); do
h=$((height -1))
run_all -x 0 -y 0 -width $width -height $h -mx $i -my $h -rdelay
done

# sweep w/ randomized noc address, eth mcast
for (( i=0; i<=11; i++ )); do
run_all -e $i -width $width -height $height
done

# sweep w/ randomized delay+noc address, eth mcast
for (( i=0; i<=11; i++ )); do
run_all -e $i -width $width -height $height -rdelay
done
236 changes: 236 additions & 0 deletions tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

// This test stresses NOC mcast by:
// - using 1 mcast core (future work to add multiple) either tensix or eth
// - rapidly mcast into a grid of tensix workers
// - rapidly grid of tensix workers generates random noc traffic
// - does not verify correct transactions, just runs til termination

#include <algorithm>
#include <cstdint>
#include <functional>
#include <random>
#include <string>

#include "core_coord.hpp"
#include "logger.hpp"
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/llrt/rtoptions.hpp"
#include "tt_metal/common/metal_soc_descriptor.h"
#include "tt_metal/impl/event/event.hpp"
#include "tt_metal/impl/dispatch/command_queue.hpp"
#include "tt_metal/impl/device/device.hpp"
#include "tt_metal/common/metal_soc_descriptor.h"
#include "llrt/hal.hpp"

using namespace tt;

const uint32_t CB_ELEMENTS = 2048;
const uint32_t DEFAULT_SECONDS = 10;
const uint32_t DEFAULT_TARGET_WIDTH = 1;
const uint32_t DEFAULT_TARGET_HEIGHT = 1;
const uint32_t N_RANDS = 512;

uint32_t device_num_g = 0;
uint32_t time_secs_g = DEFAULT_SECONDS;
uint32_t tlx_g = 0;
uint32_t tly_g = 0;
uint32_t width_g = DEFAULT_TARGET_WIDTH;
uint32_t height_g = DEFAULT_TARGET_HEIGHT;
uint32_t mcast_x_g = 0;
uint32_t mcast_y_g = 0;
uint32_t mcast_size_g = 16;
uint32_t ucast_size_g = 8192;
uint32_t mcast_from_n_eth_g;
bool mcast_from_eth_g;
bool rnd_delay_g = false;
bool rnd_coord_g = true;

void init(int argc, char** argv) {
std::vector<std::string> input_args(argv, argv + argc);

if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) {
log_info(LogTest, "Usage:");
log_info(LogTest, " -v: device number to run on (default 0) ", DEFAULT_SECONDS);
log_info(LogTest, " -t: time in seconds (default {})", DEFAULT_SECONDS);
log_info(LogTest, " -x: grid top left x");
log_info(LogTest, " -y: grid top left y");
log_info(LogTest, " -width: unicast grid width (default {})", DEFAULT_TARGET_WIDTH);
log_info(LogTest, "-height: unicast grid height (default {})", DEFAULT_TARGET_HEIGHT);
log_info(LogTest, " -mx: mcast core x");
log_info(LogTest, " -my: mcast core y");
log_info(LogTest, " -e: mcast from nth idle eth core (ignores -mx,-my)");
log_info(LogTest, " -m: mcast packet size");
log_info(LogTest, " -u: ucast packet size");
log_info(LogTest, "-rdelay: insert random delay between noc transactions");
log_info(LogTest, " -s: seed random number generator");
exit(0);
}

device_num_g = test_args::get_command_option_uint32(input_args, "-v", 0);
time_secs_g = test_args::get_command_option_uint32(input_args, "-t", DEFAULT_SECONDS);
tlx_g = test_args::get_command_option_uint32(input_args, "-x", 0);
tly_g = test_args::get_command_option_uint32(input_args, "-y", 0);
width_g = test_args::get_command_option_uint32(input_args, "-width", DEFAULT_TARGET_WIDTH);
height_g = test_args::get_command_option_uint32(input_args, "-height", DEFAULT_TARGET_HEIGHT);
mcast_x_g = test_args::get_command_option_uint32(input_args, "-mx", 0);
mcast_y_g = test_args::get_command_option_uint32(input_args, "-my", 0);
mcast_from_n_eth_g = test_args::get_command_option_uint32(input_args, "-e", 0xffff);
mcast_size_g = test_args::get_command_option_uint32(input_args, "-m", 16);
ucast_size_g = test_args::get_command_option_uint32(input_args, "-u", 8192);
mcast_from_eth_g = (mcast_from_n_eth_g != 0xffff);
rnd_delay_g = test_args::has_command_option(input_args, "-rdelay");
uint32_t seed = test_args::get_command_option_uint32(input_args, "-s", 0);
srand(seed);

if (!mcast_from_eth_g && mcast_x_g >= tlx_g && mcast_x_g <= tlx_g + width_g - 1 && mcast_y_g >= tly_g &&
mcast_y_g <= tly_g + height_g - 1) {
log_fatal("Mcast core can't be within mcast grid");
exit(-1);
}
}

int main(int argc, char** argv) {
init(argc, argv);

tt_metal::IDevice* device = tt_metal::CreateDevice(device_num_g);
tt_metal::Program program = tt_metal::CreateProgram();

const auto& eth_cores = device->get_inactive_ethernet_cores();

CoreRange workers_logical({tlx_g, tly_g}, {tlx_g + width_g - 1, tly_g + height_g - 1});
CoreCoord mcast_logical(mcast_x_g, mcast_y_g);
CoreCoord tl_core = device->worker_core_from_logical_core({tlx_g, tly_g});

if (mcast_from_eth_g) {
CoreCoord eth_logical(0, mcast_from_n_eth_g);
bool found = false;
for (const auto& eth_core : eth_cores) {
if (eth_logical == eth_core) {
found = true;
break;
}
}
if (!found) {
log_fatal("{} not found in the list of idle eth cores", mcast_from_n_eth_g);
tt_metal::CloseDevice(device);
exit(-1);
}
mcast_logical = eth_logical;
}

CoreCoord virtual_offset = device->worker_core_from_logical_core({0, 0});
TT_ASSERT(virtual_offset.x == virtual_offset.y);
std::vector<uint32_t> compile_args = {
false,
tl_core.x,
tl_core.y,
width_g,
height_g,
time_secs_g,
ucast_size_g,
mcast_size_g,
virtual_offset.x,
N_RANDS,
rnd_delay_g,
tt::tt_metal::hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED),
tt::tt_metal::hal.get_dev_addr(
mcast_from_eth_g ? HalProgrammableCoreType::IDLE_ETH : HalProgrammableCoreType::TENSIX,
HalL1MemAddrType::UNRESERVED),
};

KernelHandle ucast_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp",
workers_logical,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0,
.noc = tt_metal::NOC::RISCV_0_default,
.compile_args = compile_args,
});

for (CoreCoord coord : workers_logical) {
std::vector<uint32_t> runtime_args;
// Not particularly random since all cores are getting the same data
// N_RANDS in bytes
CoreCoord grid_size = device->logical_grid_size();
for (int i = 0; i < N_RANDS / sizeof(uint32_t); i++) {
uint32_t rnd = 0;
for (int j = 0; j < sizeof(uint32_t); j++) {
uint32_t x = rand() % grid_size.x;
uint32_t y = rand() % grid_size.y;
rnd = (rnd << 8) | (y << 4) | x;
}
runtime_args.push_back(rnd);
}
tt::tt_metal::SetRuntimeArgs(program, ucast_kernel, coord, runtime_args);
}

compile_args[0] = true;
KernelHandle mcast_kernel;
if (mcast_from_eth_g) {
mcast_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp",
mcast_logical,
tt_metal::EthernetConfig{
.eth_mode = Eth::IDLE,
.noc = tt_metal::NOC::NOC_0,
.compile_args = compile_args,
});
} else {
mcast_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp",
mcast_logical,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0,
.noc = tt_metal::NOC::RISCV_0_default,
.compile_args = compile_args,
});
}
std::vector<uint32_t> runtime_args;
for (int i = 0; i < 128; i++) {
runtime_args.push_back(rand());
}
tt::tt_metal::SetRuntimeArgs(program, mcast_kernel, mcast_logical, runtime_args);

CoreCoord mcast_virtual;
CoreCoord mcast_physical;
if (mcast_from_eth_g) {
mcast_virtual = device->ethernet_core_from_logical_core(mcast_logical);
mcast_physical =
tt::Cluster::instance().get_soc_desc(device_num_g).get_physical_ethernet_core_from_logical(mcast_logical);
} else {
mcast_virtual = device->worker_core_from_logical_core(mcast_logical);
mcast_physical =
tt::Cluster::instance().get_soc_desc(device_num_g).get_physical_tensix_core_from_logical(mcast_logical);
}

log_info(
LogTest,
"MCast {} core: {}, virtual {}, physical {}, writing {} bytes per xfer",
mcast_from_eth_g ? "ETH" : "TENSIX",
mcast_logical,
mcast_virtual,
mcast_physical,
mcast_size_g);
log_info(LogTest, "Unicast grid: {}, writing {} bytes per xfer", workers_logical.str(), ucast_size_g);

if (rnd_coord_g) {
log_info("Randomizing ucast noc write destinations");
} else {
log_info("Non-random ucast noc write destinations TBD");
}

if (rnd_delay_g) {
log_info("Randomizing delay");
}
log_info(LogTest, "Running for {} seconds", time_secs_g);

tt::tt_metal::detail::LaunchProgram(device, program, true);
tt_metal::CloseDevice(device);
}

0 comments on commit 3738051

Please sign in to comment.