Skip to content

Commit

Permalink
#9044: Move dispatch core x y to be part of launch msg, which is set …
Browse files Browse the repository at this point in the history
…in assemble_device_commands. This enables running programs on any cq
  • Loading branch information
tt-aho committed Jun 27, 2024
1 parent 6456f2e commit 956e2bb
Show file tree
Hide file tree
Showing 15 changed files with 463 additions and 122 deletions.
3 changes: 2 additions & 1 deletion tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ void kernel_main() {
//Need to signal completion to dispatcher before hanging so that
//Dispatcher Kernel is able to finish.
//Device Close () requires fast dispatch kernels to finish.
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE);
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31, false);
#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ void MAIN {
//Need to signal completion to dispatcher before hanging so that
//Dispatcher Kernel is able to finish.
//Device Close () requires fast dispatch kernels to finish.
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE);
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/);
}
#else
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

set(UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueProgram.cpp
${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueTrace.cpp
${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWaitForEvent.cpp
${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <memory>
#include "command_queue_fixture.hpp"
#include "command_queue_test_utils.hpp"
#include "gtest/gtest.h"
#include "impl/buffers/buffer.hpp"
#include "tt_metal/common/bfloat16.hpp"
#include "tt_metal/common/scoped_timer.hpp"
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"

using namespace tt::tt_metal;

struct CBConfig {
uint32_t cb_id;
uint32_t num_pages;
uint32_t page_size;
tt::DataFormat data_format;
};

struct DummyProgramConfig {
CoreRangeSet cr_set;
CBConfig cb_config;
uint32_t num_cbs;
uint32_t num_sems;
};

struct DummyProgramMultiCBConfig {
CoreRangeSet cr_set;
std::vector<CBConfig> cb_config_vector;
uint32_t num_sems;
};


namespace local_test_functions {

// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two.
// Optionally force the max size for one of the vectors.
std::pair<std::vector<uint32_t>, std::vector<uint32_t>> create_runtime_args(bool force_max_size = false, uint32_t unique_base = 0, uint32_t common_base = 100){

constexpr uint32_t MAX_RUNTIME_ARGS = 255;

// Generate Unique Runtime Args. Common RT args starting address must be L1 Aligned, so account for that here via padding
uint32_t num_rt_args_unique = num_rt_args_unique = rand() % (MAX_RUNTIME_ARGS + 1);
uint32_t num_rt_args_unique_padded = align(num_rt_args_unique, L1_ALIGNMENT / sizeof(uint32_t));
uint32_t num_rt_args_common = num_rt_args_unique_padded < MAX_RUNTIME_ARGS ? rand() % (MAX_RUNTIME_ARGS - num_rt_args_unique_padded + 1) : 0;

if (force_max_size) {
if (rand() % 2) {
num_rt_args_unique = MAX_RUNTIME_ARGS;
num_rt_args_common = 0;
} else {
num_rt_args_common = MAX_RUNTIME_ARGS;
num_rt_args_unique = 0;
}
}

vector<uint32_t> rt_args_common;
for (uint32_t i = 0; i < num_rt_args_common; i++) {
rt_args_common.push_back(common_base + i);
}

vector<uint32_t> rt_args_unique;
for (uint32_t i = 0; i < num_rt_args_unique; i++) {
rt_args_unique.push_back(unique_base + i);
}

log_trace(tt::LogTest, "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}", __FUNCTION__, num_rt_args_unique, num_rt_args_common, force_max_size);
return std::make_pair(rt_args_unique, rt_args_common);
}


} // namespace local_test_functions

namespace stress_tests {

TEST_F(MultiCommandQueueSingleDeviceFixture, TestRandomizedProgram) {
uint32_t NUM_PROGRAMS = 100;
uint32_t MAX_LOOP = 100;
uint32_t page_size = 1024;

// Make random
auto random_seed = 0; // (unsigned int)time(NULL);
uint32_t seed = tt::parse_env("SEED", random_seed);
log_info(tt::LogTest, "Using Test Seed: {}", seed);
srand(seed);

CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size();
CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
CoreRangeSet cr_set({cr});

log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS);

vector<Program> programs;
for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
programs.push_back(Program());
Program& program = programs.back();

std::map<string, string> data_movement_defines = {{"DATA_MOVEMENT", "1"}};
std::map<string, string> compute_defines = {{"COMPUTE", "1"}};

// brisc
uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS;
bool USE_MAX_RT_ARGS;

if (i == 0) {
// Ensures that we get at least one compilation with the max amount to
// ensure it compiles and runs
BRISC_OUTER_LOOP = MAX_LOOP;
BRISC_MIDDLE_LOOP = MAX_LOOP;
BRISC_INNER_LOOP = MAX_LOOP;
NUM_CBS = NUM_CIRCULAR_BUFFERS;
NUM_SEMS = NUM_SEMAPHORES;
USE_MAX_RT_ARGS = true;
} else {
BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1;
NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1;
USE_MAX_RT_ARGS = false;
}

log_debug(tt::LogTest, "Compiling program {}/{} w/ BRISC_OUTER_LOOP: {} BRISC_MIDDLE_LOOP: {} BRISC_INNER_LOOP: {} NUM_CBS: {} NUM_SEMS: {} USE_MAX_RT_ARGS: {}",
i+1, NUM_PROGRAMS, BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, USE_MAX_RT_ARGS);

for (uint32_t j = 0; j < NUM_CBS; j++) {
CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}).set_page_size(j, page_size * (j + 1));
auto cb = CreateCircularBuffer(program, cr_set, cb_config);
}

for (uint32_t j = 0; j < NUM_SEMS; j++) {
CreateSemaphore(program, cr_set, j + 1);
}

auto [brisc_unique_rtargs, brisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size();
uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size();
vector<uint32_t> brisc_compile_args = {BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_brisc_unique_rtargs, num_brisc_common_rtargs, page_size};

// ncrisc
uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP;
if (i == 0) {
NCRISC_OUTER_LOOP = MAX_LOOP;
NCRISC_MIDDLE_LOOP = MAX_LOOP;
NCRISC_INNER_LOOP = MAX_LOOP;
} else {
NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
}

auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size();
uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size();
vector<uint32_t> ncrisc_compile_args = {NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_ncrisc_unique_rtargs, num_ncrisc_common_rtargs, page_size};

// trisc
uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP;
if (i == 0) {
TRISC_OUTER_LOOP = MAX_LOOP;
TRISC_MIDDLE_LOOP = MAX_LOOP;
TRISC_INNER_LOOP = MAX_LOOP;
} else {
TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
}

auto [trisc_unique_rtargs, trisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size();
uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size();
vector<uint32_t> trisc_compile_args = {TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_trisc_unique_rtargs, num_trisc_common_rtargs, page_size};

bool at_least_one_kernel = false;
if (i == 0 or ((rand() % 2) == 0)) {
auto dummy_brisc_kernel = CreateKernel(
program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines});
SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
at_least_one_kernel = true;
}

if (i == 0 or ((rand() % 2) == 0)) {
auto dummy_ncrisc_kernel = CreateKernel(
program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines});
SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
at_least_one_kernel = true;
}

if (i == 0 or ((rand() % 2) == 0)) {
auto dummy_trisc_kernel = CreateKernel(
program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{
.math_approx_mode = false,
.compile_args = trisc_compile_args,
.defines = compute_defines
});
SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
at_least_one_kernel = true;
}

if (not at_least_one_kernel) {
uint32_t random_risc = rand() % 3 + 1;
if (random_risc == 1) {
auto dummy_brisc_kernel = CreateKernel(
program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines});
SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
} else if (random_risc == 2) {
auto dummy_ncrisc_kernel = CreateKernel(
program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines});
SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
} else if (random_risc == 3) {
auto dummy_trisc_kernel = CreateKernel(
program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{
.math_approx_mode = false,
.compile_args = trisc_compile_args,
.defines = compute_defines
});
SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
} else {
TT_ASSERT("Invalid");
}
}

tt::tt_metal::detail::CompileProgram(this->device_, program);
}

for (uint8_t cq_id = 0; cq_id < this->device_->num_hw_cqs(); ++cq_id) {
log_info(tt::LogTest, "Running {} programs on cq {} for cache warmup.", programs.size(), (uint32_t)cq_id);
// This loop caches program and runs
for (Program& program: programs) {
EnqueueProgram(this->device_->command_queue(cq_id), program, false);
}

// This loops assumes already cached
uint32_t NUM_ITERATIONS = 500; // TODO(agrebenisan): Bump this to 5000, saw hangs for very large number of iterations, need to come back to that

log_info(tt::LogTest, "Running {} programs on cq {} for {} iterations now.", programs.size(), (uint32_t)cq_id, NUM_ITERATIONS);
for (uint32_t i = 0; i < NUM_ITERATIONS; i++) {
auto rng = std::default_random_engine {};
std::shuffle(std::begin(programs), std::end(programs), rng);
if (i % 10 == 0) {
log_debug(tt::LogTest, "Enqueueing {} programs on cq {} for iter: {}/{} now.", programs.size(), (uint32_t)cq_id, i+1, NUM_ITERATIONS);
}
for (Program& program: programs) {
EnqueueProgram(this->device_->command_queue(cq_id), program, false);
}
}

log_info(tt::LogTest, "Calling Finish.");
Finish(this->device_->command_queue(cq_id));
}
}

} // namespace stress_tests
15 changes: 1 addition & 14 deletions tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,18 +399,6 @@ namespace tt::tt_metal{
pcie_chan_end_addr += tt::Cluster::instance().get_host_channel_size(device->id(), pcie_chan);
}

uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
const uint8_t cq_id = 0; // Currently, only the first command queue is responsible for enqueuing programs
tt_cxy_pair enqueue_program_dispatch_core;
if (device->is_mmio_capable()) {
enqueue_program_dispatch_core = dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_core(device->id(), channel, cq_id);
} else {
enqueue_program_dispatch_core = dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_d_core(device->id(), channel, cq_id);
}

CoreType core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
CoreCoord physical_enqueue_program_dispatch_core = get_physical_core_coordinate(enqueue_program_dispatch_core, core_type);

jit_build_genfiles_noc_addr_ranges_header(
path,
pcie_chan_base_addr,
Expand All @@ -421,8 +409,7 @@ namespace tt::tt_metal{
soc_d.get_dram_cores(),
soc_d.get_physical_ethernet_cores(),
soc_d.grid_size,
harvested_rows,
physical_enqueue_program_dispatch_core);
harvested_rows);
}

inline void CheckDataMovementConfig(Program &program, const std::string &file_name, const CoreRangeSet &core_ranges) {
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ int main() {
// Notify dispatcher core that it has completed
if (mailboxes->launch.mode == DISPATCH_MODE_DEV) {
uint64_t dispatch_addr =
NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4);
noc_fast_atomic_increment(
noc_index,
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/firmware/src/erisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void __attribute__((section("erisc_l1_code"))) kernel_launch() {

kernel_main();
mailboxes->launch.run = RUN_MSG_DONE;
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
if (routing_info->routing_enabled and mailboxes->launch.mode == DISPATCH_MODE_DEV) {
internal_::notify_dispatch_core_done(dispatch_addr);
}
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ int main() {

// Notify dispatcher core that it has completed
if (mailboxes->launch.mode == DISPATCH_MODE_DEV) {
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR);
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4);
noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/);
}
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/inc/dev_msgs.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ struct launch_msg_t { // must be cacheline aligned
volatile uint8_t brisc_noc_id;
volatile uint8_t enables[DISPATCH_CLASS_MAX_PROC];
volatile uint8_t max_cb_index;
volatile uint8_t dispatch_core_x;
volatile uint8_t dispatch_core_y;
volatile uint8_t run; // must be in last cacheline of this msg
};

Expand Down
Loading

0 comments on commit 956e2bb

Please sign in to comment.