Skip to content

Commit

Permalink
#11669: Profiler slow dispatch unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
mo-tenstorrent committed Aug 20, 2024
1 parent 42cca16 commit a402d6d
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 2 deletions.
29 changes: 27 additions & 2 deletions tests/tt_metal/tools/profiler/test_device_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,16 @@
PROG_EXMP_DIR = "programming_examples/profiler"


def run_device_profiler_test(testName=None, setup=False):
def run_device_profiler_test(testName=None, setup=False, slowDispatch=False):
name = inspect.stack()[1].function
testCommand = f"build/{PROG_EXMP_DIR}/{name}"
if testName:
testCommand = testName
clear_profiler_runtime_artifacts()
profilerRun = os.system(f"cd {TT_METAL_HOME} && {testCommand}")
slowDispatchEnv = ""
if slowDispatch:
slowDispatchEnv = "TT_METAL_SLOW_DISPATCH_MODE=1 "
profilerRun = os.system(f"cd {TT_METAL_HOME} && {slowDispatchEnv}{testCommand}")
assert profilerRun == 0

setupStr = ""
Expand Down Expand Up @@ -77,6 +80,28 @@ def test_multi_op():
assert stats[statName]["stats"]["Count"] in REF_COUNT_DICT[ENV_VAR_ARCH_NAME], "Wrong Marker Repeat count"


def test_custom_cycle_count_slow_dispatch():
REF_CYCLE_COUNT_PER_LOOP = 52
LOOP_COUNT = 2000
REF_CYCLE_COUNT = REF_CYCLE_COUNT_PER_LOOP * LOOP_COUNT
REF_CYCLE_COUNT_HIGH_MULTIPLIER = 10
REF_CYCLE_COUNT_LOW_MULTIPLIER = 5

REF_CYCLE_COUNT_MAX = REF_CYCLE_COUNT * REF_CYCLE_COUNT_HIGH_MULTIPLIER
REF_CYCLE_COUNT_MIN = REF_CYCLE_COUNT // REF_CYCLE_COUNT_LOW_MULTIPLIER

devicesData = run_device_profiler_test(setup=True, slowDispatch=True)

stats = devicesData["data"]["devices"]["0"]["cores"]["DEVICE"]["analysis"]

for risc in ["BRISC", "NCRISC", "TRISC_0", "TRISC_1", "TRISC_2"]:
statName = f"{risc} KERNEL_START->KERNEL_END"

assert statName in stats.keys(), "Wrong device analysis format"
assert stats[statName]["stats"]["Average"] < REF_CYCLE_COUNT_MAX, "Wrong cycle count, too high"
assert stats[statName]["stats"]["Average"] > REF_CYCLE_COUNT_MIN, "Wrong cycle count, too low"


def test_custom_cycle_count():
REF_CYCLE_COUNT_PER_LOOP = 52
LOOP_COUNT = 2000
Expand Down
1 change: 1 addition & 0 deletions tt_metal/programming_examples/profiler/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

set(PROFILER_EXAMPLES_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/test_custom_cycle_count/test_custom_cycle_count.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_full_buffer/test_full_buffer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_multi_op/test_multi_op.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch_cores/test_dispatch_cores.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include "compute_kernel_api.h"

/**
* LOOP_COUNT and LOOP_SIZE provide the ability to decide how many cycles this kernel takes.
* With a large enough LOOP_COUNT and a LOOP_SIZEs within icache size, cycle count will be
* very close to LOOP_COUNT x (LOOP_SIZE + loop_overhead). loop_overhead is 2 cycle 1 for
* addi and 1 for branch if not zero.
*
* Keeping LOOP_SIZE constant and suitable for all 5 risc ichahes, The diff between to runs
* with LOOP_COUNT and LOOP_COUNT + 1 should be the same across all riscs and it should be
* LOOP_COUNT + 2 cycles
*
* More info on tt-metal issue #515
*
* https://github.com/tenstorrent/tt-metal/issues/515#issuecomment-1548434301
*/

namespace NAMESPACE {
void MAIN {
for (int i = 0; i < LOOP_COUNT; i ++)
{
//Max unroll size
#pragma GCC unroll 65534
for (int j = 0 ; j < LOOP_SIZE; j++)
{
asm("nop");
}
}
}
} // NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>

/**
* LOOP_COUNT and LOOP_SIZE provide the ability to decide how many cycles this kernel takes.
* With a large enough LOOP_COUNT and a LOOP_SIZEs within icache size, cycle count will be
* very close to LOOP_COUNT x (LOOP_SIZE + loop_overhead). loop_overhead is 2 cycle 1 for
* addi and 1 for branch if not zero.
*
* Keeping LOOP_SIZE constant and suitable for all 5 risc ichahes, The diff between to runs
* with LOOP_COUNT and LOOP_COUNT + 1 should be the same across all riscs and it should be
* LOOP_COUNT + 2 cycles
*
* More info on tt-metal issue #515
*
* https://github.com/tenstorrent/tt-metal/issues/515#issuecomment-1548434301
*/

void kernel_main() {
for (int i = 0; i < LOOP_COUNT; i ++)
{
//Max unroll size
#pragma GCC unroll 65534
for (int j = 0 ; j < LOOP_SIZE; j++)
{
asm("nop");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/impl/device/device.hpp"

using namespace tt;

bool RunCustomCycle(tt_metal::Device *device, int loop_count)
{
bool pass = true;

CoreCoord compute_with_storage_size = device->compute_with_storage_grid_size();
CoreCoord start_core = {0, 0};
CoreCoord end_core = {compute_with_storage_size.x - 1, compute_with_storage_size.y - 1};
CoreRange all_cores(start_core, end_core);

tt_metal::Program program = tt_metal::CreateProgram();

constexpr int loop_size = 50;
constexpr bool profile_device = true;
std::map<string, string> kernel_defines = {
{"LOOP_COUNT", std::to_string(loop_count)},
{"LOOP_SIZE", std::to_string(loop_size)}
};

tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel(
program, "tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/kernels/custom_cycle_count_slow_dispatch.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .defines = kernel_defines});

tt_metal::KernelHandle ncrisc_kernel = tt_metal::CreateKernel(
program, "tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/kernels/custom_cycle_count_slow_dispatch.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines});

vector<uint32_t> trisc_kernel_args = {};
tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel(
program, "tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/kernels/custom_cycle_count_compute_slow_dispatch.cpp",
all_cores,
tt_metal::ComputeConfig{.compile_args = trisc_kernel_args, .defines = kernel_defines}
);

tt_metal::detail::LaunchProgram(device, program);

return pass;
}

int main(int argc, char **argv) {
bool pass = true;

try {
////////////////////////////////////////////////////////////////////////////
// Device Setup
////////////////////////////////////////////////////////////////////////////
int device_id = 0;
tt_metal::Device *device =
tt_metal::CreateDevice(device_id);

int loop_count = 2000;
pass &= RunCustomCycle(device, loop_count);

pass &= tt_metal::CloseDevice(device);

} catch (const std::exception &e) {
pass = false;
// Capture the exception error message
log_error(LogTest, "{}", e.what());
// Capture system call errors that may have returned from driver/kernel
log_error(LogTest, "System error message: {}", std::strerror(errno));
}

if (pass) {
log_info(LogTest, "Test Passed");
} else {
TT_THROW("Test Failed");
}

TT_FATAL(pass);

return 0;
}
36 changes: 36 additions & 0 deletions tt_metal/tools/profiler/device_post_proc_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,42 @@ class test_custom_cycle_count(default_setup):
detectOps = False


class test_custom_cycle_count_slow_dispatch(default_setup):
timerAnalysis = {
"BRISC KERNEL_START->KERNEL_END": {
"across": "core",
"type": "adjacent",
"start": {"core": "ANY", "risc": "BRISC", "zone_name": "BRISC-KERNEL"},
"end": {"core": "ANY", "risc": "BRISC", "zone_name": "BRISC-KERNEL"},
},
"NCRISC KERNEL_START->KERNEL_END": {
"across": "core",
"type": "adjacent",
"start": {"core": "ANY", "risc": "NCRISC", "zone_name": "NCRISC-KERNEL"},
"end": {"core": "ANY", "risc": "NCRISC", "zone_name": "NCRISC-KERNEL"},
},
"TRISC_0 KERNEL_START->KERNEL_END": {
"across": "core",
"type": "adjacent",
"start": {"core": "ANY", "risc": "TRISC_0", "zone_name": "TRISC-KERNEL"},
"end": {"core": "ANY", "risc": "TRISC_0", "zone_name": "TRISC-KERNEL"},
},
"TRISC_1 KERNEL_START->KERNEL_END": {
"across": "core",
"type": "adjacent",
"start": {"core": "ANY", "risc": "TRISC_1", "zone_name": "TRISC-KERNEL"},
"end": {"core": "ANY", "risc": "TRISC_1", "zone_name": "TRISC-KERNEL"},
},
"TRISC_2 KERNEL_START->KERNEL_END": {
"across": "core",
"type": "adjacent",
"start": {"core": "ANY", "risc": "TRISC_2", "zone_name": "TRISC-KERNEL"},
"end": {"core": "ANY", "risc": "TRISC_2", "zone_name": "TRISC-KERNEL"},
},
}
detectOps = False


class test_full_buffer(default_setup):
timerAnalysis = {
"Marker Repeat": {
Expand Down

0 comments on commit a402d6d

Please sign in to comment.