Skip to content

Commit

Permalink
#14849: Profiler sub-device post proc fix and smoke test
Browse files Browse the repository at this point in the history
  • Loading branch information
mo-tenstorrent committed Dec 9, 2024
1 parent 7aed5ea commit 36b3164
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 30 deletions.
55 changes: 44 additions & 11 deletions tests/tt_metal/tools/profiler/test_device_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import inspect
import pytest
import subprocess

import pandas as pd

Expand All @@ -24,6 +25,30 @@
PROG_EXMP_DIR = "programming_examples/profiler"


def get_device_data(setupStr=""):
postProcessRun = os.system(
f"cd {PROFILER_SCRIPTS_ROOT} && " f"./process_device_log.py {setupStr} --no-artifacts --no-print-stats"
)

assert postProcessRun == 0, f"Log process script crashed with exit code {postProcessRun}"

devicesData = {}
with open(f"{PROFILER_ARTIFACTS_DIR}/output/device/device_analysis_data.json", "r") as devicesDataJson:
devicesData = json.load(devicesDataJson)

return devicesData


def run_gtest_profiler_test(testbin, testname):
clear_profiler_runtime_artifacts()
output = subprocess.check_output(
f"cd {TT_METAL_HOME} && {testbin} --gtest_filter={testname}", stderr=subprocess.STDOUT, shell=True
).decode("UTF-8")
print(output)
if "SKIPPED" not in output:
get_device_data()


def run_device_profiler_test(testName=None, setup=False, slowDispatch=False):
name = inspect.stack()[1].function
testCommand = f"build/{PROG_EXMP_DIR}/{name}"
Expand All @@ -41,17 +66,7 @@ def run_device_profiler_test(testName=None, setup=False, slowDispatch=False):
if setup:
setupStr = f"-s {name}"

postProcessRun = os.system(
f"cd {PROFILER_SCRIPTS_ROOT} && " f"./process_device_log.py {setupStr} --no-artifacts --no-print-stats"
)

assert postProcessRun == 0, f"Log process script crashed with exit code {postProcessRun}"

devicesData = {}
with open(f"{PROFILER_ARTIFACTS_DIR}/output/device/device_analysis_data.json", "r") as devicesDataJson:
devicesData = json.load(devicesDataJson)

return devicesData
return get_device_data(setupStr)


def get_function_name():
Expand Down Expand Up @@ -231,6 +246,8 @@ def test_profiler_host_device_sync():
assert freq < (reportedFreq * (1 + TOLERANCE)), f"Frequency too large on device {device}"
assert freq > (reportedFreq * (1 - TOLERANCE)), f"Frequency too small on device {device}"

os.environ["TT_METAL_PROFILER_SYNC"] = "0"


def test_timestamped_events():
OP_COUNT = 2
Expand Down Expand Up @@ -268,3 +285,19 @@ def test_timestamped_events():
devicesData["data"]["devices"]["0"]["cores"]["DEVICE"]["riscs"]["TENSIX"]["events"]["all_events"]
)
assert eventCount in REF_COUNT_DICT[ENV_VAR_ARCH_NAME], "Wrong event count"


def test_sub_device_profiler():
run_gtest_profiler_test(
"./build/test/tt_metal/unit_tests_dispatch", "CommandQueueSingleCardFixture.TensixTestSubDeviceBasicPrograms"
)
os.environ["TT_METAL_PROFILER_SYNC"] = "1"
run_gtest_profiler_test(
"./build/test/tt_metal/unit_tests_dispatch",
"CommandQueueSingleCardFixture.TensixActiveEthTestSubDeviceBasicEthPrograms",
)
os.environ["TT_METAL_PROFILER_SYNC"] = "0"
run_gtest_profiler_test(
"./build/test/tt_metal/unit_tests_dispatch_trace",
"CommandQueueSingleCardTraceFixture.TensixTestSubDeviceTraceBasicPrograms",
)
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceBasicPrograms) {
EnqueueProgram(device->command_queue(), incrementer_program, false);
}
Synchronize(device);
detail::DumpDeviceProfileResults(device);
}
}

Expand Down Expand Up @@ -136,5 +137,6 @@ TEST_F(CommandQueueSingleCardFixture, TensixActiveEthTestSubDeviceBasicEthProgra
EnqueueProgram(device->command_queue(), incrementer_program, false);
}
Synchronize(device);
detail::DumpDeviceProfileResults(device);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ TEST_F(CommandQueueSingleCardTraceFixture, TensixTestSubDeviceTraceBasicPrograms
ReplayTrace(device, device->command_queue().id(), tid_2, false);
}
Synchronize(device);
detail::DumpDeviceProfileResults(device);
}
}

Expand Down
5 changes: 2 additions & 3 deletions tt_metal/tools/profiler/process_device_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ def get_ops(timeseries):
opsDict[opID].append(ts)

ordered_ops = list(opsDict.keys())
# sort over timestamps
ordered_ops.sort(key=lambda x: opsDict[x][0][1])

ops = []
Expand All @@ -327,9 +328,7 @@ def get_ops(timeseries):
if (risc == "BRISC" and timerID["zone_name"] == "BRISC-FW" and timerID["type"] == "ZONE_START") or (
risc == "ERISC" and timerID["zone_name"] == "ERISC-FW" and timerID["type"] == "ZONE_START"
):
for opDuration in coresOp.values():
assert len(opDuration) == 2, "Unexpected FW start"

assert len(coresOp[core]) == 2, "Unexpected FW end"
ops.append({"timeseries": []})
coresOp = {}
elif (risc == "BRISC" and timerID["zone_name"] == "BRISC-FW" and timerID["type"] == "ZONE_END") or (
Expand Down
32 changes: 16 additions & 16 deletions tt_metal/tools/profiler/tt_metal_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,22 +99,22 @@ void syncDeviceHost(
smallestHostime.emplace(device_id, 0);

constexpr uint16_t sampleCount = 249;
if (sync_program == nullptr) {
sync_program = std::make_shared<tt_metal::Program>();

std::map<string, string> kernel_defines = {
{"SAMPLE_COUNT", std::to_string(sampleCount)},
};

tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel(
*sync_program,
"tt_metal/tools/profiler/sync/sync_kernel.cpp",
logical_core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0,
.noc = tt_metal::NOC::RISCV_0_default,
.defines = kernel_defines});
}
// TODO(MO): Always recreate a new program until subdevice
// allows using the first program generated by default manager
sync_program = std::make_shared<tt_metal::Program>();

std::map<string, string> kernel_defines = {
{"SAMPLE_COUNT", std::to_string(sampleCount)},
};

tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel(
*sync_program,
"tt_metal/tools/profiler/sync/sync_kernel.cpp",
logical_core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0,
.noc = tt_metal::NOC::RISCV_0_default,
.defines = kernel_defines});

EnqueueProgram(device->command_queue(), *sync_program, false);

Expand Down

0 comments on commit 36b3164

Please sign in to comment.