Skip to content

Commit

Permalink
Fix benchmark result parsing. Improve pytest
Browse files Browse the repository at this point in the history
  • Loading branch information
RattataKing committed Aug 21, 2024
1 parent 8b707e9 commit c516eb9
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 109 deletions.
60 changes: 23 additions & 37 deletions tuning/libtuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,14 +831,16 @@ def parse_dispatch_benchmark_results(
) -> tuple[list[ParsedDisptachBenchmarkResult], list[str]]:
benchmark_result_configs = []
dump_list = []
incomplete_list = []

for benchmark_result in benchmark_results:
res_str = benchmark_result.result.stdout
candidate_id = benchmark_result.candidate_id
if res_str is None:
continue
res = IREEBenchmarkResult(candidate_id, res_str)
benchmark_time = res.get_mean_time()
if benchmark_time is None:
incomplete_list.append(candidate_id)
continue
assert benchmark_time is not None
candidate_trackers[candidate_id].first_benchmark_time = benchmark_time
candidate_trackers[candidate_id].spec_path = (
Expand All @@ -860,6 +862,10 @@ def parse_dispatch_benchmark_results(
)
)
)

if incomplete_list:
dump_list += [f"Candidate {i} not incompleted" for i in incomplete_list]

return benchmark_result_configs, dump_list


Expand Down Expand Up @@ -1055,22 +1061,6 @@ def compile_models(
)


def sort_candidates_by_first_benchmark_times(
candidate_indexes: list[int], candidate_trackers: list[CandidateTracker]
) -> list[int]:
"""Sorts candidate indexes based on their first benchmark times in ascending order"""
# Get the first benchmark times, defaulting to a large number if None
first_benchmark_times = [
candidate_trackers[index].first_benchmark_time or float("inf")
for index in candidate_indexes
]
combined = list(zip(candidate_indexes, first_benchmark_times))
combined_sorted = sorted(combined, key=lambda x: x[1])
sorted_indexes, _ = zip(*combined_sorted)

return list(sorted_indexes)


def group_benchmark_results_by_device_id(
benchmark_results: list[TaskResult],
) -> list[list[TaskResult]]:
Expand Down Expand Up @@ -1101,6 +1091,7 @@ def parse_model_benchmark_results(
candidate_results: list[TaskResult],
baseline_results: list[TaskResult],
):
"""Update candidate_tracker and format a list of result strings to be saved later."""
candidate_results = sorted(candidate_results, key=lambda br: br.device_id)
baseline_results = sorted(baseline_results, key=lambda tr: tr.device_id)

Expand All @@ -1115,7 +1106,7 @@ def parse_model_benchmark_results(
dump_list = []
incomplete_list: list[tuple[int, Optional[str]]] = (
[]
) # format: [(candidate_id, device_id)], baseline will have candidate_id=0
) # format: [(candidate_id, device_id)]

baseline_time = None
for same_device_results in grouped_benchmark_results:
Expand All @@ -1128,17 +1119,12 @@ def parse_model_benchmark_results(
# Check if benchmarking has completed
if result_str is None:
incomplete_list.append((candidate_id, device_id))
if candidate_id == 0:
baseline_time = None
continue

res = IREEBenchmarkResult(candidate_id, result_str)
benchmark_time = res.get_mean_time()
if benchmark_time == None:
handle_error(
condition=True,
msg="Failed to extract benchmark time for candidate {candidate_id}",
level=logging.WARNING,
)
continue
assert benchmark_time is not None

# Record baseline benchmarking result and skip rest processes
Expand All @@ -1163,17 +1149,17 @@ def parse_model_benchmark_results(
candidate_trackers[candidate_id].model_benchmark_time = benchmark_time
candidate_trackers[candidate_id].model_benchmark_device_id = device_id

# Skip improvement calculation if no baseline data.
if baseline_time is None:
dump_unsort_list.append((benchmark_time, result_str))
continue

# Calculate candidate improvement based on baseline.
candidate_trackers[candidate_id].baseline_benchmark_time = baseline_time
calibrated_benchmark_diff = (benchmark_time - baseline_time) / baseline_time
candidate_trackers[candidate_id].calibrated_benchmark_diff = (
calibrated_benchmark_diff
)
if baseline_time:
candidate_trackers[candidate_id].baseline_benchmark_time = baseline_time
calibrated_benchmark_diff = (
benchmark_time - baseline_time
) / baseline_time
candidate_trackers[candidate_id].calibrated_benchmark_diff = (
calibrated_benchmark_diff
)
else:
calibrated_benchmark_diff = None

# Collect candidate dump str
candidate_vmfb_path = candidate_trackers[candidate_id].compiled_model_path
Expand All @@ -1199,7 +1185,7 @@ def parse_model_benchmark_results(
for index, device in incomplete_list:
file_path = candidate_trackers[index].compiled_model_path
assert file_path is not None
error_msg = f"Benchmarking result of {file_path.as_posix()} on deivce {device} is incomplete"
error_msg = f"Benchmarking result of {file_path.as_posix()} on device {device} is incomplete"
handle_error(condition=True, msg=error_msg, level=logging.WARNING)
dump_list.append(error_msg + "\n")

Expand Down
153 changes: 81 additions & 72 deletions tuning/test_libtuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,6 @@ def test_group_benchmark_results_by_device_id():
assert grouped_results[1][0].device_id == "device_2"


def test_sort_candidates_by_first_benchmark_times():
candidate_trackers = [libtuner.CandidateTracker(i) for i in range(5)]
candidate_trackers[0].first_benchmark_time = 35
candidate_trackers[1].first_benchmark_time = 2141
candidate_trackers[2].first_benchmark_time = 231
candidate_trackers[3].first_benchmark_time = 231.23
candidate_trackers[4].first_benchmark_time = 58
test_input = [i for i in range(5)]
expect_output = [0, 4, 2, 3, 1]
assert (
libtuner.sort_candidates_by_first_benchmark_times(
test_input, candidate_trackers
)
== expect_output
)


def test_find_collisions():
input = [(1, "abc"), (2, "def"), (3, "abc")]
assert libtuner.find_collisions(input) == (True, [("abc", [1, 3]), ("def", [2])])
Expand Down Expand Up @@ -106,17 +89,17 @@ def test_IREEBenchmarkResult_get():

def test_generate_display_BR():
output = libtuner.generate_display_DBR(1, 3.14)
expected = f"1\tMean Time: 3.1\n"
expected = f"1\tMean Time: 3.1"
assert output == expected, "DispatchBenchmarkResult generates invalid sample string"

output = libtuner.generate_display_MBR("baseline.vmfb", str(1), 567.89)
expected = "Benchmarking: baseline.vmfb on device 1: 568\n\n"
expected = "Benchmarking: baseline.vmfb on device 1: 568"
assert output == expected, "ModelBenchmarkResult generates invalid sample string"
output = libtuner.generate_display_MBR("baseline.vmfb", str(1), 567.89, 0.0314)
expected = "Benchmarking: baseline.vmfb on device 1: 568 (+3.140%)\n\n"
expected = "Benchmarking: baseline.vmfb on device 1: 568 (+3.140%)"
assert output == expected, "ModelBenchmarkResult generates invalid sample string"
output = libtuner.generate_display_MBR("baseline.vmfb", str(1), 567.89, -3.14)
expected = "Benchmarking: baseline.vmfb on device 1: 568 (-314.000%)\n\n"
expected = "Benchmarking: baseline.vmfb on device 1: 568 (-314.000%)"
assert output == expected, "ModelBenchmarkResult generates invalid sample string"


Expand All @@ -132,15 +115,25 @@ def test_parse_dispatch_benchmark_results():
mock_result_2 = MagicMock()
mock_result_2.result.stdout = "process_time/real_time_mean 200.0 us"
mock_result_2.candidate_id = 2
benchmark_results = [mock_result_1, mock_result_2]
mock_result_3 = MagicMock()
mock_result_3.result.stdout = "" # Incomplete result
mock_result_3.candidate_id = 3
benchmark_results = [mock_result_1, mock_result_2, mock_result_3]

candidate_tracker_0 = libtuner.CandidateTracker(candidate_id=0)
candidate_tracker_0.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/0.mlir")
candidate_tracker_1 = libtuner.CandidateTracker(candidate_id=1)
candidate_tracker_1.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/1.mlir")
candidate_tracker_2 = libtuner.CandidateTracker(candidate_id=2)
candidate_tracker_2.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/2.mlir")
candidate_trackers = [candidate_tracker_0, candidate_tracker_1, candidate_tracker_2]
candidate_tracker_3 = libtuner.CandidateTracker(candidate_id=3)
candidate_tracker_3.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/3.mlir")
candidate_trackers = [
candidate_tracker_0,
candidate_tracker_1,
candidate_tracker_2,
candidate_tracker_3,
]

expected_parsed_results = [
libtuner.ParsedDisptachBenchmarkResult(
Expand All @@ -159,6 +152,7 @@ def test_parse_dispatch_benchmark_results():
expected_dump_list = [
"1\tMean Time: 100.0\n",
"2\tMean Time: 200.0\n",
"Candidate 3 not incompleted",
]

parsed_results, dump_list = libtuner.parse_dispatch_benchmark_results(
Expand All @@ -179,22 +173,19 @@ def test_parse_dispatch_benchmark_results():

def test_parse_model_benchmark_results():
# Setup mock data for candidate_trackers
tracker0 = MagicMock(spec=libtuner.CandidateTracker)
tracker0.compiled_model_path = MagicMock(
return_value=libtuner.Path("/path/to/baseline.vmfb")
)
tracker0 = libtuner.CandidateTracker(0)
tracker0.compiled_model_path = libtuner.Path("/path/to/baseline.vmfb")

tracker1 = MagicMock(spec=libtuner.CandidateTracker)
tracker1.compiled_model_path = MagicMock(
return_value=libtuner.Path("/path/to/model_1.vmfb")
)
tracker1 = libtuner.CandidateTracker(1)
tracker1.compiled_model_path = libtuner.Path("/path/to/model_1.vmfb")

tracker2 = MagicMock(spec=libtuner.CandidateTracker)
tracker2.compiled_model_path = MagicMock(
return_value=libtuner.Path("/path/to/model_2.vmfb")
)
tracker2 = libtuner.CandidateTracker(2)
tracker2.compiled_model_path = libtuner.Path("/path/to/model_2.vmfb")

tracker3 = libtuner.CandidateTracker(3)
tracker3.compiled_model_path = libtuner.Path("/path/to/model_3.vmfb")

candidate_trackers = [tracker0, tracker1, tracker2]
candidate_trackers = [tracker0, tracker1, tracker2, tracker3]

# Setup mock data for task results
result1 = MagicMock(spec=libtuner.TaskResult)
Expand All @@ -217,49 +208,67 @@ def test_parse_model_benchmark_results():
result4.candidate_id = 0
result4.device_id = "device2"

candidate_results = [result1, result2]
baseline_results = [result3, result4]
# Incomplete baseline on device3
result5 = MagicMock(spec=libtuner.TaskResult)
result5.result = MagicMock(stdout=None)
result5.candidate_id = 0
result5.device_id = "device3"

# Mock IREEBenchmarkResult to return float value from stdout
result6 = MagicMock(spec=libtuner.TaskResult)
result6.result = MagicMock(stdout="3.38")
result6.candidate_id = 3
result6.device_id = "device3"

candidate_results = [result1, result2, result6]
baseline_results = [result3, result4, result5]

# Skip real benchmark extraction, directly use given values from above
def mock_get_mean_time(self):
return float(self.result_str)
return float(self.result_str) if self.result_str else None

# Mock IREEBenchmarkResult to return specific benchmark times
# Mock IREEBenchmarkResult to return wanted benchmark times
with patch("libtuner.IREEBenchmarkResult.get_mean_time", new=mock_get_mean_time):
# Mock generate_display_MBR to return a fixed display string
with patch(
"libtuner.generate_display_MBR",
side_effect=lambda *args, **kwargs: "display_str",
):
# Mock handle_error to avoid actual logging during tests
with patch("libtuner.handle_error") as mock_handle_error:
# Call the function
result = libtuner.parse_model_benchmark_results(
candidate_trackers, candidate_results, baseline_results
)
# Mock handle_error to avoid actual logging during tests
with patch("libtuner.handle_error") as mock_handle_error:
dump_list = libtuner.parse_model_benchmark_results(
candidate_trackers, candidate_results, baseline_results
)

# Verify interactions with candidate_trackers
assert tracker1.model_benchmark_time == 1.23
assert tracker1.model_benchmark_device_id == "device1"
assert tracker1.baseline_benchmark_time == 0.98
assert tracker1.calibrated_benchmark_diff == pytest.approx(
(1.23 - 0.98) / 0.98, rel=1e-6
)
# Verify interactions with candidate_trackers
assert tracker1.model_benchmark_time == 1.23
assert tracker1.model_benchmark_device_id == "device1"
assert tracker1.baseline_benchmark_time == 0.98
assert tracker1.calibrated_benchmark_diff == pytest.approx(
(1.23 - 0.98) / 0.98, rel=1e-6
)

assert tracker2.model_benchmark_time == 4.56
assert tracker2.model_benchmark_device_id == "device2"
assert tracker2.baseline_benchmark_time == 4.13
assert tracker2.calibrated_benchmark_diff == pytest.approx(
(4.56 - 4.13) / 4.13, rel=1e-6
)
assert tracker2.model_benchmark_time == 4.56
assert tracker2.model_benchmark_device_id == "device2"
assert tracker2.baseline_benchmark_time == 4.13
assert tracker2.calibrated_benchmark_diff == pytest.approx(
(4.56 - 4.13) / 4.13, rel=1e-6
)

assert result == [
"display_str",
"display_str",
"display_str",
"display_str",
] # Adjust this based on the actual expected result
assert len(result) == 4 # Update based on expected result length
assert tracker3.model_benchmark_time == 3.38
assert tracker3.model_benchmark_device_id == "device3"

assert dump_list == [
"Benchmarking: /path/to/baseline.vmfb on device device1: 0.98\n" "\n",
"Benchmarking: /path/to/model_1.vmfb on device device1: 1.23 (+25.510%)\n"
"\n",
"Benchmarking: /path/to/baseline.vmfb on device device2: 4.13\n" "\n",
"Benchmarking: /path/to/model_2.vmfb on device device2: 4.56 (+10.412%)\n"
"\n",
"Benchmarking: /path/to/model_3.vmfb on device device3: 3.38\n" "\n",
"Benchmarking result of /path/to/baseline.vmfb on device device3 is incomplete\n",
]

# Verify handle_error was called correctly
mock_handle_error.assert_called_once_with(
condition=True,
msg="Benchmarking result of /path/to/baseline.vmfb on device device3 is incomplete",
level=libtuner.logging.WARNING,
)


def test_extract_driver_names():
Expand Down

0 comments on commit c516eb9

Please sign in to comment.