Skip to content

Commit

Permalink
BaselineEvaluator: refactor to unify baseline evaluators
Browse files Browse the repository at this point in the history
Now that MeasurementResults export a `metrics` property, Baseline
evaluation can be refactored and unified to work with almost any
MeasurementResult in a much simpler manner.

This commit does just that, additionally there are two additional
related changes:
* MetricComparison class is extended to contain more information so that
  we can unify it and remove the use of a "comparison dictionary"
* BaselineEvaluationResult class is added to store all of the grouped
  MetricComparisons and to generate a nice description for them.

This does change the Result description format as it was previously more
customized to the individual MeasurementResults, but this IMO actually
is more accurate although maybe a little less nice to look at.

Finally this gives us a bunch more power to actually machine post
process the baseline evaluations so I've also removed the "good
direction -> improvement warning" override which IMO should be done as a
post processing step and not directly as a part of the evaluation
method.

The unification of the BaselineEvaluator means that we can completely
remove:
* BaselineFlowAverageEvaluator
* BaselineRDMABandwidthAverageEvaluator
* BaselineTcRunAverageEvaluator

We need to keep the BaselineCPUAverageEvaluator as this does still
override some methods wrt. how filtering and result grouping works for
CPU Measurements.

Signed-off-by: Ondrej Lichtner <[email protected]>
  • Loading branch information
olichtne committed Nov 27, 2023
1 parent bf278f7 commit 27cc1dd
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 377 deletions.
90 changes: 6 additions & 84 deletions lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
from __future__ import division
from typing import List, Dict
from typing import List, Dict, Optional


from lnst.Controller.Recipe import BaseRecipe
from lnst.Controller.RecipeResults import ResultType

from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf
from lnst.RecipeCommon.Perf.Results import result_averages_difference
from lnst.RecipeCommon.Perf.Measurements.Results import (
BaseMeasurementResults as PerfMeasurementResults,
)
from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import (
BaselineEvaluator, MetricComparison
)
from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator


class BaselineCPUAverageEvaluator(BaselineEvaluator):
def __init__(
self, thresholds: dict, evaluation_filter: Dict[str, str] = None
self,
metrics_to_evaluate: Optional[List[str]] = None,
evaluation_filter: Optional[Dict[str, str]] = None,
):
self._thresholds = thresholds
super().__init__(metrics_to_evaluate)
self._evaluation_filter = evaluation_filter

def filter_results(
Expand Down Expand Up @@ -57,79 +55,3 @@ def _divide_results_by_host(self, results: List[PerfMeasurementResults]):
results_by_host[result.host] = []
results_by_host[result.host].append(result)
return results_by_host

def describe_group_results(
self,
recipe: BaseRecipe,
recipe_conf: PerfRecipeConf,
results: List[PerfMeasurementResults],
) -> List[str]:
return [
"CPU Baseline average evaluation for Host {hostid}:".format(
hostid=results[0].host.hostid
)
]

def compare_result_with_baseline(
self,
recipe: BaseRecipe,
recipe_conf: PerfRecipeConf,
result: PerfMeasurementResults,
baseline: PerfMeasurementResults,
result_index: int = 0
) -> List[MetricComparison]:
comparison = ResultType.FAIL

metric_name = f"{result_index}_utilization"

if baseline is None:
return [
MetricComparison(
metric_name=metric_name,
result=ResultType.FAIL,
text=f"FAIL: CPU {result.cpu}: no baseline found",
)
]
elif (threshold := self._thresholds.get(metric_name, None)) is not None:
try:
difference = result_averages_difference(
result.utilization, baseline.utilization
)

text = (
"CPU {cpuid}: {metric_name} {diff:.2f}% {direction} than baseline. "
"Allowed difference: {threshold}%".format(
cpuid=result.cpu,
metric_name=metric_name,
diff=abs(difference),
direction="higher" if difference >= 0 else "lower",
threshold=threshold
)
)

if difference < -threshold:
comparison = ResultType.WARNING
text = "IMPROVEMENT: " + text
elif difference <= threshold:
comparison = ResultType.PASS
text = "PASS: " + text
else:
comparison = ResultType.FAIL
text = "FAIL: " + text
except ZeroDivisionError:
text = f"CPU {result.cpu}: {metric_name} zero division by baseline"
return [
MetricComparison(
metric_name=metric_name,
result=comparison,
text=text,
)
]
else:
return [
MetricComparison(
metric_name=metric_name,
result=ResultType.FAIL,
text=f"FAIL: CPU {result.cpu}: {metric_name} no threshold found",
)
]
155 changes: 109 additions & 46 deletions lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,63 @@
from typing import List
from typing import List, Optional
from functools import reduce
from dataclasses import dataclass

from lnst.Controller.Recipe import BaseRecipe
from lnst.Controller.RecipeResults import ResultType, Result
from lnst.RecipeCommon.BaseResultEvaluator import BaseResultEvaluator
from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf
from lnst.RecipeCommon.Perf.Results import result_averages_difference
from lnst.RecipeCommon.Perf.Measurements.Results import (
BaseMeasurementResults as PerfMeasurementResults,
)


class BaselineEvaluationResult(Result):
pass


@dataclass
class MetricComparison:
measurement_type: str
current_result: PerfMeasurementResults
baseline_result: Optional[PerfMeasurementResults]
threshold: float
metric_name: str
result: ResultType
difference: float
comparison_result: ResultType
text: str


class BaselineEvaluationResult(Result):
def __init__(
self, comparisons: list[MetricComparison], recipe_conf: PerfRecipeConf
):
super().__init__(ResultType.PASS)
self.comparisons = comparisons
self.recipe_conf = recipe_conf

@property
def result(self) -> ResultType:
return reduce(
ResultType.max_severity,
[comparison.comparison_result for comparison in self.comparisons],
ResultType.PASS,
)

@property
def description(self) -> str:
res = []
current_result = None
for comparison in self.comparisons:
if comparison.current_result != current_result:
res.append(comparison.current_result.describe())
current_result = comparison.current_result
res.append(comparison.text)
return "\n".join(
["Baseline evaluation of"] + res
)


class BaselineEvaluator(BaseResultEvaluator):
def __init__(self, metrics_to_evaluate: Optional[List[str]] = None):
self._metrics_to_evaluate = metrics_to_evaluate

def evaluate_results(
self,
recipe: BaseRecipe,
Expand Down Expand Up @@ -59,53 +94,22 @@ def evaluate_group_results(
):
cumulative_result = ResultType.PASS
comparisons = []
result_text = self.describe_group_results(recipe, recipe_conf, results)

baselines = self.get_baselines(recipe, recipe_conf, results)
result_index = len(recipe.current_run.results)
for i, (result, baseline) in enumerate(zip(results, baselines)):
metric_comparisons = self.compare_result_with_baseline(
recipe, recipe_conf, result, baseline, result_index
)
cumulative_result = reduce(
ResultType.max_severity,
[metric.result for metric in metric_comparisons],
cumulative_result,
)
result_text.extend(
[metric.text for metric in metric_comparisons]
)
for result, baseline in zip(results, baselines):
comparisons.extend(
[
{
"measurement_type": result.measurement.__class__.__name__,
"current_result": result,
"baseline_result": baseline,
"comparison_result": metric.result,
"metric_name": metric.metric_name,
"text": metric.text,
"recipe_conf": recipe_conf,
}
for metric in metric_comparisons
]
self.compare_result_with_baseline(
recipe, recipe_conf, result, baseline
)
)

recipe.add_custom_result(
BaselineEvaluationResult(
cumulative_result,
"\n".join(result_text),
data={"comparisons": comparisons},
comparisons=comparisons,
recipe_conf=recipe_conf,
)
)

def describe_group_results(
self,
recipe: BaseRecipe,
recipe_conf: PerfRecipeConf,
results: List[PerfMeasurementResults],
) -> List[str]:
return []

def get_baselines(
self,
recipe: BaseRecipe,
Expand All @@ -121,7 +125,14 @@ def get_baseline(
recipe: BaseRecipe,
recipe_conf: PerfRecipeConf,
result: PerfMeasurementResults,
) -> PerfMeasurementResults:
) -> Optional[PerfMeasurementResults]:
return None

def get_threshold(
self,
baseline: PerfMeasurementResults,
metric_name: str,
) -> Optional[float]:
return None

def compare_result_with_baseline(
Expand All @@ -130,6 +141,58 @@ def compare_result_with_baseline(
recipe_conf: PerfRecipeConf,
result: PerfMeasurementResults,
baseline: PerfMeasurementResults,
result_index: int = 0,
) -> List[MetricComparison]:
raise NotImplementedError("Result to baseline metric comparison not implemented")
comparisons = []

if self._metrics_to_evaluate:
metrics_to_evaluate = [
i for i in result.metrics if i in self._metrics_to_evaluate
]
else:
metrics_to_evaluate = result.metrics

for metric in metrics_to_evaluate:
comparisons.append(
self.compare_metrics_with_threshold(
result=result,
baseline=baseline,
metric_name=metric,
)
)
return comparisons

def compare_metrics_with_threshold(self, result, baseline, metric_name):
threshold = None
diff = None

if not baseline:
comparison_result = ResultType.FAIL
text = "No baseline found"
elif (threshold := self.get_threshold(baseline, metric_name)) is None:
comparison_result = ResultType.FAIL
text = "No threshold found"
else:
diff = result_averages_difference(
getattr(result, metric_name),
getattr(baseline, metric_name),
)
direction = "higher" if diff >= 0 else "lower"

comparison_result = (
ResultType.PASS if diff <= threshold else ResultType.FAIL
)
text = (
f"New {metric_name} average is {abs(diff):.2f}% {direction} from the baseline. "
f"Allowed difference: {threshold}%"
)

return MetricComparison(
measurement_type=result.measurement.__class__.__name__,
current_result=result,
baseline_result=baseline,
threshold=threshold,
metric_name=metric_name,
difference=diff,
comparison_result=comparison_result,
text=text,
)
Loading

0 comments on commit 27cc1dd

Please sign in to comment.