Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor measurements evaluations #351

Merged
7 changes: 3 additions & 4 deletions lnst/Common/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import errno
import ast
import collections
import math
import statistics
import itertools
from collections.abc import Iterable, Callable
from contextlib import AbstractContextManager
Expand Down Expand Up @@ -287,10 +287,9 @@ def dict_to_dot(original_dict, prefix=""):
return return_list

def std_deviation(values):
if len(values) <= 0:
if len(values) <= 1:
return 0.0
avg = sum(values) / float(len(values))
return math.sqrt(sum([(float(i) - avg)**2 for i in values])/len(values))
return statistics.stdev(values)

def deprecated(func):
"""
Expand Down
90 changes: 6 additions & 84 deletions lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
from __future__ import division
from typing import List, Dict
from typing import List, Dict, Optional


from lnst.Controller.Recipe import BaseRecipe
from lnst.Controller.RecipeResults import ResultType

from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf
from lnst.RecipeCommon.Perf.Results import result_averages_difference
from lnst.RecipeCommon.Perf.Measurements.Results import (
BaseMeasurementResults as PerfMeasurementResults,
)
from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import (
BaselineEvaluator, MetricComparison
)
from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator


class BaselineCPUAverageEvaluator(BaselineEvaluator):
def __init__(
self, thresholds: dict, evaluation_filter: Dict[str, str] = None
self,
metrics_to_evaluate: Optional[List[str]] = None,
evaluation_filter: Optional[Dict[str, str]] = None,
):
self._thresholds = thresholds
super().__init__(metrics_to_evaluate)
self._evaluation_filter = evaluation_filter

def filter_results(
Expand Down Expand Up @@ -57,79 +55,3 @@ def _divide_results_by_host(self, results: List[PerfMeasurementResults]):
results_by_host[result.host] = []
results_by_host[result.host].append(result)
return results_by_host

def describe_group_results(
self,
recipe: BaseRecipe,
recipe_conf: PerfRecipeConf,
results: List[PerfMeasurementResults],
) -> List[str]:
return [
"CPU Baseline average evaluation for Host {hostid}:".format(
hostid=results[0].host.hostid
)
]

def compare_result_with_baseline(
self,
recipe: BaseRecipe,
recipe_conf: PerfRecipeConf,
result: PerfMeasurementResults,
baseline: PerfMeasurementResults,
result_index: int = 0
) -> List[MetricComparison]:
comparison = ResultType.FAIL

metric_name = f"{result_index}_utilization"

if baseline is None:
return [
MetricComparison(
metric_name=metric_name,
result=ResultType.FAIL,
text=f"FAIL: CPU {result.cpu}: no baseline found",
)
]
elif (threshold := self._thresholds.get(metric_name, None)) is not None:
try:
difference = result_averages_difference(
result.utilization, baseline.utilization
)

text = (
"CPU {cpuid}: {metric_name} {diff:.2f}% {direction} than baseline. "
"Allowed difference: {threshold}%".format(
cpuid=result.cpu,
metric_name=metric_name,
diff=abs(difference),
direction="higher" if difference >= 0 else "lower",
threshold=threshold
)
)

if difference < -threshold:
comparison = ResultType.WARNING
text = "IMPROVEMENT: " + text
elif difference <= threshold:
comparison = ResultType.PASS
text = "PASS: " + text
else:
comparison = ResultType.FAIL
text = "FAIL: " + text
except ZeroDivisionError:
text = f"CPU {result.cpu}: {metric_name} zero division by baseline"
return [
MetricComparison(
metric_name=metric_name,
result=comparison,
text=text,
)
]
else:
return [
MetricComparison(
metric_name=metric_name,
result=ResultType.FAIL,
text=f"FAIL: CPU {result.cpu}: {metric_name} no threshold found",
)
]
155 changes: 109 additions & 46 deletions lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,63 @@
from typing import List
from typing import List, Optional
from functools import reduce
from dataclasses import dataclass

from lnst.Controller.Recipe import BaseRecipe
from lnst.Controller.RecipeResults import ResultType, Result
from lnst.RecipeCommon.BaseResultEvaluator import BaseResultEvaluator
from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf
from lnst.RecipeCommon.Perf.Results import result_averages_difference
from lnst.RecipeCommon.Perf.Measurements.Results import (
BaseMeasurementResults as PerfMeasurementResults,
)


class BaselineEvaluationResult(Result):
pass


@dataclass
class MetricComparison:
measurement_type: str
current_result: PerfMeasurementResults
baseline_result: Optional[PerfMeasurementResults]
threshold: Optional[float]
metric_name: str
result: ResultType
difference: Optional[float]
comparison_result: ResultType
text: str


class BaselineEvaluationResult(Result):
def __init__(
self, comparisons: list[MetricComparison], recipe_conf: PerfRecipeConf
):
super().__init__(ResultType.PASS)
self.comparisons = comparisons
self.recipe_conf = recipe_conf

@property
def result(self) -> ResultType:
return reduce(
ResultType.max_severity,
[comparison.comparison_result for comparison in self.comparisons],
ResultType.PASS,
)

@property
def description(self) -> str:
res = []
current_result = None
for comparison in self.comparisons:
if comparison.current_result != current_result:
res.append(comparison.current_result.describe())
current_result = comparison.current_result
res.append(f"{comparison.comparison_result}: {comparison.text}")
return "\n".join(
["Baseline evaluation of"] + res
)


class BaselineEvaluator(BaseResultEvaluator):
def __init__(self, metrics_to_evaluate: Optional[List[str]] = None):
self._metrics_to_evaluate = metrics_to_evaluate

def evaluate_results(
self,
recipe: BaseRecipe,
Expand Down Expand Up @@ -59,53 +94,22 @@ def evaluate_group_results(
):
cumulative_result = ResultType.PASS
comparisons = []
result_text = self.describe_group_results(recipe, recipe_conf, results)

baselines = self.get_baselines(recipe, recipe_conf, results)
result_index = len(recipe.current_run.results)
for i, (result, baseline) in enumerate(zip(results, baselines)):
metric_comparisons = self.compare_result_with_baseline(
recipe, recipe_conf, result, baseline, result_index
)
cumulative_result = reduce(
ResultType.max_severity,
[metric.result for metric in metric_comparisons],
cumulative_result,
)
result_text.extend(
[metric.text for metric in metric_comparisons]
)
for result, baseline in zip(results, baselines):
comparisons.extend(
[
{
"measurement_type": result.measurement.__class__.__name__,
"current_result": result,
"baseline_result": baseline,
"comparison_result": metric.result,
"metric_name": metric.metric_name,
"text": metric.text,
"recipe_conf": recipe_conf,
}
for metric in metric_comparisons
]
self.compare_result_with_baseline(
recipe, recipe_conf, result, baseline
)
)

recipe.add_custom_result(
BaselineEvaluationResult(
cumulative_result,
"\n".join(result_text),
data={"comparisons": comparisons},
comparisons=comparisons,
recipe_conf=recipe_conf,
)
)

def describe_group_results(
self,
recipe: BaseRecipe,
recipe_conf: PerfRecipeConf,
results: List[PerfMeasurementResults],
) -> List[str]:
return []

def get_baselines(
self,
recipe: BaseRecipe,
Expand All @@ -121,7 +125,14 @@ def get_baseline(
recipe: BaseRecipe,
recipe_conf: PerfRecipeConf,
result: PerfMeasurementResults,
) -> PerfMeasurementResults:
) -> Optional[PerfMeasurementResults]:
return None

def get_threshold(
self,
baseline: PerfMeasurementResults,
metric_name: str,
) -> Optional[float]:
return None

def compare_result_with_baseline(
Expand All @@ -130,6 +141,58 @@ def compare_result_with_baseline(
recipe_conf: PerfRecipeConf,
result: PerfMeasurementResults,
baseline: PerfMeasurementResults,
result_index: int = 0,
) -> List[MetricComparison]:
raise NotImplementedError("Result to baseline metric comparison not implemented")
comparisons = []

if self._metrics_to_evaluate:
metrics_to_evaluate = [
i for i in result.metrics if i in self._metrics_to_evaluate
]
else:
metrics_to_evaluate = result.metrics

for metric in metrics_to_evaluate:
comparisons.append(
self.compare_metrics_with_threshold(
result=result,
baseline=baseline,
metric_name=metric,
)
)
return comparisons

def compare_metrics_with_threshold(self, result, baseline, metric_name):
threshold = None
diff = None

if not baseline:
comparison_result = ResultType.FAIL
text = "No baseline found"
elif (threshold := self.get_threshold(baseline, metric_name)) is None:
comparison_result = ResultType.FAIL
text = "No threshold found"
else:
diff = result_averages_difference(
getattr(result, metric_name),
getattr(baseline, metric_name),
)
direction = "higher" if diff >= 0 else "lower"

comparison_result = (
ResultType.PASS if abs(diff) <= threshold else ResultType.FAIL
)
text = (
f"New {metric_name} average is {abs(diff):.2f}% {direction} from the baseline. "
f"Allowed difference: {threshold}%"
)

return MetricComparison(
measurement_type=result.measurement.__class__.__name__,
current_result=result,
baseline_result=baseline,
threshold=threshold,
metric_name=metric_name,
difference=diff,
comparison_result=comparison_result,
text=text,
)
Loading
Loading