From a36089cd9e1497b3167248b11b58b066976e7ba7 Mon Sep 17 00:00:00 2001 From: Ondrej Lichtner Date: Fri, 24 Nov 2023 16:36:27 +0100 Subject: [PATCH] BaselineEvaluator: refactor to unify baseline evaluators Now that MeasurementResults export a `metrics` property, Baseline evaluation can be refactored and unified to work with almost any MeasurementResult in a much simpler manner. This commit does just that, additionally there are two additional related changes: * MetricComparison class is extended to contain more information so that we can unify it and remove the use of a "comparison dictionary" * BaselineEvaluationResult class is added to store all of the grouped MetricComparisons and to generate a nice description for them. This does change the Result description format as it was previously more customized to the individual MeasurementResults, but this IMO actually is more accurate although maybe a little less nice to look at. Finally this gives us a bunch more power to actually machine post process the baseline evaluations so I've also removed the "good direction -> improvement warning" override which IMO should be done as a post processing step and not directly as a part of the evaluation method. The unification of the BaselineEvaluator means that we can completely remove: * BaselineFlowAverageEvaluator * BaselineRDMABandwidthAverageEvaluator * BaselineTcRunAverageEvaluator We need to keep the BaselineCPUAverageEvaluator as this does still override some methods wrt. how filtering and result grouping works for CPU Measurements. Signed-off-by: Ondrej Lichtner --- .../Evaluators/BaselineCPUAverageEvaluator.py | 90 +--------- .../Perf/Evaluators/BaselineEvaluator.py | 155 ++++++++++++------ .../BaselineFlowAverageEvaluator.py | 111 ------------- .../BaselineRDMABandwidthAverageEvaluator.py | 64 -------- .../BaselineTcRunAverageEvaluator.py | 67 -------- lnst/RecipeCommon/Perf/Evaluators/__init__.py | 6 +- 6 files changed, 116 insertions(+), 377 deletions(-) delete mode 100644 lnst/RecipeCommon/Perf/Evaluators/BaselineFlowAverageEvaluator.py delete mode 100644 lnst/RecipeCommon/Perf/Evaluators/BaselineRDMABandwidthAverageEvaluator.py delete mode 100644 lnst/RecipeCommon/Perf/Evaluators/BaselineTcRunAverageEvaluator.py diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py index 520e71c61..8232fca60 100644 --- a/lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py +++ b/lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py @@ -1,25 +1,23 @@ from __future__ import division -from typing import List, Dict +from typing import List, Dict, Optional from lnst.Controller.Recipe import BaseRecipe -from lnst.Controller.RecipeResults import ResultType from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf -from lnst.RecipeCommon.Perf.Results import result_averages_difference from lnst.RecipeCommon.Perf.Measurements.Results import ( BaseMeasurementResults as PerfMeasurementResults, ) -from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import ( - BaselineEvaluator, MetricComparison -) +from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator class BaselineCPUAverageEvaluator(BaselineEvaluator): def __init__( - self, thresholds: dict, evaluation_filter: Dict[str, str] = None + self, + metrics_to_evaluate: Optional[List[str]] = None, + evaluation_filter: Optional[Dict[str, str]] = None, ): - self._thresholds = thresholds + super().__init__(metrics_to_evaluate) self._evaluation_filter = evaluation_filter def filter_results( @@ -57,79 +55,3 @@ def _divide_results_by_host(self, results: List[PerfMeasurementResults]): results_by_host[result.host] = [] results_by_host[result.host].append(result) return results_by_host - - def describe_group_results( - self, - recipe: BaseRecipe, - recipe_conf: PerfRecipeConf, - results: List[PerfMeasurementResults], - ) -> List[str]: - return [ - "CPU Baseline average evaluation for Host {hostid}:".format( - hostid=results[0].host.hostid - ) - ] - - def compare_result_with_baseline( - self, - recipe: BaseRecipe, - recipe_conf: PerfRecipeConf, - result: PerfMeasurementResults, - baseline: PerfMeasurementResults, - result_index: int = 0 - ) -> List[MetricComparison]: - comparison = ResultType.FAIL - - metric_name = f"{result_index}_utilization" - - if baseline is None: - return [ - MetricComparison( - metric_name=metric_name, - result=ResultType.FAIL, - text=f"FAIL: CPU {result.cpu}: no baseline found", - ) - ] - elif (threshold := self._thresholds.get(metric_name, None)) is not None: - try: - difference = result_averages_difference( - result.utilization, baseline.utilization - ) - - text = ( - "CPU {cpuid}: {metric_name} {diff:.2f}% {direction} than baseline. " - "Allowed difference: {threshold}%".format( - cpuid=result.cpu, - metric_name=metric_name, - diff=abs(difference), - direction="higher" if difference >= 0 else "lower", - threshold=threshold - ) - ) - - if difference < -threshold: - comparison = ResultType.WARNING - text = "IMPROVEMENT: " + text - elif difference <= threshold: - comparison = ResultType.PASS - text = "PASS: " + text - else: - comparison = ResultType.FAIL - text = "FAIL: " + text - except ZeroDivisionError: - text = f"CPU {result.cpu}: {metric_name} zero division by baseline" - return [ - MetricComparison( - metric_name=metric_name, - result=comparison, - text=text, - ) - ] - else: - return [ - MetricComparison( - metric_name=metric_name, - result=ResultType.FAIL, - text=f"FAIL: CPU {result.cpu}: {metric_name} no threshold found", - ) - ] diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py index 6c072b55f..4504976a1 100644 --- a/lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py +++ b/lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from functools import reduce from dataclasses import dataclass @@ -6,23 +6,58 @@ from lnst.Controller.RecipeResults import ResultType, Result from lnst.RecipeCommon.BaseResultEvaluator import BaseResultEvaluator from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf +from lnst.RecipeCommon.Perf.Results import result_averages_difference from lnst.RecipeCommon.Perf.Measurements.Results import ( BaseMeasurementResults as PerfMeasurementResults, ) -class BaselineEvaluationResult(Result): - pass - - @dataclass class MetricComparison: + measurement_type: str + current_result: PerfMeasurementResults + baseline_result: Optional[PerfMeasurementResults] + threshold: float metric_name: str - result: ResultType + difference: float + comparison_result: ResultType text: str +class BaselineEvaluationResult(Result): + def __init__( + self, comparisons: list[MetricComparison], recipe_conf: PerfRecipeConf + ): + super().__init__(ResultType.PASS) + self.comparisons = comparisons + self.recipe_conf = recipe_conf + + @property + def result(self) -> ResultType: + return reduce( + ResultType.max_severity, + [comparison.comparison_result for comparison in self.comparisons], + ResultType.PASS, + ) + + @property + def description(self) -> str: + res = [] + current_result = None + for comparison in self.comparisons: + if comparison.current_result != current_result: + res.append(comparison.current_result.describe()) + current_result = comparison.current_result + res.append(f"{comparison.comparison_result}: {comparison.text}") + return "\n".join( + ["Baseline evaluation of"] + res + ) + + class BaselineEvaluator(BaseResultEvaluator): + def __init__(self, metrics_to_evaluate: Optional[List[str]] = None): + self._metrics_to_evaluate = metrics_to_evaluate + def evaluate_results( self, recipe: BaseRecipe, @@ -59,53 +94,22 @@ def evaluate_group_results( ): cumulative_result = ResultType.PASS comparisons = [] - result_text = self.describe_group_results(recipe, recipe_conf, results) baselines = self.get_baselines(recipe, recipe_conf, results) - result_index = len(recipe.current_run.results) - for i, (result, baseline) in enumerate(zip(results, baselines)): - metric_comparisons = self.compare_result_with_baseline( - recipe, recipe_conf, result, baseline, result_index - ) - cumulative_result = reduce( - ResultType.max_severity, - [metric.result for metric in metric_comparisons], - cumulative_result, - ) - result_text.extend( - [metric.text for metric in metric_comparisons] - ) + for result, baseline in zip(results, baselines): comparisons.extend( - [ - { - "measurement_type": result.measurement.__class__.__name__, - "current_result": result, - "baseline_result": baseline, - "comparison_result": metric.result, - "metric_name": metric.metric_name, - "text": metric.text, - "recipe_conf": recipe_conf, - } - for metric in metric_comparisons - ] + self.compare_result_with_baseline( + recipe, recipe_conf, result, baseline + ) ) recipe.add_custom_result( BaselineEvaluationResult( - cumulative_result, - "\n".join(result_text), - data={"comparisons": comparisons}, + comparisons=comparisons, + recipe_conf=recipe_conf, ) ) - def describe_group_results( - self, - recipe: BaseRecipe, - recipe_conf: PerfRecipeConf, - results: List[PerfMeasurementResults], - ) -> List[str]: - return [] - def get_baselines( self, recipe: BaseRecipe, @@ -121,7 +125,14 @@ def get_baseline( recipe: BaseRecipe, recipe_conf: PerfRecipeConf, result: PerfMeasurementResults, - ) -> PerfMeasurementResults: + ) -> Optional[PerfMeasurementResults]: + return None + + def get_threshold( + self, + baseline: PerfMeasurementResults, + metric_name: str, + ) -> Optional[float]: return None def compare_result_with_baseline( @@ -130,6 +141,58 @@ def compare_result_with_baseline( recipe_conf: PerfRecipeConf, result: PerfMeasurementResults, baseline: PerfMeasurementResults, - result_index: int = 0, ) -> List[MetricComparison]: - raise NotImplementedError("Result to baseline metric comparison not implemented") + comparisons = [] + + if self._metrics_to_evaluate: + metrics_to_evaluate = [ + i for i in result.metrics if i in self._metrics_to_evaluate + ] + else: + metrics_to_evaluate = result.metrics + + for metric in metrics_to_evaluate: + comparisons.append( + self.compare_metrics_with_threshold( + result=result, + baseline=baseline, + metric_name=metric, + ) + ) + return comparisons + + def compare_metrics_with_threshold(self, result, baseline, metric_name): + threshold = None + diff = None + + if not baseline: + comparison_result = ResultType.FAIL + text = "No baseline found" + elif (threshold := self.get_threshold(baseline, metric_name)) is None: + comparison_result = ResultType.FAIL + text = "No threshold found" + else: + diff = result_averages_difference( + getattr(result, metric_name), + getattr(baseline, metric_name), + ) + direction = "higher" if diff >= 0 else "lower" + + comparison_result = ( + ResultType.PASS if abs(diff) <= threshold else ResultType.FAIL + ) + text = ( + f"New {metric_name} average is {abs(diff):.2f}% {direction} from the baseline. " + f"Allowed difference: {threshold}%" + ) + + return MetricComparison( + measurement_type=result.measurement.__class__.__name__, + current_result=result, + baseline_result=baseline, + threshold=threshold, + metric_name=metric_name, + difference=diff, + comparison_result=comparison_result, + text=text, + ) diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineFlowAverageEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineFlowAverageEvaluator.py deleted file mode 100644 index 193ab56c8..000000000 --- a/lnst/RecipeCommon/Perf/Evaluators/BaselineFlowAverageEvaluator.py +++ /dev/null @@ -1,111 +0,0 @@ -from __future__ import division -from typing import List - -from lnst.Controller.Recipe import BaseRecipe -from lnst.Controller.RecipeResults import ResultType - -from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf -from lnst.RecipeCommon.Perf.Results import result_averages_difference -from lnst.RecipeCommon.Perf.Results import SequentialPerfResult -from lnst.RecipeCommon.Perf.Measurements.Results import ( - BaseMeasurementResults as PerfMeasurementResults, -) -from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import ( - BaselineEvaluator, MetricComparison -) - - -class BaselineFlowAverageEvaluator(BaselineEvaluator): - def __init__( - self, thresholds: dict, metrics_to_evaluate: List[str] = None - ): - self._thresholds = thresholds - - if metrics_to_evaluate is not None: - self._metrics_to_evaluate = metrics_to_evaluate - else: - self._metrics_to_evaluate = [ - "generator_results", - "generator_cpu_stats", - "receiver_results", - "receiver_cpu_stats", - ] - - def describe_group_results( - self, - recipe: BaseRecipe, - recipe_conf: PerfRecipeConf, - results: List[PerfMeasurementResults], - ) -> List[str]: - result = results[0] - return [ - "Baseline average evaluation of flow:", - "{}".format(result.flow) - ] - - def compare_result_with_baseline( - self, - recipe: BaseRecipe, - recipe_conf: PerfRecipeConf, - result: PerfMeasurementResults, - baseline: PerfMeasurementResults, - result_index: int = 0, - ) -> List[MetricComparison]: - metric_comparisons = [] - for i in self._metrics_to_evaluate: - metric = f"{result_index}_{i}" - if baseline is None: - comparison_result = ResultType.FAIL - text = f"FAIL: Metric {metric} baseline not found for this flow" - elif (threshold := self._thresholds.get(metric, None)) is not None: - comparison_result, text = self._average_diff_comparison( - name=metric, - target=getattr(result, i), - baseline=getattr(baseline, i), - threshold=threshold - ) - else: - comparison_result = ResultType.FAIL - text = f"FAIL: Metric {metric} threshold not found" - - metric_comparisons.append( - MetricComparison( - metric_name=metric, - result=comparison_result, - text=text, - ) - ) - return metric_comparisons - - def _average_diff_comparison( - self, - name: str, - target: SequentialPerfResult, - baseline: SequentialPerfResult, - threshold: int - ): - difference = result_averages_difference(target, baseline) - result_text = "New {name} average is {diff:.2f}% {direction} from the baseline. " \ - "Allowed difference: {threshold}%".format( - name=name, - diff=abs(difference), - direction="higher" if difference >= 0 else "lower", - threshold=threshold - ) - - cpu = "_cpu_" in name - - # ( flow metrics ) or ( cpu metrics ) - if (not cpu and difference > threshold) or (cpu and difference < -threshold): - comparison = ResultType.WARNING - elif (not cpu and difference >= -threshold) or (cpu and difference <= threshold): - comparison = ResultType.PASS - else: - comparison = ResultType.FAIL - - if comparison == ResultType.WARNING: - result_text = f"IMPROVEMENT: {result_text}" - else: - result_text = f"{comparison}: {result_text}" - - return comparison, result_text diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineRDMABandwidthAverageEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineRDMABandwidthAverageEvaluator.py deleted file mode 100644 index 81e2b3b56..000000000 --- a/lnst/RecipeCommon/Perf/Evaluators/BaselineRDMABandwidthAverageEvaluator.py +++ /dev/null @@ -1,64 +0,0 @@ -from lnst.Controller.Recipe import BaseRecipe -from lnst.Controller.RecipeResults import ResultType -from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator, MetricComparison -from lnst.RecipeCommon.Perf.Measurements.Results import RDMABandwidthMeasurementResults - - -class BaselineRDMABandwidthAverageEvaluator(BaselineEvaluator): - - def __init__(self, thresholds: dict): - self._thresholds = thresholds - - def compare_result_with_baseline( - self, - recipe: BaseRecipe, - recipe_conf: "EnrtConfiguration", - result: RDMABandwidthMeasurementResults, - baseline: RDMABandwidthMeasurementResults, - result_index: int = 0, - ) -> list[MetricComparison]: - metric_name = f"{result_index}_bandwidth" - - if baseline is None: - return [ - MetricComparison( - metric_name=metric_name, - result=ResultType.FAIL, - text=f"{self.__class__.__name__} FAIL:\n Metric {metric_name} baseline not found", - ) - ] - elif (threshold := self._thresholds.get(metric_name, None)) is None: - return [ - MetricComparison( - metric_name=metric_name, - result=ResultType.FAIL, - text=f"{self.__class__.__name__}\nFAIL: Metric {metric_name} threshold not found", - ) - ] - - difference = ((result.bandwidth.average / baseline.bandwidth.average) * 100) - 100 - direction = "higher" if difference >= 0 else "lower" - text = [ - f"{self.__class__.__name__} of {metric_name}", - f"Baseline: {baseline.bandwidth.average} MiB/s", - f"Measured: {result.bandwidth.average} MiB/s", - f"{abs(difference):2f}% {direction} than baseline", - f"Allowed difference: {threshold}%", - ] - if difference > threshold: - comparison = ResultType.WARNING - text[0] = f"IMPROVEMENT: {text[0]}" - elif difference >= -threshold: - comparison = ResultType.PASS - text[0] = f"PASS: {text[0]}" - else: - comparison = ResultType.FAIL - text[0] = f"FAIL: {text[0]}" - - return [ - MetricComparison( - metric_name=metric_name, - result=comparison, - text="\n".join(text) - ) - ] diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineTcRunAverageEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineTcRunAverageEvaluator.py deleted file mode 100644 index 01c30a3fe..000000000 --- a/lnst/RecipeCommon/Perf/Evaluators/BaselineTcRunAverageEvaluator.py +++ /dev/null @@ -1,67 +0,0 @@ -from lnst.Controller.RecipeResults import ResultType -from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator, MetricComparison -from lnst.RecipeCommon.Perf.Measurements.Results import TcRunMeasurementResults -from lnst.Recipes.ENRT.TrafficControlRecipe import TrafficControlRecipe, TcRecipeConfiguration - - -class BaselineTcRunAverageEvaluator(BaselineEvaluator): - - def __init__(self, thresholds: dict): - self._thresholds = thresholds - - def compare_result_with_baseline( - self, - recipe: TrafficControlRecipe, - recipe_conf: TcRecipeConfiguration, - result: TcRunMeasurementResults, - baseline: TcRunMeasurementResults, - result_index: int = 0, - ) -> list[MetricComparison]: - - metric_name = f"{result_index}_rule_install_rate" - - if baseline is None: - return [ - MetricComparison( - metric_name=metric_name, - result=ResultType.FAIL, - text=f"{self.__class__.__name__} FAIL:\n {result.device.name} {metric_name} baseline not found", - ) - ] - elif (threshold := self._thresholds.get(metric_name, None)) is not None: - difference = ((result.rule_install_rate.average / baseline.rule_install_rate.average) * 100) - 100 - direction = "higher" if difference >= 0 else "lower" - text = [ - f"{self.__class__.__name__} of tc run with {metric_name}", - f"{result.description}", - f"Baseline: {baseline.rule_install_rate.average} rules/sec", - f"Measured: {result.rule_install_rate.average} rules/sec", - f"{abs(difference):2f}% {direction} than baseline ", - f"Allowed difference: {threshold}% ", - ] - if difference > threshold: - comparison = ResultType.WARNING - text[0] = f"IMPROVEMENT: {text[0]}" - elif difference >= -threshold: - comparison = ResultType.PASS - text[0] = f"PASS: {text[0]}" - else: - comparison = ResultType.FAIL - text[0] = f"FAIL: {text[0]}" - - return [ - MetricComparison( - metric_name=metric_name, - result=comparison, - text="\n".join(text) - ) - ] - else: - return [ - MetricComparison( - metric_name=metric_name, - result=ResultType.FAIL, - text=f"{self.__class__.__name__}\nFAIL: {result.device.name} {metric_name} no threshold found", - ) - ] - diff --git a/lnst/RecipeCommon/Perf/Evaluators/__init__.py b/lnst/RecipeCommon/Perf/Evaluators/__init__.py index 15a4fd7f4..1bac79501 100644 --- a/lnst/RecipeCommon/Perf/Evaluators/__init__.py +++ b/lnst/RecipeCommon/Perf/Evaluators/__init__.py @@ -1,8 +1,4 @@ from lnst.RecipeCommon.Perf.Evaluators.NonzeroFlowEvaluator import NonzeroFlowEvaluator -from lnst.RecipeCommon.Perf.Evaluators.BaselineFlowAverageEvaluator import BaselineFlowAverageEvaluator - +from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator from lnst.RecipeCommon.Perf.Evaluators.BaselineCPUAverageEvaluator import BaselineCPUAverageEvaluator - from lnst.RecipeCommon.Perf.Evaluators.MaxTimeTakenEvaluator import MaxTimeTakenEvaluator -from lnst.RecipeCommon.Perf.Evaluators.BaselineTcRunAverageEvaluator import BaselineTcRunAverageEvaluator -from lnst.RecipeCommon.Perf.Evaluators.BaselineRDMABandwidthAverageEvaluator import BaselineRDMABandwidthAverageEvaluator