diff --git a/lnst/Common/Utils.py b/lnst/Common/Utils.py
index 5c3a80ab6..044330187 100644
--- a/lnst/Common/Utils.py
+++ b/lnst/Common/Utils.py
@@ -21,7 +21,7 @@
 import errno
 import ast
 import collections
-import math
+import statistics
 import itertools
 from collections.abc import Iterable, Callable
 from contextlib import AbstractContextManager
@@ -287,10 +287,9 @@ def dict_to_dot(original_dict, prefix=""):
     return return_list
 
 def std_deviation(values):
-    if len(values) <= 0:
+    if len(values) <= 1:
         return 0.0
-    avg = sum(values) / float(len(values))
-    return math.sqrt(sum([(float(i) - avg)**2 for i in values])/len(values))
+    return statistics.stdev(values)
 
 def deprecated(func):
     """
diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py
index 520e71c61..8232fca60 100644
--- a/lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py
+++ b/lnst/RecipeCommon/Perf/Evaluators/BaselineCPUAverageEvaluator.py
@@ -1,25 +1,23 @@
 from __future__ import division
-from typing import List, Dict
+from typing import List, Dict, Optional
 
 
 from lnst.Controller.Recipe import BaseRecipe
-from lnst.Controller.RecipeResults import ResultType
 
 from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf
-from lnst.RecipeCommon.Perf.Results import result_averages_difference
 from lnst.RecipeCommon.Perf.Measurements.Results import (
     BaseMeasurementResults as PerfMeasurementResults,
 )
-from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import (
-    BaselineEvaluator, MetricComparison
-)
+from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator
 
 
 class BaselineCPUAverageEvaluator(BaselineEvaluator):
     def __init__(
-        self, thresholds: dict, evaluation_filter: Dict[str, str] = None
+        self,
+        metrics_to_evaluate: Optional[List[str]] = None,
+        evaluation_filter: Optional[Dict[str, str]] = None,
     ):
-        self._thresholds = thresholds
+        super().__init__(metrics_to_evaluate)
         self._evaluation_filter = evaluation_filter
 
     def filter_results(
@@ -57,79 +55,3 @@ def _divide_results_by_host(self, results: List[PerfMeasurementResults]):
                 results_by_host[result.host] = []
             results_by_host[result.host].append(result)
         return results_by_host
-
-    def describe_group_results(
-        self,
-        recipe: BaseRecipe,
-        recipe_conf: PerfRecipeConf,
-        results: List[PerfMeasurementResults],
-    ) -> List[str]:
-        return [
-            "CPU Baseline average evaluation for Host {hostid}:".format(
-                hostid=results[0].host.hostid
-            )
-        ]
-
-    def compare_result_with_baseline(
-        self,
-        recipe: BaseRecipe,
-        recipe_conf: PerfRecipeConf,
-        result: PerfMeasurementResults,
-        baseline: PerfMeasurementResults,
-        result_index: int = 0
-    ) -> List[MetricComparison]:
-        comparison = ResultType.FAIL
-
-        metric_name = f"{result_index}_utilization"
-
-        if baseline is None:
-            return [
-                MetricComparison(
-                    metric_name=metric_name,
-                    result=ResultType.FAIL,
-                    text=f"FAIL: CPU {result.cpu}: no baseline found",
-                )
-            ]
-        elif (threshold := self._thresholds.get(metric_name, None)) is not None:
-            try:
-                difference = result_averages_difference(
-                    result.utilization, baseline.utilization
-                )
-
-                text = (
-                    "CPU {cpuid}: {metric_name} {diff:.2f}% {direction} than baseline. "
-                    "Allowed difference: {threshold}%".format(
-                        cpuid=result.cpu,
-                        metric_name=metric_name,
-                        diff=abs(difference),
-                        direction="higher" if difference >= 0 else "lower",
-                        threshold=threshold
-                    )
-                )
-
-                if difference < -threshold:
-                    comparison = ResultType.WARNING
-                    text = "IMPROVEMENT: " + text
-                elif difference <= threshold:
-                    comparison = ResultType.PASS
-                    text = "PASS: " + text
-                else:
-                    comparison = ResultType.FAIL
-                    text = "FAIL: " + text
-            except ZeroDivisionError:
-                text = f"CPU {result.cpu}: {metric_name} zero division by baseline"
-            return [
-                MetricComparison(
-                    metric_name=metric_name,
-                    result=comparison,
-                    text=text,
-                )
-            ]
-        else:
-            return [
-                MetricComparison(
-                    metric_name=metric_name,
-                    result=ResultType.FAIL,
-                    text=f"FAIL: CPU {result.cpu}: {metric_name} no threshold found",
-                )
-            ]
diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py
index 6c072b55f..07e401725 100644
--- a/lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py
+++ b/lnst/RecipeCommon/Perf/Evaluators/BaselineEvaluator.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 from functools import reduce
 from dataclasses import dataclass
 
@@ -6,23 +6,58 @@
 from lnst.Controller.RecipeResults import ResultType, Result
 from lnst.RecipeCommon.BaseResultEvaluator import BaseResultEvaluator
 from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf
+from lnst.RecipeCommon.Perf.Results import result_averages_difference
 from lnst.RecipeCommon.Perf.Measurements.Results import (
     BaseMeasurementResults as PerfMeasurementResults,
 )
 
 
-class BaselineEvaluationResult(Result):
-    pass
-
-
 @dataclass
 class MetricComparison:
+    measurement_type: str
+    current_result: PerfMeasurementResults
+    baseline_result: Optional[PerfMeasurementResults]
+    threshold: Optional[float]
     metric_name: str
-    result: ResultType
+    difference: Optional[float]
+    comparison_result: ResultType
     text: str
 
 
+class BaselineEvaluationResult(Result):
+    def __init__(
+        self, comparisons: list[MetricComparison], recipe_conf: PerfRecipeConf
+    ):
+        super().__init__(ResultType.PASS)
+        self.comparisons = comparisons
+        self.recipe_conf = recipe_conf
+
+    @property
+    def result(self) -> ResultType:
+        return reduce(
+            ResultType.max_severity,
+            [comparison.comparison_result for comparison in self.comparisons],
+            ResultType.PASS,
+        )
+
+    @property
+    def description(self) -> str:
+        res = []
+        current_result = None
+        for comparison in self.comparisons:
+            if comparison.current_result != current_result:
+                res.append(comparison.current_result.describe())
+                current_result = comparison.current_result
+            res.append(f"{comparison.comparison_result}: {comparison.text}")
+        return "\n".join(
+            ["Baseline evaluation of"] + res
+        )
+
+
 class BaselineEvaluator(BaseResultEvaluator):
+    def __init__(self, metrics_to_evaluate: Optional[List[str]] = None):
+        self._metrics_to_evaluate = metrics_to_evaluate
+
     def evaluate_results(
         self,
         recipe: BaseRecipe,
@@ -59,53 +94,22 @@ def evaluate_group_results(
     ):
         cumulative_result = ResultType.PASS
         comparisons = []
-        result_text = self.describe_group_results(recipe, recipe_conf, results)
 
         baselines = self.get_baselines(recipe, recipe_conf, results)
-        result_index = len(recipe.current_run.results)
-        for i, (result, baseline) in enumerate(zip(results, baselines)):
-            metric_comparisons = self.compare_result_with_baseline(
-                recipe, recipe_conf, result, baseline, result_index
-            )
-            cumulative_result = reduce(
-                ResultType.max_severity,
-                [metric.result for metric in metric_comparisons],
-                cumulative_result,
-            )
-            result_text.extend(
-                [metric.text for metric in metric_comparisons]
-            )
+        for result, baseline in zip(results, baselines):
             comparisons.extend(
-                [
-                    {
-                        "measurement_type": result.measurement.__class__.__name__,
-                        "current_result": result,
-                        "baseline_result": baseline,
-                        "comparison_result": metric.result,
-                        "metric_name": metric.metric_name,
-                        "text": metric.text,
-                        "recipe_conf": recipe_conf,
-                    }
-                    for metric in metric_comparisons
-                ]
+                self.compare_result_with_baseline(
+                    recipe, recipe_conf, result, baseline
+                )
             )
 
         recipe.add_custom_result(
             BaselineEvaluationResult(
-                cumulative_result,
-                "\n".join(result_text),
-                data={"comparisons": comparisons},
+                comparisons=comparisons,
+                recipe_conf=recipe_conf,
             )
         )
 
-    def describe_group_results(
-        self,
-        recipe: BaseRecipe,
-        recipe_conf: PerfRecipeConf,
-        results: List[PerfMeasurementResults],
-    ) -> List[str]:
-        return []
-
     def get_baselines(
         self,
         recipe: BaseRecipe,
@@ -121,7 +125,14 @@ def get_baseline(
         recipe: BaseRecipe,
         recipe_conf: PerfRecipeConf,
         result: PerfMeasurementResults,
-    ) -> PerfMeasurementResults:
+    ) -> Optional[PerfMeasurementResults]:
+        return None
+
+    def get_threshold(
+        self,
+        baseline: PerfMeasurementResults,
+        metric_name: str,
+    ) -> Optional[float]:
         return None
 
     def compare_result_with_baseline(
@@ -130,6 +141,58 @@ def compare_result_with_baseline(
         recipe_conf: PerfRecipeConf,
         result: PerfMeasurementResults,
         baseline: PerfMeasurementResults,
-        result_index: int = 0,
     ) -> List[MetricComparison]:
-        raise NotImplementedError("Result to baseline metric comparison not implemented")
+        comparisons = []
+
+        if self._metrics_to_evaluate:
+            metrics_to_evaluate = [
+                i for i in result.metrics if i in self._metrics_to_evaluate
+            ]
+        else:
+            metrics_to_evaluate = result.metrics
+
+        for metric in metrics_to_evaluate:
+            comparisons.append(
+                self.compare_metrics_with_threshold(
+                    result=result,
+                    baseline=baseline,
+                    metric_name=metric,
+                )
+            )
+        return comparisons
+
+    def compare_metrics_with_threshold(self, result, baseline, metric_name):
+        threshold = None
+        diff = None
+
+        if not baseline:
+            comparison_result = ResultType.FAIL
+            text = "No baseline found"
+        elif (threshold := self.get_threshold(baseline, metric_name)) is None:
+            comparison_result = ResultType.FAIL
+            text = "No threshold found"
+        else:
+            diff = result_averages_difference(
+                getattr(result, metric_name),
+                getattr(baseline, metric_name),
+            )
+            direction = "higher" if diff >= 0 else "lower"
+
+            comparison_result = (
+                ResultType.PASS if abs(diff) <= threshold else ResultType.FAIL
+            )
+            text = (
+                f"New {metric_name} average is {abs(diff):.2f}% {direction} from the baseline. "
+                f"Allowed difference: {threshold}%"
+            )
+
+        return MetricComparison(
+            measurement_type=result.measurement.__class__.__name__,
+            current_result=result,
+            baseline_result=baseline,
+            threshold=threshold,
+            metric_name=metric_name,
+            difference=diff,
+            comparison_result=comparison_result,
+            text=text,
+        )
diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineFlowAverageEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineFlowAverageEvaluator.py
deleted file mode 100644
index 193ab56c8..000000000
--- a/lnst/RecipeCommon/Perf/Evaluators/BaselineFlowAverageEvaluator.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from __future__ import division
-from typing import List
-
-from lnst.Controller.Recipe import BaseRecipe
-from lnst.Controller.RecipeResults import ResultType
-
-from lnst.RecipeCommon.Perf.Recipe import RecipeConf as PerfRecipeConf
-from lnst.RecipeCommon.Perf.Results import result_averages_difference
-from lnst.RecipeCommon.Perf.Results import SequentialPerfResult
-from lnst.RecipeCommon.Perf.Measurements.Results import (
-    BaseMeasurementResults as PerfMeasurementResults,
-)
-from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import (
-    BaselineEvaluator, MetricComparison
-)
-
-
-class BaselineFlowAverageEvaluator(BaselineEvaluator):
-    def __init__(
-        self, thresholds: dict, metrics_to_evaluate: List[str] = None
-    ):
-        self._thresholds = thresholds
-
-        if metrics_to_evaluate is not None:
-            self._metrics_to_evaluate = metrics_to_evaluate
-        else:
-            self._metrics_to_evaluate = [
-                "generator_results",
-                "generator_cpu_stats",
-                "receiver_results",
-                "receiver_cpu_stats",
-            ]
-
-    def describe_group_results(
-        self,
-        recipe: BaseRecipe,
-        recipe_conf: PerfRecipeConf,
-        results: List[PerfMeasurementResults],
-    ) -> List[str]:
-        result = results[0]
-        return [
-            "Baseline average evaluation of flow:",
-            "{}".format(result.flow)
-        ]
-
-    def compare_result_with_baseline(
-        self,
-        recipe: BaseRecipe,
-        recipe_conf: PerfRecipeConf,
-        result: PerfMeasurementResults,
-        baseline: PerfMeasurementResults,
-        result_index: int = 0,
-    ) -> List[MetricComparison]:
-        metric_comparisons = []
-        for i in self._metrics_to_evaluate:
-            metric = f"{result_index}_{i}"
-            if baseline is None:
-                comparison_result = ResultType.FAIL
-                text = f"FAIL: Metric {metric} baseline not found for this flow"
-            elif (threshold := self._thresholds.get(metric, None)) is not None:
-                comparison_result, text = self._average_diff_comparison(
-                    name=metric,
-                    target=getattr(result, i),
-                    baseline=getattr(baseline, i),
-                    threshold=threshold
-                )
-            else:
-                comparison_result = ResultType.FAIL
-                text = f"FAIL: Metric {metric} threshold not found"
-
-            metric_comparisons.append(
-                MetricComparison(
-                    metric_name=metric,
-                    result=comparison_result,
-                    text=text,
-                )
-            )
-        return metric_comparisons
-
-    def _average_diff_comparison(
-        self,
-        name: str,
-        target: SequentialPerfResult,
-        baseline: SequentialPerfResult,
-        threshold: int
-    ):
-        difference = result_averages_difference(target, baseline)
-        result_text = "New {name} average is {diff:.2f}% {direction} from the baseline. " \
-                      "Allowed difference: {threshold}%".format(
-            name=name,
-            diff=abs(difference),
-            direction="higher" if difference >= 0 else "lower",
-            threshold=threshold
-        )
-
-        cpu = "_cpu_" in name
-
-        #  (           flow metrics           ) or (          cpu metrics          )
-        if (not cpu and difference > threshold) or (cpu and difference < -threshold):
-            comparison = ResultType.WARNING
-        elif (not cpu and difference >= -threshold) or (cpu and difference <= threshold):
-            comparison = ResultType.PASS
-        else:
-            comparison = ResultType.FAIL
-
-        if comparison == ResultType.WARNING:
-            result_text = f"IMPROVEMENT: {result_text}"
-        else:
-            result_text = f"{comparison}: {result_text}"
-
-        return comparison, result_text
diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineRDMABandwidthAverageEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineRDMABandwidthAverageEvaluator.py
deleted file mode 100644
index 81e2b3b56..000000000
--- a/lnst/RecipeCommon/Perf/Evaluators/BaselineRDMABandwidthAverageEvaluator.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from lnst.Controller.Recipe import BaseRecipe
-from lnst.Controller.RecipeResults import ResultType
-from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator, MetricComparison
-from lnst.RecipeCommon.Perf.Measurements.Results import RDMABandwidthMeasurementResults
-
-
-class BaselineRDMABandwidthAverageEvaluator(BaselineEvaluator):
-
-    def __init__(self, thresholds: dict):
-        self._thresholds = thresholds
-
-    def compare_result_with_baseline(
-        self,
-        recipe: BaseRecipe,
-        recipe_conf: "EnrtConfiguration",
-        result: RDMABandwidthMeasurementResults,
-        baseline: RDMABandwidthMeasurementResults,
-        result_index: int = 0,
-    ) -> list[MetricComparison]:
-        metric_name = f"{result_index}_bandwidth"
-
-        if baseline is None:
-            return [
-                MetricComparison(
-                    metric_name=metric_name,
-                    result=ResultType.FAIL,
-                    text=f"{self.__class__.__name__} FAIL:\n Metric {metric_name} baseline not found",
-                )
-            ]
-        elif (threshold := self._thresholds.get(metric_name, None)) is None:
-            return [
-                MetricComparison(
-                    metric_name=metric_name,
-                    result=ResultType.FAIL,
-                    text=f"{self.__class__.__name__}\nFAIL: Metric {metric_name} threshold not found",
-                )
-            ]
-
-        difference = ((result.bandwidth.average / baseline.bandwidth.average) * 100) - 100
-        direction = "higher" if difference >= 0 else "lower"
-        text = [
-            f"{self.__class__.__name__} of {metric_name}",
-            f"Baseline: {baseline.bandwidth.average} MiB/s",
-            f"Measured: {result.bandwidth.average} MiB/s",
-            f"{abs(difference):2f}% {direction} than baseline",
-            f"Allowed difference: {threshold}%",
-        ]
-        if difference > threshold:
-            comparison = ResultType.WARNING
-            text[0] = f"IMPROVEMENT: {text[0]}"
-        elif difference >= -threshold:
-            comparison = ResultType.PASS
-            text[0] = f"PASS: {text[0]}"
-        else:
-            comparison = ResultType.FAIL
-            text[0] = f"FAIL: {text[0]}"
-
-        return [
-            MetricComparison(
-                metric_name=metric_name,
-                result=comparison,
-                text="\n".join(text)
-            )
-        ]
diff --git a/lnst/RecipeCommon/Perf/Evaluators/BaselineTcRunAverageEvaluator.py b/lnst/RecipeCommon/Perf/Evaluators/BaselineTcRunAverageEvaluator.py
deleted file mode 100644
index 01c30a3fe..000000000
--- a/lnst/RecipeCommon/Perf/Evaluators/BaselineTcRunAverageEvaluator.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from lnst.Controller.RecipeResults import ResultType
-from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator, MetricComparison
-from lnst.RecipeCommon.Perf.Measurements.Results import TcRunMeasurementResults
-from lnst.Recipes.ENRT.TrafficControlRecipe import TrafficControlRecipe, TcRecipeConfiguration
-
-
-class BaselineTcRunAverageEvaluator(BaselineEvaluator):
-
-    def __init__(self, thresholds: dict):
-        self._thresholds = thresholds
-
-    def compare_result_with_baseline(
-            self,
-            recipe: TrafficControlRecipe,
-            recipe_conf: TcRecipeConfiguration,
-            result: TcRunMeasurementResults,
-            baseline: TcRunMeasurementResults,
-            result_index: int = 0,
-    ) -> list[MetricComparison]:
-
-        metric_name = f"{result_index}_rule_install_rate"
-
-        if baseline is None:
-            return [
-                MetricComparison(
-                    metric_name=metric_name,
-                    result=ResultType.FAIL,
-                    text=f"{self.__class__.__name__} FAIL:\n {result.device.name} {metric_name} baseline not found",
-                )
-            ]
-        elif (threshold := self._thresholds.get(metric_name, None)) is not None:
-            difference = ((result.rule_install_rate.average / baseline.rule_install_rate.average) * 100) - 100
-            direction = "higher" if difference >= 0 else "lower"
-            text = [
-                f"{self.__class__.__name__} of tc run with {metric_name}",
-                f"{result.description}",
-                f"Baseline: {baseline.rule_install_rate.average} rules/sec",
-                f"Measured: {result.rule_install_rate.average} rules/sec",
-                f"{abs(difference):2f}% {direction} than baseline ",
-                f"Allowed difference: {threshold}% ",
-            ]
-            if difference > threshold:
-                comparison = ResultType.WARNING
-                text[0] = f"IMPROVEMENT: {text[0]}"
-            elif difference >= -threshold:
-                comparison = ResultType.PASS
-                text[0] = f"PASS: {text[0]}"
-            else:
-                comparison = ResultType.FAIL
-                text[0] = f"FAIL: {text[0]}"
-
-            return [
-                MetricComparison(
-                    metric_name=metric_name,
-                    result=comparison,
-                    text="\n".join(text)
-                )
-            ]
-        else:
-            return [
-                MetricComparison(
-                    metric_name=metric_name,
-                    result=ResultType.FAIL,
-                    text=f"{self.__class__.__name__}\nFAIL: {result.device.name} {metric_name} no threshold found",
-                )
-            ]
-
diff --git a/lnst/RecipeCommon/Perf/Evaluators/__init__.py b/lnst/RecipeCommon/Perf/Evaluators/__init__.py
index 15a4fd7f4..1bac79501 100644
--- a/lnst/RecipeCommon/Perf/Evaluators/__init__.py
+++ b/lnst/RecipeCommon/Perf/Evaluators/__init__.py
@@ -1,8 +1,4 @@
 from lnst.RecipeCommon.Perf.Evaluators.NonzeroFlowEvaluator import NonzeroFlowEvaluator
-from lnst.RecipeCommon.Perf.Evaluators.BaselineFlowAverageEvaluator import BaselineFlowAverageEvaluator
-
+from lnst.RecipeCommon.Perf.Evaluators.BaselineEvaluator import BaselineEvaluator
 from lnst.RecipeCommon.Perf.Evaluators.BaselineCPUAverageEvaluator import BaselineCPUAverageEvaluator
-
 from lnst.RecipeCommon.Perf.Evaluators.MaxTimeTakenEvaluator import MaxTimeTakenEvaluator
-from lnst.RecipeCommon.Perf.Evaluators.BaselineTcRunAverageEvaluator import BaselineTcRunAverageEvaluator
-from lnst.RecipeCommon.Perf.Evaluators.BaselineRDMABandwidthAverageEvaluator import BaselineRDMABandwidthAverageEvaluator
diff --git a/lnst/RecipeCommon/Perf/Measurements/BaseFlowMeasurement.py b/lnst/RecipeCommon/Perf/Measurements/BaseFlowMeasurement.py
index e24cfccbe..176a751fd 100644
--- a/lnst/RecipeCommon/Perf/Measurements/BaseFlowMeasurement.py
+++ b/lnst/RecipeCommon/Perf/Measurements/BaseFlowMeasurement.py
@@ -92,7 +92,6 @@ def _report_flow_results(cls, recipe, flow_results):
         receiver_cpu = flow_results.receiver_cpu_stats
 
         desc = []
-        desc.append(str(flow_results.flow))
         desc.append(flow_results.describe())
 
         recipe_result = ResultType.PASS
diff --git a/lnst/RecipeCommon/Perf/Measurements/Results/BaseMeasurementResults.py b/lnst/RecipeCommon/Perf/Measurements/Results/BaseMeasurementResults.py
index e5e26eda4..a8840376c 100644
--- a/lnst/RecipeCommon/Perf/Measurements/Results/BaseMeasurementResults.py
+++ b/lnst/RecipeCommon/Perf/Measurements/Results/BaseMeasurementResults.py
@@ -14,6 +14,10 @@ def measurement(self) -> BaseMeasurement:
     def warmup_duration(self):
         return self._warmup_duration
 
+    @property
+    def metrics(self) -> list[str]:
+        return []
+
     def align_data(self, start, end):
         return self
 
diff --git a/lnst/RecipeCommon/Perf/Measurements/Results/CPUMeasurementResults.py b/lnst/RecipeCommon/Perf/Measurements/Results/CPUMeasurementResults.py
index 1b4c5a6ea..7f7a4fb21 100644
--- a/lnst/RecipeCommon/Perf/Measurements/Results/CPUMeasurementResults.py
+++ b/lnst/RecipeCommon/Perf/Measurements/Results/CPUMeasurementResults.py
@@ -26,6 +26,10 @@ def utilization(self):
     def utilization(self, value):
         self._utilization = value
 
+    @property
+    def metrics(self) -> list[str]:
+        return ['utilization']
+
     def describe(self):
         return "host {host} cpu '{cpu}' utilization: {average:.2f} +-{deviation:.2f} {unit} per second".format(
             host=self.host.hostid,
diff --git a/lnst/RecipeCommon/Perf/Measurements/Results/FlowMeasurementResults.py b/lnst/RecipeCommon/Perf/Measurements/Results/FlowMeasurementResults.py
index eb503e73e..5d45b9c60 100644
--- a/lnst/RecipeCommon/Perf/Measurements/Results/FlowMeasurementResults.py
+++ b/lnst/RecipeCommon/Perf/Measurements/Results/FlowMeasurementResults.py
@@ -1,16 +1,29 @@
 from lnst.RecipeCommon.Perf.Results import ParallelPerfResult
-from lnst.RecipeCommon.Perf.Measurements.Results.BaseMeasurementResults import BaseMeasurementResults
+from lnst.RecipeCommon.Perf.Measurements.Results.BaseMeasurementResults import (
+    BaseMeasurementResults,
+)
 
 
 class FlowMeasurementResults(BaseMeasurementResults):
     def __init__(self, measurement, flow, warmup_duration=0):
-        super(FlowMeasurementResults, self).__init__(measurement, warmup_duration)
+        super(FlowMeasurementResults, self).__init__(
+            measurement, warmup_duration
+        )
         self._flow = flow
         self._generator_results = None
         self._generator_cpu_stats = None
         self._receiver_results = None
         self._receiver_cpu_stats = None
 
+    @property
+    def metrics(self) -> list[str]:
+        return [
+            "generator_results",
+            "generator_cpu_stats",
+            "receiver_results",
+            "receiver_cpu_stats",
+        ]
+
     @property
     def flow(self):
         return self._flow
@@ -77,7 +90,10 @@ def warmup_end(self):
         return max(
             [
                 parallel[self.warmup_duration - 1].end_timestamp
-                for parallel in (*self.generator_results, *self.receiver_results)
+                for parallel in (
+                    *self.generator_results,
+                    *self.receiver_results,
+                )
             ]
         )
 
@@ -89,18 +105,31 @@ def warmdown_start(self):
         return min(
             [
                 parallel[-self.warmup_duration].start_timestamp
-                for parallel in (*self.generator_results, *self.receiver_results)
+                for parallel in (
+                    *self.generator_results,
+                    *self.receiver_results,
+                )
             ]
         )
 
     def time_slice(self, start, end):
-        result_copy = FlowMeasurementResults(self.measurement, self.flow, warmup_duration=0)
+        result_copy = FlowMeasurementResults(
+            self.measurement, self.flow, warmup_duration=0
+        )
 
-        result_copy.generator_cpu_stats = self.generator_cpu_stats.time_slice(start, end)
-        result_copy.receiver_cpu_stats = self.receiver_cpu_stats.time_slice(start, end)
+        result_copy.generator_cpu_stats = self.generator_cpu_stats.time_slice(
+            start, end
+        )
+        result_copy.receiver_cpu_stats = self.receiver_cpu_stats.time_slice(
+            start, end
+        )
 
-        result_copy.generator_results = self.generator_results.time_slice(start, end)
-        result_copy.receiver_results = self.receiver_results.time_slice(start, end)
+        result_copy.generator_results = self.generator_results.time_slice(
+            start, end
+        )
+        result_copy.receiver_results = self.receiver_results.time_slice(
+            start, end
+        )
 
         return result_copy
 
@@ -110,29 +139,42 @@ def describe(self):
         receiver = self.receiver_results
         receiver_cpu = self.receiver_cpu_stats
         desc = []
-        desc.append("Generator measured throughput: {tput:.2f} +-{deviation:.2f}({percentage:.2f}%) {unit} per second."
-                .format(tput=generator.average,
-                        deviation=generator.std_deviation,
-                        percentage=self._deviation_percentage(generator),
-                        unit=generator.unit))
-        desc.append("Generator process CPU data: {cpu:.2f} +-{cpu_deviation:.2f} {cpu_unit} per second."
-                .format(cpu=generator_cpu.average,
-                        cpu_deviation=generator_cpu.std_deviation,
-                        cpu_unit=generator_cpu.unit))
-        desc.append("Receiver measured throughput: {tput:.2f} +-{deviation:.2f}({percentage:.2f}%) {unit} per second."
-                .format(tput=receiver.average,
-                        deviation=receiver.std_deviation,
-                        percentage=self._deviation_percentage(receiver),
-                        unit=receiver.unit))
-        desc.append("Receiver process CPU data: {cpu:.2f} +-{cpu_deviation:.2f} {cpu_unit} per second."
-                .format(cpu=receiver_cpu.average,
-                        cpu_deviation=receiver_cpu.std_deviation,
-                        cpu_unit=receiver_cpu.unit))
+        desc.append(str(self.flow))
+        desc.append(
+            "Generator measured throughput (generator_results): {tput:.2f} +-{deviation:.2f}({percentage:.2f}%) {unit} per second.".format(
+                tput=generator.average,
+                deviation=generator.std_deviation,
+                percentage=self._deviation_percentage(generator),
+                unit=generator.unit,
+            )
+        )
+        desc.append(
+            "Generator process CPU data (generator_cpu_stats): {cpu:.2f} +-{cpu_deviation:.2f} {cpu_unit} per second.".format(
+                cpu=generator_cpu.average,
+                cpu_deviation=generator_cpu.std_deviation,
+                cpu_unit=generator_cpu.unit,
+            )
+        )
+        desc.append(
+            "Receiver measured throughput (receiver_results): {tput:.2f} +-{deviation:.2f}({percentage:.2f}%) {unit} per second.".format(
+                tput=receiver.average,
+                deviation=receiver.std_deviation,
+                percentage=self._deviation_percentage(receiver),
+                unit=receiver.unit,
+            )
+        )
+        desc.append(
+            "Receiver process CPU data (receiver_cpu_stats): {cpu:.2f} +-{cpu_deviation:.2f} {cpu_unit} per second.".format(
+                cpu=receiver_cpu.average,
+                cpu_deviation=receiver_cpu.std_deviation,
+                cpu_unit=receiver_cpu.unit,
+            )
+        )
         return "\n".join(desc)
 
     @staticmethod
     def _deviation_percentage(result):
         try:
-            return (result.std_deviation/result.average) * 100
+            return (result.std_deviation / result.average) * 100
         except ZeroDivisionError:
-            return float('inf') if result.std_deviation >= 0 else float("-inf")
+            return float("inf") if result.std_deviation >= 0 else float("-inf")
diff --git a/lnst/RecipeCommon/Perf/Measurements/Results/RDMABandwidthMeasurementResults.py b/lnst/RecipeCommon/Perf/Measurements/Results/RDMABandwidthMeasurementResults.py
index 84df3215f..a0fa82e09 100644
--- a/lnst/RecipeCommon/Perf/Measurements/Results/RDMABandwidthMeasurementResults.py
+++ b/lnst/RecipeCommon/Perf/Measurements/Results/RDMABandwidthMeasurementResults.py
@@ -9,6 +9,10 @@ def __init__(self, measurement: BaseMeasurement, flow: "Flow"):
 
         self._flow = flow
 
+    @property
+    def metrics(self) -> list[str]:
+        return ['bandwidth']
+
     @property
     def flow(self):
         return self._flow
diff --git a/lnst/RecipeCommon/Perf/Measurements/Results/TcRunMeasurementResults.py b/lnst/RecipeCommon/Perf/Measurements/Results/TcRunMeasurementResults.py
index aa26ebeb2..98aac5ef8 100644
--- a/lnst/RecipeCommon/Perf/Measurements/Results/TcRunMeasurementResults.py
+++ b/lnst/RecipeCommon/Perf/Measurements/Results/TcRunMeasurementResults.py
@@ -18,6 +18,10 @@ def __init__(
         self._rule_install_rate: ParallelPerfResult = None
         self._run_success: bool = None
 
+    @property
+    def metrics(self) -> list[str]:
+        return ['rule_install_rate']
+
     @property
     def device(self) -> Device:
         return self._device
@@ -48,7 +52,7 @@ def description(self):
                f" tc run with {self.rule_install_rate.value} rules" \
                f" num_instances={self.measurement.num_instances}" \
                f" took {self.rule_install_rate.duration} seconds " \
-               f"({self.rule_install_rate.average} rules/sec)"
+               f"(rule_install_rate={self.rule_install_rate.average} rules/sec)"
 
     @property
     def time_taken(self):
diff --git a/lnst/RecipeCommon/Perf/Measurements/Results/XDPBenchMeasurementResults.py b/lnst/RecipeCommon/Perf/Measurements/Results/XDPBenchMeasurementResults.py
index 4ff535051..244592ac4 100644
--- a/lnst/RecipeCommon/Perf/Measurements/Results/XDPBenchMeasurementResults.py
+++ b/lnst/RecipeCommon/Perf/Measurements/Results/XDPBenchMeasurementResults.py
@@ -11,6 +11,10 @@ def __init__(self, *args, **kwargs):
         self._generator_results = ParallelPerfResult()  # multiple instances of pktgen
         self._receiver_results = ParallelPerfResult()  # single instance of xdpbench
 
+    @property
+    def metrics(self) -> list[str]:
+        return ['generator_results', 'receiver_results']
+
     def add_results(self, results):
         if results is None:
             return
@@ -37,12 +41,12 @@ def describe(self) -> str:
         desc = []
         desc.append(str(self.flow))
         desc.append(
-            "Generator generated: {tput:,f} {unit} per second.".format(
+            "Generator generated (generator_results): {tput:,f} {unit} per second.".format(
                 tput=generator.average, unit=generator.unit
             )
         )
         desc.append(
-            "Receiver processed: {tput:,f} {unit} per second.".format(
+            "Receiver processed (receiver_results): {tput:,f} {unit} per second.".format(
                 tput=receiver.average, unit=receiver.unit
             )
         )
diff --git a/lnst/Recipes/ENRT/BaseEnrtRecipe.py b/lnst/Recipes/ENRT/BaseEnrtRecipe.py
index 4d20083df..68979680b 100644
--- a/lnst/Recipes/ENRT/BaseEnrtRecipe.py
+++ b/lnst/Recipes/ENRT/BaseEnrtRecipe.py
@@ -165,12 +165,6 @@ class BaseEnrtRecipe(
         specify how many times should each performance measurement be repeated
         to generate cumulative results which can be statistically analyzed.
     :type perf_iterations: :any:`IntParam` (default 5)
-
-    :param perf_evaluation_strategy:
-        Parameter used by the :any:`evaluator_by_measurement` selector to
-        pick correct performance measurement evaluators based on the strategy
-        specified.
-    :type perf_evaluation_strategy: :any:`StrParam` (default "all")
     """
 
     driver = StrParam()
@@ -187,7 +181,6 @@ class BaseEnrtRecipe(
 
     # generic perf test params
     perf_iterations = IntParam(default=5)
-    perf_evaluation_strategy = StrParam(default="all")
 
     def test(self):
         """Main test loop shared by all the Enrt recipes
@@ -472,27 +465,14 @@ def evaluator_by_measurement(self, measurement):
         The selector looks at the input measurement to pick
         appropriate evaluator.
 
-        If :any: `perf_evaluation_strategy` property is set
-        to either "none" or "nonzero", selector returns
-        given evaluators based on their strategy.
-
         :return: list of Result evaluators
         :rtype: List[:any:`BaseResultEvaluator`]
 
         """
-        if self.params.perf_evaluation_strategy == "none":
-            return []
-
         if isinstance(measurement, BaseCPUMeasurement):
-            if self.params.perf_evaluation_strategy in ["nonzero", "none"]:
-                evaluators = []
-            else:
-                evaluators = self.cpu_perf_evaluators
+            evaluators = self.cpu_perf_evaluators
         elif isinstance(measurement, BaseFlowMeasurement):
-            if self.params.perf_evaluation_strategy == "nonzero":
-                evaluators = [NonzeroFlowEvaluator()]
-            else:
-                evaluators = self.net_perf_evaluators
+            evaluators = self.net_perf_evaluators
         else:
             evaluators = []
 
diff --git a/lnst/Recipes/ENRT/OvS_DPDK_PvP.py b/lnst/Recipes/ENRT/OvS_DPDK_PvP.py
index 50d9e58c1..a388e170e 100644
--- a/lnst/Recipes/ENRT/OvS_DPDK_PvP.py
+++ b/lnst/Recipes/ENRT/OvS_DPDK_PvP.py
@@ -37,13 +37,13 @@ def __init__(self):
 
 
 class OvSDPDKPvPRecipe(BasePvPRecipe):
-    m1 = HostReq()
-    m1.eth0 = DeviceReq(label="net1", driver=RecipeParam("driver"))
-    m1.eth1 = DeviceReq(label="net1", driver=RecipeParam("driver"))
+    host1 = HostReq()
+    host1.eth0 = DeviceReq(label="net1", driver=RecipeParam("driver"))
+    host1.eth1 = DeviceReq(label="net1", driver=RecipeParam("driver"))
 
-    m2 = HostReq(with_guest="yes")
-    m2.eth0 = DeviceReq(label="net1", driver=RecipeParam("driver"))
-    m2.eth1 = DeviceReq(label="net1", driver=RecipeParam("driver"))
+    host2 = HostReq(with_guest="yes")
+    host2.eth0 = DeviceReq(label="net1", driver=RecipeParam("driver"))
+    host2.eth1 = DeviceReq(label="net1", driver=RecipeParam("driver"))
 
     net_ipv4 = IPv4NetworkParam(default="192.168.1.0/24")
 
@@ -76,33 +76,33 @@ def check_dependencies(self):
 
     def gen_ping_config(self):
         return [
-            (self.matched.m1, self.matched.m1.eth0, self.matched.m2.eth0),
-            (self.matched.m1, self.matched.m1.eth1, self.matched.m2.eth1),
-            (self.matched.m2, self.matched.m2.eth0, self.matched.m1.eth0),
-            (self.matched.m2, self.matched.m2.eth1, self.matched.m2.eth1)
+            (self.matched.host1, self.matched.host1.eth0, self.matched.host2.eth0),
+            (self.matched.host1, self.matched.host1.eth1, self.matched.host2.eth1),
+            (self.matched.host2, self.matched.host2.eth0, self.matched.host1.eth0),
+            (self.matched.host2, self.matched.host2.eth1, self.matched.host2.eth1)
         ]
 
     def test_wide_configuration(self, config):
-        config.generator.host = self.matched.m1
-        config.generator.nics.append(self.matched.m1.eth0)
-        config.generator.nics.append(self.matched.m1.eth1)
+        config.generator.host = self.matched.host1
+        config.generator.nics.append(self.matched.host1.eth0)
+        config.generator.nics.append(self.matched.host1.eth1)
 
         ipv4_addr = interface_addresses(self.params.net_ipv4)
         nic_addrs = {
-            self.matched.m1.eth0: next(ipv4_addr),
-            self.matched.m2.eth0: next(ipv4_addr),
-            self.matched.m1.eth1: next(ipv4_addr),
-            self.matched.m2.eth1: next(ipv4_addr),
+            self.matched.host1.eth0: next(ipv4_addr),
+            self.matched.host2.eth0: next(ipv4_addr),
+            self.matched.host1.eth1: next(ipv4_addr),
+            self.matched.host2.eth1: next(ipv4_addr),
         }
-        self.matched.m1.eth0.ip_add(nic_addrs[self.matched.m1.eth0])
-        self.matched.m1.eth1.ip_add(nic_addrs[self.matched.m1.eth1])
+        self.matched.host1.eth0.ip_add(nic_addrs[self.matched.host1.eth0])
+        self.matched.host1.eth1.ip_add(nic_addrs[self.matched.host1.eth1])
         self.base_dpdk_configuration(config.generator)
 
-        config.dut.host = self.matched.m2
-        config.dut.nics.append(self.matched.m2.eth0)
-        config.dut.nics.append(self.matched.m2.eth1)
-        self.matched.m2.eth0.ip_add(nic_addrs[self.matched.m2.eth0])
-        self.matched.m2.eth1.ip_add(nic_addrs[self.matched.m2.eth1])
+        config.dut.host = self.matched.host2
+        config.dut.nics.append(self.matched.host2.eth0)
+        config.dut.nics.append(self.matched.host2.eth1)
+        self.matched.host2.eth0.ip_add(nic_addrs[self.matched.host2.eth0])
+        self.matched.host2.eth1.ip_add(nic_addrs[self.matched.host2.eth1])
         self.base_dpdk_configuration(config.dut)
         self.ovs_dpdk_bridge_configuration(config.dut)
 
@@ -228,7 +228,7 @@ def ovs_dpdk_bridge_configuration(self, host_conf):
         host.run("systemctl restart openvswitch")
 
         #  TODO use an actual OvS Device object
-        #  TODO config.dut.nics.append(CachedRemoteDevice(m2.ovs))
+        #  TODO config.dut.nics.append(CachedRemoteDevice(host2.ovs))
         host.run("ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev")
 
         host_conf.dpdk_ports = []
diff --git a/lnst/Recipes/ENRT/VhostNetPvPRecipe.py b/lnst/Recipes/ENRT/VhostNetPvPRecipe.py
index be0630bb8..fe2f5bcad 100644
--- a/lnst/Recipes/ENRT/VhostNetPvPRecipe.py
+++ b/lnst/Recipes/ENRT/VhostNetPvPRecipe.py
@@ -23,13 +23,13 @@ def __init__(self):
 
 
 class VhostNetPvPRecipe(BasePvPRecipe):
-    generator_req = HostReq()
-    generator_req.eth0 = DeviceReq(label="net1", driver=RecipeParam("driver"))
-    generator_req.eth1 = DeviceReq(label="net1", driver=RecipeParam("driver"))
+    host1 = HostReq()
+    host1.eth0 = DeviceReq(label="net1", driver=RecipeParam("driver"))
+    host1.eth1 = DeviceReq(label="net1", driver=RecipeParam("driver"))
 
-    host_req = HostReq(with_guest="yes")
-    host_req.eth0 = DeviceReq(label="net1", driver=RecipeParam("driver"))
-    host_req.eth1 = DeviceReq(label="net1", driver=RecipeParam("driver"))
+    host2 = HostReq(with_guest="yes")
+    host2.eth0 = DeviceReq(label="net1", driver=RecipeParam("driver"))
+    host2.eth1 = DeviceReq(label="net1", driver=RecipeParam("driver"))
 
     net_ipv4 = IPv4NetworkParam(default="192.168.101.0/24")
 
@@ -64,39 +64,39 @@ def check_params(self):
 
     def gen_ping_config(self):
         return [
-            (self.matched.generator_req,
-             self.matched.generator_req.eth0,
-             self.matched.host_req.eth0),
-            (self.matched.generator_req,
-             self.matched.generator_req.eth1,
-             self.matched.host_req.eth1),
-            (self.matched.host_req,
-             self.matched.host_req.eth0,
-             self.matched.generator_req.eth0),
-            (self.matched.host_req,
-             self.matched.host_req.eth1,
-             self.matched.host_req.eth1)
+            (self.matched.host1,
+             self.matched.host1.eth0,
+             self.matched.host2.eth0),
+            (self.matched.host1,
+             self.matched.host1.eth1,
+             self.matched.host2.eth1),
+            (self.matched.host2,
+             self.matched.host2.eth0,
+             self.matched.host1.eth0),
+            (self.matched.host2,
+             self.matched.host2.eth1,
+             self.matched.host2.eth1)
         ]
 
     def test_wide_configuration(self, config):
 
-        config.generator.host = self.matched.generator_req
-        config.generator.nics.append(self.matched.generator_req.eth0)
-        config.generator.nics.append(self.matched.generator_req.eth1)
+        config.generator.host = self.matched.host1
+        config.generator.nics.append(self.matched.host1.eth0)
+        config.generator.nics.append(self.matched.host1.eth1)
 
         ipv4_addr = interface_addresses(self.params.net_ipv4)
-        self.matched.generator_req.eth0.ip_add(next(ipv4_addr))
-        self.matched.generator_req.eth1.ip_add(next(ipv4_addr))
-        self.matched.generator_req.eth0.up()
-        self.matched.generator_req.eth1.up()
+        self.matched.host1.eth0.ip_add(next(ipv4_addr))
+        self.matched.host1.eth1.ip_add(next(ipv4_addr))
+        self.matched.host1.eth0.up()
+        self.matched.host1.eth1.up()
 
         self.base_dpdk_configuration(config.generator)
 
-        config.dut.host = self.matched.host_req
-        config.dut.nics.append(self.matched.host_req.eth0)
-        config.dut.nics.append(self.matched.host_req.eth1)
-        self.matched.host_req.eth0.up()
-        self.matched.host_req.eth1.up()
+        config.dut.host = self.matched.host2
+        config.dut.nics.append(self.matched.host2.eth0)
+        config.dut.nics.append(self.matched.host2.eth1)
+        self.matched.host2.eth0.up()
+        self.matched.host2.eth1.up()
 
         self.host_forwarding_configuration(config.dut)