added evaluation to sflkit

uds-se · Jul 27, 2024 · 03de84d · 03de84d
1 parent c4fcc59
commit 03de84d
Show file tree

Hide file tree

Showing 4 changed files with 303 additions and 3 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sflkit"
-version = "0.2.21"
+version = "0.2.22"
 authors = [
     { name = "Marius Smytzek", email = "[email protected]" },
 ]

diff --git a/src/sflkit/analysis/analyzer.py b/src/sflkit/analysis/analyzer.py
@@ -19,6 +19,9 @@ def __init__(
         self.irrelevant_event_files = irrelevant_event_files
         self.model = Model(factory)
         self.paths: Dict[int, os.PathLike] = dict()
+        self.max_suspiciousness = 0
+        self.min_suspiciousness = 0
+        self.avg_suspiciousness = 0
 
     def _analyze(self, event_file):
         self.model.prepare(event_file)
@@ -48,20 +51,35 @@ def get_analysis_by_type(self, type_: AnalysisType) -> Set[AnalysisObject]:
 
     def get_sorted_suggestions(
         self, base_dir, metric: Callable = None, type_: AnalysisType = None
-    ):
+    ) -> List[Suggestion]:
         if type_:
             objects = self.get_analysis_by_type(type_)
         else:
             objects = self.get_analysis()
+        return self.get_sorted_suggestions_from_analysis(base_dir, objects, metric)
+
+    def get_sorted_suggestions_from_analysis(
+        self, base_dir, analysis: Set[AnalysisObject], metric: Callable = None
+    ) -> List[Suggestion]:
         suggestions = dict()
+        max_suspiciousness = float("-inf")
+        min_suspiciousness = float("inf")
+        avg_suspiciousness = 0
         for suggestion in map(
-            lambda p: p.get_suggestion(metric=metric, base_dir=base_dir), objects
+            lambda p: p.get_suggestion(metric=metric, base_dir=base_dir), analysis
         ):
+            max_suspiciousness = max(max_suspiciousness, suggestion.suspiciousness)
+            min_suspiciousness = min(min_suspiciousness, suggestion.suspiciousness)
+            avg_suspiciousness += suggestion.suspiciousness
             if suggestion.suspiciousness not in suggestions:
                 suggestions[suggestion.suspiciousness] = set(suggestion.lines)
             else:
                 suggestions[suggestion.suspiciousness] |= set(suggestion.lines)
 
+        self.max_suspiciousness = max_suspiciousness
+        self.min_suspiciousness = min_suspiciousness
+        self.avg_suspiciousness = avg_suspiciousness / len(analysis)
+
         return sorted(
             [
                 Suggestion(list(lines), suspiciousness)

diff --git a/src/sflkit/evaluation.py b/src/sflkit/evaluation.py
@@ -0,0 +1,116 @@
+import enum
+import random
+from typing import List, Dict, Callable, Set, Optional
+
+from sflkit.analysis.suggestion import Suggestion, Location
+
+
+class Average:
+    def __init__(self):
+        self.number_of_locations = 0
+
+    def average(self, suspiciousness: float, current_suspiciousness: float):
+        current_suspiciousness *= self.number_of_locations
+        self.number_of_locations += 1
+        return (current_suspiciousness + suspiciousness) / self.number_of_locations
+
+
+class Scenario(enum.Enum):
+    BEST_CASE = "best_case"
+    AVG_CASE = "avg_case"
+    WORST_CASE = "worst_case"
+
+
+class Rank:
+    def __init__(
+        self,
+        suggestions: List[Suggestion],
+        metric: Callable[[float, float], float] = max,
+        default_suspiciousness: float = float("-inf"),
+    ):
+        self.suggestions = sorted(suggestions, reverse=True)
+        self.suspiciousness: Dict[Location, float] = dict()
+        self.ranks: Dict[float, List[Location]] = dict()
+        self.locations: Dict[Location, float] = dict()
+        current_rank = 1
+        for i, suggestion in enumerate(self.suggestions):
+            lines = suggestion.lines
+            if len(lines) == 0:
+                continue
+            elif len(lines) == 1:
+                rank = current_rank
+                current_rank += 1
+            else:
+                rank = (len(lines)) / 2 + (current_rank - 1)
+                current_rank += len(lines)
+            self.ranks[rank] = lines
+            for line in lines:
+                self.suspiciousness[line] = metric(
+                    suggestion.suspiciousness,
+                    self.locations.get(line, default_suspiciousness),
+                )
+                self.locations[line] = rank
+        self.number_of_locations = len(self.locations)
+
+    def top_n(
+        self,
+        faulty: Set[Location],
+        n: int,
+        scenario: Optional[Scenario] = None,
+        repeat: int = 1000,
+    ) -> float:
+        top_n_locations = list()
+        for suggestion in self.suggestions:
+            if len(top_n_locations) >= n:
+                break
+            for line in suggestion.lines:
+                if line not in top_n_locations:
+                    top_n_locations.append(line)
+        if len(top_n_locations) <= n:
+            return self._top_n(faulty, top_n_locations, scenario)
+        else:
+            sum_ = 0
+            for _ in range(repeat):
+                sum_ += self._top_n(
+                    faulty, random.sample(top_n_locations, k=n), scenario
+                )
+            return sum_ / repeat
+
+    @staticmethod
+    def _top_n(
+        faulty: Set[Location],
+        top_n_locations: List[Location],
+        scenario: Optional[Scenario] = None,
+    ) -> float:
+        found = len(faulty.intersection(top_n_locations))
+        if scenario == Scenario.BEST_CASE:
+            return 1 if found > 0 else 0
+        elif scenario == Scenario.WORST_CASE:
+            return found / len(faulty)
+        elif scenario == Scenario.AVG_CASE:
+            return min(found / (len(faulty) / 2), 1)
+        else:
+            return found / len(top_n_locations)
+
+    def get_rank(
+        self, faulty: Set[Location], scenario: Optional[Scenario] = None
+    ) -> float:
+        if scenario == Scenario.BEST_CASE:
+            rank = min(self.locations[location] for location in faulty)
+        elif scenario == Scenario.WORST_CASE:
+            rank = max(self.locations[location] for location in faulty)
+        elif scenario == Scenario.AVG_CASE:
+            rank = sorted([self.locations[location] for location in faulty])[
+                max(len(faulty) // 2 - 1, 0)
+            ]
+        else:
+            rank = sum(self.locations[location] for location in faulty) / len(faulty)
+        return rank
+
+    def exam(self, faulty: Set[Location], scenario: Optional[Scenario] = None) -> float:
+        return self.get_rank(faulty, scenario) / self.number_of_locations
+
+    def wasted_effort(
+        self, faulty: Set[Location], scenario: Optional[Scenario] = None
+    ) -> float:
+        return self.get_rank(faulty, scenario)
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -0,0 +1,166 @@
+import random
+from typing import Optional
+
+from sflkit.analysis.suggestion import Suggestion, Location
+from sflkit.evaluation import Rank, Scenario
+from utils import BaseTest
+
+
+class TestEvaluation(BaseTest):
+    def setUp(self):
+        random.seed(0)
+        self.suggestions = [
+            Suggestion([Location("a.py", 1)], 0.5),
+            Suggestion([Location("a.py", 2)], 0.3),
+            Suggestion([Location("a.py", 3)], 0.7),
+            Suggestion([Location("a.py", 4)], 0.1),
+            Suggestion([Location("a.py", 5)], 0.9),
+            Suggestion([Location("a.py", 6)], 0.2),
+            Suggestion([Location("a.py", 7)], 0.8),
+            Suggestion([Location("a.py", 8)], 0.4),
+            Suggestion([Location("a.py", 9)], 0.6),
+            Suggestion([Location("a.py", 10)], 0.0),
+        ]
+
+    def get_rank(self, multi: bool = False):
+        suggestions = (
+            self.suggestions
+            if not multi
+            else self.suggestions
+            + [
+                Suggestion(
+                    [Location("a.py", 11), Location("a.py", 12), Location("a.py", 13)],
+                    0.55,
+                )
+            ]
+        )
+        return (
+            Rank(suggestions),
+            {
+                Location("a.py", 2),
+                Location("a.py", 6),
+                Location("a.py", 7),
+                Location("a.py", 8),
+            },
+        )
+
+    def get_top_n(
+        self, n: int, scenario: Optional[Scenario] = None, multi: bool = False
+    ):
+        rank, locations = self.get_rank(multi=multi)
+        return rank.top_n(
+            locations,
+            n,
+            scenario=scenario,
+            repeat=10000,
+        )
+
+    def get_exam(self, scenario: Optional[Scenario] = None):
+        rank, locations = self.get_rank(multi=True)
+        return rank.exam(
+            locations,
+            scenario=scenario,
+        )
+
+    def get_wasted_effort(self, scenario: Optional[Scenario] = None):
+        rank, locations = self.get_rank(multi=True)
+        return rank.wasted_effort(
+            locations,
+            scenario=scenario,
+        )
+
+    def test_top_1(self):
+        top_1 = self.get_top_n(1)
+        self.assertAlmostEqual(0, top_1, delta=self.DELTA)
+
+    def test_top_5(self):
+        top_5 = self.get_top_n(5)
+        self.assertAlmostEqual(0.2, top_5, delta=self.DELTA)
+
+    def test_top_10(self):
+        top_10 = self.get_top_n(10)
+        self.assertAlmostEqual(0.4, top_10, delta=self.DELTA)
+
+    def test_top_5_multi(self):
+        top_5_multi = self.get_top_n(5, multi=True)
+        self.assertAlmostEqual(0.14285, top_5_multi, delta=0.05)
+
+    def test_top_1_avg(self):
+        top_1_avg = self.get_top_n(1, scenario=Scenario.AVG_CASE)
+        self.assertAlmostEqual(0, top_1_avg, delta=self.DELTA)
+
+    def test_top_5_avg(self):
+        top_5_avg = self.get_top_n(5, scenario=Scenario.AVG_CASE)
+        self.assertAlmostEqual(0.5, top_5_avg, delta=self.DELTA)
+
+    def test_top_10_avg(self):
+        top_10_avg = self.get_top_n(10, scenario=Scenario.AVG_CASE)
+        self.assertAlmostEqual(1, top_10_avg, delta=self.DELTA)
+
+    def test_top_5_avg_multi(self):
+        top_5_avg_multi = self.get_top_n(5, scenario=Scenario.AVG_CASE, multi=True)
+        self.assertAlmostEqual(0.35714, top_5_avg_multi, delta=0.005)
+
+    def test_top_1_best(self):
+        top_1_best = self.get_top_n(1, scenario=Scenario.BEST_CASE)
+        self.assertAlmostEqual(0, top_1_best, delta=self.DELTA)
+
+    def test_top_5_best(self):
+        top_5_best = self.get_top_n(5, scenario=Scenario.BEST_CASE)
+        self.assertAlmostEqual(1, top_5_best, delta=self.DELTA)
+
+    def test_top_10_best(self):
+        top_10_best = self.get_top_n(10, scenario=Scenario.BEST_CASE)
+        self.assertAlmostEqual(1, top_10_best, delta=self.DELTA)
+
+    def test_top_5_best_multi(self):
+        top_5_best_multi = self.get_top_n(5, scenario=Scenario.BEST_CASE, multi=True)
+        self.assertAlmostEqual(0.71429, top_5_best_multi, delta=0.005)
+
+    def test_top_1_worst(self):
+        top_1_worst = self.get_top_n(1, scenario=Scenario.WORST_CASE)
+        self.assertAlmostEqual(0, top_1_worst, delta=self.DELTA)
+
+    def test_top_5_worst(self):
+        top_5_worst = self.get_top_n(5, scenario=Scenario.WORST_CASE)
+        self.assertAlmostEqual(0.25, top_5_worst, delta=self.DELTA)
+
+    def test_top_10_worst(self):
+        top_10_worst = self.get_top_n(10, scenario=Scenario.WORST_CASE)
+        self.assertAlmostEqual(1, top_10_worst, delta=self.DELTA)
+
+    def test_top_5_worst_multi(self):
+        top_5_worst_multi = self.get_top_n(5, scenario=Scenario.WORST_CASE, multi=True)
+        self.assertAlmostEqual(0.17857, top_5_worst_multi, delta=0.005)
+
+    def test_exam_avg(self):
+        exam_avg = self.get_exam(scenario=Scenario.AVG_CASE)
+        self.assertAlmostEqual(9 / 13, exam_avg, delta=self.DELTA)
+
+    def test_exam_best(self):
+        exam_best = self.get_exam(scenario=Scenario.BEST_CASE)
+        self.assertAlmostEqual(2 / 13, exam_best, delta=self.DELTA)
+
+    def test_exam_worst(self):
+        exam_worst = self.get_exam(scenario=Scenario.WORST_CASE)
+        self.assertAlmostEqual(11 / 13, exam_worst, delta=self.DELTA)
+
+    def test_exam(self):
+        exam = self.get_exam()
+        self.assertAlmostEqual((2 + 9 + 10 + 11) / (13 * 4), exam, delta=self.DELTA)
+
+    def test_wasted_effort_avg(self):
+        wasted_effort_avg = self.get_wasted_effort(scenario=Scenario.AVG_CASE)
+        self.assertAlmostEqual(9, wasted_effort_avg, delta=self.DELTA)
+
+    def test_wasted_effort_best(self):
+        wasted_effort_best = self.get_wasted_effort(scenario=Scenario.BEST_CASE)
+        self.assertAlmostEqual(2, wasted_effort_best, delta=self.DELTA)
+
+    def test_wasted_effort_worst(self):
+        wasted_effort_worst = self.get_wasted_effort(scenario=Scenario.WORST_CASE)
+        self.assertAlmostEqual(11, wasted_effort_worst, delta=self.DELTA)
+
+    def test_wasted_effort(self):
+        wasted_effort = self.get_wasted_effort()
+        self.assertAlmostEqual((2 + 9 + 10 + 11) / 4, wasted_effort, delta=self.DELTA)