Merge pull request #113 from dilyabareeva/output_format

Fix Output Structure
dilyabareeva · Aug 21, 2024 · cf21d46 · cf21d46
2 parents 8e9cecc + 5cc6689
commit cf21d46
Show file tree

Hide file tree

Showing 15 changed files with 83 additions and 70 deletions.
diff --git a/quanda/metrics/localization/class_detection.py b/quanda/metrics/localization/class_detection.py
@@ -38,7 +38,7 @@ def compute(self):
         """
         Used to aggregate current results and return a metric score.
         """
-        return torch.cat(self.scores).mean()
+        return {"score": torch.cat(self.scores).mean().item()}
 
     def reset(self, *args, **kwargs):
         """

diff --git a/quanda/metrics/localization/mislabeling_detection.py b/quanda/metrics/localization/mislabeling_detection.py
@@ -86,7 +86,7 @@ def compute(self, *args, **kwargs):
         normalized_curve = torch.cumsum(success_arr * 1.0, dim=0) / len(self.poisoned_indices)
         score = torch.trapezoid(normalized_curve) / len(self.poisoned_indices)
         return {
-            "success_arr": success_arr,
             "score": score.item(),
+            "success_arr": success_arr,
             "curve": normalized_curve / len(self.poisoned_indices),
         }
diff --git a/quanda/metrics/randomization/model_randomization.py b/quanda/metrics/randomization/model_randomization.py
@@ -84,8 +84,8 @@ def explain_update(
         )
         self.update(test_data=test_data, explanations=explanations, explanation_targets=explanation_targets)
 
-    def compute(self) -> float:
-        return torch.cat(self.results["scores"]).mean().item()
+    def compute(self):
+        return {"score": torch.cat(self.results["scores"]).mean().item()}
 
     def reset(self):
         self.results = {"scores": []}

diff --git a/quanda/metrics/unnamed/dataset_cleaning.py b/quanda/metrics/unnamed/dataset_cleaning.py
@@ -151,4 +151,4 @@ def compute(self, *args, **kwargs):
 
         clean_accuracy = class_accuracy(self.model, clean_dl, self.device)
 
-        return original_accuracy - clean_accuracy
+        return {"score": (original_accuracy - clean_accuracy)}
diff --git a/quanda/metrics/unnamed/top_k_overlap.py b/quanda/metrics/unnamed/top_k_overlap.py
@@ -27,7 +27,7 @@ def update(
         self.all_top_k_examples = torch.concat((self.all_top_k_examples, top_k_indices), dim=0)
 
     def compute(self, *args, **kwargs):
-        return len(torch.unique(self.all_top_k_examples))
+        return {"score": len(torch.unique(self.all_top_k_examples))}
 
     def reset(self, *args, **kwargs):
         self.all_top_k_examples = torch.empty(0, self.top_k)

diff --git a/tests/explainers/wrappers/test_captum_influence.py b/tests/explainers/wrappers/test_captum_influence.py
@@ -168,26 +168,24 @@ def test_captum_influence_explain_functional(
             {"batch_size": 1, "projection_dim": 10, "arnoldi_dim": 10},
         ),
         (
-                "mnist",
-                "load_mnist_model",
-                "load_mnist_dataset",
-                "load_mnist_test_samples_1",
-                "load_mnist_test_labels_1",
-                {
-                    "batch_size": 1,
-                    "projection_dim": 10,
-                    "arnoldi_dim": 20,
-                    "arnoldi_tol": 2e-1,
-                    "hessian_reg": 2e-3,
-                    "hessian_inverse_tol": 2e-4,
-                    "projection_on_cpu": True,
-                },
+            "mnist",
+            "load_mnist_model",
+            "load_mnist_dataset",
+            "load_mnist_test_samples_1",
+            "load_mnist_test_labels_1",
+            {
+                "batch_size": 1,
+                "projection_dim": 10,
+                "arnoldi_dim": 20,
+                "arnoldi_tol": 2e-1,
+                "hessian_reg": 2e-3,
+                "hessian_inverse_tol": 2e-4,
+                "projection_on_cpu": True,
+            },
         ),
     ],
 )
-def test_captum_arnoldi(
-    test_id, model, dataset, test_tensor, test_labels, method_kwargs, request
-):
+def test_captum_arnoldi(test_id, model, dataset, test_tensor, test_labels, method_kwargs, request):
     model = request.getfixturevalue(model)
     dataset = request.getfixturevalue(dataset)
     test_tensor = request.getfixturevalue(test_tensor)
@@ -236,21 +234,21 @@ def test_captum_arnoldi(
             },
         ),
         (
-                "mnist",
-                "load_mnist_model",
-                "load_mnist_dataset",
-                "load_mnist_test_samples_1",
-                "load_mnist_test_labels_1",
-                {
-                    "batch_size": 1,
-                    "seed": 42,
-                    "projection_dim": 10,
-                    "arnoldi_dim": 20,
-                    "arnoldi_tol": 1e-1,
-                    "hessian_reg": 1e-3,
-                    "hessian_inverse_tol": 1e-4,
-                    "projection_on_cpu": True,
-                },
+            "mnist",
+            "load_mnist_model",
+            "load_mnist_dataset",
+            "load_mnist_test_samples_1",
+            "load_mnist_test_labels_1",
+            {
+                "batch_size": 1,
+                "seed": 42,
+                "projection_dim": 10,
+                "arnoldi_dim": 20,
+                "arnoldi_tol": 1e-1,
+                "hessian_reg": 1e-3,
+                "hessian_inverse_tol": 1e-4,
+                "projection_on_cpu": True,
+            },
         ),
     ],
 )
@@ -263,7 +261,6 @@ def test_captum_arnoldi_explain_functional(
     test_labels = request.getfixturevalue(test_labels)
     hessian_dataset = torch.utils.data.Subset(dataset, [0, 1])
 
-
     explainer_captum = ArnoldiInfluenceFunction(
         model=model,
         train_dataset=dataset,

diff --git a/tests/metrics/test_localization_metrics.py b/tests/metrics/test_localization_metrics.py
@@ -1,3 +1,5 @@
+import math
+
 import pytest
 
 from quanda.explainers import SumAggregator
@@ -41,13 +43,13 @@ def test_identical_class_metrics(
     tda = request.getfixturevalue(explanations)
     metric = ClassDetectionMetric(model=model, train_dataset=dataset, device="cpu")
     metric.update(test_labels=test_labels, explanations=tda)
-    score = metric.compute()
+    score = metric.compute()["score"]
     # TODO: introduce a more meaningfull test, where the score is not zero
     # Note from Galip:
     # one idea could be: a random attributor should get approximately 1/( # of classes).
     # With a big test dataset, the probability of failing a truly random test
     # should diminish.
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
 
 
 @pytest.mark.localization_metrics
@@ -89,8 +91,8 @@ def test_identical_subclass_metrics(
         device="cpu",
     )
     metric.update(test_subclasses=test_labels, explanations=tda)
-    score = metric.compute()
-    assert score == expected_score
+    score = metric.compute()["score"]
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
 
 
 @pytest.mark.localization_metrics
@@ -158,6 +160,6 @@ def test_poisoning_detection_metric(
             expl_kwargs=expl_kwargs,
             device="cpu",
         )
-    score = metric.compute()
+    score = metric.compute()["score"]
 
-    assert score["score"] == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
diff --git a/tests/metrics/test_randomization_metrics.py b/tests/metrics/test_randomization_metrics.py
@@ -94,7 +94,7 @@ def test_randomization_metric(
     else:
         metric.update(test_data=test_data, explanations=tda, explanation_targets=test_labels)
 
-    out = metric.compute()
+    out = metric.compute()["score"]
     assert (out >= -1.0) & (out <= 1.0), "Test failed."
 
 

diff --git a/tests/metrics/test_unnamed_metrics.py b/tests/metrics/test_unnamed_metrics.py
@@ -1,3 +1,5 @@
+import math
+
 import pytest
 
 from quanda.explainers.wrappers.captum_influence import CaptumSimilarity
@@ -37,8 +39,8 @@ def test_top_k_overlap_metrics(
     explanations = request.getfixturevalue(explanations)
     metric = TopKOverlapMetric(model=model, train_dataset=dataset, top_k=top_k, device="cpu")
     metric.update(explanations=explanations)
-    score = metric.compute()
-    assert score == expected_score
+    score = metric.compute()["score"]
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
 
 
 @pytest.mark.unnamed_metrics
@@ -135,9 +137,9 @@ def test_dataset_cleaning(
             device="cpu",
         )
 
-    score = metric.compute()
+    score = metric.compute()["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
 
 
 @pytest.mark.unnamed_metrics
@@ -202,9 +204,9 @@ def test_dataset_cleaning_self_influence_based(
         device="cpu",
     )
 
-    score = metric.compute()
+    score = metric.compute()["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
 
 
 @pytest.mark.unnamed_metrics
@@ -263,6 +265,6 @@ def test_dataset_cleaning_aggr_based(
 
     metric.update(explanations=explanations)
 
-    score = metric.compute()
+    score = metric.compute()["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
diff --git a/tests/toy_benchmarks/localization/test_class_detection.py b/tests/toy_benchmarks/localization/test_class_detection.py
@@ -1,3 +1,5 @@
+import math
+
 import pytest
 
 from quanda.explainers.wrappers import CaptumSimilarity
@@ -112,6 +114,6 @@ def test_class_detection(
         model_id="default_model_id",
         batch_size=batch_size,
         device="cpu",
-    )
+    )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
diff --git a/tests/toy_benchmarks/localization/test_mislabeling_detection.py b/tests/toy_benchmarks/localization/test_mislabeling_detection.py
@@ -1,3 +1,5 @@
+import math
+
 import lightning as L
 import pytest
 
@@ -166,7 +168,7 @@ def test_mislabeling_detection(
         device="cpu",
     )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
 
 
 @pytest.mark.toy_benchmarks
@@ -240,4 +242,4 @@ def test_mislabeling_detection_generate_from_pl_module(
         device="cpu",
     )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
diff --git a/tests/toy_benchmarks/localization/test_subclass_detection.py b/tests/toy_benchmarks/localization/test_subclass_detection.py
@@ -1,3 +1,5 @@
+import math
+
 import lightning as L
 import pytest
 
@@ -152,9 +154,9 @@ def test_subclass_detection(
         use_predictions=use_pred,
         batch_size=batch_size,
         device="cpu",
-    )
+    )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
 
 
 @pytest.mark.toy_benchmarks
@@ -228,6 +230,6 @@ def test_subclass_detection_generate_lightning_model(
         use_predictions=use_pred,
         batch_size=batch_size,
         device="cpu",
-    )
+    )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
diff --git a/tests/toy_benchmarks/randomization/test_model_randomization.py b/tests/toy_benchmarks/randomization/test_model_randomization.py
@@ -1,3 +1,5 @@
+import math
+
 import pytest
 
 from quanda.explainers.wrappers import CaptumSimilarity
@@ -112,6 +114,6 @@ def test_model_randomization(
         model_id="default_model_id",
         batch_size=batch_size,
         device="cpu",
-    )
+    )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
diff --git a/tests/toy_benchmarks/unnamed/test_dataset_cleaning.py b/tests/toy_benchmarks/unnamed/test_dataset_cleaning.py
@@ -1,3 +1,5 @@
+import math
+
 import lightning as L
 import pytest
 
@@ -146,9 +148,9 @@ def test_dataset_cleaning(
         global_method=global_method,
         batch_size=batch_size,
         device="cpu",
-    )
+    )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
 
 
 @pytest.mark.toy_benchmarks
@@ -218,6 +220,6 @@ def test_dataset_cleaning_generate_from_pl_module(
         global_method=global_method,
         batch_size=batch_size,
         device="cpu",
-    )
+    )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
diff --git a/tests/toy_benchmarks/unnamed/test_top_k_overlap.py b/tests/toy_benchmarks/unnamed/test_top_k_overlap.py
@@ -1,3 +1,5 @@
+import math
+
 import pytest
 
 from quanda.explainers.wrappers import CaptumSimilarity
@@ -136,6 +138,6 @@ def test_class_detection(
         model_id="default_model_id",
         batch_size=batch_size,
         device="cpu",
-    )
+    )["score"]
 
-    assert score == expected_score
+    assert math.isclose(score, expected_score, abs_tol=0.00001)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -151,4 +151,4 @@ def compute(self, args, *kwargs):

		clean_accuracy = class_accuracy(self.model, clean_dl, self.device)

		return original_accuracy - clean_accuracy
		return {"score": (original_accuracy - clean_accuracy)}