tests passing, most todos removed

understandable-machine-intelligence-lab · Dec 7, 2023 · fb3d5ef · fb3d5ef
1 parent 79cf6d8
commit fb3d5ef
Show file tree

Hide file tree

Showing 8 changed files with 107 additions and 126 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -89,11 +89,7 @@ It is possible to limit the scope of testing to specific sections of the codebas
 Faithfulness metrics using python3.9 (make sure the python versions match in your environment):
 
 ```bash
-<<<<<<< HEAD
 python3 -m tox run -e py39 -- -m faithfulness -s
-=======
-python3 -m tox run -e py39 -- -m evaluate_func -s
->>>>>>> c33f4039e2f53332a8eff2207cdbe6686600e67c
 ```
 
 For a complete overview of the possible testing scopes, please refer to `pytest.ini`.

diff --git a/quantus/helpers/constants.py b/quantus/helpers/constants.py
@@ -74,11 +74,15 @@
     },
 }
 
-# Perturbation steps with 'masking', based on attribution order/ ranking.
+# Quantus metrics that include a step-wise 'masking'/ perturbation that is
+# based on attribution order/ ranking (and not magnitude).
 AVAILABLE_INVERSE_ESTIMATION_METRICS = {
     "Pixel-Flipping": PixelFlipping,
-    "Region Perturbation": RegionPerturbation,
+    "Region Perturbation": RegionPerturbation,  # order = 'morf'
+    "ROAD": ROAD,  # return_only_values = True
+    "Selectivity": Selectivity,
 }
+#
 
 AVAILABLE_PERTURBATION_FUNCTIONS = {
     "baseline_replacement_by_indices": baseline_replacement_by_indices,

diff --git a/quantus/metrics/faithfulness/infidelity.py b/quantus/metrics/faithfulness/infidelity.py
@@ -331,6 +331,7 @@ def evaluate_instance(
 
                 for i_x, top_left_x in enumerate(range(0, x.shape[1], patch_size)):
                     for i_y, top_left_y in enumerate(range(0, x.shape[2], patch_size)):
+
                         # Perturb input patch-wise.
                         x_perturbed_pad = utils._pad_array(
                             x_perturbed, pad_width, mode="edge", padded_axes=self.a_axes

diff --git a/quantus/metrics/faithfulness/pixel_flipping.py b/quantus/metrics/faithfulness/pixel_flipping.py
@@ -287,11 +287,8 @@ def evaluate_instance(
         # Reshape attributions.
         a = a.flatten()
 
-        # Get indices of sorted attributions.
-        if self.inverse_estimation is None or self.inverse_estimation is False:
-            a_indices = np.argsort(-a)  # Order is descending.
-        elif self.inverse_estimation is True:
-            a_indices = np.argsort(a)  # Order is ascending.
+        # Get indices of sorted attributions (descending).
+        a_indices = np.argsort(-a)
 
         # Prepare lists.
         n_perturbations = len(range(0, len(a_indices), self.features_in_step))

diff --git a/quantus/metrics/faithfulness/region_perturbation.py b/quantus/metrics/faithfulness/region_perturbation.py
@@ -76,7 +76,6 @@ def __init__(
         perturb_func: Optional[Callable] = None,
         perturb_baseline: str = "black",
         perturb_func_kwargs: Optional[Dict[str, Any]] = None,
-        inverse_estimation: Optional[bool] = None,
         return_aggregate: bool = False,
         aggregate_func: Optional[Callable] = None,
         default_plot_func: Optional[Callable] = None,
@@ -147,7 +146,6 @@ def __init__(
         self.patch_size = patch_size
         self.order = order.lower()
         self.regions_evaluation = regions_evaluation
-        self.inverse_estimation = inverse_estimation
         self.perturb_func = make_perturb_func(
             perturb_func, perturb_func_kwargs, perturb_baseline=perturb_baseline
         )
@@ -343,11 +341,6 @@ def evaluate_instance(
             )
             patches.append(patch_slice)
 
-        if self.inverse_estimation == True:
-            self.order = "lerf"
-        elif self.inverse_estimation == False:
-            self.order = "morf"
-
         if self.order == "random":
             # Order attributions randomly.
             order = np.arange(len(patches))

diff --git a/quantus/metrics/faithfulness/road.py b/quantus/metrics/faithfulness/road.py
@@ -65,6 +65,7 @@ def __init__(
         self,
         percentages: Optional[List[float]] = None,
         noise: float = 0.01,
+        return_only_values: Optional[bool] = None,
         abs: bool = False,
         normalise: bool = True,
         normalise_func: Optional[Callable[[np.ndarray], np.ndarray]] = None,
@@ -81,8 +82,12 @@ def __init__(
         """
         Parameters
         ----------
-        percentages (list): The list of percentages of the image to be removed, default=list(range(1, 100, 2)).
-            noise (noise): Noise added, default=0.01.
+        percentages: list of ints
+            The list of percentages of the image to be removed, default=list(range(1, 100, 2)).
+        noise: float
+            Noise added, default=0.01.
+        return_only_values: bool
+            Indicates whether only evaluation scores (list of floats) should be returned and not the dictionary that also includes the percentages, default=None.
         abs: boolean
             Indicates whether absolute operation is applied on the attribution, default=False.
         normalise: boolean
@@ -131,9 +136,11 @@ def __init__(
             perturb_func = noisy_linear_imputation
 
         self.percentages = percentages
+        self.noise = noise
+        self.return_values = return_only_values
         self.a_size = None
         self.perturb_func = make_perturb_func(
-            perturb_func, perturb_func_kwargs, noise=noise
+            perturb_func, perturb_func_kwargs, noise=self.noise
         )
 
         # Asserts and warnings.
@@ -335,6 +342,10 @@ def custom_postprocess(
             for p_ix, percentage in enumerate(self.percentages)
         }
 
+        # Return only the evaluation scores (and not percentages).
+        if self.return_values:
+            self.evaluation_scores = list(self.evaluation_scores.values())
+
     def evaluate_batch(
         self,
         model: ModelInterface,

diff --git a/quantus/metrics/inverse_estimation.py b/quantus/metrics/inverse_estimation.py
@@ -104,12 +104,7 @@ def __init__(
             # TODO. Create specific plot.
             default_plot_func = plotting.plot_pixel_flipping_experiment
 
-        abs = metric_init.abs
-        normalise = metric_init.normalise
-        return_aggregate = metric_init.return_aggregate
-        aggregate_func = metric_init.aggregate_func
-        display_progressbar = metric_init.display_progressbar
-        disable_warnings = metric_init.disable_warnings
+        self.return_aggregate = return_aggregate
 
         super().__init__(
             abs=abs,
@@ -125,9 +120,10 @@ def __init__(
         )
 
         # Asserts and warnings.
-        # assert hasattr(
-        #    metric_init, "inverse_estimation"
-        # ), "The metric must have 'inverse_estimation' (bool) attribute"
+        if metric_init.name == "ROAD":
+            metric_init.return_only_values = True
+        if metric_init.name == "Region-Perturbation":
+            metric_init.order = "morf"
 
         # TODO. Update warnings.
         if not self.disable_warnings:
@@ -138,8 +134,6 @@ def __init__(
             )
 
         self.metric_init = metric_init
-        self.all_evaluation_scores_meta = []
-        self.all_evaluation_scores_meta_inverse = []
 
     def __call__(
         self,
@@ -231,17 +225,18 @@ def __call__(
             >>> metric = Metric(abs=True, normalise=False)
             >>> scores = metric(model=model, x_batch=x_batch, y_batch=y_batch, a_batch=a_batch_saliency}
         """
-        if self.metric_init.return_aggregate != False:
+        if self.metric_init.return_aggregate:
             print(
                 "The metric is not designed to return an aggregate score, setting return_aggregate=False."
             )
             self.metric_init.return_aggregate = False
 
-        # TODO. Check compatibility with different normalisation functions.
-        # TODO. Check compatibility with different batch sizes (think ok).
-        # TODO. Check that we use 2most important feature first" type.
+        assert (
+            a_batch is not None
+        ), "'a_batch' must be provided to run the inverse estimation."
 
-        # TODO. Implement ranking assumption. see: https://github.com/annahedstroem/eval-project/blob/febe271a78c6efc16a51372ab58fcba676e0eb88/src/xai_faithfulness_experiments_lib_edits.py#L403
+        # TODO. Do we want to turn the attributions to rankings?
+        #  See: https://github.com/annahedstroem/eval-project/blob/febe271a78c6efc16a51372ab58fcba676e0eb88/src/xai_faithfulness_experiments_lib_edits.py#L403
         self.scores = self.metric_init(
             model=model,
             x_batch=x_batch,
@@ -261,15 +256,12 @@ def __call__(
             "To run the inverse estimation, the number of evaluation scores "
             "must match the number of instances in the batch."
         )
-        self.all_evaluation_scores_meta.extend(self.metric_init.evaluation_scores)
+
         # Empty the evaluation scores before re-scoring with the metric.
         self.metric_init.evaluation_scores = []
 
         # Run inverse experiment.
-        # TODO. Check if the metric only relies on ordering.
-        a_batch_inv = -np.array(a_batch) / np.min(
-            -np.array(a_batch)
-        )  # [1, 2, 3], [-1, -2, -3]
+        a_batch_inv = -np.array(a_batch)  # / np.min(-np.array(a_batch))
         self.scores_inv = self.metric_init(
             model=model,
             x_batch=x_batch,
@@ -285,54 +277,29 @@ def __call__(
             model_predict_kwargs=model_predict_kwargs,
             **kwargs,
         )
-        self.all_evaluation_scores_meta_inverse.extend(
-            self.metric_init.evaluation_scores
-        )
 
         # Compute the inverse, empty the evaluation scores again and overwrite with the inverse scores.
-        inv_scores = np.array(self.scores) - np.array(self.scores_inv)
-        self.metric_init.evaluation_scores = []
-        self.evaluation_scores.extend(inv_scores)
-
-        # TODO. If all_evaluation_scores is empty, overwrite with inverse scores
-        #  for the those last samples (keep iterator). Or skip and throw a warning.
-        # if self.all_evaluation_scores:
-        #    self.all_evaluation_scores[-1] = []
-        self.all_evaluation_scores.append(inv_scores)
+        inv_scores = (np.array(self.scores) - np.array(self.scores_inv)).tolist()
+        self.evaluation_scores = inv_scores
 
-        return inv_scores
-
-    def custom_postprocess(
-        self,
-        model: ModelInterface,
-        x_batch: np.ndarray,
-        y_batch: Optional[np.ndarray],
-        a_batch: Optional[np.ndarray],
-        s_batch: np.ndarray,
-        **kwargs,
-    ) -> None:
-        """
-        Post-process the evaluation results.
+        if self.return_aggregate:
+            self.evaluation_scores = self.get_mean_score
 
-        Parameters
-        ----------
-        model: torch.nn.Module, tf.keras.Model
-            A torch or tensorflow model e.g., torchvision.models that is subject to explanation.
-        x_batch: np.ndarray
-            A np.ndarray which contains the input data that are explained.
-        y_batch: np.ndarray
-            A np.ndarray which contains the output labels that are explained.
-        a_batch: np.ndarray, optional
-            A np.ndarray which contains pre-computed attributions i.e., explanations.
-        s_batch: np.ndarray, optional
-            A np.ndarray which contains segmentation masks that matches the input.
+        self.all_evaluation_scores.extend(self.metric_init.evaluation_scores)
 
-        Returns
-        -------
-        None
-        """
-        # TODO. Is this needed?
-        pass
+        return inv_scores
 
     def convert_attributions_to_rankings(self):
         pass
+
+    @property
+    def get_mean_score(self):
+        """Calculate the area under the curve (AUC) score for several test samples."""
+        return np.mean(np.array(self.evaluation_scores), axis=1)
+
+    @property
+    def get_auc_score(self):
+        """Calculate the area under the curve (AUC) score for several test samples."""
+        return np.mean(
+            [utils.calculate_auc(np.array(curve)) for curve in self.evaluation_scores]
+        )