diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9ba3485d..c77983dc 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -89,11 +89,7 @@ It is possible to limit the scope of testing to specific sections of the codebas
 Faithfulness metrics using python3.9 (make sure the python versions match in your environment):
 
 ```bash
-<<<<<<< HEAD
 python3 -m tox run -e py39 -- -m faithfulness -s
-=======
-python3 -m tox run -e py39 -- -m evaluate_func -s
->>>>>>> c33f4039e2f53332a8eff2207cdbe6686600e67c
 ```
 
 For a complete overview of the possible testing scopes, please refer to `pytest.ini`.
diff --git a/quantus/helpers/constants.py b/quantus/helpers/constants.py
index 7da2c790..b8ba74e0 100644
--- a/quantus/helpers/constants.py
+++ b/quantus/helpers/constants.py
@@ -74,11 +74,15 @@
     },
 }
 
-# Perturbation steps with 'masking', based on attribution order/ ranking.
+# Quantus metrics that include a step-wise 'masking'/ perturbation that is
+# based on attribution order/ ranking (and not magnitude).
 AVAILABLE_INVERSE_ESTIMATION_METRICS = {
     "Pixel-Flipping": PixelFlipping,
-    "Region Perturbation": RegionPerturbation,
+    "Region Perturbation": RegionPerturbation,  # order = 'morf'
+    "ROAD": ROAD,  # return_only_values = True
+    "Selectivity": Selectivity,
 }
+#
 
 AVAILABLE_PERTURBATION_FUNCTIONS = {
     "baseline_replacement_by_indices": baseline_replacement_by_indices,
diff --git a/quantus/metrics/faithfulness/infidelity.py b/quantus/metrics/faithfulness/infidelity.py
index baf48cb9..ef621a60 100644
--- a/quantus/metrics/faithfulness/infidelity.py
+++ b/quantus/metrics/faithfulness/infidelity.py
@@ -331,6 +331,7 @@ def evaluate_instance(
 
                 for i_x, top_left_x in enumerate(range(0, x.shape[1], patch_size)):
                     for i_y, top_left_y in enumerate(range(0, x.shape[2], patch_size)):
+
                         # Perturb input patch-wise.
                         x_perturbed_pad = utils._pad_array(
                             x_perturbed, pad_width, mode="edge", padded_axes=self.a_axes
diff --git a/quantus/metrics/faithfulness/pixel_flipping.py b/quantus/metrics/faithfulness/pixel_flipping.py
index 850ebad7..6f220c9c 100644
--- a/quantus/metrics/faithfulness/pixel_flipping.py
+++ b/quantus/metrics/faithfulness/pixel_flipping.py
@@ -287,11 +287,8 @@ def evaluate_instance(
         # Reshape attributions.
         a = a.flatten()
 
-        # Get indices of sorted attributions.
-        if self.inverse_estimation is None or self.inverse_estimation is False:
-            a_indices = np.argsort(-a)  # Order is descending.
-        elif self.inverse_estimation is True:
-            a_indices = np.argsort(a)  # Order is ascending.
+        # Get indices of sorted attributions (descending).
+        a_indices = np.argsort(-a)
 
         # Prepare lists.
         n_perturbations = len(range(0, len(a_indices), self.features_in_step))
diff --git a/quantus/metrics/faithfulness/region_perturbation.py b/quantus/metrics/faithfulness/region_perturbation.py
index 6c95a035..6c65d14d 100644
--- a/quantus/metrics/faithfulness/region_perturbation.py
+++ b/quantus/metrics/faithfulness/region_perturbation.py
@@ -76,7 +76,6 @@ def __init__(
         perturb_func: Optional[Callable] = None,
         perturb_baseline: str = "black",
         perturb_func_kwargs: Optional[Dict[str, Any]] = None,
-        inverse_estimation: Optional[bool] = None,
         return_aggregate: bool = False,
         aggregate_func: Optional[Callable] = None,
         default_plot_func: Optional[Callable] = None,
@@ -147,7 +146,6 @@ def __init__(
         self.patch_size = patch_size
         self.order = order.lower()
         self.regions_evaluation = regions_evaluation
-        self.inverse_estimation = inverse_estimation
         self.perturb_func = make_perturb_func(
             perturb_func, perturb_func_kwargs, perturb_baseline=perturb_baseline
         )
@@ -343,11 +341,6 @@ def evaluate_instance(
             )
             patches.append(patch_slice)
 
-        if self.inverse_estimation == True:
-            self.order = "lerf"
-        elif self.inverse_estimation == False:
-            self.order = "morf"
-
         if self.order == "random":
             # Order attributions randomly.
             order = np.arange(len(patches))
diff --git a/quantus/metrics/faithfulness/road.py b/quantus/metrics/faithfulness/road.py
index 8c54dc92..a3763659 100644
--- a/quantus/metrics/faithfulness/road.py
+++ b/quantus/metrics/faithfulness/road.py
@@ -65,6 +65,7 @@ def __init__(
         self,
         percentages: Optional[List[float]] = None,
         noise: float = 0.01,
+        return_only_values: Optional[bool] = None,
         abs: bool = False,
         normalise: bool = True,
         normalise_func: Optional[Callable[[np.ndarray], np.ndarray]] = None,
@@ -81,8 +82,12 @@ def __init__(
         """
         Parameters
         ----------
-        percentages (list): The list of percentages of the image to be removed, default=list(range(1, 100, 2)).
-            noise (noise): Noise added, default=0.01.
+        percentages: list of ints
+            The list of percentages of the image to be removed, default=list(range(1, 100, 2)).
+        noise: float
+            Noise added, default=0.01.
+        return_only_values: bool
+            Indicates whether only evaluation scores (list of floats) should be returned and not the dictionary that also includes the percentages, default=None.
         abs: boolean
             Indicates whether absolute operation is applied on the attribution, default=False.
         normalise: boolean
@@ -131,9 +136,11 @@ def __init__(
             perturb_func = noisy_linear_imputation
 
         self.percentages = percentages
+        self.noise = noise
+        self.return_values = return_only_values
         self.a_size = None
         self.perturb_func = make_perturb_func(
-            perturb_func, perturb_func_kwargs, noise=noise
+            perturb_func, perturb_func_kwargs, noise=self.noise
         )
 
         # Asserts and warnings.
@@ -335,6 +342,10 @@ def custom_postprocess(
             for p_ix, percentage in enumerate(self.percentages)
         }
 
+        # Return only the evaluation scores (and not percentages).
+        if self.return_values:
+            self.evaluation_scores = list(self.evaluation_scores.values())
+
     def evaluate_batch(
         self,
         model: ModelInterface,
diff --git a/quantus/metrics/inverse_estimation.py b/quantus/metrics/inverse_estimation.py
index e8a53961..73e12ed5 100644
--- a/quantus/metrics/inverse_estimation.py
+++ b/quantus/metrics/inverse_estimation.py
@@ -104,12 +104,7 @@ def __init__(
             # TODO. Create specific plot.
             default_plot_func = plotting.plot_pixel_flipping_experiment
 
-        abs = metric_init.abs
-        normalise = metric_init.normalise
-        return_aggregate = metric_init.return_aggregate
-        aggregate_func = metric_init.aggregate_func
-        display_progressbar = metric_init.display_progressbar
-        disable_warnings = metric_init.disable_warnings
+        self.return_aggregate = return_aggregate
 
         super().__init__(
             abs=abs,
@@ -125,9 +120,10 @@ def __init__(
         )
 
         # Asserts and warnings.
-        # assert hasattr(
-        #    metric_init, "inverse_estimation"
-        # ), "The metric must have 'inverse_estimation' (bool) attribute"
+        if metric_init.name == "ROAD":
+            metric_init.return_only_values = True
+        if metric_init.name == "Region-Perturbation":
+            metric_init.order = "morf"
 
         # TODO. Update warnings.
         if not self.disable_warnings:
@@ -138,8 +134,6 @@ def __init__(
             )
 
         self.metric_init = metric_init
-        self.all_evaluation_scores_meta = []
-        self.all_evaluation_scores_meta_inverse = []
 
     def __call__(
         self,
@@ -231,17 +225,18 @@ def __call__(
             >>> metric = Metric(abs=True, normalise=False)
             >>> scores = metric(model=model, x_batch=x_batch, y_batch=y_batch, a_batch=a_batch_saliency}
         """
-        if self.metric_init.return_aggregate != False:
+        if self.metric_init.return_aggregate:
             print(
                 "The metric is not designed to return an aggregate score, setting return_aggregate=False."
             )
             self.metric_init.return_aggregate = False
 
-        # TODO. Check compatibility with different normalisation functions.
-        # TODO. Check compatibility with different batch sizes (think ok).
-        # TODO. Check that we use 2most important feature first" type.
+        assert (
+            a_batch is not None
+        ), "'a_batch' must be provided to run the inverse estimation."
 
-        # TODO. Implement ranking assumption. see: https://github.com/annahedstroem/eval-project/blob/febe271a78c6efc16a51372ab58fcba676e0eb88/src/xai_faithfulness_experiments_lib_edits.py#L403
+        # TODO. Do we want to turn the attributions to rankings?
+        #  See: https://github.com/annahedstroem/eval-project/blob/febe271a78c6efc16a51372ab58fcba676e0eb88/src/xai_faithfulness_experiments_lib_edits.py#L403
         self.scores = self.metric_init(
             model=model,
             x_batch=x_batch,
@@ -261,15 +256,12 @@ def __call__(
             "To run the inverse estimation, the number of evaluation scores "
             "must match the number of instances in the batch."
         )
-        self.all_evaluation_scores_meta.extend(self.metric_init.evaluation_scores)
+
         # Empty the evaluation scores before re-scoring with the metric.
         self.metric_init.evaluation_scores = []
 
         # Run inverse experiment.
-        # TODO. Check if the metric only relies on ordering.
-        a_batch_inv = -np.array(a_batch) / np.min(
-            -np.array(a_batch)
-        )  # [1, 2, 3], [-1, -2, -3]
+        a_batch_inv = -np.array(a_batch)  # / np.min(-np.array(a_batch))
         self.scores_inv = self.metric_init(
             model=model,
             x_batch=x_batch,
@@ -285,54 +277,29 @@ def __call__(
             model_predict_kwargs=model_predict_kwargs,
             **kwargs,
         )
-        self.all_evaluation_scores_meta_inverse.extend(
-            self.metric_init.evaluation_scores
-        )
 
         # Compute the inverse, empty the evaluation scores again and overwrite with the inverse scores.
-        inv_scores = np.array(self.scores) - np.array(self.scores_inv)
-        self.metric_init.evaluation_scores = []
-        self.evaluation_scores.extend(inv_scores)
-
-        # TODO. If all_evaluation_scores is empty, overwrite with inverse scores
-        #  for the those last samples (keep iterator). Or skip and throw a warning.
-        # if self.all_evaluation_scores:
-        #    self.all_evaluation_scores[-1] = []
-        self.all_evaluation_scores.append(inv_scores)
+        inv_scores = (np.array(self.scores) - np.array(self.scores_inv)).tolist()
+        self.evaluation_scores = inv_scores
 
-        return inv_scores
-
-    def custom_postprocess(
-        self,
-        model: ModelInterface,
-        x_batch: np.ndarray,
-        y_batch: Optional[np.ndarray],
-        a_batch: Optional[np.ndarray],
-        s_batch: np.ndarray,
-        **kwargs,
-    ) -> None:
-        """
-        Post-process the evaluation results.
+        if self.return_aggregate:
+            self.evaluation_scores = self.get_mean_score
 
-        Parameters
-        ----------
-        model: torch.nn.Module, tf.keras.Model
-            A torch or tensorflow model e.g., torchvision.models that is subject to explanation.
-        x_batch: np.ndarray
-            A np.ndarray which contains the input data that are explained.
-        y_batch: np.ndarray
-            A np.ndarray which contains the output labels that are explained.
-        a_batch: np.ndarray, optional
-            A np.ndarray which contains pre-computed attributions i.e., explanations.
-        s_batch: np.ndarray, optional
-            A np.ndarray which contains segmentation masks that matches the input.
+        self.all_evaluation_scores.extend(self.metric_init.evaluation_scores)
 
-        Returns
-        -------
-        None
-        """
-        # TODO. Is this needed?
-        pass
+        return inv_scores
 
     def convert_attributions_to_rankings(self):
         pass
+
+    @property
+    def get_mean_score(self):
+        """Calculate the area under the curve (AUC) score for several test samples."""
+        return np.mean(np.array(self.evaluation_scores), axis=1)
+
+    @property
+    def get_auc_score(self):
+        """Calculate the area under the curve (AUC) score for several test samples."""
+        return np.mean(
+            [utils.calculate_auc(np.array(curve)) for curve in self.evaluation_scores]
+        )
diff --git a/tests/metrics/test_inverse_estimation.py b/tests/metrics/test_inverse_estimation.py
index 6b471b2f..8d68b53c 100644
--- a/tests/metrics/test_inverse_estimation.py
+++ b/tests/metrics/test_inverse_estimation.py
@@ -40,7 +40,7 @@
                     },
                 },
             },
-            {"min": 0.0, "max": 1.0},
+            {"min": -1000.0, "max": 1000.0},
         ),
         (
             lazy_fixture("load_mnist_model"),
@@ -61,7 +61,7 @@
                     },
                 },
             },
-            {"min": 0.0, "max": 1.0},
+            {"min": -1000.0, "max": 1000.0},
         ),
         (
             lazy_fixture("load_mnist_model"),
@@ -82,7 +82,7 @@
                     },
                 },
             },
-            {"min": 0.0, "max": 1.0},
+            {"min": -1000.0, "max": 1000.0},
         ),
         (
             lazy_fixture("load_mnist_model"),
@@ -103,12 +103,13 @@
                     },
                 },
             },
-            {"min": 0.0, "max": 1.0},
+            {"exception": AssertionError},
         ),
         (
             lazy_fixture("load_mnist_model"),
             lazy_fixture("load_mnist_images"),
             {
+                "a_batch_generate": True,
                 "init": {
                     "perturb_baseline": "mean",
                     "features_in_step": 28,
@@ -124,7 +125,7 @@
                     },
                 },
             },
-            {"min": 0.0, "max": 1.0},
+            {"min": -1000.0, "max": 1000.0},
         ),
         (
             lazy_fixture("load_1d_3ch_conv_model"),
@@ -140,12 +141,13 @@
                 },
                 "call": {},
             },
-            {"min": 0.0, "max": 1.0},
+            {"exception": AssertionError},
         ),
         (
             lazy_fixture("load_mnist_model"),
             lazy_fixture("load_mnist_images"),
             {
+                "a_batch_generate": True,
                 "init": {
                     "perturb_baseline": "uniform",
                     "features_in_step": 56,
@@ -158,16 +160,17 @@
                     "explain_func": explain,
                     "explain_func_kwargs": {
                         "method": "Saliency",
+                        "softmax": False,
                     },
                 },
             },
-            {"min": 0.0, "max": 14.0},
+            {"min": -1000.0, "max": 1000.0},
         ),
         (
             lazy_fixture("load_1d_3ch_conv_model"),
             lazy_fixture("almost_uniform_1d"),
             {
-                "a_batch_generate": False,
+                "a_batch_generate": True,
                 "init": {
                     "features_in_step": 10,
                     "normalise": False,
@@ -175,13 +178,20 @@
                     "perturb_baseline": "mean",
                     "disable_warnings": True,
                 },
-                "call": {},
+                "call": {
+                    "explain_func": explain,
+                    "explain_func_kwargs": {
+                        "method": "IntegratedGradients",
+                        "xai_lib": "captum",
+                    },
+                    "softmax": False,
+                },
             },
-            {"min": 0.0, "max": 10.0},
+            {"exception": AssertionError},
         ),
     ],
 )
-def test_pixel_flipping(
+def test_inverse_estimation_with_pixel_flipping(
     model,
     data: np.ndarray,
     params: dict,
@@ -195,7 +205,9 @@ def test_pixel_flipping(
     init_params = params.get("init", {})
     call_params = params.get("call", {})
 
-    if params.get("a_batch_generate", True):
+    if "a_batch" in data:
+        a_batch = data["a_batch"]
+    elif params.get("a_batch_generate", True):
         explain = call_params["explain_func"]
         explain_func_kwargs = call_params.get("explain_func_kwargs", {})
         a_batch = explain(
@@ -204,37 +216,37 @@ def test_pixel_flipping(
             targets=y_batch,
             **explain_func_kwargs,
         )
-    elif "a_batch" in data:
-        a_batch = data["a_batch"]
+        assert a_batch is not None
     else:
         a_batch = None
 
     metric_init = PixelFlipping(**init_params)
-    inverse_estimation = InverseEstimation(metric_init=metric_init)
-    scores = inverse_estimation(
-        model=model,
-        x_batch=x_batch,
-        y_batch=y_batch,
-        a_batch=a_batch,
-        **call_params,
-    )
-    print("final scores!!!", scores[0][:10])
-    print(
-        "scores_inv!!!",
-        (
-            np.shape(inverse_estimation.scores_inv),
-            inverse_estimation.scores_inv[0][:10],
-        ),
-    )
-    print(
-        "scores!!!",
-        (np.shape(inverse_estimation.scores), inverse_estimation.scores[0][:10]),
-    )
+    metric_init.softmax = True
+
+    try:
+        inv = InverseEstimation(metric_init=metric_init, return_aggregate=True)
+        scores = inv(
+            model=model,
+            x_batch=x_batch,
+            y_batch=y_batch,
+            a_batch=a_batch,
+            **call_params,
+        )
+        print(f"\n\n\tscores: {np.shape(inv.scores)},\n{inv.scores}")
+        print(f"\n\n\tscores_inv: {np.shape(inv.scores_inv)},\n{inv.scores_inv}")
+        print(
+            f"\n\n\tall_evaluation_scores: {np.shape(inv.all_evaluation_scores)},\n{inv.all_evaluation_scores}"
+        )
+        print(f"\n\n\tscores: {np.shape(scores)},\n{scores}")
 
-    assert all(
-        [
-            (s >= expected["min"] and s <= expected["max"])
-            for s_list in scores
-            for s in s_list
-        ]
-    ), "Test failed."
+        if "exception" not in expected:
+            assert all(
+                [
+                    (s >= expected["min"] and s <= expected["max"])
+                    for s_list in scores
+                    for s in s_list
+                ]
+            ), "Test failed."
+    except expected["exception"] as e:
+        print(f'Raised exception type {expected["exception"]}', e)
+        return