diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9ba3485d..c77983dc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -89,11 +89,7 @@ It is possible to limit the scope of testing to specific sections of the codebas Faithfulness metrics using python3.9 (make sure the python versions match in your environment): ```bash -<<<<<<< HEAD python3 -m tox run -e py39 -- -m faithfulness -s -======= -python3 -m tox run -e py39 -- -m evaluate_func -s ->>>>>>> c33f4039e2f53332a8eff2207cdbe6686600e67c ``` For a complete overview of the possible testing scopes, please refer to `pytest.ini`. diff --git a/quantus/helpers/constants.py b/quantus/helpers/constants.py index 7da2c790..b8ba74e0 100644 --- a/quantus/helpers/constants.py +++ b/quantus/helpers/constants.py @@ -74,11 +74,15 @@ }, } -# Perturbation steps with 'masking', based on attribution order/ ranking. +# Quantus metrics that include a step-wise 'masking'/ perturbation that is +# based on attribution order/ ranking (and not magnitude). AVAILABLE_INVERSE_ESTIMATION_METRICS = { "Pixel-Flipping": PixelFlipping, - "Region Perturbation": RegionPerturbation, + "Region Perturbation": RegionPerturbation, # order = 'morf' + "ROAD": ROAD, # return_only_values = True + "Selectivity": Selectivity, } +# AVAILABLE_PERTURBATION_FUNCTIONS = { "baseline_replacement_by_indices": baseline_replacement_by_indices, diff --git a/quantus/metrics/faithfulness/infidelity.py b/quantus/metrics/faithfulness/infidelity.py index baf48cb9..ef621a60 100644 --- a/quantus/metrics/faithfulness/infidelity.py +++ b/quantus/metrics/faithfulness/infidelity.py @@ -331,6 +331,7 @@ def evaluate_instance( for i_x, top_left_x in enumerate(range(0, x.shape[1], patch_size)): for i_y, top_left_y in enumerate(range(0, x.shape[2], patch_size)): + # Perturb input patch-wise. x_perturbed_pad = utils._pad_array( x_perturbed, pad_width, mode="edge", padded_axes=self.a_axes diff --git a/quantus/metrics/faithfulness/pixel_flipping.py b/quantus/metrics/faithfulness/pixel_flipping.py index 850ebad7..6f220c9c 100644 --- a/quantus/metrics/faithfulness/pixel_flipping.py +++ b/quantus/metrics/faithfulness/pixel_flipping.py @@ -287,11 +287,8 @@ def evaluate_instance( # Reshape attributions. a = a.flatten() - # Get indices of sorted attributions. - if self.inverse_estimation is None or self.inverse_estimation is False: - a_indices = np.argsort(-a) # Order is descending. - elif self.inverse_estimation is True: - a_indices = np.argsort(a) # Order is ascending. + # Get indices of sorted attributions (descending). + a_indices = np.argsort(-a) # Prepare lists. n_perturbations = len(range(0, len(a_indices), self.features_in_step)) diff --git a/quantus/metrics/faithfulness/region_perturbation.py b/quantus/metrics/faithfulness/region_perturbation.py index 6c95a035..6c65d14d 100644 --- a/quantus/metrics/faithfulness/region_perturbation.py +++ b/quantus/metrics/faithfulness/region_perturbation.py @@ -76,7 +76,6 @@ def __init__( perturb_func: Optional[Callable] = None, perturb_baseline: str = "black", perturb_func_kwargs: Optional[Dict[str, Any]] = None, - inverse_estimation: Optional[bool] = None, return_aggregate: bool = False, aggregate_func: Optional[Callable] = None, default_plot_func: Optional[Callable] = None, @@ -147,7 +146,6 @@ def __init__( self.patch_size = patch_size self.order = order.lower() self.regions_evaluation = regions_evaluation - self.inverse_estimation = inverse_estimation self.perturb_func = make_perturb_func( perturb_func, perturb_func_kwargs, perturb_baseline=perturb_baseline ) @@ -343,11 +341,6 @@ def evaluate_instance( ) patches.append(patch_slice) - if self.inverse_estimation == True: - self.order = "lerf" - elif self.inverse_estimation == False: - self.order = "morf" - if self.order == "random": # Order attributions randomly. order = np.arange(len(patches)) diff --git a/quantus/metrics/faithfulness/road.py b/quantus/metrics/faithfulness/road.py index 8c54dc92..a3763659 100644 --- a/quantus/metrics/faithfulness/road.py +++ b/quantus/metrics/faithfulness/road.py @@ -65,6 +65,7 @@ def __init__( self, percentages: Optional[List[float]] = None, noise: float = 0.01, + return_only_values: Optional[bool] = None, abs: bool = False, normalise: bool = True, normalise_func: Optional[Callable[[np.ndarray], np.ndarray]] = None, @@ -81,8 +82,12 @@ def __init__( """ Parameters ---------- - percentages (list): The list of percentages of the image to be removed, default=list(range(1, 100, 2)). - noise (noise): Noise added, default=0.01. + percentages: list of ints + The list of percentages of the image to be removed, default=list(range(1, 100, 2)). + noise: float + Noise added, default=0.01. + return_only_values: bool + Indicates whether only evaluation scores (list of floats) should be returned and not the dictionary that also includes the percentages, default=None. abs: boolean Indicates whether absolute operation is applied on the attribution, default=False. normalise: boolean @@ -131,9 +136,11 @@ def __init__( perturb_func = noisy_linear_imputation self.percentages = percentages + self.noise = noise + self.return_values = return_only_values self.a_size = None self.perturb_func = make_perturb_func( - perturb_func, perturb_func_kwargs, noise=noise + perturb_func, perturb_func_kwargs, noise=self.noise ) # Asserts and warnings. @@ -335,6 +342,10 @@ def custom_postprocess( for p_ix, percentage in enumerate(self.percentages) } + # Return only the evaluation scores (and not percentages). + if self.return_values: + self.evaluation_scores = list(self.evaluation_scores.values()) + def evaluate_batch( self, model: ModelInterface, diff --git a/quantus/metrics/inverse_estimation.py b/quantus/metrics/inverse_estimation.py index e8a53961..73e12ed5 100644 --- a/quantus/metrics/inverse_estimation.py +++ b/quantus/metrics/inverse_estimation.py @@ -104,12 +104,7 @@ def __init__( # TODO. Create specific plot. default_plot_func = plotting.plot_pixel_flipping_experiment - abs = metric_init.abs - normalise = metric_init.normalise - return_aggregate = metric_init.return_aggregate - aggregate_func = metric_init.aggregate_func - display_progressbar = metric_init.display_progressbar - disable_warnings = metric_init.disable_warnings + self.return_aggregate = return_aggregate super().__init__( abs=abs, @@ -125,9 +120,10 @@ def __init__( ) # Asserts and warnings. - # assert hasattr( - # metric_init, "inverse_estimation" - # ), "The metric must have 'inverse_estimation' (bool) attribute" + if metric_init.name == "ROAD": + metric_init.return_only_values = True + if metric_init.name == "Region-Perturbation": + metric_init.order = "morf" # TODO. Update warnings. if not self.disable_warnings: @@ -138,8 +134,6 @@ def __init__( ) self.metric_init = metric_init - self.all_evaluation_scores_meta = [] - self.all_evaluation_scores_meta_inverse = [] def __call__( self, @@ -231,17 +225,18 @@ def __call__( >>> metric = Metric(abs=True, normalise=False) >>> scores = metric(model=model, x_batch=x_batch, y_batch=y_batch, a_batch=a_batch_saliency} """ - if self.metric_init.return_aggregate != False: + if self.metric_init.return_aggregate: print( "The metric is not designed to return an aggregate score, setting return_aggregate=False." ) self.metric_init.return_aggregate = False - # TODO. Check compatibility with different normalisation functions. - # TODO. Check compatibility with different batch sizes (think ok). - # TODO. Check that we use 2most important feature first" type. + assert ( + a_batch is not None + ), "'a_batch' must be provided to run the inverse estimation." - # TODO. Implement ranking assumption. see: https://github.com/annahedstroem/eval-project/blob/febe271a78c6efc16a51372ab58fcba676e0eb88/src/xai_faithfulness_experiments_lib_edits.py#L403 + # TODO. Do we want to turn the attributions to rankings? + # See: https://github.com/annahedstroem/eval-project/blob/febe271a78c6efc16a51372ab58fcba676e0eb88/src/xai_faithfulness_experiments_lib_edits.py#L403 self.scores = self.metric_init( model=model, x_batch=x_batch, @@ -261,15 +256,12 @@ def __call__( "To run the inverse estimation, the number of evaluation scores " "must match the number of instances in the batch." ) - self.all_evaluation_scores_meta.extend(self.metric_init.evaluation_scores) + # Empty the evaluation scores before re-scoring with the metric. self.metric_init.evaluation_scores = [] # Run inverse experiment. - # TODO. Check if the metric only relies on ordering. - a_batch_inv = -np.array(a_batch) / np.min( - -np.array(a_batch) - ) # [1, 2, 3], [-1, -2, -3] + a_batch_inv = -np.array(a_batch) # / np.min(-np.array(a_batch)) self.scores_inv = self.metric_init( model=model, x_batch=x_batch, @@ -285,54 +277,29 @@ def __call__( model_predict_kwargs=model_predict_kwargs, **kwargs, ) - self.all_evaluation_scores_meta_inverse.extend( - self.metric_init.evaluation_scores - ) # Compute the inverse, empty the evaluation scores again and overwrite with the inverse scores. - inv_scores = np.array(self.scores) - np.array(self.scores_inv) - self.metric_init.evaluation_scores = [] - self.evaluation_scores.extend(inv_scores) - - # TODO. If all_evaluation_scores is empty, overwrite with inverse scores - # for the those last samples (keep iterator). Or skip and throw a warning. - # if self.all_evaluation_scores: - # self.all_evaluation_scores[-1] = [] - self.all_evaluation_scores.append(inv_scores) + inv_scores = (np.array(self.scores) - np.array(self.scores_inv)).tolist() + self.evaluation_scores = inv_scores - return inv_scores - - def custom_postprocess( - self, - model: ModelInterface, - x_batch: np.ndarray, - y_batch: Optional[np.ndarray], - a_batch: Optional[np.ndarray], - s_batch: np.ndarray, - **kwargs, - ) -> None: - """ - Post-process the evaluation results. + if self.return_aggregate: + self.evaluation_scores = self.get_mean_score - Parameters - ---------- - model: torch.nn.Module, tf.keras.Model - A torch or tensorflow model e.g., torchvision.models that is subject to explanation. - x_batch: np.ndarray - A np.ndarray which contains the input data that are explained. - y_batch: np.ndarray - A np.ndarray which contains the output labels that are explained. - a_batch: np.ndarray, optional - A np.ndarray which contains pre-computed attributions i.e., explanations. - s_batch: np.ndarray, optional - A np.ndarray which contains segmentation masks that matches the input. + self.all_evaluation_scores.extend(self.metric_init.evaluation_scores) - Returns - ------- - None - """ - # TODO. Is this needed? - pass + return inv_scores def convert_attributions_to_rankings(self): pass + + @property + def get_mean_score(self): + """Calculate the area under the curve (AUC) score for several test samples.""" + return np.mean(np.array(self.evaluation_scores), axis=1) + + @property + def get_auc_score(self): + """Calculate the area under the curve (AUC) score for several test samples.""" + return np.mean( + [utils.calculate_auc(np.array(curve)) for curve in self.evaluation_scores] + ) diff --git a/tests/metrics/test_inverse_estimation.py b/tests/metrics/test_inverse_estimation.py index 6b471b2f..8d68b53c 100644 --- a/tests/metrics/test_inverse_estimation.py +++ b/tests/metrics/test_inverse_estimation.py @@ -40,7 +40,7 @@ }, }, }, - {"min": 0.0, "max": 1.0}, + {"min": -1000.0, "max": 1000.0}, ), ( lazy_fixture("load_mnist_model"), @@ -61,7 +61,7 @@ }, }, }, - {"min": 0.0, "max": 1.0}, + {"min": -1000.0, "max": 1000.0}, ), ( lazy_fixture("load_mnist_model"), @@ -82,7 +82,7 @@ }, }, }, - {"min": 0.0, "max": 1.0}, + {"min": -1000.0, "max": 1000.0}, ), ( lazy_fixture("load_mnist_model"), @@ -103,12 +103,13 @@ }, }, }, - {"min": 0.0, "max": 1.0}, + {"exception": AssertionError}, ), ( lazy_fixture("load_mnist_model"), lazy_fixture("load_mnist_images"), { + "a_batch_generate": True, "init": { "perturb_baseline": "mean", "features_in_step": 28, @@ -124,7 +125,7 @@ }, }, }, - {"min": 0.0, "max": 1.0}, + {"min": -1000.0, "max": 1000.0}, ), ( lazy_fixture("load_1d_3ch_conv_model"), @@ -140,12 +141,13 @@ }, "call": {}, }, - {"min": 0.0, "max": 1.0}, + {"exception": AssertionError}, ), ( lazy_fixture("load_mnist_model"), lazy_fixture("load_mnist_images"), { + "a_batch_generate": True, "init": { "perturb_baseline": "uniform", "features_in_step": 56, @@ -158,16 +160,17 @@ "explain_func": explain, "explain_func_kwargs": { "method": "Saliency", + "softmax": False, }, }, }, - {"min": 0.0, "max": 14.0}, + {"min": -1000.0, "max": 1000.0}, ), ( lazy_fixture("load_1d_3ch_conv_model"), lazy_fixture("almost_uniform_1d"), { - "a_batch_generate": False, + "a_batch_generate": True, "init": { "features_in_step": 10, "normalise": False, @@ -175,13 +178,20 @@ "perturb_baseline": "mean", "disable_warnings": True, }, - "call": {}, + "call": { + "explain_func": explain, + "explain_func_kwargs": { + "method": "IntegratedGradients", + "xai_lib": "captum", + }, + "softmax": False, + }, }, - {"min": 0.0, "max": 10.0}, + {"exception": AssertionError}, ), ], ) -def test_pixel_flipping( +def test_inverse_estimation_with_pixel_flipping( model, data: np.ndarray, params: dict, @@ -195,7 +205,9 @@ def test_pixel_flipping( init_params = params.get("init", {}) call_params = params.get("call", {}) - if params.get("a_batch_generate", True): + if "a_batch" in data: + a_batch = data["a_batch"] + elif params.get("a_batch_generate", True): explain = call_params["explain_func"] explain_func_kwargs = call_params.get("explain_func_kwargs", {}) a_batch = explain( @@ -204,37 +216,37 @@ def test_pixel_flipping( targets=y_batch, **explain_func_kwargs, ) - elif "a_batch" in data: - a_batch = data["a_batch"] + assert a_batch is not None else: a_batch = None metric_init = PixelFlipping(**init_params) - inverse_estimation = InverseEstimation(metric_init=metric_init) - scores = inverse_estimation( - model=model, - x_batch=x_batch, - y_batch=y_batch, - a_batch=a_batch, - **call_params, - ) - print("final scores!!!", scores[0][:10]) - print( - "scores_inv!!!", - ( - np.shape(inverse_estimation.scores_inv), - inverse_estimation.scores_inv[0][:10], - ), - ) - print( - "scores!!!", - (np.shape(inverse_estimation.scores), inverse_estimation.scores[0][:10]), - ) + metric_init.softmax = True + + try: + inv = InverseEstimation(metric_init=metric_init, return_aggregate=True) + scores = inv( + model=model, + x_batch=x_batch, + y_batch=y_batch, + a_batch=a_batch, + **call_params, + ) + print(f"\n\n\tscores: {np.shape(inv.scores)},\n{inv.scores}") + print(f"\n\n\tscores_inv: {np.shape(inv.scores_inv)},\n{inv.scores_inv}") + print( + f"\n\n\tall_evaluation_scores: {np.shape(inv.all_evaluation_scores)},\n{inv.all_evaluation_scores}" + ) + print(f"\n\n\tscores: {np.shape(scores)},\n{scores}") - assert all( - [ - (s >= expected["min"] and s <= expected["max"]) - for s_list in scores - for s in s_list - ] - ), "Test failed." + if "exception" not in expected: + assert all( + [ + (s >= expected["min"] and s <= expected["max"]) + for s_list in scores + for s in s_list + ] + ), "Test failed." + except expected["exception"] as e: + print(f'Raised exception type {expected["exception"]}', e) + return