Skip to content

Commit

Permalink
tests passing, most todos removed
Browse files Browse the repository at this point in the history
  • Loading branch information
annahedstroem committed Dec 7, 2023
1 parent 79cf6d8 commit fb3d5ef
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 126 deletions.
4 changes: 0 additions & 4 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,7 @@ It is possible to limit the scope of testing to specific sections of the codebas
Faithfulness metrics using python3.9 (make sure the python versions match in your environment):

```bash
<<<<<<< HEAD
python3 -m tox run -e py39 -- -m faithfulness -s
=======
python3 -m tox run -e py39 -- -m evaluate_func -s
>>>>>>> c33f4039e2f53332a8eff2207cdbe6686600e67c
```

For a complete overview of the possible testing scopes, please refer to `pytest.ini`.
Expand Down
8 changes: 6 additions & 2 deletions quantus/helpers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,15 @@
},
}

# Perturbation steps with 'masking', based on attribution order/ ranking.
# Quantus metrics that include a step-wise 'masking'/ perturbation that is
# based on attribution order/ ranking (and not magnitude).
AVAILABLE_INVERSE_ESTIMATION_METRICS = {
"Pixel-Flipping": PixelFlipping,
"Region Perturbation": RegionPerturbation,
"Region Perturbation": RegionPerturbation, # order = 'morf'
"ROAD": ROAD, # return_only_values = True
"Selectivity": Selectivity,
}
#

AVAILABLE_PERTURBATION_FUNCTIONS = {
"baseline_replacement_by_indices": baseline_replacement_by_indices,
Expand Down
1 change: 1 addition & 0 deletions quantus/metrics/faithfulness/infidelity.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ def evaluate_instance(

for i_x, top_left_x in enumerate(range(0, x.shape[1], patch_size)):
for i_y, top_left_y in enumerate(range(0, x.shape[2], patch_size)):

# Perturb input patch-wise.
x_perturbed_pad = utils._pad_array(
x_perturbed, pad_width, mode="edge", padded_axes=self.a_axes
Expand Down
7 changes: 2 additions & 5 deletions quantus/metrics/faithfulness/pixel_flipping.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,11 +287,8 @@ def evaluate_instance(
# Reshape attributions.
a = a.flatten()

# Get indices of sorted attributions.
if self.inverse_estimation is None or self.inverse_estimation is False:
a_indices = np.argsort(-a) # Order is descending.
elif self.inverse_estimation is True:
a_indices = np.argsort(a) # Order is ascending.
# Get indices of sorted attributions (descending).
a_indices = np.argsort(-a)

# Prepare lists.
n_perturbations = len(range(0, len(a_indices), self.features_in_step))
Expand Down
7 changes: 0 additions & 7 deletions quantus/metrics/faithfulness/region_perturbation.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def __init__(
perturb_func: Optional[Callable] = None,
perturb_baseline: str = "black",
perturb_func_kwargs: Optional[Dict[str, Any]] = None,
inverse_estimation: Optional[bool] = None,
return_aggregate: bool = False,
aggregate_func: Optional[Callable] = None,
default_plot_func: Optional[Callable] = None,
Expand Down Expand Up @@ -147,7 +146,6 @@ def __init__(
self.patch_size = patch_size
self.order = order.lower()
self.regions_evaluation = regions_evaluation
self.inverse_estimation = inverse_estimation
self.perturb_func = make_perturb_func(
perturb_func, perturb_func_kwargs, perturb_baseline=perturb_baseline
)
Expand Down Expand Up @@ -343,11 +341,6 @@ def evaluate_instance(
)
patches.append(patch_slice)

if self.inverse_estimation == True:
self.order = "lerf"
elif self.inverse_estimation == False:
self.order = "morf"

if self.order == "random":
# Order attributions randomly.
order = np.arange(len(patches))
Expand Down
17 changes: 14 additions & 3 deletions quantus/metrics/faithfulness/road.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(
self,
percentages: Optional[List[float]] = None,
noise: float = 0.01,
return_only_values: Optional[bool] = None,
abs: bool = False,
normalise: bool = True,
normalise_func: Optional[Callable[[np.ndarray], np.ndarray]] = None,
Expand All @@ -81,8 +82,12 @@ def __init__(
"""
Parameters
----------
percentages (list): The list of percentages of the image to be removed, default=list(range(1, 100, 2)).
noise (noise): Noise added, default=0.01.
percentages: list of ints
The list of percentages of the image to be removed, default=list(range(1, 100, 2)).
noise: float
Noise added, default=0.01.
return_only_values: bool
Indicates whether only evaluation scores (list of floats) should be returned and not the dictionary that also includes the percentages, default=None.
abs: boolean
Indicates whether absolute operation is applied on the attribution, default=False.
normalise: boolean
Expand Down Expand Up @@ -131,9 +136,11 @@ def __init__(
perturb_func = noisy_linear_imputation

self.percentages = percentages
self.noise = noise
self.return_values = return_only_values
self.a_size = None
self.perturb_func = make_perturb_func(
perturb_func, perturb_func_kwargs, noise=noise
perturb_func, perturb_func_kwargs, noise=self.noise
)

# Asserts and warnings.
Expand Down Expand Up @@ -335,6 +342,10 @@ def custom_postprocess(
for p_ix, percentage in enumerate(self.percentages)
}

# Return only the evaluation scores (and not percentages).
if self.return_values:
self.evaluation_scores = list(self.evaluation_scores.values())

def evaluate_batch(
self,
model: ModelInterface,
Expand Down
95 changes: 31 additions & 64 deletions quantus/metrics/inverse_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,7 @@ def __init__(
# TODO. Create specific plot.
default_plot_func = plotting.plot_pixel_flipping_experiment

abs = metric_init.abs
normalise = metric_init.normalise
return_aggregate = metric_init.return_aggregate
aggregate_func = metric_init.aggregate_func
display_progressbar = metric_init.display_progressbar
disable_warnings = metric_init.disable_warnings
self.return_aggregate = return_aggregate

super().__init__(
abs=abs,
Expand All @@ -125,9 +120,10 @@ def __init__(
)

# Asserts and warnings.
# assert hasattr(
# metric_init, "inverse_estimation"
# ), "The metric must have 'inverse_estimation' (bool) attribute"
if metric_init.name == "ROAD":
metric_init.return_only_values = True
if metric_init.name == "Region-Perturbation":
metric_init.order = "morf"

# TODO. Update warnings.
if not self.disable_warnings:
Expand All @@ -138,8 +134,6 @@ def __init__(
)

self.metric_init = metric_init
self.all_evaluation_scores_meta = []
self.all_evaluation_scores_meta_inverse = []

def __call__(
self,
Expand Down Expand Up @@ -231,17 +225,18 @@ def __call__(
>>> metric = Metric(abs=True, normalise=False)
>>> scores = metric(model=model, x_batch=x_batch, y_batch=y_batch, a_batch=a_batch_saliency}
"""
if self.metric_init.return_aggregate != False:
if self.metric_init.return_aggregate:
print(
"The metric is not designed to return an aggregate score, setting return_aggregate=False."
)
self.metric_init.return_aggregate = False

# TODO. Check compatibility with different normalisation functions.
# TODO. Check compatibility with different batch sizes (think ok).
# TODO. Check that we use 2most important feature first" type.
assert (
a_batch is not None
), "'a_batch' must be provided to run the inverse estimation."

# TODO. Implement ranking assumption. see: https://github.com/annahedstroem/eval-project/blob/febe271a78c6efc16a51372ab58fcba676e0eb88/src/xai_faithfulness_experiments_lib_edits.py#L403
# TODO. Do we want to turn the attributions to rankings?
# See: https://github.com/annahedstroem/eval-project/blob/febe271a78c6efc16a51372ab58fcba676e0eb88/src/xai_faithfulness_experiments_lib_edits.py#L403
self.scores = self.metric_init(
model=model,
x_batch=x_batch,
Expand All @@ -261,15 +256,12 @@ def __call__(
"To run the inverse estimation, the number of evaluation scores "
"must match the number of instances in the batch."
)
self.all_evaluation_scores_meta.extend(self.metric_init.evaluation_scores)

# Empty the evaluation scores before re-scoring with the metric.
self.metric_init.evaluation_scores = []

# Run inverse experiment.
# TODO. Check if the metric only relies on ordering.
a_batch_inv = -np.array(a_batch) / np.min(
-np.array(a_batch)
) # [1, 2, 3], [-1, -2, -3]
a_batch_inv = -np.array(a_batch) # / np.min(-np.array(a_batch))
self.scores_inv = self.metric_init(
model=model,
x_batch=x_batch,
Expand All @@ -285,54 +277,29 @@ def __call__(
model_predict_kwargs=model_predict_kwargs,
**kwargs,
)
self.all_evaluation_scores_meta_inverse.extend(
self.metric_init.evaluation_scores
)

# Compute the inverse, empty the evaluation scores again and overwrite with the inverse scores.
inv_scores = np.array(self.scores) - np.array(self.scores_inv)
self.metric_init.evaluation_scores = []
self.evaluation_scores.extend(inv_scores)

# TODO. If all_evaluation_scores is empty, overwrite with inverse scores
# for the those last samples (keep iterator). Or skip and throw a warning.
# if self.all_evaluation_scores:
# self.all_evaluation_scores[-1] = []
self.all_evaluation_scores.append(inv_scores)
inv_scores = (np.array(self.scores) - np.array(self.scores_inv)).tolist()
self.evaluation_scores = inv_scores

return inv_scores

def custom_postprocess(
self,
model: ModelInterface,
x_batch: np.ndarray,
y_batch: Optional[np.ndarray],
a_batch: Optional[np.ndarray],
s_batch: np.ndarray,
**kwargs,
) -> None:
"""
Post-process the evaluation results.
if self.return_aggregate:
self.evaluation_scores = self.get_mean_score

Parameters
----------
model: torch.nn.Module, tf.keras.Model
A torch or tensorflow model e.g., torchvision.models that is subject to explanation.
x_batch: np.ndarray
A np.ndarray which contains the input data that are explained.
y_batch: np.ndarray
A np.ndarray which contains the output labels that are explained.
a_batch: np.ndarray, optional
A np.ndarray which contains pre-computed attributions i.e., explanations.
s_batch: np.ndarray, optional
A np.ndarray which contains segmentation masks that matches the input.
self.all_evaluation_scores.extend(self.metric_init.evaluation_scores)

Returns
-------
None
"""
# TODO. Is this needed?
pass
return inv_scores

def convert_attributions_to_rankings(self):
pass

@property
def get_mean_score(self):
"""Calculate the area under the curve (AUC) score for several test samples."""
return np.mean(np.array(self.evaluation_scores), axis=1)

@property
def get_auc_score(self):
"""Calculate the area under the curve (AUC) score for several test samples."""
return np.mean(
[utils.calculate_auc(np.array(curve)) for curve in self.evaluation_scores]
)
Loading

0 comments on commit fb3d5ef

Please sign in to comment.