diff --git a/spectrum_fundamentals/metrics/fragments_ratio.py b/spectrum_fundamentals/metrics/fragments_ratio.py index 0c0de6f..8b0e830 100644 --- a/spectrum_fundamentals/metrics/fragments_ratio.py +++ b/spectrum_fundamentals/metrics/fragments_ratio.py @@ -3,6 +3,9 @@ import numpy as np import scipy.sparse +import pandas as pd +from warnings import simplefilter +simplefilter(action="ignore", category=pd.errors.PerformanceWarning) from .. import constants from .metric import Metric @@ -49,7 +52,7 @@ def count_with_ion_mask( if xl: array_size = 348 else: - array_size = 174 + array_size = boolean_array.shape[1] if ion_mask is None: ion_mask = scipy.sparse.csr_matrix(np.ones((array_size, 1))) @@ -136,7 +139,7 @@ def get_observation_state( ) return observation_state - def calc(self, xl: bool = False): + def calc(self, xl: bool = False, ion_dict: pd.DataFrame = None): """Adds columns with count, fraction and fraction_predicted features to metrics_val dataframe.""" if self.true_intensities is None or self.pred_intensities is None: return None @@ -418,6 +421,129 @@ def calc(self, xl: bool = False): self.metrics_val["fraction_observed_but_not_predicted_y_vs_predicted_y_b"] = ( self.metrics_val["count_observed_but_not_predicted_y_b"] / num_predicted_ions_y_b ) + + elif ion_dict is not None: + unique_ions = ion_dict['ion'].unique() + ION_MASKS = {ion: (ion_dict['ion'] == ion).to_numpy().astype(int) for ion in unique_ions} + + mask_observed_valid = FragmentsRatio.get_mask_observed_valid(self.true_intensities) + observed_boolean = FragmentsRatio.make_boolean(self.true_intensities, mask_observed_valid) + predicted_boolean = FragmentsRatio.make_boolean(self.pred_intensities, mask_observed_valid, cutoff=0.05) + observation_state = FragmentsRatio.get_observation_state( + observed_boolean, predicted_boolean, mask_observed_valid + ) + + # counting metrics + # - count_predicted + # - count_observed + # - count_not_observed_and_not_predicted + # - count_observed_and_predicted + # - count_observed_but_not_predicted + # - count_not_observed_but_predicted + + self.metrics_val["count_predicted"] = FragmentsRatio.count_with_ion_mask(predicted_boolean) + self.metrics_val["count_observed"] = FragmentsRatio.count_with_ion_mask(observed_boolean) + self.metrics_val["count_observed_and_predicted"] = FragmentsRatio.count_observation_states( + observation_state, ObservationState.OBS_AND_PRED + ) + self.metrics_val["count_not_observed_and_not_predicted"] = FragmentsRatio.count_observation_states( + observation_state, ObservationState.NOT_OBS_AND_NOT_PRED + ) + self.metrics_val["count_observed_but_not_predicted"] = FragmentsRatio.count_observation_states( + observation_state, ObservationState.OBS_BUT_NOT_PRED + ) + self.metrics_val["count_not_observed_but_predicted"] = FragmentsRatio.count_observation_states( + observation_state, ObservationState.NOT_OBS_BUT_PRED + ) + for ion, mask in ION_MASKS.items(): + self.metrics_val[f"count_predicted_{ion}"] = FragmentsRatio.count_with_ion_mask( + predicted_boolean, mask + ) + self.metrics_val[f"count_observed_{ion}"] = FragmentsRatio.count_with_ion_mask( + observed_boolean, mask + ) + self.metrics_val[f"count_observed_and_predicted_{ion}"] = FragmentsRatio.count_observation_states( + observation_state, ObservationState.OBS_AND_PRED, mask + ) + self.metrics_val[f"count_not_observed_and_not_predicted_{ion}"] = FragmentsRatio.count_observation_states( + observation_state, ObservationState.NOT_OBS_AND_NOT_PRED, mask + ) + self.metrics_val[f"count_observed_but_not_predicted_{ion}"] = FragmentsRatio.count_observation_states( + observation_state, ObservationState.OBS_BUT_NOT_PRED, mask + ) + self.metrics_val[f"count_not_observed_but_predicted_{ion}"] = FragmentsRatio.count_observation_states( + observation_state, ObservationState.NOT_OBS_BUT_PRED, mask + ) + + # fractional count metrics + + valid_ions = np.maximum(1, FragmentsRatio.count_with_ion_mask(mask_observed_valid)) + self.metrics_val["fraction_predicted"] = self.metrics_val["count_predicted"].values / valid_ions + self.metrics_val["fraction_observed"] = self.metrics_val["count_observed"].values / valid_ions + self.metrics_val["fraction_observed_and_predicted"] = ( + self.metrics_val["count_observed_and_predicted"].values / valid_ions + ) + self.metrics_val["fraction_not_observed_and_not_predicted"] = ( + self.metrics_val["count_not_observed_and_not_predicted"].values / valid_ions + ) + self.metrics_val["fraction_observed_but_not_predicted"] = ( + self.metrics_val["count_observed_but_not_predicted"].values / valid_ions + ) + self.metrics_val["fraction_not_observed_but_predicted"] = ( + self.metrics_val["count_not_observed_but_predicted"].values / valid_ions + ) + for ion, mask in ION_MASKS.items(): + valid_ions = np.maximum(1, FragmentsRatio.count_with_ion_mask(mask_observed_valid, mask)) + self.metrics_val[f"fraction_predicted_{ion}"] = self.metrics_val[f"count_predicted_{ion}"].values / valid_ions + self.metrics_val[f"fraction_observed_{ion}"] = self.metrics_val[f"count_observed_{ion}"].values / valid_ions + self.metrics_val[f"fraction_observed_and_predicted_{ion}"] = ( + self.metrics_val[f"count_observed_and_predicted_{ion}"].values / valid_ions + ) + self.metrics_val[f"fraction_not_observed_and_not_predicted_{ion}"] = ( + self.metrics_val[f"count_not_observed_and_not_predicted_{ion}"].values / valid_ions + ) + self.metrics_val[f"fraction_observed_but_not_predicted_{ion}"] = ( + self.metrics_val[f"count_observed_but_not_predicted_{ion}"].values / valid_ions + ) + self.metrics_val[f"fraction_not_observed_but_predicted_{ion}"] = ( + self.metrics_val[f"count_not_observed_but_predicted_{ion}"].values / valid_ions + ) + + # fractional count metrics relative to predictions + + num_predicted_ions = np.maximum(1, self.metrics_val["count_predicted"]) + self.metrics_val["fraction_observed_and_predicted_vs_predicted"] = ( + self.metrics_val["count_observed_and_predicted"].values / num_predicted_ions + ) + self.metrics_val["fraction_not_observed_and_not_predicted_vs_predicted"] = ( + self.metrics_val["count_not_observed_and_not_predicted"].values / num_predicted_ions + ) + self.metrics_val["fraction_observed_but_not_predicted_vs_predicted"] = ( + self.metrics_val["count_observed_but_not_predicted"].values / num_predicted_ions + ) + for ion, mask in ION_MASKS.items(): + num_predicted_ions = np.maximum(1, self.metrics_val[f"count_predicted_{ion}"]) + self.metrics_val[f"fraction_observed_and_predicted_{ion}_vs_predicted_{ion}"] = ( + self.metrics_val[f"count_observed_and_predicted_{ion}"].values / num_predicted_ions + ) + self.metrics_val[f"fraction_not_observed_and_not_predicted_{ion}_vs_predicted_{ion}"] = ( + self.metrics_val[f"count_not_observed_and_not_predicted_{ion}"].values / num_predicted_ions + ) + self.metrics_val[f"fraction_observed_but_not_predicted_{ion}_vs_predicted_{ion}"] = ( + self.metrics_val[f"count_observed_but_not_predicted_{ion}"].values / num_predicted_ions + ) + + # not needed, as these are simply (1 - fraction_observed_and_predicted_vs_predicted) + + num_predicted_ions = np.maximum(1, self.metrics_val["count_predicted"]) + self.metrics_val["fraction_not_observed_but_predicted_vs_predicted"] = ( + self.metrics_val["count_not_observed_but_predicted"].values / num_predicted_ions + ) + for ion, mask in ION_MASKS.items(): + num_predicted_ions = np.maximum(1, self.metrics_val[f"count_predicted_{ion}"]) + self.metrics_val[f"fraction_not_observed_but_predicted_{ion}_vs_predicted"] = ( + self.metrics_val[f"count_not_observed_but_predicted_{ion}"].values / num_predicted_ions + ) else: mask_observed_valid = FragmentsRatio.get_mask_observed_valid(self.true_intensities) diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index e10909d..3edaf0f 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -445,7 +445,7 @@ def _reorder_columns_for_percolator(self): new_columns = first_columns + sorted(mid_columns) + last_columns self.metrics_val = self.metrics_val[new_columns] - def calc(self): # noqa: C901 + def calc(self, ion_dict): # noqa: C901 """Adds percolator metadata and feature columns to metrics_val based on PSM metadata.""" self.add_common_features() self.target_decoy_labels = self.metadata["REVERSE"].apply(Percolator.get_target_decoy_label).to_numpy() @@ -455,7 +455,7 @@ def calc(self): # noqa: C901 # add additional features self.add_additional_features() fragments_ratio = fr.FragmentsRatio(self.pred_intensities, self.true_intensities) - fragments_ratio.calc(xl=self.xl) + fragments_ratio.calc(xl=self.xl, ion_dict=ion_dict) similarity = sim.SimilarityMetrics(self.pred_intensities, self.true_intensities, self.mz) similarity.calc(self.all_features_flag, xl=self.xl)