From 84c2b8052d093c2afe13c0b2910a8777a6784fad Mon Sep 17 00:00:00 2001 From: Arne van den Berg Date: Thu, 25 Jul 2024 17:01:30 +0200 Subject: [PATCH 1/8] created function to add additional features from custom search results --- spectrum_fundamentals/metrics/percolator.py | 25 +++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index 33fc3f8..e178395 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -51,6 +51,7 @@ def __init__( self, metadata: pd.DataFrame, input_type: str, + additional_columns: Union[str, list], pred_intensities: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, true_intensities: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, mz: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, @@ -62,9 +63,16 @@ def __init__( self.metadata = metadata self.input_type = input_type self.all_features_flag = all_features_flag + self.additional_columns = additional_columns self.regression_method = regression_method self.fdr_cutoff = fdr_cutoff self.xl = "CROSSLINKER_TYPE" in self.metadata.columns + self.base_columns = ['RAW_FILE','SCAN_NUMBER','MODIFIED_SEQUENCE','PRECURSOR_CHARGE', + 'SCAN_EVENT_NUMBER','MASS','SCORE','REVERSE','SEQUENCE', + 'PEPTIDE_LENGTH','FRAGMENTATION','CALCULATED_MASS','SEQUENCE_A', + 'SEQUENCE_B','MODIFIED_SEQUENCE_A','MODIFIED_SEQUENCE_B','RETENTION_TIME', + 'PREDICTED_IRT','INSTRUMENT_TYPES', + 'MASS_ANALYZER','MZ_RANGE'] super().__init__(pred_intensities, true_intensities, mz) @@ -274,6 +282,22 @@ def add_common_features(self): self.metrics_val["HCD"] = (self.metadata["FRAGMENTATION"] == "HCD").astype(int) self.metrics_val["CID"] = (self.metadata["FRAGMENTATION"] == "CID").astype(int) + def add_additional_features(self): + """Add additional features from custom serch results if specified""" + feature_cols=[] + if isinstance(self.additional_columns, list): + feature_cols = self.additional_columns + elif isinstance(self.additional_columns, list) and (self.additional_columns.lower() == "all"): + feature_cols = list(self.metadata.columns.difference(self.base_columns)) + + for col in feature_cols: + if col not in self.metadata.columns: + raise ValueError(f"provided column: {col} cannot be found in search results.") + elif not pd.api.types.is_numeric_dtype(self.metadata[col]): + raise ValueError(f"wrong datatype for column: {col}, datatype must be numerical.") + else: + self.metrics_val[col] = self.metadata[col] + def add_percolator_metadata_columns(self): """Add metadata columns needed by percolator, e.g. to identify a PSM.""" if self.xl: @@ -401,6 +425,7 @@ def _reorder_columns_for_percolator(self): def calc(self): """Adds percolator metadata and feature columns to metrics_val based on PSM metadata.""" self.add_common_features() + self.add_additional_features() self.target_decoy_labels = self.metadata["REVERSE"].apply(Percolator.get_target_decoy_label).to_numpy() np.random.seed(1) # add Prosit or Andromeda features From dc2c942826917cb63509bf188cb98f21c43cc8bf Mon Sep 17 00:00:00 2001 From: Arne van den Berg Date: Thu, 25 Jul 2024 17:36:22 +0200 Subject: [PATCH 2/8] added . to conform to documentation requirements --- spectrum_fundamentals/metrics/percolator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index e178395..974e2d9 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -283,7 +283,7 @@ def add_common_features(self): self.metrics_val["CID"] = (self.metadata["FRAGMENTATION"] == "CID").astype(int) def add_additional_features(self): - """Add additional features from custom serch results if specified""" + """Add additional features from custom serch results if specified.""" feature_cols=[] if isinstance(self.additional_columns, list): feature_cols = self.additional_columns From 59b0f5bb707f0c3bbd5260a85726d087bb1e407f Mon Sep 17 00:00:00 2001 From: Arne van den Berg Date: Thu, 25 Jul 2024 17:42:05 +0200 Subject: [PATCH 3/8] made additional_columns argument optional --- spectrum_fundamentals/metrics/percolator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index 974e2d9..27b1a06 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -51,7 +51,7 @@ def __init__( self, metadata: pd.DataFrame, input_type: str, - additional_columns: Union[str, list], + additional_columns: Optional[Union[str, list]] = None, pred_intensities: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, true_intensities: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, mz: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, From 21822071e2c53e52913219bd0eb19c9b6363edbd Mon Sep 17 00:00:00 2001 From: Arne van den Berg Date: Thu, 25 Jul 2024 17:59:07 +0200 Subject: [PATCH 4/8] fixed bug in logic --- spectrum_fundamentals/metrics/percolator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index 27b1a06..7a4c525 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -287,7 +287,7 @@ def add_additional_features(self): feature_cols=[] if isinstance(self.additional_columns, list): feature_cols = self.additional_columns - elif isinstance(self.additional_columns, list) and (self.additional_columns.lower() == "all"): + elif isinstance(self.additional_columns, str) and (self.additional_columns.lower() == "all"): feature_cols = list(self.metadata.columns.difference(self.base_columns)) for col in feature_cols: From b8c42ef764127d3c7d80a2f7139c5989504182f0 Mon Sep 17 00:00:00 2001 From: Arne van den Berg Date: Tue, 30 Jul 2024 12:56:15 +0000 Subject: [PATCH 5/8] removed unnamed columns from features --- spectrum_fundamentals/metrics/percolator.py | 36 ++++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index 7a4c525..888696c 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -67,12 +67,30 @@ def __init__( self.regression_method = regression_method self.fdr_cutoff = fdr_cutoff self.xl = "CROSSLINKER_TYPE" in self.metadata.columns - self.base_columns = ['RAW_FILE','SCAN_NUMBER','MODIFIED_SEQUENCE','PRECURSOR_CHARGE', - 'SCAN_EVENT_NUMBER','MASS','SCORE','REVERSE','SEQUENCE', - 'PEPTIDE_LENGTH','FRAGMENTATION','CALCULATED_MASS','SEQUENCE_A', - 'SEQUENCE_B','MODIFIED_SEQUENCE_A','MODIFIED_SEQUENCE_B','RETENTION_TIME', - 'PREDICTED_IRT','INSTRUMENT_TYPES', - 'MASS_ANALYZER','MZ_RANGE'] + self.base_columns = [ + "raw_file", + "scan_number", + "modified_sequence", + "precursor_charge", + "scan_event_number", + "mass", + "score", + "reverse", + "sequence", + "peptide_length", + "fragmentation", + "calculated_mass", + "sequence_a", + "sequence_b", + "modified_sequence_a", + "modified_sequence_b", + "retention_time", + "predicted_irt", + "instrument_types", + "mass_analyzer", + "mz_range", + "collision_energy", + ] super().__init__(pred_intensities, true_intensities, mz) @@ -284,11 +302,13 @@ def add_common_features(self): def add_additional_features(self): """Add additional features from custom serch results if specified.""" - feature_cols=[] + feature_cols = [] + if isinstance(self.additional_columns, list): feature_cols = self.additional_columns elif isinstance(self.additional_columns, str) and (self.additional_columns.lower() == "all"): - feature_cols = list(self.metadata.columns.difference(self.base_columns)) + feature_cols = [x for x in self.metadata.columns if x.lower() not in set(self.base_columns)] + feature_cols = [x for x in feature_cols if not x.lower().startswith("unnamed")] # remove Unnamed cols for col in feature_cols: if col not in self.metadata.columns: From 3a0660033cf450866e2ec44a6ec8890dafd58b7e Mon Sep 17 00:00:00 2001 From: Arne van den Berg Date: Tue, 30 Jul 2024 12:57:14 +0000 Subject: [PATCH 6/8] added unittests for all three possible parameter configurations --- tests/unit_tests/test_percolator.py | 35 +++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/unit_tests/test_percolator.py b/tests/unit_tests/test_percolator.py index e2e6ae8..5e736f4 100644 --- a/tests/unit_tests/test_percolator.py +++ b/tests/unit_tests/test_percolator.py @@ -315,6 +315,41 @@ def test_get_delta_score(self): perc.Percolator.get_delta_score(df, "spectral_angle"), np.array([20, 40, 0, 40, 250, 0]) ) + def test_add_additional_features(self): + """Test add_additional_features.""" + types = { + "RAW_FILE": str, + "SCAN_NUMBER": int, + "MODIFIED_SEQUENCE": str, + "PRECURSOR_CHARGE": int, + "SCAN_EVENT_NUMBER": int, + "MASS": float, + "SCORE": float, + "REVERSE": int, + "SEQUENCE": str, + "PEPTIDE_LENGTH": float, + "A": float, + "B": float, + "precursor_charge": float, + "Unnamed 1": int, + } + + perc_input = pd.DataFrame(columns=types).astype(types) + + percolator_all = perc.Percolator(metadata=perc_input, input_type="rescore", additional_columns="all") + percolator_all.add_additional_features() + pd.testing.assert_frame_equal( + pd.DataFrame(columns=["A", "B"]).astype({"A": float, "B": float}), percolator_all.metrics_val + ) + + percolator_list = perc.Percolator(metadata=perc_input, input_type="rescore", additional_columns=["A"]) + percolator_list.add_additional_features() + pd.testing.assert_frame_equal(pd.DataFrame(columns=["A"]).astype({"A": float}), percolator_list.metrics_val) + + percolator_none = perc.Percolator(metadata=perc_input, input_type="rescore", additional_columns="none") + percolator_none.add_additional_features() + pd.testing.assert_frame_equal(pd.DataFrame(), percolator_none.metrics_val) + def test_calc(self): """Test calc.""" perc_input = pd.read_csv(Path(__file__).parent / "data/perc_input.csv") From 052a5650b211bf64a6ed73fc8f0f9b6fb57f3e3c Mon Sep 17 00:00:00 2001 From: Arne van den Berg Date: Tue, 30 Jul 2024 16:19:00 +0200 Subject: [PATCH 7/8] moved function call so that it only applies to rescore searchtype --- spectrum_fundamentals/metrics/percolator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index 888696c..62c3953 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -445,11 +445,12 @@ def _reorder_columns_for_percolator(self): def calc(self): """Adds percolator metadata and feature columns to metrics_val based on PSM metadata.""" self.add_common_features() - self.add_additional_features() self.target_decoy_labels = self.metadata["REVERSE"].apply(Percolator.get_target_decoy_label).to_numpy() np.random.seed(1) # add Prosit or Andromeda features if self.input_type == "rescore": + # add additional features + self.add_additional_features() fragments_ratio = fr.FragmentsRatio(self.pred_intensities, self.true_intensities) fragments_ratio.calc(xl=self.xl) similarity = sim.SimilarityMetrics(self.pred_intensities, self.true_intensities, self.mz) From 4377e20ea71790ab40ee80d1d13cd5f7d27469d3 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Wed, 31 Jul 2024 14:47:08 +0200 Subject: [PATCH 8/8] added additional column option to original mode --- spectrum_fundamentals/metrics/percolator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index 62c3953..72f22ef 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -51,13 +51,13 @@ def __init__( self, metadata: pd.DataFrame, input_type: str, - additional_columns: Optional[Union[str, list]] = None, pred_intensities: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, true_intensities: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, mz: Optional[Union[np.ndarray, scipy.sparse.csr_matrix]] = None, all_features_flag: bool = False, regression_method: str = "lowess", fdr_cutoff: float = 0.01, + additional_columns: Optional[Union[str, list]] = None, ): """Initialize a Percolator obj.""" self.metadata = metadata @@ -503,6 +503,7 @@ def calc(self): ) else: + self.add_additional_features() self.metrics_val["andromeda"] = self.metadata["SCORE"] self.add_percolator_metadata_columns()