diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index bd41373..3a71fba 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -1,8 +1,5 @@ import enum -import hashlib import logging -import re -import subprocess from typing import Optional, Tuple, Union import numpy as np @@ -60,7 +57,6 @@ def __init__( all_features_flag: bool = False, regression_method: str = "lowess", fdr_cutoff: float = 0.01, - percolator_version: Optional[float] = 3.05, ): """Initialize a Percolator obj.""" self.metadata = metadata @@ -68,18 +64,8 @@ def __init__( self.all_features_flag = all_features_flag self.regression_method = regression_method self.fdr_cutoff = fdr_cutoff - - self._resolve_percolator_compatibility(percolator_version) super().__init__(pred_intensities, true_intensities, mz) - def _resolve_percolator_compatibility(self, percolator_version: Optional[float] = None): - if percolator_version is None: - result = subprocess.run(["percolator", "-h"], capture_output=True, text=True) - version_line = result.stderr.splitlines()[0].strip() - version = version_line.split("version ")[1] - percolator_version = float(re.sub(r"\.[^.]+$", "", version)) - self.prot_col_name = "Proteins" if percolator_version >= 3.06 else "Protein" - @staticmethod def sample_balanced_over_bins(retention_time_df: pd.DataFrame, sample_size: int = 5000) -> pd.Index: """ @@ -170,18 +156,6 @@ def get_aligned_predicted_retention_times( return aligned_rts_predicted - @staticmethod - def get_scannr(metadata_subset: Union[pd.Series, Tuple[str, int]]) -> int: - """ - Creates a hash of the raw_file and scan number to use as a unique scan number in percolator. - - :param metadata_subset: tuple of (raw_file, scan_number) - :return: hashed unique id - """ - raw_file, scan_number = metadata_subset - s = f"{raw_file}{scan_number}".encode() - return int(hashlib.sha224(s).hexdigest()[:6], 16) - @staticmethod def get_delta_score(scores_df: pd.DataFrame, scoring_feature: str) -> np.ndarray: """ @@ -296,11 +270,11 @@ def add_percolator_metadata_columns(self): spec_id_cols.append("SCAN_EVENT_NUMBER") self.metrics_val["SpecId"] = self.metadata[spec_id_cols].apply(Percolator.get_specid, axis=1) self.metrics_val["Label"] = self.target_decoy_labels - self.metrics_val["ScanNr"] = self.metadata[["RAW_FILE", "SCAN_NUMBER"]].apply(Percolator.get_scannr, axis=1) - + self.metrics_val["ScanNr"] = self.metadata["SCAN_NUMBER"] + self.metrics_val["filename"] = self.metadata["RAW_FILE"] self.metrics_val["Peptide"] = self.metadata["MODIFIED_SEQUENCE"].apply(lambda x: "_." + x + "._") - self.metrics_val[self.prot_col_name] = self.metadata[ + self.metrics_val["Proteins"] = self.metadata[ "MODIFIED_SEQUENCE" ] # we don't need the protein ID to get PSM / peptide results, fill with peptide sequence @@ -397,8 +371,8 @@ def fdrs_to_qvals(fdrs: np.ndarray) -> np.ndarray: def _reorder_columns_for_percolator(self): all_columns = self.metrics_val.columns - first_columns = ["SpecId", "Label", "ScanNr"] - last_columns = ["Peptide", "Protein"] if "Protein" in all_columns else ["Peptide", "Proteins"] + first_columns = ["SpecId", "Label", "ScanNr", "filename"] + last_columns = ["Peptide", "Proteins"] mid_columns = list(set(all_columns) - set(first_columns) - set(last_columns)) new_columns = first_columns + sorted(mid_columns) + last_columns self.metrics_val = self.metrics_val[new_columns] diff --git a/tests/unit_tests/data/perc_output.csv b/tests/unit_tests/data/perc_output.csv index 1309d95..55ed3bb 100644 --- a/tests/unit_tests/data/perc_output.csv +++ b/tests/unit_tests/data/perc_output.csv @@ -1,4 +1,4 @@ -,SpecId,Label,ScanNr,CID,Charge1,Charge2,Charge3,Charge4,Charge5,Charge6,HCD,KR,Mass,RT,UnknownFragmentationMethod,abs_rt_diff,collision_energy_aligned,count_not_observed_and_not_predicted,count_not_observed_and_not_predicted_b,count_not_observed_and_not_predicted_y,count_not_observed_but_predicted,count_not_observed_but_predicted_b,count_not_observed_but_predicted_y,count_observed,count_observed_and_predicted,count_observed_and_predicted_b,count_observed_and_predicted_y,count_observed_b,count_observed_but_not_predicted,count_observed_but_not_predicted_b,count_observed_but_not_predicted_y,count_observed_y,count_predicted,count_predicted_b,count_predicted_y,fraction_not_observed_and_not_predicted,fraction_not_observed_and_not_predicted_b,fraction_not_observed_and_not_predicted_b_vs_predicted_b,fraction_not_observed_and_not_predicted_vs_predicted,fraction_not_observed_and_not_predicted_y,fraction_not_observed_and_not_predicted_y_vs_predicted_y,fraction_not_observed_but_predicted,fraction_not_observed_but_predicted_b,fraction_not_observed_but_predicted_b_vs_predicted,fraction_not_observed_but_predicted_vs_predicted,fraction_not_observed_but_predicted_y,fraction_not_observed_but_predicted_y_vs_predicted,fraction_observed,fraction_observed_and_predicted,fraction_observed_and_predicted_b,fraction_observed_and_predicted_b_vs_predicted_b,fraction_observed_and_predicted_vs_predicted,fraction_observed_and_predicted_y,fraction_observed_and_predicted_y_vs_predicted_y,fraction_observed_b,fraction_observed_but_not_predicted,fraction_observed_but_not_predicted_b,fraction_observed_but_not_predicted_b_vs_predicted_b,fraction_observed_but_not_predicted_vs_predicted,fraction_observed_but_not_predicted_y,fraction_observed_but_not_predicted_y_vs_predicted_y,fraction_observed_y,fraction_predicted,fraction_predicted_b,fraction_predicted_y,iRT,lda_scores,missedCleavages,modified_cosine,pearson_corr,pred_RT,sequence_length,spectral_angle,Peptide,Protein +,SpecId,Label,ScanNr,CID,Charge1,Charge2,Charge3,Charge4,Charge5,Charge6,HCD,KR,Mass,RT,UnknownFragmentationMethod,abs_rt_diff,collision_energy_aligned,count_not_observed_and_not_predicted,count_not_observed_and_not_predicted_b,count_not_observed_and_not_predicted_y,count_not_observed_but_predicted,count_not_observed_but_predicted_b,count_not_observed_but_predicted_y,count_observed,count_observed_and_predicted,count_observed_and_predicted_b,count_observed_and_predicted_y,count_observed_b,count_observed_but_not_predicted,count_observed_but_not_predicted_b,count_observed_but_not_predicted_y,count_observed_y,count_predicted,count_predicted_b,count_predicted_y,fraction_not_observed_and_not_predicted,fraction_not_observed_and_not_predicted_b,fraction_not_observed_and_not_predicted_b_vs_predicted_b,fraction_not_observed_and_not_predicted_vs_predicted,fraction_not_observed_and_not_predicted_y,fraction_not_observed_and_not_predicted_y_vs_predicted_y,fraction_not_observed_but_predicted,fraction_not_observed_but_predicted_b,fraction_not_observed_but_predicted_b_vs_predicted,fraction_not_observed_but_predicted_vs_predicted,fraction_not_observed_but_predicted_y,fraction_not_observed_but_predicted_y_vs_predicted,fraction_observed,fraction_observed_and_predicted,fraction_observed_and_predicted_b,fraction_observed_and_predicted_b_vs_predicted_b,fraction_observed_and_predicted_vs_predicted,fraction_observed_and_predicted_y,fraction_observed_and_predicted_y_vs_predicted_y,fraction_observed_b,fraction_observed_but_not_predicted,fraction_observed_but_not_predicted_b,fraction_observed_but_not_predicted_b_vs_predicted_b,fraction_observed_but_not_predicted_vs_predicted,fraction_observed_but_not_predicted_y,fraction_observed_but_not_predicted_y_vs_predicted_y,fraction_observed_y,fraction_predicted,fraction_predicted_b,fraction_predicted_y,iRT,lda_scores,missedCleavages,modified_cosine,pearson_corr,pred_RT,sequence_length,spectral_angle,Peptide,Proteins 0,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-7978-AAIGEATRL-2-1,1,10203379,0,0,1,0,0,0,0,1,1,900.50288029264,0.5000000183883155,0,0.0,0.3,2.0,1,1,2.0,1,1,4.0,2.0,1,1,1,2.0,0,2,3,4.0,2,2,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.5,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.3333333333333333,0.25,0.0,0.0,0.5,0.4,1.0,0.6,0.5,0.6666666666666666,0.4,0.5000000183883155,1.7027402478448157e+30,1,0.8959462788501606,0.8831116463997931,0.5000000323590892,9,0.7449534694193893,_.AAIGEATRL._,AAIGEATRL 1,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-12304-AAVPRAAFL-2-2,-1,5831230,0,0,1,0,0,0,0,1,1,914.53379,1.000000038995633,0,2.533209730870567e-08,0.3,1.0,0,1,1.0,0,1,2.0,1.0,0,1,1,1.0,1,0,1,2.0,0,2,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,1.0,0.25,1.0,1.0,0.5,0.0,0.0,0.3333333333333333,0.5,0.0,0.6666666666666666,1.0000000136635356,-1.7027402478448157e+30,1,1.0,0.0,1.5000000246189773,9,0.5903344918223923,_.AAVPRAAFL._,AAVPRAAFL 2,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-12398-AAYFGVYDTAK-2-3,-1,16059765,0,0,1,0,0,0,0,1,1,1204.5764,1.5000000993570155,0,1.0185683851915428e-07,0.3,1.0,0,1,1.0,0,1,2.0,1.0,0,1,1,1.0,1,0,1,2.0,0,2,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,1.0,0.25,1.0,1.0,0.5,0.0,0.0,0.3333333333333333,0.5,0.0,0.6666666666666666,1.499999997500177,-1.7027402478448157e+30,0,1.0,0.0,2.500000056694346,11,0.5903344918223923,_.AAYFGVYDTAK._,AAYFGVYDTAK diff --git a/tests/unit_tests/test_percolator.py b/tests/unit_tests/test_percolator.py index 4216c7a..6a2de87 100644 --- a/tests/unit_tests/test_percolator.py +++ b/tests/unit_tests/test_percolator.py @@ -274,12 +274,6 @@ def _create_noisy_logistic_data(): class TestPercolator: """Class to test percolator.""" - def test_get_scannr(self): - """Test get_scannr.""" - np.testing.assert_equal( - perc.Percolator.get_scannr(("20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02", 7978)), 10203379 - ) - def test_get_specid(self): """Test get_specid.""" np.testing.assert_string_equal( @@ -361,11 +355,12 @@ def test_calc(self): percolator.metrics_val["SpecId"][0], "20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-7978-AAIGEATRL-2-1" ) np.testing.assert_equal(percolator.metrics_val["Label"][0], 1) - np.testing.assert_equal(percolator.metrics_val["ScanNr"][0], 10203379) + np.testing.assert_equal(percolator.metrics_val["ScanNr"][0], 7978) + np.testing.assert_equal(percolator.metrics_val["filename"][0], "20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02") # np.testing.assert_almost_equal(percolator.metrics_val['ExpMass'][0], 900.50345678) np.testing.assert_string_equal(percolator.metrics_val["Peptide"][0], "_.AAIGEATRL._") np.testing.assert_string_equal( - percolator.metrics_val["Protein"][0], "AAIGEATRL" + percolator.metrics_val["Proteins"][0], "AAIGEATRL" ) # we don't need the protein ID to get PSM / peptide results # features