diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py index 289e367..5aa4caa 100644 --- a/spectrum_io/search_result/sage.py +++ b/spectrum_io/search_result/sage.py @@ -21,13 +21,13 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame: :param tmt_labeled: tmt label as str :return: pd.DataFrame with the formatted data """ - logger.info("Reading msms.tsv file") + logger.info(f"Reading {self.path}") df = pd.read_csv( self.path, - usecols=["filename", "scannr", "peptide", "charge", "hyperscore", "calcmass", "proteins"], + usecols=["filename", "scannr", "peptide", "charge", "hyperscore", "calcmass", "label", "proteins"], sep="\t", ) - logger.info("Finished reading msms.tsv file") + logger.info(f"Finished reading {self.path}") # Standardize column names df.columns = df.columns.str.upper() @@ -51,6 +51,9 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram "SCANNR": "SCAN_NUMBER", "PEPTIDE": "MODIFIED_SEQUENCE", "CHARGE": "PRECURSOR_CHARGE", + "CALCMASS": "MASS", + "HYPERSCORE": "SCORE", + "LABEL": "REVERSE", } ) @@ -59,16 +62,12 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram # extracting only the scan number df["SCAN_NUMBER"] = [int(x.rsplit("=", 1)[-1]) for x in df["SCAN_NUMBER"]] # creating a column of decoys and targets - df["REVERSE"] = df["PROTEINS"].str.startswith("rev_") + df["REVERSE"] = df["REVERSE"] < 0 # removing modification to create the unmodified sequences - df["SEQUENCE"] = df["MODIFIED_SEQUENCE"].str.replace(r"\[.*?\]", "", regex=True) + df["SEQUENCE"] = df["MODIFIED_SEQUENCE"].str.replace(r"\-|\[.*?\]", "", regex=True) # length of the peptide df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len() - # mass of the peptide - df["MASS"] = df["CALCMASS"] - # score of the peptide - df["SCORE"] = df["HYPERSCORE"] - # converting proforma to unimode + # converting sage to unimod df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"]) return df diff --git a/tests/unit_tests/data/sage_output_internal.csv b/tests/unit_tests/data/sage_output_internal.csv index 2a15b5b..9b6c7ff 100644 --- a/tests/unit_tests/data/sage_output_internal.csv +++ b/tests/unit_tests/data/sage_output_internal.csv @@ -1,16 +1,18 @@ -,MODIFIED_SEQUENCE,PROTEINS,RAW_FILE,SCAN_NUMBER,CALCMASS,PRECURSOR_CHARGE,HYPERSCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH,MASS,SCORE -0,[UNIMOD:737]-HLDGGAEQSLLFVAGM[UNIMOD:35]R,rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN,GN20170722_SK_HLA_G0103_R1_01,50989,2045.0568,2,18.084579770792818,True,-HLDGGAEQSLLFVAGMR,18,2045.0568,18.084579770792818 -1,[UNIMOD:737]-GRFVEPLSNVQEEWNQK[UNIMOD:737],rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37495,2517.338,2,21.44256145543081,True,-GRFVEPLSNVQEEWNQK,18,2517.338,21.44256145543081 -3,[UNIMOD:737]-VNM[UNIMOD:35]RTSSSIQNEDEATSMELIAPGP,sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45493,2921.3948,3,19.97971639308644,False,-VNMRTSSSIQNEDEATSMELIAPGP,26,2921.3948,19.97971639308644 -4,[UNIMOD:737]-VGEQEAPHEGGHPGSDSARASMADWLR,sp|Q9H093|NUAK2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41946,3075.4417,3,20.936078174830037,False,-VGEQEAPHEGGHPGSDSARASMADWLR,28,3075.4417,20.936078174830037 -5,[UNIMOD:737]-EM[UNIMOD:35]VSPTDSC[UNIMOD:4]VRVSVRDLPQFHVSVVDM[UNIMOD:35]DR,rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN,GN20170722_SK_HLA_G0103_R1_01,39750,3620.7583,3,17.954540767022678,True,-EMVSPTDSCVRVSVRDLPQFHVSVVDMDR,30,3620.7583,17.954540767022678 -6,[UNIMOD:737]-K[UNIMOD:737]M[UNIMOD:35]EEDIYTNLSK[UNIMOD:737]METVLGQSMSSLPLSYR,sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45441,4053.1077,4,19.962357128713123,False,-KMEEDIYTNLSKMETVLGQSMSSLPLSYR,30,4053.1077,19.962357128713123 -7,[UNIMOD:737]-TC[UNIMOD:4]SK[UNIMOD:737]SQGSWGNREIVIIDTPDMFSWK[UNIMOD:737],sp|Q9UG22|GIMA2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,36758,3728.9263,3,18.776753308950084,False,-TCSKSQGSWGNREIVIIDTPDMFSWK,27,3728.9263,18.776753308950084 -8,[UNIMOD:737]-M[UNIMOD:35]SLGRAAPSAPGR,rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,49100,1514.819,2,20.66450790410668,True,-MSLGRAAPSAPGR,14,1514.819,20.66450790410668 -9,[UNIMOD:737]-C[UNIMOD:4]LIQM[UNIMOD:35]GAAVEAK[UNIMOD:737]AYNGNTALHVAASLQYR,tr|H7C5S1|H7C5S1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37376,3593.885,3,21.88178474607965,False,-CLIQMGAAVEAKAYNGNTALHVAASLQYR,30,3593.885,21.88178474607965 -11,[UNIMOD:737]-LNVEGTERGSC[UNIMOD:4]GRK[UNIMOD:737],sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40784,2020.0887,3,21.842498887625368,False,-LNVEGTERGSCGRK,15,2020.0887,21.842498887625368 -12,[UNIMOD:737]-C[UNIMOD:4]NRGWTALHESVSR,sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,51002,1900.9526,2,17.823081883875247,False,-CNRGWTALHESVSR,15,1900.9526,17.823081883875247 -13,[UNIMOD:737]-AEVDNQMHVVDK[UNIMOD:737]NPVSLVSK[UNIMOD:737]TR,rev_sp|O75151|PHF2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40715,3152.7583,3,19.971965322167343,True,-AEVDNQMHVVDKNPVSLVSKTR,23,3152.7583,19.971965322167343 -14,[UNIMOD:737]-YLLSLEEERPALMDDR,sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37506,2178.1194,2,19.99761378859208,False,-YLLSLEEERPALMDDR,17,2178.1194,19.99761378859208 -15,[UNIMOD:737]-EGRGAGSQSPPRGR,sp|Q6ZSN1|YI023_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45427,1639.8704,2,19.22966416224777,False,-EGRGAGSQSPPRGR,15,1639.8704,19.22966416224777 -16,[UNIMOD:737]-TASASRRSAR,sp|P08729|K2C7_HUMAN,GN20170722_SK_HLA_G0103_R1_01,30700,1290.7319,2,19.376349012983997,False,-TASASRRSAR,11,1290.7319,19.376349012983997 +MODIFIED_SEQUENCE,PROTEINS,RAW_FILE,SCAN_NUMBER,REVERSE,MASS,PRECURSOR_CHARGE,SCORE,SEQUENCE,PEPTIDE_LENGTH +[UNIMOD:737]-HLDGGAEQSLLFVAGM[UNIMOD:35]R,rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN,GN20170722_SK_HLA_G0103_R1_01,50989,True,2045.0568,2,18.084579770792818,HLDGGAEQSLLFVAGMR,17 +[UNIMOD:737]-GRFVEPLSNVQEEWNQK[UNIMOD:737],rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37495,True,2517.338,2,21.44256145543081,GRFVEPLSNVQEEWNQK,17 +[UNIMOD:737]-LTVEC[UNIMOD:4]MPTIASDDLPVGTLQESEVSM[UNIMOD:35]TGPG,rev_tr|C9JVX2|C9JVX2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45450,True,3378.6082,3,18.628243636678995,LTVECMPTIASDDLPVGTLQESEVSMTGPG,30 +[UNIMOD:737]-VNM[UNIMOD:35]RTSSSIQNEDEATSMELIAPGP,sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45493,False,2921.3948,3,19.97971639308644,VNMRTSSSIQNEDEATSMELIAPGP,25 +[UNIMOD:737]-VGEQEAPHEGGHPGSDSARASMADWLR,sp|Q9H093|NUAK2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41946,False,3075.4417,3,20.936078174830037,VGEQEAPHEGGHPGSDSARASMADWLR,27 +[UNIMOD:737]-EM[UNIMOD:35]VSPTDSC[UNIMOD:4]VRVSVRDLPQFHVSVVDM[UNIMOD:35]DR,rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN,GN20170722_SK_HLA_G0103_R1_01,39750,True,3620.7583,3,17.954540767022678,EMVSPTDSCVRVSVRDLPQFHVSVVDMDR,29 +[UNIMOD:737]-K[UNIMOD:737]M[UNIMOD:35]EEDIYTNLSK[UNIMOD:737]METVLGQSMSSLPLSYR,sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45441,False,4053.1077,4,19.962357128713123,KMEEDIYTNLSKMETVLGQSMSSLPLSYR,29 +[UNIMOD:737]-TC[UNIMOD:4]SK[UNIMOD:737]SQGSWGNREIVIIDTPDMFSWK[UNIMOD:737],sp|Q9UG22|GIMA2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,36758,False,3728.9263,3,18.776753308950084,TCSKSQGSWGNREIVIIDTPDMFSWK,26 +[UNIMOD:737]-M[UNIMOD:35]SLGRAAPSAPGR,rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,49100,True,1514.819,2,20.66450790410668,MSLGRAAPSAPGR,13 +[UNIMOD:737]-C[UNIMOD:4]LIQM[UNIMOD:35]GAAVEAK[UNIMOD:737]AYNGNTALHVAASLQYR,tr|H7C5S1|H7C5S1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37376,False,3593.885,3,21.88178474607965,CLIQMGAAVEAKAYNGNTALHVAASLQYR,29 +[UNIMOD:737]-M[UNIMOD:35]EESLNIVK[UNIMOD:737]YTAFLYNDQLIWSGLEQDDMR,sp|P86790|CCZ1B_HUMAN;sp|P86791|CCZ1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41296,False,4095.037,3,22.91659667770412,MEESLNIVKYTAFLYNDQLIWSGLEQDDMR,30 +[UNIMOD:737]-LNVEGTERGSC[UNIMOD:4]GRK[UNIMOD:737],sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40784,False,2020.0887,3,21.842498887625368,LNVEGTERGSCGRK,14 +[UNIMOD:737]-C[UNIMOD:4]NRGWTALHESVSR,sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,51002,False,1900.9526,2,17.823081883875247,CNRGWTALHESVSR,14 +[UNIMOD:737]-AEVDNQMHVVDK[UNIMOD:737]NPVSLVSK[UNIMOD:737]TR,rev_sp|O75151|PHF2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40715,True,3152.7583,3,19.971965322167343,AEVDNQMHVVDKNPVSLVSKTR,22 +[UNIMOD:737]-YLLSLEEERPALMDDR,sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37506,False,2178.1194,2,19.99761378859208,YLLSLEEERPALMDDR,16 +[UNIMOD:737]-EGRGAGSQSPPRGR,sp|Q6ZSN1|YI023_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45427,False,1639.8704,2,19.22966416224777,EGRGAGSQSPPRGR,14 +[UNIMOD:737]-TASASRRSAR,sp|P08729|K2C7_HUMAN,GN20170722_SK_HLA_G0103_R1_01,30700,False,1290.7319,2,19.376349012983997,TASASRRSAR,10 diff --git a/tests/unit_tests/test_sage.py b/tests/unit_tests/test_sage.py index a2a69e6..40dcfb0 100644 --- a/tests/unit_tests/test_sage.py +++ b/tests/unit_tests/test_sage.py @@ -12,8 +12,7 @@ class TestSage(unittest.TestCase): def test_read_sage(self): """Test function for reading sage results and transforming to Prosit format.""" expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv" - internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result() - expected_df = pd.read_csv(expected_sage_internal_path, index_col=0) + expected_df = pd.read_csv(expected_sage_internal_path) pd.testing.assert_frame_equal(internal_search_results_df, expected_df)