diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index a55ac15..244485b 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -5,7 +5,7 @@ import pandas as pd import spectrum_fundamentals.constants as c from pyteomics import pepxml -from spectrum_fundamentals.mod_string import internal_without_mods +from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal from tqdm import tqdm from .search_results import SearchResults, filter_valid_prosit_sequences @@ -42,7 +42,7 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: df = pd.concat(ms_frag_results) - df = update_columns_for_prosit(df, "") + df = update_columns_for_prosit(df, tmt_labeled) return filter_valid_prosit_sequences(df) @@ -58,7 +58,17 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame: df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0]) df["MASS"] = df["precursor_neutral_mass"] df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x)) - df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"]) + + if tmt_labeled != "": + unimod_tag = c.TMT_MODS[tmt_labeled] + logger.info("Adding TMT fixed modifications") + df["MODIFIED_SEQUENCE"] = msfragger_to_internal( + df["modified_peptide"].to_list(), + fixed_mods={"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, + ) + else: + df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list()) + df.rename( columns={ "assumed_charge": "PRECURSOR_CHARGE", @@ -84,20 +94,3 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame: "PEPTIDE_LENGTH", ] ] - - -def msfragger_to_internal(modstrings: pd.Series): - """ - Transform modstring from msfragger format to internal format. - - This function takes a modstrings column from a pandas dataframe and converts each - supported modification (M[147] and C[160]) to the internal representation that is - M[UNIMOD:35] and C[UNIMOD:4], respectively. Since C is considered a fixed modification, - every occurence of a C is transformed to C[UNIMOD:4] as well. - - :param modstrings: pd.Series containing the msfragger modstrings - :return: pd.Series with internal modstrings - """ - modstrings = modstrings.str.replace("M[147]", "M[UNIMOD:35]", regex=False) - modstrings = modstrings.str.replace(r"C\[160\]|C", "C[UNIMOD:4]", regex=True) - return modstrings diff --git a/tests/unit_tests/data/psm_tmt.pepXML b/tests/unit_tests/data/psm_tmt.pepXML new file mode 100644 index 0000000..b31bdf0 --- /dev/null +++ b/tests/unit_tests/data/psm_tmt.pepXML @@ -0,0 +1,175 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/unit_tests/data/psm_tmt_internal.csv b/tests/unit_tests/data/psm_tmt_internal.csv new file mode 100644 index 0000000..4e3f7be --- /dev/null +++ b/tests/unit_tests/data/psm_tmt_internal.csv @@ -0,0 +1,5 @@ +,RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE,SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH +0,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2459,[UNIMOD:2016]-GQAVLAFQEQVGTGR,5,34,1863.023,8.221,True,GQAVLAFQEQVGTGR,15 +1,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2486,[UNIMOD:2016]-TEVPM[UNIMOD:35]GLSLRTTSAR,5,42,1937.0531,7.083,False,TEVPMGLSLRTTSAR,15 +2,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2980,[UNIMOD:2016]-YSGNC[UNIMOD:4]DRQSVER,3,193,1773.8123,3.932,False,YSGNCDRQSVER,12 +3,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,3945,[UNIMOD:2016]-ESTK[UNIMOD:2016]SAAER,3,647,1586.8887,10.839,True,ESTKSAAER,9 diff --git a/tests/unit_tests/test_msfragger.py b/tests/unit_tests/test_msfragger.py index cc6ce14..6b869c8 100644 --- a/tests/unit_tests/test_msfragger.py +++ b/tests/unit_tests/test_msfragger.py @@ -24,3 +24,13 @@ def test_read_result(self): self.assertTrue("REVERSE" in df.columns) self.assertTrue("SEQUENCE" in df.columns) self.assertTrue("PEPTIDE_LENGTH" in df.columns) + + def test_read_msfragger(self): + """Test function for reading sage results and transforming to Prosit format.""" + msfragger_output_path = Path(__file__).parent / "data" / "psm_tmt.pepXML" + expected_msfragger_internal_path = Path(__file__).parent / "data" / "psm_tmt_internal.csv" + + internal_search_results_df = MSFragger.read_result(msfragger_output_path, tmt_labeled="tmtpro") + expected_df = pd.read_csv(expected_msfragger_internal_path, index_col=0) + + pd.testing.assert_frame_equal(internal_search_results_df, expected_df)