From 07f8d287cba717fbe36c3930098e9f66c1d0d4f8 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Tue, 7 Nov 2023 11:26:56 +0100 Subject: [PATCH 1/3] moved msfragger_to_internal to fundamentals --- spectrum_io/search_result/msfragger.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index a55ac15..e1926d6 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -5,7 +5,7 @@ import pandas as pd import spectrum_fundamentals.constants as c from pyteomics import pepxml -from spectrum_fundamentals.mod_string import internal_without_mods +from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal from tqdm import tqdm from .search_results import SearchResults, filter_valid_prosit_sequences @@ -84,20 +84,3 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame: "PEPTIDE_LENGTH", ] ] - - -def msfragger_to_internal(modstrings: pd.Series): - """ - Transform modstring from msfragger format to internal format. - - This function takes a modstrings column from a pandas dataframe and converts each - supported modification (M[147] and C[160]) to the internal representation that is - M[UNIMOD:35] and C[UNIMOD:4], respectively. Since C is considered a fixed modification, - every occurence of a C is transformed to C[UNIMOD:4] as well. - - :param modstrings: pd.Series containing the msfragger modstrings - :return: pd.Series with internal modstrings - """ - modstrings = modstrings.str.replace("M[147]", "M[UNIMOD:35]", regex=False) - modstrings = modstrings.str.replace(r"C\[160\]|C", "C[UNIMOD:4]", regex=True) - return modstrings From c1f4b4ac1994f85310665f9711fe326c7311bc3f Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Thu, 9 Nov 2023 17:29:46 +0100 Subject: [PATCH 2/3] added support for tmt for msfragger --- spectrum_io/search_result/msfragger.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index e1926d6..244485b 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -42,7 +42,7 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: df = pd.concat(ms_frag_results) - df = update_columns_for_prosit(df, "") + df = update_columns_for_prosit(df, tmt_labeled) return filter_valid_prosit_sequences(df) @@ -58,7 +58,17 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame: df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0]) df["MASS"] = df["precursor_neutral_mass"] df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x)) - df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"]) + + if tmt_labeled != "": + unimod_tag = c.TMT_MODS[tmt_labeled] + logger.info("Adding TMT fixed modifications") + df["MODIFIED_SEQUENCE"] = msfragger_to_internal( + df["modified_peptide"].to_list(), + fixed_mods={"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, + ) + else: + df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list()) + df.rename( columns={ "assumed_charge": "PRECURSOR_CHARGE", From ab9cffac1bab692fca4e7d581b64592337fa5e06 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Fri, 10 Nov 2023 18:09:18 +0100 Subject: [PATCH 3/3] added tmt test --- tests/unit_tests/data/psm_tmt.pepXML | 175 +++++++++++++++++++++ tests/unit_tests/data/psm_tmt_internal.csv | 5 + tests/unit_tests/test_msfragger.py | 10 ++ 3 files changed, 190 insertions(+) create mode 100644 tests/unit_tests/data/psm_tmt.pepXML create mode 100644 tests/unit_tests/data/psm_tmt_internal.csv diff --git a/tests/unit_tests/data/psm_tmt.pepXML b/tests/unit_tests/data/psm_tmt.pepXML new file mode 100644 index 0000000..b31bdf0 --- /dev/null +++ b/tests/unit_tests/data/psm_tmt.pepXML @@ -0,0 +1,175 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/unit_tests/data/psm_tmt_internal.csv b/tests/unit_tests/data/psm_tmt_internal.csv new file mode 100644 index 0000000..4e3f7be --- /dev/null +++ b/tests/unit_tests/data/psm_tmt_internal.csv @@ -0,0 +1,5 @@ +,RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE,SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH +0,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2459,[UNIMOD:2016]-GQAVLAFQEQVGTGR,5,34,1863.023,8.221,True,GQAVLAFQEQVGTGR,15 +1,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2486,[UNIMOD:2016]-TEVPM[UNIMOD:35]GLSLRTTSAR,5,42,1937.0531,7.083,False,TEVPMGLSLRTTSAR,15 +2,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2980,[UNIMOD:2016]-YSGNC[UNIMOD:4]DRQSVER,3,193,1773.8123,3.932,False,YSGNCDRQSVER,12 +3,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,3945,[UNIMOD:2016]-ESTK[UNIMOD:2016]SAAER,3,647,1586.8887,10.839,True,ESTKSAAER,9 diff --git a/tests/unit_tests/test_msfragger.py b/tests/unit_tests/test_msfragger.py index cc6ce14..6b869c8 100644 --- a/tests/unit_tests/test_msfragger.py +++ b/tests/unit_tests/test_msfragger.py @@ -24,3 +24,13 @@ def test_read_result(self): self.assertTrue("REVERSE" in df.columns) self.assertTrue("SEQUENCE" in df.columns) self.assertTrue("PEPTIDE_LENGTH" in df.columns) + + def test_read_msfragger(self): + """Test function for reading sage results and transforming to Prosit format.""" + msfragger_output_path = Path(__file__).parent / "data" / "psm_tmt.pepXML" + expected_msfragger_internal_path = Path(__file__).parent / "data" / "psm_tmt_internal.csv" + + internal_search_results_df = MSFragger.read_result(msfragger_output_path, tmt_labeled="tmtpro") + expected_df = pd.read_csv(expected_msfragger_internal_path, index_col=0) + + pd.testing.assert_frame_equal(internal_search_results_df, expected_df)