updated custom_modification feature

wilhelm-lab · Jul 18, 2024 · f44b95a · f44b95a
1 parent da4ab8b
commit f44b95a
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 83 deletions.
diff --git a/spectrum_io/search_result/mascot.py b/spectrum_io/search_result/mascot.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
-from spectrum_fundamentals.mod_string import internal_without_mods
+from spectrum_fundamentals.mod_string import internal_without_mods, custom_regex_escape
 
 from .search_results import SearchResults, filter_valid_prosit_sequences
 
@@ -16,12 +16,14 @@
 class Mascot(SearchResults):
     """Handle search results from Mascot."""
 
-    def read_result(self, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, float]] = None, 
-                    custom_var_mods: Dict[str, Tuple[str, float]] = None) -> pd.DataFrame:
+    def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
+                    var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
         """
         Function to read a mascot msf file and perform some basic formatting.
 
         :param tmt_labeled: tmt label as str
+        :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
+        :param stat_mods: dict with custom static identifier and respecitve internal equivalent
         :return: pd.DataFrame with the formatted data
         """
         logger.info("Reading mascot msf file")
@@ -76,59 +78,35 @@ def read_result(self, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, f
             ["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"],
             as_index=False,
         ).agg({"MODIFICATIONS": "|".join})
-        mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()}
-
-
-        def custom_regex_escape(key: str) -> str:
-            """
-            Subfunction to escape only normal brackets in the modstring.
-
-            :param key: The match to escape
-            :return: match with escaped special characters
-            """
-            for k, v in {"[": r"\[", "]": r"\]", "(": r"\(", ")": r"\)"}.items():
-                key = key.replace(k, v)
-            return key
+        MOD_MASSES = c.update_mod_masses()
+        mod_masses_reverse = {round(float(v), 3): k for k, v in MOD_MASSES.items()}
 
-        def find_replacement(match: re.Match, seq: str) -> str:
+        def find_replacement(match: re.Match, sequence: str) -> str:
             """
-        Subfunction to find the corresponding substitution for a match.
+            Subfunction to find the corresponding substitution for a match.
 
-        :param match: an re.Match object found by re.sub
-        :return: substitution string for the given match
-        """
+            :param match: an re.Match object found by re.sub
+            :return: substitution string for the given match
+            """
             key = match.string[match.start() : match.end()]
-            if custom_stat_mods is not None and key in custom_stat_mods.keys():
-                assert isinstance(custom_mods[key][0], str), f"Provided illegal custom mod format, expected dict-values are (str, float), 
-                recieved {(type(custom_mods[key][0]).__name__), (type(custom_mods[key][1]).__name__)}."
-                end = match.span()[1]
-                if end < len(seq) and (seq[end] == "[" or seq[end]== "("):
-                    return key
-                if not custom_mods[key][0].startswith(key):
-                        return key + custom_mods[key][0]
-                return custom_mods[key][0]
-            elif custom_var_mods is not None and key in custom_var_mods.keys():
-                assert isinstance(custom_mods[key][0], str), f"Provided illegal custom mod format, expected dict-values are (str, float), 
-                recieved {(type(custom_mods[key][0]).__name__), (type(custom_mods[key][1]).__name__)}."
-                return custom_mods[key][0]
-            return custom_mods[key]
+            return mods[key]
 
 
-        custom_mods = {}
+        mods = {}
 
-        if custom_var_mods is not None:
-            custom_mods.update(custom_var_mods)
-        if custom_stat_mods is not None:
-            custom_mods.update(custom_stat_mods)
+        if var_mods is not None:
+            mods.update(var_mods)
+        if stat_mods is not None:
+            mods.update(stat_mods)
 
-        if custom_mods:
-            regex = re.compile("|".join(map(custom_regex_escape, custom_mods.keys())))
+        if mods:
+            regex = re.compile("|".join(map(custom_regex_escape, mods.keys())))
 
         sequences = []
         for _, row in df.iterrows():
             modifications = row["MODIFICATIONS"].split("|")
             sequence = row["SEQUENCE"]
-            if custom_mods:
+            if mods:
                 sequence = regex.sub(lambda match: find_replacement(match, sequence), sequence) 
 
             if len(modifications) == 0:

diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py
@@ -1,10 +1,11 @@
 import logging
 from pathlib import Path
-from typing import Union, Dict, Tuple
+from typing import Optional, Union, Dict, Tuple
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
 from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal
+from spectrum_fundamentals.constants import MAXQUANT_VAR_MODS
 
 from .search_results import SearchResults, filter_valid_prosit_sequences
 
@@ -41,12 +42,14 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:
         mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"]
         return mass
 
-    def read_result(self, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, float]] = None, 
-                    custom_var_mods: Dict[str, Tuple[str, float]] = None) -> pd.DataFrame:
+    def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
+                    var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
 
         :param tmt_labeled: tmt label as str
+        :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
+        :param stat_mods: dict with custom static identifier and respecitve internal equivalent
         :return: pd.DataFrame with the formatted data
         """
         logger.info("Reading msms.txt file")
@@ -73,54 +76,59 @@ def read_result(self, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, f
         df.columns = df.columns.str.upper()
         df.columns = df.columns.str.replace(" ", "_")
 
-        df = MaxQuant.update_columns_for_prosit(df, tmt_labeled, custom_stat_mods, custom_var_mods)
+        df = MaxQuant.update_columns_for_prosit(df, tmt_labeled, stat_mods, var_mods)
         return filter_valid_prosit_sequences(df)
 
     @staticmethod
-    def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, float]] = None, 
-                                  custom_var_mods: Dict[str, Tuple[str, float]] = None) -> pd.DataFrame:
+    def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
+                                  var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
         """
         Update columns of df to work with Prosit.
 
         :param df: df to modify
         :param tmt_labeled: True if tmt labeled
+        :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
+        :param stat_mods: dict with custom static identifier and respecitve internal equivalent
         :return: modified df as pd.DataFrame
         """
         df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True)
 
+        mods = {**(MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}
+
         df["REVERSE"].fillna(False, inplace=True)
         df["REVERSE"].replace("+", True, inplace=True)
         logger.info("Converting MaxQuant peptide sequence to internal format")
         if tmt_labeled != "":
             unimod_tag = c.TMT_MODS[tmt_labeled]
             logger.info("Adding TMT fixed modifications")
             df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df["MODIFIED_SEQUENCE"].to_numpy(),
-                fixed_mods={"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, 
-                stat_custom_mods=custom_stat_mods, var_custom_mods=custom_var_mods)
+                df["MODIFIED_SEQUENCE"].to_numpy(), mods=
+                {**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}
+                )
             df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1)
             if "msa" in tmt_labeled:
                 logger.info("Replacing phospho by dehydration for Phospho-MSA")
                 df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace(
                     "[UNIMOD:21]", "[UNIMOD:23]", regex=False
                 )
+                fixed_mods = {"C": "C[UNIMOD:4]"}
         elif "LABELING_STATE" in df.columns:
             logger.info("Adding SILAC fixed modifications")
+
             df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(),
-                fixed_mods={"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, stat_custom_mods=custom_stat_mods, 
-                var_custom_mods=custom_var_mods
+                df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(), mods = 
+                {**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods}
             )
             df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), stat_custom_mods=custom_stat_mods, 
-                var_custom_mods=custom_var_mods
+                df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods=
+                {**{"C": "C[UNIMOD:4]"}, **mods}
             )
             df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1)
             df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1)
             df.drop(columns=["LABELING_STATE"], inplace=True)
         else:
-            df["MODIFIED_SEQUENCE"] = maxquant_to_internal(df["MODIFIED_SEQUENCE"].to_numpy(), stat_custom_mods=custom_stat_mods, 
-                                                           var_custom_mods=custom_var_mods)
+            df["MODIFIED_SEQUENCE"] = maxquant_to_internal(df["MODIFIED_SEQUENCE"].to_numpy(), mods=
+                                                           {**{"C": "C[UNIMOD:4]"}, **mods})
         df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
         df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))
         df["PROTEINS"].fillna("UNKNOWN", inplace=True)

diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py
@@ -1,11 +1,12 @@
 import logging
 from pathlib import Path
-from typing import Union, Dict, Tuple
+from typing import Optional, Union, Dict, Tuple
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
 from pyteomics import pepxml
-from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
+from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_or_custom_to_internal
+from spectrum_fundamentals.constants import MSFRAGGER_VAR_MODS
 from tqdm import tqdm
 
 from .search_results import SearchResults, filter_valid_prosit_sequences
@@ -16,13 +17,14 @@
 class MSFragger(SearchResults):
     """Handle search results from MSFragger."""
 
-    def read_result(self, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, float]] = None, 
-                    custom_var_mods: Dict[str, Tuple[str, float]] = None) -> pd.DataFrame:
+    def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
+                    var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
 
         :param tmt_labeled: tmt label as str
-        :raises FileNotFoundError: in case the given path is neither a file, nor a directory.
+        :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
+        :param stat_mods: dict with custom static identifier and respecitve internal equivalent:raises FileNotFoundError: in case the given path is neither a file, nor a directory.
         :return: pd.DataFrame with the formatted data
         """
         if self.path.is_file():
@@ -38,17 +40,19 @@ def read_result(self, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, f
 
         df = pd.concat(ms_frag_results)
 
-        df = update_columns_for_prosit(df, tmt_labeled, custom_stat_mods=custom_stat_mods, custom_var_mods=custom_var_mods)
+        df = update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods)
         return filter_valid_prosit_sequences(df)
 
 
-def update_columns_for_prosit(df, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, float]] = None, 
-                                  custom_var_mods: Dict[str, Tuple[str, float]] = None) -> pd.DataFrame:
+def update_columns_for_prosit(df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
+                              var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
     """
     Update columns of df to work with Prosit.
 
     :param df: df to modify
     :param tmt_labeled: True if tmt labeled
+    :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
+    :param stat_mods: dict with custom static identifier and respecitve internal equivalent
     :return: modified df as pd.DataFrame
     """
     df["PROTEINS"] = df["protein"]
@@ -58,16 +62,20 @@ def update_columns_for_prosit(df, tmt_labeled: str, custom_stat_mods: Dict[str,
     df["MASS"] = df["precursor_neutral_mass"]
     df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x))
 
+    mods = {**(MSFRAGGER_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}
+
+
     if tmt_labeled != "":
         unimod_tag = c.TMT_MODS[tmt_labeled]
         logger.info("Adding TMT fixed modifications")
-        df["MODIFIED_SEQUENCE"] = msfragger_to_internal(
-            df["modified_peptide"].to_list(),
-            fixed_mods={"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, stat_custom_mods=custom_stat_mods, 
-            var_custom_mods=custom_var_mods)
+        mods = {**{"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}
+        df["MODIFIED_SEQUENCE"] = msfragger_or_custom_to_internal(
+            df["modified_peptide"].to_list(), mods=mods)
     else:
-        df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), stat_custom_mods=custom_stat_mods, 
-                                                        var_custom_mods=custom_var_mods)
+        #By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included
+        # in the fixed modifications. If you want to have no fixed modifictions at all, supply fixed_mods={}
+        mods = {**{"C": "C[UNIMOD:4]"}, **mods}
+        df["MODIFIED_SEQUENCE"] = msfragger_or_custom_to_internal(df["modified_peptide"].to_list(), mods=mods)
 
     df.rename(
         columns={

diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py
@@ -1,9 +1,9 @@
 import logging
 from pathlib import Path
-from typing import Union, Dict, Tuple
+from typing import Optional, Union, Dict, Tuple
 
 import pandas as pd
-import spectrum_fundamentals.constants as c
+from spectrum_fundamentals.constants import MOD_MASSES_SAGE
 from spectrum_fundamentals.mod_string import sage_to_internal
 
 from .search_results import SearchResults, filter_valid_prosit_sequences
@@ -14,11 +14,14 @@
 class Sage(SearchResults):
     """Handle search results from Sage."""
 
-    def read_result(self, tmt_labeled: str = "", custom_mods: Dict[str, str] = None) -> pd.DataFrame:
+    def read_result(self, tmt_labeled: str = "", stat_mods: Optional[Dict[str, str]] = None, 
+                    var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
         """
         Function to read a msms tsv and perform some basic formatting.
 
         :param tmt_labeled: tmt label as str
+        :param var_mods: Variable modifications with custom identifiers and their respective internal equivalents
+        :param stat_mods: Static modifications with custom identifiers and their respective internal equivalents
         :return: pd.DataFrame with the formatted data
         """
         logger.info(f"Reading {self.path}")
@@ -33,16 +36,19 @@ def read_result(self, tmt_labeled: str = "", custom_mods: Dict[str, str] = None)
         df.columns = df.columns.str.upper()
         df.columns = df.columns.str.replace(" ", "_")
 
-        df = Sage.update_columns_for_prosit(df, tmt_labeled, custom_mods)
+        df = Sage.update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods)
         return filter_valid_prosit_sequences(df)
 
     @staticmethod
-    def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, custom_stat_mods: Dict[str, Tuple[str, float]] = None, custom_var_mods: Dict[str, Tuple[str, float]] = None) -> pd.DataFrame:
+    def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
+                                  var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
         """
         Update columns of df to work with Prosit.
 
         :param df: df to modify
         :param tmt_labeled: True if tmt labeled, ignored
+        :param var_mods: Variable modifications with custom identifiers and their respective internal equivalents
+        :param stat_mods: Static modifications with custom identifiers and their respective internal equivalents
         :return: modified df as pd.DataFrame
         """
         df = df.rename(
@@ -56,6 +62,7 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, custom_stat_mo
                 "LABEL": "REVERSE",
             }
         )
+        mods = {**(MOD_MASSES_SAGE), **(stat_mods or {}), **(var_mods or {})}
 
         # removing .mzML
         df["RAW_FILE"] = df["RAW_FILE"].str.replace(r"\.mz[M|m][l|L]", "", regex=True)
@@ -68,7 +75,7 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, custom_stat_mo
         # length of the peptide
         df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
         # converting sage to unimod
-        df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"], stat_custom_mods=custom_stat_mods, var_custom_mods=custom_var_mods)
+        df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"], mods=mods)
         df["PROTEINS"].fillna("UNKNOWN", inplace=True)
 
         return df