tests pased spectrum_io

wilhelm-lab · Jul 31, 2024 · 9143d1d · 9143d1d
1 parent 1368743
commit 9143d1d
Show file tree

Hide file tree

Showing 13 changed files with 334 additions and 84 deletions.
diff --git a/spectrum_io/search_result/mascot.py b/spectrum_io/search_result/mascot.py
@@ -2,11 +2,11 @@
 import re
 import sqlite3
 from pathlib import Path
-from typing import Optional, Union, Dict, Tuple
+from typing import Dict, Optional, Tuple, Union
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
-from spectrum_fundamentals.mod_string import internal_without_mods, custom_regex_escape
+from spectrum_fundamentals.mod_string import custom_regex_escape, internal_without_mods
 
 from .search_results import SearchResults, filter_valid_prosit_sequences
 
@@ -16,14 +16,16 @@
 class Mascot(SearchResults):
     """Handle search results from Mascot."""
 
-    def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
-                    var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
+    def read_result(
+        self,
+        tmt_labeled: str,
+        custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
+    ) -> pd.DataFrame:
         """
         Function to read a mascot msf file and perform some basic formatting.
 
         :param tmt_labeled: tmt label as str
-        :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
-        :param stat_mods: dict with custom static identifier and respecitve internal equivalent
+        :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
         :return: pd.DataFrame with the formatted data
         """
         logger.info("Reading mascot msf file")
@@ -78,10 +80,10 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No
             ["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"],
             as_index=False,
         ).agg({"MODIFICATIONS": "|".join})
-        MOD_MASSES = c.update_mod_masses()
-        mod_masses_reverse = {round(float(v), 3): k for k, v in MOD_MASSES.items()}
-        
-        def find_replacement(match: re.Match, sequence: str) -> str:
+        mod_masses = c.update_mod_masses()
+        mod_masses_reverse = {round(float(v), 3): k for k, v in mod_masses.items()}
+
+        def find_replacement(match: re.Match) -> str:
             """
             Subfunction to find the corresponding substitution for a match.
 
@@ -90,7 +92,13 @@ def find_replacement(match: re.Match, sequence: str) -> str:
             """
             key = match.string[match.start() : match.end()]
             return mods[key]
-
+
+        stat_mods: Dict[str, str] = {}
+        var_mods: Dict[str, str] = {}
+
+        if custom_mods is not None:
+            stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
+            var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}
 
         mods = {}
 
@@ -107,11 +115,11 @@ def find_replacement(match: re.Match, sequence: str) -> str:
             modifications = row["MODIFICATIONS"].split("|")
             sequence = row["SEQUENCE"]
             if mods:
-                sequence = regex.sub(lambda match: find_replacement(match, sequence), sequence) 
+                sequence = regex.sub(lambda match: find_replacement(match), sequence)
 
             if len(modifications) == 0:
                 sequences.append(sequence)
-            else:                 
+            else:
                 skip = 0
                 for mod in modifications:
                     pos, mass = mod.split("$")

diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py
@@ -1,11 +1,10 @@
 import logging
 from pathlib import Path
-from typing import Optional, Union, Dict, Tuple
+from typing import Dict, Optional, Tuple, Union
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
 from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal
-from spectrum_fundamentals.constants import MAXQUANT_VAR_MODS
 
 from .search_results import SearchResults, filter_valid_prosit_sequences
 
@@ -42,14 +41,16 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:
         mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"]
         return mass
 
-    def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
-                    var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
+    def read_result(
+        self,
+        tmt_labeled: str,
+        custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
+    ) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
 
         :param tmt_labeled: tmt label as str
-        :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
-        :param stat_mods: dict with custom static identifier and respecitve internal equivalent
+        :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
         :return: pd.DataFrame with the formatted data
         """
         logger.info("Reading msms.txt file")
@@ -76,24 +77,35 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No
         df.columns = df.columns.str.upper()
         df.columns = df.columns.str.replace(" ", "_")
 
+        stat_mods: Dict[str, str] = {}
+        var_mods: Dict[str, str] = {}
+
+        if custom_mods is not None:
+            stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
+            var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}
+
         df = MaxQuant.update_columns_for_prosit(df, tmt_labeled, stat_mods, var_mods)
         return filter_valid_prosit_sequences(df)
 
     @staticmethod
-    def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
-                                  var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
+    def update_columns_for_prosit(
+        df: pd.DataFrame,
+        tmt_labeled: str,
+        stat_mods: Optional[Dict[str, str]] = None,
+        var_mods: Optional[Dict[str, str]] = None,
+    ) -> pd.DataFrame:
         """
         Update columns of df to work with Prosit.
 
         :param df: df to modify
         :param tmt_labeled: True if tmt labeled
-        :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
+        :param var_mods: dict with custom variable identifier and respecitve internal equivalent
         :param stat_mods: dict with custom static identifier and respecitve internal equivalent
         :return: modified df as pd.DataFrame
         """
         df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True)
 
-        mods = {**(MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}
+        mods = {**(c.MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}
 
         df["REVERSE"].fillna(False, inplace=True)
         df["REVERSE"].replace("+", True, inplace=True)
@@ -102,33 +114,32 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Opt
             unimod_tag = c.TMT_MODS[tmt_labeled]
             logger.info("Adding TMT fixed modifications")
             df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df["MODIFIED_SEQUENCE"].to_numpy(), mods=
-                {**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}
-                )
+                df["MODIFIED_SEQUENCE"].to_numpy(),
+                mods={**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods},
+            )
             df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1)
             if "msa" in tmt_labeled:
                 logger.info("Replacing phospho by dehydration for Phospho-MSA")
                 df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace(
                     "[UNIMOD:21]", "[UNIMOD:23]", regex=False
                 )
-                fixed_mods = {"C": "C[UNIMOD:4]"}
         elif "LABELING_STATE" in df.columns:
             logger.info("Adding SILAC fixed modifications")
-    
+
             df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(), mods = 
-                {**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods}
+                df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(),
+                mods={**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods},
             )
             df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods=
-                {**{"C": "C[UNIMOD:4]"}, **mods}
+                df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods}
             )
             df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1)
             df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1)
             df.drop(columns=["LABELING_STATE"], inplace=True)
         else:
-            df["MODIFIED_SEQUENCE"] = maxquant_to_internal(df["MODIFIED_SEQUENCE"].to_numpy(), mods=
-                                                           {**{"C": "C[UNIMOD:4]"}, **mods})
+            df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
+                df["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods}
+            )
         df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
         df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))
         df["PROTEINS"].fillna("UNKNOWN", inplace=True)

diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py
@@ -1,12 +1,12 @@
 import logging
 from pathlib import Path
-from typing import Optional, Union, Dict, Tuple
+from typing import Dict, Optional, Tuple, Union
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
 from pyteomics import pepxml
-from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
 from spectrum_fundamentals.constants import MSFRAGGER_VAR_MODS
+from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
 from tqdm import tqdm
 
 from .search_results import SearchResults, filter_valid_prosit_sequences
@@ -17,14 +17,17 @@
 class MSFragger(SearchResults):
     """Handle search results from MSFragger."""
 
-    def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
-                    var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
+    def read_result(
+        self,
+        tmt_labeled: str,
+        custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
+    ) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
 
         :param tmt_labeled: tmt label as str
-        :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
-        :param stat_mods: dict with custom static identifier and respecitve internal equivalent:raises FileNotFoundError: in case the given path is neither a file, nor a directory.
+        :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
+        :raises FileNotFoundError: in case the given path is neither a file, nor a directory.
         :return: pd.DataFrame with the formatted data
         """
         if self.path.is_file():
@@ -40,18 +43,26 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No
 
         df = pd.concat(ms_frag_results)
 
+        stat_mods: Dict[str, str] = {}
+        var_mods: Dict[str, str] = {}
+
+        if custom_mods is not None:
+            stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
+            var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}
+
         df = update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods)
         return filter_valid_prosit_sequences(df)
 
 
-def update_columns_for_prosit(df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
-                              var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
+def update_columns_for_prosit(
+    df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, var_mods: Optional[Dict[str, str]] = None
+) -> pd.DataFrame:
     """
     Update columns of df to work with Prosit.
 
     :param df: df to modify
     :param tmt_labeled: True if tmt labeled
-    :param var_mods: dict with custom variable identifier and respecitve internal equivalent 
+    :param var_mods: dict with custom variable identifier and respecitve internal equivalent
     :param stat_mods: dict with custom static identifier and respecitve internal equivalent
     :return: modified df as pd.DataFrame
     """
@@ -64,15 +75,13 @@ def update_columns_for_prosit(df, tmt_labeled: str, stat_mods: Optional[Dict[str
 
     mods = {**(MSFRAGGER_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}
 
-
     if tmt_labeled != "":
         unimod_tag = c.TMT_MODS[tmt_labeled]
         logger.info("Adding TMT fixed modifications")
         mods = {**{"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}
-        df["MODIFIED_SEQUENCE"] = msfragger_to_internal(
-            df["modified_peptide"].to_list(), mods=mods)
+        df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods)
     else:
-        #By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included
+        # By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included
         # in the fixed modifications. If you want to have no fixed modifictions at all, supply fixed_mods={}
         mods = {**{"C": "C[UNIMOD:4]"}, **mods}
         df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods)

diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import Optional, Union, Dict, Tuple
+from typing import Dict, Optional, Tuple, Union
 
 import pandas as pd
 from spectrum_fundamentals.constants import MOD_MASSES_SAGE
@@ -14,14 +14,16 @@
 class Sage(SearchResults):
     """Handle search results from Sage."""
 
-    def read_result(self, tmt_labeled: str = "", stat_mods: Optional[Dict[str, str]] = None, 
-                    var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
+    def read_result(
+        self,
+        tmt_labeled: str = "",
+        custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
+    ) -> pd.DataFrame:
         """
         Function to read a msms tsv and perform some basic formatting.
 
         :param tmt_labeled: tmt label as str
-        :param var_mods: Variable modifications with custom identifiers and their respective internal equivalents
-        :param stat_mods: Static modifications with custom identifiers and their respective internal equivalents
+        :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
         :return: pd.DataFrame with the formatted data
         """
         logger.info(f"Reading {self.path}")
@@ -36,12 +38,23 @@ def read_result(self, tmt_labeled: str = "", stat_mods: Optional[Dict[str, str]]
         df.columns = df.columns.str.upper()
         df.columns = df.columns.str.replace(" ", "_")
 
+        stat_mods: Dict[str, str] = {}
+        var_mods: Dict[str, str] = {}
+
+        if custom_mods is not None:
+            stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
+            var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}
+
         df = Sage.update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods)
         return filter_valid_prosit_sequences(df)
 
     @staticmethod
-    def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, 
-                                  var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
+    def update_columns_for_prosit(
+        df: pd.DataFrame,
+        tmt_labeled: str,
+        stat_mods: Optional[Dict[str, str]] = None,
+        var_mods: Optional[Dict[str, str]] = None,
+    ) -> pd.DataFrame:
         """
         Update columns of df to work with Prosit.