fixed and cleaned up unit tests

wilhelm-lab · Aug 1, 2024 · ce752a3 · ce752a3
1 parent 9143d1d
commit ce752a3
Show file tree

Hide file tree

Showing 15 changed files with 420 additions and 370 deletions.
diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py
@@ -4,9 +4,9 @@
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
-from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal
+from spectrum_fundamentals.mod_string import internal_without_mods
 
-from .search_results import SearchResults, filter_valid_prosit_sequences
+from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods
 
 logger = logging.getLogger(__name__)
 
@@ -43,108 +43,74 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:
 
     def read_result(
         self,
-        tmt_labeled: str,
-        custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
+        tmt_label: str = "",
+        custom_mods: Optional[Dict[str, int]] = None,
     ) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
 
-        :param tmt_labeled: tmt label as str
-        :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
+        :param tmt_label: optional tmt label as str
+        :param custom_mods: optional dictionary mapping MaxQuant-specific mod pattern to UNIMOD IDs.
+            If None, static carbamidomethylation of cytein and variable oxidation of methionine
+            are mapped automatically. To avoid this, explicitely provide an empty dictionary.
         :return: pd.DataFrame with the formatted data
         """
-        logger.info("Reading msms.txt file")
-        df = pd.read_csv(
-            self.path / "msms.txt",
-            usecols=lambda x: x.upper()
-            in [
-                "RAW FILE",
-                "SCAN NUMBER",
-                "MODIFIED SEQUENCE",
-                "CHARGE",
-                "SCAN EVENT NUMBER",
-                "LABELING STATE",
-                "MASS",  # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead
-                "SCORE",
-                "REVERSE",
-                "PROTEINS",
-            ],
-            sep="\t",
-        )
-        logger.info("Finished reading msms.txt file")
-
-        # Standardize column names
-        df.columns = df.columns.str.upper()
-        df.columns = df.columns.str.replace(" ", "_")
+        if custom_mods is None:
+            custom_mods = {
+                "C": 4,
+                "M(ox)": 35,
+                "M(Oxidation (M))": 35,
+            }
+        parsed_mods = parse_mods(custom_mods)
+        if tmt_label:
+            unimod_tag = c.TMT_MODS[tmt_label]
+            parsed_mods["K"] = f"K{unimod_tag}"
+            parsed_mods["^_"] = f"_{unimod_tag}-"
 
-        stat_mods: Dict[str, str] = {}
-        var_mods: Dict[str, str] = {}
+        logger.info("Reading msms.txt file")
+        self.results = pd.read_csv(self.path / "msms.txt", sep="\t")
 
-        if custom_mods is not None:
-            stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
-            var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}
+        logger.info("Finished reading msms.txt file")
 
-        df = MaxQuant.update_columns_for_prosit(df, tmt_labeled, stat_mods, var_mods)
-        return filter_valid_prosit_sequences(df)
+        self.convert_to_internal(mods=parsed_mods)
+        return filter_valid_prosit_sequences(self.results)
 
-    @staticmethod
-    def update_columns_for_prosit(
-        df: pd.DataFrame,
-        tmt_labeled: str,
-        stat_mods: Optional[Dict[str, str]] = None,
-        var_mods: Optional[Dict[str, str]] = None,
-    ) -> pd.DataFrame:
+    def convert_to_internal(self, mods: Dict[str, str]) -> pd.DataFrame:
         """
-        Update columns of df to work with Prosit.
+        Convert all columns in the MaxQuant output to the internal format used by Oktoberfest.
 
-        :param df: df to modify
-        :param tmt_labeled: True if tmt labeled
-        :param var_mods: dict with custom variable identifier and respecitve internal equivalent
-        :param stat_mods: dict with custom static identifier and respecitve internal equivalent
-        :return: modified df as pd.DataFrame
+        :param mods: dictionary mapping MaxQuant-specific mod patterns (keys) to ProForma standard (values)
         """
-        df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True)
-
-        mods = {**(c.MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}
-
-        df["REVERSE"].fillna(False, inplace=True)
-        df["REVERSE"].replace("+", True, inplace=True)
-        logger.info("Converting MaxQuant peptide sequence to internal format")
-        if tmt_labeled != "":
-            unimod_tag = c.TMT_MODS[tmt_labeled]
-            logger.info("Adding TMT fixed modifications")
-            df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df["MODIFIED_SEQUENCE"].to_numpy(),
-                mods={**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods},
-            )
-            df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1)
-            if "msa" in tmt_labeled:
-                logger.info("Replacing phospho by dehydration for Phospho-MSA")
-                df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace(
-                    "[UNIMOD:21]", "[UNIMOD:23]", regex=False
-                )
-        elif "LABELING_STATE" in df.columns:
-            logger.info("Adding SILAC fixed modifications")
-
-            df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(),
-                mods={**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods},
-            )
-            df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods}
-            )
-            df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1)
-            df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1)
-            df.drop(columns=["LABELING_STATE"], inplace=True)
-        else:
-            df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
-                df["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods}
-            )
-        df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
-        df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))
-        df["PROTEINS"].fillna("UNKNOWN", inplace=True)
-
-        return df
+        df = self.results
+        # Standardize column names
+        # df.columns = df.columns.str.upper()
+        # df.columns = df.columns.str.replace(" ", "_")
+        # df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True)
+
+        mods["_"] = ""
+
+        df.fillna({"Reverse": "", "Proteins": "UNKNOWN"}, inplace=True)
+        df["Reverse"] = df["Reverse"].astype(bool)
+        df.replace({"Modified sequence": mods}, regex=True, inplace=True)
+
+        df["Sequence"] = internal_without_mods(df["Modified sequence"])
+        df["PEPTIDE_LENGTH"] = df["Sequence"].str.len()
+
+        df.rename(
+            columns={
+                "Reverse": "REVERSE",
+                "Sequence": "SEQUENCE",
+                "Modified sequence": "MODIFIED_SEQUENCE",
+                "Proteins": "PROTEINS",
+                "Charge": "PRECURSOR_CHARGE",
+                "Raw file": "RAW_FILE",
+                "Scan number": "SCAN_NUMBER",
+                "Scan event number": "SCAN_EVENT_NUMBER",
+                "Mass": "MASS",
+                "Score": "SCORE",
+            },
+            inplace=True,
+        )
 
     def generate_internal_timstof_metadata(self):
         """

diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py
@@ -6,10 +6,10 @@
 import spectrum_fundamentals.constants as c
 from pyteomics import pepxml
 from spectrum_fundamentals.constants import MSFRAGGER_VAR_MODS
-from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
+from spectrum_fundamentals.mod_string import internal_without_mods
 from tqdm import tqdm
 
-from .search_results import SearchResults, filter_valid_prosit_sequences
+from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods
 
 logger = logging.getLogger(__name__)
 
@@ -19,17 +19,26 @@ class MSFragger(SearchResults):
 
     def read_result(
         self,
-        tmt_labeled: str,
-        custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
+        tmt_label: str = "",
+        custom_mods: Optional[Dict[str, int]] = None,
     ) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
 
-        :param tmt_labeled: tmt label as str
-        :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
+        :param tmt_label: optional tmt label as str
+        :param custom_mods: optional dictionary mapping MSFragger-specific mod pattern to UNIMOD IDs.
+            If None, static carbamidomethylation of cytein and variable oxidation of methionine
+            are mapped automatically. To avoid this, explicitely provide an empty dictionary.
         :raises FileNotFoundError: in case the given path is neither a file, nor a directory.
         :return: pd.DataFrame with the formatted data
         """
+        if custom_mods is None:
+            custom_mods = {"C": 4, "M[147]": 35}
+        parsed_mods = parse_mods(custom_mods)
+        if tmt_label:
+            unimod_tag = c.TMT_MODS[tmt_label]
+            parsed_mods["K"] = f"K{unimod_tag}"
+            parsed_mods[r"^n\[\d+\]"] = f"{unimod_tag}-"
         if self.path.is_file():
             file_list = [self.path]
         elif self.path.is_dir():
@@ -41,76 +50,57 @@ def read_result(
         for pep_xml_file in tqdm(file_list):
             ms_frag_results.append(pepxml.DataFrame(str(pep_xml_file)))
 
-        df = pd.concat(ms_frag_results)
-
-        stat_mods: Dict[str, str] = {}
-        var_mods: Dict[str, str] = {}
-
-        if custom_mods is not None:
-            stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
-            var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}
-
-        df = update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods)
-        return filter_valid_prosit_sequences(df)
-
-
-def update_columns_for_prosit(
-    df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, var_mods: Optional[Dict[str, str]] = None
-) -> pd.DataFrame:
-    """
-    Update columns of df to work with Prosit.
-
-    :param df: df to modify
-    :param tmt_labeled: True if tmt labeled
-    :param var_mods: dict with custom variable identifier and respecitve internal equivalent
-    :param stat_mods: dict with custom static identifier and respecitve internal equivalent
-    :return: modified df as pd.DataFrame
-    """
-    df["PROTEINS"] = df["protein"]
-    df["PROTEINS"].fillna("UNKNOWN", inplace=True)
-    df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x))
-    df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0])
-    df["MASS"] = df["precursor_neutral_mass"]
-    df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x))
-
-    mods = {**(MSFRAGGER_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}
-
-    if tmt_labeled != "":
-        unimod_tag = c.TMT_MODS[tmt_labeled]
-        logger.info("Adding TMT fixed modifications")
-        mods = {**{"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}
-        df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods)
-    else:
-        # By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included
-        # in the fixed modifications. If you want to have no fixed modifictions at all, supply fixed_mods={}
-        mods = {**{"C": "C[UNIMOD:4]"}, **mods}
-        df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods)
-
-    df.rename(
-        columns={
-            "assumed_charge": "PRECURSOR_CHARGE",
-            "index": "SCAN_EVENT_NUMBER",
-            "peptide": "SEQUENCE",
-            "start_scan": "SCAN_NUMBER",
-            "hyperscore": "SCORE",
-        },
-        inplace=True,
-    )
-    df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
-    df["PROTEINS"] = df["PROTEINS"].apply(lambda x: ";".join(x))
-
-    return df[
-        [
-            "RAW_FILE",
-            "SCAN_NUMBER",
-            "MODIFIED_SEQUENCE",
-            "PRECURSOR_CHARGE",
-            "SCAN_EVENT_NUMBER",
-            "MASS",
-            "SCORE",
-            "REVERSE",
-            "SEQUENCE",
-            "PEPTIDE_LENGTH",
-            "PROTEINS",
+        self.results = pd.concat(ms_frag_results)
+
+        self.convert_to_internal(mods=parsed_mods)
+        return filter_valid_prosit_sequences(self.results)
+
+    def convert_to_internal(self, mods: Dict[str, str]) -> pd.DataFrame:
+        """
+        Convert all columns in the MSFragger output to the internal format used by Oktoberfest.
+
+        :param mods: dictionary mapping MSFragger-specific mod patterns (keys) to ProForma standard (values)
+        """
+        df = self.results
+        df["protein"] = df["protein"].fillna("UNKNOWN").apply(lambda x: ";".join(x))
+
+        df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x))
+        df["spectrum"] = df["spectrum"].str.split(pat=".", n=1).str[0]
+        df["PEPTIDE_LENGTH"] = df["peptide"].str.len()
+
+        df.replace({"modified_peptide": mods}, regex=True, inplace=True)
+        df["peptide"] = internal_without_mods(df["modified_peptide"])
+
+        df.rename(
+            columns={
+                "assumed_charge": "PRECURSOR_CHARGE",
+                "index": "SCAN_EVENT_NUMBER",
+                "peptide": "SEQUENCE",
+                "start_scan": "SCAN_NUMBER",
+                "hyperscore": "SCORE",
+                "modified_peptide": "MODIFIED_SEQUENCE",
+                "protein": "PROTEINS",
+                "peptide": "SEQUENCE",
+                "precursor_neutral_mass": "MASS",
+                "spectrum": "RAW_FILE",
+            },
+            inplace=True,
+        )
+
+        """
+        return df[
+            [
+                "RAW_FILE",
+                "SCAN_NUMBER",
+                "MODIFIED_SEQUENCE",
+                "PRECURSOR_CHARGE",
+                "SCAN_EVENT_NUMBER",
+                "MASS",
+                "SCORE",
+                "REVERSE",
+                "SEQUENCE",
+                "PEPTIDE_LENGTH",
+                "PROTEINS",
+            ]
         ]
-    ]
+        """