From f5de6a32c15571a60edd510b1b0869aa9084af65 Mon Sep 17 00:00:00 2001
From: Mario Picciani <mario.picciani@tum.de>
Date: Wed, 6 Dec 2023 13:38:43 +0100
Subject: [PATCH 1/2] added MSAmanda class structure

---
 spectrum_io/search_result/__init__.py |   1 +
 spectrum_io/search_result/msamanda.py | 226 +++++++++++++-------------
 2 files changed, 116 insertions(+), 111 deletions(-)

diff --git a/spectrum_io/search_result/__init__.py b/spectrum_io/search_result/__init__.py
index 53e4dd0..6736c70 100644
--- a/spectrum_io/search_result/__init__.py
+++ b/spectrum_io/search_result/__init__.py
@@ -1,5 +1,6 @@
 """Initialize seach result."""
 from .mascot import Mascot
 from .maxquant import MaxQuant
+from .msamanda import MSAmanda
 from .msfragger import MSFragger
 from .sage import Sage
diff --git a/spectrum_io/search_result/msamanda.py b/spectrum_io/search_result/msamanda.py
index 4be13e6..f4c48ba 100644
--- a/spectrum_io/search_result/msamanda.py
+++ b/spectrum_io/search_result/msamanda.py
@@ -5,121 +5,125 @@
 import pandas as pd
 from spectrum_fundamentals.constants import PARTICLE_MASSES
 
-from .search_results import filter_valid_prosit_sequences
+from .search_results import SearchResults, filter_valid_prosit_sequences
 
 logger = logging.getLogger(__name__)
 
 
-def _update_columns_for_prosit(df: pd.DataFrame):
-    """
-    Update columns of df to work with Prosit.
-
-    :param df: df to modify
-    :return: modified df as pd.DataFrame
-    """
-
-    def _check_unsupported_mods(modstring: str):
-        if modstring:
-            for mod in modstring.split(";"):
-                if not mod.split("(", 1)[1].startswith(("Oxidation", "Carbamidomethyl")):
-                    raise AssertionError(f"Unsupported modification: {mod}")
-
-    def _make_mod_seq(seq: str):
-        seq = seq.replace("m", "M[UNIMOD:35]")
-        seq = seq.replace("c", "C[UNIMOD:4]")
-        return seq
-
-    df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_")
-    df["MASS"] = df["m/z"] * df["PRECURSOR_CHARGE"] - (df["PRECURSOR_CHARGE"] * PARTICLE_MASSES["PROTON"])
-    df["Modifications"].fillna("").apply(_check_unsupported_mods)
-    df["MODIFIED_SEQUENCE"] = df["SEQUENCE"].apply(_make_mod_seq)
-    df["SEQUENCE"] = df["SEQUENCE"].str.upper()
-    df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
-    df["RAW_FILE"] = df["RAW_FILE"].str.replace(r"\.\w+$", "", regex=True)
-
-    return df[
-        [
-            "SCORE",
-            "MASS",
-            "PRECURSOR_CHARGE",
-            "RAW_FILE",
-            "SCAN_NUMBER",
-            "REVERSE",
-            "MODIFIED_SEQUENCE",
-            "SEQUENCE",
-            "PEPTIDE_LENGTH",
+class MSAmanda(SearchResults):
+    """Handle search results from MSAmanda."""
+
+    @staticmethod
+    def _update_columns_for_prosit(df: pd.DataFrame):
+        """
+        Update columns of df to work with Prosit.
+
+        :param df: df to modify
+        :return: modified df as pd.DataFrame
+        """
+
+        def _check_unsupported_mods(modstring: str):
+            if modstring:
+                for mod in modstring.split(";"):
+                    if not mod.split("(", 1)[1].startswith(("Oxidation", "Carbamidomethyl")):
+                        raise AssertionError(f"Unsupported modification: {mod}")
+
+        def _make_mod_seq(seq: str):
+            seq = seq.replace("m", "M[UNIMOD:35]")
+            seq = seq.replace("c", "C[UNIMOD:4]")
+            return seq
+
+        df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_")
+        df["MASS"] = df["m/z"] * df["PRECURSOR_CHARGE"] - (df["PRECURSOR_CHARGE"] * PARTICLE_MASSES["PROTON"])
+        df["Modifications"].fillna("").apply(_check_unsupported_mods)
+        df["MODIFIED_SEQUENCE"] = df["SEQUENCE"].apply(_make_mod_seq)
+        df["SEQUENCE"] = df["SEQUENCE"].str.upper()
+        df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
+        df["RAW_FILE"] = df["RAW_FILE"].str.replace(r"\.\w+$", "", regex=True)
+
+        return df[
+            [
+                "SCORE",
+                "MASS",
+                "PRECURSOR_CHARGE",
+                "RAW_FILE",
+                "SCAN_NUMBER",
+                "REVERSE",
+                "MODIFIED_SEQUENCE",
+                "SEQUENCE",
+                "PEPTIDE_LENGTH",
+            ]
         ]
-    ]
-
-
-def _remove_decoys_in_targets(full_df):
-    duplicated_psms = full_df[["SCAN_NUMBER", "RAW_FILE", "MODIFIED_SEQUENCE"]].duplicated(keep=False)
-    logger.info(f"Removing {sum(duplicated_psms)} duplicated PSMs...")
-    full_df = full_df[~(full_df["REVERSE"] & duplicated_psms)]
-    if any(full_df[["SCAN_NUMBER", "RAW_FILE", "MODIFIED_SEQUENCE"]].duplicated(keep=False)):
-        raise AssertionError("There are duplicate target PSMs. This is a bug of msamanda!")
-    logger.info(f"{len(full_df)} remaining PSMs")
-
-    return full_df
-
-
-def read_result(path: Union[str, Path], suffix: str = "output.csv") -> pd.DataFrame:
-    """
-    Function to read a msms txt and perform some basic formatting.
-
-    :param path: path to msms.txt to read
-    :param suffix: Optional suffix to determine which fileresult files should be taken from the supplied path
-    :raises FileNotFoundError: If the supplied path is not found
-    :raises AssertionError: If the supplied path does not contain any files matching the provided suffix.
-    :return: pd.DataFrame with the formatted data
-    """
-    if isinstance(path, str):
-        path = Path(path)
-    if path.is_file():
-        pathlist = [path]
-    elif path.is_dir():
-        pathlist = list(path.glob(f"*{suffix}"))
-        if not pathlist:
-            raise AssertionError(f"The directory does not contain any files that match the pattern *{suffix}")
-    else:
-        raise FileNotFoundError(f"{path} does not exist.")
-
-    df_list = []
-    for output_file in pathlist:
-        logger.info(f"Reading {output_file}...")
-        df = pd.read_csv(
-            output_file,
-            sep="\t",
-            skiprows=1,
-            usecols=[
-                "Scan Number",
-                "Sequence",
+
+    @staticmethod
+    def _remove_decoys_in_targets(full_df):
+        duplicated_psms = full_df[["SCAN_NUMBER", "RAW_FILE", "MODIFIED_SEQUENCE"]].duplicated(keep=False)
+        logger.info(f"Removing {sum(duplicated_psms)} duplicated PSMs...")
+        full_df = full_df[~(full_df["REVERSE"] & duplicated_psms)]
+        if any(full_df[["SCAN_NUMBER", "RAW_FILE", "MODIFIED_SEQUENCE"]].duplicated(keep=False)):
+            raise AssertionError("There are duplicate target PSMs. This is a bug of msamanda!")
+        logger.info(f"{len(full_df)} remaining PSMs")
+
+        return full_df
+
+    @staticmethod
+    def read_result(path: Union[str, Path], suffix: str = "output.csv") -> pd.DataFrame:
+        """
+        Function to read a msms txt and perform some basic formatting.
+
+        :param path: path to msms.txt to read
+        :param suffix: Optional suffix to determine which fileresult files should be taken from the supplied path
+        :raises FileNotFoundError: If the supplied path is not found
+        :raises AssertionError: If the supplied path does not contain any files matching the provided suffix.
+        :return: pd.DataFrame with the formatted data
+        """
+        if isinstance(path, str):
+            path = Path(path)
+        if path.is_file():
+            pathlist = [path]
+        elif path.is_dir():
+            pathlist = list(path.glob(f"*{suffix}"))
+            if not pathlist:
+                raise AssertionError(f"The directory does not contain any files that match the pattern *{suffix}")
+        else:
+            raise FileNotFoundError(f"{path} does not exist.")
+
+        df_list = []
+        for output_file in pathlist:
+            logger.info(f"Reading {output_file}...")
+            df = pd.read_csv(
+                output_file,
+                sep="\t",
+                skiprows=1,
+                usecols=[
+                    "Scan Number",
+                    "Sequence",
+                    "Modifications",
+                    "Filename",
+                    "Amanda Score",
+                    "m/z",
+                    "Charge",
+                    "Protein Accessions",
+                ],
+            )
+            df.columns = [
+                "SCAN_NUMBER",
+                "SEQUENCE",
                 "Modifications",
-                "Filename",
-                "Amanda Score",
-                "m/z",
-                "Charge",
                 "Protein Accessions",
-            ],
-        )
-        df.columns = [
-            "SCAN_NUMBER",
-            "SEQUENCE",
-            "Modifications",
-            "Protein Accessions",
-            "SCORE",
-            "m/z",
-            "PRECURSOR_CHARGE",
-            "RAW_FILE",
-        ]
-        logger.info(f"Finished reading {output_file}")
-
-        df = _update_columns_for_prosit(df)
-        df = filter_valid_prosit_sequences(df)
-        df_list.append(df)
-
-    if len(df_list) == 1:
-        full_df = df_list[0]
-    full_df = pd.concat(df_list, axis=0)
-    full_df = _remove_decoys_in_targets(full_df)
-    return full_df
+                "SCORE",
+                "m/z",
+                "PRECURSOR_CHARGE",
+                "RAW_FILE",
+            ]
+            logger.info(f"Finished reading {output_file}")
+
+            df = MSAmanda._update_columns_for_prosit(df)
+            df = filter_valid_prosit_sequences(df)
+            df_list.append(df)
+
+        if len(df_list) == 1:
+            full_df = df_list[0]
+        full_df = pd.concat(df_list, axis=0)
+        full_df = MSAmanda._remove_decoys_in_targets(full_df)
+        return full_df

From 5c3eaf4058b85693fd1366b1740baeb8f21ad58d Mon Sep 17 00:00:00 2001
From: Mario Picciani <mario.picciani@tum.de>
Date: Thu, 12 Sep 2024 13:06:00 +0200
Subject: [PATCH 2/2] msamanda with new class structure and unit test

---
 spectrum_io/search_result/msamanda.py       | 167 ++++++++------------
 spectrum_io/search_result/search_results.py |   2 +-
 tests/unit_tests/test_sage.py               |   6 +-
 3 files changed, 73 insertions(+), 102 deletions(-)

diff --git a/spectrum_io/search_result/msamanda.py b/spectrum_io/search_result/msamanda.py
index f4c48ba..0d667f5 100644
--- a/spectrum_io/search_result/msamanda.py
+++ b/spectrum_io/search_result/msamanda.py
@@ -1,11 +1,10 @@
 import logging
-from pathlib import Path
-from typing import Union
+from typing import Dict, Optional
 
 import pandas as pd
 from spectrum_fundamentals.constants import PARTICLE_MASSES
 
-from .search_results import SearchResults, filter_valid_prosit_sequences
+from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods
 
 logger = logging.getLogger(__name__)
 
@@ -13,117 +12,91 @@
 class MSAmanda(SearchResults):
     """Handle search results from MSAmanda."""
 
-    @staticmethod
-    def _update_columns_for_prosit(df: pd.DataFrame):
-        """
-        Update columns of df to work with Prosit.
-
-        :param df: df to modify
-        :return: modified df as pd.DataFrame
-        """
-
-        def _check_unsupported_mods(modstring: str):
-            if modstring:
-                for mod in modstring.split(";"):
-                    if not mod.split("(", 1)[1].startswith(("Oxidation", "Carbamidomethyl")):
-                        raise AssertionError(f"Unsupported modification: {mod}")
-
-        def _make_mod_seq(seq: str):
-            seq = seq.replace("m", "M[UNIMOD:35]")
-            seq = seq.replace("c", "C[UNIMOD:4]")
-            return seq
-
-        df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_")
-        df["MASS"] = df["m/z"] * df["PRECURSOR_CHARGE"] - (df["PRECURSOR_CHARGE"] * PARTICLE_MASSES["PROTON"])
-        df["Modifications"].fillna("").apply(_check_unsupported_mods)
-        df["MODIFIED_SEQUENCE"] = df["SEQUENCE"].apply(_make_mod_seq)
-        df["SEQUENCE"] = df["SEQUENCE"].str.upper()
-        df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
-        df["RAW_FILE"] = df["RAW_FILE"].str.replace(r"\.\w+$", "", regex=True)
-
-        return df[
-            [
-                "SCORE",
-                "MASS",
-                "PRECURSOR_CHARGE",
-                "RAW_FILE",
-                "SCAN_NUMBER",
-                "REVERSE",
-                "MODIFIED_SEQUENCE",
-                "SEQUENCE",
-                "PEPTIDE_LENGTH",
-            ]
-        ]
+    @property
+    def standard_mods(self):
+        """Standard modifications that are always applied if not otherwise specified."""
+        return {"m": 35, "c": 4}
 
-    @staticmethod
-    def _remove_decoys_in_targets(full_df):
-        duplicated_psms = full_df[["SCAN_NUMBER", "RAW_FILE", "MODIFIED_SEQUENCE"]].duplicated(keep=False)
-        logger.info(f"Removing {sum(duplicated_psms)} duplicated PSMs...")
-        full_df = full_df[~(full_df["REVERSE"] & duplicated_psms)]
-        if any(full_df[["SCAN_NUMBER", "RAW_FILE", "MODIFIED_SEQUENCE"]].duplicated(keep=False)):
-            raise AssertionError("There are duplicate target PSMs. This is a bug of msamanda!")
-        logger.info(f"{len(full_df)} remaining PSMs")
-
-        return full_df
-
-    @staticmethod
-    def read_result(path: Union[str, Path], suffix: str = "output.csv") -> pd.DataFrame:
+    def read_result(
+        self, tmt_label: str = "", custom_mods: Optional[Dict[str, int]] = None, suffix: str = "output.csv"
+    ) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
 
-        :param path: path to msms.txt to read
+        :param tmt_label: optional tmt label as str
+        :param custom_mods: optional dictionary mapping Sage-specific mod pattern to UNIMOD IDs.
+            If None, static carbamidomethylation of cytein and variable oxidation of methionine
+            are mapped automatically. To avoid this, explicitely provide an empty dictionary.
         :param suffix: Optional suffix to determine which fileresult files should be taken from the supplied path
         :raises FileNotFoundError: If the supplied path is not found
         :raises AssertionError: If the supplied path does not contain any files matching the provided suffix.
+        :raises NotImplementedError: If tmt label was supplied.
         :return: pd.DataFrame with the formatted data
         """
-        if isinstance(path, str):
-            path = Path(path)
-        if path.is_file():
-            pathlist = [path]
-        elif path.is_dir():
-            pathlist = list(path.glob(f"*{suffix}"))
+        parsed_mods = parse_mods(self.standard_mods | (custom_mods or {}))
+        print(parsed_mods)
+        if tmt_label:
+            raise NotImplementedError("TMT is not supported for MSAmanda. Please open an issue on github.")
+
+        if self.path.is_file():
+            pathlist = [self.path]
+            print("hooray")
+        elif self.path.is_dir():
+            pathlist = list(self.path.glob(f"*{suffix}"))
             if not pathlist:
                 raise AssertionError(f"The directory does not contain any files that match the pattern *{suffix}")
         else:
-            raise FileNotFoundError(f"{path} does not exist.")
+            raise FileNotFoundError(f"{self.path} does not exist.")
 
         df_list = []
         for output_file in pathlist:
             logger.info(f"Reading {output_file}...")
-            df = pd.read_csv(
-                output_file,
-                sep="\t",
-                skiprows=1,
-                usecols=[
-                    "Scan Number",
-                    "Sequence",
-                    "Modifications",
-                    "Filename",
-                    "Amanda Score",
-                    "m/z",
-                    "Charge",
-                    "Protein Accessions",
-                ],
+            df_list.append(
+                pd.read_csv(
+                    output_file,
+                    sep="\t",
+                    skiprows=1,
+                    usecols=[
+                        "Scan Number",
+                        "Sequence",
+                        # "Modifications",
+                        "Protein Accessions",
+                        "Amanda Score",
+                        "m/z",
+                        "Charge",
+                        "Filename",
+                    ],
+                )
             )
-            df.columns = [
-                "SCAN_NUMBER",
-                "SEQUENCE",
-                "Modifications",
-                "Protein Accessions",
-                "SCORE",
-                "m/z",
-                "PRECURSOR_CHARGE",
-                "RAW_FILE",
-            ]
             logger.info(f"Finished reading {output_file}")
 
-            df = MSAmanda._update_columns_for_prosit(df)
-            df = filter_valid_prosit_sequences(df)
-            df_list.append(df)
+        self.results = pd.concat(df_list)
+
+        self.convert_to_internal(mods=parsed_mods)
+        return filter_valid_prosit_sequences(self.results)
+
+    def convert_to_internal(self, mods: Dict[str, str]):
+        """
+        Convert all columns in the Sage output to the internal format used by Oktoberfest.
+
+        :param mods: dictionary mapping Sage-specific mod patterns (keys) to ProForma standard (values)
+        """
+        df = self.results
+        df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_")
+        df["m/z"] = df["Charge"] * (df["m/z"] - PARTICLE_MASSES["PROTON"])
+        df["SEQUENCE"] = df["Sequence"].str.upper()
+        df.replace({"RAW_FILE": {r"\.\w+$": ""}, "Sequence": mods}, regex=True, inplace=True)
+        df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
 
-        if len(df_list) == 1:
-            full_df = df_list[0]
-        full_df = pd.concat(df_list, axis=0)
-        full_df = MSAmanda._remove_decoys_in_targets(full_df)
-        return full_df
+        df.rename(
+            columns={
+                "Filename": "RAW_FILE",
+                "Scan Number": "SCAN_NUMBER",
+                "Sequence": "MODIFIED_SEQUENCE",
+                "Charge": "PRECURSOR_CHARGE",
+                "m/z": "MASS",
+                "Amanda Score": "SCORE",
+                "Protein Accessions": "PROTEINS",
+            },
+            inplace=True,
+        )
diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py
index 5cda972..720abde 100644
--- a/spectrum_io/search_result/search_results.py
+++ b/spectrum_io/search_result/search_results.py
@@ -78,7 +78,7 @@ def parse_mods(mods: Dict[str, int]) -> Dict[str, str]:
                 f"Replacement {k} not understood. Replacements must be strings and follow " f"the pattern {key_pattern}"
             )
         if k[0].isalpha():
-            unimod_regex_map[re.escape(k)] = f"{k[0]}[UNIMOD:{v}]"
+            unimod_regex_map[re.escape(k)] = f"{k[0].upper()}[UNIMOD:{v}]"
             continue
         if k[0] == "^":
             unimod_regex_map[f"^{re.escape(k[1:])}"] = f"[UNIMOD:{v}]-"
diff --git a/tests/unit_tests/test_sage.py b/tests/unit_tests/test_sage.py
index 679f9da..4ec4a49 100644
--- a/tests/unit_tests/test_sage.py
+++ b/tests/unit_tests/test_sage.py
@@ -25,11 +25,9 @@ class TestSage(unittest.TestCase):
     def test_read_sage(self):
         """Test function for reading sage results and transforming to Prosit format."""
         expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv"
-        internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result(
-            tmt_label="tmt"
-        )
+        search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result(tmt_label="tmt")
         expected_df = pd.read_csv(expected_sage_internal_path)
-        pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS])
+        pd.testing.assert_frame_equal(search_results_df[COLUMNS], expected_df[COLUMNS])
 
     def test_read_sage_with_custom_mods(self):
         """Test function for reading sage results with custom mods and transforming to Prosit format ."""