added sage.py parser - tmt label in the update_columns_for_prosit fun…

…ction
wilhelm-lab · Oct 24, 2023 · 85e8900 · 85e8900
1 parent 3a01722
commit 85e8900
Showing 1 changed file with 84 additions and 1 deletion.
diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py
@@ -1 +1,84 @@
-s
+import logging
+from pathlib import Path
+from typing import Union
+
+import pandas as pd
+import spectrum_fundamentals.constants as c
+from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal
+
+from .search_results import SearchResults, filter_valid_prosit_sequences
+
+logger = logging.getLogger(__name__)
+
+
+class Sage(SearchResults):
+
+    @staticmethod
+    def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:
+        """
+        Add tmt modification.
+
+        :param mass: mass without tmt modification
+        :param seq: sequence of the peptide
+        :param unimod_tag: UNIMOD tag for the modification
+        :return: mass as float
+        """
+        num_of_tmt = seq.count(unimod_tag)
+        mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"]
+        return mass
+
+    @staticmethod
+    def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
+        """
+        Function to read a msms txt and perform some basic formatting.
+
+        :param path: path to msms.txt to read
+        :param tmt_labeled: tmt label as str
+        :return: pd.DataFrame with the formatted data
+        """
+        logger.info("Reading msms.txt file")
+        df = pd.read_csv(
+            path,
+            usecols=lambda x: x.upper()
+            in [
+                "filename",
+                "scannr",
+                "peptide",
+                "charge",
+                "hyperscore",
+                "calcmass",
+            ],
+            sep="\t",
+        )
+        logger.info("Finished reading msms.txt file")
+
+        # Standardize column names
+        df.columns = df.columns.str.upper()
+        df.columns = df.columns.str.replace(" ", "_")
+
+        df = Sage.modify_columns_for_prosit(df, tmt_labeled)
+        return filter_valid_prosit_sequences(df)
+
+    @staticmethod
+    def update_columns_for_prosit(df: pd.DataFrame , tmt_labeled: str) -> pd.DataFrame:
+        df = df.rename (columns={'FILENAME':'RAW_FILE','SCANNR':'SCAN_NUMBER','PEPTIDE':'MODIFIED_SEQUENCE','CHARGE':'PRECURSOR_CHARGE'})
+        # TODO modified sequence should be changed from proforma to unimod
+        # removing .mzML
+        df['RAW_FILE'] = df['RAW_FILE'].str.replace(".mzML","")
+        # extracting only the scan number 
+        df['SCAN_NUMBER'] = df['SCAN_NUMBER'].str.split('=').str[3:].str.join('=')
+        # creating a column of decoys and targets
+        df['REVERSE'] = df['PROTEINS'].str.startswith('rev_')
+        # removing modification to create the unmodified sequences
+        df['SEQUENCE'] = df['MODIFIED_SEQUENCE'].str.replace(r'\[.*?\]', '', regex=True)
+        #length of the peptide
+        df['PEPTIDE_LENGTH'] = df['SEQUENCE'].str.len()
+        # mass of the peptide
+        df['MASS'] = df['CALCMASS']
+        # score of the peptide
+        df['SCORE'] = df['HYPERSCORE']
+
+        return df
+
+
+