Added api for spectrum io

wilhelm-lab · Aug 16, 2023 · 89dae69 · 89dae69
1 parent e98c491
commit 89dae69
Show file tree

Hide file tree

Showing 12 changed files with 296 additions and 313 deletions.
diff --git a/.flake8 b/.flake8
@@ -10,3 +10,4 @@ per-file-ignores =
 	spectrum_io/raw/thermo_raw.py:S603,S404
 	spectrum_io/raw/msraw.py:S405,S314
         docs/conf.py:S404,S607,S603
+        spectrum_io/search_result/__init__.py:F403
diff --git a/spectrum_io/__init__.py b/spectrum_io/__init__.py
@@ -10,7 +10,6 @@
 import time
 
 from . import file, raw
-from .search_result import MaxQuant
 from .spectral_library import DLib, Spectronaut
 
 CONSOLE_LOG_LEVEL = logging.INFO

diff --git a/spectrum_io/search_result/__init__.py b/spectrum_io/search_result/__init__.py
@@ -1,4 +1,2 @@
 """Initialize seach result."""
-from .mascot import Mascot
-from .maxquant import MaxQuant
-from .msfragger import MSFragger
+from .process import *
diff --git a/spectrum_io/search_result/filter.py b/spectrum_io/search_result/filter.py
@@ -0,0 +1,29 @@
+import logging
+import re
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Filter valid Prosit sequences.
+
+    :param df: df to filter
+    :return: df after filtering out unsupported peptides
+    """
+    logger.info(f"#sequences before filtering for valid prosit sequences: {len(df.index)}")
+    # retain only peptides that fall within [7, 30] length supported by Prosit
+    df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)]
+    # remove unsupported mods to exclude
+    unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]"]
+    exclude_mods_pattern = re.compile("|".join(unsupported_mods))
+    df = df[~df["MODIFIED_SEQUENCE"].str.contains(exclude_mods_pattern)]
+    # remove non-canonical aas
+    df = df[(~df["SEQUENCE"].str.contains("U|O"))]
+    # remove precursor charges greater than 6
+    df = df[df["PRECURSOR_CHARGE"] <= 6]
+    logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}")
+
+    return df
diff --git a/spectrum_io/search_result/mascot.py b/spectrum_io/search_result/mascot.py
@@ -7,98 +7,92 @@
 import spectrum_fundamentals.constants as c
 from spectrum_fundamentals.mod_string import internal_without_mods
 
-from .search_results import SearchResults, filter_valid_prosit_sequences
+from .filter import filter_valid_prosit_sequences
 
 logger = logging.getLogger(__name__)
 
 
-class Mascot(SearchResults):
-    """Handle search results from Mascot."""
+def read_mascot(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
+    """
+    Function to read a mascot msf file and perform some basic formatting.
 
-    @staticmethod
-    def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
-        """
-        Function to read a mascot msf file and perform some basic formatting.
+    :param path: path to msms.txt to read
+    :param tmt_labeled: tmt label as str
+    :return: pd.DataFrame with the formatted data
+    """
+    logger.info("Reading mascot msf file")
+    connection = sqlite3.connect(path)
+    # cursor = connection.cursor()
+    # cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+    df = pd.read_sql("SELECT * FROM MSnSpectrumInfo", connection)[
+        ["SpectrumID", "SpectrumFileName", "RetentionTime", "Mass", "Charge"]
+    ]
+    df_id_map = pd.read_sql("SELECT * FROM TargetPsmsMSnSpectrumInfo", connection)[
+        ["MSnSpectrumInfoSpectrumID", "TargetPsmsPeptideID"]
+    ]
+    df = df.merge(df_id_map, left_on="SpectrumID", right_on="MSnSpectrumInfoSpectrumID")
+    df_target_psms = pd.read_sql("SELECT * FROM TargetPsms", connection)[
+        ["PeptideID", "Sequence", "ModifiedSequence", "Modifications", "XCorr"]
+    ]
+    df = df.merge(df_target_psms, left_on="TargetPsmsPeptideID", right_on="PeptideID")
+    df_modif = pd.read_sql("SELECT * FROM TargetPsmsFoundModifications", connection)[
+        ["TargetPsmsPeptideID", "FoundModificationsModificationID", "Position"]
+    ]
+    df_modif_mass = pd.read_sql("SELECT * FROM FoundModifications", connection)[
+        ["ModificationID", "DeltaMonoisotopicMass"]
+    ]
+    df_modif = df_modif.merge(df_modif_mass, left_on="FoundModificationsModificationID", right_on="ModificationID")
+    df = df.merge(df_modif, on="TargetPsmsPeptideID")
 
-        :param path: path to msms.txt to read
-        :param tmt_labeled: tmt label as str
-        :return: pd.DataFrame with the formatted data
-        """
-        logger.info("Reading mascot msf file")
-        connection = sqlite3.connect(path)
-        # cursor = connection.cursor()
-        # cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
-        df = pd.read_sql("SELECT * FROM MSnSpectrumInfo", connection)[
-            ["SpectrumID", "SpectrumFileName", "RetentionTime", "Mass", "Charge"]
-        ]
-        df_id_map = pd.read_sql("SELECT * FROM TargetPsmsMSnSpectrumInfo", connection)[
-            ["MSnSpectrumInfoSpectrumID", "TargetPsmsPeptideID"]
-        ]
-        df = df.merge(df_id_map, left_on="SpectrumID", right_on="MSnSpectrumInfoSpectrumID")
-        df_target_psms = pd.read_sql("SELECT * FROM TargetPsms", connection)[
-            ["PeptideID", "Sequence", "ModifiedSequence", "Modifications", "XCorr"]
-        ]
-        df = df.merge(df_target_psms, left_on="TargetPsmsPeptideID", right_on="PeptideID")
-        df_modif = pd.read_sql("SELECT * FROM TargetPsmsFoundModifications", connection)[
-            ["TargetPsmsPeptideID", "FoundModificationsModificationID", "Position"]
-        ]
-        df_modif_mass = pd.read_sql("SELECT * FROM FoundModifications", connection)[
-            ["ModificationID", "DeltaMonoisotopicMass"]
-        ]
-        df_modif = df_modif.merge(df_modif_mass, left_on="FoundModificationsModificationID", right_on="ModificationID")
-        df = df.merge(df_modif, on="TargetPsmsPeptideID")
+    logger.info("Finished reading mascot msf file.")
 
-        logger.info("Finished reading mascot msf file.")
+    df.rename(
+        columns={
+            "SpectrumID": "SCAN NUMBER",
+            "ModifiedSequence": "MODIFIED SEQUENCE",
+            "Charge": "PRECURSOR CHARGE",
+            "XCorr": "SCORE",
+            "SpectrumFileName": "RAW FILE",
+        },
+        inplace=True,
+    )
 
-        df.rename(
-            columns={
-                "SpectrumID": "SCAN NUMBER",
-                "ModifiedSequence": "MODIFIED SEQUENCE",
-                "Charge": "PRECURSOR CHARGE",
-                "XCorr": "SCORE",
-                "SpectrumFileName": "RAW FILE",
-            },
-            inplace=True,
-        )
+    # Standardize column names
+    df.columns = df.columns.str.upper()
+    df.columns = df.columns.str.replace(" ", "_")
+    # TODO reverse
+    df["REVERSE"] = df["SEQUENCE"].str.contains("Reverse")
+    logger.info("Converting MSFragger  peptide sequence to internal format")
+    df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "")
+    df["MODIFICATIONS"] = (df["POSITION"].astype(int) - 1).astype(str) + "$" + df["DELTAMONOISOTOPICMASS"].astype(str)
+    df = df.groupby("SCAN_NUMBER", as_index=False).apply(lambda x: x.sort_values("POSITION"))
+    df = df.groupby(
+        ["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"],
+        as_index=False,
+    ).agg({"MODIFICATIONS": "|".join})
+    mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()}
 
-        # Standardize column names
-        df.columns = df.columns.str.upper()
-        df.columns = df.columns.str.replace(" ", "_")
-        # TODO reverse
-        df["REVERSE"] = df["SEQUENCE"].str.contains("Reverse")
-        logger.info("Converting MSFragger  peptide sequence to internal format")
-        df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "")
-        df["MODIFICATIONS"] = (
-            (df["POSITION"].astype(int) - 1).astype(str) + "$" + df["DELTAMONOISOTOPICMASS"].astype(str)
-        )
-        df = df.groupby("SCAN_NUMBER", as_index=False).apply(lambda x: x.sort_values("POSITION"))
-        df = df.groupby(
-            ["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"],
-            as_index=False,
-        ).agg({"MODIFICATIONS": "|".join})
-        mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()}
+    sequences = []
+    for _, row in df.iterrows():
+        modifications = row["MODIFICATIONS"].split("|")
+        if len(modifications) == 0:
+            sequences.append(row["SEQUENCE"])
+        else:
+            sequence = row["SEQUENCE"]
+            skip = 0
+            for mod in modifications:
+                pos, mass = mod.split("$")
+                sequence = (
+                    sequence[: int(pos) + 1 + skip]
+                    + mod_masses_reverse[round(float(mass), 3)]
+                    + sequence[int(pos) + 1 + skip :]
+                )
+                skip = skip + len(mod_masses_reverse[round(float(mass), 3)])
+            sequences.append(sequence)
 
-        sequences = []
-        for _, row in df.iterrows():
-            modifications = row["MODIFICATIONS"].split("|")
-            if len(modifications) == 0:
-                sequences.append(row["SEQUENCE"])
-            else:
-                sequence = row["SEQUENCE"]
-                skip = 0
-                for mod in modifications:
-                    pos, mass = mod.split("$")
-                    sequence = (
-                        sequence[: int(pos) + 1 + skip]
-                        + mod_masses_reverse[round(float(mass), 3)]
-                        + sequence[int(pos) + 1 + skip :]
-                    )
-                    skip = skip + len(mod_masses_reverse[round(float(mass), 3)])
-                sequences.append(sequence)
+    df["MODIFIED_SEQUENCE"] = sequences
 
-        df["MODIFIED_SEQUENCE"] = sequences
+    df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
+    df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))
 
-        df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
-        df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))
-
-        return filter_valid_prosit_sequences(df)
+    return filter_valid_prosit_sequences(df)