Merge pull request #156 from wilhelm-lab/feature/xl

Feature/xl
wilhelm-lab · Oct 27, 2024 · 537478a · 537478a
2 parents 93d93bd + 01ee5e9
commit 537478a
Show file tree

Hide file tree

Showing 18 changed files with 6,961 additions and 4,489 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Mario Picciani
 email: [email protected]
 project_name: spectrum_io
 project_short_description: IO related functionalities for oktoberfest.
-version: 0.6.3
+version: 0.6.4
 license: MIT
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.6.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.6.3 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.6.4 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.6.4 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.3
+current_version = 0.6.4
 
 [bumpversion_files_whitelisted]
 init_file = spectrum_io/__init__.py

diff --git a/docs/conf.py b/docs/conf.py
@@ -52,9 +52,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.6.3"
+version = "0.6.4"
 # The full version, including alpha/beta/rc tags.
-release = "0.6.3"
+release = "0.6.4"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "spectrum_io"
-version = "0.6.3"  # <<COOKIETEMPLE_FORCE_BUMP>>
+version = "0.6.4"  # <<COOKIETEMPLE_FORCE_BUMP>>
 description = "IO related functionalities for oktoberfest."
 authors = ["Wilhelmlab at Technical University of Munich"]
 license = "MIT"
@@ -29,7 +29,7 @@ pyarrow = ">=16.0.0"
 pymzml = "^2.5.0"
 pyteomics = "^4.3.3"
 lxml= '>=4.5.2,<6.0.0'
-spectrum-fundamentals = ">=0.7.4,<0.8.0"
+spectrum-fundamentals = ">=0.7.8,<0.8.0"
 alphatims = "^1.0.8"
 sortedcontainers = "^2.4.0"
 

diff --git a/requirements.txt b/requirements.txt
diff --git a/spectrum_io/__init__.py b/spectrum_io/__init__.py
@@ -5,7 +5,7 @@
 __author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)"""
 __copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich"
 __license__ = "MIT"
-__version__ = "0.6.3"
+__version__ = "0.6.4"
 
 import logging
 import logging.handlers

diff --git a/spectrum_io/__main__.py b/spectrum_io/__main__.py
@@ -5,7 +5,7 @@
 
 
 @click.command()
-@click.version_option(version="0.6.3", message=click.style("spectrum_io Version: 0.6.3"))
+@click.version_option(version="0.6.4", message=click.style("spectrum_io Version: 0.6.4"))
 def main() -> None:
     """spectrum_io."""
 

diff --git a/spectrum_io/search_result/__init__.py b/spectrum_io/search_result/__init__.py
@@ -5,4 +5,5 @@
 from .msamanda import MSAmanda
 from .msfragger import MSFragger
 from .sage import Sage
+from .scout import Scout
 from .xisearch import Xisearch
diff --git a/spectrum_io/search_result/scout.py b/spectrum_io/search_result/scout.py
@@ -0,0 +1,237 @@
+from __future__ import annotations
+
+import glob
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import numpy as np
+import pandas as pd
+import spectrum_fundamentals.constants as c
+from spectrum_fundamentals.mod_string import xisearch_to_internal
+
+from .search_results import SearchResults
+
+logger = logging.getLogger(__name__)
+
+
+class Scout(SearchResults):
+    """Handle search results from xisearch."""
+
+    def read_result(
+        self,
+        tmt_label: str = "",
+        custom_mods: dict[str, int] | None = None,
+        ptm_unimod_id: int | None = 0,
+        ptm_sites: list[str] | None = None,
+    ) -> pd.DataFrame:
+        """
+        Function to read a csv of CSMs and perform some basic formatting.
+
+        :param tmt_label: tmt label as str
+        :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
+        :param ptm_unimod_id: unimod id used for site localization
+        :param ptm_sites: possible sites that the ptm can exist on
+        :raises NotImplementedError: if TMT label is provided
+        :return: pd.DataFrame with the formatted data
+        """
+        if tmt_label != "":
+            raise NotImplementedError("TMT is not supported for Scout")
+
+        logger.info("Reading search results file...")
+        columns_to_read = [
+            "ScanNumber",
+            "Charge",
+            "ExperimentalMZ",
+            "AlphaPeptide",
+            "BetaPeptide",
+            "AlphaPos",
+            "BetaPos",
+            "AlphaMappings",
+            "BetaMappings",
+            "ClassificationScore",
+            "Peptide Position 1",
+            "Peptide Position 2",
+            "Protein 1",
+            "Protein 2",
+            "FileName",
+        ]
+
+        self.results = pd.read_csv(self.path, usecols=columns_to_read)
+        logger.info("Finished reading search results file.")
+        # Standardize column names
+        self.convert_to_internal(mods={})
+        return self.filter_valid_prosit_sequences()
+        # df = Scout._filter_duplicates(df)
+
+    @staticmethod
+    def _filter_duplicates(df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Keep csm with higher score and remove duplicate (only top ranks).
+
+        :param df: df to filter
+        :return: filtered df as pd.DataFrame
+        """
+        repetitive_combinations = df[df.duplicated(subset=["ScanNumber", "RAW_FILE"], keep=False)]
+        filtered_df = repetitive_combinations.groupby(["ScanNumber", "RAW_FILE"]).apply(
+            lambda x: x.loc[x["ClassificationScore"].idxmax()]
+        )
+        filtered_df.reset_index(drop=True, inplace=True)
+        final_df = pd.concat([df.drop_duplicates(subset=["ScanNumber", "RAW_FILE"], keep=False), filtered_df])
+        final_df.reset_index(drop=True, inplace=True)
+        df = final_df
+        return df
+
+    @staticmethod
+    def _extract_modifications(peptide_seq: str):
+        modifications = []
+        # Find all matches of modifications
+        matches = re.findall(r"([CM])\(\+([\d.]+)\)", peptide_seq)
+        for match in matches:
+            mod, _ = match
+            # Add modification to the list
+            if mod == "C":
+                modifications.append("cm")
+            elif mod == "M":
+                modifications.append("ox")
+        return ";".join(modifications)
+
+    @staticmethod
+    def _extract_modification_positions(peptide_seq: str):
+        pattern = r"([A-Z])(\(\+\d+\.\d+\))?"
+        matches = re.findall(pattern, peptide_seq)
+        split_peptide = []
+        for match in matches:
+            amino_acid = match[0]
+            modification = match[1] if match[1] else ""
+            split_peptide.append(amino_acid + modification)
+        positions = [str(i + 1) for i, component in enumerate(split_peptide) if "+" in component]
+        return ";".join(positions)
+
+    @staticmethod
+    def _self_or_between_mp(df: pd.DataFrame) -> pd.DataFrame:
+        df["tmp_id"] = df.index
+        df_expl = df.copy()
+        df_expl.loc[:, "AlphaMappings"] = df_expl["AlphaMappings"].str.split(";")
+        df_expl.loc[:, "BetaMappings"] = df_expl["BetaMappings"].str.split(";")
+        df_expl = df_expl.explode("AlphaMappings")
+        df_expl = df_expl.explode("BetaMappings")
+        df_expl.loc[:, "self"] = False
+        df_expl.loc[df_expl["AlphaMappings"] == df_expl["BetaMappings"], "self"] = True
+        id_to_self = df_expl.groupby("tmp_id", dropna=False).agg({"self": "max"}).reset_index()
+        df = df.drop(["self"], axis=1, errors="ignore").merge(id_to_self, on=["tmp_id"], validate="1:1")
+        df.loc[:, "fdr_group"] = df["self"].apply(lambda x: "self" if x else "between")
+        return df
+
+    def convert_to_internal(
+        self, mods: dict[str, str], ptm_unimod_id: int | None = None, ptm_sites: list[str] | None = None
+    ):
+        """
+        Convert all columns in the search engine-specific output to the internal format used by Oktoberfest.
+
+        :param mods: dictionary mapping search engine-specific mod patterns (keys) to ProForma standard (values)
+        :param ptm_unimod_id: unimod id used for site localization
+        :param ptm_sites: possible sites that the ptm can exist on
+        """
+        # Filter csms that does not contain any "k"
+        df = self.results
+        df = df[(df["AlphaPeptide"].str.contains("K")) & (df["BetaPeptide"].str.contains("K"))]
+        df["decoy_p1"] = df["AlphaMappings"].str.contains("Reverse").astype(bool)
+        df["decoy_p2"] = df["BetaMappings"].str.contains("Reverse").astype(bool)
+        df["protein_p1"] = df["AlphaMappings"]
+        df["protein_p2"] = df["BetaMappings"]
+        df["decoy"] = df["decoy_p1"] | df["decoy_p2"]
+        df["REVERSE"] = df["decoy"]
+        df["RAW_FILE"] = df["FileName"].apply(lambda x: x.split("\\")[-1])
+        df["MASS"] = df["ExperimentalMZ"]
+        df["PRECURSOR_CHARGE"] = df["Charge"]
+        df["CROSSLINKER_TYPE"] = "DSSO"
+        df["crosslinker_name"] = "DSSO"
+        df["linked_aa_p1"] = "K"
+        df["linked_aa_p2"] = "K"
+        df["linear"] = "False"
+        df["match_score"] = "ClassificationScore"
+        df["SCORE"] = df["ClassificationScore"]
+        df["SCAN_NUMBER"] = df["ScanNumber"]
+        df["SEQUENCE_A"] = df["AlphaPeptide"].apply(lambda x: re.sub(r"\([^)]*\)", "", x))
+        df["SEQUENCE_B"] = df["BetaPeptide"].apply(lambda x: re.sub(r"\([^)]*\)", "", x))
+        df["base_sequence_p1"] = df["SEQUENCE_A"]
+        df["base_sequence_p2"] = df["SEQUENCE_B"]
+        df = df[df.apply(lambda row: row["SEQUENCE_A"][row["AlphaPos"]] == "K", axis=1)]
+        df = df[df.apply(lambda row: row["SEQUENCE_B"][row["BetaPos"]] == "K", axis=1)]
+        df["Modifications_A"] = df["AlphaPeptide"].apply(Scout._extract_modifications)
+        df["Modifications_B"] = df["BetaPeptide"].apply(Scout._extract_modifications)
+        df["mods_p1"] = df["AlphaPeptide"].apply(Scout._extract_modifications)
+        df["mods_p2"] = df["BetaPeptide"].apply(Scout._extract_modifications)
+        df["ModificationPositions1"] = df["AlphaPeptide"].apply(Scout._extract_modification_positions)
+        df["ModificationPositions2"] = df["BetaPeptide"].apply(Scout._extract_modification_positions)
+        df["CROSSLINKER_POSITION_A"] = df["AlphaPos"] + 1
+        df["CROSSLINKER_POSITION_B"] = df["BetaPos"] + 1
+        df["mod_pos_p1"] = df["AlphaPos"] + 1
+        df["mod_pos_p2"] = df["BetaPos"] + 1
+        df["link_pos_p1"] = df["AlphaPos"] + 1
+        df["link_pos_p2"] = df["BetaPos"] + 1
+        df["PEPTIDE_LENGTH_A"] = df["SEQUENCE_A"].apply(len)
+        df["PEPTIDE_LENGTH_B"] = df["SEQUENCE_B"].apply(len)
+        df["aa_len_p1"] = df["SEQUENCE_A"].apply(len)
+        df["aa_len_p2"] = df["SEQUENCE_B"].apply(len)
+        df = Scout._self_or_between_mp(df)
+        df["fdr_group"] = np.where(
+            df["AlphaMappings"].str.replace("Reverse_", "") == df["BetaMappings"].str.replace("Reverse_", ""),
+            "self",
+            "between",
+        )
+        df.drop(columns=["self"], inplace=True)
+        df.drop(columns=["tmp_id"], inplace=True)
+        logger.info("Converting Scout peptide sequence to internal format...")
+        df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "")
+        df["MODIFIED_SEQUENCE_A"] = df.apply(
+            lambda row: xisearch_to_internal(
+                xl=row["CROSSLINKER_TYPE"],
+                seq=row["SEQUENCE_A"],
+                mod=row["Modifications_A"],
+                crosslinker_position=row["CROSSLINKER_POSITION_A"],
+                mod_positions=row["ModificationPositions1"],
+            ),
+            axis=1,
+            result_type="expand",
+        )
+        df["MODIFIED_SEQUENCE_B"] = df.apply(
+            lambda row: xisearch_to_internal(
+                xl=row["CROSSLINKER_TYPE"],
+                seq=row["SEQUENCE_B"],
+                mod=row["Modifications_B"],
+                crosslinker_position=row["CROSSLINKER_POSITION_B"],
+                mod_positions=row["ModificationPositions2"],
+            ),
+            axis=1,
+            result_type="expand",
+        )
+        new_column_names = {
+            "FileName": "run_name",
+            "ScanNumber": "scan_number",
+            "ExperimentalMZ": "precursor_mass",
+            "Charge": "precursor_charge",
+            "scan_number": "ScanNumber",
+        }
+        self.results = df.rename(columns=new_column_names)
+
+    def filter_valid_prosit_sequences(self) -> pd.DataFrame:
+        """
+        Filter valid Prosit sequences.
+
+        :return: df after filtration
+        """
+        logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results)}")
+        self.results = self.results[(self.results["PEPTIDE_LENGTH_A"] <= 30)]
+        self.results = self.results[self.results["PEPTIDE_LENGTH_A"] >= 6]
+        self.results = self.results[(self.results["PEPTIDE_LENGTH_B"] <= 30)]
+        self.results = self.results[self.results["PEPTIDE_LENGTH_B"] >= 6]
+        self.results = self.results[(~self.results["SEQUENCE_A"].str.contains(r"B|\*|\.|U|O|X|Z|\(|\)"))]
+        self.results = self.results[(~self.results["SEQUENCE_B"].str.contains(r"B|\*|\.|U|O|X|Z|\(|\)"))]
+        self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6]
+        logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results)}")
+
+        return self.results
diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py
@@ -118,6 +118,7 @@ def generate_internal(
         custom_mods: dict[str, int] | None = None,
         ptm_unimod_id: int | None = 0,
         ptm_sites: list[str] | None = None,
+        xl: bool = False,
     ) -> pd.DataFrame:
         """
         Generate df and save to out_path if provided.
@@ -127,14 +128,18 @@ def generate_internal(
         :param custom_mods: dict with static and variable custom modifications, their internal identifier and mass
         :param ptm_unimod_id: unimod id used for site localization
         :param ptm_sites: possible sites that the ptm can exist on
+        :param xl: set to True for crosslinking data
         :return: path to output file
         """
         if out_path is None:
             # convert and return
             filtered_df = self.read_result(
                 tmt_label, custom_mods=custom_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites
             )
-            return filtered_df[COLUMNS]
+            if xl:
+                return filtered_df[:]  # Return all columns
+            else:
+                return filtered_df[COLUMNS]
         if isinstance(out_path, str):
             out_path = Path(out_path)
 
@@ -145,9 +150,14 @@ def generate_internal(
             return csv.read_file(out_path)
 
         # convert, save and return
-        df = self.read_result(tmt_label, custom_mods=custom_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites)[
-            COLUMNS
-        ]
+        if xl:
+            df = self.read_result(tmt_label, custom_mods=custom_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites)[
+                :
+            ]
+        else:
+            df = self.read_result(tmt_label, custom_mods=custom_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites)[
+                COLUMNS
+            ]
         csv.write_file(df, out_path)
         return df