Merge pull request #151 from wilhelm-lab/release/0.6.3

Release/0.6.3
wilhelm-lab · Oct 1, 2024 · d1eeab7 · d1eeab7
2 parents 236425a + 93d93bd
commit d1eeab7
Show file tree

Hide file tree

Showing 17 changed files with 223 additions and 104 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Mario Picciani
 email: [email protected]
 project_name: spectrum_io
 project_short_description: IO related functionalities for oktoberfest.
-version: 0.6.2
+version: 0.6.3
 license: MIT
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.6.2 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.6.2 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.6.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.6.3 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.2
+current_version = 0.6.3
 
 [bumpversion_files_whitelisted]
 init_file = spectrum_io/__init__.py

diff --git a/docs/conf.py b/docs/conf.py
@@ -52,9 +52,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.6.2"
+version = "0.6.3"
 # The full version, including alpha/beta/rc tags.
-release = "0.6.2"
+release = "0.6.3"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "spectrum_io"
-version = "0.6.2"  # <<COOKIETEMPLE_FORCE_BUMP>>
+version = "0.6.3"  # <<COOKIETEMPLE_FORCE_BUMP>>
 description = "IO related functionalities for oktoberfest."
 authors = ["Wilhelmlab at Technical University of Munich"]
 license = "MIT"
@@ -29,7 +29,7 @@ pyarrow = ">=16.0.0"
 pymzml = "^2.5.0"
 pyteomics = "^4.3.3"
 lxml= '>=4.5.2,<6.0.0'
-spectrum-fundamentals = ">=0.7.1,<0.8.0"
+spectrum-fundamentals = ">=0.7.4,<0.8.0"
 alphatims = "^1.0.8"
 sortedcontainers = "^2.4.0"
 

diff --git a/spectrum_io/__init__.py b/spectrum_io/__init__.py
@@ -5,7 +5,7 @@
 __author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)"""
 __copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich"
 __license__ = "MIT"
-__version__ = "0.6.2"
+__version__ = "0.6.3"
 
 import logging
 import logging.handlers

diff --git a/spectrum_io/__main__.py b/spectrum_io/__main__.py
@@ -5,7 +5,7 @@
 
 
 @click.command()
-@click.version_option(version="0.6.2", message=click.style("spectrum_io Version: 0.6.2"))
+@click.version_option(version="0.6.3", message=click.style("spectrum_io Version: 0.6.3"))
 def main() -> None:
     """spectrum_io."""
 

diff --git a/spectrum_io/search_result/mascot.py b/spectrum_io/search_result/mascot.py
@@ -1,14 +1,12 @@
 import logging
-import re
 import sqlite3
-from pathlib import Path
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
 from spectrum_fundamentals.mod_string import internal_without_mods
 
-from .search_results import SearchResults, filter_valid_prosit_sequences
+from .search_results import SearchResults
 
 logger = logging.getLogger(__name__)
 
@@ -25,12 +23,16 @@ def read_result(
         self,
         tmt_label: str = "",
         custom_mods: Optional[Dict[str, int]] = None,
+        ptm_unimod_id: Optional[int] = 0,
+        ptm_sites: Optional[list[str]] = None,
     ) -> pd.DataFrame:
         """
         Function to read a mascot msf file and perform some basic formatting.
 
         :param tmt_label: tmt label as str
         :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
+        :param ptm_unimod_id: unimod id used for site localization
+        :param ptm_sites: possible sites that the ptm can exist on
         :raises NotImplementedError: always
         :return: pd.DataFrame with the formatted data
         """
@@ -119,4 +121,4 @@ def read_result(
         df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
         df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))
 
-        return filter_valid_prosit_sequences(df)
+        return self.filter_valid_prosit_sequences()
diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py
@@ -1,20 +1,22 @@
+from __future__ import annotations
+
 import logging
 from pathlib import Path
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Union
 
 import pandas as pd
 import spectrum_fundamentals.constants as c
-from spectrum_fundamentals.mod_string import internal_without_mods
+from spectrum_fundamentals.mod_string import add_permutations, internal_without_mods
 
-from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods
+from .search_results import SearchResults, parse_mods
 
 logger = logging.getLogger(__name__)
 
 
 class MaxQuant(SearchResults):
     """Handle search results from MaxQuant."""
 
-    def __init__(self, path: Union[str, Path]):
+    def __init__(self, path: str | Path):
         """
         Init Searchresults object.
 
@@ -34,6 +36,9 @@ def standard_mods(self):
             "C": 4,
             "M(ox)": 35,
             "M(Oxidation (M))": 35,
+            "R(Citrullination)": 7,
+            "Q(Deamidation (NQ))": 7,
+            "N(Deamidation (NQ))": 7,
         }
 
     @staticmethod
@@ -50,10 +55,25 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:
         mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"]
         return mass
 
+    def filter_valid_prosit_sequences(self):
+        """Filter valid Prosit sequences."""
+        logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results.index)}")
+        # retain only peptides that fall within [7, 30] length supported by Prosit
+        self.results = self.results[(self.results["PEPTIDE_LENGTH"] <= 30) & (self.results["PEPTIDE_LENGTH"] >= 7)]
+        # remove unsupported mods to exclude
+        self.results = self.results[~self.results["MODIFIED_SEQUENCE"].str.contains(r"\(", regex=True)]
+        # remove precursor charges greater than 6
+        self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6]
+        logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results.index)}")
+
+        return self.results
+
     def read_result(
         self,
         tmt_label: str = "",
-        custom_mods: Optional[Dict[str, int]] = None,
+        custom_mods: dict[str, int] | None = None,
+        ptm_unimod_id: int | None = 0,
+        ptm_sites: list[str] | None = None,
     ) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
@@ -62,6 +82,8 @@ def read_result(
         :param custom_mods: optional dictionary mapping MaxQuant-specific mod pattern to UNIMOD IDs.
             If None, static carbamidomethylation of cytein and variable oxidation of methionine
             are mapped automatically. To avoid this, explicitely provide an empty dictionary.
+        :param ptm_unimod_id: unimod id used for site localization
+        :param ptm_sites: possible sites that the ptm can exist on
         :return: pd.DataFrame with the formatted data
         """
         parsed_mods = parse_mods(self.standard_mods | (custom_mods or {}))
@@ -89,14 +111,16 @@ def read_result(
 
         logger.info("Finished reading msms.txt file")
 
-        self.convert_to_internal(mods=parsed_mods)
-        return filter_valid_prosit_sequences(self.results)
+        self.convert_to_internal(mods=parsed_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites)
+        return self.filter_valid_prosit_sequences()
 
-    def convert_to_internal(self, mods: Dict[str, str]):
+    def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None):
         """
         Convert all columns in the MaxQuant output to the internal format used by Oktoberfest.
 
         :param mods: dictionary mapping MaxQuant-specific mod patterns (keys) to ProForma standard (values)
+        :param ptm_unimod_id: unimod id used for site localization
+        :param ptm_sites: possible sites that the ptm can exist on
         """
         df = self.results
         # Standardize column names
@@ -112,6 +136,21 @@ def convert_to_internal(self, mods: Dict[str, str]):
 
         df["Sequence"] = internal_without_mods(df["Modified sequence"])
         df["PEPTIDE_LENGTH"] = df["Sequence"].str.len()
+        if ptm_unimod_id != 0:
+
+            # PTM permutation generation
+            if ptm_unimod_id == 7:
+                allow_one_less_modification = True
+            else:
+                allow_one_less_modification = False
+
+            df["Modified sequence"] = df["Modified sequence"].apply(
+                add_permutations,
+                unimod_id=ptm_unimod_id,
+                residues=ptm_sites,
+                allow_one_less_modification=allow_one_less_modification,
+            )
+            df = df.explode("Modified sequence", ignore_index=True)
 
         df.rename(
             columns={
@@ -128,6 +167,7 @@ def convert_to_internal(self, mods: Dict[str, str]):
             },
             inplace=True,
         )
+        self.results = df
 
     def generate_internal_timstof_metadata(self):
         """

diff --git a/spectrum_io/search_result/msamanda.py b/spectrum_io/search_result/msamanda.py
@@ -1,10 +1,12 @@
+from __future__ import annotations
+
 import logging
 from typing import Dict, Optional
 
 import pandas as pd
 from spectrum_fundamentals.constants import PARTICLE_MASSES
 
-from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods
+from .search_results import SearchResults, parse_mods
 
 logger = logging.getLogger(__name__)
 
@@ -18,7 +20,12 @@ def standard_mods(self):
         return {"m": 35, "c": 4}
 
     def read_result(
-        self, tmt_label: str = "", custom_mods: Optional[Dict[str, int]] = None, suffix: str = "output.csv"
+        self,
+        tmt_label: str = "",
+        custom_mods: dict[str, int] | None = None,
+        ptm_unimod_id: int | None = 0,
+        ptm_sites: list[str] | None = None,
+        suffix: str = "output.csv",
     ) -> pd.DataFrame:
         """
         Function to read a msms txt and perform some basic formatting.
@@ -28,6 +35,8 @@ def read_result(
             If None, static carbamidomethylation of cytein and variable oxidation of methionine
             are mapped automatically. To avoid this, explicitely provide an empty dictionary.
         :param suffix: Optional suffix to determine which fileresult files should be taken from the supplied path
+        :param ptm_unimod_id: unimod id used for site localization
+        :param ptm_sites: possible sites that the ptm can exist on
         :raises FileNotFoundError: If the supplied path is not found
         :raises AssertionError: If the supplied path does not contain any files matching the provided suffix.
         :raises NotImplementedError: If tmt label was supplied.
@@ -72,14 +81,29 @@ def read_result(
 
         self.results = pd.concat(df_list)
 
-        self.convert_to_internal(mods=parsed_mods)
-        return filter_valid_prosit_sequences(self.results)
+        self.convert_to_internal(mods=parsed_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites)
+        return self.filter_valid_prosit_sequences()
+
+    def filter_valid_prosit_sequences(self):
+        """Filter valid Prosit sequences."""
+        logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results.index)}")
+        # retain only peptides that fall within [7, 30] length supported by Prosit
+        self.results = self.results[(self.results["PEPTIDE_LENGTH"] <= 30) & (self.results["PEPTIDE_LENGTH"] >= 7)]
+        # remove unsupported mods to exclude
+        self.results = self.results[~self.results["MODIFIED_SEQUENCE"].str.contains(r"[a-z]+", regex=True)]
+        # remove precursor charges greater than 6
+        self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6]
+        logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results.index)}")
+
+        return self.results
 
-    def convert_to_internal(self, mods: Dict[str, str]):
+    def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None):
         """
         Convert all columns in the Sage output to the internal format used by Oktoberfest.
 
         :param mods: dictionary mapping Sage-specific mod patterns (keys) to ProForma standard (values)
+        :param ptm_unimod_id: unimod id used for site localization
+        :param ptm_sites: possible sites that the ptm can exist on
         """
         df = self.results
         df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_")