diff --git a/.cookietemple.yml b/.cookietemple.yml index f404bae..040b846 100644 --- a/.cookietemple.yml +++ b/.cookietemple.yml @@ -15,5 +15,5 @@ full_name: Mario Picciani email: mario.picciani@tum.de project_name: spectrum_io project_short_description: IO related functionalities for oktoberfest. -version: 0.6.2 +version: 0.6.3 license: MIT diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 4a68881..b612d77 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -1,5 +1,5 @@ -name-template: "0.6.2 🌈" # <> -tag-template: 0.6.2 # <> +name-template: "0.6.3 🌈" # <> +tag-template: 0.6.3 # <> exclude-labels: - "skip-changelog" diff --git a/cookietemple.cfg b/cookietemple.cfg index 3f096ee..6c223ea 100644 --- a/cookietemple.cfg +++ b/cookietemple.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.2 +current_version = 0.6.3 [bumpversion_files_whitelisted] init_file = spectrum_io/__init__.py diff --git a/docs/conf.py b/docs/conf.py index 34b4479..5e84d5b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,9 +52,9 @@ # the built documents. # # The short X.Y version. -version = "0.6.2" +version = "0.6.3" # The full version, including alpha/beta/rc tags. -release = "0.6.2" +release = "0.6.3" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/pyproject.toml b/pyproject.toml index e1e786f..ad48670 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "spectrum_io" -version = "0.6.2" # <> +version = "0.6.3" # <> description = "IO related functionalities for oktoberfest." authors = ["Wilhelmlab at Technical University of Munich"] license = "MIT" @@ -29,7 +29,7 @@ pyarrow = ">=16.0.0" pymzml = "^2.5.0" pyteomics = "^4.3.3" lxml= '>=4.5.2,<6.0.0' -spectrum-fundamentals = ">=0.7.1,<0.8.0" +spectrum-fundamentals = ">=0.7.4,<0.8.0" alphatims = "^1.0.8" sortedcontainers = "^2.4.0" diff --git a/spectrum_io/__init__.py b/spectrum_io/__init__.py index 747edd5..026753a 100644 --- a/spectrum_io/__init__.py +++ b/spectrum_io/__init__.py @@ -5,7 +5,7 @@ __author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)""" __copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich" __license__ = "MIT" -__version__ = "0.6.2" +__version__ = "0.6.3" import logging import logging.handlers diff --git a/spectrum_io/__main__.py b/spectrum_io/__main__.py index e762e37..2c8b279 100644 --- a/spectrum_io/__main__.py +++ b/spectrum_io/__main__.py @@ -5,7 +5,7 @@ @click.command() -@click.version_option(version="0.6.2", message=click.style("spectrum_io Version: 0.6.2")) +@click.version_option(version="0.6.3", message=click.style("spectrum_io Version: 0.6.3")) def main() -> None: """spectrum_io.""" diff --git a/spectrum_io/search_result/mascot.py b/spectrum_io/search_result/mascot.py index cea93f4..eb0c86c 100644 --- a/spectrum_io/search_result/mascot.py +++ b/spectrum_io/search_result/mascot.py @@ -1,14 +1,12 @@ import logging -import re import sqlite3 -from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional import pandas as pd import spectrum_fundamentals.constants as c from spectrum_fundamentals.mod_string import internal_without_mods -from .search_results import SearchResults, filter_valid_prosit_sequences +from .search_results import SearchResults logger = logging.getLogger(__name__) @@ -25,12 +23,16 @@ def read_result( self, tmt_label: str = "", custom_mods: Optional[Dict[str, int]] = None, + ptm_unimod_id: Optional[int] = 0, + ptm_sites: Optional[list[str]] = None, ) -> pd.DataFrame: """ Function to read a mascot msf file and perform some basic formatting. :param tmt_label: tmt label as str :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on :raises NotImplementedError: always :return: pd.DataFrame with the formatted data """ @@ -119,4 +121,4 @@ def read_result( df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) - return filter_valid_prosit_sequences(df) + return self.filter_valid_prosit_sequences() diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py index 10ed7e3..51af2b3 100644 --- a/spectrum_io/search_result/maxquant.py +++ b/spectrum_io/search_result/maxquant.py @@ -1,12 +1,14 @@ +from __future__ import annotations + import logging from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Union import pandas as pd import spectrum_fundamentals.constants as c -from spectrum_fundamentals.mod_string import internal_without_mods +from spectrum_fundamentals.mod_string import add_permutations, internal_without_mods -from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods +from .search_results import SearchResults, parse_mods logger = logging.getLogger(__name__) @@ -14,7 +16,7 @@ class MaxQuant(SearchResults): """Handle search results from MaxQuant.""" - def __init__(self, path: Union[str, Path]): + def __init__(self, path: str | Path): """ Init Searchresults object. @@ -34,6 +36,9 @@ def standard_mods(self): "C": 4, "M(ox)": 35, "M(Oxidation (M))": 35, + "R(Citrullination)": 7, + "Q(Deamidation (NQ))": 7, + "N(Deamidation (NQ))": 7, } @staticmethod @@ -50,10 +55,25 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float: mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"] return mass + def filter_valid_prosit_sequences(self): + """Filter valid Prosit sequences.""" + logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results.index)}") + # retain only peptides that fall within [7, 30] length supported by Prosit + self.results = self.results[(self.results["PEPTIDE_LENGTH"] <= 30) & (self.results["PEPTIDE_LENGTH"] >= 7)] + # remove unsupported mods to exclude + self.results = self.results[~self.results["MODIFIED_SEQUENCE"].str.contains(r"\(", regex=True)] + # remove precursor charges greater than 6 + self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6] + logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results.index)}") + + return self.results + def read_result( self, tmt_label: str = "", - custom_mods: Optional[Dict[str, int]] = None, + custom_mods: dict[str, int] | None = None, + ptm_unimod_id: int | None = 0, + ptm_sites: list[str] | None = None, ) -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. @@ -62,6 +82,8 @@ def read_result( :param custom_mods: optional dictionary mapping MaxQuant-specific mod pattern to UNIMOD IDs. If None, static carbamidomethylation of cytein and variable oxidation of methionine are mapped automatically. To avoid this, explicitely provide an empty dictionary. + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on :return: pd.DataFrame with the formatted data """ parsed_mods = parse_mods(self.standard_mods | (custom_mods or {})) @@ -89,14 +111,16 @@ def read_result( logger.info("Finished reading msms.txt file") - self.convert_to_internal(mods=parsed_mods) - return filter_valid_prosit_sequences(self.results) + self.convert_to_internal(mods=parsed_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites) + return self.filter_valid_prosit_sequences() - def convert_to_internal(self, mods: Dict[str, str]): + def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None): """ Convert all columns in the MaxQuant output to the internal format used by Oktoberfest. :param mods: dictionary mapping MaxQuant-specific mod patterns (keys) to ProForma standard (values) + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on """ df = self.results # Standardize column names @@ -112,6 +136,21 @@ def convert_to_internal(self, mods: Dict[str, str]): df["Sequence"] = internal_without_mods(df["Modified sequence"]) df["PEPTIDE_LENGTH"] = df["Sequence"].str.len() + if ptm_unimod_id != 0: + + # PTM permutation generation + if ptm_unimod_id == 7: + allow_one_less_modification = True + else: + allow_one_less_modification = False + + df["Modified sequence"] = df["Modified sequence"].apply( + add_permutations, + unimod_id=ptm_unimod_id, + residues=ptm_sites, + allow_one_less_modification=allow_one_less_modification, + ) + df = df.explode("Modified sequence", ignore_index=True) df.rename( columns={ @@ -128,6 +167,7 @@ def convert_to_internal(self, mods: Dict[str, str]): }, inplace=True, ) + self.results = df def generate_internal_timstof_metadata(self): """ diff --git a/spectrum_io/search_result/msamanda.py b/spectrum_io/search_result/msamanda.py index 0d667f5..8258858 100644 --- a/spectrum_io/search_result/msamanda.py +++ b/spectrum_io/search_result/msamanda.py @@ -1,10 +1,12 @@ +from __future__ import annotations + import logging from typing import Dict, Optional import pandas as pd from spectrum_fundamentals.constants import PARTICLE_MASSES -from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods +from .search_results import SearchResults, parse_mods logger = logging.getLogger(__name__) @@ -18,7 +20,12 @@ def standard_mods(self): return {"m": 35, "c": 4} def read_result( - self, tmt_label: str = "", custom_mods: Optional[Dict[str, int]] = None, suffix: str = "output.csv" + self, + tmt_label: str = "", + custom_mods: dict[str, int] | None = None, + ptm_unimod_id: int | None = 0, + ptm_sites: list[str] | None = None, + suffix: str = "output.csv", ) -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. @@ -28,6 +35,8 @@ def read_result( If None, static carbamidomethylation of cytein and variable oxidation of methionine are mapped automatically. To avoid this, explicitely provide an empty dictionary. :param suffix: Optional suffix to determine which fileresult files should be taken from the supplied path + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on :raises FileNotFoundError: If the supplied path is not found :raises AssertionError: If the supplied path does not contain any files matching the provided suffix. :raises NotImplementedError: If tmt label was supplied. @@ -72,14 +81,29 @@ def read_result( self.results = pd.concat(df_list) - self.convert_to_internal(mods=parsed_mods) - return filter_valid_prosit_sequences(self.results) + self.convert_to_internal(mods=parsed_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites) + return self.filter_valid_prosit_sequences() + + def filter_valid_prosit_sequences(self): + """Filter valid Prosit sequences.""" + logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results.index)}") + # retain only peptides that fall within [7, 30] length supported by Prosit + self.results = self.results[(self.results["PEPTIDE_LENGTH"] <= 30) & (self.results["PEPTIDE_LENGTH"] >= 7)] + # remove unsupported mods to exclude + self.results = self.results[~self.results["MODIFIED_SEQUENCE"].str.contains(r"[a-z]+", regex=True)] + # remove precursor charges greater than 6 + self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6] + logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results.index)}") + + return self.results - def convert_to_internal(self, mods: Dict[str, str]): + def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None): """ Convert all columns in the Sage output to the internal format used by Oktoberfest. :param mods: dictionary mapping Sage-specific mod patterns (keys) to ProForma standard (values) + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on """ df = self.results df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_") diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index 47ce5fb..6debc66 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -1,15 +1,16 @@ +from __future__ import annotations + import logging -from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional import pandas as pd import spectrum_fundamentals.constants as c from pyteomics import pepxml from spectrum_fundamentals.constants import MSFRAGGER_VAR_MODS -from spectrum_fundamentals.mod_string import internal_without_mods +from spectrum_fundamentals.mod_string import add_permutations, internal_without_mods from tqdm import tqdm -from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods +from .search_results import SearchResults, parse_mods logger = logging.getLogger(__name__) @@ -20,12 +21,27 @@ class MSFragger(SearchResults): @property def standard_mods(self): """Standard modifications that are always applied if not otherwise specified.""" - return {"C": 4, "M[147]": 35} + return {"C[160]": 4, "M[147]": 35, "R[157]": 7, "Q[129]": 7, "N[115]": 7} + + def filter_valid_prosit_sequences(self): + """Filter valid Prosit sequences.""" + logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results.index)}") + # retain only peptides that fall within [7, 30] length supported by Prosit + self.results = self.results[(self.results["PEPTIDE_LENGTH"] <= 30) & (self.results["PEPTIDE_LENGTH"] >= 7)] + # remove unsupported mods to exclude + self.results = self.results[~self.results["MODIFIED_SEQUENCE"].str.contains(r"\[\d+\]", regex=True)] + # remove precursor charges greater than 6 + self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6] + logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results.index)}") + + return self.results def read_result( self, tmt_label: str = "", - custom_mods: Optional[Dict[str, int]] = None, + custom_mods: dict[str, int] | None = None, + ptm_unimod_id: int | None = 0, + ptm_sites: list[str] | None = None, ) -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. @@ -34,6 +50,8 @@ def read_result( :param custom_mods: optional dictionary mapping MSFragger-specific mod pattern to UNIMOD IDs. If None, static carbamidomethylation of cytein and variable oxidation of methionine are mapped automatically. To avoid this, explicitely provide an empty dictionary. + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on :raises FileNotFoundError: in case the given path is neither a file, nor a directory. :return: pd.DataFrame with the formatted data """ @@ -55,24 +73,59 @@ def read_result( self.results = pd.concat(ms_frag_results) - self.convert_to_internal(mods=parsed_mods) - return filter_valid_prosit_sequences(self.results) + self.convert_to_internal(mods=parsed_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites) + return self.filter_valid_prosit_sequences() + + @staticmethod + def check_decoys(protein_names: str): + """ + Check if all protein names in a given string correspond to decoy proteins. - def convert_to_internal(self, mods: Dict[str, str]): + :param protein_names: A string containing one or more protein names separated by semicolons (';'). + Each protein name is checked for the presence of the substring 'rev'. + :return: `True` if all proteins are decoy proteins (i.e., if all protein names contain 'rev'), + otherwise `False`. + """ + all_proteins = protein_names.split(";") + reverse = True + for protein in all_proteins: + if "rev" not in protein: + reverse = False + break + return reverse + + def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None): """ Convert all columns in the MSFragger output to the internal format used by Oktoberfest. :param mods: dictionary mapping MSFragger-specific mod patterns (keys) to ProForma standard (values) + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on """ df = self.results df["protein"] = df["protein"].fillna("UNKNOWN").apply(lambda x: ";".join(x)) - df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x)) + df["REVERSE"] = df["protein"].apply(lambda x: MSFragger.check_decoys(x)) + df["spectrum"] = df["spectrum"].str.split(pat=".", n=1).str[0] df["PEPTIDE_LENGTH"] = df["peptide"].str.len() - df.replace({"modified_peptide": mods}, regex=True, inplace=True) df["peptide"] = internal_without_mods(df["modified_peptide"]) + if ptm_unimod_id != 0: + + # PTM permutation generation + if ptm_unimod_id == 7: + allow_one_less_modification = True + else: + allow_one_less_modification = False + + df["modified_peptide"] = df["modified_peptide"].apply( + add_permutations, + unimod_id=ptm_unimod_id, + residues=ptm_sites, + allow_one_less_modification=allow_one_less_modification, + ) + df = df.explode("modified_peptide", ignore_index=True) df.rename( columns={ @@ -89,7 +142,7 @@ def convert_to_internal(self, mods: Dict[str, str]): }, inplace=True, ) - + self.results = df """ return df[ [ diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py index beeb1df..c02ae89 100644 --- a/spectrum_io/search_result/sage.py +++ b/spectrum_io/search_result/sage.py @@ -1,12 +1,13 @@ +from __future__ import annotations + import logging -from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional import pandas as pd import spectrum_fundamentals.constants as c from spectrum_fundamentals.mod_string import internal_without_mods -from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods +from .search_results import SearchResults, parse_mods logger = logging.getLogger(__name__) @@ -22,7 +23,9 @@ def standard_mods(self): def read_result( self, tmt_label: str = "", - custom_mods: Optional[Dict[str, int]] = None, + custom_mods: dict[str, int] | None = None, + ptm_unimod_id: int | None = 0, + ptm_sites: list[str] | None = None, ) -> pd.DataFrame: """ Function to read a msms tsv and perform some basic formatting. @@ -31,6 +34,8 @@ def read_result( :param custom_mods: optional dictionary mapping Sage-specific mod pattern to UNIMOD IDs. If None, static carbamidomethylation of cytein and variable oxidation of methionine are mapped automatically. To avoid this, explicitely provide an empty dictionary. + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on :return: pd.DataFrame with the formatted data """ parsed_mods = parse_mods(self.standard_mods | (custom_mods or {})) @@ -47,14 +52,29 @@ def read_result( ) logger.info(f"Finished reading {self.path}") - self.convert_to_internal(mods=parsed_mods) - return filter_valid_prosit_sequences(self.results) + self.convert_to_internal(mods=parsed_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites) + return self.filter_valid_prosit_sequences() + + def filter_valid_prosit_sequences(self): + """Filter valid Prosit sequences.""" + logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results.index)}") + # retain only peptides that fall within [7, 30] length supported by Prosit + self.results = self.results[(self.results["PEPTIDE_LENGTH"] <= 30) & (self.results["PEPTIDE_LENGTH"] >= 7)] + # remove unsupported mods to exclude + self.results = self.results[~self.results["MODIFIED_SEQUENCE"].str.contains(r"\[\d+\]", regex=True)] + # remove precursor charges greater than 6 + self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6] + logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results.index)}") + + return self.results - def convert_to_internal(self, mods: Dict[str, str]): + def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None): """ Convert all columns in the Sage output to the internal format used by Oktoberfest. :param mods: dictionary mapping Sage-specific mod patterns (keys) to ProForma standard (values) + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on """ df = self.results diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py index 1a62f92..e7d65e1 100644 --- a/spectrum_io/search_result/search_results.py +++ b/spectrum_io/search_result/search_results.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import re from abc import abstractmethod @@ -25,27 +27,7 @@ ] -def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame: - """ - Filter valid Prosit sequences. - - :param df: df to filter - :return: df after filtering out unsupported peptides - """ - logger.info(f"#sequences before filtering for valid prosit sequences: {len(df.index)}") - # retain only peptides that fall within [7, 30] length supported by Prosit - df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)] - # remove unsupported mods to exclude - supported_pattern = re.compile(r"^(?:\[UNIMOD:\d+\]\-)?(?:[ACDEFGHIKLMNPQRSTVWY]+(?:\[UNIMOD:\d+\])?)*$") - df = df[df["MODIFIED_SEQUENCE"].str.match(supported_pattern)] - # remove precursor charges greater than 6 - df = df[df["PRECURSOR_CHARGE"] <= 6] - logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}") - - return df - - -def parse_mods(mods: Dict[str, int]) -> Dict[str, str]: +def parse_mods(mods: dict[str, int]) -> dict[str, str]: """ Parse provided mapping of custom modification pattern to ProForma standard. @@ -97,7 +79,7 @@ class SearchResults: orig_res: pd.DataFrame fake_msms: pd.DataFrame - def __init__(self, path: Union[str, Path]): + def __init__(self, path: str | Path): """ Init Searchresults object. @@ -107,24 +89,35 @@ def __init__(self, path: Union[str, Path]): path = Path(path) self.path = path + @abstractmethod + def filter_valid_prosit_sequences(self): + """Filter valid Prosit sequences.""" + raise NotImplementedError + @abstractmethod def read_result( self, tmt_label: str = "", - custom_mods: Optional[Dict[str, int]] = None, + custom_mods: dict[str, int] | None = None, + ptm_unimod_id: int | None = 0, + ptm_sites: list[str] | None = None, ): """Read result. :param tmt_label: tmt label as str :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on """ raise NotImplementedError def generate_internal( self, tmt_label: str = "", - out_path: Optional[Union[str, Path]] = None, - custom_mods: Optional[Dict[str, int]] = None, + out_path: str | Path | None = None, + custom_mods: dict[str, int] | None = None, + ptm_unimod_id: int | None = 0, + ptm_sites: list[str] | None = None, ) -> pd.DataFrame: """ Generate df and save to out_path if provided. @@ -132,11 +125,15 @@ def generate_internal( :param out_path: path to output :param tmt_label: tmt label as str :param custom_mods: dict with static and variable custom modifications, their internal identifier and mass + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on :return: path to output file """ if out_path is None: # convert and return - filtered_df = self.read_result(tmt_label, custom_mods=custom_mods) + filtered_df = self.read_result( + tmt_label, custom_mods=custom_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites + ) return filtered_df[COLUMNS] if isinstance(out_path, str): out_path = Path(out_path) @@ -148,7 +145,9 @@ def generate_internal( return csv.read_file(out_path) # convert, save and return - df = self.read_result(tmt_label, custom_mods=custom_mods)[COLUMNS] + df = self.read_result(tmt_label, custom_mods=custom_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites)[ + COLUMNS + ] csv.write_file(df, out_path) return df @@ -161,10 +160,12 @@ def read_internal(self) -> pd.DataFrame: return csv.read_file(self.path) @abstractmethod - def convert_to_internal(self, mods: Dict[str, str]): + def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None): """ Convert all columns in the search engine-specific output to the internal format used by Oktoberfest. :param mods: dictionary mapping search engine-specific mod patterns (keys) to ProForma standard (values) + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on """ raise NotImplementedError diff --git a/spectrum_io/search_result/xisearch.py b/spectrum_io/search_result/xisearch.py index 688fc3e..f8ff130 100644 --- a/spectrum_io/search_result/xisearch.py +++ b/spectrum_io/search_result/xisearch.py @@ -1,12 +1,7 @@ -import glob import logging -import os -import re -from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional import pandas as pd -import spectrum_fundamentals.constants as c from spectrum_fundamentals.mod_string import xisearch_to_internal from .search_results import SearchResults @@ -21,12 +16,16 @@ def read_result( self, tmt_label: str = "", custom_mods: Optional[Dict[str, int]] = None, + ptm_unimod_id: Optional[int] = 0, + ptm_sites: Optional[list[str]] = None, ) -> pd.DataFrame: """ Function to read a csv of CSMs and perform some basic formatting. :param tmt_label: tmt label as str :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on :raises NotImplementedError: if a tmt label is provided :return: pd.DataFrame with the formatted data """ @@ -65,7 +64,7 @@ def read_result( # Standardize column names df = Xisearch.filter_xisearch_result(df) df = Xisearch.update_columns_for_prosit(df) - df = Xisearch.filter_valid_prosit_sequences(df) + df = Xisearch.filter_valid_prosit_sequences_xl(df) return df @staticmethod @@ -141,7 +140,7 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame: return df @staticmethod - def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame: + def filter_valid_prosit_sequences_xl(df: pd.DataFrame) -> pd.DataFrame: """ Filter valid Prosit sequences. diff --git a/tests/unit_tests/data/psm_tmt.pepXML b/tests/unit_tests/data/psm_tmt.pepXML index b31bdf0..7faa555 100644 --- a/tests/unit_tests/data/psm_tmt.pepXML +++ b/tests/unit_tests/data/psm_tmt.pepXML @@ -136,24 +136,6 @@ - - - - - - - - - - - - - - - - - - diff --git a/tests/unit_tests/data/psm_tmt_internal.csv b/tests/unit_tests/data/psm_tmt_internal.csv index fb5d141..7edac20 100644 --- a/tests/unit_tests/data/psm_tmt_internal.csv +++ b/tests/unit_tests/data/psm_tmt_internal.csv @@ -1,5 +1,4 @@ ,RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE,SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH,PROTEINS 0,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2459,[UNIMOD:2016]-GQAVLAFQEQVGTGR,5,34,1863.023,8.221,True,GQAVLAFQEQVGTGR,15,rev_tr|E9Q8J5|E9Q8J5_MOUSE 1,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2486,[UNIMOD:2016]-TEVPM[UNIMOD:35]GLSLRTTSAR,5,42,1937.0531,7.083,False,TEVPMGLSLRTTSAR,15,tr|A0A0N4SW17|A0A0N4SW17_MOUSE -2,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2980,[UNIMOD:2016]-YSGNC[UNIMOD:4]DRQSVER,3,193,1773.8123,3.932,False,YSGNCDRQSVER,12,sp|Q9D413-2|SH2D6_MOUSE;sp|Q9D413|SH2D6_MOUSE;tr|A0A3Q4EBW9|A0A3Q4EBW9_MOUSE;tr|A0A3Q4ECA8|A0A3Q4ECA8_MOUSE;tr|A0A3Q4EGG3|A0A3Q4EGG3_MOUSE;tr|E0CYY5|E0CYY5_MOUSE;tr|E9QJU1|E9QJU1_MOUSE -3,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,3945,[UNIMOD:2016]-ESTK[UNIMOD:2016]SAAER,3,647,1586.8887,10.839,True,ESTKSAAER,9,rev_sp|Q3TLH4-5|PRC2C_MOUSE;rev_sp|Q3TLH4|PRC2C_MOUSE;rev_tr|A0A0A0MQ79|A0A0A0MQ79_MOUSE;rev_tr|S4R209|S4R209_MOUSE;rev_tr|S4R294|S4R294_MOUSE;rev_tr|S4R2J9|S4R2J9_MOUSE +2,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,3945,[UNIMOD:2016]-ESTK[UNIMOD:2016]SAAER,3,647,1586.8887,10.839,True,ESTKSAAER,9,rev_sp|Q3TLH4-5|PRC2C_MOUSE;rev_sp|Q3TLH4|PRC2C_MOUSE;rev_tr|A0A0A0MQ79|A0A0A0MQ79_MOUSE;rev_tr|S4R209|S4R209_MOUSE;rev_tr|S4R294|S4R294_MOUSE;rev_tr|S4R2J9|S4R2J9_MOUSE diff --git a/tests/unit_tests/test_maxquant.py b/tests/unit_tests/test_maxquant.py index d407067..481091b 100644 --- a/tests/unit_tests/test_maxquant.py +++ b/tests/unit_tests/test_maxquant.py @@ -7,7 +7,6 @@ import pytest from spectrum_io.search_result.maxquant import MaxQuant -from spectrum_io.search_result.search_results import filter_valid_prosit_sequences COLUMNS = [ "RAW_FILE",