From 9143d1d85ef4e5cef65006cb1a1fc76a6e6bbbdb Mon Sep 17 00:00:00 2001 From: Fabian Basso Date: Wed, 31 Jul 2024 12:02:07 +0000 Subject: [PATCH] tests pased spectrum_io --- spectrum_io/search_result/mascot.py | 34 ++-- spectrum_io/search_result/maxquant.py | 53 +++--- spectrum_io/search_result/msfragger.py | 35 ++-- spectrum_io/search_result/sage.py | 27 ++- spectrum_io/search_result/search_results.py | 30 +-- spectrum_io/search_result/xisearch.py | 10 +- spectrum_io/spectral_library/dlib.py | 9 +- spectrum_io/spectral_library/msp.py | 5 +- .../spectral_library/spectral_library.py | 13 +- spectrum_io/spectral_library/spectronaut.py | 5 +- tests/unit_tests/data/psm_mods.pepXML | 175 ++++++++++++++++++ .../unit_tests/data/psm_tmt_internal_mods.csv | 5 + tests/unit_tests/test_msfragger.py | 17 +- 13 files changed, 334 insertions(+), 84 deletions(-) create mode 100644 tests/unit_tests/data/psm_mods.pepXML create mode 100644 tests/unit_tests/data/psm_tmt_internal_mods.csv diff --git a/spectrum_io/search_result/mascot.py b/spectrum_io/search_result/mascot.py index e3f7adc..7e6f2eb 100644 --- a/spectrum_io/search_result/mascot.py +++ b/spectrum_io/search_result/mascot.py @@ -2,11 +2,11 @@ import re import sqlite3 from pathlib import Path -from typing import Optional, Union, Dict, Tuple +from typing import Dict, Optional, Tuple, Union import pandas as pd import spectrum_fundamentals.constants as c -from spectrum_fundamentals.mod_string import internal_without_mods, custom_regex_escape +from spectrum_fundamentals.mod_string import custom_regex_escape, internal_without_mods from .search_results import SearchResults, filter_valid_prosit_sequences @@ -16,14 +16,16 @@ class Mascot(SearchResults): """Handle search results from Mascot.""" - def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame: + def read_result( + self, + tmt_labeled: str, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ) -> pd.DataFrame: """ Function to read a mascot msf file and perform some basic formatting. :param tmt_labeled: tmt label as str - :param var_mods: dict with custom variable identifier and respecitve internal equivalent - :param stat_mods: dict with custom static identifier and respecitve internal equivalent + :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass :return: pd.DataFrame with the formatted data """ logger.info("Reading mascot msf file") @@ -78,10 +80,10 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No ["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"], as_index=False, ).agg({"MODIFICATIONS": "|".join}) - MOD_MASSES = c.update_mod_masses() - mod_masses_reverse = {round(float(v), 3): k for k, v in MOD_MASSES.items()} - - def find_replacement(match: re.Match, sequence: str) -> str: + mod_masses = c.update_mod_masses() + mod_masses_reverse = {round(float(v), 3): k for k, v in mod_masses.items()} + + def find_replacement(match: re.Match) -> str: """ Subfunction to find the corresponding substitution for a match. @@ -90,7 +92,13 @@ def find_replacement(match: re.Match, sequence: str) -> str: """ key = match.string[match.start() : match.end()] return mods[key] - + + stat_mods: Dict[str, str] = {} + var_mods: Dict[str, str] = {} + + if custom_mods is not None: + stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()} + var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()} mods = {} @@ -107,11 +115,11 @@ def find_replacement(match: re.Match, sequence: str) -> str: modifications = row["MODIFICATIONS"].split("|") sequence = row["SEQUENCE"] if mods: - sequence = regex.sub(lambda match: find_replacement(match, sequence), sequence) + sequence = regex.sub(lambda match: find_replacement(match), sequence) if len(modifications) == 0: sequences.append(sequence) - else: + else: skip = 0 for mod in modifications: pos, mass = mod.split("$") diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py index 2180979..410102e 100644 --- a/spectrum_io/search_result/maxquant.py +++ b/spectrum_io/search_result/maxquant.py @@ -1,11 +1,10 @@ import logging from pathlib import Path -from typing import Optional, Union, Dict, Tuple +from typing import Dict, Optional, Tuple, Union import pandas as pd import spectrum_fundamentals.constants as c from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal -from spectrum_fundamentals.constants import MAXQUANT_VAR_MODS from .search_results import SearchResults, filter_valid_prosit_sequences @@ -42,14 +41,16 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float: mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"] return mass - def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame: + def read_result( + self, + tmt_labeled: str, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ) -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. :param tmt_labeled: tmt label as str - :param var_mods: dict with custom variable identifier and respecitve internal equivalent - :param stat_mods: dict with custom static identifier and respecitve internal equivalent + :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass :return: pd.DataFrame with the formatted data """ logger.info("Reading msms.txt file") @@ -76,24 +77,35 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No df.columns = df.columns.str.upper() df.columns = df.columns.str.replace(" ", "_") + stat_mods: Dict[str, str] = {} + var_mods: Dict[str, str] = {} + + if custom_mods is not None: + stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()} + var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()} + df = MaxQuant.update_columns_for_prosit(df, tmt_labeled, stat_mods, var_mods) return filter_valid_prosit_sequences(df) @staticmethod - def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame: + def update_columns_for_prosit( + df: pd.DataFrame, + tmt_labeled: str, + stat_mods: Optional[Dict[str, str]] = None, + var_mods: Optional[Dict[str, str]] = None, + ) -> pd.DataFrame: """ Update columns of df to work with Prosit. :param df: df to modify :param tmt_labeled: True if tmt labeled - :param var_mods: dict with custom variable identifier and respecitve internal equivalent + :param var_mods: dict with custom variable identifier and respecitve internal equivalent :param stat_mods: dict with custom static identifier and respecitve internal equivalent :return: modified df as pd.DataFrame """ df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True) - mods = {**(MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})} + mods = {**(c.MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})} df["REVERSE"].fillna(False, inplace=True) df["REVERSE"].replace("+", True, inplace=True) @@ -102,33 +114,32 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Opt unimod_tag = c.TMT_MODS[tmt_labeled] logger.info("Adding TMT fixed modifications") df["MODIFIED_SEQUENCE"] = maxquant_to_internal( - df["MODIFIED_SEQUENCE"].to_numpy(), mods= - {**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods} - ) + df["MODIFIED_SEQUENCE"].to_numpy(), + mods={**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}, + ) df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1) if "msa" in tmt_labeled: logger.info("Replacing phospho by dehydration for Phospho-MSA") df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace( "[UNIMOD:21]", "[UNIMOD:23]", regex=False ) - fixed_mods = {"C": "C[UNIMOD:4]"} elif "LABELING_STATE" in df.columns: logger.info("Adding SILAC fixed modifications") - + df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal( - df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(), mods = - {**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods} + df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(), + mods={**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods}, ) df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal( - df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods= - {**{"C": "C[UNIMOD:4]"}, **mods} + df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods} ) df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1) df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1) df.drop(columns=["LABELING_STATE"], inplace=True) else: - df["MODIFIED_SEQUENCE"] = maxquant_to_internal(df["MODIFIED_SEQUENCE"].to_numpy(), mods= - {**{"C": "C[UNIMOD:4]"}, **mods}) + df["MODIFIED_SEQUENCE"] = maxquant_to_internal( + df["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods} + ) df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) df["PROTEINS"].fillna("UNKNOWN", inplace=True) diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index df98d79..3911700 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -1,12 +1,12 @@ import logging from pathlib import Path -from typing import Optional, Union, Dict, Tuple +from typing import Dict, Optional, Tuple, Union import pandas as pd import spectrum_fundamentals.constants as c from pyteomics import pepxml -from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal from spectrum_fundamentals.constants import MSFRAGGER_VAR_MODS +from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal from tqdm import tqdm from .search_results import SearchResults, filter_valid_prosit_sequences @@ -17,14 +17,17 @@ class MSFragger(SearchResults): """Handle search results from MSFragger.""" - def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame: + def read_result( + self, + tmt_labeled: str, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ) -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. :param tmt_labeled: tmt label as str - :param var_mods: dict with custom variable identifier and respecitve internal equivalent - :param stat_mods: dict with custom static identifier and respecitve internal equivalent:raises FileNotFoundError: in case the given path is neither a file, nor a directory. + :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass + :raises FileNotFoundError: in case the given path is neither a file, nor a directory. :return: pd.DataFrame with the formatted data """ if self.path.is_file(): @@ -40,18 +43,26 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No df = pd.concat(ms_frag_results) + stat_mods: Dict[str, str] = {} + var_mods: Dict[str, str] = {} + + if custom_mods is not None: + stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()} + var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()} + df = update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods) return filter_valid_prosit_sequences(df) -def update_columns_for_prosit(df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame: +def update_columns_for_prosit( + df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, var_mods: Optional[Dict[str, str]] = None +) -> pd.DataFrame: """ Update columns of df to work with Prosit. :param df: df to modify :param tmt_labeled: True if tmt labeled - :param var_mods: dict with custom variable identifier and respecitve internal equivalent + :param var_mods: dict with custom variable identifier and respecitve internal equivalent :param stat_mods: dict with custom static identifier and respecitve internal equivalent :return: modified df as pd.DataFrame """ @@ -64,15 +75,13 @@ def update_columns_for_prosit(df, tmt_labeled: str, stat_mods: Optional[Dict[str mods = {**(MSFRAGGER_VAR_MODS), **(stat_mods or {}), **(var_mods or {})} - if tmt_labeled != "": unimod_tag = c.TMT_MODS[tmt_labeled] logger.info("Adding TMT fixed modifications") mods = {**{"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods} - df["MODIFIED_SEQUENCE"] = msfragger_to_internal( - df["modified_peptide"].to_list(), mods=mods) + df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods) else: - #By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included + # By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included # in the fixed modifications. If you want to have no fixed modifictions at all, supply fixed_mods={} mods = {**{"C": "C[UNIMOD:4]"}, **mods} df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods) diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py index 10be092..50438b9 100644 --- a/spectrum_io/search_result/sage.py +++ b/spectrum_io/search_result/sage.py @@ -1,6 +1,6 @@ import logging from pathlib import Path -from typing import Optional, Union, Dict, Tuple +from typing import Dict, Optional, Tuple, Union import pandas as pd from spectrum_fundamentals.constants import MOD_MASSES_SAGE @@ -14,14 +14,16 @@ class Sage(SearchResults): """Handle search results from Sage.""" - def read_result(self, tmt_labeled: str = "", stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame: + def read_result( + self, + tmt_labeled: str = "", + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ) -> pd.DataFrame: """ Function to read a msms tsv and perform some basic formatting. :param tmt_labeled: tmt label as str - :param var_mods: Variable modifications with custom identifiers and their respective internal equivalents - :param stat_mods: Static modifications with custom identifiers and their respective internal equivalents + :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass :return: pd.DataFrame with the formatted data """ logger.info(f"Reading {self.path}") @@ -36,12 +38,23 @@ def read_result(self, tmt_labeled: str = "", stat_mods: Optional[Dict[str, str]] df.columns = df.columns.str.upper() df.columns = df.columns.str.replace(" ", "_") + stat_mods: Dict[str, str] = {} + var_mods: Dict[str, str] = {} + + if custom_mods is not None: + stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()} + var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()} + df = Sage.update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods) return filter_valid_prosit_sequences(df) @staticmethod - def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame: + def update_columns_for_prosit( + df: pd.DataFrame, + tmt_labeled: str, + stat_mods: Optional[Dict[str, str]] = None, + var_mods: Optional[Dict[str, str]] = None, + ) -> pd.DataFrame: """ Update columns of df to work with Prosit. diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py index 9bccdfc..0d9c476 100644 --- a/spectrum_io/search_result/search_results.py +++ b/spectrum_io/search_result/search_results.py @@ -2,7 +2,7 @@ import re from abc import abstractmethod from pathlib import Path -from typing import Optional, Union, Dict, Tuple +from typing import Dict, Optional, Tuple, Union import pandas as pd @@ -51,33 +51,35 @@ def __init__(self, path: Union[str, Path]): self.path = path @abstractmethod - def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None): + def read_result( + self, + tmt_labeled: str, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ): """Read result. :param tmt_labeled: tmt label as str - :param var_mods: variable modifications with custom identifier and respecitve internal equivalent - :param stat_mods: static modifications with custom identifier and respecitve internal equivalent - + :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass """ raise NotImplementedError - def generate_internal(self, tmt_labeled: str, out_path: Optional[Union[str, Path]] = None, custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None) -> pd.DataFrame: + def generate_internal( + self, + tmt_labeled: str, + out_path: Optional[Union[str, Path]] = None, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ) -> pd.DataFrame: """ Generate df and save to out_path if provided. :param out_path: path to output :param tmt_labeled: tmt label as str :param custom_mods: dict with static and variable custom modifications, their internal identifier and mass - :raises AssertionError: if custom modification with illegal mass was provided :return: path to output file """ - stat_mods: Dict[str, str] = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()} - var_mods: Dict[str, str] = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()} - if out_path is None: # convert and return - return self.read_result(tmt_labeled, stat_mods=stat_mods, var_mods=var_mods) + return self.read_result(tmt_labeled, custom_mods=custom_mods) if isinstance(out_path, str): out_path = Path(out_path) @@ -85,11 +87,11 @@ def generate_internal(self, tmt_labeled: str, out_path: Optional[Union[str, Path if out_path.is_file(): # only read converted and return logger.info(f"Found search results in internal format at {out_path}, skipping conversion") - #TODO: internal_to_unimod + # TODO: internal_to_unimod return csv.read_file(out_path) # convert, save and return - df = self.read_result(tmt_labeled, stat_mods=stat_mods, var_mods=var_mods) + df = self.read_result(tmt_labeled, custom_mods=custom_mods) csv.write_file(df, out_path) return df diff --git a/spectrum_io/search_result/xisearch.py b/spectrum_io/search_result/xisearch.py index 5e1bbd9..ffeb5ef 100644 --- a/spectrum_io/search_result/xisearch.py +++ b/spectrum_io/search_result/xisearch.py @@ -3,9 +3,8 @@ import os import re from pathlib import Path -from typing import Union +from typing import Dict, Optional, Tuple, Union -import numpy as np import pandas as pd import spectrum_fundamentals.constants as c from spectrum_fundamentals.mod_string import xisearch_to_internal @@ -18,11 +17,16 @@ class Xisearch(SearchResults): """Handle search results from xisearch.""" - def read_result(self, tmt_labeled: str = "") -> pd.DataFrame: + def read_result( + self, + tmt_labeled: str = "", + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ) -> pd.DataFrame: """ Function to read a csv of CSMs and perform some basic formatting. :param tmt_labeled: tmt label as str + :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass :raises NotImplementedError: if a tmt label is provided :return: pd.DataFrame with the formatted data """ diff --git a/spectrum_io/spectral_library/dlib.py b/spectrum_io/spectral_library/dlib.py index 703926f..ba9558c 100644 --- a/spectrum_io/spectral_library/dlib.py +++ b/spectrum_io/spectral_library/dlib.py @@ -1,7 +1,7 @@ import sqlite3 import zlib from pathlib import Path -from typing import IO, Dict, Union +from typing import IO, Dict, Union, Tuple, Optional import numpy as np import pandas as pd @@ -125,12 +125,13 @@ def _create_database(conn: sqlite3.Connection): c.execute(sql_insert_meta, ["staleProteinMapping", "true"]) conn.commit() - def _write(self, out: Union[IO, sqlite3.Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame): + def _write(self, out: Union[IO, sqlite3.Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): if isinstance(out, IO): raise TypeError("Not supported. Use msp/spectronaut if you want to write a text file.") seqs = metadata["SEQUENCE"] - modseqs = metadata["MODIFIED_SEQUENCE"] - mass_mod_sequences = internal_to_mod_mass(modseqs) + modseqs = metadata["MODIFIED_SEQUENCE"] + mass_mod_sequences = internal_to_mod_mass(modseqs, custom_mods) p_charges = metadata["PRECURSOR_CHARGE"] p_mzs = (metadata["MASS"] + (p_charges * PARTICLE_MASSES["PROTON"])) / p_charges diff --git a/spectrum_io/spectral_library/msp.py b/spectrum_io/spectral_library/msp.py index ffe42b4..1412375 100644 --- a/spectrum_io/spectral_library/msp.py +++ b/spectrum_io/spectral_library/msp.py @@ -1,5 +1,5 @@ from sqlite3 import Connection -from typing import IO, Dict, Union +from typing import IO, Dict, Union, Tuple, Optional import numpy as np import pandas as pd @@ -17,7 +17,8 @@ def _assemble_fragment_string(f_mz: float, f_int: float, f_a: bytes): annot = f_a[:-2].decode() if f_a.endswith(b"1") else f_a.replace(b"+", b"^").decode() return f'{f_mz:.8f}\t{f_int:.4f}\t"{annot}/0.0ppm"\n' - def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame): + def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): # prepare metadata if isinstance(out, Connection): raise TypeError("Not supported. Use DLib if you want to write a database file.") diff --git a/spectrum_io/spectral_library/spectral_library.py b/spectrum_io/spectral_library/spectral_library.py index 1d8bbc9..b037c14 100644 --- a/spectrum_io/spectral_library/spectral_library.py +++ b/spectrum_io/spectral_library/spectral_library.py @@ -3,7 +3,7 @@ from multiprocessing.managers import ValueProxy from pathlib import Path from sqlite3 import Connection -from typing import IO, Dict, Optional, Union +from typing import IO, Dict, Optional, Union, Tuple import numpy as np import pandas as pd @@ -52,12 +52,14 @@ def write(self, *args, **kwargs): def _get_handle(self): return open(self.out_path, self.mode) - def async_write(self, queue: Queue, progress: ValueProxy): + def async_write(self, queue: Queue, progress: ValueProxy, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): """ Asynchronously write content to the output file from a queue. :param queue: A queue from which content will be retrieved for writing. :param progress: An integer value representing the progress of the writing process. + :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass """ with self._get_handle() as out: self._initialize(out) @@ -65,7 +67,7 @@ def async_write(self, queue: Queue, progress: ValueProxy): content = queue.get() if content is None: break - self._write(out, *content) + self._write(out, *content, custom_mods=custom_mods) progress.value += 1 def _fragment_filter_passed( @@ -86,7 +88,8 @@ def _fragment_filter_passed( return (f_mz != -1) & (f_int >= self.min_intensity_threshold) @abstractmethod - def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame): + def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): """ Internal writer function. @@ -97,6 +100,8 @@ def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metada :param out: file handle accepting the data to be written to disk :param data: Dictionary containing TODO keys and corresponding values as numpy array :param metadata: a dataframe that contains the columns TODO + :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass + """ pass diff --git a/spectrum_io/spectral_library/spectronaut.py b/spectrum_io/spectral_library/spectronaut.py index 4adb612..c4632f3 100644 --- a/spectrum_io/spectral_library/spectronaut.py +++ b/spectrum_io/spectral_library/spectronaut.py @@ -2,7 +2,7 @@ import re from itertools import chain, cycle from sqlite3 import Connection -from typing import IO, Dict, Tuple, Union +from typing import IO, Dict, Tuple, Union, Optional import numpy as np import pandas as pd @@ -26,7 +26,8 @@ def _assemble_fragment_string(f_int: float, f_mz: float, f_annot: bytes): f"{f_int:.4f},{f_mz:.8f},{m.group(2)},{m.group(1)},{m.group(3)},{m.group(4) if m.group(4) else 'noloss'}\n" ) - def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame): + def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): # prepare metadata if isinstance(out, Connection): raise TypeError("Not supported. Use DLib if you want to write a database file.") diff --git a/tests/unit_tests/data/psm_mods.pepXML b/tests/unit_tests/data/psm_mods.pepXML new file mode 100644 index 0000000..b2a5ca5 --- /dev/null +++ b/tests/unit_tests/data/psm_mods.pepXML @@ -0,0 +1,175 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/unit_tests/data/psm_tmt_internal_mods.csv b/tests/unit_tests/data/psm_tmt_internal_mods.csv new file mode 100644 index 0000000..166040e --- /dev/null +++ b/tests/unit_tests/data/psm_tmt_internal_mods.csv @@ -0,0 +1,5 @@ +,RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE,SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH,PROTEINS +0,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2459,[UNIMOD:2016]-GQAVLAFQEQVGTGR,5,34,1863.023,8.221,True,GQAVLAFQEQVGTGR,15,rev_tr|E9Q8J5|E9Q8J5_MOUSE +1,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2486,[UNIMOD:2016]-TEVPM[UNIMOD:35]GLSLRTTSAR,5,42,1937.0531,7.083,False,TEVPMGLSLRTTSAR,15,tr|A0A0N4SW17|A0A0N4SW17_MOUSE +2,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2980,[UNIMOD:2016]-YSGN[UNIMOD:41]C[UNIMOD:4]DRQSVER,3,193,1773.8123,3.932,False,YSGNCDRQSVER,12,sp|Q9D413-2|SH2D6_MOUSE;sp|Q9D413|SH2D6_MOUSE;tr|A0A3Q4EBW9|A0A3Q4EBW9_MOUSE;tr|A0A3Q4ECA8|A0A3Q4ECA8_MOUSE;tr|A0A3Q4EGG3|A0A3Q4EGG3_MOUSE;tr|E0CYY5|E0CYY5_MOUSE;tr|E9QJU1|E9QJU1_MOUSE +3,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,3945,[UNIMOD:2016]-ESTK[UNIMOD:2016]SAAER,3,647,1586.8887,10.839,True,ESTKSAAER,9,rev_sp|Q3TLH4-5|PRC2C_MOUSE;rev_sp|Q3TLH4|PRC2C_MOUSE;rev_tr|A0A0A0MQ79|A0A0A0MQ79_MOUSE;rev_tr|S4R209|S4R209_MOUSE;rev_tr|S4R294|S4R294_MOUSE;rev_tr|S4R2J9|S4R2J9_MOUSE diff --git a/tests/unit_tests/test_msfragger.py b/tests/unit_tests/test_msfragger.py index 2eac2b7..86be885 100644 --- a/tests/unit_tests/test_msfragger.py +++ b/tests/unit_tests/test_msfragger.py @@ -27,7 +27,7 @@ def test_read_result(self): self.assertTrue("PROTEINS" in df.columns) def test_read_msfragger(self): - """Test function for reading sage results and transforming to Prosit format.""" + """Test function for reading msfragger results and transforming to Prosit format.""" expected_msfragger_internal_path = Path(__file__).parent / "data" / "psm_tmt_internal.csv" internal_search_results_df = MSFragger(Path(__file__).parent / "data" / "psm_tmt.pepXML").read_result( @@ -38,3 +38,18 @@ def test_read_msfragger(self): print("Expected Columns:", expected_df.columns) pd.testing.assert_frame_equal(internal_search_results_df, expected_df) + + def test_read_msfragger_mods(self): + """Test function for reading msfragger results and transforming to Prosit format with custom mods.""" + expected_msfragger_internal_path = Path(__file__).parent / "data" / "psm_tmt_internal_mods.csv" + stat_mod = {"M[35]": "[UNIMOD:35]"} + var_mod = {"[41]": "[UNIMOD:41]"} + + internal_search_results_df = MSFragger(Path(__file__).parent / "data" / "psm_mods.pepXML").read_result( + tmt_labeled="tmtpro", stat_mods=stat_mod, var_mods=var_mod + ) + expected_df = pd.read_csv(expected_msfragger_internal_path, index_col=0) + print("Internal Search Results Columns:", internal_search_results_df.columns) + print("Expected Columns:", expected_df.columns) + + pd.testing.assert_frame_equal(internal_search_results_df, expected_df)