From ce752a3e915b047b5a6184961302e39efb2df9fa Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Thu, 1 Aug 2024 21:21:03 +0200 Subject: [PATCH] fixed and cleaned up unit tests --- spectrum_io/search_result/maxquant.py | 148 +++++++---------- spectrum_io/search_result/msfragger.py | 146 ++++++++--------- spectrum_io/search_result/sage.py | 105 ++++++------ spectrum_io/search_result/search_results.py | 60 ++++++- spectrum_io/spectral_library/dlib.py | 13 +- spectrum_io/spectral_library/msp.py | 11 +- .../spectral_library/spectral_library.py | 16 +- spectrum_io/spectral_library/spectronaut.py | 11 +- tests/unit_tests/data/msms.txt | 10 ++ tests/unit_tests/data/msms_internal.csv | 10 ++ tests/unit_tests/data/msms_internal_tmt.csv | 10 ++ tests/unit_tests/data/psm_mods.pepXML | 2 +- tests/unit_tests/test_maxquant.py | 155 ++++++------------ tests/unit_tests/test_msfragger.py | 50 ++++-- tests/unit_tests/test_sage.py | 43 ++++- 15 files changed, 420 insertions(+), 370 deletions(-) create mode 100644 tests/unit_tests/data/msms.txt create mode 100644 tests/unit_tests/data/msms_internal.csv create mode 100644 tests/unit_tests/data/msms_internal_tmt.csv diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py index 410102e..a378a1c 100644 --- a/spectrum_io/search_result/maxquant.py +++ b/spectrum_io/search_result/maxquant.py @@ -4,9 +4,9 @@ import pandas as pd import spectrum_fundamentals.constants as c -from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal +from spectrum_fundamentals.mod_string import internal_without_mods -from .search_results import SearchResults, filter_valid_prosit_sequences +from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods logger = logging.getLogger(__name__) @@ -43,108 +43,74 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float: def read_result( self, - tmt_labeled: str, - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + tmt_label: str = "", + custom_mods: Optional[Dict[str, int]] = None, ) -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. - :param tmt_labeled: tmt label as str - :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass + :param tmt_label: optional tmt label as str + :param custom_mods: optional dictionary mapping MaxQuant-specific mod pattern to UNIMOD IDs. + If None, static carbamidomethylation of cytein and variable oxidation of methionine + are mapped automatically. To avoid this, explicitely provide an empty dictionary. :return: pd.DataFrame with the formatted data """ - logger.info("Reading msms.txt file") - df = pd.read_csv( - self.path / "msms.txt", - usecols=lambda x: x.upper() - in [ - "RAW FILE", - "SCAN NUMBER", - "MODIFIED SEQUENCE", - "CHARGE", - "SCAN EVENT NUMBER", - "LABELING STATE", - "MASS", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead - "SCORE", - "REVERSE", - "PROTEINS", - ], - sep="\t", - ) - logger.info("Finished reading msms.txt file") - - # Standardize column names - df.columns = df.columns.str.upper() - df.columns = df.columns.str.replace(" ", "_") + if custom_mods is None: + custom_mods = { + "C": 4, + "M(ox)": 35, + "M(Oxidation (M))": 35, + } + parsed_mods = parse_mods(custom_mods) + if tmt_label: + unimod_tag = c.TMT_MODS[tmt_label] + parsed_mods["K"] = f"K{unimod_tag}" + parsed_mods["^_"] = f"_{unimod_tag}-" - stat_mods: Dict[str, str] = {} - var_mods: Dict[str, str] = {} + logger.info("Reading msms.txt file") + self.results = pd.read_csv(self.path / "msms.txt", sep="\t") - if custom_mods is not None: - stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()} - var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()} + logger.info("Finished reading msms.txt file") - df = MaxQuant.update_columns_for_prosit(df, tmt_labeled, stat_mods, var_mods) - return filter_valid_prosit_sequences(df) + self.convert_to_internal(mods=parsed_mods) + return filter_valid_prosit_sequences(self.results) - @staticmethod - def update_columns_for_prosit( - df: pd.DataFrame, - tmt_labeled: str, - stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None, - ) -> pd.DataFrame: + def convert_to_internal(self, mods: Dict[str, str]) -> pd.DataFrame: """ - Update columns of df to work with Prosit. + Convert all columns in the MaxQuant output to the internal format used by Oktoberfest. - :param df: df to modify - :param tmt_labeled: True if tmt labeled - :param var_mods: dict with custom variable identifier and respecitve internal equivalent - :param stat_mods: dict with custom static identifier and respecitve internal equivalent - :return: modified df as pd.DataFrame + :param mods: dictionary mapping MaxQuant-specific mod patterns (keys) to ProForma standard (values) """ - df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True) - - mods = {**(c.MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})} - - df["REVERSE"].fillna(False, inplace=True) - df["REVERSE"].replace("+", True, inplace=True) - logger.info("Converting MaxQuant peptide sequence to internal format") - if tmt_labeled != "": - unimod_tag = c.TMT_MODS[tmt_labeled] - logger.info("Adding TMT fixed modifications") - df["MODIFIED_SEQUENCE"] = maxquant_to_internal( - df["MODIFIED_SEQUENCE"].to_numpy(), - mods={**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}, - ) - df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1) - if "msa" in tmt_labeled: - logger.info("Replacing phospho by dehydration for Phospho-MSA") - df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace( - "[UNIMOD:21]", "[UNIMOD:23]", regex=False - ) - elif "LABELING_STATE" in df.columns: - logger.info("Adding SILAC fixed modifications") - - df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal( - df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(), - mods={**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods}, - ) - df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal( - df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods} - ) - df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1) - df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1) - df.drop(columns=["LABELING_STATE"], inplace=True) - else: - df["MODIFIED_SEQUENCE"] = maxquant_to_internal( - df["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods} - ) - df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) - df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) - df["PROTEINS"].fillna("UNKNOWN", inplace=True) - - return df + df = self.results + # Standardize column names + # df.columns = df.columns.str.upper() + # df.columns = df.columns.str.replace(" ", "_") + # df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True) + + mods["_"] = "" + + df.fillna({"Reverse": "", "Proteins": "UNKNOWN"}, inplace=True) + df["Reverse"] = df["Reverse"].astype(bool) + df.replace({"Modified sequence": mods}, regex=True, inplace=True) + + df["Sequence"] = internal_without_mods(df["Modified sequence"]) + df["PEPTIDE_LENGTH"] = df["Sequence"].str.len() + + df.rename( + columns={ + "Reverse": "REVERSE", + "Sequence": "SEQUENCE", + "Modified sequence": "MODIFIED_SEQUENCE", + "Proteins": "PROTEINS", + "Charge": "PRECURSOR_CHARGE", + "Raw file": "RAW_FILE", + "Scan number": "SCAN_NUMBER", + "Scan event number": "SCAN_EVENT_NUMBER", + "Mass": "MASS", + "Score": "SCORE", + }, + inplace=True, + ) def generate_internal_timstof_metadata(self): """ diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index 3911700..aa9032b 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -6,10 +6,10 @@ import spectrum_fundamentals.constants as c from pyteomics import pepxml from spectrum_fundamentals.constants import MSFRAGGER_VAR_MODS -from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal +from spectrum_fundamentals.mod_string import internal_without_mods from tqdm import tqdm -from .search_results import SearchResults, filter_valid_prosit_sequences +from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods logger = logging.getLogger(__name__) @@ -19,17 +19,26 @@ class MSFragger(SearchResults): def read_result( self, - tmt_labeled: str, - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + tmt_label: str = "", + custom_mods: Optional[Dict[str, int]] = None, ) -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. - :param tmt_labeled: tmt label as str - :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass + :param tmt_label: optional tmt label as str + :param custom_mods: optional dictionary mapping MSFragger-specific mod pattern to UNIMOD IDs. + If None, static carbamidomethylation of cytein and variable oxidation of methionine + are mapped automatically. To avoid this, explicitely provide an empty dictionary. :raises FileNotFoundError: in case the given path is neither a file, nor a directory. :return: pd.DataFrame with the formatted data """ + if custom_mods is None: + custom_mods = {"C": 4, "M[147]": 35} + parsed_mods = parse_mods(custom_mods) + if tmt_label: + unimod_tag = c.TMT_MODS[tmt_label] + parsed_mods["K"] = f"K{unimod_tag}" + parsed_mods[r"^n\[\d+\]"] = f"{unimod_tag}-" if self.path.is_file(): file_list = [self.path] elif self.path.is_dir(): @@ -41,76 +50,57 @@ def read_result( for pep_xml_file in tqdm(file_list): ms_frag_results.append(pepxml.DataFrame(str(pep_xml_file))) - df = pd.concat(ms_frag_results) - - stat_mods: Dict[str, str] = {} - var_mods: Dict[str, str] = {} - - if custom_mods is not None: - stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()} - var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()} - - df = update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods) - return filter_valid_prosit_sequences(df) - - -def update_columns_for_prosit( - df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, var_mods: Optional[Dict[str, str]] = None -) -> pd.DataFrame: - """ - Update columns of df to work with Prosit. - - :param df: df to modify - :param tmt_labeled: True if tmt labeled - :param var_mods: dict with custom variable identifier and respecitve internal equivalent - :param stat_mods: dict with custom static identifier and respecitve internal equivalent - :return: modified df as pd.DataFrame - """ - df["PROTEINS"] = df["protein"] - df["PROTEINS"].fillna("UNKNOWN", inplace=True) - df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x)) - df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0]) - df["MASS"] = df["precursor_neutral_mass"] - df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x)) - - mods = {**(MSFRAGGER_VAR_MODS), **(stat_mods or {}), **(var_mods or {})} - - if tmt_labeled != "": - unimod_tag = c.TMT_MODS[tmt_labeled] - logger.info("Adding TMT fixed modifications") - mods = {**{"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods} - df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods) - else: - # By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included - # in the fixed modifications. If you want to have no fixed modifictions at all, supply fixed_mods={} - mods = {**{"C": "C[UNIMOD:4]"}, **mods} - df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods) - - df.rename( - columns={ - "assumed_charge": "PRECURSOR_CHARGE", - "index": "SCAN_EVENT_NUMBER", - "peptide": "SEQUENCE", - "start_scan": "SCAN_NUMBER", - "hyperscore": "SCORE", - }, - inplace=True, - ) - df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) - df["PROTEINS"] = df["PROTEINS"].apply(lambda x: ";".join(x)) - - return df[ - [ - "RAW_FILE", - "SCAN_NUMBER", - "MODIFIED_SEQUENCE", - "PRECURSOR_CHARGE", - "SCAN_EVENT_NUMBER", - "MASS", - "SCORE", - "REVERSE", - "SEQUENCE", - "PEPTIDE_LENGTH", - "PROTEINS", + self.results = pd.concat(ms_frag_results) + + self.convert_to_internal(mods=parsed_mods) + return filter_valid_prosit_sequences(self.results) + + def convert_to_internal(self, mods: Dict[str, str]) -> pd.DataFrame: + """ + Convert all columns in the MSFragger output to the internal format used by Oktoberfest. + + :param mods: dictionary mapping MSFragger-specific mod patterns (keys) to ProForma standard (values) + """ + df = self.results + df["protein"] = df["protein"].fillna("UNKNOWN").apply(lambda x: ";".join(x)) + + df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x)) + df["spectrum"] = df["spectrum"].str.split(pat=".", n=1).str[0] + df["PEPTIDE_LENGTH"] = df["peptide"].str.len() + + df.replace({"modified_peptide": mods}, regex=True, inplace=True) + df["peptide"] = internal_without_mods(df["modified_peptide"]) + + df.rename( + columns={ + "assumed_charge": "PRECURSOR_CHARGE", + "index": "SCAN_EVENT_NUMBER", + "peptide": "SEQUENCE", + "start_scan": "SCAN_NUMBER", + "hyperscore": "SCORE", + "modified_peptide": "MODIFIED_SEQUENCE", + "protein": "PROTEINS", + "peptide": "SEQUENCE", + "precursor_neutral_mass": "MASS", + "spectrum": "RAW_FILE", + }, + inplace=True, + ) + + """ + return df[ + [ + "RAW_FILE", + "SCAN_NUMBER", + "MODIFIED_SEQUENCE", + "PRECURSOR_CHARGE", + "SCAN_EVENT_NUMBER", + "MASS", + "SCORE", + "REVERSE", + "SEQUENCE", + "PEPTIDE_LENGTH", + "PROTEINS", + ] ] - ] + """ diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py index 50438b9..2451efe 100644 --- a/spectrum_io/search_result/sage.py +++ b/spectrum_io/search_result/sage.py @@ -3,10 +3,10 @@ from typing import Dict, Optional, Tuple, Union import pandas as pd -from spectrum_fundamentals.constants import MOD_MASSES_SAGE -from spectrum_fundamentals.mod_string import sage_to_internal +import spectrum_fundamentals.constants as c +from spectrum_fundamentals.mod_string import internal_without_mods -from .search_results import SearchResults, filter_valid_prosit_sequences +from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods logger = logging.getLogger(__name__) @@ -16,79 +16,68 @@ class Sage(SearchResults): def read_result( self, - tmt_labeled: str = "", - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + tmt_label: str = "", + custom_mods: Optional[Dict[str, int]] = None, ) -> pd.DataFrame: """ Function to read a msms tsv and perform some basic formatting. - :param tmt_labeled: tmt label as str - :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass + :param tmt_label: optional tmt label as str + :param custom_mods: optional dictionary mapping Sage-specific mod pattern to UNIMOD IDs. + If None, static carbamidomethylation of cytein and variable oxidation of methionine + are mapped automatically. To avoid this, explicitely provide an empty dictionary. :return: pd.DataFrame with the formatted data """ + if custom_mods is None: + custom_mods = { + "C[+57.0215]": 4, + "M[+15.9949]": 35, + "M[+15.994]": 35, + } + parsed_mods = parse_mods(custom_mods) + if tmt_label: + unimod_tag = c.TMT_MODS[tmt_label] + parsed_mods[r"K\[\+\d+\.\d+\]"] = f"K{unimod_tag}" + parsed_mods[r"^\[\+\d+\.\d+\]"] = f"{unimod_tag}" + logger.info(f"Reading {self.path}") - df = pd.read_csv( + self.results = pd.read_csv( self.path, usecols=["filename", "scannr", "peptide", "charge", "hyperscore", "calcmass", "label", "proteins"], sep="\t", ) logger.info(f"Finished reading {self.path}") - # Standardize column names - df.columns = df.columns.str.upper() - df.columns = df.columns.str.replace(" ", "_") - - stat_mods: Dict[str, str] = {} - var_mods: Dict[str, str] = {} + self.convert_to_internal(mods=parsed_mods) + print(parsed_mods, self.results) + return filter_valid_prosit_sequences(self.results) - if custom_mods is not None: - stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()} - var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()} - - df = Sage.update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods) - return filter_valid_prosit_sequences(df) - - @staticmethod - def update_columns_for_prosit( - df: pd.DataFrame, - tmt_labeled: str, - stat_mods: Optional[Dict[str, str]] = None, - var_mods: Optional[Dict[str, str]] = None, - ) -> pd.DataFrame: + def convert_to_internal(self, mods: Dict[str, str]) -> pd.DataFrame: """ - Update columns of df to work with Prosit. + Convert all columns in the Sage output to the internal format used by Oktoberfest. - :param df: df to modify - :param tmt_labeled: True if tmt labeled, ignored - :param var_mods: Variable modifications with custom identifiers and their respective internal equivalents - :param stat_mods: Static modifications with custom identifiers and their respective internal equivalents - :return: modified df as pd.DataFrame + :param mods: dictionary mapping Sage-specific mod patterns (keys) to ProForma standard (values) """ - df = df.rename( - columns={ - "FILENAME": "RAW_FILE", - "SCANNR": "SCAN_NUMBER", - "PEPTIDE": "MODIFIED_SEQUENCE", - "CHARGE": "PRECURSOR_CHARGE", - "CALCMASS": "MASS", - "HYPERSCORE": "SCORE", - "LABEL": "REVERSE", - } - ) - mods = {**(MOD_MASSES_SAGE), **(stat_mods or {}), **(var_mods or {})} + df = self.results # removing .mzML - df["RAW_FILE"] = df["RAW_FILE"].str.replace(r"\.mz[M|m][l|L]", "", regex=True) - # extracting only the scan number - df["SCAN_NUMBER"] = [int(x.rsplit("=", 1)[-1]) for x in df["SCAN_NUMBER"]] - # creating a column of decoys and targets - df["REVERSE"] = df["REVERSE"] < 0 - # removing modification to create the unmodified sequences - df["SEQUENCE"] = df["MODIFIED_SEQUENCE"].str.replace(r"\-|\[.*?\]", "", regex=True) - # length of the peptide + df.fillna({"proteins": "UNKNOWN"}, inplace=True) + df.replace({"filename": {r"\.mz[M|m][l|L]": ""}, "peptide": mods}, regex=True, inplace=True) + df["scannr"] = df["scannr"].str.rsplit(pat="=", n=1).str[1].astype(int) + df["label"] = df["label"] < 0 + df["SEQUENCE"] = internal_without_mods(df["peptide"]) df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len() - # converting sage to unimod - df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"], mods=mods) - df["PROTEINS"].fillna("UNKNOWN", inplace=True) - return df + df.rename( + columns={ + "filename": "RAW_FILE", + "scannr": "SCAN_NUMBER", + "peptide": "MODIFIED_SEQUENCE", + "charge": "PRECURSOR_CHARGE", + "calcmass": "MASS", + "hyperscore": "SCORE", + "label": "REVERSE", + "proteins": "PROTEINS", + }, + inplace=True, + ) diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py index 0d9c476..4c7619a 100644 --- a/spectrum_io/search_result/search_results.py +++ b/spectrum_io/search_result/search_results.py @@ -34,6 +34,51 @@ def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame: return df +def parse_mods(mods: Dict[str, int]) -> Dict[str, str]: + """ + Parse provided mapping of custom modification pattern to ProForma standard. + + This function takes a dictionary mapping custom modification pattern for specific aminoacids (keys) to a + UNIMOD ID (values). The pattern is translated to ProForma standard and a new dictionary mapping the custom + modification patterns to the ProForma standard is returned. + The pattern for the custom modifications must start with the one-letter code for an aminoacid or '^' / '$', + to describe n- / c-terminal modifications, respectively, followed by an optional pattern (which can be + empty). + This means that 'X' or 'X(custom_pattern)', is both mapped to 'X[UNIMOD:#]'. + For the n-terminus, an additional dash will be added automatically, which maps 'X(custom_pattern)' to + 'X[UNIMOD:#]-'. If the sequence to apply the transformation on already contains the dash, it needs to be part + of the custom_pattern (i.e. 'X(custom_pattern)-'), to avoid adding an additional dash. + + :param mods: Dictionary mapping custom modification patterns (keys) to UNIMOD IDs (values) + :raises TypeError: if keys are not strings or values are not integers + :raises ValueError: if the keys do not start with [A-Z,a-z,^,$] + :return: A dictionary mapping custom modification patterns (keys) to the ProForma standard (values) + """ + key_pattern = ( + "'X' or 'X' where X is either the one-letter code of an aminoacid or '^' / '$' defining the" + " n- or c-terminus, respectively, followed by an optional pattern identifying a specific modification." + ) + unimod_regex_map = {} + for k, v in mods.items(): + if not isinstance(v, int): + raise TypeError(f"UNIMOD id {v} for replacement {k} not understood. UNIMOD IDs must be integers.") + if not isinstance(k, str): + raise TypeError( + f"Replacement {k} not understood. Replacements must be strings and follow " f"the pattern {key_pattern}" + ) + if k[0].isalpha(): + unimod_regex_map[re.escape(k)] = f"{k[0]}[UNIMOD:{v}]" + continue + if k[0] == "^": + unimod_regex_map[f"^{re.escape(k[1:])}"] = f"[UNIMOD:{v}]-" + continue + raise ValueError( + f"Replacement {k} not understood. {k[0]} is not a valid aminoacid. " + f"Replacements most follow the pattern {key_pattern}" + ) + return unimod_regex_map + + class SearchResults: """Handle search results from different software.""" @@ -53,12 +98,12 @@ def __init__(self, path: Union[str, Path]): @abstractmethod def read_result( self, - tmt_labeled: str, - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + tmt_label: str = "", + custom_mods: Optional[Dict[str, int]] = None, ): """Read result. - :param tmt_labeled: tmt label as str + :param tmt_label: tmt label as str :param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass """ raise NotImplementedError @@ -102,3 +147,12 @@ def read_internal(self) -> pd.DataFrame: :return: dataframe after reading the file """ return csv.read_file(self.path) + + @abstractmethod + def convert_to_internal(self, mods: Dict[str, int]): + """ + Convert all columns in the search engine-specific output to the internal format used by Oktoberfest. + + :param mods: dictionary mapping search engine-specific mod patterns (keys) to ProForma standard (values) + """ + raise NotImplementedError diff --git a/spectrum_io/spectral_library/dlib.py b/spectrum_io/spectral_library/dlib.py index ba9558c..205bda8 100644 --- a/spectrum_io/spectral_library/dlib.py +++ b/spectrum_io/spectral_library/dlib.py @@ -1,7 +1,7 @@ import sqlite3 import zlib from pathlib import Path -from typing import IO, Dict, Union, Tuple, Optional +from typing import IO, Dict, Optional, Tuple, Union import numpy as np import pandas as pd @@ -125,12 +125,17 @@ def _create_database(conn: sqlite3.Connection): c.execute(sql_insert_meta, ["staleProteinMapping", "true"]) conn.commit() - def _write(self, out: Union[IO, sqlite3.Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame, - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): + def _write( + self, + out: Union[IO, sqlite3.Connection], + data: Dict[str, np.ndarray], + metadata: pd.DataFrame, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ): if isinstance(out, IO): raise TypeError("Not supported. Use msp/spectronaut if you want to write a text file.") seqs = metadata["SEQUENCE"] - modseqs = metadata["MODIFIED_SEQUENCE"] + modseqs = metadata["MODIFIED_SEQUENCE"] mass_mod_sequences = internal_to_mod_mass(modseqs, custom_mods) p_charges = metadata["PRECURSOR_CHARGE"] diff --git a/spectrum_io/spectral_library/msp.py b/spectrum_io/spectral_library/msp.py index 1412375..a83fa7c 100644 --- a/spectrum_io/spectral_library/msp.py +++ b/spectrum_io/spectral_library/msp.py @@ -1,5 +1,5 @@ from sqlite3 import Connection -from typing import IO, Dict, Union, Tuple, Optional +from typing import IO, Dict, Optional, Tuple, Union import numpy as np import pandas as pd @@ -17,8 +17,13 @@ def _assemble_fragment_string(f_mz: float, f_int: float, f_a: bytes): annot = f_a[:-2].decode() if f_a.endswith(b"1") else f_a.replace(b"+", b"^").decode() return f'{f_mz:.8f}\t{f_int:.4f}\t"{annot}/0.0ppm"\n' - def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame, - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): + def _write( + self, + out: Union[IO, Connection], + data: Dict[str, np.ndarray], + metadata: pd.DataFrame, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ): # prepare metadata if isinstance(out, Connection): raise TypeError("Not supported. Use DLib if you want to write a database file.") diff --git a/spectrum_io/spectral_library/spectral_library.py b/spectrum_io/spectral_library/spectral_library.py index b037c14..9abf0f6 100644 --- a/spectrum_io/spectral_library/spectral_library.py +++ b/spectrum_io/spectral_library/spectral_library.py @@ -3,7 +3,7 @@ from multiprocessing.managers import ValueProxy from pathlib import Path from sqlite3 import Connection -from typing import IO, Dict, Optional, Union, Tuple +from typing import IO, Dict, Optional, Tuple, Union import numpy as np import pandas as pd @@ -52,8 +52,9 @@ def write(self, *args, **kwargs): def _get_handle(self): return open(self.out_path, self.mode) - def async_write(self, queue: Queue, progress: ValueProxy, - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): + def async_write( + self, queue: Queue, progress: ValueProxy, custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None + ): """ Asynchronously write content to the output file from a queue. @@ -88,8 +89,13 @@ def _fragment_filter_passed( return (f_mz != -1) & (f_int >= self.min_intensity_threshold) @abstractmethod - def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame, - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): + def _write( + self, + out: Union[IO, Connection], + data: Dict[str, np.ndarray], + metadata: pd.DataFrame, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ): """ Internal writer function. diff --git a/spectrum_io/spectral_library/spectronaut.py b/spectrum_io/spectral_library/spectronaut.py index c4632f3..59ca248 100644 --- a/spectrum_io/spectral_library/spectronaut.py +++ b/spectrum_io/spectral_library/spectronaut.py @@ -2,7 +2,7 @@ import re from itertools import chain, cycle from sqlite3 import Connection -from typing import IO, Dict, Tuple, Union, Optional +from typing import IO, Dict, Optional, Tuple, Union import numpy as np import pandas as pd @@ -26,8 +26,13 @@ def _assemble_fragment_string(f_int: float, f_mz: float, f_annot: bytes): f"{f_int:.4f},{f_mz:.8f},{m.group(2)},{m.group(1)},{m.group(3)},{m.group(4) if m.group(4) else 'noloss'}\n" ) - def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame, - custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None): + def _write( + self, + out: Union[IO, Connection], + data: Dict[str, np.ndarray], + metadata: pd.DataFrame, + custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None, + ): # prepare metadata if isinstance(out, Connection): raise TypeError("Not supported. Use DLib if you want to write a database file.") diff --git a/tests/unit_tests/data/msms.txt b/tests/unit_tests/data/msms.txt new file mode 100644 index 0000000..083aaec --- /dev/null +++ b/tests/unit_tests/data/msms.txt @@ -0,0 +1,10 @@ +Raw file Scan number Scan index Sequence Length Missed cleavages Modifications Modified sequence Oxidation (M) Probabilities Oxidation (M) Score diffs Oxidation (M) Proteins Charge Fragmentation Mass analyzer Type Scan event number Isotope index m/z Mass Mass error [ppm] Mass error [Da] Simple mass error [ppm] Retention time PEP Score Delta score Score diff Localization prob Combinatorics PIF Fraction of total spectrum Base peak fraction Precursor full scan number Precursor Intensity Precursor apex fraction Precursor apex offset Precursor apex offset time Matches Intensities Mass deviations [Da] Mass deviations [ppm] Masses Number of matches Intensity coverage Peak coverage Unfragmented precursor intensity Unfragmented precursor fraction Neutral loss level ETD identification type Reverse All scores All sequences All modified sequences MS3 scan numbers Reporter PIF Reporter fraction id Protein group IDs Peptide ID Mod. peptide ID Evidence ID Oxidation (M) site IDs Mass deficit +GN20170722_SK_HLA_G0103_R1_02 39581 34737 AAAAAVAGVGRGG 13 Unmodified _AAAAAVAGVGRGG_ 0 uc009yqv.3;uc001ogi.3;C11orf68_p.V135L_uc001ogi.2|C11orf68 2 CID FTMS MULTI-MSMS 8 0 514.2858 1026.557 3.036 0.0015614 8.3832641 88.186 0.71744 13.77 3.3555 NaN NaN 1 0 0 0 39573 1384891.5 0.7223182599737116 -3 0.08632659912109375 y5;y7;a2;b9 2905.48046875;9943.1416015625;3013.589599609375;37873.2109375 -0.002829233181444124;0.008368573916527566;-0.00031301558594520884;-0.00275202985596934 -6.354192112149343;14.597148465958854;-2.7198193653715848;-4.0329222465724435 445.2545865011814;573.3019662054834;115.08690243568594;682.391002779256 4 0.016147311455613267 0.04395604395604396 0 0 None Unknown 13.769603755808198;10.41414173872871;9.986874898151301 AAAAAVAGVGRGG;RGLQQPVVM;PLFPVAGVPM _AAAAAVAGVGRGG_;_RGLQQPVVM_;_PLFPVAGVPM_ 0 153 0 0 0 0.044824929062770025 +GN20170722_SK_HLA_G0103_R1_01 24282 21318 AAADRNLIYVLK 12 Unmodified _AAADRNLIYVLK_ 0 2 CID FTMS MULTI-MSMS 8 1 673.89317 1345.7718 -4.268 -0.0028761 747.2514 52.702 0.56055 28.727 0.32818 NaN NaN 1 0 0 0 24274 4397484 0.3726964352547448 10 -0.23571395874023438 y1;y2;y3;y4;y5;y7;y2-NH3;a2 3799.669677734375;138293.265625;18070.681640625;58923.94140625;107806.1171875;70635.7734375;15442.98046875;9986.8359375 -0.000463241690255245;0.00019436417716178767;0.003219772849433866;0.0019772452006918684;0.0012221122285609454;0.002896620233173053;0.0009968129016613148;-0.000993683999936934 -3.148877721252655;0.7469894765873476;8.962184397850063;3.785457364047471;1.9233399457992681;3.358257103880474;4.099254348751434;-8.634154729256231 147.11326741229024;260.1966737868229;359.26206229435064;522.3266333602993;635.4114524736714;862.5367693932668;243.1693222365984;115.08758310409993 8 0.06633611121645898 0.07407407407407407 0 0 None Unknown + 28.726836482416193;28.398655567304367;27.254056396965822 AAADRNLIYVLK;LVRGIAHIFSPH;LVYQEINLARK _AAADRNLIYVLK_;_LVRGIAHIFSPH_;_LVYQEINLARK_ 1 6213 1 1 1 0.11272994931368885 +GN20170722_SK_HLA_G0103_R1_01 32608 28449 AAAGRIAIPGL 11 Unmodified _AAAGRIAIPGL_ 0 uc002lpr.2;uc002lpq.2;uc002lpp.2 2 CID FTMS MSMS 11 505.31128 1008.608 NaN NaN 9.1541616 72.138 0.35461 15.662 1.8867 NaN NaN 1 0 0 0 32597 334279.59375 1 0 0 y1;y4;y6;a2;b2;b7;b10 17585.08984375;3773.285888671875;47387.72265625;14307.5595703125;3126.1455078125;103400.8515625;2367.97900390625 0.0001248233497790352;-0.0018376640455812776;0.009915978956996696;-0.00043868453276729724;-0.0012951289365901175;-0.004575538846552263;0.004011688101854816 0.9449028581307712;-4.602651610884071;16.997710143135873;-3.8117634975199977;-9.051604693874186;-7.484112246043633;4.566433773179205 132.10178030995021;399.2620343533456;583.3714584785429;115.08702810463276;143.0827991711366;611.3669458887465;878.5166502177981 7 0.06293662075192183 0.045454545454545456 0 0 None Unknown 15.66199446558733;13.775330524044264;13.64772730687463 AAAGRIAIPGL;PVARLFPPL;VPVGRLQGAL _AAAGRIAIPGL_;_PVARLFPPL_;_VPVGRLQGAL_ 2 16295 2 2 2 0.1040544195111579 +GN20170722_SK_HLA_G0103_R1_01 35464 30971 AAAIVFLVDRF 11 Unmodified _AAAIVFLVDRF_ 0 uc011cfx.2;uc003iap.4;uc003ian.4 2 CID FTMS MSMS 3 611.35315 1220.6917 NaN NaN 7.2936771 78.767 0.35461 15.662 1.4552 NaN NaN 1 0 0 0 35461 1150588.5 0.46417705227480194 -2 0.02997589111328125 y5;a2;b2;b4;b5 2324.119140625;7897.48388671875;3402.477783203125;8944.435546875;3646.345458984375 0.00016820457972244185;-0.0002067135477687998;-0.0006781696607163212;0.001769698386226537;-0.0026612948673232495 0.2590286824841407;-1.7961534660218355;-4.739721258319645;5.408598572673599;-6.243159057969467 649.3666188212202;115.08679613364777;143.08218221186073;327.2009121120137;426.2737570214673 5 0.018596876451864083 0.05 0 0 None Unknown 15.66199446558733;14.20677841292237;11.49006305901108 AAAIVFLVDRF;VGAVGHKAANVAK;GRRRGGPPVNR _AAAIVFLVDRF_;_VGAVGHKAANVAK_;_GRRRGGPPVNR_ 3 18307 3 3 3 0.09022553328168215 +GN20170722_SK_HLA_G0103_R2_01 26762 22942 AAALGAVVRLA 11 Unmodified _AAALGAVVRLA_ 0 2 CID FTMS MSMS 3 506.31911 1010.6237 NaN NaN -2.2315064 60.764 0.357 14.865 0.089257 NaN NaN 1 0 0 0 -1 NaN NaN 0 NaN y6;y9;y6-NH3;y9-NH3;b9 6704.11572265625;3121.10546875;5720.6767578125;36383.8828125;2903.078125 -7.786020262301463E-05;0.004498815625538555;-0.003963494129493483;-0.012215085613661358;-0.006804483999644617 -0.1238995059129184;5.173715335129975;-6.482743414813657;-14.327833916408876;-8.40572395662594 628.4141494296026;869.5522142455745;611.3914859620295;852.5423790453137;809.5060026662998 5 0.006509061445123882 0.035211267605633804 0 0 None Unknown + 14.864876125193954;14.775619335231177;14.535294575441029 AAALGAVVRLA;LSGIIKRQP;AAIQILRNL _AAALGAVVRLA_;_LSGIIKRQP_;_AAIQILRNL_ 4 5820 4 4 4 0.1187772846816415 +GN20170722_SK_HLA_G0103_R1_02 36957 32348 AAALLGPSAQVKPS 14 Unmodified _AAALLGPSAQVKPS_ 0 2 CID FTMS MSMS 12 655.37735 1308.7402 NaN NaN 7.953341 82.433 0.44365 16.632 5.1418 NaN NaN 1 0 0 0 36945 2145926.75 0.8411612144806966 5 -0.1116790771484375 y8;a2;b2;b4;b10 4655.591796875;16715.123046875;2262.689453125;5398.7060546875;4745.7470703125 -0.007113766089901219;-0.0003434307455876251;0.0001157588298212886;0.0013808909956765092;-0.004194096411310966 -8.745140500668365;-2.9840985164902825;0.8090418412213772;4.220310224306376;-4.763350700971752 813.4536076758899;115.08693285084559;143.0813882833702;327.2013009194043;880.4928871719113 5 0.0204413445189944 0.03496503496503497 0 0 None Unknown + 16.631834798470333;11.49006305901108;10.82951774543324 AAALLGPSAQVKPS;ILRNHVMVRVG;VGGLVILAMPQVP _AAALLGPSAQVKPS_;_ILRNHVM(Oxidation (M))VRVG_;_VGGLVILAM(Oxidation (M))PQVP_ 5 7349 5 5 5 0.09813002177429553 +GN20170722_SK_HLA_G0103_R1_01 35853 31312 AAALVFFVTA 10 Unmodified _AAALVFFVTA_ 0 tr|A0A097I0R1|A0A097I0R1_HHV3;tr|A0A097I0Y1|A0A097I0Y1_HHV3;tr|G9IXM9|G9IXM9_HHV3;tr|G9IX13|G9IX13_HHV3;tr|Q0QA15|Q0QA15_HHV3;tr|Q0QA88|Q0QA88_HHV3;tr|Q2PJ42|Q2PJ42_HHV3;tr|R4P4T3|R4P4T3_HHV3;tr|G9IXV1|G9IXV1_HHV3;tr|A0A1C9CWW4|A0A1C9CWW4_HHV3;tr|Q0Q9E6|Q0Q9E6_HHV3;tr|Q0Q973|Q0Q973_HHV3;tr|G9IUZ3|G9IUZ3_HHV3;tr|A0A0F7GK62|A0A0F7GK62_HHV3;tr|A0A0F7CV33|A0A0F7CV33_HHV3;tr|A0A0F7CVA6|A0A0F7CVA6_HHV3;tr|A0A0F7CVQ2|A0A0F7CVQ2_HHV3;tr|A0A0F7CW33|A0A0F7CW33_HHV3;tr|Q0Q9L9|Q0Q9L9_HHV3;sp|P09278|LTP_VZVD;tr|I2CNP0|I2CNP0_HHV3;tr|A0A0F7GJA5|A0A0F7GJA5_HHV3;tr|Q6QCN3|Q6QCN3_HHV3;tr|W5RSJ9|W5RSJ9_HHV3;tr|Q0Q9U2|Q0Q9U2_HHV3;tr|Q0Q858|Q0Q858_HHV3;tr|A0A1B1JF83|A0A1B1JF83_HHV3;tr|G0ZLC3|G0ZLC3_HHV3;tr|Q6QCV5|Q6QCV5_HHV3;tr|A8I3I4|A8I3I4_HHV3;tr|G9IUC7|G9IUC7_HHV3;tr|G9IVD7|G9IVD7_HHV3;tr|G9IW03|G9IW03_HHV3;tr|G9IWE7|G9IWE7_HHV3;tr|G9IWL9|G9IWL9_HHV3;tr|G9IXF7|G9IXF7_HHV3;tr|L7X8T0|L7X8T0_HHV3;tr|A0A0F7GIU8|A0A0F7GIU8_HHV3;tr|A0A0F7GKY0|A0A0F7GKY0_HHV3;tr|A0A1C9CX67|A0A1C9CX67_HHV3;tr|U5NQF8|U5NQF8_HHV3;tr|U5NT66|U5NT66_HHV3;tr|U5NPW9|U5NPW9_HHV3;tr|A0A1B1JGR2|A0A1B1JGR2_HHV3;tr|Q0Q8D1|Q0Q8D1_HHV3;tr|A0A075X7J7|A0A075X7J7_HHV3;tr|A0A076N2D1|A0A076N2D1_HHV3;tr|Q0Q8S7|Q0Q8S7_HHV3;tr|A0A1B1JGW3|A0A1B1JGW3_HHV3;tr|W6EHL9|W6EHL9_HHV3;tr|B0CMS9|B0CMS9_HHV3;tr|A0A1B1JFX5|A0A1B1JFX5_HHV3;tr|A0A1B1JG34|A0A1B1JG34_HHV3;tr|A0A075X1N3|A0A075X1N3_HHV3;tr|G9IY23|G9IY23_HHV3;tr|A4GE96|A4GE96_HHV3;tr|U5NSY8|U5NSY8_HHV3;tr|A0A0F7GLI9|A0A0F7GLI9_HHV3;tr|G9IU55|G9IU55_HHV3;tr|A0A0F7GQA2|A0A0F7GQA2_HHV3;tr|U5NTP5|U5NTP5_HHV3;tr|A0A0F7GLZ1|A0A0F7GLZ1_HHV3;sp|Q4JQX9|LTP_VZVO;tr|A0A0F7GNC3|A0A0F7GNC3_HHV3;tr|A0A0F7GNZ4|A0A0F7GNZ4_HHV3;tr|A0A0F7GPS9|A0A0F7GPS9_HHV3;tr|I7D980|I7D980_HHV3;tr|U5NUP5|U5NUP5_HHV3;tr|G9IW75|G9IW75_HHV3;tr|Q0Q8K4|Q0Q8K4_HHV3;tr|A6XEB1|A6XEB1_HHV3;tr|I7DJA6|I7DJA6_HHV3;tr|W6E6K8|W6E6K8_HHV3;tr|W6E684|W6E684_HHV3;tr|Q0Q900|Q0Q900_HHV3 2 CID FTMS MULTI-MSMS 3 0 505.28949 1008.5644 2.236 0.0011298 4.7881027 79.666 0.62222 15.381 0.60553 NaN NaN 1 0 0 0 35850 15369293 1 0 0 y4;y6;y4-H2O;b4 8194.3076171875;9210.5302734375;12286.7236328125;9231.583984375 -0.008737849023532362;-0.006874988596337062;0.0027794625577257648;0.005002444408432893 -19.983727872620598;-10.060225299391472;6.6299842599590635;15.28875271403542 437.2481990962236;683.3831640681964;419.2261170983423;327.1976793659915 4 0.005030056466088641 0.05263157894736842 0 0 None Unknown 15.381154005585476;14.775619335231177;12.988381952877914 AAALVFFVTA;VPLSPPWLT;PTKLYKLM _AAALVFFVTA_;_VPLSPPWLT_;_PTKLYKLM(Oxidation (M))_ 6 11912 6 6 6 0.06047832453987212 +GN20170722_SK_HLA_G0103_R1_01 37465 32769 AAAMAFG 7 Unmodified _AAAMAFG_ 0 tr|U5NU36|U5NU36_HHV3;tr|Q0Q9Y9|Q0Q9Y9_HHV3;tr|I7EGL3|I7EGL3_HHV3;tr|I2CNR6|I2CNR6_HHV3;tr|G9IXB0|G9IXB0_HHV3;tr|G9IVV6|G9IVV6_HHV3;tr|G9IVG2|G9IVG2_HHV3;tr|G9IUM4|G9IUM4_HHV3;tr|B0CMV6|B0CMV6_HHV3;tr|A0A1C9CX38|A0A1C9CX38_HHV3;tr|A0A0F7GMU7|A0A0F7GMU7_HHV3;tr|A0A0F7GKV3|A0A0F7GKV3_HHV3;tr|A0A0F7GJZ8|A0A0F7GJZ8_HHV3;sp|Q4JQS7|AN_VZVO;sp|P09253|AN_VZVD 1 CID FTMS MULTI-MSMS 5 0 638.29666 637.28938 -2.6899 -0.0017169 2.2618452 83.465 0.69237 3.3758 3.3758 NaN NaN 1 0 0 0 37460 49799916 0.6104119988864497 -6 0.15149688720703125 y4 32493.693359375 0.00040581233241709924 0.9544372849576236 425.18491137436763 1 0.00043692510291909703 0.01020408163265306 0 0 None Unknown 3.3758226685180976;0;0 AAAMAFG;PFFGGGG;GFGPFGG _AAAMAFG_;_PFFGGGG_;_GFGPFGG_ 7 11602 7 7 7 -0.0437710322582916 +GN20170722_SK_HLA_G0103_R1_01 4878 3972 AAAMLRK 7 Unmodified _AAAMLRK_ 0 uc001vne.3;uc010tik.1;uc010til.1;uc001vnf.1 2 CID FTMS MSMS 5 380.72854 759.44253 NaN NaN -0.89752058 11.884 1.0478 38.202 10.885 NaN NaN 1 0 0 0 4873 3145282 1 0 0 y3;y4;y3-NH3;y4-NH3;b3 92125.1015625;475564.0625;5908.86328125;26057.03515625;47402.09765625 -0.00042149807143232465;-0.00030228005061871954;-0.0030104841815727923;0.0005637190507741252;-5.349210823624162E-06 -1.012490249173898;-0.5522723208365405;-7.539887044452489;1.0629963883454507;-0.024982464132262953 416.29840067717146;547.3387660653506;399.2744405617816;530.3113509647492;214.11862317921083 5 0.20388849272384799 0.0625 0 0 None Unknown 38.20205643361177;27.3166592096155;27.3166592096155 AAAMLRK;GGVRMLK;GGVLRMK _AAAMLRK_;_GGVRMLK_;_GGVLRMK_ 8 14949 8 8 8 0.05318511890629907 diff --git a/tests/unit_tests/data/msms_internal.csv b/tests/unit_tests/data/msms_internal.csv new file mode 100644 index 0000000..0baf93d --- /dev/null +++ b/tests/unit_tests/data/msms_internal.csv @@ -0,0 +1,10 @@ +RAW_FILE,SCAN_NUMBER,Scan index,SEQUENCE,Length,Missed cleavages,Modifications,MODIFIED_SEQUENCE,Oxidation (M) Probabilities,Oxidation (M) Score diffs,Oxidation (M),PROTEINS,PRECURSOR_CHARGE,Fragmentation,Mass analyzer,Type,SCAN_EVENT_NUMBER,Isotope index,m/z,MASS,Mass error [ppm],Mass error [Da],Simple mass error [ppm],Retention time,PEP,SCORE,Delta score,Score diff,Localization prob,Combinatorics,PIF,Fraction of total spectrum,Base peak fraction,Precursor full scan number,Precursor Intensity,Precursor apex fraction,Precursor apex offset,Precursor apex offset time,Matches,Intensities,Mass deviations [Da],Mass deviations [ppm],Masses,Number of matches,Intensity coverage,Peak coverage,Unfragmented precursor intensity,Unfragmented precursor fraction,Neutral loss level,ETD identification type,REVERSE,All scores,All sequences,All modified sequences,MS3 scan numbers,Reporter PIF,Reporter fraction,id,Protein group IDs,Peptide ID,Mod. peptide ID,Evidence ID,Oxidation (M) site IDs,Mass deficit,PEPTIDE_LENGTH +GN20170722_SK_HLA_G0103_R1_02,39581,34737,AAAAAVAGVGRGG,13,,Unmodified,AAAAAVAGVGRGG,,,0,uc009yqv.3;uc001ogi.3;C11orf68_p.V135L_uc001ogi.2|C11orf68,2,CID,FTMS,MULTI-MSMS,8,0.0,514.2858,1026.557,3.036,0.0015614,8.3832641,88.186,0.71744,13.77,3.3555,,,1,0,0,0,39573,1384891.5,0.7223182599737116,-3,0.0863265991210937,y5;y7;a2;b9,2905.48046875;9943.1416015625;3013.589599609375;37873.2109375,-0.002829233181444124;0.008368573916527566;-0.00031301558594520884;-0.00275202985596934,-6.354192112149343;14.597148465958854;-2.7198193653715848;-4.0329222465724435,445.2545865011814;573.3019662054834;115.08690243568594;682.391002779256,4,0.0161473114556132,0.0439560439560439,0,0,,Unknown,False,13.769603755808198;10.41414173872871;9.986874898151301,AAAAAVAGVGRGG;RGLQQPVVM;PLFPVAGVPM,_AAAAAVAGVGRGG_;_RGLQQPVVM_;_PLFPVAGVPM_,,,,0,153,0,0,0,,0.04482492906277,13 +GN20170722_SK_HLA_G0103_R1_01,24282,21318,AAADRNLIYVLK,12,,Unmodified,AAADRNLIYVLK,,,0,UNKNOWN,2,CID,FTMS,MULTI-MSMS,8,1.0,673.89317,1345.7718,-4.268,-0.0028761,747.2514,52.702,0.56055,28.727,0.32818,,,1,0,0,0,24274,4397484.0,0.3726964352547448,10,-0.2357139587402343,y1;y2;y3;y4;y5;y7;y2-NH3;a2,3799.669677734375;138293.265625;18070.681640625;58923.94140625;107806.1171875;70635.7734375;15442.98046875;9986.8359375,-0.000463241690255245;0.00019436417716178767;0.003219772849433866;0.0019772452006918684;0.0012221122285609454;0.002896620233173053;0.0009968129016613148;-0.000993683999936934,-3.148877721252655;0.7469894765873476;8.962184397850063;3.785457364047471;1.9233399457992681;3.358257103880474;4.099254348751434;-8.634154729256231,147.11326741229024;260.1966737868229;359.26206229435064;522.3266333602993;635.4114524736714;862.5367693932668;243.1693222365984;115.08758310409993,8,0.0663361112164589,0.074074074074074,0,0,,Unknown,True,28.726836482416193;28.398655567304367;27.254056396965822,AAADRNLIYVLK;LVRGIAHIFSPH;LVYQEINLARK,_AAADRNLIYVLK_;_LVRGIAHIFSPH_;_LVYQEINLARK_,,,,1,6213,1,1,1,,0.1127299493136888,12 +GN20170722_SK_HLA_G0103_R1_01,32608,28449,AAAGRIAIPGL,11,,Unmodified,AAAGRIAIPGL,,,0,uc002lpr.2;uc002lpq.2;uc002lpp.2,2,CID,FTMS,MSMS,11,,505.31128,1008.608,,,9.1541616,72.138,0.35461,15.662,1.8867,,,1,0,0,0,32597,334279.59375,1.0,0,0.0,y1;y4;y6;a2;b2;b7;b10,17585.08984375;3773.285888671875;47387.72265625;14307.5595703125;3126.1455078125;103400.8515625;2367.97900390625,0.0001248233497790352;-0.0018376640455812776;0.009915978956996696;-0.00043868453276729724;-0.0012951289365901175;-0.004575538846552263;0.004011688101854816,0.9449028581307712;-4.602651610884071;16.997710143135873;-3.8117634975199977;-9.051604693874186;-7.484112246043633;4.566433773179205,132.10178030995021;399.2620343533456;583.3714584785429;115.08702810463276;143.0827991711366;611.3669458887465;878.5166502177981,7,0.0629366207519218,0.0454545454545454,0,0,,Unknown,False,15.66199446558733;13.775330524044264;13.64772730687463,AAAGRIAIPGL;PVARLFPPL;VPVGRLQGAL,_AAAGRIAIPGL_;_PVARLFPPL_;_VPVGRLQGAL_,,,,2,16295,2,2,2,,0.1040544195111579,11 +GN20170722_SK_HLA_G0103_R1_01,35464,30971,AAAIVFLVDRF,11,,Unmodified,AAAIVFLVDRF,,,0,uc011cfx.2;uc003iap.4;uc003ian.4,2,CID,FTMS,MSMS,3,,611.35315,1220.6917,,,7.2936771,78.767,0.35461,15.662,1.4552,,,1,0,0,0,35461,1150588.5,0.4641770522748019,-2,0.0299758911132812,y5;a2;b2;b4;b5,2324.119140625;7897.48388671875;3402.477783203125;8944.435546875;3646.345458984375,0.00016820457972244185;-0.0002067135477687998;-0.0006781696607163212;0.001769698386226537;-0.0026612948673232495,0.2590286824841407;-1.7961534660218355;-4.739721258319645;5.408598572673599;-6.243159057969467,649.3666188212202;115.08679613364777;143.08218221186073;327.2009121120137;426.2737570214673,5,0.018596876451864,0.05,0,0,,Unknown,False,15.66199446558733;14.20677841292237;11.49006305901108,AAAIVFLVDRF;VGAVGHKAANVAK;GRRRGGPPVNR,_AAAIVFLVDRF_;_VGAVGHKAANVAK_;_GRRRGGPPVNR_,,,,3,18307,3,3,3,,0.0902255332816821,11 +GN20170722_SK_HLA_G0103_R2_01,26762,22942,AAALGAVVRLA,11,,Unmodified,AAALGAVVRLA,,,0,UNKNOWN,2,CID,FTMS,MSMS,3,,506.31911,1010.6237,,,-2.2315064,60.764,0.357,14.865,0.089257,,,1,0,0,0,-1,,,0,,y6;y9;y6-NH3;y9-NH3;b9,6704.11572265625;3121.10546875;5720.6767578125;36383.8828125;2903.078125,-7.786020262301463E-05;0.004498815625538555;-0.003963494129493483;-0.012215085613661358;-0.006804483999644617,-0.1238995059129184;5.173715335129975;-6.482743414813657;-14.327833916408876;-8.40572395662594,628.4141494296026;869.5522142455745;611.3914859620295;852.5423790453137;809.5060026662998,5,0.0065090614451238,0.0352112676056338,0,0,,Unknown,True,14.864876125193954;14.775619335231177;14.535294575441029,AAALGAVVRLA;LSGIIKRQP;AAIQILRNL,_AAALGAVVRLA_;_LSGIIKRQP_;_AAIQILRNL_,,,,4,5820,4,4,4,,0.1187772846816415,11 +GN20170722_SK_HLA_G0103_R1_02,36957,32348,AAALLGPSAQVKPS,14,,Unmodified,AAALLGPSAQVKPS,,,0,UNKNOWN,2,CID,FTMS,MSMS,12,,655.37735,1308.7402,,,7.953341,82.433,0.44365,16.632,5.1418,,,1,0,0,0,36945,2145926.75,0.8411612144806966,5,-0.1116790771484375,y8;a2;b2;b4;b10,4655.591796875;16715.123046875;2262.689453125;5398.7060546875;4745.7470703125,-0.007113766089901219;-0.0003434307455876251;0.0001157588298212886;0.0013808909956765092;-0.004194096411310966,-8.745140500668365;-2.9840985164902825;0.8090418412213772;4.220310224306376;-4.763350700971752,813.4536076758899;115.08693285084559;143.0813882833702;327.2013009194043;880.4928871719113,5,0.0204413445189944,0.0349650349650349,0,0,,Unknown,True,16.631834798470333;11.49006305901108;10.82951774543324,AAALLGPSAQVKPS;ILRNHVMVRVG;VGGLVILAMPQVP,_AAALLGPSAQVKPS_;_ILRNHVM(Oxidation (M))VRVG_;_VGGLVILAM(Oxidation (M))PQVP_,,,,5,7349,5,5,5,,0.0981300217742955,14 +GN20170722_SK_HLA_G0103_R1_01,35853,31312,AAALVFFVTA,10,,Unmodified,AAALVFFVTA,,,0,tr|A0A097I0R1|A0A097I0R1_HHV3;tr|A0A097I0Y1|A0A097I0Y1_HHV3;tr|G9IXM9|G9IXM9_HHV3;tr|G9IX13|G9IX13_HHV3;tr|Q0QA15|Q0QA15_HHV3;tr|Q0QA88|Q0QA88_HHV3;tr|Q2PJ42|Q2PJ42_HHV3;tr|R4P4T3|R4P4T3_HHV3;tr|G9IXV1|G9IXV1_HHV3;tr|A0A1C9CWW4|A0A1C9CWW4_HHV3;tr|Q0Q9E6|Q0Q9E6_HHV3;tr|Q0Q973|Q0Q973_HHV3;tr|G9IUZ3|G9IUZ3_HHV3;tr|A0A0F7GK62|A0A0F7GK62_HHV3;tr|A0A0F7CV33|A0A0F7CV33_HHV3;tr|A0A0F7CVA6|A0A0F7CVA6_HHV3;tr|A0A0F7CVQ2|A0A0F7CVQ2_HHV3;tr|A0A0F7CW33|A0A0F7CW33_HHV3;tr|Q0Q9L9|Q0Q9L9_HHV3;sp|P09278|LTP_VZVD;tr|I2CNP0|I2CNP0_HHV3;tr|A0A0F7GJA5|A0A0F7GJA5_HHV3;tr|Q6QCN3|Q6QCN3_HHV3;tr|W5RSJ9|W5RSJ9_HHV3;tr|Q0Q9U2|Q0Q9U2_HHV3;tr|Q0Q858|Q0Q858_HHV3;tr|A0A1B1JF83|A0A1B1JF83_HHV3;tr|G0ZLC3|G0ZLC3_HHV3;tr|Q6QCV5|Q6QCV5_HHV3;tr|A8I3I4|A8I3I4_HHV3;tr|G9IUC7|G9IUC7_HHV3;tr|G9IVD7|G9IVD7_HHV3;tr|G9IW03|G9IW03_HHV3;tr|G9IWE7|G9IWE7_HHV3;tr|G9IWL9|G9IWL9_HHV3;tr|G9IXF7|G9IXF7_HHV3;tr|L7X8T0|L7X8T0_HHV3;tr|A0A0F7GIU8|A0A0F7GIU8_HHV3;tr|A0A0F7GKY0|A0A0F7GKY0_HHV3;tr|A0A1C9CX67|A0A1C9CX67_HHV3;tr|U5NQF8|U5NQF8_HHV3;tr|U5NT66|U5NT66_HHV3;tr|U5NPW9|U5NPW9_HHV3;tr|A0A1B1JGR2|A0A1B1JGR2_HHV3;tr|Q0Q8D1|Q0Q8D1_HHV3;tr|A0A075X7J7|A0A075X7J7_HHV3;tr|A0A076N2D1|A0A076N2D1_HHV3;tr|Q0Q8S7|Q0Q8S7_HHV3;tr|A0A1B1JGW3|A0A1B1JGW3_HHV3;tr|W6EHL9|W6EHL9_HHV3;tr|B0CMS9|B0CMS9_HHV3;tr|A0A1B1JFX5|A0A1B1JFX5_HHV3;tr|A0A1B1JG34|A0A1B1JG34_HHV3;tr|A0A075X1N3|A0A075X1N3_HHV3;tr|G9IY23|G9IY23_HHV3;tr|A4GE96|A4GE96_HHV3;tr|U5NSY8|U5NSY8_HHV3;tr|A0A0F7GLI9|A0A0F7GLI9_HHV3;tr|G9IU55|G9IU55_HHV3;tr|A0A0F7GQA2|A0A0F7GQA2_HHV3;tr|U5NTP5|U5NTP5_HHV3;tr|A0A0F7GLZ1|A0A0F7GLZ1_HHV3;sp|Q4JQX9|LTP_VZVO;tr|A0A0F7GNC3|A0A0F7GNC3_HHV3;tr|A0A0F7GNZ4|A0A0F7GNZ4_HHV3;tr|A0A0F7GPS9|A0A0F7GPS9_HHV3;tr|I7D980|I7D980_HHV3;tr|U5NUP5|U5NUP5_HHV3;tr|G9IW75|G9IW75_HHV3;tr|Q0Q8K4|Q0Q8K4_HHV3;tr|A6XEB1|A6XEB1_HHV3;tr|I7DJA6|I7DJA6_HHV3;tr|W6E6K8|W6E6K8_HHV3;tr|W6E684|W6E684_HHV3;tr|Q0Q900|Q0Q900_HHV3,2,CID,FTMS,MULTI-MSMS,3,0.0,505.28949,1008.5644,2.236,0.0011298,4.7881027,79.666,0.62222,15.381,0.60553,,,1,0,0,0,35850,15369293.0,1.0,0,0.0,y4;y6;y4-H2O;b4,8194.3076171875;9210.5302734375;12286.7236328125;9231.583984375,-0.008737849023532362;-0.006874988596337062;0.0027794625577257648;0.005002444408432893,-19.983727872620598;-10.060225299391472;6.6299842599590635;15.28875271403542,437.2481990962236;683.3831640681964;419.2261170983423;327.1976793659915,4,0.0050300564660886,0.0526315789473684,0,0,,Unknown,False,15.381154005585476;14.775619335231177;12.988381952877914,AAALVFFVTA;VPLSPPWLT;PTKLYKLM,_AAALVFFVTA_;_VPLSPPWLT_;_PTKLYKLM(Oxidation (M))_,,,,6,11912,6,6,6,,0.0604783245398721,10 +GN20170722_SK_HLA_G0103_R1_01,37465,32769,AAAMAFG,7,,Unmodified,AAAMAFG,,,0,tr|U5NU36|U5NU36_HHV3;tr|Q0Q9Y9|Q0Q9Y9_HHV3;tr|I7EGL3|I7EGL3_HHV3;tr|I2CNR6|I2CNR6_HHV3;tr|G9IXB0|G9IXB0_HHV3;tr|G9IVV6|G9IVV6_HHV3;tr|G9IVG2|G9IVG2_HHV3;tr|G9IUM4|G9IUM4_HHV3;tr|B0CMV6|B0CMV6_HHV3;tr|A0A1C9CX38|A0A1C9CX38_HHV3;tr|A0A0F7GMU7|A0A0F7GMU7_HHV3;tr|A0A0F7GKV3|A0A0F7GKV3_HHV3;tr|A0A0F7GJZ8|A0A0F7GJZ8_HHV3;sp|Q4JQS7|AN_VZVO;sp|P09253|AN_VZVD,1,CID,FTMS,MULTI-MSMS,5,0.0,638.29666,637.28938,-2.6899,-0.0017169,2.2618452,83.465,0.69237,3.3758,3.3758,,,1,0,0,0,37460,49799916.0,0.6104119988864497,-6,0.1514968872070312,y4,32493.693359375,0.00040581233241709924,0.9544372849576236,425.18491137436763,1,0.000436925102919,0.010204081632653,0,0,,Unknown,False,3.3758226685180976;0;0,AAAMAFG;PFFGGGG;GFGPFGG,_AAAMAFG_;_PFFGGGG_;_GFGPFGG_,,,,7,11602,7,7,7,,-0.0437710322582916,7 +GN20170722_SK_HLA_G0103_R1_01,4878,3972,AAAMLRK,7,,Unmodified,AAAMLRK,,,0,uc001vne.3;uc010tik.1;uc010til.1;uc001vnf.1,2,CID,FTMS,MSMS,5,,380.72854,759.44253,,,-0.89752058,11.884,1.0478,38.202,10.885,,,1,0,0,0,4873,3145282.0,1.0,0,0.0,y3;y4;y3-NH3;y4-NH3;b3,92125.1015625;475564.0625;5908.86328125;26057.03515625;47402.09765625,-0.00042149807143232465;-0.00030228005061871954;-0.0030104841815727923;0.0005637190507741252;-5.349210823624162E-06,-1.012490249173898;-0.5522723208365405;-7.539887044452489;1.0629963883454507;-0.024982464132262953,416.29840067717146;547.3387660653506;399.2744405617816;530.3113509647492;214.11862317921083,5,0.2038884927238479,0.0625,0,0,,Unknown,False,38.20205643361177;27.3166592096155;27.3166592096155,AAAMLRK;GGVRMLK;GGVLRMK,_AAAMLRK_;_GGVRMLK_;_GGVLRMK_,,,,8,14949,8,8,8,,0.053185118906299,7 diff --git a/tests/unit_tests/data/msms_internal_tmt.csv b/tests/unit_tests/data/msms_internal_tmt.csv new file mode 100644 index 0000000..ba520ac --- /dev/null +++ b/tests/unit_tests/data/msms_internal_tmt.csv @@ -0,0 +1,10 @@ +RAW_FILE,SCAN_NUMBER,Scan index,SEQUENCE,Length,Missed cleavages,Modifications,MODIFIED_SEQUENCE,Oxidation (M) Probabilities,Oxidation (M) Score diffs,Oxidation (M),PROTEINS,PRECURSOR_CHARGE,Fragmentation,Mass analyzer,Type,SCAN_EVENT_NUMBER,Isotope index,m/z,MASS,Mass error [ppm],Mass error [Da],Simple mass error [ppm],Retention time,PEP,SCORE,Delta score,Score diff,Localization prob,Combinatorics,PIF,Fraction of total spectrum,Base peak fraction,Precursor full scan number,Precursor Intensity,Precursor apex fraction,Precursor apex offset,Precursor apex offset time,Matches,Intensities,Mass deviations [Da],Mass deviations [ppm],Masses,Number of matches,Intensity coverage,Peak coverage,Unfragmented precursor intensity,Unfragmented precursor fraction,Neutral loss level,ETD identification type,REVERSE,All scores,All sequences,All modified sequences,MS3 scan numbers,Reporter PIF,Reporter fraction,id,Protein group IDs,Peptide ID,Mod. peptide ID,Evidence ID,Oxidation (M) site IDs,Mass deficit,PEPTIDE_LENGTH +GN20170722_SK_HLA_G0103_R1_02,39581,34737,AAAAAVAGVGRGG,13,,Unmodified,[UNIMOD:2016]-AAAAAVAGVGRGG,,,0,uc009yqv.3;uc001ogi.3;C11orf68_p.V135L_uc001ogi.2|C11orf68,2,CID,FTMS,MULTI-MSMS,8,0.0,514.2858,1026.557,3.036,0.0015614,8.3832641,88.186,0.71744,13.77,3.3555,,,1,0,0,0,39573,1384891.5,0.7223182599737116,-3,0.0863265991210937,y5;y7;a2;b9,2905.48046875;9943.1416015625;3013.589599609375;37873.2109375,-0.002829233181444124;0.008368573916527566;-0.00031301558594520884;-0.00275202985596934,-6.354192112149343;14.597148465958854;-2.7198193653715848;-4.0329222465724435,445.2545865011814;573.3019662054834;115.08690243568594;682.391002779256,4,0.0161473114556132,0.0439560439560439,0,0,,Unknown,False,13.769603755808198;10.41414173872871;9.986874898151301,AAAAAVAGVGRGG;RGLQQPVVM;PLFPVAGVPM,_AAAAAVAGVGRGG_;_RGLQQPVVM_;_PLFPVAGVPM_,,,,0,153,0,0,0,,0.04482492906277,13 +GN20170722_SK_HLA_G0103_R1_01,24282,21318,AAADRNLIYVLK,12,,Unmodified,[UNIMOD:2016]-AAADRNLIYVLK[UNIMOD:2016],,,0,UNKNOWN,2,CID,FTMS,MULTI-MSMS,8,1.0,673.89317,1345.7718,-4.268,-0.0028761,747.2514,52.702,0.56055,28.727,0.32818,,,1,0,0,0,24274,4397484.0,0.3726964352547448,10,-0.2357139587402343,y1;y2;y3;y4;y5;y7;y2-NH3;a2,3799.669677734375;138293.265625;18070.681640625;58923.94140625;107806.1171875;70635.7734375;15442.98046875;9986.8359375,-0.000463241690255245;0.00019436417716178767;0.003219772849433866;0.0019772452006918684;0.0012221122285609454;0.002896620233173053;0.0009968129016613148;-0.000993683999936934,-3.148877721252655;0.7469894765873476;8.962184397850063;3.785457364047471;1.9233399457992681;3.358257103880474;4.099254348751434;-8.634154729256231,147.11326741229024;260.1966737868229;359.26206229435064;522.3266333602993;635.4114524736714;862.5367693932668;243.1693222365984;115.08758310409993,8,0.0663361112164589,0.074074074074074,0,0,,Unknown,True,28.726836482416193;28.398655567304367;27.254056396965822,AAADRNLIYVLK;LVRGIAHIFSPH;LVYQEINLARK,_AAADRNLIYVLK_;_LVRGIAHIFSPH_;_LVYQEINLARK_,,,,1,6213,1,1,1,,0.1127299493136888,12 +GN20170722_SK_HLA_G0103_R1_01,32608,28449,AAAGRIAIPGL,11,,Unmodified,[UNIMOD:2016]-AAAGRIAIPGL,,,0,uc002lpr.2;uc002lpq.2;uc002lpp.2,2,CID,FTMS,MSMS,11,,505.31128,1008.608,,,9.1541616,72.138,0.35461,15.662,1.8867,,,1,0,0,0,32597,334279.59375,1.0,0,0.0,y1;y4;y6;a2;b2;b7;b10,17585.08984375;3773.285888671875;47387.72265625;14307.5595703125;3126.1455078125;103400.8515625;2367.97900390625,0.0001248233497790352;-0.0018376640455812776;0.009915978956996696;-0.00043868453276729724;-0.0012951289365901175;-0.004575538846552263;0.004011688101854816,0.9449028581307712;-4.602651610884071;16.997710143135873;-3.8117634975199977;-9.051604693874186;-7.484112246043633;4.566433773179205,132.10178030995021;399.2620343533456;583.3714584785429;115.08702810463276;143.0827991711366;611.3669458887465;878.5166502177981,7,0.0629366207519218,0.0454545454545454,0,0,,Unknown,False,15.66199446558733;13.775330524044264;13.64772730687463,AAAGRIAIPGL;PVARLFPPL;VPVGRLQGAL,_AAAGRIAIPGL_;_PVARLFPPL_;_VPVGRLQGAL_,,,,2,16295,2,2,2,,0.1040544195111579,11 +GN20170722_SK_HLA_G0103_R1_01,35464,30971,AAAIVFLVDRF,11,,Unmodified,[UNIMOD:2016]-AAAIVFLVDRF,,,0,uc011cfx.2;uc003iap.4;uc003ian.4,2,CID,FTMS,MSMS,3,,611.35315,1220.6917,,,7.2936771,78.767,0.35461,15.662,1.4552,,,1,0,0,0,35461,1150588.5,0.4641770522748019,-2,0.0299758911132812,y5;a2;b2;b4;b5,2324.119140625;7897.48388671875;3402.477783203125;8944.435546875;3646.345458984375,0.00016820457972244185;-0.0002067135477687998;-0.0006781696607163212;0.001769698386226537;-0.0026612948673232495,0.2590286824841407;-1.7961534660218355;-4.739721258319645;5.408598572673599;-6.243159057969467,649.3666188212202;115.08679613364777;143.08218221186073;327.2009121120137;426.2737570214673,5,0.018596876451864,0.05,0,0,,Unknown,False,15.66199446558733;14.20677841292237;11.49006305901108,AAAIVFLVDRF;VGAVGHKAANVAK;GRRRGGPPVNR,_AAAIVFLVDRF_;_VGAVGHKAANVAK_;_GRRRGGPPVNR_,,,,3,18307,3,3,3,,0.0902255332816821,11 +GN20170722_SK_HLA_G0103_R2_01,26762,22942,AAALGAVVRLA,11,,Unmodified,[UNIMOD:2016]-AAALGAVVRLA,,,0,UNKNOWN,2,CID,FTMS,MSMS,3,,506.31911,1010.6237,,,-2.2315064,60.764,0.357,14.865,0.089257,,,1,0,0,0,-1,,,0,,y6;y9;y6-NH3;y9-NH3;b9,6704.11572265625;3121.10546875;5720.6767578125;36383.8828125;2903.078125,-7.786020262301463E-05;0.004498815625538555;-0.003963494129493483;-0.012215085613661358;-0.006804483999644617,-0.1238995059129184;5.173715335129975;-6.482743414813657;-14.327833916408876;-8.40572395662594,628.4141494296026;869.5522142455745;611.3914859620295;852.5423790453137;809.5060026662998,5,0.0065090614451238,0.0352112676056338,0,0,,Unknown,True,14.864876125193954;14.775619335231177;14.535294575441029,AAALGAVVRLA;LSGIIKRQP;AAIQILRNL,_AAALGAVVRLA_;_LSGIIKRQP_;_AAIQILRNL_,,,,4,5820,4,4,4,,0.1187772846816415,11 +GN20170722_SK_HLA_G0103_R1_02,36957,32348,AAALLGPSAQVKPS,14,,Unmodified,[UNIMOD:2016]-AAALLGPSAQVK[UNIMOD:2016]PS,,,0,UNKNOWN,2,CID,FTMS,MSMS,12,,655.37735,1308.7402,,,7.953341,82.433,0.44365,16.632,5.1418,,,1,0,0,0,36945,2145926.75,0.8411612144806966,5,-0.1116790771484375,y8;a2;b2;b4;b10,4655.591796875;16715.123046875;2262.689453125;5398.7060546875;4745.7470703125,-0.007113766089901219;-0.0003434307455876251;0.0001157588298212886;0.0013808909956765092;-0.004194096411310966,-8.745140500668365;-2.9840985164902825;0.8090418412213772;4.220310224306376;-4.763350700971752,813.4536076758899;115.08693285084559;143.0813882833702;327.2013009194043;880.4928871719113,5,0.0204413445189944,0.0349650349650349,0,0,,Unknown,True,16.631834798470333;11.49006305901108;10.82951774543324,AAALLGPSAQVKPS;ILRNHVMVRVG;VGGLVILAMPQVP,_AAALLGPSAQVKPS_;_ILRNHVM(Oxidation (M))VRVG_;_VGGLVILAM(Oxidation (M))PQVP_,,,,5,7349,5,5,5,,0.0981300217742955,14 +GN20170722_SK_HLA_G0103_R1_01,35853,31312,AAALVFFVTA,10,,Unmodified,[UNIMOD:2016]-AAALVFFVTA,,,0,tr|A0A097I0R1|A0A097I0R1_HHV3;tr|A0A097I0Y1|A0A097I0Y1_HHV3;tr|G9IXM9|G9IXM9_HHV3;tr|G9IX13|G9IX13_HHV3;tr|Q0QA15|Q0QA15_HHV3;tr|Q0QA88|Q0QA88_HHV3;tr|Q2PJ42|Q2PJ42_HHV3;tr|R4P4T3|R4P4T3_HHV3;tr|G9IXV1|G9IXV1_HHV3;tr|A0A1C9CWW4|A0A1C9CWW4_HHV3;tr|Q0Q9E6|Q0Q9E6_HHV3;tr|Q0Q973|Q0Q973_HHV3;tr|G9IUZ3|G9IUZ3_HHV3;tr|A0A0F7GK62|A0A0F7GK62_HHV3;tr|A0A0F7CV33|A0A0F7CV33_HHV3;tr|A0A0F7CVA6|A0A0F7CVA6_HHV3;tr|A0A0F7CVQ2|A0A0F7CVQ2_HHV3;tr|A0A0F7CW33|A0A0F7CW33_HHV3;tr|Q0Q9L9|Q0Q9L9_HHV3;sp|P09278|LTP_VZVD;tr|I2CNP0|I2CNP0_HHV3;tr|A0A0F7GJA5|A0A0F7GJA5_HHV3;tr|Q6QCN3|Q6QCN3_HHV3;tr|W5RSJ9|W5RSJ9_HHV3;tr|Q0Q9U2|Q0Q9U2_HHV3;tr|Q0Q858|Q0Q858_HHV3;tr|A0A1B1JF83|A0A1B1JF83_HHV3;tr|G0ZLC3|G0ZLC3_HHV3;tr|Q6QCV5|Q6QCV5_HHV3;tr|A8I3I4|A8I3I4_HHV3;tr|G9IUC7|G9IUC7_HHV3;tr|G9IVD7|G9IVD7_HHV3;tr|G9IW03|G9IW03_HHV3;tr|G9IWE7|G9IWE7_HHV3;tr|G9IWL9|G9IWL9_HHV3;tr|G9IXF7|G9IXF7_HHV3;tr|L7X8T0|L7X8T0_HHV3;tr|A0A0F7GIU8|A0A0F7GIU8_HHV3;tr|A0A0F7GKY0|A0A0F7GKY0_HHV3;tr|A0A1C9CX67|A0A1C9CX67_HHV3;tr|U5NQF8|U5NQF8_HHV3;tr|U5NT66|U5NT66_HHV3;tr|U5NPW9|U5NPW9_HHV3;tr|A0A1B1JGR2|A0A1B1JGR2_HHV3;tr|Q0Q8D1|Q0Q8D1_HHV3;tr|A0A075X7J7|A0A075X7J7_HHV3;tr|A0A076N2D1|A0A076N2D1_HHV3;tr|Q0Q8S7|Q0Q8S7_HHV3;tr|A0A1B1JGW3|A0A1B1JGW3_HHV3;tr|W6EHL9|W6EHL9_HHV3;tr|B0CMS9|B0CMS9_HHV3;tr|A0A1B1JFX5|A0A1B1JFX5_HHV3;tr|A0A1B1JG34|A0A1B1JG34_HHV3;tr|A0A075X1N3|A0A075X1N3_HHV3;tr|G9IY23|G9IY23_HHV3;tr|A4GE96|A4GE96_HHV3;tr|U5NSY8|U5NSY8_HHV3;tr|A0A0F7GLI9|A0A0F7GLI9_HHV3;tr|G9IU55|G9IU55_HHV3;tr|A0A0F7GQA2|A0A0F7GQA2_HHV3;tr|U5NTP5|U5NTP5_HHV3;tr|A0A0F7GLZ1|A0A0F7GLZ1_HHV3;sp|Q4JQX9|LTP_VZVO;tr|A0A0F7GNC3|A0A0F7GNC3_HHV3;tr|A0A0F7GNZ4|A0A0F7GNZ4_HHV3;tr|A0A0F7GPS9|A0A0F7GPS9_HHV3;tr|I7D980|I7D980_HHV3;tr|U5NUP5|U5NUP5_HHV3;tr|G9IW75|G9IW75_HHV3;tr|Q0Q8K4|Q0Q8K4_HHV3;tr|A6XEB1|A6XEB1_HHV3;tr|I7DJA6|I7DJA6_HHV3;tr|W6E6K8|W6E6K8_HHV3;tr|W6E684|W6E684_HHV3;tr|Q0Q900|Q0Q900_HHV3,2,CID,FTMS,MULTI-MSMS,3,0.0,505.28949,1008.5644,2.236,0.0011298,4.7881027,79.666,0.62222,15.381,0.60553,,,1,0,0,0,35850,15369293.0,1.0,0,0.0,y4;y6;y4-H2O;b4,8194.3076171875;9210.5302734375;12286.7236328125;9231.583984375,-0.008737849023532362;-0.006874988596337062;0.0027794625577257648;0.005002444408432893,-19.983727872620598;-10.060225299391472;6.6299842599590635;15.28875271403542,437.2481990962236;683.3831640681964;419.2261170983423;327.1976793659915,4,0.0050300564660886,0.0526315789473684,0,0,,Unknown,False,15.381154005585476;14.775619335231177;12.988381952877914,AAALVFFVTA;VPLSPPWLT;PTKLYKLM,_AAALVFFVTA_;_VPLSPPWLT_;_PTKLYKLM(Oxidation (M))_,,,,6,11912,6,6,6,,0.0604783245398721,10 +GN20170722_SK_HLA_G0103_R1_01,37465,32769,AAAMAFG,7,,Unmodified,[UNIMOD:2016]-AAAMAFG,,,0,tr|U5NU36|U5NU36_HHV3;tr|Q0Q9Y9|Q0Q9Y9_HHV3;tr|I7EGL3|I7EGL3_HHV3;tr|I2CNR6|I2CNR6_HHV3;tr|G9IXB0|G9IXB0_HHV3;tr|G9IVV6|G9IVV6_HHV3;tr|G9IVG2|G9IVG2_HHV3;tr|G9IUM4|G9IUM4_HHV3;tr|B0CMV6|B0CMV6_HHV3;tr|A0A1C9CX38|A0A1C9CX38_HHV3;tr|A0A0F7GMU7|A0A0F7GMU7_HHV3;tr|A0A0F7GKV3|A0A0F7GKV3_HHV3;tr|A0A0F7GJZ8|A0A0F7GJZ8_HHV3;sp|Q4JQS7|AN_VZVO;sp|P09253|AN_VZVD,1,CID,FTMS,MULTI-MSMS,5,0.0,638.29666,637.28938,-2.6899,-0.0017169,2.2618452,83.465,0.69237,3.3758,3.3758,,,1,0,0,0,37460,49799916.0,0.6104119988864497,-6,0.1514968872070312,y4,32493.693359375,0.00040581233241709924,0.9544372849576236,425.18491137436763,1,0.000436925102919,0.010204081632653,0,0,,Unknown,False,3.3758226685180976;0;0,AAAMAFG;PFFGGGG;GFGPFGG,_AAAMAFG_;_PFFGGGG_;_GFGPFGG_,,,,7,11602,7,7,7,,-0.0437710322582916,7 +GN20170722_SK_HLA_G0103_R1_01,4878,3972,AAAMLRK,7,,Unmodified,[UNIMOD:2016]-AAAMLRK[UNIMOD:2016],,,0,uc001vne.3;uc010tik.1;uc010til.1;uc001vnf.1,2,CID,FTMS,MSMS,5,,380.72854,759.44253,,,-0.89752058,11.884,1.0478,38.202,10.885,,,1,0,0,0,4873,3145282.0,1.0,0,0.0,y3;y4;y3-NH3;y4-NH3;b3,92125.1015625;475564.0625;5908.86328125;26057.03515625;47402.09765625,-0.00042149807143232465;-0.00030228005061871954;-0.0030104841815727923;0.0005637190507741252;-5.349210823624162E-06,-1.012490249173898;-0.5522723208365405;-7.539887044452489;1.0629963883454507;-0.024982464132262953,416.29840067717146;547.3387660653506;399.2744405617816;530.3113509647492;214.11862317921083,5,0.2038884927238479,0.0625,0,0,,Unknown,False,38.20205643361177;27.3166592096155;27.3166592096155,AAAMLRK;GGVRMLK;GGVLRMK,_AAAMLRK_;_GGVRMLK_;_GGVLRMK_,,,,8,14949,8,8,8,,0.053185118906299,7 diff --git a/tests/unit_tests/data/psm_mods.pepXML b/tests/unit_tests/data/psm_mods.pepXML index b2a5ca5..a33aed5 100644 --- a/tests/unit_tests/data/psm_mods.pepXML +++ b/tests/unit_tests/data/psm_mods.pepXML @@ -127,7 +127,7 @@ - + diff --git a/tests/unit_tests/test_maxquant.py b/tests/unit_tests/test_maxquant.py index 4bb4783..03cb6cf 100644 --- a/tests/unit_tests/test_maxquant.py +++ b/tests/unit_tests/test_maxquant.py @@ -1,12 +1,28 @@ import io +import unittest +from pathlib import Path import numpy as np import pandas as pd import pytest -import spectrum_io.search_result.maxquant as mq +from spectrum_io.search_result.maxquant import MaxQuant from spectrum_io.search_result.search_results import filter_valid_prosit_sequences +COLUMNS = [ + "RAW_FILE", + "SCAN_NUMBER", + "MODIFIED_SEQUENCE", + "PRECURSOR_CHARGE", + "SCAN_EVENT_NUMBER", + "MASS", + "SCORE", + "REVERSE", + "SEQUENCE", + "PEPTIDE_LENGTH", + "PROTEINS", +] + class TestAddTMTMod: """Class to test tmt modification addition.""" @@ -14,115 +30,48 @@ class TestAddTMTMod: def test_add_tmt_mod(self): """Test addition of tmt modification.""" assert ( - mq.MaxQuant.add_tmt_mod(1.0, "[UNIMOD:2016]ABC[UNIMOD:4]K[UNIMOD:2016]", "[UNIMOD:2016]") + MaxQuant.add_tmt_mod(1.0, "[UNIMOD:2016]ABC[UNIMOD:4]K[UNIMOD:2016]", "[UNIMOD:2016]") == 1.0 + 2 * 304.207146 ) -class TestUpdateColumns: - """Class to test update columns.""" - - def test_update_columns(self, maxquant_df: pd.DataFrame): - """ - Test column update. - - :param maxquant_df: maxquant df as pd.DataFrame - """ - prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="") - assert not prosit_df["REVERSE"][0] - assert prosit_df["REVERSE"][3] - - assert prosit_df["MODIFIED_SEQUENCE"][0] == "DS[UNIMOD:21]DS[UNIMOD:21]WDADAFSVEDPVRK" - assert prosit_df["MODIFIED_SEQUENCE"][3] == "SS[UNIMOD:21]PTPES[UNIMOD:21]PTMLTK" - - assert prosit_df["SEQUENCE"][0] == "DSDSWDADAFSVEDPVRK" - assert prosit_df["SEQUENCE"][3] == "SSPTPESPTMLTK" - - assert prosit_df["PEPTIDE_LENGTH"][0] == 18 - assert prosit_df["PEPTIDE_LENGTH"][3] == 13 - - assert prosit_df["PROTEINS"][0] == "P12345" - assert prosit_df["PROTEINS"][3] == "Q67890" - - def test_update_columns_silac(self, maxquant_df: pd.DataFrame): - """ - Test column update silac. +class TestMaxQuant(unittest.TestCase): + """Class to test MSFragger.""" - :param maxquant_df: maxquant df as pd.DataFrame - """ - maxquant_df["LABELING_STATE"] = [1, 1, 1, 2, 2] - prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="") - assert prosit_df["MODIFIED_SEQUENCE"][0] == "DS[UNIMOD:21]DS[UNIMOD:21]WDADAFSVEDPVR[UNIMOD:267]K[UNIMOD:259]" - assert prosit_df["MODIFIED_SEQUENCE"][3] == "SS[UNIMOD:21]PTPES[UNIMOD:21]PTMLTK" + def test_read_maxquant(self): + """Test function for reading msfragger results and transforming to Prosit format.""" + expected_df_path = Path(__file__).parent / "data" / "msms_internal.csv" - assert prosit_df["MASS"][0] == 1.0 + 8.014199 + 10.008269 - assert prosit_df["MASS"][3] == 2.0 + internal_search_results_df = MaxQuant(Path(__file__).parent / "data" / "msms.txt").read_result() + expected_df = pd.read_csv(expected_df_path) - def test_update_columns_tmt(self, maxquant_df: pd.DataFrame): - """ - Test column update tmt. + pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS]) - :param maxquant_df: maxquant df as pd.DataFrame - """ - prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="tmt") - assert prosit_df["MODIFIED_SEQUENCE"][0] == "[UNIMOD:737]-DS[UNIMOD:21]DS[UNIMOD:21]WDADAFSVEDPVRK[UNIMOD:737]" - assert prosit_df["MODIFIED_SEQUENCE"][3] == "[UNIMOD:737]-SS[UNIMOD:21]PTPES[UNIMOD:21]PTMLTK[UNIMOD:737]" - - assert prosit_df["MASS"][0] == 1.0 + 2 * 229.162932 - assert prosit_df["MASS"][3] == 2.0 + 2 * 229.162932 - - def test_update_columns_tmt_msa(self, maxquant_df: pd.DataFrame): - """ - Test column update tmt msa. - - :param maxquant_df: maxquant df as pd.DataFrame - """ - prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="tmt_msa") - assert ( - prosit_df["MODIFIED_SEQUENCE_MSA"][0] == "[UNIMOD:737]-DS[UNIMOD:23]DS[UNIMOD:23]WDADAFSVEDPVRK[UNIMOD:737]" + def test_read_maxquant_with_custom_mods(self): + """Test function for reading msfragger results and transforming to Prosit format with custom mods.""" + expected_df_path = Path(__file__).parent / "data" / "msms_internal.csv" + custom_mods = {"M(Oxidation (M)": 35, "C": 4, "S(Phospho (STY))": 21} + internal_search_results_df = MaxQuant(Path(__file__).parent / "data" / "msms.txt").read_result( + custom_mods=custom_mods ) - assert prosit_df["MODIFIED_SEQUENCE_MSA"][3] == "[UNIMOD:737]-SS[UNIMOD:23]PTPES[UNIMOD:23]PTMLTK[UNIMOD:737]" - - def test_filter_valid_prosit_sequences(self, invalid_df: pd.DataFrame): - """Test filter_valid_prosit_sequences.""" - filtered_df = filter_valid_prosit_sequences(invalid_df) - assert filtered_df["MODIFIED_SEQUENCE"][0] == "ABCDEFG" - assert len(filtered_df) == 1 - assert "(ac)" not in filtered_df["MODIFIED_SEQUENCE"] - assert "(Acetyl (Protein N-term))" not in filtered_df["MODIFIED_SEQUENCE"] - assert "U" not in filtered_df["SEQUENCE"] - assert filtered_df["PEPTIDE_LENGTH"].min() >= 7 - assert filtered_df["PRECURSOR_CHARGE"].max() <= 6 - - -@pytest.fixture -def maxquant_df(): - """Create dataframes from strings: https://towardsdatascience.com/67b0c2b71e6a.""" - df_string = """ MODIFIED_SEQUENCE; REVERSE; MASS; PROTEINS; -_DS(Phospho (STY))DS(Phospho (STY))WDADAFSVEDPVRK_; ; 1.0; P12345; -_DS(Phospho (STY))DS(Phospho (STY))WDADAFSVEDPVRK_; ; 1.0; P12345; -_DS(Phospho (STY))DSWDADAFS(Phospho (STY))VEDPVRK_; ; 1.0; P12345; - _SS(Phospho (STY))PTPES(Phospho (STY))PTMLTK_; +; 2.0; Q67890; - _SS(Phospho (STY))PTPES(Phospho (STY))PTMLTK_; +; 2.0; Q67890;""" - df = pd.read_csv(io.StringIO(df_string), delimiter=";", skipinitialspace=True) - df["Charge"] = 2 - return df - - -@pytest.fixture -def invalid_df(): - """Create invalid df.""" - df = pd.DataFrame( - { - "PEPTIDE_LENGTH": [7, 7, 6, 32], - "MODIFIED_SEQUENCE": [ - "ABCDEFG", - "GHD(ac)IJKL", - "MN(Acetyl (Protein N-term))OPQR", - "STUVWDEFSTUVWDEFSTUVWDEFSTUVWDEF", - ], - "SEQUENCE": ["ABCDEFG", "GHDIJKL", "MNOPQR", "STUVWDEFSTUVWDEFSTUVWDEFSTUVWDEF"], - "PRECURSOR_CHARGE": [2, 5, 7, 6], + expected_df = pd.read_csv(expected_df_path) + + pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS]) + + def test_read_maxquant_with_custom_mods_with_tmt(self): + """Test function for reading msfragger results and transforming to Prosit format with custom mods.""" + expected_df_path = Path(__file__).parent / "data" / "msms_internal_tmt.csv" + internal_search_results_df = MaxQuant(Path(__file__).parent / "data" / "msms.txt").read_result(tmt_label="") + custom_mods = { + "M(Oxidation (M)": 35, + "C": 4, + "K": 2016, + "^": 2016, } - ) - return df + + internal_search_results_df = MaxQuant(Path(__file__).parent / "data" / "msms.txt").read_result( + custom_mods=custom_mods + ) + expected_df = pd.read_csv(expected_df_path) + + pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS]) diff --git a/tests/unit_tests/test_msfragger.py b/tests/unit_tests/test_msfragger.py index 86be885..e07c4ac 100644 --- a/tests/unit_tests/test_msfragger.py +++ b/tests/unit_tests/test_msfragger.py @@ -5,6 +5,20 @@ from spectrum_io.search_result.msfragger import MSFragger +COLUMNS = [ + "RAW_FILE", + "SCAN_NUMBER", + "MODIFIED_SEQUENCE", + "PRECURSOR_CHARGE", + "SCAN_EVENT_NUMBER", + "MASS", + "SCORE", + "REVERSE", + "SEQUENCE", + "PEPTIDE_LENGTH", + "PROTEINS", +] + class TestMSFragger(unittest.TestCase): """Class to test MSFragger.""" @@ -31,25 +45,35 @@ def test_read_msfragger(self): expected_msfragger_internal_path = Path(__file__).parent / "data" / "psm_tmt_internal.csv" internal_search_results_df = MSFragger(Path(__file__).parent / "data" / "psm_tmt.pepXML").read_result( - tmt_labeled="tmtpro" + tmt_label="tmtpro" ) expected_df = pd.read_csv(expected_msfragger_internal_path, index_col=0) - print("Internal Search Results Columns:", internal_search_results_df.columns) - print("Expected Columns:", expected_df.columns) - pd.testing.assert_frame_equal(internal_search_results_df, expected_df) + pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS]) - def test_read_msfragger_mods(self): + def test_read_msfragger_with_custom_mods(self): """Test function for reading msfragger results and transforming to Prosit format with custom mods.""" - expected_msfragger_internal_path = Path(__file__).parent / "data" / "psm_tmt_internal_mods.csv" - stat_mod = {"M[35]": "[UNIMOD:35]"} - var_mod = {"[41]": "[UNIMOD:41]"} + expected_msfragger_internal_path = Path(__file__).parent / "data" / "psm_tmt_internal.csv" + custom_mods = { + "M[147]": 35, + "C": 4, + } + + internal_search_results_df = MSFragger(Path(__file__).parent / "data" / "psm_tmt.pepXML").read_result( + tmt_label="tmtpro", custom_mods=custom_mods + ) + expected_df = pd.read_csv(expected_msfragger_internal_path, index_col=0) + + pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS]) - internal_search_results_df = MSFragger(Path(__file__).parent / "data" / "psm_mods.pepXML").read_result( - tmt_labeled="tmtpro", stat_mods=stat_mod, var_mods=var_mod + def test_read_msfragger_with_custom_mods_with_tmt(self): + """Test function for reading msfragger results and transforming to Prosit format with custom mods and explicit TMT.""" + expected_msfragger_internal_path = Path(__file__).parent / "data" / "psm_tmt_internal.csv" + custom_mods = {"M[147]": 35, "C": 4, "^n[305]": 2016, "K": 2016} + + internal_search_results_df = MSFragger(Path(__file__).parent / "data" / "psm_tmt.pepXML").read_result( + custom_mods=custom_mods ) expected_df = pd.read_csv(expected_msfragger_internal_path, index_col=0) - print("Internal Search Results Columns:", internal_search_results_df.columns) - print("Expected Columns:", expected_df.columns) - pd.testing.assert_frame_equal(internal_search_results_df, expected_df) + pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS]) diff --git a/tests/unit_tests/test_sage.py b/tests/unit_tests/test_sage.py index f571923..0169777 100644 --- a/tests/unit_tests/test_sage.py +++ b/tests/unit_tests/test_sage.py @@ -5,25 +5,52 @@ from spectrum_io.search_result import Sage +COLUMNS = [ + "RAW_FILE", + "SCAN_NUMBER", + "MODIFIED_SEQUENCE", + "PRECURSOR_CHARGE", + "MASS", + "SCORE", + "REVERSE", + "SEQUENCE", + "PEPTIDE_LENGTH", + "PROTEINS", +] + class TestSage(unittest.TestCase): - """Test vlass to check Sage search result processing.""" + """Test class to check Sage search result processing.""" def test_read_sage(self): """Test function for reading sage results and transforming to Prosit format.""" expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv" - internal_search_results_df = ( - Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result().reset_index(drop=True) + internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result( + tmt_label="tmt" + ) + expected_df = pd.read_csv(expected_sage_internal_path) + pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS]) + + def test_read_msfragger_with_custom_mods(self): + """Test function for reading sage results with custom mods and transforming to Prosit format .""" + custom_mods = { + "C[+57.0215]": 4, + "M[+15.9948]": 35, + "M[+15.994]": 35, + } + expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal_mods.csv" + internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output_mods.tsv").read_result( + custom_mods=custom_mods, tmt_label="tmt" ) expected_df = pd.read_csv(expected_sage_internal_path) pd.testing.assert_frame_equal(internal_search_results_df, expected_df) - def test_read_sage_custom(self): + def test_read_msfragger_with_custom_mods_with_tmt(self): """Test function for reading sage results with custom mods and transforming to Prosit format .""" - stat_mods = {"15.9948": "[UNIMOD:35]"} - expected_sage_internal_path = Path.cwd() / "data" / "sage_output_internal_mods.csv" - internal_search_results_df = ( - Sage(Path.cwd() / "data" / "sage_output_mods.tsv").read_result(stat_mods=stat_mods).reset_index(drop=True) + custom_mods = {"C[+57.0215]": 4, "M[+15.9948]": 35, "M[+15.994]": 35, "K[+229.1629]": 737, "^[+229.1629]-": 737} + expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal_mods.csv" + internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output_mods.tsv").read_result( + custom_mods=custom_mods ) expected_df = pd.read_csv(expected_sage_internal_path) pd.testing.assert_frame_equal(internal_search_results_df, expected_df)