diff --git a/.flake8 b/.flake8 index e9108c3..6a8814f 100644 --- a/.flake8 +++ b/.flake8 @@ -10,3 +10,4 @@ per-file-ignores = spectrum_io/raw/thermo_raw.py:S603,S404 spectrum_io/raw/msraw.py:S405,S314 docs/conf.py:S404,S607,S603 + spectrum_io/search_result/__init__.py:F403 diff --git a/spectrum_io/__init__.py b/spectrum_io/__init__.py index 3489132..c6bf6f2 100644 --- a/spectrum_io/__init__.py +++ b/spectrum_io/__init__.py @@ -10,7 +10,6 @@ import time from . import file, raw -from .search_result import MaxQuant from .spectral_library import DLib, Spectronaut CONSOLE_LOG_LEVEL = logging.INFO diff --git a/spectrum_io/search_result/__init__.py b/spectrum_io/search_result/__init__.py index 1b87493..14660b1 100644 --- a/spectrum_io/search_result/__init__.py +++ b/spectrum_io/search_result/__init__.py @@ -1,4 +1,2 @@ """Initialize seach result.""" -from .mascot import Mascot -from .maxquant import MaxQuant -from .msfragger import MSFragger +from .process import * diff --git a/spectrum_io/search_result/filter.py b/spectrum_io/search_result/filter.py new file mode 100644 index 0000000..ef9afde --- /dev/null +++ b/spectrum_io/search_result/filter.py @@ -0,0 +1,29 @@ +import logging +import re + +import pandas as pd + +logger = logging.getLogger(__name__) + + +def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame: + """ + Filter valid Prosit sequences. + + :param df: df to filter + :return: df after filtering out unsupported peptides + """ + logger.info(f"#sequences before filtering for valid prosit sequences: {len(df.index)}") + # retain only peptides that fall within [7, 30] length supported by Prosit + df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)] + # remove unsupported mods to exclude + unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]"] + exclude_mods_pattern = re.compile("|".join(unsupported_mods)) + df = df[~df["MODIFIED_SEQUENCE"].str.contains(exclude_mods_pattern)] + # remove non-canonical aas + df = df[(~df["SEQUENCE"].str.contains("U|O"))] + # remove precursor charges greater than 6 + df = df[df["PRECURSOR_CHARGE"] <= 6] + logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}") + + return df diff --git a/spectrum_io/search_result/mascot.py b/spectrum_io/search_result/mascot.py index 9bfb51a..8915513 100644 --- a/spectrum_io/search_result/mascot.py +++ b/spectrum_io/search_result/mascot.py @@ -7,98 +7,92 @@ import spectrum_fundamentals.constants as c from spectrum_fundamentals.mod_string import internal_without_mods -from .search_results import SearchResults, filter_valid_prosit_sequences +from .filter import filter_valid_prosit_sequences logger = logging.getLogger(__name__) -class Mascot(SearchResults): - """Handle search results from Mascot.""" +def read_mascot(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: + """ + Function to read a mascot msf file and perform some basic formatting. - @staticmethod - def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: - """ - Function to read a mascot msf file and perform some basic formatting. + :param path: path to msms.txt to read + :param tmt_labeled: tmt label as str + :return: pd.DataFrame with the formatted data + """ + logger.info("Reading mascot msf file") + connection = sqlite3.connect(path) + # cursor = connection.cursor() + # cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") + df = pd.read_sql("SELECT * FROM MSnSpectrumInfo", connection)[ + ["SpectrumID", "SpectrumFileName", "RetentionTime", "Mass", "Charge"] + ] + df_id_map = pd.read_sql("SELECT * FROM TargetPsmsMSnSpectrumInfo", connection)[ + ["MSnSpectrumInfoSpectrumID", "TargetPsmsPeptideID"] + ] + df = df.merge(df_id_map, left_on="SpectrumID", right_on="MSnSpectrumInfoSpectrumID") + df_target_psms = pd.read_sql("SELECT * FROM TargetPsms", connection)[ + ["PeptideID", "Sequence", "ModifiedSequence", "Modifications", "XCorr"] + ] + df = df.merge(df_target_psms, left_on="TargetPsmsPeptideID", right_on="PeptideID") + df_modif = pd.read_sql("SELECT * FROM TargetPsmsFoundModifications", connection)[ + ["TargetPsmsPeptideID", "FoundModificationsModificationID", "Position"] + ] + df_modif_mass = pd.read_sql("SELECT * FROM FoundModifications", connection)[ + ["ModificationID", "DeltaMonoisotopicMass"] + ] + df_modif = df_modif.merge(df_modif_mass, left_on="FoundModificationsModificationID", right_on="ModificationID") + df = df.merge(df_modif, on="TargetPsmsPeptideID") - :param path: path to msms.txt to read - :param tmt_labeled: tmt label as str - :return: pd.DataFrame with the formatted data - """ - logger.info("Reading mascot msf file") - connection = sqlite3.connect(path) - # cursor = connection.cursor() - # cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") - df = pd.read_sql("SELECT * FROM MSnSpectrumInfo", connection)[ - ["SpectrumID", "SpectrumFileName", "RetentionTime", "Mass", "Charge"] - ] - df_id_map = pd.read_sql("SELECT * FROM TargetPsmsMSnSpectrumInfo", connection)[ - ["MSnSpectrumInfoSpectrumID", "TargetPsmsPeptideID"] - ] - df = df.merge(df_id_map, left_on="SpectrumID", right_on="MSnSpectrumInfoSpectrumID") - df_target_psms = pd.read_sql("SELECT * FROM TargetPsms", connection)[ - ["PeptideID", "Sequence", "ModifiedSequence", "Modifications", "XCorr"] - ] - df = df.merge(df_target_psms, left_on="TargetPsmsPeptideID", right_on="PeptideID") - df_modif = pd.read_sql("SELECT * FROM TargetPsmsFoundModifications", connection)[ - ["TargetPsmsPeptideID", "FoundModificationsModificationID", "Position"] - ] - df_modif_mass = pd.read_sql("SELECT * FROM FoundModifications", connection)[ - ["ModificationID", "DeltaMonoisotopicMass"] - ] - df_modif = df_modif.merge(df_modif_mass, left_on="FoundModificationsModificationID", right_on="ModificationID") - df = df.merge(df_modif, on="TargetPsmsPeptideID") + logger.info("Finished reading mascot msf file.") - logger.info("Finished reading mascot msf file.") + df.rename( + columns={ + "SpectrumID": "SCAN NUMBER", + "ModifiedSequence": "MODIFIED SEQUENCE", + "Charge": "PRECURSOR CHARGE", + "XCorr": "SCORE", + "SpectrumFileName": "RAW FILE", + }, + inplace=True, + ) - df.rename( - columns={ - "SpectrumID": "SCAN NUMBER", - "ModifiedSequence": "MODIFIED SEQUENCE", - "Charge": "PRECURSOR CHARGE", - "XCorr": "SCORE", - "SpectrumFileName": "RAW FILE", - }, - inplace=True, - ) + # Standardize column names + df.columns = df.columns.str.upper() + df.columns = df.columns.str.replace(" ", "_") + # TODO reverse + df["REVERSE"] = df["SEQUENCE"].str.contains("Reverse") + logger.info("Converting MSFragger peptide sequence to internal format") + df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "") + df["MODIFICATIONS"] = (df["POSITION"].astype(int) - 1).astype(str) + "$" + df["DELTAMONOISOTOPICMASS"].astype(str) + df = df.groupby("SCAN_NUMBER", as_index=False).apply(lambda x: x.sort_values("POSITION")) + df = df.groupby( + ["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"], + as_index=False, + ).agg({"MODIFICATIONS": "|".join}) + mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()} - # Standardize column names - df.columns = df.columns.str.upper() - df.columns = df.columns.str.replace(" ", "_") - # TODO reverse - df["REVERSE"] = df["SEQUENCE"].str.contains("Reverse") - logger.info("Converting MSFragger peptide sequence to internal format") - df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "") - df["MODIFICATIONS"] = ( - (df["POSITION"].astype(int) - 1).astype(str) + "$" + df["DELTAMONOISOTOPICMASS"].astype(str) - ) - df = df.groupby("SCAN_NUMBER", as_index=False).apply(lambda x: x.sort_values("POSITION")) - df = df.groupby( - ["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"], - as_index=False, - ).agg({"MODIFICATIONS": "|".join}) - mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()} + sequences = [] + for _, row in df.iterrows(): + modifications = row["MODIFICATIONS"].split("|") + if len(modifications) == 0: + sequences.append(row["SEQUENCE"]) + else: + sequence = row["SEQUENCE"] + skip = 0 + for mod in modifications: + pos, mass = mod.split("$") + sequence = ( + sequence[: int(pos) + 1 + skip] + + mod_masses_reverse[round(float(mass), 3)] + + sequence[int(pos) + 1 + skip :] + ) + skip = skip + len(mod_masses_reverse[round(float(mass), 3)]) + sequences.append(sequence) - sequences = [] - for _, row in df.iterrows(): - modifications = row["MODIFICATIONS"].split("|") - if len(modifications) == 0: - sequences.append(row["SEQUENCE"]) - else: - sequence = row["SEQUENCE"] - skip = 0 - for mod in modifications: - pos, mass = mod.split("$") - sequence = ( - sequence[: int(pos) + 1 + skip] - + mod_masses_reverse[round(float(mass), 3)] - + sequence[int(pos) + 1 + skip :] - ) - skip = skip + len(mod_masses_reverse[round(float(mass), 3)]) - sequences.append(sequence) + df["MODIFIED_SEQUENCE"] = sequences - df["MODIFIED_SEQUENCE"] = sequences + df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) + df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) - df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) - df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) - - return filter_valid_prosit_sequences(df) + return filter_valid_prosit_sequences(df) diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py index 49533d2..dd79058 100644 --- a/spectrum_io/search_result/maxquant.py +++ b/spectrum_io/search_result/maxquant.py @@ -6,105 +6,99 @@ import spectrum_fundamentals.constants as c from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal -from .search_results import SearchResults, filter_valid_prosit_sequences +from .filter import filter_valid_prosit_sequences logger = logging.getLogger(__name__) -class MaxQuant(SearchResults): - """Handle search results from MaxQuant.""" - - @staticmethod - def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float: - """ - Add tmt modification. - - :param mass: mass without tmt modification - :param seq: sequence of the peptide - :param unimod_tag: UNIMOD tag for the modification - :return: mass as float - """ - num_of_tmt = seq.count(unimod_tag) - mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"] - return mass - - @staticmethod - def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: - """ - Function to read a msms txt and perform some basic formatting. - - :param path: path to msms.txt to read - :param tmt_labeled: tmt label as str - :return: pd.DataFrame with the formatted data - """ - logger.info("Reading msms.txt file") - df = pd.read_csv( - path, - usecols=lambda x: x.upper() - in [ - "RAW FILE", - "SCAN NUMBER", - "MODIFIED SEQUENCE", - "CHARGE", - "SCAN EVENT NUMBER", - "LABELING STATE", - "MASS", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead - "SCORE", - "REVERSE", - ], - sep="\t", +def read_maxquant(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: + """ + Function to read a msms txt and perform some basic formatting. + + :param path: path to msms.txt to read + :param tmt_labeled: tmt label as str + :return: pd.DataFrame with the formatted data + """ + logger.info("Reading msms.txt file") + df = pd.read_csv( + path, + usecols=lambda x: x.upper() + in [ + "RAW FILE", + "SCAN NUMBER", + "MODIFIED SEQUENCE", + "CHARGE", + "SCAN EVENT NUMBER", + "LABELING STATE", + "MASS", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead + "SCORE", + "REVERSE", + ], + sep="\t", + ) + logger.info("Finished reading msms.txt file") + + # Standardize column names + df.columns = df.columns.str.upper() + df.columns = df.columns.str.replace(" ", "_") + + df = update_columns_for_prosit(df, tmt_labeled) + return filter_valid_prosit_sequences(df) + + +def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFrame: + """ + Update columns of df to work with Prosit. + + :param df: df to modify + :param tmt_labeled: True if tmt labeled + :return: modified df as pd.DataFrame + """ + df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True) + + df["REVERSE"].fillna(False, inplace=True) + df["REVERSE"].replace("+", True, inplace=True) + logger.info("Converting MaxQuant peptide sequence to internal format") + if tmt_labeled != "": + unimod_tag = c.TMT_MODS[tmt_labeled] + logger.info("Adding TMT fixed modifications") + df["MODIFIED_SEQUENCE"] = maxquant_to_internal( + df["MODIFIED_SEQUENCE"].to_numpy(), + fixed_mods={"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}", "K": f"K{unimod_tag}"}, ) - logger.info("Finished reading msms.txt file") - - # Standardize column names - df.columns = df.columns.str.upper() - df.columns = df.columns.str.replace(" ", "_") - - df = MaxQuant.update_columns_for_prosit(df, tmt_labeled) - return filter_valid_prosit_sequences(df) - - @staticmethod - def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFrame: - """ - Update columns of df to work with Prosit. - - :param df: df to modify - :param tmt_labeled: True if tmt labeled - :return: modified df as pd.DataFrame - """ - df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True) - - df["REVERSE"].fillna(False, inplace=True) - df["REVERSE"].replace("+", True, inplace=True) - logger.info("Converting MaxQuant peptide sequence to internal format") - if tmt_labeled != "": - unimod_tag = c.TMT_MODS[tmt_labeled] - logger.info("Adding TMT fixed modifications") - df["MODIFIED_SEQUENCE"] = maxquant_to_internal( - df["MODIFIED_SEQUENCE"].to_numpy(), - fixed_mods={"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}", "K": f"K{unimod_tag}"}, - ) - df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1) - if "msa" in tmt_labeled: - logger.info("Replacing phospho by dehydration for Phospho-MSA") - df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace( - "[UNIMOD:21]", "[UNIMOD:23]", regex=False - ) - elif "LABELING_STATE" in df.columns: - logger.info("Adding SILAC fixed modifications") - df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal( - df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(), - fixed_mods={"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, - ) - df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal( - df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy() - ) - df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1) - df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1) - df.drop(columns=["LABELING_STATE"], inplace=True) - else: - df["MODIFIED_SEQUENCE"] = maxquant_to_internal(df["MODIFIED_SEQUENCE"].to_numpy()) - df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) - df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) - - return df + df["MASS"] = df.apply(lambda x: add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1) + if "msa" in tmt_labeled: + logger.info("Replacing phospho by dehydration for Phospho-MSA") + df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace("[UNIMOD:21]", "[UNIMOD:23]", regex=False) + elif "LABELING_STATE" in df.columns: + logger.info("Adding SILAC fixed modifications") + df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal( + df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(), + fixed_mods={"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, + ) + df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal( + df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy() + ) + df["MASS"] = df.apply(lambda x: add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1) + df["MASS"] = df.apply(lambda x: add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1) + df.drop(columns=["LABELING_STATE"], inplace=True) + else: + df["MODIFIED_SEQUENCE"] = maxquant_to_internal(df["MODIFIED_SEQUENCE"].to_numpy()) + df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) + df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) + + return df + + +def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float: + """ + Add tmt modification. + + :param mass: mass without tmt modification + :param seq: sequence of the peptide + :param unimod_tag: UNIMOD tag for the modification + :return: mass as float + """ + num_of_tmt = seq.count(unimod_tag) + mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"] + return mass diff --git a/spectrum_io/search_result/msamanda.py b/spectrum_io/search_result/msamanda.py index 4be13e6..563086c 100644 --- a/spectrum_io/search_result/msamanda.py +++ b/spectrum_io/search_result/msamanda.py @@ -5,7 +5,7 @@ import pandas as pd from spectrum_fundamentals.constants import PARTICLE_MASSES -from .search_results import filter_valid_prosit_sequences +from .filter import filter_valid_prosit_sequences logger = logging.getLogger(__name__) @@ -63,7 +63,7 @@ def _remove_decoys_in_targets(full_df): return full_df -def read_result(path: Union[str, Path], suffix: str = "output.csv") -> pd.DataFrame: +def read_msamanda(path: Union[str, Path], suffix: str = "output.csv") -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index a55ac15..f87953d 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -8,42 +8,38 @@ from spectrum_fundamentals.mod_string import internal_without_mods from tqdm import tqdm -from .search_results import SearchResults, filter_valid_prosit_sequences +from .filter import filter_valid_prosit_sequences logger = logging.getLogger(__name__) -class MSFragger(SearchResults): - """Handle search results from MSFragger.""" - - @staticmethod - def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: - """ - Function to read a msms txt and perform some basic formatting. +def read_msfragger(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: + """ + Function to read a msms txt and perform some basic formatting. - :param path: path to pepXML folder or single pepXML file to read - :param tmt_labeled: tmt label as str - :raises FileNotFoundError: in case the given path is neither a file, nor a directory. - :return: pd.DataFrame with the formatted data - """ - if isinstance(path, str): - path = Path(path) + :param path: path to pepXML folder or single pepXML file to read + :param tmt_labeled: tmt label as str + :raises FileNotFoundError: in case the given path is neither a file, nor a directory. + :return: pd.DataFrame with the formatted data + """ + if isinstance(path, str): + path = Path(path) - if path.is_file(): - file_list = [path] - elif path.is_dir(): - file_list = list(path.rglob("*.pepXML")) - else: - raise FileNotFoundError(f"{path} could not be found.") + if path.is_file(): + file_list = [path] + elif path.is_dir(): + file_list = list(path.rglob("*.pepXML")) + else: + raise FileNotFoundError(f"{path} could not be found.") - ms_frag_results = [] - for pep_xml_file in tqdm(file_list): - ms_frag_results.append(pepxml.DataFrame(str(pep_xml_file))) + ms_frag_results = [] + for pep_xml_file in tqdm(file_list): + ms_frag_results.append(pepxml.DataFrame(str(pep_xml_file))) - df = pd.concat(ms_frag_results) + df = pd.concat(ms_frag_results) - df = update_columns_for_prosit(df, "") - return filter_valid_prosit_sequences(df) + df = update_columns_for_prosit(df, "") + return filter_valid_prosit_sequences(df) def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame: diff --git a/spectrum_io/search_result/process.py b/spectrum_io/search_result/process.py new file mode 100644 index 0000000..b5de1b7 --- /dev/null +++ b/spectrum_io/search_result/process.py @@ -0,0 +1,63 @@ +import logging +import re +from abc import abstractmethod +from pathlib import Path +from typing import Optional, Union + +import pandas as pd + +from spectrum_io.file import csv + +from .mascot import read_mascot +from .maxquant import read_maxquant +from .msamanda import read_msamanda +from .msfragger import read_msfragger + +logger = logging.getLogger(__name__) + + +def read_search_results(search_results: Union[str, Path], search_type: str, tmt_labeled: str): + """Read seach results.""" + if search_type.lower() == "maxquant": + read_maxquant(search_results, tmt_labeled) + elif search_type.lower() == "mascot": + read_mascot(search_results, tmt_labeled) + elif search_type.lower() == "msfragger": + read_msfragger(search_results, tmt_labeled) + elif search_type.lower() == "msamanda": + read_msamanda(search_results) + else: + raise ValueError(f"Unknown search_type provided: {search_type}") + + +def generate_internal(self, tmt_labeled: str, out_path: Optional[Union[str, Path]] = None) -> Path: + """ + Generate df and save to out_path. + + :param out_path: path to output + :param tmt_labeled: tmt label as str + :return: path to output file + """ + if out_path is None: + out_path = self.path.with_suffix(".prosit") + if isinstance(out_path, str): + out_path = Path(out_path) + + if out_path.is_file(): + logger.info(f"Found search results in internal format at {out_path}, skipping conversion") + return out_path + + df = self.read_result(self.path, tmt_labeled) + csv.write_file(df, out_path) + + return out_path + + +def read_internal(self, path: Union[str, Path]) -> pd.DataFrame: + """ + Read file from path. + + :param path: path to file + :return: dataframe after reading the file + """ + return csv.read_file(path) diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py deleted file mode 100644 index f1ed69d..0000000 --- a/spectrum_io/search_result/search_results.py +++ /dev/null @@ -1,87 +0,0 @@ -import logging -import re -from abc import abstractmethod -from pathlib import Path -from typing import Optional, Union - -import pandas as pd - -from spectrum_io.file import csv - -logger = logging.getLogger(__name__) - - -def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame: - """ - Filter valid Prosit sequences. - - :param df: df to filter - :return: df after filtering out unsupported peptides - """ - logger.info(f"#sequences before filtering for valid prosit sequences: {len(df.index)}") - # retain only peptides that fall within [7, 30] length supported by Prosit - df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)] - # remove unsupported mods to exclude - unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]"] - exclude_mods_pattern = re.compile("|".join(unsupported_mods)) - df = df[~df["MODIFIED_SEQUENCE"].str.contains(exclude_mods_pattern)] - # remove non-canonical aas - df = df[(~df["SEQUENCE"].str.contains("U|O"))] - # remove precursor charges greater than 6 - df = df[df["PRECURSOR_CHARGE"] <= 6] - logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}") - - return df - - -class SearchResults: - """Handle search results from different software.""" - - orig_res: pd.DataFrame - fake_msms: pd.DataFrame - - def __init__(self, path: Union[str, Path]): - """ - Init Searchresults object. - - :param path: path to file - """ - if isinstance(path, str): - path = Path(path) - self.path = path - - @abstractmethod - def read_result(self, path: Union[str, Path], tmt_labeled: str): - """Read result.""" - raise NotImplementedError - - def generate_internal(self, tmt_labeled: str, out_path: Optional[Union[str, Path]] = None) -> Path: - """ - Generate df and save to out_path. - - :param out_path: path to output - :param tmt_labeled: tmt label as str - :return: path to output file - """ - if out_path is None: - out_path = self.path.with_suffix(".prosit") - if isinstance(out_path, str): - out_path = Path(out_path) - - if out_path.is_file(): - logger.info(f"Found search results in internal format at {out_path}, skipping conversion") - return out_path - - df = self.read_result(self.path, tmt_labeled) - csv.write_file(df, out_path) - - return out_path - - def read_internal(self, path: Union[str, Path]) -> pd.DataFrame: - """ - Read file from path. - - :param path: path to file - :return: dataframe after reading the file - """ - return csv.read_file(path) diff --git a/tests/unit_tests/test_maxquant.py b/tests/unit_tests/test_maxquant.py index af45d6c..7a53bec 100644 --- a/tests/unit_tests/test_maxquant.py +++ b/tests/unit_tests/test_maxquant.py @@ -4,8 +4,8 @@ import pandas as pd import pytest -import spectrum_io.search_result.maxquant as mq -from spectrum_io.search_result.search_results import filter_valid_prosit_sequences +from spectrum_io.search_result.filter import filter_valid_prosit_sequences +from spectrum_io.search_result.maxquant import add_tmt_mod, update_columns_for_prosit class TestAddTMTMod: @@ -13,10 +13,7 @@ class TestAddTMTMod: def test_add_tmt_mod(self): """Test addition of tmt modification.""" - assert ( - mq.MaxQuant.add_tmt_mod(1.0, "[UNIMOD:2016]ABC[UNIMOD:4]K[UNIMOD:2016]", "[UNIMOD:2016]") - == 1.0 + 2 * 304.207146 - ) + assert add_tmt_mod(1.0, "[UNIMOD:2016]ABC[UNIMOD:4]K[UNIMOD:2016]", "[UNIMOD:2016]") == 1.0 + 2 * 304.207146 class TestUpdateColumns: @@ -28,7 +25,7 @@ def test_update_columns(self, maxquant_df: pd.DataFrame): :param maxquant_df: maxquant df as pd.DataFrame """ - prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="") + prosit_df = update_columns_for_prosit(maxquant_df, tmt_labeled="") assert not prosit_df["REVERSE"][0] assert prosit_df["REVERSE"][3] @@ -48,7 +45,7 @@ def test_update_columns_silac(self, maxquant_df: pd.DataFrame): :param maxquant_df: maxquant df as pd.DataFrame """ maxquant_df["LABELING_STATE"] = [1, 1, 1, 2, 2] - prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="") + prosit_df = update_columns_for_prosit(maxquant_df, tmt_labeled="") assert prosit_df["MODIFIED_SEQUENCE"][0] == "DS[UNIMOD:21]DS[UNIMOD:21]WDADAFSVEDPVR[UNIMOD:267]K[UNIMOD:259]" assert prosit_df["MODIFIED_SEQUENCE"][3] == "SS[UNIMOD:21]PTPES[UNIMOD:21]PTMLTK" @@ -61,7 +58,7 @@ def test_update_columns_tmt(self, maxquant_df: pd.DataFrame): :param maxquant_df: maxquant df as pd.DataFrame """ - prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="tmt") + prosit_df = update_columns_for_prosit(maxquant_df, tmt_labeled="tmt") assert prosit_df["MODIFIED_SEQUENCE"][0] == "[UNIMOD:737]DS[UNIMOD:21]DS[UNIMOD:21]WDADAFSVEDPVRK[UNIMOD:737]" assert prosit_df["MODIFIED_SEQUENCE"][3] == "[UNIMOD:737]SS[UNIMOD:21]PTPES[UNIMOD:21]PTMLTK[UNIMOD:737]" @@ -74,7 +71,7 @@ def test_update_columns_tmt_msa(self, maxquant_df: pd.DataFrame): :param maxquant_df: maxquant df as pd.DataFrame """ - prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="tmt_msa") + prosit_df = update_columns_for_prosit(maxquant_df, tmt_labeled="tmt_msa") assert ( prosit_df["MODIFIED_SEQUENCE_MSA"][0] == "[UNIMOD:737]DS[UNIMOD:23]DS[UNIMOD:23]WDADAFSVEDPVRK[UNIMOD:737]" ) diff --git a/tests/unit_tests/test_msfragger.py b/tests/unit_tests/test_msfragger.py index cc6ce14..8af6a19 100644 --- a/tests/unit_tests/test_msfragger.py +++ b/tests/unit_tests/test_msfragger.py @@ -3,7 +3,7 @@ import pandas as pd -from spectrum_io.search_result.msfragger import MSFragger +from spectrum_io.search_result.msfragger import read_msfragger class TestMSFragger(unittest.TestCase): @@ -11,8 +11,7 @@ class TestMSFragger(unittest.TestCase): def test_read_result(self): """Test read_result for MSFragger.""" - msfragger = MSFragger(Path(__file__).parent / "data/") - df = msfragger.read_result(Path(__file__).parent / "data/psm.pepXML", "") + df = read_msfragger(Path(__file__).parent / "data/psm.pepXML", "") self.assertIsInstance(df, pd.DataFrame) self.assertTrue("RAW_FILE" in df.columns) self.assertTrue("SCAN_NUMBER" in df.columns)