From f761c8d4515833b779bc157be437278aeab2d18c Mon Sep 17 00:00:00 2001 From: victorgiurcoiu Date: Thu, 16 Feb 2023 17:02:17 +0000 Subject: [PATCH 1/4] Changed msfragger to work with the new output format --- spectrum_io/search_result/msfragger.py | 71 ++++++++++++++------------ 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index 0c24a1e..16e510a 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -1,8 +1,8 @@ import logging +import re import pandas as pd import spectrum_fundamentals.constants as c -from spectrum_fundamentals.mod_string import internal_without_mods from .search_results import SearchResults @@ -21,31 +21,35 @@ def read_result(path: str, tmt_labeled: str) -> pd.DataFrame: :param tmt_labeled: tmt label as str :return: pd.DataFrame with the formatted data """ - logger.info("Reading msfragger xlsx file") - df = pd.read_excel( + logger.info("Reading msfragger tsv file") + df = pd.read_csv( path, usecols=lambda x: x.upper() in [ - "SCANID", - "PEPTIDE SEQUENCE", - "PRECURSOR CHARGE", - "PRECURSOR NEUTRAL MASS (DA)", - "HYPERSCORE", + "PEPTIDE", "PROTEIN", - "RETENTION TIME (MINUTES)", - "VARIABLE MODIFICATIONS DETECTED (STARTS WITH M, SEPARATED BY |, FORMATED AS POSITION,MASS)", + "PEPTIDE LENGTH", + "SPECTRUM FILE", + "SPECTRUM", + "ASSIGNED MODIFICATIONS", + "CHARGE", + "LABELING STATE", + "OBSERVED MASS", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead + "HYPERSCORE", + "RETENTION", ], + sep="\t", ) - logger.info("Finished reading msfragger xlsx file") + logger.info("Finished reading msfragger tsv file") df.rename( columns={ - "ScanID": "SCAN NUMBER", - "Peptide Sequence": "MODIFIED SEQUENCE", - "Precursor neutral mass (Da)": "MASS", + "Peptide": "SEQUENCE", + "Assigned Modifications": "MODIFICATIONS", + "Observed Mass": "MASS", "Hyperscore": "SCORE", - "Retention time (minutes)": "RETENTION TIME", - "Variable modifications detected (starts with M, separated by |, formated as position,mass)": "MODIFICATIONS", + "Retention": "RETENTION TIME", + "Spectrum File": "RAW FILE", }, inplace=True, ) @@ -53,38 +57,39 @@ def read_result(path: str, tmt_labeled: str) -> pd.DataFrame: # Standardize column names df.columns = df.columns.str.upper() df.columns = df.columns.str.replace(" ", "_") - df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True) - + df[["RAW_FILE", "SCAN_NUMBER"]] = df["SPECTRUM"].str.split(".", expand=True, n=2)[[0, 1]] df["REVERSE"] = df["PROTEIN"].str.contains("Reverse") - # df["RAW_FILE"] = df.iloc[0]["PROTEIN"] - df["RAW_FILE"] = "01625b_GA6-TUM_first_pool_41_01_01-DDA-1h-R2" - logger.info("Converting MSFragger peptide sequence to internal format") + df["MODIFICATIONS"] = df["MODIFICATIONS"].fillna(0) mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()} sequences = [] for _, row in df.iterrows(): - modifications = row["MODIFICATIONS"].split("|")[1:] - if len(modifications) == 0: - sequences.append(row["MODIFIED_SEQUENCE"]) + modifications = row["MODIFICATIONS"] + if modifications == 0: + sequences.append(row["SEQUENCE"]) else: - sequence = row["MODIFIED_SEQUENCE"] + modifications = modifications.split(", ") + sequence = row["SEQUENCE"] skip = 0 - for mod in modifications: - pos, mass = mod.split("$") + for mod in sorted( + modifications, key=lambda s: 0 if s.startswith("N") else int(re.sub("[^0-9]", "", s.split("(")[0])) + ): + pos, mass = mod.split("(") + mass = mass.replace(")", "") + if pos == "N-term": + continue + else: + pos = re.sub("[^0-9]", "", pos) sequence = ( - sequence[: int(pos) + 1 + skip] + sequence[: int(pos) + skip] + mod_masses_reverse[round(float(mass), 3)] - + sequence[int(pos) + 1 + skip :] + + sequence[int(pos) + skip :] ) skip = skip + len(mod_masses_reverse[round(float(mass), 3)]) sequences.append(sequence) - df["MODIFIED_SEQUENCE"] = sequences - df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) - df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) - logger.info(f"No of sequences before Filtering is {len(df['PEPTIDE_LENGTH'])}") df = df[(df["PEPTIDE_LENGTH"] <= 30)] df = df[(~df["MODIFIED_SEQUENCE"].str.contains(r"\(ac\)"))] From d4e1586a73f8804403e08e2159d91781ec1e3007 Mon Sep 17 00:00:00 2001 From: victorgiurcoiu Date: Tue, 18 Apr 2023 14:27:51 +0200 Subject: [PATCH 2/4] Added unit test for msfragger --- tests/unit_tests/data/psm.tsv | 11 +++++++++++ tests/unit_tests/test_msfragger.py | 27 +++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 tests/unit_tests/data/psm.tsv create mode 100644 tests/unit_tests/test_msfragger.py diff --git a/tests/unit_tests/data/psm.tsv b/tests/unit_tests/data/psm.tsv new file mode 100644 index 0000000..0f5a753 --- /dev/null +++ b/tests/unit_tests/data/psm.tsv @@ -0,0 +1,11 @@ +Spectrum Spectrum File Peptide Modified Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore PeptideProphet Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +YIG_085_L006_30_01.00375.00375.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKRGHA A K 20 3 100.2141 2090.9014 2090.9014 697.9744 697.9744 2090.9001 697.9740 0.0012 0.03333000000000 19.4340 0.0000 0.9571 0 0 605 624 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain +YIG_085_L006_30_01.00420.00420.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKRG A H 18 3 112.1268 1882.8008 1882.8008 628.6075 628.6075 1882.8041 628.6086 -0.0033 0.00000017520000 43.2000 12.3400 1.0000 0 0 605 622 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain +YIG_085_L006_30_01.00423.00423.4 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKRG A H 18 4 112.8499 1882.8019 1882.8019 471.7078 471.7078 1882.8041 471.7083 -0.0022 0.00000003950000 38.3490 13.4780 1.0000 0 0 605 622 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain +YIG_085_L006_30_01.00424.00424.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKR A G 17 3 113.1629 1825.7810 1825.7810 609.6009 609.6009 1825.7826 609.6015 -0.0015 0.00000341200000 41.5720 17.8670 1.0000 0 0 605 621 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain +YIG_085_L006_30_01.00428.00428.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml NRVGKVEHGSVA Y L 12 3 114.2047 1251.6663 1251.6663 418.2294 418.2294 1251.6683 418.2300 -0.0020 0.00121700000000 20.9290 11.5700 1.0000 0 0 437 448 0.0000 true sp|Q9Y490|TLN1_HUMAN Q9Y490 TLN1_HUMAN TLN1 Talin-1 +YIG_085_L006_30_01.00432.00432.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml EEGKRHPYKMNLA EEGKRHPYKM[147]NLA T S 13 3 115.1439 1587.7778 1587.7778 530.2665 530.2665 1587.7826 530.2681 -0.0047 0.00089230000000 22.2240 14.5200 1.0000 0 0 95 107 0.0000 10M(15.9949) true sp|O00151|PDLI1_HUMAN O00151 PDLI1_HUMAN PDLIM1 PDZ and LIM domain protein 1 +YIG_085_L006_30_01.00439.00439.4 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKRGHAKSRPV A R 25 4 117.2938 2658.2427 2658.2427 665.5680 665.5680 2658.2493 665.5696 -0.0065 0.00009422000000 28.6670 11.8630 1.0000 0 0 605 629 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain +YIG_085_L006_30_01.00449.00449.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml KPEFHEDTR Q S 9 3 119.9727 1157.5449 1157.5449 386.8556 386.8556 1157.5464 386.8561 -0.0014 0.00225800000000 17.7940 0.0000 0.9999 0 0 54 62 0.0000 true sp|Q9NS28|RGS18_HUMAN Q9NS28 RGS18_HUMAN RGS18 Regulator of G-protein signaling 18 +YIG_085_L006_30_01.00469.00469.2 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml VGKVEHGSVA R L 10 2 125.7287 981.5227 981.5227 491.7686 491.7686 981.5243 491.7694 -0.0015 0.00000236400000 31.2090 12.4440 1.0000 0 0 439 448 0.0000 true sp|Q9Y490|TLN1_HUMAN Q9Y490 TLN1_HUMAN TLN1 Talin-1 +YIG_085_L006_30_01.00490.00490.2 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHS A T 14 2 131.7005 1440.5374 1440.5374 721.2760 721.2760 1440.5388 721.2767 -0.0014 0.00000604300000 34.3750 10.4530 1.0000 0 0 605 618 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain diff --git a/tests/unit_tests/test_msfragger.py b/tests/unit_tests/test_msfragger.py new file mode 100644 index 0000000..5df8a28 --- /dev/null +++ b/tests/unit_tests/test_msfragger.py @@ -0,0 +1,27 @@ +import unittest +from pathlib import Path + +import pandas as pd + +from spectrum_io.search_result.msfragger import MSFragger + + +class TestMSFragger(unittest.TestCase): + """Class to test MSFragger.""" + + def test_read_result(self): + """Test read_result for MSFragger.""" + msfragger = MSFragger(str(Path(__file__).parent / "data/")) + df = msfragger.read_result(str(Path(__file__).parent / "data/psm.tsv"), "") + self.assertIsInstance(df, pd.DataFrame) + self.assertTrue("SEQUENCE" in df.columns) + self.assertTrue("MODIFIED_SEQUENCE" in df.columns) + self.assertTrue("MASS" in df.columns) + self.assertTrue("SCORE" in df.columns) + self.assertTrue("PRECURSOR_CHARGE" in df.columns) + self.assertTrue("RAW_FILE" in df.columns) + self.assertTrue("SCAN_NUMBER" in df.columns) + self.assertTrue("REVERSE" in df.columns) + self.assertTrue("PEPTIDE_LENGTH" in df.columns) + self.assertTrue("SPECTRUM" in df.columns) + self.assertTrue("RETENTION_TIME" in df.columns) From 8b0c87d5bc2df85f89139d56dc92c1a0ce593978 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Mon, 7 Aug 2023 17:10:05 +0200 Subject: [PATCH 3/4] use pyteomics and pepXML to read msfragger --- spectrum_io/search_result/msfragger.py | 160 +++++++++----------- spectrum_io/search_result/search_results.py | 4 +- 2 files changed, 75 insertions(+), 89 deletions(-) diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index 66e0727..a066acf 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -1,11 +1,12 @@ import logging -import re from pathlib import Path from typing import Union import pandas as pd import spectrum_fundamentals.constants as c +from pyteomics import pepxml from spectrum_fundamentals.mod_string import internal_without_mods +from tqdm import tqdm from .search_results import SearchResults, filter_valid_prosit_sequences @@ -20,95 +21,80 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: """ Function to read a msms txt and perform some basic formatting. - :param path: path to msms.txt to read + :param path: path to pepXML folder or single pepXML file to read :param tmt_labeled: tmt label as str + :raises FileNotFoundError: in case the given path is neither a file, nor a directory. :return: pd.DataFrame with the formatted data """ - logger.info("Reading msfragger tsv file") - df = pd.read_csv( - path, - usecols=lambda x: x.upper() - in [ - "PEPTIDE", - "PROTEIN", - "PEPTIDE LENGTH", - "SPECTRUM FILE", - "SPECTRUM", - "ASSIGNED MODIFICATIONS", - "CHARGE", - "LABELING STATE", - "OBSERVED MASS", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead - "HYPERSCORE", - "RETENTION", - "VARIABLE MODIFICATIONS DETECTED (STARTS WITH M, SEPARATED BY |, FORMATED AS POSITION,MASS)", - ], - sep="\t", - ) - logger.info("Finished reading msfragger tsv file") - - df.rename( - columns={ - "Peptide": "SEQUENCE", - "Assigned Modifications": "MODIFICATIONS", - "Observed Mass": "MASS", - "Hyperscore": "SCORE", - "Retention": "RETENTION TIME", - "Spectrum File": "RAW FILE", - "Variable modifications detected (starts with M, separated by |, formated as position,mass)": "MODIFICATIONS", - }, - inplace=True, - ) - - # Standardize column names - df.columns = df.columns.str.upper() - df.columns = df.columns.str.replace(" ", "_") - - df = MSFragger.update_columns_for_prosit(df, tmt_labeled) + if path.is_file(): + file_list = [path] + elif path.is_dir(): + file_list = path.rglob("*.pepXML") + else: + raise FileNotFoundError(f"{path} could not be found.") + + ms_frag_results = [] + for pep_xml_file in tqdm(list(file_list)): + ms_frag_results.append(pepxml.DataFrame(str(pep_xml_file))) + + df = pd.concat(ms_frag_results) + + df = update_columns_for_prosit(df, "") return filter_valid_prosit_sequences(df) - @staticmethod - def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame: - """ - Update columns of df to work with Prosit. - :param df: df to modify - :param tmt_labeled: True if tmt labeled - :return: modified df as pd.DataFrame - """ - df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True) - df[["RAW_FILE", "SCAN_NUMBER"]] = df["SPECTRUM"].str.split(".", expand=True, n=2)[[0, 1]] - df["REVERSE"] = df["PROTEIN"].str.contains("Reverse") - - df["MODIFICATIONS"] = df["MODIFICATIONS"].fillna(0) - mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()} - sequences = [] - for _, row in df.iterrows(): - modifications = row["MODIFICATIONS"] - if modifications == 0: - sequences.append(row["SEQUENCE"]) - else: - modifications = modifications.split(", ") - sequence = row["SEQUENCE"] - skip = 0 - for mod in sorted( - modifications, key=lambda s: 0 if s.startswith("N") else int(re.sub("[^0-9]", "", s.split("(")[0])) - ): - pos, mass = mod.split("(") - mass = mass.replace(")", "") - if pos == "N-term": - continue - else: - pos = re.sub("[^0-9]", "", pos) - sequence = ( - sequence[: int(pos) + skip] - + mod_masses_reverse[round(float(mass), 3)] - + sequence[int(pos) + skip :] - ) - skip = skip + len(mod_masses_reverse[round(float(mass), 3)]) - sequences.append(sequence) - df["MODIFIED_SEQUENCE"] = sequences - - df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) - df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) - - return df +def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame: + """ + Update columns of df to work with Prosit. + + :param df: df to modify + :param tmt_labeled: True if tmt labeled + :return: modified df as pd.DataFrame + """ + df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x)) + df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0]) + df["MASS"] = df["precursor_neutral_mass"] + df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x)) + df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"]) + df.rename( + columns={ + "assumed_charge": "PRECURSOR_CHARGE", + "index": "SCAN_EVENT_NUMBER", + "peptide": "SEQUENCE", + "start_scan": "SCAN_NUMBER", + "hyperscore": "SCORE", + }, + inplace=True, + ) + df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) + return df[ + [ + "RAW_FILE", + "SCAN_NUMBER", + "MODIFIED_SEQUENCE", + "PRECURSOR_CHARGE", + "SCAN_EVENT_NUMBER", + "MASS", + "SCORE", + "REVERSE", + "SEQUENCE", + "PEPTIDE_LENGTH", + ] + ] + + +def msfragger_to_internal(modstrings: pd.Series): + """ + Transform modstring from msfragger format to internal format. + + This function takes a modstrings column from a pandas dataframe and converts each + supported modification (M[147] and C[160]) to the internal representation that is + M[UNIMOD:35] and C[UNIMOD:4], respectively. Since C is considered a fixed modification, + every occurence of a C is transformed to C[UNIMOD:4] as well. + + :param modstrings: pd.Series containing the msfragger modstrings + :return: pd.Series with internal modstrings + """ + modstrings = modstrings.str.replace("M[147]", "M[UNIMOD:35]", regex=False) + modstrings = modstrings.str.replace(r"C\[160\]|C", "C[UNIMOD:4]", regex=True) + return modstrings diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py index c71a5d9..f1ed69d 100644 --- a/spectrum_io/search_result/search_results.py +++ b/spectrum_io/search_result/search_results.py @@ -22,8 +22,8 @@ def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame: # retain only peptides that fall within [7, 30] length supported by Prosit df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)] # remove unsupported mods to exclude - unsupported_mods = ["Acetyl (Protein N-term)", "ac"] - exclude_mods_pattern = re.compile("|".join(map(re.escape, unsupported_mods))) + unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]"] + exclude_mods_pattern = re.compile("|".join(unsupported_mods)) df = df[~df["MODIFIED_SEQUENCE"].str.contains(exclude_mods_pattern)] # remove non-canonical aas df = df[(~df["SEQUENCE"].str.contains("U|O"))] From f995b82873288c53b49f8bebca5a9b1754bf89c6 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Mon, 7 Aug 2023 17:40:59 +0200 Subject: [PATCH 4/4] fixed unit test and mypy/typeguard --- spectrum_io/search_result/msfragger.py | 7 +- tests/unit_tests/data/psm.pepXML | 161 +++++++++++++++++++++++++ tests/unit_tests/data/psm.tsv | 11 -- tests/unit_tests/test_msfragger.py | 15 ++- 4 files changed, 173 insertions(+), 21 deletions(-) create mode 100644 tests/unit_tests/data/psm.pepXML delete mode 100644 tests/unit_tests/data/psm.tsv diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py index a066acf..a55ac15 100644 --- a/spectrum_io/search_result/msfragger.py +++ b/spectrum_io/search_result/msfragger.py @@ -26,15 +26,18 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: :raises FileNotFoundError: in case the given path is neither a file, nor a directory. :return: pd.DataFrame with the formatted data """ + if isinstance(path, str): + path = Path(path) + if path.is_file(): file_list = [path] elif path.is_dir(): - file_list = path.rglob("*.pepXML") + file_list = list(path.rglob("*.pepXML")) else: raise FileNotFoundError(f"{path} could not be found.") ms_frag_results = [] - for pep_xml_file in tqdm(list(file_list)): + for pep_xml_file in tqdm(file_list): ms_frag_results.append(pepxml.DataFrame(str(pep_xml_file))) df = pd.concat(ms_frag_results) diff --git a/tests/unit_tests/data/psm.pepXML b/tests/unit_tests/data/psm.pepXML new file mode 100644 index 0000000..9922557 --- /dev/null +++ b/tests/unit_tests/data/psm.pepXML @@ -0,0 +1,161 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/unit_tests/data/psm.tsv b/tests/unit_tests/data/psm.tsv deleted file mode 100644 index 0f5a753..0000000 --- a/tests/unit_tests/data/psm.tsv +++ /dev/null @@ -1,11 +0,0 @@ -Spectrum Spectrum File Peptide Modified Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore PeptideProphet Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins -YIG_085_L006_30_01.00375.00375.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKRGHA A K 20 3 100.2141 2090.9014 2090.9014 697.9744 697.9744 2090.9001 697.9740 0.0012 0.03333000000000 19.4340 0.0000 0.9571 0 0 605 624 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain -YIG_085_L006_30_01.00420.00420.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKRG A H 18 3 112.1268 1882.8008 1882.8008 628.6075 628.6075 1882.8041 628.6086 -0.0033 0.00000017520000 43.2000 12.3400 1.0000 0 0 605 622 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain -YIG_085_L006_30_01.00423.00423.4 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKRG A H 18 4 112.8499 1882.8019 1882.8019 471.7078 471.7078 1882.8041 471.7083 -0.0022 0.00000003950000 38.3490 13.4780 1.0000 0 0 605 622 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain -YIG_085_L006_30_01.00424.00424.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKR A G 17 3 113.1629 1825.7810 1825.7810 609.6009 609.6009 1825.7826 609.6015 -0.0015 0.00000341200000 41.5720 17.8670 1.0000 0 0 605 621 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain -YIG_085_L006_30_01.00428.00428.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml NRVGKVEHGSVA Y L 12 3 114.2047 1251.6663 1251.6663 418.2294 418.2294 1251.6683 418.2300 -0.0020 0.00121700000000 20.9290 11.5700 1.0000 0 0 437 448 0.0000 true sp|Q9Y490|TLN1_HUMAN Q9Y490 TLN1_HUMAN TLN1 Talin-1 -YIG_085_L006_30_01.00432.00432.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml EEGKRHPYKMNLA EEGKRHPYKM[147]NLA T S 13 3 115.1439 1587.7778 1587.7778 530.2665 530.2665 1587.7826 530.2681 -0.0047 0.00089230000000 22.2240 14.5200 1.0000 0 0 95 107 0.0000 10M(15.9949) true sp|O00151|PDLI1_HUMAN O00151 PDLI1_HUMAN PDLIM1 PDZ and LIM domain protein 1 -YIG_085_L006_30_01.00439.00439.4 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHSTKRGHAKSRPV A R 25 4 117.2938 2658.2427 2658.2427 665.5680 665.5680 2658.2493 665.5696 -0.0065 0.00009422000000 28.6670 11.8630 1.0000 0 0 605 629 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain -YIG_085_L006_30_01.00449.00449.3 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml KPEFHEDTR Q S 9 3 119.9727 1157.5449 1157.5449 386.8556 386.8556 1157.5464 386.8561 -0.0014 0.00225800000000 17.7940 0.0000 0.9999 0 0 54 62 0.0000 true sp|Q9NS28|RGS18_HUMAN Q9NS28 RGS18_HUMAN RGS18 Regulator of G-protein signaling 18 -YIG_085_L006_30_01.00469.00469.2 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml VGKVEHGSVA R L 10 2 125.7287 981.5227 981.5227 491.7686 491.7686 981.5243 491.7694 -0.0015 0.00000236400000 31.2090 12.4440 1.0000 0 0 439 448 0.0000 true sp|Q9Y490|TLN1_HUMAN Q9Y490 TLN1_HUMAN TLN1 Talin-1 -YIG_085_L006_30_01.00490.00490.2 U:\internal_projects\L006_peptidome\MQ_searches\230203_L006_85_MetOH-ACN_PROCAL\MSFragger\output\interact-YIG_085_L006_30_01.pep.xml DEAGSEADHEGTHS A T 14 2 131.7005 1440.5374 1440.5374 721.2760 721.2760 1440.5388 721.2767 -0.0014 0.00000604300000 34.3750 10.4530 1.0000 0 0 605 618 0.0000 true sp|P02671|FIBA_HUMAN P02671 FIBA_HUMAN FGA Fibrinogen alpha chain diff --git a/tests/unit_tests/test_msfragger.py b/tests/unit_tests/test_msfragger.py index 5df8a28..cc6ce14 100644 --- a/tests/unit_tests/test_msfragger.py +++ b/tests/unit_tests/test_msfragger.py @@ -11,17 +11,16 @@ class TestMSFragger(unittest.TestCase): def test_read_result(self): """Test read_result for MSFragger.""" - msfragger = MSFragger(str(Path(__file__).parent / "data/")) - df = msfragger.read_result(str(Path(__file__).parent / "data/psm.tsv"), "") + msfragger = MSFragger(Path(__file__).parent / "data/") + df = msfragger.read_result(Path(__file__).parent / "data/psm.pepXML", "") self.assertIsInstance(df, pd.DataFrame) - self.assertTrue("SEQUENCE" in df.columns) + self.assertTrue("RAW_FILE" in df.columns) + self.assertTrue("SCAN_NUMBER" in df.columns) + self.assertTrue("PRECURSOR_CHARGE" in df.columns) + self.assertTrue("SCAN_EVENT_NUMBER" in df.columns) self.assertTrue("MODIFIED_SEQUENCE" in df.columns) self.assertTrue("MASS" in df.columns) self.assertTrue("SCORE" in df.columns) - self.assertTrue("PRECURSOR_CHARGE" in df.columns) - self.assertTrue("RAW_FILE" in df.columns) - self.assertTrue("SCAN_NUMBER" in df.columns) self.assertTrue("REVERSE" in df.columns) + self.assertTrue("SEQUENCE" in df.columns) self.assertTrue("PEPTIDE_LENGTH" in df.columns) - self.assertTrue("SPECTRUM" in df.columns) - self.assertTrue("RETENTION_TIME" in df.columns)