From 3fe1f92a323cbb156813787f3089111162bd05d5 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Sun, 26 May 2024 11:02:32 +0200 Subject: [PATCH 1/4] fix unknown mods not filtered out for sage --- spectrum_io/search_result/sage.py | 2 +- spectrum_io/search_result/search_results.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py index 5aa4caa..f03b504 100644 --- a/spectrum_io/search_result/sage.py +++ b/spectrum_io/search_result/sage.py @@ -58,7 +58,7 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram ) # removing .mzML - df["RAW_FILE"] = df["RAW_FILE"].str.replace(".mzML", "", regex=True) + df["RAW_FILE"] = df["RAW_FILE"].str.replace(r"\.mz[M|m][l|L]", "", regex=True) # extracting only the scan number df["SCAN_NUMBER"] = [int(x.rsplit("=", 1)[-1]) for x in df["SCAN_NUMBER"]] # creating a column of decoys and targets diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py index 2160873..8e9a6b5 100644 --- a/spectrum_io/search_result/search_results.py +++ b/spectrum_io/search_result/search_results.py @@ -22,7 +22,7 @@ def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame: # retain only peptides that fall within [7, 30] length supported by Prosit df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)] # remove unsupported mods to exclude - unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]"] + unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]", r"\+"] exclude_mods_pattern = re.compile("|".join(unsupported_mods)) df = df[~df["MODIFIED_SEQUENCE"].str.contains(exclude_mods_pattern)] # remove non-canonical aas From ff37316338b264041ba01fcfe7ca778d7cee7a48 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Sun, 26 May 2024 11:08:30 +0200 Subject: [PATCH 2/4] fix mod masses for unit test --- tests/unit_tests/data/sage_output.tsv | 12 ++++++------ tests/unit_tests/test_sage.py | 5 +++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/unit_tests/data/sage_output.tsv b/tests/unit_tests/data/sage_output.tsv index 043e04d..0b1596e 100644 --- a/tests/unit_tests/data/sage_output.tsv +++ b/tests/unit_tests/data/sage_output.tsv @@ -1,17 +1,17 @@ peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms1_intensity ms2_intensity [+229.1629]-HLDGGAEQSLLFVAGM[+15.9949]R rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN 2 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=50989 1 -1 2199.172 2045.0568 2 17 0 0.0 72623.49 7.826053 18.084579770792818 0.20642824189008735 0.0 113.94822 0.9495685 0.0 0.9495685 4 2 1 0.05882353 1.318411 84602 -2.0562283019710534 -0.37137848 -0.35820845 0.30232558 0.5880901 0.5815684 926232.0 7361.109 [+229.1629]-GRFVEPLSNVQEEWNQK[+229.1629] rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=37495 1 -1 2671.5417 2517.338 2 17 1 0.0 59436.277 4.4063077 21.44256145543081 0.24493444146309074 0.0 83.519325 0.6959944 0.0 0.6959944 4 1 2 0.11764706 3.139837 130027 -2.014281373814444 -0.37227735 -0.35820845 0.30232558 0.5880901 0.5815684 2674831.2 42405.516 -[+229.1629]-LTVEC[+57.0214]MPTIASDDLPVGTLQESEVSM[+15.9949]TGPG rev_tr|C9JVX2|C9JVX2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45450 1 -1 3533.4688 3378.6082 3 30 0 0.0 44808.7 2.583788 18.628243636678995 0.4911733270242351 0.0 102.02221 0.8501851 0.0 0.8501851 4 1 2 0.06666667 1.6194082 80638 -2.046675752396668 -0.3724502 -0.35820845 0.30232558 0.5880901 0.5815684 643726.75 8588.239 +[+229.1629]-LTVEC[+57.0215]MPTIASDDLPVGTLQESEVSM[+15.9949]TGPG rev_tr|C9JVX2|C9JVX2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45450 1 -1 3533.4688 3378.6082 3 30 0 0.0 44808.7 2.583788 18.628243636678995 0.4911733270242351 0.0 102.02221 0.8501851 0.0 0.8501851 4 1 2 0.06666667 1.6194082 80638 -2.046675752396668 -0.3724502 -0.35820845 0.30232558 0.5880901 0.5815684 643726.75 8588.239 [+229.1629]-VNM[+15.9949]RTSSSIQNEDEATSMELIAPGP sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN 3 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45493 1 1 3074.5525 2921.3948 3 25 1 0.0 51087.08 5.391072 19.979716393086438 0.2293833747943026 0.0 102.12942 0.85107845 0.0 0.85107845 5 1 2 0.08 1.4697478 154744 -2.71207222963538 -0.3737817 -0.35820845 0.30232558 0.5880901 0.5815684 1085421.8 8865.469 [+229.1629]-VGEQEAPHEGGHPGSDSARASMADWLR sp|Q9H093|NUAK2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=41946 1 1 3229.3804 3075.4417 3 27 1 0.0 48832.06 2.6081188 20.936078174830033 0.05364075290029291 0.0 93.63916 0.7803263 0.0 0.7803263 4 1 1 0.037037037 6.790568 109802 -2.034297610829216 -0.37500644 -0.35820845 0.30232558 0.5880901 0.5815684 604286.9 36628.875 -[+229.1629]-EM[+15.9949]VSPTDSC[+57.0214]VRVSVRDLPQFHVSVVDM[+15.9949]DR rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN 2 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=39750 1 -1 3924.602 3620.7583 3 29 2 0.0 80537.9 4.2269964 17.954540767022678 0.13827490088740646 0.0 88.40008 0.73666734 0.0 0.73666734 4 1 2 0.06896552 2.1663432 50127 -2.0296029706779146 -0.375757 -0.35820845 0.30232558 0.5880901 0.5815684 1711349.0 6906.8516 +[+229.1629]-EM[+15.9949]VSPTDSC[+57.0215]VRVSVRDLPQFHVSVVDM[+15.9949]DR rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN 2 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=39750 1 -1 3924.602 3620.7583 3 29 2 0.0 80537.9 4.2269964 17.954540767022678 0.13827490088740646 0.0 88.40008 0.73666734 0.0 0.73666734 4 1 2 0.06896552 2.1663432 50127 -2.0296029706779146 -0.375757 -0.35820845 0.30232558 0.5880901 0.5815684 1711349.0 6906.8516 [+229.1629]-K[+229.1629]M[+15.9949]EEDIYTNLSK[+229.1629]METVLGQSMSSLPLSYR sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45441 2 1 4205.6133 4053.1077 4 29 2 0.0 36932.02 7.9742827 19.962357128713123 0.0 0.0 101.99851 0.8499876 0.0 0.8499876 4 1 2 0.06896552 2.6182032 65126 -1.9825140292603154 -0.37760013 -0.35820845 0.30232558 0.5880901 0.5815684 1558514.0 15372.994 -[+229.1629]-TC[+57.0214]SK[+229.1629]SQGSWGNREIVIIDTPDMFSWK[+229.1629] sp|Q9UG22|GIMA2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=36758 1 1 3881.789 3728.9263 3 26 2 0.0 40170.414 8.588704 18.776753308950084 2.853361183504054 0.0 81.83095 0.6819246 0.0 0.6819246 4 1 1 0.03846154 2.170038 54463 -2.02115128698497 -0.3785715 -0.35820845 0.30232558 0.5880901 0.5815684 1264436.8 13284.256 +[+229.1629]-TC[+57.0215]SK[+229.1629]SQGSWGNREIVIIDTPDMFSWK[+229.1629] sp|Q9UG22|GIMA2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=36758 1 1 3881.789 3728.9263 3 26 2 0.0 40170.414 8.588704 18.776753308950084 2.853361183504054 0.0 81.83095 0.6819246 0.0 0.6819246 4 1 1 0.03846154 2.170038 54463 -2.02115128698497 -0.3785715 -0.35820845 0.30232558 0.5880901 0.5815684 1264436.8 13284.256 [+229.1629]-M[+15.9949]SLGRAAPSAPGR rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN 3 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=49100 1 -1 1669.267 1514.819 2 13 1 0.0 97012.45 4.073574 20.66450790410668 0.2021918361478079 0.0 109.23304 0.91027534 0.0 0.91027534 5 1 3 0.23076923 1.8854505 176106 -2.684809584132347 -0.37956935 -0.35820845 0.30232558 0.5880901 0.5815684 611105.25 12195.32 -[+229.1629]-C[+57.0214]LIQM[+15.9949]GAAVEAK[+229.1629]AYNGNTALHVAASLQYR tr|H7C5S1|H7C5S1_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=37376 1 1 3897.3843 3593.885 3 29 1 0.0 81027.46 5.0039053 21.88178474607965 0.07848479239995854 0.0 83.277466 0.6939789 0.0 0.6939789 4 1 2 0.06896552 2.1495197 58331 -2.045800246831915 -0.37965888 -0.35820845 0.30232558 0.5880901 0.5815684 18796730.0 52520.473 +[+229.1629]-C[+57.0215]LIQM[+15.9949]GAAVEAK[+229.1629]AYNGNTALHVAASLQYR tr|H7C5S1|H7C5S1_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=37376 1 1 3897.3843 3593.885 3 29 1 0.0 81027.46 5.0039053 21.88178474607965 0.07848479239995854 0.0 83.277466 0.6939789 0.0 0.6939789 4 1 2 0.06896552 2.1495197 58331 -2.045800246831915 -0.37965888 -0.35820845 0.30232558 0.5880901 0.5815684 18796730.0 52520.473 [+229.1629]-M[+15.9949]EESLNIVK[+229.1629]YTAFLYNDQLIWSGLEQDDMR sp|P86790|CCZ1B_HUMAN;sp|P86791|CCZ1_HUMAN 2 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=41296 1 1 4398.7207 4095.037 3 30 1 0.0 71507.48 3.7571762 22.91659667770412 2.159702635767953 0.0 92.108765 0.76757306 0.0 0.76757306 4 2 2 0.06666667 18.10546 11995 -2.0440490580574484 -0.38045537 -0.35820845 0.30232558 0.5880901 0.5815684 1328658.9 71828.805 -[+229.1629]-LNVEGTERGSC[+57.0214]GRK[+229.1629] sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=40784 2 1 2174.1533 2020.0887 3 14 2 0.0 73464.8 5.374492 21.842498887625368 0.6170763228642393 0.0 90.874275 0.75728565 0.0 0.75728565 5 3 2 0.14285715 2.8490458 237683 -2.6522210241481585 -0.3812077 -0.35820845 0.30232558 0.5880901 0.5815684 1290087.5 21134.262 -[+229.1629]-C[+57.0214]NRGWTALHESVSR sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=51002 2 1 2055.9688 1900.9526 2 14 1 0.0 78351.875 5.277066 17.823081883875247 0.012034867562086049 0.0 113.98048 0.9498373 0.0 0.9498373 4 1 1 0.071428575 1.2503061 115129 -2.0307522097044646 -0.3828116 -0.35820845 0.30232558 0.5880901 0.5815684 977832.06 5506.2334 +[+229.1629]-LNVEGTERGSC[+57.0215]GRK[+229.1629] sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=40784 2 1 2174.1533 2020.0887 3 14 2 0.0 73464.8 5.374492 21.842498887625368 0.6170763228642393 0.0 90.874275 0.75728565 0.0 0.75728565 5 3 2 0.14285715 2.8490458 237683 -2.6522210241481585 -0.3812077 -0.35820845 0.30232558 0.5880901 0.5815684 1290087.5 21134.262 +[+229.1629]-C[+57.0215]NRGWTALHESVSR sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=51002 2 1 2055.9688 1900.9526 2 14 1 0.0 78351.875 5.277066 17.823081883875247 0.012034867562086049 0.0 113.98048 0.9498373 0.0 0.9498373 4 1 1 0.071428575 1.2503061 115129 -2.0307522097044646 -0.3828116 -0.35820845 0.30232558 0.5880901 0.5815684 977832.06 5506.2334 [+229.1629]-AEVDNQMHVVDK[+229.1629]NPVSLVSK[+229.1629]TR rev_sp|O75151|PHF2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=40715 2 -1 3307.599 3152.7583 3 22 2 0.0 47935.68 2.6617033 19.971965322167343 1.5655510246394897 0.0 90.70834 0.7559029 0.0 0.7559029 4 2 1 0.045454547 2.7450855 107384 -2.0235551137874417 -0.38507006 -0.35820845 0.30232558 0.5880901 0.5815684 923512.1 14331.008 [+229.1629]-YLLSLEEERPALMDDR sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN 3 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=37506 1 1 2331.2417 2178.1194 2 16 0 0.0 67913.086 2.5317562 19.997613788592076 0.39086524401945155 0.0 83.542175 0.6961848 0.0 0.6961848 4 1 2 0.125 4.0383697 80445 -2.062325498085077 -0.38619247 -0.35820845 0.30232558 0.5880901 0.5815684 3893642.5 14758.541 [+229.1629]-EGRGAGSQSPPRGR sp|Q6ZSN1|YI023_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45427 1 1 1942.9047 1639.8704 2 14 2 0.0 169161.78 4.145797 19.229664162247772 0.4602391019734071 0.0 101.96355 0.8496962 0.0 0.8496962 4 1 1 0.071428575 4.303446 136956 -2.026177700336816 -0.3904131 -0.35820845 0.30232558 0.5880901 0.5815684 993124.44 14842.607 diff --git a/tests/unit_tests/test_sage.py b/tests/unit_tests/test_sage.py index 40dcfb0..9faf7e7 100644 --- a/tests/unit_tests/test_sage.py +++ b/tests/unit_tests/test_sage.py @@ -12,7 +12,8 @@ class TestSage(unittest.TestCase): def test_read_sage(self): """Test function for reading sage results and transforming to Prosit format.""" expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv" - internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result() + internal_search_results_df = ( + Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result().reset_index(drop=True) + ) expected_df = pd.read_csv(expected_sage_internal_path) - pd.testing.assert_frame_equal(internal_search_results_df, expected_df) From 77aa3999884cc93511ead14068bc542d6f1ad40f Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Sun, 26 May 2024 11:11:12 +0200 Subject: [PATCH 3/4] added TIMSTOF instrument type to output --- spectrum_io/d/bruker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spectrum_io/d/bruker.py b/spectrum_io/d/bruker.py index 60cc6b2..2f6da95 100644 --- a/spectrum_io/d/bruker.py +++ b/spectrum_io/d/bruker.py @@ -184,5 +184,6 @@ def read_and_aggregate_timstof(source: Path, tims_meta_file: Path) -> pd.DataFra df_combined["RAW_FILE"] = source.stem df_combined["MASS_ANALYZER"] = "TOF" df_combined["FRAGMENTATION"] = "HCD" + df_combined["INSTRUMENT_TYPES"] = "TIMSTOF" return df_combined From 5a9c28b73acae4a413c99dd413dc5dbdea394767 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Sun, 26 May 2024 11:12:18 +0200 Subject: [PATCH 4/4] updated spectrum_fundamentals min dep --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0651f6e..72624b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ h5py = "^3.1.0" pymzml = "^2.5.0" pyteomics = "^4.3.3" lxml= '^4.5.2' -spectrum-fundamentals = ">=0.5.2,<0.6.0" +spectrum-fundamentals = ">=0.5.4,<0.6.0" alphatims = "^1.0.8" sortedcontainers = "^2.4.0"