Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/sage #77

Merged
merged 17 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions spectrum_io/search_result/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .mascot import Mascot
from .maxquant import MaxQuant
from .msfragger import MSFragger
from .sage import Sage
78 changes: 78 additions & 0 deletions spectrum_io/search_result/sage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import logging
from pathlib import Path
from typing import Union

import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import sage_to_internal

from .search_results import SearchResults, filter_valid_prosit_sequences

logger = logging.getLogger(__name__)


class Sage(SearchResults):
"""Handle search results from Sage."""

@staticmethod
def read_result(path: Union[str, Path], tmt_labeled: str = "") -> pd.DataFrame:
"""
Function to read a msms tsv and perform some basic formatting.

:param path: path to msms.tsv to read
:param tmt_labeled: tmt label as str
:return: pd.DataFrame with the formatted data
"""
logger.info("Reading msms.tsv file")
df = pd.read_csv(
path,
usecols=["filename", "scannr", "peptide", "charge", "hyperscore", "calcmass", "proteins"],
sep="\t",
)
logger.info("Finished reading msms.tsv file")

# Standardize column names
df.columns = df.columns.str.upper()
df.columns = df.columns.str.replace(" ", "_")

df = Sage.update_columns_for_prosit(df, tmt_labeled)
return filter_valid_prosit_sequences(df)

@staticmethod
def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFrame:
"""
Update columns of df to work with Prosit.

:param df: df to modify
:param tmt_labeled: True if tmt labeled, ignored
:return: modified df as pd.DataFrame
"""
df = df.rename(
columns={
"FILENAME": "RAW_FILE",
"SCANNR": "SCAN_NUMBER",
"PEPTIDE": "MODIFIED_SEQUENCE",
"CHARGE": "PRECURSOR_CHARGE",
}
)

# removing .mzML
df["RAW_FILE"] = df["RAW_FILE"].str.replace(".mzML", "", regex=True)
# extracting only the scan number
df["SCAN_NUMBER"] = [int(x.rsplit("=", 1)[-1]) for x in df["SCAN_NUMBER"]]
# creating a column of decoys and targets
df["REVERSE"] = df["PROTEINS"].str.startswith("rev_")
# removing modification to create the unmodified sequences
df["SEQUENCE"] = df["MODIFIED_SEQUENCE"].str.replace(r"\[.*?\]", "", regex=True)
# length of the peptide
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
# mass of the peptide
df["MASS"] = df["CALCMASS"]
# score of the peptide
df["SCORE"] = df["HYPERSCORE"]
# converting proforma to unimode
print(df)
df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"])

print(df.columns)
return df
18 changes: 18 additions & 0 deletions tests/unit_tests/data/sage_output.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms1_intensity ms2_intensity
[+229.1629]-HLDGGAEQSLLFVAGM[+15.9949]R rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN 2 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=50989 1 -1 2199.172 2045.0568 2 17 0 0.0 72623.49 7.826053 18.084579770792818 0.20642824189008735 0.0 113.94822 0.9495685 0.0 0.9495685 4 2 1 0.05882353 1.318411 84602 -2.0562283019710534 -0.37137848 -0.35820845 0.30232558 0.5880901 0.5815684 926232.0 7361.109
[+229.1629]-GRFVEPLSNVQEEWNQK[+229.1629] rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=37495 1 -1 2671.5417 2517.338 2 17 1 0.0 59436.277 4.4063077 21.44256145543081 0.24493444146309074 0.0 83.519325 0.6959944 0.0 0.6959944 4 1 2 0.11764706 3.139837 130027 -2.014281373814444 -0.37227735 -0.35820845 0.30232558 0.5880901 0.5815684 2674831.2 42405.516
[+229.1629]-LTVEC[+57.0214]MPTIASDDLPVGTLQESEVSM[+15.9949]TGPG rev_tr|C9JVX2|C9JVX2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45450 1 -1 3533.4688 3378.6082 3 30 0 0.0 44808.7 2.583788 18.628243636678995 0.4911733270242351 0.0 102.02221 0.8501851 0.0 0.8501851 4 1 2 0.06666667 1.6194082 80638 -2.046675752396668 -0.3724502 -0.35820845 0.30232558 0.5880901 0.5815684 643726.75 8588.239
[+229.1629]-VNM[+15.9949]RTSSSIQNEDEATSMELIAPGP sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN 3 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45493 1 1 3074.5525 2921.3948 3 25 1 0.0 51087.08 5.391072 19.979716393086438 0.2293833747943026 0.0 102.12942 0.85107845 0.0 0.85107845 5 1 2 0.08 1.4697478 154744 -2.71207222963538 -0.3737817 -0.35820845 0.30232558 0.5880901 0.5815684 1085421.8 8865.469
[+229.1629]-VGEQEAPHEGGHPGSDSARASMADWLR sp|Q9H093|NUAK2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=41946 1 1 3229.3804 3075.4417 3 27 1 0.0 48832.06 2.6081188 20.936078174830033 0.05364075290029291 0.0 93.63916 0.7803263 0.0 0.7803263 4 1 1 0.037037037 6.790568 109802 -2.034297610829216 -0.37500644 -0.35820845 0.30232558 0.5880901 0.5815684 604286.9 36628.875
[+229.1629]-EM[+15.9949]VSPTDSC[+57.0214]VRVSVRDLPQFHVSVVDM[+15.9949]DR rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN 2 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=39750 1 -1 3924.602 3620.7583 3 29 2 0.0 80537.9 4.2269964 17.954540767022678 0.13827490088740646 0.0 88.40008 0.73666734 0.0 0.73666734 4 1 2 0.06896552 2.1663432 50127 -2.0296029706779146 -0.375757 -0.35820845 0.30232558 0.5880901 0.5815684 1711349.0 6906.8516
[+229.1629]-K[+229.1629]M[+15.9949]EEDIYTNLSK[+229.1629]METVLGQSMSSLPLSYR sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45441 2 1 4205.6133 4053.1077 4 29 2 0.0 36932.02 7.9742827 19.962357128713123 0.0 0.0 101.99851 0.8499876 0.0 0.8499876 4 1 2 0.06896552 2.6182032 65126 -1.9825140292603154 -0.37760013 -0.35820845 0.30232558 0.5880901 0.5815684 1558514.0 15372.994
[+229.1629]-TC[+57.0214]SK[+229.1629]SQGSWGNREIVIIDTPDMFSWK[+229.1629] sp|Q9UG22|GIMA2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=36758 1 1 3881.789 3728.9263 3 26 2 0.0 40170.414 8.588704 18.776753308950084 2.853361183504054 0.0 81.83095 0.6819246 0.0 0.6819246 4 1 1 0.03846154 2.170038 54463 -2.02115128698497 -0.3785715 -0.35820845 0.30232558 0.5880901 0.5815684 1264436.8 13284.256
[+229.1629]-M[+15.9949]SLGRAAPSAPGR rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN 3 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=49100 1 -1 1669.267 1514.819 2 13 1 0.0 97012.45 4.073574 20.66450790410668 0.2021918361478079 0.0 109.23304 0.91027534 0.0 0.91027534 5 1 3 0.23076923 1.8854505 176106 -2.684809584132347 -0.37956935 -0.35820845 0.30232558 0.5880901 0.5815684 611105.25 12195.32
[+229.1629]-C[+57.0214]LIQM[+15.9949]GAAVEAK[+229.1629]AYNGNTALHVAASLQYR tr|H7C5S1|H7C5S1_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=37376 1 1 3897.3843 3593.885 3 29 1 0.0 81027.46 5.0039053 21.88178474607965 0.07848479239995854 0.0 83.277466 0.6939789 0.0 0.6939789 4 1 2 0.06896552 2.1495197 58331 -2.045800246831915 -0.37965888 -0.35820845 0.30232558 0.5880901 0.5815684 18796730.0 52520.473
[+229.1629]-M[+15.9949]EESLNIVK[+229.1629]YTAFLYNDQLIWSGLEQDDMR sp|P86790|CCZ1B_HUMAN;sp|P86791|CCZ1_HUMAN 2 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=41296 1 1 4398.7207 4095.037 3 30 1 0.0 71507.48 3.7571762 22.91659667770412 2.159702635767953 0.0 92.108765 0.76757306 0.0 0.76757306 4 2 2 0.06666667 18.10546 11995 -2.0440490580574484 -0.38045537 -0.35820845 0.30232558 0.5880901 0.5815684 1328658.9 71828.805
[+229.1629]-LNVEGTERGSC[+57.0214]GRK[+229.1629] sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=40784 2 1 2174.1533 2020.0887 3 14 2 0.0 73464.8 5.374492 21.842498887625368 0.6170763228642393 0.0 90.874275 0.75728565 0.0 0.75728565 5 3 2 0.14285715 2.8490458 237683 -2.6522210241481585 -0.3812077 -0.35820845 0.30232558 0.5880901 0.5815684 1290087.5 21134.262
[+229.1629]-C[+57.0214]NRGWTALHESVSR sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN 4 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=51002 2 1 2055.9688 1900.9526 2 14 1 0.0 78351.875 5.277066 17.823081883875247 0.012034867562086049 0.0 113.98048 0.9498373 0.0 0.9498373 4 1 1 0.071428575 1.2503061 115129 -2.0307522097044646 -0.3828116 -0.35820845 0.30232558 0.5880901 0.5815684 977832.06 5506.2334
[+229.1629]-AEVDNQMHVVDK[+229.1629]NPVSLVSK[+229.1629]TR rev_sp|O75151|PHF2_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=40715 2 -1 3307.599 3152.7583 3 22 2 0.0 47935.68 2.6617033 19.971965322167343 1.5655510246394897 0.0 90.70834 0.7559029 0.0 0.7559029 4 2 1 0.045454547 2.7450855 107384 -2.0235551137874417 -0.38507006 -0.35820845 0.30232558 0.5880901 0.5815684 923512.1 14331.008
[+229.1629]-YLLSLEEERPALMDDR sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN 3 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=37506 1 1 2331.2417 2178.1194 2 16 0 0.0 67913.086 2.5317562 19.997613788592076 0.39086524401945155 0.0 83.542175 0.6961848 0.0 0.6961848 4 1 2 0.125 4.0383697 80445 -2.062325498085077 -0.38619247 -0.35820845 0.30232558 0.5880901 0.5815684 3893642.5 14758.541
[+229.1629]-EGRGAGSQSPPRGR sp|Q6ZSN1|YI023_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=45427 1 1 1942.9047 1639.8704 2 14 2 0.0 169161.78 4.145797 19.229664162247772 0.4602391019734071 0.0 101.96355 0.8496962 0.0 0.8496962 4 1 1 0.071428575 4.303446 136956 -2.026177700336816 -0.3904131 -0.35820845 0.30232558 0.5880901 0.5815684 993124.44 14842.607
[+229.1629]-TASASRRSAR sp|P08729|K2C7_HUMAN 1 GN20170722_SK_HLA_G0103_R1_01.mzML controllerType=0 controllerNumber=1 scan=30700 1 1 1444.0938 1290.7319 2 10 2 0.0 112154.734 2.7423766 19.376349012983994 0.42494292662552624 0.0 67.56542 0.5630452 0.0 0.5630452 4 1 2 0.2 4.1968794 97885 -2.029718921235703 -0.3916049 -0.35820845 0.30232558 0.5880901 0.5815684 266738.5 11889.021
16 changes: 16 additions & 0 deletions tests/unit_tests/data/sage_output_internal.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
,MODIFIED_SEQUENCE,PROTEINS,RAW_FILE,SCAN_NUMBER,CALCMASS,PRECURSOR_CHARGE,HYPERSCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH,MASS,SCORE
0,[UNIMOD:737]-HLDGGAEQSLLFVAGM[UNIMOD:35]R,rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN,GN20170722_SK_HLA_G0103_R1_01,50989,2045.0568,2,18.084579770792818,True,-HLDGGAEQSLLFVAGMR,18,2045.0568,18.084579770792818
1,[UNIMOD:737]-GRFVEPLSNVQEEWNQK[UNIMOD:737],rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37495,2517.338,2,21.44256145543081,True,-GRFVEPLSNVQEEWNQK,18,2517.338,21.44256145543081
3,[UNIMOD:737]-VNM[UNIMOD:35]RTSSSIQNEDEATSMELIAPGP,sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45493,2921.3948,3,19.97971639308644,False,-VNMRTSSSIQNEDEATSMELIAPGP,26,2921.3948,19.97971639308644
4,[UNIMOD:737]-VGEQEAPHEGGHPGSDSARASMADWLR,sp|Q9H093|NUAK2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41946,3075.4417,3,20.936078174830037,False,-VGEQEAPHEGGHPGSDSARASMADWLR,28,3075.4417,20.936078174830037
5,[UNIMOD:737]-EM[UNIMOD:35]VSPTDSC[UNIMOD:4]VRVSVRDLPQFHVSVVDM[UNIMOD:35]DR,rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN,GN20170722_SK_HLA_G0103_R1_01,39750,3620.7583,3,17.954540767022678,True,-EMVSPTDSCVRVSVRDLPQFHVSVVDMDR,30,3620.7583,17.954540767022678
6,[UNIMOD:737]-K[UNIMOD:737]M[UNIMOD:35]EEDIYTNLSK[UNIMOD:737]METVLGQSMSSLPLSYR,sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45441,4053.1077,4,19.962357128713123,False,-KMEEDIYTNLSKMETVLGQSMSSLPLSYR,30,4053.1077,19.962357128713123
7,[UNIMOD:737]-TC[UNIMOD:4]SK[UNIMOD:737]SQGSWGNREIVIIDTPDMFSWK[UNIMOD:737],sp|Q9UG22|GIMA2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,36758,3728.9263,3,18.776753308950084,False,-TCSKSQGSWGNREIVIIDTPDMFSWK,27,3728.9263,18.776753308950084
8,[UNIMOD:737]-M[UNIMOD:35]SLGRAAPSAPGR,rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,49100,1514.819,2,20.66450790410668,True,-MSLGRAAPSAPGR,14,1514.819,20.66450790410668
9,[UNIMOD:737]-C[UNIMOD:4]LIQM[UNIMOD:35]GAAVEAK[UNIMOD:737]AYNGNTALHVAASLQYR,tr|H7C5S1|H7C5S1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37376,3593.885,3,21.88178474607965,False,-CLIQMGAAVEAKAYNGNTALHVAASLQYR,30,3593.885,21.88178474607965
11,[UNIMOD:737]-LNVEGTERGSC[UNIMOD:4]GRK[UNIMOD:737],sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40784,2020.0887,3,21.842498887625368,False,-LNVEGTERGSCGRK,15,2020.0887,21.842498887625368
12,[UNIMOD:737]-C[UNIMOD:4]NRGWTALHESVSR,sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,51002,1900.9526,2,17.823081883875247,False,-CNRGWTALHESVSR,15,1900.9526,17.823081883875247
13,[UNIMOD:737]-AEVDNQMHVVDK[UNIMOD:737]NPVSLVSK[UNIMOD:737]TR,rev_sp|O75151|PHF2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40715,3152.7583,3,19.971965322167343,True,-AEVDNQMHVVDKNPVSLVSKTR,23,3152.7583,19.971965322167343
14,[UNIMOD:737]-YLLSLEEERPALMDDR,sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37506,2178.1194,2,19.99761378859208,False,-YLLSLEEERPALMDDR,17,2178.1194,19.99761378859208
15,[UNIMOD:737]-EGRGAGSQSPPRGR,sp|Q6ZSN1|YI023_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45427,1639.8704,2,19.22966416224777,False,-EGRGAGSQSPPRGR,15,1639.8704,19.22966416224777
16,[UNIMOD:737]-TASASRRSAR,sp|P08729|K2C7_HUMAN,GN20170722_SK_HLA_G0103_R1_01,30700,1290.7319,2,19.376349012983997,False,-TASASRRSAR,11,1290.7319,19.376349012983997
20 changes: 20 additions & 0 deletions tests/unit_tests/test_sage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import unittest
from pathlib import Path

import pandas as pd

from spectrum_io.search_result import Sage


class TestSage(unittest.TestCase):
"""Test vlass to check Sage search result processing."""

def test_read_sage(self):
"""Test function for reading sage results and transforming to Prosit format."""
sage_output_path = Path(__file__).parent / "data" / "sage_output.tsv"
expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv"

internal_search_results_df = Sage.read_result(sage_output_path)
expected_df = pd.read_csv(expected_sage_internal_path, index_col=0)

pd.testing.assert_frame_equal(internal_search_results_df, expected_df)
Loading