Skip to content

Commit

Permalink
Merge pull request #98 from wilhelm-lab/fix/sage_read_tmt_and_decoys
Browse files Browse the repository at this point in the history
Fix/sage read tmt and decoys
  • Loading branch information
picciama authored Apr 27, 2024
2 parents 275ef32 + e14f86e commit 080a28a
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 28 deletions.
19 changes: 9 additions & 10 deletions spectrum_io/search_result/sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame:
:param tmt_labeled: tmt label as str
:return: pd.DataFrame with the formatted data
"""
logger.info("Reading msms.tsv file")
logger.info(f"Reading {self.path}")
df = pd.read_csv(
self.path,
usecols=["filename", "scannr", "peptide", "charge", "hyperscore", "calcmass", "proteins"],
usecols=["filename", "scannr", "peptide", "charge", "hyperscore", "calcmass", "label", "proteins"],
sep="\t",
)
logger.info("Finished reading msms.tsv file")
logger.info(f"Finished reading {self.path}")

# Standardize column names
df.columns = df.columns.str.upper()
Expand All @@ -51,6 +51,9 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram
"SCANNR": "SCAN_NUMBER",
"PEPTIDE": "MODIFIED_SEQUENCE",
"CHARGE": "PRECURSOR_CHARGE",
"CALCMASS": "MASS",
"HYPERSCORE": "SCORE",
"LABEL": "REVERSE",
}
)

Expand All @@ -59,16 +62,12 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram
# extracting only the scan number
df["SCAN_NUMBER"] = [int(x.rsplit("=", 1)[-1]) for x in df["SCAN_NUMBER"]]
# creating a column of decoys and targets
df["REVERSE"] = df["PROTEINS"].str.startswith("rev_")
df["REVERSE"] = df["REVERSE"] < 0
# removing modification to create the unmodified sequences
df["SEQUENCE"] = df["MODIFIED_SEQUENCE"].str.replace(r"\[.*?\]", "", regex=True)
df["SEQUENCE"] = df["MODIFIED_SEQUENCE"].str.replace(r"\-|\[.*?\]", "", regex=True)
# length of the peptide
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
# mass of the peptide
df["MASS"] = df["CALCMASS"]
# score of the peptide
df["SCORE"] = df["HYPERSCORE"]
# converting proforma to unimode
# converting sage to unimod
df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"])

return df
34 changes: 18 additions & 16 deletions tests/unit_tests/data/sage_output_internal.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
,MODIFIED_SEQUENCE,PROTEINS,RAW_FILE,SCAN_NUMBER,CALCMASS,PRECURSOR_CHARGE,HYPERSCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH,MASS,SCORE
0,[UNIMOD:737]-HLDGGAEQSLLFVAGM[UNIMOD:35]R,rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN,GN20170722_SK_HLA_G0103_R1_01,50989,2045.0568,2,18.084579770792818,True,-HLDGGAEQSLLFVAGMR,18,2045.0568,18.084579770792818
1,[UNIMOD:737]-GRFVEPLSNVQEEWNQK[UNIMOD:737],rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37495,2517.338,2,21.44256145543081,True,-GRFVEPLSNVQEEWNQK,18,2517.338,21.44256145543081
3,[UNIMOD:737]-VNM[UNIMOD:35]RTSSSIQNEDEATSMELIAPGP,sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45493,2921.3948,3,19.97971639308644,False,-VNMRTSSSIQNEDEATSMELIAPGP,26,2921.3948,19.97971639308644
4,[UNIMOD:737]-VGEQEAPHEGGHPGSDSARASMADWLR,sp|Q9H093|NUAK2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41946,3075.4417,3,20.936078174830037,False,-VGEQEAPHEGGHPGSDSARASMADWLR,28,3075.4417,20.936078174830037
5,[UNIMOD:737]-EM[UNIMOD:35]VSPTDSC[UNIMOD:4]VRVSVRDLPQFHVSVVDM[UNIMOD:35]DR,rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN,GN20170722_SK_HLA_G0103_R1_01,39750,3620.7583,3,17.954540767022678,True,-EMVSPTDSCVRVSVRDLPQFHVSVVDMDR,30,3620.7583,17.954540767022678
6,[UNIMOD:737]-K[UNIMOD:737]M[UNIMOD:35]EEDIYTNLSK[UNIMOD:737]METVLGQSMSSLPLSYR,sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45441,4053.1077,4,19.962357128713123,False,-KMEEDIYTNLSKMETVLGQSMSSLPLSYR,30,4053.1077,19.962357128713123
7,[UNIMOD:737]-TC[UNIMOD:4]SK[UNIMOD:737]SQGSWGNREIVIIDTPDMFSWK[UNIMOD:737],sp|Q9UG22|GIMA2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,36758,3728.9263,3,18.776753308950084,False,-TCSKSQGSWGNREIVIIDTPDMFSWK,27,3728.9263,18.776753308950084
8,[UNIMOD:737]-M[UNIMOD:35]SLGRAAPSAPGR,rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,49100,1514.819,2,20.66450790410668,True,-MSLGRAAPSAPGR,14,1514.819,20.66450790410668
9,[UNIMOD:737]-C[UNIMOD:4]LIQM[UNIMOD:35]GAAVEAK[UNIMOD:737]AYNGNTALHVAASLQYR,tr|H7C5S1|H7C5S1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37376,3593.885,3,21.88178474607965,False,-CLIQMGAAVEAKAYNGNTALHVAASLQYR,30,3593.885,21.88178474607965
11,[UNIMOD:737]-LNVEGTERGSC[UNIMOD:4]GRK[UNIMOD:737],sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40784,2020.0887,3,21.842498887625368,False,-LNVEGTERGSCGRK,15,2020.0887,21.842498887625368
12,[UNIMOD:737]-C[UNIMOD:4]NRGWTALHESVSR,sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,51002,1900.9526,2,17.823081883875247,False,-CNRGWTALHESVSR,15,1900.9526,17.823081883875247
13,[UNIMOD:737]-AEVDNQMHVVDK[UNIMOD:737]NPVSLVSK[UNIMOD:737]TR,rev_sp|O75151|PHF2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40715,3152.7583,3,19.971965322167343,True,-AEVDNQMHVVDKNPVSLVSKTR,23,3152.7583,19.971965322167343
14,[UNIMOD:737]-YLLSLEEERPALMDDR,sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37506,2178.1194,2,19.99761378859208,False,-YLLSLEEERPALMDDR,17,2178.1194,19.99761378859208
15,[UNIMOD:737]-EGRGAGSQSPPRGR,sp|Q6ZSN1|YI023_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45427,1639.8704,2,19.22966416224777,False,-EGRGAGSQSPPRGR,15,1639.8704,19.22966416224777
16,[UNIMOD:737]-TASASRRSAR,sp|P08729|K2C7_HUMAN,GN20170722_SK_HLA_G0103_R1_01,30700,1290.7319,2,19.376349012983997,False,-TASASRRSAR,11,1290.7319,19.376349012983997
MODIFIED_SEQUENCE,PROTEINS,RAW_FILE,SCAN_NUMBER,REVERSE,MASS,PRECURSOR_CHARGE,SCORE,SEQUENCE,PEPTIDE_LENGTH
[UNIMOD:737]-HLDGGAEQSLLFVAGM[UNIMOD:35]R,rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN,GN20170722_SK_HLA_G0103_R1_01,50989,True,2045.0568,2,18.084579770792818,HLDGGAEQSLLFVAGMR,17
[UNIMOD:737]-GRFVEPLSNVQEEWNQK[UNIMOD:737],rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37495,True,2517.338,2,21.44256145543081,GRFVEPLSNVQEEWNQK,17
[UNIMOD:737]-LTVEC[UNIMOD:4]MPTIASDDLPVGTLQESEVSM[UNIMOD:35]TGPG,rev_tr|C9JVX2|C9JVX2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45450,True,3378.6082,3,18.628243636678995,LTVECMPTIASDDLPVGTLQESEVSMTGPG,30
[UNIMOD:737]-VNM[UNIMOD:35]RTSSSIQNEDEATSMELIAPGP,sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45493,False,2921.3948,3,19.97971639308644,VNMRTSSSIQNEDEATSMELIAPGP,25
[UNIMOD:737]-VGEQEAPHEGGHPGSDSARASMADWLR,sp|Q9H093|NUAK2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41946,False,3075.4417,3,20.936078174830037,VGEQEAPHEGGHPGSDSARASMADWLR,27
[UNIMOD:737]-EM[UNIMOD:35]VSPTDSC[UNIMOD:4]VRVSVRDLPQFHVSVVDM[UNIMOD:35]DR,rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN,GN20170722_SK_HLA_G0103_R1_01,39750,True,3620.7583,3,17.954540767022678,EMVSPTDSCVRVSVRDLPQFHVSVVDMDR,29
[UNIMOD:737]-K[UNIMOD:737]M[UNIMOD:35]EEDIYTNLSK[UNIMOD:737]METVLGQSMSSLPLSYR,sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45441,False,4053.1077,4,19.962357128713123,KMEEDIYTNLSKMETVLGQSMSSLPLSYR,29
[UNIMOD:737]-TC[UNIMOD:4]SK[UNIMOD:737]SQGSWGNREIVIIDTPDMFSWK[UNIMOD:737],sp|Q9UG22|GIMA2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,36758,False,3728.9263,3,18.776753308950084,TCSKSQGSWGNREIVIIDTPDMFSWK,26
[UNIMOD:737]-M[UNIMOD:35]SLGRAAPSAPGR,rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,49100,True,1514.819,2,20.66450790410668,MSLGRAAPSAPGR,13
[UNIMOD:737]-C[UNIMOD:4]LIQM[UNIMOD:35]GAAVEAK[UNIMOD:737]AYNGNTALHVAASLQYR,tr|H7C5S1|H7C5S1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37376,False,3593.885,3,21.88178474607965,CLIQMGAAVEAKAYNGNTALHVAASLQYR,29
[UNIMOD:737]-M[UNIMOD:35]EESLNIVK[UNIMOD:737]YTAFLYNDQLIWSGLEQDDMR,sp|P86790|CCZ1B_HUMAN;sp|P86791|CCZ1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41296,False,4095.037,3,22.91659667770412,MEESLNIVKYTAFLYNDQLIWSGLEQDDMR,30
[UNIMOD:737]-LNVEGTERGSC[UNIMOD:4]GRK[UNIMOD:737],sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40784,False,2020.0887,3,21.842498887625368,LNVEGTERGSCGRK,14
[UNIMOD:737]-C[UNIMOD:4]NRGWTALHESVSR,sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,51002,False,1900.9526,2,17.823081883875247,CNRGWTALHESVSR,14
[UNIMOD:737]-AEVDNQMHVVDK[UNIMOD:737]NPVSLVSK[UNIMOD:737]TR,rev_sp|O75151|PHF2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40715,True,3152.7583,3,19.971965322167343,AEVDNQMHVVDKNPVSLVSKTR,22
[UNIMOD:737]-YLLSLEEERPALMDDR,sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37506,False,2178.1194,2,19.99761378859208,YLLSLEEERPALMDDR,16
[UNIMOD:737]-EGRGAGSQSPPRGR,sp|Q6ZSN1|YI023_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45427,False,1639.8704,2,19.22966416224777,EGRGAGSQSPPRGR,14
[UNIMOD:737]-TASASRRSAR,sp|P08729|K2C7_HUMAN,GN20170722_SK_HLA_G0103_R1_01,30700,False,1290.7319,2,19.376349012983997,TASASRRSAR,10
3 changes: 1 addition & 2 deletions tests/unit_tests/test_sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ class TestSage(unittest.TestCase):
def test_read_sage(self):
"""Test function for reading sage results and transforming to Prosit format."""
expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv"

internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result()
expected_df = pd.read_csv(expected_sage_internal_path, index_col=0)
expected_df = pd.read_csv(expected_sage_internal_path)

pd.testing.assert_frame_equal(internal_search_results_df, expected_df)

0 comments on commit 080a28a

Please sign in to comment.