Skip to content

Commit

Permalink
fix removing n-terminal dash for TMT sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
picciama committed Apr 27, 2024
1 parent 318008c commit 80484f6
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 17 deletions.
3 changes: 2 additions & 1 deletion spectrum_io/search_result/sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram
df["SCAN_NUMBER"] = [int(x.rsplit("=", 1)[-1]) for x in df["SCAN_NUMBER"]]
# creating a column of decoys and targets
df["REVERSE"] = df["LABEL"] < 0
df.drop(columns=["LABEL"], inplace=True)
# removing modification to create the unmodified sequences
df["SEQUENCE"] = df["MODIFIED_SEQUENCE"].str.replace(r"\[.*?\]", "", regex=True)
df["SEQUENCE"] = df["MODIFIED_SEQUENCE"].str.replace(r"\-|\[.*?\]", "", regex=True)
# length of the peptide
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
# mass of the peptide
Expand Down
32 changes: 17 additions & 15 deletions tests/unit_tests/data/sage_output_internal.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
,MODIFIED_SEQUENCE,PROTEINS,RAW_FILE,SCAN_NUMBER,CALCMASS,PRECURSOR_CHARGE,HYPERSCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH,MASS,SCORE
0,[UNIMOD:737]-HLDGGAEQSLLFVAGM[UNIMOD:35]R,rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN,GN20170722_SK_HLA_G0103_R1_01,50989,2045.0568,2,18.084579770792818,True,-HLDGGAEQSLLFVAGMR,18,2045.0568,18.084579770792818
1,[UNIMOD:737]-GRFVEPLSNVQEEWNQK[UNIMOD:737],rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37495,2517.338,2,21.44256145543081,True,-GRFVEPLSNVQEEWNQK,18,2517.338,21.44256145543081
3,[UNIMOD:737]-VNM[UNIMOD:35]RTSSSIQNEDEATSMELIAPGP,sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45493,2921.3948,3,19.97971639308644,False,-VNMRTSSSIQNEDEATSMELIAPGP,26,2921.3948,19.97971639308644
4,[UNIMOD:737]-VGEQEAPHEGGHPGSDSARASMADWLR,sp|Q9H093|NUAK2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41946,3075.4417,3,20.936078174830037,False,-VGEQEAPHEGGHPGSDSARASMADWLR,28,3075.4417,20.936078174830037
5,[UNIMOD:737]-EM[UNIMOD:35]VSPTDSC[UNIMOD:4]VRVSVRDLPQFHVSVVDM[UNIMOD:35]DR,rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN,GN20170722_SK_HLA_G0103_R1_01,39750,3620.7583,3,17.954540767022678,True,-EMVSPTDSCVRVSVRDLPQFHVSVVDMDR,30,3620.7583,17.954540767022678
6,[UNIMOD:737]-K[UNIMOD:737]M[UNIMOD:35]EEDIYTNLSK[UNIMOD:737]METVLGQSMSSLPLSYR,sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45441,4053.1077,4,19.962357128713123,False,-KMEEDIYTNLSKMETVLGQSMSSLPLSYR,30,4053.1077,19.962357128713123
7,[UNIMOD:737]-TC[UNIMOD:4]SK[UNIMOD:737]SQGSWGNREIVIIDTPDMFSWK[UNIMOD:737],sp|Q9UG22|GIMA2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,36758,3728.9263,3,18.776753308950084,False,-TCSKSQGSWGNREIVIIDTPDMFSWK,27,3728.9263,18.776753308950084
8,[UNIMOD:737]-M[UNIMOD:35]SLGRAAPSAPGR,rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,49100,1514.819,2,20.66450790410668,True,-MSLGRAAPSAPGR,14,1514.819,20.66450790410668
9,[UNIMOD:737]-C[UNIMOD:4]LIQM[UNIMOD:35]GAAVEAK[UNIMOD:737]AYNGNTALHVAASLQYR,tr|H7C5S1|H7C5S1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37376,3593.885,3,21.88178474607965,False,-CLIQMGAAVEAKAYNGNTALHVAASLQYR,30,3593.885,21.88178474607965
11,[UNIMOD:737]-LNVEGTERGSC[UNIMOD:4]GRK[UNIMOD:737],sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40784,2020.0887,3,21.842498887625368,False,-LNVEGTERGSCGRK,15,2020.0887,21.842498887625368
12,[UNIMOD:737]-C[UNIMOD:4]NRGWTALHESVSR,sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,51002,1900.9526,2,17.823081883875247,False,-CNRGWTALHESVSR,15,1900.9526,17.823081883875247
13,[UNIMOD:737]-AEVDNQMHVVDK[UNIMOD:737]NPVSLVSK[UNIMOD:737]TR,rev_sp|O75151|PHF2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40715,3152.7583,3,19.971965322167343,True,-AEVDNQMHVVDKNPVSLVSKTR,23,3152.7583,19.971965322167343
14,[UNIMOD:737]-YLLSLEEERPALMDDR,sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37506,2178.1194,2,19.99761378859208,False,-YLLSLEEERPALMDDR,17,2178.1194,19.99761378859208
15,[UNIMOD:737]-EGRGAGSQSPPRGR,sp|Q6ZSN1|YI023_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45427,1639.8704,2,19.22966416224777,False,-EGRGAGSQSPPRGR,15,1639.8704,19.22966416224777
16,[UNIMOD:737]-TASASRRSAR,sp|P08729|K2C7_HUMAN,GN20170722_SK_HLA_G0103_R1_01,30700,1290.7319,2,19.376349012983997,False,-TASASRRSAR,11,1290.7319,19.376349012983997
0,[UNIMOD:737]-HLDGGAEQSLLFVAGM[UNIMOD:35]R,rev_sp|P26006-1|ITA3_HUMAN;rev_sp|P26006|ITA3_HUMAN,GN20170722_SK_HLA_G0103_R1_01,50989,2045.0568,2,18.084579770792818,True,HLDGGAEQSLLFVAGMR,17,2045.0568,18.084579770792818
1,[UNIMOD:737]-GRFVEPLSNVQEEWNQK[UNIMOD:737],rev_sp|Q9H1A3-2|METL9_HUMAN;rev_sp|Q9H1A3|METL9_HUMAN;rev_tr|H3BN86|H3BN86_HUMAN;rev_tr|Q8TD49|Q8TD49_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37495,2517.338,2,21.44256145543081,True,GRFVEPLSNVQEEWNQK,17,2517.338,21.44256145543081
2,[UNIMOD:737]-LTVEC[UNIMOD:4]MPTIASDDLPVGTLQESEVSM[UNIMOD:35]TGPG,rev_tr|C9JVX2|C9JVX2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45450,3378.6082,3,18.628243636678995,True,LTVECMPTIASDDLPVGTLQESEVSMTGPG,30,3378.6082,18.628243636678995
3,[UNIMOD:737]-VNM[UNIMOD:35]RTSSSIQNEDEATSMELIAPGP,sp|Q9Y5Y9|SCNAA_HUMAN;tr|A0A2R8Y6J6|A0A2R8Y6J6_HUMAN;tr|A0A590UJM0|A0A590UJM0_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45493,2921.3948,3,19.97971639308644,False,VNMRTSSSIQNEDEATSMELIAPGP,25,2921.3948,19.97971639308644
4,[UNIMOD:737]-VGEQEAPHEGGHPGSDSARASMADWLR,sp|Q9H093|NUAK2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41946,3075.4417,3,20.936078174830037,False,VGEQEAPHEGGHPGSDSARASMADWLR,27,3075.4417,20.936078174830037
5,[UNIMOD:737]-EM[UNIMOD:35]VSPTDSC[UNIMOD:4]VRVSVRDLPQFHVSVVDM[UNIMOD:35]DR,rev_sp|Q9HCN3|PGAP6_HUMAN;rev_tr|K4DI83|K4DI83_HUMAN,GN20170722_SK_HLA_G0103_R1_01,39750,3620.7583,3,17.954540767022678,True,EMVSPTDSCVRVSVRDLPQFHVSVVDMDR,29,3620.7583,17.954540767022678
6,[UNIMOD:737]-K[UNIMOD:737]M[UNIMOD:35]EEDIYTNLSK[UNIMOD:737]METVLGQSMSSLPLSYR,sp|Q8WXH0-2|SYNE2_HUMAN;sp|Q8WXH0|SYNE2_HUMAN;tr|A0A0A0MRE3|A0A0A0MRE3_HUMAN;tr|G3V5X4|G3V5X4_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45441,4053.1077,4,19.962357128713123,False,KMEEDIYTNLSKMETVLGQSMSSLPLSYR,29,4053.1077,19.962357128713123
7,[UNIMOD:737]-TC[UNIMOD:4]SK[UNIMOD:737]SQGSWGNREIVIIDTPDMFSWK[UNIMOD:737],sp|Q9UG22|GIMA2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,36758,3728.9263,3,18.776753308950084,False,TCSKSQGSWGNREIVIIDTPDMFSWK,26,3728.9263,18.776753308950084
8,[UNIMOD:737]-M[UNIMOD:35]SLGRAAPSAPGR,rev_sp|P51693-2|APLP1_HUMAN;rev_sp|P51693|APLP1_HUMAN;rev_tr|K7EMS1|K7EMS1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,49100,1514.819,2,20.66450790410668,True,MSLGRAAPSAPGR,13,1514.819,20.66450790410668
9,[UNIMOD:737]-C[UNIMOD:4]LIQM[UNIMOD:35]GAAVEAK[UNIMOD:737]AYNGNTALHVAASLQYR,tr|H7C5S1|H7C5S1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37376,3593.885,3,21.88178474607965,False,CLIQMGAAVEAKAYNGNTALHVAASLQYR,29,3593.885,21.88178474607965
10,[UNIMOD:737]-M[UNIMOD:35]EESLNIVK[UNIMOD:737]YTAFLYNDQLIWSGLEQDDMR,sp|P86790|CCZ1B_HUMAN;sp|P86791|CCZ1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,41296,4095.037,3,22.91659667770412,False,MEESLNIVKYTAFLYNDQLIWSGLEQDDMR,30,4095.037,22.91659667770412
11,[UNIMOD:737]-LNVEGTERGSC[UNIMOD:4]GRK[UNIMOD:737],sp|O75078-2|ADA11_HUMAN;sp|O75078|ADA11_HUMAN;tr|B4DKD2|B4DKD2_HUMAN;tr|K7EKA8|K7EKA8_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40784,2020.0887,3,21.842498887625368,False,LNVEGTERGSCGRK,14,2020.0887,21.842498887625368
12,[UNIMOD:737]-C[UNIMOD:4]NRGWTALHESVSR,sp|Q96Q27-1|ASB2_HUMAN;sp|Q96Q27|ASB2_HUMAN;tr|G3V2Z2|G3V2Z2_HUMAN;tr|G3V4B2|G3V4B2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,51002,1900.9526,2,17.823081883875247,False,CNRGWTALHESVSR,14,1900.9526,17.823081883875247
13,[UNIMOD:737]-AEVDNQMHVVDK[UNIMOD:737]NPVSLVSK[UNIMOD:737]TR,rev_sp|O75151|PHF2_HUMAN,GN20170722_SK_HLA_G0103_R1_01,40715,3152.7583,3,19.971965322167343,True,AEVDNQMHVVDKNPVSLVSKTR,22,3152.7583,19.971965322167343
14,[UNIMOD:737]-YLLSLEEERPALMDDR,sp|Q86TB9-2|PATL1_HUMAN;sp|Q86TB9-4|PATL1_HUMAN;sp|Q86TB9|PATL1_HUMAN,GN20170722_SK_HLA_G0103_R1_01,37506,2178.1194,2,19.99761378859208,False,YLLSLEEERPALMDDR,16,2178.1194,19.99761378859208
15,[UNIMOD:737]-EGRGAGSQSPPRGR,sp|Q6ZSN1|YI023_HUMAN,GN20170722_SK_HLA_G0103_R1_01,45427,1639.8704,2,19.22966416224777,False,EGRGAGSQSPPRGR,14,1639.8704,19.22966416224777
16,[UNIMOD:737]-TASASRRSAR,sp|P08729|K2C7_HUMAN,GN20170722_SK_HLA_G0103_R1_01,30700,1290.7319,2,19.376349012983997,False,TASASRRSAR,10,1290.7319,19.376349012983997
3 changes: 2 additions & 1 deletion tests/unit_tests/test_sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ class TestSage(unittest.TestCase):
def test_read_sage(self):
"""Test function for reading sage results and transforming to Prosit format."""
expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv"

internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result()
print(internal_search_results_df.columns)
expected_df = pd.read_csv(expected_sage_internal_path, index_col=0)
print(expected_df.columns)

pd.testing.assert_frame_equal(internal_search_results_df, expected_df)

0 comments on commit 80484f6

Please sign in to comment.