Skip to content

Commit

Permalink
Added protein ids for msfragger, sage, fixed tests and typeguard
Browse files Browse the repository at this point in the history
  • Loading branch information
victorgiurcoiu committed May 30, 2024
1 parent 2a824f8 commit 0dbd1d6
Show file tree
Hide file tree
Showing 9 changed files with 39 additions and 27 deletions.
5 changes: 5 additions & 0 deletions spectrum_io/search_result/msfragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame:
:param tmt_labeled: True if tmt labeled
:return: modified df as pd.DataFrame
"""
df["PROTEINS"] = df["protein"]
df["PROTEINS"].fillna("UNKNOWN", inplace=True)
df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x))
df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0])
df["MASS"] = df["precursor_neutral_mass"]
Expand All @@ -75,6 +77,8 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame:
inplace=True,
)
df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
df["PROTEINS"] = df["PROTEINS"].apply(lambda x: ";".join(x))

return df[
[
"RAW_FILE",
Expand All @@ -87,5 +91,6 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame:
"REVERSE",
"SEQUENCE",
"PEPTIDE_LENGTH",
"PROTEINS",
]
]
1 change: 1 addition & 0 deletions spectrum_io/search_result/sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,6 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
# converting sage to unimod
df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"])
df["PROTEINS"].fillna("UNKNOWN", inplace=True)

return df
8 changes: 3 additions & 5 deletions spectrum_io/spectral_library/dlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"PeptideSeq",
"RTInSeconds",
"PrecursorMz",
"ProteinIds",
]


Expand Down Expand Up @@ -103,8 +102,7 @@ def _create_database(conn: sqlite3.Connection):
RTInSecondsStop REAL,
MedianChromatogramEncodedLength INTEGER,
MedianChromatogramArray BLOB,
SourceFile TEXT NOT NULL DEFAULT 'Oktoberfest',
ProteinIds TEXT NOT NULL
SourceFile TEXT NOT NULL DEFAULT 'Oktoberfest'
)
"""
sql_create_p2p = """
Expand Down Expand Up @@ -148,9 +146,9 @@ def _write(self, out: Union[IO, sqlite3.Connection], data: Dict[str, np.ndarray]

masked_values = self._calculate_masked_values(f_mzss, f_intss)

data_list = [*masked_values, p_charges, mass_mod_sequences, seqs, irts, p_mzs, pr_ids]
data_list = [*masked_values, p_charges, mass_mod_sequences, seqs, irts, p_mzs]
entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))
p2p = pd.DataFrame({"PeptideSeq": seqs})
p2p = pd.DataFrame({"PeptideSeq": seqs, "ProteinAccession": pr_ids})

out.execute("BEGIN")

Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/spectral_library/msp.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metada
):
lines.append(f"Name: {stripped_peptide}/{p_charge}\nMW: {p_mz}\n")
lines.append(
f"Comment: Parent={p_mz:.8f} Collision_energy={ce} Pritein_ids={pr_id} Mods={mods[0]} "
f"Comment: Parent={p_mz:.8f} Collision_energy={ce} Protein_ids={pr_id} Mods={mods[0]} "
f"ModString={mods[1]}/{p_charge} iRT={irt:.2f}\n"
)

Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/spectral_library/spectronaut.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metada
f_intss, f_mzss, modseqs, seqs, p_charges, p_mzs, irts, ces, pr_ids, f_annotss
):
cond = self._fragment_filter_passed(f_mzs, f_ints)
line_start = [f"{modseq},{seq},{seq},{p_charge},{p_mz:.8f},{irt:.2f},{ce},{pr_id}"]
line_start = [f"{modseq},{seq},{seq},{p_charge},{p_mz:.8f},{irt:.2f},{ce},{pr_id},"]
fragment_list = vec_assemble(f_ints[cond], f_mzs[cond], f_annots[cond])
out.writelines(chain.from_iterable(zip(cycle(line_start), fragment_list)))

Expand Down
10 changes: 5 additions & 5 deletions tests/unit_tests/data/psm_tmt_internal.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
,RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE,SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH
0,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2459,[UNIMOD:2016]-GQAVLAFQEQVGTGR,5,34,1863.023,8.221,True,GQAVLAFQEQVGTGR,15
1,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2486,[UNIMOD:2016]-TEVPM[UNIMOD:35]GLSLRTTSAR,5,42,1937.0531,7.083,False,TEVPMGLSLRTTSAR,15
2,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2980,[UNIMOD:2016]-YSGNC[UNIMOD:4]DRQSVER,3,193,1773.8123,3.932,False,YSGNCDRQSVER,12
3,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,3945,[UNIMOD:2016]-ESTK[UNIMOD:2016]SAAER,3,647,1586.8887,10.839,True,ESTKSAAER,9
,RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE,SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH,PROTEINS
0,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2459,[UNIMOD:2016]-GQAVLAFQEQVGTGR,5,34,1863.023,8.221,True,GQAVLAFQEQVGTGR,15,rev_tr|E9Q8J5|E9Q8J5_MOUSE
1,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2486,[UNIMOD:2016]-TEVPM[UNIMOD:35]GLSLRTTSAR,5,42,1937.0531,7.083,False,TEVPMGLSLRTTSAR,15,tr|A0A0N4SW17|A0A0N4SW17_MOUSE
2,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,2980,[UNIMOD:2016]-YSGNC[UNIMOD:4]DRQSVER,3,193,1773.8123,3.932,False,YSGNCDRQSVER,12,sp|Q9D413-2|SH2D6_MOUSE;sp|Q9D413|SH2D6_MOUSE;tr|A0A3Q4EBW9|A0A3Q4EBW9_MOUSE;tr|A0A3Q4ECA8|A0A3Q4ECA8_MOUSE;tr|A0A3Q4EGG3|A0A3Q4EGG3_MOUSE;tr|E0CYY5|E0CYY5_MOUSE;tr|E9QJU1|E9QJU1_MOUSE
3,Ecl1_0277_R0096-01_S004436_D_H01_TMT18_01,3945,[UNIMOD:2016]-ESTK[UNIMOD:2016]SAAER,3,647,1586.8887,10.839,True,ESTKSAAER,9,rev_sp|Q3TLH4-5|PRC2C_MOUSE;rev_sp|Q3TLH4|PRC2C_MOUSE;rev_tr|A0A0A0MQ79|A0A0A0MQ79_MOUSE;rev_tr|S4R209|S4R209_MOUSE;rev_tr|S4R294|S4R294_MOUSE;rev_tr|S4R2J9|S4R2J9_MOUSE
15 changes: 9 additions & 6 deletions tests/unit_tests/test_maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def test_update_columns(self, maxquant_df: pd.DataFrame):
assert prosit_df["PEPTIDE_LENGTH"][0] == 18
assert prosit_df["PEPTIDE_LENGTH"][3] == 13

assert prosit_df["PROTEINS"][0] == "P12345"
assert prosit_df["PROTEINS"][3] == "Q67890"

def test_update_columns_silac(self, maxquant_df: pd.DataFrame):
"""
Test column update silac.
Expand Down Expand Up @@ -95,12 +98,12 @@ def test_filter_valid_prosit_sequences(self, invalid_df: pd.DataFrame):
@pytest.fixture
def maxquant_df():
"""Create dataframes from strings: https://towardsdatascience.com/67b0c2b71e6a."""
df_string = """ MODIFIED_SEQUENCE; REVERSE; MASS;
_DS(Phospho (STY))DS(Phospho (STY))WDADAFSVEDPVRK_; ; 1.0;
_DS(Phospho (STY))DS(Phospho (STY))WDADAFSVEDPVRK_; ; 1.0;
_DS(Phospho (STY))DSWDADAFS(Phospho (STY))VEDPVRK_; ; 1.0;
_SS(Phospho (STY))PTPES(Phospho (STY))PTMLTK_; +; 2.0;
_SS(Phospho (STY))PTPES(Phospho (STY))PTMLTK_; +; 2.0;"""
df_string = """ MODIFIED_SEQUENCE; REVERSE; MASS; PROTEINS;
_DS(Phospho (STY))DS(Phospho (STY))WDADAFSVEDPVRK_; ; 1.0; P12345;
_DS(Phospho (STY))DS(Phospho (STY))WDADAFSVEDPVRK_; ; 1.0; P12345;
_DS(Phospho (STY))DSWDADAFS(Phospho (STY))VEDPVRK_; ; 1.0; P12345;
_SS(Phospho (STY))PTPES(Phospho (STY))PTMLTK_; +; 2.0; Q67890;
_SS(Phospho (STY))PTPES(Phospho (STY))PTMLTK_; +; 2.0; Q67890;"""
df = pd.read_csv(io.StringIO(df_string), delimiter=";", skipinitialspace=True)
df["Charge"] = 2
return df
Expand Down
3 changes: 3 additions & 0 deletions tests/unit_tests/test_msfragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def test_read_result(self):
self.assertTrue("REVERSE" in df.columns)
self.assertTrue("SEQUENCE" in df.columns)
self.assertTrue("PEPTIDE_LENGTH" in df.columns)
self.assertTrue("PROTEINS" in df.columns)

def test_read_msfragger(self):
"""Test function for reading sage results and transforming to Prosit format."""
Expand All @@ -33,5 +34,7 @@ def test_read_msfragger(self):
tmt_labeled="tmtpro"
)
expected_df = pd.read_csv(expected_msfragger_internal_path, index_col=0)
print("Internal Search Results Columns:", internal_search_results_df.columns)
print("Expected Columns:", expected_df.columns)

pd.testing.assert_frame_equal(internal_search_results_df, expected_df)
20 changes: 11 additions & 9 deletions tests/unit_tests/test_spectral_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ def test_write(self, data, metadata):
anticipated_content = (
"Name: AAACCCCKR/1\n"
"MW: 124.407276467\n"
"Comment: Parent=124.40727647 Collision_energy=10.0 Mods=2/3,C,Carbamidomethyl/5,C,Carbamidomethyl "
"Comment: Parent=124.40727647 Collision_energy=10.0 "
"Protein_ids=ProteinA Mods=2/3,C,Carbamidomethyl/5,C,Carbamidomethyl "
"ModString=AAACCCCKR//Carbamidomethyl@C3; Carbamidomethyl@C5/1 iRT=982.12\n"
"Num peaks: 2\n"
'0.80000000 0.2000 "b1/0.0ppm"\n'
'0.30000000 0.8000 "b2^2/0.0ppm"\n'
"Name: AAACILKKR/2\n"
"MW: 1617.057276467\n"
"Comment: Parent=1617.05727647 Collision_energy=20.0 Mods=0 ModString=AAACILKKR///2 iRT=382.12\n"
"Comment: Parent=1617.05727647 Collision_energy=20.0 Protein_ids=ProteinB Mods=0 ModString=AAACILKKR///2 iRT=382.12\n"
"Num peaks: 3\n"
'0.50000000 0.5000 "b1/0.0ppm"\n'
'0.40000000 0.6000 "y2^2/0.0ppm"\n'
Expand All @@ -54,15 +55,15 @@ def test_write(self, data, metadata):

file_content = file_content.replace("\r", "") # explicitly remove to work for windows
anticipated_content = (
"ModifiedPeptide,LabeledPeptide,StrippedPeptide,PrecursorCharge,PrecursorMz,iRT,CollisionEnergy,"
"ModifiedPeptide,LabeledPeptide,StrippedPeptide,PrecursorCharge,PrecursorMz,iRT,CollisionEnergy,ProteinIds,"
"RelativeFragmentIntensity,FragmentMz,FragmentNumber,FragmentType,FragmentCharge,FragmentLossType\n"
"_AAAC[Carbamidomethyl (C)]CC[Carbamidomethyl (C)]CKR_,AAACCCCKR,AAACCCCKR,1,124.40727647,982.12,"
"10.0,0.2000,0.80000000,1,b,1,noloss\n"
"10.0,ProteinA,0.2000,0.80000000,1,b,1,noloss\n"
"_AAAC[Carbamidomethyl (C)]CC[Carbamidomethyl (C)]CKR_,AAACCCCKR,AAACCCCKR,1,124.40727647,982.12,"
"10.0,0.8000,0.30000000,2,b,2,noloss\n"
"_AAACILKKR_,AAACILKKR,AAACILKKR,2,1617.05727647,382.12,20.0,0.5000,0.50000000,1,b,1,noloss\n"
"_AAACILKKR_,AAACILKKR,AAACILKKR,2,1617.05727647,382.12,20.0,0.6000,0.40000000,2,y,2,noloss\n"
"_AAACILKKR_,AAACILKKR,AAACILKKR,2,1617.05727647,382.12,20.0,0.0010,0.30000000,2,b,2,noloss\n"
"10.0,ProteinA,0.8000,0.30000000,2,b,2,noloss\n"
"_AAACILKKR_,AAACILKKR,AAACILKKR,2,1617.05727647,382.12,20.0,ProteinB,0.5000,0.50000000,1,b,1,noloss\n"
"_AAACILKKR_,AAACILKKR,AAACILKKR,2,1617.05727647,382.12,20.0,ProteinB,0.6000,0.40000000,2,y,2,noloss\n"
"_AAACILKKR_,AAACILKKR,AAACILKKR,2,1617.05727647,382.12,20.0,ProteinB,0.0010,0.30000000,2,b,2,noloss\n"
)
assert file_content == anticipated_content

Expand Down Expand Up @@ -114,7 +115,7 @@ def test_write(self, data, metadata):
)

df_expected_p2p = pd.DataFrame(
{"PeptideSeq": ["AAACCCCKR", "AAACILKKR"], "isDecoy": 0, "ProteinAccession": "UNKNOWN"}
{"PeptideSeq": ["AAACCCCKR", "AAACILKKR"], "isDecoy": 0, "ProteinAccession": ["ProteinA", "ProteinB"]}
)

df_expected_meta = pd.DataFrame({"Key": ["version", "staleProteinMapping"], "Value": ["0.1.14", "true"]})
Expand Down Expand Up @@ -147,5 +148,6 @@ def metadata():
"PRECURSOR_CHARGE": [1, 2],
"MASS": [123.4, 3232.1],
"COLLISION_ENERGY": [10.0, 20.0],
"PROTEINS": ["ProteinA", "ProteinB"],
}
)

0 comments on commit 0dbd1d6

Please sign in to comment.