Skip to content

Commit

Permalink
added a few lines and edits for Sage integration
Browse files Browse the repository at this point in the history
  • Loading branch information
“Karim committed Oct 27, 2023
1 parent 85e8900 commit c781611
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 19 deletions.
1 change: 1 addition & 0 deletions spectrum_io/search_result/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .mascot import Mascot
from .maxquant import MaxQuant
from .msfragger import MSFragger
from .sage import Sage
Empty file removed spectrum_io/search_result/comit.py
Empty file.
16 changes: 7 additions & 9 deletions spectrum_io/search_result/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,14 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
path,
usecols=lambda x: x.upper()
in [
"RAW FILE",
"SCAN NUMBER",
"MODIFIED SEQUENCE",
"FILENAME",
"SCANNR",
"PEPTIDE",
"CHARGE",
"SCAN EVENT NUMBER",
"LABELING STATE",
"MASS", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead
"SCORE",
"REVERSE",
],
"HYPERSCORE",
"CALCMASS",
"PROTEINS"
],
sep="\t",
)
logger.info("Finished reading msms.txt file")
Expand Down
26 changes: 16 additions & 10 deletions spectrum_io/search_result/sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal
from spectrum_fundamentals.mod_string import sage_to_internal

from .search_results import SearchResults, filter_valid_prosit_sequences

Expand Down Expand Up @@ -39,14 +39,14 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
logger.info("Reading msms.txt file")
df = pd.read_csv(
path,
usecols=lambda x: x.upper()
in [
usecols=[
"filename",
"scannr",
"peptide",
"charge",
"hyperscore",
"calcmass",
"proteins"
],
sep="\t",
)
Expand All @@ -55,14 +55,16 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
# Standardize column names
df.columns = df.columns.str.upper()
df.columns = df.columns.str.replace(" ", "_")

df = Sage.modify_columns_for_prosit(df, tmt_labeled)
print(df)
df = Sage.update_columns_for_prosit(df)
return filter_valid_prosit_sequences(df)

@staticmethod
def update_columns_for_prosit(df: pd.DataFrame , tmt_labeled: str) -> pd.DataFrame:
df = df.rename (columns={'FILENAME':'RAW_FILE','SCANNR':'SCAN_NUMBER','PEPTIDE':'MODIFIED_SEQUENCE','CHARGE':'PRECURSOR_CHARGE'})
# TODO modified sequence should be changed from proforma to unimod
def update_columns_for_prosit(df: pd.DataFrame ) -> pd.DataFrame:
# renaming input columns
print(df.columns)
df = df.rename(columns={'FILENAME':'RAW_FILE','SCANNR':'SCAN_NUMBER','PEPTIDE':'MODIFIED_SEQUENCE','CHARGE':'PRECURSOR_CHARGE'})
print(df.columns)
# removing .mzML
df['RAW_FILE'] = df['RAW_FILE'].str.replace(".mzML","")
# extracting only the scan number
Expand All @@ -71,14 +73,18 @@ def update_columns_for_prosit(df: pd.DataFrame , tmt_labeled: str) -> pd.DataFra
df['REVERSE'] = df['PROTEINS'].str.startswith('rev_')
# removing modification to create the unmodified sequences
df['SEQUENCE'] = df['MODIFIED_SEQUENCE'].str.replace(r'\[.*?\]', '', regex=True)
#length of the peptide
# length of the peptide
df['PEPTIDE_LENGTH'] = df['SEQUENCE'].str.len()
# mass of the peptide
df['MASS'] = df['CALCMASS']
# score of the peptide
df['SCORE'] = df['HYPERSCORE']
# converting proforma to unimode
print(df)
df['MODIFIED_SEQUENCE'] = sage_to_internal(df['MODIFIED_SEQUENCE'].to_numpy())

return df
print(df.columns)
return df



0 comments on commit c781611

Please sign in to comment.