Skip to content

Commit

Permalink
added sage.py parser - tmt label in the update_columns_for_prosit fun…
Browse files Browse the repository at this point in the history
…ction
  • Loading branch information
“Karim committed Oct 24, 2023
1 parent 3a01722 commit 85e8900
Showing 1 changed file with 84 additions and 1 deletion.
85 changes: 84 additions & 1 deletion spectrum_io/search_result/sage.py
Original file line number Diff line number Diff line change
@@ -1 +1,84 @@
s
import logging
from pathlib import Path
from typing import Union

import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal

from .search_results import SearchResults, filter_valid_prosit_sequences

logger = logging.getLogger(__name__)


class Sage(SearchResults):

@staticmethod
def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:
"""
Add tmt modification.
:param mass: mass without tmt modification
:param seq: sequence of the peptide
:param unimod_tag: UNIMOD tag for the modification
:return: mass as float
"""
num_of_tmt = seq.count(unimod_tag)
mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"]
return mass

@staticmethod
def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
:param path: path to msms.txt to read
:param tmt_labeled: tmt label as str
:return: pd.DataFrame with the formatted data
"""
logger.info("Reading msms.txt file")
df = pd.read_csv(
path,
usecols=lambda x: x.upper()
in [
"filename",
"scannr",
"peptide",
"charge",
"hyperscore",
"calcmass",
],
sep="\t",
)
logger.info("Finished reading msms.txt file")

# Standardize column names
df.columns = df.columns.str.upper()
df.columns = df.columns.str.replace(" ", "_")

df = Sage.modify_columns_for_prosit(df, tmt_labeled)
return filter_valid_prosit_sequences(df)

@staticmethod
def update_columns_for_prosit(df: pd.DataFrame , tmt_labeled: str) -> pd.DataFrame:
df = df.rename (columns={'FILENAME':'RAW_FILE','SCANNR':'SCAN_NUMBER','PEPTIDE':'MODIFIED_SEQUENCE','CHARGE':'PRECURSOR_CHARGE'})
# TODO modified sequence should be changed from proforma to unimod
# removing .mzML
df['RAW_FILE'] = df['RAW_FILE'].str.replace(".mzML","")
# extracting only the scan number
df['SCAN_NUMBER'] = df['SCAN_NUMBER'].str.split('=').str[3:].str.join('=')
# creating a column of decoys and targets
df['REVERSE'] = df['PROTEINS'].str.startswith('rev_')
# removing modification to create the unmodified sequences
df['SEQUENCE'] = df['MODIFIED_SEQUENCE'].str.replace(r'\[.*?\]', '', regex=True)
#length of the peptide
df['PEPTIDE_LENGTH'] = df['SEQUENCE'].str.len()
# mass of the peptide
df['MASS'] = df['CALCMASS']
# score of the peptide
df['SCORE'] = df['HYPERSCORE']

return df



0 comments on commit 85e8900

Please sign in to comment.