-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added sage.py parser - tmt label in the update_columns_for_prosit fun…
…ction
- Loading branch information
“Karim
committed
Oct 24, 2023
1 parent
3a01722
commit 85e8900
Showing
1 changed file
with
84 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,84 @@ | ||
s | ||
import logging | ||
from pathlib import Path | ||
from typing import Union | ||
|
||
import pandas as pd | ||
import spectrum_fundamentals.constants as c | ||
from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal | ||
|
||
from .search_results import SearchResults, filter_valid_prosit_sequences | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Sage(SearchResults): | ||
|
||
@staticmethod | ||
def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float: | ||
""" | ||
Add tmt modification. | ||
:param mass: mass without tmt modification | ||
:param seq: sequence of the peptide | ||
:param unimod_tag: UNIMOD tag for the modification | ||
:return: mass as float | ||
""" | ||
num_of_tmt = seq.count(unimod_tag) | ||
mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"] | ||
return mass | ||
|
||
@staticmethod | ||
def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: | ||
""" | ||
Function to read a msms txt and perform some basic formatting. | ||
:param path: path to msms.txt to read | ||
:param tmt_labeled: tmt label as str | ||
:return: pd.DataFrame with the formatted data | ||
""" | ||
logger.info("Reading msms.txt file") | ||
df = pd.read_csv( | ||
path, | ||
usecols=lambda x: x.upper() | ||
in [ | ||
"filename", | ||
"scannr", | ||
"peptide", | ||
"charge", | ||
"hyperscore", | ||
"calcmass", | ||
], | ||
sep="\t", | ||
) | ||
logger.info("Finished reading msms.txt file") | ||
|
||
# Standardize column names | ||
df.columns = df.columns.str.upper() | ||
df.columns = df.columns.str.replace(" ", "_") | ||
|
||
df = Sage.modify_columns_for_prosit(df, tmt_labeled) | ||
return filter_valid_prosit_sequences(df) | ||
|
||
@staticmethod | ||
def update_columns_for_prosit(df: pd.DataFrame , tmt_labeled: str) -> pd.DataFrame: | ||
df = df.rename (columns={'FILENAME':'RAW_FILE','SCANNR':'SCAN_NUMBER','PEPTIDE':'MODIFIED_SEQUENCE','CHARGE':'PRECURSOR_CHARGE'}) | ||
# TODO modified sequence should be changed from proforma to unimod | ||
# removing .mzML | ||
df['RAW_FILE'] = df['RAW_FILE'].str.replace(".mzML","") | ||
# extracting only the scan number | ||
df['SCAN_NUMBER'] = df['SCAN_NUMBER'].str.split('=').str[3:].str.join('=') | ||
# creating a column of decoys and targets | ||
df['REVERSE'] = df['PROTEINS'].str.startswith('rev_') | ||
# removing modification to create the unmodified sequences | ||
df['SEQUENCE'] = df['MODIFIED_SEQUENCE'].str.replace(r'\[.*?\]', '', regex=True) | ||
#length of the peptide | ||
df['PEPTIDE_LENGTH'] = df['SEQUENCE'].str.len() | ||
# mass of the peptide | ||
df['MASS'] = df['CALCMASS'] | ||
# score of the peptide | ||
df['SCORE'] = df['HYPERSCORE'] | ||
|
||
return df | ||
|
||
|
||
|