Skip to content

Commit

Permalink
tests pased spectrum_io
Browse files Browse the repository at this point in the history
  • Loading branch information
Fabian Basso committed Jul 31, 2024
1 parent 1368743 commit 9143d1d
Show file tree
Hide file tree
Showing 13 changed files with 334 additions and 84 deletions.
34 changes: 21 additions & 13 deletions spectrum_io/search_result/mascot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import re
import sqlite3
from pathlib import Path
from typing import Optional, Union, Dict, Tuple
from typing import Dict, Optional, Tuple, Union

import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import internal_without_mods, custom_regex_escape
from spectrum_fundamentals.mod_string import custom_regex_escape, internal_without_mods

from .search_results import SearchResults, filter_valid_prosit_sequences

Expand All @@ -16,14 +16,16 @@
class Mascot(SearchResults):
"""Handle search results from Mascot."""

def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
def read_result(
self,
tmt_labeled: str,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
) -> pd.DataFrame:
"""
Function to read a mascot msf file and perform some basic formatting.
:param tmt_labeled: tmt label as str
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param stat_mods: dict with custom static identifier and respecitve internal equivalent
:param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
:return: pd.DataFrame with the formatted data
"""
logger.info("Reading mascot msf file")
Expand Down Expand Up @@ -78,10 +80,10 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No
["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"],
as_index=False,
).agg({"MODIFICATIONS": "|".join})
MOD_MASSES = c.update_mod_masses()
mod_masses_reverse = {round(float(v), 3): k for k, v in MOD_MASSES.items()}
def find_replacement(match: re.Match, sequence: str) -> str:
mod_masses = c.update_mod_masses()
mod_masses_reverse = {round(float(v), 3): k for k, v in mod_masses.items()}

def find_replacement(match: re.Match) -> str:
"""
Subfunction to find the corresponding substitution for a match.
Expand All @@ -90,7 +92,13 @@ def find_replacement(match: re.Match, sequence: str) -> str:
"""
key = match.string[match.start() : match.end()]
return mods[key]


stat_mods: Dict[str, str] = {}
var_mods: Dict[str, str] = {}

if custom_mods is not None:
stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}

mods = {}

Expand All @@ -107,11 +115,11 @@ def find_replacement(match: re.Match, sequence: str) -> str:
modifications = row["MODIFICATIONS"].split("|")
sequence = row["SEQUENCE"]
if mods:
sequence = regex.sub(lambda match: find_replacement(match, sequence), sequence)
sequence = regex.sub(lambda match: find_replacement(match), sequence)

if len(modifications) == 0:
sequences.append(sequence)
else:
else:
skip = 0
for mod in modifications:
pos, mass = mod.split("$")
Expand Down
53 changes: 32 additions & 21 deletions spectrum_io/search_result/maxquant.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import logging
from pathlib import Path
from typing import Optional, Union, Dict, Tuple
from typing import Dict, Optional, Tuple, Union

import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal
from spectrum_fundamentals.constants import MAXQUANT_VAR_MODS

from .search_results import SearchResults, filter_valid_prosit_sequences

Expand Down Expand Up @@ -42,14 +41,16 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:
mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"]
return mass

def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
def read_result(
self,
tmt_labeled: str,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
) -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
:param tmt_labeled: tmt label as str
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param stat_mods: dict with custom static identifier and respecitve internal equivalent
:param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
:return: pd.DataFrame with the formatted data
"""
logger.info("Reading msms.txt file")
Expand All @@ -76,24 +77,35 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No
df.columns = df.columns.str.upper()
df.columns = df.columns.str.replace(" ", "_")

stat_mods: Dict[str, str] = {}
var_mods: Dict[str, str] = {}

if custom_mods is not None:
stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}

df = MaxQuant.update_columns_for_prosit(df, tmt_labeled, stat_mods, var_mods)
return filter_valid_prosit_sequences(df)

@staticmethod
def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
def update_columns_for_prosit(
df: pd.DataFrame,
tmt_labeled: str,
stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None,
) -> pd.DataFrame:
"""
Update columns of df to work with Prosit.
:param df: df to modify
:param tmt_labeled: True if tmt labeled
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param stat_mods: dict with custom static identifier and respecitve internal equivalent
:return: modified df as pd.DataFrame
"""
df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True)

mods = {**(MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}
mods = {**(c.MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}

df["REVERSE"].fillna(False, inplace=True)
df["REVERSE"].replace("+", True, inplace=True)
Expand All @@ -102,33 +114,32 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Opt
unimod_tag = c.TMT_MODS[tmt_labeled]
logger.info("Adding TMT fixed modifications")
df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
df["MODIFIED_SEQUENCE"].to_numpy(), mods=
{**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}
)
df["MODIFIED_SEQUENCE"].to_numpy(),
mods={**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods},
)
df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1)
if "msa" in tmt_labeled:
logger.info("Replacing phospho by dehydration for Phospho-MSA")
df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace(
"[UNIMOD:21]", "[UNIMOD:23]", regex=False
)
fixed_mods = {"C": "C[UNIMOD:4]"}
elif "LABELING_STATE" in df.columns:
logger.info("Adding SILAC fixed modifications")

df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(), mods =
{**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods}
df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(),
mods={**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods},
)
df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods=
{**{"C": "C[UNIMOD:4]"}, **mods}
df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods}
)
df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1)
df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1)
df.drop(columns=["LABELING_STATE"], inplace=True)
else:
df["MODIFIED_SEQUENCE"] = maxquant_to_internal(df["MODIFIED_SEQUENCE"].to_numpy(), mods=
{**{"C": "C[UNIMOD:4]"}, **mods})
df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
df["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods}
)
df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))
df["PROTEINS"].fillna("UNKNOWN", inplace=True)
Expand Down
35 changes: 22 additions & 13 deletions spectrum_io/search_result/msfragger.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import logging
from pathlib import Path
from typing import Optional, Union, Dict, Tuple
from typing import Dict, Optional, Tuple, Union

import pandas as pd
import spectrum_fundamentals.constants as c
from pyteomics import pepxml
from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
from spectrum_fundamentals.constants import MSFRAGGER_VAR_MODS
from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
from tqdm import tqdm

from .search_results import SearchResults, filter_valid_prosit_sequences
Expand All @@ -17,14 +17,17 @@
class MSFragger(SearchResults):
"""Handle search results from MSFragger."""

def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
def read_result(
self,
tmt_labeled: str,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
) -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
:param tmt_labeled: tmt label as str
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param stat_mods: dict with custom static identifier and respecitve internal equivalent:raises FileNotFoundError: in case the given path is neither a file, nor a directory.
:param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
:raises FileNotFoundError: in case the given path is neither a file, nor a directory.
:return: pd.DataFrame with the formatted data
"""
if self.path.is_file():
Expand All @@ -40,18 +43,26 @@ def read_result(self, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = No

df = pd.concat(ms_frag_results)

stat_mods: Dict[str, str] = {}
var_mods: Dict[str, str] = {}

if custom_mods is not None:
stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}

df = update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods)
return filter_valid_prosit_sequences(df)


def update_columns_for_prosit(df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
def update_columns_for_prosit(
df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, var_mods: Optional[Dict[str, str]] = None
) -> pd.DataFrame:
"""
Update columns of df to work with Prosit.
:param df: df to modify
:param tmt_labeled: True if tmt labeled
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param stat_mods: dict with custom static identifier and respecitve internal equivalent
:return: modified df as pd.DataFrame
"""
Expand All @@ -64,15 +75,13 @@ def update_columns_for_prosit(df, tmt_labeled: str, stat_mods: Optional[Dict[str

mods = {**(MSFRAGGER_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}


if tmt_labeled != "":
unimod_tag = c.TMT_MODS[tmt_labeled]
logger.info("Adding TMT fixed modifications")
mods = {**{"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}
df["MODIFIED_SEQUENCE"] = msfragger_to_internal(
df["modified_peptide"].to_list(), mods=mods)
df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods)
else:
#By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included
# By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included
# in the fixed modifications. If you want to have no fixed modifictions at all, supply fixed_mods={}
mods = {**{"C": "C[UNIMOD:4]"}, **mods}
df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods)
Expand Down
27 changes: 20 additions & 7 deletions spectrum_io/search_result/sage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import Optional, Union, Dict, Tuple
from typing import Dict, Optional, Tuple, Union

import pandas as pd
from spectrum_fundamentals.constants import MOD_MASSES_SAGE
Expand All @@ -14,14 +14,16 @@
class Sage(SearchResults):
"""Handle search results from Sage."""

def read_result(self, tmt_labeled: str = "", stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
def read_result(
self,
tmt_labeled: str = "",
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
) -> pd.DataFrame:
"""
Function to read a msms tsv and perform some basic formatting.
:param tmt_labeled: tmt label as str
:param var_mods: Variable modifications with custom identifiers and their respective internal equivalents
:param stat_mods: Static modifications with custom identifiers and their respective internal equivalents
:param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
:return: pd.DataFrame with the formatted data
"""
logger.info(f"Reading {self.path}")
Expand All @@ -36,12 +38,23 @@ def read_result(self, tmt_labeled: str = "", stat_mods: Optional[Dict[str, str]]
df.columns = df.columns.str.upper()
df.columns = df.columns.str.replace(" ", "_")

stat_mods: Dict[str, str] = {}
var_mods: Dict[str, str] = {}

if custom_mods is not None:
stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}

df = Sage.update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods)
return filter_valid_prosit_sequences(df)

@staticmethod
def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None) -> pd.DataFrame:
def update_columns_for_prosit(
df: pd.DataFrame,
tmt_labeled: str,
stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None,
) -> pd.DataFrame:
"""
Update columns of df to work with Prosit.
Expand Down
Loading

0 comments on commit 9143d1d

Please sign in to comment.