Skip to content

Commit

Permalink
fixed and cleaned up unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
picciama committed Aug 1, 2024
1 parent 9143d1d commit ce752a3
Show file tree
Hide file tree
Showing 15 changed files with 420 additions and 370 deletions.
148 changes: 57 additions & 91 deletions spectrum_io/search_result/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import internal_without_mods, maxquant_to_internal
from spectrum_fundamentals.mod_string import internal_without_mods

from .search_results import SearchResults, filter_valid_prosit_sequences
from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -43,108 +43,74 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:

def read_result(
self,
tmt_labeled: str,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
tmt_label: str = "",
custom_mods: Optional[Dict[str, int]] = None,
) -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
:param tmt_labeled: tmt label as str
:param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
:param tmt_label: optional tmt label as str
:param custom_mods: optional dictionary mapping MaxQuant-specific mod pattern to UNIMOD IDs.
If None, static carbamidomethylation of cytein and variable oxidation of methionine
are mapped automatically. To avoid this, explicitely provide an empty dictionary.
:return: pd.DataFrame with the formatted data
"""
logger.info("Reading msms.txt file")
df = pd.read_csv(
self.path / "msms.txt",
usecols=lambda x: x.upper()
in [
"RAW FILE",
"SCAN NUMBER",
"MODIFIED SEQUENCE",
"CHARGE",
"SCAN EVENT NUMBER",
"LABELING STATE",
"MASS", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead
"SCORE",
"REVERSE",
"PROTEINS",
],
sep="\t",
)
logger.info("Finished reading msms.txt file")

# Standardize column names
df.columns = df.columns.str.upper()
df.columns = df.columns.str.replace(" ", "_")
if custom_mods is None:
custom_mods = {
"C": 4,
"M(ox)": 35,
"M(Oxidation (M))": 35,
}
parsed_mods = parse_mods(custom_mods)
if tmt_label:
unimod_tag = c.TMT_MODS[tmt_label]
parsed_mods["K"] = f"K{unimod_tag}"
parsed_mods["^_"] = f"_{unimod_tag}-"

stat_mods: Dict[str, str] = {}
var_mods: Dict[str, str] = {}
logger.info("Reading msms.txt file")
self.results = pd.read_csv(self.path / "msms.txt", sep="\t")

if custom_mods is not None:
stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}
logger.info("Finished reading msms.txt file")

df = MaxQuant.update_columns_for_prosit(df, tmt_labeled, stat_mods, var_mods)
return filter_valid_prosit_sequences(df)
self.convert_to_internal(mods=parsed_mods)
return filter_valid_prosit_sequences(self.results)

@staticmethod
def update_columns_for_prosit(
df: pd.DataFrame,
tmt_labeled: str,
stat_mods: Optional[Dict[str, str]] = None,
var_mods: Optional[Dict[str, str]] = None,
) -> pd.DataFrame:
def convert_to_internal(self, mods: Dict[str, str]) -> pd.DataFrame:
"""
Update columns of df to work with Prosit.
Convert all columns in the MaxQuant output to the internal format used by Oktoberfest.
:param df: df to modify
:param tmt_labeled: True if tmt labeled
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param stat_mods: dict with custom static identifier and respecitve internal equivalent
:return: modified df as pd.DataFrame
:param mods: dictionary mapping MaxQuant-specific mod patterns (keys) to ProForma standard (values)
"""
df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True)

mods = {**(c.MAXQUANT_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}

df["REVERSE"].fillna(False, inplace=True)
df["REVERSE"].replace("+", True, inplace=True)
logger.info("Converting MaxQuant peptide sequence to internal format")
if tmt_labeled != "":
unimod_tag = c.TMT_MODS[tmt_labeled]
logger.info("Adding TMT fixed modifications")
df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
df["MODIFIED_SEQUENCE"].to_numpy(),
mods={**{"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods},
)
df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1)
if "msa" in tmt_labeled:
logger.info("Replacing phospho by dehydration for Phospho-MSA")
df["MODIFIED_SEQUENCE_MSA"] = df["MODIFIED_SEQUENCE"].str.replace(
"[UNIMOD:21]", "[UNIMOD:23]", regex=False
)
elif "LABELING_STATE" in df.columns:
logger.info("Adding SILAC fixed modifications")

df.loc[df["LABELING_STATE"] == 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
df[df["LABELING_STATE"] == 1]["MODIFIED_SEQUENCE"].to_numpy(),
mods={**{"C": "C[UNIMOD:4]", "K": "K[UNIMOD:259]", "R": "R[UNIMOD:267]"}, **mods},
)
df.loc[df["LABELING_STATE"] != 1, "MODIFIED_SEQUENCE"] = maxquant_to_internal(
df[df["LABELING_STATE"] != 1]["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods}
)
df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:259]"), axis=1)
df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, "[UNIMOD:267]"), axis=1)
df.drop(columns=["LABELING_STATE"], inplace=True)
else:
df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
df["MODIFIED_SEQUENCE"].to_numpy(), mods={**{"C": "C[UNIMOD:4]"}, **mods}
)
df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))
df["PROTEINS"].fillna("UNKNOWN", inplace=True)

return df
df = self.results
# Standardize column names
# df.columns = df.columns.str.upper()
# df.columns = df.columns.str.replace(" ", "_")
# df.rename(columns={"CHARGE": "PRECURSOR_CHARGE"}, inplace=True)

mods["_"] = ""

df.fillna({"Reverse": "", "Proteins": "UNKNOWN"}, inplace=True)
df["Reverse"] = df["Reverse"].astype(bool)
df.replace({"Modified sequence": mods}, regex=True, inplace=True)

df["Sequence"] = internal_without_mods(df["Modified sequence"])
df["PEPTIDE_LENGTH"] = df["Sequence"].str.len()

df.rename(
columns={
"Reverse": "REVERSE",
"Sequence": "SEQUENCE",
"Modified sequence": "MODIFIED_SEQUENCE",
"Proteins": "PROTEINS",
"Charge": "PRECURSOR_CHARGE",
"Raw file": "RAW_FILE",
"Scan number": "SCAN_NUMBER",
"Scan event number": "SCAN_EVENT_NUMBER",
"Mass": "MASS",
"Score": "SCORE",
},
inplace=True,
)

def generate_internal_timstof_metadata(self):
"""
Expand Down
146 changes: 68 additions & 78 deletions spectrum_io/search_result/msfragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import spectrum_fundamentals.constants as c
from pyteomics import pepxml
from spectrum_fundamentals.constants import MSFRAGGER_VAR_MODS
from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
from spectrum_fundamentals.mod_string import internal_without_mods
from tqdm import tqdm

from .search_results import SearchResults, filter_valid_prosit_sequences
from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods

logger = logging.getLogger(__name__)

Expand All @@ -19,17 +19,26 @@ class MSFragger(SearchResults):

def read_result(
self,
tmt_labeled: str,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
tmt_label: str = "",
custom_mods: Optional[Dict[str, int]] = None,
) -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
:param tmt_labeled: tmt label as str
:param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
:param tmt_label: optional tmt label as str
:param custom_mods: optional dictionary mapping MSFragger-specific mod pattern to UNIMOD IDs.
If None, static carbamidomethylation of cytein and variable oxidation of methionine
are mapped automatically. To avoid this, explicitely provide an empty dictionary.
:raises FileNotFoundError: in case the given path is neither a file, nor a directory.
:return: pd.DataFrame with the formatted data
"""
if custom_mods is None:
custom_mods = {"C": 4, "M[147]": 35}
parsed_mods = parse_mods(custom_mods)
if tmt_label:
unimod_tag = c.TMT_MODS[tmt_label]
parsed_mods["K"] = f"K{unimod_tag}"
parsed_mods[r"^n\[\d+\]"] = f"{unimod_tag}-"
if self.path.is_file():
file_list = [self.path]
elif self.path.is_dir():
Expand All @@ -41,76 +50,57 @@ def read_result(
for pep_xml_file in tqdm(file_list):
ms_frag_results.append(pepxml.DataFrame(str(pep_xml_file)))

df = pd.concat(ms_frag_results)

stat_mods: Dict[str, str] = {}
var_mods: Dict[str, str] = {}

if custom_mods is not None:
stat_mods = {key: value[0] for key, value in (custom_mods.get("stat_mods") or {}).items()}
var_mods = {key: value[0] for key, value in (custom_mods.get("var_mods") or {}).items()}

df = update_columns_for_prosit(df, tmt_labeled, stat_mods=stat_mods, var_mods=var_mods)
return filter_valid_prosit_sequences(df)


def update_columns_for_prosit(
df, tmt_labeled: str, stat_mods: Optional[Dict[str, str]] = None, var_mods: Optional[Dict[str, str]] = None
) -> pd.DataFrame:
"""
Update columns of df to work with Prosit.
:param df: df to modify
:param tmt_labeled: True if tmt labeled
:param var_mods: dict with custom variable identifier and respecitve internal equivalent
:param stat_mods: dict with custom static identifier and respecitve internal equivalent
:return: modified df as pd.DataFrame
"""
df["PROTEINS"] = df["protein"]
df["PROTEINS"].fillna("UNKNOWN", inplace=True)
df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x))
df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0])
df["MASS"] = df["precursor_neutral_mass"]
df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x))

mods = {**(MSFRAGGER_VAR_MODS), **(stat_mods or {}), **(var_mods or {})}

if tmt_labeled != "":
unimod_tag = c.TMT_MODS[tmt_labeled]
logger.info("Adding TMT fixed modifications")
mods = {**{"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"}, **mods}
df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods)
else:
# By default, i.e. if nothing is supplied to fixed_mods, carbamidomethylation on cystein will be included
# in the fixed modifications. If you want to have no fixed modifictions at all, supply fixed_mods={}
mods = {**{"C": "C[UNIMOD:4]"}, **mods}
df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list(), mods=mods)

df.rename(
columns={
"assumed_charge": "PRECURSOR_CHARGE",
"index": "SCAN_EVENT_NUMBER",
"peptide": "SEQUENCE",
"start_scan": "SCAN_NUMBER",
"hyperscore": "SCORE",
},
inplace=True,
)
df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
df["PROTEINS"] = df["PROTEINS"].apply(lambda x: ";".join(x))

return df[
[
"RAW_FILE",
"SCAN_NUMBER",
"MODIFIED_SEQUENCE",
"PRECURSOR_CHARGE",
"SCAN_EVENT_NUMBER",
"MASS",
"SCORE",
"REVERSE",
"SEQUENCE",
"PEPTIDE_LENGTH",
"PROTEINS",
self.results = pd.concat(ms_frag_results)

self.convert_to_internal(mods=parsed_mods)
return filter_valid_prosit_sequences(self.results)

def convert_to_internal(self, mods: Dict[str, str]) -> pd.DataFrame:
"""
Convert all columns in the MSFragger output to the internal format used by Oktoberfest.
:param mods: dictionary mapping MSFragger-specific mod patterns (keys) to ProForma standard (values)
"""
df = self.results
df["protein"] = df["protein"].fillna("UNKNOWN").apply(lambda x: ";".join(x))

df["REVERSE"] = df["protein"].apply(lambda x: "rev" in str(x))
df["spectrum"] = df["spectrum"].str.split(pat=".", n=1).str[0]
df["PEPTIDE_LENGTH"] = df["peptide"].str.len()

df.replace({"modified_peptide": mods}, regex=True, inplace=True)
df["peptide"] = internal_without_mods(df["modified_peptide"])

df.rename(
columns={
"assumed_charge": "PRECURSOR_CHARGE",
"index": "SCAN_EVENT_NUMBER",
"peptide": "SEQUENCE",
"start_scan": "SCAN_NUMBER",
"hyperscore": "SCORE",
"modified_peptide": "MODIFIED_SEQUENCE",
"protein": "PROTEINS",
"peptide": "SEQUENCE",
"precursor_neutral_mass": "MASS",
"spectrum": "RAW_FILE",
},
inplace=True,
)

"""
return df[
[
"RAW_FILE",
"SCAN_NUMBER",
"MODIFIED_SEQUENCE",
"PRECURSOR_CHARGE",
"SCAN_EVENT_NUMBER",
"MASS",
"SCORE",
"REVERSE",
"SEQUENCE",
"PEPTIDE_LENGTH",
"PROTEINS",
]
]
]
"""
Loading

0 comments on commit ce752a3

Please sign in to comment.