Skip to content

Commit

Permalink
Merge pull request #120 from wilhelm-lab/feature/custom-modifications
Browse files Browse the repository at this point in the history
Feature/custom modifications
  • Loading branch information
picciama authored Jul 31, 2024
2 parents 5fb9de2 + 623f586 commit 82d5fed
Show file tree
Hide file tree
Showing 6 changed files with 325 additions and 138 deletions.
28 changes: 25 additions & 3 deletions spectrum_fundamentals/annotation/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def annotate_spectra(
un_annot_spectra: pd.DataFrame,
mass_tolerance: Optional[float] = None,
unit_mass_tolerance: Optional[str] = None,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
fragmentation_method: str = "HCD",
) -> pd.DataFrame:
"""
Expand All @@ -141,12 +142,20 @@ def annotate_spectra(
:param mass_tolerance: mass tolerance to calculate min and max mass
:param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
:param fragmentation_method: fragmentation method that was used
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:return: a Pandas DataFrame containing the annotated spectra with meta data
"""
raw_file_annotations = []
index_columns = {col: un_annot_spectra.columns.get_loc(col) for col in un_annot_spectra.columns}
for row in un_annot_spectra.values:
results = parallel_annotate(row, index_columns, mass_tolerance, unit_mass_tolerance, fragmentation_method)
results = parallel_annotate(
row,
index_columns,
mass_tolerance,
unit_mass_tolerance,
fragmentation_method=fragmentation_method,
custom_mods=custom_mods,
)
if not results:
continue
raw_file_annotations.append(results)
Expand Down Expand Up @@ -336,6 +345,7 @@ def parallel_annotate(
index_columns: Dict[str, int],
mass_tolerance: Optional[float] = None,
unit_mass_tolerance: Optional[str] = None,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
fragmentation_method: str = "HCD",
) -> Optional[
Union[
Expand All @@ -357,6 +367,7 @@ def parallel_annotate(
:param index_columns: a dictionary that contains the index columns of the spectrum
:param mass_tolerance: mass tolerance to calculate min and max mass
:param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param fragmentation_method: fragmentation method that was used
:return: a tuple containing intensity values (np.ndarray), masses (np.ndarray), calculated mass (float),
and any removed peaks (List[str])
Expand All @@ -366,13 +377,18 @@ def parallel_annotate(
if spectrum[index_columns["PEPTIDE_LENGTH"]] > 30: # this was in initialize peaks but can be checked prior
return None
return _annotate_linear_spectrum(
spectrum, index_columns, mass_tolerance, unit_mass_tolerance, fragmentation_method
spectrum,
index_columns,
mass_tolerance,
unit_mass_tolerance,
fragmentation_method=fragmentation_method,
custom_mods=custom_mods,
)

if (spectrum[index_columns["PEPTIDE_LENGTH_A"]] > 30) or (spectrum[index_columns["PEPTIDE_LENGTH_B"]] > 30):
return None
return _annotate_crosslinked_spectrum(
spectrum, index_columns, spectrum[xl_type_col], mass_tolerance, unit_mass_tolerance
spectrum, index_columns, spectrum[xl_type_col], mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
)


Expand All @@ -381,6 +397,7 @@ def _annotate_linear_spectrum(
index_columns: Dict[str, int],
mass_tolerance: Optional[float],
unit_mass_tolerance: Optional[str],
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
fragmentation_method: str = "HCD",
):
"""
Expand All @@ -390,6 +407,7 @@ def _annotate_linear_spectrum(
:param index_columns: Index columns of the spectrum
:param mass_tolerance: Mass tolerance for calculating min and max mass
:param unit_mass_tolerance: Unit for the mass tolerance (da or ppm)
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param fragmentation_method: fragmentation method that was used
:return: Annotated spectrum
"""
Expand All @@ -403,6 +421,7 @@ def _annotate_linear_spectrum(
mass_tolerance=mass_tolerance,
unit_mass_tolerance=unit_mass_tolerance,
fragmentation_method=fragmentation_method,
custom_mods=custom_mods,
)
matched_peaks = match_peaks(
fragments_meta_data,
Expand Down Expand Up @@ -435,6 +454,7 @@ def _annotate_crosslinked_spectrum(
crosslinker_type: str,
mass_tolerance: Optional[float] = None,
unit_mass_tolerance: Optional[str] = None,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
):
"""
Annotate a crosslinked peptide spectrum.
Expand All @@ -444,6 +464,7 @@ def _annotate_crosslinked_spectrum(
:param crosslinker_type: Type of crosslinker used
:param mass_tolerance: Mass tolerance for calculating min and max mass
:param unit_mass_tolerance: Unit for the mass tolerance (da or ppm)
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:raises ValueError: if unsupported crosslinker type was supplied.
:return: Annotated spectrum
Expand All @@ -470,6 +491,7 @@ def _xl_annotation_workflow(seq_id: str, non_cl_xl: bool):

else:
array_size = 348
inputs.append(custom_mods)
fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks_xl(*inputs)
matched_peaks = match_peaks(
fragments_meta_data,
Expand Down
52 changes: 40 additions & 12 deletions spectrum_fundamentals/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from typing import Dict, List, Optional, Tuple

import numpy as np

Expand Down Expand Up @@ -233,19 +234,46 @@
"[UNIMOD:747]": 86.000394, # Malonylation
}


def update_mod_masses(mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None) -> Dict[str, float]:
"""
Function to update MOD_MASSES with custom modifications.
:param mods: Modifications with respective mass
:raises AssertionError: if mass of modification was not provided as float.
:return: updated MOD_MASSES
"""
mod_masses = {}
if mods:
try:
stat_mods: List[Tuple[str, float]] = [
(value[0], float(value[1])) for key, value in (mods.get("stat_mods") or {}).items()
]
var_mods: List[Tuple[str, float]] = [
(value[0], float(value[1])) for key, value in (mods.get("var_mods") or {}).items()
]
except ValueError as e:
raise AssertionError("All custom modifications value entries must be of type Tuple[str, float]") from e

for value in stat_mods + var_mods:
mod_masses[value[0]] = float(value[1])

return MOD_MASSES | mod_masses


MOD_MASSES_SAGE = {
229.1629: "[UNIMOD:737]",
304.2071: "[UNIMOD:2016]",
144.1020: "[UNIMOD:214]",
304.2053: "[UNIMOD:730]",
8.0141: "[UNIMOD:259]",
10.0082: "[UNIMOD:267]",
79.9663: "[UNIMOD:21]",
-18.0105: "[UNIMOD:23]",
57.0215: "[UNIMOD:4]",
15.9949: "[UNIMOD:35]",
15.994: "[UNIMOD:35]",
42.0105: "[UNIMOD:1]",
"229.1629": "[UNIMOD:737]",
"304.2071": "[UNIMOD:2016]",
"144.1020": "[UNIMOD:214]",
"304.2053": "[UNIMOD:730]",
"8.0141": "[UNIMOD:259]",
"10.0082": "[UNIMOD:267]",
"79.9663": "[UNIMOD:21]",
"-18.0105": "[UNIMOD:23]",
"57.0215": "[UNIMOD:4]",
"15.9949": "[UNIMOD:35]",
"15.994": "[UNIMOD:35]",
"42.0105": "[UNIMOD:1]",
}
# these are only used for prosit_grpc, oktoberfest uses the masses from MOD_MASSES
AA_MOD_MASSES = {
Expand Down
27 changes: 19 additions & 8 deletions spectrum_fundamentals/fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
logger = logging.getLogger(__name__)


def _get_modifications(peptide_sequence: str) -> Dict[int, float]:
def _get_modifications(
peptide_sequence: str, custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None
) -> Dict[int, float]:
"""
Get modification masses and position in a peptide sequence.
Expand All @@ -24,6 +26,7 @@ def _get_modifications(peptide_sequence: str) -> Dict[int, float]:
modification was present are returned.
:param peptide_sequence: Modified peptide sequence
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:return: modification_deltas
"""
modification_deltas = {}
Expand All @@ -37,25 +40,28 @@ def _get_modifications(peptide_sequence: str) -> Dict[int, float]:
pattern = re.compile(r"\[.{8}[^\]]*\]")
matches = pattern.finditer(peptide_sequence)

mod_masses = constants.update_mod_masses(mods=custom_mods)

for match in matches:
start_pos = match.start()
end_pos = match.end()
modification_deltas[start_pos - offset] = constants.MOD_MASSES[peptide_sequence[start_pos:end_pos]]
modification_deltas[start_pos - offset] = mod_masses[peptide_sequence[start_pos:end_pos]]
offset += end_pos - start_pos

return modification_deltas


def compute_peptide_mass(sequence: str) -> float:
def compute_peptide_mass(sequence: str, custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None) -> float:
"""
Compute the theoretical mass of the peptide sequence.
:param sequence: Modified peptide sequence
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:return: Theoretical mass of the sequence
"""
terminal_masses = 2 * constants.ATOM_MASSES["H"] + constants.ATOM_MASSES["O"] # add terminal masses HO- and H-

modification_deltas = _get_modifications(sequence)
modification_deltas = _get_modifications(sequence, custom_mods=custom_mods)
if modification_deltas: # there were modifictions
sequence = internal_without_mods([sequence])[0]
terminal_masses += modification_deltas.get(-2, 0.0) # prime with n_term_mod delta if present
Expand All @@ -67,7 +73,7 @@ def compute_peptide_mass(sequence: str) -> float:

def _xl_sanity_check(noncl_xl: int, peptide_beta_mass: float, xl_pos: float):
"""
Checks input validity for initialize_peacks when used with xl mode.
Checks input validity for initialize_peaks when used with xl mode.
:param noncl_xl: whether the function is called with a non-cleavable xl modification
:param peptide_beta_mass: the mass of the second peptide to be considered for non-cleavable XL
Expand Down Expand Up @@ -135,6 +141,7 @@ def initialize_peaks(
peptide_beta_mass: float = 0.0,
xl_pos: int = -1,
fragmentation_method: str = "HCD",
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
) -> Tuple[List[dict], int, str, float]:
"""
Generate theoretical peaks for a modified peptide sequence.
Expand All @@ -148,13 +155,14 @@ def initialize_peaks(
:param peptide_beta_mass: the mass of the second peptide to be considered for non-cleavable XL
:param xl_pos: the position of the crosslinker for non-cleavable XL
:param fragmentation_method: fragmentation method that was used
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence
"""
_xl_sanity_check(noncl_xl, peptide_beta_mass, xl_pos)

max_charge = min(3, charge)
ion_types = retrieve_ion_types(fragmentation_method)
modification_deltas = _get_modifications(sequence)
modification_deltas = _get_modifications(sequence, custom_mods=custom_mods)

fragments_meta_data = []
n_term_mod = 1
Expand Down Expand Up @@ -230,6 +238,7 @@ def initialize_peaks_xl(
mass_tolerance: Optional[float] = None,
unit_mass_tolerance: Optional[str] = None,
sequence_beta: Optional[str] = None,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
) -> Tuple[List[dict], int, str, float]:
"""
Generate theoretical peaks for a modified (potentially cleavable cross-linked) peptide sequence.
Expand All @@ -243,6 +252,7 @@ def initialize_peaks_xl(
:param mass_tolerance: mass tolerance to calculate min and max mass
:param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
:param sequence_beta: optional second peptide to be considered for non-cleavable XL
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:raises ValueError: if crosslinker_type is unkown
:raises AssertionError: if the short and long XL sequence (the one with the short / long crosslinker mod)
has a tmt n term while the other one does not
Expand Down Expand Up @@ -278,10 +288,10 @@ def initialize_peaks_xl(
# percolator!

list_out_s, tmt_n_term_s, peptide_sequence, _ = initialize_peaks(
sequence_s, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance
sequence_s, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
)
list_out_l, tmt_n_term_l, peptide_sequence, _ = initialize_peaks(
sequence_l, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance
sequence_l, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
)

tmt_n_term = tmt_n_term_s
Expand Down Expand Up @@ -324,6 +334,7 @@ def initialize_peaks_xl(
True,
sequence_beta_mass if sequence_beta_mass is not None else None,
crosslinker_position,
custom_mods=custom_mods,
)
df_out = pd.DataFrame(list_out)

Expand Down
Loading

0 comments on commit 82d5fed

Please sign in to comment.