Merge pull request #120 from wilhelm-lab/feature/custom-modifications

Feature/custom modifications
wilhelm-lab · Jul 31, 2024 · 82d5fed · 82d5fed
2 parents 5fb9de2 + 623f586
commit 82d5fed
Show file tree

Hide file tree

Showing 6 changed files with 325 additions and 138 deletions.
diff --git a/spectrum_fundamentals/annotation/annotation.py b/spectrum_fundamentals/annotation/annotation.py
@@ -121,6 +121,7 @@ def annotate_spectra(
     un_annot_spectra: pd.DataFrame,
     mass_tolerance: Optional[float] = None,
     unit_mass_tolerance: Optional[str] = None,
+    custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
     fragmentation_method: str = "HCD",
 ) -> pd.DataFrame:
     """
@@ -141,12 +142,20 @@ def annotate_spectra(
     :param mass_tolerance: mass tolerance to calculate min and max mass
     :param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
     :param fragmentation_method: fragmentation method that was used
+    :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :return: a Pandas DataFrame containing the annotated spectra with meta data
     """
     raw_file_annotations = []
     index_columns = {col: un_annot_spectra.columns.get_loc(col) for col in un_annot_spectra.columns}
     for row in un_annot_spectra.values:
-        results = parallel_annotate(row, index_columns, mass_tolerance, unit_mass_tolerance, fragmentation_method)
+        results = parallel_annotate(
+            row,
+            index_columns,
+            mass_tolerance,
+            unit_mass_tolerance,
+            fragmentation_method=fragmentation_method,
+            custom_mods=custom_mods,
+        )
         if not results:
             continue
         raw_file_annotations.append(results)
@@ -336,6 +345,7 @@ def parallel_annotate(
     index_columns: Dict[str, int],
     mass_tolerance: Optional[float] = None,
     unit_mass_tolerance: Optional[str] = None,
+    custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
     fragmentation_method: str = "HCD",
 ) -> Optional[
     Union[
@@ -357,6 +367,7 @@ def parallel_annotate(
     :param index_columns: a dictionary that contains the index columns of the spectrum
     :param mass_tolerance: mass tolerance to calculate min and max mass
     :param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
+    :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :param fragmentation_method: fragmentation method that was used
     :return: a tuple containing intensity values (np.ndarray), masses (np.ndarray), calculated mass (float),
              and any removed peaks (List[str])
@@ -366,13 +377,18 @@ def parallel_annotate(
         if spectrum[index_columns["PEPTIDE_LENGTH"]] > 30:  # this was in initialize peaks but can be checked prior
             return None
         return _annotate_linear_spectrum(
-            spectrum, index_columns, mass_tolerance, unit_mass_tolerance, fragmentation_method
+            spectrum,
+            index_columns,
+            mass_tolerance,
+            unit_mass_tolerance,
+            fragmentation_method=fragmentation_method,
+            custom_mods=custom_mods,
         )
 
     if (spectrum[index_columns["PEPTIDE_LENGTH_A"]] > 30) or (spectrum[index_columns["PEPTIDE_LENGTH_B"]] > 30):
         return None
     return _annotate_crosslinked_spectrum(
-        spectrum, index_columns, spectrum[xl_type_col], mass_tolerance, unit_mass_tolerance
+        spectrum, index_columns, spectrum[xl_type_col], mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
     )
 
 
@@ -381,6 +397,7 @@ def _annotate_linear_spectrum(
     index_columns: Dict[str, int],
     mass_tolerance: Optional[float],
     unit_mass_tolerance: Optional[str],
+    custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
     fragmentation_method: str = "HCD",
 ):
     """
@@ -390,6 +407,7 @@ def _annotate_linear_spectrum(
     :param index_columns: Index columns of the spectrum
     :param mass_tolerance: Mass tolerance for calculating min and max mass
     :param unit_mass_tolerance: Unit for the mass tolerance (da or ppm)
+    :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :param fragmentation_method: fragmentation method that was used
     :return: Annotated spectrum
     """
@@ -403,6 +421,7 @@ def _annotate_linear_spectrum(
         mass_tolerance=mass_tolerance,
         unit_mass_tolerance=unit_mass_tolerance,
         fragmentation_method=fragmentation_method,
+        custom_mods=custom_mods,
     )
     matched_peaks = match_peaks(
         fragments_meta_data,
@@ -435,6 +454,7 @@ def _annotate_crosslinked_spectrum(
     crosslinker_type: str,
     mass_tolerance: Optional[float] = None,
     unit_mass_tolerance: Optional[str] = None,
+    custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
 ):
     """
     Annotate a crosslinked peptide spectrum.
@@ -444,6 +464,7 @@ def _annotate_crosslinked_spectrum(
     :param crosslinker_type: Type of crosslinker used
     :param mass_tolerance: Mass tolerance for calculating min and max mass
     :param unit_mass_tolerance: Unit for the mass tolerance (da or ppm)
+    :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :raises ValueError: if unsupported crosslinker type was supplied.
 
     :return: Annotated spectrum
@@ -470,6 +491,7 @@ def _xl_annotation_workflow(seq_id: str, non_cl_xl: bool):
 
         else:
             array_size = 348
+        inputs.append(custom_mods)
         fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks_xl(*inputs)
         matched_peaks = match_peaks(
             fragments_meta_data,

diff --git a/spectrum_fundamentals/constants.py b/spectrum_fundamentals/constants.py
@@ -1,4 +1,5 @@
 from enum import Enum
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 
@@ -233,19 +234,46 @@
     "[UNIMOD:747]": 86.000394,  # Malonylation
 }
 
+
+def update_mod_masses(mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None) -> Dict[str, float]:
+    """
+    Function to update MOD_MASSES with custom modifications.
+
+    :param mods: Modifications with respective mass
+    :raises AssertionError: if mass of modification was not provided as float.
+    :return: updated MOD_MASSES
+    """
+    mod_masses = {}
+    if mods:
+        try:
+            stat_mods: List[Tuple[str, float]] = [
+                (value[0], float(value[1])) for key, value in (mods.get("stat_mods") or {}).items()
+            ]
+            var_mods: List[Tuple[str, float]] = [
+                (value[0], float(value[1])) for key, value in (mods.get("var_mods") or {}).items()
+            ]
+        except ValueError as e:
+            raise AssertionError("All custom modifications value entries must be of type Tuple[str, float]") from e
+
+        for value in stat_mods + var_mods:
+            mod_masses[value[0]] = float(value[1])
+
+    return MOD_MASSES | mod_masses
+
+
 MOD_MASSES_SAGE = {
-    229.1629: "[UNIMOD:737]",
-    304.2071: "[UNIMOD:2016]",
-    144.1020: "[UNIMOD:214]",
-    304.2053: "[UNIMOD:730]",
-    8.0141: "[UNIMOD:259]",
-    10.0082: "[UNIMOD:267]",
-    79.9663: "[UNIMOD:21]",
-    -18.0105: "[UNIMOD:23]",
-    57.0215: "[UNIMOD:4]",
-    15.9949: "[UNIMOD:35]",
-    15.994: "[UNIMOD:35]",
-    42.0105: "[UNIMOD:1]",
+    "229.1629": "[UNIMOD:737]",
+    "304.2071": "[UNIMOD:2016]",
+    "144.1020": "[UNIMOD:214]",
+    "304.2053": "[UNIMOD:730]",
+    "8.0141": "[UNIMOD:259]",
+    "10.0082": "[UNIMOD:267]",
+    "79.9663": "[UNIMOD:21]",
+    "-18.0105": "[UNIMOD:23]",
+    "57.0215": "[UNIMOD:4]",
+    "15.9949": "[UNIMOD:35]",
+    "15.994": "[UNIMOD:35]",
+    "42.0105": "[UNIMOD:1]",
 }
 # these are only used for prosit_grpc, oktoberfest uses the masses from MOD_MASSES
 AA_MOD_MASSES = {

diff --git a/spectrum_fundamentals/fragments.py b/spectrum_fundamentals/fragments.py
@@ -12,7 +12,9 @@
 logger = logging.getLogger(__name__)
 
 
-def _get_modifications(peptide_sequence: str) -> Dict[int, float]:
+def _get_modifications(
+    peptide_sequence: str, custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None
+) -> Dict[int, float]:
     """
     Get modification masses and position in a peptide sequence.
 
@@ -24,6 +26,7 @@ def _get_modifications(peptide_sequence: str) -> Dict[int, float]:
     modification was present are returned.
 
     :param peptide_sequence: Modified peptide sequence
+    :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :return: modification_deltas
     """
     modification_deltas = {}
@@ -37,25 +40,28 @@ def _get_modifications(peptide_sequence: str) -> Dict[int, float]:
     pattern = re.compile(r"\[.{8}[^\]]*\]")
     matches = pattern.finditer(peptide_sequence)
 
+    mod_masses = constants.update_mod_masses(mods=custom_mods)
+
     for match in matches:
         start_pos = match.start()
         end_pos = match.end()
-        modification_deltas[start_pos - offset] = constants.MOD_MASSES[peptide_sequence[start_pos:end_pos]]
+        modification_deltas[start_pos - offset] = mod_masses[peptide_sequence[start_pos:end_pos]]
         offset += end_pos - start_pos
 
     return modification_deltas
 
 
-def compute_peptide_mass(sequence: str) -> float:
+def compute_peptide_mass(sequence: str, custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None) -> float:
     """
     Compute the theoretical mass of the peptide sequence.
 
     :param sequence: Modified peptide sequence
+    :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :return: Theoretical mass of the sequence
     """
     terminal_masses = 2 * constants.ATOM_MASSES["H"] + constants.ATOM_MASSES["O"]  # add terminal masses HO- and H-
 
-    modification_deltas = _get_modifications(sequence)
+    modification_deltas = _get_modifications(sequence, custom_mods=custom_mods)
     if modification_deltas:  # there were modifictions
         sequence = internal_without_mods([sequence])[0]
         terminal_masses += modification_deltas.get(-2, 0.0)  # prime with n_term_mod delta if present
@@ -67,7 +73,7 @@ def compute_peptide_mass(sequence: str) -> float:
 
 def _xl_sanity_check(noncl_xl: int, peptide_beta_mass: float, xl_pos: float):
     """
-    Checks input validity for initialize_peacks when used with xl mode.
+    Checks input validity for initialize_peaks when used with xl mode.
 
     :param noncl_xl: whether the function is called with a non-cleavable xl modification
     :param peptide_beta_mass: the mass of the second peptide to be considered for non-cleavable XL
@@ -135,6 +141,7 @@ def initialize_peaks(
     peptide_beta_mass: float = 0.0,
     xl_pos: int = -1,
     fragmentation_method: str = "HCD",
+    custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
 ) -> Tuple[List[dict], int, str, float]:
     """
     Generate theoretical peaks for a modified peptide sequence.
@@ -148,13 +155,14 @@ def initialize_peaks(
     :param peptide_beta_mass: the mass of the second peptide to be considered for non-cleavable XL
     :param xl_pos: the position of the crosslinker for non-cleavable XL
     :param fragmentation_method: fragmentation method that was used
+    :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence
     """
     _xl_sanity_check(noncl_xl, peptide_beta_mass, xl_pos)
 
     max_charge = min(3, charge)
     ion_types = retrieve_ion_types(fragmentation_method)
-    modification_deltas = _get_modifications(sequence)
+    modification_deltas = _get_modifications(sequence, custom_mods=custom_mods)
 
     fragments_meta_data = []
     n_term_mod = 1
@@ -230,6 +238,7 @@ def initialize_peaks_xl(
     mass_tolerance: Optional[float] = None,
     unit_mass_tolerance: Optional[str] = None,
     sequence_beta: Optional[str] = None,
+    custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
 ) -> Tuple[List[dict], int, str, float]:
     """
     Generate theoretical peaks for a modified (potentially cleavable cross-linked) peptide sequence.
@@ -243,6 +252,7 @@ def initialize_peaks_xl(
     :param mass_tolerance: mass tolerance to calculate min and max mass
     :param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
     :param sequence_beta: optional second peptide to be considered for non-cleavable XL
+    :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :raises ValueError: if crosslinker_type is unkown
     :raises AssertionError: if the short and long XL sequence (the one with the short / long crosslinker mod)
         has a tmt n term while the other one does not
@@ -278,10 +288,10 @@ def initialize_peaks_xl(
         # percolator!
 
         list_out_s, tmt_n_term_s, peptide_sequence, _ = initialize_peaks(
-            sequence_s, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance
+            sequence_s, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
         )
         list_out_l, tmt_n_term_l, peptide_sequence, _ = initialize_peaks(
-            sequence_l, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance
+            sequence_l, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
         )
 
         tmt_n_term = tmt_n_term_s
@@ -324,6 +334,7 @@ def initialize_peaks_xl(
             True,
             sequence_beta_mass if sequence_beta_mass is not None else None,
             crosslinker_position,
+            custom_mods=custom_mods,
         )
         df_out = pd.DataFrame(list_out)