Skip to content

Commit

Permalink
rm compute_ion_mass function + custom mod cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
picciama committed Aug 6, 2024
1 parent 645ec62 commit 39c18f5
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 193 deletions.
16 changes: 8 additions & 8 deletions spectrum_fundamentals/annotation/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def annotate_spectra(
un_annot_spectra: pd.DataFrame,
mass_tolerance: Optional[float] = None,
unit_mass_tolerance: Optional[str] = None,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
custom_mods: Optional[Dict[str, float]] = None,
fragmentation_method: str = "HCD",
) -> pd.DataFrame:
"""
Expand All @@ -142,7 +142,7 @@ def annotate_spectra(
:param mass_tolerance: mass tolerance to calculate min and max mass
:param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
:param fragmentation_method: fragmentation method that was used
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:return: a Pandas DataFrame containing the annotated spectra with meta data
"""
raw_file_annotations = []
Expand Down Expand Up @@ -345,7 +345,7 @@ def parallel_annotate(
index_columns: Dict[str, int],
mass_tolerance: Optional[float] = None,
unit_mass_tolerance: Optional[str] = None,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
custom_mods: Optional[Dict[str, float]] = None,
fragmentation_method: str = "HCD",
) -> Optional[
Union[
Expand All @@ -367,7 +367,7 @@ def parallel_annotate(
:param index_columns: a dictionary that contains the index columns of the spectrum
:param mass_tolerance: mass tolerance to calculate min and max mass
:param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:param fragmentation_method: fragmentation method that was used
:return: a tuple containing intensity values (np.ndarray), masses (np.ndarray), calculated mass (float),
and any removed peaks (List[str])
Expand Down Expand Up @@ -397,7 +397,7 @@ def _annotate_linear_spectrum(
index_columns: Dict[str, int],
mass_tolerance: Optional[float],
unit_mass_tolerance: Optional[str],
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
custom_mods: Optional[Dict[str, float]] = None,
fragmentation_method: str = "HCD",
):
"""
Expand All @@ -407,7 +407,7 @@ def _annotate_linear_spectrum(
:param index_columns: Index columns of the spectrum
:param mass_tolerance: Mass tolerance for calculating min and max mass
:param unit_mass_tolerance: Unit for the mass tolerance (da or ppm)
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:param fragmentation_method: fragmentation method that was used
:return: Annotated spectrum
"""
Expand Down Expand Up @@ -454,7 +454,7 @@ def _annotate_crosslinked_spectrum(
crosslinker_type: str,
mass_tolerance: Optional[float] = None,
unit_mass_tolerance: Optional[str] = None,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
custom_mods: Optional[Dict[str, float]] = None,
):
"""
Annotate a crosslinked peptide spectrum.
Expand All @@ -464,7 +464,7 @@ def _annotate_crosslinked_spectrum(
:param crosslinker_type: Type of crosslinker used
:param mass_tolerance: Mass tolerance for calculating min and max mass
:param unit_mass_tolerance: Unit for the mass tolerance (da or ppm)
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:raises ValueError: if unsupported crosslinker type was supplied.
:return: Annotated spectrum
Expand Down
29 changes: 2 additions & 27 deletions spectrum_fundamentals/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from enum import Enum
from typing import Dict, List, Optional, Tuple

import numpy as np

Expand Down Expand Up @@ -235,32 +234,6 @@
}


def update_mod_masses(mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None) -> Dict[str, float]:
"""
Function to update MOD_MASSES with custom modifications.
:param mods: Modifications with respective mass
:raises AssertionError: if mass of modification was not provided as float.
:return: updated MOD_MASSES
"""
mod_masses = {}
if mods:
try:
stat_mods: List[Tuple[str, float]] = [
(value[0], float(value[1])) for key, value in (mods.get("stat_mods") or {}).items()
]
var_mods: List[Tuple[str, float]] = [
(value[0], float(value[1])) for key, value in (mods.get("var_mods") or {}).items()
]
except ValueError as e:
raise AssertionError("All custom modifications value entries must be of type Tuple[str, float]") from e

for value in stat_mods + var_mods:
mod_masses[value[0]] = float(value[1])

return MOD_MASSES | mod_masses


MOD_MASSES_SAGE = {
"229.1629": "[UNIMOD:737]",
"304.2071": "[UNIMOD:2016]",
Expand All @@ -276,6 +249,8 @@ def update_mod_masses(mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] =
"42.0105": "[UNIMOD:1]",
}
# these are only used for prosit_grpc, oktoberfest uses the masses from MOD_MASSES


AA_MOD_MASSES = {
"K[UNIMOD:737]": AA_MASSES["K"] + MOD_MASSES["[UNIMOD:737]"],
"M[UNIMOD:35]": AA_MASSES["M"] + MOD_MASSES["[UNIMOD:35]"],
Expand Down
152 changes: 21 additions & 131 deletions spectrum_fundamentals/fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@
import numpy as np
import pandas as pd

from . import constants as constants
from .constants import AA_MASSES, ATOM_MASSES, MOD_MASSES, PARTICLE_MASSES
from .mod_string import internal_without_mods

logger = logging.getLogger(__name__)


def _get_modifications(
peptide_sequence: str, custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None
) -> Dict[int, float]:
def _get_modifications(peptide_sequence: str, custom_mods: Optional[Dict[str, float]] = None) -> Dict[int, float]:
"""
Get modification masses and position in a peptide sequence.
Expand All @@ -26,7 +24,7 @@ def _get_modifications(
modification was present are returned.
:param peptide_sequence: Modified peptide sequence
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:return: modification_deltas
"""
modification_deltas = {}
Expand All @@ -40,33 +38,32 @@ def _get_modifications(
pattern = re.compile(r"\[.{8}[^\]]*\]")
matches = pattern.finditer(peptide_sequence)

mod_masses = constants.update_mod_masses(mods=custom_mods)
mod_masses = MOD_MASSES | (custom_mods or {})

for match in matches:
start_pos = match.start()
end_pos = match.end()
modification_deltas[start_pos - offset] = mod_masses[peptide_sequence[start_pos:end_pos]]
start_pos, end_pos = match.span()
modification_deltas[start_pos - offset] = mod_masses[match.group()]
offset += end_pos - start_pos

return modification_deltas


def compute_peptide_mass(sequence: str, custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None) -> float:
def compute_peptide_mass(sequence: str, custom_mods: Optional[Dict[str, float]] = None) -> float:
"""
Compute the theoretical mass of the peptide sequence.
:param sequence: Modified peptide sequence
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:return: Theoretical mass of the sequence
"""
terminal_masses = 2 * constants.ATOM_MASSES["H"] + constants.ATOM_MASSES["O"] # add terminal masses HO- and H-
terminal_masses = 2 * ATOM_MASSES["H"] + ATOM_MASSES["O"] # add terminal masses HO- and H-

modification_deltas = _get_modifications(sequence, custom_mods=custom_mods)
if modification_deltas: # there were modifictions
sequence = internal_without_mods([sequence])[0]
terminal_masses += modification_deltas.get(-2, 0.0) # prime with n_term_mod delta if present

peptide_sum = sum([constants.AA_MASSES[c] + modification_deltas.get(i, 0.0) for i, c in enumerate(sequence)])
peptide_sum = sum([AA_MASSES[c] + modification_deltas.get(i, 0.0) for i, c in enumerate(sequence)])

return terminal_masses + peptide_sum

Expand Down Expand Up @@ -141,12 +138,12 @@ def get_ion_delta(ion_types: List[str]) -> np.ndarray:
:return: numpy array with masses of the ions
"""
ion_type_offsets = {
"a": -constants.ATOM_MASSES["O"] - constants.ATOM_MASSES["C"],
"a": -ATOM_MASSES["O"] - ATOM_MASSES["C"],
"b": 0.0,
"c": 3 * constants.ATOM_MASSES["H"] + constants.ATOM_MASSES["N"],
"x": 2 * constants.ATOM_MASSES["O"] + constants.ATOM_MASSES["C"],
"y": constants.ATOM_MASSES["O"] + 2 * constants.ATOM_MASSES["H"],
"z": constants.ATOM_MASSES["O"] - constants.ATOM_MASSES["N"] - constants.ATOM_MASSES["H"],
"c": 3 * ATOM_MASSES["H"] + ATOM_MASSES["N"],
"x": 2 * ATOM_MASSES["O"] + ATOM_MASSES["C"],
"y": ATOM_MASSES["O"] + 2 * ATOM_MASSES["H"],
"z": ATOM_MASSES["O"] - ATOM_MASSES["N"] - ATOM_MASSES["H"],
}
# I think list comprehension is fastest way
deltas = np.array([ion_type_offsets[ion_type] for ion_type in ion_types]).reshape(len(ion_types), 1)
Expand All @@ -164,7 +161,7 @@ def initialize_peaks(
peptide_beta_mass: float = 0.0,
xl_pos: int = -1,
fragmentation_method: str = "HCD",
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
custom_mods: Optional[Dict[str, float]] = None,
) -> Tuple[List[dict], int, str, float]:
"""
Generate theoretical peaks for a modified peptide sequence.
Expand All @@ -178,7 +175,7 @@ def initialize_peaks(
:param peptide_beta_mass: the mass of the second peptide to be considered for non-cleavable XL
:param xl_pos: the position of the crosslinker for non-cleavable XL
:param fragmentation_method: fragmentation method that was used
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence
"""
_xl_sanity_check(noncl_xl, peptide_beta_mass, xl_pos)
Expand All @@ -205,7 +202,7 @@ def initialize_peaks(
# add n_term mass to first aa for easy processing in the following calculation
modification_deltas[0] = modification_deltas.get(0, 0.0) + n_term_delta

mass_arr = np.array([constants.AA_MASSES[_] for _ in sequence])
mass_arr = np.array([AA_MASSES[_] for _ in sequence])
for pos, mod_mass in modification_deltas.items():
mass_arr[pos] += mod_mass

Expand All @@ -224,7 +221,7 @@ def initialize_peaks(
# calculate for m/z for charges 1, 2, 3
# shape of ion_mzs: (n_ions, n_fragments, max_charge)
charges = np.arange(1, max_charge + 1)
ion_mzs = (sum_array[..., np.newaxis] + charges * constants.PARTICLE_MASSES["PROTON"]) / charges
ion_mzs = (sum_array[..., np.newaxis] + charges * PARTICLE_MASSES["PROTON"]) / charges

min_mzs, max_mzs = get_min_max_mass(mass_analyzer, ion_mzs, mass_tolerance, unit_mass_tolerance)

Expand All @@ -249,7 +246,7 @@ def initialize_peaks(
fragments_meta_data,
n_term_mod,
sequence,
(peptide_mass + constants.ATOM_MASSES["O"] + 2 * constants.ATOM_MASSES["H"]),
(peptide_mass + ATOM_MASSES["O"] + 2 * ATOM_MASSES["H"]),
)


Expand All @@ -261,7 +258,7 @@ def initialize_peaks_xl(
mass_tolerance: Optional[float] = None,
unit_mass_tolerance: Optional[str] = None,
sequence_beta: Optional[str] = None,
custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None,
custom_mods: Optional[Dict[str, float]] = None,
) -> Tuple[List[dict], int, str, float]:
"""
Generate theoretical peaks for a modified (potentially cleavable cross-linked) peptide sequence.
Expand All @@ -275,7 +272,7 @@ def initialize_peaks_xl(
:param mass_tolerance: mass tolerance to calculate min and max mass
:param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
:param sequence_beta: optional second peptide to be considered for non-cleavable XL
:param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:raises ValueError: if crosslinker_type is unkown
:raises AssertionError: if the short and long XL sequence (the one with the short / long crosslinker mod)
has a tmt n term while the other one does not
Expand Down Expand Up @@ -425,110 +422,3 @@ def get_min_max_mass(
else:
raise ValueError(f"Unsupported mass_analyzer: {mass_analyzer}")
return (min_mass, max_mass)


def compute_ion_masses(seq_int: List[int], charge_onehot: List[int], tmt: str = "") -> Optional[np.ndarray]:
"""
Collects an integer sequence e.g. [1,2,3] with charge 2 and returns array with 174 positions for ion masses.
Invalid masses are set to -1.
:param seq_int: TODO
:param charge_onehot: is a onehot representation of charge with 6 elems for charges 1 to 6
:param tmt: TODO
:return: list of masses as floats
"""
charge = list(charge_onehot).index(1) + 1
if not (charge in (1, 2, 3, 4, 5, 6) and len(charge_onehot) == 6):
print("[ERROR] One-hot-enconded Charge is not in valid range 1 to 6")
return None

if not len(seq_int) == constants.SEQ_LEN:
print(f"[ERROR] Sequence length {len(seq_int)} is not desired length of {constants.SEQ_LEN}")
return None

idx = list(seq_int).index(0) if 0 in seq_int else constants.SEQ_LEN
masses = np.ones((constants.SEQ_LEN - 1) * 2 * 3, dtype=np.float32) * -1
mass_b = 0
mass_y = 0
j = 0 # iterate over masses

# Iterate over sequence, sequence should have length 30
for i in range(idx - 1): # only 29 possible ions
j = i * 6 # index for masses array at position

# MASS FOR Y IONS
# print("Added", constants.VEC_MZ[seq_int[l-1-i]])
mass_y += constants.VEC_MZ[seq_int[idx - 1 - i]]

# Compute charge +1
masses[j] = (
mass_y
+ 1 * constants.PARTICLE_MASSES["PROTON"]
+ constants.MASSES["C_TERMINUS"]
+ constants.ATOM_MASSES["H"]
) / 1.0
# Compute charge +2
masses[j + 1] = (
(
mass_y
+ 2 * constants.PARTICLE_MASSES["PROTON"]
+ constants.MASSES["C_TERMINUS"]
+ constants.ATOM_MASSES["H"]
)
/ 2.0
if charge >= 2
else -1.0
)
# Compute charge +3
masses[j + 2] = (
(
mass_y
+ 3 * constants.PARTICLE_MASSES["PROTON"]
+ constants.MASSES["C_TERMINUS"]
+ constants.ATOM_MASSES["H"]
)
/ 3.0
if charge >= 3.0
else -1.0
)

# MASS FOR B IONS
if i == 0 and tmt != "":
mass_b += constants.VEC_MZ[seq_int[i]] + constants.MOD_MASSES[constants.TMT_MODS[tmt]]
else:
mass_b += constants.VEC_MZ[seq_int[i]]

# Compute charge +1
masses[j + 3] = (
mass_b
+ 1 * constants.PARTICLE_MASSES["PROTON"]
+ constants.MASSES["N_TERMINUS"]
- constants.ATOM_MASSES["H"]
) / 1.0
# Compute charge +2
masses[j + 4] = (
(
mass_b
+ 2 * constants.PARTICLE_MASSES["PROTON"]
+ constants.MASSES["N_TERMINUS"]
- constants.ATOM_MASSES["H"]
)
/ 2.0
if charge >= 2
else -1.0
)
# Compute charge +3
masses[j + 5] = (
(
mass_b
+ 3 * constants.PARTICLE_MASSES["PROTON"]
+ constants.MASSES["N_TERMINUS"]
- constants.ATOM_MASSES["H"]
)
/ 3.0
if charge >= 3.0
else -1.0
)

return masses
Loading

0 comments on commit 39c18f5

Please sign in to comment.