From 847ddceab6db8cf95f8ca6942a2fb2f91894b32e Mon Sep 17 00:00:00 2001
From: WassimG <wassim.gabriel@gmail.com>
Date: Thu, 19 Sep 2024 12:37:04 +0000
Subject: [PATCH] Added neutral losses annotation

---
 spectrum_fundamentals/constants.py |  59 ++++++++++++++++
 spectrum_fundamentals/fragments.py | 106 ++++++++++++++++++++++++++++-
 tests/unit_tests/test_fragments.py |  48 +++++++++++++
 3 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/spectrum_fundamentals/constants.py b/spectrum_fundamentals/constants.py
index 56b46b3..69b4894 100644
--- a/spectrum_fundamentals/constants.py
+++ b/spectrum_fundamentals/constants.py
@@ -144,6 +144,8 @@
     "C": 12.0,
     "O": 15.9949146,
     "N": 14.003074,
+    'S': 31.9720712,
+    'P': 30.9737619
 }
 
 MASSES = {**PARTICLE_MASSES, **ATOM_MASSES}
@@ -283,6 +285,63 @@
 
 AA_MOD = {**AA_MASSES, **AA_MOD_MASSES}
 
+
+AA_Neutral_losses = {
+    'R': ['NH3', 'CH2N2', 'C3H9N3'],
+    'N': ['NH3', 'CH3NO', 'C2H5NO', 'C3H5NO'],
+    'D': ['H2O', 'CO2', 'C2H4O2'],
+    'C': ['CH2S'],
+    'E': ['H2O', 'C2H4O2'],
+    'Q': ['NH3', 'CH3NO', 'C2H5NO', 'C3H5NO'],
+    'I': ['C2H4'],
+    'L': ['C3H6', 'C4H8'],
+    'K': ['C2H5N', 'C4H9N', 'C4H11N', 'C3H9N'],
+    'M': ['C2H4S', 'C3H6S'],
+    'M[UNIMOD:35]': ['CH4SO', 'C3H8SO', 'C3H6SO'],
+    'S': ['H2O', 'CH4O'],
+    'T': ['H2O', 'C2H4O'],
+    'W': ['C8H7N', 'C9H9N'],
+    'V': ['C3H6'],
+    '[]-': ['NH3'],
+    '-[]': ['H2O'],
+}
+
+Mod_Neutral_losses = {
+    'R[UNIMOD:7]': ['CHNO'],
+    'S[UNIMOD:21]': ['H3O4P']
+}
+
+Neutral_losses_Mass = {
+ 'C2H4': (ATOM_MASSES['C']*2) + (ATOM_MASSES['H']*4),
+ 'C2H4O': (ATOM_MASSES['C']*2) + (ATOM_MASSES['H']*4) + ATOM_MASSES['O'],
+ 'C2H4O2': (ATOM_MASSES['C']*2) + (ATOM_MASSES['H']*4) + (ATOM_MASSES['O']*2),
+ 'C2H4S': (ATOM_MASSES['C']*2) + (ATOM_MASSES['H']*4) + ATOM_MASSES['S'],
+ 'C2H5N':  (ATOM_MASSES['C']*2) + (ATOM_MASSES['H']*5) + ATOM_MASSES['N'],
+ 'CHNO': (ATOM_MASSES['C']) + (ATOM_MASSES['H']) + ATOM_MASSES['N'] + ATOM_MASSES['O'],
+ 'C2H5NO': (ATOM_MASSES['C']*2) + (ATOM_MASSES['H']*5) + ATOM_MASSES['N'] + ATOM_MASSES['O'],
+ 'C3H5NO': (ATOM_MASSES['C']*3) + (ATOM_MASSES['H']*5) + ATOM_MASSES['N'] + ATOM_MASSES['O'],
+ 'C3H6': (ATOM_MASSES['C']*3) + (ATOM_MASSES['H']*6),
+ 'C3H6S': (ATOM_MASSES['C']*3) + (ATOM_MASSES['H']*6) + ATOM_MASSES['S'],
+ 'C3H6SO': (ATOM_MASSES['C']*3) + (ATOM_MASSES['H']*6) + ATOM_MASSES['S'] + ATOM_MASSES['O'],
+ 'C3H8SO': (ATOM_MASSES['C']*3) + (ATOM_MASSES['H']*8) + ATOM_MASSES['S'] + ATOM_MASSES['O'],
+ 'C3H9N': (ATOM_MASSES['C']*3) + (ATOM_MASSES['H']*9) + ATOM_MASSES['N'],
+ 'C3H9N3': (ATOM_MASSES['C']*3) + (ATOM_MASSES['H']*9) + (ATOM_MASSES['N']*3),
+ 'C4H11N': (ATOM_MASSES['C']*4) + (ATOM_MASSES['H']*11) + ATOM_MASSES['N'],
+ 'C4H8': (ATOM_MASSES['C']*4) + (ATOM_MASSES['H']*8),
+ 'C4H9N': (ATOM_MASSES['C']*4) + (ATOM_MASSES['H']*9) + ATOM_MASSES['N'],
+ 'C8H7N': (ATOM_MASSES['C']*8) + (ATOM_MASSES['H']*7) + ATOM_MASSES['N'],
+ 'C9H9N': (ATOM_MASSES['C']*9) + (ATOM_MASSES['H']*9) + ATOM_MASSES['N'],
+ 'CH2N2': ATOM_MASSES['C'] + (ATOM_MASSES['H']*2) + (ATOM_MASSES['N']*2),
+ 'CH2S': ATOM_MASSES['C'] + (ATOM_MASSES['H']*2) + ATOM_MASSES['S'],
+ 'CH3NO': ATOM_MASSES['C'] + (ATOM_MASSES['H']*3) + ATOM_MASSES['N'] + ATOM_MASSES['O'],
+ 'CH4O': ATOM_MASSES['C'] + (ATOM_MASSES['H']*4) + ATOM_MASSES['O'],
+ 'CH4SO': ATOM_MASSES['C'] + (ATOM_MASSES['H']*4) + ATOM_MASSES['S'] + ATOM_MASSES['O'],
+ 'CO2': ATOM_MASSES['C'] + (ATOM_MASSES['O']*2),
+ 'H2O': (ATOM_MASSES['H']*2) + ATOM_MASSES['O'],
+ 'NH3': ATOM_MASSES['N'] + (ATOM_MASSES['H']*3),
+ 'H3O4P': (ATOM_MASSES['H']*3) + (ATOM_MASSES['O']*4) + ATOM_MASSES['P']
+}
+
 #######################################
 # HELPERS FOR FRAGMENT MZ CALCULATION #
 #######################################
diff --git a/spectrum_fundamentals/fragments.py b/spectrum_fundamentals/fragments.py
index 24741c4..7be9e41 100644
--- a/spectrum_fundamentals/fragments.py
+++ b/spectrum_fundamentals/fragments.py
@@ -130,6 +130,83 @@ def get_ion_delta(ion_types: List[str]) -> np.ndarray:
     return np.array([c.ION_DELTAS[ion_type] for ion_type in ion_types]).reshape(len(ion_types), 1)
 
 
+def _add_nl(neutral_losses, nl_dict, start_aa_index, end_aa_index):
+    """
+    Adds neutral losses (NL) to a dictionary of neutral losses for specific amino acid indices.
+
+    This function updates the `nl_dict` by incorporating the provided `neutral_losses` into 
+    the amino acid indices between `start_aa_index` and `end_aa_index`. 
+
+    :param list neutral_losses: A list of neutral losses to be added to the amino acids.
+    :param dict nl_dict: A dictionary where the keys are amino acid indices and the values are lists of neutral 
+        losses associated with each index.
+    :param int start_aa_index: The starting index of the amino acid range to which the neutral losses should be added.
+    :param int end_aa_index: The ending index of the amino acid range to which the neutral losses should be added.
+    :returns: Updated dictionary with the added neutral losses for the specified amino acid indices.
+    """
+    first_nl = True
+    new_nls = {}
+    for nl in neutral_losses:
+        for i in range(start_aa_index, end_aa_index):
+            current_aa_nl = nl_dict[i]
+            if first_nl:
+                new_nls[i] = list(set(neutral_losses) - set(current_aa_nl))
+            if nl not in current_aa_nl:
+                current_aa_nl.append(nl)
+        first_nl = False
+    return nl_dict
+
+
+def _get_neutral_losses(peptide_sequence, modifications):
+    """
+    Get possible neutral losses and position in a peptide sequence.
+    :param peptide_sequence: Unmodified peptide sequence
+    :modifications: modifications dict generated by _get_modifications from modified petide sequence.
+    :return: Dict with neutral losses position as an ID and composition as its value.
+    """
+    sequence_length = len(peptide_sequence)
+    keys = range(0, sequence_length - 1)
+
+    NL_b_ions = dict([(key, []) for key in keys])
+    NL_y_ions = dict([(key, []) for key in keys])
+
+    for i in range(0, sequence_length):
+        aa = peptide_sequence[i]
+        if aa in c.AA_Neutral_losses:
+            if i in modifications:
+                if aa == 'M' and modifications[i] == 15.9949146:
+                    NL_b_ions = _add_nl(c.AA_Neutral_losses['M[UNIMOD:35]'], NL_b_ions, i, sequence_length - 1)
+                    NL_y_ions = _add_nl(c.AA_Neutral_losses['M[UNIMOD:35]'], NL_y_ions, sequence_length - i - 1,
+                                        sequence_length - 1)
+                elif aa == 'R' and modifications[i] == 0.984016:
+                    NL_b_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_b_ions, i, sequence_length - 1)
+                    NL_y_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_y_ions, sequence_length - i - 1,
+                                        sequence_length - 1)
+                elif (aa == 'S' or aa=='T') and modifications[i] == 79.9663:
+                    NL_b_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_b_ions, i, sequence_length - 1)
+                    NL_y_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_y_ions, sequence_length - i - 1,
+                                        sequence_length - 1)
+            else:
+                NL_b_ions = _add_nl(c.AA_Neutral_losses[aa], NL_b_ions, i, sequence_length - 1)
+                NL_y_ions = _add_nl(c.AA_Neutral_losses[aa], NL_y_ions, sequence_length - i - 1, sequence_length - 1)
+    return NL_b_ions, NL_y_ions
+
+def _calculate_nl_score_mass(neutral_loss):
+    """
+    Calculates the score and mass for a given neutral loss (NL).
+    :param str neutral_loss: The type of neutral loss for which to calculate the score and mass.
+    This should be a key present in the `Neutral_losses_Mass` dictionary.    
+    :returns: A tuple containing the adjusted score and the mass of the specified neutral loss.
+    """
+    score = 100
+    mass = 0
+    mass = c.Neutral_losses_Mass[neutral_loss]
+    if neutral_loss == 'H2O' or neutral_loss == 'NH3':
+        score -= 5
+    else:
+        score -= 30
+    return score, mass
+
 def initialize_peaks(
     sequence: str,
     mass_analyzer: str,
@@ -141,6 +218,7 @@ def initialize_peaks(
     xl_pos: int = -1,
     fragmentation_method: str = "HCD",
     custom_mods: Optional[Dict[str, float]] = None,
+    add_neutral_losses: Optional[bool] = False
 ) -> Tuple[List[dict], int, str, float]:
     """
     Generate theoretical peaks for a modified peptide sequence.
@@ -155,6 +233,7 @@ def initialize_peaks(
     :param xl_pos: the position of the crosslinker for non-cleavable XL
     :param fragmentation_method: fragmentation method that was used
     :param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
+    :param add_neutral_losses: Flag to indicate whether to annotate neutral losses or not
     :return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence
     """
     _xl_sanity_check(noncl_xl, peptide_beta_mass, xl_pos)
@@ -181,6 +260,9 @@ def initialize_peaks(
             # add n_term mass to first aa for easy processing in the following calculation
             modification_deltas[0] = modification_deltas.get(0, 0.0) + n_term_delta
 
+    if add_neutral_losses:
+        nl_b_ions, nl_y_ions = _get_neutral_losses(sequence, modification_deltas)
+        nl_ions = [nl_y_ions,nl_b_ions]
     mass_arr = np.array([c.AA_MASSES[_] for _ in sequence])
     for pos, mod_mass in modification_deltas.items():
         mass_arr[pos] += mod_mass
@@ -201,7 +283,6 @@ def initialize_peaks(
     # shape of ion_mzs: (n_ions, n_fragments, max_charge)
     charges = np.arange(1, max_charge + 1)
     ion_mzs = (sum_array[..., np.newaxis] + charges * c.PARTICLE_MASSES["PROTON"]) / charges
-
     min_mzs, max_mzs = get_min_max_mass(mass_analyzer, ion_mzs, mass_tolerance, unit_mass_tolerance)
 
     # write mz together with min and max value in output list with one dictionary for each ion
@@ -216,8 +297,31 @@ def initialize_peaks(
                         "mass": ion_mzs[ion_type, number, charge],  # mz
                         "min_mass": min_mzs[ion_type, number, charge],  # min mz
                         "max_mass": max_mzs[ion_type, number, charge],  # max mz
+                        "neutral_loss": '',
+                        "fragment_score": 100,
                     }
                 )
+                if not add_neutral_losses:
+                    continue
+                for nl in nl_ions[ion_type][number]:
+                    nl_score, nl_mass = _calculate_nl_score_mass(nl)
+                    ion_mass = sum_array[ion_type,number] - nl_mass
+                    ion_mz = (ion_mass + (charge+1) * c.PARTICLE_MASSES["PROTON"]) / (charge+1)
+                    min_mz, max_mz = get_min_max_mass(mass_analyzer, ion_mz, mass_tolerance, unit_mass_tolerance)
+                    
+                    fragments_meta_data.append(
+                    {
+                        "ion_type": ion_types[ion_type],  # ion type
+                        "no": number + 1,  # no
+                        "charge": charge + 1,  # charge
+                        "mass": ion_mz,  # mz
+                        "min_mass": min_mz,  # min mz
+                        "max_mass": max_mz,  # max mz
+                        "neutral_loss": nl,
+                        "fragment_score": 100- nl_score,
+                    }
+                ) 
+
 
     fragments_meta_data = sorted(fragments_meta_data, key=itemgetter("mass"))
 
diff --git a/tests/unit_tests/test_fragments.py b/tests/unit_tests/test_fragments.py
index 6b5f095..9c1f5b7 100644
--- a/tests/unit_tests/test_fragments.py
+++ b/tests/unit_tests/test_fragments.py
@@ -176,3 +176,51 @@ def test_catches_redundant_order(self):
             _ = fragments.generate_fragment_ion_annotations(
                 ion_types=["y", "b"], order=("ion_type", "position", "ion_type")
             )
+
+
+class TestNeutralLossFunctions(unittest.TestCase):
+
+    def test_add_nl(self):
+        """
+        Test the _add_nl function with a range of amino acids and neutral losses.
+        """
+        neutral_losses = ['H2O', 'NH3']
+        nl_dict = {
+            0: ['CO2'],     # Initial dictionary contains CO2 for index 0
+            1: ['H2O'],     # Initial dictionary contains H2O for index 1
+        }
+        start_aa_index = 0
+        end_aa_index = 2  # Only indices 0 and 1 will be modified
+
+        # Expected output after the neutral losses are added
+        expected_nl_dict = {
+            0: ['CO2', 'H2O', 'NH3'],  # Both H2O and NH3 added
+            1: ['H2O', 'NH3']          # NH3 added, H2O already present
+        }
+
+        result = fragments._add_nl(neutral_losses, nl_dict, start_aa_index, end_aa_index)
+        self.assertEqual(result, expected_nl_dict)
+
+    def test_calculate_nl_score_mass(self):
+        """
+        Test the _calculate_nl_score_mass function with various neutral losses.
+        """
+        # Test for H2O, which should reduce the score by 5
+        score, mass = fragments._calculate_nl_score_mass('H2O')
+        self.assertEqual(score, 95)  # Starting score of 100 - 5
+        self.assertEqual(mass, 18.01056467)  # Mass of H2O
+
+        # Test for NH3, which should also reduce the score by 5
+        score, mass = fragments._calculate_nl_score_mass('NH3')
+        self.assertEqual(score, 95)  # Starting score of 100 - 5
+        self.assertEqual(mass, 17.026549105)  # Mass of NH3
+
+        # Test for CO2, which should reduce the score by 30
+        score, mass = fragments._calculate_nl_score_mass('CO2')
+        self.assertEqual(score, 70)  # Starting score of 100 - 30
+        self.assertEqual(mass, 43.9898292)  # Mass of CO2
+
+        # Test for C2H4O2, which should reduce the score by 30
+        score, mass = fragments._calculate_nl_score_mass('C2H4O2')
+        self.assertEqual(score, 70)  # Starting score of 100 - 30
+        self.assertEqual(mass, 60.02112934)
\ No newline at end of file