From 4091ed77640ab3e61422237ed70f7986c9377f6a Mon Sep 17 00:00:00 2001
From: WassimG <wassim.gabriel@gmail.com>
Date: Fri, 20 Sep 2024 15:00:31 +0000
Subject: [PATCH] Add neutral losses annotation and features

---
 .../annotation/annotation.py                  |  46 ++++----
 spectrum_fundamentals/fragments.py            | 111 ++++++++++--------
 spectrum_fundamentals/metrics/percolator.py   |   5 +
 spectrum_fundamentals/mod_string.py           |  12 +-
 4 files changed, 99 insertions(+), 75 deletions(-)

diff --git a/spectrum_fundamentals/annotation/annotation.py b/spectrum_fundamentals/annotation/annotation.py
index 1d42d31..750f664 100644
--- a/spectrum_fundamentals/annotation/annotation.py
+++ b/spectrum_fundamentals/annotation/annotation.py
@@ -17,7 +17,7 @@ def match_peaks(
     tmt_n_term: int,
     unmod_sequence: str,
     charge: int,
-) -> List[Dict[str, Union[str, int, float]]]:
+) -> Tuple[List[Dict[str, Union[str, int, float]]], int]:
     """
     Matching experimental peaks with theoretical fragment ions.
 
@@ -36,6 +36,7 @@ def match_peaks(
     temp_list = []
     next_start_peak = 0
     matched_peak = False
+    count_annotated_nl = 0
     fragment_no: float
     for fragment in fragments_meta_data:
         min_mass = fragment["min_mass"]
@@ -56,27 +57,32 @@ def match_peaks(
             if (
                 not (fragment["ion_type"][0] == "b" and fragment_no == 1)
                 or (unmod_sequence[0] == "R" or unmod_sequence[0] == "H" or unmod_sequence[0] == "K")
-                and (tmt_n_term == 1)
+                or (tmt_n_term == 2)
             ):
-                row_list.append(
-                    {
-                        "ion_type": fragment["ion_type"],
-                        "no": fragment_no,
-                        "charge": fragment["charge"],
-                        "exp_mass": peak_mass,
-                        "theoretical_mass": fragment["mass"],
-                        "intensity": peak_intensity,
-                    }
-                )
-                if peak_intensity > max_intensity:
-                    max_intensity = float(peak_intensity)
+                # For now only counting neutral loss peaks this can change with different models later
+                if fragment["neutral_loss"] == "":
+                    row_list.append(
+                        {
+                            "ion_type": fragment["ion_type"],
+                            "no": fragment_no,
+                            "charge": fragment["charge"],
+                            "exp_mass": peak_mass,
+                            "theoretical_mass": fragment["mass"],
+                            "intensity": peak_intensity,
+                        }
+                    )
+                    if peak_intensity > max_intensity:
+                        max_intensity = float(peak_intensity)
+                else:
+                    count_annotated_nl += 1
+
             matched_peak = True
             next_start_peak = start_peak
             start_peak += 1
     for row in row_list:
         row["intensity"] = float(row["intensity"]) / max_intensity
         temp_list.append(row)
-    return temp_list
+    return temp_list, count_annotated_nl
 
 
 def handle_multiple_matches(
@@ -162,7 +168,7 @@ def annotate_spectra(
     results_df = pd.DataFrame(raw_file_annotations)
 
     if "CROSSLINKER_TYPE" not in index_columns:
-        results_df.columns = ["INTENSITIES", "MZ", "CALCULATED_MASS", "removed_peaks"]
+        results_df.columns = ["INTENSITIES", "MZ", "CALCULATED_MASS", "removed_peaks", "count_nl_peaks"]
     else:
         results_df.columns = [
             "INTENSITIES_A",
@@ -414,7 +420,7 @@ def _annotate_linear_spectrum(
     mod_seq_column = "MODIFIED_SEQUENCE"
     if "MODIFIED_SEQUENCE_MSA" in index_columns:
         mod_seq_column = "MODIFIED_SEQUENCE_MSA"
-    fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks(
+    fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass, _ = initialize_peaks(
         sequence=spectrum[index_columns[mod_seq_column]],
         mass_analyzer=spectrum[index_columns["MASS_ANALYZER"]],
         charge=spectrum[index_columns["PRECURSOR_CHARGE"]],
@@ -423,7 +429,7 @@ def _annotate_linear_spectrum(
         fragmentation_method=fragmentation_method,
         custom_mods=custom_mods,
     )
-    matched_peaks = match_peaks(
+    matched_peaks, count_annotated_nl = match_peaks(
         fragments_meta_data,
         spectrum[index_columns["INTENSITIES"]],
         spectrum[index_columns["MZ"]],
@@ -445,7 +451,7 @@ def _annotate_linear_spectrum(
     intensities, mass = generate_annotation_matrix(
         matched_peaks, unmod_sequence, spectrum[index_columns["PRECURSOR_CHARGE"]], fragmentation_method
     )
-    return intensities, mass, calc_mass, removed_peaks
+    return intensities, mass, calc_mass, removed_peaks, count_annotated_nl
 
 
 def _annotate_crosslinked_spectrum(
@@ -493,7 +499,7 @@ def _xl_annotation_workflow(seq_id: str, non_cl_xl: bool):
             array_size = 348
         inputs.append(custom_mods)
         fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks_xl(*inputs)
-        matched_peaks = match_peaks(
+        matched_peaks, annotated_nl = match_peaks(
             fragments_meta_data,
             np.array(spectrum[index_columns["INTENSITIES"]]),
             np.array(spectrum[index_columns["MZ"]]),  # Convert to numpy array
diff --git a/spectrum_fundamentals/fragments.py b/spectrum_fundamentals/fragments.py
index 7be9e41..85c3cff 100644
--- a/spectrum_fundamentals/fragments.py
+++ b/spectrum_fundamentals/fragments.py
@@ -130,18 +130,18 @@ def get_ion_delta(ion_types: List[str]) -> np.ndarray:
     return np.array([c.ION_DELTAS[ion_type] for ion_type in ion_types]).reshape(len(ion_types), 1)
 
 
-def _add_nl(neutral_losses, nl_dict, start_aa_index, end_aa_index):
+def _add_nl(neutral_losses: List[str], nl_dict: dict, start_aa_index: int, end_aa_index: int):
     """
     Adds neutral losses (NL) to a dictionary of neutral losses for specific amino acid indices.
 
-    This function updates the `nl_dict` by incorporating the provided `neutral_losses` into 
-    the amino acid indices between `start_aa_index` and `end_aa_index`. 
+    This function updates the `nl_dict` by incorporating the provided `neutral_losses` into
+    the amino acid indices between `start_aa_index` and `end_aa_index`.
 
-    :param list neutral_losses: A list of neutral losses to be added to the amino acids.
-    :param dict nl_dict: A dictionary where the keys are amino acid indices and the values are lists of neutral 
+    :param neutral_losses: A list of neutral losses to be added to the amino acids.
+    :param nl_dict: A dictionary where the keys are amino acid indices and the values are lists of neutral
         losses associated with each index.
-    :param int start_aa_index: The starting index of the amino acid range to which the neutral losses should be added.
-    :param int end_aa_index: The ending index of the amino acid range to which the neutral losses should be added.
+    :param start_aa_index: The starting index of the amino acid range to which the neutral losses should be added.
+    :param end_aa_index: The ending index of the amino acid range to which the neutral losses should be added.
     :returns: Updated dictionary with the added neutral losses for the specified amino acid indices.
     """
     first_nl = True
@@ -160,53 +160,59 @@ def _add_nl(neutral_losses, nl_dict, start_aa_index, end_aa_index):
 def _get_neutral_losses(peptide_sequence, modifications):
     """
     Get possible neutral losses and position in a peptide sequence.
+
     :param peptide_sequence: Unmodified peptide sequence
-    :modifications: modifications dict generated by _get_modifications from modified petide sequence.
+    :param modifications: modifications dict generated by _get_modifications from modified petide sequence.
     :return: Dict with neutral losses position as an ID and composition as its value.
     """
     sequence_length = len(peptide_sequence)
     keys = range(0, sequence_length - 1)
 
-    NL_b_ions = dict([(key, []) for key in keys])
-    NL_y_ions = dict([(key, []) for key in keys])
+    nl_b_ions = {key: [] for key in keys}
+    nl_y_ions = {key: [] for key in keys}
 
     for i in range(0, sequence_length):
         aa = peptide_sequence[i]
         if aa in c.AA_Neutral_losses:
             if i in modifications:
-                if aa == 'M' and modifications[i] == 15.9949146:
-                    NL_b_ions = _add_nl(c.AA_Neutral_losses['M[UNIMOD:35]'], NL_b_ions, i, sequence_length - 1)
-                    NL_y_ions = _add_nl(c.AA_Neutral_losses['M[UNIMOD:35]'], NL_y_ions, sequence_length - i - 1,
-                                        sequence_length - 1)
-                elif aa == 'R' and modifications[i] == 0.984016:
-                    NL_b_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_b_ions, i, sequence_length - 1)
-                    NL_y_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_y_ions, sequence_length - i - 1,
-                                        sequence_length - 1)
-                elif (aa == 'S' or aa=='T') and modifications[i] == 79.9663:
-                    NL_b_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_b_ions, i, sequence_length - 1)
-                    NL_y_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_y_ions, sequence_length - i - 1,
-                                        sequence_length - 1)
+                if aa == "M" and modifications[i] == 15.9949146:
+                    nl_b_ions = _add_nl(c.AA_Neutral_losses["M[UNIMOD:35]"], nl_b_ions, i, sequence_length - 1)
+                    nl_y_ions = _add_nl(
+                        c.AA_Neutral_losses["M[UNIMOD:35]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1
+                    )
+                elif aa == "R" and modifications[i] == 0.984016:
+                    nl_b_ions = _add_nl(c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_b_ions, i, sequence_length - 1)
+                    nl_y_ions = _add_nl(
+                        c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1
+                    )
+                elif (aa == "S" or aa == "T") and modifications[i] == 79.9663:
+                    nl_b_ions = _add_nl(c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_b_ions, i, sequence_length - 1)
+                    nl_y_ions = _add_nl(
+                        c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1
+                    )
             else:
-                NL_b_ions = _add_nl(c.AA_Neutral_losses[aa], NL_b_ions, i, sequence_length - 1)
-                NL_y_ions = _add_nl(c.AA_Neutral_losses[aa], NL_y_ions, sequence_length - i - 1, sequence_length - 1)
-    return NL_b_ions, NL_y_ions
+                nl_b_ions = _add_nl(c.AA_Neutral_losses[aa], nl_b_ions, i, sequence_length - 1)
+                nl_y_ions = _add_nl(c.AA_Neutral_losses[aa], nl_y_ions, sequence_length - i - 1, sequence_length - 1)
+    return nl_b_ions, nl_y_ions
+
 
 def _calculate_nl_score_mass(neutral_loss):
     """
     Calculates the score and mass for a given neutral loss (NL).
-    :param str neutral_loss: The type of neutral loss for which to calculate the score and mass.
-    This should be a key present in the `Neutral_losses_Mass` dictionary.    
+
+    :param neutral_loss: The type of neutral loss for which to calculate the score and mass.
     :returns: A tuple containing the adjusted score and the mass of the specified neutral loss.
     """
     score = 100
     mass = 0
     mass = c.Neutral_losses_Mass[neutral_loss]
-    if neutral_loss == 'H2O' or neutral_loss == 'NH3':
+    if neutral_loss == "H2O" or neutral_loss == "NH3":
         score -= 5
     else:
         score -= 30
     return score, mass
 
+
 def initialize_peaks(
     sequence: str,
     mass_analyzer: str,
@@ -218,8 +224,8 @@ def initialize_peaks(
     xl_pos: int = -1,
     fragmentation_method: str = "HCD",
     custom_mods: Optional[Dict[str, float]] = None,
-    add_neutral_losses: Optional[bool] = False
-) -> Tuple[List[dict], int, str, float]:
+    add_neutral_losses: Optional[bool] = False,
+) -> Tuple[List[dict], int, str, float, int]:
     """
     Generate theoretical peaks for a modified peptide sequence.
 
@@ -234,7 +240,8 @@ def initialize_peaks(
     :param fragmentation_method: fragmentation method that was used
     :param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
     :param add_neutral_losses: Flag to indicate whether to annotate neutral losses or not
-    :return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence
+    :return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence,
+        number of expected nl peaks
     """
     _xl_sanity_check(noncl_xl, peptide_beta_mass, xl_pos)
 
@@ -262,7 +269,9 @@ def initialize_peaks(
 
     if add_neutral_losses:
         nl_b_ions, nl_y_ions = _get_neutral_losses(sequence, modification_deltas)
-        nl_ions = [nl_y_ions,nl_b_ions]
+        nl_ions = [nl_y_ions, nl_b_ions]
+
+    expected_nl_count = 0
     mass_arr = np.array([c.AA_MASSES[_] for _ in sequence])
     for pos, mod_mass in modification_deltas.items():
         mass_arr[pos] += mod_mass
@@ -297,7 +306,7 @@ def initialize_peaks(
                         "mass": ion_mzs[ion_type, number, charge],  # mz
                         "min_mass": min_mzs[ion_type, number, charge],  # min mz
                         "max_mass": max_mzs[ion_type, number, charge],  # max mz
-                        "neutral_loss": '',
+                        "neutral_loss": "",
                         "fragment_score": 100,
                     }
                 )
@@ -305,23 +314,22 @@ def initialize_peaks(
                     continue
                 for nl in nl_ions[ion_type][number]:
                     nl_score, nl_mass = _calculate_nl_score_mass(nl)
-                    ion_mass = sum_array[ion_type,number] - nl_mass
-                    ion_mz = (ion_mass + (charge+1) * c.PARTICLE_MASSES["PROTON"]) / (charge+1)
+                    ion_mass = sum_array[ion_type, number] - nl_mass
+                    ion_mz = (ion_mass + (charge + 1) * c.PARTICLE_MASSES["PROTON"]) / (charge + 1)
                     min_mz, max_mz = get_min_max_mass(mass_analyzer, ion_mz, mass_tolerance, unit_mass_tolerance)
-                    
+                    expected_nl_count += 1
                     fragments_meta_data.append(
-                    {
-                        "ion_type": ion_types[ion_type],  # ion type
-                        "no": number + 1,  # no
-                        "charge": charge + 1,  # charge
-                        "mass": ion_mz,  # mz
-                        "min_mass": min_mz,  # min mz
-                        "max_mass": max_mz,  # max mz
-                        "neutral_loss": nl,
-                        "fragment_score": 100- nl_score,
-                    }
-                ) 
-
+                        {
+                            "ion_type": ion_types[ion_type],  # ion type
+                            "no": number + 1,  # no
+                            "charge": charge + 1,  # charge
+                            "mass": ion_mz,  # mz
+                            "min_mass": min_mz,  # min mz
+                            "max_mass": max_mz,  # max mz
+                            "neutral_loss": nl,
+                            "fragment_score": 100 - nl_score,
+                        }
+                    )
 
     fragments_meta_data = sorted(fragments_meta_data, key=itemgetter("mass"))
 
@@ -330,6 +338,7 @@ def initialize_peaks(
         n_term_mod,
         sequence,
         (peptide_mass + c.ATOM_MASSES["O"] + 2 * c.ATOM_MASSES["H"]),
+        expected_nl_count,
     )
 
 
@@ -390,10 +399,10 @@ def initialize_peaks_xl(
         # the crosslinker is returned! This needs to be fixed, because mass is used as CALCULATED_MASS in
         # percolator!
 
-        list_out_s, tmt_n_term_s, peptide_sequence, _ = initialize_peaks(
+        list_out_s, tmt_n_term_s, peptide_sequence, _, _ = initialize_peaks(
             sequence_s, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
         )
-        list_out_l, tmt_n_term_l, peptide_sequence, _ = initialize_peaks(
+        list_out_l, tmt_n_term_l, peptide_sequence, _, _ = initialize_peaks(
             sequence_l, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
         )
 
@@ -428,7 +437,7 @@ def initialize_peaks_xl(
         sequence_mass = compute_peptide_mass(sequence_without_crosslinker)
         sequence_beta_mass = compute_peptide_mass(sequence_beta_without_crosslinker)
 
-        list_out, tmt_n_term, peptide_sequence, _ = initialize_peaks(
+        list_out, tmt_n_term, peptide_sequence, _, _ = initialize_peaks(
             sequence,
             mass_analyzer,
             charge,
diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py
index 72f22ef..5ba9f73 100644
--- a/spectrum_fundamentals/metrics/percolator.py
+++ b/spectrum_fundamentals/metrics/percolator.py
@@ -58,6 +58,7 @@ def __init__(
         regression_method: str = "lowess",
         fdr_cutoff: float = 0.01,
         additional_columns: Optional[Union[str, list]] = None,
+        ptm_localization_flag: Optional[bool] = False,
     ):
         """Initialize a Percolator obj."""
         self.metadata = metadata
@@ -66,6 +67,7 @@ def __init__(
         self.additional_columns = additional_columns
         self.regression_method = regression_method
         self.fdr_cutoff = fdr_cutoff
+        self.ptm_localization_flag = ptm_localization_flag
         self.xl = "CROSSLINKER_TYPE" in self.metadata.columns
         self.base_columns = [
             "raw_file",
@@ -459,6 +461,9 @@ def calc(self):
             self.metrics_val = pd.concat(
                 [self.metrics_val, fragments_ratio.metrics_val, similarity.metrics_val], axis=1
             )
+            if self.ptm_localization_flag:
+                self.metrics_val["ANNOTATED_NL_COUNT"] = self.metadata["ANNOTATED_NL_COUNT"]
+                self.metrics_val["EXPECTED_NL_COUNT"] = self.metadata["EXPECTED_NL_COUNT"]
             if self.xl:
                 self.metrics_val["collision_energy_aligned"] = self.metadata["COLLISION_ENERGY"] / 100.0
             else:
diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py
index 9715859..0e68f1a 100644
--- a/spectrum_fundamentals/mod_string.py
+++ b/spectrum_fundamentals/mod_string.py
@@ -351,15 +351,17 @@ def get_all_tokens(sequences: List[str]) -> Set[str]:
     return tokens
 
 
-def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str], allow_one_less_modification: bool = False):
+def add_permutations(
+    modified_sequence: str, unimod_id: int, residues: List[str], allow_one_less_modification: bool = False
+):
     """
     Generate different peptide sequences with moving the modification to all possible residues.
 
     :param modified_sequence: Peptide sequence
     :param unimod_id: modification unimod id to be used for generating different permutations.
     :param residues: possible amino acids where this mod can exist
-    :param allow_one_less_modification: Flag to indicate if permutations with one less modification should be generated to check 
-                                whether the modification mass was mistakenly picked as the monoisotopic peak. Mainly used for Citrullination.
+    :param allow_one_less_modification: Flag to indicate if permutations with one less modification should be generated to check
+        whether the modification mass was mistakenly picked as the monoisotopic peak. Mainly used for Citrullination.
     :return: list of possible sequence permutations
     """
     sequence = modified_sequence.replace("[unimod:" + str(unimod_id) + "]", "")
@@ -371,7 +373,9 @@ def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str]
     all_combinations = [list(each_permutation) for each_permutation in combinations(possible_positions, modifications)]
 
     if allow_one_less_modification:
-        all_combinations_1 = [list(each_permutation) for each_permutation in combinations(possible_positions, modifications-1)]
+        all_combinations_1 = [
+            list(each_permutation) for each_permutation in combinations(possible_positions, modifications - 1)
+        ]
         all_combinations = all_combinations + all_combinations_1
 
     modified_sequences_comb = []