From 4091ed77640ab3e61422237ed70f7986c9377f6a Mon Sep 17 00:00:00 2001 From: WassimG Date: Fri, 20 Sep 2024 15:00:31 +0000 Subject: [PATCH] Add neutral losses annotation and features --- .../annotation/annotation.py | 46 ++++---- spectrum_fundamentals/fragments.py | 111 ++++++++++-------- spectrum_fundamentals/metrics/percolator.py | 5 + spectrum_fundamentals/mod_string.py | 12 +- 4 files changed, 99 insertions(+), 75 deletions(-) diff --git a/spectrum_fundamentals/annotation/annotation.py b/spectrum_fundamentals/annotation/annotation.py index 1d42d31..750f664 100644 --- a/spectrum_fundamentals/annotation/annotation.py +++ b/spectrum_fundamentals/annotation/annotation.py @@ -17,7 +17,7 @@ def match_peaks( tmt_n_term: int, unmod_sequence: str, charge: int, -) -> List[Dict[str, Union[str, int, float]]]: +) -> Tuple[List[Dict[str, Union[str, int, float]]], int]: """ Matching experimental peaks with theoretical fragment ions. @@ -36,6 +36,7 @@ def match_peaks( temp_list = [] next_start_peak = 0 matched_peak = False + count_annotated_nl = 0 fragment_no: float for fragment in fragments_meta_data: min_mass = fragment["min_mass"] @@ -56,27 +57,32 @@ def match_peaks( if ( not (fragment["ion_type"][0] == "b" and fragment_no == 1) or (unmod_sequence[0] == "R" or unmod_sequence[0] == "H" or unmod_sequence[0] == "K") - and (tmt_n_term == 1) + or (tmt_n_term == 2) ): - row_list.append( - { - "ion_type": fragment["ion_type"], - "no": fragment_no, - "charge": fragment["charge"], - "exp_mass": peak_mass, - "theoretical_mass": fragment["mass"], - "intensity": peak_intensity, - } - ) - if peak_intensity > max_intensity: - max_intensity = float(peak_intensity) + # For now only counting neutral loss peaks this can change with different models later + if fragment["neutral_loss"] == "": + row_list.append( + { + "ion_type": fragment["ion_type"], + "no": fragment_no, + "charge": fragment["charge"], + "exp_mass": peak_mass, + "theoretical_mass": fragment["mass"], + "intensity": peak_intensity, + } + ) + if peak_intensity > max_intensity: + max_intensity = float(peak_intensity) + else: + count_annotated_nl += 1 + matched_peak = True next_start_peak = start_peak start_peak += 1 for row in row_list: row["intensity"] = float(row["intensity"]) / max_intensity temp_list.append(row) - return temp_list + return temp_list, count_annotated_nl def handle_multiple_matches( @@ -162,7 +168,7 @@ def annotate_spectra( results_df = pd.DataFrame(raw_file_annotations) if "CROSSLINKER_TYPE" not in index_columns: - results_df.columns = ["INTENSITIES", "MZ", "CALCULATED_MASS", "removed_peaks"] + results_df.columns = ["INTENSITIES", "MZ", "CALCULATED_MASS", "removed_peaks", "count_nl_peaks"] else: results_df.columns = [ "INTENSITIES_A", @@ -414,7 +420,7 @@ def _annotate_linear_spectrum( mod_seq_column = "MODIFIED_SEQUENCE" if "MODIFIED_SEQUENCE_MSA" in index_columns: mod_seq_column = "MODIFIED_SEQUENCE_MSA" - fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks( + fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass, _ = initialize_peaks( sequence=spectrum[index_columns[mod_seq_column]], mass_analyzer=spectrum[index_columns["MASS_ANALYZER"]], charge=spectrum[index_columns["PRECURSOR_CHARGE"]], @@ -423,7 +429,7 @@ def _annotate_linear_spectrum( fragmentation_method=fragmentation_method, custom_mods=custom_mods, ) - matched_peaks = match_peaks( + matched_peaks, count_annotated_nl = match_peaks( fragments_meta_data, spectrum[index_columns["INTENSITIES"]], spectrum[index_columns["MZ"]], @@ -445,7 +451,7 @@ def _annotate_linear_spectrum( intensities, mass = generate_annotation_matrix( matched_peaks, unmod_sequence, spectrum[index_columns["PRECURSOR_CHARGE"]], fragmentation_method ) - return intensities, mass, calc_mass, removed_peaks + return intensities, mass, calc_mass, removed_peaks, count_annotated_nl def _annotate_crosslinked_spectrum( @@ -493,7 +499,7 @@ def _xl_annotation_workflow(seq_id: str, non_cl_xl: bool): array_size = 348 inputs.append(custom_mods) fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks_xl(*inputs) - matched_peaks = match_peaks( + matched_peaks, annotated_nl = match_peaks( fragments_meta_data, np.array(spectrum[index_columns["INTENSITIES"]]), np.array(spectrum[index_columns["MZ"]]), # Convert to numpy array diff --git a/spectrum_fundamentals/fragments.py b/spectrum_fundamentals/fragments.py index 7be9e41..85c3cff 100644 --- a/spectrum_fundamentals/fragments.py +++ b/spectrum_fundamentals/fragments.py @@ -130,18 +130,18 @@ def get_ion_delta(ion_types: List[str]) -> np.ndarray: return np.array([c.ION_DELTAS[ion_type] for ion_type in ion_types]).reshape(len(ion_types), 1) -def _add_nl(neutral_losses, nl_dict, start_aa_index, end_aa_index): +def _add_nl(neutral_losses: List[str], nl_dict: dict, start_aa_index: int, end_aa_index: int): """ Adds neutral losses (NL) to a dictionary of neutral losses for specific amino acid indices. - This function updates the `nl_dict` by incorporating the provided `neutral_losses` into - the amino acid indices between `start_aa_index` and `end_aa_index`. + This function updates the `nl_dict` by incorporating the provided `neutral_losses` into + the amino acid indices between `start_aa_index` and `end_aa_index`. - :param list neutral_losses: A list of neutral losses to be added to the amino acids. - :param dict nl_dict: A dictionary where the keys are amino acid indices and the values are lists of neutral + :param neutral_losses: A list of neutral losses to be added to the amino acids. + :param nl_dict: A dictionary where the keys are amino acid indices and the values are lists of neutral losses associated with each index. - :param int start_aa_index: The starting index of the amino acid range to which the neutral losses should be added. - :param int end_aa_index: The ending index of the amino acid range to which the neutral losses should be added. + :param start_aa_index: The starting index of the amino acid range to which the neutral losses should be added. + :param end_aa_index: The ending index of the amino acid range to which the neutral losses should be added. :returns: Updated dictionary with the added neutral losses for the specified amino acid indices. """ first_nl = True @@ -160,53 +160,59 @@ def _add_nl(neutral_losses, nl_dict, start_aa_index, end_aa_index): def _get_neutral_losses(peptide_sequence, modifications): """ Get possible neutral losses and position in a peptide sequence. + :param peptide_sequence: Unmodified peptide sequence - :modifications: modifications dict generated by _get_modifications from modified petide sequence. + :param modifications: modifications dict generated by _get_modifications from modified petide sequence. :return: Dict with neutral losses position as an ID and composition as its value. """ sequence_length = len(peptide_sequence) keys = range(0, sequence_length - 1) - NL_b_ions = dict([(key, []) for key in keys]) - NL_y_ions = dict([(key, []) for key in keys]) + nl_b_ions = {key: [] for key in keys} + nl_y_ions = {key: [] for key in keys} for i in range(0, sequence_length): aa = peptide_sequence[i] if aa in c.AA_Neutral_losses: if i in modifications: - if aa == 'M' and modifications[i] == 15.9949146: - NL_b_ions = _add_nl(c.AA_Neutral_losses['M[UNIMOD:35]'], NL_b_ions, i, sequence_length - 1) - NL_y_ions = _add_nl(c.AA_Neutral_losses['M[UNIMOD:35]'], NL_y_ions, sequence_length - i - 1, - sequence_length - 1) - elif aa == 'R' and modifications[i] == 0.984016: - NL_b_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_b_ions, i, sequence_length - 1) - NL_y_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_y_ions, sequence_length - i - 1, - sequence_length - 1) - elif (aa == 'S' or aa=='T') and modifications[i] == 79.9663: - NL_b_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_b_ions, i, sequence_length - 1) - NL_y_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_y_ions, sequence_length - i - 1, - sequence_length - 1) + if aa == "M" and modifications[i] == 15.9949146: + nl_b_ions = _add_nl(c.AA_Neutral_losses["M[UNIMOD:35]"], nl_b_ions, i, sequence_length - 1) + nl_y_ions = _add_nl( + c.AA_Neutral_losses["M[UNIMOD:35]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1 + ) + elif aa == "R" and modifications[i] == 0.984016: + nl_b_ions = _add_nl(c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_b_ions, i, sequence_length - 1) + nl_y_ions = _add_nl( + c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1 + ) + elif (aa == "S" or aa == "T") and modifications[i] == 79.9663: + nl_b_ions = _add_nl(c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_b_ions, i, sequence_length - 1) + nl_y_ions = _add_nl( + c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1 + ) else: - NL_b_ions = _add_nl(c.AA_Neutral_losses[aa], NL_b_ions, i, sequence_length - 1) - NL_y_ions = _add_nl(c.AA_Neutral_losses[aa], NL_y_ions, sequence_length - i - 1, sequence_length - 1) - return NL_b_ions, NL_y_ions + nl_b_ions = _add_nl(c.AA_Neutral_losses[aa], nl_b_ions, i, sequence_length - 1) + nl_y_ions = _add_nl(c.AA_Neutral_losses[aa], nl_y_ions, sequence_length - i - 1, sequence_length - 1) + return nl_b_ions, nl_y_ions + def _calculate_nl_score_mass(neutral_loss): """ Calculates the score and mass for a given neutral loss (NL). - :param str neutral_loss: The type of neutral loss for which to calculate the score and mass. - This should be a key present in the `Neutral_losses_Mass` dictionary. + + :param neutral_loss: The type of neutral loss for which to calculate the score and mass. :returns: A tuple containing the adjusted score and the mass of the specified neutral loss. """ score = 100 mass = 0 mass = c.Neutral_losses_Mass[neutral_loss] - if neutral_loss == 'H2O' or neutral_loss == 'NH3': + if neutral_loss == "H2O" or neutral_loss == "NH3": score -= 5 else: score -= 30 return score, mass + def initialize_peaks( sequence: str, mass_analyzer: str, @@ -218,8 +224,8 @@ def initialize_peaks( xl_pos: int = -1, fragmentation_method: str = "HCD", custom_mods: Optional[Dict[str, float]] = None, - add_neutral_losses: Optional[bool] = False -) -> Tuple[List[dict], int, str, float]: + add_neutral_losses: Optional[bool] = False, +) -> Tuple[List[dict], int, str, float, int]: """ Generate theoretical peaks for a modified peptide sequence. @@ -234,7 +240,8 @@ def initialize_peaks( :param fragmentation_method: fragmentation method that was used :param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass :param add_neutral_losses: Flag to indicate whether to annotate neutral losses or not - :return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence + :return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence, + number of expected nl peaks """ _xl_sanity_check(noncl_xl, peptide_beta_mass, xl_pos) @@ -262,7 +269,9 @@ def initialize_peaks( if add_neutral_losses: nl_b_ions, nl_y_ions = _get_neutral_losses(sequence, modification_deltas) - nl_ions = [nl_y_ions,nl_b_ions] + nl_ions = [nl_y_ions, nl_b_ions] + + expected_nl_count = 0 mass_arr = np.array([c.AA_MASSES[_] for _ in sequence]) for pos, mod_mass in modification_deltas.items(): mass_arr[pos] += mod_mass @@ -297,7 +306,7 @@ def initialize_peaks( "mass": ion_mzs[ion_type, number, charge], # mz "min_mass": min_mzs[ion_type, number, charge], # min mz "max_mass": max_mzs[ion_type, number, charge], # max mz - "neutral_loss": '', + "neutral_loss": "", "fragment_score": 100, } ) @@ -305,23 +314,22 @@ def initialize_peaks( continue for nl in nl_ions[ion_type][number]: nl_score, nl_mass = _calculate_nl_score_mass(nl) - ion_mass = sum_array[ion_type,number] - nl_mass - ion_mz = (ion_mass + (charge+1) * c.PARTICLE_MASSES["PROTON"]) / (charge+1) + ion_mass = sum_array[ion_type, number] - nl_mass + ion_mz = (ion_mass + (charge + 1) * c.PARTICLE_MASSES["PROTON"]) / (charge + 1) min_mz, max_mz = get_min_max_mass(mass_analyzer, ion_mz, mass_tolerance, unit_mass_tolerance) - + expected_nl_count += 1 fragments_meta_data.append( - { - "ion_type": ion_types[ion_type], # ion type - "no": number + 1, # no - "charge": charge + 1, # charge - "mass": ion_mz, # mz - "min_mass": min_mz, # min mz - "max_mass": max_mz, # max mz - "neutral_loss": nl, - "fragment_score": 100- nl_score, - } - ) - + { + "ion_type": ion_types[ion_type], # ion type + "no": number + 1, # no + "charge": charge + 1, # charge + "mass": ion_mz, # mz + "min_mass": min_mz, # min mz + "max_mass": max_mz, # max mz + "neutral_loss": nl, + "fragment_score": 100 - nl_score, + } + ) fragments_meta_data = sorted(fragments_meta_data, key=itemgetter("mass")) @@ -330,6 +338,7 @@ def initialize_peaks( n_term_mod, sequence, (peptide_mass + c.ATOM_MASSES["O"] + 2 * c.ATOM_MASSES["H"]), + expected_nl_count, ) @@ -390,10 +399,10 @@ def initialize_peaks_xl( # the crosslinker is returned! This needs to be fixed, because mass is used as CALCULATED_MASS in # percolator! - list_out_s, tmt_n_term_s, peptide_sequence, _ = initialize_peaks( + list_out_s, tmt_n_term_s, peptide_sequence, _, _ = initialize_peaks( sequence_s, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods ) - list_out_l, tmt_n_term_l, peptide_sequence, _ = initialize_peaks( + list_out_l, tmt_n_term_l, peptide_sequence, _, _ = initialize_peaks( sequence_l, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods ) @@ -428,7 +437,7 @@ def initialize_peaks_xl( sequence_mass = compute_peptide_mass(sequence_without_crosslinker) sequence_beta_mass = compute_peptide_mass(sequence_beta_without_crosslinker) - list_out, tmt_n_term, peptide_sequence, _ = initialize_peaks( + list_out, tmt_n_term, peptide_sequence, _, _ = initialize_peaks( sequence, mass_analyzer, charge, diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index 72f22ef..5ba9f73 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -58,6 +58,7 @@ def __init__( regression_method: str = "lowess", fdr_cutoff: float = 0.01, additional_columns: Optional[Union[str, list]] = None, + ptm_localization_flag: Optional[bool] = False, ): """Initialize a Percolator obj.""" self.metadata = metadata @@ -66,6 +67,7 @@ def __init__( self.additional_columns = additional_columns self.regression_method = regression_method self.fdr_cutoff = fdr_cutoff + self.ptm_localization_flag = ptm_localization_flag self.xl = "CROSSLINKER_TYPE" in self.metadata.columns self.base_columns = [ "raw_file", @@ -459,6 +461,9 @@ def calc(self): self.metrics_val = pd.concat( [self.metrics_val, fragments_ratio.metrics_val, similarity.metrics_val], axis=1 ) + if self.ptm_localization_flag: + self.metrics_val["ANNOTATED_NL_COUNT"] = self.metadata["ANNOTATED_NL_COUNT"] + self.metrics_val["EXPECTED_NL_COUNT"] = self.metadata["EXPECTED_NL_COUNT"] if self.xl: self.metrics_val["collision_energy_aligned"] = self.metadata["COLLISION_ENERGY"] / 100.0 else: diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 9715859..0e68f1a 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -351,15 +351,17 @@ def get_all_tokens(sequences: List[str]) -> Set[str]: return tokens -def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str], allow_one_less_modification: bool = False): +def add_permutations( + modified_sequence: str, unimod_id: int, residues: List[str], allow_one_less_modification: bool = False +): """ Generate different peptide sequences with moving the modification to all possible residues. :param modified_sequence: Peptide sequence :param unimod_id: modification unimod id to be used for generating different permutations. :param residues: possible amino acids where this mod can exist - :param allow_one_less_modification: Flag to indicate if permutations with one less modification should be generated to check - whether the modification mass was mistakenly picked as the monoisotopic peak. Mainly used for Citrullination. + :param allow_one_less_modification: Flag to indicate if permutations with one less modification should be generated to check + whether the modification mass was mistakenly picked as the monoisotopic peak. Mainly used for Citrullination. :return: list of possible sequence permutations """ sequence = modified_sequence.replace("[unimod:" + str(unimod_id) + "]", "") @@ -371,7 +373,9 @@ def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str] all_combinations = [list(each_permutation) for each_permutation in combinations(possible_positions, modifications)] if allow_one_less_modification: - all_combinations_1 = [list(each_permutation) for each_permutation in combinations(possible_positions, modifications-1)] + all_combinations_1 = [ + list(each_permutation) for each_permutation in combinations(possible_positions, modifications - 1) + ] all_combinations = all_combinations + all_combinations_1 modified_sequences_comb = []