Skip to content

Commit

Permalink
Add neutral losses annotation and features
Browse files Browse the repository at this point in the history
  • Loading branch information
WassimG committed Sep 20, 2024
1 parent 6e8ca39 commit 4091ed7
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 75 deletions.
46 changes: 26 additions & 20 deletions spectrum_fundamentals/annotation/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def match_peaks(
tmt_n_term: int,
unmod_sequence: str,
charge: int,
) -> List[Dict[str, Union[str, int, float]]]:
) -> Tuple[List[Dict[str, Union[str, int, float]]], int]:
"""
Matching experimental peaks with theoretical fragment ions.
Expand All @@ -36,6 +36,7 @@ def match_peaks(
temp_list = []
next_start_peak = 0
matched_peak = False
count_annotated_nl = 0
fragment_no: float
for fragment in fragments_meta_data:
min_mass = fragment["min_mass"]
Expand All @@ -56,27 +57,32 @@ def match_peaks(
if (
not (fragment["ion_type"][0] == "b" and fragment_no == 1)
or (unmod_sequence[0] == "R" or unmod_sequence[0] == "H" or unmod_sequence[0] == "K")
and (tmt_n_term == 1)
or (tmt_n_term == 2)
):
row_list.append(
{
"ion_type": fragment["ion_type"],
"no": fragment_no,
"charge": fragment["charge"],
"exp_mass": peak_mass,
"theoretical_mass": fragment["mass"],
"intensity": peak_intensity,
}
)
if peak_intensity > max_intensity:
max_intensity = float(peak_intensity)
# For now only counting neutral loss peaks this can change with different models later
if fragment["neutral_loss"] == "":
row_list.append(
{
"ion_type": fragment["ion_type"],
"no": fragment_no,
"charge": fragment["charge"],
"exp_mass": peak_mass,
"theoretical_mass": fragment["mass"],
"intensity": peak_intensity,
}
)
if peak_intensity > max_intensity:
max_intensity = float(peak_intensity)
else:
count_annotated_nl += 1

matched_peak = True
next_start_peak = start_peak
start_peak += 1
for row in row_list:
row["intensity"] = float(row["intensity"]) / max_intensity
temp_list.append(row)
return temp_list
return temp_list, count_annotated_nl


def handle_multiple_matches(
Expand Down Expand Up @@ -162,7 +168,7 @@ def annotate_spectra(
results_df = pd.DataFrame(raw_file_annotations)

if "CROSSLINKER_TYPE" not in index_columns:
results_df.columns = ["INTENSITIES", "MZ", "CALCULATED_MASS", "removed_peaks"]
results_df.columns = ["INTENSITIES", "MZ", "CALCULATED_MASS", "removed_peaks", "count_nl_peaks"]
else:
results_df.columns = [
"INTENSITIES_A",
Expand Down Expand Up @@ -414,7 +420,7 @@ def _annotate_linear_spectrum(
mod_seq_column = "MODIFIED_SEQUENCE"
if "MODIFIED_SEQUENCE_MSA" in index_columns:
mod_seq_column = "MODIFIED_SEQUENCE_MSA"
fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks(
fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass, _ = initialize_peaks(
sequence=spectrum[index_columns[mod_seq_column]],
mass_analyzer=spectrum[index_columns["MASS_ANALYZER"]],
charge=spectrum[index_columns["PRECURSOR_CHARGE"]],
Expand All @@ -423,7 +429,7 @@ def _annotate_linear_spectrum(
fragmentation_method=fragmentation_method,
custom_mods=custom_mods,
)
matched_peaks = match_peaks(
matched_peaks, count_annotated_nl = match_peaks(
fragments_meta_data,
spectrum[index_columns["INTENSITIES"]],
spectrum[index_columns["MZ"]],
Expand All @@ -445,7 +451,7 @@ def _annotate_linear_spectrum(
intensities, mass = generate_annotation_matrix(
matched_peaks, unmod_sequence, spectrum[index_columns["PRECURSOR_CHARGE"]], fragmentation_method
)
return intensities, mass, calc_mass, removed_peaks
return intensities, mass, calc_mass, removed_peaks, count_annotated_nl


def _annotate_crosslinked_spectrum(
Expand Down Expand Up @@ -493,7 +499,7 @@ def _xl_annotation_workflow(seq_id: str, non_cl_xl: bool):
array_size = 348
inputs.append(custom_mods)
fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks_xl(*inputs)
matched_peaks = match_peaks(
matched_peaks, annotated_nl = match_peaks(
fragments_meta_data,
np.array(spectrum[index_columns["INTENSITIES"]]),
np.array(spectrum[index_columns["MZ"]]), # Convert to numpy array
Expand Down
111 changes: 60 additions & 51 deletions spectrum_fundamentals/fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,18 +130,18 @@ def get_ion_delta(ion_types: List[str]) -> np.ndarray:
return np.array([c.ION_DELTAS[ion_type] for ion_type in ion_types]).reshape(len(ion_types), 1)


def _add_nl(neutral_losses, nl_dict, start_aa_index, end_aa_index):
def _add_nl(neutral_losses: List[str], nl_dict: dict, start_aa_index: int, end_aa_index: int):
"""
Adds neutral losses (NL) to a dictionary of neutral losses for specific amino acid indices.
This function updates the `nl_dict` by incorporating the provided `neutral_losses` into
the amino acid indices between `start_aa_index` and `end_aa_index`.
This function updates the `nl_dict` by incorporating the provided `neutral_losses` into
the amino acid indices between `start_aa_index` and `end_aa_index`.
:param list neutral_losses: A list of neutral losses to be added to the amino acids.
:param dict nl_dict: A dictionary where the keys are amino acid indices and the values are lists of neutral
:param neutral_losses: A list of neutral losses to be added to the amino acids.
:param nl_dict: A dictionary where the keys are amino acid indices and the values are lists of neutral
losses associated with each index.
:param int start_aa_index: The starting index of the amino acid range to which the neutral losses should be added.
:param int end_aa_index: The ending index of the amino acid range to which the neutral losses should be added.
:param start_aa_index: The starting index of the amino acid range to which the neutral losses should be added.
:param end_aa_index: The ending index of the amino acid range to which the neutral losses should be added.
:returns: Updated dictionary with the added neutral losses for the specified amino acid indices.
"""
first_nl = True
Expand All @@ -160,53 +160,59 @@ def _add_nl(neutral_losses, nl_dict, start_aa_index, end_aa_index):
def _get_neutral_losses(peptide_sequence, modifications):
"""
Get possible neutral losses and position in a peptide sequence.
:param peptide_sequence: Unmodified peptide sequence
:modifications: modifications dict generated by _get_modifications from modified petide sequence.
:param modifications: modifications dict generated by _get_modifications from modified petide sequence.
:return: Dict with neutral losses position as an ID and composition as its value.
"""
sequence_length = len(peptide_sequence)
keys = range(0, sequence_length - 1)

NL_b_ions = dict([(key, []) for key in keys])
NL_y_ions = dict([(key, []) for key in keys])
nl_b_ions = {key: [] for key in keys}
nl_y_ions = {key: [] for key in keys}

for i in range(0, sequence_length):
aa = peptide_sequence[i]
if aa in c.AA_Neutral_losses:
if i in modifications:
if aa == 'M' and modifications[i] == 15.9949146:
NL_b_ions = _add_nl(c.AA_Neutral_losses['M[UNIMOD:35]'], NL_b_ions, i, sequence_length - 1)
NL_y_ions = _add_nl(c.AA_Neutral_losses['M[UNIMOD:35]'], NL_y_ions, sequence_length - i - 1,
sequence_length - 1)
elif aa == 'R' and modifications[i] == 0.984016:
NL_b_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_b_ions, i, sequence_length - 1)
NL_y_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_y_ions, sequence_length - i - 1,
sequence_length - 1)
elif (aa == 'S' or aa=='T') and modifications[i] == 79.9663:
NL_b_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_b_ions, i, sequence_length - 1)
NL_y_ions = _add_nl(c.Mod_Neutral_losses['R[UNIMOD:7]'], NL_y_ions, sequence_length - i - 1,
sequence_length - 1)
if aa == "M" and modifications[i] == 15.9949146:
nl_b_ions = _add_nl(c.AA_Neutral_losses["M[UNIMOD:35]"], nl_b_ions, i, sequence_length - 1)
nl_y_ions = _add_nl(
c.AA_Neutral_losses["M[UNIMOD:35]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1
)
elif aa == "R" and modifications[i] == 0.984016:
nl_b_ions = _add_nl(c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_b_ions, i, sequence_length - 1)
nl_y_ions = _add_nl(
c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1
)
elif (aa == "S" or aa == "T") and modifications[i] == 79.9663:
nl_b_ions = _add_nl(c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_b_ions, i, sequence_length - 1)
nl_y_ions = _add_nl(
c.Mod_Neutral_losses["R[UNIMOD:7]"], nl_y_ions, sequence_length - i - 1, sequence_length - 1
)
else:
NL_b_ions = _add_nl(c.AA_Neutral_losses[aa], NL_b_ions, i, sequence_length - 1)
NL_y_ions = _add_nl(c.AA_Neutral_losses[aa], NL_y_ions, sequence_length - i - 1, sequence_length - 1)
return NL_b_ions, NL_y_ions
nl_b_ions = _add_nl(c.AA_Neutral_losses[aa], nl_b_ions, i, sequence_length - 1)
nl_y_ions = _add_nl(c.AA_Neutral_losses[aa], nl_y_ions, sequence_length - i - 1, sequence_length - 1)
return nl_b_ions, nl_y_ions


def _calculate_nl_score_mass(neutral_loss):
"""
Calculates the score and mass for a given neutral loss (NL).
:param str neutral_loss: The type of neutral loss for which to calculate the score and mass.
This should be a key present in the `Neutral_losses_Mass` dictionary.
:param neutral_loss: The type of neutral loss for which to calculate the score and mass.
:returns: A tuple containing the adjusted score and the mass of the specified neutral loss.
"""
score = 100
mass = 0
mass = c.Neutral_losses_Mass[neutral_loss]
if neutral_loss == 'H2O' or neutral_loss == 'NH3':
if neutral_loss == "H2O" or neutral_loss == "NH3":
score -= 5
else:
score -= 30
return score, mass


def initialize_peaks(
sequence: str,
mass_analyzer: str,
Expand All @@ -218,8 +224,8 @@ def initialize_peaks(
xl_pos: int = -1,
fragmentation_method: str = "HCD",
custom_mods: Optional[Dict[str, float]] = None,
add_neutral_losses: Optional[bool] = False
) -> Tuple[List[dict], int, str, float]:
add_neutral_losses: Optional[bool] = False,
) -> Tuple[List[dict], int, str, float, int]:
"""
Generate theoretical peaks for a modified peptide sequence.
Expand All @@ -234,7 +240,8 @@ def initialize_peaks(
:param fragmentation_method: fragmentation method that was used
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:param add_neutral_losses: Flag to indicate whether to annotate neutral losses or not
:return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence
:return: List of theoretical peaks, Flag to indicate if there is a tmt on n-terminus, Un modified peptide sequence,
number of expected nl peaks
"""
_xl_sanity_check(noncl_xl, peptide_beta_mass, xl_pos)

Expand Down Expand Up @@ -262,7 +269,9 @@ def initialize_peaks(

if add_neutral_losses:
nl_b_ions, nl_y_ions = _get_neutral_losses(sequence, modification_deltas)
nl_ions = [nl_y_ions,nl_b_ions]
nl_ions = [nl_y_ions, nl_b_ions]

expected_nl_count = 0
mass_arr = np.array([c.AA_MASSES[_] for _ in sequence])
for pos, mod_mass in modification_deltas.items():
mass_arr[pos] += mod_mass
Expand Down Expand Up @@ -297,31 +306,30 @@ def initialize_peaks(
"mass": ion_mzs[ion_type, number, charge], # mz
"min_mass": min_mzs[ion_type, number, charge], # min mz
"max_mass": max_mzs[ion_type, number, charge], # max mz
"neutral_loss": '',
"neutral_loss": "",
"fragment_score": 100,
}
)
if not add_neutral_losses:
continue
for nl in nl_ions[ion_type][number]:
nl_score, nl_mass = _calculate_nl_score_mass(nl)
ion_mass = sum_array[ion_type,number] - nl_mass
ion_mz = (ion_mass + (charge+1) * c.PARTICLE_MASSES["PROTON"]) / (charge+1)
ion_mass = sum_array[ion_type, number] - nl_mass
ion_mz = (ion_mass + (charge + 1) * c.PARTICLE_MASSES["PROTON"]) / (charge + 1)
min_mz, max_mz = get_min_max_mass(mass_analyzer, ion_mz, mass_tolerance, unit_mass_tolerance)

expected_nl_count += 1
fragments_meta_data.append(
{
"ion_type": ion_types[ion_type], # ion type
"no": number + 1, # no
"charge": charge + 1, # charge
"mass": ion_mz, # mz
"min_mass": min_mz, # min mz
"max_mass": max_mz, # max mz
"neutral_loss": nl,
"fragment_score": 100- nl_score,
}
)

{
"ion_type": ion_types[ion_type], # ion type
"no": number + 1, # no
"charge": charge + 1, # charge
"mass": ion_mz, # mz
"min_mass": min_mz, # min mz
"max_mass": max_mz, # max mz
"neutral_loss": nl,
"fragment_score": 100 - nl_score,
}
)

fragments_meta_data = sorted(fragments_meta_data, key=itemgetter("mass"))

Expand All @@ -330,6 +338,7 @@ def initialize_peaks(
n_term_mod,
sequence,
(peptide_mass + c.ATOM_MASSES["O"] + 2 * c.ATOM_MASSES["H"]),
expected_nl_count,
)


Expand Down Expand Up @@ -390,10 +399,10 @@ def initialize_peaks_xl(
# the crosslinker is returned! This needs to be fixed, because mass is used as CALCULATED_MASS in
# percolator!

list_out_s, tmt_n_term_s, peptide_sequence, _ = initialize_peaks(
list_out_s, tmt_n_term_s, peptide_sequence, _, _ = initialize_peaks(
sequence_s, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
)
list_out_l, tmt_n_term_l, peptide_sequence, _ = initialize_peaks(
list_out_l, tmt_n_term_l, peptide_sequence, _, _ = initialize_peaks(
sequence_l, mass_analyzer, charge, mass_tolerance, unit_mass_tolerance, custom_mods=custom_mods
)

Expand Down Expand Up @@ -428,7 +437,7 @@ def initialize_peaks_xl(
sequence_mass = compute_peptide_mass(sequence_without_crosslinker)
sequence_beta_mass = compute_peptide_mass(sequence_beta_without_crosslinker)

list_out, tmt_n_term, peptide_sequence, _ = initialize_peaks(
list_out, tmt_n_term, peptide_sequence, _, _ = initialize_peaks(
sequence,
mass_analyzer,
charge,
Expand Down
5 changes: 5 additions & 0 deletions spectrum_fundamentals/metrics/percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(
regression_method: str = "lowess",
fdr_cutoff: float = 0.01,
additional_columns: Optional[Union[str, list]] = None,
ptm_localization_flag: Optional[bool] = False,
):
"""Initialize a Percolator obj."""
self.metadata = metadata
Expand All @@ -66,6 +67,7 @@ def __init__(
self.additional_columns = additional_columns
self.regression_method = regression_method
self.fdr_cutoff = fdr_cutoff
self.ptm_localization_flag = ptm_localization_flag
self.xl = "CROSSLINKER_TYPE" in self.metadata.columns
self.base_columns = [
"raw_file",
Expand Down Expand Up @@ -459,6 +461,9 @@ def calc(self):
self.metrics_val = pd.concat(
[self.metrics_val, fragments_ratio.metrics_val, similarity.metrics_val], axis=1
)
if self.ptm_localization_flag:
self.metrics_val["ANNOTATED_NL_COUNT"] = self.metadata["ANNOTATED_NL_COUNT"]
self.metrics_val["EXPECTED_NL_COUNT"] = self.metadata["EXPECTED_NL_COUNT"]
if self.xl:
self.metrics_val["collision_energy_aligned"] = self.metadata["COLLISION_ENERGY"] / 100.0
else:
Expand Down
12 changes: 8 additions & 4 deletions spectrum_fundamentals/mod_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,15 +351,17 @@ def get_all_tokens(sequences: List[str]) -> Set[str]:
return tokens


def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str], allow_one_less_modification: bool = False):
def add_permutations(
modified_sequence: str, unimod_id: int, residues: List[str], allow_one_less_modification: bool = False
):
"""
Generate different peptide sequences with moving the modification to all possible residues.
:param modified_sequence: Peptide sequence
:param unimod_id: modification unimod id to be used for generating different permutations.
:param residues: possible amino acids where this mod can exist
:param allow_one_less_modification: Flag to indicate if permutations with one less modification should be generated to check
whether the modification mass was mistakenly picked as the monoisotopic peak. Mainly used for Citrullination.
:param allow_one_less_modification: Flag to indicate if permutations with one less modification should be generated to check
whether the modification mass was mistakenly picked as the monoisotopic peak. Mainly used for Citrullination.
:return: list of possible sequence permutations
"""
sequence = modified_sequence.replace("[unimod:" + str(unimod_id) + "]", "")
Expand All @@ -371,7 +373,9 @@ def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str]
all_combinations = [list(each_permutation) for each_permutation in combinations(possible_positions, modifications)]

if allow_one_less_modification:
all_combinations_1 = [list(each_permutation) for each_permutation in combinations(possible_positions, modifications-1)]
all_combinations_1 = [
list(each_permutation) for each_permutation in combinations(possible_positions, modifications - 1)
]
all_combinations = all_combinations + all_combinations_1

modified_sequences_comb = []
Expand Down

0 comments on commit 4091ed7

Please sign in to comment.