From ca09c0bbd41f812763996fa6e2dad1e8bd58b13d Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Mon, 5 Aug 2024 16:52:39 +0200 Subject: [PATCH 1/2] added new internal_to_msp function --- docs/reference.rst | 2 +- spectrum_fundamentals/mod_string.py | 39 ++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/docs/reference.rst b/docs/reference.rst index ff7ad07..0323c01 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -1,7 +1,7 @@ How to cite =========== -If you want to cite spectrum_fundamentals in your work, please cite the main Oktoberfest publication following: +If you want to cite spectrum_fundamentals in your work, please cite the main Oktoberfest publication: .. [1] Picciani M, Gabriel W, Giurcoiu VG et al. (2023), *Oktoberfest: Open-source spectral library generation and rescoring pipeline based on Prosit*, diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 4c14f21..e3cec21 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -200,9 +200,7 @@ def internal_without_mods(sequences: List[str]) -> List[str]: return [re.sub(regex, "", seq) for seq in sequences] -def internal_to_mod_mass( - sequences: List[str], custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None -) -> List[str]: +def internal_to_mod_mass(sequences: List[str], custom_mods: Optional[Dict[str, float]] = None) -> List[str]: """ Function to exchange the internal mod identifiers with the masses of the specific modifiction. @@ -217,6 +215,41 @@ def internal_to_mod_mass( return [regex.sub(replacement_func, seq) for seq in sequences] +def internal_to_msp( + sequences: Union[List[str], pd.Series], + mods: Dict[str, str], +) -> List[Tuple[str, str]]: + """ + Function to translate an internal modstring to modstring and Mods for MSP format. + + :param sequences: sequences to translate + :param mods: dictionary mapping from internal unimod format (keys) to MSP format (values). + :return: a tuple for each sequence, containing (Mods, mod_string) for the MSP format + """ + ret_vals = [] + p = re.compile("|".join(mods.keys())) + for seq in sequences: + offset = 0 + mod_list = [] + matches = p.finditer(seq) + for match in matches: + replacement = mods[re.escape(match.group())] + start, end = match.span() + actual_start = start - offset + mod_list.append((actual_start, replacement)) + + offset += end - start - 1 + + mod_string = "; ".join([f"{mod[2:]}@{mod[0]}{pos}" for pos, mod in mod_list]) + n_mods = len(mod_list) + if n_mods > 0: + mods = f"{n_mods}/{'/'.join([f'{pos},{mod}' for pos, mod in mod_list])}" + else: + mods = "0" + ret_vals.append((mods, mod_string)) + return ret_vals + + def internal_to_mod_names( sequences: List[str], ) -> List[Tuple[str, str]]: From 347979c2f22f8a208708e53c6c213e4dc071ef7b Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Mon, 5 Aug 2024 17:28:20 +0200 Subject: [PATCH 2/2] fixed mypy / typeguard --- spectrum_fundamentals/mod_string.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index e3cec21..2016c02 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -200,7 +200,9 @@ def internal_without_mods(sequences: List[str]) -> List[str]: return [re.sub(regex, "", seq) for seq in sequences] -def internal_to_mod_mass(sequences: List[str], custom_mods: Optional[Dict[str, float]] = None) -> List[str]: +def internal_to_mod_mass( + sequences: List[str], custom_mods: Optional[Dict[str, Dict[str, Tuple[str, float]]]] = None +) -> List[str]: """ Function to exchange the internal mod identifiers with the masses of the specific modifiction. @@ -243,10 +245,10 @@ def internal_to_msp( mod_string = "; ".join([f"{mod[2:]}@{mod[0]}{pos}" for pos, mod in mod_list]) n_mods = len(mod_list) if n_mods > 0: - mods = f"{n_mods}/{'/'.join([f'{pos},{mod}' for pos, mod in mod_list])}" + mods_field = f"{n_mods}/{'/'.join([f'{pos},{mod}' for pos, mod in mod_list])}" else: - mods = "0" - ret_vals.append((mods, mod_string)) + mods_field = "0" + ret_vals.append((mods_field, mod_string)) return ret_vals