From 9d10c3476907e6d5af41b30647d81aad359c3e66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Tue, 24 Oct 2023 14:06:37 +0200 Subject: [PATCH 1/8] added sage reversed unimod dictionary --- spectrum_fundamentals/constants.py | 15 ++++++++++++++- spectrum_fundamentals/mod_string.py | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/spectrum_fundamentals/constants.py b/spectrum_fundamentals/constants.py index e23207f..9141f59 100644 --- a/spectrum_fundamentals/constants.py +++ b/spectrum_fundamentals/constants.py @@ -18,7 +18,17 @@ ############# # ALPHABETS # ############# - +MOD_MASSES_SAGE = {+229.1629: '[UNIMOD:737]', + +304.2071: '[UNIMOD:2016]', + +144.1020: '[UNIMOD:214]', + +304.2053: '[UNIMOD:730]', + +8.0141: '[UNIMOD:259]', + +10.0082: '[UNIMOD:267]', + +79.9663: '[UNIMOD:21]', + -18.0105: '[UNIMOD:23]', + +57.0214: '[UNIMOD:4]', + +15.9949: '[UNIMOD:35]', + +42.0105: '[UNIMOD:1]'} AA_ALPHABET = { "A": 1, "C": 24, @@ -276,3 +286,6 @@ class RescoreType(Enum): PROSIT = "prosit" ANDROMEDA = "andromeda" + + + diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 334a5a7..0f8b32b 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -239,3 +239,5 @@ def get_mods_list(mods_variable: str, mods_fixed: str): return mods_variable.split(";") else: return mods_variable.split(";") + mods_fixed.split(";") + +def \ No newline at end of file From 393ca1219e0b1261a973024d000f10d74fadfcbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Thu, 26 Oct 2023 11:18:34 +0200 Subject: [PATCH 2/8] fixed some isses --- spectrum_fundamentals/constants.py | 24 ++++++++++++------------ spectrum_fundamentals/mod_string.py | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/spectrum_fundamentals/constants.py b/spectrum_fundamentals/constants.py index 9141f59..613e714 100644 --- a/spectrum_fundamentals/constants.py +++ b/spectrum_fundamentals/constants.py @@ -18,17 +18,7 @@ ############# # ALPHABETS # ############# -MOD_MASSES_SAGE = {+229.1629: '[UNIMOD:737]', - +304.2071: '[UNIMOD:2016]', - +144.1020: '[UNIMOD:214]', - +304.2053: '[UNIMOD:730]', - +8.0141: '[UNIMOD:259]', - +10.0082: '[UNIMOD:267]', - +79.9663: '[UNIMOD:21]', - -18.0105: '[UNIMOD:23]', - +57.0214: '[UNIMOD:4]', - +15.9949: '[UNIMOD:35]', - +42.0105: '[UNIMOD:1]'} + AA_ALPHABET = { "A": 1, "C": 24, @@ -163,7 +153,17 @@ "[UNIMOD:35]": 15.9949146, # Oxidation "[UNIMOD:1]": 42.010565, # Acetylation } - +MOD_MASSES_SAGE = {+229.1629: '[UNIMOD:737]', + +304.2071: '[UNIMOD:2016]', + +144.1020: '[UNIMOD:214]', + +304.2053: '[UNIMOD:730]', + +8.0141: '[UNIMOD:259]', + +10.0082: '[UNIMOD:267]', + +79.9663: '[UNIMOD:21]', + -18.0105: '[UNIMOD:23]', + +57.0214: '[UNIMOD:4]', + +15.9949: '[UNIMOD:35]', + +42.0105: '[UNIMOD:1]'} # these are only used for prosit_grpc, oktoberfest uses the masses from MOD_MASSES AA_MOD_MASSES = { "K[UNIMOD:737]": AA_MASSES["K"] + MOD_MASSES["[UNIMOD:737]"], diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 0f8b32b..9118438 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -5,7 +5,8 @@ from .constants import MAXQUANT_VAR_MODS, MOD_MASSES, MOD_NAMES, SPECTRONAUT_MODS - +def sage_to_internal(): + def internal_to_spectronaut(sequences: List[str]) -> List[str]: """ Function to translate a modstring from the internal format to the spectronaut format. @@ -240,4 +241,3 @@ def get_mods_list(mods_variable: str, mods_fixed: str): else: return mods_variable.split(";") + mods_fixed.split(";") -def \ No newline at end of file From c8ce0d11ffbcbaf638ae4418a23167a3bac9236f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Thu, 26 Oct 2023 11:36:07 +0200 Subject: [PATCH 3/8] added final funcion for sage. --- spectrum_fundamentals/mod_string.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 9118438..6e57257 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -3,9 +3,29 @@ from itertools import repeat from typing import Dict, List, Optional, Tuple -from .constants import MAXQUANT_VAR_MODS, MOD_MASSES, MOD_NAMES, SPECTRONAUT_MODS +from .constants import MAXQUANT_VAR_MODS, MOD_MASSES, MOD_NAMES, SPECTRONAUT_MODS , MOD_MASSES_SAGE + +def sage_to_internal(sequences: List[str])->List[str]: + # Find the number within square brackets (as a float) + start_idx = sequences.find('[') + 1 + end_idx = sequences.find(']') + + # Extract the number string + number_str = sequences[start_idx:end_idx] + + try: + # Attempt to convert the number to a float + number = float(number_str) + except ValueError: + # If conversion fails, keep the original text + return sequences + + # Replace with the corresponding value from the dictionary + if number in MOD_MASSES_SAGE: + return sequences.replace(f'[{number_str}]', MOD_MASSES_SAGE[number]) + else: + return sequences -def sage_to_internal(): def internal_to_spectronaut(sequences: List[str]) -> List[str]: """ From df44fb198aff25620fb78a90ac4f31f341ebf645 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Fri, 27 Oct 2023 10:00:20 +0200 Subject: [PATCH 4/8] added a few lines for sage --- spectrum_fundamentals/constants.py | 20 ++++++------ spectrum_fundamentals/mod_string.py | 50 +++++++++++++++++------------ 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/spectrum_fundamentals/constants.py b/spectrum_fundamentals/constants.py index 613e714..9d54ff8 100644 --- a/spectrum_fundamentals/constants.py +++ b/spectrum_fundamentals/constants.py @@ -153,17 +153,17 @@ "[UNIMOD:35]": 15.9949146, # Oxidation "[UNIMOD:1]": 42.010565, # Acetylation } -MOD_MASSES_SAGE = {+229.1629: '[UNIMOD:737]', - +304.2071: '[UNIMOD:2016]', - +144.1020: '[UNIMOD:214]', - +304.2053: '[UNIMOD:730]', - +8.0141: '[UNIMOD:259]', - +10.0082: '[UNIMOD:267]', - +79.9663: '[UNIMOD:21]', +MOD_MASSES_SAGE = {229.1629: '[UNIMOD:737]', + 304.2071: '[UNIMOD:2016]', + 144.1020: '[UNIMOD:214]', + 304.2053: '[UNIMOD:730]', + 8.0141: '[UNIMOD:259]', + 10.0082: '[UNIMOD:267]', + 79.9663: '[UNIMOD:21]', -18.0105: '[UNIMOD:23]', - +57.0214: '[UNIMOD:4]', - +15.9949: '[UNIMOD:35]', - +42.0105: '[UNIMOD:1]'} + 57.0214: '[UNIMOD:4]', + 15.9949: '[UNIMOD:35]', + 42.0105: '[UNIMOD:1]'} # these are only used for prosit_grpc, oktoberfest uses the masses from MOD_MASSES AA_MOD_MASSES = { "K[UNIMOD:737]": AA_MASSES["K"] + MOD_MASSES["[UNIMOD:737]"], diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 6e57257..71011ee 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -5,26 +5,36 @@ from .constants import MAXQUANT_VAR_MODS, MOD_MASSES, MOD_NAMES, SPECTRONAUT_MODS , MOD_MASSES_SAGE -def sage_to_internal(sequences: List[str])->List[str]: - # Find the number within square brackets (as a float) - start_idx = sequences.find('[') + 1 - end_idx = sequences.find(']') - - # Extract the number string - number_str = sequences[start_idx:end_idx] - - try: - # Attempt to convert the number to a float - number = float(number_str) - except ValueError: - # If conversion fails, keep the original text - return sequences - - # Replace with the corresponding value from the dictionary - if number in MOD_MASSES_SAGE: - return sequences.replace(f'[{number_str}]', MOD_MASSES_SAGE[number]) - else: - return sequences +def sage_to_internal(strings: List[str]) -> List[str]: + modified_strings = [] + + for string in strings: + # Find the number within square brackets (as a float) + start_idx = string.find('[') + 1 + end_idx = string.find(']') + + if start_idx > 0 and end_idx > start_idx: + # Extract the number string + number_str = string[start_idx:end_idx] + + try: + # Attempt to convert the number to a float + number = float(number_str) + except ValueError: + # If conversion fails, keep the original text + modified_strings.append(string) + continue + + # Replace with the corresponding value from the dictionary + if number in MOD_MASSES_SAGE: + modified_value = string.replace(f'[{number_str}]', MOD_MASSES_SAGE[number]) + modified_strings.append(modified_value) + else: + modified_strings.append(string) + else: + modified_strings.append(string) + + return modified_strings def internal_to_spectronaut(sequences: List[str]) -> List[str]: From 8c04d717582908eeb6e1fb6c676c85a77ff1fc90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Fri, 10 Nov 2023 13:42:20 +0100 Subject: [PATCH 5/8] formatting using black --- spectrum_fundamentals/constants.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/spectrum_fundamentals/constants.py b/spectrum_fundamentals/constants.py index 9d54ff8..249adb4 100644 --- a/spectrum_fundamentals/constants.py +++ b/spectrum_fundamentals/constants.py @@ -153,17 +153,19 @@ "[UNIMOD:35]": 15.9949146, # Oxidation "[UNIMOD:1]": 42.010565, # Acetylation } -MOD_MASSES_SAGE = {229.1629: '[UNIMOD:737]', - 304.2071: '[UNIMOD:2016]', - 144.1020: '[UNIMOD:214]', - 304.2053: '[UNIMOD:730]', - 8.0141: '[UNIMOD:259]', - 10.0082: '[UNIMOD:267]', - 79.9663: '[UNIMOD:21]', - -18.0105: '[UNIMOD:23]', - 57.0214: '[UNIMOD:4]', - 15.9949: '[UNIMOD:35]', - 42.0105: '[UNIMOD:1]'} +MOD_MASSES_SAGE = { + 229.1629: "[UNIMOD:737]", + 304.2071: "[UNIMOD:2016]", + 144.1020: "[UNIMOD:214]", + 304.2053: "[UNIMOD:730]", + 8.0141: "[UNIMOD:259]", + 10.0082: "[UNIMOD:267]", + 79.9663: "[UNIMOD:21]", + -18.0105: "[UNIMOD:23]", + 57.0214: "[UNIMOD:4]", + 15.9949: "[UNIMOD:35]", + 42.0105: "[UNIMOD:1]", +} # these are only used for prosit_grpc, oktoberfest uses the masses from MOD_MASSES AA_MOD_MASSES = { "K[UNIMOD:737]": AA_MASSES["K"] + MOD_MASSES["[UNIMOD:737]"], @@ -286,6 +288,3 @@ class RescoreType(Enum): PROSIT = "prosit" ANDROMEDA = "andromeda" - - - From 03829483fe2a35a422e9653100055fb2ed58a37b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Fri, 10 Nov 2023 13:47:39 +0100 Subject: [PATCH 6/8] Fixed doc string for sage to internal. --- spectrum_fundamentals/mod_string.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 71011ee..3f4edf8 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -3,40 +3,50 @@ from itertools import repeat from typing import Dict, List, Optional, Tuple -from .constants import MAXQUANT_VAR_MODS, MOD_MASSES, MOD_NAMES, SPECTRONAUT_MODS , MOD_MASSES_SAGE +from .constants import MAXQUANT_VAR_MODS, MOD_MASSES, MOD_MASSES_SAGE, MOD_NAMES, SPECTRONAUT_MODS -def sage_to_internal(strings: List[str]) -> List[str]: + +def sage_to_internal(sequences: List[str]) -> List[str]: + """ + Convert mod string from sage to the internal format. + + This function converts sequences using the mass change of a modification in + square brackets as done by Sage to the internal format by replacing the mass + shift with the corresponding UNIMOD identifier of known and supported + modifications defined in the constants. + + :param sequences: A list of sequences with values inside square brackets. + :return: A list of modified sequences with values converted to internal format. + """ + # Define a regular expression pattern to match values within square brackets, like [+1.0] or [-2.0]. + pattern = r"\[([\+\-]\d+\.\d+)\]" + + # Define a function 'replace' that takes a regex match object. + def replace(match): + # Extract the value inside the square brackets as a float. + value = float(match.group(1)) + + # Check if the 'MOD_MASSES_SAGE' dictionary has a replacement value for the extracted value. + # If it does, use the replacement value; otherwise, use the original value from the match. + unimod_expression = MOD_MASSES_SAGE.get(value, match.group(0)) + + return unimod_expression + + # Create an empty list 'modified_strings' to store the modified sequences. modified_strings = [] - for string in strings: - # Find the number within square brackets (as a float) - start_idx = string.find('[') + 1 - end_idx = string.find(']') - - if start_idx > 0 and end_idx > start_idx: - # Extract the number string - number_str = string[start_idx:end_idx] - - try: - # Attempt to convert the number to a float - number = float(number_str) - except ValueError: - # If conversion fails, keep the original text - modified_strings.append(string) - continue - - # Replace with the corresponding value from the dictionary - if number in MOD_MASSES_SAGE: - modified_value = string.replace(f'[{number_str}]', MOD_MASSES_SAGE[number]) - modified_strings.append(modified_value) - else: - modified_strings.append(string) - else: - modified_strings.append(string) + # Iterate through the input 'sequences'. + for string in sequences: + # Use 're.sub' to search and replace values within square brackets in the 'string' using the 'replace' function. + modified_string = re.sub(pattern, replace, string) + # Append the modified string to the 'modified_strings' list. + modified_strings.append(modified_string) + + # Return the list of modified sequences. return modified_strings - + def internal_to_spectronaut(sequences: List[str]) -> List[str]: """ Function to translate a modstring from the internal format to the spectronaut format. @@ -270,4 +280,3 @@ def get_mods_list(mods_variable: str, mods_fixed: str): return mods_variable.split(";") else: return mods_variable.split(";") + mods_fixed.split(";") - From d849161e5fb34965b5673a1f777c7aebca44cdd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Fri, 10 Nov 2023 14:24:44 +0100 Subject: [PATCH 7/8] added test mod string --- spectrum_fundamentals/mod_string.py | 13 ++++++++++--- tests/unit_tests/test_mod_string.py | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 3f4edf8..c6df9e7 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -19,17 +19,24 @@ def sage_to_internal(sequences: List[str]) -> List[str]: :return: A list of modified sequences with values converted to internal format. """ # Define a regular expression pattern to match values within square brackets, like [+1.0] or [-2.0]. - pattern = r"\[([\+\-]\d+\.\d+)\]" + pattern = r"[A-Z]?\[([\+\-]\d+\.\d+)\]-?" # Define a function 'replace' that takes a regex match object. def replace(match): # Extract the value inside the square brackets as a float. value = float(match.group(1)) + key = match.string[match.start() : match.end()] + if key.endswith("-"): + unimod_expression = f"{MOD_MASSES_SAGE.get(value, match.group(0))}-" + elif key.startswith("C"): + unimod_expression = f"C{MOD_MASSES_SAGE.get(value, match.group(0))}" + elif key.startswith("K"): + unimod_expression = f"K{MOD_MASSES_SAGE.get(value, match.group(0))}" + elif key.startswith("M"): + unimod_expression = f"M{MOD_MASSES_SAGE.get(value, match.group(0))}" # Check if the 'MOD_MASSES_SAGE' dictionary has a replacement value for the extracted value. # If it does, use the replacement value; otherwise, use the original value from the match. - unimod_expression = MOD_MASSES_SAGE.get(value, match.group(0)) - return unimod_expression # Create an empty list 'modified_strings' to store the modified sequences. diff --git a/tests/unit_tests/test_mod_string.py b/tests/unit_tests/test_mod_string.py index 89dc7de..dae047f 100644 --- a/tests/unit_tests/test_mod_string.py +++ b/tests/unit_tests/test_mod_string.py @@ -18,6 +18,26 @@ def test_internal_to_mod_names(self): ] +class TestSageToInternal(unittest.TestCase): + """Class to test MaxQuant to internal.""" + + def test_sage_to_internal_carbamidomethylation(self): + """Test maxquant_to_internal_carbamidomethylation.""" + self.assertEqual(mod.sage_to_internal(["ABC[+57.0214]DEFGH"]), ["ABC[UNIMOD:4]DEFGH"]) + + def test_sage_to_internal_variable_oxidation(self): + """Test maxquant_to_internal_variable_oxidation.""" + self.assertEqual(mod.sage_to_internal(["ABC[+57.0214]DM[+15.9949]EFGH"]), ["ABC[UNIMOD:4]DM[UNIMOD:35]EFGH"]) + + def test_sage_to_internal_tmt(self): + """Test maxquant_to_internal_tmt.""" + # fixed_mods = {"C": "C[UNIMOD:4]", "^_": "_[UNIMOD:737]-", "K": "K[UNIMOD:737]"} + self.assertEqual( + mod.sage_to_internal(["[+229.1629]-ABC[+57.0214]DEFGHK[+229.1629]"]), + ["[UNIMOD:737]-ABC[UNIMOD:4]DEFGHK[UNIMOD:737]"], + ) + + class TestMaxQuantToInternal(unittest.TestCase): """Class to test MaxQuant to internal.""" From 04b2b60dbd6af1251bcbd97ba3f1a86d50e71ec9 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Fri, 10 Nov 2023 15:23:11 +0100 Subject: [PATCH 8/8] removed unnecessary fixed_mods --- tests/unit_tests/test_mod_string.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit_tests/test_mod_string.py b/tests/unit_tests/test_mod_string.py index dae047f..43b338a 100644 --- a/tests/unit_tests/test_mod_string.py +++ b/tests/unit_tests/test_mod_string.py @@ -31,7 +31,6 @@ def test_sage_to_internal_variable_oxidation(self): def test_sage_to_internal_tmt(self): """Test maxquant_to_internal_tmt.""" - # fixed_mods = {"C": "C[UNIMOD:4]", "^_": "_[UNIMOD:737]-", "K": "K[UNIMOD:737]"} self.assertEqual( mod.sage_to_internal(["[+229.1629]-ABC[+57.0214]DEFGHK[+229.1629]"]), ["[UNIMOD:737]-ABC[UNIMOD:4]DEFGHK[UNIMOD:737]"],