From 5df1448232e7fa6922e9f778f9cb0356ca2e07ff Mon Sep 17 00:00:00 2001 From: Fabian Basso Date: Fri, 9 Aug 2024 12:12:46 +0000 Subject: [PATCH 1/4] fixed tokens for custom mods --- spectrum_fundamentals/mod_string.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 99b3702..685cb24 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -311,10 +311,17 @@ def split_modstring(sequence: str, r_pattern): # Ugly and fast fix for reading modifications as is from maxquant we should reconsider how to fix it. # sequence = sequence.replace('M(ox)','M(U:35)') # sequence = sequence.replace('C','C(U:4)') + val = max(alphabet.values()) + 1 split_seq = r_pattern.findall(sequence) if "".join(split_seq) == sequence: if translate: - return [alphabet[aa] for aa in split_seq] + results = [] + for aa in split_seq: + if aa not in alphabet: #does not exist + alphabet[aa] = val + val += 1 + results.append(alphabet[aa]) + return results else: return split_seq elif filter: @@ -327,9 +334,10 @@ def split_modstring(sequence: str, r_pattern): f"The element(s) [{not_parsable_elements}] " f"in the sequence [{sequence}] could not be parsed" ) - pattern = sorted(alphabet, key=len, reverse=True) + pattern = r"[A-Z]\[UNIMOD:\d+\]" + alphabet_pattern = [re.escape(i) for i in sorted(alphabet, key=len, reverse=True)] - pattern = [re.escape(i) for i in pattern] + pattern = [pattern] + alphabet_pattern regex_pattern = re.compile("|".join(pattern)) return map(split_modstring, sequences, repeat(regex_pattern)) From 4e38d9d693ed0d4770435f4f033afbc58c2beab5 Mon Sep 17 00:00:00 2001 From: Fabian Basso Date: Fri, 9 Aug 2024 12:15:13 +0000 Subject: [PATCH 2/4] fixed tokens for custom mods --- spectrum_fundamentals/mod_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 685cb24..8456124 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -317,7 +317,7 @@ def split_modstring(sequence: str, r_pattern): if translate: results = [] for aa in split_seq: - if aa not in alphabet: #does not exist + if aa not in alphabet: # does not exist alphabet[aa] = val val += 1 results.append(alphabet[aa]) From 04bec442b9400a2c5287d122395c25ff36b01a24 Mon Sep 17 00:00:00 2001 From: Fabian Basso Date: Fri, 9 Aug 2024 12:22:01 +0000 Subject: [PATCH 3/4] Mypy test passed --- spectrum_fundamentals/mod_string.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 8456124..47a839e 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -334,10 +334,10 @@ def split_modstring(sequence: str, r_pattern): f"The element(s) [{not_parsable_elements}] " f"in the sequence [{sequence}] could not be parsed" ) - pattern = r"[A-Z]\[UNIMOD:\d+\]" + unimod_pattern = r"[A-Z]\[UNIMOD:\d+\]" alphabet_pattern = [re.escape(i) for i in sorted(alphabet, key=len, reverse=True)] - pattern = [pattern] + alphabet_pattern + pattern = [unimod_pattern] + alphabet_pattern regex_pattern = re.compile("|".join(pattern)) return map(split_modstring, sequences, repeat(regex_pattern)) From 3babf78f1fda12a43e4604e3697bcf98aca59d44 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Fri, 9 Aug 2024 18:52:21 +0200 Subject: [PATCH 4/4] added get_all_token method --- spectrum_fundamentals/mod_string.py | 11 ++++++++++- tests/unit_tests/test_mod_string.py | 7 +++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 47a839e..cbbe4f6 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -1,7 +1,7 @@ import difflib import re from itertools import combinations, repeat -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Set, Tuple, Union import numpy as np import pandas as pd @@ -342,6 +342,15 @@ def split_modstring(sequence: str, r_pattern): return map(split_modstring, sequences, repeat(regex_pattern)) +def get_all_tokens(sequences: List[str]) -> Set[str]: + """Parse given sequences in UNIMOD ProForma standard into a set of all tokens.""" + pattern = r"[ACDEFGHIKLMNPQRSTVWY](\[UNIMOD:\d+\])?" + tokens = set() + for seq in sequences: + tokens |= {match.group() for match in re.finditer(pattern, seq)} + return tokens + + def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str]): """ Generate different peptide sequences with moving the modification to all possible residues. diff --git a/tests/unit_tests/test_mod_string.py b/tests/unit_tests/test_mod_string.py index cee6c5c..d03c736 100644 --- a/tests/unit_tests/test_mod_string.py +++ b/tests/unit_tests/test_mod_string.py @@ -309,6 +309,13 @@ def test_parse_modstrings_invalid_with_filtering(self): invalid_seq = "testing" self.assertEqual(next(mod.parse_modstrings([invalid_seq], alphabet=c.ALPHABET, filter=True)), [0]) + def test_get_all_tokens(self): + """Test parsing of any UNIMOD sequence into tokens.""" + seqs = ["ACKC[UNIMOD:4]AD", "PEPTIDE", "PEM[UNIMOD:35]"] + + result = mod.get_all_tokens(seqs) + self.assertEqual(result, {"A", "C", "C[UNIMOD:4]", "D", "E", "I", "K", "M[UNIMOD:35]", "P", "T"}) + class TestCustomToInternal(unittest.TestCase): """Class to test custom to internal."""