wilhelm-lab · picciama · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py
@@ -1,7 +1,7 @@
 import difflib
 import re
 from itertools import combinations, repeat
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -311,10 +311,17 @@ def split_modstring(sequence: str, r_pattern):
         # Ugly and fast fix for reading modifications as is from maxquant we should reconsider how to fix it.
         # sequence = sequence.replace('M(ox)','M(U:35)')
         # sequence = sequence.replace('C','C(U:4)')
+        val = max(alphabet.values()) + 1
         split_seq = r_pattern.findall(sequence)
         if "".join(split_seq) == sequence:
             if translate:
-                return [alphabet[aa] for aa in split_seq]
+                results = []
+                for aa in split_seq:
+                    if aa not in alphabet:  # does not exist
+                        alphabet[aa] = val
+                        val += 1
+                    results.append(alphabet[aa])
+                return results
             else:
                 return split_seq
         elif filter:
@@ -327,13 +334,23 @@ def split_modstring(sequence: str, r_pattern):
                 f"The element(s) [{not_parsable_elements}] " f"in the sequence [{sequence}] could not be parsed"
             )
 
-    pattern = sorted(alphabet, key=len, reverse=True)
+    unimod_pattern = r"[A-Z]\[UNIMOD:\d+\]"
+    alphabet_pattern = [re.escape(i) for i in sorted(alphabet, key=len, reverse=True)]
 
-    pattern = [re.escape(i) for i in pattern]
+    pattern = [unimod_pattern] + alphabet_pattern
     regex_pattern = re.compile("|".join(pattern))
     return map(split_modstring, sequences, repeat(regex_pattern))
 
 
+def get_all_tokens(sequences: List[str]) -> Set[str]:
+    """Parse given sequences in UNIMOD ProForma standard into a set of all tokens."""
+    pattern = r"[ACDEFGHIKLMNPQRSTVWY](\[UNIMOD:\d+\])?"
+    tokens = set()
+    for seq in sequences:
+        tokens |= {match.group() for match in re.finditer(pattern, seq)}
+    return tokens
+
+
 def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str]):
     """
     Generate different peptide sequences with moving the modification to all possible residues.

diff --git a/tests/unit_tests/test_mod_string.py b/tests/unit_tests/test_mod_string.py
@@ -309,6 +309,13 @@ def test_parse_modstrings_invalid_with_filtering(self):
         invalid_seq = "testing"
         self.assertEqual(next(mod.parse_modstrings([invalid_seq], alphabet=c.ALPHABET, filter=True)), [0])
 
+    def test_get_all_tokens(self):
+        """Test parsing of any UNIMOD sequence into tokens."""
+        seqs = ["ACKC[UNIMOD:4]AD", "PEPTIDE", "PEM[UNIMOD:35]"]
+
+        result = mod.get_all_tokens(seqs)
+        self.assertEqual(result, {"A", "C", "C[UNIMOD:4]", "D", "E", "I", "K", "M[UNIMOD:35]", "P", "T"})
+
 
 class TestCustomToInternal(unittest.TestCase):
     """Class to test custom to internal."""