diff --git a/spectrum_fundamentals/mod_string.py b/spectrum_fundamentals/mod_string.py index 47a839e..cbbe4f6 100644 --- a/spectrum_fundamentals/mod_string.py +++ b/spectrum_fundamentals/mod_string.py @@ -1,7 +1,7 @@ import difflib import re from itertools import combinations, repeat -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Set, Tuple, Union import numpy as np import pandas as pd @@ -342,6 +342,15 @@ def split_modstring(sequence: str, r_pattern): return map(split_modstring, sequences, repeat(regex_pattern)) +def get_all_tokens(sequences: List[str]) -> Set[str]: + """Parse given sequences in UNIMOD ProForma standard into a set of all tokens.""" + pattern = r"[ACDEFGHIKLMNPQRSTVWY](\[UNIMOD:\d+\])?" + tokens = set() + for seq in sequences: + tokens |= {match.group() for match in re.finditer(pattern, seq)} + return tokens + + def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str]): """ Generate different peptide sequences with moving the modification to all possible residues. diff --git a/tests/unit_tests/test_mod_string.py b/tests/unit_tests/test_mod_string.py index cee6c5c..d03c736 100644 --- a/tests/unit_tests/test_mod_string.py +++ b/tests/unit_tests/test_mod_string.py @@ -309,6 +309,13 @@ def test_parse_modstrings_invalid_with_filtering(self): invalid_seq = "testing" self.assertEqual(next(mod.parse_modstrings([invalid_seq], alphabet=c.ALPHABET, filter=True)), [0]) + def test_get_all_tokens(self): + """Test parsing of any UNIMOD sequence into tokens.""" + seqs = ["ACKC[UNIMOD:4]AD", "PEPTIDE", "PEM[UNIMOD:35]"] + + result = mod.get_all_tokens(seqs) + self.assertEqual(result, {"A", "C", "C[UNIMOD:4]", "D", "E", "I", "K", "M[UNIMOD:35]", "P", "T"}) + class TestCustomToInternal(unittest.TestCase): """Class to test custom to internal."""