diff --git a/pyproject.toml b/pyproject.toml index 3777c29..65271a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,7 @@ module = [ "test_core", "test_caches", "test_parsers_basics", + "test_fa_simplifier", ] #check_untyped_defs = false diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py index a9a09b4..85ac75a 100644 --- a/src/ua_parser/__init__.py +++ b/src/ua_parser/__init__.py @@ -57,6 +57,7 @@ UserAgent, ) from .loaders import load_builtins, load_lazy_builtins +from .utils import IS_GRAAL Re2Resolver: Optional[Callable[[Matchers], Resolver]] = None if importlib.util.find_spec("re2"): @@ -132,10 +133,11 @@ def parse_device(self: Resolver, ua: str) -> Optional[Device]: def __getattr__(name: str) -> Parser: global parser if name == "parser": - parser = Parser.from_matchers( - load_builtins() if Re2Resolver is None else load_lazy_builtins() - ) - return parser + if Re2Resolver or IS_GRAAL: + matchers = load_lazy_builtins() + else: + matchers = load_builtins() + return Parser.from_matchers(matchers) raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/ua_parser/basic.py b/src/ua_parser/basic.py index bdc1e69..00b49e1 100644 --- a/src/ua_parser/basic.py +++ b/src/ua_parser/basic.py @@ -1,7 +1,9 @@ __all__ = ["Resolver"] +import re +from itertools import chain from operator import methodcaller -from typing import List +from typing import Any, List from .core import ( Device, @@ -12,6 +14,7 @@ PartialResult, UserAgent, ) +from .utils import IS_GRAAL, fa_simplifier class Resolver: @@ -30,6 +33,24 @@ def __init__( matchers: Matchers, ) -> None: self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers + if IS_GRAAL: + matcher: Any + kind = next( + ( + "eager" if hasattr(type(m), "regex") else "lazy" + for m in chain.from_iterable(matchers) + ), + None, + ) + if kind == "eager": + for matcher in chain.from_iterable(matchers): + matcher.pattern = re.compile( + fa_simplifier(matcher.pattern.pattern), + flags=matcher.pattern.flags, + ) + elif kind == "lazy": + for matcher in chain.from_iterable(matchers): + matcher.regex = fa_simplifier(matcher.pattern.pattern) def __call__(self, ua: str, domains: Domain, /) -> PartialResult: parse = methodcaller("__call__", ua) diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py index 83a4a14..1f17e22 100644 --- a/src/ua_parser/re2.py +++ b/src/ua_parser/re2.py @@ -14,6 +14,7 @@ PartialResult, UserAgent, ) +from .utils import fa_simplifier class DummyFilter: @@ -38,7 +39,7 @@ def __init__( if self.user_agent_matchers: self.ua = re2.Filter() for u in self.user_agent_matchers: - self.ua.Add(u.regex) + self.ua.Add(fa_simplifier(u.regex)) self.ua.Compile() else: self.ua = DummyFilter() @@ -46,7 +47,7 @@ def __init__( if self.os_matchers: self.os = re2.Filter() for o in self.os_matchers: - self.os.Add(o.regex) + self.os.Add(fa_simplifier(o.regex)) self.os.Compile() else: self.os = DummyFilter() @@ -58,9 +59,9 @@ def __init__( # no pattern uses global flags, but since they're not # supported in JS that seems safe. if d.flags & re.IGNORECASE: - self.devices.Add("(?i)" + d.regex) + self.devices.Add("(?i)" + fa_simplifier(d.regex)) else: - self.devices.Add(d.regex) + self.devices.Add(fa_simplifier(d.regex)) self.devices.Compile() else: self.devices = DummyFilter() diff --git a/src/ua_parser/utils.py b/src/ua_parser/utils.py index f3afa48..ac11c5a 100644 --- a/src/ua_parser/utils.py +++ b/src/ua_parser/utils.py @@ -1,6 +1,9 @@ +import platform import re from typing import Match, Optional +IS_GRAAL: bool = platform.python_implementation() == "GraalVM" + def get(m: Match[str], idx: int) -> Optional[str]: return (m[idx] or None) if 0 < idx <= m.re.groups else None @@ -28,3 +31,33 @@ def replacer(repl: str, m: Match[str]) -> Optional[str]: return None return re.sub(r"\$(\d)", lambda n: get(m, int(n[1])) or "", repl).strip() or None + + +REPETITION_PATTERN = re.compile(r"\{(0|1)\s*,\s*\d{3,}\}") +CLASS_PATTERN = re.compile( + r""" +\[[^]]*\\(d|w)[^]]*\] +| +\\(d|w) +""", + re.VERBOSE, +) + + +def class_replacer(m: re.Match[str]) -> str: + d, w = ("0-9", "A-Za-z0-9_") if m[1] else ("[0-9]", "[A-Za-z0-9_]") + return m[0].replace(r"\d", d).replace(r"\w", w) + + +def fa_simplifier(pattern: str) -> str: + """uap-core makes significant use of large bounded repetitions, to + mitigate catastrophic backtracking. + + However this explodes the number of states (and thus graph size) + for finite automaton engines, which significantly increases their + memory use, and for those which use JITs it can exceed the JIT + threshold and force fallback to a slower engine (seems to be the + case for graal's TRegex). + """ + pattern = REPETITION_PATTERN.sub(lambda m: "*" if m[1] == "0" else "+", pattern) + return CLASS_PATTERN.sub(class_replacer, pattern) diff --git a/tests/test_fa_simplifier.py b/tests/test_fa_simplifier.py new file mode 100644 index 0000000..1c66050 --- /dev/null +++ b/tests/test_fa_simplifier.py @@ -0,0 +1,15 @@ +import pytest # type: ignore + +from ua_parser.utils import fa_simplifier + + +@pytest.mark.parametrize( + ("from_", "to"), + [ + (r"\d", "[0-9]"), + (r"[\d]", "[0-9]"), + (r"[\d\.]", r"[0-9\.]"), + ], +) +def test_classes(from_, to): + assert fa_simplifier(from_) == to