From e0a079bd202748cd4fc6cc2d40ddffc0b8f425ad Mon Sep 17 00:00:00 2001 From: masklinn Date: Mon, 28 Oct 2024 21:33:39 +0100 Subject: [PATCH] Add finite-automaton simplifier, for re2 and graal As I've discovered a while ago, finite automaton engines are not very fond of large bounded repetitions. In re2 and regex, that mostly translates to increased memory consumption (e.g. in their default modes, converting `.*` to `.{0,500}` increases the pattern's size by 115x in re2 and 84x in regex, if a capture is added on top then regex balloons to 219x), there is a performance impact but it's high single digit to low double, in regex at least (didn't test re2). However as it turns out Graal uses a JIT-ed DFA, and it *really* doesn't like these patterns, it spends a lot of time JIT-compiling (this is apparently the source of the extra 300% CPU use I could observe on what are purely single-threaded workloads, the JIT desperately trying to optimise regexes) them with no gain in performance: down-converting the regex back to the sensible increases performances by ~25%, though it doesn't seem to impact memory use... So... do that: `fa_simplifier` is the same idea as ua-parser/uap-rust@29b9195d886a5e1d13dc7109a002a7f8f12e5406 but from the Python side, and applied to graal and re2 (not regex because it does that internally as linked above). Also switch Graal over to the lazy builtins, it kinda spreads the cost but it seems stupid to compile the regexes only to immediately swap (fa_simplifier) and recompile them... so don't do that, especially as I couldn't be arsed to make the replacement conditional (so every eager regex is recompiled, even though only those which actually got modified by `fa_simplifier` need it...). Fixes #228 --- pyproject.toml | 1 + src/ua_parser/__init__.py | 10 ++++++---- src/ua_parser/basic.py | 23 ++++++++++++++++++++++- src/ua_parser/re2.py | 9 +++++---- src/ua_parser/utils.py | 33 +++++++++++++++++++++++++++++++++ tests/test_fa_simplifier.py | 15 +++++++++++++++ 6 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 tests/test_fa_simplifier.py diff --git a/pyproject.toml b/pyproject.toml index 3777c29..65271a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,7 @@ module = [ "test_core", "test_caches", "test_parsers_basics", + "test_fa_simplifier", ] #check_untyped_defs = false diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py index a9a09b4..85ac75a 100644 --- a/src/ua_parser/__init__.py +++ b/src/ua_parser/__init__.py @@ -57,6 +57,7 @@ UserAgent, ) from .loaders import load_builtins, load_lazy_builtins +from .utils import IS_GRAAL Re2Resolver: Optional[Callable[[Matchers], Resolver]] = None if importlib.util.find_spec("re2"): @@ -132,10 +133,11 @@ def parse_device(self: Resolver, ua: str) -> Optional[Device]: def __getattr__(name: str) -> Parser: global parser if name == "parser": - parser = Parser.from_matchers( - load_builtins() if Re2Resolver is None else load_lazy_builtins() - ) - return parser + if Re2Resolver or IS_GRAAL: + matchers = load_lazy_builtins() + else: + matchers = load_builtins() + return Parser.from_matchers(matchers) raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/ua_parser/basic.py b/src/ua_parser/basic.py index bdc1e69..00b49e1 100644 --- a/src/ua_parser/basic.py +++ b/src/ua_parser/basic.py @@ -1,7 +1,9 @@ __all__ = ["Resolver"] +import re +from itertools import chain from operator import methodcaller -from typing import List +from typing import Any, List from .core import ( Device, @@ -12,6 +14,7 @@ PartialResult, UserAgent, ) +from .utils import IS_GRAAL, fa_simplifier class Resolver: @@ -30,6 +33,24 @@ def __init__( matchers: Matchers, ) -> None: self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers + if IS_GRAAL: + matcher: Any + kind = next( + ( + "eager" if hasattr(type(m), "regex") else "lazy" + for m in chain.from_iterable(matchers) + ), + None, + ) + if kind == "eager": + for matcher in chain.from_iterable(matchers): + matcher.pattern = re.compile( + fa_simplifier(matcher.pattern.pattern), + flags=matcher.pattern.flags, + ) + elif kind == "lazy": + for matcher in chain.from_iterable(matchers): + matcher.regex = fa_simplifier(matcher.pattern.pattern) def __call__(self, ua: str, domains: Domain, /) -> PartialResult: parse = methodcaller("__call__", ua) diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py index 83a4a14..1f17e22 100644 --- a/src/ua_parser/re2.py +++ b/src/ua_parser/re2.py @@ -14,6 +14,7 @@ PartialResult, UserAgent, ) +from .utils import fa_simplifier class DummyFilter: @@ -38,7 +39,7 @@ def __init__( if self.user_agent_matchers: self.ua = re2.Filter() for u in self.user_agent_matchers: - self.ua.Add(u.regex) + self.ua.Add(fa_simplifier(u.regex)) self.ua.Compile() else: self.ua = DummyFilter() @@ -46,7 +47,7 @@ def __init__( if self.os_matchers: self.os = re2.Filter() for o in self.os_matchers: - self.os.Add(o.regex) + self.os.Add(fa_simplifier(o.regex)) self.os.Compile() else: self.os = DummyFilter() @@ -58,9 +59,9 @@ def __init__( # no pattern uses global flags, but since they're not # supported in JS that seems safe. if d.flags & re.IGNORECASE: - self.devices.Add("(?i)" + d.regex) + self.devices.Add("(?i)" + fa_simplifier(d.regex)) else: - self.devices.Add(d.regex) + self.devices.Add(fa_simplifier(d.regex)) self.devices.Compile() else: self.devices = DummyFilter() diff --git a/src/ua_parser/utils.py b/src/ua_parser/utils.py index f3afa48..ac11c5a 100644 --- a/src/ua_parser/utils.py +++ b/src/ua_parser/utils.py @@ -1,6 +1,9 @@ +import platform import re from typing import Match, Optional +IS_GRAAL: bool = platform.python_implementation() == "GraalVM" + def get(m: Match[str], idx: int) -> Optional[str]: return (m[idx] or None) if 0 < idx <= m.re.groups else None @@ -28,3 +31,33 @@ def replacer(repl: str, m: Match[str]) -> Optional[str]: return None return re.sub(r"\$(\d)", lambda n: get(m, int(n[1])) or "", repl).strip() or None + + +REPETITION_PATTERN = re.compile(r"\{(0|1)\s*,\s*\d{3,}\}") +CLASS_PATTERN = re.compile( + r""" +\[[^]]*\\(d|w)[^]]*\] +| +\\(d|w) +""", + re.VERBOSE, +) + + +def class_replacer(m: re.Match[str]) -> str: + d, w = ("0-9", "A-Za-z0-9_") if m[1] else ("[0-9]", "[A-Za-z0-9_]") + return m[0].replace(r"\d", d).replace(r"\w", w) + + +def fa_simplifier(pattern: str) -> str: + """uap-core makes significant use of large bounded repetitions, to + mitigate catastrophic backtracking. + + However this explodes the number of states (and thus graph size) + for finite automaton engines, which significantly increases their + memory use, and for those which use JITs it can exceed the JIT + threshold and force fallback to a slower engine (seems to be the + case for graal's TRegex). + """ + pattern = REPETITION_PATTERN.sub(lambda m: "*" if m[1] == "0" else "+", pattern) + return CLASS_PATTERN.sub(class_replacer, pattern) diff --git a/tests/test_fa_simplifier.py b/tests/test_fa_simplifier.py new file mode 100644 index 0000000..1c66050 --- /dev/null +++ b/tests/test_fa_simplifier.py @@ -0,0 +1,15 @@ +import pytest # type: ignore + +from ua_parser.utils import fa_simplifier + + +@pytest.mark.parametrize( + ("from_", "to"), + [ + (r"\d", "[0-9]"), + (r"[\d]", "[0-9]"), + (r"[\d\.]", r"[0-9\.]"), + ], +) +def test_classes(from_, to): + assert fa_simplifier(from_) == to