Skip to content

Commit

Permalink
Add finite-automaton simplifier, for re2 and graal
Browse files Browse the repository at this point in the history
As I've discovered a while ago, finite automaton engines are not very
fond of large bounded repetitions.

In re2 and regex, that mostly translates to increased memory
consumption (e.g. in their default modes, converting `.*` to
`.{0,500}` increases the pattern's size by 115x in re2 and 84x in
regex, if a capture is added on top then regex balloons to 219x),
there is a performance impact but it's high single digit to low
double, in regex at least (didn't test re2).

However as it turns out Graal uses a JIT-ed DFA, and it *really*
doesn't like these patterns, it spends a lot of time JIT-compiling
(this is apparently the source of the extra 300% CPU use I could
observe on what are purely single-threaded workloads, the JIT
desperately trying to optimise regexes) them with no gain in
performance: down-converting the regex back to the sensible increases
performances by ~25%, though it doesn't seem to impact memory use...

So... do that: `fa_simplifier` is the same idea as
ua-parser/uap-rust@29b9195 but from
the Python side, and applied to graal and re2 (not regex because it
does that internally as linked above).

Also switch Graal over to the lazy builtins, it kinda spreads the cost
but it seems stupid to compile the regexes only to immediately swap
(fa_simplifier) and recompile them... so don't do that, especially as
I couldn't be arsed to make the replacement conditional (so every
eager regex is recompiled, even though only those which actually got
modified by `fa_simplifier` need it...).

Fixes #228
  • Loading branch information
masklinn committed Oct 29, 2024
1 parent 1358e75 commit 6fb7b58
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 9 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ module = [
"test_core",
"test_caches",
"test_parsers_basics",
"test_fa_simplifier",
]

#check_untyped_defs = false
Expand Down
10 changes: 6 additions & 4 deletions src/ua_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
UserAgent,
)
from .loaders import load_builtins, load_lazy_builtins
from .utils import IS_GRAAL

Re2Resolver: Optional[Callable[[Matchers], Resolver]] = None
if importlib.util.find_spec("re2"):
Expand Down Expand Up @@ -132,10 +133,11 @@ def parse_device(self: Resolver, ua: str) -> Optional[Device]:
def __getattr__(name: str) -> Parser:
global parser
if name == "parser":
parser = Parser.from_matchers(
load_builtins() if Re2Resolver is None else load_lazy_builtins()
)
return parser
if Re2Resolver or IS_GRAAL:
matchers = load_lazy_builtins()
else:
matchers = load_builtins()
return Parser.from_matchers(matchers)
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


Expand Down
23 changes: 22 additions & 1 deletion src/ua_parser/basic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
__all__ = ["Resolver"]

import re
from itertools import chain
from operator import methodcaller
from typing import List
from typing import Any, List

from .core import (
Device,
Expand All @@ -12,6 +14,7 @@
PartialResult,
UserAgent,
)
from .utils import IS_GRAAL, fa_simplifier


class Resolver:
Expand All @@ -30,6 +33,24 @@ def __init__(
matchers: Matchers,
) -> None:
self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers
if IS_GRAAL:
matcher: Any
kind = next(
(
"eager" if hasattr(type(m), "regex") else "lazy"
for m in chain.from_iterable(matchers)
),
None,
)
if kind == "eager":
for matcher in chain.from_iterable(matchers):
matcher.pattern = re.compile(
fa_simplifier(matcher.pattern.pattern),
flags=matcher.pattern.flags,
)
elif kind == "lazy":
for matcher in chain.from_iterable(matchers):
matcher.regex = fa_simplifier(matcher.pattern.pattern)

def __call__(self, ua: str, domains: Domain, /) -> PartialResult:
parse = methodcaller("__call__", ua)
Expand Down
9 changes: 5 additions & 4 deletions src/ua_parser/re2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
PartialResult,
UserAgent,
)
from .utils import fa_simplifier


class DummyFilter:
Expand All @@ -38,15 +39,15 @@ def __init__(
if self.user_agent_matchers:
self.ua = re2.Filter()
for u in self.user_agent_matchers:
self.ua.Add(u.regex)
self.ua.Add(fa_simplifier(u.regex))
self.ua.Compile()
else:
self.ua = DummyFilter()

if self.os_matchers:
self.os = re2.Filter()
for o in self.os_matchers:
self.os.Add(o.regex)
self.os.Add(fa_simplifier(o.regex))
self.os.Compile()
else:
self.os = DummyFilter()
Expand All @@ -58,9 +59,9 @@ def __init__(
# no pattern uses global flags, but since they're not
# supported in JS that seems safe.
if d.flags & re.IGNORECASE:
self.devices.Add("(?i)" + d.regex)
self.devices.Add("(?i)" + fa_simplifier(d.regex))
else:
self.devices.Add(d.regex)
self.devices.Add(fa_simplifier(d.regex))
self.devices.Compile()
else:
self.devices = DummyFilter()
Expand Down
33 changes: 33 additions & 0 deletions src/ua_parser/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import platform
import re
from typing import Match, Optional

IS_GRAAL: bool = platform.python_implementation() == "GraalVM"


def get(m: Match[str], idx: int) -> Optional[str]:
return (m[idx] or None) if 0 < idx <= m.re.groups else None
Expand Down Expand Up @@ -28,3 +31,33 @@ def replacer(repl: str, m: Match[str]) -> Optional[str]:
return None

return re.sub(r"\$(\d)", lambda n: get(m, int(n[1])) or "", repl).strip() or None


REPETITION_PATTERN = re.compile(r"\{(0|1)\s*,\s*\d{3,}\}")
CLASS_PATTERN = re.compile(
r"""
\[[^]]*\\(d|w)[^]]*\]
|
\\(d|w)
""",
re.VERBOSE,
)


def class_replacer(m: re.Match[str]) -> str:
d, w = ("0-9", "A-Za-z0-9_") if m[1] else ("[0-9]", "[A-Za-z0-9_]")
return m[0].replace(r"\d", d).replace(r"\w", w)


def fa_simplifier(pattern: str) -> str:
"""uap-core makes significant use of large bounded repetitions, to
mitigate catastrophic backtracking.
However this explodes the number of states (and thus graph size)
for finite automaton engines, which significantly increases their
memory use, and for those which use JITs it can exceed the JIT
threshold and force fallback to a slower engine (seems to be the
case for graal's TRegex).
"""
pattern = REPETITION_PATTERN.sub(lambda m: "*" if m[1] == "0" else "+", pattern)
return CLASS_PATTERN.sub(class_replacer, pattern)
15 changes: 15 additions & 0 deletions tests/test_fa_simplifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest # type: ignore

from ua_parser.utils import fa_simplifier


@pytest.mark.parametrize(
("from_", "to"),
[
(r"\d", "[0-9]"),
(r"[\d]", "[0-9]"),
(r"[\d\.]", r"[0-9\.]"),
],
)
def test_classes(from_, to):
assert fa_simplifier(from_) == to

0 comments on commit 6fb7b58

Please sign in to comment.