From df7ffe9d6e956ab7ef032183290b332344d946fa Mon Sep 17 00:00:00 2001
From: masklinn <github.com@masklinn.net>
Date: Thu, 2 Nov 2023 21:45:02 +0100
Subject: [PATCH] Add an re2-based parser

Requires splitting out some of the testenvs, as re2 is not available
for pypy at all, and not yet for 3.12.

Only uses re2.Set which turns out to be not great, at least according
to `pytest --durations` on 3.11:

- re2 is sometimes faster for UA tests
  - `pgts_browser_list.yaml` goes from 2.5s to 1.5
  - `firefox_user_agent_strings.yaml` goes from 0.05 to 0.04 (not
    really significant)
  - though `test_ua.yaml` goes from 0.18 to 0.65
- re2 is *way* slower for devices tests
  - `test_device.yaml` goes from 2.5 to 8s

Obviously tests might not be representative at all, implementing a
proper benchmark on a real-life test-set (#163) would likely provide
better information.

It's possible that `FilteredRE2` would would offer better
performances, *but* it requires additional memory and more importantly
it requires a fast literal string matcher e.g. a fast implementation
of Aho-Corasick, or possibly Hyperscan's Teddy (via
[python-hyperscan][5]?). [According to burntsushi commentz-walter is
not great in practice][1], at least as you increase the number of
patterns, so that one looks like a dead end.

Either way this would likely be an *additional* dependency to make it
usable, although there seems to be [a well-maintained Python version
with impressive performances (for pure python)][2], [a native
module][3], and [a wrapper for burntsushi's rust implementation][4]
which claims even better performances than the native module.

Linked to (but probably can't be argued to fix) #149.

[1]: https://news.ycombinator.com/item?id=26913349
[2]: https://github.com/abusix/ahocorapy
[3]: https://github.com/WojciechMula/pyahocorasick/
[4]: https://github.com/G-Research/ahocorasick_rs/
[5]: https://python-hyperscan.readthedocs.io
---
 pyproject.toml       |  2 +-
 src/ua_parser/re2.py | 80 ++++++++++++++++++++++++++++++++++++++++++++
 tests/test_core.py   |  4 +++
 tox.ini              |  6 ++++
 4 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 src/ua_parser/re2.py

diff --git a/pyproject.toml b/pyproject.toml
index b42d432..9acef8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ version = "1.0.0a1"
 readme = "README.rst"
 requires-python = ">=3.8"
 dependencies = []
-optional-dependencies = { yaml = ["PyYaml"] }
+optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }
 
 license = {text = "Apache 2.0"}
 urls = {repository = "https://github.com/ua-parser/uap-python"}
diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py
new file mode 100644
index 0000000..fb23a9a
--- /dev/null
+++ b/src/ua_parser/re2.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+import io
+import os
+import re
+from typing import List, Tuple, Union
+
+import re2  # type: ignore
+
+from .core import (
+    Parser as BaseParser,
+    PartialParseResult,
+    Device,
+    Domain,
+    OS,
+    UserAgent,
+    Matchers,
+    UserAgentMatcher,
+    OSMatcher,
+    DeviceMatcher,
+)
+
+
+RE_OPTS = re2.Options()
+# as of uap-core 0.18, the devices set needs at least 28MB (up from
+# the default 8), set to 32
+RE_OPTS.max_mem = 8 << 22
+# might write directly to stdout? not great, suppress
+RE_OPTS.log_errors = False
+
+
+class Parser(BaseParser):
+    ua: re2.Set
+    user_agent_parsers: List[UserAgentMatcher]
+    os: re2.Set
+    os_parsers: List[OSMatcher]
+    devices: re2.Set
+    device_parsers: List[DeviceMatcher]
+
+    def __init__(
+        self,
+        matchers: Matchers,
+    ) -> None:
+        self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers
+
+        self.ua = re2.Set.SearchSet(RE_OPTS)
+        for u in self.user_agent_parsers:
+            self.ua.Add(u.regex.pattern)
+        self.ua.Compile()
+
+        self.os = re2.Set.SearchSet(RE_OPTS)
+        for o in self.os_parsers:
+            self.os.Add(o.regex.pattern)
+        self.os.Compile()
+
+        self.devices = re2.Set.SearchSet(RE_OPTS)
+        for d in self.device_parsers:
+            # Prepend the i global flag if IGNORECASE is set. Assumes
+            # no pattern uses global flags, but since they're not
+            # supported in JS that seems safe.
+            if d.regex.flags & re.IGNORECASE:
+                self.devices.Add("(?i)" + d.regex.pattern)
+            else:
+                self.devices.Add(d.regex.pattern)
+        self.devices.Compile()
+
+    def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
+        user_agent = os = device = None
+        if Domain.USER_AGENT in domains:
+            if matches := self.ua.Match(ua):
+                user_agent = self.user_agent_parsers[min(matches)](ua)
+        if Domain.OS in domains:
+            if matches := self.os.Match(ua):
+                os = self.os_parsers[min(matches)](ua)
+        if Domain.DEVICE in domains:
+            if matches := self.devices.Match(ua):
+                device = self.device_parsers[min(matches)](ua)
+        return PartialParseResult(
+            domains=domains, string=ua, user_agent=user_agent, os=os, device=device
+        )
diff --git a/tests/test_core.py b/tests/test_core.py
index 76743c8..16eb43f 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -1,5 +1,6 @@
 """Tests UAP-Python using the UAP-core test suite
 """
+import contextlib
 import dataclasses
 import logging
 import pathlib
@@ -36,7 +37,10 @@
 PARSERS = [
     pytest.param(BasicParser(load_builtins()), id="basic"),
 ]
+with contextlib.suppress(ImportError):
+    from ua_parser import re2
 
+    PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2"))
 
 UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}
 
diff --git a/tox.ini b/tox.ini
index ca73951..094cf67 100644
--- a/tox.ini
+++ b/tox.ini
@@ -19,9 +19,15 @@ wheel_build_env = .pkg
 deps =
      pytest
      pyyaml
+     google-re2
 commands =
     pytest -Werror --doctest-glob="*.rst" {posargs}
 
+[testenv:pypy3.{8,9,10},py312]
+deps =
+     pytest
+     pyyaml
+
 [testenv:flake8]
 package = skip
 deps = flake8