Add an re2-based parser

Requires splitting out some of the testenvs, as re2 is not available for pypy at all, and not yet for 3.12. Uses `re2.Filter`, which unlike the C++ `FilteredRE2` bundles prefiltering, using an `re2.Set` so likely less efficient than providing one's own e.g. aho-corasick, but avoids having to do that. At first glance according to pytest's `--durations 0` this is quite successful (unlike using `re2.Set` which was more of a mixed bag): ``` 2.54s call tests/test_core.py::test_devices[test_device.yaml-basic] 2.51s call tests/test_core.py::test_ua[pgts_browser_list.yaml-basic] 2.48s call tests/test_legacy.py::TestParse::testPGTSStrings 2.43s call tests/test_legacy.py::TestParse::testStringsDevice 0.95s call tests/test_core.py::test_devices[test_device.yaml-re2] 0.55s call tests/test_core.py::test_ua[pgts_browser_list.yaml-re2] 0.18s call tests/test_core.py::test_ua[test_ua.yaml-basic] 0.16s call tests/test_legacy.py::TestParse::testBrowserscopeStrings 0.10s call tests/test_core.py::test_ua[test_ua.yaml-re2] ``` While the "basic" parser for the new API is slightly slower than the legacy API (browserscope does use test_ua.yaml so that matches) the re2 parser is significantly faster than both: - 60% faster on test_device.yaml (~2.5s -> 1s) - 80% faster on pgts (2.5s -> 0.5s) - 40% faster on browserscope (0.16 -> 0.1) This is very encouraging, altough the memory consumption has not been checked (yet). Fixes #149, kind-of
ua-parser · Nov 3, 2023 · 7ff511e · 7ff511e
1 parent 7f90746
commit 7ff511e
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 1 deletion.
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ version = "1.0.0a1"
 readme = "README.rst"
 requires-python = ">=3.8"
 dependencies = []
-optional-dependencies = { yaml = ["PyYaml"] }
+optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }
 
 license = {text = "Apache 2.0"}
 urls = {repository = "https://github.com/ua-parser/uap-python"}

diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import io
+import os
+import re
+from typing import List, Tuple, Union
+
+import re2  # type: ignore
+
+from .core import (
+    Parser as AbstractParser,
+    PartialParseResult,
+    Device,
+    Domain,
+    OS,
+    UserAgent,
+    Matchers,
+    UserAgentMatcher,
+    OSMatcher,
+    DeviceMatcher,
+)
+
+
+class Parser(AbstractParser):
+    ua: re2.Filter
+    user_agent_parsers: List[UserAgentMatcher]
+    os: re2.Filter
+    os_parsers: List[OSMatcher]
+    devices: re2.Filter
+    device_parsers: List[DeviceMatcher]
+
+    def __init__(
+        self,
+        matchers: Matchers,
+    ) -> None:
+        self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers
+
+        self.ua = re2.Filter()
+        for u in self.user_agent_parsers:
+            self.ua.Add(u.regex.pattern)
+        self.ua.Compile()
+
+        self.os = re2.Filter()
+        for o in self.os_parsers:
+            self.os.Add(o.regex.pattern)
+        self.os.Compile()
+
+        self.devices = re2.Filter()
+        for d in self.device_parsers:
+            # Prepend the i global flag if IGNORECASE is set. Assumes
+            # no pattern uses global flags, but since they're not
+            # supported in JS that seems safe.
+            if d.regex.flags & re.IGNORECASE:
+                self.devices.Add("(?i)" + d.regex.pattern)
+            else:
+                self.devices.Add(d.regex.pattern)
+        self.devices.Compile()
+
+    def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
+        user_agent = os = device = None
+        if Domain.USER_AGENT in domains:
+            if matches := self.ua.Match(ua):
+                # Set/Filter does not return the match in index order
+                # (position order?) so to fit UAP semantics we need to
+                # extract the first matching regex (lowest index).
+                user_agent = self.user_agent_parsers[min(matches)](ua)
+        if Domain.OS in domains:
+            if matches := self.os.Match(ua):
+                os = self.os_parsers[min(matches)](ua)
+        if Domain.DEVICE in domains:
+            if matches := self.devices.Match(ua):
+                device = self.device_parsers[min(matches)](ua)
+        return PartialParseResult(
+            domains=domains, string=ua, user_agent=user_agent, os=os, device=device
+        )
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1,5 +1,6 @@
 """Tests UAP-Python using the UAP-core test suite
 """
+import contextlib
 import dataclasses
 import logging
 import pathlib
@@ -36,7 +37,10 @@
 PARSERS = [
     pytest.param(BasicParser(load_builtins()), id="basic"),
 ]
+with contextlib.suppress(ImportError):
+    from ua_parser import re2
 
+    PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2"))
 
 UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}
 

diff --git a/tox.ini b/tox.ini
@@ -19,9 +19,15 @@ wheel_build_env = .pkg
 deps =
      pytest
      pyyaml
+     google-re2
 commands =
     pytest -Werror --doctest-glob="*.rst" {posargs}
 
+[testenv:pypy3.{8,9,10},py312]
+deps =
+     pytest
+     pyyaml
+
 [testenv:flake8]
 package = skip
 deps = flake8