From df7ffe9d6e956ab7ef032183290b332344d946fa Mon Sep 17 00:00:00 2001 From: masklinn Date: Thu, 2 Nov 2023 21:45:02 +0100 Subject: [PATCH] Add an re2-based parser Requires splitting out some of the testenvs, as re2 is not available for pypy at all, and not yet for 3.12. Only uses re2.Set which turns out to be not great, at least according to `pytest --durations` on 3.11: - re2 is sometimes faster for UA tests - `pgts_browser_list.yaml` goes from 2.5s to 1.5 - `firefox_user_agent_strings.yaml` goes from 0.05 to 0.04 (not really significant) - though `test_ua.yaml` goes from 0.18 to 0.65 - re2 is *way* slower for devices tests - `test_device.yaml` goes from 2.5 to 8s Obviously tests might not be representative at all, implementing a proper benchmark on a real-life test-set (#163) would likely provide better information. It's possible that `FilteredRE2` would would offer better performances, *but* it requires additional memory and more importantly it requires a fast literal string matcher e.g. a fast implementation of Aho-Corasick, or possibly Hyperscan's Teddy (via [python-hyperscan][5]?). [According to burntsushi commentz-walter is not great in practice][1], at least as you increase the number of patterns, so that one looks like a dead end. Either way this would likely be an *additional* dependency to make it usable, although there seems to be [a well-maintained Python version with impressive performances (for pure python)][2], [a native module][3], and [a wrapper for burntsushi's rust implementation][4] which claims even better performances than the native module. Linked to (but probably can't be argued to fix) #149. [1]: https://news.ycombinator.com/item?id=26913349 [2]: https://github.com/abusix/ahocorapy [3]: https://github.com/WojciechMula/pyahocorasick/ [4]: https://github.com/G-Research/ahocorasick_rs/ [5]: https://python-hyperscan.readthedocs.io --- pyproject.toml | 2 +- src/ua_parser/re2.py | 80 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_core.py | 4 +++ tox.ini | 6 ++++ 4 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 src/ua_parser/re2.py diff --git a/pyproject.toml b/pyproject.toml index b42d432..9acef8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ version = "1.0.0a1" readme = "README.rst" requires-python = ">=3.8" dependencies = [] -optional-dependencies = { yaml = ["PyYaml"] } +optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] } license = {text = "Apache 2.0"} urls = {repository = "https://github.com/ua-parser/uap-python"} diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py new file mode 100644 index 0000000..fb23a9a --- /dev/null +++ b/src/ua_parser/re2.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import io +import os +import re +from typing import List, Tuple, Union + +import re2 # type: ignore + +from .core import ( + Parser as BaseParser, + PartialParseResult, + Device, + Domain, + OS, + UserAgent, + Matchers, + UserAgentMatcher, + OSMatcher, + DeviceMatcher, +) + + +RE_OPTS = re2.Options() +# as of uap-core 0.18, the devices set needs at least 28MB (up from +# the default 8), set to 32 +RE_OPTS.max_mem = 8 << 22 +# might write directly to stdout? not great, suppress +RE_OPTS.log_errors = False + + +class Parser(BaseParser): + ua: re2.Set + user_agent_parsers: List[UserAgentMatcher] + os: re2.Set + os_parsers: List[OSMatcher] + devices: re2.Set + device_parsers: List[DeviceMatcher] + + def __init__( + self, + matchers: Matchers, + ) -> None: + self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers + + self.ua = re2.Set.SearchSet(RE_OPTS) + for u in self.user_agent_parsers: + self.ua.Add(u.regex.pattern) + self.ua.Compile() + + self.os = re2.Set.SearchSet(RE_OPTS) + for o in self.os_parsers: + self.os.Add(o.regex.pattern) + self.os.Compile() + + self.devices = re2.Set.SearchSet(RE_OPTS) + for d in self.device_parsers: + # Prepend the i global flag if IGNORECASE is set. Assumes + # no pattern uses global flags, but since they're not + # supported in JS that seems safe. + if d.regex.flags & re.IGNORECASE: + self.devices.Add("(?i)" + d.regex.pattern) + else: + self.devices.Add(d.regex.pattern) + self.devices.Compile() + + def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult: + user_agent = os = device = None + if Domain.USER_AGENT in domains: + if matches := self.ua.Match(ua): + user_agent = self.user_agent_parsers[min(matches)](ua) + if Domain.OS in domains: + if matches := self.os.Match(ua): + os = self.os_parsers[min(matches)](ua) + if Domain.DEVICE in domains: + if matches := self.devices.Match(ua): + device = self.device_parsers[min(matches)](ua) + return PartialParseResult( + domains=domains, string=ua, user_agent=user_agent, os=os, device=device + ) diff --git a/tests/test_core.py b/tests/test_core.py index 76743c8..16eb43f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,5 +1,6 @@ """Tests UAP-Python using the UAP-core test suite """ +import contextlib import dataclasses import logging import pathlib @@ -36,7 +37,10 @@ PARSERS = [ pytest.param(BasicParser(load_builtins()), id="basic"), ] +with contextlib.suppress(ImportError): + from ua_parser import re2 + PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2")) UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)} diff --git a/tox.ini b/tox.ini index ca73951..094cf67 100644 --- a/tox.ini +++ b/tox.ini @@ -19,9 +19,15 @@ wheel_build_env = .pkg deps = pytest pyyaml + google-re2 commands = pytest -Werror --doctest-glob="*.rst" {posargs} +[testenv:pypy3.{8,9,10},py312] +deps = + pytest + pyyaml + [testenv:flake8] package = skip deps = flake8