diff --git a/pyproject.toml b/pyproject.toml index b42d432..9acef8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ version = "1.0.0a1" readme = "README.rst" requires-python = ">=3.8" dependencies = [] -optional-dependencies = { yaml = ["PyYaml"] } +optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] } license = {text = "Apache 2.0"} urls = {repository = "https://github.com/ua-parser/uap-python"} diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py new file mode 100644 index 0000000..fb23a9a --- /dev/null +++ b/src/ua_parser/re2.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import io +import os +import re +from typing import List, Tuple, Union + +import re2 # type: ignore + +from .core import ( + Parser as BaseParser, + PartialParseResult, + Device, + Domain, + OS, + UserAgent, + Matchers, + UserAgentMatcher, + OSMatcher, + DeviceMatcher, +) + + +RE_OPTS = re2.Options() +# as of uap-core 0.18, the devices set needs at least 28MB (up from +# the default 8), set to 32 +RE_OPTS.max_mem = 8 << 22 +# might write directly to stdout? not great, suppress +RE_OPTS.log_errors = False + + +class Parser(BaseParser): + ua: re2.Set + user_agent_parsers: List[UserAgentMatcher] + os: re2.Set + os_parsers: List[OSMatcher] + devices: re2.Set + device_parsers: List[DeviceMatcher] + + def __init__( + self, + matchers: Matchers, + ) -> None: + self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers + + self.ua = re2.Set.SearchSet(RE_OPTS) + for u in self.user_agent_parsers: + self.ua.Add(u.regex.pattern) + self.ua.Compile() + + self.os = re2.Set.SearchSet(RE_OPTS) + for o in self.os_parsers: + self.os.Add(o.regex.pattern) + self.os.Compile() + + self.devices = re2.Set.SearchSet(RE_OPTS) + for d in self.device_parsers: + # Prepend the i global flag if IGNORECASE is set. Assumes + # no pattern uses global flags, but since they're not + # supported in JS that seems safe. + if d.regex.flags & re.IGNORECASE: + self.devices.Add("(?i)" + d.regex.pattern) + else: + self.devices.Add(d.regex.pattern) + self.devices.Compile() + + def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult: + user_agent = os = device = None + if Domain.USER_AGENT in domains: + if matches := self.ua.Match(ua): + user_agent = self.user_agent_parsers[min(matches)](ua) + if Domain.OS in domains: + if matches := self.os.Match(ua): + os = self.os_parsers[min(matches)](ua) + if Domain.DEVICE in domains: + if matches := self.devices.Match(ua): + device = self.device_parsers[min(matches)](ua) + return PartialParseResult( + domains=domains, string=ua, user_agent=user_agent, os=os, device=device + ) diff --git a/tests/test_core.py b/tests/test_core.py index 76743c8..16eb43f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,5 +1,6 @@ """Tests UAP-Python using the UAP-core test suite """ +import contextlib import dataclasses import logging import pathlib @@ -36,7 +37,10 @@ PARSERS = [ pytest.param(BasicParser(load_builtins()), id="basic"), ] +with contextlib.suppress(ImportError): + from ua_parser import re2 + PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2")) UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)} diff --git a/tox.ini b/tox.ini index ca73951..094cf67 100644 --- a/tox.ini +++ b/tox.ini @@ -19,9 +19,15 @@ wheel_build_env = .pkg deps = pytest pyyaml + google-re2 commands = pytest -Werror --doctest-glob="*.rst" {posargs} +[testenv:pypy3.{8,9,10},py312] +deps = + pytest + pyyaml + [testenv:flake8] package = skip deps = flake8