Skip to content

Commit

Permalink
Add an re2-based parser
Browse files Browse the repository at this point in the history
Requires splitting out some of the testenvs, as re2 is not available
for pypy at all, and not yet for 3.12.

Only uses re2.Set which turns out to be not great, at least according
to `pytest --durations` on 3.11:

- re2 is sometimes faster for UA tests
  - `pgts_browser_list.yaml` goes from 2.5s to 1.5
  - `firefox_user_agent_strings.yaml` goes from 0.05 to 0.04 (not
    really significant)
  - though `test_ua.yaml` goes from 0.18 to 0.65
- re2 is *way* slower for devices tests
  - `test_device.yaml` goes from 2.5 to 8s

Obviously tests might not be representative at all, implementing a
proper benchmark on a real-life test-set (#163) would likely provide
better information.

It's possible that `FilteredRE2` would would offer better
performances, *but* it requires additional memory and more importantly
it requires a fast literal string matcher e.g. a fast implementation
of Aho-Corasick, or possibly Hyperscan's Teddy (via
[python-hyperscan][5]?). [According to burntsushi commentz-walter is
not great in practice][1], at least as you increase the number of
patterns, so that one looks like a dead end.

Either way this would likely be an *additional* dependency to make it
usable, although there seems to be [a well-maintained Python version
with impressive performances (for pure python)][2], [a native
module][3], and [a wrapper for burntsushi's rust implementation][4]
which claims even better performances than the native module.

Linked to (but probably can't be argued to fix) #149.

[1]: https://news.ycombinator.com/item?id=26913349
[2]: https://github.com/abusix/ahocorapy
[3]: https://github.com/WojciechMula/pyahocorasick/
[4]: https://github.com/G-Research/ahocorasick_rs/
[5]: https://python-hyperscan.readthedocs.io
  • Loading branch information
masklinn committed Nov 2, 2023
1 parent dbcee8c commit df7ffe9
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ version = "1.0.0a1"
readme = "README.rst"
requires-python = ">=3.8"
dependencies = []
optional-dependencies = { yaml = ["PyYaml"] }
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }

license = {text = "Apache 2.0"}
urls = {repository = "https://github.com/ua-parser/uap-python"}
Expand Down
80 changes: 80 additions & 0 deletions src/ua_parser/re2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from __future__ import annotations

import io
import os
import re
from typing import List, Tuple, Union

import re2 # type: ignore

from .core import (
Parser as BaseParser,
PartialParseResult,
Device,
Domain,
OS,
UserAgent,
Matchers,
UserAgentMatcher,
OSMatcher,
DeviceMatcher,
)


RE_OPTS = re2.Options()
# as of uap-core 0.18, the devices set needs at least 28MB (up from
# the default 8), set to 32
RE_OPTS.max_mem = 8 << 22
# might write directly to stdout? not great, suppress
RE_OPTS.log_errors = False


class Parser(BaseParser):
ua: re2.Set
user_agent_parsers: List[UserAgentMatcher]
os: re2.Set
os_parsers: List[OSMatcher]
devices: re2.Set
device_parsers: List[DeviceMatcher]

def __init__(
self,
matchers: Matchers,
) -> None:
self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers

self.ua = re2.Set.SearchSet(RE_OPTS)
for u in self.user_agent_parsers:
self.ua.Add(u.regex.pattern)
self.ua.Compile()

self.os = re2.Set.SearchSet(RE_OPTS)
for o in self.os_parsers:
self.os.Add(o.regex.pattern)
self.os.Compile()

self.devices = re2.Set.SearchSet(RE_OPTS)
for d in self.device_parsers:
# Prepend the i global flag if IGNORECASE is set. Assumes
# no pattern uses global flags, but since they're not
# supported in JS that seems safe.
if d.regex.flags & re.IGNORECASE:
self.devices.Add("(?i)" + d.regex.pattern)
else:
self.devices.Add(d.regex.pattern)
self.devices.Compile()

def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
user_agent = os = device = None
if Domain.USER_AGENT in domains:
if matches := self.ua.Match(ua):
user_agent = self.user_agent_parsers[min(matches)](ua)
if Domain.OS in domains:
if matches := self.os.Match(ua):
os = self.os_parsers[min(matches)](ua)
if Domain.DEVICE in domains:
if matches := self.devices.Match(ua):
device = self.device_parsers[min(matches)](ua)
return PartialParseResult(
domains=domains, string=ua, user_agent=user_agent, os=os, device=device
)
4 changes: 4 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Tests UAP-Python using the UAP-core test suite
"""
import contextlib
import dataclasses
import logging
import pathlib
Expand Down Expand Up @@ -36,7 +37,10 @@
PARSERS = [
pytest.param(BasicParser(load_builtins()), id="basic"),
]
with contextlib.suppress(ImportError):
from ua_parser import re2

PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2"))

UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}

Expand Down
6 changes: 6 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@ wheel_build_env = .pkg
deps =
pytest
pyyaml
google-re2
commands =
pytest -Werror --doctest-glob="*.rst" {posargs}

[testenv:pypy3.{8,9,10},py312]
deps =
pytest
pyyaml

[testenv:flake8]
package = skip
deps = flake8
Expand Down

0 comments on commit df7ffe9

Please sign in to comment.