Skip to content

Commit

Permalink
Add an re2-based parser
Browse files Browse the repository at this point in the history
Requires splitting out some of the testenvs, as re2 is not available
for pypy at all, and not yet for 3.12.

Uses `re2.Filter`, which unlike the C++ `FilteredRE2` bundles
prefiltering, using an `re2.Set` so likely less efficient than
providing one's own e.g. aho-corasick, but avoids having to do that.

At first glance according to pytest's `--durations 0` this is quite
successful (unlike using `re2.Set` which was more of a mixed bag):

```
2.54s call     tests/test_core.py::test_devices[test_device.yaml-basic]
2.51s call     tests/test_core.py::test_ua[pgts_browser_list.yaml-basic]
2.48s call     tests/test_legacy.py::TestParse::testPGTSStrings
2.43s call     tests/test_legacy.py::TestParse::testStringsDevice
0.95s call     tests/test_core.py::test_devices[test_device.yaml-re2]
0.55s call     tests/test_core.py::test_ua[pgts_browser_list.yaml-re2]
0.18s call     tests/test_core.py::test_ua[test_ua.yaml-basic]
0.16s call     tests/test_legacy.py::TestParse::testBrowserscopeStrings
0.10s call     tests/test_core.py::test_ua[test_ua.yaml-re2]
```

While the "basic" parser for the new API is slightly slower than the
legacy API (browserscope does use test_ua.yaml so that matches) the
re2 parser is significantly faster than both:

- 60% faster on test_device.yaml (~2.5s -> 1s)
- 80% faster on pgts (2.5s -> 0.5s)
- 40% faster on browserscope (0.16 -> 0.1)

This is very encouraging, altough the memory consumption has not been
checked (yet).

Fixes #149, kind-of
  • Loading branch information
masklinn committed Nov 3, 2023
1 parent 7f90746 commit 7ff511e
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ version = "1.0.0a1"
readme = "README.rst"
requires-python = ">=3.8"
dependencies = []
optional-dependencies = { yaml = ["PyYaml"] }
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }

license = {text = "Apache 2.0"}
urls = {repository = "https://github.com/ua-parser/uap-python"}
Expand Down
75 changes: 75 additions & 0 deletions src/ua_parser/re2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from __future__ import annotations

import io
import os
import re
from typing import List, Tuple, Union

import re2 # type: ignore

from .core import (
Parser as AbstractParser,
PartialParseResult,
Device,
Domain,
OS,
UserAgent,
Matchers,
UserAgentMatcher,
OSMatcher,
DeviceMatcher,
)


class Parser(AbstractParser):
ua: re2.Filter
user_agent_parsers: List[UserAgentMatcher]
os: re2.Filter
os_parsers: List[OSMatcher]
devices: re2.Filter
device_parsers: List[DeviceMatcher]

def __init__(
self,
matchers: Matchers,
) -> None:
self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers

self.ua = re2.Filter()
for u in self.user_agent_parsers:
self.ua.Add(u.regex.pattern)
self.ua.Compile()

self.os = re2.Filter()
for o in self.os_parsers:
self.os.Add(o.regex.pattern)
self.os.Compile()

self.devices = re2.Filter()
for d in self.device_parsers:
# Prepend the i global flag if IGNORECASE is set. Assumes
# no pattern uses global flags, but since they're not
# supported in JS that seems safe.
if d.regex.flags & re.IGNORECASE:
self.devices.Add("(?i)" + d.regex.pattern)
else:
self.devices.Add(d.regex.pattern)
self.devices.Compile()

def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
user_agent = os = device = None
if Domain.USER_AGENT in domains:
if matches := self.ua.Match(ua):
# Set/Filter does not return the match in index order
# (position order?) so to fit UAP semantics we need to
# extract the first matching regex (lowest index).
user_agent = self.user_agent_parsers[min(matches)](ua)
if Domain.OS in domains:
if matches := self.os.Match(ua):
os = self.os_parsers[min(matches)](ua)
if Domain.DEVICE in domains:
if matches := self.devices.Match(ua):
device = self.device_parsers[min(matches)](ua)
return PartialParseResult(
domains=domains, string=ua, user_agent=user_agent, os=os, device=device
)
4 changes: 4 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Tests UAP-Python using the UAP-core test suite
"""
import contextlib
import dataclasses
import logging
import pathlib
Expand Down Expand Up @@ -36,7 +37,10 @@
PARSERS = [
pytest.param(BasicParser(load_builtins()), id="basic"),
]
with contextlib.suppress(ImportError):
from ua_parser import re2

PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2"))

UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}

Expand Down
6 changes: 6 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@ wheel_build_env = .pkg
deps =
pytest
pyyaml
google-re2
commands =
pytest -Werror --doctest-glob="*.rst" {posargs}

[testenv:pypy3.{8,9,10},py312]
deps =
pytest
pyyaml

[testenv:flake8]
package = skip
deps = flake8
Expand Down

0 comments on commit 7ff511e

Please sign in to comment.