m

ua-parser · Jul 1, 2023 · a61cc22 · a61cc22
1 parent 63e9749
commit a61cc22
Show file tree

Hide file tree

Showing 3 changed files with 177 additions and 3 deletions.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,113 @@
+Load factor: 2/3
+IntCap is powers of two, so steps are at 2/3*(1<<n)
+
+Necessary space is
+
+    o + (1<<n) * (n//8+1) + (1<<n+1) // 3 * 24
+
+where `o` is 112 from 3.8 to 3.10, and 104 in 3.11.
+
+Note: formula doesn't quite match measurements below e.g. for a 100
+entries dict we need n=8 (170 external entries), but the formula above
+gives 4435 rather than the measured 4696.
+
+From this an approximation of the internal capacity from the size is:
+
+    def internal_capacity(d):
+        s = sys.getsizeof(d) // 17
+        n=3
+        while 1<<n+1 <= s:
+            n+=1
+        return n
+
+Smallest internal capacity = 8 (5 entries)
+
+Python 3.8~3.10:
+
+- 1 entry = 232
+- 5 entries = 232
+- 10 entries = 360
+- 20 entries = 640
+- 50 entries = 2272
+- 100 entries = 4696
+- 200 entries = 9312
+- 500 entries = 18520
+- 1000 entries = 36960
+- 2000 entries = 73816
+- 5000 entries = 147552
+
+Python 3.11: reduced overhead by 8 (that's it)
+
+- 224
+- 224
+- 352
+- 632
+- 2264
+- 4688
+- 9304
+- 18512
+- 36952
+- 73808
+- 147544
+
+```text
+PyDictObject { = 64
+    PyObject { // 32
+        PyObject_HEAD_EXTRA {
+            PyObject *_ob_next;
+            PyObject *_ob_prev;
+        };
+        Py_ssize_t ob_refcnt;
+        PyTypeObject *ob_type;
+    };
+    // 32
+    Py_ssize_t ma_used;
+    uint64_t ma_version_tag;
+    PyDictKeysObject *ma_keys;
+    PyDictValues *ma_values; 
+};
+
+// 32 (excluding indices & entries)
++---------------------+
+| dk_refcnt           | 8
+| dk_log2_size        | 1
+| dk_log2_index_bytes | 1
+| dk_kind             | 1
+| dk_version          | 4
+| dk_usable           | 8
+| dk_nentries         | 8
++---------------------+
+| dk_indices[]        |
+|                     |
++---------------------+
+| dk_entries[]        |
+|                     |
++---------------------+
+
+indices are
+* int8  for          dk_size <= 128
+* int16 for 256   <= dk_size <= 2**15
+* int32 for 2**16 <= dk_size <= 2**31
+* int64 for 2**32 <= dk_size
+
+len(dk_entries) = 2/3*dk_size
+
+entries = 3 pointers if DICT_KEYS_GENERAL, 2 if DICT_KEYS_UNICODE or DICT_KEYS_SPLIT
+```
+
+entries: 75158
+unique: 20322
+ideal hit rate: 72.9%
+
+Cache size    20: hit rate 1.0% throughput: 1952 ua/s
+Cache size    50: hit rate 2.0% throughput: 2002 ua/s
+Cache size   100: hit rate 3.8% throughput: 2055 ua/s
+Cache size   200: hit rate 7.2% throughput: 2127 ua/s
+Cache size   500: hit rate 15.4% throughput: 2390 ua/s
+Cache size  1000: hit rate 24.8% throughput: 2697 ua/s
+Cache size  2000: hit rate 36.2% throughput: 3182 ua/s
+Cache size  5000: hit rate 51.3% throughput: 4135 ua/s
+
+legacy, maxsize=20: Mean +- std dev: 37.1 sec +- 0.4 sec
+legacy, maxsize=200: Mean +- std dev: 34.6 sec +- 0.3 sec
+legacy, maxsize=2000: Mean +- std dev: 23.8 sec +- 0.3 sec
diff --git a/benchmarks/bench.py b/benchmarks/bench.py
@@ -1,15 +1,34 @@
 import pathlib
+from typing import Optional
 
 import pyperf
-from ua_parser.user_agent_parser import Parse, _PARSE_CACHE
+from ua_parser import user_agent_parser
 
 
 f = pathlib.Path(__file__).parent / 'useragents.txt'
 uas = f.read_text('utf-8').splitlines(keepends=False)
 
+Parse = user_agent_parser.Parse
+
 runner = pyperf.Runner()
-runner.timeit("legacy", """
+
+user_agent_parser.MAX_CACHE_SIZE = 20
+user_agent_parser._PARSE_CACHE.clear()
+runner.timeit("legacy, maxsize=20", """
 for ua in uas:
     Parse(ua)
-""", "from __main__ import Parse, uas, _PARSE_CACHE")
+""", "from __main__ import Parse, uas")
 
+user_agent_parser.MAX_CACHE_SIZE = 200
+user_agent_parser._PARSE_CACHE.clear()
+runner.timeit("legacy, maxsize=200", """
+for ua in uas:
+    Parse(ua)
+""", "from __main__ import Parse, uas")
+
+user_agent_parser.MAX_CACHE_SIZE = 2000
+user_agent_parser._PARSE_CACHE.clear()
+runner.timeit("legacy, maxsize=2000", """
+for ua in uas:
+    Parse(ua)
+""", "from __main__ import Parse, uas")
diff --git a/benchmarks/hitrate.py b/benchmarks/hitrate.py
@@ -0,0 +1,42 @@
+import pathlib
+import time
+from typing import Optional
+
+import pyperf
+from ua_parser import user_agent_parser
+
+
+f = pathlib.Path(__file__).parent / 'useragents.txt'
+uas = f.read_text('utf-8').splitlines(keepends=False)
+
+hits = misses = 0
+def _lookup(ua: str, old=user_agent_parser._lookup):
+    global hits, misses
+    r = old(ua)
+    if len(r) == 1:
+        misses += 1
+    else:
+        hits += 1
+    return r
+
+user_agent_parser._lookup = _lookup
+
+unique = set(uas)
+r = (len(uas) - len(unique)) / len(uas)
+print("Total entries:", len(uas))
+print("Unique entries:", len(unique))
+print(f"Ideal hit rate: {r:.1%}\n")
+
+
+for cache_size in [20, 50, 100, 200, 500, 1000, 2000, 5000]:
+    print(f"Cache size {cache_size: >5}: ", end='', flush=True)
+    user_agent_parser.MAX_CACHE_SIZE = cache_size
+    user_agent_parser._PARSE_CACHE.clear()
+    start = time.time()
+    for ua in uas:
+        user_agent_parser.Parse(ua)
+    end = time.time()
+    r = hits / (hits + misses)
+    t = int(len(uas) / (end - start))
+    print(f"hit rate {r:.1%} throughput: {t} ua/s")
+    hits = misses = 0