Skip to content

Commit

Permalink
m
Browse files Browse the repository at this point in the history
  • Loading branch information
masklinn committed Jul 1, 2023
1 parent 63e9749 commit a61cc22
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 3 deletions.
113 changes: 113 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
Load factor: 2/3
IntCap is powers of two, so steps are at 2/3*(1<<n)

Necessary space is

o + (1<<n) * (n//8+1) + (1<<n+1) // 3 * 24

where `o` is 112 from 3.8 to 3.10, and 104 in 3.11.

Note: formula doesn't quite match measurements below e.g. for a 100
entries dict we need n=8 (170 external entries), but the formula above
gives 4435 rather than the measured 4696.

From this an approximation of the internal capacity from the size is:

def internal_capacity(d):
s = sys.getsizeof(d) // 17
n=3
while 1<<n+1 <= s:
n+=1
return n

Smallest internal capacity = 8 (5 entries)

Python 3.8~3.10:

- 1 entry = 232
- 5 entries = 232
- 10 entries = 360
- 20 entries = 640
- 50 entries = 2272
- 100 entries = 4696
- 200 entries = 9312
- 500 entries = 18520
- 1000 entries = 36960
- 2000 entries = 73816
- 5000 entries = 147552

Python 3.11: reduced overhead by 8 (that's it)

- 224
- 224
- 352
- 632
- 2264
- 4688
- 9304
- 18512
- 36952
- 73808
- 147544

```text
PyDictObject { = 64
PyObject { // 32
PyObject_HEAD_EXTRA {
PyObject *_ob_next;
PyObject *_ob_prev;
};
Py_ssize_t ob_refcnt;
PyTypeObject *ob_type;
};
// 32
Py_ssize_t ma_used;
uint64_t ma_version_tag;
PyDictKeysObject *ma_keys;
PyDictValues *ma_values;
};
// 32 (excluding indices & entries)
+---------------------+
| dk_refcnt | 8
| dk_log2_size | 1
| dk_log2_index_bytes | 1
| dk_kind | 1
| dk_version | 4
| dk_usable | 8
| dk_nentries | 8
+---------------------+
| dk_indices[] |
| |
+---------------------+
| dk_entries[] |
| |
+---------------------+
indices are
* int8 for dk_size <= 128
* int16 for 256 <= dk_size <= 2**15
* int32 for 2**16 <= dk_size <= 2**31
* int64 for 2**32 <= dk_size
len(dk_entries) = 2/3*dk_size
entries = 3 pointers if DICT_KEYS_GENERAL, 2 if DICT_KEYS_UNICODE or DICT_KEYS_SPLIT
```

entries: 75158
unique: 20322
ideal hit rate: 72.9%

Cache size 20: hit rate 1.0% throughput: 1952 ua/s
Cache size 50: hit rate 2.0% throughput: 2002 ua/s
Cache size 100: hit rate 3.8% throughput: 2055 ua/s
Cache size 200: hit rate 7.2% throughput: 2127 ua/s
Cache size 500: hit rate 15.4% throughput: 2390 ua/s
Cache size 1000: hit rate 24.8% throughput: 2697 ua/s
Cache size 2000: hit rate 36.2% throughput: 3182 ua/s
Cache size 5000: hit rate 51.3% throughput: 4135 ua/s

legacy, maxsize=20: Mean +- std dev: 37.1 sec +- 0.4 sec
legacy, maxsize=200: Mean +- std dev: 34.6 sec +- 0.3 sec
legacy, maxsize=2000: Mean +- std dev: 23.8 sec +- 0.3 sec
25 changes: 22 additions & 3 deletions benchmarks/bench.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,34 @@
import pathlib
from typing import Optional

import pyperf
from ua_parser.user_agent_parser import Parse, _PARSE_CACHE
from ua_parser import user_agent_parser


f = pathlib.Path(__file__).parent / 'useragents.txt'
uas = f.read_text('utf-8').splitlines(keepends=False)

Parse = user_agent_parser.Parse

runner = pyperf.Runner()
runner.timeit("legacy", """

user_agent_parser.MAX_CACHE_SIZE = 20
user_agent_parser._PARSE_CACHE.clear()
runner.timeit("legacy, maxsize=20", """
for ua in uas:
Parse(ua)
""", "from __main__ import Parse, uas, _PARSE_CACHE")
""", "from __main__ import Parse, uas")

user_agent_parser.MAX_CACHE_SIZE = 200
user_agent_parser._PARSE_CACHE.clear()
runner.timeit("legacy, maxsize=200", """
for ua in uas:
Parse(ua)
""", "from __main__ import Parse, uas")

user_agent_parser.MAX_CACHE_SIZE = 2000
user_agent_parser._PARSE_CACHE.clear()
runner.timeit("legacy, maxsize=2000", """
for ua in uas:
Parse(ua)
""", "from __main__ import Parse, uas")
42 changes: 42 additions & 0 deletions benchmarks/hitrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pathlib
import time
from typing import Optional

import pyperf
from ua_parser import user_agent_parser


f = pathlib.Path(__file__).parent / 'useragents.txt'
uas = f.read_text('utf-8').splitlines(keepends=False)

hits = misses = 0
def _lookup(ua: str, old=user_agent_parser._lookup):
global hits, misses
r = old(ua)
if len(r) == 1:
misses += 1
else:
hits += 1
return r

user_agent_parser._lookup = _lookup

unique = set(uas)
r = (len(uas) - len(unique)) / len(uas)
print("Total entries:", len(uas))
print("Unique entries:", len(unique))
print(f"Ideal hit rate: {r:.1%}\n")


for cache_size in [20, 50, 100, 200, 500, 1000, 2000, 5000]:
print(f"Cache size {cache_size: >5}: ", end='', flush=True)
user_agent_parser.MAX_CACHE_SIZE = cache_size
user_agent_parser._PARSE_CACHE.clear()
start = time.time()
for ua in uas:
user_agent_parser.Parse(ua)
end = time.time()
r = hits / (hits + misses)
t = int(len(uas) / (end - start))
print(f"hit rate {r:.1%} throughput: {t} ua/s")
hits = misses = 0

0 comments on commit a61cc22

Please sign in to comment.