-
Notifications
You must be signed in to change notification settings - Fork 153
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
177 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
Load factor: 2/3 | ||
IntCap is powers of two, so steps are at 2/3*(1<<n) | ||
|
||
Necessary space is | ||
|
||
o + (1<<n) * (n//8+1) + (1<<n+1) // 3 * 24 | ||
|
||
where `o` is 112 from 3.8 to 3.10, and 104 in 3.11. | ||
|
||
Note: formula doesn't quite match measurements below e.g. for a 100 | ||
entries dict we need n=8 (170 external entries), but the formula above | ||
gives 4435 rather than the measured 4696. | ||
|
||
From this an approximation of the internal capacity from the size is: | ||
|
||
def internal_capacity(d): | ||
s = sys.getsizeof(d) // 17 | ||
n=3 | ||
while 1<<n+1 <= s: | ||
n+=1 | ||
return n | ||
|
||
Smallest internal capacity = 8 (5 entries) | ||
|
||
Python 3.8~3.10: | ||
|
||
- 1 entry = 232 | ||
- 5 entries = 232 | ||
- 10 entries = 360 | ||
- 20 entries = 640 | ||
- 50 entries = 2272 | ||
- 100 entries = 4696 | ||
- 200 entries = 9312 | ||
- 500 entries = 18520 | ||
- 1000 entries = 36960 | ||
- 2000 entries = 73816 | ||
- 5000 entries = 147552 | ||
|
||
Python 3.11: reduced overhead by 8 (that's it) | ||
|
||
- 224 | ||
- 224 | ||
- 352 | ||
- 632 | ||
- 2264 | ||
- 4688 | ||
- 9304 | ||
- 18512 | ||
- 36952 | ||
- 73808 | ||
- 147544 | ||
|
||
```text | ||
PyDictObject { = 64 | ||
PyObject { // 32 | ||
PyObject_HEAD_EXTRA { | ||
PyObject *_ob_next; | ||
PyObject *_ob_prev; | ||
}; | ||
Py_ssize_t ob_refcnt; | ||
PyTypeObject *ob_type; | ||
}; | ||
// 32 | ||
Py_ssize_t ma_used; | ||
uint64_t ma_version_tag; | ||
PyDictKeysObject *ma_keys; | ||
PyDictValues *ma_values; | ||
}; | ||
// 32 (excluding indices & entries) | ||
+---------------------+ | ||
| dk_refcnt | 8 | ||
| dk_log2_size | 1 | ||
| dk_log2_index_bytes | 1 | ||
| dk_kind | 1 | ||
| dk_version | 4 | ||
| dk_usable | 8 | ||
| dk_nentries | 8 | ||
+---------------------+ | ||
| dk_indices[] | | ||
| | | ||
+---------------------+ | ||
| dk_entries[] | | ||
| | | ||
+---------------------+ | ||
indices are | ||
* int8 for dk_size <= 128 | ||
* int16 for 256 <= dk_size <= 2**15 | ||
* int32 for 2**16 <= dk_size <= 2**31 | ||
* int64 for 2**32 <= dk_size | ||
len(dk_entries) = 2/3*dk_size | ||
entries = 3 pointers if DICT_KEYS_GENERAL, 2 if DICT_KEYS_UNICODE or DICT_KEYS_SPLIT | ||
``` | ||
|
||
entries: 75158 | ||
unique: 20322 | ||
ideal hit rate: 72.9% | ||
|
||
Cache size 20: hit rate 1.0% throughput: 1952 ua/s | ||
Cache size 50: hit rate 2.0% throughput: 2002 ua/s | ||
Cache size 100: hit rate 3.8% throughput: 2055 ua/s | ||
Cache size 200: hit rate 7.2% throughput: 2127 ua/s | ||
Cache size 500: hit rate 15.4% throughput: 2390 ua/s | ||
Cache size 1000: hit rate 24.8% throughput: 2697 ua/s | ||
Cache size 2000: hit rate 36.2% throughput: 3182 ua/s | ||
Cache size 5000: hit rate 51.3% throughput: 4135 ua/s | ||
|
||
legacy, maxsize=20: Mean +- std dev: 37.1 sec +- 0.4 sec | ||
legacy, maxsize=200: Mean +- std dev: 34.6 sec +- 0.3 sec | ||
legacy, maxsize=2000: Mean +- std dev: 23.8 sec +- 0.3 sec |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,34 @@ | ||
import pathlib | ||
from typing import Optional | ||
|
||
import pyperf | ||
from ua_parser.user_agent_parser import Parse, _PARSE_CACHE | ||
from ua_parser import user_agent_parser | ||
|
||
|
||
f = pathlib.Path(__file__).parent / 'useragents.txt' | ||
uas = f.read_text('utf-8').splitlines(keepends=False) | ||
|
||
Parse = user_agent_parser.Parse | ||
|
||
runner = pyperf.Runner() | ||
runner.timeit("legacy", """ | ||
|
||
user_agent_parser.MAX_CACHE_SIZE = 20 | ||
user_agent_parser._PARSE_CACHE.clear() | ||
runner.timeit("legacy, maxsize=20", """ | ||
for ua in uas: | ||
Parse(ua) | ||
""", "from __main__ import Parse, uas, _PARSE_CACHE") | ||
""", "from __main__ import Parse, uas") | ||
|
||
user_agent_parser.MAX_CACHE_SIZE = 200 | ||
user_agent_parser._PARSE_CACHE.clear() | ||
runner.timeit("legacy, maxsize=200", """ | ||
for ua in uas: | ||
Parse(ua) | ||
""", "from __main__ import Parse, uas") | ||
|
||
user_agent_parser.MAX_CACHE_SIZE = 2000 | ||
user_agent_parser._PARSE_CACHE.clear() | ||
runner.timeit("legacy, maxsize=2000", """ | ||
for ua in uas: | ||
Parse(ua) | ||
""", "from __main__ import Parse, uas") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import pathlib | ||
import time | ||
from typing import Optional | ||
|
||
import pyperf | ||
from ua_parser import user_agent_parser | ||
|
||
|
||
f = pathlib.Path(__file__).parent / 'useragents.txt' | ||
uas = f.read_text('utf-8').splitlines(keepends=False) | ||
|
||
hits = misses = 0 | ||
def _lookup(ua: str, old=user_agent_parser._lookup): | ||
global hits, misses | ||
r = old(ua) | ||
if len(r) == 1: | ||
misses += 1 | ||
else: | ||
hits += 1 | ||
return r | ||
|
||
user_agent_parser._lookup = _lookup | ||
|
||
unique = set(uas) | ||
r = (len(uas) - len(unique)) / len(uas) | ||
print("Total entries:", len(uas)) | ||
print("Unique entries:", len(unique)) | ||
print(f"Ideal hit rate: {r:.1%}\n") | ||
|
||
|
||
for cache_size in [20, 50, 100, 200, 500, 1000, 2000, 5000]: | ||
print(f"Cache size {cache_size: >5}: ", end='', flush=True) | ||
user_agent_parser.MAX_CACHE_SIZE = cache_size | ||
user_agent_parser._PARSE_CACHE.clear() | ||
start = time.time() | ||
for ua in uas: | ||
user_agent_parser.Parse(ua) | ||
end = time.time() | ||
r = hits / (hits + misses) | ||
t = int(len(uas) / (end - start)) | ||
print(f"hit rate {r:.1%} throughput: {t} ua/s") | ||
hits = misses = 0 |