add script to run hash algorithm benchmark (#336)

* add script to run hash algorithm experiment Signed-off-by: Spencer Schrock <[email protected]> * use randomly generated data instead of models for hash experiment As far as hashing is concerned, bytes are bytes. By generating our own bytes, we avoid I/O associated with reading models from disk. While we could read the model into memory, recreating the filesystem seems complicated. Signed-off-by: Spencer Schrock <[email protected]> * use default data sizes better for log scale Signed-off-by: Spencer Schrock <[email protected]> * alter output per PR feedback Signed-off-by: Spencer Schrock <[email protected]> * generate data as needed for each size Signed-off-by: Spencer Schrock <[email protected]> * right align the measurement and limit to 5 decimals Signed-off-by: Spencer Schrock <[email protected]> --------- Signed-off-by: Spencer Schrock <[email protected]>
sigstore · Jan 15, 2025 · 3006f76 · 3006f76
1 parent 2847a1b
commit 3006f76
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 0 deletions.
diff --git a/benchmarks/exp_hash.py b/benchmarks/exp_hash.py
@@ -0,0 +1,125 @@
+# Copyright 2025 The Sigstore Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Script for running a benchmark to pick a hashing algorithm."""
+
+import argparse
+import timeit
+from typing import Final
+
+import numpy as np
+
+from model_signing.hashing import hashing
+from model_signing.hashing import memory
+
+
+KB: Final[int] = 1024
+MB: Final[int] = 1024 * KB
+GB: Final[int] = 1024 * MB
+
+
+def build_parser() -> argparse.ArgumentParser:
+    """Builds the command line parser for the hash experiment."""
+    parser = argparse.ArgumentParser(
+        description="hash algorithm benchmark data for model signing"
+    )
+
+    parser.add_argument(
+        "--repeat",
+        help="how many times to repeat each algorithm",
+        type=int,
+        default=5,
+    )
+
+    parser.add_argument(
+        "--methods",
+        help="hash methods to benchmark",
+        nargs="+",
+        type=str,
+        default=["sha256", "blake2"],
+    )
+
+    parser.add_argument(
+        "--data-sizes", help="hash methods to benchmark", nargs="+", type=int
+    )
+
+    return parser
+
+
+def _human_size(size: int) -> str:
+    if size >= GB:
+        return str(size / GB) + " GB"
+    elif size >= MB:
+        return str(size / MB) + " MB"
+    elif size >= KB:
+        return str(size / KB) + " KB"
+    return str(size) + " B"
+
+
+def _get_hasher(hash_algorithm: str) -> hashing.StreamingHashEngine:
+    match hash_algorithm:
+        case "sha256":
+            return memory.SHA256()
+        case "blake2":
+            return memory.BLAKE2()
+    raise ValueError(f"Cannot convert {hash_algorithm} to a hash engine")
+
+
+def _generate_data(size: int) -> bytes:
+    if size < 0:
+        raise ValueError("Cannot generate negative bytes")
+    return np.random.randint(0, 256, size, dtype=np.uint8).tobytes()
+
+
+def _default_sizes() -> list[int]:
+    """Generates sizes following 1, 2, 5 pattern, useful for log scale."""
+    sizes = []
+    for scale in [KB, MB, GB]:
+        for d in [1, 2, 5, 10, 20, 50, 100, 200, 500]:
+            if scale == GB and d > 20:
+                break
+            sizes.append(d * scale)
+    return sizes
+
+
+def _get_padding(methods: list[str], sizes: list[int]) -> int:
+    """Calculates the necessary padding by looking at longest output.
+
+    E.g. "sha256/1024: " would require 13 characters of padding.
+    """
+    return len(f"{max(methods, key=len)}/{max(sizes)}: ")
+
+
+if __name__ == "__main__":
+    np.random.seed(42)
+    args = build_parser().parse_args()
+    sizes = args.data_sizes or _default_sizes()
+    padding = _get_padding(args.methods, sizes)
+
+    for size in sizes:
+        data = _generate_data(size)
+        for algorithm in args.methods:
+            hasher = _get_hasher(algorithm)
+
+            def hash(hasher=hasher, data=data):
+                hasher.update(data)
+                return hasher.compute()
+
+            times = timeit.repeat(lambda: hash(), number=1, repeat=args.repeat)
+
+            # Grab the min time, as suggested by the docs
+            # https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
+            measurement = min(times)
+            print(f"{f'{algorithm}/{size}: ':<{padding}}{measurement:10.4f}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -81,6 +81,7 @@ python = ["3.10", "3.11", "3.12", "3.13"]
 [tool.hatch.envs.bench.scripts]
 generate = "python benchmarks/generate.py {args}"
 serialize = "python benchmarks/serialize.py {args}"
+hash = "python benchmarks/exp_hash.py {args}"
 
 [tool.hatch.envs.docs]
 description = """Custom environment for pdoc.