Skip to content

Commit

Permalink
add script to run hash algorithm benchmark (#336)
Browse files Browse the repository at this point in the history
* add script to run hash algorithm experiment

Signed-off-by: Spencer Schrock <[email protected]>

* use randomly generated data instead of models for hash experiment

As far as hashing is concerned, bytes are bytes. By generating our own
bytes, we avoid I/O associated with reading models from disk. While we
could read the model into memory, recreating the filesystem seems
complicated.

Signed-off-by: Spencer Schrock <[email protected]>

* use default data sizes better for log scale

Signed-off-by: Spencer Schrock <[email protected]>

* alter output per PR feedback

Signed-off-by: Spencer Schrock <[email protected]>

* generate data as needed for each size

Signed-off-by: Spencer Schrock <[email protected]>

* right align the measurement and limit to 5 decimals

Signed-off-by: Spencer Schrock <[email protected]>

---------

Signed-off-by: Spencer Schrock <[email protected]>
  • Loading branch information
spencerschrock authored Jan 15, 2025
1 parent 2847a1b commit 3006f76
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 0 deletions.
125 changes: 125 additions & 0 deletions benchmarks/exp_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Copyright 2025 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Script for running a benchmark to pick a hashing algorithm."""

import argparse
import timeit
from typing import Final

import numpy as np

from model_signing.hashing import hashing
from model_signing.hashing import memory


KB: Final[int] = 1024
MB: Final[int] = 1024 * KB
GB: Final[int] = 1024 * MB


def build_parser() -> argparse.ArgumentParser:
"""Builds the command line parser for the hash experiment."""
parser = argparse.ArgumentParser(
description="hash algorithm benchmark data for model signing"
)

parser.add_argument(
"--repeat",
help="how many times to repeat each algorithm",
type=int,
default=5,
)

parser.add_argument(
"--methods",
help="hash methods to benchmark",
nargs="+",
type=str,
default=["sha256", "blake2"],
)

parser.add_argument(
"--data-sizes", help="hash methods to benchmark", nargs="+", type=int
)

return parser


def _human_size(size: int) -> str:
if size >= GB:
return str(size / GB) + " GB"
elif size >= MB:
return str(size / MB) + " MB"
elif size >= KB:
return str(size / KB) + " KB"
return str(size) + " B"


def _get_hasher(hash_algorithm: str) -> hashing.StreamingHashEngine:
match hash_algorithm:
case "sha256":
return memory.SHA256()
case "blake2":
return memory.BLAKE2()
raise ValueError(f"Cannot convert {hash_algorithm} to a hash engine")


def _generate_data(size: int) -> bytes:
if size < 0:
raise ValueError("Cannot generate negative bytes")
return np.random.randint(0, 256, size, dtype=np.uint8).tobytes()


def _default_sizes() -> list[int]:
"""Generates sizes following 1, 2, 5 pattern, useful for log scale."""
sizes = []
for scale in [KB, MB, GB]:
for d in [1, 2, 5, 10, 20, 50, 100, 200, 500]:
if scale == GB and d > 20:
break
sizes.append(d * scale)
return sizes


def _get_padding(methods: list[str], sizes: list[int]) -> int:
"""Calculates the necessary padding by looking at longest output.
E.g. "sha256/1024: " would require 13 characters of padding.
"""
return len(f"{max(methods, key=len)}/{max(sizes)}: ")


if __name__ == "__main__":
np.random.seed(42)
args = build_parser().parse_args()
sizes = args.data_sizes or _default_sizes()
padding = _get_padding(args.methods, sizes)

for size in sizes:
data = _generate_data(size)
for algorithm in args.methods:
hasher = _get_hasher(algorithm)

def hash(hasher=hasher, data=data):
hasher.update(data)
return hasher.compute()

times = timeit.repeat(lambda: hash(), number=1, repeat=args.repeat)

# Grab the min time, as suggested by the docs
# https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
measurement = min(times)
print(f"{f'{algorithm}/{size}: ':<{padding}}{measurement:10.4f}")
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ python = ["3.10", "3.11", "3.12", "3.13"]
[tool.hatch.envs.bench.scripts]
generate = "python benchmarks/generate.py {args}"
serialize = "python benchmarks/serialize.py {args}"
hash = "python benchmarks/exp_hash.py {args}"

[tool.hatch.envs.docs]
description = """Custom environment for pdoc.
Expand Down

0 comments on commit 3006f76

Please sign in to comment.