Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added multimanager example and benchmark #209

Merged
merged 1 commit into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions examples/multi_mgr_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import h5pyd
import numpy as np
import random
import time

DOMAIN_PATH = "/home/test_user1/test/multi_mgr_benchmark.h5"
DATASET_COUNT = 200
DSET_SHAPE = (10,)
DSET_DTYPE = np.int32


def generate_range(ds_shape: tuple):
# generate a tuple of random indices for one dataset
indices = []
for axis_length in ds_shape:
index = random.randint(0, axis_length - 1)
indices.append(index)
return tuple(indices)


def generate_index_query(h5file):
# generate a list of index tuples
query = []
for ds in h5file.values():
ds_shape = ds.shape
indices = generate_range(ds_shape)
query.append(indices)
return query


def benchmark_multimanager(h5file, num=10):
"""
Benchmark retrieving one random entry from every dataset in an h5file
using the MultiManager.
"""
ds_names = list(h5file.keys())
datsets = [h5file[name] for name in ds_names]
mm = h5pyd.MultiManager(datsets)

# prepare queries to exclude from runtime
queries = []
for i in range(num):
query = generate_index_query(h5file)
queries.append(query)

# accessing the data
t0 = time.time()
for query in queries:
mm[query]

runtime = time.time() - t0
print(f"Mean runtime multimanager: {runtime/num:.4f} s")
# 100ms for case with 6 datasets


def benchmark_sequential_ds(h5file, num=10):
"""
Benchmark retrieving one random entry from every dataset in
an h5file by sequentially looping through the datasets
"""
# prepare queries to exclude this code from runtime
index_lists = []
for i in range(num):
index_list = []
for ds in h5file.values():
indices = generate_range(ds.shape)
index_list.append(indices)
index_lists.append(index_list)

# accessing the data
t0 = time.time()
for index_list in index_lists:
for indices, ds in zip(index_list, h5file.values()):
ds[indices]

runtime = time.time() - t0
print(f"Mean runtime sequentially: {runtime/num:.4f} s")
# ~ 400ms for case with 6 datasests


def run_benchmark(f):
"""
Initialize datasets if not done previously
Then run sequential and multimanager tests
"""

for i in range(DATASET_COUNT):
dset_name = f"dset_{i:04d}"
if dset_name not in f:
data = np.random.randint(0, 100, size=DSET_SHAPE, dtype=DSET_DTYPE)
f.create_dataset(dset_name, data=data)

benchmark_sequential_ds(f)

benchmark_multimanager(f)


#
# main
#

# create domain if it does not exist already
with h5pyd.File(DOMAIN_PATH, "a") as f:
run_benchmark(f)
280 changes: 280 additions & 0 deletions examples/notebooks/multi_manager_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"USE_H5PY = False # set to True to use h5py/hdf5lib instead\n",
"if USE_H5PY:\n",
" import h5py\n",
" from h5py import MultiManager\n",
"else:\n",
" import h5pyd as h5py # Use the \"as\" syntax for code compatibility\n",
" from h5pyd import MultiManager\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# create a new file\n",
"f = h5py.File(\"/home/test_user1/multi_try.h5\", mode=\"w\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# create some datasets\n",
"DSET_SHAPE = (10,)\n",
"DSET_DTYPE = np.int32\n",
"\n",
"# create 4 datasets\n",
"DSET_COUNT = 4\n",
"datasets = []\n",
"for i in range(DSET_COUNT):\n",
" dset = f.create_dataset(f\"dset_{i}\", shape=DSET_SHAPE, dtype=DSET_DTYPE)\n",
" datasets.append(dset)\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# initialize some data to write\n",
"data_in = []\n",
"for n in range(DSET_COUNT):\n",
" arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n",
" arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n",
" data_in.append(arr)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# instantiate a MultiManager and use it to write to all the datasets simultaneously\n",
"mm = MultiManager(datasets)\n",
"mm[...] = data_in"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# verify what get saved to the first dataset\n",
"dset = f[\"dset_0\"]\n",
"dset[...]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# and the second dataset\n",
"dset = f[\"dset_1\"]\n",
"dset[...]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read all the data from all the daasets using the same MultiManager instance\n",
"data_out = mm[...]\n",
"len(data_out)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the first item from the returned list\n",
"data_out[0]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# and the second item\n",
"data_out[1]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3], dtype=int32)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# rather than reading all the data for a dataset, you can read a given selection\n",
"data_out = mm[0:4]\n",
"data_out[0]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# it's also possible to pass a list of selections and have each selection\n",
"# read from the corresponding dataset\n",
"selections = []\n",
"for n in range(DSET_COUNT):\n",
" s = slice(n, n+2, 1)\n",
" selections.append(s)\n",
"\n",
"data_out = mm.__getitem__(selections)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1], dtype=int32)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_out[0]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([101, 102], dtype=int32)"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_out[1]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading