diff --git a/examples/multi_mgr_benchmark.py b/examples/multi_mgr_benchmark.py new file mode 100644 index 0000000..edcb8b2 --- /dev/null +++ b/examples/multi_mgr_benchmark.py @@ -0,0 +1,104 @@ +import h5pyd +import numpy as np +import random +import time + +DOMAIN_PATH = "/home/test_user1/test/multi_mgr_benchmark.h5" +DATASET_COUNT = 200 +DSET_SHAPE = (10,) +DSET_DTYPE = np.int32 + + +def generate_range(ds_shape: tuple): + # generate a tuple of random indices for one dataset + indices = [] + for axis_length in ds_shape: + index = random.randint(0, axis_length - 1) + indices.append(index) + return tuple(indices) + + +def generate_index_query(h5file): + # generate a list of index tuples + query = [] + for ds in h5file.values(): + ds_shape = ds.shape + indices = generate_range(ds_shape) + query.append(indices) + return query + + +def benchmark_multimanager(h5file, num=10): + """ + Benchmark retrieving one random entry from every dataset in an h5file + using the MultiManager. + """ + ds_names = list(h5file.keys()) + datsets = [h5file[name] for name in ds_names] + mm = h5pyd.MultiManager(datsets) + + # prepare queries to exclude from runtime + queries = [] + for i in range(num): + query = generate_index_query(h5file) + queries.append(query) + + # accessing the data + t0 = time.time() + for query in queries: + mm[query] + + runtime = time.time() - t0 + print(f"Mean runtime multimanager: {runtime/num:.4f} s") + # 100ms for case with 6 datasets + + +def benchmark_sequential_ds(h5file, num=10): + """ + Benchmark retrieving one random entry from every dataset in + an h5file by sequentially looping through the datasets + """ + # prepare queries to exclude this code from runtime + index_lists = [] + for i in range(num): + index_list = [] + for ds in h5file.values(): + indices = generate_range(ds.shape) + index_list.append(indices) + index_lists.append(index_list) + + # accessing the data + t0 = time.time() + for index_list in index_lists: + for indices, ds in zip(index_list, h5file.values()): + ds[indices] + + runtime = time.time() - t0 + print(f"Mean runtime sequentially: {runtime/num:.4f} s") + # ~ 400ms for case with 6 datasests + + +def run_benchmark(f): + """ + Initialize datasets if not done previously + Then run sequential and multimanager tests + """ + + for i in range(DATASET_COUNT): + dset_name = f"dset_{i:04d}" + if dset_name not in f: + data = np.random.randint(0, 100, size=DSET_SHAPE, dtype=DSET_DTYPE) + f.create_dataset(dset_name, data=data) + + benchmark_sequential_ds(f) + + benchmark_multimanager(f) + + +# +# main +# + +# create domain if it does not exist already +with h5pyd.File(DOMAIN_PATH, "a") as f: + run_benchmark(f) diff --git a/examples/notebooks/multi_manager_example.ipynb b/examples/notebooks/multi_manager_example.ipynb new file mode 100644 index 0000000..10c71cd --- /dev/null +++ b/examples/notebooks/multi_manager_example.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", + "if USE_H5PY:\n", + " import h5py\n", + " from h5py import MultiManager\n", + "else:\n", + " import h5pyd as h5py # Use the \"as\" syntax for code compatibility\n", + " from h5pyd import MultiManager\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new file\n", + "f = h5py.File(\"/home/test_user1/multi_try.h5\", mode=\"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# create some datasets\n", + "DSET_SHAPE = (10,)\n", + "DSET_DTYPE = np.int32\n", + "\n", + "# create 4 datasets\n", + "DSET_COUNT = 4\n", + "datasets = []\n", + "for i in range(DSET_COUNT):\n", + " dset = f.create_dataset(f\"dset_{i}\", shape=DSET_SHAPE, dtype=DSET_DTYPE)\n", + " datasets.append(dset)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# initialize some data to write\n", + "data_in = []\n", + "for n in range(DSET_COUNT):\n", + " arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n", + " arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n", + " data_in.append(arr)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# instantiate a MultiManager and use it to write to all the datasets simultaneously\n", + "mm = MultiManager(datasets)\n", + "mm[...] = data_in" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# verify what get saved to the first dataset\n", + "dset = f[\"dset_0\"]\n", + "dset[...]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and the second dataset\n", + "dset = f[\"dset_1\"]\n", + "dset[...]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read all the data from all the daasets using the same MultiManager instance\n", + "data_out = mm[...]\n", + "len(data_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get the first item from the returned list\n", + "data_out[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and the second item\n", + "data_out[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, 3], dtype=int32)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rather than reading all the data for a dataset, you can read a given selection\n", + "data_out = mm[0:4]\n", + "data_out[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# it's also possible to pass a list of selections and have each selection\n", + "# read from the corresponding dataset\n", + "selections = []\n", + "for n in range(DSET_COUNT):\n", + " s = slice(n, n+2, 1)\n", + " selections.append(s)\n", + "\n", + "data_out = mm.__getitem__(selections)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1], dtype=int32)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_out[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([101, 102], dtype=int32)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_out[1]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}