added multimanager example and benchmark

HDFGroup · Jul 9, 2024 · 4511062 · 4511062
1 parent 5e5b864
commit 4511062
Show file tree

Hide file tree

Showing 2 changed files with 384 additions and 0 deletions.
diff --git a/examples/multi_mgr_benchmark.py b/examples/multi_mgr_benchmark.py
@@ -0,0 +1,104 @@
+import h5pyd
+import numpy as np
+import random
+import time
+
+DOMAIN_PATH = "/home/test_user1/test/multi_mgr_benchmark.h5"
+DATASET_COUNT = 200
+DSET_SHAPE = (10,)
+DSET_DTYPE = np.int32
+
+
+def generate_range(ds_shape: tuple):
+    # generate a tuple of random indices for one dataset
+    indices = []
+    for axis_length in ds_shape:
+        index = random.randint(0, axis_length - 1)
+        indices.append(index)
+    return tuple(indices)
+
+
+def generate_index_query(h5file):
+    # generate a list of index tuples
+    query = []
+    for ds in h5file.values():
+        ds_shape = ds.shape
+        indices = generate_range(ds_shape)
+        query.append(indices)
+    return query
+
+
+def benchmark_multimanager(h5file, num=10):
+    """
+    Benchmark retrieving one random entry from every dataset in an h5file
+    using the MultiManager.
+    """
+    ds_names = list(h5file.keys())
+    datsets = [h5file[name] for name in ds_names]
+    mm = h5pyd.MultiManager(datsets)
+
+    # prepare queries to exclude from runtime
+    queries = []
+    for i in range(num):
+        query = generate_index_query(h5file)
+        queries.append(query)
+
+    # accessing the data
+    t0 = time.time()
+    for query in queries:
+        mm[query]
+
+    runtime = time.time() - t0
+    print(f"Mean runtime multimanager: {runtime/num:.4f} s")
+    # 100ms for case with 6 datasets
+
+
+def benchmark_sequential_ds(h5file, num=10):
+    """
+    Benchmark retrieving one random entry from every dataset in
+    an h5file by sequentially looping through the datasets
+    """
+    # prepare queries to exclude this code from runtime
+    index_lists = []
+    for i in range(num):
+        index_list = []
+        for ds in h5file.values():
+            indices = generate_range(ds.shape)
+            index_list.append(indices)
+        index_lists.append(index_list)
+
+    # accessing the data
+    t0 = time.time()
+    for index_list in index_lists:
+        for indices, ds in zip(index_list, h5file.values()):
+            ds[indices]
+
+    runtime = time.time() - t0
+    print(f"Mean runtime sequentially: {runtime/num:.4f} s")
+    # ~ 400ms for case with 6 datasests
+
+
+def run_benchmark(f):
+    """
+    Initialize datasets if not done previously
+    Then run sequential and multimanager tests
+    """
+
+    for i in range(DATASET_COUNT):
+        dset_name = f"dset_{i:04d}"
+        if dset_name not in f:
+            data = np.random.randint(0, 100, size=DSET_SHAPE, dtype=DSET_DTYPE)
+            f.create_dataset(dset_name, data=data)
+
+    benchmark_sequential_ds(f)
+
+    benchmark_multimanager(f)
+
+
+#
+# main
+#
+
+# create domain if it does not exist already
+with h5pyd.File(DOMAIN_PATH, "a") as f:
+    run_benchmark(f)
diff --git a/examples/notebooks/multi_manager_example.ipynb b/examples/notebooks/multi_manager_example.ipynb
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "USE_H5PY = False  # set to True to use h5py/hdf5lib instead\n",
+    "if USE_H5PY:\n",
+    "    import h5py\n",
+    "    from h5py import MultiManager\n",
+    "else:\n",
+    "    import h5pyd as h5py  # Use the \"as\" syntax for code  compatibility\n",
+    "    from h5pyd import MultiManager\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a new file\n",
+    "f = h5py.File(\"/home/test_user1/multi_try.h5\", mode=\"w\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create some datasets\n",
+    "DSET_SHAPE = (10,)\n",
+    "DSET_DTYPE = np.int32\n",
+    "\n",
+    "# create 4 datasets\n",
+    "DSET_COUNT = 4\n",
+    "datasets = []\n",
+    "for i in range(DSET_COUNT):\n",
+    "    dset = f.create_dataset(f\"dset_{i}\", shape=DSET_SHAPE, dtype=DSET_DTYPE)\n",
+    "    datasets.append(dset)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# initialize some data to write\n",
+    "data_in = []\n",
+    "for n in range(DSET_COUNT):\n",
+    "    arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n",
+    "    arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n",
+    "    data_in.append(arr)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# instantiate a MultiManager and use it to write to all the datasets simultaneously\n",
+    "mm = MultiManager(datasets)\n",
+    "mm[...] = data_in"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# verify what get saved to the first dataset\n",
+    "dset = f[\"dset_0\"]\n",
+    "dset[...]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# and the second dataset\n",
+    "dset = f[\"dset_1\"]\n",
+    "dset[...]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Read all the data from all the daasets using the same MultiManager instance\n",
+    "data_out = mm[...]\n",
+    "len(data_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# get the first item from the returned list\n",
+    "data_out[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# and the second item\n",
+    "data_out[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 2, 3], dtype=int32)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# rather than reading all the data for a dataset, you can read a given selection\n",
+    "data_out = mm[0:4]\n",
+    "data_out[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# it's also possible to pass a list of selections and have each selection\n",
+    "# read from the corresponding dataset\n",
+    "selections = []\n",
+    "for n in range(DSET_COUNT):\n",
+    "    s = slice(n, n+2, 1)\n",
+    "    selections.append(s)\n",
+    "\n",
+    "data_out = mm.__getitem__(selections)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1], dtype=int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_out[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([101, 102], dtype=int32)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_out[1]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}