Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add MultiManager to h5pyd __init__.py #185

Merged
merged 4 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion h5pyd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from ._hl.files import File, is_hdf5
from ._hl.folders import Folder
from ._hl.group import Group, SoftLink, ExternalLink, UserDefinedLink, HardLink
from ._hl.dataset import Dataset
from ._hl.dataset import Dataset, MultiManager
from ._hl.table import Table
from ._hl.datatype import Datatype
from ._hl.attrs import AttributeManager
Expand Down
7 changes: 6 additions & 1 deletion h5pyd/_hl/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import base64
import numpy
import os
import logging
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed

Expand Down Expand Up @@ -1741,10 +1742,14 @@ class MultiManager():
# Avoid overtaxing HSDS
max_workers = 16

def __init__(self, datasets=None):
def __init__(self, datasets=None, logger=None):
if (datasets is None) or (len(datasets) == 0):
raise ValueError("MultiManager requires non-empty list of datasets")
self.datasets = datasets
if logger is None:
self.log = logging
else:
self.log = logging.getLogger(logger)

def read_dset_tl(self, args):
"""
Expand Down
52 changes: 43 additions & 9 deletions test/hl/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from __future__ import absolute_import

import sys
import os
import os.path as op
import tempfile
Expand All @@ -39,6 +38,32 @@
del testfile


def getTestFileName(basename, subfolder=None):
"""
Get filepath for a test case given a testname
"""

if config.get("use_h5py"):
filename = "out"
if not op.isdir(filename):
os.mkdir(filename)
if subfolder:
filename = op.join(filename, subfolder)
if not op.isdir(filename):
os.mkdir(filename)
filename = op.join(filename, f"{basename}.h5")
else:
if "H5PYD_TEST_FOLDER" in os.environ:
filename = os.environ["H5PYD_TEST_FOLDER"]
else:
# default to the root folder
filename = "/"
if subfolder:
filename = op.join(filename, subfolder)
filename = op.join(filename, f"{basename}.h5")
return filename


class TestCase(ut.TestCase):

"""
Expand Down Expand Up @@ -201,23 +226,32 @@ def assertNumpyBehavior(self, dset, arr, s):
with self.assertRaises(exc):
dset[s]

def getFileName(self, basename):
def getFileName(self, basename, subfolder=None):
"""
Get filepath for a test case given a testname
"""

# Just call the external function
filename = getTestFileName(basename, subfolder=subfolder)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getFileName invokes getTestFileName, but then overwrites the result to redo the same work.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made the change so getTestFileName can be invoked outside the TestCase class, but forgot to remove the duplicate code. Fixed now.


if config.get("use_h5py"):
if not op.isdir("out"):
os.mkdir("out")
filename = "out/" + basename + ".h5"
filename = "out"
if not op.isdir(filename):
os.mkdir(filename)
if subfolder:
filename = op.join(filename, subfolder)
if not op.isdir(filename):
os.mkdir(filename)
filename = op.join(filename, f"{basename}.h5")
else:
if "H5PYD_TEST_FOLDER" in os.environ:
domain = os.environ["H5PYD_TEST_FOLDER"]
filename = os.environ["H5PYD_TEST_FOLDER"]
else:
# default to the root folder
domain = "/"
filename = op.join(domain, basename)
filename += ".h5"
filename = "/"
if subfolder:
filename = op.join(filename, subfolder)
filename = op.join(filename, f"{basename}.h5")
return filename

def getPathFromDomain(self, domain):
Expand Down
14 changes: 11 additions & 3 deletions test/hl/multi_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import subprocess
import re

from h5pyd._hl.dataset import MultiManager
from h5pyd import MultiManager
import h5pyd as h5py
from common import getTestFileName

# Flag to stop resource usage collection thread after a benchmark finishes
stop_stat_collection = False
Expand Down Expand Up @@ -249,7 +250,13 @@ def run_benchmark(test_name, test_func, stats, datasets, num_iters):
dt = np.int32
stats = {}

fs = [h5py.File("/home/test_user1/h5pyd_multi_bm_" + str(i), mode='w') for i in range(count)]
fs = []

for i in range(count):
filename = getTestFileName(f"bm_{i:04d}", subfolder="multi_bm")
f = h5py.File(filename, mode='w')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the user hasn't specifically created the multi_bm, this will fail. We should either create that here or check for its existence to throw a helpful error

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to print message on IOError.

fs.append(f)

data_in = np.zeros(shape, dtype=dt)
datasets = [f.create_dataset("data", shape, dtype=dt, data=data_in) for f in fs]

Expand All @@ -266,7 +273,8 @@ def run_benchmark(test_name, test_func, stats, datasets, num_iters):

print("Testing with shared HTTP connection...")

f = h5py.File("/home/test_user1/h5pyd_multi_bm_shared", mode='w')
filename = getTestFileName("bm_shared", subfolder="multi_bm")
f = h5py.File(filename, mode='w')
datasets = [f.create_dataset("data" + str(i), data=data_in, dtype=dt) for i in range(count)]

run_benchmark("Read Multi (Shared HttpConn)", read_datasets_multi, stats, datasets, num_iters)
Expand Down
4 changes: 2 additions & 2 deletions test/hl/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@
import sys
import numpy as np
import platform
import warnings

from common import ut, TestCase
from h5pyd._hl.dataset import MultiManager
import config
from h5pyd import MultiManager

if config.get("use_h5py"):
from h5py import File, Dataset
Expand All @@ -39,6 +38,7 @@

def is_empty_dataspace(obj):
shape_json = obj.shape_json

if "class" not in shape_json:
raise KeyError()
if shape_json["class"] == 'H5S_NULL':
Expand Down
2 changes: 0 additions & 2 deletions test/hl/test_dataset_extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
##############################################################################

import logging
import numpy as np
import math

import config

Expand Down
1 change: 0 additions & 1 deletion test/hl/test_dataset_fancyselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
##############################################################################

import numpy as np
import math

import config

Expand Down
1 change: 0 additions & 1 deletion test/hl/test_datatype.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
##############################################################################

import numpy as np
import math
import logging
import config

Expand Down