From 5710666029bbcd37f6a3c226dd1022427ff52248 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 29 Apr 2024 08:47:58 -0700 Subject: [PATCH] Add codespace configuration for h5pyd (#187) * added codespace config * added requirements.txt * revised examples * move requirements.txt to .devcontainer * support s3fs for anonymous creds * set hsds loglevel to ERROR * updated notebook environment * updated notebook markdown * remove extraneious .domain.json file --- .devcontainer/devcontainer.json | 29 ++ .devcontainer/docker-compose.yaml | 107 +++++++ .devcontainer/requirements.txt | 40 +++ .gitignore | 3 + .hsds/config/override.yml | 9 + .hsds/config/passwd.txt | 13 + .hsds/data/hsdstest/home/.domain.json | 1 + .../hsdstest/home/test_user1/.domain.json | 1 + .../home/test_user1/h5pyd_test/.domain.json | 1 + README.rst | 17 +- environment.yml | 10 +- examples/notebooks/read_nrel_nsrdb.ipynb | 276 ++++++++++++++++++ examples/notebooks/tasmax.ipynb | 153 ---------- examples/read_example.py | 68 ++++- examples/write_example.py | 56 ++-- h5pyd/_apps/hsload.py | 14 +- 16 files changed, 594 insertions(+), 204 deletions(-) create mode 100644 .devcontainer/devcontainer.json create mode 100644 .devcontainer/docker-compose.yaml create mode 100644 .devcontainer/requirements.txt create mode 100644 .hsds/config/override.yml create mode 100644 .hsds/config/passwd.txt create mode 100644 .hsds/data/hsdstest/home/.domain.json create mode 100644 .hsds/data/hsdstest/home/test_user1/.domain.json create mode 100644 .hsds/data/hsdstest/home/test_user1/h5pyd_test/.domain.json create mode 100644 examples/notebooks/read_nrel_nsrdb.ipynb delete mode 100644 examples/notebooks/tasmax.ipynb diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..95af1f66 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +{ + "name": "h5pyd with HSDS", + "hostRequirements": { + "cpus": 4 + }, + "dockerComposeFile": "docker-compose.yaml", + "updateContentCommand": "python3 -m pip install -r .devcontainer/requirements.txt", + "postCreateCommand": "", + "service": "app", + "workspaceFolder": "/workspace", + "forwardPorts": [5101], + "portsAttributes": { + "5101": {"label": "HSDS port", "onAutoForward": "silent"} + }, + "features": { + "ghcr.io/devcontainers/features/docker-outside-of-docker": {} + }, + "customizations": { + "codespaces": { + "openFiles": [] + }, + "vscode": { + "extensions": [ + "ms-python.python", + "ms-toolsai.jupyter" + ] + } + } +} \ No newline at end of file diff --git a/.devcontainer/docker-compose.yaml b/.devcontainer/docker-compose.yaml new file mode 100644 index 00000000..933dbea8 --- /dev/null +++ b/.devcontainer/docker-compose.yaml @@ -0,0 +1,107 @@ +version: "3" +services: + app: + image: "mcr.microsoft.com/devcontainers/universal:2" + environment: + - HS_ENDPOINT=http://localhost:5101 + - HS_USERNAME=test_user1 + - HS_PASSWORD=test + - H5PYD_TEST_FOLDER=/home/test_user1/h5pyd_test/ + volumes: + - ..:/workspace:cached + + # Overrides default command so things don't shut down after the process ends. + command: sleep infinity + + # Runs app on the same network as the SN container, allows "forwardPorts" in devcontainer.json function. + network_mode: service:sn + head: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 512m + environment: + - TARGET_SN_COUNT=1 + - TARGET_DN_COUNT=4 + - NODE_TYPE=head_node + ports: + - 5100 + volumes: + - ../.hsds/data:/data + - ../.hsds/config/:/config/ + dn1: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - NODE_TYPE=dn + - ROOT_DIR=/data + ports: + - 6101 + depends_on: + - head + volumes: + - ../.hsds/data:/data + - ../.hsds/config/:/config/ + links: + - head + dn2: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - NODE_TYPE=dn + ports: + - 6102 + depends_on: + - head + volumes: + - ../.hsds/data:/data + - ../.hsds/config/:/config/ + links: + - head + dn3: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - NODE_TYPE=dn + ports: + - 6103 + depends_on: + - head + volumes: + - ../.hsds/data:/data + - ../.hsds/config/:/config/ + links: + - head + dn4: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - NODE_TYPE=dn + ports: + - 6104 + depends_on: + - head + volumes: + - ../.hsds/data:/data + - ../.hsds/config/:/config/ + links: + - head + sn: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - SN_PORT=5101 + - NODE_TYPE=sn + ports: + - 5101:5101 + depends_on: + - head + volumes: + - ../.hsds/data:/data + - ../.hsds/config/:/config/ + links: + - head diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt new file mode 100644 index 00000000..7d41991c --- /dev/null +++ b/.devcontainer/requirements.txt @@ -0,0 +1,40 @@ +aiobotocore==2.12.3 +aiohttp==3.9.5 +aioitertools==0.11.0 +aiosignal==1.3.1 +attrs==23.2.0 +botocore==1.34.69 +build==1.2.1 +certifi==2024.2.2 +charset-normalizer==3.3.2 +contourpy==1.2.1 +cycler==0.12.1 +flake8==7.0.0 +fonttools==4.51.0 +frozenlist==1.4.1 +fsspec==2024.3.1 +h5py==3.11.0 +h5pyd @ git+https://github.com/hdfgroup/h5pyd +idna==3.7 +jmespath==1.0.1 +kiwisolver==1.4.5 +matplotlib==3.8.4 +mccabe==0.7.0 +multidict==6.0.5 +numpy==1.26.4 +packaging==24.0 +pillow==10.3.0 +pycodestyle==2.11.1 +pyflakes==3.2.0 +PyJWT==2.8.0 +pyparsing==3.1.2 +pyproject_hooks==1.0.0 +python-dateutil==2.9.0.post0 +pytz==2024.1 +requests==2.31.0 +requests-unixsocket==0.3.0 +s3fs==2024.3.1 +six==1.16.0 +urllib3==2.2.1 +wrapt==1.16.0 +yarl==1.9.4 diff --git a/.gitignore b/.gitignore index 9f4c2f4f..03c3526c 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,6 @@ target/ # macOS stuff .DS_Store + +# hsds data +data/ diff --git a/.hsds/config/override.yml b/.hsds/config/override.yml new file mode 100644 index 00000000..d94b9c5b --- /dev/null +++ b/.hsds/config/override.yml @@ -0,0 +1,9 @@ +log_level: ERROR +bucket_name: hsdstest +server_name: "HSDS for Github codespaces" +aws_region: us-west-2 # (original was us-east-1) +aws_s3_gateway: http://s3.us-west-2.amazonaws.com/ # (original was null) +aws_s3_no_sign_request: True # (original was false) +root_dir: /data + + diff --git a/.hsds/config/passwd.txt b/.hsds/config/passwd.txt new file mode 100644 index 00000000..fc4dc113 --- /dev/null +++ b/.hsds/config/passwd.txt @@ -0,0 +1,13 @@ +# HSDS password file template +# +# +# This file contains a list of usernames/passwords that will be used to authenticate +# requests to HSDS. +# If using HTTP Basic Auth, copy file to "passwd.txt" in the same directory before deploying HSDS. +# Otherwise, if using Azure Active Directory or Kerberos, don't copy this file - usernames will be +# authenticated using those identity providers. +# For production use, replace the "test" password below with secret passwords and add +# and any new accounts desired. +admin:admin +test_user1:test +test_user2:test diff --git a/.hsds/data/hsdstest/home/.domain.json b/.hsds/data/hsdstest/home/.domain.json new file mode 100644 index 00000000..d6f91997 --- /dev/null +++ b/.hsds/data/hsdstest/home/.domain.json @@ -0,0 +1 @@ +{"owner": "admin", "acls": {"admin": {"create": true, "read": true, "update": true, "delete": true, "readACL": true, "updateACL": true}, "default": {"create": false, "read": true, "update": false, "delete": false, "readACL": false, "updateACL": false}}, "created": 1708897646.0599918, "lastModified": 1708897646.0599918} \ No newline at end of file diff --git a/.hsds/data/hsdstest/home/test_user1/.domain.json b/.hsds/data/hsdstest/home/test_user1/.domain.json new file mode 100644 index 00000000..9bd46f8d --- /dev/null +++ b/.hsds/data/hsdstest/home/test_user1/.domain.json @@ -0,0 +1 @@ +{"owner": "test_user1", "acls": {"test_user1": {"create": true, "read": true, "update": true, "delete": true, "readACL": true, "updateACL": true}, "default": {"create": false, "read": true, "update": false, "delete": false, "readACL": false, "updateACL": false}}, "created": 1711992550.3733413, "lastModified": 1711992550.3733413} \ No newline at end of file diff --git a/.hsds/data/hsdstest/home/test_user1/h5pyd_test/.domain.json b/.hsds/data/hsdstest/home/test_user1/h5pyd_test/.domain.json new file mode 100644 index 00000000..ce715d4a --- /dev/null +++ b/.hsds/data/hsdstest/home/test_user1/h5pyd_test/.domain.json @@ -0,0 +1 @@ +{"owner": "test_user1", "acls": {"test_user1": {"create": true, "read": true, "update": true, "delete": true, "readACL": true, "updateACL": true}, "default": {"create": false, "read": true, "update": false, "delete": false, "readACL": false, "updateACL": false}}, "created": 1711992550.3733413, "lastModified": 1711992550.3733413} diff --git a/README.rst b/README.rst index c4b75ecf..1d6554ba 100644 --- a/README.rst +++ b/README.rst @@ -1,16 +1,18 @@ +[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/hdfgroup/h5pyd) + h5pyd ===== .. image:: https://travis-ci.org/HDFGroup/h5pyd.svg?branch=master :target: https://travis-ci.org/HDFGroup/h5pyd -Python client library for HDF5 REST interface +Python client library for HSDS Introduction ------------ -This repository contains library, test, and examples of h5pyd - a Python package for the -HDF REST interface. +This repository contains library, test, and examples of h5pyd - a Python package for HSDS +(Highly Scalable Data Service), or other HDF REST compatible web services. The h5pyd library provides a high-level interface to the REST specification that is generally easier to use than invoking http calls directly. @@ -31,7 +33,6 @@ Related Projects ---------------- * HSDS: https://github.com/HDFGroup/hsds -* HDF Server: https://github.com/HDFGroup/h5serv * HDF5-JSON: https://github.com/HDFGroup/hdf5-json * h5py: https://github.com/h5py/h5py * REST API Documentation: https://github.com/HDFGroup/hdf-rest-api @@ -50,10 +51,12 @@ From a release tarball or Git checkout:: By default the examples look for a local instance of HSDS. See the https://github.com/HDFGroup/hsds for instructions on installing and running HSDS. -These tests are also to designed to work with HSDS (see https://github.com/HDFGroup/hsds). Install HSDS locally, or set environment variables (see next section) -to point to an existing HSDS instance. +These tests are also to designed to work with HSDS (see https://github.com/HDFGroup/hsds). +Install HSDS locally, or set environment variables (see next section) +to point to an existing HSDS instance. Alternatively, if you create as a Github codespace +(sell link above), HSDS will be automatically setup as part of the codespace. -h5pyd can all be run in serverless mode with either AWS Lambda or direct mode (storage system accessed directly). +Alternatively, h5pyd can all be run in serverless mode with either AWS Lambda or direct mode (storage system accessed directly). To use with AWS Lambda, set the HS_ENDPOINT to: "http+lambda://hslambda" where "hslambda" is the name of the lambda function. When using AWS Lambda some additional environment variables need to be set: diff --git a/environment.yml b/environment.yml index eff3ee91..574ae93f 100644 --- a/environment.yml +++ b/environment.yml @@ -1,15 +1,15 @@ -name: hsds +name: h5pyd tutorial channels: - conda-forge - defaults dependencies: - - python=3.6 + - python=3.11 - h5py - jupyter + - matplotlib - nb_conda_kernels - pytz - requests - - matplotlib + - s3fs - pip: - - h5pyd #latest release on PyPi -# - git+https://github.com/HDFGroup/h5pyd.git #latest from master + - h5pyd @ git+https://github.com/hdfgroup/h5pyd # latest from master diff --git a/examples/notebooks/read_nrel_nsrdb.ipynb b/examples/notebooks/read_nrel_nsrdb.ipynb new file mode 100644 index 00000000..a02adce9 --- /dev/null +++ b/examples/notebooks/read_nrel_nsrdb.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## HDF5 data in the cloud\n", + "\n", + "Many large data collections are hosted in the cloud and are freely availble. \n", + "E.g.: See https://registry.opendata.aws/\n", + "\n", + "For HDF5 data stored in AWS S3, these can directly be accessed with h5py and s3fs, or\n", + "using HSDS (Highly Scalable Data Service) and h5pyd (h5py-api compatible package for HSDS).\n", + "\n", + "This notebook illustrates accessing the NREL NSRDB (National Solar Radiation Database) using both h5pyd \n", + "and h5py.\n", + "\n", + "By running this notebook in codespaces, data access will generally be faster, since the bulk of\n", + "the data transfer happens on a high-speed internet backbone. The data is physically located in\n", + "the AWS us-west-2 region, so speed might be somewhat faster if you select us-west when creating\n", + "the codespace.\n", + "\n", + "Once the codespace environment is ready, you can start evaluating the Jupyter notebooks \n", + "(by placing the cursor into a code cell and pressing `Ctrl+Enter` or `Shift+Enter`). \n", + "When prompted for a Python kernel, select\n", + "\n", + "```\n", + "Python 3.10.13 ~/python/current/bin/python3 Recommended\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", + "if USE_H5PY:\n", + " import h5py\n", + " import s3fs # This package enables h5py to \"see\" S3 files as read-only posix files\n", + "else:\n", + " import h5pyd as h5py # Use the \"as\" syntax for code compatibility\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# hsls is an h5pyd utility to list HSDS domains\n", + "# In the shell, use the --bucket option to list files from NREL's S3 bucket \n", + "# run with \"-r\" option to see all domains\n", + "! hsls --bucket s3://nrel-pds-hsds /nrel/nsrdb/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drill down to the conus directory. Use -H and -v options to show the file sizes\n", + "# Downloading one of these files would take over a month with a standard\n", + "# broadband internet connection!\n", + "\n", + "! hsls -H -v --bucket s3://nrel-pds-hsds /nrel/nsrdb/conus/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# Open one of the nsrdb files. Use the bucket param to get the data from NREL's S3 bucket\n", + "if USE_H5PY:\n", + " s3 = s3fs.S3FileSystem(anon=True)\n", + " f = h5py.File(s3.open(\"s3://nrel-pds-nsrdb/conus/nsrdb_conus_pv_2022.h5\", \"rb\"), \"r\")\n", + "else:\n", + " f = h5py.File(\"/nrel/nsrdb/conus/nsrdb_conus_2022.h5\", bucket=\"s3://nrel-pds-hsds\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# attributes can be used to provide desriptions of the content\n", + "%time f.attrs['version'] " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(f) # datasets under root group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset = f[\"air_temperature\"]\n", + "dset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# each dataset has an id\n", + "dset.id.id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset.shape # two-dimensional time x station_index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the chunk shape\n", + "dset.chunks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# compute the number of bytes per chunk (about 2mb)\n", + "np.prod(dset.chunks) * dset.dtype.itemsize " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# compute the number of chunks in the dataset\n", + "(dset.shape[0] // dset.chunks[0]) * (dset.shape[1] // dset.chunks[1]) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read one year of measurments for a given station index.\n", + "# this will require reading ~`100MB from S3`\n", + "%time tseries = dset[::,1234567]\n", + "tseries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get min, max, and mean values\n", + "tseries.min(), tseries.max(), tseries.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot the data\n", + "x = range(len(tseries))\n", + "plt.plot(x, tseries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This dataset is actually linked from an HDF5 file in a different bucket\n", + "if USE_H5PY:\n", + " # this property doesn't exist for h5py\n", + " layout = None\n", + "else:\n", + " layout = dset.id.layout\n", + "layout" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The HSDS domain actually maps to several different HDF5 files\n", + "# compile a list of all the files\n", + "hdf5_files = set()\n", + "if not USE_H5PY:\n", + " for k in f:\n", + " dset = f[k]\n", + " layout = dset.id.layout\n", + " if \"file_uri\" in layout:\n", + " hdf5_files.add(layout[\"file_uri\"])\n", + "hdf5_files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/notebooks/tasmax.ipynb b/examples/notebooks/tasmax.ipynb deleted file mode 100644 index 32e617f6..00000000 --- a/examples/notebooks/tasmax.ipynb +++ /dev/null @@ -1,153 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import h5pyd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.image as mpimg" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "domain = \"tasmax_day_BCSD_rcp45_r1i1p1_CanESM2_2050.nasa.data.hdfgroup.org\"\n", - "endpoint = \"https://data.hdfgroup.org:7258\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "f = h5pyd.File(domain, 'r', endpoint=endpoint)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "dset = f[\"/tasmax\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "dset.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "dset.dtype" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "data = dset[0,:,:] \n", - "data = np.squeeze(data) # reduce to 2-d\n", - "data.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "plt.imshow(np.clip(data, 200, 320), origin='lower')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "data = dset[200,:,:] \n", - "data = np.squeeze(data) # reduce to 2-d\n", - "data.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "plt.imshow(np.clip(data, 200, 320), origin='lower')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.4.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/examples/read_example.py b/examples/read_example.py index eeb66ba9..3f32f533 100755 --- a/examples/read_example.py +++ b/examples/read_example.py @@ -12,32 +12,68 @@ import h5pyd as h5py +from pathlib import Path +import subprocess +import sys + +DOMAIN_PATH = "/home/test_user1/test/tall.h5" + + +def load_file(): + """ Load the HDF5 file from the S3 sample bucket to HSDS """ + + path = Path(DOMAIN_PATH) + parent_path = str(path.parent) + "/" # HSDS folders must end with a '/' + try: + h5py.Folder(parent_path, mode="x") # 'x' mode will create the folder if not present + except IOError as ioe: + print("ioe:", ioe) + sys.exit(1) + + # run hsload + s3_uri = "s3://hdf5.sample/data/hdf5test/tall.h5" + run_cmd = ["hsload", s3_uri, parent_path] + print("running command:", " ".join(run_cmd)) + result = subprocess.run(run_cmd) + if result.returncode != 0: + print(f"unable able to hsload {s3_uri}, error: {result.returncode}") + sys.exit(1) + print("hsload complete") + # now we should be able to open the domain + f = h5py.File(DOMAIN_PATH) + return f + def visit_item(name): - print("visit:", name) + print(f" {name}") return None def find_g1_2(name): - print("visit:", name) + print(f" {name}") if name.endswith("g1.2"): return True # stop iteration def visit_item_obj(name, obj): - print("visit:", name, obj.id.id) + print(f" {name:20s} id: {obj.id.id}") return None -print("version:", h5py.version.version) +# print the h5py version +print("h5pyd version:", h5py.version.version) -# this is the path specified in the "post-install instructions" -# see: "Test Data Setup" in: -# https://github.com/HDFGroup/hsds/blob/master/docs/post_install.md -DOMAIN_PATH = "/home/test_user1/test/tall.h5" print("opening domain:", DOMAIN_PATH) -f = h5py.File(DOMAIN_PATH, "r") +try: + f = h5py.File(DOMAIN_PATH, "r") +except IOError as ioe: + if ioe.errno in (404, 410): + # file hasn't been loaded into HSDS, get it now + f = load_file() + else: + print("unexpected error opening: {DOMAIN_PATH}: {ioe}") + sys.exit(1) print("name:", f.name) print("id:", f.id.id) @@ -71,20 +107,20 @@ def visit_item_obj(name, obj): arr = dset111[...] print("dset111 values:", arr) - attr1 = dset111.attrs['attr1'] print("attr1:", attr1) print("num attrs of dset1.1.1:", len(dset111.attrs)) -print("attr keys:", dset111.attrs.keys()) +print("dset1.1.1 attributes:") -for attr in dset111.attrs: - print("name:", attr) +for k in dset111.attrs: + v = dset111.attrs[k] + print(f" {k}: {v}") -print("visit...") +print("\nvisit...") f.visit(visit_item) -print("visititems...") +print("\nvisititems...") f.visititems(visit_item_obj) -print("search g1.2:") +print("\nsearch g1.2:") f.visit(find_g1_2) diff --git a/examples/write_example.py b/examples/write_example.py index 13f72289..ae01ecf9 100644 --- a/examples/write_example.py +++ b/examples/write_example.py @@ -10,7 +10,6 @@ # request a copy from help@hdfgroup.org. # ############################################################################## from __future__ import print_function -import math import numpy as np import logging import time @@ -29,8 +28,9 @@ s = f"usage: python {sys.argv[0]} " s += "[--ncols=n] " s += "[--nrows=m] " - s += "[--comp=none|gzip]" - s += "[--loglevel=debug|info|warn|error]" + s += "[--comp=none|gzip " + s += "[--loglevel=debug|info|warn|error] " + s += "" print(s) print("for filepath, use hdf5:// prefix for h5pyd, posix path for h5py") print("defaults...") @@ -101,40 +101,54 @@ if "dset2d" not in f: # create dataset - f.create_dataset("dset2d", (nrows, ncols), dtype=dt, chunks=CHUNKS, compression=compression) + shape = (nrows, ncols) + chunks = [nrows, ncols] + # chunk shape can't be larger than the dataset extents + for i in range(2): + if shape[i] > CHUNKS[i]: + chunks[i] = CHUNKS[i] + chunks = tuple(chunks) + + f.create_dataset("dset2d", shape, dtype=dt, chunks=chunks, compression=compression) dset2d = f["dset2d"] +print(dset2d) + +rng = np.random.default_rng() # initialize random number generator + # initialize numpy array to test values print("initialzing data") +ts = time.time() # starting time for the write + +for s in dset2d.iter_chunks(): + # s is two element list of slices + n = s[0].stop - s[0].start + m = s[1].stop - s[1].start + element_cnt = n * m + # get random values less than a 1000 so the values are compressible + arr = rng.integers(low=0, high=1000, size=element_cnt) + arr2d = arr.reshape((n, m)) # reshape to two-dimensional + logging.info(f"writing selection: {s} data range: {arr2d.min()} - {arr2d.max()}") + dset2d[s] = arr2d[:, :] -arr = np.zeros((nrows, ncols), dtype=dt) -exp = int(math.log10(ncols)) + 1 -for i in range(nrows): - row_start_value = i * 10 ** exp - for j in range(ncols): - arr[i, j] = row_start_value + j + 1 - -print("writing...") -num_bytes = nrows * ncols * dt.itemsize -ts = time.time() -dset2d[:, :] = arr[:, :] elapsed = time.time() - ts +print("done writing") +num_bytes = nrows * ncols * dt.itemsize + mb_per_sec = num_bytes / (1024 * 1024 * elapsed) print(f" elapsed: {elapsed:.2f} s, {mb_per_sec:.2f} MB/s") # read back the data as binary print("reading...") + ts = time.time() -arr_copy = dset2d[:, :] +for s in dset2d.iter_chunks(): + arr = dset2d[s] + logging.info(f"reading selection: {s} data range: {arr.min()} - {arr.max()}") elapsed = time.time() - ts mb_per_sec = num_bytes / (1024 * 1024 * elapsed) print(f" elapsed: {elapsed:.2f} s, {mb_per_sec:.2f} MB/s") -if not np.array_equal(arr, arr_copy): - print("arrays don't match!") -else: - print("passed!") - f.close() diff --git a/h5pyd/_apps/hsload.py b/h5pyd/_apps/hsload.py index 8fb72eec..f56ea78f 100755 --- a/h5pyd/_apps/hsload.py +++ b/h5pyd/_apps/hsload.py @@ -192,14 +192,24 @@ def main(): sys.exit(1) if not s3: + kwargs = {"use_ssl": False} key = os.environ.get("AWS_ACCESS_KEY_ID") secret = os.environ.get("AWS_SECRET_ACCESS_KEY") - aws_s3_gateway = os.environ.get("AWS_GATEWAY") + + if not key or not secret: + kwargs["anon"] = True + else: + kwargs["key"] = key + kwargs["secret"] = secret + client_kwargs = {} + aws_s3_gateway = os.environ.get("AWS_GATEWAY") if aws_s3_gateway: client_kwargs["endpoint_url"] = aws_s3_gateway + if client_kwargs: + kwargs["client_kwargs"] = client_kwargs - s3 = s3fs.S3FileSystem(use_ssl=False, key=key, secret=secret, client_kwargs=client_kwargs) + s3 = s3fs.S3FileSystem(**kwargs) try: fin = h5py.File(s3.open(src_file, "rb"), moe="r") except IOError as ioe: