Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactoring tests to log io behavior #31

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -109,3 +109,7 @@ venv.bak/
*.hdf5
*.nc
*.tif

*.log
notebooks/logs
notebooks/results
23 changes: 23 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: h5cloud
channels:
- conda-forge
dependencies:
- jupyterlab
- boto3
- tqdm
- matplotlib-base
- pandas
- numpy
- s3fs
- xarray
- dask
- distributed
- geopandas
- h5py>=3.10
- zarr
- kerchunk
- h5netcdf
- pip
- pip:
- git+https://github.com/betolink/filesystem_spec.git
- git+https://github.com/ICESat2-SlideRule/h5coro.git
43 changes: 28 additions & 15 deletions h5tests/h5coro_arr_mean.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,40 @@
from .h5test import H5Test, timer_decorator
import numpy as np
import subprocess

import numpy as np
from h5test import H5Test, timer_decorator

try:
import h5coro
except:
completed_process = subprocess.run([
'mamba', 'install', '-c', 'conda-forge', 'h5coro', '--yes'
])
completed_process = subprocess.run(
["pip", "install", "git+https://github.com/ICESat2-SlideRule/h5coro.git@main"]
)
import h5coro

from h5coro import h5coro, s3driver, filedriver
h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)

from h5coro import h5coro, s3driver

driver = s3driver.S3Driver


class H5CoroArrMean(H5Test):
@timer_decorator
def run(self):
group = '/gt1l/heights'
variable = 'h_ph'
def run(self, dataset="/gt1l/heights", variable="h_ph"):
group = dataset
variable = variable
final_h5coro_array = []

for file in self.files:
h5obj = h5coro.H5Coro(file.replace("s3://", ""), s3driver.S3Driver)
output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)
data = h5obj[f'{group}/{variable}'].values
final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None)
if link.startswith("s3://nasa-cryo-persistent/"):
h5obj = h5coro.H5Coro(link.replace("s3://", ""), s3driver.S3Driver)
else:
h5obj = h5coro.H5Coro(
link.replace("s3://", ""),
s3driver.S3Driver,
credentials={"annon": True},
)
ds = h5obj.readDatasets(datasets=[f"{group}/{variable}"], block=True)
data = ds[f"{group}/{variable}"][:]
final_h5coro_array = np.insert(
final_h5coro_array, len(final_h5coro_array), data, axis=None
)
return np.mean(final_h5coro_array)
33 changes: 19 additions & 14 deletions h5tests/h5py_arr_mean.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
from .h5test import H5Test, timer_decorator
import h5py
import numpy as np
from h5test import H5Test, fsspec_logging_decorator, timer_decorator


class H5pyArrMean(H5Test):
@timer_decorator
def run(self):
final_h5py_array = []
# TODO: Do we need to make this configurable or consistent?
group = '/gt1l/heights'
variable = 'h_ph'
@fsspec_logging_decorator
def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"):
final_h5py_array = []
fsspec_params = {}
h5py_params = {}
if "fsspec_params" in io_params:
fsspec_params = io_params["fsspec_params"]
if "h5py_params" in io_params:
h5py_params = io_params["h5py_params"]
self.file_sizes = [self.s3_fs.info(file)["size"] for file in self.files]
for file in self.files:
with h5py.File(self.s3_fs.open(file, 'rb')) as f:
data = f[f'{group}/{variable}'][:]
# Need to test if using concatenate is faster
final_h5py_array = np.insert(
final_h5py_array,
len(final_h5py_array),
data, axis=None
)
with self.s3_fs.open(file, mode="rb", **fsspec_params) as fo:
print("h5py params: ", h5py_params)
with h5py.File(fo, **h5py_params) as f:
data = f[f"{dataset}/{variable}"][:]
final_h5py_array = np.insert(
final_h5py_array, len(final_h5py_array), data, axis=None
)
return np.mean(final_h5py_array)
44 changes: 23 additions & 21 deletions h5tests/h5py_arr_subset_mean.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,51 @@
import os
import sys

from .h5test import H5Test, timer_decorator
import h5py
import numpy as np
from h5test import H5Test, fsspec_logging_decorator, timer_decorator

current = os.path.abspath('..')
current = os.path.abspath("..")
sys.path.append(current)
from helpers.geospatial import get_subset_region, get_subset_indices
from helpers.geospatial import get_subset_indices, get_subset_region


class H5pyArrSubsetMean(H5Test):

def __init__(self, data_format, geometry=None):
"""
geometry : path to geojson file containing geometry
**Could be list containing [lonmin, lonmax, latmin, latmax]**
"""
super().__init__(data_format)
self.bounds = get_subset_region(geometry)

@timer_decorator
def run(self):
final_h5py_array = []
@fsspec_logging_decorator
def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"):
final_h5py_array = []
# TODO: Do we need to make this configurable or consistent?
group = '/gt1l/heights'
variable = 'h_ph'
if "fsspec_params" in io_params:
fsspec_params = io_params["fsspec_params"]
if "h5py_params" in io_params:
h5py_params = io_params["h5py_params"]
for file in self.files:
with h5py.File(self.s3_fs.open(file, 'rb')) as f:

lat = f[f'{group}/lat_ph'][:]
lon = f[f'{group}/lon_ph'][:]

with h5py.File(
self.s3_fs.open(file, "rb", **fsspec_params), **h5py_params
) as f:
lat = f[f"{dataset}/lat_ph"][:]
lon = f[f"{dataset}/lon_ph"][:]

idx_start, idx_end = get_subset_indices(lat, lon, self.bounds)

# Leaving this code here so that we can create a DataFrame or
# Dataset at a later date. Suggest creating dict which can be
# Dataset at a later date. Suggest creating dict which can be
# passsed to xarray or (geo)pandas
# lat[idx_start:idx_end])
# lon[idx_start:idx_end])

data = f[f'{group}/{variable}'][idx_start:idx_end]
data = f[f"{dataset}/{variable}"][idx_start:idx_end]
# Need to test if using concatenate is faster
final_h5py_array = np.insert(
final_h5py_array,
len(final_h5py_array),
data, axis=None
final_h5py_array, len(final_h5py_array), data, axis=None
)
return np.mean(final_h5py_array)
return np.mean(final_h5py_array)
Loading