From 77e11c269c12e77770c22323468c6a964452a2fd Mon Sep 17 00:00:00 2001 From: betolink Date: Sun, 28 Jan 2024 20:25:09 -0600 Subject: [PATCH 01/11] refactoring tests to log io behavior --- h5tests/h5py_arr_mean.py | 32 +- h5tests/h5test.py | 88 +++++- h5tests/single-test.ipynb | 44 +-- h5tests/xarray_arr_mean.py | 49 +++- helpers/links-old.json | 53 ++++ helpers/links.py | 2 +- helpers/s3filelinks.json | 103 ++++--- notebooks/logs-fsspec.ipynb | 568 ++++++++++++++++++++++++++++++++++++ 8 files changed, 822 insertions(+), 117 deletions(-) create mode 100644 helpers/links-old.json create mode 100644 notebooks/logs-fsspec.ipynb diff --git a/h5tests/h5py_arr_mean.py b/h5tests/h5py_arr_mean.py index 7c35407..1d5a01c 100644 --- a/h5tests/h5py_arr_mean.py +++ b/h5tests/h5py_arr_mean.py @@ -1,21 +1,27 @@ -from .h5test import H5Test, timer_decorator import h5py import numpy as np +from .h5test import H5Test, timer_decorator + + class H5pyArrMean(H5Test): @timer_decorator - def run(self): - final_h5py_array = [] + def run(self, io_params={}): + final_h5py_array = [] # TODO: Do we need to make this configurable or consistent? - group = '/gt1l/heights' - variable = 'h_ph' + group = "/gt1l/heights" + variable = "h_ph" + fsspec_params = {} + h5py_params = {} + if "fsspec_params" in io_params: + fsspec_params = io_params["fsspec_params"] + if "h5py_params" in io_params: + h5py_params = io_params["h5py_params"] for file in self.files: - with h5py.File(self.s3_fs.open(file, 'rb')) as f: - data = f[f'{group}/{variable}'][:] - # Need to test if using concatenate is faster - final_h5py_array = np.insert( - final_h5py_array, - len(final_h5py_array), - data, axis=None - ) + with self.s3_fs.open(file, mode="rb", **fsspec_params) as fo: + with h5py.File(fo, **h5py_params) as f: + data = f[f"{group}/{variable}"][:] + final_h5py_array = np.insert( + final_h5py_array, len(final_h5py_array), data, axis=None + ) return np.mean(final_h5py_array) diff --git a/h5tests/h5test.py b/h5tests/h5test.py index 74bb4de..60f7ac7 100644 --- a/h5tests/h5test.py +++ b/h5tests/h5test.py @@ -1,52 +1,109 @@ -import boto3 import csv -from io import StringIO +import logging +import os +import re +import sys import time from datetime import datetime -import os +from io import StringIO + +import boto3 import s3fs -import sys -current = os.path.abspath('..') +current = os.path.abspath("..") sys.path.append(current) + from helpers.links import S3Links -def generate_timestamp(): - return datetime.now().strftime('%Y-%m-%d-%H%M%S') + +class RegexFilter(logging.Filter): + """ + This class will filter a logstream based on a regex expression + The idea is to target a particular library as they usually have a consistent signature. + """ + + def __init__(self, regex_pattern): + super(RegexFilter, self).__init__() + self.regex_pattern = re.compile(regex_pattern) + + def filter(self, record): + # Apply the regex pattern to the log message + return not bool(self.regex_pattern.search(record.msg)) + def timer_decorator(func): """ A decorator to measure the execution time of the wrapped function. + It also writes logs to local disk if a regex expression is used in the + subclass instance. """ + + def __setup_logging(self, tstamp): + log_filename = f"logs/{self.data_format}-{tstamp}.log" + logger = logging.getLogger("fsspec") + logger.setLevel(logging.DEBUG) + self.regex_filter = RegexFilter(self.logs_regex) + # add regerx to root logger + logging.getLogger().addFilter(self.regex_filter) + self._file_handler = logging.FileHandler(log_filename) + self._file_handler.setLevel(logging.DEBUG) + # Add the handler to the root logger + logging.getLogger().addHandler(self._file_handler) + + def __turnoff_logging(self): + logging.getLogger().removeFilter(self.regex_filter) + logging.getLogger().removeHandler(self._file_handler) + self._file_handler.close() + def wrapper(self, *args, **kwargs): + tstamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") + if self.logs_regex: + __setup_logging(self, tstamp) start_time = time.time() result = func(self, *args, **kwargs) end_time = time.time() + if self.logs_regex: + __turnoff_logging(self) execution_time = end_time - start_time # Call the store method here if self.store_results: - results_key = f"{generate_timestamp()}_{self.name}_{self.data_format}_results.csv" + results_key = f"{tstamp}_{self.name}_{self.data_format}_results.csv" s3_key = f"{self.results_directory}/{results_key}" - self.store(run_time=execution_time, result=result, bucket=self.bucket, s3_key=s3_key) + self.store( + run_time=execution_time, + result=result, + bucket=self.bucket, + s3_key=s3_key, + ) return result, execution_time + return wrapper + class H5Test: - def __init__(self, data_format: str, files=None, store_results=True): + def __init__( + self, data_format: str, files=None, store_results=True, logs_regex=None + ): self.name = self.__class__.__name__ self.data_format = data_format + self.logs_regex = logs_regex if files: self.files = files else: self.files = S3Links().get_links_by_format(data_format) - self.s3_client = boto3.client('s3') # Ensure AWS credentials are configured + self.s3_client = boto3.client("s3") # Ensure AWS credentials are configured self.s3_fs = s3fs.S3FileSystem(anon=False) self.store_results = store_results - self.bucket = "nasa-cryo-scratch" - self.results_directory = "h5cloud/benchmark_results" + self.bucket = "nasa-cryo-persistent" + self.results_directory = "h5cloud/benchmark_results" @timer_decorator - def run(self): + def run(self, io_params={}): + """ + When implemented we can pass io_params as runtime tweaks to the underlying + libraries e.g. fsspec. + """ + raise NotImplementedError("The run method has not been implemented") def store(self, run_time: float, result: str, bucket: str, s3_key: str): @@ -61,7 +118,7 @@ def store(self, run_time: float, result: str, bucket: str, s3_key: str): # Create a CSV in-memory csv_buffer = StringIO() csv_writer = csv.writer(csv_buffer) - csv_writer.writerow(['Name', 'Data Format', 'Run Time', 'Result']) # Headers + csv_writer.writerow(["Name", "Data Format", "Run Time", "Result"]) # Headers csv_writer.writerow([self.name, self.data_format, run_time, result]) # Reset the buffer's position to the beginning @@ -70,6 +127,7 @@ def store(self, run_time: float, result: str, bucket: str, s3_key: str): # Upload the CSV to S3 self.s3_client.put_object(Bucket=bucket, Key=s3_key, Body=csv_buffer.getvalue()) + ## Example subclass # class SampleTest(H5Test): # @timer_decorator diff --git a/h5tests/single-test.ipynb b/h5tests/single-test.ipynb index 1ea5439..ee4e966 100644 --- a/h5tests/single-test.ipynb +++ b/h5tests/single-test.ipynb @@ -2,21 +2,12 @@ "cells": [ { "cell_type": "code", - "execution_count": 73, + "execution_count": null, "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload \n", @@ -25,44 +16,41 @@ "import os\n", "current = os.path.abspath('..')\n", "sys.path.append(current)\n", - "from h5tests.xarray_arr_len import XarrayArrLen\n", + "from xarray_arr_mean import XarrayArrMean\n", "from helpers.links import S3Links" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": null, "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", "metadata": { "tags": [] }, "outputs": [], "source": [ - "test = XarrayArrLen('kerchunk-repacked', store_results=False)" + "test = XarrayArrMean('atl03-midsize-original', store_results=False)" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": null, "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "(338294671, 69.48884391784668)" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "test.run()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -81,7 +69,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/h5tests/xarray_arr_mean.py b/h5tests/xarray_arr_mean.py index 0c1fc92..0341210 100644 --- a/h5tests/xarray_arr_mean.py +++ b/h5tests/xarray_arr_mean.py @@ -1,31 +1,48 @@ -from .h5test import H5Test, timer_decorator import fsspec -import xarray as xr import numpy as np +import xarray as xr +from h5test import H5Test, timer_decorator + class XarrayArrMean(H5Test): def open_reference_ds(self, file): fs = fsspec.filesystem( - 'reference', - fo=file, - remote_protocol='s3', - remote_options=dict(anon=False), - skip_instance_cache=True + "reference", + fo=file, + remote_protocol="s3", + remote_options=dict(anon=False), + skip_instance_cache=True, + ) + return xr.open_dataset( + fs.get_mapper(""), engine="zarr", consolidated=False, group="gt1l/heights" ) - return xr.open_dataset(fs.get_mapper(""), engine='zarr', consolidated=False, group='gt1l/heights') @timer_decorator - def run(self): - group = '/gt1l/heights' - variable = 'h_ph' - if 'kerchunk' in self.data_format: + def run(self, io_params={}): + group = "/gt1l/heights" + variable = "h_ph" + + if "kerchunk" in self.data_format: datasets = [self.open_reference_ds(file) for file in self.files] h_ph_values = [] for dataset in datasets: - h_ph_values = np.append(h_ph_values, dataset['h_ph'].values) + h_ph_values = np.append(h_ph_values, dataset["h_ph"].values) return np.mean(h_ph_values) else: - s3_fileset = [self.s3_fs.open(file) for file in self.files] - xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf') - h_ph_values = xrds['h_ph'] + fsspec_params = {} + h5py_params = {} + if "fsspec_params" in io_params: + fsspec_params = io_params["fsspec_params"] + if "h5py_params" in io_params: + h5py_params = io_params["h5py_params"] + + s3_fileset = [self.s3_fs.open(file, **fsspec_params) for file in self.files] + xrds = xr.open_mfdataset( + s3_fileset, + group=group, + combine="by_coords", + engine="h5netcdf", + **h5py_params + ) + h_ph_values = xrds["h_ph"] return float(np.mean(h_ph_values).values) diff --git a/helpers/links-old.json b/helpers/links-old.json new file mode 100644 index 0000000..1f0b836 --- /dev/null +++ b/helpers/links-old.json @@ -0,0 +1,53 @@ +{ + "flatgeobuf": { + "ATL03_20181120182818_08110112_006_02.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20181120182818_08110112_006_02.fgb", + "ATL03_20190219140808_08110212_006_02.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20190219140808_08110212_006_02.fgb", + "ATL03_20200217204710_08110612_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20200217204710_08110612_006_01.fgb", + "ATL03_20211114142614_08111312_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20211114142614_08111312_006_01.fgb", + "ATL03_20230211164520_08111812_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20230211164520_08111812_006_01.fgb" + }, + "flatgeobuf_no_sindex": { + "ATL03_20181120182818_08110112_006_02_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20181120182818_08110112_006_02_no_sindex.fgb", + "ATL03_20190219140808_08110212_006_02_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20190219140808_08110212_006_02_no_sindex.fgb", + "ATL03_20200217204710_08110612_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20200217204710_08110612_006_01_no_sindex.fgb", + "ATL03_20211114142614_08111312_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20211114142614_08111312_006_01_no_sindex.fgb", + "ATL03_20230211164520_08111812_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20230211164520_08111812_006_01_no_sindex.fgb" + }, + "geoparquet": { + "ATL03_20181120182818_08110112_006_02.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20181120182818_08110112_006_02.h5.gpq", + "ATL03_20190219140808_08110212_006_02.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20190219140808_08110212_006_02.h5.gpq", + "ATL03_20200217204710_08110612_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20200217204710_08110612_006_01.h5.gpq", + "ATL03_20211114142614_08111312_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20211114142614_08111312_006_01.h5.gpq", + "ATL03_20230211164520_08111812_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20230211164520_08111812_006_01.h5.gpq", + "['ATL03_20200217204710_08110612_006_01.h5'].gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/['ATL03_20200217204710_08110612_006_01.h5'].gpq" + }, + + "h5repack": { + "ATL03_20181120182818_08110112_006_02_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5", + "ATL03_20190219140808_08110212_006_02_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5", + "ATL03_20200217204710_08110612_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5", + "ATL03_20211114142614_08111312_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5", + "ATL03_20230211164520_08111812_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5" + }, + "kerchunk-original": { + "original_ATL03_20181120182818_08110112_006_02.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20181120182818_08110112_006_02.json", + "original_ATL03_20190219140808_08110212_006_02.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20190219140808_08110212_006_02.json", + "original_ATL03_20200217204710_08110612_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20200217204710_08110612_006_01.json", + "original_ATL03_20211114142614_08111312_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20211114142614_08111312_006_01.json", + "original_ATL03_20230211164520_08111812_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20230211164520_08111812_006_01.json" + }, + "kerchunk-repacked": { + "h5repack_ATL03_20181120182818_08110112_006_02_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20181120182818_08110112_006_02_repacked.json", + "h5repack_ATL03_20190219140808_08110212_006_02_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20190219140808_08110212_006_02_repacked.json", + "h5repack_ATL03_20200217204710_08110612_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20200217204710_08110612_006_01_repacked.json", + "h5repack_ATL03_20211114142614_08111312_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20211114142614_08111312_006_01_repacked.json", + "h5repack_ATL03_20230211164520_08111812_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20230211164520_08111812_006_01_repacked.json" + }, + "original": { + "ATL03_20181120182818_08110112_006_02.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20181120182818_08110112_006_02.h5", + "ATL03_20190219140808_08110212_006_02.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20190219140808_08110212_006_02.h5", + "ATL03_20200217204710_08110612_006_01.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20200217204710_08110612_006_01.h5", + "ATL03_20211114142614_08111312_006_01.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20211114142614_08111312_006_01.h5", + "ATL03_20230211164520_08111812_006_01.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20230211164520_08111812_006_01.h5" + } +} \ No newline at end of file diff --git a/helpers/links.py b/helpers/links.py index 5590042..c56e285 100644 --- a/helpers/links.py +++ b/helpers/links.py @@ -4,7 +4,7 @@ import s3fs -S3LINK = "s3://nasa-cryo-scratch/h5cloud/" +S3LINK = "s3://nasa-cryo-permanent/h5cloud/" S3FILELINKS = Path("../helpers/s3filelinks.json") diff --git a/helpers/s3filelinks.json b/helpers/s3filelinks.json index 2a4ab87..2818b4b 100644 --- a/helpers/s3filelinks.json +++ b/helpers/s3filelinks.json @@ -1,52 +1,67 @@ { "flatgeobuf": { - "ATL03_20181120182818_08110112_006_02.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20181120182818_08110112_006_02.fgb", - "ATL03_20190219140808_08110212_006_02.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20190219140808_08110212_006_02.fgb", - "ATL03_20200217204710_08110612_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20200217204710_08110612_006_01.fgb", - "ATL03_20211114142614_08111312_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20211114142614_08111312_006_01.fgb", - "ATL03_20230211164520_08111812_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20230211164520_08111812_006_01.fgb" + }, "flatgeobuf_no_sindex": { - "ATL03_20181120182818_08110112_006_02_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20181120182818_08110112_006_02_no_sindex.fgb", - "ATL03_20190219140808_08110212_006_02_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20190219140808_08110212_006_02_no_sindex.fgb", - "ATL03_20200217204710_08110612_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20200217204710_08110612_006_01_no_sindex.fgb", - "ATL03_20211114142614_08111312_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20211114142614_08111312_006_01_no_sindex.fgb", - "ATL03_20230211164520_08111812_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20230211164520_08111812_006_01_no_sindex.fgb" + }, "geoparquet": { - "ATL03_20181120182818_08110112_006_02.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20181120182818_08110112_006_02.h5.gpq", - "ATL03_20190219140808_08110212_006_02.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20190219140808_08110212_006_02.h5.gpq", - "ATL03_20200217204710_08110612_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20200217204710_08110612_006_01.h5.gpq", - "ATL03_20211114142614_08111312_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20211114142614_08111312_006_01.h5.gpq", - "ATL03_20230211164520_08111812_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20230211164520_08111812_006_01.h5.gpq", - "['ATL03_20200217204710_08110612_006_01.h5'].gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/['ATL03_20200217204710_08110612_006_01.h5'].gpq" - }, - "h5repack": { - "ATL03_20181120182818_08110112_006_02_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5", - "ATL03_20190219140808_08110212_006_02_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5", - "ATL03_20200217204710_08110612_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5", - "ATL03_20211114142614_08111312_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5", - "ATL03_20230211164520_08111812_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5" - }, - "kerchunk-original": { - "original_ATL03_20181120182818_08110112_006_02.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20181120182818_08110112_006_02.json", - "original_ATL03_20190219140808_08110212_006_02.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20190219140808_08110212_006_02.json", - "original_ATL03_20200217204710_08110612_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20200217204710_08110612_006_01.json", - "original_ATL03_20211114142614_08111312_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20211114142614_08111312_006_01.json", - "original_ATL03_20230211164520_08111812_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20230211164520_08111812_006_01.json" - }, - "kerchunk-repacked": { - "h5repack_ATL03_20181120182818_08110112_006_02_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20181120182818_08110112_006_02_repacked.json", - "h5repack_ATL03_20190219140808_08110212_006_02_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20190219140808_08110212_006_02_repacked.json", - "h5repack_ATL03_20200217204710_08110612_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20200217204710_08110612_006_01_repacked.json", - "h5repack_ATL03_20211114142614_08111312_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20211114142614_08111312_006_01_repacked.json", - "h5repack_ATL03_20230211164520_08111812_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20230211164520_08111812_006_01_repacked.json" - }, - "original": { - "ATL03_20181120182818_08110112_006_02.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20181120182818_08110112_006_02.h5", - "ATL03_20190219140808_08110212_006_02.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20190219140808_08110212_006_02.h5", - "ATL03_20200217204710_08110612_006_01.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20200217204710_08110612_006_01.h5", - "ATL03_20211114142614_08111312_006_01.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20211114142614_08111312_006_01.h5", - "ATL03_20230211164520_08111812_006_01.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20230211164520_08111812_006_01.h5" + + }, + "atl03-bigsize-original": { + "ATL03_20181120182818_08110112_006_02.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5", + "ATL03_20190219140808_08110212_006_02.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5", + "ATL03_20200217204710_08110612_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20200217204710_08110612_006_01.h5", + "ATL03_20211114142614_08111312_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20211114142614_08111312_006_01.h5", + "ATL03_20230211164520_08111812_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20230211164520_08111812_006_01.h5" + }, + "atl03-bigsize-h5repack": { + "ATL03_20181120182818_08110112_006_02_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5", + "ATL03_20190219140808_08110212_006_02_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5", + "ATL03_20200217204710_08110612_006_01_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20200217204710_08110612_006_01_repacked.h5", + "ATL03_20211114142614_08111312_006_01_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20211114142614_08111312_006_01_repacked.h5", + "ATL03_20230211164520_08111812_006_01_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20230211164520_08111812_006_01_repacked.h5" + }, + "atl03-kerchunk-bigsize-original": { + "atl03_ATL03_20190219140808_08110212_006_02.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20190219140808_08110212_006_02.json", + "atl03_ATL03_20230211164520_08111812_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20230211164520_08111812_006_01.json", + "atl03_ATL03_20200217204710_08110612_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20200217204710_08110612_006_01.json", + "atl03_ATL03_20181120182818_08110112_006_02.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json", + "atl03_ATL03_20211114142614_08111312_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20211114142614_08111312_006_01.json" + }, + "atl03-kerchunk-bigsize-repacked": { + "atl03_ATL03_20181120182818_08110112_006_02_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json", + "atl03_ATL03_20190219140808_08110212_006_02_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20190219140808_08110212_006_02_repacked.json", + "atl03_ATL03_20211114142614_08111312_006_01_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20211114142614_08111312_006_01_repacked.json", + "atl03_ATL03_20200217204710_08110612_006_01_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20200217204710_08110612_006_01_repacked.json", + "atl03_ATL03_20230211164520_08111812_006_01_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20230211164520_08111812_006_01_repacked.json" + }, + "atl03-midsize-original": { + "ATL03_20191225111315_13680501_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20191225111315_13680501_006_01.h5", + "ATL03_20200922221235_13680801_006_02.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20200922221235_13680801_006_02.h5", + "ATL03_20220620155150_13681501_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20220620155150_13681501_006_01.h5", + "ATL03_20220919113142_13681601_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20220919113142_13681601_006_01.h5", + "ATL03_20230618223036_13681901_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5" + }, + "atl03-midsize-h5repack":{ + "ATL03_20191225111315_13680501_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20191225111315_13680501_006_01.h5", + "ATL03_20200922221235_13680801_006_02.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20200922221235_13680801_006_02.h5", + "ATL03_20220620155150_13681501_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20220620155150_13681501_006_01.h5", + "ATL03_20220919113142_13681601_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20220919113142_13681601_006_01.h5", + "ATL03_20230618223036_13681901_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5" + }, + "atl03-kerchunk-midsize-original": { + "atl03_ATL03_20220919113142_13681601_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20220919113142_13681601_006_01.json", + "atl03_ATL03_20191225111315_13680501_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20191225111315_13680501_006_01.json", + "atl03_ATL03_20220620155150_13681501_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20220620155150_13681501_006_01.json", + "atl03_ATL03_20200922221235_13680801_006_02.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20200922221235_13680801_006_02.json", + "atl03_ATL03_20230618223036_13681901_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20230618223036_13681901_006_01.json" + }, + "atl03-kerchunk-midsize-repacked": { + "atl03_ATL03_20220620155150_13681501_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20220620155150_13681501_006_01.json", + "atl03_ATL03_20191225111315_13680501_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20191225111315_13680501_006_01.json", + "atl03_ATL03_20220919113142_13681601_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20220919113142_13681601_006_01.json", + "atl03_ATL03_20200922221235_13680801_006_02.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20200922221235_13680801_006_02.json", + "atl03_ATL03_20230618223036_13681901_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20230618223036_13681901_006_01.json" } } \ No newline at end of file diff --git a/notebooks/logs-fsspec.ipynb b/notebooks/logs-fsspec.ipynb new file mode 100644 index 0000000..fa25aa4 --- /dev/null +++ b/notebooks/logs-fsspec.ipynb @@ -0,0 +1,568 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6c9b37e2-2daa-4283-a228-ea581498de0c", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## AB testing access time for ICESat-2 HDF5 files on the cloud.\n", + "\n", + "This notebook requires that we have 2 versions of the same file:\n", + " * Original A: The original file with no modifications on a S3 location.\n", + " * Test Case B: A modified version of the orignal file to test for metadata consolidation, rechunking and other strategies to speed up access to the data in the file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3b78fb94-10ae-48cb-8e30-521b2c8b7822", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xarray v2023.12.0\n", + "h5py v3.10.0\n", + "fsspec v2023.6.0\n" + ] + } + ], + "source": [ + "import xarray as xr\n", + "import h5py\n", + "import fsspec\n", + "import s3fs\n", + "import boto3\n", + "import logging\n", + "import re\n", + "import time\n", + "from datetime import datetime\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "\n", + "class RegexFilter(logging.Filter):\n", + " def __init__(self, regex_pattern):\n", + " super(RegexFilter, self).__init__()\n", + " self.regex_pattern = re.compile(regex_pattern)\n", + "\n", + " def filter(self, record):\n", + " # Apply the regex pattern to the log message\n", + " return not bool(self.regex_pattern.search(record.msg))\n", + "\n", + " \n", + "def timer_decorator(func):\n", + " \"\"\"\n", + " A decorator to measure the execution time of the wrapped function.\n", + " \"\"\"\n", + " def __setup_logging(self, tstamp):\n", + " log_filename = f\"logs/{self.data_format}-{tstamp}.log\"\n", + " logger = logging.getLogger(\"fsspec\")\n", + " logger.setLevel(logging.DEBUG)\n", + " self.regex_filter = RegexFilter(self.logs_regex)\n", + " # add regerx to root logger\n", + " logging.getLogger().addFilter(self.regex_filter )\n", + " self._file_handler = logging.FileHandler(log_filename)\n", + " self._file_handler.setLevel(logging.DEBUG)\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(self._file_handler)\n", + " \n", + " def __turnoff_logging(self):\n", + " logging.getLogger().removeFilter(self.regex_filter)\n", + " logging.getLogger().removeHandler(self._file_handler)\n", + " self._file_handler.close()\n", + " \n", + " def wrapper(self, *args, **kwargs):\n", + " tstamp = datetime.now().strftime('%Y-%m-%d-%H%M%S')\n", + " if self.logs_regex:\n", + " __setup_logging(self, tstamp)\n", + " start_time = time.time()\n", + " result = func(self, *args, **kwargs)\n", + " end_time = time.time()\n", + " if self.logs_regex:\n", + " __turnoff_logging(self)\n", + " execution_time = end_time - start_time\n", + " # Call the store method here\n", + " if self.store_results:\n", + " results_key = f\"{tstamp}_{self.name}_{self.data_format}_results.csv\"\n", + " s3_key = f\"{self.results_directory}/{results_key}\"\n", + " self.store(run_time=execution_time, result=result, bucket=self.bucket, s3_key=s3_key)\n", + " return result, execution_time\n", + " return wrapper \n", + "\n", + "\n", + " \n", + "class H5Test:\n", + " def __init__(self,\n", + " data_format: str,\n", + " files=None,\n", + " store_results=True,\n", + " logs_regex=None):\n", + " self.name = self.__class__.__name__\n", + " self.data_format = data_format\n", + " self.logs_regex = logs_regex\n", + " if files:\n", + " self.files = files\n", + " else:\n", + " self.files = S3Links().get_links_by_format(data_format)\n", + " self.s3_client = boto3.client('s3') # Ensure AWS credentials are configured\n", + " self.s3_fs = s3fs.S3FileSystem(anon=False)\n", + " self.store_results = store_results\n", + " self.bucket = \"nasa-cryo-persistent\"\n", + " self.results_directory = \"h5cloud/benchmark_results\"\n", + " \n", + " \n", + "\n", + " @timer_decorator\n", + " def run(self, io_params):\n", + " raise NotImplementedError(\"The run method has not been implemented\")\n", + "\n", + " def store(self, run_time: float, result: str, bucket: str, s3_key: str):\n", + " \"\"\"\n", + " Store test results to an S3 bucket as a CSV file.\n", + "\n", + " :param run_time: The runtime of the test\n", + " :param result: The result of the test\n", + " :param bucket: The name of the S3 bucket where the CSV will be uploaded\n", + " :param s3_key: The S3 key (filename) where the CSV will be stored\n", + " \"\"\"\n", + " # Create a CSV in-memory\n", + " csv_buffer = StringIO()\n", + " csv_writer = csv.writer(csv_buffer)\n", + " csv_writer.writerow(['Name', 'Data Format', 'Run Time', 'Result']) # Headers\n", + " csv_writer.writerow([self.name, self.data_format, run_time, result])\n", + "\n", + " # Reset the buffer's position to the beginning\n", + " csv_buffer.seek(0)\n", + "\n", + " # Upload the CSV to S3\n", + " self.s3_client.put_object(Bucket=bucket, Key=s3_key, Body=csv_buffer.getvalue())\n", + "\n", + "for library in (xr, h5py, fsspec):\n", + " print(f'{library.__name__} v{library.__version__}')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d6ed86c7-a919-4532-b7d3-1ca3cf4e25d1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class H5pyArrMean(H5Test):\n", + " \n", + " @timer_decorator\n", + " def run(self, io_params):\n", + " final_h5py_array = [] \n", + " # TODO: Do we need to make this configurable or consistent?\n", + " group = '/gt1l/heights'\n", + " variable = 'h_ph'\n", + " fsspec_params = io_params[\"fsspec_params\"]\n", + " h5py_params = io_params[\"h5py_params\"]\n", + " for file in self.files:\n", + " with self.s3_fs.open(file, mode=\"rb\", **fsspec_params) as fo:\n", + " with h5py.File(fo, **h5py_params) as f:\n", + " data = f[f\"{group}/{variable}\"][:]\n", + " final_h5py_array = np.insert(\n", + " final_h5py_array,\n", + " len(final_h5py_array),\n", + " data, axis=None\n", + " )\n", + " return np.mean(final_h5py_array)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7db7c600-d362-45c2-bb9f-3670de9ddf4d", + "metadata": {}, + "outputs": [], + "source": [ + "class H5pyROS3ArrMean(H5Test):\n", + " \"\"\"\n", + " This will only work for public buckets for now\n", + " \"\"\"\n", + " \n", + " @timer_decorator\n", + " def run(self, io_params):\n", + " final_h5py_array = [] \n", + " # TODO: Do we need to make this configurable or consistent?\n", + " group = '/gt1l/heights'\n", + " variable = 'h_ph'\n", + " h5py_params = io_params[\"h5py_params\"]\n", + " for file in self.files:\n", + " with h5py.File(file, driver=\"ros3\", **h5py_params) as f:\n", + " data = f[f\"{group}/{variable}\"][:]\n", + " final_h5py_array = np.insert(\n", + " final_h5py_array,\n", + " len(final_h5py_array),\n", + " data, axis=None\n", + " )\n", + " return np.mean(final_h5py_array)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "621f6014-fda5-40a4-be23-dcaed47b6fbd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class XarrayArrMean(H5Test):\n", + " def open_reference_ds(self, file):\n", + " fs = fsspec.filesystem(\n", + " 'reference', \n", + " fo=file, \n", + " remote_protocol='s3', \n", + " remote_options=dict(anon=False), \n", + " skip_instance_cache=True\n", + " )\n", + " return xr.open_dataset(fs.get_mapper(\"\"), engine='zarr', consolidated=False, group='gt1l/heights')\n", + "\n", + " @timer_decorator\n", + " def run(self, io_params):\n", + " group = '/gt1l/heights'\n", + " variable = 'h_ph'\n", + "\n", + " if 'kerchunk' in self.data_format: \n", + " datasets = [self.open_reference_ds(file) for file in self.files]\n", + " h_ph_values = []\n", + " for dataset in datasets:\n", + " h_ph_values = np.append(h_ph_values, dataset['h_ph'].values)\n", + " return np.mean(h_ph_values)\n", + " else:\n", + " if \"repacked\" in self.data_format:\n", + " fsspec_params = {\n", + " # \"skip_instance_cache\": True\n", + " \"cache_type\": \"first\",\n", + " \"block_size\": 16*1024*1024\n", + " }\n", + " h5py_params = {\n", + " \"driver_kwds\" :{\n", + " \"page_buf_size\": 32*1024*1024,\n", + " \"rdcc_nbytes\": 8*1024*1024\n", + " }\n", + " } \n", + " s3_fileset = [self.s3_fs.open(file, **fsspec_params) for file in self.files]\n", + " xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf', **h5py_params)\n", + " h_ph_values = xrds['h_ph']\n", + " return float(np.mean(h_ph_values).values)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "a6d4b6ce-10f1-4031-987d-fcfd43422ae6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "repacked_granules = [\n", + " \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + "]\n", + "test_cloud = H5pyArrMean('atl03-bigsize-repacked',\n", + " files=repacked_granules,\n", + " store_results=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4dfe5ab6-2e88-46d3-bb19-d6ebcec2d341", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "original_granules = [\n", + " \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "]\n", + "\n", + "logs_regex = r\"\\s*(read: \\d+ - \\d+)\"\n", + "\n", + "test_original = H5pyArrMean('atl03-bigsize-original',\n", + " files=original_granules,\n", + " store_results=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "40ef2c85-f4d2-4a03-9839-bc192dda0c02", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1032.9840463639412, 12.149354219436646)\n", + "(1032.9840463639412, 12.194729566574097)\n", + "(1032.9840463639412, 12.10885739326477)\n", + "(1032.9840463639412, 11.940461874008179)\n", + "(1032.9840463639412, 12.063915252685547)\n" + ] + } + ], + "source": [ + "# logger = logging.getLogger()\n", + "# logger.setLevel(logging.DEBUG)\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " \"skip_instance_cache\": True\n", + " # \"cache_type\": \"blockcache\",\n", + " # \"block_size\": 4*1024*1024\n", + " },\n", + " \"h5py_params\": {\n", + " # \"rdcc_nbytes\": 2*1024*1024 \n", + " }\n", + "}\n", + "for runs in range(5):\n", + " print(test_original.run(io_params))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb2f92bf-4b1f-4bfe-a037-8061bfa9b127", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1032.9840463639412, 34.1772985458374)\n", + "(1032.9840463639412, 30.96499228477478)\n", + "(1032.9840463639412, 31.00865602493286)\n", + "(1032.9840463639412, 31.207276821136475)\n" + ] + } + ], + "source": [ + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"skip_instance_cache\": True\n", + " # \"cache_type\": \"blockcache\",\n", + " # \"block_size\": 4*1024*1024\n", + " },\n", + " \"h5py_params\": {\n", + " # \"page_buf_size\": 32*1024*1024,\n", + " # \"rdcc_nbytes\": 2*1024*1024\n", + " }\n", + "}\n", + "for runs in range(5):\n", + " print(test_cloud.run(io_params))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "262a6b25-8b23-46bb-8301-8965aaf155d2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered drivers: frozenset({'mpio', 'family', 'ros3', 'split', 'core', 'sec2', 'fileobj', 'direct', 'stdio'})\n" + ] + } + ], + "source": [ + "print(f'Registered drivers: {h5py.registered_drivers()}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02258a85-4951-48c2-a295-75a47c0e38c1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "df = pd.DataFrame.from_dict(benchmarks)\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['tool', 'dataset', 'format']):\n", + " tool, dataset, formated = name\n", + " x = f'{tool}, {dataset}, {formated}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{tool}, {dataset}, {formated}', group['time'].mean(), label=f'{tool}, {dataset}, {formated}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Combination')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title('mean() on photon data for a single IS2 track, less is better')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d5d779b-5bc6-4208-ae26-05c5a473d9b9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64bcc5de-aae3-46aa-9474-1c90b9ff20a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "df = pd.DataFrame.from_dict(benchmarks)\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['tool', 'dataset', 'format']):\n", + " tool, dataset, formated = name\n", + " x = f'{tool}, {dataset}, {formated}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{tool}, {dataset}, {formated}', group['time'].mean(), label=f'{tool}, {dataset}, {formated}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Combination')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title('mean() on photon data for a single IS2 track, less is better')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b2871e1-d700-4d22-b01f-5e5a9acd1006", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89aabece-d942-418c-b387-77ea2de1e561", + "metadata": {}, + "outputs": [], + "source": [ + "# def normalize_log(log_file):\n", + "# with open(log_file, 'r') as input_file:\n", + "# # Open the output file in write mode\n", + "# with open(f'{log_file.replace(\".log\", \"-ros-compatible.log\")}', 'w') as output_file:\n", + "# # Iterate through each line in the input file\n", + "# for line in input_file:\n", + "# # Strip leading and trailing whitespaces from the line\n", + "# stripped_line = line.strip()\n", + "\n", + "# # Write the stripped line to the output file\n", + "# output_file.write(stripped_line + '\\n') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8411f4a8-22b4-424e-b817-0d31e7ac9e93", + "metadata": {}, + "outputs": [], + "source": [ + " # \"ATL08\": {\n", + " # \"links\": {\n", + " # \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl08/original/ATL08_20200404075919_01340707_006_03.h5\",\n", + " # \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl08/repacked/ATL08_20200404075919_01340707_006_03_repacked.h5\",\n", + " # },\n", + " # \"group\": \"/gt1l/signal_photons\",\n", + " # \"variable\": \"ph_h\",\n", + " # \"processing\": [\n", + " # \"h5repack -S PAGE -G 4000000\"\n", + " # ]\n", + " # },\n", + " # \"ATL03\": {\n", + " # \"links\": {\n", + " # \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\",\n", + " # \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\"\n", + " # },\n", + " # \"group\": \"/gt1l/heights\",\n", + " # \"variable\": \"h_ph\",\n", + " # \"processing\": [\n", + " # \"h5repack -S PAGE -G 4000000\"\n", + " # ]\n", + " # }," + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 5c0482717ecc5cd07f3332254100e34230cb8d97 Mon Sep 17 00:00:00 2001 From: betolink Date: Tue, 30 Jan 2024 14:41:51 -0600 Subject: [PATCH 02/11] testing with out of region access --- environment.yml | 21 + h5tests/h5test.py | 14 +- h5tests/single-test.ipynb | 54 +- h5tests/xarray_arr_mean.py | 11 +- helpers/links.py | 13 +- helpers/s3itslive.json | 50 ++ notebooks/cloud-optimized-hdf5.ipynb | 1062 ++++++++++++++++++++++++++ notebooks/fsspec-logs.ipynb | 2 +- notebooks/logs-fsspec.ipynb | 2 +- 9 files changed, 1201 insertions(+), 28 deletions(-) create mode 100644 helpers/s3itslive.json create mode 100644 notebooks/cloud-optimized-hdf5.ipynb diff --git a/environment.yml b/environment.yml index e69de29..05d9104 100644 --- a/environment.yml +++ b/environment.yml @@ -0,0 +1,21 @@ +name: h5cloud +channels: + - conda-forge +dependencies: + - jupyterlab + - matplotlib-base + - pandas + - numpy + - s3fs + - xarray + - fsspec + - dask + - distributed + - geopandas + - h5py>3.9 + - zarr + - kerchunk + - h5netcdf + - pip + - pip: + - h5coro diff --git a/h5tests/h5test.py b/h5tests/h5test.py index 60f7ac7..3be885b 100644 --- a/h5tests/h5test.py +++ b/h5tests/h5test.py @@ -82,7 +82,7 @@ def wrapper(self, *args, **kwargs): class H5Test: def __init__( - self, data_format: str, files=None, store_results=True, logs_regex=None + self, data_format: str, files=None, store_results=True, logs_regex=None, anon_access=False, source="cryocloud" ): self.name = self.__class__.__name__ self.data_format = data_format @@ -90,9 +90,13 @@ def __init__( if files: self.files = files else: - self.files = S3Links().get_links_by_format(data_format) - self.s3_client = boto3.client("s3") # Ensure AWS credentials are configured - self.s3_fs = s3fs.S3FileSystem(anon=False) + if source == "cryocloud": + links = "../helpers/s3filelinks.json" + else: + links = "../helpers/itslivelinks.json" + self.files = S3Links(links).get_links_by_format(data_format) + self.s3_fs = s3fs.S3FileSystem(anon=anon_access) + self.store_results = store_results self.bucket = "nasa-cryo-persistent" self.results_directory = "h5cloud/benchmark_results" @@ -115,6 +119,8 @@ def store(self, run_time: float, result: str, bucket: str, s3_key: str): :param bucket: The name of the S3 bucket where the CSV will be uploaded :param s3_key: The S3 key (filename) where the CSV will be stored """ + self.s3_client = boto3.client("s3") # Ensure AWS credentials are configured + # Create a CSV in-memory csv_buffer = StringIO() csv_writer = csv.writer(csv_buffer) diff --git a/h5tests/single-test.ipynb b/h5tests/single-test.ipynb index ee4e966..be39a69 100644 --- a/h5tests/single-test.ipynb +++ b/h5tests/single-test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", "metadata": { "tags": [] @@ -16,20 +16,23 @@ "import os\n", "current = os.path.abspath('..')\n", "sys.path.append(current)\n", - "from xarray_arr_mean import XarrayArrMean\n", - "from helpers.links import S3Links" + "from xarray_arr_mean import XarrayArrMean" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", "metadata": { "tags": [] }, "outputs": [], "source": [ - "test = XarrayArrMean('atl03-midsize-original', store_results=False)" + "files = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "]\n", + "test_original = XarrayArrMean('atl03-bigsize-original', files=files, store_results=False, anon_access=True)" ] }, { @@ -41,7 +44,12 @@ }, "outputs": [], "source": [ - "test.run()" + "# don't even try this out of region.... more than 30 minutes \n", + "io_params ={\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\" : {}\n", + "}\n", + "test_original.run(io_params)" ] }, { @@ -50,7 +58,37 @@ "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "files = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + "]\n", + "test_cloud = XarrayArrMean('atl03-bigsize-repacked', files=files, store_results=False, anon_access=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfd4e404-0412-4d2f-8eba-ca39a670e369", + "metadata": {}, + "outputs": [], + "source": [ + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"skip_instance_cache\": True\n", + " \"cache_type\": \"blockcache\",\n", + " \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"driver_kwds\": {\n", + " \"page_buf_size\": 32*1024*1024,\n", + " \"rdcc_nbytes\": 8*1024*1024\n", + " }\n", + "\n", + " }\n", + "}\n", + "test_cloud.run(io_params)" + ] } ], "metadata": { @@ -69,7 +107,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/h5tests/xarray_arr_mean.py b/h5tests/xarray_arr_mean.py index 0341210..8bee141 100644 --- a/h5tests/xarray_arr_mean.py +++ b/h5tests/xarray_arr_mean.py @@ -35,14 +35,9 @@ def run(self, io_params={}): fsspec_params = io_params["fsspec_params"] if "h5py_params" in io_params: h5py_params = io_params["h5py_params"] + print(h5py_params) s3_fileset = [self.s3_fs.open(file, **fsspec_params) for file in self.files] - xrds = xr.open_mfdataset( - s3_fileset, - group=group, - combine="by_coords", - engine="h5netcdf", - **h5py_params - ) - h_ph_values = xrds["h_ph"] + xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf', **h5py_params) + h_ph_values = xrds['h_ph'] return float(np.mean(h_ph_values).values) diff --git a/helpers/links.py b/helpers/links.py index c56e285..e212ef0 100644 --- a/helpers/links.py +++ b/helpers/links.py @@ -5,7 +5,6 @@ import s3fs S3LINK = "s3://nasa-cryo-permanent/h5cloud/" -S3FILELINKS = Path("../helpers/s3filelinks.json") class S3Links: @@ -41,9 +40,11 @@ class S3Links: 'h5cloud/original/ATL03_20181120182818_08110112_006_02.h5' """ - def __init__(self): - self.json_file = S3FILELINKS - self.table = load_s3testfile(S3FILELINKS) + def __init__(self, file="../helpers/s3filelinks.json"): + self.S3FILELINKS = Path(file) + + self.json_file = self.S3FILELINKS + self.table = load_s3testfile(self.S3FILELINKS) self.formats = list(self.table.keys()) def get_links_by_format(self, file_format): @@ -86,9 +87,9 @@ def update_links(self, write_to_file=True): print("Differences between self.table and S3 buckets: updating self.table") self.table = filelinks self.formats = list(self.table.keys()) - response = input(f"Update {S3FILELINKS} (y or n)?") + response = input(f"Update {self.S3FILELINKS} (y or n)?") if response.lower() == "y": - print(f"Updating {S3FILELINKS}") + print(f"Updating {self.S3FILELINKS}") write_s3links(filelinks) diff --git a/helpers/s3itslive.json b/helpers/s3itslive.json new file mode 100644 index 0000000..4f0a16d --- /dev/null +++ b/helpers/s3itslive.json @@ -0,0 +1,50 @@ +{ + "flatgeobuf": { + + }, + "flatgeobuf_no_sindex": { + + }, + "geoparquet": { + + }, + "atl03-bigsize-original": { + "ATL03_20181120182818_08110112_006_02.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5", + "ATL03_20190219140808_08110212_006_02.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5", + "ATL03_20200217204710_08110612_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20200217204710_08110612_006_01.h5", + "ATL03_20211114142614_08111312_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20211114142614_08111312_006_01.h5", + "ATL03_20230211164520_08111812_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20230211164520_08111812_006_01.h5" + }, + "atl03-bigsize-h5repack": { + "ATL03_20181120182818_08110112_006_02_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5", + "ATL03_20190219140808_08110212_006_02_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5", + "ATL03_20200217204710_08110612_006_01_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20200217204710_08110612_006_01_repacked.h5", + "ATL03_20211114142614_08111312_006_01_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20211114142614_08111312_006_01_repacked.h5", + "ATL03_20230211164520_08111812_006_01_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20230211164520_08111812_006_01_repacked.h5" + }, + "atl03-kerchunk-bigsize-original": { + + }, + "atl03-kerchunk-bigsize-repacked": { + }, + "atl03-midsize-original": { + "ATL03_20191225111315_13680501_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20191225111315_13680501_006_01.h5", + "ATL03_20200922221235_13680801_006_02.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20200922221235_13680801_006_02.h5", + "ATL03_20220620155150_13681501_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20220620155150_13681501_006_01.h5", + "ATL03_20220919113142_13681601_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20220919113142_13681601_006_01.h5", + "ATL03_20230618223036_13681901_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5" + }, + "atl03-midsize-h5repack":{ + "ATL03_20191225111315_13680501_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20191225111315_13680501_006_01.h5", + "ATL03_20200922221235_13680801_006_02.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20200922221235_13680801_006_02.h5", + "ATL03_20220620155150_13681501_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20220620155150_13681501_006_01.h5", + "ATL03_20220919113142_13681601_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20220919113142_13681601_006_01.h5", + "ATL03_20230618223036_13681901_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5" + }, + "atl03-kerchunk-midsize-original": { + + }, + "atl03-kerchunk-midsize-repacked": { + + } +} \ No newline at end of file diff --git a/notebooks/cloud-optimized-hdf5.ipynb b/notebooks/cloud-optimized-hdf5.ipynb new file mode 100644 index 0000000..98eaa91 --- /dev/null +++ b/notebooks/cloud-optimized-hdf5.ipynb @@ -0,0 +1,1062 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "11f9a9cb-c049-461e-8578-7090a644508e", + "metadata": {}, + "source": [ + "# Cloud Optimized HDF: or How I Learned to Stop Worrying and Love the Format\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "6332a484-8fd6-4448-827f-aa48e6322f8f", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "2d37475f-42b0-4105-b34c-529f627d9066", + "metadata": {}, + "source": [ + "## The big ol list of \"ifs\"\n", + "\n", + "* We use the most recent versions of h5py, xarray and fsspec\n", + "* We create the HDF5 files with [cloud optimized flags](https://www.youtube.com/watch?v=rcS5vt-mKok)\n", + " * if the files are out there we can repack them, consolidating the metadata and perhaps incresing the chunk sizes\n", + "* We know how to \"tweak the nobs\" (or a fair understanding of what the I/O libraries are doing)." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "736bb5fb-c5cd-42bf-be4e-6b81ae6eb865", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xarray v2024.1.1\n", + "h5py v3.10.0\n", + "s3fs v2023.12.2\n" + ] + } + ], + "source": [ + "import xarray as xr\n", + "import h5py\n", + "import s3fs\n", + "\n", + "fs = s3fs.S3FileSystem(anon=True)\n", + "\n", + "for library in (xr, h5py, s3fs):\n", + " print(f'{library.__name__} v{library.__version__}')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "78d6697b-9f84-4edf-b426-fde27560bc68", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ETag': '\"237bbd5828745b9e1a1e0ba88486e43c-835\"',\n", + " 'LastModified': datetime.datetime(2024, 1, 29, 4, 48, 24, tzinfo=tzutc()),\n", + " 'size': 6997123664,\n", + " 'name': 'its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5',\n", + " 'type': 'file',\n", + " 'StorageClass': 'INTELLIGENT_TIERING',\n", + " 'VersionId': None,\n", + " 'ContentType': 'application/x-hdf5'}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# a \"big\" ATL03 file from the ICESat-2 mission\n", + "original_granule = \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\"\n", + "# the same \"big\" ATL03 file from the ICESat-2 mission, metadata consolidated in 8MB-size pages.\n", + "cloud_optimized = \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\"\n", + "\n", + "fs.info(original_granule)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e94bb01e-a325-4ab3-8f6a-ac5799d14f02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ETag': '\"08af0688f787f10eee1ccfb13f7eb66d-836\"',\n", + " 'LastModified': datetime.datetime(2024, 1, 29, 4, 52, 44, tzinfo=tzutc()),\n", + " 'size': 7008000000,\n", + " 'name': 'its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5',\n", + " 'type': 'file',\n", + " 'StorageClass': 'INTELLIGENT_TIERING',\n", + " 'VersionId': None,\n", + " 'ContentType': 'application/x-hdf5'}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fs.info(cloud_optimized)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec2bce8f-bcf4-4982-8556-d3a71209af74", + "metadata": {}, + "outputs": [], + "source": [ + "# don't even try this out of region (us-west-2) will take forever, forever >= 30 minutes\n", + "ds = xr.open_dataset(fs.open(original_granule),\n", + " group=\"/gt1l/heights\",\n", + " engine=\"h5netcdf\")\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9b5701b-6a8b-41ac-a56a-34a4f42125e1", + "metadata": {}, + "outputs": [], + "source": [ + "# again... don't even try this out of region (us-west-2) will take forever, forever >= 30 minutes\n", + "ds = xr.open_dataset(fs.open(cloud_optimized),\n", + " group=\"/gt1l/heights\",\n", + " engine=\"h5netcdf\")\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0def8b43-7616-4e01-a502-3f44811ae47e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4.16 s, sys: 3.04 s, total: 7.2 s\n", + "Wall time: 20.6 s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:         (delta_time: 73765028, ds_surf_type: 5)\n",
+       "Coordinates:\n",
+       "  * delta_time      (delta_time) datetime64[ns] 2019-02-19T14:08:08.557345384...\n",
+       "    lat_ph          (delta_time) float64 ...\n",
+       "    lon_ph          (delta_time) float64 ...\n",
+       "Dimensions without coordinates: ds_surf_type\n",
+       "Data variables:\n",
+       "    dist_ph_across  (delta_time) float32 ...\n",
+       "    dist_ph_along   (delta_time) float32 ...\n",
+       "    h_ph            (delta_time) float32 ...\n",
+       "    pce_mframe_cnt  (delta_time) uint32 ...\n",
+       "    ph_id_channel   (delta_time) uint8 ...\n",
+       "    ph_id_count     (delta_time) uint8 ...\n",
+       "    ph_id_pulse     (delta_time) uint8 ...\n",
+       "    quality_ph      (delta_time) int8 ...\n",
+       "    signal_conf_ph  (delta_time, ds_surf_type) int8 ...\n",
+       "    weight_ph       (delta_time) uint8 ...\n",
+       "Attributes:\n",
+       "    Description:  Contains arrays of the parameters for each received photon.\n",
+       "    data_rate:    Data are stored at the photon detection rate.
" + ], + "text/plain": [ + "\n", + "Dimensions: (delta_time: 73765028, ds_surf_type: 5)\n", + "Coordinates:\n", + " * delta_time (delta_time) datetime64[ns] 2019-02-19T14:08:08.557345384...\n", + " lat_ph (delta_time) float64 ...\n", + " lon_ph (delta_time) float64 ...\n", + "Dimensions without coordinates: ds_surf_type\n", + "Data variables:\n", + " dist_ph_across (delta_time) float32 ...\n", + " dist_ph_along (delta_time) float32 ...\n", + " h_ph (delta_time) float32 ...\n", + " pce_mframe_cnt (delta_time) uint32 ...\n", + " ph_id_channel (delta_time) uint8 ...\n", + " ph_id_count (delta_time) uint8 ...\n", + " ph_id_pulse (delta_time) uint8 ...\n", + " quality_ph (delta_time) int8 ...\n", + " signal_conf_ph (delta_time, ds_surf_type) int8 ...\n", + " weight_ph (delta_time) uint8 ...\n", + "Attributes:\n", + " Description: Contains arrays of the parameters for each received photon.\n", + " data_rate: Data are stored at the photon detection rate." + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "# this one is different! you can try this at home (cloud otpmized HDF5!)\n", + "\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"skip_instance_cache\": True\n", + " \"cache_type\": \"blockcache\", # or \"first\" with enough space\n", + " \"block_size\": 8*1024*1024 # could be bigger\n", + " },\n", + " \"h5py_params\" : {\n", + " \"driver_kwds\": { # only recent versions of xarray and h5netcdf allow this correctly\n", + " \"page_buf_size\": 32*1024*1024, # this one only works in repacked files\n", + " \"rdcc_nbytes\": 8*1024*1024 # this one is to read the chunks \n", + " }\n", + "\n", + " }\n", + "}\n", + "ds = xr.open_dataset(fs.open(cloud_optimized, **io_params[\"fsspec_params\"]),\n", + " group=\"/gt1l/heights\",\n", + " engine=\"h5netcdf\",\n", + " **io_params[\"h5py_params\"])\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "da959721-2f9d-4151-b361-6f9f38fa5b8c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 11 s, sys: 2.02 s, total: 13 s\n", + "Wall time: 1min 25s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'h_ph' ()>\n",
+       "array(1031.6101, dtype=float32)
" + ], + "text/plain": [ + "\n", + "array(1031.6101, dtype=float32)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "# takes about ~2 minutes\n", + "ds.h_ph.mean()" + ] + }, + { + "cell_type": "markdown", + "id": "35caf411-afe7-44f7-9264-5e7b892456d0", + "metadata": {}, + "source": [ + "
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/fsspec-logs.ipynb b/notebooks/fsspec-logs.ipynb index 96bfe63..87d7f05 100644 --- a/notebooks/fsspec-logs.ipynb +++ b/notebooks/fsspec-logs.ipynb @@ -317,7 +317,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/logs-fsspec.ipynb b/notebooks/logs-fsspec.ipynb index fa25aa4..768cf69 100644 --- a/notebooks/logs-fsspec.ipynb +++ b/notebooks/logs-fsspec.ipynb @@ -560,7 +560,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.12.1" } }, "nbformat": 4, From 80b035356c2814b91326a2bca327b88375c5cb02 Mon Sep 17 00:00:00 2001 From: betolink Date: Mon, 5 Feb 2024 19:31:02 -0600 Subject: [PATCH 03/11] updating notebooks, portable can quickly test and visualize results by access pattern --- .gitignore | 4 + environment.yml | 2 + h5tests/h5py_arr_mean.py | 10 +- h5tests/h5test.py | 156 +++++--- h5tests/single-test.ipynb | 239 ++++++++++- h5tests/xarray_arr_mean.py | 33 +- notebooks/logs-fsspec.ipynb | 568 --------------------------- notebooks/portable-h5py-test.ipynb | 329 ++++++++++++++++ notebooks/portable-xarray-test.ipynb | 305 ++++++++++++++ 9 files changed, 995 insertions(+), 651 deletions(-) delete mode 100644 notebooks/logs-fsspec.ipynb create mode 100644 notebooks/portable-h5py-test.ipynb create mode 100644 notebooks/portable-xarray-test.ipynb diff --git a/.gitignore b/.gitignore index ee11d40..97e5249 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,7 @@ venv.bak/ *.hdf5 *.nc *.tif + +*.log +notebooks/logs +notebooks/results diff --git a/environment.yml b/environment.yml index 05d9104..0c9d76a 100644 --- a/environment.yml +++ b/environment.yml @@ -3,6 +3,8 @@ channels: - conda-forge dependencies: - jupyterlab + - boto3 + - tqdm - matplotlib-base - pandas - numpy diff --git a/h5tests/h5py_arr_mean.py b/h5tests/h5py_arr_mean.py index 1d5a01c..d49431f 100644 --- a/h5tests/h5py_arr_mean.py +++ b/h5tests/h5py_arr_mean.py @@ -1,16 +1,13 @@ import h5py import numpy as np -from .h5test import H5Test, timer_decorator +from h5test import H5Test, timer_decorator class H5pyArrMean(H5Test): @timer_decorator - def run(self, io_params={}): + def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): final_h5py_array = [] - # TODO: Do we need to make this configurable or consistent? - group = "/gt1l/heights" - variable = "h_ph" fsspec_params = {} h5py_params = {} if "fsspec_params" in io_params: @@ -19,8 +16,9 @@ def run(self, io_params={}): h5py_params = io_params["h5py_params"] for file in self.files: with self.s3_fs.open(file, mode="rb", **fsspec_params) as fo: + print("h5py params: ", h5py_params) with h5py.File(fo, **h5py_params) as f: - data = f[f"{group}/{variable}"][:] + data = f[f"{dataset}/{variable}"][:] final_h5py_array = np.insert( final_h5py_array, len(final_h5py_array), data, axis=None ) diff --git a/h5tests/h5test.py b/h5tests/h5test.py index 3be885b..e7310cc 100644 --- a/h5tests/h5test.py +++ b/h5tests/h5test.py @@ -13,15 +13,27 @@ current = os.path.abspath("..") sys.path.append(current) -from helpers.links import S3Links +import csv +import logging +import os +import pathlib +import re +import time +from datetime import datetime +from io import StringIO + +import boto3 +import fsspec +import h5py +import numpy as np +import pandas as pd +import s3fs +import xarray as xr +from tqdm import tqdm -class RegexFilter(logging.Filter): - """ - This class will filter a logstream based on a regex expression - The idea is to target a particular library as they usually have a consistent signature. - """ +class RegexFilter(logging.Filter): def __init__(self, regex_pattern): super(RegexFilter, self).__init__() self.regex_pattern = re.compile(regex_pattern) @@ -34,25 +46,45 @@ def filter(self, record): def timer_decorator(func): """ A decorator to measure the execution time of the wrapped function. - It also writes logs to local disk if a regex expression is used in the - subclass instance. """ + def fsspec_stats(log_file): + with open(log_file, "r") as input_file: + num_requests = 0 + total_requested_bytes = 0 + for line in input_file: + # Strip leading and trailing whitespaces from the line + + try: + read_range = line.split("read:")[1].split(" - ") + request_size = int(read_range[1]) - int(read_range[0]) + total_requested_bytes += request_size + num_requests += 1 + except Exception: + pass + stats = { + "total_reqs": num_requests, + "total_reqs_bytes": total_requested_bytes, + "avg_req_size": int(round(total_requested_bytes / num_requests, 2)), + } + return stats + def __setup_logging(self, tstamp): - log_filename = f"logs/{self.data_format}-{tstamp}.log" + pathlib.Path(f"./logs").mkdir(exist_ok=True) + self.log_filename = f"logs/{self.data_format}-{tstamp}.log" logger = logging.getLogger("fsspec") logger.setLevel(logging.DEBUG) self.regex_filter = RegexFilter(self.logs_regex) # add regerx to root logger - logging.getLogger().addFilter(self.regex_filter) - self._file_handler = logging.FileHandler(log_filename) + logging.getLogger("fsspec").addFilter(self.regex_filter) + self._file_handler = logging.FileHandler(self.log_filename) self._file_handler.setLevel(logging.DEBUG) # Add the handler to the root logger - logging.getLogger().addHandler(self._file_handler) + logging.getLogger("fsspec").addHandler(self._file_handler) def __turnoff_logging(self): - logging.getLogger().removeFilter(self.regex_filter) - logging.getLogger().removeHandler(self._file_handler) + logging.getLogger("fsspec").removeFilter(self.regex_filter) + logging.getLogger("fsspec").removeHandler(self._file_handler) self._file_handler.close() def wrapper(self, *args, **kwargs): @@ -66,72 +98,100 @@ def wrapper(self, *args, **kwargs): __turnoff_logging(self) execution_time = end_time - start_time # Call the store method here + self.io_stats = fsspec_stats(self.log_filename) if self.store_results: results_key = f"{tstamp}_{self.name}_{self.data_format}_results.csv" - s3_key = f"{self.results_directory}/{results_key}" - self.store( - run_time=execution_time, - result=result, - bucket=self.bucket, - s3_key=s3_key, - ) - return result, execution_time + self.store(run_time=execution_time, result=result, file_name=results_key) + return result, execution_time, self.log_filename, self.io_stats return wrapper class H5Test: def __init__( - self, data_format: str, files=None, store_results=True, logs_regex=None, anon_access=False, source="cryocloud" + self, + data_format: str, + files=[], + store_results=True, + logs_regex=r"\s*(read: \d+ - \d+)", ): self.name = self.__class__.__name__ + self.io_stats = {} + self.log_filename = "" self.data_format = data_format self.logs_regex = logs_regex - if files: + if len(files) > 0: self.files = files else: - if source == "cryocloud": - links = "../helpers/s3filelinks.json" - else: - links = "../helpers/itslivelinks.json" - self.files = S3Links(links).get_links_by_format(data_format) - self.s3_fs = s3fs.S3FileSystem(anon=anon_access) - + raise ValueError("We need at least 1 ATL03 granule URL hosted in S3") + self.store_results = store_results - self.bucket = "nasa-cryo-persistent" - self.results_directory = "h5cloud/benchmark_results" - @timer_decorator - def run(self, io_params={}): - """ - When implemented we can pass io_params as runtime tweaks to the underlying - libraries e.g. fsspec. - """ + if files[0].startswith("s3://nasa-cryo-persistent"): + self.s3_client = boto3.client("s3") # + self.annon_access = False + self.results_bucket = "s3://nasa-cryo-persistent/" + self.results_directory = "h5cloud/benchmark_results" + self.results_store_type = "S3" + else: + self.annon_access = True + self.results_path = "results" + pathlib.Path(f"./{self.results_path}").mkdir(exist_ok=True) + self.results_store_type = "Local" + self.s3_fs = s3fs.S3FileSystem(anon=self.annon_access) + + @timer_decorator + def run(self, io_params, dataset, variable): raise NotImplementedError("The run method has not been implemented") - def store(self, run_time: float, result: str, bucket: str, s3_key: str): + def store(self, run_time: float, result: str, file_name: str): """ Store test results to an S3 bucket as a CSV file. - :param run_time: The runtime of the test :param result: The result of the test - :param bucket: The name of the S3 bucket where the CSV will be uploaded - :param s3_key: The S3 key (filename) where the CSV will be stored + :param file_name: file to store the results """ - self.s3_client = boto3.client("s3") # Ensure AWS credentials are configured - # Create a CSV in-memory csv_buffer = StringIO() csv_writer = csv.writer(csv_buffer) - csv_writer.writerow(["Name", "Data Format", "Run Time", "Result"]) # Headers - csv_writer.writerow([self.name, self.data_format, run_time, result]) + csv_writer.writerow( + [ + "Name", + "Data Format", + "Run Time", + "Result", + "Access Log", + "Total Bytes Tranferred", + "Total Requests", + ] + ) # Headers + csv_writer.writerow( + [ + self.name, + self.data_format, + run_time, + result, + self.log_filename, + self.io_stats["total_reqs_bytes"], + self.io_stats["total_reqs"], + ] + ) # Reset the buffer's position to the beginning csv_buffer.seek(0) # Upload the CSV to S3 - self.s3_client.put_object(Bucket=bucket, Key=s3_key, Body=csv_buffer.getvalue()) + if self.results_store_type == "S3": + # assumes s3 can write to bucket + self.s3_client.put_object( + Bucket=self.results_bucket, + Key=f"{self.results_directory}/{file_name}", + Body=csv_buffer.getvalue(), + ) + else: + with open(f"{self.results_path}/{file_name}", "w", newline="") as csv_file: + csv_file.write(csv_buffer.getvalue()) ## Example subclass diff --git a/h5tests/single-test.ipynb b/h5tests/single-test.ipynb index be39a69..165aabc 100644 --- a/h5tests/single-test.ipynb +++ b/h5tests/single-test.ipynb @@ -2,12 +2,21 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload \n", @@ -16,12 +25,15 @@ "import os\n", "current = os.path.abspath('..')\n", "sys.path.append(current)\n", - "from xarray_arr_mean import XarrayArrMean" + "from xarray_arr_mean import XarrayArrMean\n", + "import pandas as pd\n", + "\n", + "benchmarks = []" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 11, "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", "metadata": { "tags": [] @@ -32,29 +44,62 @@ " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", "]\n", - "test_original = XarrayArrMean('atl03-bigsize-original', files=files, store_results=False, anon_access=True)" + "xarray_original = XarrayArrMean('atl03-bigsize-original', files=files, store_results=True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[{'library': 'xarray',\n", + " 'format': 'cloud',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 176.90762186050415,\n", + " 'total_requested_bytes': 720001152,\n", + " 'total_requests': 100,\n", + " 'avg_req_size': 7200011},\n", + " {'library': 'xarray',\n", + " 'format': 'original',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 1456.8166418075562,\n", + " 'total_requested_bytes': 438520591,\n", + " 'total_requests': 26988,\n", + " 'avg_req_size': 16248}]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# don't even try this out of region.... more than 30 minutes \n", + "# don't even try this out of region...\n", + "# takes about ~10 minutes per granule out of region (6+ GB granules)\n", "io_params ={\n", " \"fsspec_params\": {},\n", " \"h5py_params\" : {}\n", "}\n", - "test_original.run(io_params)" + "results = xarray_original.run(io_params)\n", + "benchmarks.append({\"library\": \"xarray\",\n", + " \"format\": \"original\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", "metadata": {}, "outputs": [], @@ -63,16 +108,34 @@ " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", "]\n", - "test_cloud = XarrayArrMean('atl03-bigsize-repacked', files=files, store_results=False, anon_access=True)" + "xarray_cloud = XarrayArrMean('atl03-bigsize-repacked', files=files, store_results=True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "dfd4e404-0412-4d2f-8eba-ca39a670e369", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[{'library': 'xarray',\n", + " 'format': 'cloud',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 176.90762186050415,\n", + " 'total_requested_bytes': 720001152,\n", + " 'total_requests': 100,\n", + " 'avg_req_size': 7200011}]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "# takes about ~90 seconds per granule out of region\n", "io_params ={\n", " \"fsspec_params\": {\n", " # \"skip_instance_cache\": True\n", @@ -87,7 +150,155 @@ "\n", " }\n", "}\n", - "test_cloud.run(io_params)" + "\n", + "results = xarray_cloud.run(io_params)\n", + "\n", + "benchmarks.append({\"library\": \"xarray\",\n", + " \"format\": \"cloud\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
libraryformatmeantimetotal_requested_bytestotal_requestsavg_req_size
0xarraycloud1032.984131176.9076227200011521007200011
1xarrayoriginal1032.9841311456.8166424385205912698816248
\n", + "
" + ], + "text/plain": [ + " library format mean time total_requested_bytes \\\n", + "0 xarray cloud 1032.984131 176.907622 720001152 \n", + "1 xarray original 1032.984131 1456.816642 438520591 \n", + "\n", + " total_requests avg_req_size \n", + "0 100 7200011 \n", + "1 26988 16248 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame.from_dict(benchmarks)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['library', 'format']):\n", + " library, format = name\n", + " x = f'{library}, {format}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", + " ax.text(x, y - (y/2) - 10, f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.5), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Access Pattern')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title(f'mean() on photon data for runs on ATL03, less is better ')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()" ] } ], diff --git a/h5tests/xarray_arr_mean.py b/h5tests/xarray_arr_mean.py index 8bee141..59d636d 100644 --- a/h5tests/xarray_arr_mean.py +++ b/h5tests/xarray_arr_mean.py @@ -1,43 +1,46 @@ import fsspec import numpy as np import xarray as xr + from h5test import H5Test, timer_decorator class XarrayArrMean(H5Test): - def open_reference_ds(self, file): + def open_reference_ds(self, file: str, dataset: str): fs = fsspec.filesystem( "reference", fo=file, remote_protocol="s3", - remote_options=dict(anon=False), + remote_options=dict(anon=self.anon_access), skip_instance_cache=True, ) return xr.open_dataset( - fs.get_mapper(""), engine="zarr", consolidated=False, group="gt1l/heights" + fs.get_mapper(""), engine="zarr", consolidated=False, group=dataset ) @timer_decorator - def run(self, io_params={}): - group = "/gt1l/heights" - variable = "h_ph" - + def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): if "kerchunk" in self.data_format: - datasets = [self.open_reference_ds(file) for file in self.files] + datasets_ref = [ + self.open_reference_ds(file, dataset) for file in self.files + ] h_ph_values = [] - for dataset in datasets: - h_ph_values = np.append(h_ph_values, dataset["h_ph"].values) + for ds in datasets_ref: + h_ph_values = np.append(h_ph_values, ds[variable].values) return np.mean(h_ph_values) else: - fsspec_params = {} - h5py_params = {} if "fsspec_params" in io_params: fsspec_params = io_params["fsspec_params"] if "h5py_params" in io_params: h5py_params = io_params["h5py_params"] - print(h5py_params) s3_fileset = [self.s3_fs.open(file, **fsspec_params) for file in self.files] - xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf', **h5py_params) - h_ph_values = xrds['h_ph'] + xrds = xr.open_mfdataset( + s3_fileset, + group=dataset, + combine="by_coords", + engine="h5netcdf", + **h5py_params + ) + h_ph_values = xrds[variable] return float(np.mean(h_ph_values).values) diff --git a/notebooks/logs-fsspec.ipynb b/notebooks/logs-fsspec.ipynb deleted file mode 100644 index 768cf69..0000000 --- a/notebooks/logs-fsspec.ipynb +++ /dev/null @@ -1,568 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "6c9b37e2-2daa-4283-a228-ea581498de0c", - "metadata": { - "tags": [], - "user_expressions": [] - }, - "source": [ - "## AB testing access time for ICESat-2 HDF5 files on the cloud.\n", - "\n", - "This notebook requires that we have 2 versions of the same file:\n", - " * Original A: The original file with no modifications on a S3 location.\n", - " * Test Case B: A modified version of the orignal file to test for metadata consolidation, rechunking and other strategies to speed up access to the data in the file.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "3b78fb94-10ae-48cb-8e30-521b2c8b7822", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "xarray v2023.12.0\n", - "h5py v3.10.0\n", - "fsspec v2023.6.0\n" - ] - } - ], - "source": [ - "import xarray as xr\n", - "import h5py\n", - "import fsspec\n", - "import s3fs\n", - "import boto3\n", - "import logging\n", - "import re\n", - "import time\n", - "from datetime import datetime\n", - "import pandas as pd\n", - "import numpy as np\n", - "import os\n", - "\n", - "class RegexFilter(logging.Filter):\n", - " def __init__(self, regex_pattern):\n", - " super(RegexFilter, self).__init__()\n", - " self.regex_pattern = re.compile(regex_pattern)\n", - "\n", - " def filter(self, record):\n", - " # Apply the regex pattern to the log message\n", - " return not bool(self.regex_pattern.search(record.msg))\n", - "\n", - " \n", - "def timer_decorator(func):\n", - " \"\"\"\n", - " A decorator to measure the execution time of the wrapped function.\n", - " \"\"\"\n", - " def __setup_logging(self, tstamp):\n", - " log_filename = f\"logs/{self.data_format}-{tstamp}.log\"\n", - " logger = logging.getLogger(\"fsspec\")\n", - " logger.setLevel(logging.DEBUG)\n", - " self.regex_filter = RegexFilter(self.logs_regex)\n", - " # add regerx to root logger\n", - " logging.getLogger().addFilter(self.regex_filter )\n", - " self._file_handler = logging.FileHandler(log_filename)\n", - " self._file_handler.setLevel(logging.DEBUG)\n", - " # Add the handler to the root logger\n", - " logging.getLogger().addHandler(self._file_handler)\n", - " \n", - " def __turnoff_logging(self):\n", - " logging.getLogger().removeFilter(self.regex_filter)\n", - " logging.getLogger().removeHandler(self._file_handler)\n", - " self._file_handler.close()\n", - " \n", - " def wrapper(self, *args, **kwargs):\n", - " tstamp = datetime.now().strftime('%Y-%m-%d-%H%M%S')\n", - " if self.logs_regex:\n", - " __setup_logging(self, tstamp)\n", - " start_time = time.time()\n", - " result = func(self, *args, **kwargs)\n", - " end_time = time.time()\n", - " if self.logs_regex:\n", - " __turnoff_logging(self)\n", - " execution_time = end_time - start_time\n", - " # Call the store method here\n", - " if self.store_results:\n", - " results_key = f\"{tstamp}_{self.name}_{self.data_format}_results.csv\"\n", - " s3_key = f\"{self.results_directory}/{results_key}\"\n", - " self.store(run_time=execution_time, result=result, bucket=self.bucket, s3_key=s3_key)\n", - " return result, execution_time\n", - " return wrapper \n", - "\n", - "\n", - " \n", - "class H5Test:\n", - " def __init__(self,\n", - " data_format: str,\n", - " files=None,\n", - " store_results=True,\n", - " logs_regex=None):\n", - " self.name = self.__class__.__name__\n", - " self.data_format = data_format\n", - " self.logs_regex = logs_regex\n", - " if files:\n", - " self.files = files\n", - " else:\n", - " self.files = S3Links().get_links_by_format(data_format)\n", - " self.s3_client = boto3.client('s3') # Ensure AWS credentials are configured\n", - " self.s3_fs = s3fs.S3FileSystem(anon=False)\n", - " self.store_results = store_results\n", - " self.bucket = \"nasa-cryo-persistent\"\n", - " self.results_directory = \"h5cloud/benchmark_results\"\n", - " \n", - " \n", - "\n", - " @timer_decorator\n", - " def run(self, io_params):\n", - " raise NotImplementedError(\"The run method has not been implemented\")\n", - "\n", - " def store(self, run_time: float, result: str, bucket: str, s3_key: str):\n", - " \"\"\"\n", - " Store test results to an S3 bucket as a CSV file.\n", - "\n", - " :param run_time: The runtime of the test\n", - " :param result: The result of the test\n", - " :param bucket: The name of the S3 bucket where the CSV will be uploaded\n", - " :param s3_key: The S3 key (filename) where the CSV will be stored\n", - " \"\"\"\n", - " # Create a CSV in-memory\n", - " csv_buffer = StringIO()\n", - " csv_writer = csv.writer(csv_buffer)\n", - " csv_writer.writerow(['Name', 'Data Format', 'Run Time', 'Result']) # Headers\n", - " csv_writer.writerow([self.name, self.data_format, run_time, result])\n", - "\n", - " # Reset the buffer's position to the beginning\n", - " csv_buffer.seek(0)\n", - "\n", - " # Upload the CSV to S3\n", - " self.s3_client.put_object(Bucket=bucket, Key=s3_key, Body=csv_buffer.getvalue())\n", - "\n", - "for library in (xr, h5py, fsspec):\n", - " print(f'{library.__name__} v{library.__version__}')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d6ed86c7-a919-4532-b7d3-1ca3cf4e25d1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "class H5pyArrMean(H5Test):\n", - " \n", - " @timer_decorator\n", - " def run(self, io_params):\n", - " final_h5py_array = [] \n", - " # TODO: Do we need to make this configurable or consistent?\n", - " group = '/gt1l/heights'\n", - " variable = 'h_ph'\n", - " fsspec_params = io_params[\"fsspec_params\"]\n", - " h5py_params = io_params[\"h5py_params\"]\n", - " for file in self.files:\n", - " with self.s3_fs.open(file, mode=\"rb\", **fsspec_params) as fo:\n", - " with h5py.File(fo, **h5py_params) as f:\n", - " data = f[f\"{group}/{variable}\"][:]\n", - " final_h5py_array = np.insert(\n", - " final_h5py_array,\n", - " len(final_h5py_array),\n", - " data, axis=None\n", - " )\n", - " return np.mean(final_h5py_array)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7db7c600-d362-45c2-bb9f-3670de9ddf4d", - "metadata": {}, - "outputs": [], - "source": [ - "class H5pyROS3ArrMean(H5Test):\n", - " \"\"\"\n", - " This will only work for public buckets for now\n", - " \"\"\"\n", - " \n", - " @timer_decorator\n", - " def run(self, io_params):\n", - " final_h5py_array = [] \n", - " # TODO: Do we need to make this configurable or consistent?\n", - " group = '/gt1l/heights'\n", - " variable = 'h_ph'\n", - " h5py_params = io_params[\"h5py_params\"]\n", - " for file in self.files:\n", - " with h5py.File(file, driver=\"ros3\", **h5py_params) as f:\n", - " data = f[f\"{group}/{variable}\"][:]\n", - " final_h5py_array = np.insert(\n", - " final_h5py_array,\n", - " len(final_h5py_array),\n", - " data, axis=None\n", - " )\n", - " return np.mean(final_h5py_array)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "621f6014-fda5-40a4-be23-dcaed47b6fbd", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "class XarrayArrMean(H5Test):\n", - " def open_reference_ds(self, file):\n", - " fs = fsspec.filesystem(\n", - " 'reference', \n", - " fo=file, \n", - " remote_protocol='s3', \n", - " remote_options=dict(anon=False), \n", - " skip_instance_cache=True\n", - " )\n", - " return xr.open_dataset(fs.get_mapper(\"\"), engine='zarr', consolidated=False, group='gt1l/heights')\n", - "\n", - " @timer_decorator\n", - " def run(self, io_params):\n", - " group = '/gt1l/heights'\n", - " variable = 'h_ph'\n", - "\n", - " if 'kerchunk' in self.data_format: \n", - " datasets = [self.open_reference_ds(file) for file in self.files]\n", - " h_ph_values = []\n", - " for dataset in datasets:\n", - " h_ph_values = np.append(h_ph_values, dataset['h_ph'].values)\n", - " return np.mean(h_ph_values)\n", - " else:\n", - " if \"repacked\" in self.data_format:\n", - " fsspec_params = {\n", - " # \"skip_instance_cache\": True\n", - " \"cache_type\": \"first\",\n", - " \"block_size\": 16*1024*1024\n", - " }\n", - " h5py_params = {\n", - " \"driver_kwds\" :{\n", - " \"page_buf_size\": 32*1024*1024,\n", - " \"rdcc_nbytes\": 8*1024*1024\n", - " }\n", - " } \n", - " s3_fileset = [self.s3_fs.open(file, **fsspec_params) for file in self.files]\n", - " xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf', **h5py_params)\n", - " h_ph_values = xrds['h_ph']\n", - " return float(np.mean(h_ph_values).values)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "a6d4b6ce-10f1-4031-987d-fcfd43422ae6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "repacked_granules = [\n", - " \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", - " \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", - "]\n", - "test_cloud = H5pyArrMean('atl03-bigsize-repacked',\n", - " files=repacked_granules,\n", - " store_results=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "4dfe5ab6-2e88-46d3-bb19-d6ebcec2d341", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "\n", - "original_granules = [\n", - " \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", - " \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", - "]\n", - "\n", - "logs_regex = r\"\\s*(read: \\d+ - \\d+)\"\n", - "\n", - "test_original = H5pyArrMean('atl03-bigsize-original',\n", - " files=original_granules,\n", - " store_results=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "40ef2c85-f4d2-4a03-9839-bc192dda0c02", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1032.9840463639412, 12.149354219436646)\n", - "(1032.9840463639412, 12.194729566574097)\n", - "(1032.9840463639412, 12.10885739326477)\n", - "(1032.9840463639412, 11.940461874008179)\n", - "(1032.9840463639412, 12.063915252685547)\n" - ] - } - ], - "source": [ - "# logger = logging.getLogger()\n", - "# logger.setLevel(logging.DEBUG)\n", - "io_params ={\n", - " \"fsspec_params\": {\n", - " \"skip_instance_cache\": True\n", - " # \"cache_type\": \"blockcache\",\n", - " # \"block_size\": 4*1024*1024\n", - " },\n", - " \"h5py_params\": {\n", - " # \"rdcc_nbytes\": 2*1024*1024 \n", - " }\n", - "}\n", - "for runs in range(5):\n", - " print(test_original.run(io_params))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb2f92bf-4b1f-4bfe-a037-8061bfa9b127", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1032.9840463639412, 34.1772985458374)\n", - "(1032.9840463639412, 30.96499228477478)\n", - "(1032.9840463639412, 31.00865602493286)\n", - "(1032.9840463639412, 31.207276821136475)\n" - ] - } - ], - "source": [ - "io_params ={\n", - " \"fsspec_params\": {\n", - " # \"skip_instance_cache\": True\n", - " # \"cache_type\": \"blockcache\",\n", - " # \"block_size\": 4*1024*1024\n", - " },\n", - " \"h5py_params\": {\n", - " # \"page_buf_size\": 32*1024*1024,\n", - " # \"rdcc_nbytes\": 2*1024*1024\n", - " }\n", - "}\n", - "for runs in range(5):\n", - " print(test_cloud.run(io_params))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "262a6b25-8b23-46bb-8301-8965aaf155d2", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Registered drivers: frozenset({'mpio', 'family', 'ros3', 'split', 'core', 'sec2', 'fileobj', 'direct', 'stdio'})\n" - ] - } - ], - "source": [ - "print(f'Registered drivers: {h5py.registered_drivers()}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02258a85-4951-48c2-a295-75a47c0e38c1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "df = pd.DataFrame.from_dict(benchmarks)\n", - "\n", - "fig, ax = plt.subplots(figsize=(10, 6))\n", - "\n", - "for name, group in df.groupby(['tool', 'dataset', 'format']):\n", - " tool, dataset, formated = name\n", - " x = f'{tool}, {dataset}, {formated}'\n", - " y = group['time'].mean()\n", - " ax.bar(f'{tool}, {dataset}, {formated}', group['time'].mean(), label=f'{tool}, {dataset}, {formated}', align='center')\n", - " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=8)\n", - "\n", - "# Set labels and title\n", - "ax.set_xlabel('Combination')\n", - "ax.set_ylabel('Time in Seconds')\n", - "ax.set_title('mean() on photon data for a single IS2 track, less is better')\n", - "\n", - "# Rotate x-axis labels for better readability\n", - "plt.xticks(rotation=45, ha='right')\n", - "\n", - "# # Show legend\n", - "# ax.legend()\n", - "\n", - "# Show the plot\n", - "with plt.xkcd():\n", - " # This figure will be in XKCD-style\n", - " fig1 = plt.figure()\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d5d779b-5bc6-4208-ae26-05c5a473d9b9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64bcc5de-aae3-46aa-9474-1c90b9ff20a9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "df = pd.DataFrame.from_dict(benchmarks)\n", - "\n", - "fig, ax = plt.subplots(figsize=(10, 6))\n", - "\n", - "for name, group in df.groupby(['tool', 'dataset', 'format']):\n", - " tool, dataset, formated = name\n", - " x = f'{tool}, {dataset}, {formated}'\n", - " y = group['time'].mean()\n", - " ax.bar(f'{tool}, {dataset}, {formated}', group['time'].mean(), label=f'{tool}, {dataset}, {formated}', align='center')\n", - " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=8)\n", - "\n", - "# Set labels and title\n", - "ax.set_xlabel('Combination')\n", - "ax.set_ylabel('Time in Seconds')\n", - "ax.set_title('mean() on photon data for a single IS2 track, less is better')\n", - "\n", - "# Rotate x-axis labels for better readability\n", - "plt.xticks(rotation=45, ha='right')\n", - "\n", - "# # Show legend\n", - "# ax.legend()\n", - "\n", - "# Show the plot\n", - "plt.show()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b2871e1-d700-4d22-b01f-5e5a9acd1006", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89aabece-d942-418c-b387-77ea2de1e561", - "metadata": {}, - "outputs": [], - "source": [ - "# def normalize_log(log_file):\n", - "# with open(log_file, 'r') as input_file:\n", - "# # Open the output file in write mode\n", - "# with open(f'{log_file.replace(\".log\", \"-ros-compatible.log\")}', 'w') as output_file:\n", - "# # Iterate through each line in the input file\n", - "# for line in input_file:\n", - "# # Strip leading and trailing whitespaces from the line\n", - "# stripped_line = line.strip()\n", - "\n", - "# # Write the stripped line to the output file\n", - "# output_file.write(stripped_line + '\\n') " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8411f4a8-22b4-424e-b817-0d31e7ac9e93", - "metadata": {}, - "outputs": [], - "source": [ - " # \"ATL08\": {\n", - " # \"links\": {\n", - " # \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl08/original/ATL08_20200404075919_01340707_006_03.h5\",\n", - " # \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl08/repacked/ATL08_20200404075919_01340707_006_03_repacked.h5\",\n", - " # },\n", - " # \"group\": \"/gt1l/signal_photons\",\n", - " # \"variable\": \"ph_h\",\n", - " # \"processing\": [\n", - " # \"h5repack -S PAGE -G 4000000\"\n", - " # ]\n", - " # },\n", - " # \"ATL03\": {\n", - " # \"links\": {\n", - " # \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\",\n", - " # \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\"\n", - " # },\n", - " # \"group\": \"/gt1l/heights\",\n", - " # \"variable\": \"h_ph\",\n", - " # \"processing\": [\n", - " # \"h5repack -S PAGE -G 4000000\"\n", - " # ]\n", - " # }," - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/portable-h5py-test.ipynb b/notebooks/portable-h5py-test.ipynb new file mode 100644 index 0000000..82d88f5 --- /dev/null +++ b/notebooks/portable-h5py-test.ipynb @@ -0,0 +1,329 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload \n", + "\n", + "import sys\n", + "import os\n", + "classes_path = os.path.abspath('../h5tests/')\n", + "sys.path.append(classes_path)\n", + "from h5py_arr_mean import H5pyArrMean\n", + "import pandas as pd\n", + "\n", + "benchmarks = []" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "original_granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "]\n", + "h5py_original = H5pyArrMean('atl03-bigsize-original', files=original_granules, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "h5py params: {}\n", + "h5py params: {}\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'library': 'h5py',\n", + " 'format': 'original',\n", + " 'mean': 1032.9840463639412,\n", + " 'time': 51.46329092979431,\n", + " 'total_requested_bytes': 414028873,\n", + " 'total_requests': 12295,\n", + " 'avg_req_size': 33674}]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# takes about ~30 seconds per granule out of region (6+ GB granules)\n", + "io_params ={\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\" : {}\n", + "}\n", + "results = h5py_original.run(io_params)\n", + "benchmarks.append({\"library\": \"h5py\",\n", + " \"format\": \"original\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", + "metadata": {}, + "outputs": [], + "source": [ + "cloud_optimized_granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + "]\n", + "h5py_cloud = H5pyArrMean('atl03-bigsize-repacked', files=cloud_optimized_granules, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dfd4e404-0412-4d2f-8eba-ca39a670e369", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "h5py params: {'page_buf_size': 33554432, 'rdcc_nbytes': 1048576}\n", + "h5py params: {'page_buf_size': 33554432, 'rdcc_nbytes': 1048576}\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'library': 'h5py',\n", + " 'format': 'original',\n", + " 'mean': 1032.9840463639412,\n", + " 'time': 51.46329092979431,\n", + " 'total_requested_bytes': 414028873,\n", + " 'total_requests': 12295,\n", + " 'avg_req_size': 33674},\n", + " {'library': 'h5py',\n", + " 'format': 'cloud',\n", + " 'mean': 1032.9840463639412,\n", + " 'time': 42.97014808654785,\n", + " 'total_requested_bytes': 560001136,\n", + " 'total_requests': 78,\n", + " 'avg_req_size': 7179501}]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# takes about ~30 seconds per granule out of region\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"skip_instance_cache\": True\n", + " \"cache_type\": \"first\",\n", + " \"block_size\": 16*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"page_buf_size\": 32*1024*1024,\n", + " \"rdcc_nbytes\": 1024*1024\n", + " }\n", + "}\n", + "\n", + "results = h5py_cloud.run(io_params)\n", + "\n", + "benchmarks.append({\"library\": \"h5py\",\n", + " \"format\": \"cloud\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
libraryformatmeantimetotal_requested_bytestotal_requestsavg_req_size
0h5pyoriginal1032.98404651.4632914140288731229533674
1h5pycloud1032.98404642.970148560001136787179501
\n", + "
" + ], + "text/plain": [ + " library format mean time total_requested_bytes \\\n", + "0 h5py original 1032.984046 51.463291 414028873 \n", + "1 h5py cloud 1032.984046 42.970148 560001136 \n", + "\n", + " total_requests avg_req_size \n", + "0 12295 33674 \n", + "1 78 7179501 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame.from_dict(benchmarks)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['library', 'format']):\n", + " library, format = name\n", + " x = f'{library}, {format}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", + " ax.text(x, y - (y/2) - 10, f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.5), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Access Pattern')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title(f'mean() on photon data for runs on ATL03, less is better ')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/portable-xarray-test.ipynb b/notebooks/portable-xarray-test.ipynb new file mode 100644 index 0000000..4e67969 --- /dev/null +++ b/notebooks/portable-xarray-test.ipynb @@ -0,0 +1,305 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n", + "xarray v2024.1.1\n", + "h5py v3.10.0\n", + "s3fs v2024.2.0\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload \n", + "\n", + "import sys\n", + "import os\n", + "classes_path = os.path.abspath('../h5tests/')\n", + "sys.path.append(classes_path)\n", + "from xarray_arr_mean import XarrayArrMean\n", + "import pandas as pd\n", + "\n", + "import xarray as xr\n", + "import h5py\n", + "import s3fs\n", + "\n", + "benchmarks = []\n", + "\n", + "for library in (xr, h5py, s3fs):\n", + " print(f'{library.__name__} v{library.__version__}')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "files = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "]\n", + "xarray_original = XarrayArrMean('atl03-bigsize-original', files=files, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'library': 'xarray',\n", + " 'format': 'cloud',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 176.90762186050415,\n", + " 'total_requested_bytes': 720001152,\n", + " 'total_requests': 100,\n", + " 'avg_req_size': 7200011},\n", + " {'library': 'xarray',\n", + " 'format': 'original',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 1456.8166418075562,\n", + " 'total_requested_bytes': 438520591,\n", + " 'total_requests': 26988,\n", + " 'avg_req_size': 16248}]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# don't even try this out of region...\n", + "# takes about ~10 minutes per granule out of region (6+ GB granules)\n", + "io_params ={\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\" : {}\n", + "}\n", + "results = xarray_original.run(io_params)\n", + "benchmarks.append({\"library\": \"xarray\",\n", + " \"format\": \"original\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", + "metadata": {}, + "outputs": [], + "source": [ + "files = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + "]\n", + "xarray_cloud = XarrayArrMean('atl03-bigsize-repacked', files=files, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "dfd4e404-0412-4d2f-8eba-ca39a670e369", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'library': 'xarray',\n", + " 'format': 'cloud',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 176.90762186050415,\n", + " 'total_requested_bytes': 720001152,\n", + " 'total_requests': 100,\n", + " 'avg_req_size': 7200011}]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# takes about ~90 seconds per granule out of region\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"skip_instance_cache\": True\n", + " \"cache_type\": \"blockcache\",\n", + " \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"driver_kwds\": {\n", + " \"page_buf_size\": 32*1024*1024,\n", + " \"rdcc_nbytes\": 8*1024*1024\n", + " }\n", + "\n", + " }\n", + "}\n", + "\n", + "results = xarray_cloud.run(io_params)\n", + "\n", + "benchmarks.append({\"library\": \"xarray\",\n", + " \"format\": \"cloud\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame.from_dict(benchmarks)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['library', 'format']):\n", + " library, format = name\n", + " x = f'{library}, {format}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", + " ax.text(x, y - (y/2) - 10, f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.5), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Access Pattern')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title(f'mean() on photon data for runs on ATL03, less is better ')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a5fe2e281fcf84156d76a8ffbfd112acb428ad0e Mon Sep 17 00:00:00 2001 From: betolink Date: Mon, 5 Feb 2024 19:57:25 -0600 Subject: [PATCH 04/11] h5coro needs some upstream changes to work with annon=True access --- h5tests/h5coro_arr_mean.py | 17 ++- notebooks/portable-h5coro-test.ipynb | 162 +++++++++++++++++++++++++++ 2 files changed, 174 insertions(+), 5 deletions(-) create mode 100644 notebooks/portable-h5coro-test.ipynb diff --git a/h5tests/h5coro_arr_mean.py b/h5tests/h5coro_arr_mean.py index 58b3f76..5f93fcc 100644 --- a/h5tests/h5coro_arr_mean.py +++ b/h5tests/h5coro_arr_mean.py @@ -1,4 +1,4 @@ -from .h5test import H5Test, timer_decorator +from h5test import H5Test, timer_decorator import numpy as np import subprocess @@ -12,15 +12,22 @@ from h5coro import h5coro, s3driver, filedriver h5coro.config(errorChecking=True, verbose=False, enableAttributes=False) + +driver = s3driver.S3Driver class H5CoroArrMean(H5Test): @timer_decorator - def run(self): - group = '/gt1l/heights' - variable = 'h_ph' + def run(self, dataset="/gt1l/heights", variable="h_ph"): + group = dataset + variable = variable final_h5coro_array = [] + if self.files[0].startswith("s3://cryo"): + credentials = {} + else: + credentials = {"region_name": "us-west-2", + "anon": True} for file in self.files: - h5obj = h5coro.H5Coro(file.replace("s3://", ""), s3driver.S3Driver) + h5obj = h5coro.H5Coro(file.replace("s3://", ""), s3driver.S3Driver, credentials=credentials) output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True) data = h5obj[f'{group}/{variable}'].values final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None) diff --git a/notebooks/portable-h5coro-test.ipynb b/notebooks/portable-h5coro-test.ipynb new file mode 100644 index 0000000..df2394f --- /dev/null +++ b/notebooks/portable-h5coro-test.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload \n", + "\n", + "import sys\n", + "import os\n", + "classes_path = os.path.abspath('../h5tests/')\n", + "sys.path.append(classes_path)\n", + "from h5coro_arr_mean import H5CoroArrMean\n", + "import pandas as pd\n", + "\n", + "benchmarks = []" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "original_granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "]\n", + "h5coro_original = H5CoroArrMean('atl03-bigsize-original', files=original_granules, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "ename": "FatalError", + "evalue": "invalid credential keys provided, looking for: aws_access_key_id, aws_secret_access_key, and aws_session_token", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFatalError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# takes about ~30 seconds per granule out of region (6+ GB granules)\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mh5coro_original\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m benchmarks\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlibrary\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mh5coro\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mformat\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moriginal\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmean\u001b[39m\u001b[38;5;124m\"\u001b[39m: results[\u001b[38;5;241m0\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal_requests\u001b[39m\u001b[38;5;124m\"\u001b[39m: results[\u001b[38;5;241m3\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal_reqs\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mavg_req_size\u001b[39m\u001b[38;5;124m\"\u001b[39m: results[\u001b[38;5;241m3\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mavg_req_size\u001b[39m\u001b[38;5;124m\"\u001b[39m]})\n\u001b[1;32m 10\u001b[0m benchmarks\n", + "File \u001b[0;32m~/work/openscapes/h5cloud/h5tests/h5test.py:95\u001b[0m, in \u001b[0;36mtimer_decorator..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 93\u001b[0m __setup_logging(\u001b[38;5;28mself\u001b[39m, tstamp)\n\u001b[1;32m 94\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m---> 95\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m end_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogs_regex:\n", + "File \u001b[0;32m~/work/openscapes/h5cloud/h5tests/h5coro_arr_mean.py:30\u001b[0m, in \u001b[0;36mH5CoroArrMean.run\u001b[0;34m(self, dataset, variable)\u001b[0m\n\u001b[1;32m 27\u001b[0m credentials \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mregion_name\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mus-west-2\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 28\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124manon\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28;01mTrue\u001b[39;00m}\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfiles:\n\u001b[0;32m---> 30\u001b[0m h5obj \u001b[38;5;241m=\u001b[39m \u001b[43mh5coro\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mH5Coro\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreplace\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ms3://\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43ms3driver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mS3Driver\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcredentials\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcredentials\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m output \u001b[38;5;241m=\u001b[39m h5obj\u001b[38;5;241m.\u001b[39mreadDatasets(datasets\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgroup\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvariable\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m], block\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 32\u001b[0m data \u001b[38;5;241m=\u001b[39m h5obj[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgroup\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvariable\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues\n", + "File \u001b[0;32m~/.pyenv/versions/mambaforge/envs/h5cloud/lib/python3.12/site-packages/h5coro/h5coro.py:2020\u001b[0m, in \u001b[0;36mH5Coro.__init__\u001b[0;34m(self, resource, driver_class, credentials, datasets, block)\u001b[0m\n\u001b[1;32m 2018\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, resource, driver_class, credentials\u001b[38;5;241m=\u001b[39m{}, datasets\u001b[38;5;241m=\u001b[39m[], block\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[1;32m 2019\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresource \u001b[38;5;241m=\u001b[39m resource\n\u001b[0;32m-> 2020\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdriver \u001b[38;5;241m=\u001b[39m \u001b[43mdriver_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcredentials\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2022\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 2023\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetaDataTable \u001b[38;5;241m=\u001b[39m {}\n", + "File \u001b[0;32m~/.pyenv/versions/mambaforge/envs/h5cloud/lib/python3.12/site-packages/h5coro/s3driver.py:43\u001b[0m, in \u001b[0;36mS3Driver.__init__\u001b[0;34m(self, resource, credentials)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession \u001b[38;5;241m=\u001b[39m boto3\u001b[38;5;241m.\u001b[39mSession()\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 43\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m FatalError(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minvalid credential keys provided, looking for: aws_access_key_id, aws_secret_access_key, and aws_session_token\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 45\u001b[0m \u001b[38;5;66;03m# open resource\u001b[39;00m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mresource(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124ms3\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mObject(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresourcePath[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresourcePath[\u001b[38;5;241m1\u001b[39m:]))\n", + "\u001b[0;31mFatalError\u001b[0m: invalid credential keys provided, looking for: aws_access_key_id, aws_secret_access_key, and aws_session_token" + ] + } + ], + "source": [ + "# takes about ~30 seconds per granule out of region (6+ GB granules)\n", + "results = h5coro_original.run()\n", + "benchmarks.append({\"library\": \"h5coro\",\n", + " \"format\": \"original\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", + "metadata": {}, + "outputs": [], + "source": [ + "cloud_optimized_granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + "]\n", + "h5py_cloud = H5pyArrMean('atl03-bigsize-repacked', files=cloud_optimized_granules, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(benchmarks)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['library', 'format']):\n", + " library, format = name\n", + " x = f'{library}, {format}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", + " ax.text(x, y - (y/2) - 10, f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.5), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Access Pattern')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title(f'mean() on photon data for runs on ATL03, less is better ')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a7a891dbf5002028dbdea8d39c5610a3a259d907 Mon Sep 17 00:00:00 2001 From: betolink Date: Mon, 12 Feb 2024 10:19:30 -0600 Subject: [PATCH 05/11] refactoring the whole thing --- h5tests/h5coro_arr_mean.py | 44 ++- h5tests/h5py_arr_mean.py | 5 +- h5tests/h5py_arr_subset_mean.py | 44 +-- h5tests/h5test.py | 191 +++++----- h5tests/single-test.ipynb | 249 +++++-------- h5tests/xarray_arr_mean.py | 4 +- .../01_data-selection.ipynb | 0 .../arr_mean_bar_plot.png | Bin .../benchmark-h5repack.ipynb | 0 .../benchmark-small-file-h5repack.ipynb | 0 .../benchmarks-outline.ipynb | 0 .../cloud-optimized-hdf5.ipynb | 0 .../convert_h5dataframe2flatgeobuf.ipynb | 0 .../example-list-test-files.ipynb | 0 .../format-preprocessing-times.ipynb | 0 .../{ => data-wrangling}/fsspec-logs.ipynb | 0 .../h5coro_benchmarks.ipynb | 0 ...ng_original_repacked_with_subsetting.ipynb | 0 .../{ => data-wrangling}/kerchunker.ipynb | 0 .../{ => data-wrangling}/read-results.ipynb | 0 .../data-wrangling/ros3vfd-log-info.ipynb | 344 ++++++++++++++++++ .../{ => data-wrangling}/run-tests.ipynb | 0 .../sliderule2geoparquet.ipynb | 0 .../xarray-h5coro-backend.ipynb | 0 notebooks/portable-full-comparison.ipynb | 1 + notebooks/portable-h5py-test.ipynb | 192 ++-------- 26 files changed, 628 insertions(+), 446 deletions(-) rename notebooks/{ => data-wrangling}/01_data-selection.ipynb (100%) rename notebooks/{ => data-wrangling}/arr_mean_bar_plot.png (100%) rename notebooks/{ => data-wrangling}/benchmark-h5repack.ipynb (100%) rename notebooks/{ => data-wrangling}/benchmark-small-file-h5repack.ipynb (100%) rename notebooks/{ => data-wrangling}/benchmarks-outline.ipynb (100%) rename notebooks/{ => data-wrangling}/cloud-optimized-hdf5.ipynb (100%) rename notebooks/{ => data-wrangling}/convert_h5dataframe2flatgeobuf.ipynb (100%) rename notebooks/{ => data-wrangling}/example-list-test-files.ipynb (100%) rename notebooks/{ => data-wrangling}/format-preprocessing-times.ipynb (100%) rename notebooks/{ => data-wrangling}/fsspec-logs.ipynb (100%) rename notebooks/{ => data-wrangling}/h5coro_benchmarks.ipynb (100%) rename notebooks/{ => data-wrangling}/h5py_testing_original_repacked_with_subsetting.ipynb (100%) rename notebooks/{ => data-wrangling}/kerchunker.ipynb (100%) rename notebooks/{ => data-wrangling}/read-results.ipynb (100%) create mode 100644 notebooks/data-wrangling/ros3vfd-log-info.ipynb rename notebooks/{ => data-wrangling}/run-tests.ipynb (100%) rename notebooks/{ => data-wrangling}/sliderule2geoparquet.ipynb (100%) rename notebooks/{ => data-wrangling}/xarray-h5coro-backend.ipynb (100%) create mode 100644 notebooks/portable-full-comparison.ipynb diff --git a/h5tests/h5coro_arr_mean.py b/h5tests/h5coro_arr_mean.py index 5f93fcc..6969c4a 100644 --- a/h5tests/h5coro_arr_mean.py +++ b/h5tests/h5coro_arr_mean.py @@ -1,34 +1,40 @@ -from h5test import H5Test, timer_decorator -import numpy as np import subprocess +import numpy as np +from h5test import H5Test, timer_decorator + try: import h5coro except: - completed_process = subprocess.run([ - 'mamba', 'install', '-c', 'conda-forge', 'h5coro', '--yes' - ]) + completed_process = subprocess.run( + ["pip", "install", "git+https://github.com/ICESat2-SlideRule/h5coro.git@main"] + ) import h5coro -from h5coro import h5coro, s3driver, filedriver -h5coro.config(errorChecking=True, verbose=False, enableAttributes=False) +from h5coro import h5coro, s3driver + +driver = s3driver.S3Driver + -driver = s3driver.S3Driver - class H5CoroArrMean(H5Test): @timer_decorator def run(self, dataset="/gt1l/heights", variable="h_ph"): group = dataset - variable = variable + variable = variable final_h5coro_array = [] - if self.files[0].startswith("s3://cryo"): - credentials = {} - else: - credentials = {"region_name": "us-west-2", - "anon": True} + for file in self.files: - h5obj = h5coro.H5Coro(file.replace("s3://", ""), s3driver.S3Driver, credentials=credentials) - output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True) - data = h5obj[f'{group}/{variable}'].values - final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None) + if link.startswith("s3://nasa-cryo-persistent/"): + h5obj = h5coro.H5Coro(link.replace("s3://", ""), s3driver.S3Driver) + else: + h5obj = h5coro.H5Coro( + link.replace("s3://", ""), + s3driver.S3Driver, + credentials={"annon": True}, + ) + ds = h5obj.readDatasets(datasets=[f"{group}/{variable}"], block=True) + data = ds[f"{group}/{variable}"][:] + final_h5coro_array = np.insert( + final_h5coro_array, len(final_h5coro_array), data, axis=None + ) return np.mean(final_h5coro_array) diff --git a/h5tests/h5py_arr_mean.py b/h5tests/h5py_arr_mean.py index d49431f..8e059cf 100644 --- a/h5tests/h5py_arr_mean.py +++ b/h5tests/h5py_arr_mean.py @@ -1,11 +1,11 @@ import h5py import numpy as np - -from h5test import H5Test, timer_decorator +from h5test import H5Test, fsspec_logging_decorator, timer_decorator class H5pyArrMean(H5Test): @timer_decorator + @fsspec_logging_decorator def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): final_h5py_array = [] fsspec_params = {} @@ -14,6 +14,7 @@ def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): fsspec_params = io_params["fsspec_params"] if "h5py_params" in io_params: h5py_params = io_params["h5py_params"] + self.file_sizes = [self.s3_fs.info(file)["size"] for file in self.files] for file in self.files: with self.s3_fs.open(file, mode="rb", **fsspec_params) as fo: print("h5py params: ", h5py_params) diff --git a/h5tests/h5py_arr_subset_mean.py b/h5tests/h5py_arr_subset_mean.py index e8ceeea..f2c2629 100644 --- a/h5tests/h5py_arr_subset_mean.py +++ b/h5tests/h5py_arr_subset_mean.py @@ -1,16 +1,16 @@ import os import sys -from .h5test import H5Test, timer_decorator import h5py import numpy as np +from h5test import H5Test, fsspec_logging_decorator, timer_decorator -current = os.path.abspath('..') +current = os.path.abspath("..") sys.path.append(current) -from helpers.geospatial import get_subset_region, get_subset_indices +from helpers.geospatial import get_subset_indices, get_subset_region + class H5pyArrSubsetMean(H5Test): - def __init__(self, data_format, geometry=None): """ geometry : path to geojson file containing geometry @@ -18,32 +18,34 @@ def __init__(self, data_format, geometry=None): """ super().__init__(data_format) self.bounds = get_subset_region(geometry) - + @timer_decorator - def run(self): - final_h5py_array = [] + @fsspec_logging_decorator + def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): + final_h5py_array = [] # TODO: Do we need to make this configurable or consistent? - group = '/gt1l/heights' - variable = 'h_ph' + if "fsspec_params" in io_params: + fsspec_params = io_params["fsspec_params"] + if "h5py_params" in io_params: + h5py_params = io_params["h5py_params"] for file in self.files: - with h5py.File(self.s3_fs.open(file, 'rb')) as f: - - lat = f[f'{group}/lat_ph'][:] - lon = f[f'{group}/lon_ph'][:] - + with h5py.File( + self.s3_fs.open(file, "rb", **fsspec_params), **h5py_params + ) as f: + lat = f[f"{dataset}/lat_ph"][:] + lon = f[f"{dataset}/lon_ph"][:] + idx_start, idx_end = get_subset_indices(lat, lon, self.bounds) - + # Leaving this code here so that we can create a DataFrame or - # Dataset at a later date. Suggest creating dict which can be + # Dataset at a later date. Suggest creating dict which can be # passsed to xarray or (geo)pandas # lat[idx_start:idx_end]) # lon[idx_start:idx_end]) - data = f[f'{group}/{variable}'][idx_start:idx_end] + data = f[f"{dataset}/{variable}"][idx_start:idx_end] # Need to test if using concatenate is faster final_h5py_array = np.insert( - final_h5py_array, - len(final_h5py_array), - data, axis=None + final_h5py_array, len(final_h5py_array), data, axis=None ) - return np.mean(final_h5py_array) \ No newline at end of file + return np.mean(final_h5py_array) diff --git a/h5tests/h5test.py b/h5tests/h5test.py index e7310cc..b77e81c 100644 --- a/h5tests/h5test.py +++ b/h5tests/h5test.py @@ -1,7 +1,7 @@ import csv import logging import os -import re +import pathlib import sys import time from datetime import datetime @@ -14,47 +14,34 @@ sys.path.append(current) -import csv -import logging -import os -import pathlib -import re -import time -from datetime import datetime -from io import StringIO - -import boto3 -import fsspec -import h5py -import numpy as np -import pandas as pd -import s3fs -import xarray as xr -from tqdm import tqdm - - -class RegexFilter(logging.Filter): - def __init__(self, regex_pattern): - super(RegexFilter, self).__init__() - self.regex_pattern = re.compile(regex_pattern) - - def filter(self, record): - # Apply the regex pattern to the log message - return not bool(self.regex_pattern.search(record.msg)) - - -def timer_decorator(func): +def fsspec_logging_decorator(func): """ - A decorator to measure the execution time of the wrapped function. + It will store the fsspec logs inside ./logs and will get some stats from file access + Will pass values to timer_decorator """ + def __setup_logging(self): + pathlib.Path(f"./logs").mkdir(exist_ok=True) + logger = logging.getLogger("fsspec") + logger.setLevel(logging.DEBUG) + self._file_handler = logging.FileHandler(self.log_filename) + self._file_handler.setLevel(logging.DEBUG) + logging.getLogger("fsspec").addHandler(self._file_handler) + + def __turnoff_logging(self): + [ + logging.getLogger("fsspec").debug(f"FileSize: {size}") + for size in self.file_sizes + ] + logging.getLogger("fsspec").removeHandler(self._file_handler) + self._file_handler.close() + def fsspec_stats(log_file): + stats = None with open(log_file, "r") as input_file: num_requests = 0 total_requested_bytes = 0 for line in input_file: - # Strip leading and trailing whitespaces from the line - try: read_range = line.split("read:")[1].split(" - ") request_size = int(read_range[1]) - int(read_range[0]) @@ -62,47 +49,57 @@ def fsspec_stats(log_file): num_requests += 1 except Exception: pass - stats = { - "total_reqs": num_requests, - "total_reqs_bytes": total_requested_bytes, - "avg_req_size": int(round(total_requested_bytes / num_requests, 2)), - } + if total_requested_bytes > 0: + stats = { + "total_reqs": num_requests, + "total_reqs_bytes": total_requested_bytes, + "avg_req_size": int(round(total_requested_bytes / num_requests, 2)), + } return stats - def __setup_logging(self, tstamp): - pathlib.Path(f"./logs").mkdir(exist_ok=True) + def wrapper(self, *args, **kwargs): + tstamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") self.log_filename = f"logs/{self.data_format}-{tstamp}.log" - logger = logging.getLogger("fsspec") - logger.setLevel(logging.DEBUG) - self.regex_filter = RegexFilter(self.logs_regex) - # add regerx to root logger - logging.getLogger("fsspec").addFilter(self.regex_filter) - self._file_handler = logging.FileHandler(self.log_filename) - self._file_handler.setLevel(logging.DEBUG) - # Add the handler to the root logger - logging.getLogger("fsspec").addHandler(self._file_handler) - def __turnoff_logging(self): - logging.getLogger("fsspec").removeFilter(self.regex_filter) - logging.getLogger("fsspec").removeHandler(self._file_handler) - self._file_handler.close() + __setup_logging(self) + result = func(self, *args, **kwargs) + __turnoff_logging(self) + + self.io_stats = fsspec_stats(self.log_filename) + return result, {"logs": self.log_filename, "io_stats": self.io_stats} + + return wrapper + + +def timer_decorator(func): + """ + A decorator to measure the execution time of the wrapped function. + """ def wrapper(self, *args, **kwargs): tstamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") - if self.logs_regex: - __setup_logging(self, tstamp) + start_time = time.time() result = func(self, *args, **kwargs) end_time = time.time() - if self.logs_regex: - __turnoff_logging(self) execution_time = end_time - start_time + if "io_params" in kwargs: + self.runtime_params = kwargs["io_params"] + if len(args) > 0: + self.runtime_params = args[0] + # Call the store method here - self.io_stats = fsspec_stats(self.log_filename) if self.store_results: + if type(result) in [list, dict, tuple]: + # unpack + func_result, _ = result + else: + func_result = result results_key = f"{tstamp}_{self.name}_{self.data_format}_results.csv" - self.store(run_time=execution_time, result=result, file_name=results_key) - return result, execution_time, self.log_filename, self.io_stats + self.store( + run_time=execution_time, result=func_result, file_name=results_key + ) + return result, {"execution_time": execution_time} return wrapper @@ -113,13 +110,12 @@ def __init__( data_format: str, files=[], store_results=True, - logs_regex=r"\s*(read: \d+ - \d+)", ): self.name = self.__class__.__name__ - self.io_stats = {} + self.io_stats = None + self.runtime_params = None self.log_filename = "" self.data_format = data_format - self.logs_regex = logs_regex if len(files) > 0: self.files = files else: @@ -140,6 +136,7 @@ def __init__( self.results_store_type = "Local" self.s3_fs = s3fs.S3FileSystem(anon=self.annon_access) + self.file_sizes = [self.s3_fs.info(file)["size"] for file in self.files] @timer_decorator def run(self, io_params, dataset, variable): @@ -155,28 +152,50 @@ def store(self, run_time: float, result: str, file_name: str): # Create a CSV in-memory csv_buffer = StringIO() csv_writer = csv.writer(csv_buffer) - csv_writer.writerow( - [ - "Name", - "Data Format", - "Run Time", - "Result", - "Access Log", - "Total Bytes Tranferred", - "Total Requests", - ] - ) # Headers - csv_writer.writerow( - [ - self.name, - self.data_format, - run_time, - result, - self.log_filename, - self.io_stats["total_reqs_bytes"], - self.io_stats["total_reqs"], - ] - ) + if self.io_stats: # if we are using the fsspec logger decorator + csv_writer.writerow( + [ + "Name", + "Data Format", + "Run Time", + "Result", + "Runtime Params", + "Access Log", + "Total Bytes Tranferred", + "Total Requests", + "Average Request Size", + ] + ) # Headers + csv_writer.writerow( + [ + self.name, + self.data_format, + run_time, + result, + self.runtime_params, + self.log_filename, + self.io_stats["total_reqs_bytes"], + self.io_stats["total_reqs"], + self.io_stats["avg_req_size"], + ] + ) + else: + csv_writer.writerow( + [ + "Name", + "Data Format", + "Run Time", + "Result", + ] + ) # Headers + csv_writer.writerow( + [ + self.name, + self.data_format, + run_time, + result, + ] + ) # Reset the buffer's position to the beginning csv_buffer.seek(0) diff --git a/h5tests/single-test.ipynb b/h5tests/single-test.ipynb index 165aabc..cd7a3e0 100644 --- a/h5tests/single-test.ipynb +++ b/h5tests/single-test.ipynb @@ -1,22 +1,27 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "a1039b23-f008-4740-adbd-bafb8eaccfd2", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Testing access time on ICESat-2 ATL03 HDF5 files in AWS S3.\n", + "\n", + "This notebook runs a single test from the different access patterns and stores the results in `results/` and `logs/`\n", + "If we use files in the CryoCloud the results will be send to the S3 bucket `s3://nasa-cryo-persistent/h5cloud/benchmark_results/`\n" + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 1, "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload \n", @@ -33,141 +38,50 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", "metadata": { "tags": [] }, "outputs": [], "source": [ - "files = [\n", - " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", - " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20200922221235_13680801_006_02.h5\",\n", + " # \"s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20191225111315_13680501_006_01.h5\",\n", "]\n", - "xarray_original = XarrayArrMean('atl03-bigsize-original', files=files, store_results=True)" + "\n", + "# We create the test cases for each kind of granule.\n", + "xarray_test = XarrayArrMean('atl03-xarray-original', files=granules, store_results=True)" ] }, { - "cell_type": "code", - "execution_count": 24, - "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", + "cell_type": "markdown", + "id": "33dcde98-df71-4e49-b051-f67c865981c6", "metadata": { - "tags": [] + "tags": [], + "user_expressions": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'library': 'xarray',\n", - " 'format': 'cloud',\n", - " 'mean': 1032.984130859375,\n", - " 'time': 176.90762186050415,\n", - " 'total_requested_bytes': 720001152,\n", - " 'total_requests': 100,\n", - " 'avg_req_size': 7200011},\n", - " {'library': 'xarray',\n", - " 'format': 'original',\n", - " 'mean': 1032.984130859375,\n", - " 'time': 1456.8166418075562,\n", - " 'total_requested_bytes': 438520591,\n", - " 'total_requests': 26988,\n", - " 'avg_req_size': 16248}]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# don't even try this out of region...\n", - "# takes about ~10 minutes per granule out of region (6+ GB granules)\n", - "io_params ={\n", - " \"fsspec_params\": {},\n", - " \"h5py_params\" : {}\n", - "}\n", - "results = xarray_original.run(io_params)\n", - "benchmarks.append({\"library\": \"xarray\",\n", - " \"format\": \"original\",\n", - " \"mean\": results[0],\n", - " \"time\": results[1],\n", - " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", - " \"total_requests\": results[3][\"total_reqs\"],\n", - " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", - "benchmarks" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", - "metadata": {}, - "outputs": [], "source": [ - "files = [\n", - " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", - " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", - "]\n", - "xarray_cloud = XarrayArrMean('atl03-bigsize-repacked', files=files, store_results=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "dfd4e404-0412-4d2f-8eba-ca39a670e369", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'library': 'xarray',\n", - " 'format': 'cloud',\n", - " 'mean': 1032.984130859375,\n", - " 'time': 176.90762186050415,\n", - " 'total_requested_bytes': 720001152,\n", - " 'total_requests': 100,\n", - " 'avg_req_size': 7200011}]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# takes about ~90 seconds per granule out of region\n", - "io_params ={\n", - " \"fsspec_params\": {\n", - " # \"skip_instance_cache\": True\n", - " \"cache_type\": \"blockcache\",\n", - " \"block_size\": 8*1024*1024\n", - " },\n", - " \"h5py_params\" : {\n", - " \"driver_kwds\": {\n", - " \"page_buf_size\": 32*1024*1024,\n", - " \"rdcc_nbytes\": 8*1024*1024\n", - " }\n", + "### Benchmarking access patterns \n", "\n", - " }\n", + "```python\n", + "io_params ={\n", + " \"fsspec_params\": {}, # if we use fsspec we can pass io params here\n", + " \"h5py_params\" : {} # if we use h5py we can pass io params here\n", "}\n", + "```\n", "\n", - "results = xarray_cloud.run(io_params)\n", - "\n", - "benchmarks.append({\"library\": \"xarray\",\n", - " \"format\": \"cloud\",\n", - " \"mean\": results[0],\n", - " \"time\": results[1],\n", - " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", - " \"total_requests\": results[3][\"total_reqs\"],\n", - " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", - "benchmarks" + "Accesing ATL03 with Xarray takes considerably longer than using h5py directly, this is mainly due the decoding and metadata that Xarray uses to represent the data.\n", + "Using Xarray it takes approx ~10 minutes per granule out of region (6+ GB granules) and ~2 minutes per granule in-region (6+ GB granules) when we access the non optimized granules.\n" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", - "metadata": {}, + "execution_count": 3, + "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -203,56 +117,85 @@ " \n", " 0\n", " xarray\n", - " cloud\n", - " 1032.984131\n", - " 176.907622\n", - " 720001152\n", - " 100\n", - " 7200011\n", - " \n", - " \n", - " 1\n", - " xarray\n", " original\n", - " 1032.984131\n", - " 1456.816642\n", - " 438520591\n", - " 26988\n", - " 16248\n", + " 18.128136\n", + " 70.16828\n", + " 50852992\n", + " 3828\n", + " 13284\n", " \n", " \n", "\n", "" ], "text/plain": [ - " library format mean time total_requested_bytes \\\n", - "0 xarray cloud 1032.984131 176.907622 720001152 \n", - "1 xarray original 1032.984131 1456.816642 438520591 \n", + " library format mean time total_requested_bytes \\\n", + "0 xarray original 18.128136 70.16828 50852992 \n", "\n", " total_requests avg_req_size \n", - "0 100 7200011 \n", - "1 26988 16248 " + "0 3828 13284 " ] }, - "execution_count": 25, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# we don't need this when using the original granules.\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"cache_type\": \"blockcache\",\n", + " # \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + "# \"driver_kwds\": {\n", + "# \"page_buf_size\": 64*1024*1024,\n", + "# \"rdcc_nbytes\": 8*1024*1024\n", + "# }\n", + "\n", + " }\n", + "}\n", + "\n", + "# this info gets stored in logs and csv files as usual but we want to plot them here too.\n", + "execution_info, execution_time = xarray_test.run(io_params)\n", + "\n", + "io_stats = execution_info[1][\"io_stats\"]\n", + "\n", + "benchmarks.append({\"library\": \"xarray\",\n", + " \"format\": \"original\",\n", + " \"mean\": execution_info[0],\n", + " \"time\": execution_time[\"execution_time\"],\n", + " \"total_requested_bytes\": io_stats[\"total_reqs_bytes\"],\n", + " \"total_requests\": io_stats[\"total_reqs\"],\n", + " \"avg_req_size\": io_stats[\"avg_req_size\"]})\n", + "\n", "df = pd.DataFrame.from_dict(benchmarks)\n", "df" ] }, + { + "cell_type": "markdown", + "id": "3d41bada-8735-4973-8536-bd9050b2e31f", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Plotting the resuls\n" + ] + }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 4, "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { - "image/png": "", + "image/png": "\n", "text/plain": [ "
" ] @@ -281,8 +224,8 @@ " y = group['time'].mean()\n", " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", - " ax.text(x, y - (y/2) - 10, f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", - " ax.text(x, y - (y/2.5), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2), f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.7), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", "\n", "# Set labels and title\n", "ax.set_xlabel('Access Pattern')\n", @@ -318,7 +261,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/h5tests/xarray_arr_mean.py b/h5tests/xarray_arr_mean.py index 59d636d..6287a39 100644 --- a/h5tests/xarray_arr_mean.py +++ b/h5tests/xarray_arr_mean.py @@ -1,8 +1,7 @@ import fsspec import numpy as np import xarray as xr - -from h5test import H5Test, timer_decorator +from h5test import H5Test, fsspec_logging_decorator class XarrayArrMean(H5Test): @@ -19,6 +18,7 @@ def open_reference_ds(self, file: str, dataset: str): ) @timer_decorator + @fsspec_logging_decorator def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): if "kerchunk" in self.data_format: datasets_ref = [ diff --git a/notebooks/01_data-selection.ipynb b/notebooks/data-wrangling/01_data-selection.ipynb similarity index 100% rename from notebooks/01_data-selection.ipynb rename to notebooks/data-wrangling/01_data-selection.ipynb diff --git a/notebooks/arr_mean_bar_plot.png b/notebooks/data-wrangling/arr_mean_bar_plot.png similarity index 100% rename from notebooks/arr_mean_bar_plot.png rename to notebooks/data-wrangling/arr_mean_bar_plot.png diff --git a/notebooks/benchmark-h5repack.ipynb b/notebooks/data-wrangling/benchmark-h5repack.ipynb similarity index 100% rename from notebooks/benchmark-h5repack.ipynb rename to notebooks/data-wrangling/benchmark-h5repack.ipynb diff --git a/notebooks/benchmark-small-file-h5repack.ipynb b/notebooks/data-wrangling/benchmark-small-file-h5repack.ipynb similarity index 100% rename from notebooks/benchmark-small-file-h5repack.ipynb rename to notebooks/data-wrangling/benchmark-small-file-h5repack.ipynb diff --git a/notebooks/benchmarks-outline.ipynb b/notebooks/data-wrangling/benchmarks-outline.ipynb similarity index 100% rename from notebooks/benchmarks-outline.ipynb rename to notebooks/data-wrangling/benchmarks-outline.ipynb diff --git a/notebooks/cloud-optimized-hdf5.ipynb b/notebooks/data-wrangling/cloud-optimized-hdf5.ipynb similarity index 100% rename from notebooks/cloud-optimized-hdf5.ipynb rename to notebooks/data-wrangling/cloud-optimized-hdf5.ipynb diff --git a/notebooks/convert_h5dataframe2flatgeobuf.ipynb b/notebooks/data-wrangling/convert_h5dataframe2flatgeobuf.ipynb similarity index 100% rename from notebooks/convert_h5dataframe2flatgeobuf.ipynb rename to notebooks/data-wrangling/convert_h5dataframe2flatgeobuf.ipynb diff --git a/notebooks/example-list-test-files.ipynb b/notebooks/data-wrangling/example-list-test-files.ipynb similarity index 100% rename from notebooks/example-list-test-files.ipynb rename to notebooks/data-wrangling/example-list-test-files.ipynb diff --git a/notebooks/format-preprocessing-times.ipynb b/notebooks/data-wrangling/format-preprocessing-times.ipynb similarity index 100% rename from notebooks/format-preprocessing-times.ipynb rename to notebooks/data-wrangling/format-preprocessing-times.ipynb diff --git a/notebooks/fsspec-logs.ipynb b/notebooks/data-wrangling/fsspec-logs.ipynb similarity index 100% rename from notebooks/fsspec-logs.ipynb rename to notebooks/data-wrangling/fsspec-logs.ipynb diff --git a/notebooks/h5coro_benchmarks.ipynb b/notebooks/data-wrangling/h5coro_benchmarks.ipynb similarity index 100% rename from notebooks/h5coro_benchmarks.ipynb rename to notebooks/data-wrangling/h5coro_benchmarks.ipynb diff --git a/notebooks/h5py_testing_original_repacked_with_subsetting.ipynb b/notebooks/data-wrangling/h5py_testing_original_repacked_with_subsetting.ipynb similarity index 100% rename from notebooks/h5py_testing_original_repacked_with_subsetting.ipynb rename to notebooks/data-wrangling/h5py_testing_original_repacked_with_subsetting.ipynb diff --git a/notebooks/kerchunker.ipynb b/notebooks/data-wrangling/kerchunker.ipynb similarity index 100% rename from notebooks/kerchunker.ipynb rename to notebooks/data-wrangling/kerchunker.ipynb diff --git a/notebooks/read-results.ipynb b/notebooks/data-wrangling/read-results.ipynb similarity index 100% rename from notebooks/read-results.ipynb rename to notebooks/data-wrangling/read-results.ipynb diff --git a/notebooks/data-wrangling/ros3vfd-log-info.ipynb b/notebooks/data-wrangling/ros3vfd-log-info.ipynb new file mode 100644 index 0000000..faa57d1 --- /dev/null +++ b/notebooks/data-wrangling/ros3vfd-log-info.ipynb @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c1d4cad4-c84c-4104-981c-9eb0a20f75fd", + "metadata": {}, + "source": [ + "# ROS3 VFD Log Analysis Dashboard" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f53c6f1c-e624-4952-a3e8-69302152da81", + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "import io\n", + "import re\n", + "import numpy as np\n", + "from bokeh.models import HoverTool\n", + "import holoviews as hv\n", + "import panel as pn\n", + "hv.extension('bokeh')\n", + "pn.extension()" + ] + }, + { + "cell_type": "markdown", + "id": "bc1a4585-45e7-48c7-a806-d7096f0f82bc", + "metadata": {}, + "source": [ + "## Log Parser\n", + "\n", + "The class that represents information of one HTTP range GET request:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cef30c0-71b2-405f-83d4-a440df748a52", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass(frozen=True)\n", + "class ByteRange:\n", + " start: int\n", + " end: int\n", + " filesize: int\n", + "\n", + " def __post_init__(self):\n", + " if self.start < 0 or self.end <= 0 or self.filesize <= 0:\n", + " raise ValueError('Start, end, and file size values must be positive integers')\n", + " elif self.end > self.filesize:\n", + " raise ValueError('End value must be smaller or equal to file size')\n", + " elif self.start > self.end:\n", + " raise ValueError('Start value must be smaller or equal to end value')\n", + "\n", + " @property\n", + " def size(self):\n", + " return self.end - self.start + 1\n", + "\n", + " def __len__(self):\n", + " return self.size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b205fe75-e4a3-4bb3-86ac-a24b630580f8", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_fsspec_log(content: bytes) -> list[ByteRange]:\n", + " head_line = re.compile('read: 0 - ')\n", + " fsize_line = re.compile('FileSize: ([0-9]+)')\n", + " range_line = re.compile('\\s*(read: \\d+ - \\d+)')\n", + "\n", + " ranges = list()\n", + " with io.TextIOWrapper(io.BytesIO(content)) as logtxt:\n", + " for line in logtxt:\n", + " if head_line.match(line):\n", + " break\n", + " else:\n", + " raise RuntimeError('HEAD line not found in the log file')\n", + "\n", + " for line in logtxt:\n", + " match = fsize_line.match(line)\n", + " if match:\n", + " fsize = int(match.group(1))\n", + " break\n", + " else:\n", + " raise RuntimeError('FILESIZE line not found in the log file')\n", + "\n", + " for line in logtxt:\n", + " match = range_line.search(line)\n", + " if match:\n", + " range = ByteRange(start=int(match.group('start')), \n", + " end=int(match.group('end')),\n", + " filesize=fsize)\n", + " if range.size != int(match.group('size')):\n", + " raise ValueError(f'Reported size different for {match.group()}')\n", + " ranges.append(range)\n", + " \n", + " return ranges" + ] + }, + { + "cell_type": "markdown", + "id": "a5954181-f1d7-40f3-8594-3afe62bcd2aa", + "metadata": {}, + "source": [ + "Log file parser:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4e4e020-c058-4a6c-b2dd-ed12962ae785", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_ros3vfd_log(content: bytes) -> list[ByteRange]:\n", + " head_line = re.compile('HEAD: Bytes 0 - ')\n", + " fsize_line = re.compile('FILESIZE: ([0-9]+)')\n", + " range_line = re.compile('GET: Bytes (?P[0-9]+) - (?P[0-9]+), Request Size: (?P[0-9]+)')\n", + "\n", + " ranges = list()\n", + " with io.TextIOWrapper(io.BytesIO(content)) as logtxt:\n", + " for line in logtxt:\n", + " if head_line.match(line):\n", + " break\n", + " else:\n", + " raise RuntimeError('HEAD line not found in the log file')\n", + "\n", + " for line in logtxt:\n", + " match = fsize_line.match(line)\n", + " if match:\n", + " fsize = int(match.group(1))\n", + " break\n", + " else:\n", + " raise RuntimeError('FILESIZE line not found in the log file')\n", + "\n", + " for line in logtxt:\n", + " match = range_line.search(line)\n", + " if match:\n", + " range = ByteRange(start=int(match.group('start')), \n", + " end=int(match.group('end')),\n", + " filesize=fsize)\n", + " if range.size != int(match.group('size')):\n", + " raise ValueError(f'Reported size different for {match.group()}')\n", + " ranges.append(range)\n", + " \n", + " return ranges" + ] + }, + { + "cell_type": "markdown", + "id": "502ba537-da68-4bc2-95bc-ccf73b1f3322", + "metadata": {}, + "source": [ + "## Dashboard\n", + "\n", + "Function for generating log stats and plots:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8738d59c-3ff6-4d83-9ef2-4eeeb93ee851", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_ros3vfd_log(from_file):\n", + " if from_file is None:\n", + " return\n", + " elif len(from_file) == 0:\n", + " return [pn.pane.Alert('ros3vfd log file empty.', alert_type='danger')]\n", + " try:\n", + " ranges = parse_ros3vfd_log(from_file)\n", + " except Exception as e:\n", + " return [pn.pane.Alert(f'Error: {str(e)}', alert_type='danger')]\n", + " if len(ranges) == 0:\n", + " return [pn.pane.Alert('No range `GET` info found.', alert_type='info')]\n", + " start = np.fromiter([r.start for r in ranges], dtype=np.uint64)\n", + " end = np.fromiter([r.end for r in ranges], dtype=np.uint64)\n", + " req_no = np.arange(len(ranges)) + 1\n", + " sizes = np.fromiter([r.size for r in ranges], np.uint64)\n", + " info = pn.pane.Markdown(f\"\"\"\n", + "# ros3vfd Log Information\n", + "\n", + "Log size: {len(from_file):,} bytes\n", + "\n", + "HDF5 file size: {ranges[0].filesize:,} bytes\n", + "\n", + "Number of range _GET_ requests: {len(ranges):,}\n", + "\n", + "Overall range _GET_ requests stats:\n", + "\n", + "* Smallest: {np.min(sizes):,} bytes
\n", + "* Median: {int(np.median(sizes)):,} bytes
\n", + "* Largest: {np.max(sizes):,} bytes\n", + "\n", + "Maximum file byte read: {end.max():,}\n", + "\n", + "Total of file content read: {sizes.sum():,} bytes\n", + "\n", + "Percentage of content read to file size: {100 * (sizes.sum() / ranges[0].filesize) :.2f} %\n", + "\"\"\")\n", + " data = dict(start=start, end=end, start_event=req_no, end_event=req_no)\n", + " max_offset_range = min(16_000_000, np.max(end))\n", + " req_range = np.where(end <= max_offset_range)[0]\n", + " if req_range.size == 0:\n", + " max_req_range = req_no[-1]\n", + " else:\n", + " max_req_range = req_no[np.where(end <= max_offset_range)[0][-1]] + 1\n", + " ros3plt = hv.Segments(\n", + " data, \n", + " [\n", + " hv.Dimension('start', label='File offset', range=(0, max_offset_range)),\n", + " hv.Dimension('start_event', label='Req. No.', range=(0, max_req_range)), \n", + " 'end', \n", + " 'end_event'\n", + " ]\n", + " )\n", + " hvrtip = HoverTool(\n", + " tooltips = [\n", + " ('req no', '@start_event'),\n", + " ('start byte', '@start'),\n", + " ('end byte', '@end')\n", + " ]\n", + " )\n", + " ros3plt.opts(width=700, height=600, invert_axes=True, color='blue', \n", + " line_width=3, tools=[hvrtip])\n", + " size_hist = hv.Histogram(np.histogram(sizes, bins=512))\n", + " size_hist.opts(color='blue', line_color=None, tools=['hover'],\n", + " xlabel='Size (bytes)', ylabel='Number of requests')\n", + " \n", + " return [pn.Row(info, size_hist), ros3plt]" + ] + }, + { + "cell_type": "markdown", + "id": "d1578b9a-1a8f-43cb-9902-96e21c83cf3a", + "metadata": {}, + "source": [ + "### Dashboard Components" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "553c70f5-6f78-4f28-9d6b-7998cbbc7ec2", + "metadata": {}, + "outputs": [], + "source": [ + "log_file = pn.widgets.FileInput()\n", + "upld_form = pn.Row(\n", + " pn.pane.Markdown('Please select a ros3vfd log file (limit 10MB):'),\n", + " log_file\n", + ")\n", + "res = pn.Column()\n", + "app = pn.WidgetBox(upld_form, res)" + ] + }, + { + "cell_type": "markdown", + "id": "15126686-d315-4ae4-8666-054dd6127ba5", + "metadata": {}, + "source": [ + "Callback function for interactive log processing invocation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "322e3372-be21-4d65-b97e-9db5a552aaf0", + "metadata": {}, + "outputs": [], + "source": [ + "def callback(value):\n", + " res.objects = plot_ros3vfd_log(value)" + ] + }, + { + "cell_type": "markdown", + "id": "dd2b2c83-1d2e-4db4-a23f-db9ad5d6f84d", + "metadata": {}, + "source": [ + "Register callback with the appropriate dashboard object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "580279df-8b79-426f-bb22-002f117f200a", + "metadata": {}, + "outputs": [], + "source": [ + "log_file.param.watch_values(callback, ['value']);" + ] + }, + { + "cell_type": "markdown", + "id": "a3f35ce7-0698-4b02-a95e-75107138a29a", + "metadata": {}, + "source": [ + "Run the dashboard:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56c63583-922a-4b67-8cb0-8f3751ed7c28", + "metadata": {}, + "outputs": [], + "source": [ + "app.servable()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/run-tests.ipynb b/notebooks/data-wrangling/run-tests.ipynb similarity index 100% rename from notebooks/run-tests.ipynb rename to notebooks/data-wrangling/run-tests.ipynb diff --git a/notebooks/sliderule2geoparquet.ipynb b/notebooks/data-wrangling/sliderule2geoparquet.ipynb similarity index 100% rename from notebooks/sliderule2geoparquet.ipynb rename to notebooks/data-wrangling/sliderule2geoparquet.ipynb diff --git a/notebooks/xarray-h5coro-backend.ipynb b/notebooks/data-wrangling/xarray-h5coro-backend.ipynb similarity index 100% rename from notebooks/xarray-h5coro-backend.ipynb rename to notebooks/data-wrangling/xarray-h5coro-backend.ipynb diff --git a/notebooks/portable-full-comparison.ipynb b/notebooks/portable-full-comparison.ipynb new file mode 100644 index 0000000..334bddc --- /dev/null +++ b/notebooks/portable-full-comparison.ipynb @@ -0,0 +1 @@ +{"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"}},"nbformat_minor":5,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## AB testing access time for ICESat-2 ATL03 HDF5 files in the cloud.\n\nThis notebook requires that we have 2 versions of the same file:\n * Original A: The original file with no modifications on a S3 location.\n * Test Case B: A modified version of the orignal file to test for metadata consolidation, rechunking and other strategies to speed up access to the data in the file.\n","metadata":{"tags":[],"user_expressions":[]},"id":"6c9b37e2-2daa-4283-a228-ea581498de0c"},{"cell_type":"code","source":"import xarray as xr\nimport h5py\nimport fsspec\nimport logging\nimport re\nimport time\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\nfrom h5coro import h5coro, s3driver, filedriver\ndriver = s3driver.S3Driver\n\nlogger = logging.getLogger('fsspec')\nlogger.setLevel(logging.DEBUG)","metadata":{"trusted":true,"tags":[]},"execution_count":1,"outputs":[],"id":"3b78fb94-10ae-48cb-8e30-521b2c8b7822"},{"cell_type":"code","source":"for library in (xr, h5py, fsspec, h5coro):\n print(f'{library.__name__} v{library.__version__}')","metadata":{"trusted":true,"tags":[]},"execution_count":2,"outputs":[{"name":"stdout","output_type":"stream","text":"xarray v2023.12.0\nh5py v3.9.0\nfsspec v2023.6.0\nh5coro v0.0.6\n"}],"id":"431d900d-0656-4b75-af6b-82f0f171d5f8"},{"cell_type":"markdown","source":"For listing files in CryoCloud\n\n```bash\naws s3 ls s3://nasa-cryo-persistent/h5cloud/ --recursive\n```","metadata":{"tags":[],"user_expressions":[]},"id":"7998cd99-6034-4a1b-9ae5-d651bc265bff"},{"cell_type":"code","source":"test_dict = {\n \"ATL03-1GB\": {\n \"links\": {\n \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\",\n \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\"\n },\n \"group\": \"/gt1l/heights\",\n \"variable\": \"h_ph\",\n \"processing\": [\n \"h5repack -S PAGE -G 8000000\"\n ]\n },\n \"ATL03-7GB\": {\n \"links\": {\n \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n },\n \"group\": \"/gt1l/heights\",\n \"variable\": \"h_ph\",\n \"processing\": [\n \"h5repack -S PAGE -G 8000000\"\n ]\n },\n \"ATL03-7GB-kerchunk\": {\n \"links\": {\n \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\",\n \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\",\n },\n \"group\": \"/gt1l/heights\",\n \"variable\": \"h_ph\",\n \"processing\": [\n \"h5repack -S PAGE -G 8000000\"\n ]\n }, \n \"ATL03-2GB\": {\n \"links\": {\n \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\",\n \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\",\n },\n \"group\": \"/gt1l/heights\",\n \"variable\": \"h_ph\",\n \"processing\": [\n \"h5repack -S PAGE -G 8000000\"\n ]\n }\n}\n\ndef kerchunk_result(file: str, dataset: str, variable: str):\n fs = fsspec.filesystem(\n \"reference\",\n fo=file,\n remote_protocol=\"s3\",\n remote_options=dict(anon=False),\n skip_instance_cache=True,\n )\n ds = xr.open_dataset(\n fs.get_mapper(\"\"), engine=\"zarr\", consolidated=False, group=dataset\n )\n return ds[variable].mean()\n\n# This will use the embedded credentials in the hub to access the s3://nasa-cryo-persistent bucket\nfs = fsspec.filesystem('s3')\n","metadata":{"trusted":true,"tags":[]},"execution_count":3,"outputs":[],"id":"9850faac-f534-4bc2-9214-c8dababe0f52"},{"cell_type":"markdown","source":"## [h5coro](https://github.com/ICESat2-SlideRule/h5coro/)\n\n**h5coro** is optimized for reading HDF5 data in high-latency high-throughput environments. It accomplishes this through a few key design decisions:\n* __All reads are concurrent.__ Each dataset and/or attribute read by **h5coro** is performed in its own thread.\n* __Intelligent range gets__ are used to read as many dataset chunks as possible in each read operation. This drastically reduces the number of HTTP requests to S3 and means there is no longer a need to re-chunk the data (it actually works better on smaller chunk sizes due to the granularity of the request).\n* __Block caching__ is used to minimize the number of GET requests made to S3. S3 has a large first-byte latency (we've measured it at ~60ms on our systems), which means there is a large penalty for each read operation performed. **h5coro** performs all reads to S3 as large block reads and then maintains data in a local cache for access to smaller amounts of data within those blocks.\n* __The system is serverless__ and does not depend on any external services to read the data. This means it scales naturally as the user application scales, and it reduces overall system complexity.\n* __No metadata repository is needed.__ The structure of the file are cached as they are read so that successive reads to other datasets in the same file will not have to re-read and re-build the directory structure of the file.\n","metadata":{"tags":[],"user_expressions":[]},"id":"4d166627-6144-40bf-884d-2188e5c764ba"},{"cell_type":"code","source":"h5coro_beanchmarks = []\n\nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n print (f\"Processing: {link}\")\n if \"kerchunk\" in link:\n continue\n group = dataset[\"group\"]\n variable = dataset['variable'] \n final_h5coro_array = []\n start = time.time()\n if link.startswith(\"s3://nasa-cryo-persistent/\"):\n h5obj = h5coro.H5Coro(link.replace(\"s3://\", \"\"), s3driver.S3Driver)\n else:\n h5obj = h5coro.H5Coro(link.replace(\"s3://\", \"\"), s3driver.S3Driver, credentials={\"annon\": True})\n ds = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)\n data = ds[f'{group}/{variable}'][:]\n data_mean = np.mean(data)\n elapsed = time.time() - start\n \n h5coro_beanchmarks.append({\"tool\": \"h5coro\",\n \"dataset\": key,\n \"cloud-aware\": \"no\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean})\n\n\ndf = pd.DataFrame.from_dict(h5coro_beanchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\nplt.title('h5coro cloud optimized HDF5 performance')\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":33,"outputs":[{"name":"stdout","output_type":"stream","text":"Processing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\n"},{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"efe41d4a-1947-438b-a3c3-7ab954d75e13"},{"cell_type":"markdown","source":"### Xarray + kerchunk, out of the box performance.","metadata":{"tags":[],"user_expressions":[]},"id":"8f0ba64d-d89c-4879-b965-f00d70956360"},{"cell_type":"code","source":"# this is going to keep our numbers without modifying the i/o paramters\nregular_xarray_benchmarks = []\nkerchunk_benchmarks = []\n\nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n print (f\"Processing: {link}\")\n try:\n log_filename = f\"logs/fsspec-xarray-{key}-{k}-default.log\"\n \n # Create a new FileHandler for each iteration\n file_handler = logging.FileHandler(log_filename)\n file_handler.setLevel(logging.DEBUG)\n\n # Add the handler to the root logger\n logging.getLogger().addHandler(file_handler)\n \n start = time.time()\n if \"kerchunk\" in link:\n data_mean = kerchunk_result(link, dataset[\"group\"], dataset[\"variable\"])\n elapsed = time.time() - start\n kerchunk_benchmarks.append(\n {\"tool\": \"kerchunk\",\n \"dataset\": key,\n \"cloud-aware\": \"no\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean}) \n else:\n ds = xr.open_dataset(fs.open(link, mode='rb'), group=dataset[\"group\"], engine=\"h5netcdf\", decode_cf=False)\n data_mean = ds[dataset[\"variable\"]].mean() \n elapsed = time.time() - start\n regular_xarray_benchmarks.append(\n {\"tool\": \"xarray\",\n \"dataset\": key,\n \"cloud-aware\": \"no\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean}) \n \n logging.getLogger().removeHandler(file_handler)\n file_handler.close()\n\n except Exception as e:\n print(e)","metadata":{"trusted":true,"tags":[]},"execution_count":22,"outputs":[{"name":"stdout","output_type":"stream","text":"Processing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\n"}],"id":"ff56958f-8c1d-4fd7-b885-6efb81af8da7"},{"cell_type":"markdown","source":"### Plotting Results","metadata":{"tags":[],"user_expressions":[]},"id":"92a8e67d-026e-4c6b-aa7d-b19dc10f4afd"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(kerchunk_benchmarks + regular_xarray_benchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\n\nplt.title(\"Out of the box I/O parameters\", fontsize=10)\nplt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":24,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"149d5972-c5b9-4f29-979a-cf46c9654a06"},{"cell_type":"markdown","source":"## h5py out of the box performance.","metadata":{"tags":[],"user_expressions":[]},"id":"fa6ac2b9-989c-4246-bb89-b54b711dd695"},{"cell_type":"code","source":"regular_h5py_benchmarks = []\n\nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n try:\n if \"kerchunk\" in link:\n continue \n print (f\"Processing: {link}\")\n log_filename = f\"logs/fsspec-h5py-{key}-{k}_default.log\"\n \n # Create a new FileHandler for each iteration\n file_handler = logging.FileHandler(log_filename)\n file_handler.setLevel(logging.DEBUG)\n\n # Add the handler to the root logger\n logging.getLogger().addHandler(file_handler)\n # this is mostly IO so no perf_counter is needed\n start = time.time()\n with h5py.File(fs.open(link, mode=\"rb\")) as f:\n path = f\"{dataset['group']}/{dataset['variable']}\"\n data = f[path][:]\n data_mean = data.mean()\n elapsed = time.time() - start\n regular_h5py_benchmarks.append(\n {\"tool\": \"h5py\",\n \"dataset\": key,\n \"cloud-aware\": \"no\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean})\n\n logging.getLogger().removeHandler(file_handler) \n file_handler.close()\n \n except Exception as e:\n print(e)","metadata":{"trusted":true,"tags":[]},"execution_count":25,"outputs":[{"name":"stdout","output_type":"stream","text":"Processing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\n"}],"id":"98c29558-de50-44af-87e9-074092fcd0ac"},{"cell_type":"markdown","source":"### Plotting Results","metadata":{"tags":[],"user_expressions":[]},"id":"f4232e98-1159-45eb-ba11-0f0dbb905d83"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(regular_h5py_benchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\nplt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\nplt.title(\"Out of the box I/O parameters\", fontsize=10)\n\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=45)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":26,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"d8fa6dca-f408-4298-beca-f2839d4c3b67"},{"cell_type":"markdown","source":"## Aggregated plot by tool and different file sizes","metadata":{"tags":[],"user_expressions":[]},"id":"b20b2032-9ab4-46e1-b1f8-2e62b656a265"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(regular_h5py_benchmarks + kerchunk_benchmarks + regular_xarray_benchmarks + h5coro_beanchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\nplt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\nplt.title(\"Out of the box I/O parameters\", fontsize=10)\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":27,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"64bcc5de-aae3-46aa-9474-1c90b9ff20a9"},{"cell_type":"markdown","source":"## Now leet's run the tests with \"informed\" parameters, this is a I/O that aligns to the cloud-optimized granules chunking strategy and consolidated metadata.\n","metadata":{"tags":[],"user_expressions":[]},"id":"0ea67b0b-5e7f-4d1f-bca9-1f3cae7fe309"},{"cell_type":"code","source":"optimized_h5py_benchmarks = []\noptimized_xarray_benchmarks = []\n\nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n print(f\"Processing: {link}\")\n try:\n log_filename = f\"logs/fsspec-xarray-{key}-{k}.log\"\n \n # Create a new FileHandler for each iteration\n file_handler = logging.FileHandler(log_filename)\n file_handler.setLevel(logging.DEBUG)\n\n # Add the handler to the root logger\n logging.getLogger().addHandler(file_handler)\n \n io_params = {\n \"fsspec_params\": {},\n \"h5py_params\": {}\n }\n \n if \"repacked\" in link: \n io_params ={\n \"fsspec_params\": {\n \"cache_type\": \"blockcache\",\n \"block_size\": 8*1024*1024\n },\n \"h5py_params\" : {\n \"driver_kwds\": {\n \"page_buf_size\": 64*1024*1024,\n \"rdcc_nbytes\": 8*1024*1024\n }\n\n }\n }\n\n if \"kerchunk\" in link:\n continue\n \n start = time.time()\n ds = xr.open_dataset(fs.open(link, mode='rb', **io_params[\"fsspec_params\"]), group=dataset[\"group\"], engine=\"h5netcdf\", decode_cf=False)\n data_mean = ds[dataset[\"variable\"]].mean()\n elapsed = time.time() - start\n optimized_xarray_benchmarks.append(\n {\"tool\": \"xarray\",\n \"dataset\": key,\n \"cloud-aware\": \"yes\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean})\n \n logging.getLogger().removeHandler(file_handler)\n file_handler.close()\n\n except Exception as e:\n print(e)\n \nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n try:\n if \"kerchunk\" in link:\n continue \n print (f\"Processing: {link}\")\n log_filename = f\"logs/fsspec-h5py-{key}-{k}_default.log\"\n \n # Create a new FileHandler for each iteration\n file_handler = logging.FileHandler(log_filename)\n file_handler.setLevel(logging.DEBUG)\n\n # Add the handler to the root logger\n logging.getLogger().addHandler(file_handler)\n # this is mostly IO so no perf_counter is needed\n start = time.time()\n io_params = {\n \"fsspec_params\": {},\n \"h5py_params\": {}\n }\n \n if \"repacked\" in link: \n io_params ={\n \"fsspec_params\": {\n \"cache_type\": \"blockcache\",\n \"block_size\": 8*1024*1024\n },\n \"h5py_params\" : {\n \"page_buf_size\": 64*1024*1024,\n \"rdcc_nbytes\": 8*1024*1024\n }\n } \n with h5py.File(fs.open(link, mode=\"rb\", **io_params[\"fsspec_params\"]), **io_params[\"h5py_params\"]) as f:\n path = f\"{dataset['group']}/{dataset['variable']}\"\n data = f[path][:]\n data_mean = data.mean()\n elapsed = time.time() - start\n optimized_h5py_benchmarks.append(\n {\"tool\": \"h5py\",\n \"dataset\": key,\n \"cloud-aware\": \"yes\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean})\n\n logging.getLogger().removeHandler(file_handler) \n file_handler.close()\n \n\n except Exception as e:\n print(e)","metadata":{"trusted":true,"tags":[]},"execution_count":28,"outputs":[{"name":"stdout","output_type":"stream","text":"Processing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\n"}],"id":"8151834b-0b57-4a3d-98b5-8cfaffa37dc4"},{"cell_type":"markdown","source":"## Plotting results","metadata":{"tags":[],"user_expressions":[]},"id":"04414c2e-0666-4701-8ecc-7842727ede22"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(optimized_h5py_benchmarks+h5coro_beanchmarks+optimized_xarray_benchmarks+kerchunk_benchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\n\nplt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\nplt.title(\"Informed I/O parameters\", fontsize=10)\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":29,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"2db2535a-8d3a-4e65-b21c-8db6b48074c8"},{"cell_type":"markdown","source":"## Pliting tool specific performance","metadata":{"tags":[],"user_expressions":[]},"id":"ea0db03e-5653-4908-ada1-16d723666e18"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(regular_xarray_benchmarks+optimized_xarray_benchmarks)\n\npivot_df = df.pivot_table(index=['dataset','cloud-aware'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\nplt.title('Xarray \"Cloud-awared\" Access Pattern Performance (less is better)')\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":32,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"47444e8a-6d59-42c2-baff-a3c85c447eb2"}]} \ No newline at end of file diff --git a/notebooks/portable-h5py-test.ipynb b/notebooks/portable-h5py-test.ipynb index 82d88f5..05c60b8 100644 --- a/notebooks/portable-h5py-test.ipynb +++ b/notebooks/portable-h5py-test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", "metadata": { "tags": [] @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", "metadata": { "tags": [] @@ -32,52 +32,29 @@ "outputs": [], "source": [ "original_granules = [\n", - " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", - " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20191225111315_13680501_006_01.h5\",\n", + " # \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", "]\n", "h5py_original = H5pyArrMean('atl03-bigsize-original', files=original_granules, store_results=True)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "h5py params: {}\n", - "h5py params: {}\n" - ] - }, - { - "data": { - "text/plain": [ - "[{'library': 'h5py',\n", - " 'format': 'original',\n", - " 'mean': 1032.9840463639412,\n", - " 'time': 51.46329092979431,\n", - " 'total_requested_bytes': 414028873,\n", - " 'total_requests': 12295,\n", - " 'avg_req_size': 33674}]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# takes about ~30 seconds per granule out of region (6+ GB granules)\n", "io_params ={\n", " \"fsspec_params\": {},\n", " \"h5py_params\" : {}\n", "}\n", + "\n", "results = h5py_original.run(io_params)\n", + "\n", "benchmarks.append({\"library\": \"h5py\",\n", " \"format\": \"original\",\n", " \"mean\": results[0],\n", @@ -90,63 +67,31 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", "metadata": {}, "outputs": [], "source": [ "cloud_optimized_granules = [\n", - " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", - " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20191225111315_13680501_006_01.h5\",\n", + " # \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", "]\n", "h5py_cloud = H5pyArrMean('atl03-bigsize-repacked', files=cloud_optimized_granules, store_results=True)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "dfd4e404-0412-4d2f-8eba-ca39a670e369", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "h5py params: {'page_buf_size': 33554432, 'rdcc_nbytes': 1048576}\n", - "h5py params: {'page_buf_size': 33554432, 'rdcc_nbytes': 1048576}\n" - ] - }, - { - "data": { - "text/plain": [ - "[{'library': 'h5py',\n", - " 'format': 'original',\n", - " 'mean': 1032.9840463639412,\n", - " 'time': 51.46329092979431,\n", - " 'total_requested_bytes': 414028873,\n", - " 'total_requests': 12295,\n", - " 'avg_req_size': 33674},\n", - " {'library': 'h5py',\n", - " 'format': 'cloud',\n", - " 'mean': 1032.9840463639412,\n", - " 'time': 42.97014808654785,\n", - " 'total_requested_bytes': 560001136,\n", - " 'total_requests': 78,\n", - " 'avg_req_size': 7179501}]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# takes about ~30 seconds per granule out of region\n", "io_params ={\n", " \"fsspec_params\": {\n", " # \"skip_instance_cache\": True\n", " \"cache_type\": \"first\",\n", - " \"block_size\": 16*1024*1024\n", + " \"block_size\": 8*1024*1024\n", " },\n", " \"h5py_params\" : {\n", " \"page_buf_size\": 32*1024*1024,\n", @@ -168,80 +113,21 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, + "id": "3059ebd8-b110-49c5-9250-2a2cd009338f", + "metadata": {}, + "outputs": [], + "source": [ + "for run in range(5):\n", + " results = h5py_cloud.run(io_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
libraryformatmeantimetotal_requested_bytestotal_requestsavg_req_size
0h5pyoriginal1032.98404651.4632914140288731229533674
1h5pycloud1032.98404642.970148560001136787179501
\n", - "
" - ], - "text/plain": [ - " library format mean time total_requested_bytes \\\n", - "0 h5py original 1032.984046 51.463291 414028873 \n", - "1 h5py cloud 1032.984046 42.970148 560001136 \n", - "\n", - " total_requests avg_req_size \n", - "0 12295 33674 \n", - "1 78 7179501 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df = pd.DataFrame.from_dict(benchmarks)\n", "df" @@ -249,30 +135,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", From 2e47c3044e1f4353354d2a1dbb5716ae85632d7a Mon Sep 17 00:00:00 2001 From: betolink Date: Sun, 18 Feb 2024 23:26:48 -0600 Subject: [PATCH 06/11] update environment --- environment.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index 0c9d76a..555ab7f 100644 --- a/environment.yml +++ b/environment.yml @@ -10,14 +10,14 @@ dependencies: - numpy - s3fs - xarray - - fsspec - dask - distributed - geopandas - - h5py>3.9 + - h5py>=3.10 - zarr - kerchunk - h5netcdf - pip - pip: - - h5coro + - git+https://github.com/betolink/filesystem_spec.git + - git+https://github.com/ICESat2-SlideRule/h5coro.git From d7e8aae1040cca3f2c2f3705fe7272b8b9dc675d Mon Sep 17 00:00:00 2001 From: Andy Barrett Date: Wed, 28 Feb 2024 17:51:53 +0000 Subject: [PATCH 07/11] Add plot --- notebooks/access_time.summary.png | Bin 0 -> 49179 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 notebooks/access_time.summary.png diff --git a/notebooks/access_time.summary.png b/notebooks/access_time.summary.png new file mode 100644 index 0000000000000000000000000000000000000000..b8c3e4e00e52afc123b6eba137434eada48d9a7a GIT binary patch literal 49179 zcmbrm1zgqH+6Kz#7-Np3*ubcu0s@L40s>mVcw&> z{0DwDb#Sn?7vbZx`uiJrZR||=ex06sfe-n~_O!M=6VrMl^4}lgCF0DOn2P_ToI0*{ zDP*YCMaz#dH~)#gX6f#TU$5ql`iPpF#~O#c&U*9uj;wi}`URE@o6sjiegWEX3M>^0 zH@Ed@=)@XdxDXrrfx60?ooj#WZr&f`hhERjbqH;&U9+;}*Zoyxj*cvaqi!S99_^pQ zf;uh4Kj*F8e6w^nE@t5$C9aoayx;$FdGLoTZiloEZ2okTiRm}VmnO_i5xYC>SEWYS zFflQQW>j%4ndE+~bs+z1Pq1SJPtM2V>zU&3eo0=!q;}q2;TiMzTBe9d?xBqGr#nX^ zY3^U2%*@QNQv!MQscC68!kRxaT^18h!Y6GDSG8T{v6LykZ`E`%zrTqx{b6)8``0gD z{G7Kj`#d|Nbo%t4R4R3=sYt==gT{O{(t}WGhCxf zQcX7Bl)R9kjsM+zGp3+|bK~-B$Af}`%3r<8v##1OB2BJQ{{7h4SY5pGp5}0SJG=6{ z$T6dXThAZd+V%dua$C&8ujJxfGRc28FfgtnkdIqbl&13g2{SXZ@`?)XQGVOokH}Mc zDV))mJU=_?MU5UGcfh&`u$SzvIM1J8UY|HNF~K{Eb&Y@g*Ic0mcXxH|7Zg;-wWud) z_{YV?tyzB3B~U%~n3x#%u&GS$og(s_dtYD3s068Z9*x6z(Kyr0AGEcVc4V+J5cXuQoO|-90`0 z%Qlc-(~-EF*=bKk;{l4l=lNYsOxOGwo8{x+MN;DZp+mVoAsmC!{7IfainX!s|hkojkB}?kk&q=$u zx^jt&8?<{&h8E4wx!v9&q_F!63C&Zz~3g^u{8{!tf}1e4U;S?7yg!W1UhZJ}u|z>1ph` zbMbjGFTLU>ENmyNI?ZQ{-DzBTHr=!#SsTl@`d#|D&^u>F+X`t09t?G{uU+@2`l5X6 zGE5ZfQ}t=D&Bs1}rg81c&B@8xE@-aSQ?_^)N{4>VjglLFmwxi6pMF}kYE{jj8}_Vf z4GUcvH4hG* zQ%lhiZ1#L!Qex#5#&7J?dPsbJ#=bs5RobEug-bzp5JIXANb-0;V zFTUjZ`Xm{Du7SpkxV;J?`z0hKWZz&0v@GXQgTl!+x3N#3?)kB4jJ6jW#wmtt&{Or9w{A6Umab1! zzp4H?fJZm;Z0RaNn>Iatnp3?^@oa2;lICF7tNRbdJ@WU;Ka3t4vdpq>)!D+U_m=!y zues0R3ok9xtakS#%hs*&SpLHLCMnG&B_-LfEst2dytS<<_hOX0hX?ca?TLx%Ndr0U zp0(H6JmdTNObaKPES!Y$Zr`|3t*R&%^ZKw%YxRic-wE-`l zMa)%=TKB}bg4^1nb?>fS$Mf>?oPL)yA4tg^Bt6`RChc z%6)cUM~={l|{uvooKoutIA;zAv$=K#NMm228|v zJv?Id5IdBef*R%8?XS5Bi^8Vps}vr7TJ+52IR@QO6D@bI+=DYh-o9B1-)>ILQYxz2 z`f^MXDyHs8^`9TOXET;>+J#a_Rl~2uy{^6g=0e8sMGf*fJeGm%Y;0`uVaK#lf!n?2 z3!h|Uv})Lgi+fla(HL6Jt&V~0{VjR+t^)~i_==yz@NINO2Zv1AGEujQWCQn4lHJ|i z{V{-Mc>rn*X?P=}+s81h%UmKC1Q1`b*jc+sC?B-8Ql9X;F?nu=!@|{ALqf3jfwM zPJ`C8;Um|FcLw`f4U=q{wJn745LjeP*N`D&ojadGzR!ll@ECuR=yC z28H?5`}bwtlarGV$Oj*hfA#8>>&&OJ`Ya30=KLsWUoP|?p|(p=`tPiQ%UkCAYVy#~gKmKlqJnZ>Y0?ZO3oT`SJ1l#lCc0 zo9eo+I6hfCua6?&$BvJX|J+-p`P*-o1F*wh-rmui%_eB>7poAOXx*Css=dAVm_kBL ze?v+zDf(C?W*=VMkX+tBNKeZV>q!F!tjjc0z20q773R;rY}vBBgbzvQKKHyqSs)nE zz@?k{`cu6rdjXI8d*xFv9o9%fW0sk+cbGBMPTR6rcP5kG9_}eCE8`Xx*2b^MTHVZn zVy-|ytZ}-QL&wt}n#cb9b0wwb#f^Wq19to+#ccwNNtYdag}QA*ZT{PN2$4&!~wK(X(k`$uzfMhY0Y?P#8S*w1HRssyQKq^B4R=b-lP8N$j!9gA zR>0uzxg0u1y1Uhrhm)@uYiI;yTQ%>cDD*nz@HV4Ux~+P_jJ7W}*<#NhBu8o`l|y#t z)sHNqPNq$nW>2&ZxPEzm-AZUbk7=n7vxr^SpTnc1tWCB$Q@Ehm))FV{S0___CLe?PyjqWg<${|QqW-&vd*Tybow&zd=l~U zfRk0+-3#!n&kJl9bQ;oX^_rhmn5>RdWRnj$s*D;$4G#~W7->!F>N+oI-J(Uhe{(jr zt(OObltNHm+5P^DOOiep)OgvG|O>N0~@H`C>YIouu^o?kNx&e zVHJQBonp^IsoUFQ&|V2bm64ZMKYKRvmyH|ifpuD)nysjFGgmtk;=*c3Ut|o@ldytB zoQ57KgdBY&I+A-bNi(e|v3Jd4tJuseVmtZqtrjH&Fmw+kk=EOj32o5LdoF3o<6wMeTrt-Hxr6mL-I zYDPW?6~@vsd1iK&`IlcJfgu{7J0)SQnpcJhlai3>HSdXcBa@<&RpZC0>M+(tS>pWd zaYWqOdcZ5RFK>y6L*HG}#qaM^Yzn{*F(2=%cI~<+t(Il3PN_lB90x>8vhS^&!gdQC z!80|Gk&&rEjYB<6;bxs4>#oTDO^BCQL0nwi+dBCE{V0jKFDHNa;RgkqtDM~{;svi= zGv?RmYjpRsI5PM=&mQ$lv!$W7R#jM7*jS|uiyX^08V@4i)ts_t`4b!4*+`oZFrKDj zuj1$#!&h6mznHwPzl#qBXQ+=?K51ZJu|?xC)uqO7+mFjvgR}ayUBC{omi8Z!uBWjtHKW;#7Ftdnc|2+Yk& zG56V-{SIC+W z?n#qG`IahfFyy0Lmu%MDf2B4wIWib}{rdH**RFLBht5}|+`W6(E_Ee+_tbJQ?IHal z_r{4{?%Ci({wSNFYN67T?N8Om6}o@fw5cm#_N~ywDo+h&wMR=QL8}Jydc#z~p%~TD zY>h?L)-Ac|7hXSjghpkK&O^|xAb@`J3R@|u_w6(1FIruVPxd_%E>3nA2Sjo(8|&b3U%=M^Ff3{JagA=J^0&>BbIN$ zdC4W!r|4=@9eQ4Gi<_2JRMgbbd6=k~HZnFdEGfvmb_LV^^~)r@z8rXBlCEV*kM0P+ z@*#48hPN3?#N{5Tyl6$9OG(z1q`pv}H?38-X<;@~FAa?7ktbG+P$(3tn(0w%{V*zZ zH^sar`bI%PfnP{S@?n$8qb zkkp*EUsJIGsqA3Nj-ze*)bWz_5=Sj<^f$Azj*O@*mSHA;J@0ibZj&QClqrDMtn;sa zp*a#-F_cloPMMva%;8}>eBl;*KY-!ePn~5}6;Y9qnGXFbU%!4e`|Iv5V?E);Umx)# z;jLJ5;7AAeX zLStw2*GhX#jdNC?TfUw(`PsDqs!xng2!x%`wxQ(i{K z2OBXC&uj3G=!jCW=d8fl7b^=H4Y~lv#fe$qDpKIH=*jzQJ5NbUavnRTmvP|@S8b=? zi+YjCVh9R!R2_i= z`2iGgpR7V!xZ%RZUmla{Fjxn{L2j?A_gs~!gN?;dQ*0)~-<_~RwYKh~^#uXp7uFAg8fXnXZT%PVg7I|e`lFV$g>KG;Qv*q^ zW3Tp+3Y7q?nF#iraL0`UN{A-MkCVXI;Be(LN9k96Y*}zcA!yTC=JmTRo$o=``q|^# z3hviC$8cjtc^cUPF)Y$UC#%7x>K+|MWpXup{ooJ~$3v@WysYN%IJuy+&LLG2^Lma0 zb=6o5<1csE^q~jTK?|`QG{H*+XBY@|Zd3lhY8?Os=bZ}x;^l+*CGqBKp&<pKSL^17m`4NHl_h=A5)Y)@=LW&~-^qFQlh3Gzo=7ma3Cw9s}yX*`=iz z*r5hjl7Pz`08)6Y3y@l6hGsSJE&wRPa zqmy~sZDL3nb)HCcdG;m*|AELH$jk-d|^P zz>Wiq)rG)Bzx=~eb_zbM`s%XP4qrYRVO=u%s!xKY*iGHM6B4r1%85unkbIUH7V)L3& zZugu{`Ow+v4{g|$F_54bE>5ssqFRE=P^VT}bEM?WOw;P!DY`idpt!0^N_VHm`=eH{ zUV3We1t?fYgcl%Du0w~Ex9}Ncig{tF6H&qESS82=K!!v=_U-^E+J(xH1aSFTd<`{M z4KSQnFZWkPMMWZ*;qCfD$r^rKMJ?DzUg3&$JC&rQq#lh@H1N`E!QTnf0&dX0a^(sU z-=ut5Q!Za-qS-&;8k()+;^$Z4)<{+Yisa|#-vks{JUeW&UEDp_-Q9hN)xu+~2Plci zrH>H-e1@q&6&0oe=0Kj*=-+?;k-WJT>sj&2dLe;CMMV5cD&HQdTS|Z(p%U*G3WVRD zCC+{c5JF|k*IT6ugr0_SnT2f!C9;IqHitY=QY6Vwf@r8iY$=+XvKVfqX^s|7hVi`Z zr>BKO(o2H`nS>3oYR#I=;i%j__i`*7Bbm81;$1%X3^ZjWc}@=tLHp1l&lcES(5!~t z*u*5saj+rZ_Wf1zZ)ZP;&({I?N5{$sZ_XP9`2l69>KCzSWo2c}HilVn_`>VASP|WW zgGq1SDygUFs29&p1dGpnJSizHJ(xu|$Ov|p07Os}WBBnOJ`AE}FKGEr2?}B@w7&Xc zulck3ODTyeF+N~{QJ|}MyVaXhAsIDZow>GRJr_53#b&M8=;-~>-|+ArdkuLKPrOQa|58Sx)Yj=gcQ*j=g&aTAkaoR=+*W&p!^}gU|68o1E0d z(li#>?g%}w5OM&aAakShb5$enbh9n1i{}`8i#iOdea)kj*T>(T^X-ofw&r5kpJ9a0 z&q$+QQv(A7MVv>HxzrQ)i-_p3d5qu7#*%WMat z8-%tS$Q~+eF9m%v*1RtM(I0Df5OGOHL7@R4C(EW?JoM~Fu|VBIda8W1Kylggdj)+{HQ@bTlv31=mXp6E9AJ+ITf8%sSTX4~iI;y?&rRaD5L ziUdF)JA3Y2mhJl!M8gM)cbb}`0aTJjKirg6D=*XRr>l-ptwIE?a!Pr0>l)wx0VAN|8 z`F9+|Se82c5ble{+}zx{ER*x+`|33f)gc@~+>3M@$t%kqso*QFn;tDr(=>3A1R4y$ z@@+krW7D3hYn?|{nEm$s;9jrF%A|>Ohu+LSD=ad1Ur}5c4&59h&8yLM5-Vrn!J-`E z9%=LvB7C4?B-peU5g=tnKghu$Ni=(`$GRx#^}2V%AnIemtD?N|6@7`cI^|?-o!aL9 zS)OLd>Gu+)Fw?xA3*@-Jcz!kw`jCxiAfP099>CDHQ>A)FfJx_%1T$~g@Cfy(snWjO zkFzPm#23n;ne+Ug!8S$O&TTGn1NBKJIc$Sz=U{h5;R|K&j}r9|7AG-(Iy%nK93cs~ zqWwCStQC6aaXjF`gLqEWSkw2W?FmnxMy&1EthWq7}*PnJtzY;Wk=G-<&N=_4W1Ls~C3R+Or7aNMW;$aO$W)0Una#nf~&n52%9b zCt))QgT{EvA3*48upF@bcv(iFBDj7k-9nHbFBl?nfB}HF8tX|MChgIhKYC!UI z@aAyO{#Gt%iA2#_zIOXjxz5iW*Vc2fvqzuG^7i&_oUkwsRpbylzhshnaQki7 zo*YT>WSie_H$EXT_i4TEJ6;O3B$bK)9>@33m$f?7$~zSf4h~uj zwqC8OC7*=4m;`mw74K!Z=<&*xD<9vs=2@9Ykb6U;QkzH7~HFkC-V4}@2jsxL|! zat%lC#!`6zz=k@Gt=;6IeP;M3paQCFQ@_~pLwdZlNa+DO|=yzDf62E-8 zgl3kwseZ2ij~&*wS{K$*QCx?6ZUv6nJl837b>!H$FYt>NG+s}a6x%$~v7pqa8SfhN!Gb*OTrT1WyKX|Ye?>o-^p%7XsL4F`KA3l7L8Gqc;yns!DCI{-jCYa z+OqBU|2IGDv$n}DljBeof%UTWuRl-Ke0Hz8R2dEj0TJY`p^cw7ae_#`s0^=61&G52 zp09!$2WamArj=z{{k-44FY$S5p5v)3nrEw0-NlgEfyjdmy@aAE&~*bv2OGJ?BtR7u zu9Zbj)#j&mI6j$ZKm&;)VCQY8v9WRE+|Wpy-Co#;Q1Af94>s2broX z84PdltP!kt^dG%XQpH}~m2F&a4z;M48MdFqeKW=?__$$5!h+QR1tNkypzB{}4)*zL z8dSrE4K147#F+6Iua+jO^ZE1V0vBFy1&=ueIb;mJ1z`v#VJ@+9?N!NDdDvjq$dT;M z;Tfq@ZSX)gL>0iB(@uBr&AHWFL#|Nri6T)wH#<$LAnGN7zza&`rB7!G_ECtNSrAYn z3P?Txdsm)YGZnFvV1bJtUhKX$jv~{{+9?N>z4pvU^ek#L2%eCdJHxWAK$|FIKmAnA zomTMS=LfQ0cl4}kqF*uFyfilsZGIG$=po7G9r~m4k0JrM>?ML&av+A5a&vItagI@ZSD3y*y9+w3@4He(k;L zhWwQ2X7~a%H8pifntZoeL>r8zK@GP|=Ptu`l$456hLr2@V9CpRkAUA9O%LT_&lZR4 z-N^EV)zq4;$c=uD!W0j1M;+^R^a97s^qh6|0C;kRuixT&Y=N}4T=_{^IX2&UG!tAwa??!uGMy z10vz4iG+shmlq$3$BK=NjC24HKRetk;yi_V_~D?=5mYmLEZ4;j=G zh>gEJNgbPBc!rBL&{SXmM#O#m@TpV32OQA;@N?4;2n-PK{nE45P|huB8#)YAw9=pR z_Lcwj*IU|X5jT{XAC@kECog}7C1DESFYuYRzXGJ+Ft0fmRG|K^z2TaWg;uvVfE?jf zX1$pmvOIqg9Yp``J3J8Ld=g4q6C*pEtq+&R%E6YRW#j$;MibE(@gq*3|Ge()%?UP- zds2l&>?7voCg~X_vj>W?pn2WfB9CbTjcJa+iF9z#O*?nGmzs%{t3U&tnjB3-9~b~0 zwHML+3RfW0W7?^?9qAo)v~t2INTSNh1$vTN8vI7t=>rM(_DGjQnn4}N4xZjjG%@l>tsDIf;s8{rjg}3BR>vLx0(3u) zt{NC$iHiQx>~itBvUQY^yl>hzlkpW!W2K(|iwo>M0um_x@aV-zh?(q&b&*Yi&XWFzI5K4jA z&jAP}2qnf|DD~k_b&Is;Z8Fzgt8Si`fybbENA# zo8gYP!zCh*7H$)&I1s8N?D~tB-WXC4#Q<1LJaq15Jp5Ti5lG`8T@}By@q{Pmd>{vU zLN%Vb0CqIZ>^_WQ;+@B(HFLXkTv=f`>Wz}Gh6N4aMHFFTeoUcH$re;m(f4YB)Ki7+ zd-~Yszg&l)91Ee7Q&dzJE}G2g(@MCu;nXt!HbtP1Bs|`Juz32HPo4FVlB-D?5IGAC zbPvK^V0@T$JiSty`WeQ5J+V{qsEJ)q8XNdiD!O7K(t+9cz9oRw1fc0hyYQ58F5~if z5<5{rN8Xkx6cBV*W21;ZOd(`C=&C+nvsRotpQyW zstfT2AdwTjaC5dU-mnH_ngwG7xML8SJrP<6)yE~|@66@X$^2`gw>Z^)qW8lCyl0}t zK>$@mOep|~TU*JbO*p@dfvLF`azlThJE24z^PFtA}BQc?ow z-_%624GAPkQiXbNUPDsVtmf6Is_KS(r>*5ltKX2~!@{jNQHL%_ zl5zGfu4*@mIOU#Ik%;6gI$iUZJuPV>8Nw2O@gALblmv~v$(;Q?KR>o1H*o25%Y{e4 z#FRUyHe}SW_!lOmBAszyE(u+5aCy`TzXw56Wqm%l|SJ zC^t=h*Y5b>%>}5vIU|Qr*+{Pi$?}6X1U`x6sVcFgA^-0_U*2J>;E~|&=`pD+3{!mu4*5jid7}s_g3Zc zIJ9S31R8jLHAZ!$K$Rla7yu_T8(Z3F(X;{yhLUwg%o?7u#ix5Y>GsoEF3@Lw*R784 zZ=Ng{3pRkyX`I*{^{_+-bSt03RsuRU$=Ko2rAYTRUyBnSaY>kusMM$^ES zOM5rH4r>~g`(7j&Rp|Ydi(V#Ey!*FrW|`6z9Hu(x|YGB3I!m zjJTo>G7#?>K+jS^3=t~k@Y&&Zj2u94uuiBConAEJmA$t!aIf1HT1X&_eCehH5D2$(&PPv)gU1k1spX1mreumgOYTf%{Lp; z4AV${q3f@EME6_1W@`j78PJFvrf0q^uGjK~C~US(J1+$Wd`_^`%OD8w7O06$xD{59 zab=n#g4Ef7J5i4xA0!ykt1mZRH!T*Ky z7TABA#wMkz1@XGBOzD+-dPBJdpnG5oHK@)Y2C(`iMkIoFv48>SII} zFI7$M$C3T24Yo8WFUU3J%YzLd05-g2%Fy$xB>J*C+(#eB77#2mz#Bwa!ic#*);85) zVn}Q6954?XAwYkk^deJbcpG@2U0$6)!{-Fxp-3SAjbRyW7Y*oQ!JV<-2* zTfaac5l}dL_IJ^8ztXem)9oKDxhc!9J+s!w!o2D68f&KXaaWu83xz^AW^qZb8Kv^O!@leB=w{VIm9t56~6%8#m>%74N<)78+xe3qQotch-qnC z96OQ2^bJVJ?d#`kUHI#LNa@f4;8wR9y1KgFxbv+>;@*o@W~gTav&{fCr2ywC~QjefzdQJCc#_jM!Ze zw3cxnD!xPIi$A(&MD`q_nI!87X+M%|1@t%4ug>!ajpObtpez|1{9T2TkYa*ml#e`n zXlg}8MaG-j))?6>G)GXF8W2@dVE`szHOivL5-SUpGDX~D%Fku^FFNeN6umqZkWG@h zWw|fg6HyVm;435r;fr(CACljDAA2$eE!Py}kA!rF8f=;} z6V#^p)(tucR>nSW%)NLn=zIBpT5ob-bcvoZORFc4^efYu&jMgy>=dNVGa+~bwe3U5 zK#s^%@Kw^xQ~ZOFuC%Wb+lPjXBF3Zrb73x9eI-%x zGtaK7jLyrzLgDML63-&j8;DTo_LMD-L)kLa`!cg@6K?1VF7p5D9}W zNmwcEJY_@3hmco*Q2K3FFd0L})i2XeRQvb0~@TK`zjmWqEd065uzHn)(mKIWp5 zo}RuPp%`R}buN^V0Zja(B%b=}>1-LB7g1k`)CO(qT>!~+g9Sh&L|`Yoq!a&QPk&Xg zeZW3dp71f=M@q)Z%`lPUAuS9hr4`knp>JYlj#+Hz&MgwWGd_@xECm|5R0!j8SyK`0@`>V}uRE*4w5vBIk{e5ILlIMa=_;-9P^(6g# zM-g~Cr3TICTuzJYVHp$H9?&u3K?sl`u8)xo?CR~+%=Q*BIudMLc{r)(v4-aCI}W4D z!xffCXpZ4P1luJ%3tsxN84i`{aXv_eq4vl=(hYMR$i}Yw{IcxDi~VTGxD>vIBGSMh zR&3HKfkqa}YXgwm<~eN@H>fAPSdcIK3_o1mQWYv12bJbkNy$m_2wL-D%2KE~InIxK z;cX>C(8B!9h)Rd`!L52aCNM7?M`na+F1_5lGa6^FOxJG)I)Tw#ets(B&Bl!z8A#!r zrdopxjB%}HYDw(qRlF#ElWXI(XJ=@8^>+CE2yppUsB)HJ?=y&@%YVk3&Jq$a9&9m~ z+Z=M$v%fO!-0l%qP;ji(u7Zb%%en|wvNO+i>E>LN1jED#(rW#1*hr|(Xg$PM=v`UOlKw<>ev{2a&>t95mv{QCi=hlGUu zGCHl()8DVXt11s>DF!%I5u}0iOEx<+&qU|ZHd0eDz^B{Bp<&LPWuugiIalTxNifY3 z!THr3&W}C!h9iOYcluZ*q3Duy_t}G6`*!$k_2>*?1pZayKQFJo?Vsjjdz;Si2eoXb z%GQwa75{4sADPV6_i$AUXbCjrJx~qfYOmD&oq6?z@Ahw*SIzYCkyc$GKi*M(rij;9h2sT!CY@drZ>8fdqA!$?vWcZ8|<6{0-2Vp7ea^F{%Rt0xen`et#M>H zD0bqZxBR>^>PA{=vQ*D6?^j%BG#CRk8mqD|)C!=0n*0k(CegY3QEgyA2k@vl!(BZ@ zjb1PQ<+n0lwj3T`AD0(MIxRrOv%?--xYt z5~@H0D5U^`@+8KE#zbU)R5V?i!kEjKFDI#e2LusMa`!HogyTadIH?Y>g$*xGZ!x%Z z9I8?SvRD`=+?md71 z`QIj7o%I9(S{@47grRn%5oLlz`jC$p#|3kWi??;p?%lhW3j!l_0$#`4BQ1g41i_L> z=&4hB^_|2f#v|qM)C1;*X*>|Hy~Se~zQi~rUvrGd5^Ihb7#oxz&$ykwThp~7w2E)x zK7KQ{m2VMXxFRKLETAzoDuS>t{?j~>^S_;ez1)1~Mhyij7Q%hZn>WXzs|7j>b7Z@J zwIk91K^@3zlLdo)6X>jcSr=E`mr*5Nd${UY6p$0Xd*M>lTz?$VL^D;hee;n2A=gBV zP;bvbi}z~lpIX5*{ynP7WjpsDlB(5n*REY-0OG!CZl={`<=Y$5smS)|0_Jz?T%)DU z!s=(IfUcz?BR3e}p>bp(x@y7v?WIcYq2!gLHQw5LUqNj4))U!7w`H`)%$w90ZLX?P zQR#J8Xb@jP9sMv?04r65JDkz=yvcbYA zH1-4u{#XbVyOD4=pb}D zJi5$+jCHBwqyf4ruCEM2Nxw0M9b`L#BtXT zH}(yszK4D^4%w^^NkBN1b%cjPYaFFO3Mn4kqW8_CdJYSthss!N>59AJ9nExi@G(_MnnIozj$(5450FB-c zP5^!;0p(O7yttUPta18J?ljH8!7l6xb+{*F&XVQWB_-$^LM^t}2x%npIJzLw0ZD)g zWgU?W#r0z2N{=Sx|Mc~xBl)li-V*368EL{t*JJp=bg=zuFFS?!yRG7FF1;+k?yn{$ zAsCXEMo0|WaWaWZ4mGM0_Yz};t>;qG9s!+6qa?!`yFn-$Zp;E#K!Vd`D1k#Zz!ZYF z2VtRl==s-BjC3)zrSkp4hJPh}tbDy)8(-0gG6ZnnftH>om>mX-+^Z`{i6}V*V z-m`~+K#IuPlh7vwG2@Mm&k2VgWYJ;ewO$AX1Pj3#Aj5c>PSWcJxn#p!NYYjd(Q<$+@=mu_Ycz?X}KMk9= z^Hg19=P@YI!glh<^b%edT%I&CMnWnVO0@ujSj(0#4=@c!MzMh)PB3cXtdIaHMtDZKRWoi1?@(L?l0#iIWh3#j0V&91APEdKzsADNFu6UV1=8qnkmQLJgaBF` zCZdmnjuY;M`y59ust=8`0diA)ww11~qFlJRKBm;1(s($FZ6L6ylUP004pF8R!o^bv z1p#X&ndIS{PXIF1$y6Y0MbtbKniPnE6Wt5u?Zd++caS(CZx^08d6r~?gIpWjl6Wwi zb-l!hBGF8QA4v|&5!)R(lTa}?4UA|gL75OUD z(b>jYWD(@@EpOLnm@c=r9Y%sn2<*Bx`xum065c1rE5OS{a3uMKqn7s4$%P;# zMx`Rr#K=@9Fq|gBH9WRCcT|S!lLB#i$)o8)a+(FA(6cCf=I+mYU&7 z&T}z;{`*f(wL*k>uHM4y(4RkF4?mckZ~#`>Co%uk7Yl)hQ{u7ba27rDVy6^ISGa!d z4wTJohw9f4w^bAN6FaAvm_9aqE%ADgrlj28+n5b)xR4AqVa+rSpM&J@`rdoXIJlkZ zrw^qHZUv7g@6{9#%SRU-_SHl|55NG?{QzW1K#V9W2%%&%USs()aOvjwF{dkTa7Xcp z9DICy#29iaBgG6~h{3SxNi6b5?EUFwtg0~Nu}#Q4a$u{EEa|T?zJrwJ!sH{4$S}ts!@6DuR$RZ~{8S)0p@7y6tuqGxC0T9#@Of#i`H))o z^YK*~eKVHTHpc{4Ey=wK5=l`(rA*CD5mXWgev@6s3?2F7+3Uk9o-6lKPs21oVaqDik&c^!lKdeTct>( zetGzkmym+5tYEzhuf0TsQzG6uj+nGgs`{uTWGd~RpM<}+s9#sp77pZ7Fegn^NGK^5 zTgv~qeti(nQUjs~a7pL*C&go*PTf$P_f{b%RxVO@!lM#JLwMga0l2S#o*g&!&ta0BKP091; zLt&Z-Ubq=3!h%Jlv? z$XNp<&Y2qY?95wY!o%9NLi`6bXPqzR2xy32zRI?GW>~KUutPi1vhiE|ZS46<)UTVR zXD5qG8#So4_Y%b1CJszoL?jp3h&!s9OfU;47h`cUAiQsK^CBI|VXKP`e~=omAB(uV z7P4;ZR1fb2KqZDPjAUFaGM0Y#?(L$8paI^n*dGrYlS~4mePftG4PiJKzZGW=#_Q%- zBaN4}+AhV*$)p2|C1j*Zr0IyTa3fFI#qK82bF)i2Y`ms}tv1;n$6BaHRXrQLv@ili zStOLLAY$QYjUYzFN|V84AxEA|p(MhI+SGcf^Y*&W7P2a5&O{J+i1peS=4?@z)8?8~ zGFbMHLo-BGw)^jP`%m@LXZ=TS34Dm$?egQ$6xGvrH5xk7Q zwgWt-k>-gJz|yIDvaC>Si0}RZ94m!Oqs-Y^dYuZuc zg#a{@|FZcLQbCy1pu?OWaTxD1j9HM^uY}zGH5K4D-8UG+AtG9XW<#4r z^yqB*Bqs$&ZYTum_Aci;x=IK_8Io)Huzd{s=8%Jvb#ONUJ{Si&YwEe^i&1#gL}IW5 z5SU}Gl!QBgr>*8Ds52@ZGVu8O1#+F7JYHMl`*Rf2>d3|L0>incDsr7&Pzm@l`j@w51)j~yp0?#{WV6;M&Ingf>+?d6mBYMKW8 ze!Ku$CI*4m3y@DPS9cTOJ>ut+19@U^-0nCDspp z`0E#a4l?ouHnytO8jlPD`hED%@IN9{*yylvvBOA68F;s33ftd*YOChiwYj146_0@~ za#-R}Jjuux%AV6m3nwJs)%7W=VyC2~124@v4F+y%P<}$c2{Gw36#>_d@fO&j`#8Od zhQ@H7!@rz@axR%M2iUf5PdX}dYs2S;!AW#1;bd6~g~Wz%*oW+~=a34#oU{V&-i-kJ zD~I~O`Kqp=W@W5-9+HMutmD*toiuu+R`1oEKjur7>jvvwy}Ze`S$n_22b2O@7#~uX zU%q_d7u&p`lQ3_?_VD|6v*pFEt;E~~BHIf_hsl#^3NAwjnG!M|97+@q(7RjMo49z82 zPDQCFCKpKvsW|{c`bP_qwj=!wFF+lROht+mAQTy_$7ZWp2+&0}K(`$_G4P*`VX@j% zC2Iko!oU4CIXDe3;Jj`a^d;yR+|Z?STI^&W*#~a9@LvvvX&&DT+dUAv88UsX@^bKo z#?e3!8h#V=IsaiA{W(dbkdbVdJ!!Cbt~1ahbub%&=4V-vT+F_I=#Ny)p_4}oY0FZm zfM+4QntoRD#sJQMVhjCa038NC?ZxKVZxMmq&4*&vlcxU8a-!b++Th z)Dcnu(f0a=p$3;05L*(yKUIO;+vqu((_-+Bna{#;t)KVY!4uPb>K{!^uZQXA_KQ!L2?8K3XeLRYH|<< zN;~2@ga;Wr-dNbe@gI>>xuJPTdRyT2|4lwm1NL-_s~qY8$oL=;1(1A;z(Ps_E!ob0 z{!ghpI}<<(YpvOHX-G5lLkp1GB=uZVCzt>2ded z$;1=k`ljg2B$HvY@obDxJ0>FL4()Gv4Q5J#DH?Un7 z%x&-q!q{PiA`ePJ=oqKK^Ow+lXHDb-^60|d$8L~eWH^GPNRvYf6~d172Rzj;pXZN& zFhSHmP}N;z_85WRc9eMMu&0R`l6E;8#4aiRegj41MuAzD!lQQ-nx87pzRBO-f5J?}?UHM@EgVqVsp zbcstJ&yesUA#0HE<4{7tE8xIsL4C+UAV^lN6L~&oGUN~an5;-(Zf2y0Nce^X>|9zG zdV2h=H37v66RSlL@yzz#0m+37>pW`i!SSZh5}=c;6Y9^zb6u6Ht@?7eHLK2s2ln=k zBhQxwz3Cx`^4X_x96EH!btt`zNM^H;-grmtxMD2QZ8|)`KyWmaN_sCP&L#v2hgwJy z?A>0kdT?@1C9{9=)$mua;1XdyM-2bW*FQgr+VyfutQCn0%3Jx`NZJ$I`8 zD6FvCMdTzHi)+9bB)L7aJ!v+OU$0H^A1OOkqzEgfA1$0?OM=x{Au?xwpH?>^9_;gF z+io+vO}S?@>LouejJyfc9l*hu>*F$MkZdBCQfeSd;JBQ{EKF|Y&$)ru2030AZFWcd zCr31)JTXJ`hB(HFvjwo_1_0Kfa0zkrVX+rJhsM4>JxUH9_NzXu}$MZ*sV~1H5z_wf(r$fo&h*q{ z$9OwjJHbmpA;2t1^cP2rC2oou{BrdN95Y0Y)xFnf82~Mj9i%!(MbQcavvId>t-rV^UY+Likpr=FVvom)wdJlZ}(^DnO z#h(W$!U1R0F|TDE@T4o57uJrF@P@w+H}*mLM@EdP((;JmRkBizJ`g zfJe~h*CNhyEsiYt8~^^n5bQest!wi?mhS2flXrHJh}98K6i)WPCAH`8r4TWO^kQmA zZD0lM)~YJ4w(ou~ldag_L5hDrz4u?v;rX`In3#nA+qbK_Y*EsKnpKO;BPqI=J}-wL z2za9jPk>S7HCJmijp!jEd~jRtL!==CPvjsWJn5Z@-;WU18%@2jVT446j@m|$lV^hD z_L7s|5KdPmM*&rZk3OCS)FEdJ44~AJyaB-pl2TIU2pB?_Yps`9+?Y&H9n+rs+A8Ak zGEAK$19p)&j}Ta1x?S}~#<|8{I5j;t7##tC%rV-msM!H0`=;=bQ}{Eu8SD<|h? zdL?9JWr+pa`E$0c3QkTKnq%FtVHbsrMBv0x62>=!EK0_5aE69Ktl;;&E>}2C!itNH zHAmuwXj>x)KXF@!W9xvwQK&!yqmSG2Em3Ypcs4EmR;7ORK2BpYijV6jo;ZSe5;GsK zks}!pupvj+E?vGx3CE3CVG8*Yb}KjLMS+f;-WekG#ZEzY(;WU@ADH}4;*2Waox-~* ziP{;l#81T8M2#t4z|{=6Yi3U`|3HmKKu4BZ3#AFNmKw%euwjXTj#FH@OTM3x%;ZnO zCxLgfQ`+4>--hEOfYcIsRXF>AP7HV`o|s;f0&r4BXGGS8mqJd*b_gd}8)v^j8qiCgTN3qeVKGsVHkEE{r>8E?(D;2)QVcx(0 z?)Q(QwEM^rcQ^)J{piAZ=cG8XhO6T2wRcfAKGzreOLGset1a80SXp#Dmyt}Mqv_X=B0wH!4GWh{@Uzv*bXO6 z+gd4|{Z=%&E{Wxedv`YBa5qv!-*wt2;vglG$a&vH4xTIR8R?D9ta6r2ii_MowDXo` z-ko2}aHLf9OkPW9=u`NayZWoD2k_Ei^#)?9IrNVaG=6II+qZ9@a4j?*ENmr;df!tq z3=EL^fJ1)a9}hlm6Y(eGtL)wF2eOcPrbYu5KQnS0eDd+*NB+cbG8KMQ8$YT~#X&v4 zQb;V?O112C4FCg{+b#->#s^kkaYz$81=HkLaFmL*E-R8~(Em~BtDHWof8Wb3F(<0t z^HRL#r`yT=&Md62?FY|XM%0f`dTtyVif*?sFoAi>=IqhxsO#h$9de2*_}&2i3h{zu zOmGxDv_M%Ppl;_Ai=TdZP{n&a87x^aS21Tvlvk2%1{ww}CB=aZs+61~11e;i7Xu+G zh)d(mk_kkNF_HAY=KS}Z4HLa^72E&Q-kU&Uy|-<{S4y>|nP|4_Qih^gDWMWqB4bj9 z<^c&ALdHs^L0uU(6_SiaYM0AYhz5!>RtQO^5E(MR$BC=G@8{k3^WJNH@3+?Xe(T-q zUe9{==5qaq-|zgL=W!m#adK34TDu@;GxTncHhJKWVVVrr2>Hv#KNbZ3<$nt%?xujVAzO>uy|IbqF{mc zdAsq$%OZ7UgJ&csb1!)Yl1V->Fkq~d{8>FEB?Zl|F=x;rA^$z}H4LOIK#oCVJ9Mi7 zCw?>m`-{l10if9b?DcFI!%a3^(*z>MH_l7bFzILc*c(}^cajdS#9LCUoL_I@cO z=j9#7kc^S15fBP>B6S4Xf1Uma9(fsO=qgE+8Otd{3z%$6@K+9lJ55+GQyjqq6+2A_ zKvgY6p8uR-dM$&&S2c-K4$BkB>8{{$1NVl*L?rYH%m(Y&sM%uf&+T5(gC4OVsdDq@Z`A4+vphrwJy5IG0yfpaGF}O@BY42MNbZTuJPx{OG%ko( zKxowydR;Q+{v-86zy(CBw@GVjK_Dk3*W_?;i7=VRp}f*^@^{NRGnBdMyHtv{=vVWn zQ0hWMNoECUV{Xh1q)nU2xHjie+Wut;E4|md^e00zW9XcWfIu4NJ@t`dgO~X&wGXgY z5oQq(90D&V*{v^@fmB8+23&kAz!6ixgJjshVo&<0VoB)@w4i_jg5s{uHj~kmgAgyj z5i|#SZNNv8*14=bwmAoNWX$+27l$4oFJ57|g#it?q4_J0NG~TA4?aEoH_uOz?@Rki zy*s*MhSJMZah$d9;ie5kU+}MO&)#4!s}P z1r`AaqW&l(oJSB~nxcOWsKRv&YyeRfiUuF=xgCH7h8=0wJx^S;Ze3|(J9o`X#P1!` z1STCj6x63RcT3P;KzU2Z8={{on@n9uMvhCzevlFr0{I6B-CF)!dHdE-@`%Q&r$M~3z6@;NQ@&f^3NV`F%D{O0c*7k^9inl6CRCc=NKXi*cNpI3TuDc?_da8Z83D zA9y#sLXR9T(+8$RWgVowL8FGr0z1I^SRr?v>MRirxYd8_?3_F@wZk&0hp>r6z%(cb zzb|VBHUEU6Viz0xLhPl|B|O|7I)1vzb?OAS9`xV`34B=ccBbFShYhU?ad2fZZpf9e`1r<| zLzkuS4iFaq=2KE9EmaZ@<>P%s`g)m6FcrQ1PNibP>dHwRutW`?_O@PtGuK_<33ZsW zqKW<#+UCu9BdE276pt$sy6b$JCH6n#CZPtH14RmZD2jj2nFtAHrxpo4>+$u9QMFf4AGK**9 z6DvJtWw{R|K?2*mM}mRk1*nZ+Kdk#?mi=YI-?! z*+4HD!Hj+OLfl82+71(J?!WxMT)Hhu=uU+a+7RHIERJVbNRi+d9L~gttGJgCGhK8- z>At;t+XyQHuxFdy)824Mh$Br@(66sG5g*8n0kR?ACx73gYT^&k$R_n2S{QIaiR-I* z4Xbi*sEp_$o>QlfoR1MR1QJsK*w7)&$^(T*t9uJK zwT(#ioXyD%k~fAIC7~TOp@EDR!TwZQABJ^^oC>bCyCZ*zU4rLH+EzRPB_44NwnE#{ z9g{BF2TkTY7AgKrxms6-#U(M@3TJqVu?Xz*OgWC`4J!{m9;!Qr+S|DBVqcuJ2$*ap zF$8XgLyMay7#_&Ix{958IOuW8j@^v=A$~o$7+PN*BOxrJ-sm852ft{oL<#O!M^t#q z{qBg9(;7*414v;c1YLa;-1tQ>}(NEEXCku&-3uNG}eQ07!yB4T5F0p!H>!s<|hb#!??K3sSr-KJ10b zDd4i+Fs}jq$Fl1A5xCken{h$Tbhr+US#~XQQ_B_du7rAdd?6$C(Y=i32r%9erwYOM zCacxit1T2TR|{MxBBoOip{5zKtOJ@u!Wc$r>d@Ia7!{rTsS!0$vyq|UpU^7^qKy`k zokEnac11&X=Ng^*ugBvjufL8*>{N-Ayn%ybhu&22-ERO{pJ!l18^gkIxBIX6Qkr^a za+6DM%UQ%I(kM3oLWJln>GM8{{CX3(RdDLcTbtZ^@DSSI8AK)|nS39{{WU_*AQ!Kf z>`8zTk<;CxX{!1~I-!V8cwGwQvV#!@5M!O)5O4TTCHI%jvD_sUkLBGD>JKrg*`a{3 zOn-i_L~7}SMlp~mjf1@Xg~%{tpoy>YEFp7OV2Xa|T#(HU8Y={;BPSwmG+I~31*5pP zpfe5o6C4aNz*(?5CAOCXE;0i7=n*g-xT4Z8w)lI08%d0?_B=UR0cOY&YrOvihaNlh{rmUJ zA_a#YJm-`D)-bJsLo24O4Hy+OJu~o`6ZFKiRUt+ud!h*Rz7FX7T>@7RNJs?UNS*(o zz=Y2ops4sV!~aRUT2tgOw4GfWeBR}1`zEi$RFvBVJibTqU{1V7RaD0}IFR{4P@BWFZrGK3Y8XZx5swV35@fSb#-8EsSJ=+)_=)4(up;YsW z=p)*Njty#{XP`RrPK;?dpJc3j$LZ+yjA9$56Hmpz{lr2NtpjdAv)kHU&iAS%j0x0f z5?k!&9F_ZN8k@46y^+Pn5TN$*fqp%#n|zSmFSH9cjQcZUTvPpp)i6^+2Tw|_!$Alk z^^<7Dj7>xHNCD)Q^uIo&TUk5%?{%IN)UgSx=okm!@A&|Z?$f7g*Y~}ZIneBo{9Lf| z#&EgZ2SK(2Y}MM4&!}Vt?dn5*r?eg{W6mrN46qDQWf154LZ5KMS&UTKe5-^;s$>Nj zg)zZP+0y$*13H1j6IOq&nv#hEm?n_kecHtSdDqJ0A0FioxC&q&W+(BF0bNYb+}zyW z&uyMg#;04yW+7LNADwk4F2;qk!K3P>zzcYpZahjW%NtqE?9 zgRV!DzvJn$nNElrB&ot96lEdpu}sWGc?Z{sNc^tUoAR825&vN?AVjyIA?>(si*82E z*(a7?ni@1eni@L>WXKUh|k|+%=TssMBQ){rvrp z>_UfN;Xlk|`8qIAAPQY7e>4|1(^-Xu0&jr@h&b@|Xuy6E&kity*TPLHDpjdvjxw?q z3Gw}1H84a#ecS+v&O73qoSozF(mcfmqaD6K5c{hGG5tbPC9F1>ENqFz5BHWH_-}=) z+T1|z+MpSy%ozj11i~r}{K75s%xwDQ>S?(APC`TQpJ0Vn_y z0|)6c;$#7U7cvF_JR{lM$_?qo{(HSF<^n4s1AVeEM3;1T!sSV5_&DGCktDQYsEqt6 zp)+duBN5*`<!{h~00G!QFHq70Jfx7U;ox^SDrnX1CcXI?@ zlKp^dSHg%ng4c;7e1|XhsWL3oBs4eZiq|z|f^x zAmJxOWpXdj;-1wEPI0tevq>@cJ5yI)5!}MPJlrYM1;}=UxIN* zEXpt6X{cl!xLSn(-$;{1UTxIJ0G|Qfz(lLn+*~HLJq84Z0UUhcsQHOBXq)Y~NIwLW zI#VRRBzwbzF(wOa&U|F<#5qFs9)--B9f}zOYuPB^;``i9o4V`12wlTxj{(f^y zUR|i8*PNq*ViNU;69BCZo_VAG`lBCLu<~%G%VpZKXt#C4To5R419-{z z+MKz|-NesTn&bC;iMm~RrG*L`;4x5s2^PYX-ucFHe$YVN^>i3c(istpP0Z4q(%gAJ zVju)LCPKKZ$Nm$#c(THS(wbyz=bd&wHWO!k1bf7%V>o$ZOzP!v=I`7A7#5fKp*4f4ZFUW8NlhkFlH$?KkF zs^sY(x_L*>*hv>{oQ_4=uU=hcg7Mg8wh+buXkmfD+!9@3{_sOq%C;qv>*n4N@*ZP3 z(q)p&6G6F!tFB|N-I$CCy^5u&b2Bd4eTmGQEc0oy%$;!sKhhipfAX~S{cgVHLH9|W zwpn97_e>pYS=I1!Wmp9afUH^Mdg$C;w=m^L&RVEfg3#^>1IDV#-kR^N<1k6UyvhrV zQY8A_DK1M?Ra7FHq=HA!Gq0HVIgD7kxldo|KAF7s`_R>7UJcLTx740xsl6OFeY$o+ z8Y3sm_(%)qT8boxg(O5p%+R)wCbcoZv%}D|l63(f+GCNR!I9o_;fNp%$*AcE%IIzF zaKgWprhpjt@M7>^7_I^aA`u(p4+h)*PT&7LK7C;_)5=HYAMX74>RBM!-D@)Uw!p+6 zuoZReWHdY#F3$cO#!M5W6;^oU7j=i>0uF1>Qt{p#RHGk+Kgw&k+Q8lw>bic^pzprjB@UJWt zP%ibB(#RdttXnZIqw_3B{8jxA&3DIVoon7El5r`vYNl@KL{;M2vVm`ivt*l-e|)i) zCDr4j+9#0Wvb1nK720rHTpE%EweCGY-+2H7q7#BPrnpvcq z=Q6xR6&feM-NzX~RWt*$d86k;GISf<_MAwDCO@1UZQT$zh0= zn`C#)(7CG`tO`BQvP#Y4$qfpuVV&{|+$$Y*2Uj{!x6OsD#zOHtOd&Xug%)SlgtIy8 zd(6mj6X}gME_BPcPEsr*(&91rf(yDozJz>^0g3ORb>X4M{i9=TM;KbjV+ayv3|lWHT79o!!X?o*nQ;9E?YIH@L{RlZdc zG7Lq)jK%@2+bh9cCyx{r$}2HRD5!yS&XlX=udvA!)T?EyO1&f3c1jzt*9L91`^c;d z@#Y8;yu@!0vdN$uL$3!#Ss>L#SNd@Oth*upY?H-m0tsr-4JOkExok8t+aVq$C}xn4 zgu~sw2#h>Z<$6M+50#T#4^?rf0cayjrL<|0s>0KlOYRW)gSTyV6BKPlnC7T)7yf?` z6}%nn=MgapJlu5+B*J0Ai_jvjkBAZE;Rs`2QUFV)G!I{(>qpVxaZPtvTeh&7I7i8U zfsN6i2BZ{ZqlVPo8Kf7b?$aFxin*!MLB5Yq502pGN}Jay$g6x6@rL?GIdeh zHhe&oMKZ<*&c8l&7M}`&`yn$DZG0=)o9s}sU1b;VO=6M(CE!y$%xl@zQ302Keddg7 z;EF;&pt7bQlv=HxMO?5$_HgxJ!ZqP_%@f${P-2JwfbuHK$?W=@?M+VZ`juT3$-(Q+ z&HQmI5*SyMXV)nC#qT^pe|F>h90NwVzm@2?8p`gxv_llEy^%$!{X}uB>xBWW;#O$ zCk|C@EnC~iHRWF<%FoTl*SpD|yZ-s5_8||B_+?I{KZM#*N#i2i&Rx$Ex`^*G(tc;4 zt8+<$GZ!8pcpL@5#7N0qhTJBzNN7O?Z0>%VRl&`jtv*p&+X%2#tLf9-z)`&+g0?;54r-uK_yeakC@fv!y z1SBde_pH2eQ73Ngms9>n0&dLoo;jG_hziQhy)n0ZwaniWjGD03xCl3g^(SI^j76Tv zXQ;8Z2YGo0Rw}2ZNRI=f=hzaDGiN~hRcD97#LjPrG3PFd zWS=5#b*a!O@Rqj|KPOusGU{U)p zDM6rN7%AqRaCRjz5!8hL$q8Gi$_*F=Q*apOW@WYZCf)zu^k%--y3gXUnI>~4w9pR& z=OfBZnYkV$M+6N(r*cc{ug7tdC$VBZ%!oibDTsB#%c21cmR{Vy{#`2g#V_Nl;IO|R zpF9pK!$3;#g5~@hJv3V!jmpg9kp}R}@o4z}IzxO?m+)skf01pjn3j^-SakIwzh{$z z>uv5qti$tc$$lCB@gEZP);`$QU8er<?B6%q`iJS? zT7_qg|0xY8KC3Swa|zzOgnwgLiH?DR!I=`PAS=l#i_uC$&v3UmvK-eLX-oJ&@L#`t zGFgqcZjE#NzGIVay??@5#(!!!@3}942dZ#M>q_A5-jN^ZaVxmuZ{qz%MhctLKH5IC z41zZ=Q8<^U_>+#siyJ+xug-T`6$43)~4AtTF zse$76hC125=+rxpo3`jJ6b*j@O`}T1`kfMa0z->stR%->#sv#T7+aux3c=LC2CK&9 z@bN{H%@9<^dh`sb7=~vHdhYc^>H{-y7b}HZ>lbXtN1y)ue1V3p;~OIYGU7wWBTGn$ z@K+K+=9(-m&g@oxIzvmy7kdlf?Kbo*D!N+p- zsSvE{La~Z*P1Q6b3eE;BEePJW+--)ijS3qi6!dQJkIq^|i{)mrtu3nrLu-DPT~1Lv zy|-tsJJ%_`YQpEb6zWy^T|K#Rf1x|WEr>I`p)oq&GsVgqHhJ9W~a3! z-hG}x30Ug12)qRG!IyBUs(u4|c&bzQm~H|xXL`oW_}im4Zt-8w^Ae2S zh2i;;uU~JCRq90Yy@eW9(TS#mI|r_3B0_j0@GXba8jJnsiI&e(8pioSU3^a>iMvlD zc<~UWSP=7EVFDkcp~fwXwn1ow1fQtf0CaS;xv4lKO`_t3#Q^9|n8lv4HyfN`8w_Z$ z?{|TBU0_lXxDi(E&E>{0Q__InfDQUolJfG#;DICRQBw=kF}2SOwTdSl=6`l&kIuI_ zE0EsBmmJkZ2B5HH?9aj64t4?>W=PsHeWA0Pwrk+OEz{XUXdTbBJ;I)sy!=w!c{_RH zcPTGhCYA^o&kFz^&=$!xYZ9_L24THXGXHY^7jFLh`Ojd?LEswP`N7M+{gnVPCT(&z z&(j%xvW%#a|NC=*j`|@;VARy4GO7F(k_8?mjlLy~k0X@vtq~GKXIv@|&N%Dq5VJ*F z>}t|BKQJpN7<3?p#x5GnVWiogd>v;lX}fx*!OqI_OurdA5SK?5<4cH5nd(NVHx18 zZr4?&9-ndI(l`J8dve`p4n4uK3ifbv{Q?#w&VAfQ-;Jg`-;w&x%`S>3L(ioA=DgWv zD4D=RaNV6cd+-^FED*q5C}a-BXpQKlw@9dScf;USDcpO0dN2OU4UN2iy3E|~NPwWpvqszOM%!Hl^YLftpNO3TrKTbj$pu|jr+MB z7KYkM;#1IH?}UCccymaF(KBI>Tkn6K_^?=vBg83_WoyRKCmu{+HM{G?b-ujgcy6CW zHpDqy8+vSGdEDBnme8jex@X)S$L|yN_~voz_g$SP${nnyB}b-B4?Dm3`i|`tQYyKc z{chj*;zu$fx_BC2>>2z_6d!Nr4EoihRyQM>%<)h+1oy8VEJqX$3}(R+yz&zUw}Q!a zPx%IJ!TxX4pQWWEbY}jwCJ!!C8vGu{s8Kiqqi+1V_fp>;?_JZ+F2}wT(^hi%QZ;5h z{i1?C@|XxKtlNX{t-n+ql~BoTNt*3=YHVhQ8hekhKpIk@La&p(se`*tY#iQNVVH=V za@Uv#(MdBI?l%PGTU$oX#S0B6YjR;N03km;5N~@Ez3l?N-k10sk79CujJCNp(7ouU z7zRUP#PI%fmC!1l6Ef*p*o*b*`MGscPe0cB3ddfRySAulIwO^jo2Ey-$Ip(gxl&#H zL?}afXX(rW0m12>DtlMY3zit%AR{hG)9Q(66!sh&u?9Ev=p62w|MrHXoHFwT{G&8P zd0JGJlr9WR3A&Bn6O)(pn8y~rJT?Mf;kGk<^SZz-lQs)Cj&fU-{=O1ZIT<-Th+o&R3pujY4@5A&Nj*wq(Wr1|t+Cb$T=3%zL&X z?!eUP9*VcdD){_%T3T>3$MEr$NsTUhLtAYdTa#wjbk#I`D>kk={^#hplE!6f4>uhB zu0tx%B$=_TXGJ8SuY^SVylZ~qfD7kSsA`0b?%O(#pm{`S$_Q2 zw7<%1#=Spg=4hLlnVso=rm+csRN>7eZ{AK|IfNz<7X)6}F;BWpX!as1<$Ty=Mnh5* z4e+V3Obs9Cb=m%A(-d*tGQA%Y=5&A{k<6ZO>y2u|=7!Q+=y2Cg5=sI!g=4o+>hg%d;Xr{b9&^9kQE zIU6j=si~=P^sIx2Y>PmNDCtzm^%N!UZP@gYQAc4RUDpZCcL|$6-?Y7HbsaRPz%wcI zdDrDrt$91~JK`9U0E~S*(n8g{%^x0yLzEZ8%`nFgEXx2hG0eXU71&Cj>Y z_^|v{`XG%jV!V4y#S^mbZ`}!zFR*gK5`=q3T1}0$-{V;?TO9|+eV(8FX|ST`5B~g6 zi}w$*s$|2*bYm=zdm;0|ZAI53=-xD;6TKyt7D%hb?SQ7(=(l{cia#psQ9bprk6vX< zNx^S=rz;1VMTItbY+=uupnS$iyYk89&6*F^^fi0QNu{&XP? zQs{mSh-A5EvP_zHwA97bH?w!G^xV$LA1Jm~Jn0?knW?VQTi0;*?X<=Fdj^W9^joYt z>X4XnWs-kbo!#0rG2yDs4;I*-n^}JDdLHkVZ=-ewvN@}yR7DJ`MwynX6rKF~-Cq0n zD6LWzgPFwwX~HsE-wkKZT>ok2b^S8Ahwov76b(1b!x;QTOYnR<&1(YPm zQEY;OHnI^0QhIT$9U~ZHM}3iCD2UA8mNqwIJVWGdSC+1#Oz76iaceCEXtvSFP{4HC?f&}2g4!kvKd50f z$b*$W2;*y z(MeO?7Oh=dGIgDlUv|ye9cs%lAHoS~Yo@P{&lk}0R{*vG=8f8qCJ#a6?XyRimId8; zZ|_NRCW9D<+%yiAr^|ClGl4;BY|OfkCZHCLC^oSNX-HYo(;YkZ!dS$iFQTXWWcO?B47KWU&pB5)&lS5YokqG`jg|EWgM0Jdzh#-B zl%9>MiwfTntY$5xMT#+oYAj%E~J3rztyVL|Rw-=$U}L zDO+>n_!hINsAyoCx{W;C;cRaG+^G&8Q~3Wv@%mtwe#;e$Op=WV! zrgzx);x?N3q7 z=dkjyvUc0|M4ZI?yT%0AgLSHck0!=G=_b87e*X>{pkMXXsnRqu zsLy%gCXZXmMa)sC0OjL4lh6_0$Jeo{;R|Dm>3miR28s1i(Vl&`Zhu9C=dT6|o>db9 z^fhJ~20~>1ukWA_rlagoGDfCLf!{sD*%^}f5z#p8}5Sg9^TAD%`IlFhvS z+g&=3{!8y=jpayVhzPtP#zcWb>WIvu$%9Ys!IfwYqc0NF^C-lHc{HE}VW7KG3{#2G zG+_fDDQ+Kt{`3*Pe{VK`vf{tC@E6-8GJZkt=rG!NXn?hJk2#^w&u7JM5P$CaVCh9>{Gp{Qe`VWI5L=kh_K_fCTsV5hpS zPUv7sdKPq)#v;P`YtiFHAS6@cg=`gZWp(f|{WG{Y!i)xr!99uKA|h#{IlV2eJ!AYy z@f+j&yu4zufO%`)v+(Y+>a3}Y_z}JPJF}{89M8GRBu7`Rk7sQRr2NPWgt#sgz*VJF zzO-A;|2+EBB5)AdbgDsRQZ+S>m!U{h(Te8zasd3v|-glMN_ zZ72<%R4fxAW~EygQgM0tGN>2mtRX`+h#fw6jTFzSwmJebLACi>JTb6`{FpYlnGi!I z(KT2BW3nG5FkwOE99I8xkE70g09lbgG)r^i-gWB3g6}${^w>| zIE|FN5*;<|T3C@5^0|8u0PZl;2knA{O3wpd7HaVrGG0Cl4Vqe^ZnHY)L>)IA&Ld`@I!C)#;fRR=pX(n44Es-!o ze!COin4$SokMy?3y0j;jk;(%Itt!HXLFJc4q#EUa*4dnrF`n5f(7Q2a16~D%r0TSl z6S6Ke%-rE-p!-OSQ>+E|V)*QDj5`o^WZEuYe}8>5C-f!B6oq`;$svqJ2;g+w!loH! zU|-nalpsP0GJ@nec3}2r_RG`c2RO6o>KWd_45Xi>6APs!C9{7tr|)kb>_4$ur&ylK zN9g=>|Eno1lJn_&?Z_BEbRc3e(s7TYekd?&eu;`Yv|KI!Zj<_`2jClRo5TU!x_R^V z1=7M7vvzT|@KF5*3NUjvq)$6Q6t$vb-ZiqAbWo4+-rhHjRKk>Qr zuQwPxZHbyT8PJ(yh91pQz;eweeJ030+v_c5kiZd64=Gk5q&s(!+%`Z{QjE6zqrI7Y zu|2cooegl6Zx_dxhEV2<1xVdenyC%RXl_lMr@E{S?DmM^18r3SksZ*Z-&+FDacT_I z<3#Rn3zv_ijsXHi;K^22A3Vr?14lAk+oNP-1>Y>ix+N<;JRrTnUqd&7^e*s^c$ncl z*QxC)aL~fIeaaZU(QQ`VXxk7K9m5y(O2n;x36c@eSGVACJ5*hg@w~SSnIdgek|p9B z6Th%k3t+6+M`jD?+81{OI@F89R|?ASK=zRUvgB{#R82tGrB_C?Q{o!oVbNiC+Uusrv~Lad1KI2xN9}b0CKWEu5irZE`;=PSFU8;zkTfs_rtAxD>Q%u;ceiJn2?VQN>HgMKlhTdm3A75fgx{7t+BkIzT{?5 zP%Nfr7=zwl59vW%?x}QBFL>S2KxRx;^fgaz*-RhA1#~0F5@o@?fV@E|@QN;67>zEo z3DQTYq;Jax$2=pEJrHp4ymK;INHn?{(}*?d?U6AA?(1(kE*KFhtTlS96;i%Iw=B?D zcG@OUhY50*B19aTZ$Oj2>3phwF^h(wyy%!eZypY0I(^A81m8$z{1+PIq1Gku3Y?7I z%rhPyS?2MkfT3~n#L||+tr}v%wb_3oqAaG1zQvSExgH_%QKXO2Oybb|sxI{8>u|7= zmjzjY$@R766N^zdd?aJ{t@@XyXXqWsLB2!-Z%|2pvrQt-0*IRibSbBQn@UNi!{evm z;gVvJ7%MxIPsEu&={gwO7z4#K1gmKIdg*hqyxC()~FpE;RK>4*YSQI`62T^)4)`Y{lb~ehPyE5 z9b?g}#G{0a_BBDOjpQ#MEATb~%cH~ZLN@ha55XXy2+K`y#teyPODOAfLK;pl6RL(# z06XbX+Pdjj46z_1NrJIRdUQ2yti-$SD{G+ zm7X?Gi)jD}UQb$03w(n?-jS#3%RF9k*|NL_TM(#p!NNnuHUI3hThLkTZE(T7Fj!7I z1rFHB-UJ&5q>KWvVgE#6A|p#A(+A^npVL$zF4Y!Gx%*oKHdq*d5jsq%7|ksnS->o< zoG>bZs1hWiGP180#NJXzB9jk^9T`-x^57Oh33hBx)BCuMnA#ph_MNmhD6A1Kflv54 zyvAYqOxtWA+qgdjEDzZ1s$tYT@fYFKbaRUF49bEWYJaC8J9w7l^+~g9X+{XyiBgOO zbBV^c(xjK%eUzf=;We)S)i$>uf_z`=M3v>JOtDV4(QpAWRwZXCav!4_KLSci>c2qM zjCa<$TrQL0Dxovm{vOSKthQL)hNRmRCllGr;l0v~4a7|o=+FY^UJ`{lZr+dTgu`*o zUVO!d_SHVJ4R;^rL`XB#m#b-$P!8dSnovV`=@&}g)f6OAg|%Vt)2JT$!+<4Lz?cP2 zHd^Yyjh7)Nu|SEY=?Ak_6xc!hEd)7@Zpg97KaA!dizDO1 z5&RDIj%Q;DY;Gwca1jo=MnI1A9HRL)oKQv-H?r=w{O>6PgPQZDb0LHKWcCAhmyT8E|6d(af*TrvrvHH|+usIbn2tOv2ZIDLn%X z#shf>;1bx4mD-^;mDU45#q-;;}gA@-E?A3HgO|uP~pC+ zv)&@>v~&slRfvwjR+sM|%^15*_hzvt<}ifPBv*LrW*>aWVq+&n;}1!KHZ0|*3Jk+M z&WTQCRvrr3kZeMWB%rfKMhJTZNy|=hsG|{w^p^u0W+#lA;i^)YbGsN&R zh^LLkw1{Q2FsvDQ9Po2c;2m{XGsXgS7!+)5hZ^VRC%`9Dm5U`JfI zX9}4y`NSYjn&P4Z*Se8C5#?(rv4QIcraNeYxN@bW5vKH86Xw)OFS^`@YzSDAbliIQ zDxgsIWHY9@Njb-!5w5C1I0Cx_#%#OS|>BBDTq#3UDdWH7) zXi|thpOzBFC^$UPnD#Yb11yIqUuHJj+0jfM#9{I=hAc>U{O*rJ9RG{R`|#FN~ZUGX_Jwgfh! zlF~issAph#25J=jnDqkS?#i&j&)TVN+a~=u1}6rEd26^2#2#;ajmaJCb^ACY)7+Pq zx{2tWc=1>U@Wd7CZu7JdDDT6jluVnpjB@t6n53pWWGO40dry=(G~A=(=ui?JW$Xe- z-;t9AYR-ja<~rkflpqF`z=l;nukKYf`~*Hfs>p*VCa4F#Cke0j!2*Sa{X$3z0L86& zV_X^9cuiu*?89pl5l9=5RgrUaj$EffZX$v+#X}-BVGiax-Zl9vR@yZgu(U8h9+n56 zA38Axu0dIl)cj-@l;^ieodaQ1uJ#{NQTHd&kRkh+`6=QzRumR9Ps0z|Y=J=}1>DW(rKj_CU0yR>D4UPw{06x8E|^{*Ya6>j)zSe~X|53_>^S>A{XtMfugO3@{?>NawHQOSN4Gvz$2qsEuX#^II zu;PA!zHeeTf)2v6pc{lu&o@542ttTA8>ZOB&;Mz0cI~7uC)F;_d$s<^c}tJH=p1 zgVgqlwX7m0l~3E{f=I~H0yQ>=^9B)uMy9fL>Q8TlrDrY}tBIH5lTPtrI%DYTeWyJ< zqOb`vACF;-x*U~bs^sG<@9CV-yYF&hQ(^-(705ILDQ^3s0YuA382+|El{f1M60mNV z&mfD8LQHl9lc}}zh(E3pM_x~F?EQ1qri;EHmGAkcx_PrlXscmQ|JA}E)9Px|i|y(A z{A_y>HB52dvektI#2=6RwC1LIN^6#6-~jB;FH_9y=|q^YM_L{o@iGRAhRTg3pkB3(fP1FVTJK?6l2=?^Ygt>bxOQh=Et;en=0e6N<7&zWG4D%Hy{`B^F8BYluF-Ob%SZ8C zTZU^uPa} zaZKhsymur(v}2o!%6I;w+HJBC77;0!F@-ChY>l9NUk8_h(=X{pBxHyV>|PAK`+7Vn zq#>uRxI2We*;%?#7*9S>`C#9g(F~Cj__DV(i6Q%Z9kS;loYV1x{yQxU&~_Y}Y&%!3 zTES=rtZ@fHh$b!2Tx=MyB!8;bKT%wbsSG+toWzqm7SDQ-5+$(s3erx8vggG&l(&OW z9k9;+L85hC2#NBZ^fW~m{AT19K-hNR+;)`r2Li&VGS8j*SxIohLr5Bgn!E))HMQa z!J)!!3=Dz{&90|go`ev^;wd! z-Z+Rb`pvk0(`ALD*8oxjol#}O1GgQyZar)ki=g*}0Y=0WyE8)lsm-SIwDVV}_NklR z2?>&Z(yjedE@cHo>k05{2mzVD7P8OynkL%?s!I1mDkcOAAM!T1d%|ROBvm+2!4qX-G7i288(_|NpXlKD{$R=yMt{#O20Gv=H?Pm4^vX2~0d!akgJ>PB~ zzl$fmv#A zQHgstYOjt>M;4X*xC>^VKC+_%eLS6;JBsE4?!kWc`EdEo!5{11b>fG0^jw)u=E~pc zss2Gmd&q4HA^g1{CkvOdSQ`6Ciwvm{ zCBFLWJ7HufS*O%kCM*m*ksiAbaHFcykDZ!uHiOJR7^=8+XL@nlCg47FW1R^i>jnL` zt8@T~jV9s+jrl?!Xt}<`Znw}-CN-F$8gpwyQ@D z=QqdFU-IigWWImGNUl=8#QyMqhdAV1LiP$_mN z?ukG}Ce5&;qAUFUQ^;)PdcQw|ur}$yi?vfhc%yNhL?=Nzo$Ql!ZH1&JMNk=mM~yZZ zZYpQ_ha=#LtSdiXexhpp(GL$YALa_`8T`)b27j2wBcAYhP0I`NmUa-6eT0kdJTN#> zc_xr`3nb$s7~%8bp%n5C%6rjq&xyE)oq_X=Cy$>v(J#BYRUmx)GhEy$QkI-Jx(^CrLcU1O4YwW>h>9Thy>cWVkae&B#90SS z>>+FhU|v16%Ve0eVdF;B@av2N(6FM)AD~}X`$rI226HfkFB*0fc;fSKIEHo(sgYWf zA0tLYqds$5fF6I zJR6`%8-fHQ<-_&W^ zA)XYmW@)O8|54J`0N`ycEpw)ZZD7F&_wja|kz!$hPtjxchQm>vy-qt6ZZymdbzT#~ z5SdIOQR@7#8HgYdyW0dg(I{7V?aYVD_+_O<1cdwb`Qct@8#K4SCx0%?Y>$H0Mhqr2 zDAmdTm*ne|72^A&-Znr~{T5WGC<5v+&38LcO5#pD%#l?^7=K7;D4dz}YK0lDse6=+ zB&`wNO!|AfD8s6j#mm(V9J&xA62`m6E<-^+bT+Qwx%UE?P%SdruoUJQd&ZGcu`b$syDSFh?XVT%C+0 z;nG`cvszi?4}38d{5TRwtAOqNWJ(EWn20rMaE2#zqrW5mDND9bhVZY3tIBgUf!H8X zL}Eekf{@V(MQDVh@+xb|9UZSA651>Xwzn%Px*^L6$4;MyDI_#dP)XIGdqSJ~m5+dKx4W;qc#6F_o2y2@-w-b{_=}br8PO+4cHXpO5 z!P|ZS6f#G)VG)k~x8HrD#W;Rx2OrkK?Krlk9Ov{5E5S(|I;Y7`fPh!rB=ZP_T@XDu zuk7THkR7@OlP6>nfz<&K>N#-o#p~LbZR+6<0~|;1c%~N Date: Wed, 28 Feb 2024 17:52:44 +0000 Subject: [PATCH 08/11] Add benchmarks summary csv Saves results of experiments to results --- notebooks/benchmarks.csv | 47 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 notebooks/benchmarks.csv diff --git a/notebooks/benchmarks.csv b/notebooks/benchmarks.csv new file mode 100644 index 0000000..bda4468 --- /dev/null +++ b/notebooks/benchmarks.csv @@ -0,0 +1,47 @@ +,tool,dataset,cloud-aware,format,file,time,mean,size,product +0,h5py,ATL03-1GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,2.843794107437134,386.06738,1GB,ATL03 +1,h5py,ATL03-1GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,4.157144546508789,386.06738,1GB,ATL03 +2,h5py,ATL03-7GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,6.9494102001190186,1035.1631,7GB,ATL03 +3,h5py,ATL03-7GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,13.6586012840271,1035.1631,7GB,ATL03 +4,h5py,ATL03-2GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,1.4053022861480713,2049.7554,2GB,ATL03 +5,h5py,ATL03-2GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,1.0851728916168213,2049.7554,2GB,ATL03 +6,kerchunk,ATL03-7GB-kerchunk,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json,10.746918678283691," +array(1035.1631, dtype=float32)",7GB,ATL03 +7,kerchunk,ATL03-7GB-kerchunk,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json,8.8134024143219," +array(1035.1631, dtype=float32)",7GB,ATL03 +8,xarray,ATL03-1GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,46.50308704376221," +array(386.06738, dtype=float32)",1GB,ATL03 +9,xarray,ATL03-1GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,10.25867509841919," +array(386.06738, dtype=float32)",1GB,ATL03 +10,xarray,ATL03-7GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,62.89623713493347," +array(1035.1631, dtype=float32)",7GB,ATL03 +11,xarray,ATL03-7GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,81.67518210411072," +array(1035.1631, dtype=float32)",7GB,ATL03 +12,xarray,ATL03-2GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,47.506706953048706," +array(2049.7554, dtype=float32)",2GB,ATL03 +13,xarray,ATL03-2GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,18.109654188156128," +array(2049.7554, dtype=float32)",2GB,ATL03 +14,h5coro,ATL03-1GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,4.562052011489868,386.06738,1GB,ATL03 +15,h5coro,ATL03-1GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,4.286046743392944,386.06738,1GB,ATL03 +16,h5coro,ATL03-7GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,14.072925567626953,1035.1631,7GB,ATL03 +17,h5coro,ATL03-7GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,11.79448390007019,1035.1631,7GB,ATL03 +18,h5coro,ATL03-2GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,3.1101267337799072,2049.7554,2GB,ATL03 +19,h5coro,ATL03-2GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,1.8120653629302979,2049.7554,2GB,ATL03 +20,h5py,ATL03-1GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,1.8618409633636475,386.06738,1GB,ATL03 +21,h5py,ATL03-1GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,1.9302234649658203,386.06738,1GB,ATL03 +22,h5py,ATL03-7GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,6.602761507034302,1035.1631,7GB,ATL03 +23,h5py,ATL03-7GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,5.758350849151611,1035.1631,7GB,ATL03 +24,h5py,ATL03-2GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,1.2604756355285645,2049.7554,2GB,ATL03 +25,h5py,ATL03-2GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,0.8633284568786621,2049.7554,2GB,ATL03 +26,xarray,ATL03-1GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,42.18248891830444," +array(386.06738, dtype=float32)",1GB,ATL03 +27,xarray,ATL03-1GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,2.5429904460906982," +array(386.06738, dtype=float32)",1GB,ATL03 +28,xarray,ATL03-7GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,48.71459078788757," +array(1035.1631, dtype=float32)",7GB,ATL03 +29,xarray,ATL03-7GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,6.6719231605529785," +array(1035.1631, dtype=float32)",7GB,ATL03 +30,xarray,ATL03-2GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,40.31614112854004," +array(2049.7554, dtype=float32)",2GB,ATL03 +31,xarray,ATL03-2GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,2.156572103500366," +array(2049.7554, dtype=float32)",2GB,ATL03 From d7b57b16ee8a06effe1cfc061a6a251ad51fff98 Mon Sep 17 00:00:00 2001 From: Andy Barrett Date: Wed, 28 Feb 2024 19:54:28 +0000 Subject: [PATCH 09/11] Create dedicated notebook to plot results Copied plotting from portable-full-comparison --- notebooks/plot_benchmark_results.ipynb | 449 +++++++++++++++++++++++++ 1 file changed, 449 insertions(+) create mode 100644 notebooks/plot_benchmark_results.ipynb diff --git a/notebooks/plot_benchmark_results.ipynb b/notebooks/plot_benchmark_results.ipynb new file mode 100644 index 0000000..a2195b3 --- /dev/null +++ b/notebooks/plot_benchmark_results.ipynb @@ -0,0 +1,449 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e2d85d97-bd23-4af1-b302-1ab55921a30b", + "metadata": { + "user_expressions": [] + }, + "source": [ + "# Plot Benchmarking Results\n", + "\n", + "Plots the results in `benchmarks.csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a7c05ac8-7256-42f7-a351-4b498da62ffc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import re\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "id": "7804d7bc-a01e-46bb-807b-fb5081616d8f", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Read `benchmarks.csv`\n", + "\n", + "This file is generated using [portable-full-comparison.ipynb](https://hub.cryointhecloud.com/hub/user-redirect/lab/tree/h5cloud/notebooks/portable-full-comparison.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1fa7d33b-ef5f-419d-a0a6-8213e075ede5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tooldatasetcloud-awareformatfiletimemeansizeproduct
0h5pyATL03-1GBnooriginals3://nasa-cryo-persistent/h5cloud/atl03/averag...2.843794386.067381GBATL03
1h5pyATL03-1GBnooptimizeds3://nasa-cryo-persistent/h5cloud/atl03/averag...4.157145386.067381GBATL03
2h5pyATL03-7GBnooriginals3://nasa-cryo-persistent/h5cloud/atl03/big/or...6.9494101035.16317GBATL03
3h5pyATL03-7GBnooptimizeds3://nasa-cryo-persistent/h5cloud/atl03/big/re...13.6586011035.16317GBATL03
4h5pyATL03-2GBnooriginals3://nasa-cryo-persistent/h5cloud/atl03/big/or...1.4053022049.75542GBATL03
\n", + "
" + ], + "text/plain": [ + " tool dataset cloud-aware format \\\n", + "0 h5py ATL03-1GB no original \n", + "1 h5py ATL03-1GB no optimized \n", + "2 h5py ATL03-7GB no original \n", + "3 h5py ATL03-7GB no optimized \n", + "4 h5py ATL03-2GB no original \n", + "\n", + " file time mean \\\n", + "0 s3://nasa-cryo-persistent/h5cloud/atl03/averag... 2.843794 386.06738 \n", + "1 s3://nasa-cryo-persistent/h5cloud/atl03/averag... 4.157145 386.06738 \n", + "2 s3://nasa-cryo-persistent/h5cloud/atl03/big/or... 6.949410 1035.1631 \n", + "3 s3://nasa-cryo-persistent/h5cloud/atl03/big/re... 13.658601 1035.1631 \n", + "4 s3://nasa-cryo-persistent/h5cloud/atl03/big/or... 1.405302 2049.7554 \n", + "\n", + " size product \n", + "0 1GB ATL03 \n", + "1 1GB ATL03 \n", + "2 7GB ATL03 \n", + "3 7GB ATL03 \n", + "4 2GB ATL03 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"benchmarks.csv\", index_col=0)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8e4e2660-437b-48f7-896f-795e93ae1644", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Reformat data for plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5c69cca4-430f-4552-b9e7-88bd07deea33", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
formatoptimizedoriginal
cloud-awarenoyesnoyes
toolsize
h5coro1GB4.286047NaN4.562052NaN
2GB1.812065NaN3.110127NaN
7GB11.794484NaN14.072926NaN
h5py1GB4.1571451.9302232.8437941.861841
2GB1.0851730.8633281.4053021.260476
7GB13.6586015.7583516.9494106.602762
kerchunk7GB8.813402NaN10.746919NaN
xarray1GB10.2586752.54299046.50308742.182489
2GB18.1096542.15657247.50670740.316141
7GB81.6751826.67192362.89623748.714591
\n", + "
" + ], + "text/plain": [ + "format optimized original \n", + "cloud-aware no yes no yes\n", + "tool size \n", + "h5coro 1GB 4.286047 NaN 4.562052 NaN\n", + " 2GB 1.812065 NaN 3.110127 NaN\n", + " 7GB 11.794484 NaN 14.072926 NaN\n", + "h5py 1GB 4.157145 1.930223 2.843794 1.861841\n", + " 2GB 1.085173 0.863328 1.405302 1.260476\n", + " 7GB 13.658601 5.758351 6.949410 6.602762\n", + "kerchunk 7GB 8.813402 NaN 10.746919 NaN\n", + "xarray 1GB 10.258675 2.542990 46.503087 42.182489\n", + " 2GB 18.109654 2.156572 47.506707 40.316141\n", + " 7GB 81.675182 6.671923 62.896237 48.714591" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_df = df.pivot_table(index=[\"tool\", \"size\"], columns=[\"format\", \"cloud-aware\"], values=\"time\", aggfunc=\"mean\")\n", + "pivot_df" + ] + }, + { + "cell_type": "markdown", + "id": "97f6a3dc-e5eb-46e7-ac46-611a08143d07", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Plot results" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "33602bbd-41c0-4133-bab5-76f45e9fe1a5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Set seaborn plot style\n", + "sns.set_style(\"darkgrid\", rc={'axes.facecolor': '0.9'})\n", + "\n", + "tool_order = [\"h5py\", \"xarray\", \"h5coro\", \"kerchunk\"]\n", + "# Create figure and axis to \"contain\" plot - allows customization via ax object\n", + "fig, ax = plt.subplots(figsize=(15,6), layout=\"constrained\")\n", + "\n", + "# Plot results\n", + "pivot_df.loc[tool_order,:].plot(kind=\"bar\", ax=ax, \n", + " color=[\"tab:cyan\", \"tab:blue\", \"tab:pink\", \"tab:red\"],\n", + " xlabel=\"\", fontsize=15);\n", + "ax.legend(labels = [\"Optimized\", \"Optimized with informed io parameters\", \"Original\", \"Original with informed io parameters\"], fontsize=15)\n", + "ax.set_ylabel(\"Time (s)\", fontsize=20)\n", + "\n", + "## Make two level axis\n", + "\n", + "# helper to create axis labels\n", + "def parse_text(s):\n", + " return re.sub(r\"[()]\", \"\", s).split(\", \")\n", + "\n", + "# Retrieve and parse axis labels and position\n", + "tool, size, x, y = map(np.array, zip(*[(*parse_text(l.get_text()), *l.get_position()) for l in ax.get_xticklabels()]))\n", + "# Make labels and x-positions for seconary axis\n", + "sec_x, sec_label = zip(*[(x[tool == tool_name].mean(), \"\\n\"+tool_name) for tool_name in np.unique(tool)])\n", + "# Assign ticks and labels\n", + "ax.set_xticks(x, size, rotation=0);\n", + "sec = ax.secondary_xaxis(location=0);\n", + "sec.set_xticks(sec_x, sec_label, fontsize=18);\n", + "sec.tick_params(length=0)\n", + "\n", + "sepa_x = np.array([x[tool == tool_name].min()-0.5 for tool_name in np.unique(tool)] + [x.max()+0.5])\n", + "[ax.axvline(xs, c='k', ymin=-.1, clip_on=False, zorder=3) for xs in sepa_x];\n", + "\n", + "# Uncomment to save figure\n", + "# fig.savefig(\"access_time.summary.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "815347b5-f23c-4d25-9104-42523e9de093", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 8d8996af477958ebe7b609768a3c9166b249f6e1 Mon Sep 17 00:00:00 2001 From: Andy Barrett Date: Wed, 28 Feb 2024 19:57:42 +0000 Subject: [PATCH 10/11] Remove outputs and add plotting Add plotting for end to end running --- notebooks/portable-full-comparison.ipynb | 800 ++++++++++++++++++++++- 1 file changed, 799 insertions(+), 1 deletion(-) diff --git a/notebooks/portable-full-comparison.ipynb b/notebooks/portable-full-comparison.ipynb index 334bddc..8b1dfb8 100644 --- a/notebooks/portable-full-comparison.ipynb +++ b/notebooks/portable-full-comparison.ipynb @@ -1 +1,799 @@ -{"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"}},"nbformat_minor":5,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## AB testing access time for ICESat-2 ATL03 HDF5 files in the cloud.\n\nThis notebook requires that we have 2 versions of the same file:\n * Original A: The original file with no modifications on a S3 location.\n * Test Case B: A modified version of the orignal file to test for metadata consolidation, rechunking and other strategies to speed up access to the data in the file.\n","metadata":{"tags":[],"user_expressions":[]},"id":"6c9b37e2-2daa-4283-a228-ea581498de0c"},{"cell_type":"code","source":"import xarray as xr\nimport h5py\nimport fsspec\nimport logging\nimport re\nimport time\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\nfrom h5coro import h5coro, s3driver, filedriver\ndriver = s3driver.S3Driver\n\nlogger = logging.getLogger('fsspec')\nlogger.setLevel(logging.DEBUG)","metadata":{"trusted":true,"tags":[]},"execution_count":1,"outputs":[],"id":"3b78fb94-10ae-48cb-8e30-521b2c8b7822"},{"cell_type":"code","source":"for library in (xr, h5py, fsspec, h5coro):\n print(f'{library.__name__} v{library.__version__}')","metadata":{"trusted":true,"tags":[]},"execution_count":2,"outputs":[{"name":"stdout","output_type":"stream","text":"xarray v2023.12.0\nh5py v3.9.0\nfsspec v2023.6.0\nh5coro v0.0.6\n"}],"id":"431d900d-0656-4b75-af6b-82f0f171d5f8"},{"cell_type":"markdown","source":"For listing files in CryoCloud\n\n```bash\naws s3 ls s3://nasa-cryo-persistent/h5cloud/ --recursive\n```","metadata":{"tags":[],"user_expressions":[]},"id":"7998cd99-6034-4a1b-9ae5-d651bc265bff"},{"cell_type":"code","source":"test_dict = {\n \"ATL03-1GB\": {\n \"links\": {\n \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\",\n \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\"\n },\n \"group\": \"/gt1l/heights\",\n \"variable\": \"h_ph\",\n \"processing\": [\n \"h5repack -S PAGE -G 8000000\"\n ]\n },\n \"ATL03-7GB\": {\n \"links\": {\n \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n },\n \"group\": \"/gt1l/heights\",\n \"variable\": \"h_ph\",\n \"processing\": [\n \"h5repack -S PAGE -G 8000000\"\n ]\n },\n \"ATL03-7GB-kerchunk\": {\n \"links\": {\n \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\",\n \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\",\n },\n \"group\": \"/gt1l/heights\",\n \"variable\": \"h_ph\",\n \"processing\": [\n \"h5repack -S PAGE -G 8000000\"\n ]\n }, \n \"ATL03-2GB\": {\n \"links\": {\n \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\",\n \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\",\n },\n \"group\": \"/gt1l/heights\",\n \"variable\": \"h_ph\",\n \"processing\": [\n \"h5repack -S PAGE -G 8000000\"\n ]\n }\n}\n\ndef kerchunk_result(file: str, dataset: str, variable: str):\n fs = fsspec.filesystem(\n \"reference\",\n fo=file,\n remote_protocol=\"s3\",\n remote_options=dict(anon=False),\n skip_instance_cache=True,\n )\n ds = xr.open_dataset(\n fs.get_mapper(\"\"), engine=\"zarr\", consolidated=False, group=dataset\n )\n return ds[variable].mean()\n\n# This will use the embedded credentials in the hub to access the s3://nasa-cryo-persistent bucket\nfs = fsspec.filesystem('s3')\n","metadata":{"trusted":true,"tags":[]},"execution_count":3,"outputs":[],"id":"9850faac-f534-4bc2-9214-c8dababe0f52"},{"cell_type":"markdown","source":"## [h5coro](https://github.com/ICESat2-SlideRule/h5coro/)\n\n**h5coro** is optimized for reading HDF5 data in high-latency high-throughput environments. It accomplishes this through a few key design decisions:\n* __All reads are concurrent.__ Each dataset and/or attribute read by **h5coro** is performed in its own thread.\n* __Intelligent range gets__ are used to read as many dataset chunks as possible in each read operation. This drastically reduces the number of HTTP requests to S3 and means there is no longer a need to re-chunk the data (it actually works better on smaller chunk sizes due to the granularity of the request).\n* __Block caching__ is used to minimize the number of GET requests made to S3. S3 has a large first-byte latency (we've measured it at ~60ms on our systems), which means there is a large penalty for each read operation performed. **h5coro** performs all reads to S3 as large block reads and then maintains data in a local cache for access to smaller amounts of data within those blocks.\n* __The system is serverless__ and does not depend on any external services to read the data. This means it scales naturally as the user application scales, and it reduces overall system complexity.\n* __No metadata repository is needed.__ The structure of the file are cached as they are read so that successive reads to other datasets in the same file will not have to re-read and re-build the directory structure of the file.\n","metadata":{"tags":[],"user_expressions":[]},"id":"4d166627-6144-40bf-884d-2188e5c764ba"},{"cell_type":"code","source":"h5coro_beanchmarks = []\n\nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n print (f\"Processing: {link}\")\n if \"kerchunk\" in link:\n continue\n group = dataset[\"group\"]\n variable = dataset['variable'] \n final_h5coro_array = []\n start = time.time()\n if link.startswith(\"s3://nasa-cryo-persistent/\"):\n h5obj = h5coro.H5Coro(link.replace(\"s3://\", \"\"), s3driver.S3Driver)\n else:\n h5obj = h5coro.H5Coro(link.replace(\"s3://\", \"\"), s3driver.S3Driver, credentials={\"annon\": True})\n ds = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)\n data = ds[f'{group}/{variable}'][:]\n data_mean = np.mean(data)\n elapsed = time.time() - start\n \n h5coro_beanchmarks.append({\"tool\": \"h5coro\",\n \"dataset\": key,\n \"cloud-aware\": \"no\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean})\n\n\ndf = pd.DataFrame.from_dict(h5coro_beanchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\nplt.title('h5coro cloud optimized HDF5 performance')\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":33,"outputs":[{"name":"stdout","output_type":"stream","text":"Processing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\n"},{"output_type":"display_data","data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAA0oAAAKjCAYAAAAj5v8TAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYzUlEQVR4nO3deVhUdf//8deAw7644haCSynuGlouKaZobllalmaubZr7banVnWilpdXtnX5NLVNbzDa1xcwolzQtdyu1vDO3FEITQUURmfP7o4v5OQdQkIGDw/NxXXPlfM6ZM+85zLzh1TnnMzbDMAwBAAAAAJy8rC4AAAAAAIobghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEoA8iYuLk81m08mTJ6+6bkxMjGw2W7bbHXfcUQSVWsNmsykuLs6S546JiVFMTEyRP+/x48cVFxenXbt2ZVuW9X5xp8LYZl7kdf9GRkaqW7duOS7btm2bbDabFi1a5BxbtGiRy+fDz89PlSpVUrt27TRt2jQlJSVl207WPsjpNnv2bJdaclrnsccey/frLwoffPCB6tWrJ39/f9lsthzfUwBQ1EpZXQAAz1SjRg299957LmOlS5e2phgUiuPHj2vy5MmKjIxU48aNXZY99NBDbg/GhbHN4mDhwoWqU6eOMjIylJSUpI0bN+qll17Syy+/rA8++EAdOnTI9pivvvpKoaGhLmPVq1d3ud+qVSu9/PLLLmMVK1Z0/wsooBMnTujBBx/UHXfcoTlz5sjX11c33XST1WUBAEEJQOHw9/fXrbfeamkNmZmZunTpknx9fS2toyS64YYbdMMNNxT7bRYH9evXV3R0tPN+r169NGbMGLVu3Vo9e/bU//73v2wB5+abb1b58uWvuN3SpUtb/hm8kvPnz8vPz0/79+9XRkaG+vXrp7Zt27pl22lpaQoICHDLtgCUXJx6ByBf/vrrL/Xp00ehoaGqWLGiBg8erJSUlGve3rFjx/TII48oPDxcPj4+qlKliu655x799ddfznWOHDmifv36KSwsTL6+voqKitIrr7wih8PhXOfQoUOy2WyaPn26nn/+eVWvXl2+vr5au3atJOmzzz5TixYtFBAQoODgYMXGxmrz5s15qvH06dP617/+pRo1asjX11dhYWHq0qWLfv311ys+7pdfflGPHj1UpkwZ+fn5qXHjxlq8eLHLOlmnXx06dMhlfN26dbLZbFq3bp1zzDAMTZ8+XREREfLz81PTpk21atWqPL0GSbpw4YImTpyo6tWry8fHR1WrVtXjjz+u06dPu6yXdQrZ8uXL1bBhQ/n5+alGjRp67bXXXOpr1qyZJGnQoEHOU7uyTj/M6TS5rO1+8cUXatKkifz9/RUVFaUvvvjCuS+ioqIUGBio5s2ba9u2bS6PN2/TfOra5bfLT5UzDENz5sxR48aN5e/vrzJlyuiee+7RH3/84bL9gu5fd6pWrZpeeeUVnTlzRvPmzSvS585677377rsaO3asKlWqJH9/f7Vt21Y7d+7Mtv62bdt05513qmzZsvLz81OTJk304YcfuqyT9bP6+uuvNXjwYFWoUEEBAQHq06ePWrduLUm67777sv3s8vK5zXpf7NixQ/fcc4/KlCmjmjVrSir4e27btm26//77FRkZKX9/f0VGRqpPnz46fPhwjq9v7dq1Gjp0qMqXL69y5cqpZ8+eOn78eLZ9tmTJErVo0UJBQUEKCgpS48aNtWDBApd1vvnmG7Vv314hISEKCAhQq1at9O23317pRwfAzQhKAPKlV69euummm/TJJ59owoQJWrJkicaMGZNtvQMHDqhs2bIqVaqUatasqaefflrnz593WefYsWNq1qyZli9frrFjx2rVqlWaOXOmQkNDlZycLOmf03Jatmypr7/+Ws8995w+++wzdejQQePGjdPw4cOzPe9rr72mNWvW6OWXX9aqVatUp04dLVmyRD169FBISIjef/99LViwQMnJyYqJidHGjRuv+HrPnDmj1q1ba968eRo0aJA+//xzzZ07VzfddJMSEhJyfdxvv/2mli1bas+ePXrttde0bNky1a1bVwMHDtT06dPzsquzmTx5ssaPH6/Y2FitWLFCQ4cO1cMPP6zffvvtqo81DEN33XWXXn75ZT344INauXKlxo4dq8WLF+v2229Xenq6y/q7du3S6NGjNWbMGC1fvlwtW7bUqFGjnKdyNW3aVAsXLpQkPfPMM9q8ebM2b96shx566Ip17N69WxMnTtT48eO1bNkyhYaGqmfPnpo0aZLefPNNTZ06Ve+9955SUlLUrVu3bO+Zy3Xt2tX5vFm3V199VZJUr14953qPPvqoRo8erQ4dOmjFihWaM2eO9uzZo5YtW7oE8oLs38v386VLl7LdMjMz87yNLF26dJG3t7e+++67bMuyjpZeafvfffedgoODZbfbVbduXb3yyiv5quOpp57SH3/8oTfffFNvvvmmjh8/rpiYGJeAuXbtWrVq1UqnT5/W3Llz9emnn6px48a67777XK7HyjJ48GDZ7Xa98847+vjjj/XCCy/o//7v/yRJU6dO1ebNmzVnzhxJyvfntmfPnqpVq5Y++ugjzZ071zlekPfcoUOHVLt2bc2cOVOrV6/WSy+9pISEBDVr1izH6zUfeugh2e12LVmyRNOnT9e6devUr18/l3WeffZZPfDAA6pSpYoWLVqk5cuXa8CAAS7h691331XHjh0VEhKixYsX68MPP1TZsmXVqVMnwhJQlAwAyINJkyYZkozp06e7jA8bNszw8/MzHA6Hc+zpp5825syZY6xZs8ZYuXKlMXz4cKNUqVJGmzZtjMzMTOd6gwcPNux2u7F3795cn3fChAmGJOPHH390GR86dKhhs9mM3377zTAMwzh48KAhyahZs6Zx8eJF53qZmZlGlSpVjAYNGrg895kzZ4ywsDCjZcuWV3zdU6ZMMSQZ8fHxV1xPkjFp0iTn/fvvv9/w9fU1jhw54rJe586djYCAAOP06dOGYRjGwoULDUnGwYMHXdZbu3atIclYu3atYRiGkZycbPj5+Rl33323y3rff/+9Iclo27btFev76quvcvz5ffDBB4YkY/78+c6xiIgIw2azGbt27XJZNzY21ggJCTHOnTtnGIZhbN261ZBkLFy4MNvzZb1fLhcREWH4+/sbf/75p3Ns165dhiSjcuXKzu0ahmGsWLHCkGR89tlnV9zm5X799VejXLlyRrt27Yz09HTDMAxj8+bNhiTjlVdecVn36NGjhr+/v/Hkk08ahlHw/Zv1+iRd8Xb5vsr62W/dujXXbVasWNGIiorKtg/Mt6pVq7o8btiwYcZbb71lrF+/3lixYoXxwAMPGJKMfv36XfV1ZL33mjZt6vK5PnTokGG3242HHnrIOVanTh2jSZMmRkZGhss2unXrZlSuXNn5mct6rf3798/1+T766CPnWH4+t1n75Nlnn8227YK+58wuXbpknD171ggMDDT++9//OsezXt+wYcNc1p8+fbohyUhISDAMwzD++OMPw9vb23jggQdyfY5z584ZZcuWNbp37+4ynpmZaTRq1Mho3rx5ro8F4F4cUQKQL3feeafL/YYNG+rChQsuM3Q9//zzGjp0qNq1a6cuXbpo1qxZevHFF/Xdd9/p008/da63atUqtWvXTlFRUbk+35o1a1S3bl01b97cZXzgwIEyDENr1qzJVp/dbnfe/+2333T8+HE9+OCD8vL6/y0vKChIvXr10g8//KC0tLRcn3/VqlW66aabcryg/krWrFmj9u3bKzw8PFvdaWlpeT7tL8vmzZt14cIFPfDAAy7jLVu2VERERJ7qyXr+y917770KDAzM9n+p69Wrp0aNGrmM9e3bV6mpqdqxY0e+ar9c48aNVbVqVef9rJ99TEyMyzUlWePmU5xyk5iYqDvuuEOVK1fW8uXL5ePjI0n64osvZLPZ1K9fP5cjMJUqVVKjRo2cpzYWdP9mad26tbZu3Zrt9vbbb+d5G5czDCPH8W+++cZl+19++aXL8v/7v//ToEGD1KZNG/Xo0UPvvvuuhg8frnfffTfH0+dy0rdvX5dTHSMiItSyZUvn6ay///67fv31V+c+u3z/dunSRQkJCdmOxvXq1StPz30tn9vctl2Q99zZs2c1fvx41apVS6VKlVKpUqUUFBSkc+fOad++fdmeK6f+ePk24+PjlZmZqccffzzX175p0yadOnVKAwYMcNmnDodDd9xxh7Zu3apz587l+ngA7sNkDgDypVy5ci73syZKuNIpUpLUr18/jRs3Tj/88IPuvvtuSf+cVne1i/P//vtvRUZGZhuvUqWKc/nlKleunO3xOY1nbcPhcCg5OTnXC79PnDihatWqXbHG3OrO7Tlzqjsv25OkSpUqZVuW01hOjy9VqpQqVKjgMm6z2VSpUqVs9VzpefJb++XKli3rcj8r0OQ2fuHChatu88yZM+rSpYsyMjK0atUql9ng/vrrLxmGketsbzVq1JBU8P2bJTQ01GVihoI4d+6c/v77bzVo0CDbskaNGl11Mgezfv36afbs2frhhx/UpEmTq66f277YvXu3JDlPWxw3bpzGjRuX4zbMp6fl9JnIybV8bnPbdkHec3379tW3336rf//732rWrJlCQkJks9nUpUuXHHve1frjiRMnJOmKfS9rv95zzz25rnPq1CkFBgbmuhyAexCUABSpy//vcIUKFfTnn39ecf1y5crleC1Q1gXS5j8WzRMIZP3hkts2vLy8VKZMmVyfPy81FqRuPz8/Scp2jZD5D8ys15GYmJhtm4mJiTmGSfPjL126pBMnTriEJcMwlJiY6JyY4fJt5vQ8l9dSHGRkZKhXr146cOCANmzYkO0P0PLly8tms2nDhg05zn6YNVbQ/VsYVq5cqczMTLd9R1bW0anLP4NXktu+yNpXWe/hiRMnqmfPnjluo3bt2i738/o9WNfyuXX3d2ylpKToiy++0KRJkzRhwgTneHp6uk6dOnVN28z67P3555/ZjjZnydqvs2bNynXWwuI4zTvgiTj1DkCRyJrt7fJf/J07d9batWuveLF8+/bttXfv3myne7399tuy2Wxq167dFZ+3du3aqlq1qpYsWeJyGtO5c+f0ySefOGfUyk3nzp21f//+bKf4XU379u21Zs2abDNevf322woICHDuh6w/wH/66SeX9T777DOX+7feeqv8/PyyfTfVpk2b8nR6Wvv27SX9c5H45T755BOdO3fOuTzLnj17nEcOsixZskTBwcFq2rSppLwfTSxMQ4YM0bp167Rs2TLnaU6X69atmwzD0LFjxxQdHZ3tlnW0pqD7192OHDmicePGKTQ0VI8++qhbtpl1+l9epwx///33XT4zhw8f1qZNm5zBrXbt2rrxxhu1e/fuHPdtdHS0goODr6nWgn5u3cFms8kwjGwB+80337ymyTkkqWPHjvL29tbrr7+e6zqtWrVS6dKltXfv3lz3a9bRLwCFiyNKANxqw4YNeuGFF3T33XerRo0aunDhglatWqX58+fr9ttvV/fu3Z3rTpkyRatWrVKbNm301FNPqUGDBjp9+rS++uorjR07VnXq1NGYMWP09ttvq2vXrpoyZYoiIiK0cuVKzZkzR0OHDr3qF1N6eXlp+vTpeuCBB9StWzc9+uijSk9P14wZM3T69Gm9+OKLV3z86NGj9cEHH6hHjx6aMGGCmjdvrvPnz2v9+vXq1q1brkFt0qRJ+uKLL9SuXTs9++yzKlu2rN577z2tXLlS06dPd54e1qxZM9WuXVvjxo3TpUuXVKZMGS1fvjzbrF5lypTRuHHj9Pzzz+uhhx7Svffeq6NHjyouLi5Pp4bFxsaqU6dOGj9+vFJTU9WqVSv99NNPmjRpkpo0aaIHH3zQZf0qVarozjvvVFxcnCpXrqx3331X8fHxeumll5x/oNasWVP+/v567733FBUVpaCgIFWpUsV5emFhmzFjht555x2NGDFCgYGB+uGHH5zLQkJCVLduXbVq1UqPPPKIBg0apG3btqlNmzYKDAxUQkKCNm7cqAYNGmjo0KEF3r8F8csvvzivQ0lKStKGDRu0cOFCeXt7a/ny5dlOl7yaJUuWaNmyZeratasiIiJ0+vRpffTRR1q6dKkGDhyY7dqz3CQlJenuu+/Www8/rJSUFE2aNEl+fn6aOHGic5158+apc+fO6tSpkwYOHKiqVavq1KlT2rdvn3bs2KGPPvooX7VnKejn1h1CQkLUpk0bzZgxQ+XLl1dkZKTWr1+vBQsWXPOXZ0dGRuqpp57Sc889p/Pnzzu/amHv3r06efKkJk+erKCgIM2aNUsDBgzQqVOndM899ygsLEwnTpzQ7t27deLEiSsGLQBuZNUsEgCuL1kzS504ccJl3Dxr2//+9z+jS5cuRtWqVQ1fX1/Dz8/PaNCggfHCCy8YFy5cyLbdo0ePGoMHDzYqVapk2O12o0qVKkbv3r2Nv/76y7nO4cOHjb59+xrlypUz7Ha7Ubt2bWPGjBkus2FlzXo3Y8aMHOtfsWKFccsttxh+fn5GYGCg0b59e+P777/P02tPTk42Ro0aZVSrVs2w2+1GWFiY0bVrV+PXX391riPTrHeGYRg///yz0b17dyM0NNTw8fExGjVqlOMMcfv37zc6duxohISEGBUqVDBGjBhhrFy50mXWO8MwDIfDYUybNs0IDw83fHx8jIYNGxqff/650bZt2zzNynb+/Hlj/PjxRkREhGG3243KlSsbQ4cONZKTk13Wi4iIMLp27Wp8/PHHRr169QwfHx8jMjLSePXVV7Nt8/333zfq1Klj2O12l32Q26x3Xbt2zbYNScbjjz/uMpbTz9O8zQEDBuQ6u5x5f7z11lvGLbfcYgQGBhr+/v5GzZo1jf79+xvbtm1zrlPQ/Zvb6zOMnGcIzPrsZN18fHyMsLAwo23btsbUqVONpKSkbNvJ7XN4uc2bNxvt27d3fqYCAgKMZs2aGXPmzHH5zOQmaxa6d955xxg5cqRRoUIFw9fX17jttttc9leW3bt3G7179zbCwsIMu91uVKpUybj99tuNuXPnZnutOc3wl9Osd1ny8rm90j4p6Hvuzz//NHr16mWUKVPGCA4ONu644w7jl19+MSIiIowBAwZc9fWZZ6/M8vbbbxvNmjUz/Pz8jKCgIKNJkybZesP69euNrl27GmXLljXsdrtRtWpVo2vXrjnuJwCFw2YYuUypAwAokSIjI1W/fn3nl3KiZFm3bp3atWunjz766IoTCgCAp+MaJQAAAAAwISgBAAAAgAmn3gEAAACACUeUAAAAAMCEoAQAAAAAJgQlAAAAADDx+C+cdTgcOn78uIKDg2Wz2awuBwAAAIBFDMPQmTNnVKVKFXl5XfmYkccHpePHjys8PNzqMgAAAAAUE0ePHtUNN9xwxXU8PigFBwdL+mdnhISEWFwNrJCRkaGvv/5aHTt2lN1ut7ocABahFwCgDyA1NVXh4eHOjHAlHh+Usk63CwkJISiVUBkZGQoICFBISAhNESjB6AUA6APIkpdLcpjMAQAAAABMCEoAAAAAYEJQAgAAAAATj79GKS8Mw9ClS5eUmZlpdSlwA29vb5UqVYrp4AEAAHDNSnxQunjxohISEpSWlmZ1KXCjgIAAVa5cWT4+PlaXAgAAgOtQiQ5KDodDBw8elLe3t6pUqSIfHx+OQlznDMPQxYsXdeLECR08eFA33nij1SUBAADgOlSig9LFixflcDgUHh6ugIAAq8uBm/j7+8tut+vw4cO6ePGivL29rS4JAAAA1xkmc5Dk5cVu8DT8TAEAAFAQ/DUJAAAAACYEJQAAAAAwISgBAAAAgAlBqZgZOHCgbDZbttvvv/9udWkuDh06JJvNpl27dlldCgAAAOB2JXrWu+Lqjjvu0MKFC13GKlSokO/tXLx4ke8RAgAAAK4BR5SKIV9fX1WqVMnl5u3trfXr16t58+by9fVV5cqVNWHCBF26dMn5uJiYGA0fPlxjx45V+fLlFRsbq3Xr1slms2n16tVq0qSJ/P39dfvttyspKUmrVq1SVFSUQkJC1KdPH5cv3f3qq6/UunVrlS5dWuXKlVO3bt104MAB5/Lq1atLkpo0aSKbzaaYmJgi2z8AAABAYSMoXSeOHTumLl26qFmzZtq9e7def/11LViwQM8//7zLeosXL1apUqX0/fffa968ec7xuLg4zZ49W5s2bdLRo0fVu3dvzZw5U0uWLNHKlSsVHx+vWbNmOdc/d+6cxo4dq61bt+rbb7+Vl5eX7r77bjkcDknSli1bJEnffPONEhIStGzZsiLYCwAAAEDR4NS7YuiLL75QUFCQ837nzp110003KTw8XLNnz5bNZlOdOnV0/PhxjR8/Xs8++6zze4Nq1aql6dOnOx+bmJgoSXr++efVqlUrSdKQIUM0ceJEHThwQDVq1JAk3XPPPVq7dq3Gjx8vSerVq5dLTQsWLFBYWJj27t2r+vXrO08FLFeunCpVqlRIewIAAACwBkeUiqF27dpp165dzttrr72mffv2qUWLFrLZbM71WrVqpbNnz+rPP/90jkVHR+e4zYYNGzr/XbFiRQUEBDhDUtZYUlKS8/6BAwfUt29f1ahRQyEhIc5T7Y4cOeK21wkAAAAUVxxRKoYCAwNVq1YtlzHDMFxCUtaYJJfxwMDAHLdpt9ud/7bZbC73s8ayTquTpO7duys8PFxvvPGGqlSpIofDofr16+vixYvX9qIAAACA6whHlK4TdevW1aZNm5zhSJI2bdqk4OBgVa1a1a3P9ffff2vfvn165pln1L59e0VFRSk5OdllnazZ9DIzM9363AAAAEBxwBGl68SwYcM0c+ZMjRgxQsOHD9dvv/2mSZMmaezYsc7rk9ylTJkyKleunObPn6/KlSvryJEjmjBhgss6YWFh8vf311dffaUbbrhBfn5+Cg0NdWsdAAAAhWLaDZLjgtVVWCMuxeoKrhscUbpOVK1aVV9++aW2bNmiRo0a6bHHHtOQIUP0zDPPuP25vLy8tHTpUm3fvl3169fXmDFjNGPGDJd1SpUqpddee03z5s1TlSpV1KNHD7fXAQAAAFjFZlx+LpcHSk1NVWhoqFJSUhQSEuKy7MKFCzp48KCqV68uPz8/iypEYbj8Z+vt7a0vv/xSXbp0yXZtFoCSIyMjg14AlHDOPrD7Edk5olQiXSkbmHFECQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAExKWV1AcRU5YWWRPdehF7sW2XNdTWRkpEaPHq3Ro0df8zbi4uK0YsUK7dq1y211mcXExKhx48aaOXNmoT0HAAAASi6OKJVQixYtUunSpbONb926VY888kiBtj1u3Dh9++23BdoGAAAAYCWOKMFFhQoVCryNoKAgBQUFuaEaAAAAwBocUbpOpaena+TIkQoLC5Ofn59at26trVu3SpLWrVsnm82mlStXqlGjRvLz89Mtt9yin3/+2bl80KBBSklJkc1mk81mU1xcnKR/Tr27/HQ2m82mefPmqVu3bgoICFBUVJQ2b96s33//XTExMQoMDFSLFi104MAB52Pi4uLUuHFjl22Yb5GRkc7le/fuVZcuXRQUFKSKFSvqwQcf1MmTJ53Lz507p/79+ysoKEiVK1fWK6+84v4dCgAAAFyGoHSdevLJJ/XJJ59o8eLF2rFjh2rVqqVOnTrp1KlTznWeeOIJvfzyy9q6davCwsJ05513KiMjQy1bttTMmTMVEhKihIQEJSQkaNy4cbk+13PPPaf+/ftr165dqlOnjvr27atHH31UEydO1LZt2yRJw4cPz/XxWc+RkJCg33//XbVq1VKbNm2cy9q2bavGjRtr27Zt+uqrr/TXX3+pd+/eLq9j7dq1Wr58ub7++mutW7dO27dvL+guBAAAAHLFqXfXoXPnzun111/XokWL1LlzZ0nSG2+8ofj4eC1YsEDNmjWTJE2aNEmxsbGSpMWLF+uGG27Q8uXL1bt3b4WGhspms6lSpUpXfb5BgwY5g8v48ePVokUL/fvf/1anTp0kSaNGjdKgQYNyfXzWcxiGoV69eik0NFTz5s2TJL3++utq2rSppk6d6lz/rbfeUnh4uPbv368qVapowYIFevvtt7O9FgAAAKCwEJSuQwcOHFBGRoZatWrlHLPb7WrevLn27dvnDEotWrRwLi9btqxq166tffv25fv5GjZs6Px3xYoVJUkNGjRwGbtw4YJSU1MVEhKS63aeeuopbd68WVu3bpW/v78kafv27Vq7dm2O1zQdOHBA58+f18WLF3N8LQAAAEBhsfTUu++++07du3dXlSpVZLPZtGLFCpflhmEoLi5OVapUkb+/v2JiYrRnzx5rii1GDMOQ9M+1P+Zx85jZ1ZbnxG63Z3t8TmMOhyPXbbz77rv6z3/+o+XLl7scDXI4HOrevbt27drlcvvf//6nNm3aOF8rAAAAUJQsDUrnzp1To0aNNHv27ByXT58+Xa+++qpmz56trVu3qlKlSoqNjdWZM2eKuNLipVatWvLx8dHGjRudYxkZGdq2bZuioqKcYz/88IPz38nJydq/f7/q1KkjSfLx8VFmZmaR1Lt582Y99NBDmjdvnm699VaXZU2bNtWePXsUGRmpWrVqudwCAwNVq1Yt2e32HF8LAAAAUFgsDUqdO3fW888/r549e2ZbZhiGZs6cqaefflo9e/ZU/fr1tXjxYqWlpWnJkiUWVFt8BAYGaujQoXriiSf01Vdfae/evXr44YeVlpamIUOGONebMmWKvv32W/3yyy8aOHCgypcvr7vuukvSP7PbnT17Vt9++61OnjyptLS0Qqk1MTFRd999t+6//3516tRJiYmJSkxM1IkTJyRJjz/+uE6dOqU+ffpoy5Yt+uOPP/T1119r8ODByszMVFBQkIYMGaInnnjC5bV4eTEPCQAAAApPsb1G6eDBg0pMTFTHjh2dY76+vmrbtq02bdqkRx99NMfHpaenKz093Xk/NTVV0j9HXDIyMlzWzcjIkGEYcjgc2U4b+2NqZ3e9lKu60ilruZk6daoyMzP14IMP6syZM4qOjtaqVasUGhrq3N7UqVM1atQo/e9//1OjRo20YsUKlSpVSg6HQ7feeqseffRR3Xffffr777/17LPPatKkSZLk3CeX15d1//L/5jaWdbqcw+HQ3r179ddff2nx4sVavHixc5sRERH6448/VKlSJW3YsEETJkxQp06dlJ6eroiICOdEEQ6HQy+99JLOnDmjO++8U8HBwRo7dqxSUlKy1Wnep4ZhKCMjw7mO+ecPoGTJ6gH0AqDkcvYBLz+LK7FQCe+B+fkdYDOKyUUgNptNy5cvdx7x2LRpk1q1aqVjx46pSpUqzvUeeeQRHT58WKtXr85xO3FxcZo8eXK28SVLliggIMBlrFSpUqpUqZLCw8Pl4+PjvhdjsY0bN6p79+46dOiQQkNDrS7HEhcvXtTRo0eVmJioS5cuWV0OAAAAioG0tDT17dtXKSkpV5yETCrGR5Sy5HfCgokTJ2rs2LHO+6mpqQoPD1fHjh2z7YwLFy7o6NGjCgoKkp+f5/yfhaxAGBwcfNU3gKe6cOGC/P391aZNG3l7eys+Pl6xsbEuk1AAKFkyMjLoBUAJ5+wDP4+U3XHB6nKsMfFPqyuwVNbZZnlRbINS1nfvJCYmqnLlys7xpKQk5xTVOfH19ZWvr2+2cbvdnu0XY2Zmpmw2m7y8vDzqmpes1+Jprys/vLy8ZLPZZLfb5e3tLSnn9wCAkodeAMDuuFByg1IJ73/56f/F9q/o6tWrq1KlSoqPj3eOXbx4UevXr1fLli0trKz4i4mJkWEYKl26tNWlAAAAANclS48onT17Vr///rvz/sGDB7Vr1y6VLVtW1apV0+jRozV16lTdeOONuvHGGzV16lQFBASob9++FlYNAAAAwNNZGpS2bdumdu3aOe9nXVs0YMAALVq0SE8++aTOnz+vYcOGKTk5Wbfccou+/vprBQcHW1UyAAAAgBLA0qCUdYpYbmw2m+Li4hQXF1d0RQEAAAAo8YrtNUoAAAAAYBWCEgAAAACYEJQAAAAAwKTYfo+S5eJCi/C5UormaeLitGLFCu3atSvPj4mJiVHjxo01c+ZMS+sAAAAAihJBqQQZN26cRowYka/HLFu2jC9mBAAAQIlDUCoBDMNQZmamgoKCFBQUlK/Hli1btpCqAgAAAIovrlG6TqWnp2vkyJEKCwuTn5+fWrdura1bt0qS1q1bJ5vNptWrVys6Olq+vr7asGGD4uLi1LhxY+c2Ll26pJEjR6p06dIqV66cxo8frwEDBuiuu+5yrhMTE6PRo0c770dGRmrq1KkaPHiwgoODVa1aNc2fP9+ltvHjx+umm25SQECAatSooX//+9/KyMgozN0BAAAAuBVB6Tr15JNP6pNPPtHixYu1Y8cO1apVS506ddKpU6dc1pk2bZr27dunhg0bZtvGSy+9pPfee08LFy7U999/r9TUVK1YseKqz/3KK68oOjpaO3fu1LBhwzR06FD9+uuvzuXBwcFatGiR9u7dq//+979644039J///MctrxsAAAAoCgSl69C5c+f0+uuva8aMGercubPq1q2rN954Q/7+/lqwYIFzvSlTpig2NlY1a9ZUuXLlsm1n1qxZmjhxou6++27VqVNHs2fPVunSpa/6/F26dNGwYcNUq1YtjR8/XuXLl9e6deucy5955hm1bNlSkZGR6t69u/71r3/pww8/dMdLBwAAAIoE1yhdhw4cOKCMjAy1atXKOWa329W8eXPt27dPzZo1kyRFR0fnuo2UlBT99ddfat68uXPM29tbN998sxwOxxWf//KjUzabTZUqVVJSUpJz7OOPP9bMmTP1+++/6+zZs7p06ZJCQkLy/ToBAAAAq3BE6TpkGIakf0KKefzyscDAwKtuK6dtXI15FjybzeYMVz/88IPuv/9+de7cWV988YV27typp59+WhcvXrzqdgEAAIDigqB0HapVq5Z8fHy0ceNG51hGRoa2bdumqKioPG0jNDRUFStW1JYtW5xjmZmZ2rlzZ4Fq+/777xUREaGnn35a0dHRuvHGG3X48OECbRMAAAAoapx6dx0KDAzU0KFD9cQTT6hs2bKqVq2apk+frrS0NA0ZMkS7d+/O03ZGjBihadOmqVatWqpTp45mzZql5OTkbEeZ8qNWrVo6cuSIli5dqmbNmmnlypVavnz5NW8PAAAAsAJBKTdxKVZXcEUvvviiHA6HHnzwQZ05c0bR0dFavXq1ypQpk+dtjB8/XomJierfv7+8vb31yCOPqFOnTvL29r7munr06KExY8Zo+PDhSk9PV9euXfXvf/9bcXFx17xNAAAAoKjZjLxclHIdS01NVWhoqFJSUrJNKHDhwgUdPHhQ1atXl5+fn0UVFh8Oh0NRUVHq3bu3nnvuOavLKZDLf7be3t768ssv1aVLl2zXVwEoOTIyMugFQAnn7AO7H5HdccHqcqxRzA8GFLYrZQMzjiiVYIcPH9bXX3+ttm3bKj09XbNnz9bBgwfVt29fq0sDAAAALMVkDiWYl5eXFi1apGbNmqlVq1b6+eef9c033+R5QggAAADAU3FEqQQLDw/X999/b3UZAAAAQLHDESUAAAAAMCEoKW9fsorrCz9TAAAAFESJDkpZsx6lpaVZXAncLetnysxWAAAAuBYl+holb29vlS5dWklJSZKkgICAAn3ZKqxnGIbS0tKUlJSk0qVLy9vbWw6Hw+qyAAAAcJ0p0UFJkipVqiRJzrAEz1C6dGnnzxYAAADIrxIflGw2mypXrqywsDBlZGRYXQ7cwG63y9vb2+oyAAAAcB0r8UEpi7e3N39cAwAAAJBUwidzAAAAAICcEJQAAAAAwIRT7wAAAEqIyAkrrS7BUr7ehqY3t7oKXC84ogQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMinVQunTpkp555hlVr15d/v7+qlGjhqZMmSKHw2F1aQAAAAA8WCmrC7iSl156SXPnztXixYtVr149bdu2TYMGDVJoaKhGjRpldXkAAAAAPFSxDkqbN29Wjx491LVrV0lSZGSk3n//fW3bts3iygAAAAB4smIdlFq3bq25c+dq//79uummm7R7925t3LhRM2fOzPUx6enpSk9Pd95PTU2VJGVkZCgjI6OwS0YxlPVz5+cPlGz0AkDy9TasLsFSvl7/vP4MLz+LK7FQCe+B+fkdYDMMo9h+YgzD0FNPPaWXXnpJ3t7eyszM1AsvvKCJEyfm+pi4uDhNnjw52/iSJUsUEBBQmOUCAAAAKMbS0tLUt29fpaSkKCQk5IrrFuugtHTpUj3xxBOaMWOG6tWrp127dmn06NF69dVXNWDAgBwfk9MRpfDwcJ08efKqOwOeKSMjQ/Hx8YqNjZXdbre6HAAWoRcAUv241VaXYClfL0PPRTsU+/NI2R0XrC7HGhP/tLoCS6Wmpqp8+fJ5CkrF+tS7J554QhMmTND9998vSWrQoIEOHz6sadOm5RqUfH195evrm23cbrfzi7GE4z0AQKIXoGRLz7RZXUKxYHdcKLlBqYT3v/z0/2I9PXhaWpq8vFxL9Pb2ZnpwAAAAAIWqWB9R6t69u1544QVVq1ZN9erV086dO/Xqq69q8ODBVpcGAAAAwIMV66A0a9Ys/fvf/9awYcOUlJSkKlWq6NFHH9Wzzz5rdWkAAAAAPFixDkrBwcGaOXPmFacDBwAAAAB3K9bXKAEAAACAFQhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmFxTUDpw4ICeeeYZ9enTR0lJSZKkr776Snv27HFrcQAAAABghXwHpfXr16tBgwb68ccftWzZMp09e1aS9NNPP2nSpEluLxAAAAAAilq+g9KECRP0/PPPKz4+Xj4+Ps7xdu3aafPmzW4tDgAAAACskO+g9PPPP+vuu+/ONl6hQgX9/fffbikKAAAAAKyU76BUunRpJSQkZBvfuXOnqlat6paiAAAAAMBK+Q5Kffv21fjx45WYmCibzSaHw6Hvv/9e48aNU//+/QujRgAAAAAoUvkOSi+88IKqVaumqlWr6uzZs6pbt67atGmjli1b6plnnimMGgEAAACgSJXK7wPsdrvee+89TZkyRTt37pTD4VCTJk104403FkZ9AAAAAFDk8h2UstSsWVM1a9Z0Zy0AAAAAUCzkOygZhqGPP/5Ya9euVVJSkhwOh8vyZcuWua04AAAAALBCvoPSqFGjNH/+fLVr104VK1aUzWYrjLoAAAAAwDL5Dkrvvvuuli1bpi5duhRGPQAAAABguXzPehcaGqoaNWoURi0AAAAAUCzkOyjFxcVp8uTJOn/+fGHUk82xY8fUr18/lStXTgEBAWrcuLG2b99eJM8NAAAAoGTK96l39957r95//32FhYUpMjJSdrvdZfmOHTvcVlxycrJatWqldu3aadWqVQoLC9OBAwdUunRptz0HAAAAAJjlOygNHDhQ27dvV79+/Qp9MoeXXnpJ4eHhWrhwoXMsMjKy0J4PAAAAAKRrCEorV67U6tWr1bp168Kox8Vnn32mTp066d5779X69etVtWpVDRs2TA8//HCuj0lPT1d6errzfmpqqiQpIyNDGRkZhV4zip+snzs/f6BkoxcAkq+3YXUJlvL1+uf1Z3j5WVyJhUp4D8zP7wCbYRj5+sTUqVNHH374oRo2bJjvwvLLz++fN/HYsWN17733asuWLRo9erTmzZun/v375/iYrGuozJYsWaKAgIBCrRcAAABA8ZWWlqa+ffsqJSVFISEhV1w330Fp5cqVmjVrlubOnVvop8H5+PgoOjpamzZtco6NHDlSW7du1ebNm3N8TE5HlMLDw3Xy5Mmr7gx4poyMDMXHxys2NjbbNXUASg56ASDVj1ttdQmW8vUy9Fy0Q7E/j5TdccHqcqwx8U+rK7BUamqqypcvn6eglO9T7/r166e0tDTVrFlTAQEB2X7ZnDp1Kr+bzFXlypVVt25dl7GoqCh98sknuT7G19dXvr6+2cbtdju/GEs43gMAJHoBSrb0zMK7tvx6YndcKLlBqYT3v/z0/3wHpZkzZ+b3IdesVatW+u2331zG9u/fr4iIiCKrAQAAAEDJk++gNGDAgMKoI0djxoxRy5YtNXXqVPXu3VtbtmzR/PnzNX/+/CKrAQAAAEDJk6eglJqa6jyHL2sWudy48zqgZs2aafny5Zo4caKmTJmi6tWra+bMmXrggQfc9hwAAAAAYJanoFSmTBklJCQoLCxMpUuXzvG7kwzDkM1mU2ZmplsL7Natm7p16+bWbQIAAADAleQpKK1Zs0Zly5aVJK1du7ZQCwIAAAAAq+UpKLVt21Y1atTQ1q1b1bZt28KuCQAAAAAs5ZXXFQ8dOuT20+oAAAAAoDjKc1ACAAAAgJIiX9OD7927V4mJiVdcp2HDhgUqCAAAAACslq+g1L59exmGkW3cZrMV2qx3AAAAAFDU8hWUfvzxR1WoUKGwagEAAACAYiFfQalatWoKCwsrrFoAAAAAoFhgMgcAAAAAMMlzUGrbtq18fHwKsxYAAAAAKBbyfOrd2rVrC7MOAAAAACg2OPUOAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACY5Ot7lCQpMzNTixYt0rfffqukpCQ5HA6X5WvWrHFbcQAAAABghXwHpVGjRmnRokXq2rWr6tevL5vNVhh1AQAAAIBl8h2Uli5dqg8//FBdunQpjHoAAAAAwHL5vkbJx8dHtWrVKoxaAAAAAKBYyHdQ+te//qX//ve/MgyjMOoBAAAAAMvl+9S7jRs3au3atVq1apXq1asnu93usnzZsmVuKw4AAAAArJDvoFS6dGndfffdhVELAAAAABQL+Q5KCxcuLIw6AAAAAKDY4AtnAQAAAMAk30eUJOnjjz/Whx9+qCNHjujixYsuy3bs2OGWwgAAAADAKvk+ovTaa69p0KBBCgsL086dO9W8eXOVK1dOf/zxhzp37lwYNQIAAABAkcp3UJozZ47mz5+v2bNny8fHR08++aTi4+M1cuRIpaSkFEaNAAAAAFCk8h2Ujhw5opYtW0qS/P39debMGUnSgw8+qPfff9+91QEAAACABfIdlCpVqqS///5bkhQREaEffvhBknTw4EG+hBYAAACAR8h3ULr99tv1+eefS5KGDBmiMWPGKDY2Vvfddx/frwQAAADAI+R71rv58+fL4XBIkh577DGVLVtWGzduVPfu3fXYY4+5vUAAAAAAKGr5DkpeXl7y8vr/B6J69+6t3r17u7UoAAAAALDSNX3h7IYNG9SvXz+1aNFCx44dkyS988472rhxo1uLAwAAAAAr5DsoffLJJ+rUqZP8/f21c+dOpaenS5LOnDmjqVOnur1AAAAAAChq+Q5Kzz//vObOnas33nhDdrvdOd6yZUvt2LHDrcUBAAAAgBXyHZR+++03tWnTJtt4SEiITp8+7Y6aAAAAAMBS+Q5KlStX1u+//55tfOPGjapRo4ZbigIAAAAAK+U7KD366KMaNWqUfvzxR9lsNh0/flzvvfeexo0bp2HDhhVGjQAAAABQpPI9PfiTTz6plJQUtWvXThcuXFCbNm3k6+urcePGafjw4YVRIwAAAAAUqXwHJUl64YUX9PTTT2vv3r1yOByqW7eugoKC3F0bAAAAAFjimoKSJAUEBCg6OtqdtQAAAABAsZDnoDR48OA8rffWW29dczEAAAAAUBzkOSgtWrRIERERatKkiQzDKMyaAAAAAMBSeQ5Kjz32mJYuXao//vhDgwcPVr9+/VS2bNnCrA0AAAAALJHn6cHnzJmjhIQEjR8/Xp9//rnCw8PVu3dvrV69miNMAAAAADxKvr5HydfXV3369FF8fLz27t2revXqadiwYYqIiNDZs2cLq0YAAAAAKFL5/sLZLDabTTabTYZhyOFwuLMmAAAAALBUvoJSenq63n//fcXGxqp27dr6+eefNXv2bB05coTvUQIAAADgMfI8mcOwYcO0dOlSVatWTYMGDdLSpUtVrly5wqwNAAAAACyR56A0d+5cVatWTdWrV9f69eu1fv36HNdbtmyZ24oDAAAAACvkOSj1799fNputMGsBAAAAgGIhX184CwAAAAAlwTXPegcAAAAAnoqgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACAyXUVlKZNmyabzabRo0dbXQoAAAAAD3bdBKWtW7dq/vz5atiwodWlAAAAAPBw10VQOnv2rB544AG98cYbKlOmjNXlAAAAAPBwpawuIC8ef/xxde3aVR06dNDzzz9/xXXT09OVnp7uvJ+amipJysjIUEZGRqHWieIp6+fOzx8o2egFgOTrbVhdgqV8vf55/RlefhZXYqES3gPz8zug2AelpUuXaseOHdq6dWue1p82bZomT56cbfzrr79WQECAu8vDdSQ+Pt7qEgAUA/QClGTTm1tdQfEQ3+A1q0uwzpdfWl2BpdLS0vK8rs0wjGL7vxaOHj2q6Ohoff3112rUqJEkKSYmRo0bN9bMmTNzfExOR5TCw8N18uRJhYSEFEXZKGYyMjIUHx+v2NhY2e12q8sBYBF6ASDVj1ttdQmW8vUy9Fy0Q7E/j5TdccHqcqwx8U+rK7BUamqqypcvr5SUlKtmg2J9RGn79u1KSkrSzTff7BzLzMzUd999p9mzZys9PV3e3t4uj/H19ZWvr2+2bdntdn4xlnC8BwBI9AKUbOmZNqtLKBbsjgslNyiV8P6Xn/5frINS+/bt9fPPP7uMDRo0SHXq1NH48eOzhSQAAAAAcIdiHZSCg4NVv359l7HAwECVK1cu2zgAAAAAuMt1MT04AAAAABSlYn1EKSfr1q2zugQAAAAAHo4jSgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgEkpqwsAisy0GyTHBaursE5citUVAAAAXDc4ogQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAk1JWFwAAQJGadoPkuGB1FdaIS7G6AgC4bnBECQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmDCZQwkROWGl1SVYxtfb0PTmVlcBAACA6wlHlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACASbEOStOmTVOzZs0UHByssLAw3XXXXfrtt9+sLgsAAACAhyvWQWn9+vV6/PHH9cMPPyg+Pl6XLl1Sx44dde7cOatLAwAAAODBSlldwJV89dVXLvcXLlyosLAwbd++XW3atLGoKgAAAACerlgHJbOUlBRJUtmyZXNdJz09Xenp6c77qampkqSMjAxlZGQUboHFmK+3YXUJlvH1+ue1Z3j5WVyJxUrw+x+Q5PwdUKJ7AX2gxCvJfw9I/E0gqcT3gfzkAZthGNfFJ8YwDPXo0UPJycnasGFDruvFxcVp8uTJ2caXLFmigICAwiwRAAAAQDGWlpamvn37KiUlRSEhIVdc97oJSo8//rhWrlypjRs36oYbbsh1vZyOKIWHh+vkyZNX3RmerH7caqtLsIyvl6Hnoh2K/Xmk7I4LVpdjnYl/Wl0BYKmMjAzFx8eX7F5AHyjxSvLfAxJ/E0gq8X0gNTVV5cuXz1NQui5OvRsxYoQ+++wzfffdd1cMSZLk6+srX1/fbON2u112u72wSiz20jNtVpdgObvjQsltipJUgt//wOVKdC+gD5R4/D3wD/pAyZWfPFCsg5JhGBoxYoSWL1+udevWqXr16laXBAAAAKAEKNZB6fHHH9eSJUv06aefKjg4WImJiZKk0NBQ+fv7W1wdAAAAAE9VrL9H6fXXX1dKSopiYmJUuXJl5+2DDz6wujQAAAAAHqxYH1G6TuaZAAAAAOBhivURJQAAAACwAkEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYFLK6gIAAEUjcsJKq0uwlK+3oenNra4CAHC94IgSAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCEoAQAAAAAJgQlAAAAADAhKAEAAACACUEJAAAAAEwISgAAAABgQlACAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCUAAAAAMCEoAQAAAIAJQQkAAAAATAhKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgMl1EZTmzJmj6tWry8/PTzfffLM2bNhgdUkAAAAAPFixD0offPCBRo8eraefflo7d+7Ubbfdps6dO+vIkSNWlwYAAADAQxX7oPTqq69qyJAheuihhxQVFaWZM2cqPDxcr7/+utWlAQAAAPBQpawu4EouXryo7du3a8KECS7jHTt21KZNm3J8THp6utLT0533U1JSJEmnTp1SRkZG4RVbzJW6dM7qEixTymEoLc2hvy/6yO5wWF2Odf7+2+oKYLGS3AckeoEk+gDoA/SBEt8Hzpw5I0kyDOOq6xbroHTy5EllZmaqYsWKLuMVK1ZUYmJijo+ZNm2aJk+enG28evXqhVIjrg99rS6gOJhW3uoKAMuV+F5AHwDoA/QBSf8EptDQ0CuuU6yDUhabzeZy3zCMbGNZJk6cqLFjxzrvOxwOnTp1SuXKlcv1MfBsqampCg8P19GjRxUSEmJ1OQAsQi8AQB+AYRg6c+aMqlSpctV1i3VQKl++vLy9vbMdPUpKSsp2lCmLr6+vfH19XcZKly5dWCXiOhISEkJTBEAvAEAfKOGudiQpS7GezMHHx0c333yz4uPjXcbj4+PVsmVLi6oCAAAA4OmK9RElSRo7dqwefPBBRUdHq0WLFpo/f76OHDmixx57zOrSAAAAAHioYh+U7rvvPv3999+aMmWKEhISVL9+fX355ZeKiIiwujRcJ3x9fTVp0qRsp2QCKFnoBQDoA8gPm5GXufEAAAAAoAQp1tcoAQAAAIAVCEoAAAAAYEJQAgAAAAATghIAAAAAmBCUAAAAAMCk2E8PDuRHSkqKli9frg0bNujQoUNKS0tThQoV1KRJE3Xq1IkvKgZKCHoBAPoACoojSvAICQkJevjhh1W5cmVNmTJF586dU+PGjdW+fXvdcMMNWrt2rWJjY1W3bl198MEHVpcLoJDQCwDQB+AuHFGCR2jUqJH69++vLVu2qH79+jmuc/78ea1YsUKvvvqqjh49qnHjxhVxlQAKG70AAH0A7sIXzsIjnDhxQhUqVCi09QFcH+gFAOgDcBeCEgAAAACYcOodPM7ff/+tcuXKSZKOHj2qN954Q+fPn9edd96p2267zeLqABQVegEA+gAKgiNK8Bg///yzunfvrqNHj+rGG2/U0qVLdccdd+jcuXPy8vLSuXPn9PHHH+uuu+6yulQAhYheAIA+AHdg1jt4jCeffFINGjTQ+vXrFRMTo27duqlLly5KSUlRcnKyHn30Ub344otWlwmgkNELANAH4A4cUYLHKF++vNasWaOGDRvq7NmzCgkJ0ZYtWxQdHS1J+vXXX3Xrrbfq9OnT1hYKoFDRCwDQB+AOHFGCxzh16pQqVaokSQoKClJgYKDKli3rXF6mTBmdOXPGqvIAFBF6AQD6ANyBoASPYrPZrngfQMlALwBAH0BBMesdPMrAgQPl6+srSbpw4YIee+wxBQYGSpLS09OtLA1AEaIXAKAPoKC4RgkeY9CgQXlab+HChYVcCQAr0QsA0AfgDgQlAAAAADDhGiUAAAAAMOEaJXiMhIQEzZ49Wy+88IIkqXXr1kpLS3Mu9/b21ooVK1S1alWrSgRQBOgFAOgDcAeOKMFjzJkzx+X7EHbv3q3bbrtNPXr0UI8ePeTt7a3//Oc/1hUIoEjQCwDQB+AOXKMEj9G4cWPNmDFDsbGxkqTg4GDt3r1bNWrUkCStXr1aY8eO1Z49e6wsE0AhoxcAoA/AHTiiBI9x6NAh1axZ03k/NjbWOQ2oJNWuXVsHDx60ojQARYheAIA+AHfgGiV4jEuXLiklJcV5f9myZS7Lk5OT5eXF/xsAPB29AAB9AO7AOwQeo3bt2tq0aVOuyzds2KCbbrqpCCsCYAV6AQD6ANyBoASPcf/99+vZZ5/VTz/9lG3Z7t27NXnyZPXp08eCygAUJXoBAPoA3IHJHOAxMjIy1KFDB23atEmxsbGqXbu2bDabfv31V8XHx6tFixb69ttvZbfbrS4VQCGiFwCgD8AdCErwKBcvXtSrr76qpUuXav/+/ZKkG2+8UX369NGYMWPk6+trcYUAigK9AAB9AAVFUAIAAAAAE65RAgAAAAATghJKjN27d8vb29vqMgAUgZUrV+qhhx7Sk08+qX379rksS05O1u23325RZQCKQnBwsIYMGXLFme+AqyEooUThTFPA8y1ZskQ9evRQYmKiNm/erKZNm+q9995zLr948aLWr19vYYUACtu5c+f0448/qnXr1oqKitIrr7yipKQkq8vCdYZrlOAxevbsecXlKSkpWrdunTIzM4uoIgBWaNq0qQYNGqQRI0ZIkj7++GMNGjRIM2fO1JAhQ/TXX3+pSpUq9ALAg3l5eSkxMVEJCQl68803tWTJEp09e1bdunXTQw89pDvuuEM2m83qMlHMcUQJHuPzzz/XhQsXFBoamuMtKCjI6hIBFIH9+/erW7duzvv33HOPPv/8c40ZM0Zz5861sDIARa1Ro0aaNWuWEhIStGjRIqWkpKhbt26qVq2ann32WavLQzFXyuoCAHeJiopSr169NGTIkByX79q1S1988UURVwWgqIWEhOivv/5S9erVnWMxMTH6/PPP1a1bN/35558WVgegKJiPFvn4+KhPnz7q06ePDh06pAULFmjRokWaMmWKRRXiesARJXiMm2++WTt27Mh1ua+vr6pVq1aEFQGwQvPmzbVq1aps423bttXnn3+umTNnFn1RAIrUla4siYyM1HPPPafDhw8XYUW4HnFECR5j7ty5V7zmICoqSgcPHizCigBYYcyYMbnOdBUTE6MvvvhCixcvLuKqABSlSZMmXfWUe65RwtUwmQMAAAAAmHDqHTxa165dlZCQYHUZACxGLwBAH0B+EZTg0b777judP3/e6jIAWIxeAIA+gPwiKAEAAACACUEJHi0iIkJ2u93qMgBYjF4AgD6A/GIyBwAAAAAw4YgSPI55ivAff/xR3333nTIyMiyqCIAV6AUA6AMoCIISPEZCQoJat24tX19ftW3bVsnJyerWrZtatGihmJgY1a9fn9lugBKAXgCAPgB3ICjBY4wfP16GYWj58uWqXLmyunXrptTUVB09elSHDx9WxYoV9cILL1hdJoBCRi8AQB+AO3CNEjxGlSpVtGzZMt166606deqUypcvr/j4eLVv316StHbtWj300EM6cOCAxZUCKEz0AgD0AbgDR5TgMZKTk1W1alVJUtmyZRUQEKCIiAjn8po1a3KYHSgB6AUA6ANwB4ISPEZYWJhL0xs+fLjKli3rvJ+cnKzAwEArSgNQhOgFAOgDcAeCEjxG48aNtXnzZuf9F1980aUpbty4UQ0bNrSiNABFiF4AgD4Ad+AaJZQYW7dulb+/v+rXr291KQAsRC8AQB9AXhCUAAAAAMCklNUFAO5kGIa++eYbbdq0SYmJibLZbKpYsaJatWql9u3by2azWV0igCJALwBAH0BBcUQJHuPYsWPq1q2bfv75Z9WvX18VK1aUYRhKSkrSL7/8okaNGumzzz5zzoIDwDPRCwDQB+AOBCV4jB49eujs2bN69913VblyZZdlCQkJ6tevn4KDg7VixQprCgRQJOgFAOgDcAeCEjxGUFCQvv/+ezVq1CjH5Tt37tRtt92ms2fPFnFlAIoSvQAAfQDuwPTg8Bj+/v46depUrsuTk5Pl7+9fhBUBsAK9AAB9AO5AUILHuP/++zVgwAB9/PHHSklJcY6npKTo448/1qBBg9S3b18LKwRQFOgFAOgDcAdmvYPHeOWVV3Tp0iU98MADunTpknx8fCRJFy9eVKlSpTRkyBBNnz7d4ioBFDZ6AQD6ANyBa5TgcVJTU7V9+3YlJiZKkipVqqSbb75ZISEhFlcGoCjRCwDQB1AQBCWUGEePHtWkSZP01ltvWV0KAAvRCwDQB5AXBCWUGLt371bTpk2VmZlpdSkALEQvAEAfQF4wmQMAAAAAmBCUAAAAAMCEoAQAAAAAJkwPDo/Rs2fPKy4/ffp00RQCwFL0AgD0AbgDQQkeIyQkRDabLdfloaGh6t+/fxFWBMAK9AIA9AG4A7PeAQAAAIAJ1yjBY3h7eyspKcnqMgBYjF4AgD4AdyAowWNwcBSARC8AQB+AexCUAAAAAMCEyRzgUVavXq3Q0NArrnPnnXcWUTUArEIvAEAfQEExmQM8hpfX1Q+Q2mw2ZWZmFkE1AKxCLwBAH4A7cOodPEpiYqIcDkeuNxoiUDLQCwDQB1BQBCV4jCt9XwKAkoNeAIA+AHcgKMFj5OUs0l27dhV+IQAsRS8AQB+AOxCU4DEGDBggf3//bOMpKSmaM2eOmjZtqptvvtmCygAUJXoBAPoA3IHJHOCx1qxZo7feekvLli1TRESEevXqpV69eqlJkyZWlwagCNELANAHcC2YHhwe5c8//9SiRYv01ltv6dy5c+rdu7cyMjL0ySefqG7dulaXB6CI0AsA0AdQUJx6B4/RpUsX1a1bV3v37tWsWbN0/PhxzZo1y+qyABQxegEA+gDcgSNK8Bhff/21Ro4cqaFDh+rGG2+0uhwAFqEXAKAPwB04ogSPsWHDBp05c0bR0dG65ZZbNHv2bJ04ccLqsgAUMXoBAPoA3IHJHOBx0tLStHTpUr311lvasmWLMjMz9eqrr2rw4MEKDg62ujwARYReAIA+gIIgKMGj/fbbb1qwYIHeeecdnT59WrGxsfrss8+sLgtAEaMXAKAPIL8ISigRMjMz9fnnn+utt96iKQIlGL0AAH0AeUVQAgAAAAATJnMAAAAAABOCEgAAAACYEJQAAAAAwISgBAAAAAAmBCWUKN99951SUlKsLgOAxegFAOgDuBqCEkqUmJgY1ahRQ6+88orVpQCwEL0AAH0AV0NQQoly8OBBffLJJzp58qTVpQCwEL0AAH0AV8P3KAEAAACASSmrCwAKy/bt27Vv3z7ZbDZFRUWpadOmVpcEwAL0AgD0AVwLghI8TlJSku6//36tW7dOpUuXlmEYSklJUbt27bR06VJVqFDB6hIBFAF6AQD6AAqCa5TgcUaMGKHU1FTt2bNHp06dUnJysn755RelpqZq5MiRVpcHoIjQCwDQB1AQXKMEjxMaGqpvvvlGzZo1cxnfsmWLOnbsqNOnT1tTGIAiRS8AQB9AQXBECR7H4XDIbrdnG7fb7XI4HBZUBMAK9AIA9AEUBEEJHuf222/XqFGjdPz4cefYsWPHNGbMGLVv397CygAUJXoBAPoACoJT7+Bxjh49qh49euiXX35ReHi4bDabjhw5ogYNGujTTz/VDTfcYHWJAIoAvQAAfQAFQVCCx4qPj9evv/4qwzBUt25ddejQweqSAFiAXgCAPoBrQVCCR7l06ZL8/Py0a9cu1a9f3+pyAFiEXgCAPoCC4holeJRSpUopIiJCmZmZVpcCwEL0AgD0ARQUQQke55lnntHEiRN16tQpq0sBYCF6AQD6AAqCU+/gcZo0aaLff/9dGRkZioiIUGBgoMvyHTt2WFQZgKJELwBAH0BBlLK6AMDd7rrrLqtLAFAM0AsA0AdQEBxRAgAAAAATjijBY23fvl379u2TzWZT3bp11aRJE6tLAmABegEA+gCuBUEJHicpKUn333+/1q1bp9KlS8swDKWkpKhdu3ZaunSpKlSoYHWJAIoAvQAAfQAFwax38DgjRoxQamqq9uzZo1OnTik5OVm//PKLUlNTNXLkSKvLA1BE6AUA6AMoCK5RgscJDQ3VN998o2bNmrmMb9myRR07dtTp06etKQxAkaIXAKAPoCA4ogSP43A4ZLfbs43b7XY5HA4LKgJgBXoBAPoACoKgBI9z++23a9SoUTp+/Lhz7NixYxozZozat29vYWUAihK9AAB9AAXBqXfwOEePHlWPHj30yy+/KDw8XDabTUeOHFGDBg306aef6oYbbrC6RABFgF4AgD6AgiAowWPFx8fr119/lWEYqlu3rjp06GB1SQAsQC8AQB/AtSAoAQAAAIAJ1yjB44wcOVKvvfZatvHZs2dr9OjRRV8QAEvQCwDQB1AQBCV4nE8++UStWrXKNt6yZUt9/PHHFlQEwAr0AgD0ARQEQQke5++//1ZoaGi28ZCQEJ08edKCigBYgV4AgD6AgiAowePUqlVLX331VbbxVatWqUaNGhZUBMAK9AIA9AEURCmrCwDcbezYsRo+fLhOnDih22+/XZL07bff6pVXXtHMmTOtLQ5AkaEXAKAPoCCY9Q4e6fXXX9cLL7zg/IK5yMhIxcXFqX///hZXBqAo0QsA0AdwrQhK8GgnTpyQv7+/goKCrC4FgIXoBQDoA8gvghIAAAAAmDCZAzzG/v37dXnu37hxo+666y7Vq1dPHTp00KeffmphdQCKCr0AAH0A7kBQgseIiorSiRMnJEnr1q1T27Zt5XA49MADD6h06dLq2bOnVq9ebXGVAAobvQAAfQDuwKl38BheXl5KTExUWFiYOnTooNq1a+v//u//nMsnTpyoTZs2af369RZWCaCw0QsA0AfgDhxRgkfau3dvttlsHnzwQe3Zs8eiigBYgV4AgD6Aa8X3KMGjnDlzRn5+fvL395evr6/LMh8fH50/f96iygAUJXoBAPoACoqgBI9y0003SZIMw9D27dvVuHFj57I9e/aoatWqFlUGoCjRCwDQB1BQBCV4jLVr17rcr1y5ssv9Q4cO6eGHHy7KkgBYgF4AgD4Ad2AyBwAAAAAw4YgSPNbFixeVlJQkh8PhMl6tWjWLKgJgBXoBAPoArgVBCR5n//79GjJkiDZt2uQybhiGbDabMjMzLaoMQFGiFwCgD6AgCErwOIMGDVKpUqX0xRdfqHLlyrLZbFaXBMAC9AIA9AEUBNcoweMEBgZq+/btqlOnjtWlALAQvQAAfQAFwRfOwuPUrVtXJ0+etLoMABajFwCgD6AgOKIEj5Camur897Zt2/TMM89o6tSpatCggex2u8u6ISEhRV0egCJCLwBAH4C7EJTgEby8vFzOO866SPNyXLgJeD56AQD6ANyFyRzgEcxfLAegZKIXAKAPwF04ogQAAAAAJkzmAI9w5MiRfK1/7NixQqoEgJXoBQDoA3AXghI8QrNmzfTwww9ry5Ytua6TkpKiN954Q/Xr19eyZcuKsDoARYVeAIA+AHfhGiV4hH379mnq1Km64447ZLfbFR0drSpVqsjPz0/Jycnau3ev9uzZo+joaM2YMUOdO3e2umQAhYBeAIA+AHfhGiV4lAsXLujLL7/Uhg0bdOjQIZ0/f17ly5dXkyZN1KlTJ9WvX9/qEgEUAXoBAPoACoqgBAAAAAAmXKMEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgDAJC4uTo0bN7a6DACAhQhKAIDrjs1mu+Jt4MCBVpcIALjOlbK6AAAA8ishIcH57w8++EDPPvusfvvtN+eYv7+/FWUBADwIR5QAANedSpUqOW+hoaGy2WwuY0uWLFHNmjXl4+Oj2rVr65133nF5/JEjR9SjRw8FBQUpJCREvXv31l9//WXRqwEAFEcEJQCAR1m+fLlGjRqlf/3rX/rll1/06KOPatCgQVq7dq0kyTAM3XXXXTp16pTWr1+v+Ph4HThwQPfdd5/FlQMAihNOvQMAeJSXX35ZAwcO1LBhwyRJY8eO1Q8//KCXX35Z7dq10zfffKOffvpJBw8eVHh4uCTpnXfeUb169bR161Y1a9bMyvIBAMUER5QAAB5l3759atWqlctYq1attG/fPufy8PBwZ0iSpLp166p06dLOdQAAICgBADyOzWZzuW8YhnPs8n/ntg4AAAQlAIBHiYqK0saNG13GNm3apKioKEn/HD06cuSIjh496ly+d+9epaSkONcBAIBrlAAAHuWJJ55Q79691bRpU7Vv316ff/65li1bpm+++UaS1KFDBzVs2FAPPPCAZs6cqUuXLmnYsGFq27atoqOjLa4eAFBccEQJAOBR7rrrLv33v//VjBkzVK9ePc2bN08LFy5UTEyMpH9Oy1uxYoXKlCmjNm3aqEOHDqpRo4Y++OADawsHABQrNsMwDKuLAAAAAIDihCNKAAAAAGBCUAIAAAAAE4ISAAAAAJgQlAAAAADAhKAEAAAAACYEJQAAAAAwISgBAAAAgAlBCQAAAABMCEoAAAAAYEJQAgAAAAATghIAAAAAmPw/xrMnNnSQQIUAAAAASUVORK5CYII=\n","text/plain":"
"},"metadata":{}}],"id":"efe41d4a-1947-438b-a3c3-7ab954d75e13"},{"cell_type":"markdown","source":"### Xarray + kerchunk, out of the box performance.","metadata":{"tags":[],"user_expressions":[]},"id":"8f0ba64d-d89c-4879-b965-f00d70956360"},{"cell_type":"code","source":"# this is going to keep our numbers without modifying the i/o paramters\nregular_xarray_benchmarks = []\nkerchunk_benchmarks = []\n\nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n print (f\"Processing: {link}\")\n try:\n log_filename = f\"logs/fsspec-xarray-{key}-{k}-default.log\"\n \n # Create a new FileHandler for each iteration\n file_handler = logging.FileHandler(log_filename)\n file_handler.setLevel(logging.DEBUG)\n\n # Add the handler to the root logger\n logging.getLogger().addHandler(file_handler)\n \n start = time.time()\n if \"kerchunk\" in link:\n data_mean = kerchunk_result(link, dataset[\"group\"], dataset[\"variable\"])\n elapsed = time.time() - start\n kerchunk_benchmarks.append(\n {\"tool\": \"kerchunk\",\n \"dataset\": key,\n \"cloud-aware\": \"no\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean}) \n else:\n ds = xr.open_dataset(fs.open(link, mode='rb'), group=dataset[\"group\"], engine=\"h5netcdf\", decode_cf=False)\n data_mean = ds[dataset[\"variable\"]].mean() \n elapsed = time.time() - start\n regular_xarray_benchmarks.append(\n {\"tool\": \"xarray\",\n \"dataset\": key,\n \"cloud-aware\": \"no\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean}) \n \n logging.getLogger().removeHandler(file_handler)\n file_handler.close()\n\n except Exception as e:\n print(e)","metadata":{"trusted":true,"tags":[]},"execution_count":22,"outputs":[{"name":"stdout","output_type":"stream","text":"Processing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\n"}],"id":"ff56958f-8c1d-4fd7-b885-6efb81af8da7"},{"cell_type":"markdown","source":"### Plotting Results","metadata":{"tags":[],"user_expressions":[]},"id":"92a8e67d-026e-4c6b-aa7d-b19dc10f4afd"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(kerchunk_benchmarks + regular_xarray_benchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\n\nplt.title(\"Out of the box I/O parameters\", fontsize=10)\nplt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":24,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"149d5972-c5b9-4f29-979a-cf46c9654a06"},{"cell_type":"markdown","source":"## h5py out of the box performance.","metadata":{"tags":[],"user_expressions":[]},"id":"fa6ac2b9-989c-4246-bb89-b54b711dd695"},{"cell_type":"code","source":"regular_h5py_benchmarks = []\n\nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n try:\n if \"kerchunk\" in link:\n continue \n print (f\"Processing: {link}\")\n log_filename = f\"logs/fsspec-h5py-{key}-{k}_default.log\"\n \n # Create a new FileHandler for each iteration\n file_handler = logging.FileHandler(log_filename)\n file_handler.setLevel(logging.DEBUG)\n\n # Add the handler to the root logger\n logging.getLogger().addHandler(file_handler)\n # this is mostly IO so no perf_counter is needed\n start = time.time()\n with h5py.File(fs.open(link, mode=\"rb\")) as f:\n path = f\"{dataset['group']}/{dataset['variable']}\"\n data = f[path][:]\n data_mean = data.mean()\n elapsed = time.time() - start\n regular_h5py_benchmarks.append(\n {\"tool\": \"h5py\",\n \"dataset\": key,\n \"cloud-aware\": \"no\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean})\n\n logging.getLogger().removeHandler(file_handler) \n file_handler.close()\n \n except Exception as e:\n print(e)","metadata":{"trusted":true,"tags":[]},"execution_count":25,"outputs":[{"name":"stdout","output_type":"stream","text":"Processing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\n"}],"id":"98c29558-de50-44af-87e9-074092fcd0ac"},{"cell_type":"markdown","source":"### Plotting Results","metadata":{"tags":[],"user_expressions":[]},"id":"f4232e98-1159-45eb-ba11-0f0dbb905d83"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(regular_h5py_benchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\nplt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\nplt.title(\"Out of the box I/O parameters\", fontsize=10)\n\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=45)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":26,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"d8fa6dca-f408-4298-beca-f2839d4c3b67"},{"cell_type":"markdown","source":"## Aggregated plot by tool and different file sizes","metadata":{"tags":[],"user_expressions":[]},"id":"b20b2032-9ab4-46e1-b1f8-2e62b656a265"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(regular_h5py_benchmarks + kerchunk_benchmarks + regular_xarray_benchmarks + h5coro_beanchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\nplt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\nplt.title(\"Out of the box I/O parameters\", fontsize=10)\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":27,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"64bcc5de-aae3-46aa-9474-1c90b9ff20a9"},{"cell_type":"markdown","source":"## Now leet's run the tests with \"informed\" parameters, this is a I/O that aligns to the cloud-optimized granules chunking strategy and consolidated metadata.\n","metadata":{"tags":[],"user_expressions":[]},"id":"0ea67b0b-5e7f-4d1f-bca9-1f3cae7fe309"},{"cell_type":"code","source":"optimized_h5py_benchmarks = []\noptimized_xarray_benchmarks = []\n\nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n print(f\"Processing: {link}\")\n try:\n log_filename = f\"logs/fsspec-xarray-{key}-{k}.log\"\n \n # Create a new FileHandler for each iteration\n file_handler = logging.FileHandler(log_filename)\n file_handler.setLevel(logging.DEBUG)\n\n # Add the handler to the root logger\n logging.getLogger().addHandler(file_handler)\n \n io_params = {\n \"fsspec_params\": {},\n \"h5py_params\": {}\n }\n \n if \"repacked\" in link: \n io_params ={\n \"fsspec_params\": {\n \"cache_type\": \"blockcache\",\n \"block_size\": 8*1024*1024\n },\n \"h5py_params\" : {\n \"driver_kwds\": {\n \"page_buf_size\": 64*1024*1024,\n \"rdcc_nbytes\": 8*1024*1024\n }\n\n }\n }\n\n if \"kerchunk\" in link:\n continue\n \n start = time.time()\n ds = xr.open_dataset(fs.open(link, mode='rb', **io_params[\"fsspec_params\"]), group=dataset[\"group\"], engine=\"h5netcdf\", decode_cf=False)\n data_mean = ds[dataset[\"variable\"]].mean()\n elapsed = time.time() - start\n optimized_xarray_benchmarks.append(\n {\"tool\": \"xarray\",\n \"dataset\": key,\n \"cloud-aware\": \"yes\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean})\n \n logging.getLogger().removeHandler(file_handler)\n file_handler.close()\n\n except Exception as e:\n print(e)\n \nfor key, dataset in test_dict.items():\n for k, link in dataset[\"links\"].items():\n try:\n if \"kerchunk\" in link:\n continue \n print (f\"Processing: {link}\")\n log_filename = f\"logs/fsspec-h5py-{key}-{k}_default.log\"\n \n # Create a new FileHandler for each iteration\n file_handler = logging.FileHandler(log_filename)\n file_handler.setLevel(logging.DEBUG)\n\n # Add the handler to the root logger\n logging.getLogger().addHandler(file_handler)\n # this is mostly IO so no perf_counter is needed\n start = time.time()\n io_params = {\n \"fsspec_params\": {},\n \"h5py_params\": {}\n }\n \n if \"repacked\" in link: \n io_params ={\n \"fsspec_params\": {\n \"cache_type\": \"blockcache\",\n \"block_size\": 8*1024*1024\n },\n \"h5py_params\" : {\n \"page_buf_size\": 64*1024*1024,\n \"rdcc_nbytes\": 8*1024*1024\n }\n } \n with h5py.File(fs.open(link, mode=\"rb\", **io_params[\"fsspec_params\"]), **io_params[\"h5py_params\"]) as f:\n path = f\"{dataset['group']}/{dataset['variable']}\"\n data = f[path][:]\n data_mean = data.mean()\n elapsed = time.time() - start\n optimized_h5py_benchmarks.append(\n {\"tool\": \"h5py\",\n \"dataset\": key,\n \"cloud-aware\": \"yes\",\n \"format\": k,\n \"file\": link,\n \"time\": elapsed,\n \"mean\": data_mean})\n\n logging.getLogger().removeHandler(file_handler) \n file_handler.close()\n \n\n except Exception as e:\n print(e)","metadata":{"trusted":true,"tags":[]},"execution_count":28,"outputs":[{"name":"stdout","output_type":"stream","text":"Processing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\nProcessing: s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\n"}],"id":"8151834b-0b57-4a3d-98b5-8cfaffa37dc4"},{"cell_type":"markdown","source":"## Plotting results","metadata":{"tags":[],"user_expressions":[]},"id":"04414c2e-0666-4701-8ecc-7842727ede22"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(optimized_h5py_benchmarks+h5coro_beanchmarks+optimized_xarray_benchmarks+kerchunk_benchmarks)\n\npivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\n\nplt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\nplt.title(\"Informed I/O parameters\", fontsize=10)\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":29,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"2db2535a-8d3a-4e65-b21c-8db6b48074c8"},{"cell_type":"markdown","source":"## Pliting tool specific performance","metadata":{"tags":[],"user_expressions":[]},"id":"ea0db03e-5653-4908-ada1-16d723666e18"},{"cell_type":"code","source":"df = pd.DataFrame.from_dict(regular_xarray_benchmarks+optimized_xarray_benchmarks)\n\npivot_df = df.pivot_table(index=['dataset','cloud-aware'], columns=['format'], values='time', aggfunc='mean')\n\n# Plotting\npivot_df.plot(kind='bar', figsize=(10, 6))\nplt.title('Xarray \"Cloud-awared\" Access Pattern Performance (less is better)')\nplt.xlabel('Tool')\nplt.ylabel('Mean Time')\nplt.xticks(rotation=90)\nplt.legend(title='Format')\nplt.grid(True)\nplt.show()","metadata":{"trusted":true,"tags":[]},"execution_count":32,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":"
"},"metadata":{}}],"id":"47444e8a-6d59-42c2-baff-a3c85c447eb2"}]} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6c9b37e2-2daa-4283-a228-ea581498de0c", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## AB testing access time for ICESat-2 ATL03 HDF5 files in the cloud.\n", + "\n", + "This notebook requires that we have 2 versions of the same file:\n", + " * Original A: The original file with no modifications on a S3 location.\n", + " * Test Case B: A modified version of the orignal file to test for metadata consolidation, rechunking and other strategies to speed up access to the data in the file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aaca84b1-46e9-4b41-a494-24da3a368f38", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!mamba uninstall -y h5coro \n", + "%pip install git+https://github.com/ICESat2-SlideRule/h5coro.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b78fb94-10ae-48cb-8e30-521b2c8b7822", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import h5py\n", + "import fsspec\n", + "import logging\n", + "import re\n", + "import time\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "from h5coro import h5coro, s3driver, filedriver\n", + "driver = s3driver.S3Driver\n", + "\n", + "logger = logging.getLogger('fsspec')\n", + "logger.setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "431d900d-0656-4b75-af6b-82f0f171d5f8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for library in (xr, h5py, fsspec, h5coro):\n", + " print(f'{library.__name__} v{library.__version__}')" + ] + }, + { + "cell_type": "markdown", + "id": "7998cd99-6034-4a1b-9ae5-d651bc265bff", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "For listing files in CryoCloud\n", + "\n", + "```bash\n", + "aws s3 ls s3://nasa-cryo-persistent/h5cloud/ --recursive\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9850faac-f534-4bc2-9214-c8dababe0f52", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test_dict = {\n", + " \"ATL03-1GB\": {\n", + " \"links\": {\n", + " \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\",\n", + " \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\"\n", + " },\n", + " \"group\": \"/gt1l/heights\",\n", + " \"variable\": \"h_ph\",\n", + " \"processing\": [\n", + " \"h5repack -S PAGE -G 8000000\"\n", + " ]\n", + " },\n", + " \"ATL03-7GB\": {\n", + " \"links\": {\n", + " \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " },\n", + " \"group\": \"/gt1l/heights\",\n", + " \"variable\": \"h_ph\",\n", + " \"processing\": [\n", + " \"h5repack -S PAGE -G 8000000\"\n", + " ]\n", + " },\n", + " \"ATL03-7GB-kerchunk\": {\n", + " \"links\": {\n", + " \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\",\n", + " \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\",\n", + " },\n", + " \"group\": \"/gt1l/heights\",\n", + " \"variable\": \"h_ph\",\n", + " \"processing\": [\n", + " \"h5repack -S PAGE -G 8000000\"\n", + " ]\n", + " }, \n", + " \"ATL03-2GB\": {\n", + " \"links\": {\n", + " \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\",\n", + " \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\",\n", + " },\n", + " \"group\": \"/gt1l/heights\",\n", + " \"variable\": \"h_ph\",\n", + " \"processing\": [\n", + " \"h5repack -S PAGE -G 8000000\"\n", + " ]\n", + " }\n", + "}\n", + "\n", + "def kerchunk_result(file: str, dataset: str, variable: str):\n", + " fs = fsspec.filesystem(\n", + " \"reference\",\n", + " fo=file,\n", + " remote_protocol=\"s3\",\n", + " remote_options=dict(anon=False),\n", + " skip_instance_cache=True,\n", + " )\n", + " ds = xr.open_dataset(\n", + " fs.get_mapper(\"\"), engine=\"zarr\", consolidated=False, group=dataset\n", + " )\n", + " return ds[variable].mean()\n", + "\n", + "# This will use the embedded credentials in the hub to access the s3://nasa-cryo-persistent bucket\n", + "fs = fsspec.filesystem('s3')\n" + ] + }, + { + "cell_type": "markdown", + "id": "4d166627-6144-40bf-884d-2188e5c764ba", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## [h5coro](https://github.com/ICESat2-SlideRule/h5coro/)\n", + "\n", + "**h5coro** is optimized for reading HDF5 data in high-latency high-throughput environments. It accomplishes this through a few key design decisions:\n", + "* __All reads are concurrent.__ Each dataset and/or attribute read by **h5coro** is performed in its own thread.\n", + "* __Intelligent range gets__ are used to read as many dataset chunks as possible in each read operation. This drastically reduces the number of HTTP requests to S3 and means there is no longer a need to re-chunk the data (it actually works better on smaller chunk sizes due to the granularity of the request).\n", + "* __Block caching__ is used to minimize the number of GET requests made to S3. S3 has a large first-byte latency (we've measured it at ~60ms on our systems), which means there is a large penalty for each read operation performed. **h5coro** performs all reads to S3 as large block reads and then maintains data in a local cache for access to smaller amounts of data within those blocks.\n", + "* __The system is serverless__ and does not depend on any external services to read the data. This means it scales naturally as the user application scales, and it reduces overall system complexity.\n", + "* __No metadata repository is needed.__ The structure of the file are cached as they are read so that successive reads to other datasets in the same file will not have to re-read and re-build the directory structure of the file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efe41d4a-1947-438b-a3c3-7ab954d75e13", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "h5coro_beanchmarks = []\n", + "\n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " print (f\"Processing: {link}\")\n", + " if \"kerchunk\" in link:\n", + " continue\n", + " group = dataset[\"group\"]\n", + " variable = dataset['variable'] \n", + " final_h5coro_array = []\n", + " start = time.time()\n", + " if link.startswith(\"s3://nasa-cryo-persistent/\"):\n", + " h5obj = h5coro.H5Coro(link.replace(\"s3://\", \"\"), s3driver.S3Driver)\n", + " else:\n", + " h5obj = h5coro.H5Coro(link.replace(\"s3://\", \"\"), s3driver.S3Driver, credentials={\"annon\": True})\n", + " ds = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)\n", + " data = ds[f'{group}/{variable}'][:]\n", + " data_mean = np.mean(data)\n", + " elapsed = time.time() - start\n", + " \n", + " h5coro_beanchmarks.append({\"tool\": \"h5coro\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"no\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean})\n", + "\n", + "\n", + "df = pd.DataFrame.from_dict(h5coro_beanchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "plt.title('h5coro cloud optimized HDF5 performance')\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8f0ba64d-d89c-4879-b965-f00d70956360", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "### Xarray + kerchunk, out of the box performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff56958f-8c1d-4fd7-b885-6efb81af8da7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# this is going to keep our numbers without modifying the i/o paramters\n", + "regular_xarray_benchmarks = []\n", + "kerchunk_benchmarks = []\n", + "\n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " print (f\"Processing: {link}\")\n", + " try:\n", + " log_filename = f\"logs/fsspec-xarray-{key}-{k}-default.log\"\n", + " \n", + " # Create a new FileHandler for each iteration\n", + " file_handler = logging.FileHandler(log_filename)\n", + " file_handler.setLevel(logging.DEBUG)\n", + "\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(file_handler)\n", + " \n", + " start = time.time()\n", + " if \"kerchunk\" in link:\n", + " data_mean = kerchunk_result(link, dataset[\"group\"], dataset[\"variable\"])\n", + " elapsed = time.time() - start\n", + " kerchunk_benchmarks.append(\n", + " {\"tool\": \"kerchunk\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"no\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean}) \n", + " else:\n", + " ds = xr.open_dataset(fs.open(link, mode='rb'), group=dataset[\"group\"], engine=\"h5netcdf\", decode_cf=False)\n", + " data_mean = ds[dataset[\"variable\"]].mean() \n", + " elapsed = time.time() - start\n", + " regular_xarray_benchmarks.append(\n", + " {\"tool\": \"xarray\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"no\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean}) \n", + " \n", + " logging.getLogger().removeHandler(file_handler)\n", + " file_handler.close()\n", + "\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "id": "92a8e67d-026e-4c6b-aa7d-b19dc10f4afd", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "### Plotting Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "149d5972-c5b9-4f29-979a-cf46c9654a06", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(kerchunk_benchmarks + regular_xarray_benchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "\n", + "plt.title(\"Out of the box I/O parameters\", fontsize=10)\n", + "plt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n", + "\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "fa6ac2b9-989c-4246-bb89-b54b711dd695", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## h5py out of the box performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98c29558-de50-44af-87e9-074092fcd0ac", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "regular_h5py_benchmarks = []\n", + "\n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " try:\n", + " if \"kerchunk\" in link:\n", + " continue \n", + " print (f\"Processing: {link}\")\n", + " log_filename = f\"logs/fsspec-h5py-{key}-{k}_default.log\"\n", + " \n", + " # Create a new FileHandler for each iteration\n", + " file_handler = logging.FileHandler(log_filename)\n", + " file_handler.setLevel(logging.DEBUG)\n", + "\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(file_handler)\n", + " # this is mostly IO so no perf_counter is needed\n", + " start = time.time()\n", + " with h5py.File(fs.open(link, mode=\"rb\")) as f:\n", + " path = f\"{dataset['group']}/{dataset['variable']}\"\n", + " data = f[path][:]\n", + " data_mean = data.mean()\n", + " elapsed = time.time() - start\n", + " regular_h5py_benchmarks.append(\n", + " {\"tool\": \"h5py\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"no\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean})\n", + "\n", + " logging.getLogger().removeHandler(file_handler) \n", + " file_handler.close()\n", + " \n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "id": "f4232e98-1159-45eb-ba11-0f0dbb905d83", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "### Plotting Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8fa6dca-f408-4298-beca-f2839d4c3b67", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(regular_h5py_benchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "plt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n", + "plt.title(\"Out of the box I/O parameters\", fontsize=10)\n", + "\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=45)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b20b2032-9ab4-46e1-b1f8-2e62b656a265", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Aggregated plot by tool and different file sizes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64bcc5de-aae3-46aa-9474-1c90b9ff20a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(regular_h5py_benchmarks + kerchunk_benchmarks + regular_xarray_benchmarks + h5coro_beanchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "plt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n", + "plt.title(\"Out of the box I/O parameters\", fontsize=10)\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "0ea67b0b-5e7f-4d1f-bca9-1f3cae7fe309", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Now let's run the tests with \"informed\" parameters, this is a I/O that aligns to the cloud-optimized granules chunking strategy and consolidated metadata.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8151834b-0b57-4a3d-98b5-8cfaffa37dc4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "optimized_h5py_benchmarks = []\n", + "optimized_xarray_benchmarks = []\n", + "\n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " print(f\"Processing: {link}\")\n", + " try:\n", + " log_filename = f\"logs/fsspec-xarray-{key}-{k}.log\"\n", + " \n", + " # Create a new FileHandler for each iteration\n", + " file_handler = logging.FileHandler(log_filename)\n", + " file_handler.setLevel(logging.DEBUG)\n", + "\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(file_handler)\n", + " \n", + " io_params = {\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\": {}\n", + " }\n", + " \n", + " if \"repacked\" in link: \n", + " io_params ={\n", + " \"fsspec_params\": {\n", + " \"cache_type\": \"blockcache\",\n", + " \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"driver_kwds\": {\n", + " \"page_buf_size\": 64*1024*1024,\n", + " \"rdcc_nbytes\": 8*1024*1024\n", + " }\n", + "\n", + " }\n", + " }\n", + "\n", + " if \"kerchunk\" in link:\n", + " continue\n", + " \n", + " start = time.time()\n", + " ds = xr.open_dataset(fs.open(link, mode='rb', **io_params[\"fsspec_params\"]), group=dataset[\"group\"], engine=\"h5netcdf\", decode_cf=False)\n", + " data_mean = ds[dataset[\"variable\"]].mean()\n", + " elapsed = time.time() - start\n", + " optimized_xarray_benchmarks.append(\n", + " {\"tool\": \"xarray\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"yes\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean})\n", + " \n", + " logging.getLogger().removeHandler(file_handler)\n", + " file_handler.close()\n", + "\n", + " except Exception as e:\n", + " print(e)\n", + " \n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " try:\n", + " if \"kerchunk\" in link:\n", + " continue \n", + " print (f\"Processing: {link}\")\n", + " log_filename = f\"logs/fsspec-h5py-{key}-{k}_default.log\"\n", + " \n", + " # Create a new FileHandler for each iteration\n", + " file_handler = logging.FileHandler(log_filename)\n", + " file_handler.setLevel(logging.DEBUG)\n", + "\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(file_handler)\n", + " # this is mostly IO so no perf_counter is needed\n", + " start = time.time()\n", + " io_params = {\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\": {}\n", + " }\n", + " \n", + " if \"repacked\" in link: \n", + " io_params ={\n", + " \"fsspec_params\": {\n", + " \"cache_type\": \"blockcache\",\n", + " \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"page_buf_size\": 64*1024*1024,\n", + " \"rdcc_nbytes\": 8*1024*1024\n", + " }\n", + " } \n", + " with h5py.File(fs.open(link, mode=\"rb\", **io_params[\"fsspec_params\"]), **io_params[\"h5py_params\"]) as f:\n", + " path = f\"{dataset['group']}/{dataset['variable']}\"\n", + " data = f[path][:]\n", + " data_mean = data.mean()\n", + " elapsed = time.time() - start\n", + " optimized_h5py_benchmarks.append(\n", + " {\"tool\": \"h5py\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"yes\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean})\n", + "\n", + " logging.getLogger().removeHandler(file_handler) \n", + " file_handler.close()\n", + " \n", + "\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "id": "04414c2e-0666-4701-8ecc-7842727ede22", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Plotting results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2db2535a-8d3a-4e65-b21c-8db6b48074c8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(optimized_h5py_benchmarks+h5coro_beanchmarks+optimized_xarray_benchmarks+kerchunk_benchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "\n", + "plt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n", + "plt.title(\"Informed I/O parameters\", fontsize=10)\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ea0db03e-5653-4908-ada1-16d723666e18", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Ploting tool specific performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47444e8a-6d59-42c2-baff-a3c85c447eb2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(regular_xarray_benchmarks+optimized_xarray_benchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['dataset','cloud-aware'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "plt.title('Xarray \"Cloud-Aware\" Access Pattern Performance (less is better)')\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8395f794-0ea7-4c26-8f64-0d2f9659d841", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Make one comparison plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbe17a07-22e3-4b99-a50a-d3183425d15c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(regular_h5py_benchmarks + \n", + " kerchunk_benchmarks + \n", + " regular_xarray_benchmarks + \n", + " h5coro_beanchmarks + \n", + " optimized_h5py_benchmarks + \n", + " optimized_xarray_benchmarks)\n", + "df[\"size\"] = df.dataset.str.extract(r\"-(\\dGB)\")\n", + "df[\"product\"] = df.dataset.str.extract(r\"(ATL\\d{2})\")\n", + "df.to_csv(\"benchmarks.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90486527-a1f2-4a92-bee6-1b2f934aa24d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pivot_df = df.pivot_table(index=[\"tool\", \"size\"], columns=[\"format\", \"cloud-aware\"], values=\"time\", aggfunc=\"mean\")\n", + "pivot_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88badbc0-a277-4aee-9236-d74327032d0d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "\n", + "sns.set_style(\"darkgrid\", rc={'axes.facecolor': '0.9'})\n", + "# sns.set_palette(\"bright\", 4)\n", + "\n", + "fig, ax = plt.subplots(figsize=(15,6), layout=\"constrained\")\n", + "\n", + "pivot_df.plot(kind=\"bar\", ax=ax, \n", + " color=[\"tab:cyan\", \"tab:blue\", \"tab:pink\", \"tab:red\"],\n", + " xlabel=\"\", fontsize=15);\n", + "ax.legend(labels = [\"Optimized\", \"Optimized with informed io parameters\", \"Original\", \"Original with informed io parameters\"], fontsize=15)\n", + "ax.set_ylabel(\"Time (s)\", fontsize=20)\n", + "\n", + "# Make two level axis\n", + "def parse_text(s):\n", + " return re.sub(r\"[()]\", \"\", s).split(\", \")\n", + "\n", + "# Retrieve and parse axis labels and position\n", + "tool, size, x, y = map(np.array, zip(*[(*parse_text(l.get_text()), *l.get_position()) for l in ax.get_xticklabels()]))\n", + "# Make labels and x-positions for seconary axis\n", + "sec_x, sec_label = zip(*[(x[tool == tool_name].mean(), \"\\n\"+tool_name) for tool_name in np.unique(tool)])\n", + "# Assign ticks and labels\n", + "ax.set_xticks(x, size, rotation=0);\n", + "sec = ax.secondary_xaxis(location=0);\n", + "sec.set_xticks(sec_x, sec_label, fontsize=18);\n", + "sec.tick_params(length=0)\n", + "\n", + "sepa_x = np.array([x[tool == tool_name].min()-0.5 for tool_name in np.unique(tool)] + [x.max()+0.5])\n", + "[ax.axvline(xs, c='k', ymin=-.1, clip_on=False, zorder=3) for xs in sepa_x];\n", + "\n", + "fig.savefig(\"access_time.summary.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8dd30a5-952e-428f-b908-9897fac81aa7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae595359-3e8a-4072-89b4-bd2e52d9ec12", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From aa5dd0b3dfa917b9a15d1d2b5009168d2bfe6452 Mon Sep 17 00:00:00 2001 From: Andy Barrett Date: Wed, 28 Feb 2024 20:00:22 +0000 Subject: [PATCH 11/11] Remove savefig so that canonical plotting is in plot_benchmark_results.ipynb --- notebooks/portable-full-comparison.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/portable-full-comparison.ipynb b/notebooks/portable-full-comparison.ipynb index 8b1dfb8..400a027 100644 --- a/notebooks/portable-full-comparison.ipynb +++ b/notebooks/portable-full-comparison.ipynb @@ -753,7 +753,7 @@ "sepa_x = np.array([x[tool == tool_name].min()-0.5 for tool_name in np.unique(tool)] + [x.max()+0.5])\n", "[ax.axvline(xs, c='k', ymin=-.1, clip_on=False, zorder=3) for xs in sepa_x];\n", "\n", - "fig.savefig(\"access_time.summary.png\")" + "# Use plot_benchmark_results.ipynb to generate saveable png" ] }, {