From 14e4fa2ee92b970d1ed8557a0724cff33a6aafcb Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Thu, 31 Aug 2023 14:38:30 -0700 Subject: [PATCH 01/20] add dynamic_chunking_fn field to storetozarr --- pangeo_forge_recipes/transforms.py | 26 +++++++++++--- tests/test_transforms.py | 55 +++++++++++++++++++++++++----- 2 files changed, 68 insertions(+), 13 deletions(-) diff --git a/pangeo_forge_recipes/transforms.py b/pangeo_forge_recipes/transforms.py index 226025ec..0dff2cb4 100644 --- a/pangeo_forge_recipes/transforms.py +++ b/pangeo_forge_recipes/transforms.py @@ -451,15 +451,25 @@ def expand(self, reference: beam.PCollection) -> beam.PCollection: class StoreToZarr(beam.PTransform, ZarrWriterMixin): """Store a PCollection of Xarray datasets to Zarr. - :param combine_dims: The dimensions to combine + :param combine_dims: The dimensions to combine. :param target_chunks: Dictionary mapping dimension names to chunks sizes. - If a dimension is a not named, the chunks will be inferred from the data. + If a dimension is a not named, the chunks will be inferred from the data. + :param dynamic_chunking_fn: Optionally provide a function that takes an ``XarraySchema`` + as its first argument and returns a dynamically generated chunking dict. If provided, + ``target_chunks`` cannot also be passed. + :param dynamic_chunking_fn_kwargs: Optional keyword arguments for ``dynamic_chunking_fn``. """ # TODO: make it so we don't have to explicitly specify combine_dims # Could be inferred from the pattern instead combine_dims: List[Dimension] target_chunks: Dict[str, int] = field(default_factory=dict) + dynamic_chunking_fn: Optional[Callable[[XarraySchema], dict]] = None + dynamic_chunking_fn_kwargs: Optional[dict] = field(default_factory=dict) + + def __post_init__(self): + if self.target_chunks and self.dynamic_chunking_fn: + raise ValueError("Passing both `target_chunks` and `dynamic_chunking_fn` not allowed.") def expand( self, @@ -467,11 +477,17 @@ def expand( ) -> beam.PCollection[zarr.storage.FSStore]: schema = datasets | DetermineSchema(combine_dims=self.combine_dims) indexed_datasets = datasets | IndexItems(schema=schema) - rechunked_datasets = indexed_datasets | Rechunk( - target_chunks=self.target_chunks, schema=schema + target_chunks = ( + self.target_chunks + if not self.dynamic_chunking_fn + else beam.pvalue.AsSingleton( + schema | beam.Map(self.dynamic_chunking_fn, **self.dynamic_chunking_fn_kwargs) + ) ) + rechunked_datasets = indexed_datasets | Rechunk(target_chunks=target_chunks, schema=schema) target_store = schema | PrepareZarrTarget( - target=self.get_full_target(), target_chunks=self.target_chunks + target=self.get_full_target(), + target_chunks=target_chunks, ) n_target_stores = rechunked_datasets | StoreDatasetFragments(target_store=target_store) singleton_target_store = ( diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 296e5786..c2ea56ec 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -6,7 +6,7 @@ from apache_beam.testing.util import BeamAssertException, assert_that, is_not_empty from pytest_lazyfixture import lazy_fixture -from pangeo_forge_recipes.aggregation import dataset_to_schema +from pangeo_forge_recipes.aggregation import XarraySchema, dataset_to_schema from pangeo_forge_recipes.patterns import FilePattern, FileType from pangeo_forge_recipes.storage import CacheFSSpecTarget from pangeo_forge_recipes.transforms import ( @@ -234,18 +234,20 @@ def _check_chunks(actual): assert_that(rechunked, correct_chunks()) +class OpenZarrStore(beam.PTransform): + @staticmethod + def _open_zarr(store): + return xr.open_dataset(store, engine="zarr", chunks={}) + + def expand(self, pcoll): + return pcoll | beam.Map(self._open_zarr) + + def test_StoreToZarr_emits_openable_fsstore( pipeline, netcdf_local_file_pattern_sequential, tmp_target_url, ): - def _open_zarr(store): - return xr.open_dataset(store, engine="zarr") - - class OpenZarrStore(beam.PTransform): - def expand(self, pcoll): - return pcoll | beam.Map(_open_zarr) - def is_xrdataset(): def _is_xr_dataset(actual): assert len(actual) == 1 @@ -264,3 +266,40 @@ def _is_xr_dataset(actual): ) open_store = target_store | OpenZarrStore() assert_that(open_store, is_xrdataset()) + + +def test_StoreToZarr_dynamic_chunking_interface( + pipeline: beam.Pipeline, + netcdf_local_file_pattern_sequential: FilePattern, + tmp_target_url: str, + daily_xarray_dataset: xr.Dataset, +): + def has_dynamically_set_chunks(): + def _has_dynamically_set_chunks(actual): + assert len(actual) == 1 + item = actual[0] + assert isinstance(item, xr.Dataset) + # we've dynamically set the number of timesteps per chunk to be equal to + # the length of the full time dimension of the aggregate dataset, therefore + # if this worked, there should only be one chunk + assert len(item.chunks["time"]) == 1 + + return _has_dynamically_set_chunks + + pattern: FilePattern = netcdf_local_file_pattern_sequential + + time_len = len(daily_xarray_dataset.time) + + def dynamic_chunking_fn(schema: XarraySchema): + return {"time": time_len} + + with pipeline as p: + datasets = p | beam.Create(pattern.items()) | OpenWithXarray() + target_store = datasets | StoreToZarr( + target_root=tmp_target_url, + store_name="test.zarr", + combine_dims=pattern.combine_dim_keys, + dynamic_chunking_fn=dynamic_chunking_fn, + ) + open_store = target_store | OpenZarrStore() + assert_that(open_store, has_dynamically_set_chunks()) From 1eda370be2761b03c76ff9b04b7e640abe674109 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Thu, 31 Aug 2023 14:46:19 -0700 Subject: [PATCH 02/20] test dynamic chunking kws --- tests/test_transforms.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index c2ea56ec..a07f7b17 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -268,21 +268,27 @@ def _is_xr_dataset(actual): assert_that(open_store, is_xrdataset()) +@pytest.mark.parametrize("with_kws", [True, False]) def test_StoreToZarr_dynamic_chunking_interface( pipeline: beam.Pipeline, netcdf_local_file_pattern_sequential: FilePattern, tmp_target_url: str, daily_xarray_dataset: xr.Dataset, + with_kws: bool, ): def has_dynamically_set_chunks(): def _has_dynamically_set_chunks(actual): assert len(actual) == 1 item = actual[0] assert isinstance(item, xr.Dataset) - # we've dynamically set the number of timesteps per chunk to be equal to - # the length of the full time dimension of the aggregate dataset, therefore - # if this worked, there should only be one chunk - assert len(item.chunks["time"]) == 1 + if not with_kws: + # we've dynamically set the number of timesteps per chunk to be equal to + # the length of the full time dimension of the aggregate dataset, therefore + # if this worked, there should only be one chunk + assert len(item.chunks["time"]) == 1 + else: + # in this case, we've passed the kws {"divisor": 2}, so we expect two time chunks + assert len(item.chunks["time"]) == 2 return _has_dynamically_set_chunks @@ -290,8 +296,10 @@ def _has_dynamically_set_chunks(actual): time_len = len(daily_xarray_dataset.time) - def dynamic_chunking_fn(schema: XarraySchema): - return {"time": time_len} + def dynamic_chunking_fn(schema: XarraySchema, divisor: int = 1): + return {"time": int(time_len / divisor)} + + kws = {} if not with_kws else {"dynamic_chunking_fn_kwargs": {"divisor": 2}} with pipeline as p: datasets = p | beam.Create(pattern.items()) | OpenWithXarray() @@ -300,6 +308,7 @@ def dynamic_chunking_fn(schema: XarraySchema): store_name="test.zarr", combine_dims=pattern.combine_dim_keys, dynamic_chunking_fn=dynamic_chunking_fn, + **kws, ) open_store = target_store | OpenZarrStore() assert_that(open_store, has_dynamically_set_chunks()) From 1765f67ba4ab737b0199b60d6f7f045a50a023f9 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Thu, 31 Aug 2023 15:03:20 -0700 Subject: [PATCH 03/20] test dynamic chunking with target chunks raises --- tests/test_transforms.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index a07f7b17..ffa7cffc 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -312,3 +312,24 @@ def dynamic_chunking_fn(schema: XarraySchema, divisor: int = 1): ) open_store = target_store | OpenZarrStore() assert_that(open_store, has_dynamically_set_chunks()) + + +def test_StoreToZarr_dynamic_chunking_with_target_chunks_raises( + netcdf_local_file_pattern_sequential: FilePattern, +): + def fn(schema): + pass + + pattern: FilePattern = netcdf_local_file_pattern_sequential + + with pytest.raises( + ValueError, + match="Passing both `target_chunks` and `dynamic_chunking_fn` not allowed", + ): + _ = StoreToZarr( + target_root="target_root", + store_name="test.zarr", + combine_dims=pattern.combine_dim_keys, + target_chunks={"time": 1}, + dynamic_chunking_fn=fn, + ) From 74b8ebf774ca93693408a68c2f15cdb531e1aba8 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Fri, 1 Sep 2023 16:38:24 -0700 Subject: [PATCH 04/20] xr.Datasets are more familiar than XarraySchemas, so let users generate dynamic chunks from ds --- pangeo_forge_recipes/transforms.py | 14 ++++++++------ tests/test_transforms.py | 7 ++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pangeo_forge_recipes/transforms.py b/pangeo_forge_recipes/transforms.py index 2bcc10eb..6a2e1b37 100644 --- a/pangeo_forge_recipes/transforms.py +++ b/pangeo_forge_recipes/transforms.py @@ -17,7 +17,7 @@ import xarray as xr import zarr -from .aggregation import XarraySchema, dataset_to_schema, schema_to_zarr +from .aggregation import XarraySchema, dataset_to_schema, schema_to_template_ds, schema_to_zarr from .combiners import CombineMultiZarrToZarr, CombineXarraySchemas from .openers import open_url, open_with_kerchunk, open_with_xarray from .patterns import CombineOp, Dimension, FileType, Index, augment_index_with_start_stop @@ -479,9 +479,9 @@ class StoreToZarr(beam.PTransform, ZarrWriterMixin): `store_name` will be appended to this prefix to create a full path. :param target_chunks: Dictionary mapping dimension names to chunks sizes. If a dimension is a not named, the chunks will be inferred from the data. - :param dynamic_chunking_fn: Optionally provide a function that takes an ``XarraySchema`` - as its first argument and returns a dynamically generated chunking dict. If provided, - ``target_chunks`` cannot also be passed. + :param dynamic_chunking_fn: Optionally provide a function that takes an ``xarray.Dataset`` + template dataset as its first argument and returns a dynamically generated chunking dict. + If provided, ``target_chunks`` cannot also be passed. :param dynamic_chunking_fn_kwargs: Optional keyword arguments for ``dynamic_chunking_fn``. """ @@ -493,7 +493,7 @@ class StoreToZarr(beam.PTransform, ZarrWriterMixin): default_factory=RequiredAtRuntimeDefault ) target_chunks: Dict[str, int] = field(default_factory=dict) - dynamic_chunking_fn: Optional[Callable[[XarraySchema], dict]] = None + dynamic_chunking_fn: Optional[Callable[[xr.Dataset], dict]] = None dynamic_chunking_fn_kwargs: Optional[dict] = field(default_factory=dict) def __post_init__(self): @@ -510,7 +510,9 @@ def expand( self.target_chunks if not self.dynamic_chunking_fn else beam.pvalue.AsSingleton( - schema | beam.Map(self.dynamic_chunking_fn, **self.dynamic_chunking_fn_kwargs) + schema + | beam.Map(schema_to_template_ds) + | beam.Map(self.dynamic_chunking_fn, **self.dynamic_chunking_fn_kwargs) ) ) rechunked_datasets = indexed_datasets | Rechunk(target_chunks=target_chunks, schema=schema) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 39f898cc..607fd3f1 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -6,7 +6,7 @@ from apache_beam.testing.util import BeamAssertException, assert_that, is_not_empty from pytest_lazyfixture import lazy_fixture -from pangeo_forge_recipes.aggregation import XarraySchema, dataset_to_schema +from pangeo_forge_recipes.aggregation import dataset_to_schema from pangeo_forge_recipes.patterns import FilePattern, FileType from pangeo_forge_recipes.storage import CacheFSSpecTarget from pangeo_forge_recipes.transforms import ( @@ -296,7 +296,8 @@ def _has_dynamically_set_chunks(actual): time_len = len(daily_xarray_dataset.time) - def dynamic_chunking_fn(schema: XarraySchema, divisor: int = 1): + def dynamic_chunking_fn(template_ds: xr.Dataset, divisor: int = 1): + assert isinstance(template_ds, xr.Dataset) return {"time": int(time_len / divisor)} kws = {} if not with_kws else {"dynamic_chunking_fn_kwargs": {"divisor": 2}} @@ -317,7 +318,7 @@ def dynamic_chunking_fn(schema: XarraySchema, divisor: int = 1): def test_StoreToZarr_dynamic_chunking_with_target_chunks_raises( netcdf_local_file_pattern_sequential: FilePattern, ): - def fn(schema): + def fn(template_ds): pass pattern: FilePattern = netcdf_local_file_pattern_sequential From 147cd20161029bcd648b6acd103a9a68dc1e4381 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Tue, 19 Sep 2023 11:37:08 -0400 Subject: [PATCH 05/20] fix transforms test for attrs --- tests/test_transforms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 607fd3f1..fb698e3c 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -308,6 +308,7 @@ def dynamic_chunking_fn(template_ds: xr.Dataset, divisor: int = 1): target_root=tmp_target_url, store_name="test.zarr", combine_dims=pattern.combine_dim_keys, + attrs={}, dynamic_chunking_fn=dynamic_chunking_fn, **kws, ) From 9e7dd67bfcf64b7e6f32dd59b8d73332d039f1f4 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 14 Nov 2023 16:12:53 -0800 Subject: [PATCH 06/20] Added integration test for dynamic chunking plugin --- .../feedstock/gpcp_from_gcs_dynamic_chunks.py | 49 +++++++++++++++++++ examples/feedstock/meta.yaml | 2 + 2 files changed, 51 insertions(+) create mode 100644 examples/feedstock/gpcp_from_gcs_dynamic_chunks.py diff --git a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py new file mode 100644 index 00000000..cccdcdd0 --- /dev/null +++ b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py @@ -0,0 +1,49 @@ +import apache_beam as beam +import pandas as pd +import xarray as xr +import zarr +from typing import Dict + +from pangeo_forge_recipes.patterns import ConcatDim, FilePattern +from pangeo_forge_recipes.transforms import OpenURLWithFSSpec, OpenWithXarray, StoreToZarr + +dates = [ + d.to_pydatetime().strftime("%Y%m%d") + for d in pd.date_range("1996-10-01", "1999-02-01", freq="D") +] + + +def make_url(time): + url_base = "https://storage.googleapis.com/pforge-test-data" + return f"{url_base}/gpcp/v01r03_daily_d{time}.nc" + + +concat_dim = ConcatDim("time", dates, nitems_per_file=1) +pattern = FilePattern(make_url, concat_dim) + + +def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: + # This fails integration test if not imported here + # TODO: see if --setup-file option for runner fixes this + import xarray as xr + + ds = xr.open_dataset(store, engine="zarr", chunks={}) + assert ds.title == ( + "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" + ) + return store + +def chunk_func(ds: xr.Dataset) -> Dict[str, int]: + return {'time': 3} + +recipe = ( + beam.Create(pattern.items()) + | OpenURLWithFSSpec() + | OpenWithXarray(file_type=pattern.file_type, xarray_open_kwargs={"decode_coords": "all"}) + | StoreToZarr( + dynamic_chunking_fn=chunk_func, + store_name="gpcp.zarr", + combine_dims=pattern.combine_dim_keys, + ) + | "Test dataset" >> beam.Map(test_ds) +) diff --git a/examples/feedstock/meta.yaml b/examples/feedstock/meta.yaml index cb9f2943..29fc5db0 100644 --- a/examples/feedstock/meta.yaml +++ b/examples/feedstock/meta.yaml @@ -1,6 +1,8 @@ recipes: - id: "gpcp-from-gcs" object: "gpcp_from_gcs:recipe" + - id: "gpcp-from-gcs-dynamic-chunks" + object: "gpcp_from_gcs_dynamic_chunks:recipe" - id: "noaa-oisst" object: "noaa_oisst:recipe" - id: "terraclimate" From 9770cf511fe95014de4e3bb55a8c0b2bc765f676 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 00:13:52 +0000 Subject: [PATCH 07/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/feedstock/gpcp_from_gcs_dynamic_chunks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py index cccdcdd0..b5260fb0 100644 --- a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py +++ b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py @@ -1,8 +1,9 @@ +from typing import Dict + import apache_beam as beam import pandas as pd import xarray as xr import zarr -from typing import Dict from pangeo_forge_recipes.patterns import ConcatDim, FilePattern from pangeo_forge_recipes.transforms import OpenURLWithFSSpec, OpenWithXarray, StoreToZarr @@ -33,8 +34,10 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: ) return store + def chunk_func(ds: xr.Dataset) -> Dict[str, int]: - return {'time': 3} + return {"time": 3} + recipe = ( beam.Create(pattern.items()) From f7aee4e503c0c3d586cd4343f7df2160ddb425ab Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 14 Nov 2023 16:24:27 -0800 Subject: [PATCH 08/20] Expanded docstring --- pangeo_forge_recipes/transforms.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pangeo_forge_recipes/transforms.py b/pangeo_forge_recipes/transforms.py index a8121d35..0ef8f055 100644 --- a/pangeo_forge_recipes/transforms.py +++ b/pangeo_forge_recipes/transforms.py @@ -483,7 +483,10 @@ class StoreToZarr(beam.PTransform, ZarrWriterMixin): If a dimension is a not named, the chunks will be inferred from the data. :param dynamic_chunking_fn: Optionally provide a function that takes an ``xarray.Dataset`` template dataset as its first argument and returns a dynamically generated chunking dict. - If provided, ``target_chunks`` cannot also be passed. + If provided, ``target_chunks`` cannot also be passed. You can use this to determine chunking + based on the full dataset (e.g. divide along a certain dimension based on a desired chunk + size in memory). For more advanced chunking strategies, check + out https://github.com/jbusecke/dynamic_chunks :param dynamic_chunking_fn_kwargs: Optional keyword arguments for ``dynamic_chunking_fn``. :param attrs: Extra group-level attributes to inject into the dataset. """ From 81350ace7a799619a77c946d9ce4689f539d3136 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 00:25:05 +0000 Subject: [PATCH 09/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pangeo_forge_recipes/transforms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pangeo_forge_recipes/transforms.py b/pangeo_forge_recipes/transforms.py index 0ef8f055..4df9c0dd 100644 --- a/pangeo_forge_recipes/transforms.py +++ b/pangeo_forge_recipes/transforms.py @@ -483,9 +483,9 @@ class StoreToZarr(beam.PTransform, ZarrWriterMixin): If a dimension is a not named, the chunks will be inferred from the data. :param dynamic_chunking_fn: Optionally provide a function that takes an ``xarray.Dataset`` template dataset as its first argument and returns a dynamically generated chunking dict. - If provided, ``target_chunks`` cannot also be passed. You can use this to determine chunking - based on the full dataset (e.g. divide along a certain dimension based on a desired chunk - size in memory). For more advanced chunking strategies, check + If provided, ``target_chunks`` cannot also be passed. You can use this to determine chunking + based on the full dataset (e.g. divide along a certain dimension based on a desired chunk + size in memory). For more advanced chunking strategies, check out https://github.com/jbusecke/dynamic_chunks :param dynamic_chunking_fn_kwargs: Optional keyword arguments for ``dynamic_chunking_fn``. :param attrs: Extra group-level attributes to inject into the dataset. From 6d3e9650aa1c50e44c0c3468d420f4d4258a07dd Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 14 Nov 2023 16:35:24 -0800 Subject: [PATCH 10/20] Test patching runner to not depend on recipes --- .github/workflows/test-integration.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index 325a4b5c..4842fb9d 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -49,7 +49,9 @@ jobs: shell: bash -l {0} run: | python -m pip install -e ".[dev]" - python -m pip install ${{ matrix.runner-version }} + python -m pip install ${{ matrix.runner-version }} --no-deps + python -m pip install jupyter-repo2docker ruamel.yaml escapism jsonschema traitlets importlib-metadata + #install runner dependecies minus -recipes (JUST FOR TESTING) - name: 🏄‍♂️ Run Tests shell: bash -l {0} run: | From ad2e237f426e496eb5a6bf26ba298049c32349e7 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 14 Nov 2023 16:42:49 -0800 Subject: [PATCH 11/20] Did we need that patch at all? --- .github/workflows/test-integration.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index bc443c52..786198cd 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -58,8 +58,11 @@ jobs: shell: bash -l {0} run: | python -m pip install -e ".[test,minio]" - python -m pip install ${{ matrix.runner-version }} --no-deps - python -m pip install jupyter-repo2docker ruamel.yaml escapism jsonschema traitlets importlib-metadata + python -m pip install ${{ matrix.runner-version }} + # run: | + # python -m pip install -e ".[test,minio]" + # python -m pip install ${{ matrix.runner-version }} --no-deps + # python -m pip install jupyter-repo2docker ruamel.yaml escapism jsonschema traitlets importlib-metadata #install runner dependecies minus -recipes (JUST FOR TESTING) - name: 🏄‍♂️ Run Tests shell: bash -l {0} From 798364395a54d32355d9004edf5c00c7975ea1e1 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 14 Nov 2023 17:00:56 -0800 Subject: [PATCH 12/20] reverse install order --- .github/workflows/test-integration.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index 786198cd..d34bb5e1 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -57,13 +57,11 @@ jobs: - name: 🌈 Install pangeo-forge-recipes & pangeo-forge-runner shell: bash -l {0} run: | - python -m pip install -e ".[test,minio]" python -m pip install ${{ matrix.runner-version }} - # run: | - # python -m pip install -e ".[test,minio]" - # python -m pip install ${{ matrix.runner-version }} --no-deps - # python -m pip install jupyter-repo2docker ruamel.yaml escapism jsonschema traitlets importlib-metadata - #install runner dependecies minus -recipes (JUST FOR TESTING) + python -m pip install -e ".[test,minio]" + + # order reversed to fix https://github.com/pangeo-forge/pangeo-forge-recipes/pull/595#issuecomment-1811630921 + # this should however be fixed in the runner itself - name: 🏄‍♂️ Run Tests shell: bash -l {0} run: | From c7c5206117bffd25d771ffa428afed397ef45300 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 01:01:25 +0000 Subject: [PATCH 13/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/test-integration.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index d34bb5e1..ef10be91 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -59,7 +59,7 @@ jobs: run: | python -m pip install ${{ matrix.runner-version }} python -m pip install -e ".[test,minio]" - + # order reversed to fix https://github.com/pangeo-forge/pangeo-forge-recipes/pull/595#issuecomment-1811630921 # this should however be fixed in the runner itself - name: 🏄‍♂️ Run Tests From d177dc4c1209cf5383757e3956f9b9e0b91a3a23 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Wed, 15 Nov 2023 10:38:01 -0800 Subject: [PATCH 14/20] Add chunking test --- examples/feedstock/gpcp_from_gcs_dynamic_chunks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py index b5260fb0..eedf468a 100644 --- a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py +++ b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py @@ -32,11 +32,13 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: assert ds.title == ( "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" ) + + assert ds.chunks['time'][0] == 7 return store def chunk_func(ds: xr.Dataset) -> Dict[str, int]: - return {"time": 3} + return {"time": 7} recipe = ( From cbbfe40f3a07df716dc7bcf1af7391f704acd9ae Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 18:38:17 +0000 Subject: [PATCH 15/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/feedstock/gpcp_from_gcs_dynamic_chunks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py index eedf468a..ecb434a3 100644 --- a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py +++ b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py @@ -33,7 +33,7 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" ) - assert ds.chunks['time'][0] == 7 + assert ds.chunks["time"][0] == 7 return store From dcb1aa7fa2b0622680e952d75d61d8fa6de50c29 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Wed, 15 Nov 2023 10:54:17 -0800 Subject: [PATCH 16/20] reduce chunking --- examples/feedstock/gpcp_from_gcs_dynamic_chunks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py index eedf468a..c1c51b9e 100644 --- a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py +++ b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py @@ -33,12 +33,12 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" ) - assert ds.chunks['time'][0] == 7 + assert ds.chunks['time'][0] == 3 return store def chunk_func(ds: xr.Dataset) -> Dict[str, int]: - return {"time": 7} + return {"time": 3} recipe = ( From 7dc9be7859c2e11547dd75e6e5275c96478595f6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 18:55:18 +0000 Subject: [PATCH 17/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/feedstock/gpcp_from_gcs_dynamic_chunks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py index c1c51b9e..1e4fb792 100644 --- a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py +++ b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py @@ -33,7 +33,7 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" ) - assert ds.chunks['time'][0] == 3 + assert ds.chunks["time"][0] == 3 return store From ff5fa4ce203b394a9de7f4ab8ab82d3ea55dedbc Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Wed, 15 Nov 2023 11:08:33 -0800 Subject: [PATCH 18/20] Added checks to non-chunked gpcp recipe and reduced dynamic chunks to 2 --- examples/feedstock/gpcp_from_gcs.py | 3 +++ examples/feedstock/gpcp_from_gcs_dynamic_chunks.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/feedstock/gpcp_from_gcs.py b/examples/feedstock/gpcp_from_gcs.py index caea9a11..a29da23b 100644 --- a/examples/feedstock/gpcp_from_gcs.py +++ b/examples/feedstock/gpcp_from_gcs.py @@ -29,6 +29,9 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: assert ds.title == ( "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" ) + # Making sure that the native chunking is different from the dynamic chunking + assert ds.chunks['time'][0] == 1 + return store diff --git a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py index c1c51b9e..99c96613 100644 --- a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py +++ b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py @@ -33,12 +33,12 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" ) - assert ds.chunks['time'][0] == 3 + assert ds.chunks['time'][0] == 2 return store def chunk_func(ds: xr.Dataset) -> Dict[str, int]: - return {"time": 3} + return {"time": 2} recipe = ( From 5ee3f8ae7702e939a256bbed26b8bcaf2ce81b33 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 19:12:01 +0000 Subject: [PATCH 19/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/feedstock/gpcp_from_gcs.py | 2 +- examples/feedstock/gpcp_from_gcs_dynamic_chunks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/feedstock/gpcp_from_gcs.py b/examples/feedstock/gpcp_from_gcs.py index a29da23b..9b13d59d 100644 --- a/examples/feedstock/gpcp_from_gcs.py +++ b/examples/feedstock/gpcp_from_gcs.py @@ -30,7 +30,7 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" ) # Making sure that the native chunking is different from the dynamic chunking - assert ds.chunks['time'][0] == 1 + assert ds.chunks["time"][0] == 1 return store diff --git a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py index 99c96613..bc9c3fd0 100644 --- a/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py +++ b/examples/feedstock/gpcp_from_gcs_dynamic_chunks.py @@ -33,7 +33,7 @@ def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: "Global Precipitation Climatatology Project (GPCP) " "Climate Data Record (CDR), Daily V1.3" ) - assert ds.chunks['time'][0] == 2 + assert ds.chunks["time"][0] == 2 return store From f19acfb59c85823708ec93248eb9c2d3740d1c19 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Wed, 15 Nov 2023 11:17:13 -0800 Subject: [PATCH 20/20] Add release notes --- docs/release_notes.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/release_notes.md b/docs/release_notes.md index ccf43b05..83066d47 100644 --- a/docs/release_notes.md +++ b/docs/release_notes.md @@ -1,5 +1,9 @@ # Release Notes +## v0.10.4 - 2023-11-15 + +- Add `dynamic_chunking_fn`/`dynamic_chunking_fn_kwargs` keywords to StoreToZarr. This allows users to pass a function that will be called at runtime to determine the target chunks for the resulting datasets based on the in memory representation/size of the recipe dataset. {pull}`595` + ## v0.10.3 - 2023-10-03 - Assign injection spec values for command line interface {pull}`566`