From c8e03aeaf13d720560421f0d4471708a9ccdbac6 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:32:03 -0400 Subject: [PATCH 1/4] add & test verify_existing option to cache --- pangeo_forge_recipes/storage.py | 11 ++++++++-- tests/test_storage.py | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/pangeo_forge_recipes/storage.py b/pangeo_forge_recipes/storage.py index 42e888f6..f8cdcdad 100644 --- a/pangeo_forge_recipes/storage.py +++ b/pangeo_forge_recipes/storage.py @@ -171,19 +171,26 @@ def _full_path(self, path: str) -> str: return os.path.join(self.root_path, new_path) +@dataclass class CacheFSSpecTarget(FlatFSSpecTarget): """Alias for FlatFSSpecTarget""" + verify_existing: bool = True + def cache_file(self, fname: str, secrets: Optional[dict], **open_kwargs) -> None: # check and see if the file already exists in the cache logger.info(f"Caching file '{fname}'") - if self.exists(fname): + exists = self.exists(fname) + if exists and self.verify_existing: cached_size = self.size(fname) remote_size = _get_url_size(fname, secrets, **open_kwargs) if cached_size == remote_size: # TODO: add checksumming here - logger.info(f"File '{fname}' is already cached") + logger.info(f"File '{fname}' is already cached, and matches remote size.") return + elif exists and not self.verify_existing: + logger.info(f"File '{fname}' is already cached, skipping verification.") + return input_opener = _get_opener(fname, secrets, **open_kwargs) target_opener = self.open(fname, mode="wb") diff --git a/tests/test_storage.py b/tests/test_storage.py index 1a1e2da7..ca7b0174 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -91,6 +91,43 @@ def test_caching_only_truncates_long_fnames_for_local_fs(fs_cls, fname_longer_th assert len(fname_in_full_path) > POSIX_MAX_FNAME_LENGTH +@pytest.mark.parametrize("verify_existing", [True, False]) +def test_cache_no_verify_existing(tmpdir_factory: pytest.TempdirFactory, verify_existing: bool): + tmp_src = tmpdir_factory.mktemp("src") + tmp_dst = tmpdir_factory.mktemp("dst") + cache = CacheFSSpecTarget(LocalFileSystem(), tmp_dst, verify_existing=verify_existing) + src_fname = str(tmp_src / "source.txt") + + # write the source file + with open(src_fname, mode="w") as f: + f.write("0") + + # cache it + cache.cache_file(src_fname, secrets=None) + + # overwrite source with new data + with open(src_fname, mode="w") as f: + f.write("00") + + # cache it again + cache.cache_file(src_fname, secrets=None) + + # open from cache + cached_fname = cache._full_path(src_fname) + with open(cached_fname) as f: + if not verify_existing: + # if we *do not* verify the existing cache, the second caching operation will be + # skipped due to the presence of the cached filename already existing in the cache. + # we expect the data to reflect the data contained in the initial source file. + assert f.read() == "0" + else: + # if we *do verify* the length of the existing data, we will recognize that the source + # file has changed since the first caching operation, and therefore the second caching + # operation will recognize the inconsistent lengths of the source data between the first + # and second caching operations, and re-cache the data the second time around. + assert f.read() == "00" + + def test_suffix(tmp_path): assert str((FSSpecTarget(LocalFileSystem(), tmp_path) / "test").root_path) == str( tmp_path / "test" From 9a739b9f9afc518853e93806f01c225f46df59f4 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Tue, 3 Oct 2023 15:51:55 -0400 Subject: [PATCH 2/4] test against runner 0.9.0 --- .github/workflows/test-integration.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index 325a4b5c..174537e7 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -32,7 +32,7 @@ jobs: python-version: ["3.9"] # , "3.10", "3.11"] runner-version: [ "pangeo-forge-runner==0.8.0", - "git+https://github.com/pangeo-forge/pangeo-forge-runner.git@injections#egg=pangeo_forge_runner", + "pangeo-forge-runner==0.9.0", ] steps: - uses: actions/checkout@v4 From 086275ae7a7f67c83058ca27fcad6a5c88e8b26f Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Tue, 3 Oct 2023 16:03:39 -0400 Subject: [PATCH 3/4] add release notes --- docs/release_notes.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/release_notes.md b/docs/release_notes.md index 8d40402a..ccf43b05 100644 --- a/docs/release_notes.md +++ b/docs/release_notes.md @@ -1,5 +1,12 @@ # Release Notes +## v0.10.3 - 2023-10-03 + +- Assign injection spec values for command line interface {pull}`566` +- Docs rewrite {pull}`610` +- Simplify reference recipe composition by nesting transforms {pull}`635` +- Reference transforms dependency and bugfixes {pull}`631` {pull}`632` + ## v0.10.2 - 2023-09-19 - Fix bug preventing use of multiple merge dims {pull}`591` From 8a0f3fe877b12a992c07685671cf9ba318588533 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Tue, 3 Oct 2023 16:05:55 -0400 Subject: [PATCH 4/4] Revert "add & test verify_existing option to cache" This reverts commit c8e03aeaf13d720560421f0d4471708a9ccdbac6. --- pangeo_forge_recipes/storage.py | 11 ++-------- tests/test_storage.py | 37 --------------------------------- 2 files changed, 2 insertions(+), 46 deletions(-) diff --git a/pangeo_forge_recipes/storage.py b/pangeo_forge_recipes/storage.py index f8cdcdad..42e888f6 100644 --- a/pangeo_forge_recipes/storage.py +++ b/pangeo_forge_recipes/storage.py @@ -171,26 +171,19 @@ def _full_path(self, path: str) -> str: return os.path.join(self.root_path, new_path) -@dataclass class CacheFSSpecTarget(FlatFSSpecTarget): """Alias for FlatFSSpecTarget""" - verify_existing: bool = True - def cache_file(self, fname: str, secrets: Optional[dict], **open_kwargs) -> None: # check and see if the file already exists in the cache logger.info(f"Caching file '{fname}'") - exists = self.exists(fname) - if exists and self.verify_existing: + if self.exists(fname): cached_size = self.size(fname) remote_size = _get_url_size(fname, secrets, **open_kwargs) if cached_size == remote_size: # TODO: add checksumming here - logger.info(f"File '{fname}' is already cached, and matches remote size.") + logger.info(f"File '{fname}' is already cached") return - elif exists and not self.verify_existing: - logger.info(f"File '{fname}' is already cached, skipping verification.") - return input_opener = _get_opener(fname, secrets, **open_kwargs) target_opener = self.open(fname, mode="wb") diff --git a/tests/test_storage.py b/tests/test_storage.py index ca7b0174..1a1e2da7 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -91,43 +91,6 @@ def test_caching_only_truncates_long_fnames_for_local_fs(fs_cls, fname_longer_th assert len(fname_in_full_path) > POSIX_MAX_FNAME_LENGTH -@pytest.mark.parametrize("verify_existing", [True, False]) -def test_cache_no_verify_existing(tmpdir_factory: pytest.TempdirFactory, verify_existing: bool): - tmp_src = tmpdir_factory.mktemp("src") - tmp_dst = tmpdir_factory.mktemp("dst") - cache = CacheFSSpecTarget(LocalFileSystem(), tmp_dst, verify_existing=verify_existing) - src_fname = str(tmp_src / "source.txt") - - # write the source file - with open(src_fname, mode="w") as f: - f.write("0") - - # cache it - cache.cache_file(src_fname, secrets=None) - - # overwrite source with new data - with open(src_fname, mode="w") as f: - f.write("00") - - # cache it again - cache.cache_file(src_fname, secrets=None) - - # open from cache - cached_fname = cache._full_path(src_fname) - with open(cached_fname) as f: - if not verify_existing: - # if we *do not* verify the existing cache, the second caching operation will be - # skipped due to the presence of the cached filename already existing in the cache. - # we expect the data to reflect the data contained in the initial source file. - assert f.read() == "0" - else: - # if we *do verify* the length of the existing data, we will recognize that the source - # file has changed since the first caching operation, and therefore the second caching - # operation will recognize the inconsistent lengths of the source data between the first - # and second caching operations, and re-cache the data the second time around. - assert f.read() == "00" - - def test_suffix(tmp_path): assert str((FSSpecTarget(LocalFileSystem(), tmp_path) / "test").root_path) == str( tmp_path / "test"