From 63fd6bbc2038a9f6810f83b015429fdc46d6cc71 Mon Sep 17 00:00:00 2001 From: William Moore Date: Wed, 3 Jul 2024 23:03:43 +0100 Subject: [PATCH 1/9] Add new dev2/resave.py with sharding example --- dev2/resave.py | 108 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 dev2/resave.py diff --git a/dev2/resave.py b/dev2/resave.py new file mode 100644 index 0000000..6d2cd7b --- /dev/null +++ b/dev2/resave.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +import numpy as np +import zarr +import sys +import os + +import tensorstore as ts + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("input_path") +parser.add_argument("output_path") +ns = parser.parse_args() + + +if os.path.exists(ns.output_path): + print(f"{ns.output_path} exists. Exiting") + sys.exit(1) + + +def convert_array(input_path, output_path): + read = ts.open({ + 'driver': 'zarr', + 'kvstore': { + 'driver': 'file', + 'path': input_path, + }, + }).result() + + shape = read.shape + chunks = read.schema.chunk_layout.read_chunk.shape + + # bigger_chunk includes 2 of the regular chunks + bigger_chunk = list(chunks[:]) + bigger_chunk[0] = bigger_chunk[0] * 2 + + # sharding breaks bigger_chunk down into regular chunks + # https://google.github.io/tensorstore/driver/zarr3/index.html#json-driver/zarr3/Codec/sharding_indexed + sharding_codec = { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": chunks, + "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "gzip", "configuration": {"level": 5}}], + "index_codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "crc32c"}], + "index_location": "end" + } + } + + codecs = [sharding_codec] + + # Alternative without sharding... + # blosc_codec = {"name": "blosc", "configuration": { + # "cname": "lz4", "clevel": 5}} + # codecs = [blosc_codec] + + write = ts.open({ + "driver": "zarr3", + "kvstore": { + "driver": "file", + "path": output_path + }, + "metadata": { + "shape": shape, + "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": bigger_chunk}}, + "chunk_key_encoding": {"name": "default"}, + "codecs": codecs, + "data_type": read.dtype, + }, + "create": True, + }).result() + + future = write.write(read) + future.result() + + +store_class = zarr.store.LocalStore +if ns.input_path.startswith("http"): + # TypeError: Can't instantiate abstract class RemoteStore with abstract methods get_partial_values, list, list_dir, list_prefix, set_partial_values + store_class = zarr.store.RemoteStore +read_store = store_class(ns.input_path, mode="r") +# Needs zarr_format=2 or we get ValueError("store mode does not support writing") +read_root = zarr.open_group(store=read_store, zarr_format=2) + +# Create new Image... +write_store = zarr.store.LocalStore(ns.output_path, mode="w") +root = zarr.Group.create(write_store) +# top-level version... +ome_attrs = {"version": "0.5-dev2"} +for key, value in read_root.attrs.items(): + # ...replaces all other versions - remove + if "version" in value: + del (value["version"]) + if key == "multiscales" and "version" in value[0]: + del (value[0]["version"]) + ome_attrs[key] = value +# dev2: everything is under 'ome' key +root.attrs["ome"] = ome_attrs + +# convert arrays +multiscales = read_root.attrs.get("multiscales") +for ds in multiscales[0]["datasets"]: + ds_path = ds["path"] + convert_array( + os.path.join(ns.input_path, ds_path), + os.path.join(ns.output_path, ds_path) + ) From 628e962430d595a5d34c23144354928a83abfb82 Mon Sep 17 00:00:00 2001 From: William Moore Date: Thu, 4 Jul 2024 11:39:45 +0100 Subject: [PATCH 2/9] Add dimension_names support to resave.py --- dev2/resave.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/dev2/resave.py b/dev2/resave.py index 6d2cd7b..57c9c8b 100644 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -18,7 +18,7 @@ sys.exit(1) -def convert_array(input_path, output_path): +def convert_array(input_path, output_path, dimension_names): read = ts.open({ 'driver': 'zarr', 'kvstore': { @@ -67,6 +67,7 @@ def convert_array(input_path, output_path): "chunk_key_encoding": {"name": "default"}, "codecs": codecs, "data_type": read.dtype, + "dimension_names": dimension_names, }, "create": True, }).result() @@ -86,14 +87,17 @@ def convert_array(input_path, output_path): # Create new Image... write_store = zarr.store.LocalStore(ns.output_path, mode="w") root = zarr.Group.create(write_store) +dimension_names = None # top-level version... ome_attrs = {"version": "0.5-dev2"} for key, value in read_root.attrs.items(): # ...replaces all other versions - remove if "version" in value: del (value["version"]) - if key == "multiscales" and "version" in value[0]: - del (value[0]["version"]) + if key == "multiscales": + dimension_names = [axis["name"] for axis in value[0]["axes"]] + if "version" in value[0]: + del (value[0]["version"]) ome_attrs[key] = value # dev2: everything is under 'ome' key root.attrs["ome"] = ome_attrs @@ -104,5 +108,6 @@ def convert_array(input_path, output_path): ds_path = ds["path"] convert_array( os.path.join(ns.input_path, ds_path), - os.path.join(ns.output_path, ds_path) + os.path.join(ns.output_path, ds_path), + dimension_names, ) From cb06538dbfb2b84e923ea20aa6dfed0b71a9464a Mon Sep 17 00:00:00 2001 From: William Moore Date: Thu, 4 Jul 2024 15:28:31 +0100 Subject: [PATCH 3/9] Don't use sharding in dev2/resave.py --- dev2/resave.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev2/resave.py b/dev2/resave.py index 57c9c8b..290ae8f 100644 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -48,12 +48,12 @@ def convert_array(input_path, output_path, dimension_names): } } - codecs = [sharding_codec] + # codecs = [sharding_codec] # Alternative without sharding... - # blosc_codec = {"name": "blosc", "configuration": { - # "cname": "lz4", "clevel": 5}} - # codecs = [blosc_codec] + blosc_codec = {"name": "blosc", "configuration": { + "cname": "lz4", "clevel": 5}} + codecs = [blosc_codec] write = ts.open({ "driver": "zarr3", From cbc7b91e2786fee7ca59cbb0a305a1aabf8144f5 Mon Sep 17 00:00:00 2001 From: William Moore Date: Sun, 7 Jul 2024 13:48:52 +0100 Subject: [PATCH 4/9] TEMP: WIP to handle Plates conversion --- dev2/resave.py | 104 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 29 deletions(-) diff --git a/dev2/resave.py b/dev2/resave.py index 290ae8f..fed7cbd 100644 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -14,8 +14,10 @@ if os.path.exists(ns.output_path): - print(f"{ns.output_path} exists. Exiting") - sys.exit(1) + # print(f"{ns.output_path} exists. Exiting") + # sys.exit(1) + import shutil + shutil.rmtree(ns.output_path) def convert_array(input_path, output_path, dimension_names): @@ -75,6 +77,35 @@ def convert_array(input_path, output_path, dimension_names): future = write.write(read) future.result() +# Create new Image... +def convert_image(read_root, input_path, output_path): + write_store = zarr.store.LocalStore(output_path, mode="w") + root = zarr.Group.create(write_store) + dimension_names = None + # top-level version... + ome_attrs = {"version": "0.5-dev2"} + for key, value in read_root.attrs.items(): + # ...replaces all other versions - remove + if "version" in value: + del (value["version"]) + if key == "multiscales": + dimension_names = [axis["name"] for axis in value[0]["axes"]] + if "version" in value[0]: + del (value[0]["version"]) + ome_attrs[key] = value + # dev2: everything is under 'ome' key + root.attrs["ome"] = ome_attrs + + # convert arrays + multiscales = read_root.attrs.get("multiscales") + for ds in multiscales[0]["datasets"]: + ds_path = ds["path"] + convert_array( + os.path.join(input_path, ds_path), + os.path.join(output_path, ds_path), + dimension_names, + ) + store_class = zarr.store.LocalStore if ns.input_path.startswith("http"): @@ -84,30 +115,45 @@ def convert_array(input_path, output_path, dimension_names): # Needs zarr_format=2 or we get ValueError("store mode does not support writing") read_root = zarr.open_group(store=read_store, zarr_format=2) -# Create new Image... -write_store = zarr.store.LocalStore(ns.output_path, mode="w") -root = zarr.Group.create(write_store) -dimension_names = None -# top-level version... -ome_attrs = {"version": "0.5-dev2"} -for key, value in read_root.attrs.items(): - # ...replaces all other versions - remove - if "version" in value: - del (value["version"]) - if key == "multiscales": - dimension_names = [axis["name"] for axis in value[0]["axes"]] - if "version" in value[0]: - del (value[0]["version"]) - ome_attrs[key] = value -# dev2: everything is under 'ome' key -root.attrs["ome"] = ome_attrs - -# convert arrays -multiscales = read_root.attrs.get("multiscales") -for ds in multiscales[0]["datasets"]: - ds_path = ds["path"] - convert_array( - os.path.join(ns.input_path, ds_path), - os.path.join(ns.output_path, ds_path), - dimension_names, - ) +# image... +if read_root.attrs.get("multiscales"): + convert_image(read_root, ns.input_path, ns.output_path) + +# plate... +elif read_root.attrs.get("plate"): + # convert Wells.. + write_store = zarr.store.LocalStore(ns.output_path, mode="w") + root = zarr.Group.create(write_store) + + ome_attrs = {"version": "0.5-dev2"} + for key, value in read_root.attrs.items(): + # ...replaces all other versions - remove + if "version" in value: + del (value["version"]) + ome_attrs[key] = value + # dev2: everything is under 'ome' key + root.attrs["ome"] = ome_attrs + + plate_attrs = read_root.attrs.get("plate") + for well in plate_attrs.get("wells"): + well_path = well["path"] + well_v2 = zarr.open_group(store=read_store, path=well_path, zarr_format=2) + well_group = root.create_group(well_path) + # well_attrs = { k:v for (k,v) in well_v2.attrs.items()} + # TODO: do we store 'version' in well? + well_attrs = {} + for key, value in well_v2.attrs.items(): + if "version" in value: + del (value["version"]) + well_attrs[key] = value + well_group.attrs["ome"] = well_attrs + + for img in well_attrs["well"]["images"]: + img_path = well_path + "/" + img["path"] + out_path = os.path.join(ns.output_path, img_path) + input_path = os.path.join(ns.input_path, img_path) + print("img_path", img_path) + img_v2 = zarr.open_group(store=read_store, path=img_path, zarr_format=2) + # print('img_v2', { k:v for (k,v) in img_v2.attrs.items()}) + print(input_path, out_path) + convert_image(img_v2, input_path, out_path) From d8e0f324a62c6a6a21ee61dca3ac65ccac21c42b Mon Sep 17 00:00:00 2001 From: Josh Moore Date: Wed, 10 Jul 2024 12:36:31 +0200 Subject: [PATCH 5/9] Working remote read/write Example: ``` ./resave.py zarr/v0.4/idr0062A/6001240.zarr challenge/dev2/6001240.zarr/ \ --input-overwrite \ --input-bucket=idr \ --input-endpoint=https://uk1s3.embassy.ebi.ac.uk \ --input-anon \ --output-bucket=EXAMPLE \ --output-endpoint=https://MYHOST \ --output-overwrite ``` --- dev2/resave.py | 112 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 81 insertions(+), 31 deletions(-) mode change 100644 => 100755 dev2/resave.py diff --git a/dev2/resave.py b/dev2/resave.py old mode 100644 new mode 100755 index fed7cbd..8746066 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -8,25 +8,65 @@ import argparse parser = argparse.ArgumentParser() +parser.add_argument("--input-bucket") +parser.add_argument("--input-endpoint") +parser.add_argument("--input-anon", action="store_true") +parser.add_argument("--input-region", default="us-east-1") +parser.add_argument("--input-overwrite", action="store_true") +parser.add_argument("--output-bucket") +parser.add_argument("--output-endpoint") +parser.add_argument("--output-anon", action="store_true") +parser.add_argument("--output-region", default="us-east-1") +parser.add_argument("--output-overwrite", action="store_true") parser.add_argument("input_path") parser.add_argument("output_path") ns = parser.parse_args() if os.path.exists(ns.output_path): - # print(f"{ns.output_path} exists. Exiting") - # sys.exit(1) - import shutil - shutil.rmtree(ns.output_path) + if ns.input_overwrite: + import shutil + shutil.rmtree(ns.output_path) + else: + print(f"{ns.output_path} exists. Exiting") + sys.exit(1) + + +def create_configs(ns): + configs = [] + for selection in ("input", "output"): + anon = getattr(ns, f"{selection}_anon") + bucket = getattr(ns, f"{selection}_bucket") + endpoint = getattr(ns, f"{selection}_endpoint") + region = getattr(ns, f"{selection}_region") + + if bucket: + store = { + 'driver': 's3', + 'bucket': bucket, + 'aws_region': region, + } + if anon: + store['aws_credentials'] = { 'anonymous': anon } + if endpoint: + store["endpoint"] = endpoint + else: + store = { + 'driver': 'file', + } + configs.append(store) + return configs + +CONFIGS = create_configs(ns) + +def convert_array(input_path: str, output_path: str, dimension_names): + + CONFIGS[0]["path"] = input_path + CONFIGS[1]["path"] = output_path - -def convert_array(input_path, output_path, dimension_names): read = ts.open({ 'driver': 'zarr', - 'kvstore': { - 'driver': 'file', - 'path': input_path, - }, + 'kvstore': CONFIGS[0], }).result() shape = read.shape @@ -59,10 +99,8 @@ def convert_array(input_path, output_path, dimension_names): write = ts.open({ "driver": "zarr3", - "kvstore": { - "driver": "file", - "path": output_path - }, + "kvstore": CONFIGS[1], + "delete_existing": ns.output_overwrite, "metadata": { "shape": shape, "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": bigger_chunk}}, @@ -77,10 +115,7 @@ def convert_array(input_path, output_path, dimension_names): future = write.write(read) future.result() -# Create new Image... -def convert_image(read_root, input_path, output_path): - write_store = zarr.store.LocalStore(output_path, mode="w") - root = zarr.Group.create(write_store) +def convert_image(read_root, input_path, write_root, output_path): dimension_names = None # top-level version... ome_attrs = {"version": "0.5-dev2"} @@ -94,7 +129,7 @@ def convert_image(read_root, input_path, output_path): del (value[0]["version"]) ome_attrs[key] = value # dev2: everything is under 'ome' key - root.attrs["ome"] = ome_attrs + write_root.attrs["ome"] = ome_attrs # convert arrays multiscales = read_root.attrs.get("multiscales") @@ -107,23 +142,38 @@ def convert_image(read_root, input_path, output_path): ) -store_class = zarr.store.LocalStore -if ns.input_path.startswith("http"): - # TypeError: Can't instantiate abstract class RemoteStore with abstract methods get_partial_values, list, list_dir, list_prefix, set_partial_values - store_class = zarr.store.RemoteStore -read_store = store_class(ns.input_path, mode="r") + +STORES = [] +for config, path, mode in ( + (CONFIGS[0], ns.input_path, "r"), + (CONFIGS[1], ns.output_path, "w") + ): + if config["bucket"]: + store_class = zarr.store.RemoteStore + anon = config.get("aws_credentials", {}).get("anonymous", False) + store = store_class( + url=f's3://{config["bucket"]}/{path}', + anon=anon, + endpoint_url=config.get("endpoint", None), + mode=mode, + ) + else: + store_class = zarr.store.LocalStore + store = store_class(path, mode=mode) + STORES.append(store) + # Needs zarr_format=2 or we get ValueError("store mode does not support writing") -read_root = zarr.open_group(store=read_store, zarr_format=2) +read_root = zarr.open_group(store=STORES[0], zarr_format=2) + +write_store = STORES[1] +write_root = zarr.Group.create(write_store) # image... if read_root.attrs.get("multiscales"): - convert_image(read_root, ns.input_path, ns.output_path) + convert_image(read_root, ns.input_path, write_root, ns.output_path) # plate... elif read_root.attrs.get("plate"): - # convert Wells.. - write_store = zarr.store.LocalStore(ns.output_path, mode="w") - root = zarr.Group.create(write_store) ome_attrs = {"version": "0.5-dev2"} for key, value in read_root.attrs.items(): @@ -132,13 +182,13 @@ def convert_image(read_root, input_path, output_path): del (value["version"]) ome_attrs[key] = value # dev2: everything is under 'ome' key - root.attrs["ome"] = ome_attrs + write_root.attrs["ome"] = ome_attrs plate_attrs = read_root.attrs.get("plate") for well in plate_attrs.get("wells"): well_path = well["path"] well_v2 = zarr.open_group(store=read_store, path=well_path, zarr_format=2) - well_group = root.create_group(well_path) + well_group = write_root.create_group(well_path) # well_attrs = { k:v for (k,v) in well_v2.attrs.items()} # TODO: do we store 'version' in well? well_attrs = {} From ca77edae4e30a7e6af1f8bef5e76b281cde32b60 Mon Sep 17 00:00:00 2001 From: William Moore Date: Wed, 10 Jul 2024 11:22:35 +0100 Subject: [PATCH 6/9] Fix resave.py by properly commenting-out sharding code --- dev2/resave.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/dev2/resave.py b/dev2/resave.py index 8746066..f3fbc0d 100755 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -72,25 +72,26 @@ def convert_array(input_path: str, output_path: str, dimension_names): shape = read.shape chunks = read.schema.chunk_layout.read_chunk.shape - # bigger_chunk includes 2 of the regular chunks - bigger_chunk = list(chunks[:]) - bigger_chunk[0] = bigger_chunk[0] * 2 - - # sharding breaks bigger_chunk down into regular chunks - # https://google.github.io/tensorstore/driver/zarr3/index.html#json-driver/zarr3/Codec/sharding_indexed - sharding_codec = { - "name": "sharding_indexed", - "configuration": { - "chunk_shape": chunks, - "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, - {"name": "gzip", "configuration": {"level": 5}}], - "index_codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, - {"name": "crc32c"}], - "index_location": "end" - } - } + # # bigger_chunk includes 2 of the regular chunks + # bigger_chunk = list(chunks[:]) + # bigger_chunk[0] = bigger_chunk[0] * 2 + + # # sharding breaks bigger_chunk down into regular chunks + # # https://google.github.io/tensorstore/driver/zarr3/index.html#json-driver/zarr3/Codec/sharding_indexed + # sharding_codec = { + # "name": "sharding_indexed", + # "configuration": { + # "chunk_shape": chunks, + # "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, + # {"name": "gzip", "configuration": {"level": 5}}], + # "index_codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, + # {"name": "crc32c"}], + # "index_location": "end" + # } + # } # codecs = [sharding_codec] + # chunks = bigger_chunk # Alternative without sharding... blosc_codec = {"name": "blosc", "configuration": { @@ -103,7 +104,7 @@ def convert_array(input_path: str, output_path: str, dimension_names): "delete_existing": ns.output_overwrite, "metadata": { "shape": shape, - "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": bigger_chunk}}, + "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": chunks}}, "chunk_key_encoding": {"name": "default"}, "codecs": codecs, "data_type": read.dtype, @@ -205,5 +206,4 @@ def convert_image(read_root, input_path, write_root, output_path): print("img_path", img_path) img_v2 = zarr.open_group(store=read_store, path=img_path, zarr_format=2) # print('img_v2', { k:v for (k,v) in img_v2.attrs.items()}) - print(input_path, out_path) convert_image(img_v2, input_path, out_path) From 940c78320482761cdf6d41d93cff75766d6c5930 Mon Sep 17 00:00:00 2001 From: William Moore Date: Wed, 10 Jul 2024 12:02:33 +0100 Subject: [PATCH 7/9] Handle no output bucket arguments --- dev2/resave.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev2/resave.py b/dev2/resave.py index f3fbc0d..3500af9 100755 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -149,7 +149,7 @@ def convert_image(read_root, input_path, write_root, output_path): (CONFIGS[0], ns.input_path, "r"), (CONFIGS[1], ns.output_path, "w") ): - if config["bucket"]: + if "bucket" in config: store_class = zarr.store.RemoteStore anon = config.get("aws_credentials", {}).get("anonymous", False) store = store_class( @@ -188,7 +188,7 @@ def convert_image(read_root, input_path, write_root, output_path): plate_attrs = read_root.attrs.get("plate") for well in plate_attrs.get("wells"): well_path = well["path"] - well_v2 = zarr.open_group(store=read_store, path=well_path, zarr_format=2) + well_v2 = zarr.open_group(store=STORES[0], path=well_path, zarr_format=2) well_group = write_root.create_group(well_path) # well_attrs = { k:v for (k,v) in well_v2.attrs.items()} # TODO: do we store 'version' in well? From d81a3863b17bab6780bd0b7d803e06c56c6fbd7e Mon Sep 17 00:00:00 2001 From: Josh Moore Date: Wed, 10 Jul 2024 15:21:09 +0200 Subject: [PATCH 8/9] Verify data and re-enable sharding --- dev2/resave.py | 108 ++++++++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 41 deletions(-) diff --git a/dev2/resave.py b/dev2/resave.py index 3500af9..d748f9a 100755 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import random import numpy as np import zarr import sys @@ -12,26 +13,17 @@ parser.add_argument("--input-endpoint") parser.add_argument("--input-anon", action="store_true") parser.add_argument("--input-region", default="us-east-1") -parser.add_argument("--input-overwrite", action="store_true") parser.add_argument("--output-bucket") parser.add_argument("--output-endpoint") parser.add_argument("--output-anon", action="store_true") parser.add_argument("--output-region", default="us-east-1") parser.add_argument("--output-overwrite", action="store_true") +parser.add_argument("--sharding", action="store_true") parser.add_argument("input_path") parser.add_argument("output_path") ns = parser.parse_args() -if os.path.exists(ns.output_path): - if ns.input_overwrite: - import shutil - shutil.rmtree(ns.output_path) - else: - print(f"{ns.output_path} exists. Exiting") - sys.exit(1) - - def create_configs(ns): configs = [] for selection in ("input", "output"): @@ -72,36 +64,37 @@ def convert_array(input_path: str, output_path: str, dimension_names): shape = read.shape chunks = read.schema.chunk_layout.read_chunk.shape - # # bigger_chunk includes 2 of the regular chunks - # bigger_chunk = list(chunks[:]) - # bigger_chunk[0] = bigger_chunk[0] * 2 - - # # sharding breaks bigger_chunk down into regular chunks - # # https://google.github.io/tensorstore/driver/zarr3/index.html#json-driver/zarr3/Codec/sharding_indexed - # sharding_codec = { - # "name": "sharding_indexed", - # "configuration": { - # "chunk_shape": chunks, - # "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, - # {"name": "gzip", "configuration": {"level": 5}}], - # "index_codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, - # {"name": "crc32c"}], - # "index_location": "end" - # } - # } - - # codecs = [sharding_codec] - # chunks = bigger_chunk - - # Alternative without sharding... - blosc_codec = {"name": "blosc", "configuration": { - "cname": "lz4", "clevel": 5}} - codecs = [blosc_codec] - - write = ts.open({ + if ns.sharding: + # bigger_chunk includes 2 of the regular chunks + bigger_chunk = list(chunks[:]) + bigger_chunk[0] = bigger_chunk[0] * 2 + + # sharding breaks bigger_chunk down into regular chunks + # https://google.github.io/tensorstore/driver/zarr3/index.html#json-driver/zarr3/Codec/sharding_indexed + sharding_codec = { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": chunks, + "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "gzip", "configuration": {"level": 5}}], + "index_codecs": [{"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "crc32c"}], + "index_location": "end" + } + } + + codecs = [sharding_codec] + chunks = bigger_chunk + + else: + # Alternative without sharding... + blosc_codec = {"name": "blosc", "configuration": { + "cname": "lz4", "clevel": 5}} + codecs = [blosc_codec] + + base_config = { "driver": "zarr3", "kvstore": CONFIGS[1], - "delete_existing": ns.output_overwrite, "metadata": { "shape": shape, "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": chunks}}, @@ -109,13 +102,31 @@ def convert_array(input_path: str, output_path: str, dimension_names): "codecs": codecs, "data_type": read.dtype, "dimension_names": dimension_names, - }, - "create": True, - }).result() + } + } + + write_config = base_config.copy() + write_config["create"] = True + write_config["delete_existing"] = ns.output_overwrite + + verify_config = base_config.copy() + + write = ts.open(write_config).result() future = write.write(read) future.result() + verify = ts.open(verify_config).result() + print(f"Verifying <{output_path}>\t{read.shape}\t", end="") + for x in range(10): + r = tuple([random.randint(0, y-1) for y in read.shape]) + before = read[r].read().result() + after = verify[r].read().result() + assert before == after + print(".", end="") + print("ok") + + def convert_image(read_root, input_path, write_root, output_path): dimension_names = None # top-level version... @@ -161,6 +172,21 @@ def convert_image(read_root, input_path, write_root, output_path): else: store_class = zarr.store.LocalStore store = store_class(path, mode=mode) + + if STORES: + # If more than one element, then we are configuring + # the output path. If this is local, then delete. + # + # TODO: This should really be an option on zarr-python + # as with tensorstore. + if os.path.exists(ns.output_path): + if ns.output_overwrite: + import shutil + shutil.rmtree(ns.output_path) + else: + print(f"{ns.output_path} exists. Exiting") + sys.exit(1) + STORES.append(store) # Needs zarr_format=2 or we get ValueError("store mode does not support writing") From cce3ef2befcca27b2d746509754189c3342f17a9 Mon Sep 17 00:00:00 2001 From: William Moore Date: Thu, 11 Jul 2024 06:24:39 +0100 Subject: [PATCH 9/9] Fix Well Image attrs and version to 0.5 --- dev2/resave.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dev2/resave.py b/dev2/resave.py index d748f9a..6dcfcfb 100755 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -24,6 +24,9 @@ ns = parser.parse_args() +NGFF_VERSION = "0.5" + + def create_configs(ns): configs = [] for selection in ("input", "output"): @@ -130,7 +133,7 @@ def convert_array(input_path: str, output_path: str, dimension_names): def convert_image(read_root, input_path, write_root, output_path): dimension_names = None # top-level version... - ome_attrs = {"version": "0.5-dev2"} + ome_attrs = {"version": NGFF_VERSION} for key, value in read_root.attrs.items(): # ...replaces all other versions - remove if "version" in value: @@ -202,7 +205,7 @@ def convert_image(read_root, input_path, write_root, output_path): # plate... elif read_root.attrs.get("plate"): - ome_attrs = {"version": "0.5-dev2"} + ome_attrs = {"version": NGFF_VERSION} for key, value in read_root.attrs.items(): # ...replaces all other versions - remove if "version" in value: @@ -230,6 +233,7 @@ def convert_image(read_root, input_path, write_root, output_path): out_path = os.path.join(ns.output_path, img_path) input_path = os.path.join(ns.input_path, img_path) print("img_path", img_path) - img_v2 = zarr.open_group(store=read_store, path=img_path, zarr_format=2) + img_v2 = zarr.open_group(store=STORES[0], path=img_path, zarr_format=2) + image_group = write_root.create_group(img_path) # print('img_v2', { k:v for (k,v) in img_v2.attrs.items()}) - convert_image(img_v2, input_path, out_path) + convert_image(img_v2, input_path, image_group, out_path)