Enable XYZ chunking at conversion (#178)

* add xyz chunking at conversion * bugfixes * revamp ndtiff converter * remove cli debugging * style * bugfix * write metadata at the end * pos name fixes * style * better timepoint handling * fix tests * move the ndtiff conversion path into a method * Update iohub/cli/cli.py Co-authored-by: Talon Chandler <[email protected]> * better handling of missing timepoints * prettier T/C bar * better data slicing * Revert "prettier T/C bar" This reverts commit 9b20964. --------- Co-authored-by: Ziwen Liu <[email protected]> Co-authored-by: Talon Chandler <[email protected]>
czbiohub-sf · Aug 26, 2023 · fc1db13 · fc1db13
1 parent e19d1f1
commit fc1db13
Show file tree

Hide file tree

Showing 4 changed files with 190 additions and 62 deletions.
diff --git a/iohub/cli/cli.py b/iohub/cli/cli.py
@@ -82,13 +82,22 @@ def info(files, verbose):
     is_flag=True,
     help="Arrange FOVs in a row/column grid layout for tiled acquisition",
 )
-def convert(input, output, format, scale_voxels, grid_layout):
+@click.option(
+    "--chunks",
+    "-c",
+    required=False,
+    default="XY",
+    help="Zarr chunk size given as 'XY', 'XYZ', or a tuple of chunk "
+    "dimensions. If 'XYZ', chunk size will be limited to 500 MB.",
+)
+def convert(input, output, format, scale_voxels, grid_layout, chunks):
     """Converts Micro-Manager TIFF datasets to OME-Zarr"""
     converter = TIFFConverter(
         input_dir=input,
         output_dir=output,
         data_type=format,
         scale_voxels=scale_voxels,
         grid_layout=grid_layout,
+        chunks=chunks,
     )
     converter.run()
diff --git a/iohub/convert.py b/iohub/convert.py
@@ -5,8 +5,10 @@
 from typing import Literal, Union
 
 import numpy as np
+from dask.array import to_zarr
 from numpy.typing import NDArray
 from tqdm import tqdm
+from tqdm.contrib.itertools import product
 
 from iohub._version import version as iohub_version
 from iohub.ngff import Position, TransformationMeta, open_ome_zarr
@@ -18,6 +20,7 @@
 )
 
 __all__ = ["TIFFConverter"]
+MAX_CHUNK_SIZE = 500e6  # in bytes
 
 
 def _create_grid_from_coordinates(
@@ -84,7 +87,7 @@ class TIFFConverter:
         Whether to lay out the positions in a grid-like format
         based on how the data was acquired
         (useful for tiled acquisitions), by default False
-    chunks : tuple[int], optional
+    chunks : tuple[int] or Literal['XY', 'XYZ'], optional
         Chunk size of the output Zarr arrays, by default None
         (chunk by XY planes, this is the fastest at converting time)
     scale_voxels : bool, optional
@@ -109,7 +112,7 @@ def __init__(
         output_dir: str,
         data_type: Literal["singlepagetiff", "ometiff", "ndtiff"] = None,
         grid_layout: int = False,
-        chunks: tuple[int] = None,
+        chunks: Union[tuple[int], Literal["XY", "XYZ"]] = None,
         scale_voxels: bool = True,
         hcs_plate: bool = None,
     ):
@@ -148,6 +151,7 @@ def __init__(
         self.z = self.reader.slices
         self.y = self.reader.height
         self.x = self.reader.width
+        self.dtype = self.reader.dtype
         self.dim = (self.p, self.t, self.c, self.z, self.y, self.x)
         self.prefix_list = []
         self.hcs_plate = hcs_plate
@@ -175,7 +179,7 @@ def __init__(
                 self._make_default_grid()
         else:
             self._make_default_grid()
-        self.chunks = chunks if chunks else (1, 1, 1, self.y, self.x)
+        self.chunks = self._gen_chunks(chunks)
         self.transform = self._scale_voxels() if scale_voxels else None
 
     def _check_hcs_sites(self):
@@ -331,14 +335,35 @@ def _get_coord_reorder(self, coord):
         ]
         return tuple(reordered)
 
-    def _normalize_ndtiff_coord(
-        self, p: int, t: int, c: int, z: int
-    ) -> tuple[Union[str, int], ...]:
-        if self.reader.str_position_axis:
-            p = self.pos_names[p]
-        if self.reader.str_channel_axis:
-            c = self.reader.channel_names[c]
-        return p, t, c, z
+    def _gen_chunks(self, input_chunks):
+        if not input_chunks:
+            chunks = [1, 1, 1, self.y, self.x]
+        elif isinstance(input_chunks, tuple):
+            chunks = list(input_chunks)
+        elif isinstance(input_chunks, str):
+            if input_chunks.lower() == "xy":
+                chunks = [1, 1, 1, self.y, self.x]
+            elif input_chunks.lower() == "xyz":
+                chunks = [1, 1, self.z, self.y, self.x]
+            else:
+                raise ValueError(f"{input_chunks} chunks are not supported.")
+        else:
+            raise TypeError(
+                f"Chunk type {type(input_chunks)} is not supported."
+            )
+
+        # limit chunks to MAX_CHUNK_SIZE bytes
+        bytes_per_pixel = np.dtype(self.dtype).itemsize
+        # it's OK if a single image is larger than MAX_CHUNK_SIZE
+        while (
+            chunks[-3] > 1
+            and np.prod(chunks) * bytes_per_pixel > MAX_CHUNK_SIZE
+        ):
+            chunks[-3] = np.ceil(chunks[-3] / 2).astype(int)
+
+        logging.debug(f"Zarr store chunk size will be set to {chunks}.")
+
+        return tuple(chunks)
 
     def _get_channel_names(self):
         cns = self.reader.channel_names
@@ -397,7 +422,7 @@ def _init_zarr_arrays(self):
                 self.y,
                 self.x,
             ),
-            "dtype": self.reader.dtype,
+            "dtype": self.dtype,
             "chunks": self.chunks,
             "transform": self.transform,
         }
@@ -430,6 +455,108 @@ def _create_zeros_array(
         ]
         pos.dump_meta()
 
+    def _convert_ndtiff(self):
+        bar_format_positions = (
+            "Converting Positions: |{bar:16}|{n_fmt}/{total_fmt} "
+            "(Time Remaining: {remaining}), {rate_fmt}{postfix}]"
+        )
+        bar_format_time_channel = (
+            "Converting Timepoints/Channels: |{bar:16}|{n_fmt}/{total_fmt} "
+            "(Time Remaining: {remaining}), {rate_fmt}{postfix}]"
+        )
+        all_ndtiff_metadata = {}
+        for p_idx in tqdm(range(self.p), bar_format=bar_format_positions):
+            # ndtiff_pos_idx, ndtiff_t_idx, and ndtiff_channel_idx
+            # may be None
+            ndtiff_pos_idx = (
+                self.pos_names[p_idx]
+                if self.reader.str_position_axis
+                else p_idx
+            )
+            try:
+                ndtiff_pos_idx, *_ = self.reader._check_coordinates(
+                    ndtiff_pos_idx, 0, 0, 0
+                )
+            except ValueError:
+                # Log warning and continue if some positions were not
+                # acquired in the dataset
+                logging.warning(
+                    f"Cannot load data at position {ndtiff_pos_idx}, "
+                    "filling with zeros. Raw data may be incomplete."
+                )
+                continue
+
+            dask_arr = self.reader.get_zarr(position=ndtiff_pos_idx)
+            zarr_pos_name = self.zarr_position_names[p_idx]
+            zarr_arr = self.writer[zarr_pos_name]["0"]
+
+            for t_idx, c_idx in product(
+                range(self.t),
+                range(self.c),
+                bar_format=bar_format_time_channel,
+                position=1,
+                leave=False,
+            ):
+                ndtiff_channel_idx = (
+                    self.reader.channel_names[c_idx]
+                    if self.reader.str_channel_axis
+                    else c_idx
+                )
+                # set ndtiff_t_idx and ndtiff_z_idx to None if these axes were
+                # not acquired
+                (
+                    _,
+                    ndtiff_t_idx,
+                    ndtiff_channel_idx,
+                    ndtiff_z_idx,
+                ) = self.reader._check_coordinates(
+                    ndtiff_pos_idx, t_idx, ndtiff_channel_idx, 0
+                )
+                # Log warning and continue if some T/C were not acquired in the
+                # dataset
+                if not self.reader.dataset.has_image(
+                    position=ndtiff_pos_idx,
+                    time=ndtiff_t_idx,
+                    channel=ndtiff_channel_idx,
+                    z=ndtiff_z_idx,
+                ):
+                    logging.warning(
+                        f"Cannot load data at timepoint {t_idx},  channel "
+                        f"{c_idx}, filling with zeros. Raw data may be "
+                        "incomplete."
+                    )
+                    continue
+
+                data_slice = (slice(t_idx, t_idx + 1), slice(c_idx, c_idx + 1))
+                to_zarr(
+                    dask_arr[data_slice].rechunk(self.chunks),
+                    zarr_arr,
+                    region=data_slice,
+                )
+
+                for z_idx in range(self.z):
+                    # this function will handle z_idx=0 when no z stacks
+                    # acquired
+                    image_metadata = self.reader.get_image_metadata(
+                        ndtiff_pos_idx,
+                        ndtiff_t_idx,
+                        ndtiff_channel_idx,
+                        z_idx,
+                    )
+                    # row/well/fov/img/T/C/Z
+                    frame_key = "/".join(
+                        [zarr_arr.path]
+                        + [str(i) for i in (t_idx, c_idx, z_idx)]
+                    )
+                    all_ndtiff_metadata[frame_key] = image_metadata
+
+        logging.info("Writing ND-TIFF image plane metadata...")
+        with open(
+            os.path.join(self.output_dir, "image_plane_metadata.json"),
+            mode="x",
+        ) as metadata_file:
+            json.dump(all_ndtiff_metadata, metadata_file, indent=4)
+
     def run(self, check_image: bool = True):
         """Runs the conversion.
 
@@ -441,50 +568,40 @@ def run(self, check_image: bool = True):
         """
         logging.debug("Setting up Zarr store.")
         self._init_zarr_arrays()
-        bar_format = (
-            "Status: |{bar:16}|{n_fmt}/{total_fmt} "
+        bar_format_images = (
+            "Converting Images: |{bar:16}|{n_fmt}/{total_fmt} "
             "(Time Remaining: {remaining}), {rate_fmt}{postfix}]"
         )
         # Run through every coordinate and convert in acquisition order
         logging.info("Converting Images...")
-        ndtiff = False
         if isinstance(self.reader, NDTiffReader):
-            ndtiff = True
-            all_ndtiff_metadata = {}
-        for coord in tqdm(self.coords, bar_format=bar_format):
-            coord_reorder = self._get_coord_reorder(coord)
-            if isinstance(self.reader, NDTiffReader):
-                p, t, c, z = self._normalize_ndtiff_coord(*coord_reorder)
-            else:
-                p, t, c, z = coord_reorder
-            img_raw = self._get_image_array(p, t, c, z)
-            if img_raw is None or not getattr(img_raw, "shape", ()):
-                # Leave incomplete datasets zero-filled
-                logging.warning(
-                    f"Cannot load image at PTCZ={(p, t, c, z)}, "
-                    "filling with zeros. Check if the raw data is incomplete."
-                )
-                continue
-            else:
-                pos_idx = coord_reorder[0]
-            pos_name = self.zarr_position_names[pos_idx]
-            zarr_img = self.writer[pos_name]["0"]
-            zarr_img[coord_reorder[1:]] = img_raw
             if check_image:
-                self._perform_image_check(zarr_img[coord_reorder[1:]], img_raw)
-            if ndtiff:
-                image_metadata = self.reader.get_image_metadata(p, t, c, z)
-                # row/well/fov/img/T/C/Z
-                frame_key = "/".join(
-                    [zarr_img.path] + [str(i) for i in (t, c, z)]
+                logging.info(
+                    "Checking converted image is not supported for ND-TIFF. "
+                    "Ignoring..."
                 )
-                all_ndtiff_metadata[frame_key] = image_metadata
+            self._convert_ndtiff()
+        else:
+            for coord in tqdm(self.coords, bar_format=bar_format_images):
+                coord_reorder = self._get_coord_reorder(coord)
+                p, t, c, z = coord_reorder
+                img_raw = self._get_image_array(p, t, c, z)
+                if img_raw is None or not getattr(img_raw, "shape", ()):
+                    # Leave incomplete datasets zero-filled
+                    logging.warning(
+                        f"Cannot load image at PTCZ={(p, t, c, z)}, filling "
+                        "with zeros. Check if the raw data is incomplete."
+                    )
+                    continue
+                else:
+                    pos_idx = coord_reorder[0]
+                ndtiff_pos_idx = self.zarr_position_names[pos_idx]
+                zarr_img = self.writer[ndtiff_pos_idx]["0"]
+                zarr_img[coord_reorder[1:]] = img_raw
+                if check_image:
+                    self._perform_image_check(
+                        zarr_img[coord_reorder[1:]], img_raw
+                    )
+
         self.writer.zgroup.attrs.update(self.metadata)
-        if ndtiff:
-            logging.info("Writing ND-TIFF image plane metadata...")
-            with open(
-                os.path.join(self.output_dir, "image_plane_metadata.json"),
-                mode="x",
-            ) as metadata_file:
-                json.dump(all_ndtiff_metadata, metadata_file, indent=4)
         self.writer.close()
diff --git a/iohub/ndtiff.py b/iohub/ndtiff.py
@@ -149,16 +149,18 @@ def _check_coordinates(
 
             # The axis is not part of the dataset axes
             else:
-                # If coord = 0 is requested, the coordinate will be replaced
-                # with None
-                if coord == 0:
-                    coords[i] = None
-                # If coord != 0 is requested and the axis is not part of the
-                # dataset, ValueError will be raised
-                else:
-                    raise ValueError(
-                        f"Axis {axis} is not part of this dataset"
-                    )
+                # Nothing to do if coord == None
+                if coord is not None:
+                    # If coord = 0 is requested, the coordinate will be
+                    # replaced with None
+                    if coord == 0:
+                        coords[i] = None
+                    # If coord != 0 is requested and the axis is not part of
+                    # the dataset, ValueError will be raised
+                    else:
+                        raise ValueError(
+                            f"Axis {axis} is not part of this dataset"
+                        )
 
         return (*coords,)
 

diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
@@ -110,4 +110,4 @@ def test_cli_convert_ome_tiff(
             cmd += ["-g"]
         result = runner.invoke(cli, cmd)
     assert result.exit_code == 0
-    assert "Status" in result.output
+    assert "Converting" in result.output