From 4b7752a85b11804522ee201e308003d64e519ce6 Mon Sep 17 00:00:00 2001
From: Flurin Sturzenegger <st.flurin@gmail.com>
Date: Fri, 29 Sep 2023 18:12:20 +0200
Subject: [PATCH] Cleaned up MetaSeriesUtils

---
 src/fractal_faim_hcs/MetaSeriesUtils_dask.py | 295 +++++++++++--------
 1 file changed, 170 insertions(+), 125 deletions(-)

diff --git a/src/fractal_faim_hcs/MetaSeriesUtils_dask.py b/src/fractal_faim_hcs/MetaSeriesUtils_dask.py
index f7e5261..a2f7d58 100644
--- a/src/fractal_faim_hcs/MetaSeriesUtils_dask.py
+++ b/src/fractal_faim_hcs/MetaSeriesUtils_dask.py
@@ -16,19 +16,16 @@
 from tifffile import tifffile
 
 
-
-def get_well_image_CZYX_lazy(  # noqa: C901
+def create_filename_structure(
     well_files: pd.DataFrame,
     channels: list[str],
-    assemble_fn: Callable = montage_grid_image_YX,
-) -> tuple[ArrayLike, list[UIntHistogram], list[dict], dict]:
-    """Assemble image data for the given well-files."""
-    # TODO: This function assumes that there are no gaps in data (i.e. that
-    # there are no images missing to fill a full FCZ-stack)
-    # -> maybe implement checks
-
+) -> ArrayLike:
+    """
+    Assemble filenames in a numpy-array with ordering (field,channel,plane).
+    This allows us to later easily map over the filenames to create a
+    dask-array of the images.
+    """
     planes = sorted(well_files["z"].unique(), key=int)
-
     fields = sorted(
         well_files["field"].unique(),
         key=lambda s: int(re.findall(r"(\d+)", s)[0])
@@ -52,6 +49,132 @@ def get_well_image_CZYX_lazy(  # noqa: C901
                     fns_np[s, c, z] = list(plane_files["path"])[0]
                 elif len(plane_files) > 1:
                     raise RuntimeError("Multiple files found for one FCZ")
+    
+    return fns_np
+
+
+def get_planes_params(
+        stage_positions_da: ArrayLike,
+        channels_full_planes:  list[int],
+    ):
+    """Calculate z_sampling, lowest plane min_z, and number of planes n_z."""
+    z_positions_full_planes = da.mean(
+        stage_positions_da[:,channels_full_planes,:,2],
+        axis=0
+    ).compute()
+    z_sampling = compute_z_sampling(z_positions_full_planes)
+    min_z = z_positions_full_planes.min(axis = 1).mean()
+    n_z = z_positions_full_planes.shape[1]
+    return z_sampling, min_z, n_z
+
+def roll_single_planes(
+        stage_positions_da: ArrayLike,
+        channels_one_plane: list[int],
+        z_sampling: float,
+        min_z: float,
+        n_z: int,
+        fns_np: ArrayLike,
+    ):
+    """
+    Calculate z-position of single-plane-channels, and roll the filenames to
+    the correct position
+    """
+    z_positions_one_plane = da.mean(
+        stage_positions_da[:,channels_one_plane,0,2],
+        axis=0
+    ).compute()
+
+    for c, z_position in zip(channels_one_plane, z_positions_one_plane):
+        shift_z = round((z_position - min_z) / z_sampling)
+        # if plane lies outside region of z-stack: clip to lowest/highest plane
+        if shift_z < 0:
+            shift_z = 0
+        elif n_z <= shift_z:
+            shift_z = n_z - 1
+        fns_np[:,c,:] = np.roll(fns_np[:,c,:], shift_z, axis=1)
+
+def load_UIntHistograms(imgs_fused_da, channels_empty):
+    # TODO: See if this can be done more efficiently
+    def _load_histo(x):
+        histo = UIntHistogram(x)
+        return np.array([[histo]])
+
+    histos_da = da.map_blocks(
+        _load_histo,
+        imgs_fused_da,
+        chunks=(1, 1),
+        drop_axis=(
+            2,
+            3,
+        ),
+        meta=np.asanyarray([UIntHistogram()]),
+    )
+    histos = histos_da.compute()
+
+    channel_histograms = []
+    for c in range(histos.shape[0]):
+        channel_histogram = UIntHistogram()
+        for z in range(histos.shape[1]):
+            channel_histogram.combine(histos[c, z])
+        channel_histograms.append(channel_histogram)
+    for c in channels_empty:
+        channel_histograms[c] = UIntHistogram()
+
+    return channel_histograms
+
+def _da_read_stage_positions(x: ArrayLike):
+    """
+    Load stage positions of a file (used to map over a dask-array)
+    x: One filename in an array
+    returns: Array of xyz positions, in the shape of the input array (plus 3)
+    """
+    fn = x.flatten()[0]
+    if fn=='':
+        raise RuntimeError(
+            'File not found. This channel probably either only has projections'
+            ' or only one plane.'
+        )
+    ms_metadata = load_metaseries_tiff_metadata(fn)
+    x_pos = ms_metadata["stage-position-x"]
+    y_pos = ms_metadata["stage-position-y"]
+    z_pos = ms_metadata["z-position"]
+    output = np.array([x_pos, y_pos, z_pos], dtype="float64")
+    newshape = x.shape + output.shape
+    return np.reshape(output, newshape)
+
+def _fuse_xy(
+    x: ArrayLike,
+    assemble_fn: Callable,
+    img_shape: tuple[int] = None,
+    img_dtype: np.dtype = None
+) -> ArrayLike:
+    """
+    Load images and fuse them
+    """
+    fns = x.flatten()
+    if '' not in fns:
+        # TODO: Rewrite get_img_YX to reduce unnecessary computations
+        files = pd.DataFrame(fns, columns=["path"])
+        _, img, *_ = get_img_YX(assemble_fn, files)
+    else:
+        # return zeros if plane is empty
+        if img_shape != None and img_dtype != None:
+            img = np.zeros(img_shape, dtype=img_dtype)
+        else:
+            raise RuntimeError(
+                "'img_dimension' and 'img_dtype' must be provided if plane is"
+                " empty"
+            )
+    return img.reshape(x.shape[1:] + img.shape)
+
+def get_well_image_CZYX_lazy(  # noqa: C901
+    well_files: pd.DataFrame,
+    channels: list[str],
+    assemble_fn: Callable = montage_grid_image_YX,
+) -> tuple[ArrayLike, list[UIntHistogram], list[dict], dict]:
+    """Assemble image data for the given well-files."""
+
+    fns_np = create_filename_structure(well_files, channels)
 
     # distinguish between channels with
     #   - all z-planes
@@ -69,115 +192,65 @@ def get_well_image_CZYX_lazy(  # noqa: C901
             channels_empty.append(c)
 
 
-    # LOAD STAGE POSITIONS:
+    # LOAD STAGE POSITIONS, CALCULATE Z_SAMPLING & ARRANGE SINGLE PLANES:
     # TODO: maybe only read z-positions
-    def _da_read_stage_positions(x):
-        fn = x.flatten()[0]
-        if fn=='':
-            raise RuntimeError('File not found. This channel probably either only has projections or only one plane.')
-        ms_metadata = load_metaseries_tiff_metadata(fn)
-        x_pos = ms_metadata["stage-position-x"]
-        y_pos = ms_metadata["stage-position-y"]
-        z_pos = ms_metadata["z-position"]
-        output = np.array([x_pos, y_pos, z_pos], dtype="float64")
-        newshape = x.shape + output.shape
-        return np.reshape(output, newshape)
-
-    fns_da = da.from_array(
-        fns_np,
-        chunks=((1,)*len(fns_np.shape))
-    )
-
     stage_positions_da = da.map_blocks(
         _da_read_stage_positions,
-        fns_da,
+        da.from_array(fns_np, chunks=1),
         chunks=da.core.normalize_chunks(
-            (1,) * len(fns_da.shape) + (3,), (*fns_da.shape, 3)
+            (1,) * len(fns_np.shape) + (3,), (*fns_np.shape, 3)
         ),
-        new_axis=len(fns_da.shape),
+        new_axis=len(fns_np.shape),
         meta=np.asanyarray([]).astype("float64"),
     )
 
-    z_positions_full_planes = da.mean(
-        stage_positions_da[:,channels_full_planes,:,2],
-        axis=0
-    ).compute()
-    z_positions_one_plane = da.mean(
-        stage_positions_da[:,channels_one_plane,0,2],
-        axis=0
-    ).compute()
+    z_sampling, min_z, n_z = get_planes_params(
+        stage_positions_da, channels_full_planes
+    )
 
-    z_sampling = compute_z_sampling(z_positions_full_planes)
+    roll_single_planes(
+        stage_positions_da, channels_one_plane, z_sampling, min_z, n_z, fns_np
+    )
 
-    # roll single-plane channels to correct position
-    min_z = z_positions_full_planes.min(axis = 1).mean()
-    max_z = z_positions_full_planes.max(axis = 1).mean()
-    step_size = (max_z - min_z) / len(z_positions_full_planes[0])
-    for c, z_positions in zip(channels_one_plane, z_positions_one_plane):
-        shift_z = int((z_positions - min_z) // step_size)
-        fns_np[:,c,:] = np.roll(fns_np[:,c,:], shift_z, axis=1)
 
+    # GENERATE ROI_TABLES AND PX_METADATA:
+    # (by only loading one plane)
+    # TODO: Could also be done without loading images of entire plane
+    fns_sel = fns_np[:, channels_full_planes[0], 0]
+    files = pd.DataFrame(fns_sel, columns=["path"])
+    px_metadata, sample_fused, *_, roi_tables = get_img_YX(assemble_fn, files)
+
+    px_metadata["z-scaling"] = z_sampling
+
+    for roi_table in roi_tables.values():
+        roi_table["len_z_micrometer"] = z_sampling * (fns_np.shape[2] - 1)
 
-    # MONTAGE PLANES
-    def _fuse_xy(
-        x: ArrayLike,
-        img_shape: tuple[int] = None,
-        img_dtype: np.dtype = None
-    ) -> ArrayLike:
-        fns = x.flatten()
-        if '' not in fns:
-            # TODO: Rewrite get_img_YX to reduce unnecessary computations
-            files = pd.DataFrame(fns, columns=["path"])
-            _, img, *_ = get_img_YX(assemble_fn, files)
-        else:
-            # return zeros if plane is empty
-            if img_shape != None and img_dtype != None:
-                img = np.zeros(img_shape, dtype=img_dtype)
-            else:
-                raise RuntimeError("'img_dimension' and 'img_dtype' must be provided if plane is empty")
-        return img.reshape(x.shape[1:] + img.shape)
-
-    # calculate one plane to get dimensions and dtype
-    # (choose a chanel which is non-empyt)
-    # TODO: could be done without loading images
-    sample_fused = _fuse_xy(fns_np[:, channels_full_planes[0], 0])
     (nx_tot, ny_tot) = sample_fused.shape
     img_dtype = sample_fused.dtype
 
-    # create dask array of assembled planes
-    fns_da = da.from_array(
-        fns_np,
-        chunks=((fns_np.shape[0],)+(1,)*len(fns_np.shape[1:]))
-    )
+
+    # MONTAGE PLANES
     imgs_fused_da = da.map_blocks(
         _fuse_xy,
-        fns_da,
+        da.from_array(
+            fns_np,
+            chunks=((fns_np.shape[0],)+(1,)*len(fns_np.shape[1:]))
+        ),
         chunks=da.core.normalize_chunks(
-            (1,) * len(fns_da.shape[1:]) + (nx_tot, ny_tot),
-            shape=fns_da.shape[1:] + (nx_tot, ny_tot),
+            (1,) * len(fns_np.shape[1:]) + (nx_tot, ny_tot),
+            shape=fns_np.shape[1:] + (nx_tot, ny_tot),
         ),
         drop_axis=0,
-        new_axis=range(len(fns_da.shape) - 1, len(fns_da.shape) + 1),
+        new_axis=range(len(fns_np.shape) - 1, len(fns_np.shape) + 1),
         meta=np.asanyarray([]).astype(sample_fused.dtype),
-    
+
+
+        assemble_fn=assemble_fn,
         img_shape=(nx_tot, ny_tot),
         img_dtype=img_dtype
     )
 
 
-    # GENERATE ROI_TABLES AND PX_METADATA:
-    # (by only loading one plane)
-    # TODO: Could also be done without loading images of entire plane
-    fns_sel = fns_da[:, channels_full_planes[0], 0].compute()
-    files = pd.DataFrame(fns_sel, columns=["path"])
-    px_metadata, _, _, _, roi_tables = get_img_YX(assemble_fn, files)
-
-    px_metadata["z-scaling"] = z_sampling
-
-    for roi_table in roi_tables.values():
-        roi_table["len_z_micrometer"] = z_sampling * (imgs_fused_da.shape[1] - 1)
-
-
     # LOAD CHANNEL-METADATA
     def load_channel_metadata(fn):
         ms_metadata = load_metaseries_tiff_metadata(fn)
@@ -200,33 +273,9 @@ def load_channel_metadata(fn):
                 }
             )
 
-    # LOAD UINTHISTOGRAMS
-    # TODO: See if this can be done more efficiently
-    def _load_histo(x):
-        histo = UIntHistogram(x)
-        return np.array([[histo]])
-
-    histos_da = da.map_blocks(
-        _load_histo,
-        imgs_fused_da,
-        chunks=(1, 1),
-        drop_axis=(
-            2,
-            3,
-        ),
-        meta=np.asanyarray([UIntHistogram()]),
-    )
-    histos = histos_da.compute()
-
-    channel_histograms = []
-    for c, _ in enumerate(channels):
-        channel_histogram = UIntHistogram()
-        for z, _ in enumerate(planes):
-            channel_histogram.combine(histos[c, z])
-        channel_histograms.append(channel_histogram)
-    for c in channels_empty:
-        channel_histograms[c] = UIntHistogram()
 
+    # LOAD UINTHISTOGRAMS
+    channel_histograms = load_UIntHistograms(imgs_fused_da, channels_empty)
 
     return (
         imgs_fused_da,
@@ -240,6 +289,8 @@ def _load_histo(x):
 
 def load_metaseries_tiff_metadata(path: Path) -> dict:
     """Load metaseries tiff file and return parts of its metadata.
+    Only load metadata, and not entire image for faster loading. -> PixelType
+    is not returned.
 
     The following metadata is collected:
     * _IllumSetting_
@@ -253,11 +304,6 @@ def load_metaseries_tiff_metadata(path: Path) -> dict:
     * _MagNA_
     * _MagSetting_
     * Exposure Time
-    * Lumencor Cyan Intensity
-    * Lumencor Green Intensity
-    * Lumencor Red Intensity
-    * Lumencor Violet Intensity
-    * Lumencor Yellow Intensity
     * ShadingCorrection
     * stage-label
     * SiteX
@@ -297,6 +343,5 @@ def load_metaseries_tiff_metadata(path: Path) -> dict:
         for metadata_key in plane_info:
             if metadata_key.endswith("Intensity"):
                 metadata[metadata_key] = plane_info[metadata_key]
-        # metadata["PixelType"] = str(data.dtype)
 
     return metadata