From 7b7bc7ef338cc4781f829ff8ab16f79471f07401 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Mon, 28 Oct 2024 08:41:04 +0100 Subject: [PATCH 1/4] Save reader elements incrementally --- src/spatialdata_io/readers/xenium.py | 135 +++++++++++++++++++++++---- 1 file changed, 116 insertions(+), 19 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 74034f37..6ecd8851 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -25,6 +25,7 @@ from datatree.datatree import DataTree from geopandas import GeoDataFrame from joblib import Parallel, delayed +from ome_zarr.io import parse_url from pyarrow import Table from shapely import Polygon from spatialdata import SpatialData @@ -48,7 +49,6 @@ __all__ = ["xenium", "xenium_aligned_image", "xenium_explorer_selection"] - @deprecation_alias(cells_as_shapes="cells_as_circles", cell_boundaries="cells_boundaries", cell_labels="cells_labels") @inject_docs(xx=XeniumKeys) def xenium( @@ -68,6 +68,7 @@ def xenium( imread_kwargs: Mapping[str, Any] = MappingProxyType({}), image_models_kwargs: Mapping[str, Any] = MappingProxyType({}), labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}), + output_path: Path | None = None, ) -> SpatialData: """ Read a *10X Genomics Xenium* dataset into a SpatialData object. @@ -124,10 +125,13 @@ def xenium( Keyword arguments to pass to the image models. labels_models_kwargs Keyword arguments to pass to the labels models. + output_path + Path to directly write the output to a zarr file. This can decrease the memory requirement. If not provided, the + function will return a :class:`spatialdata.SpatialData` object. Returns ------- - :class:`spatialdata.SpatialData` + If `output_path` is provided, the function will return `None`. Otherwise, it will return a :class:`spatialdata.SpatialData` object. Notes ----- @@ -160,6 +164,8 @@ def xenium( image_models_kwargs, labels_models_kwargs ) path = Path(path) + output_path = Path(output_path) if output_path is not None else None + with open(path / XeniumKeys.XENIUM_SPECS) as f: specs = json.load(f) # to trigger the warning if the version cannot be parsed @@ -204,11 +210,14 @@ def xenium( table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] - polygons = {} - labels = {} - tables = {} - points = {} - images = {} + sdata = SpatialData() + + if output_path is not None: + sdata.path = output_path + sdata._validate_can_safely_write_to_path(output_path, overwrite=False) + store = parse_url(output_path, mode="w").store + _ = zarr.group(store=store, overwrite=False) + store.close() # From the public release notes here: # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa @@ -217,7 +226,7 @@ def xenium( # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus # labels. if nucleus_labels: - labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( + sdata.labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( path, XeniumKeys.CELLS_ZARR, specs, @@ -225,8 +234,17 @@ def xenium( labels_name="nucleus_labels", labels_models_kwargs=labels_models_kwargs, ) + if output_path is not None: + sdata._write_element( + element=sdata.labels["nucleus_labels"], + zarr_container_path=output_path, + element_type="labels", + element_name="nucleus_labels", + overwrite=False, + ) + del sdata.labels["nucleus_labels"] if cells_labels: - labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( + sdata.labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( path, XeniumKeys.CELLS_ZARR, specs, @@ -234,6 +252,15 @@ def xenium( labels_name="cell_labels", labels_models_kwargs=labels_models_kwargs, ) + if output_path is not None: + sdata._write_element( + element=sdata.labels["cell_labels"], + zarr_container_path=output_path, + element_type="labels", + element_name="cell_labels", + overwrite=False, + ) + del sdata.labels["cell_labels"] if cell_labels_indices_mapping is not None and table is not None: if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): warnings.warn( @@ -249,41 +276,86 @@ def xenium( table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" if nucleus_boundaries: - polygons["nucleus_boundaries"] = _get_polygons( + sdata.shapes["nucleus_boundaries"] = _get_polygons( path, XeniumKeys.NUCLEUS_BOUNDARIES_FILE, specs, n_jobs, idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), ) + if output_path is not None: + sdata._write_element( + element=sdata.shapes["nucleus_boundaries"], + zarr_container_path=output_path, + element_type="shapes", + element_name="nucleus_boundaries", + overwrite=False, + ) + del sdata.shapes["nucleus_boundaries"] if cells_boundaries: - polygons["cell_boundaries"] = _get_polygons( + sdata.shapes["cell_boundaries"] = _get_polygons( path, XeniumKeys.CELL_BOUNDARIES_FILE, specs, n_jobs, idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), ) + if output_path is not None: + sdata._write_element( + element=sdata.shapes["cell_boundaries"], + zarr_container_path=output_path, + element_type="shapes", + element_name="cell_boundaries", + overwrite=False, + ) + del sdata.shapes["cell_boundaries"] if transcripts: - points["transcripts"] = _get_points(path, specs) + sdata.points["transcripts"] = _get_points(path, specs) + if output_path is not None: + sdata._write_element( + element=sdata.points["transcripts"], + zarr_container_path=output_path, + element_type="points", + element_name="transcripts", + overwrite=False, + ) + del sdata.points["transcripts"] if version is None or version < packaging.version.parse("2.0.0"): if morphology_mip: - images["morphology_mip"] = _get_images( + sdata.images["morphology_mip"] = _get_images( path, XeniumKeys.MORPHOLOGY_MIP_FILE, imread_kwargs, image_models_kwargs, ) + if output_path is not None: + sdata._write_element( + element=sdata.images["morphology_mip"], + zarr_container_path=output_path, + element_type="images", + element_name="morphology_mip", + overwrite=False, + ) + del sdata.images["morphology_mip"] if morphology_focus: - images["morphology_focus"] = _get_images( + sdata.images["morphology_focus"] = _get_images( path, XeniumKeys.MORPHOLOGY_FOCUS_FILE, imread_kwargs, image_models_kwargs, ) + if output_path is not None: + sdata._write_element( + element=sdata.images["morphology_focus"], + zarr_container_path=output_path, + element_type="images", + element_name="morphology_focus", + overwrite=False, + ) + del sdata.images["morphology_focus"] else: if morphology_focus: morphology_focus_dir = path / XeniumKeys.MORPHOLOGY_FOCUS_DIR @@ -331,28 +403,53 @@ def filter(self, record: logging.LogRecord) -> bool: "c_coords" not in image_models_kwargs ), "The channel names for the morphology focus images are handled internally" image_models_kwargs["c_coords"] = list(channel_names.values()) - images["morphology_focus"] = _get_images( + sdata.images["morphology_focus"] = _get_images( morphology_focus_dir, XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_IMAGE.format(0), imread_kwargs, image_models_kwargs, ) del image_models_kwargs["c_coords"] + if output_path is not None: + sdata._write_element( + element=sdata.images["morphology_focus"], + zarr_container_path=output_path, + element_type="images", + element_name="morphology_focus", + overwrite=False, + ) + del sdata.images["morphology_focus"] logger.removeFilter(IgnoreSpecificMessage()) if table is not None: - tables["table"] = table + sdata.tables["table"] = table + if output_path is not None: + sdata._write_element( + element=sdata.tables["table"], + zarr_container_path=output_path, + element_type="tables", + element_name="table", + overwrite=False, + ) + del sdata.tables["table"] - elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons} if cells_as_circles: - elements_dict["shapes"][specs["region"]] = circles - sdata = SpatialData(**elements_dict) + sdata.shapes[specs["region"]] = circles # find and add additional aligned images if aligned_images: extra_images = _add_aligned_images(path, imread_kwargs, image_models_kwargs) for key, value in extra_images.items(): sdata.images[key] = value + if output_path is not None: + sdata._write_element( + element=sdata.images[key], + zarr_container_path=output_path, + element_type="images", + element_name=key, + overwrite=False, + ) + del sdata.images[key] return sdata From d0fed4dc9c66b28230026f98187c7ce75c5dc465 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Mon, 28 Oct 2024 09:26:29 +0100 Subject: [PATCH 2/4] Consolidated metadata --- src/spatialdata_io/readers/xenium.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 6ecd8851..6a0b564e 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -451,6 +451,8 @@ def filter(self, record: logging.LogRecord) -> bool: ) del sdata.images[key] + sdata.write_consolidated_metadata() + return sdata From d13f581a6bbb935fe3ae76f579303607cfd852f9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 08:34:40 +0000 Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/readers/xenium.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 6a0b564e..96d992d5 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -49,6 +49,7 @@ __all__ = ["xenium", "xenium_aligned_image", "xenium_explorer_selection"] + @deprecation_alias(cells_as_shapes="cells_as_circles", cell_boundaries="cells_boundaries", cell_labels="cells_labels") @inject_docs(xx=XeniumKeys) def xenium( @@ -211,13 +212,13 @@ def xenium( table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] sdata = SpatialData() - + if output_path is not None: sdata.path = output_path sdata._validate_can_safely_write_to_path(output_path, overwrite=False) store = parse_url(output_path, mode="w").store _ = zarr.group(store=store, overwrite=False) - store.close() + store.close() # From the public release notes here: # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa From 43f0d04b8da16b5794e7d7c4aa847e45c0cbc776 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Mon, 28 Oct 2024 09:53:59 +0100 Subject: [PATCH 4/4] Write consolidated data only when output_path is set --- src/spatialdata_io/readers/xenium.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 96d992d5..6677a307 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -452,7 +452,8 @@ def filter(self, record: logging.LogRecord) -> bool: ) del sdata.images[key] - sdata.write_consolidated_metadata() + if output_path is not None: + sdata.write_consolidated_metadata() return sdata