From 601a1223d7255e754b1f90046b08d2d1d8d0df75 Mon Sep 17 00:00:00 2001 From: LLehner Date: Thu, 24 Oct 2024 16:10:13 +0200 Subject: [PATCH 01/11] update seqfish reader --- src/spatialdata_io/_constants/_constants.py | 11 +- src/spatialdata_io/readers/_utils/_utils.py | 9 +- src/spatialdata_io/readers/seqfish.py | 136 +++++++++++++------- 3 files changed, 99 insertions(+), 57 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index 34848137..57f50db0 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -66,14 +66,15 @@ class SeqfishKeys(ModeEnum): # file extensions CSV_FILE = ".csv" TIFF_FILE = ".tiff" - OME_TIFF_FILE = ".ome.tiff" + GEOJSON_FILE = ".geojson" # file identifiers - SECTION = "section" - TRANSCRIPT_COORDINATES = "TranscriptCoordinates" + SECTION = "_section" + TRANSCRIPT_COORDINATES = "TranscriptList" DAPI = "DAPI" - COUNTS_FILE = "CxG" - CELL_MASK_FILE = "CellMask" + COUNTS_FILE = "CellxGene" + SEGMENTATION = "Segmentation" CELL_COORDINATES = "CellCoordinates" + BOUNDARIES = "Boundaries" # transcripts TRANSCRIPTS_X = "x" TRANSCRIPTS_Y = "y" diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index 4623fced..8c9a412a 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -3,7 +3,7 @@ import os from collections.abc import Mapping from pathlib import Path -from typing import Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import numpy as np from anndata import AnnData, read_text @@ -13,15 +13,14 @@ PathLike = Union[os.PathLike, str] # type:ignore[type-arg] -try: +if TYPE_CHECKING: from numpy.typing import NDArray - NDArrayA = NDArray[Any] -except (ImportError, TypeError): - NDArray = np.ndarray +else: NDArrayA = np.ndarray + def _read_counts( path: str | Path, counts_file: str, diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index b82b2eef..e1f97f42 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -5,7 +5,7 @@ from collections.abc import Mapping from pathlib import Path from types import MappingProxyType -from typing import Any +from typing import Any, Literal, Optional, Union import anndata as ad import numpy as np @@ -19,7 +19,7 @@ ShapesModel, TableModel, ) -from spatialdata.transformations import Identity +from spatialdata.transformations.transformations import Identity from spatialdata_io._constants._constants import SeqfishKeys as SK from spatialdata_io._docs import inject_docs @@ -33,7 +33,9 @@ def seqfish( load_images: bool = True, load_labels: bool = True, load_points: bool = True, - sections: list[int] | None = None, + load_shapes: bool = True, + load_additional_shapes: Union[Literal["segmentation", "boundaries", "all"], str, None] = None, + sections: Optional[list[int]] = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> SpatialData: """ @@ -43,8 +45,8 @@ def seqfish( - ```{vx.COUNTS_FILE!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Counts and metadata file. - ```{vx.CELL_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Cell coordinates file. - - ```{vx.DAPI!r}{vx.SECTION!r}{vx.OME_TIFF_FILE!r}```: High resolution tiff image. - - ```{vx.CELL_MASK_FILE!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file. + - ```{vx.DAPI!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: High resolution tiff image. + - ```{vx.SEGMENTATION!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file. - ```{vx.TRANSCRIPT_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Transcript coordinates file. .. seealso:: @@ -58,11 +60,16 @@ def seqfish( load_images Whether to load the images. load_labels - Whether to load the labels. + Whether to load cell segmentation. load_points - Whether to load the points. + Whether to load the transcript locations. + load_shapes + Whether to load cells as shape. + load_additional_shapes + Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. sections - Which sections (specified as integers) to load. By default, all sections are loaded. + Which sections (specified as integers) to load. Only necessary if multiple sections present. + If "all" is specified, reads all remaining .tiff and .geojson files in the directory. imread_kwargs Keyword arguments to pass to :func:`dask_image.imread.imread`. @@ -71,7 +78,7 @@ def seqfish( :class:`spatialdata.SpatialData` """ path = Path(path) - count_file_pattern = re.compile(rf"(.*?)_{SK.CELL_COORDINATES}_{SK.SECTION}[0-9]+" + re.escape(SK.CSV_FILE)) + count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$") count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)] if not count_files: # no file matching tbe pattern found @@ -85,52 +92,56 @@ def seqfish( n = len(count_files) all_sections = list(range(1, n + 1)) - if sections is None: - sections = all_sections - else: + sections_str: list[Optional[str]] = [] + if sections is not None: for section in sections: if section not in all_sections: raise ValueError(f"Section {section} not found in the data.") - sections_str = [f"{SK.SECTION}{x}" for x in sections] + else: + sections_str.append(f"{SK.SECTION}{section}") + elif sections is None: + sections_str.append(None) + else: + raise ValueError("Invalid type for 'section'. Must be list[int] or None.") - def get_cell_file(section: str) -> str: - return f"{prefix}_{SK.CELL_COORDINATES}_{section}{SK.CSV_FILE}" + def get_cell_file(section: str | None) -> str: + return f"{prefix}{SK.CELL_COORDINATES}{section or ''}{SK.CSV_FILE}" - def get_count_file(section: str) -> str: - return f"{prefix}_{SK.COUNTS_FILE}_{section}{SK.CSV_FILE}" + def get_count_file(section: str | None) -> str: + return f"{prefix}{SK.COUNTS_FILE}{section or ''}{SK.CSV_FILE}" - def get_dapi_file(section: str) -> str: - return f"{prefix}_{SK.DAPI}_{section}{SK.OME_TIFF_FILE}" + def get_dapi_file(section: str | None) -> str: + return f"{prefix}{SK.DAPI}{section or ''}{SK.TIFF_FILE}" - def get_cell_mask_file(section: str) -> str: - return f"{prefix}_{SK.CELL_MASK_FILE}_{section}{SK.TIFF_FILE}" + def get_cell_mask_file(section: str | None) -> str: + return f"{prefix}{SK.SEGMENTATION}{section or ''}{SK.TIFF_FILE}" - def get_transcript_file(section: str) -> str: - return f"{prefix}_{SK.TRANSCRIPT_COORDINATES}_{section}{SK.CSV_FILE}" + def get_transcript_file(section: str | None) -> str: + return f"{prefix}{SK.TRANSCRIPT_COORDINATES}{section or ''}{SK.CSV_FILE}" - adatas: dict[str, ad.AnnData] = {} - for section in sections_str: # type: ignore[assignment] - assert isinstance(section, str) - cell_file = get_cell_file(section) - count_matrix = get_count_file(section) + adatas: dict[Optional[str], ad.AnnData] = {} + for section_str in sections_str: + assert isinstance(section_str, str) or section_str is None + cell_file = get_cell_file(section_str) + count_matrix = get_count_file(section_str) adata = ad.read_csv(path / count_matrix, delimiter=",") cell_info = pd.read_csv(path / cell_file, delimiter=",") adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy() adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1)) - region = f"cells_{section}" + region = f"cells_{section_str}" adata.obs[SK.REGION_KEY] = region adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int) - adatas[section] = adata + adatas[section_str] = adata scale_factors = [2, 2, 2, 2] if load_images: images = { - f"image_{x}": Image2DModel.parse( + f"{os.path.splitext(get_dapi_file(x))[0]}": Image2DModel.parse( imread(path / get_dapi_file(x), **imread_kwargs), dims=("c", "y", "x"), scale_factors=scale_factors, - transformations={x: Identity()}, + transformations={"global": Identity()}, ) for x in sections_str } @@ -139,11 +150,11 @@ def get_transcript_file(section: str) -> str: if load_labels: labels = { - f"labels_{x}": Labels2DModel.parse( + f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse( imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(), dims=("y", "x"), scale_factors=scale_factors, - transformations={x: Identity()}, + transformations={"global": Identity()}, ) for x in sections_str } @@ -152,12 +163,12 @@ def get_transcript_file(section: str) -> str: if load_points: points = { - f"transcripts_{x}": PointsModel.parse( + f"{os.path.splitext(get_transcript_file(x))[0]}": PointsModel.parse( pd.read_csv(path / get_transcript_file(x), delimiter=","), coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y}, feature_key=SK.FEATURE_KEY.value, - instance_key=SK.INSTANCE_KEY_POINTS.value, - transformations={x: Identity()}, + # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway + transformations={"global": Identity()}, ) for x in sections_str } @@ -174,16 +185,47 @@ def get_transcript_file(section: str) -> str: instance_key=SK.INSTANCE_KEY_TABLE.value, ) - shapes = { - f"cells_{x}": ShapesModel.parse( - adata.obsm[SK.SPATIAL_KEY], - geometry=0, - radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi), - index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), - transformations={x: Identity()}, - ) - for x, adata in adatas.items() - } + if load_shapes: + shapes = { + f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse( + adata.obsm[SK.SPATIAL_KEY], + geometry=0, + radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi), + index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), + transformations={"global": Identity()}, + ) + for x, adata in adatas.items() + } + else: + shapes = {} + + if load_additional_shapes is not None: + shape_file_names = [] + for filename in os.listdir(path): + if filename.endswith((SK.TIFF_FILE, SK.GEOJSON_FILE)): + if load_additional_shapes == "all": + if not any(key in filename for key in images.keys()) and not any( + key in filename for key in labels.keys() + ): + shape_file_names.append(filename) + elif load_additional_shapes == "segmentation": + if SK.SEGMENTATION in filename and not any(key in filename for key in labels.keys()): + shape_file_names.append(filename) + elif load_additional_shapes == "boundaries": + if SK.BOUNDARIES in filename: + shape_file_names.append(filename) + elif isinstance(load_additional_shapes, str): + if load_additional_shapes in filename: + shape_file_names.append(filename) + else: + raise ValueError(f"No file found with identifier {load_additional_shapes}") + + for x in range(len(shape_file_names)): + shapes[f"{os.path.splitext(shape_file_names[x])[0]}"] = ShapesModel.parse( + path / shape_file_names[x], + index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), + transformations={"global": Identity()}, + ) sdata = SpatialData(images=images, labels=labels, points=points, table=table, shapes=shapes) From 044ab9d168c2d755e44f4480e9f1f77b8d75a4d0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 14:19:08 +0000 Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/readers/_utils/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index 8c9a412a..f7f3abe5 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -15,12 +15,12 @@ if TYPE_CHECKING: from numpy.typing import NDArray + NDArrayA = NDArray[Any] else: NDArrayA = np.ndarray - def _read_counts( path: str | Path, counts_file: str, From 233946ae30b97665274bf197544929aaab601252 Mon Sep 17 00:00:00 2001 From: Laurens Lehner Date: Tue, 12 Nov 2024 18:32:59 +0100 Subject: [PATCH 03/11] Update prefixes; Add scalefactors --- src/spatialdata_io/_constants/_constants.py | 4 +- src/spatialdata_io/readers/seqfish.py | 129 ++++++++++++-------- 2 files changed, 78 insertions(+), 55 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index 57f50db0..92d26db3 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -68,7 +68,7 @@ class SeqfishKeys(ModeEnum): TIFF_FILE = ".tiff" GEOJSON_FILE = ".geojson" # file identifiers - SECTION = "_section" + ROI = "Roi" TRANSCRIPT_COORDINATES = "TranscriptList" DAPI = "DAPI" COUNTS_FILE = "CellxGene" @@ -88,6 +88,8 @@ class SeqfishKeys(ModeEnum): SPATIAL_KEY = "spatial" REGION_KEY = "region" INSTANCE_KEY_TABLE = "instance_id" + SCALEFEFACTOR_X = "PhysicalSizeX" + SCALEFEFACTOR_Y = "PhysicalSizeY" @unique diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index e1f97f42..0ce6f144 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -2,6 +2,7 @@ import os import re +import xml.etree.ElementTree as ET from collections.abc import Mapping from pathlib import Path from types import MappingProxyType @@ -10,6 +11,7 @@ import anndata as ad import numpy as np import pandas as pd +import tifffile from dask_image.imread import imread from spatialdata import SpatialData from spatialdata.models import ( @@ -19,7 +21,7 @@ ShapesModel, TableModel, ) -from spatialdata.transformations.transformations import Identity +from spatialdata.transformations.transformations import Identity, Scale from spatialdata_io._constants._constants import SeqfishKeys as SK from spatialdata_io._docs import inject_docs @@ -35,7 +37,7 @@ def seqfish( load_points: bool = True, load_shapes: bool = True, load_additional_shapes: Union[Literal["segmentation", "boundaries", "all"], str, None] = None, - sections: Optional[list[int]] = None, + ROIs: Optional[list[int]] = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> SpatialData: """ @@ -43,11 +45,11 @@ def seqfish( This function reads the following files: - - ```{vx.COUNTS_FILE!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Counts and metadata file. - - ```{vx.CELL_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Cell coordinates file. - - ```{vx.DAPI!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: High resolution tiff image. - - ```{vx.SEGMENTATION!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file. - - ```{vx.TRANSCRIPT_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Transcript coordinates file. + - ```{vx.ROI!r}{vx.COUNTS_FILE!r}{vx.CSV_FILE!r}```: Counts and metadata file. + - ```{vx.ROI!r}{vx.CELL_COORDINATES!r}{vx.CSV_FILE!r}```: Cell coordinates file. + - ```{vx.ROI!r}{vx.DAPI!r}{vx.TIFF_FILE!r}```: High resolution tiff image. + - ```{vx.ROI!r}{vx.SEGMENTATION!r}{vx.TIFF_FILE!r}```: Cell mask file. + - ```{vx.ROI!r}{vx.TRANSCRIPT_COORDINATES!r}{vx.CSV_FILE!r}```: Transcript coordinates file. .. seealso:: @@ -67,8 +69,8 @@ def seqfish( Whether to load cells as shape. load_additional_shapes Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. - sections - Which sections (specified as integers) to load. Only necessary if multiple sections present. + ROIs + Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present. If "all" is specified, reads all remaining .tiff and .geojson files in the directory. imread_kwargs Keyword arguments to pass to :func:`dask_image.imread.imread`. @@ -80,6 +82,9 @@ def seqfish( path = Path(path) count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$") count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)] + roi_pattern = re.compile(f"^{SK.ROI}(\\d+)") + # n_rois = {roi_pattern.match(i).group(1) for i in os.listdir(path) if roi_pattern.match(i)} + n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))} if not count_files: # no file matching tbe pattern found raise ValueError( @@ -88,50 +93,53 @@ def seqfish( matched = count_file_pattern.match(count_files[0]) if matched is None: raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}") - prefix = matched.group(1) - - n = len(count_files) - all_sections = list(range(1, n + 1)) - sections_str: list[Optional[str]] = [] - if sections is not None: - for section in sections: - if section not in all_sections: - raise ValueError(f"Section {section} not found in the data.") - else: - sections_str.append(f"{SK.SECTION}{section}") - elif sections is None: - sections_str.append(None) + # prefix = matched.group(1) + + rois_str: list[str] = [] + if ROIs is None: + for roi in n_rois: + rois_str.append(f"{SK.ROI}{roi}") + elif isinstance(ROIs, list): + for roi in ROIs: + if str(roi) not in n_rois: + raise ValueError(f"ROI{roi} not found.") + rois_str.append(f"{SK.ROI}{roi}") else: - raise ValueError("Invalid type for 'section'. Must be list[int] or None.") + raise ValueError("Invalid type for 'roi'. Must be list[int] or None.") - def get_cell_file(section: str | None) -> str: - return f"{prefix}{SK.CELL_COORDINATES}{section or ''}{SK.CSV_FILE}" + def get_cell_file(roi: str) -> str: + return f"{roi}_{SK.CELL_COORDINATES}{SK.CSV_FILE}" - def get_count_file(section: str | None) -> str: - return f"{prefix}{SK.COUNTS_FILE}{section or ''}{SK.CSV_FILE}" + def get_count_file(roi: str) -> str: + return f"{roi}_{SK.COUNTS_FILE}{SK.CSV_FILE}" - def get_dapi_file(section: str | None) -> str: - return f"{prefix}{SK.DAPI}{section or ''}{SK.TIFF_FILE}" + def get_dapi_file(roi: str) -> str: + return f"{roi}_{SK.DAPI}{SK.TIFF_FILE}" - def get_cell_mask_file(section: str | None) -> str: - return f"{prefix}{SK.SEGMENTATION}{section or ''}{SK.TIFF_FILE}" + def get_cell_mask_file(roi: str) -> str: + return f"{roi}_{SK.SEGMENTATION}{SK.TIFF_FILE}" - def get_transcript_file(section: str | None) -> str: - return f"{prefix}{SK.TRANSCRIPT_COORDINATES}{section or ''}{SK.CSV_FILE}" + def get_transcript_file(roi: str) -> str: + return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}" + scaled = {} adatas: dict[Optional[str], ad.AnnData] = {} - for section_str in sections_str: - assert isinstance(section_str, str) or section_str is None - cell_file = get_cell_file(section_str) - count_matrix = get_count_file(section_str) + for roi_str in rois_str: + assert isinstance(roi_str, str) or roi_str is None + cell_file = get_cell_file(roi_str) + count_matrix = get_count_file(roi_str) adata = ad.read_csv(path / count_matrix, delimiter=",") cell_info = pd.read_csv(path / cell_file, delimiter=",") adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy() adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1)) - region = f"cells_{section_str}" + region = f"cells_{roi_str}" adata.obs[SK.REGION_KEY] = region adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int) - adatas[section_str] = adata + adatas[roi_str] = adata + scaled[roi_str] = Scale( + np.array(_get_scale_factors(path / get_dapi_file(roi_str), SK.SCALEFEFACTOR_X, SK.SCALEFEFACTOR_Y)), + axes=("y", "x"), + ) scale_factors = [2, 2, 2, 2] @@ -141,9 +149,9 @@ def get_transcript_file(section: str | None) -> str: imread(path / get_dapi_file(x), **imread_kwargs), dims=("c", "y", "x"), scale_factors=scale_factors, - transformations={"global": Identity()}, + transformations={"global": scaled[x]}, ) - for x in sections_str + for x in rois_str } else: images = {} @@ -156,7 +164,7 @@ def get_transcript_file(section: str | None) -> str: scale_factors=scale_factors, transformations={"global": Identity()}, ) - for x in sections_str + for x in rois_str } else: labels = {} @@ -170,20 +178,22 @@ def get_transcript_file(section: str | None) -> str: # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway transformations={"global": Identity()}, ) - for x in sections_str + for x in rois_str } else: points = {} - adata = ad.concat(adatas.values()) - adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") - adata.obs = adata.obs.reset_index(drop=True) - table = TableModel.parse( - adata, - region=[f"cells_{x}" for x in sections_str], - region_key=SK.REGION_KEY.value, - instance_key=SK.INSTANCE_KEY_TABLE.value, - ) + # adata = ad.concat(adatas.values()) + tables = {} + for name, adata in adatas.items(): + adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") + adata.obs = adata.obs.reset_index(drop=True) + tables[name] = TableModel.parse( + adata, + region=f"cells_{name}", + region_key=SK.REGION_KEY.value, + instance_key=SK.INSTANCE_KEY_TABLE.value, + ) if load_shapes: shapes = { @@ -194,7 +204,7 @@ def get_transcript_file(section: str | None) -> str: index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) - for x, adata in adatas.items() + for x, adata in zip(rois_str, adatas.values()) } else: shapes = {} @@ -227,6 +237,17 @@ def get_transcript_file(section: str | None) -> str: transformations={"global": Identity()}, ) - sdata = SpatialData(images=images, labels=labels, points=points, table=table, shapes=shapes) + sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes) return sdata + + +def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_key: str) -> list[float]: + with tifffile.TiffFile(DAPI_path) as tif: + ome_metadata = tif.ome_metadata + root = ET.fromstring(ome_metadata) + for element in root.iter(): + if scalefactor_x_key in element.attrib.keys(): + scalefactor_x = element.attrib[scalefactor_x_key] + scalefactor_y = element.attrib[scalefactor_y_key] + return [float(scalefactor_x), float(scalefactor_y)] From d9ec1f51dcd889ab8fd6919b22e3e42df78467b6 Mon Sep 17 00:00:00 2001 From: Laurens Lehner Date: Tue, 12 Nov 2024 18:42:56 +0100 Subject: [PATCH 04/11] Remove unused code --- src/spatialdata_io/readers/seqfish.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index 0ce6f144..dac52f99 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -83,17 +83,14 @@ def seqfish( count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$") count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)] roi_pattern = re.compile(f"^{SK.ROI}(\\d+)") - # n_rois = {roi_pattern.match(i).group(1) for i in os.listdir(path) if roi_pattern.match(i)} n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))} if not count_files: - # no file matching tbe pattern found raise ValueError( f"No files matching the pattern {count_file_pattern} were found. Cannot infer the naming scheme." ) matched = count_file_pattern.match(count_files[0]) if matched is None: raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}") - # prefix = matched.group(1) rois_str: list[str] = [] if ROIs is None: @@ -183,7 +180,6 @@ def get_transcript_file(roi: str) -> str: else: points = {} - # adata = ad.concat(adatas.values()) tables = {} for name, adata in adatas.items(): adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") From c8fc779ca51559e2c2593953de93b50114b57ea6 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 10 Dec 2024 16:13:33 +0100 Subject: [PATCH 05/11] rename variable --- src/spatialdata_io/readers/seqfish.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index 028753fb..187c178b 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -37,7 +37,7 @@ def seqfish( load_points: bool = True, load_shapes: bool = True, load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None, - ROIs: list[int] | None = None, + rois: list[int] | None = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> SpatialData: """ @@ -69,7 +69,7 @@ def seqfish( Whether to load cells as shape. load_additional_shapes Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. - ROIs + rois Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present. If "all" is specified, reads all remaining .tiff and .geojson files in the directory. imread_kwargs @@ -93,11 +93,11 @@ def seqfish( raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}") rois_str: list[str] = [] - if ROIs is None: + if rois is None: for roi in n_rois: rois_str.append(f"{SK.ROI}{roi}") - elif isinstance(ROIs, list): - for roi in ROIs: + elif isinstance(rois, list): + for roi in rois: if str(roi) not in n_rois: raise ValueError(f"ROI{roi} not found.") rois_str.append(f"{SK.ROI}{roi}") From f219088cc055f5c6bf66f14232ffdbad05450b5a Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Wed, 11 Dec 2024 20:16:54 +0100 Subject: [PATCH 06/11] fix scale factor labels; fix table --- src/spatialdata_io/readers/seqfish.py | 133 ++++++++++++++++++-------- 1 file changed, 93 insertions(+), 40 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index 187c178b..d7aa656a 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -37,8 +37,10 @@ def seqfish( load_points: bool = True, load_shapes: bool = True, load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None, + cells_as_circles: bool = False, rois: list[int] | None = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), + raster_models_scale_factors: list[float] | None = None, ) -> SpatialData: """ Read *seqfish* formatted dataset. @@ -69,38 +71,47 @@ def seqfish( Whether to load cells as shape. load_additional_shapes Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. + If "all" is specified, reads all remaining .tiff and .geojson files in the directory. + cells_as_circles + Whether to read cells also as circles instead of labels. rois Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present. - If "all" is specified, reads all remaining .tiff and .geojson files in the directory. imread_kwargs Keyword arguments to pass to :func:`dask_image.imread.imread`. Returns ------- :class:`spatialdata.SpatialData` + + Examples + -------- + This code shows how to change the annotation target of the table from the cell labels to the cell cirlces. + Please check that Roi1 is present in your dataset, otherwise adjust the code below. + >>> from spatialdata_io import seqfish + >>> sdata = seqfish("path/to/raw/data") + >>> sdata["table"].obs["region"] = "Roi1_CellCoordinates" + >>> sdata.set_table_annotates_spatialelement( + ... table_name="Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" + ... ) + >>> sdata.write("path/to/data.zarr") """ path = Path(path) - count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$") - count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)] - roi_pattern = re.compile(f"^{SK.ROI}(\\d+)") - n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))} + count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}{re.escape(SK.CSV_FILE)}$") + count_files = [f for f in os.listdir(path) if count_file_pattern.match(f)] if not count_files: raise ValueError( f"No files matching the pattern {count_file_pattern} were found. Cannot infer the naming scheme." ) - matched = count_file_pattern.match(count_files[0]) - if matched is None: - raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}") - rois_str: list[str] = [] + roi_pattern = re.compile(f"^{SK.ROI}(\\d+)") + found_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))} if rois is None: - for roi in n_rois: - rois_str.append(f"{SK.ROI}{roi}") + rois_str = [f"{SK.ROI}{roi}" for roi in found_rois] elif isinstance(rois, list): for roi in rois: - if str(roi) not in n_rois: + if str(roi) not in found_rois: raise ValueError(f"ROI{roi} not found.") - rois_str.append(f"{SK.ROI}{roi}") + rois_str = [f"{SK.ROI}{roi}" for roi in rois] else: raise ValueError("Invalid type for 'roi'. Must be list[int] or None.") @@ -119,33 +130,54 @@ def get_cell_mask_file(roi: str) -> str: def get_transcript_file(roi: str) -> str: return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}" - scaled = {} - adatas: dict[str, ad.AnnData] = {} + # parse table information + tables: dict[str, ad.AnnData] = {} for roi_str in rois_str: - assert isinstance(roi_str, str) or roi_str is None - cell_file = get_cell_file(roi_str) + # parse cell gene expression data count_matrix = get_count_file(roi_str) - adata = ad.read_csv(path / count_matrix, delimiter=",") + df = pd.read_csv(path / count_matrix, delimiter=",") + instance_id = df.iloc[:, 0].astype(str) + expression = df.drop(columns=["Unnamed: 0"]) + expression.set_index(instance_id, inplace=True) + adata = ad.AnnData(expression) + + # parse cell spatial information + cell_file = get_cell_file(roi_str) cell_info = pd.read_csv(path / cell_file, delimiter=",") + cell_info["label"] = cell_info["label"].astype("str") + # below, the obsm are assigned by position, not by index. Here we check that we can do it + assert cell_info["label"].to_numpy().tolist() == adata.obs.index.to_numpy().tolist() + cell_info.set_index("label", inplace=True) + adata.obs[SK.AREA] = cell_info[SK.AREA] adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy() - adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1)) - region = f"cells_{roi_str}" + + # map tables to cell labels (defined later) + region = os.path.splitext(get_cell_mask_file(roi_str))[0] adata.obs[SK.REGION_KEY] = region - adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int) - adatas[roi_str] = adata + adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") + adata.obs[SK.INSTANCE_KEY_TABLE] = instance_id.to_numpy().astype(np.uint16) + adata.obs = adata.obs.reset_index(drop=True) + tables[f"table_{roi_str}"] = TableModel.parse( + adata, + region=region, + region_key=SK.REGION_KEY.value, + instance_key=SK.INSTANCE_KEY_TABLE.value, + ) + + # parse scale factors to scale images and labels + scaled = {} + for roi_str in rois_str: scaled[roi_str] = Scale( np.array(_get_scale_factors(path / get_dapi_file(roi_str), SK.SCALEFEFACTOR_X, SK.SCALEFEFACTOR_Y)), axes=("y", "x"), ) - scale_factors = [2, 2, 2, 2] - if load_images: images = { f"{os.path.splitext(get_dapi_file(x))[0]}": Image2DModel.parse( imread(path / get_dapi_file(x), **imread_kwargs), dims=("c", "y", "x"), - scale_factors=scale_factors, + scale_factors=raster_models_scale_factors, transformations={"global": scaled[x]}, ) for x in rois_str @@ -158,8 +190,8 @@ def get_transcript_file(roi: str) -> str: f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse( imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(), dims=("y", "x"), - scale_factors=scale_factors, - transformations={"global": Identity()}, + scale_factors=raster_models_scale_factors, + transformations={"global": scaled[x]}, ) for x in rois_str } @@ -172,7 +204,7 @@ def get_transcript_file(roi: str) -> str: pd.read_csv(path / get_transcript_file(x), delimiter=","), coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y}, feature_key=SK.FEATURE_KEY.value, - # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway + instance_key=None, transformations={"global": Identity()}, ) for x in rois_str @@ -180,17 +212,6 @@ def get_transcript_file(roi: str) -> str: else: points = {} - tables = {} - for name, adata in adatas.items(): - adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") - adata.obs = adata.obs.reset_index(drop=True) - tables[name] = TableModel.parse( - adata, - region=f"cells_{name}", - region_key=SK.REGION_KEY.value, - instance_key=SK.INSTANCE_KEY_TABLE.value, - ) - if load_shapes: shapes = { f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse( @@ -200,7 +221,7 @@ def get_transcript_file(roi: str) -> str: index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) - for x, adata in zip(rois_str, adatas.values()) + for x, adata in zip(rois_str, tables.values()) } else: shapes = {} @@ -247,3 +268,35 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke scalefactor_x = element.attrib[scalefactor_x_key] scalefactor_y = element.attrib[scalefactor_y_key] return [float(scalefactor_x), float(scalefactor_y)] + + +if __name__ == "__main__": + path_read = "/Users/macbook/ssd/biodata/seqfish/instrument 2 official" + sdata = seqfish( + path=path_read, + # load_images=True, + # load_labels=True, + # load_points=True, + # rois=[1], + ) + # sdata.pl.render_labels(color='Arg1').pl.show() + import matplotlib.pyplot as plt + + # plt.show() + # sdata["table_Roi1"].obs["region"] = "Roi1_CellCoordinates" + # sdata.set_table_annotates_spatialelement( + # table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" + # ) + + gene_name = "Arg1" + ( + sdata.pl.render_images("Roi1_DAPI", cmap="gray") + .pl.render_labels("Roi1_Segmentation", color=gene_name) + .pl.render_points("Roi1_TranscriptList", color="name", groups=gene_name, palette="orange") + .pl.show(title=f"{gene_name} expression over DAPI image", coordinate_systems="global", figsize=(10, 5)) + ) + plt.show() + + from napari_spatialdata import Interactive + + Interactive(sdata) From ff14eed2b0b144085c4758f1a286c3c7d3b0b0f2 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 12 Dec 2024 11:24:57 +0100 Subject: [PATCH 07/11] before removing 'load_additional_shapes' --- src/spatialdata_io/readers/seqfish.py | 90 ++++++++++++++++----------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index d7aa656a..c41a940c 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -36,7 +36,7 @@ def seqfish( load_labels: bool = True, load_points: bool = True, load_shapes: bool = True, - load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None, + load_additional_geometries: Literal["segmentation", "boundaries", "all"] | str | None = None, cells_as_circles: bool = False, rois: list[int] | None = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -69,9 +69,11 @@ def seqfish( Whether to load the transcript locations. load_shapes Whether to load cells as shape. - load_additional_shapes - Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. - If "all" is specified, reads all remaining .tiff and .geojson files in the directory. + load_additional_geometries + Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. If "all" is + specified (default), it reads all remaining .tiff and .geojson files in the directory. Other options are + "segmentation", "boundaries", or a string to match in the filename (as a subset). If `None` is passed, + no additional geometry is loaded. cells_as_circles Whether to read cells also as circles instead of labels. rois @@ -198,19 +200,24 @@ def get_transcript_file(roi: str) -> str: else: labels = {} + points = {} if load_points: - points = { - f"{os.path.splitext(get_transcript_file(x))[0]}": PointsModel.parse( - pd.read_csv(path / get_transcript_file(x), delimiter=","), + for x in rois_str: + + + # prepare data + name = f"{os.path.splitext(get_transcript_file(x))[0]}" + p = pd.read_csv(path / get_transcript_file(x), delimiter=",") + instance_key_points = SK.INSTANCE_KEY_POINTS.value if SK.INSTANCE_KEY_POINTS.value in p.columns else None + + # call parser + points[name] = PointsModel.parse( + p, coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y}, feature_key=SK.FEATURE_KEY.value, - instance_key=None, + instance_key=instance_key_points, transformations={"global": Identity()}, ) - for x in rois_str - } - else: - points = {} if load_shapes: shapes = { @@ -226,33 +233,45 @@ def get_transcript_file(roi: str) -> str: else: shapes = {} - if load_additional_shapes is not None: - shape_file_names = [] + if load_additional_geometries is not None: + labels_filenames = [] + shapes_filenames = [] + for filename in os.listdir(path): - if filename.endswith((SK.TIFF_FILE, SK.GEOJSON_FILE)): - if load_additional_shapes == "all": - if not any(key in filename for key in images.keys()) and not any( - key in filename for key in labels.keys() - ): - shape_file_names.append(filename) - elif load_additional_shapes == "segmentation": - if SK.SEGMENTATION in filename and not any(key in filename for key in labels.keys()): - shape_file_names.append(filename) - elif load_additional_shapes == "boundaries": - if SK.BOUNDARIES in filename: - shape_file_names.append(filename) - elif isinstance(load_additional_shapes, str): - if load_additional_shapes in filename: - shape_file_names.append(filename) - else: - raise ValueError(f"No file found with identifier {load_additional_shapes}") - - for x in range(len(shape_file_names)): - shapes[f"{os.path.splitext(shape_file_names[x])[0]}"] = ShapesModel.parse( - path / shape_file_names[x], + if filename.endswith(SK.TIFF_FILE): + if ( + load_additional_geometries == "all" + or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename + or isinstance(load_additional_geometries, str) and load_additional_geometries in filename + ): + labels_filenames.append(filename) + continue + if filename.endswith(SK.GEOJSON_FILE): + if ( + load_additional_geometries == "all" + or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename + or isinstance(load_additional_geometries, str) and load_additional_geometries in filename + ): + shapes_filenames.append(filename) + continue + raise ValueError(f"No file found with identifier {load_additional_geometries}") + + for labels_filename in labels_filenames: + labels[f"{os.path.splitext(labels_filename)[0]}"] = Labels2DModel.parse( + imread(path / labels_filename, **imread_kwargs).squeeze(), + dims=("y", "x"), + scale_factors=raster_models_scale_factors, + transformations={"global": scaled[x]}, + ) + + for shape_filename in shapes_filenames: + # TODO: check that indices of newly parsed shapes match the indices of the existing shapes + shapes[f"{os.path.splitext(shape_filename)[0]}"] = ShapesModel.parse( + path / shape_filename, index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) + pass sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes) @@ -287,6 +306,7 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke # sdata.set_table_annotates_spatialelement( # table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" # ) + import spatialdata_plot gene_name = "Arg1" ( From a4434695d18829171cd804a1eb932aef9ec71491 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:25:18 +0000 Subject: [PATCH 08/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/readers/seqfish.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index c41a940c..996c6d4d 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -204,7 +204,6 @@ def get_transcript_file(roi: str) -> str: if load_points: for x in rois_str: - # prepare data name = f"{os.path.splitext(get_transcript_file(x))[0]}" p = pd.read_csv(path / get_transcript_file(x), delimiter=",") @@ -241,16 +240,20 @@ def get_transcript_file(roi: str) -> str: if filename.endswith(SK.TIFF_FILE): if ( load_additional_geometries == "all" - or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename - or isinstance(load_additional_geometries, str) and load_additional_geometries in filename + or load_additional_geometries == "segmentation" + and SK.SEGMENTATION in filename + or isinstance(load_additional_geometries, str) + and load_additional_geometries in filename ): labels_filenames.append(filename) continue if filename.endswith(SK.GEOJSON_FILE): if ( load_additional_geometries == "all" - or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename - or isinstance(load_additional_geometries, str) and load_additional_geometries in filename + or load_additional_geometries == "boundaries" + and SK.BOUNDARIES in filename + or isinstance(load_additional_geometries, str) + and load_additional_geometries in filename ): shapes_filenames.append(filename) continue @@ -271,7 +274,6 @@ def get_transcript_file(roi: str) -> str: index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) - pass sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes) @@ -306,7 +308,6 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke # sdata.set_table_annotates_spatialelement( # table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" # ) - import spatialdata_plot gene_name = "Arg1" ( From 375fa5cc25b3666d4fc7077d18bf256e0f1e7903 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 12 Dec 2024 13:21:48 +0100 Subject: [PATCH 09/11] finished code review --- src/spatialdata_io/readers/seqfish.py | 115 +++++--------------------- 1 file changed, 22 insertions(+), 93 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index c41a940c..27431c09 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -6,7 +6,7 @@ from collections.abc import Mapping from pathlib import Path from types import MappingProxyType -from typing import Any, Literal +from typing import Any import anndata as ad import numpy as np @@ -36,7 +36,6 @@ def seqfish( load_labels: bool = True, load_points: bool = True, load_shapes: bool = True, - load_additional_geometries: Literal["segmentation", "boundaries", "all"] | str | None = None, cells_as_circles: bool = False, rois: list[int] | None = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -69,11 +68,6 @@ def seqfish( Whether to load the transcript locations. load_shapes Whether to load cells as shape. - load_additional_geometries - Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. If "all" is - specified (default), it reads all remaining .tiff and .geojson files in the directory. Other options are - "segmentation", "boundaries", or a string to match in the filename (as a subset). If `None` is passed, - no additional geometry is loaded. cells_as_circles Whether to read cells also as circles instead of labels. rois @@ -87,13 +81,13 @@ def seqfish( Examples -------- - This code shows how to change the annotation target of the table from the cell labels to the cell cirlces. - Please check that Roi1 is present in your dataset, otherwise adjust the code below. + This code shows how to change the annotation target of the table from the cell labels to the cell boundaries. + Please check that the string Roi1 is used in the naming of your dataset, otherwise adjust the code below. >>> from spatialdata_io import seqfish >>> sdata = seqfish("path/to/raw/data") - >>> sdata["table"].obs["region"] = "Roi1_CellCoordinates" + >>> sdata["table_Roi1"].obs["region"] = "Roi1_Boundaries" >>> sdata.set_table_annotates_spatialelement( - ... table_name="Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" + ... table_name="table_Roi1", region="Roi1_Boundaries", region_key="region", instance_key="instance_id" ... ) >>> sdata.write("path/to/data.zarr") """ @@ -126,9 +120,12 @@ def get_count_file(roi: str) -> str: def get_dapi_file(roi: str) -> str: return f"{roi}_{SK.DAPI}{SK.TIFF_FILE}" - def get_cell_mask_file(roi: str) -> str: + def get_cell_segmentation_labels_file(roi: str) -> str: return f"{roi}_{SK.SEGMENTATION}{SK.TIFF_FILE}" + def get_cell_segmentation_shapes_file(roi: str) -> str: + return f"{roi}_{SK.BOUNDARIES}{SK.GEOJSON_FILE}" + def get_transcript_file(roi: str) -> str: return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}" @@ -154,7 +151,7 @@ def get_transcript_file(roi: str) -> str: adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy() # map tables to cell labels (defined later) - region = os.path.splitext(get_cell_mask_file(roi_str))[0] + region = os.path.splitext(get_cell_segmentation_labels_file(roi_str))[0] adata.obs[SK.REGION_KEY] = region adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") adata.obs[SK.INSTANCE_KEY_TABLE] = instance_id.to_numpy().astype(np.uint16) @@ -189,8 +186,8 @@ def get_transcript_file(roi: str) -> str: if load_labels: labels = { - f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse( - imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(), + f"{os.path.splitext(get_cell_segmentation_labels_file(x))[0]}": Labels2DModel.parse( + imread(path / get_cell_segmentation_labels_file(x), **imread_kwargs).squeeze(), dims=("y", "x"), scale_factors=raster_models_scale_factors, transformations={"global": scaled[x]}, @@ -204,7 +201,6 @@ def get_transcript_file(roi: str) -> str: if load_points: for x in rois_str: - # prepare data name = f"{os.path.splitext(get_transcript_file(x))[0]}" p = pd.read_csv(path / get_transcript_file(x), delimiter=",") @@ -219,59 +215,25 @@ def get_transcript_file(roi: str) -> str: transformations={"global": Identity()}, ) - if load_shapes: - shapes = { - f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse( + shapes = {} + if cells_as_circles: + for x, adata in zip(rois_str, tables.values()): + shapes[f"{os.path.splitext(get_cell_file(x))[0]}"] = ShapesModel.parse( adata.obsm[SK.SPATIAL_KEY], geometry=0, radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi), index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) - for x, adata in zip(rois_str, tables.values()) - } - else: - shapes = {} - - if load_additional_geometries is not None: - labels_filenames = [] - shapes_filenames = [] - - for filename in os.listdir(path): - if filename.endswith(SK.TIFF_FILE): - if ( - load_additional_geometries == "all" - or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename - or isinstance(load_additional_geometries, str) and load_additional_geometries in filename - ): - labels_filenames.append(filename) - continue - if filename.endswith(SK.GEOJSON_FILE): - if ( - load_additional_geometries == "all" - or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename - or isinstance(load_additional_geometries, str) and load_additional_geometries in filename - ): - shapes_filenames.append(filename) - continue - raise ValueError(f"No file found with identifier {load_additional_geometries}") - - for labels_filename in labels_filenames: - labels[f"{os.path.splitext(labels_filename)[0]}"] = Labels2DModel.parse( - imread(path / labels_filename, **imread_kwargs).squeeze(), - dims=("y", "x"), - scale_factors=raster_models_scale_factors, + if load_shapes: + for x in rois_str: + # this assumes that the index matches the instance key of the table. A more robust approach could be + # implemented, as described here https://github.com/scverse/spatialdata-io/issues/249 + shapes[f"{os.path.splitext(get_cell_segmentation_shapes_file(x))[0]}"] = ShapesModel.parse( + path / get_cell_segmentation_shapes_file(x), transformations={"global": scaled[x]}, - ) - - for shape_filename in shapes_filenames: - # TODO: check that indices of newly parsed shapes match the indices of the existing shapes - shapes[f"{os.path.splitext(shape_filename)[0]}"] = ShapesModel.parse( - path / shape_filename, index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), - transformations={"global": Identity()}, ) - pass sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes) @@ -287,36 +249,3 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke scalefactor_x = element.attrib[scalefactor_x_key] scalefactor_y = element.attrib[scalefactor_y_key] return [float(scalefactor_x), float(scalefactor_y)] - - -if __name__ == "__main__": - path_read = "/Users/macbook/ssd/biodata/seqfish/instrument 2 official" - sdata = seqfish( - path=path_read, - # load_images=True, - # load_labels=True, - # load_points=True, - # rois=[1], - ) - # sdata.pl.render_labels(color='Arg1').pl.show() - import matplotlib.pyplot as plt - - # plt.show() - # sdata["table_Roi1"].obs["region"] = "Roi1_CellCoordinates" - # sdata.set_table_annotates_spatialelement( - # table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" - # ) - import spatialdata_plot - - gene_name = "Arg1" - ( - sdata.pl.render_images("Roi1_DAPI", cmap="gray") - .pl.render_labels("Roi1_Segmentation", color=gene_name) - .pl.render_points("Roi1_TranscriptList", color="name", groups=gene_name, palette="orange") - .pl.show(title=f"{gene_name} expression over DAPI image", coordinate_systems="global", figsize=(10, 5)) - ) - plt.show() - - from napari_spatialdata import Interactive - - Interactive(sdata) From 28b0eb3b0d42bc014e75f53b61c316504d84c375 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 12 Dec 2024 16:30:48 +0100 Subject: [PATCH 10/11] tests for seqfish --- tests/test_seqfish.py | 27 +++++++++++++++++++++++++++ tests/test_xenium.py | 8 ++------ 2 files changed, 29 insertions(+), 6 deletions(-) create mode 100644 tests/test_seqfish.py diff --git a/tests/test_seqfish.py b/tests/test_seqfish.py new file mode 100644 index 00000000..e0d66655 --- /dev/null +++ b/tests/test_seqfish.py @@ -0,0 +1,27 @@ +import math +from pathlib import Path + +import pytest + +from spatialdata_io.readers.seqfish import seqfish +from tests._utils import skip_if_below_python_version + + +# See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on +# how to download and place the data on disk +@skip_if_below_python_version() +@pytest.mark.parametrize("dataset,expected", [("instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")]) +@pytest.mark.parametrize("rois", [[1], None]) +@pytest.mark.parametrize("cells_as_circles", [False, True]) +def test_example_data(dataset: str, expected: str, rois: list[int] | None, cells_as_circles: bool) -> None: + f = Path("./data") / dataset + assert f.is_dir() + sdata = seqfish(f, cells_as_circles=cells_as_circles, rois=rois) + from spatialdata import get_extent + + extent = get_extent(sdata, exact=False) + extent = {ax: (math.floor(extent[ax][0]), math.ceil(extent[ax][1])) for ax in extent} + if cells_as_circles: + # manual correction required to take into account for the circle radii + expected = "{'y': (-2, 109), 'x': (-2, 109)}" + assert str(extent) == expected diff --git a/tests/test_xenium.py b/tests/test_xenium.py index 2fb35ce1..32408323 100644 --- a/tests/test_xenium.py +++ b/tests/test_xenium.py @@ -40,12 +40,8 @@ def test_roundtrip_with_data_limits() -> None: assert np.array_equal(cell_id_str, f0(*f1(cell_id_str))) -# The datasets should be downloaded from -# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/resources/xenium-example-data#test-data -# and placed in the "data" directory; if you run the tests locally you may need to create a symlink in "tests/data" -# pointing to "data". -# The GitHub workflow "prepare_test_data.yaml" takes care of downloading the datasets and uploading an artifact for the -# tests to use +# See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on +# how to download and place the data on disk @skip_if_below_python_version() @pytest.mark.parametrize( "dataset,expected", From 91efd4519072aab66b405cd69ec40c8a59b229a0 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 12 Dec 2024 17:01:04 +0100 Subject: [PATCH 11/11] fix seqfish test path --- tests/test_seqfish.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_seqfish.py b/tests/test_seqfish.py index e0d66655..bdbf2a60 100644 --- a/tests/test_seqfish.py +++ b/tests/test_seqfish.py @@ -10,7 +10,9 @@ # See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on # how to download and place the data on disk @skip_if_below_python_version() -@pytest.mark.parametrize("dataset,expected", [("instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")]) +@pytest.mark.parametrize( + "dataset,expected", [("seqfish-2-test-dataset/instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")] +) @pytest.mark.parametrize("rois", [[1], None]) @pytest.mark.parametrize("cells_as_circles", [False, True]) def test_example_data(dataset: str, expected: str, rois: list[int] | None, cells_as_circles: bool) -> None: