From b0a0b359bf08ca1db835de77b7a1c1a8bcfeb41e Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Wed, 9 Oct 2024 09:30:39 +0000 Subject: [PATCH 1/6] add patch coordinates to dataset metadata & manifest --- .../offline/classification/camelyon16.yaml | 2 +- .../classification/camelyon16_small.yaml | 2 +- .../pathology/offline/classification/panda.yaml | 2 +- .../offline/classification/panda_small.yaml | 2 +- .../data/datasets/classification/camelyon16.py | 4 +++- .../data/datasets/classification/panda.py | 4 +++- src/eva/vision/data/datasets/wsi.py | 17 ++++++++++++++++- 7 files changed, 26 insertions(+), 7 deletions(-) diff --git a/configs/vision/pathology/offline/classification/camelyon16.yaml b/configs/vision/pathology/offline/classification/camelyon16.yaml index e39a86cd..6ea5ed59 100644 --- a/configs/vision/pathology/offline/classification/camelyon16.yaml +++ b/configs/vision/pathology/offline/classification/camelyon16.yaml @@ -34,7 +34,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id"] + metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] backbone: class_path: eva.vision.models.ModelFromRegistry init_args: diff --git a/configs/vision/pathology/offline/classification/camelyon16_small.yaml b/configs/vision/pathology/offline/classification/camelyon16_small.yaml index 490a5038..ca18e84e 100644 --- a/configs/vision/pathology/offline/classification/camelyon16_small.yaml +++ b/configs/vision/pathology/offline/classification/camelyon16_small.yaml @@ -34,7 +34,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id"] + metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] backbone: class_path: eva.vision.models.ModelFromRegistry init_args: diff --git a/configs/vision/pathology/offline/classification/panda.yaml b/configs/vision/pathology/offline/classification/panda.yaml index dab70371..404de521 100644 --- a/configs/vision/pathology/offline/classification/panda.yaml +++ b/configs/vision/pathology/offline/classification/panda.yaml @@ -33,7 +33,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id"] + metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] backbone: class_path: eva.vision.models.ModelFromRegistry init_args: diff --git a/configs/vision/pathology/offline/classification/panda_small.yaml b/configs/vision/pathology/offline/classification/panda_small.yaml index a846a45c..9230036a 100644 --- a/configs/vision/pathology/offline/classification/panda_small.yaml +++ b/configs/vision/pathology/offline/classification/panda_small.yaml @@ -33,7 +33,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id"] + metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] backbone: class_path: eva.vision.models.ModelFromRegistry init_args: diff --git a/src/eva/vision/data/datasets/classification/camelyon16.py b/src/eva/vision/data/datasets/classification/camelyon16.py index a7ace11c..09685ff8 100644 --- a/src/eva/vision/data/datasets/classification/camelyon16.py +++ b/src/eva/vision/data/datasets/classification/camelyon16.py @@ -207,7 +207,9 @@ def load_target(self, index: int) -> torch.Tensor: @override def load_metadata(self, index: int) -> Dict[str, Any]: - return {"wsi_id": self.filename(index).split(".")[0]} + dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) + patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) + return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: """Loads the file paths of the corresponding dataset split.""" diff --git a/src/eva/vision/data/datasets/classification/panda.py b/src/eva/vision/data/datasets/classification/panda.py index ffa00ab3..a68c13f9 100644 --- a/src/eva/vision/data/datasets/classification/panda.py +++ b/src/eva/vision/data/datasets/classification/panda.py @@ -132,7 +132,9 @@ def load_target(self, index: int) -> torch.Tensor: @override def load_metadata(self, index: int) -> Dict[str, Any]: - return {"wsi_id": self.filename(index).split(".")[0]} + dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) + patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) + return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: """Loads the file paths of the corresponding dataset split.""" diff --git a/src/eva/vision/data/datasets/wsi.py b/src/eva/vision/data/datasets/wsi.py index fe83ca63..92177568 100644 --- a/src/eva/vision/data/datasets/wsi.py +++ b/src/eva/vision/data/datasets/wsi.py @@ -2,7 +2,7 @@ import bisect import os -from typing import Callable, List +from typing import Any, Callable, Dict, List from loguru import logger from torch.utils.data import dataset as torch_datasets @@ -85,6 +85,17 @@ def __getitem__(self, index: int) -> tv_tensors.Image: patch = self._apply_transforms(patch) return patch + def load_metadata(self, index: int) -> Dict[str, Any]: + """Loads the metadata for the patch at the specified index.""" + x, y = self._coords.x_y[index] + return { + "x": x, + "y": y, + "width": self._coords.width, + "height": self._coords.height, + "level_idx": self._coords.level_idx, + } + def _apply_transforms(self, image: tv_tensors.Image) -> tv_tensors.Image: if self._image_transforms is not None: image = self._image_transforms(image) @@ -185,3 +196,7 @@ def _load_datasets(self) -> list[WsiDataset]: def _get_dataset_idx(self, index: int) -> int: return bisect.bisect_right(self.cumulative_sizes, index) + + def _get_sample_idx(self, index: int) -> int: + dataset_idx = self._get_dataset_idx(index) + return index if dataset_idx == 0 else index - self.cumulative_sizes[dataset_idx - 1] From 201e0031140ff89f9df0293d00c6aff426999ea2 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Wed, 9 Oct 2024 09:37:05 +0000 Subject: [PATCH 2/6] updated unit tests --- src/eva/vision/data/datasets/classification/wsi.py | 4 +++- .../vision/data/datasets/classification/test_camelyon16.py | 5 +++++ tests/eva/vision/data/datasets/classification/test_panda.py | 5 +++++ tests/eva/vision/data/datasets/classification/test_wsi.py | 5 +++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/eva/vision/data/datasets/classification/wsi.py b/src/eva/vision/data/datasets/classification/wsi.py index 9e1cae52..23192ac8 100644 --- a/src/eva/vision/data/datasets/classification/wsi.py +++ b/src/eva/vision/data/datasets/classification/wsi.py @@ -88,7 +88,9 @@ def load_target(self, index: int) -> np.ndarray: @override def load_metadata(self, index: int) -> Dict[str, Any]: - return {"wsi_id": self.filename(index).split(".")[0]} + dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) + patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) + return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata def _load_manifest(self, manifest_path: str) -> pd.DataFrame: df = pd.read_csv(manifest_path) diff --git a/tests/eva/vision/data/datasets/classification/test_camelyon16.py b/tests/eva/vision/data/datasets/classification/test_camelyon16.py index c7c7277c..58594fb3 100644 --- a/tests/eva/vision/data/datasets/classification/test_camelyon16.py +++ b/tests/eva/vision/data/datasets/classification/test_camelyon16.py @@ -69,6 +69,11 @@ def _check_batch_shape(batch: Any): assert isinstance(target, torch.Tensor) assert isinstance(metadata, dict) assert "wsi_id" in metadata + assert "x" in metadata + assert "y" in metadata + assert "width" in metadata + assert "height" in metadata + assert "level_idx" in metadata @pytest.fixture diff --git a/tests/eva/vision/data/datasets/classification/test_panda.py b/tests/eva/vision/data/datasets/classification/test_panda.py index 783cc341..ce993a88 100644 --- a/tests/eva/vision/data/datasets/classification/test_panda.py +++ b/tests/eva/vision/data/datasets/classification/test_panda.py @@ -102,6 +102,11 @@ def _check_batch_shape(batch: Any): assert isinstance(target, torch.Tensor) assert isinstance(metadata, dict) assert "wsi_id" in metadata + assert "x" in metadata + assert "y" in metadata + assert "width" in metadata + assert "height" in metadata + assert "level_idx" in metadata @pytest.fixture diff --git a/tests/eva/vision/data/datasets/classification/test_wsi.py b/tests/eva/vision/data/datasets/classification/test_wsi.py index d14573d8..c2dc4bcc 100644 --- a/tests/eva/vision/data/datasets/classification/test_wsi.py +++ b/tests/eva/vision/data/datasets/classification/test_wsi.py @@ -79,6 +79,11 @@ def _check_batch_shape(batch: Any): assert isinstance(metadata, dict) assert "wsi_id" in metadata + assert "x" in metadata + assert "y" in metadata + assert "width" in metadata + assert "height" in metadata + assert "level_idx" in metadata @pytest.fixture From ccf34d9d1d642f5740ee3a1ce68dd91acece52eb Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Wed, 9 Oct 2024 09:39:36 +0000 Subject: [PATCH 3/6] update tests --- configs/vision/tests/offline/panda.yaml | 2 +- .../core/callbacks/writers/embeddings/test_classification.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/vision/tests/offline/panda.yaml b/configs/vision/tests/offline/panda.yaml index 28844dd1..e5d5ecd1 100644 --- a/configs/vision/tests/offline/panda.yaml +++ b/configs/vision/tests/offline/panda.yaml @@ -14,7 +14,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id"] + metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] backbone: class_path: eva.models.ModelFromFunction init_args: diff --git a/tests/eva/core/callbacks/writers/embeddings/test_classification.py b/tests/eva/core/callbacks/writers/embeddings/test_classification.py index 55488fdb..c88e0a3f 100644 --- a/tests/eva/core/callbacks/writers/embeddings/test_classification.py +++ b/tests/eva/core/callbacks/writers/embeddings/test_classification.py @@ -29,7 +29,7 @@ (5, 7, None, None), (5, 7, ["wsi_id"], None), (8, 16, None, None), - (8, 32, ["wsi_id"], ["slide_1", "slide_2"]), + (8, 32, ["wsi_id", "x", "y"], ["slide_1", "slide_2"]), ], ) def test_embeddings_writer(datamodule: datamodules.DataModule, model: modules.HeadModule) -> None: From a5c8f18dc35471b67289e8a9bbaca55ac4a3b5d5 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Wed, 9 Oct 2024 11:07:15 +0000 Subject: [PATCH 4/6] remove coordinates from manifest file --- configs/vision/pathology/offline/classification/camelyon16.yaml | 2 +- .../pathology/offline/classification/camelyon16_small.yaml | 2 +- configs/vision/pathology/offline/classification/panda.yaml | 2 +- .../vision/pathology/offline/classification/panda_small.yaml | 2 +- configs/vision/tests/offline/panda.yaml | 2 +- .../core/callbacks/writers/embeddings/test_classification.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/vision/pathology/offline/classification/camelyon16.yaml b/configs/vision/pathology/offline/classification/camelyon16.yaml index 6ea5ed59..e39a86cd 100644 --- a/configs/vision/pathology/offline/classification/camelyon16.yaml +++ b/configs/vision/pathology/offline/classification/camelyon16.yaml @@ -34,7 +34,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] + metadata_keys: ["wsi_id"] backbone: class_path: eva.vision.models.ModelFromRegistry init_args: diff --git a/configs/vision/pathology/offline/classification/camelyon16_small.yaml b/configs/vision/pathology/offline/classification/camelyon16_small.yaml index ca18e84e..490a5038 100644 --- a/configs/vision/pathology/offline/classification/camelyon16_small.yaml +++ b/configs/vision/pathology/offline/classification/camelyon16_small.yaml @@ -34,7 +34,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] + metadata_keys: ["wsi_id"] backbone: class_path: eva.vision.models.ModelFromRegistry init_args: diff --git a/configs/vision/pathology/offline/classification/panda.yaml b/configs/vision/pathology/offline/classification/panda.yaml index 404de521..dab70371 100644 --- a/configs/vision/pathology/offline/classification/panda.yaml +++ b/configs/vision/pathology/offline/classification/panda.yaml @@ -33,7 +33,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] + metadata_keys: ["wsi_id"] backbone: class_path: eva.vision.models.ModelFromRegistry init_args: diff --git a/configs/vision/pathology/offline/classification/panda_small.yaml b/configs/vision/pathology/offline/classification/panda_small.yaml index 9230036a..a846a45c 100644 --- a/configs/vision/pathology/offline/classification/panda_small.yaml +++ b/configs/vision/pathology/offline/classification/panda_small.yaml @@ -33,7 +33,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] + metadata_keys: ["wsi_id"] backbone: class_path: eva.vision.models.ModelFromRegistry init_args: diff --git a/configs/vision/tests/offline/panda.yaml b/configs/vision/tests/offline/panda.yaml index e5d5ecd1..28844dd1 100644 --- a/configs/vision/tests/offline/panda.yaml +++ b/configs/vision/tests/offline/panda.yaml @@ -14,7 +14,7 @@ trainer: 0: train 1: val 2: test - metadata_keys: ["wsi_id", "x", "y", "width", "height", "level_idx"] + metadata_keys: ["wsi_id"] backbone: class_path: eva.models.ModelFromFunction init_args: diff --git a/tests/eva/core/callbacks/writers/embeddings/test_classification.py b/tests/eva/core/callbacks/writers/embeddings/test_classification.py index c88e0a3f..55488fdb 100644 --- a/tests/eva/core/callbacks/writers/embeddings/test_classification.py +++ b/tests/eva/core/callbacks/writers/embeddings/test_classification.py @@ -29,7 +29,7 @@ (5, 7, None, None), (5, 7, ["wsi_id"], None), (8, 16, None, None), - (8, 32, ["wsi_id", "x", "y"], ["slide_1", "slide_2"]), + (8, 32, ["wsi_id"], ["slide_1", "slide_2"]), ], ) def test_embeddings_writer(datamodule: datamodules.DataModule, model: modules.HeadModule) -> None: From c41872da4e1a963caeec211415b5f56307768f1c Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Wed, 9 Oct 2024 13:19:10 +0000 Subject: [PATCH 5/6] add option save coords file to wsi dataset classes --- .../offline/classification/camelyon16.yaml | 1 + .../classification/camelyon16_small.yaml | 1 + .../offline/classification/panda.yaml | 1 + .../offline/classification/panda_small.yaml | 1 + .../core/callbacks/writers/embeddings/base.py | 7 +++---- .../datasets/classification/camelyon16.py | 7 ++++--- .../data/datasets/classification/panda.py | 7 ++++--- .../data/datasets/classification/wsi.py | 7 ++++--- src/eva/vision/data/datasets/wsi.py | 21 +++++++++++++++++++ .../vision/data/wsi/patching/coordinates.py | 10 ++++++++- tests/eva/vision/data/datasets/test_wsi.py | 12 ++++++++++- 11 files changed, 60 insertions(+), 15 deletions(-) diff --git a/configs/vision/pathology/offline/classification/camelyon16.yaml b/configs/vision/pathology/offline/classification/camelyon16.yaml index e39a86cd..620bc993 100644 --- a/configs/vision/pathology/offline/classification/camelyon16.yaml +++ b/configs/vision/pathology/offline/classification/camelyon16.yaml @@ -108,6 +108,7 @@ data: height: 224 target_mpp: 0.25 split: train + coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv image_transforms: class_path: eva.vision.data.transforms.common.ResizeAndCrop init_args: diff --git a/configs/vision/pathology/offline/classification/camelyon16_small.yaml b/configs/vision/pathology/offline/classification/camelyon16_small.yaml index 490a5038..e4c5abda 100644 --- a/configs/vision/pathology/offline/classification/camelyon16_small.yaml +++ b/configs/vision/pathology/offline/classification/camelyon16_small.yaml @@ -108,6 +108,7 @@ data: height: 224 target_mpp: 0.25 split: train + coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv image_transforms: class_path: eva.vision.data.transforms.common.ResizeAndCrop init_args: diff --git a/configs/vision/pathology/offline/classification/panda.yaml b/configs/vision/pathology/offline/classification/panda.yaml index dab70371..d2b86fd5 100644 --- a/configs/vision/pathology/offline/classification/panda.yaml +++ b/configs/vision/pathology/offline/classification/panda.yaml @@ -107,6 +107,7 @@ data: height: 224 target_mpp: 0.5 split: train + coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv image_transforms: class_path: eva.vision.data.transforms.common.ResizeAndCrop init_args: diff --git a/configs/vision/pathology/offline/classification/panda_small.yaml b/configs/vision/pathology/offline/classification/panda_small.yaml index a846a45c..f067110c 100644 --- a/configs/vision/pathology/offline/classification/panda_small.yaml +++ b/configs/vision/pathology/offline/classification/panda_small.yaml @@ -107,6 +107,7 @@ data: height: 224 target_mpp: 0.5 split: train + coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv image_transforms: class_path: eva.vision.data.transforms.common.ResizeAndCrop init_args: diff --git a/src/eva/core/callbacks/writers/embeddings/base.py b/src/eva/core/callbacks/writers/embeddings/base.py index f4930cd7..6cdde5ab 100644 --- a/src/eva/core/callbacks/writers/embeddings/base.py +++ b/src/eva/core/callbacks/writers/embeddings/base.py @@ -172,15 +172,14 @@ def _get_item_metadata( def _check_if_exists(self) -> None: """Checks if the output directory already exists and if it should be overwritten.""" - try: - os.makedirs(self._output_dir, exist_ok=self._overwrite) - except FileExistsError as e: + os.makedirs(self._output_dir, exist_ok=True) + if os.path.exists(os.path.join(self._output_dir, "manifest.csv")) and not self._overwrite: raise FileExistsError( f"The embeddings output directory already exists: {self._output_dir}. This " "either means that they have been computed before or that a wrong output " "directory is being used. Consider using `eva fit` instead, selecting a " "different output directory or setting overwrite=True." - ) from e + ) os.makedirs(self._output_dir, exist_ok=True) diff --git a/src/eva/vision/data/datasets/classification/camelyon16.py b/src/eva/vision/data/datasets/classification/camelyon16.py index 09685ff8..e8abb527 100644 --- a/src/eva/vision/data/datasets/classification/camelyon16.py +++ b/src/eva/vision/data/datasets/classification/camelyon16.py @@ -87,6 +87,7 @@ def __init__( target_mpp: float = 0.5, backend: str = "openslide", image_transforms: Callable | None = None, + coords_path: str | None = None, seed: int = 42, ) -> None: """Initializes the dataset. @@ -100,6 +101,7 @@ def __init__( target_mpp: Target microns per pixel (mpp) for the patches. backend: The backend to use for reading the whole-slide images. image_transforms: Transforms to apply to the extracted image patches. + coords_path: File path to save the patch coordinates as .csv. seed: Random seed for reproducibility. """ self._split = split @@ -119,6 +121,7 @@ def __init__( target_mpp=target_mpp, backend=backend, image_transforms=image_transforms, + coords_path=coords_path, ) @property @@ -207,9 +210,7 @@ def load_target(self, index: int) -> torch.Tensor: @override def load_metadata(self, index: int) -> Dict[str, Any]: - dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) - patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) - return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata + return wsi.MultiWsiDataset.load_metadata(self, index) def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: """Loads the file paths of the corresponding dataset split.""" diff --git a/src/eva/vision/data/datasets/classification/panda.py b/src/eva/vision/data/datasets/classification/panda.py index a68c13f9..fb089c47 100644 --- a/src/eva/vision/data/datasets/classification/panda.py +++ b/src/eva/vision/data/datasets/classification/panda.py @@ -49,6 +49,7 @@ def __init__( target_mpp: float = 0.5, backend: str = "openslide", image_transforms: Callable | None = None, + coords_path: str | None = None, seed: int = 42, ) -> None: """Initializes the dataset. @@ -62,6 +63,7 @@ def __init__( target_mpp: Target microns per pixel (mpp) for the patches. backend: The backend to use for reading the whole-slide images. image_transforms: Transforms to apply to the extracted image patches. + coords_path: File path to save the patch coordinates as .csv. seed: Random seed for reproducibility. """ self._split = split @@ -80,6 +82,7 @@ def __init__( target_mpp=target_mpp, backend=backend, image_transforms=image_transforms, + coords_path=coords_path, ) @property @@ -132,9 +135,7 @@ def load_target(self, index: int) -> torch.Tensor: @override def load_metadata(self, index: int) -> Dict[str, Any]: - dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) - patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) - return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata + return wsi.MultiWsiDataset.load_metadata(self, index) def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: """Loads the file paths of the corresponding dataset split.""" diff --git a/src/eva/vision/data/datasets/classification/wsi.py b/src/eva/vision/data/datasets/classification/wsi.py index 23192ac8..e0b4f83a 100644 --- a/src/eva/vision/data/datasets/classification/wsi.py +++ b/src/eva/vision/data/datasets/classification/wsi.py @@ -35,6 +35,7 @@ def __init__( split: Literal["train", "val", "test"] | None = None, image_transforms: Callable | None = None, column_mapping: Dict[str, str] = default_column_mapping, + coords_path: str | None = None, ): """Initializes the dataset. @@ -51,6 +52,7 @@ def __init__( split: The split of the dataset to load. image_transforms: Transforms to apply to the extracted image patches. column_mapping: Mapping of the columns in the manifest file. + coords_path: File path to save the patch coordinates as .csv. """ self._split = split self._column_mapping = self.default_column_mapping | column_mapping @@ -66,6 +68,7 @@ def __init__( target_mpp=target_mpp, backend=backend, image_transforms=image_transforms, + coords_path=coords_path, ) @override @@ -88,9 +91,7 @@ def load_target(self, index: int) -> np.ndarray: @override def load_metadata(self, index: int) -> Dict[str, Any]: - dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) - patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) - return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata + return wsi.MultiWsiDataset.load_metadata(self, index) def _load_manifest(self, manifest_path: str) -> pd.DataFrame: df = pd.read_csv(manifest_path) diff --git a/src/eva/vision/data/datasets/wsi.py b/src/eva/vision/data/datasets/wsi.py index 92177568..803493ab 100644 --- a/src/eva/vision/data/datasets/wsi.py +++ b/src/eva/vision/data/datasets/wsi.py @@ -4,6 +4,7 @@ import os from typing import Any, Callable, Dict, List +import pandas as pd from loguru import logger from torch.utils.data import dataset as torch_datasets from torchvision import tv_tensors @@ -116,6 +117,7 @@ def __init__( overwrite_mpp: float | None = None, backend: str = "openslide", image_transforms: Callable | None = None, + coords_path: str | None = None, ): """Initializes a new dataset instance. @@ -129,6 +131,7 @@ def __init__( sampler: The sampler to use for sampling patch coordinates. backend: The backend to use for reading the whole-slide images. image_transforms: Transforms to apply to the extracted image patches. + coords_path: File path to save the patch coordinates as .csv. """ super().__init__() @@ -141,6 +144,7 @@ def __init__( self._sampler = sampler self._backend = backend self._image_transforms = image_transforms + self._coords_path = coords_path self._concat_dataset: torch_datasets.ConcatDataset @@ -157,6 +161,7 @@ def cumulative_sizes(self) -> List[int]: @override def configure(self) -> None: self._concat_dataset = torch_datasets.ConcatDataset(datasets=self._load_datasets()) + self._save_coords_to_file() @override def __len__(self) -> int: @@ -170,6 +175,12 @@ def __getitem__(self, index: int) -> tv_tensors.Image: def filename(self, index: int) -> str: return os.path.basename(self._file_paths[self._get_dataset_idx(index)]) + def load_metadata(self, index: int) -> Dict[str, Any]: + """Loads the metadata for the patch at the specified index.""" + dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) + patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) + return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata + def _load_datasets(self) -> list[WsiDataset]: logger.info(f"Initializing dataset with {len(self._file_paths)} WSIs ...") wsi_datasets = [] @@ -200,3 +211,13 @@ def _get_dataset_idx(self, index: int) -> int: def _get_sample_idx(self, index: int) -> int: dataset_idx = self._get_dataset_idx(index) return index if dataset_idx == 0 else index - self.cumulative_sizes[dataset_idx - 1] + + def _save_coords_to_file(self): + if self._coords_path is not None: + coords = [ + {"file": self._file_paths[i]} | dataset._coords.to_dict() + for i, dataset in enumerate(self.datasets) + ] + os.makedirs(os.path.abspath(os.path.join(self._coords_path, os.pardir)), exist_ok=True) + pd.DataFrame(coords).to_csv(self._coords_path, index=False) + logger.info(f"Saved patch coordinates to: {self._coords_path}") diff --git a/src/eva/vision/data/wsi/patching/coordinates.py b/src/eva/vision/data/wsi/patching/coordinates.py index bab7e0be..0152115f 100644 --- a/src/eva/vision/data/wsi/patching/coordinates.py +++ b/src/eva/vision/data/wsi/patching/coordinates.py @@ -2,7 +2,7 @@ import dataclasses import functools -from typing import List, Tuple +from typing import Any, Dict, List, Tuple from eva.vision.data.wsi import backends from eva.vision.data.wsi.patching import samplers @@ -75,6 +75,14 @@ def from_file( return cls(x_y, scaled_width, scaled_height, level_idx, sample_args.get("mask")) + def to_dict(self, include_keys: List[str] | None = None) -> Dict[str, Any]: + """Convert the coordinates to a dictionary.""" + include_keys = include_keys or ["x_y", "width", "height", "level_idx"] + coord_dict = dataclasses.asdict(self) + if include_keys: + coord_dict = {key: coord_dict[key] for key in include_keys} + return coord_dict + @functools.lru_cache(LRU_CACHE_SIZE) def get_cached_coords( diff --git a/tests/eva/vision/data/datasets/test_wsi.py b/tests/eva/vision/data/datasets/test_wsi.py index 87959a60..6548d710 100644 --- a/tests/eva/vision/data/datasets/test_wsi.py +++ b/tests/eva/vision/data/datasets/test_wsi.py @@ -1,8 +1,10 @@ """WsiDataset & MultiWsiDataset tests.""" import os +import pathlib from typing import Tuple +import pandas as pd import pytest from eva.vision.data import datasets @@ -69,7 +71,7 @@ def test_patch_shape(width: int, height: int, target_mpp: float, root: str, back assert dataset[0].shape == (3, scaled_width, scaled_height) -def test_multi_dataset(root: str): +def test_multi_dataset(root: str, tmp_path: pathlib.Path): """Test MultiWsiDataset with multiple whole-slide image paths.""" file_paths = [ os.path.join(root, "0/a.tiff"), @@ -77,6 +79,8 @@ def test_multi_dataset(root: str): os.path.join(root, "1/a.tiff"), ] + # get tmp csv file path for coords + coords_path = (tmp_path / "coords.csv").as_posix() width, height = 32, 32 dataset = datasets.MultiWsiDataset( root=root, @@ -86,6 +90,7 @@ def test_multi_dataset(root: str): target_mpp=0.25, sampler=samplers.GridSampler(max_samples=None), backend="openslide", + coords_path=coords_path, ) dataset.setup() @@ -94,6 +99,11 @@ def test_multi_dataset(root: str): assert len(dataset) == _expected_n_patches(layer_shape, width, height, (0, 0)) * len(file_paths) assert dataset.cumulative_sizes == [64, 128, 192] + assert os.path.exists(coords_path) + df_coords = pd.read_csv(coords_path) + assert "file" in df_coords.columns + assert "x_y" in df_coords.columns + def _expected_n_patches(layer_shape, width, height, overlap): """Calculate the expected number of patches.""" From ad27ddaf65fb9d4107b8cadb5aa77468bbe29911 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Wed, 9 Oct 2024 13:25:17 +0000 Subject: [PATCH 6/6] remove comment line --- tests/eva/vision/data/datasets/test_wsi.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/eva/vision/data/datasets/test_wsi.py b/tests/eva/vision/data/datasets/test_wsi.py index 6548d710..5c01a59d 100644 --- a/tests/eva/vision/data/datasets/test_wsi.py +++ b/tests/eva/vision/data/datasets/test_wsi.py @@ -73,14 +73,12 @@ def test_patch_shape(width: int, height: int, target_mpp: float, root: str, back def test_multi_dataset(root: str, tmp_path: pathlib.Path): """Test MultiWsiDataset with multiple whole-slide image paths.""" + coords_path = (tmp_path / "coords.csv").as_posix() file_paths = [ os.path.join(root, "0/a.tiff"), os.path.join(root, "0/b.tiff"), os.path.join(root, "1/a.tiff"), ] - - # get tmp csv file path for coords - coords_path = (tmp_path / "coords.csv").as_posix() width, height = 32, 32 dataset = datasets.MultiWsiDataset( root=root,