Add PANDASmall dataset (#664)

kaiko-ai · Oct 8, 2024 · be6dc72 · be6dc72
1 parent 78f5a52
commit be6dc72
Show file tree

Hide file tree

Showing 15 changed files with 307 additions and 54 deletions.
diff --git a/configs/vision/pathology/offline/classification/panda_small.yaml b/configs/vision/pathology/offline/classification/panda_small.yaml
@@ -0,0 +1,133 @@
+---
+trainer:
+  class_path: eva.Trainer
+  init_args:
+    n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
+    default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/panda}
+    max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 49}
+    callbacks:
+      - class_path: eva.callbacks.ConfigurationLogger
+      - class_path: lightning.pytorch.callbacks.TQDMProgressBar
+        init_args:
+          refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
+      - class_path: lightning.pytorch.callbacks.LearningRateMonitor
+        init_args:
+          logging_interval: epoch
+      - class_path: lightning.pytorch.callbacks.ModelCheckpoint
+        init_args:
+          filename: best
+          save_last: true
+          save_top_k: 1
+          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
+          mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
+      - class_path: lightning.pytorch.callbacks.EarlyStopping
+        init_args:
+          min_delta: 0
+          patience: ${oc.env:PATIENCE, 8}
+          monitor: *MONITOR_METRIC
+          mode: *MONITOR_METRIC_MODE
+      - class_path: eva.callbacks.ClassificationEmbeddingsWriter
+        init_args:
+          output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/panda}
+          dataloader_idx_map:
+            0: train
+            1: val
+            2: test
+          metadata_keys: ["wsi_id"]
+          backbone:
+            class_path: eva.vision.models.ModelFromRegistry
+            init_args:
+              model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
+              model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
+          overwrite: false
+    logger:
+      - class_path: lightning.pytorch.loggers.TensorBoardLogger
+        init_args:
+          save_dir: *OUTPUT_ROOT
+          name: ""
+model:
+  class_path: eva.HeadModule
+  init_args:
+    head:
+      class_path: eva.vision.models.networks.ABMIL
+      init_args:
+        input_size: ${oc.env:IN_FEATURES, 384}
+        output_size: &NUM_CLASSES 6
+        projected_input_size: 128
+    criterion: torch.nn.CrossEntropyLoss
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: ${oc.env:LR_VALUE, 0.001}
+        betas: [0.9, 0.999]
+    lr_scheduler:
+      class_path: torch.optim.lr_scheduler.CosineAnnealingLR
+      init_args:
+        T_max: *MAX_EPOCHS
+        eta_min: 0.0
+    metrics:
+      common:
+        - class_path: eva.metrics.AverageLoss
+        - class_path: eva.metrics.MulticlassClassificationMetrics
+          init_args:
+            num_classes: *NUM_CLASSES
+data:
+  class_path: eva.DataModule
+  init_args:
+    datasets:
+      train:
+        class_path: eva.datasets.MultiEmbeddingsClassificationDataset
+        init_args: &DATASET_ARGS
+          root: *DATASET_EMBEDDINGS_ROOT
+          manifest_file: manifest.csv
+          split: train
+          embeddings_transforms:
+            class_path: eva.core.data.transforms.Pad2DTensor
+            init_args:
+              pad_size: &N_PATCHES 200
+      val:
+        class_path: eva.datasets.MultiEmbeddingsClassificationDataset
+        init_args:
+          <<: *DATASET_ARGS
+          split: val
+      test:
+        class_path: eva.datasets.MultiEmbeddingsClassificationDataset
+        init_args:
+          <<: *DATASET_ARGS
+          split: test
+      predict:
+        - class_path: eva.vision.datasets.PANDASmall
+          init_args: &PREDICT_DATASET_ARGS
+            root: ${oc.env:DATA_ROOT, ./data/panda/prostate-cancer-grade-assessment}
+            sampler:
+              class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler
+              init_args:
+                max_samples: *N_PATCHES
+            width: 224
+            height: 224
+            target_mpp: 0.5
+            split: train
+            image_transforms:
+              class_path: eva.vision.data.transforms.common.ResizeAndCrop
+              init_args:
+                size: ${oc.env:RESIZE_DIM, 224}
+                mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]} 
+                std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
+        - class_path: eva.vision.datasets.PANDASmall
+          init_args:
+            <<: *PREDICT_DATASET_ARGS
+            split: val
+        - class_path: eva.vision.datasets.PANDASmall
+          init_args:
+            <<: *PREDICT_DATASET_ARGS
+            split: test
+    dataloaders:
+      train:
+        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 32}
+        shuffle: true
+      val:
+        batch_size: *BATCH_SIZE
+      test:
+        batch_size: *BATCH_SIZE
+      predict:
+        batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64}
diff --git a/docs/user-guide/advanced/replicate_evaluations.md b/docs/user-guide/advanced/replicate_evaluations.md
@@ -144,8 +144,8 @@ were released on [HuggingFace](https://huggingface.co/bioptimus/H-optimus-0).
 
 ```
 MODEL_NAME=pathology/bioptimus_h_optimus_0 \
-NORMALIZE_MEAN=[0.707223, 0.578729, 0.703617] \
-NORMALIZE_STD=[0.211883, 0.230117, 0.177517] \
+NORMALIZE_MEAN=[0.707223,0.578729,0.703617] \
+NORMALIZE_STD=[0.211883,0.230117,0.177517] \
 IN_FEATURES=1024 \
 eva predict_fit --config configs/vision/pathology/offline/<task>.yaml
 ```

diff --git a/src/eva/core/data/splitting/random.py b/src/eva/core/data/splitting/random.py
@@ -24,12 +24,13 @@ def random_split(
     Returns:
         The indices of the train, validation, and test sets as lists.
     """
-    if train_ratio + val_ratio + (test_ratio or 0) != 1:
-        raise ValueError("The sum of the ratios must be equal to 1.")
+    total_ratio = train_ratio + val_ratio + test_ratio
+    if total_ratio > 1.0:
+        raise ValueError("The sum of the ratios must be lower or equal to 1.")
 
     random_generator = np.random.default_rng(seed)
-    n_samples = len(samples)
-    indices = random_generator.permutation(n_samples)
+    n_samples = int(total_ratio * len(samples))
+    indices = random_generator.permutation(len(samples))[:n_samples]
 
     n_train = int(np.floor(train_ratio * n_samples))
     n_val = n_samples - n_train if test_ratio == 0.0 else int(np.floor(val_ratio * n_samples)) or 1

diff --git a/src/eva/core/data/splitting/stratified.py b/src/eva/core/data/splitting/stratified.py
@@ -28,29 +28,35 @@ def stratified_split(
     """
     if len(samples) != len(targets):
         raise ValueError("The number of samples and targets must be equal.")
-    if train_ratio + val_ratio + (test_ratio or 0) != 1:
-        raise ValueError("The sum of the ratios must be equal to 1.")
+    if train_ratio + val_ratio + (test_ratio or 0) > 1.0:
+        raise ValueError("The sum of the ratios must be lower or equal to 1.")
 
-    np.random.seed(seed)
+    use_all_samples = train_ratio + val_ratio + test_ratio == 1
+    random_generator = np.random.default_rng(seed)
     unique_classes, y_indices = np.unique(targets, return_inverse=True)
     n_classes = unique_classes.shape[0]
 
     train_indices, val_indices, test_indices = [], [], []
 
     for c in range(n_classes):
         class_indices = np.where(y_indices == c)[0]
-        np.random.shuffle(class_indices)
+        random_generator.shuffle(class_indices)
 
         n_train = int(np.floor(train_ratio * len(class_indices))) or 1
         n_val = (
             len(class_indices) - n_train
-            if test_ratio == 0.0
+            if test_ratio == 0.0 and use_all_samples
             else int(np.floor(val_ratio * len(class_indices))) or 1
         )
 
         train_indices.extend(class_indices[:n_train])
         val_indices.extend(class_indices[n_train : n_train + n_val])
         if test_ratio > 0.0:
-            test_indices.extend(class_indices[n_train + n_val :])
+            n_test = (
+                len(class_indices) - n_train - n_val
+                if use_all_samples
+                else int(np.floor(test_ratio * len(class_indices))) or 1
+            )
+            test_indices.extend(class_indices[n_train + n_val : n_train + n_val + n_test])
 
     return train_indices, val_indices, test_indices or None
diff --git a/src/eva/vision/data/datasets/__init__.py b/src/eva/vision/data/datasets/__init__.py
@@ -6,6 +6,7 @@
     MHIST,
     PANDA,
     Camelyon16,
+    PANDASmall,
     PatchCamelyon,
     WsiClassificationDataset,
 )
@@ -28,6 +29,7 @@
     "CRC",
     "MHIST",
     "PANDA",
+    "PANDASmall",
     "Camelyon16",
     "PatchCamelyon",
     "WsiClassificationDataset",

diff --git a/src/eva/vision/data/datasets/classification/__init__.py b/src/eva/vision/data/datasets/classification/__init__.py
@@ -4,7 +4,7 @@
 from eva.vision.data.datasets.classification.camelyon16 import Camelyon16
 from eva.vision.data.datasets.classification.crc import CRC
 from eva.vision.data.datasets.classification.mhist import MHIST
-from eva.vision.data.datasets.classification.panda import PANDA
+from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall
 from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon
 from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset
 
@@ -15,5 +15,6 @@
     "PatchCamelyon",
     "WsiClassificationDataset",
     "PANDA",
+    "PANDASmall",
     "Camelyon16",
 ]
diff --git a/src/eva/vision/data/datasets/classification/panda.py b/src/eva/vision/data/datasets/classification/panda.py
@@ -182,3 +182,16 @@ def _get_target_from_path(self, file_path: str) -> int:
 
     def _get_id_from_path(self, file_path: str) -> str:
         return os.path.basename(file_path).replace(".tiff", "")
+
+
+class PANDASmall(PANDA):
+    """Small version of the PANDA dataset for quicker benchmarking."""
+
+    _train_split_ratio: float = 0.1
+    """Train split ratio."""
+
+    _val_split_ratio: float = 0.05
+    """Validation split ratio."""
+
+    _test_split_ratio: float = 0.05
+    """Test split ratio."""
diff --git a/src/eva/vision/data/wsi/patching/samplers/_utils.py b/src/eva/vision/data/wsi/patching/samplers/_utils.py
@@ -1,14 +1,8 @@
-import random
 from typing import Tuple
 
 import numpy as np
 
 
-def set_seed(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-
-
 def get_grid_coords_and_indices(
     layer_shape: Tuple[int, int],
     width: int,
@@ -33,8 +27,8 @@ def get_grid_coords_and_indices(
 
     indices = list(range(len(x_y)))
     if shuffle:
-        set_seed(seed)
-        np.random.shuffle(indices)
+        random_generator = np.random.default_rng(seed)
+        random_generator.shuffle(indices)
     return x_y, indices
 
 

diff --git a/src/eva/vision/data/wsi/patching/samplers/random.py b/src/eva/vision/data/wsi/patching/samplers/random.py
@@ -18,6 +18,7 @@ def __init__(self, n_samples: int = 1, seed: int = 42):
         """Initializes the sampler."""
         self.seed = seed
         self.n_samples = n_samples
+        self.random_generator = random.Random(seed)  # nosec
 
     def sample(
         self,
@@ -33,9 +34,10 @@ def sample(
             layer_shape: The shape of the layer.
         """
         _utils.validate_dimensions(width, height, layer_shape)
-        _utils.set_seed(self.seed)
 
         x_max, y_max = layer_shape[0], layer_shape[1]
         for _ in range(self.n_samples):
-            x, y = random.randint(0, x_max - width), random.randint(0, y_max - height)  # nosec
+            x, y = self.random_generator.randint(0, x_max - width), self.random_generator.randint(
+                0, y_max - height
+            )
             yield x, y
diff --git a/tests/eva/core/data/splitting/test_random.py b/tests/eva/core/data/splitting/test_random.py
@@ -1,5 +1,7 @@
 """Tests for the random split function."""
 
+from typing import List
+
 import pytest
 
 from eva.core.data import splitting
@@ -32,11 +34,11 @@ def test_split_ratios(n_samples: int, train_ratio: float, val_ratio: float, test
     assert len(train_indices) + len(val_indices) + len(test_indices or []) == n_samples
 
 
-@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.3, 0.0), (0.6, 0.4, 0.3)])
+@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.7, 0.0), (0.6, 0.4, 0.3)])
 def test_invalid_ratio_sums(train_ratio: float, val_ratio: float, test_ratio: float):
     """Tests if the function raises an error when the ratios do not sum to 1."""
     samples = list(range(100))
-    expected_error = "The sum of the ratios must be equal to 1."
+    expected_error = "The sum of the ratios must be lower or equal to 1"
     with pytest.raises(ValueError, match=expected_error):
         splitting.random_split(samples, train_ratio, val_ratio, test_ratio)
 
@@ -53,8 +55,20 @@ def test_different_seeds_produce_different_outputs(seed1, seed2):
     assert test1 != test2, "Different seeds should produce different test indices"
 
 
-@pytest.mark.parametrize("seed", [42, 123, 999])
-def test_same_seed_produces_same_outputs(seed):
+@pytest.mark.parametrize(
+    "seed, train_expected_indices, val_expected_indices, test_expected_indices",
+    [
+        (42, [59, 21, 56, 18], [69, 15, 48, 55], [49, 6, 90, 11]),
+        (123, [21, 71, 92, 23], [89, 14, 64, 4], [45, 75, 62, 6]),
+        (999, [47, 42, 57, 50], [41, 3, 81, 61], [45, 6, 56, 67]),
+    ],
+)
+def test_same_seed_produces_same_outputs(
+    seed: int,
+    train_expected_indices: List[int],
+    val_expected_indices: List[int],
+    test_expected_indices: List[int],
+):
     """Tests if the same seed produces the same train, validation, and test indices."""
     samples = list(range(100))
     train1, val1, test1 = splitting.random_split(samples, 0.6, 0.2, 0.2, seed=seed)
@@ -63,6 +77,11 @@ def test_same_seed_produces_same_outputs(seed):
     assert train1 == train2, "Same seed should produce the same train indices"
     assert val1 == val2, "Same seed should produce the same validation indices"
     assert test1 == test2, "Same seed should produce the same test indices"
+    assert isinstance(test1, list)
+
+    assert train1[: len(train_expected_indices)] == train_expected_indices, "Unexpected indices"
+    assert val1[: len(val_expected_indices)] == val_expected_indices, "Unexpected indices"
+    assert test1[: len(test_expected_indices)] == test_expected_indices, "Unexpected indices"
 
 
 def test_no_test_set():