From 83b751df14347843def6a133c0f2588446a5c8fd Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 09:12:40 +0000 Subject: [PATCH 01/11] updated stratified split logic to accepts ratios that don't sum up to 1 --- src/eva/core/data/splitting/stratified.py | 18 ++++++---- .../core/data/splitting/test_stratified.py | 35 +++++++++++++++---- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/eva/core/data/splitting/stratified.py b/src/eva/core/data/splitting/stratified.py index ad9377a7..8537e7a3 100644 --- a/src/eva/core/data/splitting/stratified.py +++ b/src/eva/core/data/splitting/stratified.py @@ -28,10 +28,11 @@ def stratified_split( """ if len(samples) != len(targets): raise ValueError("The number of samples and targets must be equal.") - if train_ratio + val_ratio + (test_ratio or 0) != 1: - raise ValueError("The sum of the ratios must be equal to 1.") + if train_ratio + val_ratio + (test_ratio or 0) > 1.0: + raise ValueError("The sum of the ratios must be lower or equal to 1.") - np.random.seed(seed) + use_all_samples = train_ratio + val_ratio + test_ratio == 1 + random_generator = np.random.default_rng(seed) unique_classes, y_indices = np.unique(targets, return_inverse=True) n_classes = unique_classes.shape[0] @@ -39,18 +40,23 @@ def stratified_split( for c in range(n_classes): class_indices = np.where(y_indices == c)[0] - np.random.shuffle(class_indices) + random_generator.shuffle(class_indices) n_train = int(np.floor(train_ratio * len(class_indices))) or 1 n_val = ( len(class_indices) - n_train - if test_ratio == 0.0 + if test_ratio == 0.0 and use_all_samples else int(np.floor(val_ratio * len(class_indices))) or 1 ) train_indices.extend(class_indices[:n_train]) val_indices.extend(class_indices[n_train : n_train + n_val]) if test_ratio > 0.0: - test_indices.extend(class_indices[n_train + n_val :]) + n_test = ( + len(class_indices) - n_train - n_val + if use_all_samples + else int(np.floor(test_ratio * len(class_indices))) or 1 + ) + test_indices.extend(class_indices[n_train + n_val : n_train + n_val + n_test]) return train_indices, val_indices, test_indices or None diff --git a/tests/eva/core/data/splitting/test_stratified.py b/tests/eva/core/data/splitting/test_stratified.py index 2b65ccd8..ef3857fb 100644 --- a/tests/eva/core/data/splitting/test_stratified.py +++ b/tests/eva/core/data/splitting/test_stratified.py @@ -1,5 +1,7 @@ """Tests for the stratified split function.""" +from typing import List + import pytest from eva.core.data import splitting @@ -12,10 +14,11 @@ ([0] * 50 + [1] * 50, 0.7, 0.15, 0.15), ([0] * 30 + [1] * 70, 0.8, 0.2, 0.0), ([0] * 30 + [1] * 70, 0.7, 0.15, 0.15), + ([0] * 30 + [1] * 70, 0.2, 0.1, 0.15), ], ) def test_stratification( - targets: list[int], train_ratio: float, val_ratio: float, test_ratio: float + targets: List[int], train_ratio: float, val_ratio: float, test_ratio: float ): """Tests if the stratified split maintains the class proportions.""" samples = list(range(len(targets))) @@ -31,21 +34,22 @@ def test_stratification( assert train_classes.count(c) == pytest.approx(expected_train_proportion, abs=1) assert val_classes.count(c) == pytest.approx(expected_val_proportion, abs=1) - assert len(train_indices) + len(val_indices) + len(test_indices or []) == len(samples) + if train_ratio + val_ratio + test_ratio == 1: + assert len(train_indices) + len(val_indices) + len(test_indices or []) == len(samples) -@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.3, 0.0), (0.6, 0.4, 0.3)]) +@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.5, 0.0), (0.6, 0.0, 0.7)]) def test_invalid_ratio_sums(train_ratio: float, val_ratio: float, test_ratio: float): """Tests if the function raises an error when the ratios do not sum to 1.""" samples = list(range(100)) targets = [0] * 50 + [1] * 50 - expected_error = "The sum of the ratios must be equal to 1." + expected_error = "The sum of the ratios must be lower or equal to 1" with pytest.raises(ValueError, match=expected_error): splitting.stratified_split(samples, targets, train_ratio, val_ratio, test_ratio) @pytest.mark.parametrize("seed1, seed2", [(42, 43), (123, 124), (999, 1000)]) -def test_different_seeds_produce_different_outputs(seed1, seed2): +def test_different_seeds_produce_different_outputs(seed1: int, seed2: int): """Tests if different seeds produce different train, validation, and test indices.""" samples = list(range(100)) targets = [0] * 50 + [1] * 50 @@ -57,8 +61,20 @@ def test_different_seeds_produce_different_outputs(seed1, seed2): assert test1 != test2, "Different seeds should produce different test indices" -@pytest.mark.parametrize("seed", [42, 123, 999]) -def test_same_seed_produces_same_outputs(seed): +@pytest.mark.parametrize( + "seed, train_expected_indices, val_expected_indices, test_expected_indices", + [ + (42, [5, 25, 20, 49], [3, 44, 30, 10], [0, 12, 14, 48]), + (123, [15, 38, 41, 7], [29, 44, 46, 37], [16, 9, 12, 45]), + (999, [49, 10, 1, 25], [24, 39, 3, 37], [0, 28, 13, 16]), + ], +) +def test_same_seed_produces_same_outputs( + seed: int, + train_expected_indices: List[int], + val_expected_indices: List[int], + test_expected_indices: List[int], +): """Tests if the same seed produces the same train, validation, and test indices.""" samples = list(range(100)) targets = [0] * 50 + [1] * 50 @@ -68,3 +84,8 @@ def test_same_seed_produces_same_outputs(seed): assert train1 == train2, "Same seed should produce the same train indices" assert val1 == val2, "Same seed should produce the same validation indices" assert test1 == test2, "Same seed should produce the same test indices" + assert isinstance(test1, list) + + assert train1[: len(train_expected_indices)] == train_expected_indices, "Unexpected indices" + assert val1[: len(val_expected_indices)] == val_expected_indices, "Unexpected indices" + assert test1[: len(test_expected_indices)] == test_expected_indices, "Unexpected indices" From 7704c3571a1dca7211c8933756e3d18954dc7b61 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 09:13:18 +0000 Subject: [PATCH 02/11] added PANDATiny dataset class --- .../offline/classification/panda_tiny.yaml | 133 ++++++++++++++++++ src/eva/vision/data/datasets/__init__.py | 2 + .../data/datasets/classification/__init__.py | 3 +- .../data/datasets/classification/panda.py | 13 ++ 4 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 configs/vision/pathology/offline/classification/panda_tiny.yaml diff --git a/configs/vision/pathology/offline/classification/panda_tiny.yaml b/configs/vision/pathology/offline/classification/panda_tiny.yaml new file mode 100644 index 00000000..94db80d1 --- /dev/null +++ b/configs/vision/pathology/offline/classification/panda_tiny.yaml @@ -0,0 +1,133 @@ +--- +trainer: + class_path: eva.Trainer + init_args: + n_runs: &N_RUNS ${oc.env:N_RUNS, 5} + default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/panda} + max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 49} + callbacks: + - class_path: eva.callbacks.ConfigurationLogger + - class_path: lightning.pytorch.callbacks.TQDMProgressBar + init_args: + refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1} + - class_path: lightning.pytorch.callbacks.LearningRateMonitor + init_args: + logging_interval: epoch + - class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + filename: best + save_last: true + save_top_k: 1 + monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy} + mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max} + - class_path: lightning.pytorch.callbacks.EarlyStopping + init_args: + min_delta: 0 + patience: ${oc.env:PATIENCE, 8} + monitor: *MONITOR_METRIC + mode: *MONITOR_METRIC_MODE + - class_path: eva.callbacks.ClassificationEmbeddingsWriter + init_args: + output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/panda} + dataloader_idx_map: + 0: train + 1: val + 2: test + metadata_keys: ["wsi_id"] + backbone: + class_path: eva.vision.models.ModelFromRegistry + init_args: + model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino} + model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null} + overwrite: false + logger: + - class_path: lightning.pytorch.loggers.TensorBoardLogger + init_args: + save_dir: *OUTPUT_ROOT + name: "" +model: + class_path: eva.HeadModule + init_args: + head: + class_path: eva.vision.models.networks.ABMIL + init_args: + input_size: ${oc.env:IN_FEATURES, 384} + output_size: &NUM_CLASSES 6 + projected_input_size: 128 + criterion: torch.nn.CrossEntropyLoss + optimizer: + class_path: torch.optim.AdamW + init_args: + lr: ${oc.env:LR_VALUE, 0.001} + betas: [0.9, 0.999] + lr_scheduler: + class_path: torch.optim.lr_scheduler.CosineAnnealingLR + init_args: + T_max: *MAX_EPOCHS + eta_min: 0.0 + metrics: + common: + - class_path: eva.metrics.AverageLoss + - class_path: eva.metrics.MulticlassClassificationMetrics + init_args: + num_classes: *NUM_CLASSES +data: + class_path: eva.DataModule + init_args: + datasets: + train: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: &DATASET_ARGS + root: *DATASET_EMBEDDINGS_ROOT + manifest_file: manifest.csv + split: train + embeddings_transforms: + class_path: eva.core.data.transforms.Pad2DTensor + init_args: + pad_size: &N_PATCHES 200 + val: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: + <<: *DATASET_ARGS + split: val + test: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: + <<: *DATASET_ARGS + split: test + predict: + - class_path: eva.vision.datasets.PANDATiny + init_args: &PREDICT_DATASET_ARGS + root: ${oc.env:DATA_ROOT, ./data/panda/prostate-cancer-grade-assessment} + sampler: + class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler + init_args: + max_samples: *N_PATCHES + width: 224 + height: 224 + target_mpp: 0.5 + split: train + image_transforms: + class_path: eva.vision.data.transforms.common.ResizeAndCrop + init_args: + size: ${oc.env:RESIZE_DIM, 224} + mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]} + std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]} + - class_path: eva.vision.datasets.PANDATiny + init_args: + <<: *PREDICT_DATASET_ARGS + split: val + - class_path: eva.vision.datasets.PANDATiny + init_args: + <<: *PREDICT_DATASET_ARGS + split: test + dataloaders: + train: + batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 32} + shuffle: true + val: + batch_size: *BATCH_SIZE + test: + batch_size: *BATCH_SIZE + predict: + batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64} diff --git a/src/eva/vision/data/datasets/__init__.py b/src/eva/vision/data/datasets/__init__.py index bec918af..a70a5477 100644 --- a/src/eva/vision/data/datasets/__init__.py +++ b/src/eva/vision/data/datasets/__init__.py @@ -6,6 +6,7 @@ MHIST, PANDA, Camelyon16, + PANDATiny, PatchCamelyon, WsiClassificationDataset, ) @@ -28,6 +29,7 @@ "CRC", "MHIST", "PANDA", + "PANDATiny", "Camelyon16", "PatchCamelyon", "WsiClassificationDataset", diff --git a/src/eva/vision/data/datasets/classification/__init__.py b/src/eva/vision/data/datasets/classification/__init__.py index c9daabbe..76b9241b 100644 --- a/src/eva/vision/data/datasets/classification/__init__.py +++ b/src/eva/vision/data/datasets/classification/__init__.py @@ -4,7 +4,7 @@ from eva.vision.data.datasets.classification.camelyon16 import Camelyon16 from eva.vision.data.datasets.classification.crc import CRC from eva.vision.data.datasets.classification.mhist import MHIST -from eva.vision.data.datasets.classification.panda import PANDA +from eva.vision.data.datasets.classification.panda import PANDA, PANDATiny from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset @@ -15,5 +15,6 @@ "PatchCamelyon", "WsiClassificationDataset", "PANDA", + "PANDATiny", "Camelyon16", ] diff --git a/src/eva/vision/data/datasets/classification/panda.py b/src/eva/vision/data/datasets/classification/panda.py index a7e180f6..fa5ddc74 100644 --- a/src/eva/vision/data/datasets/classification/panda.py +++ b/src/eva/vision/data/datasets/classification/panda.py @@ -182,3 +182,16 @@ def _get_target_from_path(self, file_path: str) -> int: def _get_id_from_path(self, file_path: str) -> str: return os.path.basename(file_path).replace(".tiff", "") + + +class PANDATiny(PANDA): + """Tiny version of the PANDA dataset for quicker benchmarking.""" + + _train_split_ratio: float = 0.1 + """Train split ratio.""" + + _val_split_ratio: float = 0.05 + """Validation split ratio.""" + + _test_split_ratio: float = 0.05 + """Test split ratio.""" From 8edee37ecabb810f1d5569f6e34804e40a12d7e1 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 11:26:07 +0000 Subject: [PATCH 03/11] updated random split logic & test --- .../advanced/replicate_evaluations.md | 6 ++--- src/eva/core/data/splitting/random.py | 11 +++++--- tests/eva/core/data/splitting/test_random.py | 26 ++++++++++++++++--- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/docs/user-guide/advanced/replicate_evaluations.md b/docs/user-guide/advanced/replicate_evaluations.md index eb3bcb58..6a47135a 100644 --- a/docs/user-guide/advanced/replicate_evaluations.md +++ b/docs/user-guide/advanced/replicate_evaluations.md @@ -144,10 +144,10 @@ were released on [HuggingFace](https://huggingface.co/bioptimus/H-optimus-0). ``` MODEL_NAME=pathology/bioptimus_h_optimus_0 \ -NORMALIZE_MEAN=[0.707223, 0.578729, 0.703617] \ -NORMALIZE_STD=[0.211883, 0.230117, 0.177517] \ +NORMALIZE_MEAN=[0.707223,0.578729,0.703617] \ +NORMALIZE_STD=[0.211883,0.230117,0.177517] \ IN_FEATURES=1024 \ -eva predict_fit --config configs/vision/pathology/offline/.yaml +eva predict_fit --config configs/vision/pathology/offline/panda_tiny.yaml ``` diff --git a/src/eva/core/data/splitting/random.py b/src/eva/core/data/splitting/random.py index 274a1412..922716bb 100644 --- a/src/eva/core/data/splitting/random.py +++ b/src/eva/core/data/splitting/random.py @@ -24,12 +24,15 @@ def random_split( Returns: The indices of the train, validation, and test sets as lists. """ - if train_ratio + val_ratio + (test_ratio or 0) != 1: - raise ValueError("The sum of the ratios must be equal to 1.") + total_ratio = train_ratio + val_ratio + test_ratio + if total_ratio > 1.0: + raise ValueError("The sum of the ratios must be lower or equal to 1.") random_generator = np.random.default_rng(seed) - n_samples = len(samples) - indices = random_generator.permutation(n_samples) + n_samples = int(total_ratio*len(samples)) + indices = random_generator.permutation(len(samples))[:n_samples] + + n_samples = int(total_ratio*len(samples)) n_train = int(np.floor(train_ratio * n_samples)) n_val = n_samples - n_train if test_ratio == 0.0 else int(np.floor(val_ratio * n_samples)) or 1 diff --git a/tests/eva/core/data/splitting/test_random.py b/tests/eva/core/data/splitting/test_random.py index e31396f3..21e57e9b 100644 --- a/tests/eva/core/data/splitting/test_random.py +++ b/tests/eva/core/data/splitting/test_random.py @@ -1,6 +1,7 @@ """Tests for the random split function.""" import pytest +from typing import List from eva.core.data import splitting @@ -32,11 +33,11 @@ def test_split_ratios(n_samples: int, train_ratio: float, val_ratio: float, test assert len(train_indices) + len(val_indices) + len(test_indices or []) == n_samples -@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.3, 0.0), (0.6, 0.4, 0.3)]) +@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.7, 0.0), (0.6, 0.4, 0.3)]) def test_invalid_ratio_sums(train_ratio: float, val_ratio: float, test_ratio: float): """Tests if the function raises an error when the ratios do not sum to 1.""" samples = list(range(100)) - expected_error = "The sum of the ratios must be equal to 1." + expected_error = "The sum of the ratios must be lower or equal to 1" with pytest.raises(ValueError, match=expected_error): splitting.random_split(samples, train_ratio, val_ratio, test_ratio) @@ -53,8 +54,20 @@ def test_different_seeds_produce_different_outputs(seed1, seed2): assert test1 != test2, "Different seeds should produce different test indices" -@pytest.mark.parametrize("seed", [42, 123, 999]) -def test_same_seed_produces_same_outputs(seed): +@pytest.mark.parametrize( + "seed, train_expected_indices, val_expected_indices, test_expected_indices", + [ + (42, [59, 21, 56, 18], [69, 15, 48, 55], [49, 6, 90, 11]), + (123, [21, 71, 92, 23], [89, 14, 64, 4], [45, 75, 62, 6]), + (999, [47, 42, 57, 50], [41, 3, 81, 61], [45, 6, 56, 67]), + ], +) +def test_same_seed_produces_same_outputs( + seed: int, + train_expected_indices: List[int], + val_expected_indices: List[int], + test_expected_indices: List[int], +): """Tests if the same seed produces the same train, validation, and test indices.""" samples = list(range(100)) train1, val1, test1 = splitting.random_split(samples, 0.6, 0.2, 0.2, seed=seed) @@ -63,6 +76,11 @@ def test_same_seed_produces_same_outputs(seed): assert train1 == train2, "Same seed should produce the same train indices" assert val1 == val2, "Same seed should produce the same validation indices" assert test1 == test2, "Same seed should produce the same test indices" + assert isinstance(test1, list) + + assert train1[: len(train_expected_indices)] == train_expected_indices, "Unexpected indices" + assert val1[: len(val_expected_indices)] == val_expected_indices, "Unexpected indices" + assert test1[: len(test_expected_indices)] == test_expected_indices, "Unexpected indices" def test_no_test_set(): From ad4b53987f6aa2496d13442aca2efd1d82ba6f4b Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 12:15:06 +0000 Subject: [PATCH 04/11] fix linting --- src/eva/core/data/splitting/random.py | 4 +--- tests/eva/core/data/splitting/test_random.py | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/eva/core/data/splitting/random.py b/src/eva/core/data/splitting/random.py index 922716bb..101b9315 100644 --- a/src/eva/core/data/splitting/random.py +++ b/src/eva/core/data/splitting/random.py @@ -29,11 +29,9 @@ def random_split( raise ValueError("The sum of the ratios must be lower or equal to 1.") random_generator = np.random.default_rng(seed) - n_samples = int(total_ratio*len(samples)) + n_samples = int(total_ratio * len(samples)) indices = random_generator.permutation(len(samples))[:n_samples] - n_samples = int(total_ratio*len(samples)) - n_train = int(np.floor(train_ratio * n_samples)) n_val = n_samples - n_train if test_ratio == 0.0 else int(np.floor(val_ratio * n_samples)) or 1 diff --git a/tests/eva/core/data/splitting/test_random.py b/tests/eva/core/data/splitting/test_random.py index 21e57e9b..21a376c7 100644 --- a/tests/eva/core/data/splitting/test_random.py +++ b/tests/eva/core/data/splitting/test_random.py @@ -1,8 +1,9 @@ """Tests for the random split function.""" -import pytest from typing import List +import pytest + from eva.core.data import splitting From 2403486fb7070664542c0528643a24794db826e0 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 13:19:40 +0000 Subject: [PATCH 05/11] fixed docs --- docs/user-guide/advanced/replicate_evaluations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/advanced/replicate_evaluations.md b/docs/user-guide/advanced/replicate_evaluations.md index 6a47135a..b080b15e 100644 --- a/docs/user-guide/advanced/replicate_evaluations.md +++ b/docs/user-guide/advanced/replicate_evaluations.md @@ -147,7 +147,7 @@ MODEL_NAME=pathology/bioptimus_h_optimus_0 \ NORMALIZE_MEAN=[0.707223,0.578729,0.703617] \ NORMALIZE_STD=[0.211883,0.230117,0.177517] \ IN_FEATURES=1024 \ -eva predict_fit --config configs/vision/pathology/offline/panda_tiny.yaml +eva predict_fit --config configs/vision/pathology/offline/.yaml ``` From 09d4c08de220ac4e81972ae7e424180af85fa954 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 13:30:21 +0000 Subject: [PATCH 06/11] renamed tiny to small --- .../classification/{panda_tiny.yaml => panda_small.yaml} | 6 +++--- src/eva/vision/data/datasets/__init__.py | 4 ++-- src/eva/vision/data/datasets/classification/__init__.py | 4 ++-- src/eva/vision/data/datasets/classification/panda.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) rename configs/vision/pathology/offline/classification/{panda_tiny.yaml => panda_small.yaml} (96%) diff --git a/configs/vision/pathology/offline/classification/panda_tiny.yaml b/configs/vision/pathology/offline/classification/panda_small.yaml similarity index 96% rename from configs/vision/pathology/offline/classification/panda_tiny.yaml rename to configs/vision/pathology/offline/classification/panda_small.yaml index 94db80d1..74a0e438 100644 --- a/configs/vision/pathology/offline/classification/panda_tiny.yaml +++ b/configs/vision/pathology/offline/classification/panda_small.yaml @@ -96,7 +96,7 @@ data: <<: *DATASET_ARGS split: test predict: - - class_path: eva.vision.datasets.PANDATiny + - class_path: eva.vision.datasets.PANDASmall init_args: &PREDICT_DATASET_ARGS root: ${oc.env:DATA_ROOT, ./data/panda/prostate-cancer-grade-assessment} sampler: @@ -113,11 +113,11 @@ data: size: ${oc.env:RESIZE_DIM, 224} mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]} std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]} - - class_path: eva.vision.datasets.PANDATiny + - class_path: eva.vision.datasets.PANDASmall init_args: <<: *PREDICT_DATASET_ARGS split: val - - class_path: eva.vision.datasets.PANDATiny + - class_path: eva.vision.datasets.PANDASmall init_args: <<: *PREDICT_DATASET_ARGS split: test diff --git a/src/eva/vision/data/datasets/__init__.py b/src/eva/vision/data/datasets/__init__.py index a70a5477..5c31edc8 100644 --- a/src/eva/vision/data/datasets/__init__.py +++ b/src/eva/vision/data/datasets/__init__.py @@ -6,7 +6,7 @@ MHIST, PANDA, Camelyon16, - PANDATiny, + PANDASmall, PatchCamelyon, WsiClassificationDataset, ) @@ -29,7 +29,7 @@ "CRC", "MHIST", "PANDA", - "PANDATiny", + "PANDASmall", "Camelyon16", "PatchCamelyon", "WsiClassificationDataset", diff --git a/src/eva/vision/data/datasets/classification/__init__.py b/src/eva/vision/data/datasets/classification/__init__.py index 76b9241b..33b4c775 100644 --- a/src/eva/vision/data/datasets/classification/__init__.py +++ b/src/eva/vision/data/datasets/classification/__init__.py @@ -4,7 +4,7 @@ from eva.vision.data.datasets.classification.camelyon16 import Camelyon16 from eva.vision.data.datasets.classification.crc import CRC from eva.vision.data.datasets.classification.mhist import MHIST -from eva.vision.data.datasets.classification.panda import PANDA, PANDATiny +from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset @@ -15,6 +15,6 @@ "PatchCamelyon", "WsiClassificationDataset", "PANDA", - "PANDATiny", + "PANDASmall", "Camelyon16", ] diff --git a/src/eva/vision/data/datasets/classification/panda.py b/src/eva/vision/data/datasets/classification/panda.py index fa5ddc74..df65aede 100644 --- a/src/eva/vision/data/datasets/classification/panda.py +++ b/src/eva/vision/data/datasets/classification/panda.py @@ -184,7 +184,7 @@ def _get_id_from_path(self, file_path: str) -> str: return os.path.basename(file_path).replace(".tiff", "") -class PANDATiny(PANDA): +class PANDASmall(PANDA): """Tiny version of the PANDA dataset for quicker benchmarking.""" _train_split_ratio: float = 0.1 From e36f0053f2c200a524071eff18deee3c9b3d04b0 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 13:30:56 +0000 Subject: [PATCH 07/11] tiny -> small --- src/eva/vision/data/datasets/classification/panda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eva/vision/data/datasets/classification/panda.py b/src/eva/vision/data/datasets/classification/panda.py index df65aede..ffa00ab3 100644 --- a/src/eva/vision/data/datasets/classification/panda.py +++ b/src/eva/vision/data/datasets/classification/panda.py @@ -185,7 +185,7 @@ def _get_id_from_path(self, file_path: str) -> str: class PANDASmall(PANDA): - """Tiny version of the PANDA dataset for quicker benchmarking.""" + """Small version of the PANDA dataset for quicker benchmarking.""" _train_split_ratio: float = 0.1 """Train split ratio.""" From f4c6510d8eb6afd8ed11c365b7f0ddb381611997 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 13:54:32 +0000 Subject: [PATCH 08/11] use local random generator in samplers --- src/eva/vision/data/wsi/patching/samplers/_utils.py | 10 ++-------- src/eva/vision/data/wsi/patching/samplers/random.py | 6 ++++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/eva/vision/data/wsi/patching/samplers/_utils.py b/src/eva/vision/data/wsi/patching/samplers/_utils.py index af8418df..f1fa3b7e 100644 --- a/src/eva/vision/data/wsi/patching/samplers/_utils.py +++ b/src/eva/vision/data/wsi/patching/samplers/_utils.py @@ -1,14 +1,8 @@ -import random from typing import Tuple import numpy as np -def set_seed(seed: int) -> None: - random.seed(seed) - np.random.seed(seed) - - def get_grid_coords_and_indices( layer_shape: Tuple[int, int], width: int, @@ -33,8 +27,8 @@ def get_grid_coords_and_indices( indices = list(range(len(x_y))) if shuffle: - set_seed(seed) - np.random.shuffle(indices) + random_generator = np.random.default_rng(seed) + random_generator.shuffle(indices) return x_y, indices diff --git a/src/eva/vision/data/wsi/patching/samplers/random.py b/src/eva/vision/data/wsi/patching/samplers/random.py index 09ae5729..b37a3a3e 100644 --- a/src/eva/vision/data/wsi/patching/samplers/random.py +++ b/src/eva/vision/data/wsi/patching/samplers/random.py @@ -18,6 +18,7 @@ def __init__(self, n_samples: int = 1, seed: int = 42): """Initializes the sampler.""" self.seed = seed self.n_samples = n_samples + self.random_generator = random.Random(seed) # nosec def sample( self, @@ -33,9 +34,10 @@ def sample( layer_shape: The shape of the layer. """ _utils.validate_dimensions(width, height, layer_shape) - _utils.set_seed(self.seed) x_max, y_max = layer_shape[0], layer_shape[1] for _ in range(self.n_samples): - x, y = random.randint(0, x_max - width), random.randint(0, y_max - height) # nosec + x, y = self.random_generator.randint(0, x_max - width), self.random_generator.randint( + 0, y_max - height + ) yield x, y From 0532e3a33a76137ad6e2776ded85178e6978880a Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 14:10:27 +0000 Subject: [PATCH 09/11] updated grid sampler unit tests --- .../patching/samplers/test_foreground_grid.py | 20 +++++++++----- .../data/wsi/patching/samplers/test_grid.py | 26 +++++++++++++++++-- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/tests/eva/vision/data/wsi/patching/samplers/test_foreground_grid.py b/tests/eva/vision/data/wsi/patching/samplers/test_foreground_grid.py index 9a5510ac..c87ee8f1 100644 --- a/tests/eva/vision/data/wsi/patching/samplers/test_foreground_grid.py +++ b/tests/eva/vision/data/wsi/patching/samplers/test_foreground_grid.py @@ -38,11 +38,17 @@ def test_length(min_foreground_ratio: float, max_samples: int, expected_n_sample assert len(x_y) == expected_n_samples -@pytest.mark.parametrize("n_samples, seed", [(10, 8), (22, 42)]) -def test_same_seed(n_samples: int, seed: int) -> None: +@pytest.mark.parametrize( + "max_samples, seed, x_y_expected", + [ + (7, 42, [(12, 0), (24, 12), (12, 12), (0, 12), (12, 24)]), + (10, 8, [(12, 0), (12, 24), (24, 12), (0, 12), (12, 12)]), + ], +) +def test_same_seed(max_samples: int, seed: int, x_y_expected: list) -> None: """Tests if the sampler returns the same samples for the same seed.""" sampler = samplers.ForegroundGridSampler( - max_samples=n_samples, seed=seed, min_foreground_ratio=0.5 + max_samples=max_samples, seed=seed, min_foreground_ratio=0.5 ) x_y_1 = list(sampler.sample(**TEST_ARGS)) @@ -51,11 +57,11 @@ def test_same_seed(n_samples: int, seed: int) -> None: assert x_y_1 == x_y_2 -@pytest.mark.parametrize("n_samples, seed_1, seed_2", [(3, 1, 2), (5, 3, 4)]) -def test_different_seed(n_samples: int, seed_1: int, seed_2: int) -> None: +@pytest.mark.parametrize("max_samples, seed_1, seed_2", [(3, 1, 2), (5, 3, 4)]) +def test_different_seed(max_samples: int, seed_1: int, seed_2: int) -> None: """Tests if the sampler returns different samples for different seeds.""" - sampler_1 = samplers.ForegroundGridSampler(max_samples=n_samples, seed=seed_1) - sampler_2 = samplers.ForegroundGridSampler(max_samples=n_samples, seed=seed_2) + sampler_1 = samplers.ForegroundGridSampler(max_samples=max_samples, seed=seed_1) + sampler_2 = samplers.ForegroundGridSampler(max_samples=max_samples, seed=seed_2) x_y_1 = list(sampler_1.sample(**TEST_ARGS)) x_y_2 = list(sampler_2.sample(**TEST_ARGS)) diff --git a/tests/eva/vision/data/wsi/patching/samplers/test_grid.py b/tests/eva/vision/data/wsi/patching/samplers/test_grid.py index efeecf54..d88ea0e3 100644 --- a/tests/eva/vision/data/wsi/patching/samplers/test_grid.py +++ b/tests/eva/vision/data/wsi/patching/samplers/test_grid.py @@ -19,8 +19,29 @@ def test_length(max_samples: int, expected_n_samples: int) -> None: assert len(x_y) == expected_n_samples -@pytest.mark.parametrize("max_samples, seed", [(10, 8), (22, 42)]) -def test_same_seed(max_samples: int, seed: int) -> None: +@pytest.mark.parametrize( + "max_samples, seed, x_y_expected", + [ + (7, 42, [(50, 90), (20, 10), (50, 60), (10, 80), (30, 30), (40, 20), (50, 0)]), + ( + 10, + 8, + [ + (10, 50), + (20, 60), + (40, 20), + (90, 30), + (10, 60), + (0, 40), + (90, 40), + (70, 20), + (80, 0), + (60, 30), + ], + ), + ], +) +def test_same_seed(max_samples: int, seed: int, x_y_expected: list) -> None: """Tests if the sampler returns the same samples for the same seed.""" sampler = samplers.GridSampler(max_samples=max_samples, seed=seed) @@ -28,6 +49,7 @@ def test_same_seed(max_samples: int, seed: int) -> None: x_y_2 = list(sampler.sample(**TEST_ARGS)) assert x_y_1 == x_y_2 + assert x_y_1 == x_y_expected @pytest.mark.parametrize("max_samples, seed_1, seed_2", [(3, 1, 2), (5, 3, 4)]) From adbd2498815be7bd11448df5523469c41ce824a4 Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 14:13:49 +0000 Subject: [PATCH 10/11] fixed sampler unit tests --- .../data/wsi/patching/samplers/test_grid.py | 7 ++-- .../data/wsi/patching/samplers/test_random.py | 32 ++++++++++++++++--- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/tests/eva/vision/data/wsi/patching/samplers/test_grid.py b/tests/eva/vision/data/wsi/patching/samplers/test_grid.py index d88ea0e3..42cbdbf8 100644 --- a/tests/eva/vision/data/wsi/patching/samplers/test_grid.py +++ b/tests/eva/vision/data/wsi/patching/samplers/test_grid.py @@ -43,10 +43,11 @@ def test_length(max_samples: int, expected_n_samples: int) -> None: ) def test_same_seed(max_samples: int, seed: int, x_y_expected: list) -> None: """Tests if the sampler returns the same samples for the same seed.""" - sampler = samplers.GridSampler(max_samples=max_samples, seed=seed) + sampler_1 = samplers.GridSampler(max_samples=max_samples, seed=seed) + sampler_2 = samplers.GridSampler(max_samples=max_samples, seed=seed) - x_y_1 = list(sampler.sample(**TEST_ARGS)) - x_y_2 = list(sampler.sample(**TEST_ARGS)) + x_y_1 = list(sampler_1.sample(**TEST_ARGS)) + x_y_2 = list(sampler_2.sample(**TEST_ARGS)) assert x_y_1 == x_y_2 assert x_y_1 == x_y_expected diff --git a/tests/eva/vision/data/wsi/patching/samplers/test_random.py b/tests/eva/vision/data/wsi/patching/samplers/test_random.py index 85110a6c..09c1d279 100644 --- a/tests/eva/vision/data/wsi/patching/samplers/test_random.py +++ b/tests/eva/vision/data/wsi/patching/samplers/test_random.py @@ -17,13 +17,35 @@ def test_length(n_samples: int) -> None: assert len(x_y) == n_samples -@pytest.mark.parametrize("n_samples, seed", [(10, 8), (22, 42)]) -def test_same_seed(n_samples: int, seed: int) -> None: +@pytest.mark.parametrize( + "n_samples, seed, x_y_expected", + [ + (7, 42, [(81, 14), (3, 35), (31, 28), (17, 13), (86, 69), (11, 75), (54, 4)]), + ( + 10, + 8, + [ + (29, 47), + (48, 16), + (24, 90), + (5, 10), + (17, 31), + (64, 26), + (51, 82), + (3, 58), + (62, 58), + (49, 63), + ], + ), + ], +) +def test_same_seed(n_samples: int, seed: int, x_y_expected: int) -> None: """Tests if the sampler returns the same samples for the same seed.""" - sampler = samplers.RandomSampler(n_samples=n_samples, seed=seed) + sampler_1 = samplers.RandomSampler(n_samples=n_samples, seed=seed) + sampler_2 = samplers.RandomSampler(n_samples=n_samples, seed=seed) - x_y_1 = list(sampler.sample(**TEST_ARGS)) - x_y_2 = list(sampler.sample(**TEST_ARGS)) + x_y_1 = list(sampler_1.sample(**TEST_ARGS)) + x_y_2 = list(sampler_2.sample(**TEST_ARGS)) assert x_y_1 == x_y_2 From e2762e0c5671a8eaaf4d51bdf00c84376387b66f Mon Sep 17 00:00:00 2001 From: Nicolas Kaenzig Date: Mon, 7 Oct 2024 14:30:11 +0000 Subject: [PATCH 11/11] updated panda unittest --- .../data/datasets/classification/test_panda.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/eva/vision/data/datasets/classification/test_panda.py b/tests/eva/vision/data/datasets/classification/test_panda.py index 6b901344..783cc341 100644 --- a/tests/eva/vision/data/datasets/classification/test_panda.py +++ b/tests/eva/vision/data/datasets/classification/test_panda.py @@ -58,10 +58,16 @@ def test_filenames(root: str, split: Literal["train", "val", "test"]): assert len(filenames) == len(dataset.datasets) -def test_same_split_same_seed(root: str): +def test_same_split_same_seed(root: str, seed: int = 42): """Test that the generated split is deterministic when using the same seed.""" - dataset1 = datasets.PANDA(root=root, split="train", seed=42, **DEFAULT_ARGS) - dataset2 = datasets.PANDA(root=root, split="train", seed=42, **DEFAULT_ARGS) + sampler1 = samplers.GridSampler(seed=seed) + sampler2 = samplers.GridSampler(seed=seed) + dataset1 = datasets.PANDA( + root=root, split="train", seed=seed, **(DEFAULT_ARGS | {"sampler": sampler1}) + ) + dataset2 = datasets.PANDA( + root=root, split="train", seed=seed, **(DEFAULT_ARGS | {"sampler": sampler2}) + ) _setup_datasets(dataset1, dataset2) assert len(dataset1) == len(dataset2) @@ -70,6 +76,10 @@ def test_same_split_same_seed(root: str): for i in range(len(dataset1)): assert np.allclose(dataset1[i][1], dataset2[i][1]) + expected_coords = [[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224)]] * len(dataset1.datasets) + for i in range(len(dataset1.datasets)): + assert dataset1.datasets[i]._coords.x_y[: len(expected_coords[i])] == expected_coords[i] + def test_different_seed_different_split(root: str): """Test that the generated split is different when using a different seed."""