Skip to content

Commit

Permalink
Add PANDASmall dataset (#664)
Browse files Browse the repository at this point in the history
  • Loading branch information
nkaenzig authored Oct 8, 2024
1 parent 78f5a52 commit be6dc72
Show file tree
Hide file tree
Showing 15 changed files with 307 additions and 54 deletions.
133 changes: 133 additions & 0 deletions configs/vision/pathology/offline/classification/panda_small.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/panda}
max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 49}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 8}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.ClassificationEmbeddingsWriter
init_args:
output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/panda}
dataloader_idx_map:
0: train
1: val
2: test
metadata_keys: ["wsi_id"]
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
overwrite: false
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
head:
class_path: eva.vision.models.networks.ABMIL
init_args:
input_size: ${oc.env:IN_FEATURES, 384}
output_size: &NUM_CLASSES 6
projected_input_size: 128
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.001}
betas: [0.9, 0.999]
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_EPOCHS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args: &DATASET_ARGS
root: *DATASET_EMBEDDINGS_ROOT
manifest_file: manifest.csv
split: train
embeddings_transforms:
class_path: eva.core.data.transforms.Pad2DTensor
init_args:
pad_size: &N_PATCHES 200
val:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: val
test:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: test
predict:
- class_path: eva.vision.datasets.PANDASmall
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/panda/prostate-cancer-grade-assessment}
sampler:
class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler
init_args:
max_samples: *N_PATCHES
width: 224
height: 224
target_mpp: 0.5
split: train
image_transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
size: ${oc.env:RESIZE_DIM, 224}
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
- class_path: eva.vision.datasets.PANDASmall
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
- class_path: eva.vision.datasets.PANDASmall
init_args:
<<: *PREDICT_DATASET_ARGS
split: test
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 32}
shuffle: true
val:
batch_size: *BATCH_SIZE
test:
batch_size: *BATCH_SIZE
predict:
batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64}
4 changes: 2 additions & 2 deletions docs/user-guide/advanced/replicate_evaluations.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ were released on [HuggingFace](https://huggingface.co/bioptimus/H-optimus-0).

```
MODEL_NAME=pathology/bioptimus_h_optimus_0 \
NORMALIZE_MEAN=[0.707223, 0.578729, 0.703617] \
NORMALIZE_STD=[0.211883, 0.230117, 0.177517] \
NORMALIZE_MEAN=[0.707223,0.578729,0.703617] \
NORMALIZE_STD=[0.211883,0.230117,0.177517] \
IN_FEATURES=1024 \
eva predict_fit --config configs/vision/pathology/offline/<task>.yaml
```
Expand Down
9 changes: 5 additions & 4 deletions src/eva/core/data/splitting/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@ def random_split(
Returns:
The indices of the train, validation, and test sets as lists.
"""
if train_ratio + val_ratio + (test_ratio or 0) != 1:
raise ValueError("The sum of the ratios must be equal to 1.")
total_ratio = train_ratio + val_ratio + test_ratio
if total_ratio > 1.0:
raise ValueError("The sum of the ratios must be lower or equal to 1.")

random_generator = np.random.default_rng(seed)
n_samples = len(samples)
indices = random_generator.permutation(n_samples)
n_samples = int(total_ratio * len(samples))
indices = random_generator.permutation(len(samples))[:n_samples]

n_train = int(np.floor(train_ratio * n_samples))
n_val = n_samples - n_train if test_ratio == 0.0 else int(np.floor(val_ratio * n_samples)) or 1
Expand Down
18 changes: 12 additions & 6 deletions src/eva/core/data/splitting/stratified.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,35 @@ def stratified_split(
"""
if len(samples) != len(targets):
raise ValueError("The number of samples and targets must be equal.")
if train_ratio + val_ratio + (test_ratio or 0) != 1:
raise ValueError("The sum of the ratios must be equal to 1.")
if train_ratio + val_ratio + (test_ratio or 0) > 1.0:
raise ValueError("The sum of the ratios must be lower or equal to 1.")

np.random.seed(seed)
use_all_samples = train_ratio + val_ratio + test_ratio == 1
random_generator = np.random.default_rng(seed)
unique_classes, y_indices = np.unique(targets, return_inverse=True)
n_classes = unique_classes.shape[0]

train_indices, val_indices, test_indices = [], [], []

for c in range(n_classes):
class_indices = np.where(y_indices == c)[0]
np.random.shuffle(class_indices)
random_generator.shuffle(class_indices)

n_train = int(np.floor(train_ratio * len(class_indices))) or 1
n_val = (
len(class_indices) - n_train
if test_ratio == 0.0
if test_ratio == 0.0 and use_all_samples
else int(np.floor(val_ratio * len(class_indices))) or 1
)

train_indices.extend(class_indices[:n_train])
val_indices.extend(class_indices[n_train : n_train + n_val])
if test_ratio > 0.0:
test_indices.extend(class_indices[n_train + n_val :])
n_test = (
len(class_indices) - n_train - n_val
if use_all_samples
else int(np.floor(test_ratio * len(class_indices))) or 1
)
test_indices.extend(class_indices[n_train + n_val : n_train + n_val + n_test])

return train_indices, val_indices, test_indices or None
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
MHIST,
PANDA,
Camelyon16,
PANDASmall,
PatchCamelyon,
WsiClassificationDataset,
)
Expand All @@ -28,6 +29,7 @@
"CRC",
"MHIST",
"PANDA",
"PANDASmall",
"Camelyon16",
"PatchCamelyon",
"WsiClassificationDataset",
Expand Down
3 changes: 2 additions & 1 deletion src/eva/vision/data/datasets/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from eva.vision.data.datasets.classification.camelyon16 import Camelyon16
from eva.vision.data.datasets.classification.crc import CRC
from eva.vision.data.datasets.classification.mhist import MHIST
from eva.vision.data.datasets.classification.panda import PANDA
from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall
from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon
from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset

Expand All @@ -15,5 +15,6 @@
"PatchCamelyon",
"WsiClassificationDataset",
"PANDA",
"PANDASmall",
"Camelyon16",
]
13 changes: 13 additions & 0 deletions src/eva/vision/data/datasets/classification/panda.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,16 @@ def _get_target_from_path(self, file_path: str) -> int:

def _get_id_from_path(self, file_path: str) -> str:
return os.path.basename(file_path).replace(".tiff", "")


class PANDASmall(PANDA):
"""Small version of the PANDA dataset for quicker benchmarking."""

_train_split_ratio: float = 0.1
"""Train split ratio."""

_val_split_ratio: float = 0.05
"""Validation split ratio."""

_test_split_ratio: float = 0.05
"""Test split ratio."""
10 changes: 2 additions & 8 deletions src/eva/vision/data/wsi/patching/samplers/_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
import random
from typing import Tuple

import numpy as np


def set_seed(seed: int) -> None:
random.seed(seed)
np.random.seed(seed)


def get_grid_coords_and_indices(
layer_shape: Tuple[int, int],
width: int,
Expand All @@ -33,8 +27,8 @@ def get_grid_coords_and_indices(

indices = list(range(len(x_y)))
if shuffle:
set_seed(seed)
np.random.shuffle(indices)
random_generator = np.random.default_rng(seed)
random_generator.shuffle(indices)
return x_y, indices


Expand Down
6 changes: 4 additions & 2 deletions src/eva/vision/data/wsi/patching/samplers/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self, n_samples: int = 1, seed: int = 42):
"""Initializes the sampler."""
self.seed = seed
self.n_samples = n_samples
self.random_generator = random.Random(seed) # nosec

def sample(
self,
Expand All @@ -33,9 +34,10 @@ def sample(
layer_shape: The shape of the layer.
"""
_utils.validate_dimensions(width, height, layer_shape)
_utils.set_seed(self.seed)

x_max, y_max = layer_shape[0], layer_shape[1]
for _ in range(self.n_samples):
x, y = random.randint(0, x_max - width), random.randint(0, y_max - height) # nosec
x, y = self.random_generator.randint(0, x_max - width), self.random_generator.randint(
0, y_max - height
)
yield x, y
27 changes: 23 additions & 4 deletions tests/eva/core/data/splitting/test_random.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Tests for the random split function."""

from typing import List

import pytest

from eva.core.data import splitting
Expand Down Expand Up @@ -32,11 +34,11 @@ def test_split_ratios(n_samples: int, train_ratio: float, val_ratio: float, test
assert len(train_indices) + len(val_indices) + len(test_indices or []) == n_samples


@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.3, 0.0), (0.6, 0.4, 0.3)])
@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.7, 0.0), (0.6, 0.4, 0.3)])
def test_invalid_ratio_sums(train_ratio: float, val_ratio: float, test_ratio: float):
"""Tests if the function raises an error when the ratios do not sum to 1."""
samples = list(range(100))
expected_error = "The sum of the ratios must be equal to 1."
expected_error = "The sum of the ratios must be lower or equal to 1"
with pytest.raises(ValueError, match=expected_error):
splitting.random_split(samples, train_ratio, val_ratio, test_ratio)

Expand All @@ -53,8 +55,20 @@ def test_different_seeds_produce_different_outputs(seed1, seed2):
assert test1 != test2, "Different seeds should produce different test indices"


@pytest.mark.parametrize("seed", [42, 123, 999])
def test_same_seed_produces_same_outputs(seed):
@pytest.mark.parametrize(
"seed, train_expected_indices, val_expected_indices, test_expected_indices",
[
(42, [59, 21, 56, 18], [69, 15, 48, 55], [49, 6, 90, 11]),
(123, [21, 71, 92, 23], [89, 14, 64, 4], [45, 75, 62, 6]),
(999, [47, 42, 57, 50], [41, 3, 81, 61], [45, 6, 56, 67]),
],
)
def test_same_seed_produces_same_outputs(
seed: int,
train_expected_indices: List[int],
val_expected_indices: List[int],
test_expected_indices: List[int],
):
"""Tests if the same seed produces the same train, validation, and test indices."""
samples = list(range(100))
train1, val1, test1 = splitting.random_split(samples, 0.6, 0.2, 0.2, seed=seed)
Expand All @@ -63,6 +77,11 @@ def test_same_seed_produces_same_outputs(seed):
assert train1 == train2, "Same seed should produce the same train indices"
assert val1 == val2, "Same seed should produce the same validation indices"
assert test1 == test2, "Same seed should produce the same test indices"
assert isinstance(test1, list)

assert train1[: len(train_expected_indices)] == train_expected_indices, "Unexpected indices"
assert val1[: len(val_expected_indices)] == val_expected_indices, "Unexpected indices"
assert test1[: len(test_expected_indices)] == test_expected_indices, "Unexpected indices"


def test_no_test_set():
Expand Down
Loading

0 comments on commit be6dc72

Please sign in to comment.