Skip to content

Commit

Permalink
Rename CRC-HE to CRC (#189)
Browse files Browse the repository at this point in the history
  • Loading branch information
ioangatop authored Mar 1, 2024
1 parent e3e1d6a commit cddf829
Show file tree
Hide file tree
Showing 67 changed files with 74 additions and 61 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ The following datasets are supported

### Vision
- [BACH](./docs/datasets/bach.md)
- [CRC-HE](./docs/datasets/crc_he.md)
- [CRC](./docs/datasets/crc.md)
- [PatchCamelyon](./docs/datasets/patch_camelyon.md)
- [TotalSegmentator](./docs/datasets/total_segmentator.md)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
trainer:
class_path: eva.Trainer
init_args:
default_root_dir: &LIGHTNING_ROOT ${oc.env:LIGHTNING_ROOT, logs/offline/crc_he_nonorm}
default_root_dir: &LIGHTNING_ROOT ${oc.env:LIGHTNING_ROOT, logs/dino_vits16/offline/crc}
max_steps: &MAX_STEPS 12500
callbacks:
- class_path: pytorch_lightning.callbacks.LearningRateMonitor
Expand All @@ -23,7 +23,7 @@ trainer:
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.EmbeddingsWriter
init_args:
output_dir: &EMBEDDINGS_DIR ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/crc_he_nonorm
output_dir: &EMBEDDINGS_DIR ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/crc
dataloader_idx_map:
0: train
1: val
Expand Down Expand Up @@ -83,17 +83,17 @@ data:
<<: *DATASET_ARGS
split: val
predict:
- class_path: eva.vision.datasets.CRC_HE_NONORM
- class_path: eva.vision.datasets.CRC
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data}/crc_he
root: ${oc.env:DATA_ROOT, ./data}/crc
split: train
download: ${oc.env:DOWNLOAD_DATA, true}
image_transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
- class_path: eva.vision.datasets.CRC_HE_NONORM
- class_path: eva.vision.datasets.CRC
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
trainer:
class_path: eva.Trainer
init_args:
default_root_dir: &LIGHTNING_ROOT ${oc.env:LIGHTNING_ROOT, logs/offline/crc_he}
default_root_dir: &LIGHTNING_ROOT ${oc.env:LIGHTNING_ROOT, logs/dino_vits16/offline/crc_nonorm}
max_steps: &MAX_STEPS 12500
callbacks:
- class_path: pytorch_lightning.callbacks.LearningRateMonitor
Expand All @@ -23,7 +23,7 @@ trainer:
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.EmbeddingsWriter
init_args:
output_dir: &EMBEDDINGS_DIR ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/crc_he
output_dir: &EMBEDDINGS_DIR ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/crc_nonorm
dataloader_idx_map:
0: train
1: val
Expand Down Expand Up @@ -83,7 +83,7 @@ data:
<<: *DATASET_ARGS
split: val
predict:
- class_path: eva.vision.datasets.CRC_HE
- class_path: eva.vision.datasets.CRC_NONORM
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data}/crc_he
split: train
Expand All @@ -93,7 +93,7 @@ data:
init_args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
- class_path: eva.vision.datasets.CRC_HE
- class_path: eva.vision.datasets.CRC_NONORM
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
trainer:
class_path: eva.Trainer
init_args:
default_root_dir: &LIGHTNING_ROOT ${oc.env:LIGHTNING_ROOT, logs/online/crc_he}
default_root_dir: &LIGHTNING_ROOT ${oc.env:LIGHTNING_ROOT, logs/dino_vits16/online/crc}
max_steps: &MAX_STEPS 12500
callbacks:
- class_path: pytorch_lightning.callbacks.LearningRateMonitor
Expand Down Expand Up @@ -65,9 +65,9 @@ data:
init_args:
datasets:
train:
class_path: eva.vision.datasets.CRC_HE
class_path: eva.vision.datasets.CRC
init_args: &DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data}/crc_he
root: ${oc.env:DATA_ROOT, ./data}/crc
split: train
download: ${oc.env:DOWNLOAD_DATA, true}
image_transforms:
Expand All @@ -76,7 +76,7 @@ data:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
val:
class_path: eva.vision.datasets.CRC_HE
class_path: eva.vision.datasets.CRC
init_args:
<<: *DATASET_ARGS
split: val
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
trainer:
class_path: eva.Trainer
init_args:
default_root_dir: &LIGHTNING_ROOT ${oc.env:LIGHTNING_ROOT, logs/offline/crc_he}
default_root_dir: &LIGHTNING_ROOT ${oc.env:LIGHTNING_ROOT, logs/dino_vits16/online/crc_nonorm}
max_steps: &MAX_STEPS 12500
callbacks:
- class_path: pytorch_lightning.callbacks.LearningRateMonitor
Expand Down Expand Up @@ -65,9 +65,9 @@ data:
init_args:
datasets:
train:
class_path: eva.vision.datasets.CRC_HE_NONORM
class_path: eva.vision.datasets.CRC_NONORM
init_args: &DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data}/crc_he
root: ${oc.env:DATA_ROOT, ./data}/crc
split: train
download: ${oc.env:DOWNLOAD_DATA, true}
image_transforms:
Expand All @@ -76,7 +76,7 @@ data:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
val:
class_path: eva.vision.datasets.CRC_HE_NONORM
class_path: eva.vision.datasets.CRC_NONORM
init_args:
<<: *DATASET_ARGS
split: val
Expand Down
27 changes: 18 additions & 9 deletions docs/datasets/crc_he.md → docs/datasets/crc.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# CRC-HE
# CRC

The CRC-HE dataset consists of labelled patches (9 classes) from colorectal cancer (CRC) and normal tissue. We use the `NCT-CRC-HE-100K` dataset for training and validation and the `CRC-VAL-HE-7K for testing`.

Expand All @@ -19,19 +19,19 @@ The tissue classes are: Adipose (ADI), background (BACK), debris (DEB), lymphocy
| **Image dimension** | 224 x 224 x 3 |
| **FoV (μm/px)** | 20x (0.5) |
| **Files format** | `.tif` images |
| **Number of images** | 107,180 (100k train, 7.2k val) |
| **Splits in use** | NCT-CRC-HE-100K-NONORM (train), CRC-VAL-HE-7K (val) |
| **Number of images** | 107,180 (100k train, 7.2k val) |
| **Splits in use** | NCT-CRC-HE-100K (train), CRC-VAL-HE-7K (val) |


### Splits

We use the splits according to the data sources:

- Train split: `NCT-CRC-HE-100K-NONORM`
- Train split: `NCT-CRC-HE-100K`
- Validation split: `CRC-VAL-HE-7K`

| Splits | Train | Validation |
|---|-----------------|--------------|
| Splits | Train | Validation |
|----------|-----------------|--------------|
| #Samples | 100,000 (93.3%) | 7,180 (6.7%) |

A test split is not provided. Because the patient information for the training data is not available, dividing the
Expand All @@ -40,9 +40,18 @@ __eva__ therefore reports evaluation results for CRC HE on the validation split.

### Organization

The data `NCT-CRC-HE-100K-NONORM.zip` and `CRC-VAL-HE-7K.zip` from [zenodo](https://zenodo.org/records/1214456) are organized as follows:
The data `NCT-CRC-HE-100K.zip`, `NCT-CRC-HE-100K-NONORM.zip` and `CRC-VAL-HE-7K.zip`
from [zenodo](https://zenodo.org/records/1214456) are organized as follows:

```
NCT-CRC-HE-100K # All images used for training
├── ADI # All labelled patches belonging to the 1st class
│ ├── ADI-AAAFLCLY.tif
│ ├── ...
├── BACK # All labelled patches belonging to the 2nd class
│ ├── ...
└── ...
NCT-CRC-HE-100K-NONORM # All images used for training
├── ADI # All labelled patches belonging to the 1st class
│ ├── ADI-AAAFLCLY.tif
Expand All @@ -58,12 +67,12 @@ CRC-VAL-HE-7K # All images used for validation

## Download and preprocessing

The `CRC_HE` dataset class supports download the data no runtime with the initialized argument
The `CRC` dataset class supports download the data no runtime with the initialized argument
`download: bool = True`.

## Relevant links

* [CRC-HE datasets on zenodo](https://zenodo.org/records/1214456)
* [CRC datasets on zenodo](https://zenodo.org/records/1214456)
* [Reference API Vision dataset classes](../reference/vision/data/datasets.md)


Expand Down
12 changes: 6 additions & 6 deletions docs/datasets/index.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# Datasets

**eva** provides out-of-the-box support for several public datasets. When possible, the corresponding dataset classes facilitate automatic download to disk, if not possible, this documentation provides download instructions.
**eva** provides native support for several public datasets. When possible, the corresponding dataset classes facilitate automatic download to disk, if not possible, this documentation provides download instructions.

## Vision Datasets Overview

### Whole Slide (WSI) and microscopy image datasets

| Dataset | #Patches | Patch Size | FoV (μm/px) | Task | Cancer Type | Download provided
|------------------------------------|----------|------------|---|----------------------------|-------------| ---|
| [BACH](bach.md) | 400 | 2048x1536 | 20x (0.5) | Classification (4 classes) | Breast | Yes |
| [CRC HE](crc_he.md) | 107,180 | 224x224 | 20x (0.5) | Classification (9 classes) | Colorectal | Yes |
| [PatchCamelyon](patch_camelyon.md) | 327,680 | 96x96 | 10x (1.0) \* | Classification (2 classes) | Breast | Yes |
| Dataset | #Patches | Patch Size | FoV (μm/px) | Task | Cancer Type |
|------------------------------------|----------|------------|--------------|----------------------------|-------------|
| [BACH](bach.md) | 400 | 2048x1536 | 20x (0.5) | Classification (4 classes) | Breast |
| [CRC](crc.md) | 107,180 | 224x224 | 20x (0.5) | Classification (9 classes) | Colorectal |
| [PatchCamelyon](patch_camelyon.md) | 327,680 | 96x96 | 10x (1.0) \* | Classification (2 classes) | Breast |

\* The slides were acquired and digitized at 2 different centres using a 40x objective but under-sampled to 10x to increase the field of view. Some papers do categorize it as 10x.

Expand Down
8 changes: 4 additions & 4 deletions src/eva/vision/data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from eva.vision.data.datasets.classification import (
BACH,
CRC_HE,
CRC_HE_NONORM,
CRC,
CRC_NONORM,
PatchCamelyon,
TotalSegmentatorClassification,
)
Expand All @@ -13,8 +13,8 @@

__all__ = [
"BACH",
"CRC_HE",
"CRC_HE_NONORM",
"CRC",
"CRC_NONORM",
"PatchEmbeddingDataset",
"ImageSegmentation",
"SlideEmbeddingDataset",
Expand Down
12 changes: 9 additions & 3 deletions src/eva/vision/data/datasets/classification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
"""Image classification datasets API."""

from eva.vision.data.datasets.classification.bach import BACH
from eva.vision.data.datasets.classification.crc_he import CRC_HE
from eva.vision.data.datasets.classification.crc_he_nonorm import CRC_HE_NONORM
from eva.vision.data.datasets.classification.crc import CRC
from eva.vision.data.datasets.classification.crc_nonorm import CRC_NONORM
from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon
from eva.vision.data.datasets.classification.total_segmentator import TotalSegmentatorClassification

__all__ = ["BACH", "CRC_HE", "CRC_HE_NONORM", "PatchCamelyon", "TotalSegmentatorClassification"]
__all__ = [
"BACH",
"CRC",
"CRC_NONORM",
"PatchCamelyon",
"TotalSegmentatorClassification",
]
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""CRC-HE dataset class."""
"""CRC dataset class."""

import os
from typing import Callable, Dict, List, Literal, Tuple
Expand All @@ -12,8 +12,8 @@
from eva.vision.utils import io


class CRC_HE(base.ImageClassification):
"""Dataset class for CRC-HE images and corresponding targets."""
class CRC(base.ImageClassification):
"""Dataset class for CRC images and corresponding targets."""

_train_resource: structs.DownloadResource = structs.DownloadResource(
filename="NCT-CRC-HE-100K.zip",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""CRC-HE-NONORM dataset class."""
"""CRC-NONORM dataset class."""

import os
from typing import Callable, Dict, List, Literal, Tuple
Expand All @@ -12,8 +12,8 @@
from eva.vision.utils import io


class CRC_HE_NONORM(base.ImageClassification):
"""Dataset class for CRC-HE-NONORM images and corresponding targets."""
class CRC_NONORM(base.ImageClassification):
"""Dataset class for CRC-NONORM images and corresponding targets."""

_train_resource: structs.DownloadResource = structs.DownloadResource(
filename="NCT-CRC-HE-100K-NONORM.zip",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""CRC_HE dataset tests."""
"""CRC dataset tests."""

import os
from typing import Literal
Expand All @@ -18,10 +18,10 @@
("val", 2),
],
)
def test_sample(crc_he_dataset: datasets.CRC_HE, index: int) -> None:
def test_sample(crc_dataset: datasets.CRC, index: int) -> None:
"""Tests the format of a dataset sample."""
# assert data sample is a tuple
sample = crc_he_dataset[index]
sample = crc_dataset[index]
assert isinstance(sample, tuple)
assert len(sample) == 2
# assert the format of the `image` and `target`
Expand All @@ -33,10 +33,10 @@ def test_sample(crc_he_dataset: datasets.CRC_HE, index: int) -> None:


@pytest.fixture(scope="function")
def crc_he_dataset(split: Literal["train", "val"], assets_path: str) -> datasets.CRC_HE:
"""CRC_HE dataset fixture."""
dataset = datasets.CRC_HE(
root=os.path.join(assets_path, "vision", "datasets", "crc_he"),
def crc_dataset(split: Literal["train", "val"], assets_path: str) -> datasets.CRC:
"""CRC dataset fixture."""
dataset = datasets.CRC(
root=os.path.join(assets_path, "vision", "datasets", "crc"),
split=split,
)
dataset.prepare_data()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""CRC_HE_NONORM dataset tests."""
"""CRC_NONORM dataset tests."""

import os
from typing import Literal
Expand All @@ -18,10 +18,10 @@
("val", 2),
],
)
def test_sample(crc_he_nonorm_dataset: datasets.CRC_HE_NONORM, index: int) -> None:
def test_sample(crc_nonorm_dataset: datasets.CRC_NONORM, index: int) -> None:
"""Tests the format of a dataset sample."""
# assert data sample is a tuple
sample = crc_he_nonorm_dataset[index]
sample = crc_nonorm_dataset[index]
assert isinstance(sample, tuple)
assert len(sample) == 2
# assert the format of the `image` and `target`
Expand All @@ -33,12 +33,10 @@ def test_sample(crc_he_nonorm_dataset: datasets.CRC_HE_NONORM, index: int) -> No


@pytest.fixture(scope="function")
def crc_he_nonorm_dataset(
split: Literal["train", "val"], assets_path: str
) -> datasets.CRC_HE_NONORM:
def crc_nonorm_dataset(split: Literal["train", "val"], assets_path: str) -> datasets.CRC_NONORM:
"""CRC_HE_NONORM dataset fixture."""
dataset = datasets.CRC_HE_NONORM(
root=os.path.join(assets_path, "vision", "datasets", "crc_he"),
dataset = datasets.CRC_NONORM(
root=os.path.join(assets_path, "vision", "datasets", "crc"),
split=split,
)
dataset.prepare_data()
Expand Down

0 comments on commit cddf829

Please sign in to comment.