From 5e88b19aa0fea2dffbb3ca620f4c669ef3a67c4e Mon Sep 17 00:00:00 2001
From: LukasMut <muttenthaler@cbs.mpg.de>
Date: Mon, 1 Apr 2024 13:37:39 +0200
Subject: [PATCH 01/17] refactored batchwise feature extraction and added
 explanation to README and the docs

---
 README.md                                     |   4 +-
 docs/Alignment.md                             |   8 +-
 docs/AvailableModels.md                       | 151 +++++++++---------
 docs/GettingStarted.md                        |  74 ++++++---
 .../extraction/test_torch_vs_tensorflow.py    |  17 +-
 thingsvision/_version.py                      |   2 +-
 thingsvision/core/extraction/base.py          |  28 +++-
 thingsvision/core/extraction/extractors.py    |   5 +-
 thingsvision/core/extraction/helpers.py       |   5 +-
 thingsvision/core/extraction/tensorflow.py    |   2 +-
 thingsvision/core/extraction/torch.py         |  14 +-
 11 files changed, 181 insertions(+), 129 deletions(-)
diff --git a/README.md b/README.md
index b7e98373..7a049926 100644
--- a/README.md
+++ b/README.md
@@ -119,7 +119,7 @@ If you want to extract features for [DreamSim](https://dreamsim-nights.github.io
 $ pip install dreamsim==0.1.2
 ```
 
-See the [docs](https://vicco-group.github.io/thingsvision/AvailableModels.html) for which `DreamSim` models are available in `thingsvision`.
+See the [docs](https://vicco-group.github.io/thingsvision/AvailableModels.html#dreamsim) for which `DreamSim` models are available in `thingsvision`.
 
 #### Google Colab.
 Alternatively, you can use Google Colab to play around with `thingsvision` by uploading your image data to Google Drive (via directory mounting).
@@ -175,7 +175,7 @@ extractor = get_extractor(
 As a next step, create both dataset and dataloader for your images. We assume that all of your images are in a single `root` directory which can contain subfolders (e.g., for individual classes). Therefore, we leverage the `ImageDataset` class. 
 
 ```python
-root='path/to/root/image/directory' # (e.g., './images/)
+root='path/to/your/image/directory' # (e.g., './images/)
 batch_size = 32
 
 dataset = ImageDataset(
diff --git a/docs/Alignment.md b/docs/Alignment.md
index 68f8cf07..68059d4a 100644
--- a/docs/Alignment.md
+++ b/docs/Alignment.md
@@ -6,11 +6,11 @@ nav_order: 7
 
 # Aligning neural network representations with human similarity judgments
 
-Recent research in the space of representation learning has demonstrated the usefulness of aligning neural network representations with human similarity judgments for both machine learning (ML) downstream tasks and the Cognitive Sciences (see [here]((https://proceedings.neurips.cc/paper_files/paper/2023/hash/9febda1c8344cc5f2d51713964864e93-Abstract-Conference.html)) and [here](https://arxiv.org/pdf/2310.13018.pdf) for references).
+Recent research in the space of representation learning has demonstrated the usefulness of aligning neural network representations with human similarity judgments for both machine learning (ML) downstream tasks and the Cognitive Sciences (see [here](https://openreview.net/pdf?id=ReDQ1OUQR0X), [here]((https://proceedings.neurips.cc/paper_files/paper/2023/hash/9febda1c8344cc5f2d51713964864e93-Abstract-Conference.html)), and [here](https://arxiv.org/pdf/2310.13018.pdf) for references). While [harmonized models](https://vicco-group.github.io/thingsvision/AvailableModels.html#harmonization) or models fine-tuned using the [DreamSim](https://vicco-group.github.io/thingsvision/AvailableModels.html#dreamsim) objective are models whose weights were trained or fine-tuned to be human-aligned (and as such count as <i>aligned</i> models), there are ways to separate alignment from (pre-)training and <i>post-align</i> the features of a base model (such as CLIP) while preserving the representation structure of the base model.
 
 ## [gLocal](https://proceedings.neurips.cc/paper_files/paper/2023/hash/9febda1c8344cc5f2d51713964864e93-Abstract-Conference.html)
 
-If you want to align the extracted representations with human object similarity according to the approach introduced in *[Improving neural network representations using human similiarty judgments](https://proceedings.neurips.cc/paper_files/paper/2023/hash/9febda1c8344cc5f2d51713964864e93-Abstract-Conference.html)* you can optionally `align` the extracted features using the following method:
+If you want to post-align the extracted representations with human object similarity according to the approach introduced in *[Improving neural network representations using human similiarty judgments](https://proceedings.neurips.cc/paper_files/paper/2023/hash/9febda1c8344cc5f2d51713964864e93-Abstract-Conference.html)* you can optionally `align` the extracted features using the following method:
 
 ```python
 aligned_features = extractor.align(
@@ -20,7 +20,7 @@ aligned_features = extractor.align(
 )
 ```
 
-For now, representational alignment is only implemented for `gLocal` and for the following list of models: `clip_RN50`, `clip_ViT-L/14`, `OpenCLIP_ViT-L-14_laion400m_e32`, `OpenCLIP_ViT-L-14_laion2b_s32b_b82k` `dinov2-vit-base-p14`, `dinov2-vit-large-p14`, `dino-vit-base-p16`, `dino-vit-base-p8`, `resnet18`, `resnet50`, `vgg16`, `alexnet`. However, we plan to extend both the type of representational alignment and the range of models in future versions of `thingsvision`. 
+Since that kind of alignment simply applies an affine transformation to a model's representation space, it is computationally incredibly cheap. For now, representational alignment is only implemented for `gLocal` and for the following list of models: `clip_RN50`, `clip_ViT-L/14`, `OpenCLIP_ViT-L-14_laion400m_e32`, `OpenCLIP_ViT-L-14_laion2b_s32b_b82k` `dinov2-vit-base-p14`, `dinov2-vit-large-p14`, `dino-vit-base-p16`, `dino-vit-base-p8`, `resnet18`, `resnet50`, `vgg16`, `alexnet`. However, we intend to extend both the type of representational alignment and the range of models in future versions of `thingsvision`.
 
 
-<u>Caution</u>: For `resnet18`, `resnet50`, `vgg16`, and `alexnet` gLocal does not achieve a *best-of-both-worlds-representation* for ML downstream tasks and human alignment. While gLocal significantly improves alignment with human similarity judgments for these models, it deteriorates their ML downstream task performance (such as few-shot learning and out-of-distribution detection). Hence, it does not transform the features into a *best-of-both-worlds-represenation* space as it does for CLIP-like models. If you are not interested in ML downstream task performance, you can safely ignore this.
+<u>Caution</u>: For the ImageNet-trained models `resnet18`, `resnet50`, `vgg16`, and `alexnet` gLocal does not achieve a *best-of-both-worlds-representation* for ML downstream tasks and human alignment. While gLocal significantly improves alignment with human similarity judgments for these models, it deteriorates their ML downstream task performance (such as few-shot learning and out-of-distribution detection). Hence, it does not transform the features into a *best-of-both-worlds-represenation* space as it does for CLIP-like models. If you are not interested in ML downstream task performance, you can safely ignore this.
diff --git a/docs/AvailableModels.md b/docs/AvailableModels.md
index c2c2a156..86c06aaf 100644
--- a/docs/AvailableModels.md
+++ b/docs/AvailableModels.md
@@ -3,12 +3,12 @@ title: Available models and sources (+ examples)
 nav_order: 4
 ---
 
-# Available models and sources
+# Available models and their sources
 
-`thingsvision` currently supports many models from several different sources, which represent different places or other libraries from which the model architectures or weights can come from. You can find more information about which models are available in which source and notes on their usage on this page.
+`thingsvision` currently supports many models from several different sources, which represent different places or other libraries from which the model architectures or weights may come from. You can find more information about which models are available in which source on this page. Additionally, we provide several notes on their usage.
 
 ## `torchvision`
-`thingsvision` supports all models from the `torchvision.models` module. You can find a list of all available models [here](https://pytorch.org/vision/stable/models.html). 
+`thingsvision` supports all models from the `torchvision.models` module. You can find a list of all available `torchvision` models [here](https://pytorch.org/vision/stable/models.html). 
 
 Example:
 ```python
@@ -31,7 +31,7 @@ extractor = get_extractor(
 
 Model names are case-sensitive and must be spelled exactly as they are in the `torchvision` documentation (e.g., `alexnet`, `resnet18`, `vgg16`, ...).
 
-If you use `pretrained=True`, the model will by default be pretrained on ImageNet, otherwise it is initialized randomly. For some models, `torchvision` provides multiple weight initializations, in which case you can pass the name of the weights in the `model_parameters` argument, e.g. if you want to get the extractor for a `RegNet Y 32GF` model, pretrained using SWAG and finetuned on ImageNet, you would do the following:
+If you use `pretrained=True`, the model weights will by default be pretrained on ImageNet, otherwise it is initialized randomly. For some models, `torchvision` provides multiple weight initializations, in which case you can pass the name of the weights in the `model_parameters` argument, e.g. if you want to get the extractor for a `RegNet Y 32GF` model, pretrained using SWAG and finetuned on ImageNet, you want to do the following:
 
 ```python
 import torch
@@ -54,7 +54,7 @@ extractor = get_extractor(
 For a list of all available weights, please refer to the [torchvision documentation](https://pytorch.org/vision/stable/models.html).
 
 ## `timm`
-`thingsvision` supports all models from the `timm` module. You can find a list of all available models [here](https://rwightman.github.io/pytorch-image-models/models/).
+`thingsvision` supports all models from the `timm` module. You can find a list of all available `timm` models [here](https://rwightman.github.io/pytorch-image-models/models/).
 
 Example:
 ```python
@@ -79,6 +79,7 @@ If you use `pretrained=True`, the model will be pretrained according to the mode
 
 ## `ssl`
 `thingsvision` provides various Self-supervised learning models that are loaded from the [VISSL](https://vissl.readthedocs.io/en/v0.1.5/) library or the Torch Hub.
+
 * SimCLR (`simclr-rn50`)
 * MoCov V2 (`mocov2-rn50`), 
 * Jigsaw (`jigsaw-rn50`), 
@@ -89,12 +90,10 @@ If you use `pretrained=True`, the model will be pretrained according to the mode
 * VicReg (`vicreg-rn50`)
 * DINO (`dino-rn50`)
 
-All models have the ResNet50 architecture and are pretrained on ImageNet-1K. 
-Here, the model name describes the pre-training method, instead of the model architecture.
+All models have the ResNet50 architecture and are pretrained on ImageNet-1K.  Here, the model name describes the pre-training objective rather than the model architecture.
 
 DINO models are available in ViT (Vision Transformer) and XCiT (Cross-Covariance Image Transformer) variants. For ViT models trained using DINO, the following models are available: `dino-vit-small-p8`, `dino-vit-small-p16`, `dino-vit-base-p8`, `dino-vit-base-p16`, where the trailing number describes the image patch resolution in the ViT (i.e. either 8x8 or 16x16). For the XCiT models, we have `dino-xcit-small-12-p16`, `dino-xcit-small-12-p8`, `dino-xcit-medium-24-p16`, `dino-xcit-medium-24-p8`, where the penultimate number represents model depth (12 = small, 24 = medium).
 
-
 Example SimCLR:
 
 ```python
@@ -122,7 +121,7 @@ from thingsvision import get_extractor
 model_name = 'dino-vit-base-p16'
 source = 'ssl'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
-model_paramters = {"extract_cls_token": True} # extract features only for the [cls] token of DINO
+model_paramters = {"extract_cls_token": True} # extract features exclusively for the [cls] token of DINO
 
 extractor = get_extractor(
   model_name=model_name,
@@ -162,76 +161,49 @@ If you use `pretrained=True`, the model will be pretrained on ImageNet, otherwis
 
 In addition, we provide several custom models - that are not available in other sources -, in the `custom` source. These models are:
 
-### CORnet
-We provide all CORnet models from [this paper](https://proceedings.neurips.cc/paper/2019/file/7813d1590d28a7dd372ad54b5d29d033-Paper.pdf). Available model names are:
-
-- `cornet-s`
-- `cornet-r`
-- `cornet-rt`
-- `cornet-z`
-
-Example:
-```python
-import torch
-from thingsvision import get_extractor
-
-model_name = 'cornet-s'
-source = 'custom'
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-extractor = get_extractor(
-  model_name=model_name,
-  source=source,
-  device=device,
-  pretrained=True
-)
-```
-
-### Models trained on Ecoset
+### Official CLIP and OpenCLIP
 
-We provide models trained on the [Ecoset](https://www.kietzmannlab.org/ecoset/) dataset, which contains 1.5m images from 565 categories selected to be both frequent in linguistic use and rated as concrete by human observers. Available `model_name`s are:
+We provide [CLIP](https://arxiv.org/abs/2103.00020) models from the official CLIP repo and from [OpenCLIP](https://github.com/mlfoundations/open_clip). Available `model_name`'s are:
 
-- `Alexnet_ecoset`
-- `Resnet50_ecoset`
-- `VGG16_ecoset`
-- `Inception_ecoset`
+- `clip`
+- `OpenClip`
 
-Example:
+Both provide multiple model architectures and, in the case of OpenCLIP also different training datasets, which can both be specified using the `model_parameters` argument. For example, if you want to get a `ViT-B/32` model from the official CLIP repo (trained on WIT), you would do the following:
 
 ```python
 import torch
 from thingsvision import get_extractor
 
-model_name = 'Alexnet_ecoset'
+model_name = 'clip'
 source = 'custom'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model_parameters = {
+    'variant': 'ViT-B/32'
+}
 
 extractor = get_extractor(
   model_name=model_name,
   source=source,
   device=device,
-  pretrained=True
+  pretrained=True,
+  model_parameters=model_parameters
 )
 ```
 
-### Official CLIP and OpenCLIP
-
-We provide [CLIP](https://arxiv.org/abs/2103.00020) models from the official CLIP repo and from [OpenCLIP](https://github.com/mlfoundations/open_clip). Available `model_name`'s are:
-
-- `clip`
-- `OpenClip`
+`ViT-B/32` is the default model architecture, so you can also leave out the `model_parameters` argument. For a list of all available architectures and datasets, please refer to the [CLIP repo](https://github.com/openai/CLIP/blob/main/clip/clip.py).
 
-Both provide multiple model architectures and, in the case of OpenCLIP also different training datasets, which can both be specified using the `model_parameters` argument. For example, if you want to get a `ViT-B/32` model from the official CLIP repo (trained on WIT), you would do the following:
+In the case of `OpenCLIP`, you can also specify the dataset used for training for most models, e.g. if you want to get a `ViT-B/32` model trained on the `LAION-400M` dataset, you would do the following:
 
 ```python
 import torch
 from thingsvision import get_extractor
 
-model_name = 'clip'
+model_name = 'OpenCLIP'
 source = 'custom'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 model_parameters = {
-    'variant': 'ViT-B/32'
+    'variant': 'ViT-B/32',
+    'dataset': 'laion400m_e32'
 }
 
 extractor = get_extractor(
@@ -243,20 +215,30 @@ extractor = get_extractor(
 )
 ```
 
-`ViT-B/32` is the default model architecture, so you can also leave out the `model_parameters` argument. For a list of all available architectures and datasets, please refer to the [CLIP repo](https://github.com/openai/CLIP/blob/main/clip/clip.py).
+For a list of all available architectures and datasets, please refer to the [OpenCLIP repo](https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/pretrained.py).
 
-In the case of `OpenCLIP`, you can also specify the dataset used for training for most models, e.g. if you want to get a `ViT-B/32` model trained on the `LAION-400M` dataset, you would do the following:
+### [DreamSim](https://dreamsim-nights.github.io/)
+In `thingsvision` you can extract representations from [DreamSim](https://dreamsim-nights.github.io/). See the official [DreamSim repo](https://github.com/ssundaram21/dreamsim) for more information. To extract features, install the `dreamsim` package with the following `pip` command (ideally, into your `thingsvision` environment):
+
+```bash
+ $ pip install dreamsim==0.1.2
+```
+
+The base model name is:
+- `DreamSim`
+
+We provide four `DreamSim` models: `clip_vitb32`, `open_clip_vitb32`, `dino_vitb16`, and a DreamSim `ensemble`. Specify this using the `model_parameters` argument. For instance, to get the OpenCLIP variant of DreamSim you want to do the following:
 
 ```python
 import torch
 from thingsvision import get_extractor
 
-model_name = 'OpenCLIP'
+model_name = 'DreamSim'
+module_name = 'model.mlp'
 source = 'custom'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 model_parameters = {
-    'variant': 'ViT-B/32',
-    'dataset': 'laion400m_e32'
+    'variant': 'open_clip_vitb32'
 }
 
 extractor = get_extractor(
@@ -268,9 +250,9 @@ extractor = get_extractor(
 )
 ```
 
-For a list of all available architectures and datasets, please refer to the [OpenCLIP repo](https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/pretrained.py).
+To load the CLIP ViT-B/32 version of DreamSim, pass `'clip_vitb32'` to the `variant` parameter instead. Caution (!): for the DreamSim `dino_vitb16` and `ensemble` features can only be extracted from the `model.mlp` module and not for the `model` block. We are currently working on a version that allows feature extraction from the `model` block. Please be patient until then.
 
-### Harmonization
+### [Harmonization](https://github.com/serre-lab/harmonization)
 
 If you want to extract features for [harmonized models](https://vicco-group.github.io/thingsvision/AvailableModels.html#harmonization) from the [Harmonization repo](https://github.com/serre-lab/harmonization), you have to run the following `pip` command in your `thingsvision` environment (FYI: as of now, this seems to be working smoothly only on Ubuntu but not on macOS),
 
@@ -312,38 +294,55 @@ extractor = get_extractor(
 )
 ```
 
+### CORnet
+We provide all CORnet models from [this paper](https://proceedings.neurips.cc/paper/2019/file/7813d1590d28a7dd372ad54b5d29d033-Paper.pdf). Available model names are:
 
-### DreamSim
-In `thingsvision` you can extract representations from [DreamSim](https://dreamsim-nights.github.io/). See the official [DreamSim repo](https://github.com/ssundaram21/dreamsim) for more information. To extract features, install the `dreamsim` package with the following `pip` command (ideally, into your `thingsvision` environment):
+- `cornet-s`
+- `cornet-r`
+- `cornet-rt`
+- `cornet-z`
 
-```bash
- $ pip install dreamsim==0.1.2
+Example:
+
+```python
+import torch
+from thingsvision import get_extractor
+
+model_name = 'cornet-s'
+source = 'custom'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+extractor = get_extractor(
+  model_name=model_name,
+  source=source,
+  device=device,
+  pretrained=True
+)
 ```
 
-The base model name is:
-- `DreamSim`
+### Models trained on Ecoset
 
-We provide four `DreamSim` models: `clip_vitb32`, `open_clip_vitb32`, `dino_vitb16`, and a DreamSim `ensemble`. Specify this using the `model_parameters` argument. For instance, to get the OpenCLIP variant of DreamSim you want to do the following:
+We also provide models trained on the [Ecoset](https://www.kietzmannlab.org/ecoset/) dataset, which contains 1.5m images from 565 categories selected to be both frequent in linguistic use and rated as concrete by human observers. Available `model_name`s are:
+
+- `Alexnet_ecoset`
+- `Resnet50_ecoset`
+- `VGG16_ecoset`
+- `Inception_ecoset`
+
+Example:
 
 ```python
 import torch
 from thingsvision import get_extractor
 
-model_name = 'DreamSim'
-module_name = 'model.mlp'
+model_name = 'Alexnet_ecoset'
 source = 'custom'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
-model_parameters = {
-    'variant': 'open_clip_vitb32'
-}
 
 extractor = get_extractor(
   model_name=model_name,
   source=source,
   device=device,
-  pretrained=True,
-  model_parameters=model_parameters
+  pretrained=True
 )
-```
-
-To load the CLIP ViT-B/32 version of DreamSim, pass `'clip_vitb32'` to the `variant` parameter instead. Caution (!): for the DreamSim `dino_vitb16` and `ensemble` features can only be extracted from the `model.mlp` module and not for the `model` block. We are currently working on a version that allows feature extraction from the `model` block. Please be patient until then.
+```
\ No newline at end of file
diff --git a/docs/GettingStarted.md b/docs/GettingStarted.md
index 1e06e3ab..e4bbd4e9 100644
--- a/docs/GettingStarted.md
+++ b/docs/GettingStarted.md
@@ -4,8 +4,8 @@ nav_order: 2
 ---
 # Getting started
 
-### Setting up your environment
-#### Working locally.
+## Setting up your environment
+### Working locally.
 First, create a new `conda environment` with Python version 3.8, 3.9, or 3.10 e.g. by using `conda`:
 
 ```bash
@@ -21,7 +21,7 @@ $ pip install git+https://github.com/openai/CLIP.git
 $ pip install git+https://github.com/serre-lab/Harmonization.git
 ```
 
-### Google Colab.
+### Google Colab
 Alternatively, you can use Google Colab to play around with `thingsvision` by uploading your image data to Google Drive (via directory mounting).
 You can find the jupyter notebook using `PyTorch` [here](https://colab.research.google.com/github/ViCCo-Group/thingsvision/blob/master/notebooks/pytorch.ipynb) and the `TensorFlow` example [here](https://colab.research.google.com/github/ViCCo-Group/thingsvision/blob/master/notebooks/tensorflow.ipynb).
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
@@ -30,7 +30,7 @@ You can find the jupyter notebook using `PyTorch` [here](https://colab.research.
 <!-- Basic usage -->
 ## Basic usage
 
-## Command Line Interface (CLI)
+### Command Line Interface (CLI)
 
 `thingsvision` was designed to simplify feature extraction. If you have some folder of images (e.g., `./images`) and want to extract features for each of these images without opening a Jupyter Notebook instance or writing a Python script, it's probably easiest to use our CLI. The interface includes two options,
 
@@ -44,10 +44,11 @@ thingsvision show-model --model-name "alexnet" --source "torchvision"
 thingsvision extract_features --image-root "./data" --model-name "alexnet" --module-name "features.10" --batch-size 32 --device "cuda" --source "torchvision" --file-format "npy" --out-path "./features"
 ```
 
-See `thingsvision show-model -h` and `thingsvision extract-features -h` for a list of all possible arguments. Note that the CLI provides just the basic extraction functionalities but is probably enough for most users that don't want to dive too deep into various models and modules. If you need more fine-grained control over the extraction itself, we recommend to use the python package directly and write your own Python script.
+See `thingsvision show-model -h` and `thingsvision extract-features -h` for a list of all possible arguments. Note that the CLI provides just the basic extraction functionalities but is probably enough for most users that don't want to dive too deep into various models and modules. 
 
+### Python commands for custom script or notebook
 
-To do this start by importing all the necessary components and instantiating a `thingsvision` extractor. Here we're using `AlexNet` from the `torchvision` library as the model to extract features from and also load the model to GPU for faster inference,
+If you need more fine-grained control over the extraction itself, we recommend to use the python package directly and write your own Python script. To do this start by importing all the necessary components and instantiating a `thingsvision` extractor. Here we're using a `CLIP` model as the model to extract features from. In addition, we move the model to GPU for faster inference,
 
 ```python
 import torch
@@ -55,53 +56,80 @@ from thingsvision import get_extractor
 from thingsvision.utils.storing import save_features
 from thingsvision.utils.data import ImageDataset, DataLoader
 
-model_name = 'alexnet'
-source = 'torchvision'
+model_name = 'clip'
+source = 'custom'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model_parameters = {
+    'variant': 'ViT-B/32'
+}
 
 extractor = get_extractor(
   model_name=model_name,
   source=source,
   device=device,
-  pretrained=True
+  pretrained=True,
+  model_parameters=model_parameters,
 )
 ```
 
-As a next step, create both dataset and dataloader for your images. We assume that all of your images are in a single `root` directory which can contain subfolders (e.g., for individual classes). Therefore, we leverage the `ImageDataset` class. 
+As a next step, create both a dataset and a dataloader for your images. Here, we assume that all of your images are stored in a single `root` directory which can contain subfolders (e.g., for individual classes as in ImageNet). Therefore, we leverage the thingsvision `ImageDataset` class. 
 
 ```python
-root='path/to/root/img/directory' # (e.g., './images/)
+root='path/to/your/image/directory' # (e.g., './images/)
 batch_size = 32
 
 dataset = ImageDataset(
-  root=root,
-  out_path='path/to/features',
-  backend=extractor.get_backend(),
-  transforms=extractor.get_transformations()
+    root=root,
+    out_path='path/to/features',
+    backend=extractor.get_backend(), # backend framework of model
+    transforms=extractor.get_transformations(resize_dim=256, crop_dim=224) # set the input dimensionality to whichever values are required for your pretrained model
 )
 
 batches = DataLoader(
-  dataset=dataset,
-  batch_size=batch_size, 
-  backend=extractor.get_backend()
+    dataset=dataset,
+    batch_size=batch_size,
+    backend=extractor.get_backend() # backend framework of model
 )
 ```
 
-Now all that is left is to extract the image features and store them to disk! We're extracting features from the last convolutional layer of AlexNet (`features.10`).
+Now all that is left is to extract the image features and store them to disk! We're extracting features from the image encoder of CLIP (`visual`).
 
 ```python
-module_name = 'features.10'
+module_name = 'visual'
 
 features = extractor.extract_features(
   batches=batches,
   module_name=module_name,
-  flatten_acts=True  # flatten 2D feature maps from convolutional layer
+  flatten_acts=True, # flatten 2D feature maps from an early convolutional or attention layer
+  output_type="ndarray", # or "tensor" (only applicable to PyTorch models of which CLIP is one!)
 )
 
-save_features(features, out_path='path/to/features', file_format='npy')
+save_features(features, out_path='path/to/features', file_format='npy') # file_format can be set to "npy", "txt", "mat", "pt", or "hdf5"
 ```
 
-### Showing available modules
+### Extraction with custom data pipeline and training loop
+
+
+```python
+module_name = 'visual'
+
+# your custom dataset and dataloader classes come here (for example, a PyTorch data loader)
+my_dataset = ...
+my_dataloader = ...
+
+# your custom training loop comes here
+for batch in my_dataloader:
+  ... # whatever preprocessing you want to add to the batch
+  feature_batch = extractor.extract_batch(
+    batch=batch,
+    module_name=module_name,
+    flatten_acts=True, # flatten 2D feature maps from an early convolutional or attention layer
+    output_type="tensor", # optionally set the output type of feature matrix
+    )
+  ... # whatever post-processing you want to add the features
+```
+
+### Showing available modules of a model
 If you don't know which modules exist in your model, you can use the `show_model` method to print a summary of the model architecture. For example, if you want to see which modules exist in AlexNet (using the extractor from above), you can run the following:
 
 ```python
diff --git a/tests/extractor/extraction/test_torch_vs_tensorflow.py b/tests/extractor/extraction/test_torch_vs_tensorflow.py
index 5373f379..79d25fec 100644
--- a/tests/extractor/extraction/test_torch_vs_tensorflow.py
+++ b/tests/extractor/extraction/test_torch_vs_tensorflow.py
@@ -1,4 +1,5 @@
 import unittest
+import torch
 
 import numpy as np
 import tests.helper as helper
@@ -53,10 +54,12 @@ def test_custom_torch_vs_tf_extraction(self):
             batches=pt_dl,
             module_name=layer_name,
             flatten_acts=False,
+            output_type="tensor",
         )
-        expected_features = np.array([[2, 2], [0, 0]])
-        np.testing.assert_allclose(pt_features, expected_features)
-        np.testing.assert_allclose(tf_features, expected_features)
+        expected_features_pt = torch.tensor([[2, 2], [0, 0]])
+        expected_features_tf = np.array([[2, 2], [0, 0]])
+        np.testing.assert_allclose(pt_features, expected_features_pt)
+        np.testing.assert_allclose(tf_features, expected_features_tf)
 
         layer_name = "relu2"
         tf_features = tf_model.extract_features(
@@ -68,7 +71,9 @@ def test_custom_torch_vs_tf_extraction(self):
             batches=pt_dl,
             module_name=layer_name,
             flatten_acts=False,
+            output_type="tensor",
         )
-        expected_features = np.array([[4, 4], [0, 0]])
-        np.testing.assert_allclose(pt_features, expected_features)
-        np.testing.assert_allclose(tf_features, expected_features)
+        expected_features_pt = torch.tensor([[4, 4], [0, 0]])
+        expected_features_tf = np.array([[4, 4], [0, 0]])
+        np.testing.assert_allclose(pt_features, expected_features_pt)
+        np.testing.assert_allclose(tf_features, expected_features_tf)
diff --git a/thingsvision/_version.py b/thingsvision/_version.py
index 50062f87..7a2056f5 100644
--- a/thingsvision/_version.py
+++ b/thingsvision/_version.py
@@ -1 +1 @@
-__version__ = "2.5.0"
+__version__ = "2.5.1"
diff --git a/thingsvision/core/extraction/base.py b/thingsvision/core/extraction/base.py
index e88154de..1421991c 100644
--- a/thingsvision/core/extraction/base.py
+++ b/thingsvision/core/extraction/base.py
@@ -5,11 +5,10 @@
 from typing import Callable, Iterator, List, Optional, Union
 
 import numpy as np
+import torch
 from torchtyping import TensorType
 from tqdm.auto import tqdm
 
-import torch
-
 Array = np.ndarray
 
 
@@ -72,11 +71,12 @@ def load_model(self) -> None:
         raise NotImplementedError
 
     @abc.abstractmethod
-    def _extract_batch(
+    def extract_batch(
         self,
         batch: Union[TensorType["b", "c", "h", "w"], Array],
         module_name: str,
         flatten_acts: bool,
+        output_type: Optional[str] = None,
     ) -> Union[
         Union[
             TensorType["b", "num_maps", "h_prime", "w_prime"],
@@ -86,6 +86,24 @@ def _extract_batch(
         ],
         Array,
     ]:
+        """Extract hidden unit activations (at specified layer) for every image in the database.
+
+        Parameters
+        ----------
+        batch : np.ndarray or torch.Tensor
+            mini-batch of three-dimensional image tensors.
+        module_name : str
+            Name of the module for which features should be extraced.
+        flatten_acts : bool
+            Whether the activation of a tensor should be flattened to a vector.
+        output_type : str {"ndarray", "tensor"}
+            Whether to return output features as torch.Tensor or np.ndarray.
+            Available options are "ndarray" or "tensor".
+        Returns
+        -------
+        output : np.ndarray or torch.Tensor
+            Returns the feature matrix (e.g., $X \in \mathbb{R}^{B \times d}$ if penultimate or logits layer or flatten_acts = True).
+        """
         raise NotImplementedError
 
     def get_output_types(self) -> List[str]:
@@ -96,7 +114,7 @@ def extract_features(
         batches: Iterator[Union[TensorType["b", "c", "h", "w"], Array]],
         module_name: str,
         flatten_acts: bool,
-        output_type: str = "ndarray",
+        output_type: Optional[str] = "ndarray",
         output_dir: Optional[str] = None,
         step_size: Optional[int] = None,
     ) -> Union[
@@ -167,7 +185,7 @@ def extract_features(
             enumerate(batches, start=1), desc="Batch", total=len(batches)
         ):
             features.append(
-                self._extract_batch(
+                self.extract_batch(
                     batch=batch, module_name=module_name, flatten_acts=flatten_acts
                 )
             )
diff --git a/thingsvision/core/extraction/extractors.py b/thingsvision/core/extraction/extractors.py
index 183892de..a724de0a 100644
--- a/thingsvision/core/extraction/extractors.py
+++ b/thingsvision/core/extraction/extractors.py
@@ -2,12 +2,11 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
-import timm
-import torchvision
-
 import tensorflow as tf
 import tensorflow.keras.applications as tensorflow_models
+import timm
 import torch
+import torchvision
 
 try:
     from torch.hub import load_state_dict_from_url
diff --git a/thingsvision/core/extraction/helpers.py b/thingsvision/core/extraction/helpers.py
index ac547ca7..40821f1f 100644
--- a/thingsvision/core/extraction/helpers.py
+++ b/thingsvision/core/extraction/helpers.py
@@ -2,9 +2,8 @@
 from typing import Any, Callable, Dict, Union
 
 import numpy as np
-from torchtyping import TensorType
-
 import torch
+from torchtyping import TensorType
 
 from .extractors import (
     KerasExtractor,
@@ -67,7 +66,7 @@ class CustomExtractor(Extractor):
         def __init__(self, *args, **kwargs) -> None:
             super().__init__(*args, **kwargs)
 
-    #TODO(lukasmut): this should probably be defined in the custom model itself
+    # TODO(lukasmut): this should probably be defined in the custom model itself
     if model_name.lower().startswith("clip"):
 
         def show_model(self):
diff --git a/thingsvision/core/extraction/tensorflow.py b/thingsvision/core/extraction/tensorflow.py
index 6015d616..bf736af5 100644
--- a/thingsvision/core/extraction/tensorflow.py
+++ b/thingsvision/core/extraction/tensorflow.py
@@ -35,7 +35,7 @@ def __init__(
             self.load_model()
         self.prepare_inference()
 
-    def _extract_batch(
+    def extract_batch(
         self, batch: Array, module_name: str, flatten_acts: bool
     ) -> Array:
         layer_out = [self.model.get_layer(module_name).output]
diff --git a/thingsvision/core/extraction/torch.py b/thingsvision/core/extraction/torch.py
index 5b9a4195..f57e1545 100644
--- a/thingsvision/core/extraction/torch.py
+++ b/thingsvision/core/extraction/torch.py
@@ -1,11 +1,11 @@
 from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 
 import numpy as np
-from thingsvision.utils.alignment import gLocal
+import torch
 from torchtyping import TensorType
 from torchvision import transforms as T
 
-import torch
+from thingsvision.utils.alignment import gLocal
 
 from .base import BaseExtractor
 
@@ -92,11 +92,12 @@ def _unregister_hook(self) -> None:
         self.hook_handle.remove()
 
     @torch.no_grad()
-    def _extract_batch(
+    def extract_batch(
         self,
         batch: TensorType["b", "c", "h", "w"],
         module_name: str,
         flatten_acts: bool,
+        output_type: str = "tensor",
     ) -> Union[
         TensorType["b", "num_maps", "h_prime", "w_prime"],
         TensorType["b", "t", "d"],
@@ -108,8 +109,9 @@ def _extract_batch(
         _ = self.forward(batch)
         act = self.activations[module_name]
         if hasattr(self, "extract_cls_token"):
-            # we are only interested in the representations of the first token, i.e., [cls] token
-            act = act[:, 0, :].clone()
+            if self.extract_cls_token:
+                # we are only interested in the representations of the first token, i.e., [cls] token
+                act = act[:, 0, :].clone()
         if flatten_acts:
             if self.model_name.lower().startswith("clip"):
                 act = self.flatten_acts(act, batch, module_name)
@@ -118,6 +120,8 @@ def _extract_batch(
         if act.is_cuda or act.get_device() >= 0:
             torch.cuda.empty_cache()
             act = act.cpu()
+        if output_type == "ndarray":
+            act = self._to_numpy(act)
         return act
 
     def forward(

From 5a5ecc567179c77d54908adc59d4377152f873c6 Mon Sep 17 00:00:00 2001
From: LukasMut <muttenthaler@cbs.mpg.de>
Date: Mon, 1 Apr 2024 13:55:44 +0200
Subject: [PATCH 02/17] small refactor

---
 thingsvision/core/extraction/base.py       | 20 ++++++++++++--------
 thingsvision/core/extraction/tensorflow.py |  7 ++++++-
 thingsvision/core/extraction/torch.py      |  1 +
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/thingsvision/core/extraction/base.py b/thingsvision/core/extraction/base.py
index 1421991c..7b09e15f 100644
--- a/thingsvision/core/extraction/base.py
+++ b/thingsvision/core/extraction/base.py
@@ -109,6 +109,17 @@ def extract_batch(
     def get_output_types(self) -> List[str]:
         return ["ndarray", "tensor"]
 
+    def _module_and_output_check(self, module_name: str, output_type: str) -> None:
+        """Checks whether the provided module name and output type are valid."""
+        valid_names = self.get_module_names()
+        if not module_name in valid_names:
+            raise ValueError(
+                f"\n{module_name} is not a valid module name. Please choose a name from the following set of modules: {valid_names}\n"
+            )
+        assert (
+            output_type in self.get_output_types()
+        ), f"\nData type of output feature matrix must be set to one of the following available data types: {self.get_output_types()}\n"
+
     def extract_features(
         self,
         batches: Iterator[Union[TensorType["b", "c", "h", "w"], Array]],
@@ -164,14 +175,7 @@ def extract_features(
         output : np.ndarray or torch.Tensor
             Returns the feature matrix (e.g., $X \in \mathbb{R}^{n \times d}$ if penultimate or logits layer or flatten_acts = True).
         """
-        valid_names = self.get_module_names()
-        if not module_name in valid_names:
-            raise ValueError(
-                f"\n{module_name} is not a valid module name. Please choose a name from the following set of modules: {valid_names}\n"
-            )
-        assert (
-            output_type in self.get_output_types()
-        ), f"\nData type of output feature matrix must be set to one of the following available data types: {self.get_output_types()}\n"
+        self._module_and_output_check(module_name, output_type)
         if output_dir:
             os.makedirs(output_dir, exist_ok=True)
 
diff --git a/thingsvision/core/extraction/tensorflow.py b/thingsvision/core/extraction/tensorflow.py
index bf736af5..ffda2b2e 100644
--- a/thingsvision/core/extraction/tensorflow.py
+++ b/thingsvision/core/extraction/tensorflow.py
@@ -36,8 +36,13 @@ def __init__(
         self.prepare_inference()
 
     def extract_batch(
-        self, batch: Array, module_name: str, flatten_acts: bool
+        self,
+        batch: Array,
+        module_name: str,
+        flatten_acts: bool,
+        output_type="ndarray",
     ) -> Array:
+        self._module_and_output_check(module_name, output_type)
         layer_out = [self.model.get_layer(module_name).output]
         activation_model = keras.models.Model(
             inputs=self.model.input,
diff --git a/thingsvision/core/extraction/torch.py b/thingsvision/core/extraction/torch.py
index f57e1545..fa4ca549 100644
--- a/thingsvision/core/extraction/torch.py
+++ b/thingsvision/core/extraction/torch.py
@@ -104,6 +104,7 @@ def extract_batch(
         TensorType["b", "p"],
         TensorType["b", "d"],
     ]:
+        self._module_and_output_check(module_name, output_type)
         # move current batch to torch device
         batch = batch.to(self.device)
         _ = self.forward(batch)

From c74c489ac20412608a87497409a6b6b7fa57eaa3 Mon Sep 17 00:00:00 2001
From: LukasMut <muttenthaler@cbs.mpg.de>
Date: Mon, 1 Apr 2024 13:59:47 +0200
Subject: [PATCH 03/17] small refactor

---
 thingsvision/core/extraction/torch.py | 48 +++++++++++++--------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/thingsvision/core/extraction/torch.py b/thingsvision/core/extraction/torch.py
index fa4ca549..f2437548 100644
--- a/thingsvision/core/extraction/torch.py
+++ b/thingsvision/core/extraction/torch.py
@@ -41,30 +41,6 @@ def __init__(
             self.load_model()
         self.prepare_inference()
 
-    def extract_features(
-        self,
-        batches: Iterator,
-        module_name: str,
-        flatten_acts: bool,
-        output_type: str = "ndarray",
-        output_dir: Optional[str] = None,
-        step_size: Optional[int] = None,
-    ):
-        self.model = self.model.to(self.device)
-        self.activations = {}
-        self.register_hook(module_name=module_name)
-        features = super().extract_features(
-            batches=batches,
-            module_name=module_name,
-            flatten_acts=flatten_acts,
-            output_type=output_type,
-            output_dir=output_dir,
-            step_size=step_size,
-        )
-        if self.hook_handle:
-            self._unregister_hook()
-        return features
-
     def get_activation(self, name: str) -> Callable:
         """Store copy of activations for a specific layer of the model."""
 
@@ -125,6 +101,30 @@ def extract_batch(
             act = self._to_numpy(act)
         return act
 
+    def extract_features(
+        self,
+        batches: Iterator,
+        module_name: str,
+        flatten_acts: bool,
+        output_type: str = "ndarray",
+        output_dir: Optional[str] = None,
+        step_size: Optional[int] = None,
+    ):
+        self.model = self.model.to(self.device)
+        self.activations = {}
+        self.register_hook(module_name=module_name)
+        features = super().extract_features(
+            batches=batches,
+            module_name=module_name,
+            flatten_acts=flatten_acts,
+            output_type=output_type,
+            output_dir=output_dir,
+            step_size=step_size,
+        )
+        if self.hook_handle:
+            self._unregister_hook()
+        return features
+
     def forward(
         self, batch: TensorType["b", "c", "h", "w"]
     ) -> TensorType["b", "num_cls"]:

From 522d918134fadc38d09782b3741010b827001775 Mon Sep 17 00:00:00 2001
From: LukasMut <muttenthaler@cbs.mpg.de>
Date: Mon, 1 Apr 2024 14:02:52 +0200
Subject: [PATCH 04/17] small fix in docs

---
 docs/GettingStarted.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/GettingStarted.md b/docs/GettingStarted.md
index e4bbd4e9..393e813e 100644
--- a/docs/GettingStarted.md
+++ b/docs/GettingStarted.md
@@ -124,9 +124,9 @@ for batch in my_dataloader:
     batch=batch,
     module_name=module_name,
     flatten_acts=True, # flatten 2D feature maps from an early convolutional or attention layer
-    output_type="tensor", # optionally set the output type of feature matrix
+    output_type="tensor", # optionally set the output type of the feature matrix
     )
-  ... # whatever post-processing you want to add the features
+  ... # whatever post-processing you want to add to the extracted features
 ```
 
 ### Showing available modules of a model

From 254059b332985518b2e4b5f8a6cb710418d5b017 Mon Sep 17 00:00:00 2001
From: LukasMut <muttenthaler@cbs.mpg.de>
Date: Mon, 1 Apr 2024 14:47:47 +0200
Subject: [PATCH 05/17] modified tests

---
 .../extraction/test_torch_vs_tensorflow.py    | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/tests/extractor/extraction/test_torch_vs_tensorflow.py b/tests/extractor/extraction/test_torch_vs_tensorflow.py
index 79d25fec..dd4f5d87 100644
--- a/tests/extractor/extraction/test_torch_vs_tensorflow.py
+++ b/tests/extractor/extraction/test_torch_vs_tensorflow.py
@@ -62,18 +62,21 @@ def test_custom_torch_vs_tf_extraction(self):
         np.testing.assert_allclose(tf_features, expected_features_tf)
 
         layer_name = "relu2"
-        tf_features = tf_model.extract_features(
-            batches=tf_dl,
-            module_name=layer_name,
-            flatten_acts=False,
-        )
-        pt_features = pt_model.extract_features(
-            batches=pt_dl,
-            module_name=layer_name,
-            flatten_acts=False,
-            output_type="tensor",
-        )
-        expected_features_pt = torch.tensor([[4, 4], [0, 0]])
-        expected_features_tf = np.array([[4, 4], [0, 0]])
-        np.testing.assert_allclose(pt_features, expected_features_pt)
-        np.testing.assert_allclose(tf_features, expected_features_tf)
+        expected_features = np.array([[4, 4], [0, 0]])
+        for i, batch in enumerate(tf_dl):
+            tf_features = tf_model.extract_batch(
+                batches=batch,
+                module_name=layer_name,
+                flatten_acts=False,
+            )
+            np.testing.assert_allclose(tf_features, expected_features[i])
+        
+        for i, batch in enumerate(pt_dl):
+            pt_features = pt_model.extract_batch(
+                batches=batch,
+                module_name=layer_name,
+                flatten_acts=False,
+                output_type="ndarray",
+            )
+            np.testing.assert_allclose(pt_features, expected_features[i])
+            

From f0390a918cc93319e164b6d8783399065478f90b Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 15:04:37 +0200
Subject: [PATCH 06/17] Update test_torch_vs_tensorflow.py

---
 tests/extractor/extraction/test_torch_vs_tensorflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/extractor/extraction/test_torch_vs_tensorflow.py b/tests/extractor/extraction/test_torch_vs_tensorflow.py
index dd4f5d87..943aa498 100644
--- a/tests/extractor/extraction/test_torch_vs_tensorflow.py
+++ b/tests/extractor/extraction/test_torch_vs_tensorflow.py
@@ -65,7 +65,7 @@ def test_custom_torch_vs_tf_extraction(self):
         expected_features = np.array([[4, 4], [0, 0]])
         for i, batch in enumerate(tf_dl):
             tf_features = tf_model.extract_batch(
-                batches=batch,
+                batch=batch,
                 module_name=layer_name,
                 flatten_acts=False,
             )
@@ -73,7 +73,7 @@ def test_custom_torch_vs_tf_extraction(self):
         
         for i, batch in enumerate(pt_dl):
             pt_features = pt_model.extract_batch(
-                batches=batch,
+                batch=batch,
                 module_name=layer_name,
                 flatten_acts=False,
                 output_type="ndarray",

From 9fd144bb58f62cd7cf9000bac258efc8a7a6bdd2 Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 15:34:33 +0200
Subject: [PATCH 07/17] Update test_torch_vs_tensorflow.py

---
 tests/extractor/extraction/test_torch_vs_tensorflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/extractor/extraction/test_torch_vs_tensorflow.py b/tests/extractor/extraction/test_torch_vs_tensorflow.py
index 943aa498..3d267e77 100644
--- a/tests/extractor/extraction/test_torch_vs_tensorflow.py
+++ b/tests/extractor/extraction/test_torch_vs_tensorflow.py
@@ -62,7 +62,7 @@ def test_custom_torch_vs_tf_extraction(self):
         np.testing.assert_allclose(tf_features, expected_features_tf)
 
         layer_name = "relu2"
-        expected_features = np.array([[4, 4], [0, 0]])
+        expected_features = np.array([[4., 4.], [0., 0.]])
         for i, batch in enumerate(tf_dl):
             tf_features = tf_model.extract_batch(
                 batch=batch,

From fa4be381f1426a398406aa591f7158f96ca52818 Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 16:08:02 +0200
Subject: [PATCH 08/17] Update test_torch_vs_tensorflow.py

---
 tests/extractor/extraction/test_torch_vs_tensorflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/extractor/extraction/test_torch_vs_tensorflow.py b/tests/extractor/extraction/test_torch_vs_tensorflow.py
index 3d267e77..d494ee9f 100644
--- a/tests/extractor/extraction/test_torch_vs_tensorflow.py
+++ b/tests/extractor/extraction/test_torch_vs_tensorflow.py
@@ -69,7 +69,7 @@ def test_custom_torch_vs_tf_extraction(self):
                 module_name=layer_name,
                 flatten_acts=False,
             )
-            np.testing.assert_allclose(tf_features, expected_features[i])
+            np.testing.assert_allclose(tf_features, expected_features[i][None,:])
         
         for i, batch in enumerate(pt_dl):
             pt_features = pt_model.extract_batch(
@@ -78,5 +78,5 @@ def test_custom_torch_vs_tf_extraction(self):
                 flatten_acts=False,
                 output_type="ndarray",
             )
-            np.testing.assert_allclose(pt_features, expected_features[i])
+            np.testing.assert_allclose(pt_features, expected_features[i][None,:])
             

From 04f7c4b1eb3df80ba0808fc6554cf7d2139b4c52 Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 16:45:25 +0200
Subject: [PATCH 09/17] Update torch.py

---
 thingsvision/core/extraction/torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/thingsvision/core/extraction/torch.py b/thingsvision/core/extraction/torch.py
index f2437548..87e135ed 100644
--- a/thingsvision/core/extraction/torch.py
+++ b/thingsvision/core/extraction/torch.py
@@ -83,6 +83,7 @@ def extract_batch(
         self._module_and_output_check(module_name, output_type)
         # move current batch to torch device
         batch = batch.to(self.device)
+        self.register_hook(module_name=module_name)
         _ = self.forward(batch)
         act = self.activations[module_name]
         if hasattr(self, "extract_cls_token"):
@@ -99,6 +100,7 @@ def extract_batch(
             act = act.cpu()
         if output_type == "ndarray":
             act = self._to_numpy(act)
+        self._unregister_hook()
         return act
 
     def extract_features(
@@ -112,7 +114,6 @@ def extract_features(
     ):
         self.model = self.model.to(self.device)
         self.activations = {}
-        self.register_hook(module_name=module_name)
         features = super().extract_features(
             batches=batches,
             module_name=module_name,

From 2c34e43427eea30bb74abdb8e981983b29e8e23f Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 17:33:06 +0200
Subject: [PATCH 10/17] Update torch.py

---
 thingsvision/core/extraction/torch.py | 31 +++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/thingsvision/core/extraction/torch.py b/thingsvision/core/extraction/torch.py
index 87e135ed..929a9087 100644
--- a/thingsvision/core/extraction/torch.py
+++ b/thingsvision/core/extraction/torch.py
@@ -102,6 +102,36 @@ def extract_batch(
             act = self._to_numpy(act)
         self._unregister_hook()
         return act
+        
+    @torch.no_grad()
+    def _extract_batch(
+        self,
+        batch: TensorType["b", "c", "h", "w"],
+        module_name: str,
+        flatten_acts: bool,
+    ) -> Union[
+        TensorType["b", "num_maps", "h_prime", "w_prime"],
+        TensorType["b", "t", "d"],
+        TensorType["b", "p"],
+        TensorType["b", "d"],
+    ]:
+        # move current batch to torch device
+        batch = batch.to(self.device)
+        _ = self.forward(batch)
+        act = self.activations[module_name]
+        if hasattr(self, "extract_cls_token"):
+            if self.extract_cls_token:
+                # we are only interested in the representations of the first token, i.e., [cls] token
+                act = act[:, 0, :].clone()
+        if flatten_acts:
+            if self.model_name.lower().startswith("clip"):
+                act = self.flatten_acts(act, batch, module_name)
+            else:
+                act = self.flatten_acts(act)
+        if act.is_cuda or act.get_device() >= 0:
+            torch.cuda.empty_cache()
+            act = act.cpu()
+        return act
 
     def extract_features(
         self,
@@ -114,6 +144,7 @@ def extract_features(
     ):
         self.model = self.model.to(self.device)
         self.activations = {}
+        self.register_hook(module_name=module_name)
         features = super().extract_features(
             batches=batches,
             module_name=module_name,

From 1d0c679f660b571a33d1c444ceba96d9434e3807 Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 17:35:33 +0200
Subject: [PATCH 11/17] Update base.py

---
 thingsvision/core/extraction/base.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/thingsvision/core/extraction/base.py b/thingsvision/core/extraction/base.py
index 7b09e15f..f15a0a5d 100644
--- a/thingsvision/core/extraction/base.py
+++ b/thingsvision/core/extraction/base.py
@@ -86,7 +86,7 @@ def extract_batch(
         ],
         Array,
     ]:
-        """Extract hidden unit activations (at specified layer) for every image in the database.
+        """Extract hidden unit activations (at specified layer) for every image in a mini-batch.
 
         Parameters
         ----------
@@ -105,6 +105,24 @@ def extract_batch(
             Returns the feature matrix (e.g., $X \in \mathbb{R}^{B \times d}$ if penultimate or logits layer or flatten_acts = True).
         """
         raise NotImplementedError
+        
+        
+    @abc.abstractmethod
+    def _extract_batch(
+        self,
+        batch: Union[TensorType["b", "c", "h", "w"], Array],
+        module_name: str,
+        flatten_acts: bool,
+    ) -> Union[
+        Union[
+            TensorType["b", "num_maps", "h_prime", "w_prime"],
+            TensorType["b", "t", "d"],
+            TensorType["b", "p"],
+            TensorType["b", "d"],
+        ],
+        Array,
+    ]:
+        raise NotImplementedError
 
     def get_output_types(self) -> List[str]:
         return ["ndarray", "tensor"]
@@ -189,7 +207,7 @@ def extract_features(
             enumerate(batches, start=1), desc="Batch", total=len(batches)
         ):
             features.append(
-                self.extract_batch(
+                self._extract_batch(
                     batch=batch, module_name=module_name, flatten_acts=flatten_acts
                 )
             )

From 552708c399267f92424151eff7847f6010baa932 Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 17:38:20 +0200
Subject: [PATCH 12/17] Update tensorflow.py

---
 thingsvision/core/extraction/tensorflow.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/thingsvision/core/extraction/tensorflow.py b/thingsvision/core/extraction/tensorflow.py
index ffda2b2e..1efb761c 100644
--- a/thingsvision/core/extraction/tensorflow.py
+++ b/thingsvision/core/extraction/tensorflow.py
@@ -34,15 +34,13 @@ def __init__(
         if not self.model:
             self.load_model()
         self.prepare_inference()
-
-    def extract_batch(
+        
+    def _extract_batch(
         self,
         batch: Array,
         module_name: str,
         flatten_acts: bool,
-        output_type="ndarray",
     ) -> Array:
-        self._module_and_output_check(module_name, output_type)
         layer_out = [self.model.get_layer(module_name).output]
         activation_model = keras.models.Model(
             inputs=self.model.input,
@@ -53,6 +51,17 @@ def extract_batch(
             activations = activations.reshape(activations.shape[0], -1)
         return activations
 
+    def extract_batch(
+        self,
+        batch: Array,
+        module_name: str,
+        flatten_acts: bool,
+        output_type="ndarray",
+    ) -> Array:
+        self._module_and_output_check(module_name, output_type)
+        activations = self._extract_batch(batch, module_name, flatten_acts)
+        return activations
+
     def show_model(self) -> str:
         return self.model.summary()
 

From 8c9a7bb554fce8aaf1ea13ea8c85a72210d0a60c Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 17:52:30 +0200
Subject: [PATCH 13/17] Update torch.py

---
 thingsvision/core/extraction/torch.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/thingsvision/core/extraction/torch.py b/thingsvision/core/extraction/torch.py
index 929a9087..c12ef6ec 100644
--- a/thingsvision/core/extraction/torch.py
+++ b/thingsvision/core/extraction/torch.py
@@ -81,23 +81,8 @@ def extract_batch(
         TensorType["b", "d"],
     ]:
         self._module_and_output_check(module_name, output_type)
-        # move current batch to torch device
-        batch = batch.to(self.device)
         self.register_hook(module_name=module_name)
-        _ = self.forward(batch)
-        act = self.activations[module_name]
-        if hasattr(self, "extract_cls_token"):
-            if self.extract_cls_token:
-                # we are only interested in the representations of the first token, i.e., [cls] token
-                act = act[:, 0, :].clone()
-        if flatten_acts:
-            if self.model_name.lower().startswith("clip"):
-                act = self.flatten_acts(act, batch, module_name)
-            else:
-                act = self.flatten_acts(act)
-        if act.is_cuda or act.get_device() >= 0:
-            torch.cuda.empty_cache()
-            act = act.cpu()
+        act = self._extract_batch(batch, module_name, flatten_acts)
         if output_type == "ndarray":
             act = self._to_numpy(act)
         self._unregister_hook()

From 7c87ac7e9ed5168623b02e9918d049e1fea182fd Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 18:25:19 +0200
Subject: [PATCH 14/17] Update torch.py

---
 thingsvision/core/extraction/torch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/thingsvision/core/extraction/torch.py b/thingsvision/core/extraction/torch.py
index c12ef6ec..6315c799 100644
--- a/thingsvision/core/extraction/torch.py
+++ b/thingsvision/core/extraction/torch.py
@@ -67,7 +67,6 @@ def register_hook(self, module_name: str) -> None:
     def _unregister_hook(self) -> None:
         self.hook_handle.remove()
 
-    @torch.no_grad()
     def extract_batch(
         self,
         batch: TensorType["b", "c", "h", "w"],

From cdac17aa7f795da7ae74c5d63b54d0f66ca98c9a Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 18:41:40 +0200
Subject: [PATCH 15/17] Update tensorflow.py

---
 thingsvision/core/extraction/tensorflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thingsvision/core/extraction/tensorflow.py b/thingsvision/core/extraction/tensorflow.py
index 1efb761c..ce63f5f1 100644
--- a/thingsvision/core/extraction/tensorflow.py
+++ b/thingsvision/core/extraction/tensorflow.py
@@ -56,7 +56,7 @@ def extract_batch(
         batch: Array,
         module_name: str,
         flatten_acts: bool,
-        output_type="ndarray",
+        output_type: str = "ndarray",
     ) -> Array:
         self._module_and_output_check(module_name, output_type)
         activations = self._extract_batch(batch, module_name, flatten_acts)

From 044c5c568baea49ccadf3852e793fc09f2180c4a Mon Sep 17 00:00:00 2001
From: Lukas Muttenthaler <LukasMut@users.noreply.github.com>
Date: Mon, 1 Apr 2024 18:42:56 +0200
Subject: [PATCH 16/17] Update base.py

---
 thingsvision/core/extraction/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thingsvision/core/extraction/base.py b/thingsvision/core/extraction/base.py
index f15a0a5d..80d520a5 100644
--- a/thingsvision/core/extraction/base.py
+++ b/thingsvision/core/extraction/base.py
@@ -76,7 +76,7 @@ def extract_batch(
         batch: Union[TensorType["b", "c", "h", "w"], Array],
         module_name: str,
         flatten_acts: bool,
-        output_type: Optional[str] = None,
+        output_type: str,
     ) -> Union[
         Union[
             TensorType["b", "num_maps", "h_prime", "w_prime"],

From e13455226017a692d268d32f0f2ca6a52b16548a Mon Sep 17 00:00:00 2001
From: LukasMut <muttenthaler@cbs.mpg.de>
Date: Mon, 1 Apr 2024 19:49:18 +0200
Subject: [PATCH 17/17] updated tests

---
 .../extraction/test_torch_vs_tensorflow.py    | 34 +++++++++++--------
 tests/helper.py                               |  6 ++++
 thingsvision/core/extraction/base.py          |  3 +-
 thingsvision/core/extraction/tensorflow.py    |  2 +-
 thingsvision/core/extraction/torch.py         |  2 +-
 5 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/tests/extractor/extraction/test_torch_vs_tensorflow.py b/tests/extractor/extraction/test_torch_vs_tensorflow.py
index d494ee9f..41bc732a 100644
--- a/tests/extractor/extraction/test_torch_vs_tensorflow.py
+++ b/tests/extractor/extraction/test_torch_vs_tensorflow.py
@@ -45,21 +45,25 @@ def test_custom_torch_vs_tf_extraction(self):
         pt_model.backend = pt_backend
 
         layer_name = "relu"
-        tf_features = tf_model.extract_features(
-            batches=tf_dl,
-            module_name=layer_name,
-            flatten_acts=False,
-        )
-        pt_features = pt_model.extract_features(
-            batches=pt_dl,
-            module_name=layer_name,
-            flatten_acts=False,
-            output_type="tensor",
-        )
-        expected_features_pt = torch.tensor([[2, 2], [0, 0]])
-        expected_features_tf = np.array([[2, 2], [0, 0]])
-        np.testing.assert_allclose(pt_features, expected_features_pt)
-        np.testing.assert_allclose(tf_features, expected_features_tf)
+        expected_features_pt = torch.tensor([[2., 2.], [0., 0.]])
+        expected_features_tf = np.array([[2., 2.], [0, 0.]])
+
+        for i, batch in enumerate(tf_dl):
+            tf_features = tf_model.extract_batch(
+                batch=batch,
+                module_name=layer_name,
+                flatten_acts=False,
+            )
+            np.testing.assert_allclose(tf_features, expected_features_tf[i][None,:])
+        
+        for i, batch in enumerate(pt_dl):
+            pt_features = pt_model.extract_batch(
+                batch=batch,
+                module_name=layer_name,
+                flatten_acts=False,
+                output_type="tensor",
+            )
+            np.testing.assert_allclose(pt_features, expected_features_pt[i][None,:])
 
         layer_name = "relu2"
         expected_features = np.array([[4., 4.], [0., 0.]])
diff --git a/tests/helper.py b/tests/helper.py
index 723c3830..59b688b4 100644
--- a/tests/helper.py
+++ b/tests/helper.py
@@ -100,6 +100,12 @@
         "pretrained": True,
         "source": "keras",
     },
+    "VGG19_keras": {
+        "model_name": "VGG19",
+        "modules": ["block1_conv1", "flatten"],
+        "pretrained": False,
+        "source": "keras",
+    },
     # Vissl models
     "simclr-rn50": {
         "model_name": "simclr-rn50",
diff --git a/thingsvision/core/extraction/base.py b/thingsvision/core/extraction/base.py
index 80d520a5..c789d119 100644
--- a/thingsvision/core/extraction/base.py
+++ b/thingsvision/core/extraction/base.py
@@ -105,8 +105,7 @@ def extract_batch(
             Returns the feature matrix (e.g., $X \in \mathbb{R}^{B \times d}$ if penultimate or logits layer or flatten_acts = True).
         """
         raise NotImplementedError
-        
-        
+
     @abc.abstractmethod
     def _extract_batch(
         self,
diff --git a/thingsvision/core/extraction/tensorflow.py b/thingsvision/core/extraction/tensorflow.py
index ce63f5f1..e6f76606 100644
--- a/thingsvision/core/extraction/tensorflow.py
+++ b/thingsvision/core/extraction/tensorflow.py
@@ -34,7 +34,7 @@ def __init__(
         if not self.model:
             self.load_model()
         self.prepare_inference()
-        
+
     def _extract_batch(
         self,
         batch: Array,
diff --git a/thingsvision/core/extraction/torch.py b/thingsvision/core/extraction/torch.py
index 6315c799..2f4e1833 100644
--- a/thingsvision/core/extraction/torch.py
+++ b/thingsvision/core/extraction/torch.py
@@ -86,7 +86,7 @@ def extract_batch(
             act = self._to_numpy(act)
         self._unregister_hook()
         return act
-        
+
     @torch.no_grad()
     def _extract_batch(
         self,