From 34d482fb0893164bd5c3aa543e57ecd5ec9b4382 Mon Sep 17 00:00:00 2001
From: HonzaCuhel <jan.cuhel@protonmail.com>
Date: Fri, 17 Jan 2025 09:37:27 +0100
Subject: [PATCH 1/5] Add AIMv2

---
 README.md                                     |   3 +-
 datadreamer/dataset_annotation/__init__.py    |   2 +
 .../dataset_annotation/aimv2_annotator.py     | 158 ++++++++++++++++++
 .../dataset_annotation/clip_annotator.py      |   2 +-
 .../generate_dataset_from_scratch.py          |   5 +-
 .../generate_dataset_and_train_yolo.ipynb     |   2 +-
 ..._segmentation_dataset_and_train_yolo.ipynb |   2 +-
 tests/core_tests/unittests/test_annotators.py |  30 ++++
 8 files changed, 198 insertions(+), 6 deletions(-)
 create mode 100644 datadreamer/dataset_annotation/aimv2_annotator.py
diff --git a/README.md b/README.md
index 45ef010..97de01a 100644
--- a/README.md
+++ b/README.md
@@ -181,7 +181,7 @@ datadreamer --config <path-to-config>
 - `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.
 - `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.
 - `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.
-- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.
+- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `aimv2` or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.
 - `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.
 - `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.
 - `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `""`.
@@ -218,6 +218,7 @@ datadreamer --config <path-to-config>
 |                   | [SDXL-Lightning](https://huggingface.co/ByteDance/SDXL-Lightning)                     | Fast and accurate (1024x1024 images)    |
 | Image Annotation  | [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble)                    | Open-Vocabulary object detector         |
 |                   | [CLIP](https://huggingface.co/openai/clip-vit-base-patch32)                           | Zero-shot-image-classification          |
+|                   | [AIMv2](https://huggingface.co/apple/aimv2-large-patch14-224-lit)                     | Zero-shot-image-classification          |
 |                   | [SlimSAM](https://huggingface.co/Zigeng/SlimSAM-uniform-50)                           | Zero-shot-instance-segmentation         |
 
 <a name="example"></a>
diff --git a/datadreamer/dataset_annotation/__init__.py b/datadreamer/dataset_annotation/__init__.py
index cfdf51a..82bd7ba 100644
--- a/datadreamer/dataset_annotation/__init__.py
+++ b/datadreamer/dataset_annotation/__init__.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
+from .aimv2_annotator import AIMv2Annotator
 from .clip_annotator import CLIPAnnotator
 from .image_annotator import BaseAnnotator, TaskList
 from .owlv2_annotator import OWLv2Annotator
 from .slimsam_annotator import SlimSAMAnnotator
 
 __all__ = [
+    "AIMv2Annotator",
     "BaseAnnotator",
     "TaskList",
     "OWLv2Annotator",
diff --git a/datadreamer/dataset_annotation/aimv2_annotator.py b/datadreamer/dataset_annotation/aimv2_annotator.py
new file mode 100644
index 0000000..6d4adc2
--- /dev/null
+++ b/datadreamer/dataset_annotation/aimv2_annotator.py
@@ -0,0 +1,158 @@
+from __future__ import annotations
+
+import logging
+from typing import Dict, List
+
+import numpy as np
+import PIL
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
+
+from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList
+
+logger = logging.getLogger(__name__)
+
+
+class AIMv2Annotator(BaseAnnotator):
+    """A class for image annotation using the AIMv2 model, specializing in image
+    classification.
+
+    Attributes:
+        model (AutoModel): The AIMv2 model for image-text similarity evaluation.
+        processor (AutoProcessor): The processor for preparing inputs to the AIMv2 model.
+        device (str): The device on which the model will run ('cuda' for GPU, 'cpu' for CPU).
+        size (str): The size of the AIMv2 model to use ('base' or 'large').
+
+    Methods:
+        _init_processor(): Initializes the AIMv2 processor.
+        _init_model(): Initializes the AIMv2 model.
+        annotate_batch(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels.
+        release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache.
+    """
+
+    def __init__(
+        self,
+        seed: float = 42,
+        device: str = "cuda",
+        size: str = "base",
+    ) -> None:
+        """Initializes the AIMv2Annotator with a specific seed and device.
+
+        Args:
+            seed (float): Seed for reproducibility. Defaults to 42.
+            device (str): The device to run the model on. Defaults to 'cuda'.
+        """
+        super().__init__(seed, task_definition=TaskList.CLASSIFICATION)
+        self.size = size
+        self.model = self._init_model()
+        self.processor = self._init_processor()
+        self.device = device
+        self.model.to(self.device)
+
+    def _init_processor(self) -> AutoProcessor:
+        """Initializes the AIMv2 processor.
+
+        Returns:
+            AutoProcessor: The initialized AIMv2 processor.
+        """
+        return AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
+
+    def _init_model(self) -> AutoModel:
+        """Initializes the AIMv2 model.
+
+        Returns:
+            AutoModel: The initialized AIMv2 model.
+        """
+        logger.info(f"Initializing AIMv2 {self.size} model...")
+        return AutoModel.from_pretrained(
+            "apple/aimv2-large-patch14-224-lit", trust_remote_code=True
+        )
+
+    def annotate_batch(
+        self,
+        images: List[PIL.Image.Image],
+        objects: List[str],
+        conf_threshold: float = 0.1,
+        synonym_dict: Dict[str, List[str]] | None = None,
+    ) -> List[np.ndarray]:
+        """Annotates images using the AIMv2 model.
+
+        Args:
+            images: The images to be annotated.
+            objects: A list of objects (text) to test against the images.
+            conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
+            synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None.
+
+        Returns:
+            List[np.ndarray]: A list of the annotations for each image.
+        """
+        if synonym_dict is not None:
+            objs_syn = set()
+            for obj in objects:
+                objs_syn.add(obj)
+                for syn in synonym_dict[obj]:
+                    objs_syn.add(syn)
+            objs_syn = list(objs_syn)
+            # Make a dict to transform synonym ids to original ids
+            synonym_dict_rev = {}
+            for key, value in synonym_dict.items():
+                if key in objects:
+                    synonym_dict_rev[objs_syn.index(key)] = objects.index(key)
+                    for v in value:
+                        synonym_dict_rev[objs_syn.index(v)] = objects.index(key)
+            objects = objs_syn
+
+        inputs = self.processor(
+            text=objects, images=images, return_tensors="pt", padding=True
+        ).to(self.device)
+
+        outputs = self.model(**inputs)
+
+        logits_per_image = outputs.logits_per_image  # image-text similarity score
+        probs = logits_per_image.softmax(dim=1).cpu()  # label probabilities
+
+        labels = []
+        # Get the labels for each image
+        if synonym_dict is not None:
+            for prob in probs:
+                labels.append(
+                    np.unique(
+                        np.array(
+                            [
+                                synonym_dict_rev[label.item()]
+                                for label in torch.where(prob > conf_threshold)[
+                                    0
+                                ].numpy()
+                            ]
+                        )
+                    )
+                )
+        else:
+            for prob in probs:
+                labels.append(torch.where(prob > conf_threshold)[0].numpy())
+
+        return labels
+
+    def release(self, empty_cuda_cache: bool = False) -> None:
+        """Releases the model and optionally empties the CUDA cache.
+
+        Args:
+            empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False.
+        """
+        self.model = self.model.to("cpu")
+        if empty_cuda_cache:
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    import requests
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    url = "https://ultralytics.com/images/bus.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    annotator = AIMv2Annotator(device=device)
+    labels = annotator.annotate_batch([im], ["bus", "people"])
+    print(labels)
+    annotator.release()
diff --git a/datadreamer/dataset_annotation/clip_annotator.py b/datadreamer/dataset_annotation/clip_annotator.py
index a39d1c6..28bde63 100644
--- a/datadreamer/dataset_annotation/clip_annotator.py
+++ b/datadreamer/dataset_annotation/clip_annotator.py
@@ -78,7 +78,7 @@ def annotate_batch(
         conf_threshold: float = 0.1,
         synonym_dict: Dict[str, List[str]] | None = None,
     ) -> List[np.ndarray]:
-        """Annotates images using the OWLv2 model.
+        """Annotates images using the CLIP model.
 
         Args:
             images: The images to be annotated.
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index 4d52acb..353fd35 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -17,6 +17,7 @@
 from tqdm import tqdm
 
 from datadreamer.dataset_annotation import (
+    AIMv2Annotator,
     CLIPAnnotator,
     OWLv2Annotator,
     SlimSAMAnnotator,
@@ -57,7 +58,7 @@
 }
 
 det_annotators = {"owlv2": OWLv2Annotator}
-clf_annotators = {"clip": CLIPAnnotator}
+clf_annotators = {"clip": CLIPAnnotator, "aimv2": AIMv2Annotator}
 inst_seg_annotators = {"owlv2-slimsam": SlimSAMAnnotator}
 inst_seg_detectors = {"owlv2-slimsam": OWLv2Annotator}
 
@@ -122,7 +123,7 @@ def parse_args():
     parser.add_argument(
         "--image_annotator",
         type=str,
-        choices=["owlv2", "clip", "owlv2-slimsam"],
+        choices=["owlv2", "clip", "owlv2-slimsam", "aimv2"],
         help="Image annotator to use",
     )
 
diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb
index 988942d..c530eb7 100644
--- a/examples/generate_dataset_and_train_yolo.ipynb
+++ b/examples/generate_dataset_and_train_yolo.ipynb
@@ -85,7 +85,7 @@
     "- `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.\n",
     "- `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.\n",
     "- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.\n",
-    "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.\n",
+    "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `aimv2` or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.\n",
     "- `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.\n",
     "- `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.\n",
     "- `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `\"\"`.\n",
diff --git a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb
index 1588001..a70e737 100644
--- a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb
+++ b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb
@@ -99,7 +99,7 @@
     "- `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.\n",
     "- `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.\n",
     "- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.\n",
-    "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.\n",
+    "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `aimv2` or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.\n",
     "- `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.\n",
     "- `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.\n",
     "- `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `\"\"`.\n",
diff --git a/tests/core_tests/unittests/test_annotators.py b/tests/core_tests/unittests/test_annotators.py
index 4e78df2..eb5c986 100644
--- a/tests/core_tests/unittests/test_annotators.py
+++ b/tests/core_tests/unittests/test_annotators.py
@@ -7,6 +7,7 @@
 import torch
 from PIL import Image
 
+from datadreamer.dataset_annotation.aimv2_annotator import AIMv2Annotator
 from datadreamer.dataset_annotation.clip_annotator import CLIPAnnotator
 from datadreamer.dataset_annotation.owlv2_annotator import OWLv2Annotator
 from datadreamer.dataset_annotation.slimsam_annotator import SlimSAMAnnotator
@@ -56,6 +57,35 @@ def test_cpu_owlv2_annotator():
     _check_owlv2_annotator("cpu")
 
 
+def _check_aimv2_annotator(device: str):
+    url = "https://ultralytics.com/images/bus.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    annotator = AIMv2Annotator(device=device)
+    labels = annotator.annotate_batch([im], ["bus", "people"])
+    # Check that the labels are lists
+    assert isinstance(labels, list) and len(labels) == 1
+    # Check that the labels are ndarray of integers
+    assert isinstance(labels[0], np.ndarray) and labels[0].dtype == np.int64
+
+    annotator.release(empty_cuda_cache=True if device != "cpu" else False)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or total_disk_space < 16,
+    reason="Test requires GPU and 16GB of HDD",
+)
+def test_cuda_aimv2_annotator():
+    _check_aimv2_annotator("cuda")
+
+
+@pytest.mark.skipif(
+    total_disk_space < 16,
+    reason="Test requires at least 16GB of HDD",
+)
+def test_cpu_aimv2_annotator():
+    _check_aimv2_annotator("cpu")
+
+
 def _check_clip_annotator(device: str, size: str = "base"):
     url = "https://ultralytics.com/images/bus.jpg"
     im = Image.open(requests.get(url, stream=True).raw)

From d418b8379821f128b43548a56b495318f6b5e7c7 Mon Sep 17 00:00:00 2001
From: HonzaCuhel <jan.cuhel@protonmail.com>
Date: Fri, 17 Jan 2025 11:24:59 +0100
Subject: [PATCH 2/5] Add AIMv2 to the config

---
 datadreamer/utils/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py
index 6227b61..ca5d926 100644
--- a/datadreamer/utils/config.py
+++ b/datadreamer/utils/config.py
@@ -39,7 +39,7 @@ class Config(LuxonisConfig):
     # Profanity filter arguments
     disable_lm_filter: bool = False
     # Annotation arguments
-    image_annotator: Literal["owlv2", "clip", "owlv2-slimsam"] = "owlv2"
+    image_annotator: Literal["owlv2", "aimv2", "clip", "owlv2-slimsam"] = "owlv2"
     conf_threshold: float = 0.15
     annotation_iou_threshold: float = 0.2
     use_tta: bool = False

From 5833c627f33c28945032c334c06a62a5da5e840f Mon Sep 17 00:00:00 2001
From: HonzaCuhel <jan.cuhel@protonmail.com>
Date: Fri, 17 Jan 2025 12:29:42 +0100
Subject: [PATCH 3/5] Apple License mention

---
 datadreamer/dataset_annotation/aimv2_annotator.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/datadreamer/dataset_annotation/aimv2_annotator.py b/datadreamer/dataset_annotation/aimv2_annotator.py
index 6d4adc2..71f8a3f 100644
--- a/datadreamer/dataset_annotation/aimv2_annotator.py
+++ b/datadreamer/dataset_annotation/aimv2_annotator.py
@@ -1,3 +1,11 @@
+"""This file uses pre-trained model derived from Apple's software, provided under the
+Apple Sample Code License license. The license is available at:
+
+https://developer.apple.com/support/downloads/terms/apple-sample-code/Apple-Sample-Code-License.pdf
+
+In addition, this file and other parts of the repository are licensed under the Apache 2.0
+License. By using this file, you agree to comply with the terms of both licenses.
+"""
 from __future__ import annotations
 
 import logging

From 9e4990b3821d0fdd2a2494f2f11b66adc3847949 Mon Sep 17 00:00:00 2001
From: HonzaCuhel <jan.cuhel@protonmail.com>
Date: Fri, 17 Jan 2025 18:39:59 +0100
Subject: [PATCH 4/5] Add Img Cls Annotator

---
 datadreamer/dataset_annotation/__init__.py    |   2 +
 .../dataset_annotation/aimv2_annotator.py     | 102 +-------------
 .../dataset_annotation/clip_annotator.py      | 102 +-------------
 .../dataset_annotation/cls_annotator.py       | 130 ++++++++++++++++++
 4 files changed, 136 insertions(+), 200 deletions(-)
 create mode 100644 datadreamer/dataset_annotation/cls_annotator.py

diff --git a/datadreamer/dataset_annotation/__init__.py b/datadreamer/dataset_annotation/__init__.py
index 82bd7ba..3fe9f5d 100644
--- a/datadreamer/dataset_annotation/__init__.py
+++ b/datadreamer/dataset_annotation/__init__.py
@@ -2,6 +2,7 @@
 
 from .aimv2_annotator import AIMv2Annotator
 from .clip_annotator import CLIPAnnotator
+from .cls_annotator import ImgClassificationAnnotator
 from .image_annotator import BaseAnnotator, TaskList
 from .owlv2_annotator import OWLv2Annotator
 from .slimsam_annotator import SlimSAMAnnotator
@@ -11,6 +12,7 @@
     "BaseAnnotator",
     "TaskList",
     "OWLv2Annotator",
+    "ImgClassificationAnnotator",
     "CLIPAnnotator",
     "SlimSAMAnnotator",
 ]
diff --git a/datadreamer/dataset_annotation/aimv2_annotator.py b/datadreamer/dataset_annotation/aimv2_annotator.py
index 71f8a3f..f213af0 100644
--- a/datadreamer/dataset_annotation/aimv2_annotator.py
+++ b/datadreamer/dataset_annotation/aimv2_annotator.py
@@ -9,20 +9,17 @@
 from __future__ import annotations
 
 import logging
-from typing import Dict, List
 
-import numpy as np
-import PIL
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoProcessor
 
-from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList
+from datadreamer.dataset_annotation.cls_annotator import ImgClassificationAnnotator
 
 logger = logging.getLogger(__name__)
 
 
-class AIMv2Annotator(BaseAnnotator):
+class AIMv2Annotator(ImgClassificationAnnotator):
     """A class for image annotation using the AIMv2 model, specializing in image
     classification.
 
@@ -39,25 +36,6 @@ class AIMv2Annotator(BaseAnnotator):
         release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache.
     """
 
-    def __init__(
-        self,
-        seed: float = 42,
-        device: str = "cuda",
-        size: str = "base",
-    ) -> None:
-        """Initializes the AIMv2Annotator with a specific seed and device.
-
-        Args:
-            seed (float): Seed for reproducibility. Defaults to 42.
-            device (str): The device to run the model on. Defaults to 'cuda'.
-        """
-        super().__init__(seed, task_definition=TaskList.CLASSIFICATION)
-        self.size = size
-        self.model = self._init_model()
-        self.processor = self._init_processor()
-        self.device = device
-        self.model.to(self.device)
-
     def _init_processor(self) -> AutoProcessor:
         """Initializes the AIMv2 processor.
 
@@ -77,82 +55,6 @@ def _init_model(self) -> AutoModel:
             "apple/aimv2-large-patch14-224-lit", trust_remote_code=True
         )
 
-    def annotate_batch(
-        self,
-        images: List[PIL.Image.Image],
-        objects: List[str],
-        conf_threshold: float = 0.1,
-        synonym_dict: Dict[str, List[str]] | None = None,
-    ) -> List[np.ndarray]:
-        """Annotates images using the AIMv2 model.
-
-        Args:
-            images: The images to be annotated.
-            objects: A list of objects (text) to test against the images.
-            conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
-            synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None.
-
-        Returns:
-            List[np.ndarray]: A list of the annotations for each image.
-        """
-        if synonym_dict is not None:
-            objs_syn = set()
-            for obj in objects:
-                objs_syn.add(obj)
-                for syn in synonym_dict[obj]:
-                    objs_syn.add(syn)
-            objs_syn = list(objs_syn)
-            # Make a dict to transform synonym ids to original ids
-            synonym_dict_rev = {}
-            for key, value in synonym_dict.items():
-                if key in objects:
-                    synonym_dict_rev[objs_syn.index(key)] = objects.index(key)
-                    for v in value:
-                        synonym_dict_rev[objs_syn.index(v)] = objects.index(key)
-            objects = objs_syn
-
-        inputs = self.processor(
-            text=objects, images=images, return_tensors="pt", padding=True
-        ).to(self.device)
-
-        outputs = self.model(**inputs)
-
-        logits_per_image = outputs.logits_per_image  # image-text similarity score
-        probs = logits_per_image.softmax(dim=1).cpu()  # label probabilities
-
-        labels = []
-        # Get the labels for each image
-        if synonym_dict is not None:
-            for prob in probs:
-                labels.append(
-                    np.unique(
-                        np.array(
-                            [
-                                synonym_dict_rev[label.item()]
-                                for label in torch.where(prob > conf_threshold)[
-                                    0
-                                ].numpy()
-                            ]
-                        )
-                    )
-                )
-        else:
-            for prob in probs:
-                labels.append(torch.where(prob > conf_threshold)[0].numpy())
-
-        return labels
-
-    def release(self, empty_cuda_cache: bool = False) -> None:
-        """Releases the model and optionally empties the CUDA cache.
-
-        Args:
-            empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False.
-        """
-        self.model = self.model.to("cpu")
-        if empty_cuda_cache:
-            with torch.no_grad():
-                torch.cuda.empty_cache()
-
 
 if __name__ == "__main__":
     import requests
diff --git a/datadreamer/dataset_annotation/clip_annotator.py b/datadreamer/dataset_annotation/clip_annotator.py
index 28bde63..76787bd 100644
--- a/datadreamer/dataset_annotation/clip_annotator.py
+++ b/datadreamer/dataset_annotation/clip_annotator.py
@@ -1,20 +1,17 @@
 from __future__ import annotations
 
 import logging
-from typing import Dict, List
 
-import numpy as np
-import PIL
 import torch
 from PIL import Image
 from transformers import CLIPModel, CLIPProcessor
 
-from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList
+from datadreamer.dataset_annotation.cls_annotator import ImgClassificationAnnotator
 
 logger = logging.getLogger(__name__)
 
 
-class CLIPAnnotator(BaseAnnotator):
+class CLIPAnnotator(ImgClassificationAnnotator):
     """A class for image annotation using the CLIP model, specializing in image
     classification.
 
@@ -31,25 +28,6 @@ class CLIPAnnotator(BaseAnnotator):
         release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache.
     """
 
-    def __init__(
-        self,
-        seed: float = 42,
-        device: str = "cuda",
-        size: str = "base",
-    ) -> None:
-        """Initializes the CLIPAnnotator with a specific seed and device.
-
-        Args:
-            seed (float): Seed for reproducibility. Defaults to 42.
-            device (str): The device to run the model on. Defaults to 'cuda'.
-        """
-        super().__init__(seed, task_definition=TaskList.CLASSIFICATION)
-        self.size = size
-        self.model = self._init_model()
-        self.processor = self._init_processor()
-        self.device = device
-        self.model.to(self.device)
-
     def _init_processor(self) -> CLIPProcessor:
         """Initializes the CLIP processor.
 
@@ -71,82 +49,6 @@ def _init_model(self) -> CLIPModel:
             return CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
         return CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
 
-    def annotate_batch(
-        self,
-        images: List[PIL.Image.Image],
-        objects: List[str],
-        conf_threshold: float = 0.1,
-        synonym_dict: Dict[str, List[str]] | None = None,
-    ) -> List[np.ndarray]:
-        """Annotates images using the CLIP model.
-
-        Args:
-            images: The images to be annotated.
-            objects: A list of objects (text) to test against the images.
-            conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
-            synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None.
-
-        Returns:
-            List[np.ndarray]: A list of the annotations for each image.
-        """
-        if synonym_dict is not None:
-            objs_syn = set()
-            for obj in objects:
-                objs_syn.add(obj)
-                for syn in synonym_dict[obj]:
-                    objs_syn.add(syn)
-            objs_syn = list(objs_syn)
-            # Make a dict to transform synonym ids to original ids
-            synonym_dict_rev = {}
-            for key, value in synonym_dict.items():
-                if key in objects:
-                    synonym_dict_rev[objs_syn.index(key)] = objects.index(key)
-                    for v in value:
-                        synonym_dict_rev[objs_syn.index(v)] = objects.index(key)
-            objects = objs_syn
-
-        inputs = self.processor(
-            text=objects, images=images, return_tensors="pt", padding=True
-        ).to(self.device)
-
-        outputs = self.model(**inputs)
-
-        logits_per_image = outputs.logits_per_image  # image-text similarity score
-        probs = logits_per_image.softmax(dim=1).cpu()  # label probabilities
-
-        labels = []
-        # Get the labels for each image
-        if synonym_dict is not None:
-            for prob in probs:
-                labels.append(
-                    np.unique(
-                        np.array(
-                            [
-                                synonym_dict_rev[label.item()]
-                                for label in torch.where(prob > conf_threshold)[
-                                    0
-                                ].numpy()
-                            ]
-                        )
-                    )
-                )
-        else:
-            for prob in probs:
-                labels.append(torch.where(prob > conf_threshold)[0].numpy())
-
-        return labels
-
-    def release(self, empty_cuda_cache: bool = False) -> None:
-        """Releases the model and optionally empties the CUDA cache.
-
-        Args:
-            empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False.
-        """
-        self.model = self.model.to("cpu")
-        if empty_cuda_cache:
-            with torch.no_grad():
-                torch.cuda.empty_cache()
-
 
 if __name__ == "__main__":
     import requests
diff --git a/datadreamer/dataset_annotation/cls_annotator.py b/datadreamer/dataset_annotation/cls_annotator.py
new file mode 100644
index 0000000..39b665f
--- /dev/null
+++ b/datadreamer/dataset_annotation/cls_annotator.py
@@ -0,0 +1,130 @@
+from __future__ import annotations
+
+import logging
+from typing import Dict, List
+
+import numpy as np
+import PIL
+import torch
+
+from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList
+
+logger = logging.getLogger(__name__)
+
+
+class ImgClassificationAnnotator(BaseAnnotator):
+    """Base class for image classification annotators using transformers models.
+
+    Attributes:
+        model: The model for image-text similarity evaluation.
+        processor: The processor for preparing inputs to the model.
+        device (str): The device on which the model will run ('cuda' for GPU, 'cpu' for CPU).
+        size (str): The size of the model to use ('base' or 'large').
+
+    Methods:
+        _init_processor(): Initializes the processor.
+        _init_model(): Initializes the model.
+        annotate_batch(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels.
+        release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache.
+    """
+
+    def __init__(
+        self, seed: float = 42, device: str = "cuda", size: str = "base"
+    ) -> None:
+        """Initializes the image classification annotator.
+
+        Args:
+            seed (float): Seed for reproducibility. Defaults to 42.
+            device (str): The device to run the model on. Defaults to 'cuda'.
+            size (str): The model size to use.
+        """
+        super().__init__(seed, task_definition=TaskList.CLASSIFICATION)
+        self.size = size
+        self.device = device
+        self.model = self._init_model()
+        self.processor = self._init_processor()
+        self.model.to(self.device)
+
+    def _init_processor(self):
+        """Initializes the processor."""
+        raise NotImplementedError
+
+    def _init_model(self):
+        """Initializes the model."""
+        raise NotImplementedError
+
+    def annotate_batch(
+        self,
+        images: List[PIL.Image.Image],
+        objects: List[str],
+        conf_threshold: float = 0.1,
+        synonym_dict: Dict[str, List[str]] | None = None,
+    ) -> List[np.ndarray]:
+        """Annotates images using the CLIP model.
+
+        Args:
+            images: The images to be annotated.
+            objects: A list of objects (text) to test against the images.
+            conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
+            synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None.
+
+        Returns:
+            List[np.ndarray]: A list of the annotations for each image.
+        """
+        if synonym_dict is not None:
+            objs_syn = set()
+            for obj in objects:
+                objs_syn.add(obj)
+                for syn in synonym_dict[obj]:
+                    objs_syn.add(syn)
+            objs_syn = list(objs_syn)
+            # Make a dict to transform synonym ids to original ids
+            synonym_dict_rev = {}
+            for key, value in synonym_dict.items():
+                if key in objects:
+                    synonym_dict_rev[objs_syn.index(key)] = objects.index(key)
+                    for v in value:
+                        synonym_dict_rev[objs_syn.index(v)] = objects.index(key)
+            objects = objs_syn
+
+        inputs = self.processor(
+            text=objects, images=images, return_tensors="pt", padding=True
+        ).to(self.device)
+
+        outputs = self.model(**inputs)
+
+        logits_per_image = outputs.logits_per_image  # image-text similarity score
+        probs = logits_per_image.softmax(dim=1).cpu()  # label probabilities
+
+        labels = []
+        # Get the labels for each image
+        if synonym_dict is not None:
+            for prob in probs:
+                labels.append(
+                    np.unique(
+                        np.array(
+                            [
+                                synonym_dict_rev[label.item()]
+                                for label in torch.where(prob > conf_threshold)[
+                                    0
+                                ].numpy()
+                            ]
+                        )
+                    )
+                )
+        else:
+            for prob in probs:
+                labels.append(torch.where(prob > conf_threshold)[0].numpy())
+
+        return labels
+
+    def release(self, empty_cuda_cache: bool = False) -> None:
+        """Releases the model and optionally empties the CUDA cache.
+
+        Args:
+            empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False.
+        """
+        self.model = self.model.to("cpu")
+        if empty_cuda_cache:
+            with torch.no_grad():
+                torch.cuda.empty_cache()

From aa5c8618f9829984102fc19ae798372d35cb1a6d Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Fri, 17 Jan 2025 18:40:05 +0000
Subject: [PATCH 5/5] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index 6c15cac..179c6a1 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -9,13 +9,13 @@
     </mask>
     <g mask="url(#a)">
         <path fill="#555" d="M0 0h63v20H0z"/>
-        <path fill="#a4a61d" d="M63 0h36v20H63z"/>
+        <path fill="#dfb317" d="M63 0h36v20H63z"/>
         <path fill="url(#b)" d="M0 0h99v20H0z"/>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">75%</text>
-        <text x="80" y="14">75%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">63%</text>
+        <text x="80" y="14">63%</text>
     </g>
 </svg>