From 34d482fb0893164bd5c3aa543e57ecd5ec9b4382 Mon Sep 17 00:00:00 2001 From: HonzaCuhel Date: Fri, 17 Jan 2025 09:37:27 +0100 Subject: [PATCH 1/5] Add AIMv2 --- README.md | 3 +- datadreamer/dataset_annotation/__init__.py | 2 + .../dataset_annotation/aimv2_annotator.py | 158 ++++++++++++++++++ .../dataset_annotation/clip_annotator.py | 2 +- .../generate_dataset_from_scratch.py | 5 +- .../generate_dataset_and_train_yolo.ipynb | 2 +- ..._segmentation_dataset_and_train_yolo.ipynb | 2 +- tests/core_tests/unittests/test_annotators.py | 30 ++++ 8 files changed, 198 insertions(+), 6 deletions(-) create mode 100644 datadreamer/dataset_annotation/aimv2_annotator.py diff --git a/README.md b/README.md index 45ef010..97de01a 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,7 @@ datadreamer --config - `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3. - `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`. - `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`. -- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`. +- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `aimv2` or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`. - `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`. - `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`. - `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `""`. @@ -218,6 +218,7 @@ datadreamer --config | | [SDXL-Lightning](https://huggingface.co/ByteDance/SDXL-Lightning) | Fast and accurate (1024x1024 images) | | Image Annotation | [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble) | Open-Vocabulary object detector | | | [CLIP](https://huggingface.co/openai/clip-vit-base-patch32) | Zero-shot-image-classification | +| | [AIMv2](https://huggingface.co/apple/aimv2-large-patch14-224-lit) | Zero-shot-image-classification | | | [SlimSAM](https://huggingface.co/Zigeng/SlimSAM-uniform-50) | Zero-shot-instance-segmentation | diff --git a/datadreamer/dataset_annotation/__init__.py b/datadreamer/dataset_annotation/__init__.py index cfdf51a..82bd7ba 100644 --- a/datadreamer/dataset_annotation/__init__.py +++ b/datadreamer/dataset_annotation/__init__.py @@ -1,11 +1,13 @@ from __future__ import annotations +from .aimv2_annotator import AIMv2Annotator from .clip_annotator import CLIPAnnotator from .image_annotator import BaseAnnotator, TaskList from .owlv2_annotator import OWLv2Annotator from .slimsam_annotator import SlimSAMAnnotator __all__ = [ + "AIMv2Annotator", "BaseAnnotator", "TaskList", "OWLv2Annotator", diff --git a/datadreamer/dataset_annotation/aimv2_annotator.py b/datadreamer/dataset_annotation/aimv2_annotator.py new file mode 100644 index 0000000..6d4adc2 --- /dev/null +++ b/datadreamer/dataset_annotation/aimv2_annotator.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import logging +from typing import Dict, List + +import numpy as np +import PIL +import torch +from PIL import Image +from transformers import AutoModel, AutoProcessor + +from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList + +logger = logging.getLogger(__name__) + + +class AIMv2Annotator(BaseAnnotator): + """A class for image annotation using the AIMv2 model, specializing in image + classification. + + Attributes: + model (AutoModel): The AIMv2 model for image-text similarity evaluation. + processor (AutoProcessor): The processor for preparing inputs to the AIMv2 model. + device (str): The device on which the model will run ('cuda' for GPU, 'cpu' for CPU). + size (str): The size of the AIMv2 model to use ('base' or 'large'). + + Methods: + _init_processor(): Initializes the AIMv2 processor. + _init_model(): Initializes the AIMv2 model. + annotate_batch(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels. + release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache. + """ + + def __init__( + self, + seed: float = 42, + device: str = "cuda", + size: str = "base", + ) -> None: + """Initializes the AIMv2Annotator with a specific seed and device. + + Args: + seed (float): Seed for reproducibility. Defaults to 42. + device (str): The device to run the model on. Defaults to 'cuda'. + """ + super().__init__(seed, task_definition=TaskList.CLASSIFICATION) + self.size = size + self.model = self._init_model() + self.processor = self._init_processor() + self.device = device + self.model.to(self.device) + + def _init_processor(self) -> AutoProcessor: + """Initializes the AIMv2 processor. + + Returns: + AutoProcessor: The initialized AIMv2 processor. + """ + return AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit") + + def _init_model(self) -> AutoModel: + """Initializes the AIMv2 model. + + Returns: + AutoModel: The initialized AIMv2 model. + """ + logger.info(f"Initializing AIMv2 {self.size} model...") + return AutoModel.from_pretrained( + "apple/aimv2-large-patch14-224-lit", trust_remote_code=True + ) + + def annotate_batch( + self, + images: List[PIL.Image.Image], + objects: List[str], + conf_threshold: float = 0.1, + synonym_dict: Dict[str, List[str]] | None = None, + ) -> List[np.ndarray]: + """Annotates images using the AIMv2 model. + + Args: + images: The images to be annotated. + objects: A list of objects (text) to test against the images. + conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1. + synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None. + + Returns: + List[np.ndarray]: A list of the annotations for each image. + """ + if synonym_dict is not None: + objs_syn = set() + for obj in objects: + objs_syn.add(obj) + for syn in synonym_dict[obj]: + objs_syn.add(syn) + objs_syn = list(objs_syn) + # Make a dict to transform synonym ids to original ids + synonym_dict_rev = {} + for key, value in synonym_dict.items(): + if key in objects: + synonym_dict_rev[objs_syn.index(key)] = objects.index(key) + for v in value: + synonym_dict_rev[objs_syn.index(v)] = objects.index(key) + objects = objs_syn + + inputs = self.processor( + text=objects, images=images, return_tensors="pt", padding=True + ).to(self.device) + + outputs = self.model(**inputs) + + logits_per_image = outputs.logits_per_image # image-text similarity score + probs = logits_per_image.softmax(dim=1).cpu() # label probabilities + + labels = [] + # Get the labels for each image + if synonym_dict is not None: + for prob in probs: + labels.append( + np.unique( + np.array( + [ + synonym_dict_rev[label.item()] + for label in torch.where(prob > conf_threshold)[ + 0 + ].numpy() + ] + ) + ) + ) + else: + for prob in probs: + labels.append(torch.where(prob > conf_threshold)[0].numpy()) + + return labels + + def release(self, empty_cuda_cache: bool = False) -> None: + """Releases the model and optionally empties the CUDA cache. + + Args: + empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False. + """ + self.model = self.model.to("cpu") + if empty_cuda_cache: + with torch.no_grad(): + torch.cuda.empty_cache() + + +if __name__ == "__main__": + import requests + + device = "cuda" if torch.cuda.is_available() else "cpu" + url = "https://ultralytics.com/images/bus.jpg" + im = Image.open(requests.get(url, stream=True).raw) + annotator = AIMv2Annotator(device=device) + labels = annotator.annotate_batch([im], ["bus", "people"]) + print(labels) + annotator.release() diff --git a/datadreamer/dataset_annotation/clip_annotator.py b/datadreamer/dataset_annotation/clip_annotator.py index a39d1c6..28bde63 100644 --- a/datadreamer/dataset_annotation/clip_annotator.py +++ b/datadreamer/dataset_annotation/clip_annotator.py @@ -78,7 +78,7 @@ def annotate_batch( conf_threshold: float = 0.1, synonym_dict: Dict[str, List[str]] | None = None, ) -> List[np.ndarray]: - """Annotates images using the OWLv2 model. + """Annotates images using the CLIP model. Args: images: The images to be annotated. diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py index 4d52acb..353fd35 100644 --- a/datadreamer/pipelines/generate_dataset_from_scratch.py +++ b/datadreamer/pipelines/generate_dataset_from_scratch.py @@ -17,6 +17,7 @@ from tqdm import tqdm from datadreamer.dataset_annotation import ( + AIMv2Annotator, CLIPAnnotator, OWLv2Annotator, SlimSAMAnnotator, @@ -57,7 +58,7 @@ } det_annotators = {"owlv2": OWLv2Annotator} -clf_annotators = {"clip": CLIPAnnotator} +clf_annotators = {"clip": CLIPAnnotator, "aimv2": AIMv2Annotator} inst_seg_annotators = {"owlv2-slimsam": SlimSAMAnnotator} inst_seg_detectors = {"owlv2-slimsam": OWLv2Annotator} @@ -122,7 +123,7 @@ def parse_args(): parser.add_argument( "--image_annotator", type=str, - choices=["owlv2", "clip", "owlv2-slimsam"], + choices=["owlv2", "clip", "owlv2-slimsam", "aimv2"], help="Image annotator to use", ) diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb index 988942d..c530eb7 100644 --- a/examples/generate_dataset_and_train_yolo.ipynb +++ b/examples/generate_dataset_and_train_yolo.ipynb @@ -85,7 +85,7 @@ "- `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.\n", "- `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.\n", "- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.\n", - "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.\n", + "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `aimv2` or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.\n", "- `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.\n", "- `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.\n", "- `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `\"\"`.\n", diff --git a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb index 1588001..a70e737 100644 --- a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb +++ b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb @@ -99,7 +99,7 @@ "- `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.\n", "- `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.\n", "- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.\n", - "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.\n", + "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `aimv2` or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.\n", "- `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.\n", "- `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.\n", "- `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `\"\"`.\n", diff --git a/tests/core_tests/unittests/test_annotators.py b/tests/core_tests/unittests/test_annotators.py index 4e78df2..eb5c986 100644 --- a/tests/core_tests/unittests/test_annotators.py +++ b/tests/core_tests/unittests/test_annotators.py @@ -7,6 +7,7 @@ import torch from PIL import Image +from datadreamer.dataset_annotation.aimv2_annotator import AIMv2Annotator from datadreamer.dataset_annotation.clip_annotator import CLIPAnnotator from datadreamer.dataset_annotation.owlv2_annotator import OWLv2Annotator from datadreamer.dataset_annotation.slimsam_annotator import SlimSAMAnnotator @@ -56,6 +57,35 @@ def test_cpu_owlv2_annotator(): _check_owlv2_annotator("cpu") +def _check_aimv2_annotator(device: str): + url = "https://ultralytics.com/images/bus.jpg" + im = Image.open(requests.get(url, stream=True).raw) + annotator = AIMv2Annotator(device=device) + labels = annotator.annotate_batch([im], ["bus", "people"]) + # Check that the labels are lists + assert isinstance(labels, list) and len(labels) == 1 + # Check that the labels are ndarray of integers + assert isinstance(labels[0], np.ndarray) and labels[0].dtype == np.int64 + + annotator.release(empty_cuda_cache=True if device != "cpu" else False) + + +@pytest.mark.skipif( + not torch.cuda.is_available() or total_disk_space < 16, + reason="Test requires GPU and 16GB of HDD", +) +def test_cuda_aimv2_annotator(): + _check_aimv2_annotator("cuda") + + +@pytest.mark.skipif( + total_disk_space < 16, + reason="Test requires at least 16GB of HDD", +) +def test_cpu_aimv2_annotator(): + _check_aimv2_annotator("cpu") + + def _check_clip_annotator(device: str, size: str = "base"): url = "https://ultralytics.com/images/bus.jpg" im = Image.open(requests.get(url, stream=True).raw) From d418b8379821f128b43548a56b495318f6b5e7c7 Mon Sep 17 00:00:00 2001 From: HonzaCuhel Date: Fri, 17 Jan 2025 11:24:59 +0100 Subject: [PATCH 2/5] Add AIMv2 to the config --- datadreamer/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py index 6227b61..ca5d926 100644 --- a/datadreamer/utils/config.py +++ b/datadreamer/utils/config.py @@ -39,7 +39,7 @@ class Config(LuxonisConfig): # Profanity filter arguments disable_lm_filter: bool = False # Annotation arguments - image_annotator: Literal["owlv2", "clip", "owlv2-slimsam"] = "owlv2" + image_annotator: Literal["owlv2", "aimv2", "clip", "owlv2-slimsam"] = "owlv2" conf_threshold: float = 0.15 annotation_iou_threshold: float = 0.2 use_tta: bool = False From 5833c627f33c28945032c334c06a62a5da5e840f Mon Sep 17 00:00:00 2001 From: HonzaCuhel Date: Fri, 17 Jan 2025 12:29:42 +0100 Subject: [PATCH 3/5] Apple License mention --- datadreamer/dataset_annotation/aimv2_annotator.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datadreamer/dataset_annotation/aimv2_annotator.py b/datadreamer/dataset_annotation/aimv2_annotator.py index 6d4adc2..71f8a3f 100644 --- a/datadreamer/dataset_annotation/aimv2_annotator.py +++ b/datadreamer/dataset_annotation/aimv2_annotator.py @@ -1,3 +1,11 @@ +"""This file uses pre-trained model derived from Apple's software, provided under the +Apple Sample Code License license. The license is available at: + +https://developer.apple.com/support/downloads/terms/apple-sample-code/Apple-Sample-Code-License.pdf + +In addition, this file and other parts of the repository are licensed under the Apache 2.0 +License. By using this file, you agree to comply with the terms of both licenses. +""" from __future__ import annotations import logging From 9e4990b3821d0fdd2a2494f2f11b66adc3847949 Mon Sep 17 00:00:00 2001 From: HonzaCuhel Date: Fri, 17 Jan 2025 18:39:59 +0100 Subject: [PATCH 4/5] Add Img Cls Annotator --- datadreamer/dataset_annotation/__init__.py | 2 + .../dataset_annotation/aimv2_annotator.py | 102 +------------- .../dataset_annotation/clip_annotator.py | 102 +------------- .../dataset_annotation/cls_annotator.py | 130 ++++++++++++++++++ 4 files changed, 136 insertions(+), 200 deletions(-) create mode 100644 datadreamer/dataset_annotation/cls_annotator.py diff --git a/datadreamer/dataset_annotation/__init__.py b/datadreamer/dataset_annotation/__init__.py index 82bd7ba..3fe9f5d 100644 --- a/datadreamer/dataset_annotation/__init__.py +++ b/datadreamer/dataset_annotation/__init__.py @@ -2,6 +2,7 @@ from .aimv2_annotator import AIMv2Annotator from .clip_annotator import CLIPAnnotator +from .cls_annotator import ImgClassificationAnnotator from .image_annotator import BaseAnnotator, TaskList from .owlv2_annotator import OWLv2Annotator from .slimsam_annotator import SlimSAMAnnotator @@ -11,6 +12,7 @@ "BaseAnnotator", "TaskList", "OWLv2Annotator", + "ImgClassificationAnnotator", "CLIPAnnotator", "SlimSAMAnnotator", ] diff --git a/datadreamer/dataset_annotation/aimv2_annotator.py b/datadreamer/dataset_annotation/aimv2_annotator.py index 71f8a3f..f213af0 100644 --- a/datadreamer/dataset_annotation/aimv2_annotator.py +++ b/datadreamer/dataset_annotation/aimv2_annotator.py @@ -9,20 +9,17 @@ from __future__ import annotations import logging -from typing import Dict, List -import numpy as np -import PIL import torch from PIL import Image from transformers import AutoModel, AutoProcessor -from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList +from datadreamer.dataset_annotation.cls_annotator import ImgClassificationAnnotator logger = logging.getLogger(__name__) -class AIMv2Annotator(BaseAnnotator): +class AIMv2Annotator(ImgClassificationAnnotator): """A class for image annotation using the AIMv2 model, specializing in image classification. @@ -39,25 +36,6 @@ class AIMv2Annotator(BaseAnnotator): release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache. """ - def __init__( - self, - seed: float = 42, - device: str = "cuda", - size: str = "base", - ) -> None: - """Initializes the AIMv2Annotator with a specific seed and device. - - Args: - seed (float): Seed for reproducibility. Defaults to 42. - device (str): The device to run the model on. Defaults to 'cuda'. - """ - super().__init__(seed, task_definition=TaskList.CLASSIFICATION) - self.size = size - self.model = self._init_model() - self.processor = self._init_processor() - self.device = device - self.model.to(self.device) - def _init_processor(self) -> AutoProcessor: """Initializes the AIMv2 processor. @@ -77,82 +55,6 @@ def _init_model(self) -> AutoModel: "apple/aimv2-large-patch14-224-lit", trust_remote_code=True ) - def annotate_batch( - self, - images: List[PIL.Image.Image], - objects: List[str], - conf_threshold: float = 0.1, - synonym_dict: Dict[str, List[str]] | None = None, - ) -> List[np.ndarray]: - """Annotates images using the AIMv2 model. - - Args: - images: The images to be annotated. - objects: A list of objects (text) to test against the images. - conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1. - synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None. - - Returns: - List[np.ndarray]: A list of the annotations for each image. - """ - if synonym_dict is not None: - objs_syn = set() - for obj in objects: - objs_syn.add(obj) - for syn in synonym_dict[obj]: - objs_syn.add(syn) - objs_syn = list(objs_syn) - # Make a dict to transform synonym ids to original ids - synonym_dict_rev = {} - for key, value in synonym_dict.items(): - if key in objects: - synonym_dict_rev[objs_syn.index(key)] = objects.index(key) - for v in value: - synonym_dict_rev[objs_syn.index(v)] = objects.index(key) - objects = objs_syn - - inputs = self.processor( - text=objects, images=images, return_tensors="pt", padding=True - ).to(self.device) - - outputs = self.model(**inputs) - - logits_per_image = outputs.logits_per_image # image-text similarity score - probs = logits_per_image.softmax(dim=1).cpu() # label probabilities - - labels = [] - # Get the labels for each image - if synonym_dict is not None: - for prob in probs: - labels.append( - np.unique( - np.array( - [ - synonym_dict_rev[label.item()] - for label in torch.where(prob > conf_threshold)[ - 0 - ].numpy() - ] - ) - ) - ) - else: - for prob in probs: - labels.append(torch.where(prob > conf_threshold)[0].numpy()) - - return labels - - def release(self, empty_cuda_cache: bool = False) -> None: - """Releases the model and optionally empties the CUDA cache. - - Args: - empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False. - """ - self.model = self.model.to("cpu") - if empty_cuda_cache: - with torch.no_grad(): - torch.cuda.empty_cache() - if __name__ == "__main__": import requests diff --git a/datadreamer/dataset_annotation/clip_annotator.py b/datadreamer/dataset_annotation/clip_annotator.py index 28bde63..76787bd 100644 --- a/datadreamer/dataset_annotation/clip_annotator.py +++ b/datadreamer/dataset_annotation/clip_annotator.py @@ -1,20 +1,17 @@ from __future__ import annotations import logging -from typing import Dict, List -import numpy as np -import PIL import torch from PIL import Image from transformers import CLIPModel, CLIPProcessor -from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList +from datadreamer.dataset_annotation.cls_annotator import ImgClassificationAnnotator logger = logging.getLogger(__name__) -class CLIPAnnotator(BaseAnnotator): +class CLIPAnnotator(ImgClassificationAnnotator): """A class for image annotation using the CLIP model, specializing in image classification. @@ -31,25 +28,6 @@ class CLIPAnnotator(BaseAnnotator): release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache. """ - def __init__( - self, - seed: float = 42, - device: str = "cuda", - size: str = "base", - ) -> None: - """Initializes the CLIPAnnotator with a specific seed and device. - - Args: - seed (float): Seed for reproducibility. Defaults to 42. - device (str): The device to run the model on. Defaults to 'cuda'. - """ - super().__init__(seed, task_definition=TaskList.CLASSIFICATION) - self.size = size - self.model = self._init_model() - self.processor = self._init_processor() - self.device = device - self.model.to(self.device) - def _init_processor(self) -> CLIPProcessor: """Initializes the CLIP processor. @@ -71,82 +49,6 @@ def _init_model(self) -> CLIPModel: return CLIPModel.from_pretrained("openai/clip-vit-large-patch14") return CLIPModel.from_pretrained("openai/clip-vit-base-patch32") - def annotate_batch( - self, - images: List[PIL.Image.Image], - objects: List[str], - conf_threshold: float = 0.1, - synonym_dict: Dict[str, List[str]] | None = None, - ) -> List[np.ndarray]: - """Annotates images using the CLIP model. - - Args: - images: The images to be annotated. - objects: A list of objects (text) to test against the images. - conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1. - synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None. - - Returns: - List[np.ndarray]: A list of the annotations for each image. - """ - if synonym_dict is not None: - objs_syn = set() - for obj in objects: - objs_syn.add(obj) - for syn in synonym_dict[obj]: - objs_syn.add(syn) - objs_syn = list(objs_syn) - # Make a dict to transform synonym ids to original ids - synonym_dict_rev = {} - for key, value in synonym_dict.items(): - if key in objects: - synonym_dict_rev[objs_syn.index(key)] = objects.index(key) - for v in value: - synonym_dict_rev[objs_syn.index(v)] = objects.index(key) - objects = objs_syn - - inputs = self.processor( - text=objects, images=images, return_tensors="pt", padding=True - ).to(self.device) - - outputs = self.model(**inputs) - - logits_per_image = outputs.logits_per_image # image-text similarity score - probs = logits_per_image.softmax(dim=1).cpu() # label probabilities - - labels = [] - # Get the labels for each image - if synonym_dict is not None: - for prob in probs: - labels.append( - np.unique( - np.array( - [ - synonym_dict_rev[label.item()] - for label in torch.where(prob > conf_threshold)[ - 0 - ].numpy() - ] - ) - ) - ) - else: - for prob in probs: - labels.append(torch.where(prob > conf_threshold)[0].numpy()) - - return labels - - def release(self, empty_cuda_cache: bool = False) -> None: - """Releases the model and optionally empties the CUDA cache. - - Args: - empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False. - """ - self.model = self.model.to("cpu") - if empty_cuda_cache: - with torch.no_grad(): - torch.cuda.empty_cache() - if __name__ == "__main__": import requests diff --git a/datadreamer/dataset_annotation/cls_annotator.py b/datadreamer/dataset_annotation/cls_annotator.py new file mode 100644 index 0000000..39b665f --- /dev/null +++ b/datadreamer/dataset_annotation/cls_annotator.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import logging +from typing import Dict, List + +import numpy as np +import PIL +import torch + +from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList + +logger = logging.getLogger(__name__) + + +class ImgClassificationAnnotator(BaseAnnotator): + """Base class for image classification annotators using transformers models. + + Attributes: + model: The model for image-text similarity evaluation. + processor: The processor for preparing inputs to the model. + device (str): The device on which the model will run ('cuda' for GPU, 'cpu' for CPU). + size (str): The size of the model to use ('base' or 'large'). + + Methods: + _init_processor(): Initializes the processor. + _init_model(): Initializes the model. + annotate_batch(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels. + release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache. + """ + + def __init__( + self, seed: float = 42, device: str = "cuda", size: str = "base" + ) -> None: + """Initializes the image classification annotator. + + Args: + seed (float): Seed for reproducibility. Defaults to 42. + device (str): The device to run the model on. Defaults to 'cuda'. + size (str): The model size to use. + """ + super().__init__(seed, task_definition=TaskList.CLASSIFICATION) + self.size = size + self.device = device + self.model = self._init_model() + self.processor = self._init_processor() + self.model.to(self.device) + + def _init_processor(self): + """Initializes the processor.""" + raise NotImplementedError + + def _init_model(self): + """Initializes the model.""" + raise NotImplementedError + + def annotate_batch( + self, + images: List[PIL.Image.Image], + objects: List[str], + conf_threshold: float = 0.1, + synonym_dict: Dict[str, List[str]] | None = None, + ) -> List[np.ndarray]: + """Annotates images using the CLIP model. + + Args: + images: The images to be annotated. + objects: A list of objects (text) to test against the images. + conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1. + synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None. + + Returns: + List[np.ndarray]: A list of the annotations for each image. + """ + if synonym_dict is not None: + objs_syn = set() + for obj in objects: + objs_syn.add(obj) + for syn in synonym_dict[obj]: + objs_syn.add(syn) + objs_syn = list(objs_syn) + # Make a dict to transform synonym ids to original ids + synonym_dict_rev = {} + for key, value in synonym_dict.items(): + if key in objects: + synonym_dict_rev[objs_syn.index(key)] = objects.index(key) + for v in value: + synonym_dict_rev[objs_syn.index(v)] = objects.index(key) + objects = objs_syn + + inputs = self.processor( + text=objects, images=images, return_tensors="pt", padding=True + ).to(self.device) + + outputs = self.model(**inputs) + + logits_per_image = outputs.logits_per_image # image-text similarity score + probs = logits_per_image.softmax(dim=1).cpu() # label probabilities + + labels = [] + # Get the labels for each image + if synonym_dict is not None: + for prob in probs: + labels.append( + np.unique( + np.array( + [ + synonym_dict_rev[label.item()] + for label in torch.where(prob > conf_threshold)[ + 0 + ].numpy() + ] + ) + ) + ) + else: + for prob in probs: + labels.append(torch.where(prob > conf_threshold)[0].numpy()) + + return labels + + def release(self, empty_cuda_cache: bool = False) -> None: + """Releases the model and optionally empties the CUDA cache. + + Args: + empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False. + """ + self.model = self.model.to("cpu") + if empty_cuda_cache: + with torch.no_grad(): + torch.cuda.empty_cache() From aa5c8618f9829984102fc19ae798372d35cb1a6d Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Fri, 17 Jan 2025 18:40:05 +0000 Subject: [PATCH 5/5] [Automated] Updated coverage badge --- media/coverage_badge.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg index 6c15cac..179c6a1 100644 --- a/media/coverage_badge.svg +++ b/media/coverage_badge.svg @@ -9,13 +9,13 @@ - + coverage coverage - 75% - 75% + 63% + 63%