luxonis · HonzaCuhel · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/README.md b/README.md
@@ -176,6 +176,7 @@ datadreamer --config <path-to-config>
 - `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.
 - `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.
 - `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.
+- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.
 - `--batch_size_prompt`: Batch size for prompt generation. Default is 64.
 - `--batch_size_annotation`: Batch size for annotation. Default is `1`.
 - `--batch_size_image`: Batch size for image generation. Default is `1`.

diff --git a/datadreamer/dataset_annotation/slimsam_annotator.py b/datadreamer/dataset_annotation/slimsam_annotator.py
@@ -56,7 +56,7 @@ def _init_model(self) -> SamModel:
         Returns:
             SamModel: The initialized SAM model.
         """
-        logger.info(f"Initializing `SlimSAM {self.size} model...")
+        logger.info(f"Initializing SlimSAM {self.size} model...")
         if self.size == "large":
             return SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50")
         return SamModel.from_pretrained("Zigeng/SlimSAM-uniform-77")

diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -217,6 +217,13 @@ def parse_args():
         help="Whether to use only bad words in profanity filter",
     )
 
+    parser.add_argument(
+        "--keep_unlabeled_images",
+        default=None,
+        action="store_true",
+        help="Whether to keep images without any annotations",
+    )
+
     parser.add_argument(
         "--batch_size_prompt",
         type=int,
@@ -718,6 +725,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 args.split_ratios,
                 copy_files=False,
                 is_instance_segmentation=args.task == "instance-segmentation",
+                keep_unlabeled_images=args.keep_unlabeled_images,
                 seed=args.seed,
             )
         # Convert annotations to COCO format
@@ -728,6 +736,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 "coco",
                 args.split_ratios,
                 is_instance_segmentation=args.task == "instance-segmentation",
+                keep_unlabeled_images=args.keep_unlabeled_images,
                 copy_files=False,
                 seed=args.seed,
             )
@@ -742,6 +751,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
             dataset_plugin=args.dataset_plugin,
             dataset_name=args.dataset_name,
             is_instance_segmentation=args.task == "instance-segmentation",
+            keep_unlabeled_images=args.keep_unlabeled_images,
             copy_files=False,
             seed=args.seed,
         )

diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
@@ -14,21 +14,29 @@ def __init__(self, seed=42):
         np.random.seed(seed)
 
     @abstractmethod
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
+    def convert(
+        self,
+        dataset_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        keep_unlabeled_images: bool = False,
+        copy_files: bool = True,
+    ) -> None:
         """Converts a dataset into another format.
 
         Args:
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
         pass
 
     @staticmethod
-    def read_annotations(annotation_path) -> Dict:
+    def read_annotations(annotation_path: str) -> Dict:
         """Reads annotations from a JSON file located at the specified path.
 
         Args:
@@ -42,7 +50,9 @@ def read_annotations(annotation_path) -> Dict:
         return data
 
     @staticmethod
-    def make_splits(images, split_ratios, shuffle=True) -> Tuple[List, List, List]:
+    def make_splits(
+        images: List[str], split_ratios: List[float], shuffle: bool = True
+    ) -> Tuple[List, List, List]:
         """Splits the list of images into training, validation, and test sets.
 
         Args:

diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
@@ -1,14 +1,18 @@
 from __future__ import annotations
 
 import json
+import logging
 import os
 import shutil
+from typing import Dict, List
 
 import numpy as np
 from PIL import Image
 
 from datadreamer.utils.base_converter import BaseConverter
 
+logger = logging.getLogger(__name__)
+
 
 class COCOConverter(BaseConverter):
     """Class for converting a dataset to COCO format.
@@ -33,23 +37,44 @@ def __init__(self, seed=42, is_instance_segmentation: bool = False):
         super().__init__(seed)
         self.is_instance_segmentation = is_instance_segmentation
 
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
+    def convert(
+        self,
+        dataset_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        keep_unlabeled_images: bool = False,
+        copy_files: bool = True,
+    ) -> None:
         """Converts a dataset into a COCO format.
 
         Args:
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
         annotation_path = os.path.join(dataset_dir, "annotations.json")
         data = BaseConverter.read_annotations(annotation_path)
-        self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
+        self.process_data(
+            data,
+            dataset_dir,
+            output_dir,
+            split_ratios,
+            keep_unlabeled_images,
+            copy_files,
+        )
 
     def process_data(
-        self, data, image_dir, output_dir, split_ratios, copy_files=True
+        self,
+        data: Dict,
+        image_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        keep_unlabeled_images: bool = False,
+        copy_files: bool = True,
     ) -> None:
         """Processes the data by dividing it into training and validation sets, and
         saves the images and labels in COCO format.
@@ -58,14 +83,27 @@ def process_data(
             data (dict): The dictionary containing image annotations.
             image_dir (str): The directory where the source images are located.
             output_dir (str): The base directory where the processed data will be saved.
-            split_ratios (float): The ratio to split the data into training, validation, and test sets.
+            split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
         images = list(data.keys())
         images.remove("class_names")
 
+        empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
+        if keep_unlabeled_images and len(empty_images) > 0:
+            logger.warning(
+                f"{len(empty_images)} images with no annotations will be included in the dataset."
+            )
+        elif not keep_unlabeled_images and len(empty_images) > 0:
+            logger.info(
+                f"{len(empty_images)} images with no annotations will be excluded from the dataset."
+            )
+            for image in empty_images:
+                images.remove(image)
+
         train_images, val_images, test_images = BaseConverter.make_splits(
             images, split_ratios
         )
@@ -147,7 +185,11 @@ def process_data(
             )
 
     def save_labels(
-        self, dataset_output_dir, images_info, annotations, class_names
+        self,
+        dataset_output_dir: str,
+        images_info: List[Dict],
+        annotations: List[Dict],
+        class_names: List[str],
     ) -> None:
         """Saves the labels to a JSON file.
 

diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py
@@ -49,3 +49,5 @@ class Config(LuxonisConfig):
     loader_plugin: str = ""
     dataset_name: str = ""
     dataset_id: str = ""
+    # Dataset arguments
+    keep_unlabeled_images: bool = False
diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import argparse
+from typing import List, Optional
 
 from datadreamer.utils import (
     COCOConverter,
@@ -11,25 +12,28 @@
 
 
 def convert_dataset(
-    input_dir,
-    output_dir,
-    dataset_format,
-    split_ratios,
-    dataset_plugin=None,
-    dataset_name=None,
-    is_instance_segmentation=False,
-    copy_files=True,
-    seed=42,
+    input_dir: str,
+    output_dir: str,
+    dataset_format: str,
+    split_ratios: List[float],
+    dataset_plugin: Optional[str] = None,
+    dataset_name: Optional[str] = None,
+    is_instance_segmentation: bool = False,
+    keep_unlabeled_images: bool = False,
+    copy_files: bool = True,
+    seed: int = 42,
 ) -> None:
     """Converts a dataset from one format to another.
 
     Args:
         input_dir (str): Directory containing the images and annotations.
         output_dir (str): Directory where the processed dataset will be saved.
         dataset_format (str): Format of the dataset. Can be 'yolo', 'coco', 'luxonis-dataset', or 'cls-single'.
-        split_ratios (list): List of ratios for train, val, and test splits.
+        split_ratios (lis of float): List of ratios for train, val, and test splits.
         dataset_plugin (str, optional): Plugin for Luxonis dataset. Defaults to None.
         dataset_name (str, optional): Name of the Luxonis dataset. Defaults to None.
+        is_instance_segmentation (bool, optional): Whether the dataset is for instance segmentation. Defaults to False.
+        keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
         copy_files (bool, optional): Whether to copy the files to the output directory. Defaults to True.
         seed (int, optional): Random seed. Defaults to 42.
 
@@ -56,7 +60,9 @@ def convert_dataset(
     else:
         raise ValueError(f"Invalid dataset format: {dataset_format}")
 
-    converter.convert(input_dir, output_dir, split_ratios, copy_files)
+    converter.convert(
+        input_dir, output_dir, split_ratios, keep_unlabeled_images, copy_files
+    )
 
 
 def main():
@@ -95,6 +101,18 @@ def main():
         type=str,
         help="Name of the dataset to create if dataset_plugin is used",
     )
+    parser.add_argument(
+        "--is_instance_segmentation",
+        default=None,
+        action="store_true",
+        help="Whether the dataset is for instance segmentation.",
+    )
+    parser.add_argument(
+        "--keep_unlabeled_images",
+        default=None,
+        action="store_true",
+        help="Whether to keep images without any annotations",
+    )
     parser.add_argument(
         "--copy_files",
         type=bool,
@@ -111,14 +129,16 @@ def main():
     args = parser.parse_args()
 
     convert_dataset(
-        args.input_dir,
-        args.output_dir,
-        args.dataset_format,
-        args.split_ratios,
-        args.dataset_plugin,
-        args.dataset_name,
-        args.copy_files,
-        args.seed,
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+        dataset_format=args.dataset_format,
+        split_ratios=args.split_ratios,
+        dataset_plugin=args.dataset_plugin,
+        dataset_name=args.dataset_name,
+        is_instance_segmentation=args.is_instance_segmentation,
+        keep_unlabeled_images=args.keep_unlabeled_images,
+        copy_files=args.copy_files,
+        seed=args.seed,
     )
 
 

diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
@@ -38,6 +38,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into a LuxonisDataset format.
@@ -46,16 +47,24 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
         annotation_path = os.path.join(dataset_dir, "annotations.json")
         data = BaseConverter.read_annotations(annotation_path)
-        self.process_data(data, dataset_dir, output_dir, split_ratios)
+        self.process_data(
+            data, dataset_dir, output_dir, split_ratios, keep_unlabeled_images
+        )
 
     def process_data(
-        self, data: Dict, dataset_dir: str, output_dir: str, split_ratios: List[float]
+        self,
+        data: Dict,
+        dataset_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        keep_unlabeled_images: bool = False,
     ) -> None:
         """Processes the data into LuxonisDataset format.
 
@@ -81,7 +90,10 @@ def dataset_generator():
                 width, height = Image.open(image_full_path).size
                 labels = data[image_path]["labels"]
 
-                if len(labels) == 0:
+                if len(labels) == 0 and keep_unlabeled_images:
+                    logger.warning(
+                        f"Image {image_path} has no annotations. Training on empty images with `luxonis-train` will result in an error."
+                    )
                     yield {
                         "file": image_full_path,
                     }
@@ -161,4 +173,13 @@ def dataset_generator():
             dataset = LuxonisDataset(dataset_name)
 
         dataset.add(dataset_generator())
+
+        if not keep_unlabeled_images:
+            n_empty_images = len(
+                list(filter(lambda x: len(data[x]["labels"]) == 0, image_paths))
+            )
+            if n_empty_images > 0:
+                logger.info(
+                    f"Removed {n_empty_images} empty images with no annotations from the dataset."
+                )
         dataset.make_splits(tuple(split_ratios))
diff --git a/datadreamer/utils/single_label_cls_converter.py b/datadreamer/utils/single_label_cls_converter.py
@@ -41,6 +41,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into a format suitable for single-label classification.
@@ -49,6 +50,7 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.