Skip to content

Commit

Permalink
Add option to keep images with no annotation, by default removing & r…
Browse files Browse the repository at this point in the history
…efactor
  • Loading branch information
HonzaCuhel committed Nov 12, 2024
1 parent e9bde26 commit 13a4cbf
Show file tree
Hide file tree
Showing 13 changed files with 555 additions and 430 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ datadreamer --config <path-to-config>
- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.
- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.
- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.
- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.
- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.
- `--batch_size_annotation`: Batch size for annotation. Default is `1`.
- `--batch_size_image`: Batch size for image generation. Default is `1`.
Expand Down
2 changes: 1 addition & 1 deletion datadreamer/dataset_annotation/slimsam_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _init_model(self) -> SamModel:
Returns:
SamModel: The initialized SAM model.
"""
logger.info(f"Initializing `SlimSAM {self.size} model...")
logger.info(f"Initializing SlimSAM {self.size} model...")
if self.size == "large":
return SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50")
return SamModel.from_pretrained("Zigeng/SlimSAM-uniform-77")
Expand Down
10 changes: 10 additions & 0 deletions datadreamer/pipelines/generate_dataset_from_scratch.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,13 @@ def parse_args():
help="Whether to use only bad words in profanity filter",
)

parser.add_argument(
"--keep_empty_images",
default=None,
action="store_true",
help="Whether to keep images without any annotations",
)

parser.add_argument(
"--batch_size_prompt",
type=int,
Expand Down Expand Up @@ -718,6 +725,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
args.split_ratios,
copy_files=False,
is_instance_segmentation=args.task == "instance-segmentation",
keep_empty_images=args.keep_empty_images,
seed=args.seed,
)
# Convert annotations to COCO format
Expand All @@ -728,6 +736,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
"coco",
args.split_ratios,
is_instance_segmentation=args.task == "instance-segmentation",
keep_empty_images=args.keep_empty_images,
copy_files=False,
seed=args.seed,
)
Expand All @@ -742,6 +751,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
dataset_plugin=args.dataset_plugin,
dataset_name=args.dataset_name,
is_instance_segmentation=args.task == "instance-segmentation",
keep_empty_images=args.keep_empty_images,
copy_files=False,
seed=args.seed,
)
Expand Down
16 changes: 13 additions & 3 deletions datadreamer/utils/base_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,29 @@ def __init__(self, seed=42):
np.random.seed(seed)

@abstractmethod
def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
def convert(
self,
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into another format.
Args:
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
"""
pass

@staticmethod
def read_annotations(annotation_path) -> Dict:
def read_annotations(annotation_path: str) -> Dict:
"""Reads annotations from a JSON file located at the specified path.
Args:
Expand All @@ -42,7 +50,9 @@ def read_annotations(annotation_path) -> Dict:
return data

@staticmethod
def make_splits(images, split_ratios, shuffle=True) -> Tuple[List, List, List]:
def make_splits(
images: List[str], split_ratios: List[float], shuffle: bool = True
) -> Tuple[List, List, List]:
"""Splits the list of images into training, validation, and test sets.
Args:
Expand Down
47 changes: 42 additions & 5 deletions datadreamer/utils/coco_converter.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from __future__ import annotations

import json
import logging
import os
import shutil
from typing import Dict, List

import numpy as np
from PIL import Image

from datadreamer.utils.base_converter import BaseConverter

logger = logging.getLogger(__name__)


class COCOConverter(BaseConverter):
"""Class for converting a dataset to COCO format.
Expand All @@ -33,23 +37,39 @@ def __init__(self, seed=42, is_instance_segmentation: bool = False):
super().__init__(seed)
self.is_instance_segmentation = is_instance_segmentation

def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
def convert(
self,
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a COCO format.
Args:
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
"""
annotation_path = os.path.join(dataset_dir, "annotations.json")
data = BaseConverter.read_annotations(annotation_path)
self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
self.process_data(
data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files
)

def process_data(
self, data, image_dir, output_dir, split_ratios, copy_files=True
self,
data: Dict,
image_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
copy_files: bool = True,
) -> None:
"""Processes the data by dividing it into training and validation sets, and
saves the images and labels in COCO format.
Expand All @@ -58,14 +78,27 @@ def process_data(
data (dict): The dictionary containing image annotations.
image_dir (str): The directory where the source images are located.
output_dir (str): The base directory where the processed data will be saved.
split_ratios (float): The ratio to split the data into training, validation, and test sets.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
"""
images = list(data.keys())
images.remove("class_names")

empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
if keep_empty_images and len(empty_images) > 0:
logger.warning(
f"{len(empty_images)} images with no annotations will be included in the dataset."
)
elif not keep_empty_images and len(empty_images) > 0:
logger.info(
f"{len(empty_images)} images with no annotations will be excluded from the dataset."
)
for image in empty_images:
images.remove(image)

train_images, val_images, test_images = BaseConverter.make_splits(
images, split_ratios
)
Expand Down Expand Up @@ -147,7 +180,11 @@ def process_data(
)

def save_labels(
self, dataset_output_dir, images_info, annotations, class_names
self,
dataset_output_dir: str,
images_info: List[Dict],
annotations: List[Dict],
class_names: List[str],
) -> None:
"""Saves the labels to a JSON file.
Expand Down
2 changes: 2 additions & 0 deletions datadreamer/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,5 @@ class Config(LuxonisConfig):
loader_plugin: str = ""
dataset_name: str = ""
dataset_id: str = ""
# Dataset arguments
keep_empty_images: bool = False
58 changes: 39 additions & 19 deletions datadreamer/utils/convert_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import argparse
from typing import List, Optional

from datadreamer.utils import (
COCOConverter,
Expand All @@ -11,25 +12,28 @@


def convert_dataset(
input_dir,
output_dir,
dataset_format,
split_ratios,
dataset_plugin=None,
dataset_name=None,
is_instance_segmentation=False,
copy_files=True,
seed=42,
input_dir: str,
output_dir: str,
dataset_format: str,
split_ratios: List[float],
dataset_plugin: Optional[str] = None,
dataset_name: Optional[str] = None,
is_instance_segmentation: bool = False,
keep_empty_images: bool = False,
copy_files: bool = True,
seed: int = 42,
) -> None:
"""Converts a dataset from one format to another.
Args:
input_dir (str): Directory containing the images and annotations.
output_dir (str): Directory where the processed dataset will be saved.
dataset_format (str): Format of the dataset. Can be 'yolo', 'coco', 'luxonis-dataset', or 'cls-single'.
split_ratios (list): List of ratios for train, val, and test splits.
split_ratios (lis of float): List of ratios for train, val, and test splits.
dataset_plugin (str, optional): Plugin for Luxonis dataset. Defaults to None.
dataset_name (str, optional): Name of the Luxonis dataset. Defaults to None.
is_instance_segmentation (bool, optional): Whether the dataset is for instance segmentation. Defaults to False.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the files to the output directory. Defaults to True.
seed (int, optional): Random seed. Defaults to 42.
Expand All @@ -56,7 +60,9 @@ def convert_dataset(
else:
raise ValueError(f"Invalid dataset format: {dataset_format}")

converter.convert(input_dir, output_dir, split_ratios, copy_files)
converter.convert(
input_dir, output_dir, split_ratios, keep_empty_images, copy_files
)


def main():
Expand Down Expand Up @@ -95,6 +101,18 @@ def main():
type=str,
help="Name of the dataset to create if dataset_plugin is used",
)
parser.add_argument(
"--is_instance_segmentation",
default=None,
action="store_true",
help="Whether the dataset is for instance segmentation.",
)
parser.add_argument(
"--keep_empty_images",
default=None,
action="store_true",
help="Whether to keep images without any annotations",
)
parser.add_argument(
"--copy_files",
type=bool,
Expand All @@ -111,14 +129,16 @@ def main():
args = parser.parse_args()

convert_dataset(
args.input_dir,
args.output_dir,
args.dataset_format,
args.split_ratios,
args.dataset_plugin,
args.dataset_name,
args.copy_files,
args.seed,
input_dir=args.input_dir,
output_dir=args.output_dir,
dataset_format=args.dataset_format,
split_ratios=args.split_ratios,
dataset_plugin=args.dataset_plugin,
dataset_name=args.dataset_name,
is_instance_segmentation=args.is_instance_segmentation,
keep_empty_images=args.keep_empty_images,
copy_files=args.copy_files,
seed=args.seed,
)


Expand Down
27 changes: 24 additions & 3 deletions datadreamer/utils/luxonis_dataset_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a LuxonisDataset format.
Expand All @@ -46,16 +47,24 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
"""
annotation_path = os.path.join(dataset_dir, "annotations.json")
data = BaseConverter.read_annotations(annotation_path)
self.process_data(data, dataset_dir, output_dir, split_ratios)
self.process_data(
data, dataset_dir, output_dir, split_ratios, keep_empty_images
)

def process_data(
self, data: Dict, dataset_dir: str, output_dir: str, split_ratios: List[float]
self,
data: Dict,
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
) -> None:
"""Processes the data into LuxonisDataset format.
Expand All @@ -81,7 +90,10 @@ def dataset_generator():
width, height = Image.open(image_full_path).size
labels = data[image_path]["labels"]

if len(labels) == 0:
if len(labels) == 0 and keep_empty_images:
logger.warning(
f"Image {image_path} has no annotations. Training on empty images with `luxonis-train` will result in an error."
)
yield {
"file": image_full_path,
}
Expand Down Expand Up @@ -161,4 +173,13 @@ def dataset_generator():
dataset = LuxonisDataset(dataset_name)

dataset.add(dataset_generator())

if not keep_empty_images:
n_empty_images = len(
list(filter(lambda x: len(data[x]["labels"]) == 0, image_paths))
)
if n_empty_images > 0:
logger.info(
f"Removed {n_empty_images} empty images with no annotations from the dataset."
)
dataset.make_splits(tuple(split_ratios))
2 changes: 2 additions & 0 deletions datadreamer/utils/single_label_cls_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a format suitable for single-label classification.
Expand All @@ -49,6 +50,7 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
Expand Down
Loading

0 comments on commit 13a4cbf

Please sign in to comment.