Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to keep images with no annotation #72

Merged
merged 3 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ datadreamer --config <path-to-config>
- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.
- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.
- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.
- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.
- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.
- `--batch_size_annotation`: Batch size for annotation. Default is `1`.
- `--batch_size_image`: Batch size for image generation. Default is `1`.
Expand Down
2 changes: 1 addition & 1 deletion datadreamer/dataset_annotation/slimsam_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _init_model(self) -> SamModel:
Returns:
SamModel: The initialized SAM model.
"""
logger.info(f"Initializing `SlimSAM {self.size} model...")
logger.info(f"Initializing SlimSAM {self.size} model...")
if self.size == "large":
return SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50")
return SamModel.from_pretrained("Zigeng/SlimSAM-uniform-77")
Expand Down
10 changes: 10 additions & 0 deletions datadreamer/pipelines/generate_dataset_from_scratch.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,13 @@ def parse_args():
help="Whether to use only bad words in profanity filter",
)

parser.add_argument(
"--keep_unlabeled_images",
default=None,
action="store_true",
help="Whether to keep images without any annotations",
)

parser.add_argument(
"--batch_size_prompt",
type=int,
Expand Down Expand Up @@ -718,6 +725,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
args.split_ratios,
copy_files=False,
is_instance_segmentation=args.task == "instance-segmentation",
keep_unlabeled_images=args.keep_unlabeled_images,
seed=args.seed,
)
# Convert annotations to COCO format
Expand All @@ -728,6 +736,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
"coco",
args.split_ratios,
is_instance_segmentation=args.task == "instance-segmentation",
keep_unlabeled_images=args.keep_unlabeled_images,
copy_files=False,
seed=args.seed,
)
Expand All @@ -742,6 +751,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
dataset_plugin=args.dataset_plugin,
dataset_name=args.dataset_name,
is_instance_segmentation=args.task == "instance-segmentation",
keep_unlabeled_images=args.keep_unlabeled_images,
copy_files=False,
seed=args.seed,
)
Expand Down
16 changes: 13 additions & 3 deletions datadreamer/utils/base_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,29 @@ def __init__(self, seed=42):
np.random.seed(seed)

@abstractmethod
def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
def convert(
self,
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into another format.

Args:
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.

No return value.
"""
pass

@staticmethod
def read_annotations(annotation_path) -> Dict:
def read_annotations(annotation_path: str) -> Dict:
"""Reads annotations from a JSON file located at the specified path.

Args:
Expand All @@ -42,7 +50,9 @@ def read_annotations(annotation_path) -> Dict:
return data

@staticmethod
def make_splits(images, split_ratios, shuffle=True) -> Tuple[List, List, List]:
def make_splits(
images: List[str], split_ratios: List[float], shuffle: bool = True
) -> Tuple[List, List, List]:
"""Splits the list of images into training, validation, and test sets.

Args:
Expand Down
52 changes: 47 additions & 5 deletions datadreamer/utils/coco_converter.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from __future__ import annotations

import json
import logging
import os
import shutil
from typing import Dict, List

import numpy as np
from PIL import Image

from datadreamer.utils.base_converter import BaseConverter

logger = logging.getLogger(__name__)


class COCOConverter(BaseConverter):
"""Class for converting a dataset to COCO format.
Expand All @@ -33,23 +37,44 @@ def __init__(self, seed=42, is_instance_segmentation: bool = False):
super().__init__(seed)
self.is_instance_segmentation = is_instance_segmentation

def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
def convert(
self,
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a COCO format.

Args:
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.

No return value.
"""
annotation_path = os.path.join(dataset_dir, "annotations.json")
data = BaseConverter.read_annotations(annotation_path)
self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
self.process_data(
data,
dataset_dir,
output_dir,
split_ratios,
keep_unlabeled_images,
copy_files,
)

def process_data(
self, data, image_dir, output_dir, split_ratios, copy_files=True
self,
data: Dict,
image_dir: str,
output_dir: str,
split_ratios: List[float],
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Processes the data by dividing it into training and validation sets, and
saves the images and labels in COCO format.
Expand All @@ -58,14 +83,27 @@ def process_data(
data (dict): The dictionary containing image annotations.
image_dir (str): The directory where the source images are located.
output_dir (str): The base directory where the processed data will be saved.
split_ratios (float): The ratio to split the data into training, validation, and test sets.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.

No return value.
"""
images = list(data.keys())
images.remove("class_names")

empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
if keep_unlabeled_images and len(empty_images) > 0:
logger.warning(
f"{len(empty_images)} images with no annotations will be included in the dataset."
)
elif not keep_unlabeled_images and len(empty_images) > 0:
logger.info(
f"{len(empty_images)} images with no annotations will be excluded from the dataset."
)
for image in empty_images:
images.remove(image)

train_images, val_images, test_images = BaseConverter.make_splits(
images, split_ratios
)
Expand Down Expand Up @@ -147,7 +185,11 @@ def process_data(
)

def save_labels(
self, dataset_output_dir, images_info, annotations, class_names
self,
dataset_output_dir: str,
images_info: List[Dict],
annotations: List[Dict],
class_names: List[str],
) -> None:
"""Saves the labels to a JSON file.

Expand Down
2 changes: 2 additions & 0 deletions datadreamer/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,5 @@ class Config(LuxonisConfig):
loader_plugin: str = ""
dataset_name: str = ""
dataset_id: str = ""
# Dataset arguments
keep_unlabeled_images: bool = False
58 changes: 39 additions & 19 deletions datadreamer/utils/convert_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import argparse
from typing import List, Optional

from datadreamer.utils import (
COCOConverter,
Expand All @@ -11,25 +12,28 @@


def convert_dataset(
input_dir,
output_dir,
dataset_format,
split_ratios,
dataset_plugin=None,
dataset_name=None,
is_instance_segmentation=False,
copy_files=True,
seed=42,
input_dir: str,
output_dir: str,
dataset_format: str,
split_ratios: List[float],
dataset_plugin: Optional[str] = None,
dataset_name: Optional[str] = None,
is_instance_segmentation: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
seed: int = 42,
) -> None:
"""Converts a dataset from one format to another.

Args:
input_dir (str): Directory containing the images and annotations.
output_dir (str): Directory where the processed dataset will be saved.
dataset_format (str): Format of the dataset. Can be 'yolo', 'coco', 'luxonis-dataset', or 'cls-single'.
split_ratios (list): List of ratios for train, val, and test splits.
split_ratios (lis of float): List of ratios for train, val, and test splits.
dataset_plugin (str, optional): Plugin for Luxonis dataset. Defaults to None.
dataset_name (str, optional): Name of the Luxonis dataset. Defaults to None.
is_instance_segmentation (bool, optional): Whether the dataset is for instance segmentation. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the files to the output directory. Defaults to True.
seed (int, optional): Random seed. Defaults to 42.

Expand All @@ -56,7 +60,9 @@ def convert_dataset(
else:
raise ValueError(f"Invalid dataset format: {dataset_format}")

converter.convert(input_dir, output_dir, split_ratios, copy_files)
converter.convert(
input_dir, output_dir, split_ratios, keep_unlabeled_images, copy_files
)


def main():
Expand Down Expand Up @@ -95,6 +101,18 @@ def main():
type=str,
help="Name of the dataset to create if dataset_plugin is used",
)
parser.add_argument(
"--is_instance_segmentation",
default=None,
action="store_true",
help="Whether the dataset is for instance segmentation.",
)
parser.add_argument(
"--keep_unlabeled_images",
default=None,
action="store_true",
help="Whether to keep images without any annotations",
)
parser.add_argument(
"--copy_files",
type=bool,
Expand All @@ -111,14 +129,16 @@ def main():
args = parser.parse_args()

convert_dataset(
args.input_dir,
args.output_dir,
args.dataset_format,
args.split_ratios,
args.dataset_plugin,
args.dataset_name,
args.copy_files,
args.seed,
input_dir=args.input_dir,
output_dir=args.output_dir,
dataset_format=args.dataset_format,
split_ratios=args.split_ratios,
dataset_plugin=args.dataset_plugin,
dataset_name=args.dataset_name,
is_instance_segmentation=args.is_instance_segmentation,
keep_unlabeled_images=args.keep_unlabeled_images,
copy_files=args.copy_files,
seed=args.seed,
)


Expand Down
27 changes: 24 additions & 3 deletions datadreamer/utils/luxonis_dataset_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a LuxonisDataset format.
Expand All @@ -46,16 +47,24 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.

No return value.
"""
annotation_path = os.path.join(dataset_dir, "annotations.json")
data = BaseConverter.read_annotations(annotation_path)
self.process_data(data, dataset_dir, output_dir, split_ratios)
self.process_data(
data, dataset_dir, output_dir, split_ratios, keep_unlabeled_images
)

def process_data(
self, data: Dict, dataset_dir: str, output_dir: str, split_ratios: List[float]
self,
data: Dict,
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_unlabeled_images: bool = False,
) -> None:
"""Processes the data into LuxonisDataset format.

Expand All @@ -81,7 +90,10 @@ def dataset_generator():
width, height = Image.open(image_full_path).size
labels = data[image_path]["labels"]

if len(labels) == 0:
if len(labels) == 0 and keep_unlabeled_images:
logger.warning(
f"Image {image_path} has no annotations. Training on empty images with `luxonis-train` will result in an error."
)
yield {
"file": image_full_path,
}
Expand Down Expand Up @@ -161,4 +173,13 @@ def dataset_generator():
dataset = LuxonisDataset(dataset_name)

dataset.add(dataset_generator())

if not keep_unlabeled_images:
n_empty_images = len(
list(filter(lambda x: len(data[x]["labels"]) == 0, image_paths))
)
if n_empty_images > 0:
logger.info(
f"Removed {n_empty_images} empty images with no annotations from the dataset."
)
dataset.make_splits(tuple(split_ratios))
2 changes: 2 additions & 0 deletions datadreamer/utils/single_label_cls_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a format suitable for single-label classification.
Expand All @@ -49,6 +50,7 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.

No return value.
Expand Down
Loading
Loading