From f78aae47d519e31bc461d2942bd6b9b84d6e8023 Mon Sep 17 00:00:00 2001 From: imhuwq <imhuwq@gmail.com> Date: Tue, 30 Jan 2024 11:25:55 +0800 Subject: [PATCH 1/6] feature(write concern): set write concern to 0 for batch saving --- deepdataspace/model/_base.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/deepdataspace/model/_base.py b/deepdataspace/model/_base.py index d45b713..1b30558 100644 --- a/deepdataspace/model/_base.py +++ b/deepdataspace/model/_base.py @@ -13,6 +13,7 @@ from typing import Tuple from pydantic import BaseModel as _Base +from pymongo import WriteConcern from pymongo.collection import Collection from pymongo.operations import UpdateOne from pymongo.typings import _DocumentType @@ -232,6 +233,8 @@ def batch_update(cls, filters: dict, set_data: dict = None, unset_data: dict = N co = cls.get_collection() if co is None: return None + wc = WriteConcern(w=0) + co = co.with_options(write_concern=wc) op = UpdateOne(filters, {"$set": set_data, "$unset": unset_data}) @@ -257,6 +260,8 @@ def finish_batch_update(cls): op_lock = cls._get_batch_op_lock() with op_lock: co = cls.get_collection() + wc = WriteConcern(w=0) + co = co.with_options(write_concern=wc) queue = _batch_update_queue.setdefault(cls_id, []) if queue: co.bulk_write(queue) @@ -310,6 +315,8 @@ def batch_save(self, batch_size: int = 20, set_on_insert: Dict = None): co = cls.get_collection() if co is None: return None + wc = WriteConcern(w=0) + co = co.with_options(write_concern=wc) _id = self.__dict__.get("id", None) if _id is None: @@ -348,6 +355,8 @@ def finish_batch_save(cls): op_lock = _batch_lock[cls_id] with op_lock: co = cls.get_collection() + wc = WriteConcern(w=0) + co = co.with_options(write_concern=wc) queue = _batch_save_queue.setdefault(cls_id, []) if queue: co.bulk_write(queue) From 6fe9bae0a75cd566cb9b129ce9f1b1714e81cd8c Mon Sep 17 00:00:00 2001 From: imhuwq <imhuwq@gmail.com> Date: Wed, 21 Feb 2024 10:50:26 +0800 Subject: [PATCH 2/6] feature(batch insert): support batch insert for dataset importing --- deepdataspace/io/importer.py | 19 +++++------ deepdataspace/model/_base.py | 59 ++++++++++++++++++++++++++++++++++ deepdataspace/model/dataset.py | 33 +++++++++++++------ 3 files changed, 93 insertions(+), 18 deletions(-) diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py index b3f01c8..6da2225 100644 --- a/deepdataspace/io/importer.py +++ b/deepdataspace/io/importer.py @@ -108,7 +108,7 @@ def __init__(self, name: str, id_: str = None): """ self.dataset_name = name - self.dataset = DataSet.get_importing_dataset(name, id_=id_) + self.dataset = DataSet.get_importing_dataset(name, id_=id_, batch_upsert=False) self._image_queue = {} self._label_queue = {} @@ -123,6 +123,7 @@ def pre_run(self): self.load_existing_user_data() self.dataset.status = constants.DatasetStatus.Importing self.dataset.save() + Image(self.dataset.id).get_collection().drop() def post_run(self): """ @@ -152,14 +153,14 @@ def load_existing_user_data(self): """ pipeline = [ - {"$project": {"flag" : 1, - "flag_ts" : 1, + {"$project": {"flag": 1, + "flag_ts": 1, "label_confirm": 1, - "objects" : { + "objects": { "$filter": { "input": "$objects", - "as" : "object", - "cond" : { + "as": "object", + "cond": { "$eq": ["$$object.label_type", LabelType.User] } } @@ -181,9 +182,9 @@ def load_existing_user_data(self): label_confirm = image.get("label_confirm", {}) self._user_data[image_id] = { - "objects" : user_objects, - "flag" : flag, - "flag_ts" : flag_ts, + "objects": user_objects, + "flag": flag, + "flag_ts": flag_ts, "label_confirm": label_confirm, } diff --git a/deepdataspace/model/_base.py b/deepdataspace/model/_base.py index 1b30558..7ecdf77 100644 --- a/deepdataspace/model/_base.py +++ b/deepdataspace/model/_base.py @@ -15,6 +15,7 @@ from pydantic import BaseModel as _Base from pymongo import WriteConcern from pymongo.collection import Collection +from pymongo.operations import InsertOne from pymongo.operations import UpdateOne from pymongo.typings import _DocumentType @@ -24,6 +25,7 @@ _batch_lock = {} # a dict of batch operation lock for every collection, {'collection_name': batch_op_lock, } _batch_save_queue = {} # a dict of batch save queue for every collection, {'collection_name': batch_save_queue, } _batch_update_queue = {} # a dict of batch update queue for every collection, {'collection_name': batch_update_queue, } +_batch_insert_queue = {} # a dict of batch insert queue for every collection, {'collection_name': batch_insert_queue, } def current_ts(): @@ -362,6 +364,63 @@ def finish_batch_save(cls): co.bulk_write(queue) _batch_save_queue[cls_id] = [] + @classmethod + def batch_create(cls, model_obj: "BaseModel", batch_size: int = 20): + """ + The same as self.batch_save function, + but the performance is better because we save by InsertOne instead of UpdateOne with upsert. + + :param model_obj: the saving model object. + :param batch_size: the batch size. We will only write to mongodb when the batch is full. + """ + + model_obj.post_init() + + co = cls.get_collection() + if co is None: + return None + wc = WriteConcern(w=0) + co = co.with_options(write_concern=wc) + + _id = model_obj.__dict__.get("id", None) + if _id is None: + return None + + data = model_obj.to_dict() + data["_id"] = data.pop("id", None) + + op = InsertOne(data) + + cls_id = cls.get_cls_id() + op_lock = cls._get_batch_op_lock() + with op_lock: + queue = _batch_insert_queue.setdefault(cls_id, []) + queue.append(op) + if len(queue) >= batch_size: + co.bulk_write(queue) + _batch_insert_queue[cls_id] = [] + + @classmethod + def finish_batch_create(cls): + """ + This must be called after all the batch_create calls. + """ + + cls_id = cls.get_cls_id() + if _batch_lock.get(cls_id, None) is None: + with _lock_lock: + _batch_lock[cls_id] = Lock() + + op_lock = _batch_lock[cls_id] + with op_lock: + co = cls.get_collection() + wc = WriteConcern(w=0) + co = co.with_options(write_concern=wc) + queue = _batch_insert_queue.setdefault(cls_id, []) + if queue: + co.bulk_write(queue) + _batch_insert_queue[cls_id] = [] + def delete(self): """ Delete current object from mongodb. diff --git a/deepdataspace/model/dataset.py b/deepdataspace/model/dataset.py index b99f690..370f20d 100644 --- a/deepdataspace/model/dataset.py +++ b/deepdataspace/model/dataset.py @@ -104,6 +104,9 @@ def get_collection(cls, *args, **kwargs) -> Collection[_DocumentType]: _batch_queue: Dict[int, ImageModel] = {} _batch_size: int = 100 + # If True, images from batch_add_image will be saved by UpdateOne with upsert=True. + # If False, images from batch_add_image will be saved by InsertOne, which is faster. + _batch_upsert: bool = True @classmethod def create_dataset(cls, @@ -114,6 +117,7 @@ def create_dataset(cls, files: dict = None, description: str = None, description_func: str = None, + batch_upsert: bool = True, ) -> "DataSet": """ Create a dataset. @@ -129,6 +133,8 @@ def create_dataset(cls, :param description_func: an import path of a function to generate description. The function takes the dataset instance as the only argument and returns a string. If this is provided, it proceeds the description str. + :param batch_upsert: If True, images from batch_add_image will be saved by UpdateOne with upsert=True. + otherwise they will be saved by faster InsertOne operation. :return: the dataset object. """ @@ -148,6 +154,7 @@ def create_dataset(cls, dataset = cls(name=name, id=id_, type=type, path=path, files=files, status=DatasetStatus.Ready, description=description, description_func=description_func) + dataset._batch_upsert = batch_upsert dataset.post_init() dataset.save() return dataset @@ -159,6 +166,7 @@ def get_importing_dataset(cls, type: str = None, path: str = None, files: dict = None, + batch_upsert: bool = True, ) -> "DataSet": """ This is the same as create_dataset. @@ -179,6 +187,7 @@ def get_importing_dataset(cls, files = files or {} dataset = cls(name=name, id=id_, type=type, path=path, files=files, status=DatasetStatus.Waiting) + dataset._batch_upsert = batch_upsert dataset.post_init() dataset.save() return dataset @@ -384,14 +393,20 @@ def _batch_save_image_batch(self): # setup image image.idx = idx image.id = idx if image.id < 0 else image.id - image.batch_save(batch_size=self._batch_size, set_on_insert={"idx": image.idx}) + if self._batch_upsert is True: + image.batch_save(batch_size=self._batch_size, set_on_insert={"idx": image.idx}) + else: + IModel.batch_create(image, batch_size=self._batch_size) idx += 1 self._add_local_file_url_to_whitelist(image.url, whitelist_dirs) self._add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs) # finish batch saves - IModel.finish_batch_save() + if self._batch_upsert is True: + IModel.finish_batch_save() + else: + IModel.finish_batch_create() Label.finish_batch_save() Category.finish_batch_save() @@ -406,9 +421,9 @@ def _batch_save_image_batch(self): self._batch_queue.clear() - def batch_save_image(self, enforce: bool = False): + def batch_save_image(self): batch_is_full = len(self._batch_queue) >= self._batch_size - if batch_is_full or enforce: + if batch_is_full: self._batch_save_image_batch() return True return False @@ -449,16 +464,16 @@ def cascade_delete(dataset: "DataSet"): return dataset_id = dataset.id - print(f"dataset [{dataset_id}] is found, deleting...") + logger.info(f"dataset [{dataset_id}] is found, deleting...") - print(f"dataset [{dataset_id}] is found, deleting categories...") + logger.info(f"dataset [{dataset_id}] is found, deleting categories...") Category.delete_many({"dataset_id": dataset_id}) - print(f"dataset [{dataset_id}] is found, deleting labels...") + logger.info(f"dataset [{dataset_id}] is found, deleting labels...") Label.delete_many({"dataset_id": dataset_id}) - print(f"dataset [{dataset_id}] is found, deleting images...") + logger.info(f"dataset [{dataset_id}] is found, deleting images...") Image(dataset_id).get_collection().drop() DataSet.delete_many({"id": dataset_id}) - print(f"dataset [{dataset_id}] is deleted.") + logger.info(f"dataset [{dataset_id}] is deleted.") From 794e71fded52f04329263aa014b844e39b5a2f8c Mon Sep 17 00:00:00 2001 From: imhuwq <imhuwq@gmail.com> Date: Fri, 23 Feb 2024 18:15:55 +0800 Subject: [PATCH 3/6] refactor(import dataset): import images with raw python dict instead of pydantic model --- deepdataspace/globals.py | 11 +- deepdataspace/io/importer.py | 253 +++++++++++++++++--- deepdataspace/model/_base.py | 73 +----- deepdataspace/model/dataset.py | 73 ++---- deepdataspace/model/image.py | 58 ++--- deepdataspace/model/label_task.py | 3 - deepdataspace/model/object.py | 19 -- deepdataspace/plugins/coco2017/importer.py | 13 +- deepdataspace/scripts/migrate/2023061401.py | 2 +- deepdataspace/services/mongodb.py | 2 +- 10 files changed, 277 insertions(+), 230 deletions(-) diff --git a/deepdataspace/globals.py b/deepdataspace/globals.py index c03edd6..23f118a 100644 --- a/deepdataspace/globals.py +++ b/deepdataspace/globals.py @@ -41,17 +41,8 @@ _mongo_user = urllib.parse.quote_plus(MONGODB_USER) _mongo_pass = urllib.parse.quote_plus(MONGODB_PASS) _mongo_url = f"mongodb://{_mongo_user}:{_mongo_pass}@{MONGODB_HOST}:{MONGODB_PORT}/{MONGODB_DBNAME}" -_mongo_client = MongoClient(_mongo_url, authMechanism="SCRAM-SHA-256") +_mongo_client = MongoClient(_mongo_url, authMechanism="SCRAM-SHA-256", maxPoolSize=None) MongoDB = _mongo_client[MONGODB_DBNAME] # init redis client Redis = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DBNAME, password=REDIS_PASS) - -# init sentry client -# TODO: sentry is not necessary for dds tool, remove it as soon as possible -if SENTRY_DSN is not None: - sample_rate = 0.1 if ENV == RunningEnv.Prod else 1.0 - sentry_sdk.init(dsn=SENTRY_DSN, - traces_sample_rate=sample_rate, - environment=ENV, ) - sentry_sdk.set_tag("os.user", get_os_username()) diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py index 6da2225..e176906 100644 --- a/deepdataspace/io/importer.py +++ b/deepdataspace/io/importer.py @@ -6,25 +6,35 @@ import abc import copy +import json import logging import os -import time +import uuid from typing import Dict from typing import List +from typing import Literal from typing import Tuple from typing import Type from typing import Union from tqdm import tqdm +from pymongo import WriteConcern from deepdataspace import constants +from deepdataspace.constants import AnnotationType from deepdataspace.constants import DatasetFileType +from deepdataspace.constants import DatasetStatus +from deepdataspace.constants import FileReadMode from deepdataspace.constants import LabelName from deepdataspace.constants import LabelType -from deepdataspace.model import Category +from deepdataspace.constants import RedisKey +from deepdataspace.globals import Redis from deepdataspace.model import DataSet -from deepdataspace.model import Label +from deepdataspace.model.category import Category from deepdataspace.model.image import Image +from deepdataspace.model.image import ImageModel +from deepdataspace.model.label import Label +from deepdataspace.utils.file import create_file_url from deepdataspace.utils.function import count_block_time from deepdataspace.utils.string import get_str_md5 @@ -106,14 +116,206 @@ def __init__(self, name: str, id_: str = None): :param id_: the dataset id. If provided, the importer will try to update an existing dataset instead of creating a new one. """ - self.dataset_name = name - self.dataset = DataSet.get_importing_dataset(name, id_=id_, batch_upsert=False) + self.dataset = self.get_importing_dataset(name, id=id_) self._image_queue = {} self._label_queue = {} self._category_queue = {} self._user_data = {} # {image_id: {}} + self._import_queue = {} + self._batch_size = 200 + self.IModel = Image(self.dataset.id) + + @staticmethod + def get_importing_dataset(name: str, + id: str = None, + type: str = None, + path: str = None, + files: dict = None, + ) -> "DataSet": + if id: + dataset = DataSet.find_one({"id": id}) + if dataset is not None: + dataset.type = type or dataset.type + dataset.path = path or dataset.path + dataset.files = files or dataset.files + dataset.name = name + dataset.save() + return dataset + else: + id = uuid.uuid4().hex + + files = files or {} + dataset = DataSet(name=name, id=id, type=type, path=path, files=files, status=DatasetStatus.Waiting) + dataset.save() + return dataset + + def dataset_import_image(self, + dataset: DataSet, + uri: str, + thumb_uri: str = None, + width: int = None, + height: int = None, + id_: int = None, + metadata: dict = None, + flag: int = 0, + flag_ts: int = 0, ) -> dict: + full_uri = uri + thumb_uri = full_uri if thumb_uri is None else thumb_uri + if full_uri.startswith("file://"): + full_uri = create_file_url(full_uri[7:], read_mode=FileReadMode.Binary) + if thumb_uri.startswith("file://"): + thumb_uri = create_file_url(thumb_uri[7:], read_mode=FileReadMode.Binary) + + metadata = metadata or {} + metadata = json.dumps(metadata) + + # if id is not set, + # we use a negative value to indicate we are adding a new image instead of updating an existing one + idx = dataset.num_images + id_ = id_ if id_ is not None else dataset.num_images + image = dict(id=id_, idx=idx, + type=dataset.type, dataset_id=dataset.id, + url=thumb_uri, url_full_res=full_uri, + width=width, height=height, + flag=flag, flag_ts=flag_ts, + objects=[], metadata=metadata, ) + + self._import_queue[id_] = image + dataset.num_images += 1 + return image + + @staticmethod + def image_import_annotation(image: dict, + category: str, + label: str = LabelName.GroundTruth, + label_type: Literal["GT", "Pred", "User"] = "GT", + conf: float = 1.0, + is_group: bool = False, + bbox: Tuple[int, int, int, int] = None, + segmentation: List[List[int]] = None, + alpha_uri: str = None, + keypoints: List[Union[float, int]] = None, + keypoint_colors: List[int] = None, + keypoint_skeleton: List[int] = None, + keypoint_names: List[str] = None, + caption: str = None, + confirm_type: int = 0, ): + + bbox = ImageModel.format_bbox(image["width"], image["height"], bbox) + segmentation = ImageModel.format_segmentation(segmentation) + points, colors, lines, names = ImageModel.format_keypoints(keypoints, + keypoint_colors, + keypoint_skeleton, + keypoint_names) + if alpha_uri and alpha_uri.startswith("file://"): + alpha_path = alpha_uri[7:] + alpha_uri = create_file_url(file_path=alpha_path, + read_mode=FileReadMode.Binary) + + anno_obj = dict(label_name=label, label_type=label_type, + category_name=category, caption=caption, + bounding_box=bbox, segmentation=segmentation, alpha=alpha_uri, + points=points, lines=lines, point_colors=colors, point_names=names, + conf=conf, is_group=is_group, confirm_type=confirm_type) + image["objects"].append(anno_obj) + + def bulk_write_images(self, image_queue: list): + co = self.IModel.get_collection() + wc = WriteConcern(w=0) + co = co.with_options(write_concern=wc) + co.insert_many(image_queue) + + def _dataset_flush_importing(self): + if not self._import_queue: + return + + with count_block_time("prepare batch setup", logger.debug): + dataset_id = self.dataset.id + waiting_labels = dict() + waiting_categories = dict() + object_types = set() + whitelist_dirs = set() + + with count_block_time("prepare batch data", logger.debug): + image_insert_queue = [] + + for image_id, image in self._import_queue.items(): + for obj in image["objects"]: + # setup label + label_name = obj["label_name"] + label_id = waiting_labels.get(label_name, None) + if label_id is None: + label_id = get_str_md5(f"{dataset_id}_{label_name}") + label = Label(name=label_name, id=label_id, type=obj["label_type"], dataset_id=dataset_id) + label.batch_save(batch_size=self._batch_size) + waiting_labels[label_name] = label_id + obj["label_id"] = label_id + + # setup category + cat_name = obj["category_name"] + category_id = waiting_categories.get(cat_name, None) + if category_id is None: + category_id = get_str_md5(f"{dataset_id}_{cat_name}") + category = Category(name=cat_name, id=category_id, dataset_id=dataset_id) + category.batch_save(batch_size=self._batch_size) + waiting_categories[cat_name] = category_id + obj["category_id"] = category_id + + # setup object types + if AnnotationType.Classification not in object_types: + object_types.add(AnnotationType.Classification) + if obj["bounding_box"] and AnnotationType.Detection not in object_types: + object_types.add(AnnotationType.Detection) + if obj["segmentation"] and AnnotationType.Segmentation not in object_types: + object_types.add(AnnotationType.Segmentation) + if obj["alpha"] and AnnotationType.Matting not in object_types: + object_types.add(AnnotationType.Matting) + DataSet.add_local_file_url_to_whitelist(obj["alpha"], whitelist_dirs) + if obj["points"] and AnnotationType.KeyPoints not in object_types: + object_types.add(AnnotationType.KeyPoints) + + # setup image + image["_id"] = image.pop("id") + image_insert_queue.append(image) + + DataSet.add_local_file_url_to_whitelist(image["url"], whitelist_dirs) + DataSet.add_local_file_url_to_whitelist(image["url_full_res"], whitelist_dirs) + + # finish batch saves + with count_block_time("finish batch save", logger.debug): + Label.finish_batch_save() + Category.finish_batch_save() + self.bulk_write_images(image_insert_queue) + + # setup dataset + with count_block_time("setup dataset", logger.debug): + self.dataset.object_types = list(sorted(list(object_types))) + self.dataset.save() + + # save whitelist to redis + with count_block_time("save whitelist", logger.debug): + if whitelist_dirs: + Redis.sadd(RedisKey.DatasetImageDirs, *whitelist_dirs) + + self._import_queue.clear() + + def dataset_flush_importing(self): + batch_is_full = len(self._import_queue) >= self._batch_size + if batch_is_full: + with count_block_time("_dataset_flush_importing", logger.debug): + self._dataset_flush_importing() + return True + return False + + def dataset_finish_importing(self): + """ + This method should be called after all batch_add_image calls are finished. + This saves all images in the buffer queue to database. + """ + self._dataset_flush_importing() + self.dataset.add_cover() def pre_run(self): """ @@ -121,6 +323,7 @@ def pre_run(self): """ self.load_existing_user_data() + self.dataset.num_images = 0 self.dataset.status = constants.DatasetStatus.Importing self.dataset.save() Image(self.dataset.id).get_collection().drop() @@ -129,9 +332,9 @@ def post_run(self): """ A post-run hook for subclass importers to clean up data. """ - - self.dataset.status = constants.DatasetStatus.Ready - self.dataset.save() + self.dataset.add_cover() + DataSet.update_one({"id": self.dataset.id}, {"status": DatasetStatus.Ready}) + self.dataset = DataSet.find_one({"id": self.dataset.id}) def on_error(self, err: Exception): """ @@ -139,11 +342,8 @@ def on_error(self, err: Exception): """ try: - dataset_id = self.dataset.id - Label.delete_many({"dataset_id": dataset_id}) - Category.delete_many({"dataset_id": dataset_id}) - Image(dataset_id).get_collection().drop() - self.dataset.delete() + DataSet.cascade_delete(self.dataset) + self.dataset = None finally: raise err @@ -188,20 +388,20 @@ def load_existing_user_data(self): "label_confirm": label_confirm, } - def add_user_data(self, image): + def image_add_user_data(self, image: dict): """ Save manually added user data back. """ - image_id = image.id + image_id = image["id"] user_data = self._user_data.pop(image_id, {}) if not user_data: return - image.objects.extend(user_data["objects"]) - image.flag = user_data["flag"] - image.flag_ts = user_data["flag_ts"] - image.label_confirm = user_data["label_confirm"] + image.setdefault("objects").extend(user_data["objects"]) + image["flag"] = user_data["flag"] + image["flag_ts"] = user_data["flag_ts"] + image["label_confirm"] = user_data["label_confirm"] def run_import(self): """ @@ -211,15 +411,14 @@ def run_import(self): desc = f"dataset[{self.dataset.name}@{self.dataset.id}] import progress" for (image, anno_list) in tqdm(self, desc=desc, unit=" images"): - beg = int(time.time() * 1000) - image = self.dataset.batch_add_image(**image) - self.add_user_data(image) + # for (image, anno_list) in self: + image = self.dataset_import_image(self.dataset, **image) + self.image_add_user_data(image) for anno in anno_list: - image.batch_add_annotation(**anno) - image.finish_batch_add_annotation() - logger.debug(f"time cost of import one image: {int(time.time() * 1000) - beg}ms") - logger.debug(f"imported image, id={image.id}, url={image.url}") - self.dataset.finish_batch_add_image() + self.image_import_annotation(image, **anno) + self.dataset_flush_importing() + + self.dataset_finish_importing() def run(self): """ diff --git a/deepdataspace/model/_base.py b/deepdataspace/model/_base.py index 7ecdf77..977228d 100644 --- a/deepdataspace/model/_base.py +++ b/deepdataspace/model/_base.py @@ -5,6 +5,7 @@ """ import abc +import logging import time from threading import Lock from typing import ClassVar @@ -15,7 +16,6 @@ from pydantic import BaseModel as _Base from pymongo import WriteConcern from pymongo.collection import Collection -from pymongo.operations import InsertOne from pymongo.operations import UpdateOne from pymongo.typings import _DocumentType @@ -27,6 +27,8 @@ _batch_update_queue = {} # a dict of batch update queue for every collection, {'collection_name': batch_update_queue, } _batch_insert_queue = {} # a dict of batch insert queue for every collection, {'collection_name': batch_insert_queue, } +logger = logging.getLogger("model.base") + def current_ts(): """ @@ -56,12 +58,6 @@ def get_collection(cls, *args, **kwargs) -> Collection[_DocumentType]: raise NotImplementedError - def post_init(self): - """ - Post init hook for initializing a model object. - """ - pass - @classmethod def from_dict(cls, data: dict): """ @@ -69,7 +65,6 @@ def from_dict(cls, data: dict): """ obj = cls.parse_obj(data) - obj.post_init() return obj def to_dict(self, include: list = None, exclude: list = None): @@ -282,7 +277,6 @@ def save(self, refresh=False): If refresh is True, the object will be re-fetched from mongodb after saving. """ - self.post_init() co = self.get_collection() if co is None: return None @@ -300,7 +294,6 @@ def save(self, refresh=False): new_self = co.find_one({"_id": _id}) new_self.pop("_id", None) self.__dict__.update(new_self) - self.post_init() return self def batch_save(self, batch_size: int = 20, set_on_insert: Dict = None): @@ -310,9 +303,6 @@ def batch_save(self, batch_size: int = 20, set_on_insert: Dict = None): :param batch_size: the batch size. We will only write to mongodb when the batch is full. :param set_on_insert: the fields only need to be set when we are inserting a new object. """ - - self.post_init() - cls = self.__class__ co = cls.get_collection() if co is None: @@ -364,63 +354,6 @@ def finish_batch_save(cls): co.bulk_write(queue) _batch_save_queue[cls_id] = [] - @classmethod - def batch_create(cls, model_obj: "BaseModel", batch_size: int = 20): - """ - The same as self.batch_save function, - but the performance is better because we save by InsertOne instead of UpdateOne with upsert. - - :param model_obj: the saving model object. - :param batch_size: the batch size. We will only write to mongodb when the batch is full. - """ - - model_obj.post_init() - - co = cls.get_collection() - if co is None: - return None - wc = WriteConcern(w=0) - co = co.with_options(write_concern=wc) - - _id = model_obj.__dict__.get("id", None) - if _id is None: - return None - - data = model_obj.to_dict() - data["_id"] = data.pop("id", None) - - op = InsertOne(data) - - cls_id = cls.get_cls_id() - op_lock = cls._get_batch_op_lock() - with op_lock: - queue = _batch_insert_queue.setdefault(cls_id, []) - queue.append(op) - if len(queue) >= batch_size: - co.bulk_write(queue) - _batch_insert_queue[cls_id] = [] - - @classmethod - def finish_batch_create(cls): - """ - This must be called after all the batch_create calls. - """ - - cls_id = cls.get_cls_id() - if _batch_lock.get(cls_id, None) is None: - with _lock_lock: - _batch_lock[cls_id] = Lock() - - op_lock = _batch_lock[cls_id] - with op_lock: - co = cls.get_collection() - wc = WriteConcern(w=0) - co = co.with_options(write_concern=wc) - queue = _batch_insert_queue.setdefault(cls_id, []) - if queue: - co.bulk_write(queue) - _batch_insert_queue[cls_id] = [] - def delete(self): """ Delete current object from mongodb. diff --git a/deepdataspace/model/dataset.py b/deepdataspace/model/dataset.py index 370f20d..31c16ce 100644 --- a/deepdataspace/model/dataset.py +++ b/deepdataspace/model/dataset.py @@ -27,6 +27,7 @@ from deepdataspace.model.image import ImageModel from deepdataspace.model.label import Label from deepdataspace.utils.file import create_file_url +from deepdataspace.utils.function import count_block_time from deepdataspace.utils.string import get_str_md5 logger = logging.getLogger("io.model.dataset") @@ -103,10 +104,7 @@ def get_collection(cls, *args, **kwargs) -> Collection[_DocumentType]: group_name: str = None _batch_queue: Dict[int, ImageModel] = {} - _batch_size: int = 100 - # If True, images from batch_add_image will be saved by UpdateOne with upsert=True. - # If False, images from batch_add_image will be saved by InsertOne, which is faster. - _batch_upsert: bool = True + _batch_size: int = 200 @classmethod def create_dataset(cls, @@ -117,7 +115,6 @@ def create_dataset(cls, files: dict = None, description: str = None, description_func: str = None, - batch_upsert: bool = True, ) -> "DataSet": """ Create a dataset. @@ -145,6 +142,7 @@ def create_dataset(cls, dataset.path = path or dataset.path dataset.files = files or dataset.files dataset.name = name + dataset.num_images = Image(dataset.id).count_num({}) dataset.save() return dataset else: @@ -154,45 +152,11 @@ def create_dataset(cls, dataset = cls(name=name, id=id_, type=type, path=path, files=files, status=DatasetStatus.Ready, description=description, description_func=description_func) - dataset._batch_upsert = batch_upsert - dataset.post_init() + dataset.num_images = Image(dataset.id).count_num({}) dataset.save() return dataset - @classmethod - def get_importing_dataset(cls, - name: str, - id_: str = None, - type: str = None, - path: str = None, - files: dict = None, - batch_upsert: bool = True, - ) -> "DataSet": - """ - This is the same as create_dataset. - But if the dataset is new, it's status will be set to "waiting" instead of "ready". - """ - - if id_: - dataset = DataSet.find_one({"id": id_}) - if dataset is not None: - dataset.type = type or dataset.type - dataset.path = path or dataset.path - dataset.files = files or dataset.files - dataset.name = name - dataset.save() - return dataset - else: - id_ = uuid.uuid4().hex - - files = files or {} - dataset = cls(name=name, id=id_, type=type, path=path, files=files, status=DatasetStatus.Waiting) - dataset._batch_upsert = batch_upsert - dataset.post_init() - dataset.save() - return dataset - - def _add_cover(self, force_update: bool = False): + def add_cover(self, force_update: bool = False): has_cover = bool(self.cover_url) if has_cover and not force_update: return @@ -266,17 +230,16 @@ def add_image(self, image.flag = flag or image.flag image.flag_ts = flag_ts or image.flag_ts image.metadata = metadata or image.metadata - image.post_init() image._dataset = self # this saves a db query image.save() self.num_images = Model.count_num({}) - self._add_cover() + self.add_cover() # save whitelist to redis whitelist_dirs = set() - self._add_local_file_url_to_whitelist(image.url, whitelist_dirs) - self._add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs) + self.add_local_file_url_to_whitelist(image.url, whitelist_dirs) + self.add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs) if whitelist_dirs: Redis.sadd(RedisKey.DatasetImageDirs, *whitelist_dirs) @@ -335,7 +298,7 @@ def batch_add_image(self, return image @staticmethod - def _add_local_file_url_to_whitelist(url: str, whitelist: set): + def add_local_file_url_to_whitelist(url: str, whitelist: set): if not url or not url.startswith("/files/local_files"): return @@ -386,27 +349,21 @@ def _batch_save_image_batch(self): object_types.add(AnnotationType.Segmentation) if obj.alpha and AnnotationType.Matting not in object_types: object_types.add(AnnotationType.Matting) - self._add_local_file_url_to_whitelist(obj.alpha, whitelist_dirs) + self.add_local_file_url_to_whitelist(obj.alpha, whitelist_dirs) if obj.points and AnnotationType.KeyPoints not in object_types: object_types.add(AnnotationType.KeyPoints) # setup image image.idx = idx image.id = idx if image.id < 0 else image.id - if self._batch_upsert is True: - image.batch_save(batch_size=self._batch_size, set_on_insert={"idx": image.idx}) - else: - IModel.batch_create(image, batch_size=self._batch_size) + image.batch_save(batch_size=self._batch_size, set_on_insert={"idx": image.idx}) idx += 1 - self._add_local_file_url_to_whitelist(image.url, whitelist_dirs) - self._add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs) + self.add_local_file_url_to_whitelist(image.url, whitelist_dirs) + self.add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs) # finish batch saves - if self._batch_upsert is True: - IModel.finish_batch_save() - else: - IModel.finish_batch_create() + IModel.finish_batch_save() Label.finish_batch_save() Category.finish_batch_save() @@ -434,7 +391,7 @@ def finish_batch_add_image(self): This saves all images in the buffer queue to database. """ self._batch_save_image_batch() - self._add_cover() + self.add_cover() def eval_description(self): """ diff --git a/deepdataspace/model/image.py b/deepdataspace/model/image.py index 2840232..39fd579 100644 --- a/deepdataspace/model/image.py +++ b/deepdataspace/model/image.py @@ -163,7 +163,6 @@ def from_dict(cls, data: dict): data.setdefault("idx", data["id"]) obj = cls.parse_obj(data) - obj.post_init() return obj @staticmethod @@ -173,17 +172,6 @@ def _convert_local_to_url(file_uri: str): read_mode=constants.FileReadMode.Binary) return file_url - def post_init(self): - """ - Ensure the url are visible for local file uri. - """ - - if self.url.startswith("file://"): - self.url = self._convert_local_to_url(self.url) - - if self.url_full_res.startswith("file://"): - self.url_full_res = self._convert_local_to_url(self.url_full_res) - def _add_label(self, label: str, label_type: str): """ Add a label to the dataset the image belongs to. @@ -200,7 +188,6 @@ def _add_label(self, label: str, label_type: str): if label_obj is None: label_obj = Label(name=label, id=label_id, type=label_type, dataset_id=self.dataset_id) - label_obj.post_init() label_obj.save() self._labels[label_id] = label_obj return label_obj @@ -217,13 +204,12 @@ def _add_category(self, category: str): if category_obj is None: category_obj = Category(name=category, id=category_id, dataset_id=self.dataset_id) - category_obj.post_init() category_obj.save() self._categories[category_id] = category_obj return category_obj @staticmethod - def _format_bbox(width, height, bbox: Tuple[int, int, int, int]): + def format_bbox(width, height, bbox: Tuple[int, int, int, int]): """ Convert the bbox data to the internal format. """ @@ -239,7 +225,7 @@ def _format_bbox(width, height, bbox: Tuple[int, int, int, int]): return bounding_box @staticmethod - def _format_segmentation(segmentation: List[List[int]]): + def format_segmentation(segmentation: List[List[int]]): """ Convert the segmentation data to the internal format. """ @@ -249,10 +235,10 @@ def _format_segmentation(segmentation: List[List[int]]): return "/".join([",".join([str(x) for x in seg]) for seg in segmentation]) @staticmethod - def _format_keypoints(keypoints: List[Union[float, int]], - colors: List[int] = None, - skeleton: List[int] = None, - names: List[str] = None): + def format_keypoints(keypoints: List[Union[float, int]], + colors: List[int] = None, + skeleton: List[int] = None, + names: List[str] = None): """ Convert the coco_keypoints data to the internal format. """ @@ -333,20 +319,24 @@ def _add_annotation(self, if not self.width or not self.height: raise ValueError("image width and height must be set before setting bbox") + if alpha_uri and alpha_uri.startswith("file://"): + alpha_path = alpha_uri[7:] + alpha_uri = create_file_url(file_path=alpha_path, + read_mode=FileReadMode.Binary) + label_obj = self._add_label(label, label_type) category_obj = self._add_category(category) - bounding_box = self._format_bbox(self.width, self.height, bbox) - segmentation = self._format_segmentation(segmentation) - points, colors, lines, names = self._format_keypoints(keypoints, - keypoint_colors, - keypoint_skeleton, - keypoint_names) + bounding_box = self.format_bbox(self.width, self.height, bbox) + segmentation = self.format_segmentation(segmentation) + points, colors, lines, names = self.format_keypoints(keypoints, + keypoint_colors, + keypoint_skeleton, + keypoint_names) anno_obj = Object(label_name=label, label_type=label_type, label_id=label_obj.id, category_name=category, category_id=category_obj.id, caption=caption, bounding_box=bounding_box, segmentation=segmentation, alpha=alpha_uri, points=points, lines=lines, point_colors=colors, point_names=names, conf=conf, is_group=is_group, confirm_type=confirm_type) - anno_obj.post_init() self.objects.append(anno_obj) def add_annotation(self, @@ -421,7 +411,7 @@ def batch_add_annotation(self, for annotation_data in annotations: image.batch_add_annotation(**annotation_data) - dataset.finish_batch_add+image() + dataset.finish_batch_add_image() :param category: the category name. :param label: the label name. @@ -442,12 +432,12 @@ def batch_add_annotation(self, :return: None """ - bbox = self._format_bbox(self.width, self.height, bbox) - segmentation = self._format_segmentation(segmentation) - points, colors, lines, names = self._format_keypoints(keypoints, - keypoint_colors, - keypoint_skeleton, - keypoint_names) + bbox = self.format_bbox(self.width, self.height, bbox) + segmentation = self.format_segmentation(segmentation) + points, colors, lines, names = self.format_keypoints(keypoints, + keypoint_colors, + keypoint_skeleton, + keypoint_names) if alpha_uri and alpha_uri.startswith("file://"): alpha_path = alpha_uri[7:] alpha_uri = create_file_url(file_path=alpha_path, diff --git a/deepdataspace/model/label_task.py b/deepdataspace/model/label_task.py index 9f00a29..42736cf 100644 --- a/deepdataspace/model/label_task.py +++ b/deepdataspace/model/label_task.py @@ -462,7 +462,6 @@ def _get_label(dataset_id: str, label_set_name: str): """ label_id = get_str_md5(f"{dataset_id}_{label_set_name}") label_obj = Label(name=label_set_name, id=label_id, type=LabelType.GroundTruth, dataset_id=dataset_id) - label_obj.post_init() label_obj.save() return label_obj @@ -475,7 +474,6 @@ def _get_category(dataset_id: str, category_name: str, categories: dict): if cat_obj is None: cat_id = get_str_md5(f"{dataset_id}_{category_name}") cat_obj = Category(id=cat_id, name=category_name, dataset_id=dataset_id) - cat_obj.post_init() cat_obj.save() categories[category_name] = cat_obj return cat_obj @@ -529,7 +527,6 @@ def _export_dataset(self, dataset: DataSet, label_set_name: str): anno_obj = Object(label_name=label_obj.name, label_type=label_obj.type, label_id=label_obj.id, category_name=cat_obj.name, category_id=cat_obj.id, bounding_box=anno["bounding_box"]) - anno_obj.post_init() image.objects.append(anno_obj) image.batch_save() diff --git a/deepdataspace/model/object.py b/deepdataspace/model/object.py index 55133a9..5855969 100644 --- a/deepdataspace/model/object.py +++ b/deepdataspace/model/object.py @@ -89,22 +89,3 @@ def get_collection(cls, *args, **kwargs): confirm_type: Optional[int] = 0 # the image confirm type, 0 no confirm required, 1 gt may be fn, 2 pred may be fp compare_result: Optional[Dict[str, str]] = {} # {"90": "FP", ..., "10": "OK"} matched_det_idx: Optional[int] = None # The matched ground truth index, for prediction objects only. - - @staticmethod - def _convert_file_path_to_url(file_uri: str): - """ - Convert a local file path to a visible file url. - """ - - file_path = file_uri[7:] - file_url = create_file_url(file_path=file_path, - read_mode=constants.FileReadMode.Binary) - return file_url - - def post_init(self): - """ - Override the post_init method to convert the alpha file path to url. - """ - - if self.alpha and self.alpha.startswith("file://"): - self.alpha = self._convert_file_path_to_url(self.alpha) diff --git a/deepdataspace/plugins/coco2017/importer.py b/deepdataspace/plugins/coco2017/importer.py index 1d4ba15..4016118 100644 --- a/deepdataspace/plugins/coco2017/importer.py +++ b/deepdataspace/plugins/coco2017/importer.py @@ -1,14 +1,13 @@ """ Import the coco2017 dataset and save metadata into mongodb. """ - import json import logging import os +import traceback from typing import Dict from typing import List from typing import Tuple -import traceback from deepdataspace.constants import DatasetFileType from deepdataspace.constants import DatasetType @@ -83,11 +82,11 @@ def _parse_meta(meta_path: str): assert os.path.isdir(image_root) and os.path.exists(image_root) info = { - "dataset_name" : dataset_name, - "ground_truth" : ground_truth, - "predictions" : predictions, - "image_root" : image_root, - "dynamic_caption" : getattr(module, "dynamic_caption", False), + "dataset_name": dataset_name, + "ground_truth": ground_truth, + "predictions": predictions, + "image_root": image_root, + "dynamic_caption": getattr(module, "dynamic_caption", False), "caption_generator": getattr(module, "caption_generator", None), } diff --git a/deepdataspace/scripts/migrate/2023061401.py b/deepdataspace/scripts/migrate/2023061401.py index 71f4aae..5105feb 100644 --- a/deepdataspace/scripts/migrate/2023061401.py +++ b/deepdataspace/scripts/migrate/2023061401.py @@ -16,7 +16,7 @@ def add_covers(): datasets = DataSet.find_many({}) for idx, dataset in enumerate(datasets): - dataset._add_cover(force_update=True) + dataset.add_cover(force_update=True) logger.info(f"[{idx + 1}/{num}]Added cover to dataset[{dataset.id}], cover_url={dataset.cover_url}") logger.info("Finished adding covers") diff --git a/deepdataspace/services/mongodb.py b/deepdataspace/services/mongodb.py index 83c9d2b..ffe533f 100644 --- a/deepdataspace/services/mongodb.py +++ b/deepdataspace/services/mongodb.py @@ -82,7 +82,7 @@ def _start_mongodb(self, auth: bool = False): self.port = find_free_port(*port_range) while True: run_cmd = cmd[:] - run_cmd.extend(["--port", str(self.port)]) + run_cmd.extend(["--nojournal", "--port", str(self.port)]) try: self.start_process(run_cmd) except Exception as err: From 187334c2e6e399bc09773215527dee1fba8a0a18 Mon Sep 17 00:00:00 2001 From: imhuwq <imhuwq@gmail.com> Date: Mon, 26 Feb 2024 10:17:37 +0800 Subject: [PATCH 4/6] feature(mongodb): enable journaling --- deepdataspace/services/mongodb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepdataspace/services/mongodb.py b/deepdataspace/services/mongodb.py index ffe533f..83c9d2b 100644 --- a/deepdataspace/services/mongodb.py +++ b/deepdataspace/services/mongodb.py @@ -82,7 +82,7 @@ def _start_mongodb(self, auth: bool = False): self.port = find_free_port(*port_range) while True: run_cmd = cmd[:] - run_cmd.extend(["--nojournal", "--port", str(self.port)]) + run_cmd.extend(["--port", str(self.port)]) try: self.start_process(run_cmd) except Exception as err: From 4c03620141a9069f14d8ab342c780df9380fae2b Mon Sep 17 00:00:00 2001 From: imhuwq <imhuwq@gmail.com> Date: Mon, 26 Feb 2024 11:40:24 +0800 Subject: [PATCH 5/6] delete(label confirm): delete label confirm attribute for objects --- deepdataspace/io/importer.py | 17 +--- deepdataspace/model/image.py | 22 +---- deepdataspace/model/object.py | 6 -- deepdataspace/plugins/tsv/importer.py | 4 - .../server/resources/api_v1/__init__.py | 2 - .../server/resources/api_v1/annotations.py | 1 - .../server/resources/api_v1/images.py | 11 +-- .../server/resources/api_v1/label_clone.py | 5 +- .../server/resources/api_v1/object_confirm.py | 84 ------------------- 9 files changed, 11 insertions(+), 141 deletions(-) delete mode 100644 deepdataspace/server/resources/api_v1/object_confirm.py diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py index e176906..66d70a0 100644 --- a/deepdataspace/io/importer.py +++ b/deepdataspace/io/importer.py @@ -77,8 +77,7 @@ def format_annotation(category: str, keypoint_colors: List[int] = None, keypoint_skeleton: List[int] = None, keypoint_names: List[str] = None, - caption: str = None, - confirm_type: int = 0, ): + caption: str = None): """ A helper function to format annotation data. """ @@ -95,8 +94,7 @@ def format_annotation(category: str, keypoint_colors=keypoint_colors, keypoint_skeleton=keypoint_skeleton, keypoint_names=keypoint_names, - caption=caption, - confirm_type=confirm_type, ) + caption=caption) class Importer(ImportHelper, abc.ABC): @@ -200,8 +198,7 @@ def image_import_annotation(image: dict, keypoint_colors: List[int] = None, keypoint_skeleton: List[int] = None, keypoint_names: List[str] = None, - caption: str = None, - confirm_type: int = 0, ): + caption: str = None): bbox = ImageModel.format_bbox(image["width"], image["height"], bbox) segmentation = ImageModel.format_segmentation(segmentation) @@ -218,7 +215,7 @@ def image_import_annotation(image: dict, category_name=category, caption=caption, bounding_box=bbox, segmentation=segmentation, alpha=alpha_uri, points=points, lines=lines, point_colors=colors, point_names=names, - conf=conf, is_group=is_group, confirm_type=confirm_type) + conf=conf, is_group=is_group) image["objects"].append(anno_obj) def bulk_write_images(self, image_queue: list): @@ -355,7 +352,6 @@ def load_existing_user_data(self): pipeline = [ {"$project": {"flag": 1, "flag_ts": 1, - "label_confirm": 1, "objects": { "$filter": { "input": "$objects", @@ -378,14 +374,10 @@ def load_existing_user_data(self): flag = image.get("flag", 0) flag_ts = image.get("flag_ts", 0) - # manually added confirm flag - label_confirm = image.get("label_confirm", {}) - self._user_data[image_id] = { "objects": user_objects, "flag": flag, "flag_ts": flag_ts, - "label_confirm": label_confirm, } def image_add_user_data(self, image: dict): @@ -401,7 +393,6 @@ def image_add_user_data(self, image: dict): image.setdefault("objects").extend(user_data["objects"]) image["flag"] = user_data["flag"] image["flag_ts"] = user_data["flag_ts"] - image["label_confirm"] = user_data["label_confirm"] def run_import(self): """ diff --git a/deepdataspace/model/image.py b/deepdataspace/model/image.py index 39fd579..50c864e 100644 --- a/deepdataspace/model/image.py +++ b/deepdataspace/model/image.py @@ -90,12 +90,6 @@ class ImageModel(BaseModel): fp counter of image in the format {"label_id": {90:x, 80: y, ..., 10: z}}. Default is an empty dict. num_fp_cat: dict fp counter of image categorized, in the format {"label_id": {"category_id: {90:x, 80: y, ..., 10: z}}}. Default is an empty dict. - label_confirm: dict - Confirm status of every label sets, where confirm can be: - 0 = not confirmed, - 1 = confirmed, - 2 = refine required. - Format is {"label_id": {"confirm": int, "confirm_ts": int}}. Default is an empty dict. """ @classmethod @@ -128,10 +122,6 @@ def get_collection(cls): num_fp: dict = {} # {"label_id": {90:x, 80: y, ..., 10: z}} num_fp_cat: dict = {} # {"label_id": {"category_id: {90:x, 80: y, ..., 10: z}}} - # confirm status of every label sets, confirm: 0 = not confirmed, 1 = confirmed, 2 = refine required - # {"label_id": {"confirm": int, "confirm_ts": int}} - label_confirm: dict = {} - _dataset = None _labels: dict = {} @@ -313,7 +303,6 @@ def _add_annotation(self, keypoint_skeleton: List[int] = None, keypoint_names: List[str] = None, caption: str = None, - confirm_type: int = 0, ): if bbox: if not self.width or not self.height: @@ -336,7 +325,7 @@ def _add_annotation(self, category_name=category, category_id=category_obj.id, caption=caption, bounding_box=bounding_box, segmentation=segmentation, alpha=alpha_uri, points=points, lines=lines, point_colors=colors, point_names=names, - conf=conf, is_group=is_group, confirm_type=confirm_type) + conf=conf, is_group=is_group) self.objects.append(anno_obj) def add_annotation(self, @@ -353,7 +342,6 @@ def add_annotation(self, keypoint_skeleton: List[int] = None, keypoint_names: List[str] = None, caption: str = None, - confirm_type: int = 0, ): """ Add an annotation to the image. @@ -373,13 +361,12 @@ def add_annotation(self, :param keypoint_colors: the key point colors, [255, 0, 0, ...]. :param keypoint_skeleton: the key point skeleton, [0, 1, 2, ...]. :param caption: the caption of the annotation. - :param confirm_type: the confirm_type of the annotation, 0 = not confirmed, 1 = gt may be fn, 2 = pred may be fp """ self._add_annotation(category, label, label_type, conf, is_group, bbox, segmentation, alpha_uri, keypoints, keypoint_colors, keypoint_skeleton, keypoint_names, - caption, confirm_type) + caption) self.save() self._update_dataset(bbox, segmentation, alpha_uri, keypoints) @@ -398,7 +385,7 @@ def batch_add_annotation(self, keypoint_skeleton: List[int] = None, keypoint_names: List[str] = None, caption: str = None, - confirm_type: int = 0, ): + ): """ The batch version of add_annotation. The performance is better if we are saving a lot of annotations. @@ -428,7 +415,6 @@ def batch_add_annotation(self, :param keypoint_colors: the key point colors, [255, 0, 0, ...]. :param keypoint_skeleton: the key point skeleton, [0, 1, 2, ...]. :param caption: the caption of the annotation. - :param confirm_type: the confirm_type of the annotation, 0 = not confirmed, 1 = gt may be fn, 2 = pred may be fp :return: None """ @@ -447,7 +433,7 @@ def batch_add_annotation(self, category_name=category, caption=caption, bounding_box=bbox, segmentation=segmentation, alpha=alpha_uri, points=points, lines=lines, point_colors=colors, point_names=names, - conf=conf, is_group=is_group, confirm_type=confirm_type) + conf=conf, is_group=is_group) self.objects.append(anno_obj) def finish_batch_add_annotation(self): diff --git a/deepdataspace/model/object.py b/deepdataspace/model/object.py index 5855969..a0c7a14 100644 --- a/deepdataspace/model/object.py +++ b/deepdataspace/model/object.py @@ -6,13 +6,10 @@ from typing import Dict from typing import List -from typing import Literal from typing import Optional from typing import Union -from deepdataspace import constants from deepdataspace.model._base import BaseModel -from deepdataspace.utils.file import create_file_url class Object(BaseModel): @@ -52,8 +49,6 @@ class Object(BaseModel): The point names of the object. caption: str The caption of the object. - confirm_type: int - The image confirm type, 0 for unconfirmed, 1 for confirmed, 2 for rejected. compare_result: dict The compare result of the object, {"90": "FP", ..., "10": "OK"}. matched_det_idx: int @@ -86,6 +81,5 @@ def get_collection(cls, *args, **kwargs): point_colors: Optional[List[int]] = [] point_names: Optional[List[str]] = [] caption: Optional[str] = "" - confirm_type: Optional[int] = 0 # the image confirm type, 0 no confirm required, 1 gt may be fn, 2 pred may be fp compare_result: Optional[Dict[str, str]] = {} # {"90": "FP", ..., "10": "OK"} matched_det_idx: Optional[int] = None # The matched ground truth index, for prediction objects only. diff --git a/deepdataspace/plugins/tsv/importer.py b/deepdataspace/plugins/tsv/importer.py index e243ecb..490516b 100644 --- a/deepdataspace/plugins/tsv/importer.py +++ b/deepdataspace/plugins/tsv/importer.py @@ -156,9 +156,6 @@ def load_objects(self, # prepare is_group is_group = bool(obj.get("iscrowd", False)) - # prepare confirm_type - confirm_type = obj.get("confirm_type", None) - # prepare confidence confidence = obj.get("conf", 1.0) @@ -167,7 +164,6 @@ def load_objects(self, label=label_name, label_type=label_type, conf=confidence, is_group=is_group, bbox=bbox, segmentation=segmentation, alpha_uri=alpha, - confirm_type=confirm_type, ) obj_list.append(obj) diff --git a/deepdataspace/server/resources/api_v1/__init__.py b/deepdataspace/server/resources/api_v1/__init__.py index 3681a8d..445f99b 100644 --- a/deepdataspace/server/resources/api_v1/__init__.py +++ b/deepdataspace/server/resources/api_v1/__init__.py @@ -17,7 +17,6 @@ from deepdataspace.server.resources.api_v1.image_flags import ImageFlagsView from deepdataspace.server.resources.api_v1.images import ImagesView from deepdataspace.server.resources.api_v1.label_clone import LabelCloneView -from deepdataspace.server.resources.api_v1.object_confirm import ObjectConfirmView urls = [ path("ping", ping.PingView.as_view()), @@ -29,7 +28,6 @@ path("datasets/<dataset_id>", DatasetView.as_view()), path("image_flags", ImageFlagsView.as_view()), path("label_clone", LabelCloneView.as_view()), - path("object_confirm", ObjectConfirmView.as_view()), path("annotations", AnnotationsView.as_view()), path("comparisons", ComparisonsView.as_view()), path("label_projects", label_tasks.ProjectsView.as_view()), diff --git a/deepdataspace/server/resources/api_v1/annotations.py b/deepdataspace/server/resources/api_v1/annotations.py index 49b97a4..3580e22 100644 --- a/deepdataspace/server/resources/api_v1/annotations.py +++ b/deepdataspace/server/resources/api_v1/annotations.py @@ -118,7 +118,6 @@ def _save_annotations(dataset: DataSet, image, annotations): cur_objs.append(obj) image.objects = cur_objs - image.label_confirm[label_id] = {"confirm": 1, "confirm_ts": int(time.time())} image.save() return saving_categories, saving_labels diff --git a/deepdataspace/server/resources/api_v1/images.py b/deepdataspace/server/resources/api_v1/images.py index da93987..acd303c 100644 --- a/deepdataspace/server/resources/api_v1/images.py +++ b/deepdataspace/server/resources/api_v1/images.py @@ -68,7 +68,6 @@ class ImagesView(BaseAPIView): Argument("dataset_id", str, Argument.QUERY, required=True), Argument("category_id", str, Argument.QUERY, required=False), Argument("flag", int, Argument.QUERY, required=False), - Argument("confirm", int, Argument.QUERY, required=False), Argument("label_id", str, Argument.QUERY, required=False), Argument("page_num", Argument.PositiveInt, Argument.QUERY, default=1), Argument("page_size", Argument.PositiveInt, Argument.QUERY, default=100) @@ -80,7 +79,7 @@ def get(self, request): - GET /api/v1/images """ - dataset_id, category_id, flag, confirm, label_id, page_num, page_size = parse_arguments(request, self.get_args) + dataset_id, category_id, flag, label_id, page_num, page_size = parse_arguments(request, self.get_args) dataset = DataSet.find_one({"_id": dataset_id}) if dataset is None: @@ -101,15 +100,13 @@ def get(self, request): if flag is not None: filters["flag"] = flag - if confirm is not None and label_id is not None: - filters[f"label_confirm.{label_id}.confirm"] = confirm total = Image(dataset_id).count_num(filters) image_list = [] offset = max(0, page_size * (page_num - 1)) - includes = {"id", "idx", "flag", "label_confirm", "objects", "metadata", "type", "width", "height", "url", + includes = {"id", "idx", "flag", "objects", "metadata", "type", "width", "height", "url", "url_full_res"} includes = {i: 1 for i in includes} @@ -123,10 +120,6 @@ def get(self, request): skip=offset, size=page_size, to_dict=True): - - # TODO keep for compatibility, delete this after run op/migrates/add_confirm_fields.py - image.setdefault("label_confirm", {}) - for obj in image["objects"]: obj["source"] = obj["label_type"] # TODO keep for compatibility, delete this in the future diff --git a/deepdataspace/server/resources/api_v1/label_clone.py b/deepdataspace/server/resources/api_v1/label_clone.py index 8dec90b..0542a7e 100644 --- a/deepdataspace/server/resources/api_v1/label_clone.py +++ b/deepdataspace/server/resources/api_v1/label_clone.py @@ -64,12 +64,9 @@ def gen_unique_clone_name(dataset_id, src_label_name, dst_label_name): def clone_images_collection(dataset_id, target_dataset_id, src_label_id, dst_label_id, dst_label_name): for image in Image(dataset_id).find_many({}): cloned = [] - label_confirm = image.label_confirm - label_confirm.setdefault(dst_label_id, {"confirm": 0, "confirm_ts": 0}) for obj in image.objects: label_id = obj.label_id - label_confirm.setdefault(label_id, {"confirm": 0, "confirm_ts": 0}) if obj.label_id != src_label_id: continue @@ -79,7 +76,7 @@ def clone_images_collection(dataset_id, target_dataset_id, src_label_id, dst_lab category_id=obj.category_id, category_name=obj.category_name, conf=obj.conf, is_group=obj.is_group, bounding_box=obj.bounding_box, segmentation=obj.segmentation, alpha=obj.alpha, points=obj.points, lines=obj.lines, point_colors=obj.point_colors, - point_names=obj.point_names, confirm_type=obj.confirm_type, + point_names=obj.point_names ) cloned.append(obj) image.objects.extend(cloned) diff --git a/deepdataspace/server/resources/api_v1/object_confirm.py b/deepdataspace/server/resources/api_v1/object_confirm.py deleted file mode 100644 index aaa736e..0000000 --- a/deepdataspace/server/resources/api_v1/object_confirm.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -deepdataspace.server.resources.api_v1.object_confirm - -RESTful API for object confirm. -""" - -import logging -import time - -from deepdataspace.constants import ErrCode -from deepdataspace.constants import DatasetStatus -from deepdataspace.model import DataSet -from deepdataspace.model import Label -from deepdataspace.model import Object -from deepdataspace.model.image import Image -from deepdataspace.utils.http import Argument -from deepdataspace.utils.http import BaseAPIView -from deepdataspace.utils.http import format_response -from deepdataspace.utils.http import parse_arguments -from deepdataspace.utils.http import raise_exception - -logger = logging.getLogger("django") - - -class ObjectConfirmView(BaseAPIView): - """ - - POST /api/v1/object_confirm - """ - - post_args = [ - Argument("dataset_id", str, Argument.JSON, required=True), - Argument("label_id", str, Argument.JSON, required=True), - Argument("image_id", int, Argument.JSON, required=True), - Argument("confirm", Argument.Choice([1, 2]), Argument.JSON, required=True), - Argument("objects", list, Argument.JSON, required=False), - ] - - def post(self, request): - """ - Confirm a label set of an image. - - - POST /api/v1/object_confirm - """ - - dataset_id, label_id, image_id, confirm, objects = parse_arguments(request, self.post_args) - - dataset = DataSet.find_one({"id": dataset_id}) - if dataset is None: - raise_exception(ErrCode.DatasetNotFound, f"dataset_id[{dataset_id}] not found") - if dataset.status in DatasetStatus.DontRead_: - raise_exception(404, f"dataset_id[{dataset_id}] is in status [{dataset.status}] now, try again later") - - label = Label.find_one({"id": label_id, "dataset_id": dataset_id}) - if label is None: - raise_exception(ErrCode.DatasetLabelNotFound, f"label_id[{label_id}] not found") - - image = Image(dataset_id).find_one({"id": image_id}) - if image is None: - raise_exception(ErrCode.DatasetImageNotFound, f"image_id[{image_id}] not found") - - new_objects = [] - for idx, obj in enumerate(objects): - if obj["label_id"] != label_id: - continue - - try: - obj = Object.from_dict(obj) - obj.compare_result = {} - obj.matched_det_idx = None - except Exception as err: - logger.warning(err) - raise_exception(ErrCode.AnnotationFormatError, f"objects[{idx}] data structure mismatch") - new_objects.append(obj) - - saving_objects = [o for o in image.objects if o.label_id != label_id] - saving_objects.extend(new_objects) - image.objects = saving_objects - - label_confirm = image.label_confirm - confirm_ts = int(time.time()) - label_confirm[label_id] = {"confirm": confirm, "confirm_ts": confirm_ts} - image.save() - - return format_response({"confirm": confirm, "confirm_ts": confirm_ts}) From c46cda039b7d3fbdfa9f8997e8773eec6f1963b9 Mon Sep 17 00:00:00 2001 From: imhuwq <imhuwq@gmail.com> Date: Mon, 26 Feb 2024 15:02:49 +0800 Subject: [PATCH 6/6] delete(comments): delete outdated comments --- deepdataspace/io/importer.py | 2 -- deepdataspace/model/_base.py | 1 - deepdataspace/model/dataset.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py index 66d70a0..d30c5e5 100644 --- a/deepdataspace/io/importer.py +++ b/deepdataspace/io/importer.py @@ -169,8 +169,6 @@ def dataset_import_image(self, metadata = metadata or {} metadata = json.dumps(metadata) - # if id is not set, - # we use a negative value to indicate we are adding a new image instead of updating an existing one idx = dataset.num_images id_ = id_ if id_ is not None else dataset.num_images image = dict(id=id_, idx=idx, diff --git a/deepdataspace/model/_base.py b/deepdataspace/model/_base.py index 977228d..3c9830b 100644 --- a/deepdataspace/model/_base.py +++ b/deepdataspace/model/_base.py @@ -25,7 +25,6 @@ _batch_lock = {} # a dict of batch operation lock for every collection, {'collection_name': batch_op_lock, } _batch_save_queue = {} # a dict of batch save queue for every collection, {'collection_name': batch_save_queue, } _batch_update_queue = {} # a dict of batch update queue for every collection, {'collection_name': batch_update_queue, } -_batch_insert_queue = {} # a dict of batch insert queue for every collection, {'collection_name': batch_insert_queue, } logger = logging.getLogger("model.base") diff --git a/deepdataspace/model/dataset.py b/deepdataspace/model/dataset.py index 31c16ce..28d8311 100644 --- a/deepdataspace/model/dataset.py +++ b/deepdataspace/model/dataset.py @@ -130,8 +130,6 @@ def create_dataset(cls, :param description_func: an import path of a function to generate description. The function takes the dataset instance as the only argument and returns a string. If this is provided, it proceeds the description str. - :param batch_upsert: If True, images from batch_add_image will be saved by UpdateOne with upsert=True. - otherwise they will be saved by faster InsertOne operation. :return: the dataset object. """