From f78aae47d519e31bc461d2942bd6b9b84d6e8023 Mon Sep 17 00:00:00 2001
From: imhuwq <imhuwq@gmail.com>
Date: Tue, 30 Jan 2024 11:25:55 +0800
Subject: [PATCH 1/6] feature(write concern): set write concern to 0 for batch
 saving

---
 deepdataspace/model/_base.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/deepdataspace/model/_base.py b/deepdataspace/model/_base.py
index d45b713..1b30558 100644
--- a/deepdataspace/model/_base.py
+++ b/deepdataspace/model/_base.py
@@ -13,6 +13,7 @@
 from typing import Tuple
 
 from pydantic import BaseModel as _Base
+from pymongo import WriteConcern
 from pymongo.collection import Collection
 from pymongo.operations import UpdateOne
 from pymongo.typings import _DocumentType
@@ -232,6 +233,8 @@ def batch_update(cls, filters: dict, set_data: dict = None, unset_data: dict = N
         co = cls.get_collection()
         if co is None:
             return None
+        wc = WriteConcern(w=0)
+        co = co.with_options(write_concern=wc)
 
         op = UpdateOne(filters, {"$set": set_data, "$unset": unset_data})
 
@@ -257,6 +260,8 @@ def finish_batch_update(cls):
         op_lock = cls._get_batch_op_lock()
         with op_lock:
             co = cls.get_collection()
+            wc = WriteConcern(w=0)
+            co = co.with_options(write_concern=wc)
             queue = _batch_update_queue.setdefault(cls_id, [])
             if queue:
                 co.bulk_write(queue)
@@ -310,6 +315,8 @@ def batch_save(self, batch_size: int = 20, set_on_insert: Dict = None):
         co = cls.get_collection()
         if co is None:
             return None
+        wc = WriteConcern(w=0)
+        co = co.with_options(write_concern=wc)
 
         _id = self.__dict__.get("id", None)
         if _id is None:
@@ -348,6 +355,8 @@ def finish_batch_save(cls):
         op_lock = _batch_lock[cls_id]
         with op_lock:
             co = cls.get_collection()
+            wc = WriteConcern(w=0)
+            co = co.with_options(write_concern=wc)
             queue = _batch_save_queue.setdefault(cls_id, [])
             if queue:
                 co.bulk_write(queue)

From 6fe9bae0a75cd566cb9b129ce9f1b1714e81cd8c Mon Sep 17 00:00:00 2001
From: imhuwq <imhuwq@gmail.com>
Date: Wed, 21 Feb 2024 10:50:26 +0800
Subject: [PATCH 2/6] feature(batch insert): support batch insert for dataset
 importing

---
 deepdataspace/io/importer.py   | 19 +++++------
 deepdataspace/model/_base.py   | 59 ++++++++++++++++++++++++++++++++++
 deepdataspace/model/dataset.py | 33 +++++++++++++------
 3 files changed, 93 insertions(+), 18 deletions(-)

diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py
index b3f01c8..6da2225 100644
--- a/deepdataspace/io/importer.py
+++ b/deepdataspace/io/importer.py
@@ -108,7 +108,7 @@ def __init__(self, name: str, id_: str = None):
         """
 
         self.dataset_name = name
-        self.dataset = DataSet.get_importing_dataset(name, id_=id_)
+        self.dataset = DataSet.get_importing_dataset(name, id_=id_, batch_upsert=False)
 
         self._image_queue = {}
         self._label_queue = {}
@@ -123,6 +123,7 @@ def pre_run(self):
         self.load_existing_user_data()
         self.dataset.status = constants.DatasetStatus.Importing
         self.dataset.save()
+        Image(self.dataset.id).get_collection().drop()
 
     def post_run(self):
         """
@@ -152,14 +153,14 @@ def load_existing_user_data(self):
         """
 
         pipeline = [
-            {"$project": {"flag"         : 1,
-                          "flag_ts"      : 1,
+            {"$project": {"flag": 1,
+                          "flag_ts": 1,
                           "label_confirm": 1,
-                          "objects"      : {
+                          "objects": {
                               "$filter": {
                                   "input": "$objects",
-                                  "as"   : "object",
-                                  "cond" : {
+                                  "as": "object",
+                                  "cond": {
                                       "$eq": ["$$object.label_type", LabelType.User]
                                   }
                               }
@@ -181,9 +182,9 @@ def load_existing_user_data(self):
             label_confirm = image.get("label_confirm", {})
 
             self._user_data[image_id] = {
-                "objects"      : user_objects,
-                "flag"         : flag,
-                "flag_ts"      : flag_ts,
+                "objects": user_objects,
+                "flag": flag,
+                "flag_ts": flag_ts,
                 "label_confirm": label_confirm,
             }
 
diff --git a/deepdataspace/model/_base.py b/deepdataspace/model/_base.py
index 1b30558..7ecdf77 100644
--- a/deepdataspace/model/_base.py
+++ b/deepdataspace/model/_base.py
@@ -15,6 +15,7 @@
 from pydantic import BaseModel as _Base
 from pymongo import WriteConcern
 from pymongo.collection import Collection
+from pymongo.operations import InsertOne
 from pymongo.operations import UpdateOne
 from pymongo.typings import _DocumentType
 
@@ -24,6 +25,7 @@
 _batch_lock = {}  # a dict of batch operation lock for every collection, {'collection_name': batch_op_lock, }
 _batch_save_queue = {}  # a dict of batch save queue for every collection, {'collection_name': batch_save_queue, }
 _batch_update_queue = {}  # a dict of batch update queue for every collection, {'collection_name': batch_update_queue, }
+_batch_insert_queue = {}  # a dict of batch insert queue for every collection, {'collection_name': batch_insert_queue, }
 
 
 def current_ts():
@@ -362,6 +364,63 @@ def finish_batch_save(cls):
                 co.bulk_write(queue)
                 _batch_save_queue[cls_id] = []
 
+    @classmethod
+    def batch_create(cls, model_obj: "BaseModel", batch_size: int = 20):
+        """
+        The same as self.batch_save function,
+        but the performance is better because we save by InsertOne instead of UpdateOne with upsert.
+
+        :param model_obj: the saving model object.
+        :param batch_size: the batch size. We will only write to mongodb when the batch is full.
+        """
+
+        model_obj.post_init()
+
+        co = cls.get_collection()
+        if co is None:
+            return None
+        wc = WriteConcern(w=0)
+        co = co.with_options(write_concern=wc)
+
+        _id = model_obj.__dict__.get("id", None)
+        if _id is None:
+            return None
+
+        data = model_obj.to_dict()
+        data["_id"] = data.pop("id", None)
+
+        op = InsertOne(data)
+
+        cls_id = cls.get_cls_id()
+        op_lock = cls._get_batch_op_lock()
+        with op_lock:
+            queue = _batch_insert_queue.setdefault(cls_id, [])
+            queue.append(op)
+            if len(queue) >= batch_size:
+                co.bulk_write(queue)
+                _batch_insert_queue[cls_id] = []
+
+    @classmethod
+    def finish_batch_create(cls):
+        """
+        This must be called after all the batch_create calls.
+        """
+
+        cls_id = cls.get_cls_id()
+        if _batch_lock.get(cls_id, None) is None:
+            with _lock_lock:
+                _batch_lock[cls_id] = Lock()
+
+        op_lock = _batch_lock[cls_id]
+        with op_lock:
+            co = cls.get_collection()
+            wc = WriteConcern(w=0)
+            co = co.with_options(write_concern=wc)
+            queue = _batch_insert_queue.setdefault(cls_id, [])
+            if queue:
+                co.bulk_write(queue)
+                _batch_insert_queue[cls_id] = []
+
     def delete(self):
         """
         Delete current object from mongodb.
diff --git a/deepdataspace/model/dataset.py b/deepdataspace/model/dataset.py
index b99f690..370f20d 100644
--- a/deepdataspace/model/dataset.py
+++ b/deepdataspace/model/dataset.py
@@ -104,6 +104,9 @@ def get_collection(cls, *args, **kwargs) -> Collection[_DocumentType]:
 
     _batch_queue: Dict[int, ImageModel] = {}
     _batch_size: int = 100
+    # If True, images from batch_add_image will be saved by UpdateOne with upsert=True.
+    # If False, images from batch_add_image will be saved by InsertOne, which is faster.
+    _batch_upsert: bool = True
 
     @classmethod
     def create_dataset(cls,
@@ -114,6 +117,7 @@ def create_dataset(cls,
                        files: dict = None,
                        description: str = None,
                        description_func: str = None,
+                       batch_upsert: bool = True,
                        ) -> "DataSet":
         """
         Create a dataset.
@@ -129,6 +133,8 @@ def create_dataset(cls,
         :param description_func: an import path of a function to generate description.
             The function takes the dataset instance as the only argument and returns a string.
             If this is provided, it proceeds the description str.
+        :param batch_upsert: If True, images from batch_add_image will be saved by UpdateOne with upsert=True.
+                             otherwise they will be saved by faster InsertOne operation.
         :return: the dataset object.
         """
 
@@ -148,6 +154,7 @@ def create_dataset(cls,
         dataset = cls(name=name, id=id_, type=type, path=path,
                       files=files, status=DatasetStatus.Ready,
                       description=description, description_func=description_func)
+        dataset._batch_upsert = batch_upsert
         dataset.post_init()
         dataset.save()
         return dataset
@@ -159,6 +166,7 @@ def get_importing_dataset(cls,
                               type: str = None,
                               path: str = None,
                               files: dict = None,
+                              batch_upsert: bool = True,
                               ) -> "DataSet":
         """
         This is the same as create_dataset.
@@ -179,6 +187,7 @@ def get_importing_dataset(cls,
 
         files = files or {}
         dataset = cls(name=name, id=id_, type=type, path=path, files=files, status=DatasetStatus.Waiting)
+        dataset._batch_upsert = batch_upsert
         dataset.post_init()
         dataset.save()
         return dataset
@@ -384,14 +393,20 @@ def _batch_save_image_batch(self):
             # setup image
             image.idx = idx
             image.id = idx if image.id < 0 else image.id
-            image.batch_save(batch_size=self._batch_size, set_on_insert={"idx": image.idx})
+            if self._batch_upsert is True:
+                image.batch_save(batch_size=self._batch_size, set_on_insert={"idx": image.idx})
+            else:
+                IModel.batch_create(image, batch_size=self._batch_size)
             idx += 1
 
             self._add_local_file_url_to_whitelist(image.url, whitelist_dirs)
             self._add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs)
 
         # finish batch saves
-        IModel.finish_batch_save()
+        if self._batch_upsert is True:
+            IModel.finish_batch_save()
+        else:
+            IModel.finish_batch_create()
         Label.finish_batch_save()
         Category.finish_batch_save()
 
@@ -406,9 +421,9 @@ def _batch_save_image_batch(self):
 
         self._batch_queue.clear()
 
-    def batch_save_image(self, enforce: bool = False):
+    def batch_save_image(self):
         batch_is_full = len(self._batch_queue) >= self._batch_size
-        if batch_is_full or enforce:
+        if batch_is_full:
             self._batch_save_image_batch()
             return True
         return False
@@ -449,16 +464,16 @@ def cascade_delete(dataset: "DataSet"):
             return
 
         dataset_id = dataset.id
-        print(f"dataset [{dataset_id}] is found, deleting...")
+        logger.info(f"dataset [{dataset_id}] is found, deleting...")
 
-        print(f"dataset [{dataset_id}] is found, deleting categories...")
+        logger.info(f"dataset [{dataset_id}] is found, deleting categories...")
         Category.delete_many({"dataset_id": dataset_id})
 
-        print(f"dataset [{dataset_id}] is found, deleting labels...")
+        logger.info(f"dataset [{dataset_id}] is found, deleting labels...")
         Label.delete_many({"dataset_id": dataset_id})
 
-        print(f"dataset [{dataset_id}] is found, deleting images...")
+        logger.info(f"dataset [{dataset_id}] is found, deleting images...")
         Image(dataset_id).get_collection().drop()
 
         DataSet.delete_many({"id": dataset_id})
-        print(f"dataset [{dataset_id}] is deleted.")
+        logger.info(f"dataset [{dataset_id}] is deleted.")

From 794e71fded52f04329263aa014b844e39b5a2f8c Mon Sep 17 00:00:00 2001
From: imhuwq <imhuwq@gmail.com>
Date: Fri, 23 Feb 2024 18:15:55 +0800
Subject: [PATCH 3/6] refactor(import dataset): import images with raw python
 dict instead of pydantic model

---
 deepdataspace/globals.py                    |  11 +-
 deepdataspace/io/importer.py                | 253 +++++++++++++++++---
 deepdataspace/model/_base.py                |  73 +-----
 deepdataspace/model/dataset.py              |  73 ++----
 deepdataspace/model/image.py                |  58 ++---
 deepdataspace/model/label_task.py           |   3 -
 deepdataspace/model/object.py               |  19 --
 deepdataspace/plugins/coco2017/importer.py  |  13 +-
 deepdataspace/scripts/migrate/2023061401.py |   2 +-
 deepdataspace/services/mongodb.py           |   2 +-
 10 files changed, 277 insertions(+), 230 deletions(-)

diff --git a/deepdataspace/globals.py b/deepdataspace/globals.py
index c03edd6..23f118a 100644
--- a/deepdataspace/globals.py
+++ b/deepdataspace/globals.py
@@ -41,17 +41,8 @@
 _mongo_user = urllib.parse.quote_plus(MONGODB_USER)
 _mongo_pass = urllib.parse.quote_plus(MONGODB_PASS)
 _mongo_url = f"mongodb://{_mongo_user}:{_mongo_pass}@{MONGODB_HOST}:{MONGODB_PORT}/{MONGODB_DBNAME}"
-_mongo_client = MongoClient(_mongo_url, authMechanism="SCRAM-SHA-256")
+_mongo_client = MongoClient(_mongo_url, authMechanism="SCRAM-SHA-256", maxPoolSize=None)
 MongoDB = _mongo_client[MONGODB_DBNAME]
 
 # init redis client
 Redis = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DBNAME, password=REDIS_PASS)
-
-# init sentry client
-# TODO: sentry is not necessary for dds tool, remove it as soon as possible
-if SENTRY_DSN is not None:
-    sample_rate = 0.1 if ENV == RunningEnv.Prod else 1.0
-    sentry_sdk.init(dsn=SENTRY_DSN,
-                    traces_sample_rate=sample_rate,
-                    environment=ENV, )
-    sentry_sdk.set_tag("os.user", get_os_username())
diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py
index 6da2225..e176906 100644
--- a/deepdataspace/io/importer.py
+++ b/deepdataspace/io/importer.py
@@ -6,25 +6,35 @@
 
 import abc
 import copy
+import json
 import logging
 import os
-import time
+import uuid
 from typing import Dict
 from typing import List
+from typing import Literal
 from typing import Tuple
 from typing import Type
 from typing import Union
 
 from tqdm import tqdm
+from pymongo import WriteConcern
 
 from deepdataspace import constants
+from deepdataspace.constants import AnnotationType
 from deepdataspace.constants import DatasetFileType
+from deepdataspace.constants import DatasetStatus
+from deepdataspace.constants import FileReadMode
 from deepdataspace.constants import LabelName
 from deepdataspace.constants import LabelType
-from deepdataspace.model import Category
+from deepdataspace.constants import RedisKey
+from deepdataspace.globals import Redis
 from deepdataspace.model import DataSet
-from deepdataspace.model import Label
+from deepdataspace.model.category import Category
 from deepdataspace.model.image import Image
+from deepdataspace.model.image import ImageModel
+from deepdataspace.model.label import Label
+from deepdataspace.utils.file import create_file_url
 from deepdataspace.utils.function import count_block_time
 from deepdataspace.utils.string import get_str_md5
 
@@ -106,14 +116,206 @@ def __init__(self, name: str, id_: str = None):
         :param id_: the dataset id.
             If provided, the importer will try to update an existing dataset instead of creating a new one.
         """
-
         self.dataset_name = name
-        self.dataset = DataSet.get_importing_dataset(name, id_=id_, batch_upsert=False)
+        self.dataset = self.get_importing_dataset(name, id=id_)
 
         self._image_queue = {}
         self._label_queue = {}
         self._category_queue = {}
         self._user_data = {}  # {image_id: {}}
+        self._import_queue = {}
+        self._batch_size = 200
+        self.IModel = Image(self.dataset.id)
+
+    @staticmethod
+    def get_importing_dataset(name: str,
+                              id: str = None,
+                              type: str = None,
+                              path: str = None,
+                              files: dict = None,
+                              ) -> "DataSet":
+        if id:
+            dataset = DataSet.find_one({"id": id})
+            if dataset is not None:
+                dataset.type = type or dataset.type
+                dataset.path = path or dataset.path
+                dataset.files = files or dataset.files
+                dataset.name = name
+                dataset.save()
+                return dataset
+        else:
+            id = uuid.uuid4().hex
+
+        files = files or {}
+        dataset = DataSet(name=name, id=id, type=type, path=path, files=files, status=DatasetStatus.Waiting)
+        dataset.save()
+        return dataset
+
+    def dataset_import_image(self,
+                             dataset: DataSet,
+                             uri: str,
+                             thumb_uri: str = None,
+                             width: int = None,
+                             height: int = None,
+                             id_: int = None,
+                             metadata: dict = None,
+                             flag: int = 0,
+                             flag_ts: int = 0, ) -> dict:
+        full_uri = uri
+        thumb_uri = full_uri if thumb_uri is None else thumb_uri
+        if full_uri.startswith("file://"):
+            full_uri = create_file_url(full_uri[7:], read_mode=FileReadMode.Binary)
+        if thumb_uri.startswith("file://"):
+            thumb_uri = create_file_url(thumb_uri[7:], read_mode=FileReadMode.Binary)
+
+        metadata = metadata or {}
+        metadata = json.dumps(metadata)
+
+        # if id is not set,
+        # we use a negative value to indicate we are adding a new image instead of updating an existing one
+        idx = dataset.num_images
+        id_ = id_ if id_ is not None else dataset.num_images
+        image = dict(id=id_, idx=idx,
+                     type=dataset.type, dataset_id=dataset.id,
+                     url=thumb_uri, url_full_res=full_uri,
+                     width=width, height=height,
+                     flag=flag, flag_ts=flag_ts,
+                     objects=[], metadata=metadata, )
+
+        self._import_queue[id_] = image
+        dataset.num_images += 1
+        return image
+
+    @staticmethod
+    def image_import_annotation(image: dict,
+                                category: str,
+                                label: str = LabelName.GroundTruth,
+                                label_type: Literal["GT", "Pred", "User"] = "GT",
+                                conf: float = 1.0,
+                                is_group: bool = False,
+                                bbox: Tuple[int, int, int, int] = None,
+                                segmentation: List[List[int]] = None,
+                                alpha_uri: str = None,
+                                keypoints: List[Union[float, int]] = None,
+                                keypoint_colors: List[int] = None,
+                                keypoint_skeleton: List[int] = None,
+                                keypoint_names: List[str] = None,
+                                caption: str = None,
+                                confirm_type: int = 0, ):
+
+        bbox = ImageModel.format_bbox(image["width"], image["height"], bbox)
+        segmentation = ImageModel.format_segmentation(segmentation)
+        points, colors, lines, names = ImageModel.format_keypoints(keypoints,
+                                                                   keypoint_colors,
+                                                                   keypoint_skeleton,
+                                                                   keypoint_names)
+        if alpha_uri and alpha_uri.startswith("file://"):
+            alpha_path = alpha_uri[7:]
+            alpha_uri = create_file_url(file_path=alpha_path,
+                                        read_mode=FileReadMode.Binary)
+
+        anno_obj = dict(label_name=label, label_type=label_type,
+                        category_name=category, caption=caption,
+                        bounding_box=bbox, segmentation=segmentation, alpha=alpha_uri,
+                        points=points, lines=lines, point_colors=colors, point_names=names,
+                        conf=conf, is_group=is_group, confirm_type=confirm_type)
+        image["objects"].append(anno_obj)
+
+    def bulk_write_images(self, image_queue: list):
+        co = self.IModel.get_collection()
+        wc = WriteConcern(w=0)
+        co = co.with_options(write_concern=wc)
+        co.insert_many(image_queue)
+
+    def _dataset_flush_importing(self):
+        if not self._import_queue:
+            return
+
+        with count_block_time("prepare batch setup", logger.debug):
+            dataset_id = self.dataset.id
+            waiting_labels = dict()
+            waiting_categories = dict()
+            object_types = set()
+            whitelist_dirs = set()
+
+        with count_block_time("prepare batch data", logger.debug):
+            image_insert_queue = []
+
+            for image_id, image in self._import_queue.items():
+                for obj in image["objects"]:
+                    # setup label
+                    label_name = obj["label_name"]
+                    label_id = waiting_labels.get(label_name, None)
+                    if label_id is None:
+                        label_id = get_str_md5(f"{dataset_id}_{label_name}")
+                        label = Label(name=label_name, id=label_id, type=obj["label_type"], dataset_id=dataset_id)
+                        label.batch_save(batch_size=self._batch_size)
+                        waiting_labels[label_name] = label_id
+                    obj["label_id"] = label_id
+
+                    # setup category
+                    cat_name = obj["category_name"]
+                    category_id = waiting_categories.get(cat_name, None)
+                    if category_id is None:
+                        category_id = get_str_md5(f"{dataset_id}_{cat_name}")
+                        category = Category(name=cat_name, id=category_id, dataset_id=dataset_id)
+                        category.batch_save(batch_size=self._batch_size)
+                        waiting_categories[cat_name] = category_id
+                    obj["category_id"] = category_id
+
+                    # setup object types
+                    if AnnotationType.Classification not in object_types:
+                        object_types.add(AnnotationType.Classification)
+                    if obj["bounding_box"] and AnnotationType.Detection not in object_types:
+                        object_types.add(AnnotationType.Detection)
+                    if obj["segmentation"] and AnnotationType.Segmentation not in object_types:
+                        object_types.add(AnnotationType.Segmentation)
+                    if obj["alpha"] and AnnotationType.Matting not in object_types:
+                        object_types.add(AnnotationType.Matting)
+                        DataSet.add_local_file_url_to_whitelist(obj["alpha"], whitelist_dirs)
+                    if obj["points"] and AnnotationType.KeyPoints not in object_types:
+                        object_types.add(AnnotationType.KeyPoints)
+
+                # setup image
+                image["_id"] = image.pop("id")
+                image_insert_queue.append(image)
+
+                DataSet.add_local_file_url_to_whitelist(image["url"], whitelist_dirs)
+                DataSet.add_local_file_url_to_whitelist(image["url_full_res"], whitelist_dirs)
+
+        # finish batch saves
+        with count_block_time("finish batch save", logger.debug):
+            Label.finish_batch_save()
+            Category.finish_batch_save()
+            self.bulk_write_images(image_insert_queue)
+
+        # setup dataset
+        with count_block_time("setup dataset", logger.debug):
+            self.dataset.object_types = list(sorted(list(object_types)))
+            self.dataset.save()
+
+        # save whitelist to redis
+        with count_block_time("save whitelist", logger.debug):
+            if whitelist_dirs:
+                Redis.sadd(RedisKey.DatasetImageDirs, *whitelist_dirs)
+
+            self._import_queue.clear()
+
+    def dataset_flush_importing(self):
+        batch_is_full = len(self._import_queue) >= self._batch_size
+        if batch_is_full:
+            with count_block_time("_dataset_flush_importing", logger.debug):
+                self._dataset_flush_importing()
+            return True
+        return False
+
+    def dataset_finish_importing(self):
+        """
+        This method should be called after all batch_add_image calls are finished.
+        This saves all images in the buffer queue to database.
+        """
+        self._dataset_flush_importing()
+        self.dataset.add_cover()
 
     def pre_run(self):
         """
@@ -121,6 +323,7 @@ def pre_run(self):
         """
 
         self.load_existing_user_data()
+        self.dataset.num_images = 0
         self.dataset.status = constants.DatasetStatus.Importing
         self.dataset.save()
         Image(self.dataset.id).get_collection().drop()
@@ -129,9 +332,9 @@ def post_run(self):
         """
         A post-run hook for subclass importers to clean up data.
         """
-
-        self.dataset.status = constants.DatasetStatus.Ready
-        self.dataset.save()
+        self.dataset.add_cover()
+        DataSet.update_one({"id": self.dataset.id}, {"status": DatasetStatus.Ready})
+        self.dataset = DataSet.find_one({"id": self.dataset.id})
 
     def on_error(self, err: Exception):
         """
@@ -139,11 +342,8 @@ def on_error(self, err: Exception):
         """
 
         try:
-            dataset_id = self.dataset.id
-            Label.delete_many({"dataset_id": dataset_id})
-            Category.delete_many({"dataset_id": dataset_id})
-            Image(dataset_id).get_collection().drop()
-            self.dataset.delete()
+            DataSet.cascade_delete(self.dataset)
+            self.dataset = None
         finally:
             raise err
 
@@ -188,20 +388,20 @@ def load_existing_user_data(self):
                 "label_confirm": label_confirm,
             }
 
-    def add_user_data(self, image):
+    def image_add_user_data(self, image: dict):
         """
         Save manually added user data back.
         """
 
-        image_id = image.id
+        image_id = image["id"]
         user_data = self._user_data.pop(image_id, {})
         if not user_data:
             return
 
-        image.objects.extend(user_data["objects"])
-        image.flag = user_data["flag"]
-        image.flag_ts = user_data["flag_ts"]
-        image.label_confirm = user_data["label_confirm"]
+        image.setdefault("objects").extend(user_data["objects"])
+        image["flag"] = user_data["flag"]
+        image["flag_ts"] = user_data["flag_ts"]
+        image["label_confirm"] = user_data["label_confirm"]
 
     def run_import(self):
         """
@@ -211,15 +411,14 @@ def run_import(self):
 
         desc = f"dataset[{self.dataset.name}@{self.dataset.id}] import progress"
         for (image, anno_list) in tqdm(self, desc=desc, unit=" images"):
-            beg = int(time.time() * 1000)
-            image = self.dataset.batch_add_image(**image)
-            self.add_user_data(image)
+        # for (image, anno_list) in self:
+            image = self.dataset_import_image(self.dataset, **image)
+            self.image_add_user_data(image)
             for anno in anno_list:
-                image.batch_add_annotation(**anno)
-            image.finish_batch_add_annotation()
-            logger.debug(f"time cost of import one image: {int(time.time() * 1000) - beg}ms")
-            logger.debug(f"imported image, id={image.id}, url={image.url}")
-        self.dataset.finish_batch_add_image()
+                self.image_import_annotation(image, **anno)
+            self.dataset_flush_importing()
+
+        self.dataset_finish_importing()
 
     def run(self):
         """
diff --git a/deepdataspace/model/_base.py b/deepdataspace/model/_base.py
index 7ecdf77..977228d 100644
--- a/deepdataspace/model/_base.py
+++ b/deepdataspace/model/_base.py
@@ -5,6 +5,7 @@
 """
 
 import abc
+import logging
 import time
 from threading import Lock
 from typing import ClassVar
@@ -15,7 +16,6 @@
 from pydantic import BaseModel as _Base
 from pymongo import WriteConcern
 from pymongo.collection import Collection
-from pymongo.operations import InsertOne
 from pymongo.operations import UpdateOne
 from pymongo.typings import _DocumentType
 
@@ -27,6 +27,8 @@
 _batch_update_queue = {}  # a dict of batch update queue for every collection, {'collection_name': batch_update_queue, }
 _batch_insert_queue = {}  # a dict of batch insert queue for every collection, {'collection_name': batch_insert_queue, }
 
+logger = logging.getLogger("model.base")
+
 
 def current_ts():
     """
@@ -56,12 +58,6 @@ def get_collection(cls, *args, **kwargs) -> Collection[_DocumentType]:
 
         raise NotImplementedError
 
-    def post_init(self):
-        """
-        Post init hook for initializing a model object.
-        """
-        pass
-
     @classmethod
     def from_dict(cls, data: dict):
         """
@@ -69,7 +65,6 @@ def from_dict(cls, data: dict):
         """
 
         obj = cls.parse_obj(data)
-        obj.post_init()
         return obj
 
     def to_dict(self, include: list = None, exclude: list = None):
@@ -282,7 +277,6 @@ def save(self, refresh=False):
         If refresh is True, the object will be re-fetched from mongodb after saving.
         """
 
-        self.post_init()
         co = self.get_collection()
         if co is None:
             return None
@@ -300,7 +294,6 @@ def save(self, refresh=False):
             new_self = co.find_one({"_id": _id})
             new_self.pop("_id", None)
             self.__dict__.update(new_self)
-            self.post_init()
         return self
 
     def batch_save(self, batch_size: int = 20, set_on_insert: Dict = None):
@@ -310,9 +303,6 @@ def batch_save(self, batch_size: int = 20, set_on_insert: Dict = None):
         :param batch_size: the batch size. We will only write to mongodb when the batch is full.
         :param set_on_insert: the fields only need to be set when we are inserting a new object.
         """
-
-        self.post_init()
-
         cls = self.__class__
         co = cls.get_collection()
         if co is None:
@@ -364,63 +354,6 @@ def finish_batch_save(cls):
                 co.bulk_write(queue)
                 _batch_save_queue[cls_id] = []
 
-    @classmethod
-    def batch_create(cls, model_obj: "BaseModel", batch_size: int = 20):
-        """
-        The same as self.batch_save function,
-        but the performance is better because we save by InsertOne instead of UpdateOne with upsert.
-
-        :param model_obj: the saving model object.
-        :param batch_size: the batch size. We will only write to mongodb when the batch is full.
-        """
-
-        model_obj.post_init()
-
-        co = cls.get_collection()
-        if co is None:
-            return None
-        wc = WriteConcern(w=0)
-        co = co.with_options(write_concern=wc)
-
-        _id = model_obj.__dict__.get("id", None)
-        if _id is None:
-            return None
-
-        data = model_obj.to_dict()
-        data["_id"] = data.pop("id", None)
-
-        op = InsertOne(data)
-
-        cls_id = cls.get_cls_id()
-        op_lock = cls._get_batch_op_lock()
-        with op_lock:
-            queue = _batch_insert_queue.setdefault(cls_id, [])
-            queue.append(op)
-            if len(queue) >= batch_size:
-                co.bulk_write(queue)
-                _batch_insert_queue[cls_id] = []
-
-    @classmethod
-    def finish_batch_create(cls):
-        """
-        This must be called after all the batch_create calls.
-        """
-
-        cls_id = cls.get_cls_id()
-        if _batch_lock.get(cls_id, None) is None:
-            with _lock_lock:
-                _batch_lock[cls_id] = Lock()
-
-        op_lock = _batch_lock[cls_id]
-        with op_lock:
-            co = cls.get_collection()
-            wc = WriteConcern(w=0)
-            co = co.with_options(write_concern=wc)
-            queue = _batch_insert_queue.setdefault(cls_id, [])
-            if queue:
-                co.bulk_write(queue)
-                _batch_insert_queue[cls_id] = []
-
     def delete(self):
         """
         Delete current object from mongodb.
diff --git a/deepdataspace/model/dataset.py b/deepdataspace/model/dataset.py
index 370f20d..31c16ce 100644
--- a/deepdataspace/model/dataset.py
+++ b/deepdataspace/model/dataset.py
@@ -27,6 +27,7 @@
 from deepdataspace.model.image import ImageModel
 from deepdataspace.model.label import Label
 from deepdataspace.utils.file import create_file_url
+from deepdataspace.utils.function import count_block_time
 from deepdataspace.utils.string import get_str_md5
 
 logger = logging.getLogger("io.model.dataset")
@@ -103,10 +104,7 @@ def get_collection(cls, *args, **kwargs) -> Collection[_DocumentType]:
     group_name: str = None
 
     _batch_queue: Dict[int, ImageModel] = {}
-    _batch_size: int = 100
-    # If True, images from batch_add_image will be saved by UpdateOne with upsert=True.
-    # If False, images from batch_add_image will be saved by InsertOne, which is faster.
-    _batch_upsert: bool = True
+    _batch_size: int = 200
 
     @classmethod
     def create_dataset(cls,
@@ -117,7 +115,6 @@ def create_dataset(cls,
                        files: dict = None,
                        description: str = None,
                        description_func: str = None,
-                       batch_upsert: bool = True,
                        ) -> "DataSet":
         """
         Create a dataset.
@@ -145,6 +142,7 @@ def create_dataset(cls,
                 dataset.path = path or dataset.path
                 dataset.files = files or dataset.files
                 dataset.name = name
+                dataset.num_images = Image(dataset.id).count_num({})
                 dataset.save()
                 return dataset
         else:
@@ -154,45 +152,11 @@ def create_dataset(cls,
         dataset = cls(name=name, id=id_, type=type, path=path,
                       files=files, status=DatasetStatus.Ready,
                       description=description, description_func=description_func)
-        dataset._batch_upsert = batch_upsert
-        dataset.post_init()
+        dataset.num_images = Image(dataset.id).count_num({})
         dataset.save()
         return dataset
 
-    @classmethod
-    def get_importing_dataset(cls,
-                              name: str,
-                              id_: str = None,
-                              type: str = None,
-                              path: str = None,
-                              files: dict = None,
-                              batch_upsert: bool = True,
-                              ) -> "DataSet":
-        """
-        This is the same as create_dataset.
-        But if the dataset is new, it's status will be set to "waiting" instead of "ready".
-        """
-
-        if id_:
-            dataset = DataSet.find_one({"id": id_})
-            if dataset is not None:
-                dataset.type = type or dataset.type
-                dataset.path = path or dataset.path
-                dataset.files = files or dataset.files
-                dataset.name = name
-                dataset.save()
-                return dataset
-        else:
-            id_ = uuid.uuid4().hex
-
-        files = files or {}
-        dataset = cls(name=name, id=id_, type=type, path=path, files=files, status=DatasetStatus.Waiting)
-        dataset._batch_upsert = batch_upsert
-        dataset.post_init()
-        dataset.save()
-        return dataset
-
-    def _add_cover(self, force_update: bool = False):
+    def add_cover(self, force_update: bool = False):
         has_cover = bool(self.cover_url)
         if has_cover and not force_update:
             return
@@ -266,17 +230,16 @@ def add_image(self,
             image.flag = flag or image.flag
             image.flag_ts = flag_ts or image.flag_ts
             image.metadata = metadata or image.metadata
-        image.post_init()
         image._dataset = self  # this saves a db query
 
         image.save()
         self.num_images = Model.count_num({})
-        self._add_cover()
+        self.add_cover()
 
         # save whitelist to redis
         whitelist_dirs = set()
-        self._add_local_file_url_to_whitelist(image.url, whitelist_dirs)
-        self._add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs)
+        self.add_local_file_url_to_whitelist(image.url, whitelist_dirs)
+        self.add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs)
         if whitelist_dirs:
             Redis.sadd(RedisKey.DatasetImageDirs, *whitelist_dirs)
 
@@ -335,7 +298,7 @@ def batch_add_image(self,
         return image
 
     @staticmethod
-    def _add_local_file_url_to_whitelist(url: str, whitelist: set):
+    def add_local_file_url_to_whitelist(url: str, whitelist: set):
         if not url or not url.startswith("/files/local_files"):
             return
 
@@ -386,27 +349,21 @@ def _batch_save_image_batch(self):
                     object_types.add(AnnotationType.Segmentation)
                 if obj.alpha and AnnotationType.Matting not in object_types:
                     object_types.add(AnnotationType.Matting)
-                    self._add_local_file_url_to_whitelist(obj.alpha, whitelist_dirs)
+                    self.add_local_file_url_to_whitelist(obj.alpha, whitelist_dirs)
                 if obj.points and AnnotationType.KeyPoints not in object_types:
                     object_types.add(AnnotationType.KeyPoints)
 
             # setup image
             image.idx = idx
             image.id = idx if image.id < 0 else image.id
-            if self._batch_upsert is True:
-                image.batch_save(batch_size=self._batch_size, set_on_insert={"idx": image.idx})
-            else:
-                IModel.batch_create(image, batch_size=self._batch_size)
+            image.batch_save(batch_size=self._batch_size, set_on_insert={"idx": image.idx})
             idx += 1
 
-            self._add_local_file_url_to_whitelist(image.url, whitelist_dirs)
-            self._add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs)
+            self.add_local_file_url_to_whitelist(image.url, whitelist_dirs)
+            self.add_local_file_url_to_whitelist(image.url_full_res, whitelist_dirs)
 
         # finish batch saves
-        if self._batch_upsert is True:
-            IModel.finish_batch_save()
-        else:
-            IModel.finish_batch_create()
+        IModel.finish_batch_save()
         Label.finish_batch_save()
         Category.finish_batch_save()
 
@@ -434,7 +391,7 @@ def finish_batch_add_image(self):
         This saves all images in the buffer queue to database.
         """
         self._batch_save_image_batch()
-        self._add_cover()
+        self.add_cover()
 
     def eval_description(self):
         """
diff --git a/deepdataspace/model/image.py b/deepdataspace/model/image.py
index 2840232..39fd579 100644
--- a/deepdataspace/model/image.py
+++ b/deepdataspace/model/image.py
@@ -163,7 +163,6 @@ def from_dict(cls, data: dict):
 
         data.setdefault("idx", data["id"])
         obj = cls.parse_obj(data)
-        obj.post_init()
         return obj
 
     @staticmethod
@@ -173,17 +172,6 @@ def _convert_local_to_url(file_uri: str):
                                    read_mode=constants.FileReadMode.Binary)
         return file_url
 
-    def post_init(self):
-        """
-        Ensure the url are visible for local file uri.
-        """
-
-        if self.url.startswith("file://"):
-            self.url = self._convert_local_to_url(self.url)
-
-        if self.url_full_res.startswith("file://"):
-            self.url_full_res = self._convert_local_to_url(self.url_full_res)
-
     def _add_label(self, label: str, label_type: str):
         """
         Add a label to the dataset the image belongs to.
@@ -200,7 +188,6 @@ def _add_label(self, label: str, label_type: str):
 
         if label_obj is None:
             label_obj = Label(name=label, id=label_id, type=label_type, dataset_id=self.dataset_id)
-            label_obj.post_init()
             label_obj.save()
             self._labels[label_id] = label_obj
         return label_obj
@@ -217,13 +204,12 @@ def _add_category(self, category: str):
 
         if category_obj is None:
             category_obj = Category(name=category, id=category_id, dataset_id=self.dataset_id)
-            category_obj.post_init()
             category_obj.save()
             self._categories[category_id] = category_obj
         return category_obj
 
     @staticmethod
-    def _format_bbox(width, height, bbox: Tuple[int, int, int, int]):
+    def format_bbox(width, height, bbox: Tuple[int, int, int, int]):
         """
         Convert the bbox data to the internal format.
         """
@@ -239,7 +225,7 @@ def _format_bbox(width, height, bbox: Tuple[int, int, int, int]):
         return bounding_box
 
     @staticmethod
-    def _format_segmentation(segmentation: List[List[int]]):
+    def format_segmentation(segmentation: List[List[int]]):
         """
         Convert the segmentation data to the internal format.
         """
@@ -249,10 +235,10 @@ def _format_segmentation(segmentation: List[List[int]]):
         return "/".join([",".join([str(x) for x in seg]) for seg in segmentation])
 
     @staticmethod
-    def _format_keypoints(keypoints: List[Union[float, int]],
-                          colors: List[int] = None,
-                          skeleton: List[int] = None,
-                          names: List[str] = None):
+    def format_keypoints(keypoints: List[Union[float, int]],
+                         colors: List[int] = None,
+                         skeleton: List[int] = None,
+                         names: List[str] = None):
         """
         Convert the coco_keypoints data to the internal format.
         """
@@ -333,20 +319,24 @@ def _add_annotation(self,
             if not self.width or not self.height:
                 raise ValueError("image width and height must be set before setting bbox")
 
+        if alpha_uri and alpha_uri.startswith("file://"):
+            alpha_path = alpha_uri[7:]
+            alpha_uri = create_file_url(file_path=alpha_path,
+                                        read_mode=FileReadMode.Binary)
+
         label_obj = self._add_label(label, label_type)
         category_obj = self._add_category(category)
-        bounding_box = self._format_bbox(self.width, self.height, bbox)
-        segmentation = self._format_segmentation(segmentation)
-        points, colors, lines, names = self._format_keypoints(keypoints,
-                                                              keypoint_colors,
-                                                              keypoint_skeleton,
-                                                              keypoint_names)
+        bounding_box = self.format_bbox(self.width, self.height, bbox)
+        segmentation = self.format_segmentation(segmentation)
+        points, colors, lines, names = self.format_keypoints(keypoints,
+                                                             keypoint_colors,
+                                                             keypoint_skeleton,
+                                                             keypoint_names)
         anno_obj = Object(label_name=label, label_type=label_type, label_id=label_obj.id,
                           category_name=category, category_id=category_obj.id, caption=caption,
                           bounding_box=bounding_box, segmentation=segmentation, alpha=alpha_uri,
                           points=points, lines=lines, point_colors=colors, point_names=names,
                           conf=conf, is_group=is_group, confirm_type=confirm_type)
-        anno_obj.post_init()
         self.objects.append(anno_obj)
 
     def add_annotation(self,
@@ -421,7 +411,7 @@ def batch_add_annotation(self,
                 for annotation_data in annotations:
                     image.batch_add_annotation(**annotation_data)
 
-            dataset.finish_batch_add+image()
+            dataset.finish_batch_add_image()
 
         :param category: the category name.
         :param label: the label name.
@@ -442,12 +432,12 @@ def batch_add_annotation(self,
         :return: None
         """
 
-        bbox = self._format_bbox(self.width, self.height, bbox)
-        segmentation = self._format_segmentation(segmentation)
-        points, colors, lines, names = self._format_keypoints(keypoints,
-                                                              keypoint_colors,
-                                                              keypoint_skeleton,
-                                                              keypoint_names)
+        bbox = self.format_bbox(self.width, self.height, bbox)
+        segmentation = self.format_segmentation(segmentation)
+        points, colors, lines, names = self.format_keypoints(keypoints,
+                                                             keypoint_colors,
+                                                             keypoint_skeleton,
+                                                             keypoint_names)
         if alpha_uri and alpha_uri.startswith("file://"):
             alpha_path = alpha_uri[7:]
             alpha_uri = create_file_url(file_path=alpha_path,
diff --git a/deepdataspace/model/label_task.py b/deepdataspace/model/label_task.py
index 9f00a29..42736cf 100644
--- a/deepdataspace/model/label_task.py
+++ b/deepdataspace/model/label_task.py
@@ -462,7 +462,6 @@ def _get_label(dataset_id: str, label_set_name: str):
         """
         label_id = get_str_md5(f"{dataset_id}_{label_set_name}")
         label_obj = Label(name=label_set_name, id=label_id, type=LabelType.GroundTruth, dataset_id=dataset_id)
-        label_obj.post_init()
         label_obj.save()
         return label_obj
 
@@ -475,7 +474,6 @@ def _get_category(dataset_id: str, category_name: str, categories: dict):
         if cat_obj is None:
             cat_id = get_str_md5(f"{dataset_id}_{category_name}")
             cat_obj = Category(id=cat_id, name=category_name, dataset_id=dataset_id)
-            cat_obj.post_init()
             cat_obj.save()
             categories[category_name] = cat_obj
         return cat_obj
@@ -529,7 +527,6 @@ def _export_dataset(self, dataset: DataSet, label_set_name: str):
                     anno_obj = Object(label_name=label_obj.name, label_type=label_obj.type, label_id=label_obj.id,
                                       category_name=cat_obj.name, category_id=cat_obj.id,
                                       bounding_box=anno["bounding_box"])
-                    anno_obj.post_init()
                     image.objects.append(anno_obj)
                     image.batch_save()
 
diff --git a/deepdataspace/model/object.py b/deepdataspace/model/object.py
index 55133a9..5855969 100644
--- a/deepdataspace/model/object.py
+++ b/deepdataspace/model/object.py
@@ -89,22 +89,3 @@ def get_collection(cls, *args, **kwargs):
     confirm_type: Optional[int] = 0  # the image confirm type, 0 no confirm required, 1 gt may be fn, 2 pred may be fp
     compare_result: Optional[Dict[str, str]] = {}  # {"90": "FP", ..., "10": "OK"}
     matched_det_idx: Optional[int] = None  # The matched ground truth index, for prediction objects only.
-
-    @staticmethod
-    def _convert_file_path_to_url(file_uri: str):
-        """
-        Convert a local file path to a visible file url.
-        """
-
-        file_path = file_uri[7:]
-        file_url = create_file_url(file_path=file_path,
-                                   read_mode=constants.FileReadMode.Binary)
-        return file_url
-
-    def post_init(self):
-        """
-        Override the post_init method to convert the alpha file path to url.
-        """
-
-        if self.alpha and self.alpha.startswith("file://"):
-            self.alpha = self._convert_file_path_to_url(self.alpha)
diff --git a/deepdataspace/plugins/coco2017/importer.py b/deepdataspace/plugins/coco2017/importer.py
index 1d4ba15..4016118 100644
--- a/deepdataspace/plugins/coco2017/importer.py
+++ b/deepdataspace/plugins/coco2017/importer.py
@@ -1,14 +1,13 @@
 """
 Import the coco2017 dataset and save metadata into mongodb.
 """
-
 import json
 import logging
 import os
+import traceback
 from typing import Dict
 from typing import List
 from typing import Tuple
-import traceback
 
 from deepdataspace.constants import DatasetFileType
 from deepdataspace.constants import DatasetType
@@ -83,11 +82,11 @@ def _parse_meta(meta_path: str):
             assert os.path.isdir(image_root) and os.path.exists(image_root)
 
         info = {
-            "dataset_name"     : dataset_name,
-            "ground_truth"     : ground_truth,
-            "predictions"      : predictions,
-            "image_root"       : image_root,
-            "dynamic_caption"  : getattr(module, "dynamic_caption", False),
+            "dataset_name": dataset_name,
+            "ground_truth": ground_truth,
+            "predictions": predictions,
+            "image_root": image_root,
+            "dynamic_caption": getattr(module, "dynamic_caption", False),
             "caption_generator": getattr(module, "caption_generator", None),
         }
 
diff --git a/deepdataspace/scripts/migrate/2023061401.py b/deepdataspace/scripts/migrate/2023061401.py
index 71f4aae..5105feb 100644
--- a/deepdataspace/scripts/migrate/2023061401.py
+++ b/deepdataspace/scripts/migrate/2023061401.py
@@ -16,7 +16,7 @@ def add_covers():
 
     datasets = DataSet.find_many({})
     for idx, dataset in enumerate(datasets):
-        dataset._add_cover(force_update=True)
+        dataset.add_cover(force_update=True)
         logger.info(f"[{idx + 1}/{num}]Added cover to dataset[{dataset.id}], cover_url={dataset.cover_url}")
 
     logger.info("Finished adding covers")
diff --git a/deepdataspace/services/mongodb.py b/deepdataspace/services/mongodb.py
index 83c9d2b..ffe533f 100644
--- a/deepdataspace/services/mongodb.py
+++ b/deepdataspace/services/mongodb.py
@@ -82,7 +82,7 @@ def _start_mongodb(self, auth: bool = False):
         self.port = find_free_port(*port_range)
         while True:
             run_cmd = cmd[:]
-            run_cmd.extend(["--port", str(self.port)])
+            run_cmd.extend(["--nojournal", "--port", str(self.port)])
             try:
                 self.start_process(run_cmd)
             except Exception as err:

From 187334c2e6e399bc09773215527dee1fba8a0a18 Mon Sep 17 00:00:00 2001
From: imhuwq <imhuwq@gmail.com>
Date: Mon, 26 Feb 2024 10:17:37 +0800
Subject: [PATCH 4/6] feature(mongodb): enable journaling

---
 deepdataspace/services/mongodb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepdataspace/services/mongodb.py b/deepdataspace/services/mongodb.py
index ffe533f..83c9d2b 100644
--- a/deepdataspace/services/mongodb.py
+++ b/deepdataspace/services/mongodb.py
@@ -82,7 +82,7 @@ def _start_mongodb(self, auth: bool = False):
         self.port = find_free_port(*port_range)
         while True:
             run_cmd = cmd[:]
-            run_cmd.extend(["--nojournal", "--port", str(self.port)])
+            run_cmd.extend(["--port", str(self.port)])
             try:
                 self.start_process(run_cmd)
             except Exception as err:

From 4c03620141a9069f14d8ab342c780df9380fae2b Mon Sep 17 00:00:00 2001
From: imhuwq <imhuwq@gmail.com>
Date: Mon, 26 Feb 2024 11:40:24 +0800
Subject: [PATCH 5/6] delete(label confirm): delete label confirm attribute for
 objects

---
 deepdataspace/io/importer.py                  | 17 +---
 deepdataspace/model/image.py                  | 22 +----
 deepdataspace/model/object.py                 |  6 --
 deepdataspace/plugins/tsv/importer.py         |  4 -
 .../server/resources/api_v1/__init__.py       |  2 -
 .../server/resources/api_v1/annotations.py    |  1 -
 .../server/resources/api_v1/images.py         | 11 +--
 .../server/resources/api_v1/label_clone.py    |  5 +-
 .../server/resources/api_v1/object_confirm.py | 84 -------------------
 9 files changed, 11 insertions(+), 141 deletions(-)
 delete mode 100644 deepdataspace/server/resources/api_v1/object_confirm.py

diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py
index e176906..66d70a0 100644
--- a/deepdataspace/io/importer.py
+++ b/deepdataspace/io/importer.py
@@ -77,8 +77,7 @@ def format_annotation(category: str,
                           keypoint_colors: List[int] = None,
                           keypoint_skeleton: List[int] = None,
                           keypoint_names: List[str] = None,
-                          caption: str = None,
-                          confirm_type: int = 0, ):
+                          caption: str = None):
         """
         A helper function to format annotation data.
         """
@@ -95,8 +94,7 @@ def format_annotation(category: str,
                     keypoint_colors=keypoint_colors,
                     keypoint_skeleton=keypoint_skeleton,
                     keypoint_names=keypoint_names,
-                    caption=caption,
-                    confirm_type=confirm_type, )
+                    caption=caption)
 
 
 class Importer(ImportHelper, abc.ABC):
@@ -200,8 +198,7 @@ def image_import_annotation(image: dict,
                                 keypoint_colors: List[int] = None,
                                 keypoint_skeleton: List[int] = None,
                                 keypoint_names: List[str] = None,
-                                caption: str = None,
-                                confirm_type: int = 0, ):
+                                caption: str = None):
 
         bbox = ImageModel.format_bbox(image["width"], image["height"], bbox)
         segmentation = ImageModel.format_segmentation(segmentation)
@@ -218,7 +215,7 @@ def image_import_annotation(image: dict,
                         category_name=category, caption=caption,
                         bounding_box=bbox, segmentation=segmentation, alpha=alpha_uri,
                         points=points, lines=lines, point_colors=colors, point_names=names,
-                        conf=conf, is_group=is_group, confirm_type=confirm_type)
+                        conf=conf, is_group=is_group)
         image["objects"].append(anno_obj)
 
     def bulk_write_images(self, image_queue: list):
@@ -355,7 +352,6 @@ def load_existing_user_data(self):
         pipeline = [
             {"$project": {"flag": 1,
                           "flag_ts": 1,
-                          "label_confirm": 1,
                           "objects": {
                               "$filter": {
                                   "input": "$objects",
@@ -378,14 +374,10 @@ def load_existing_user_data(self):
             flag = image.get("flag", 0)
             flag_ts = image.get("flag_ts", 0)
 
-            # manually added confirm flag
-            label_confirm = image.get("label_confirm", {})
-
             self._user_data[image_id] = {
                 "objects": user_objects,
                 "flag": flag,
                 "flag_ts": flag_ts,
-                "label_confirm": label_confirm,
             }
 
     def image_add_user_data(self, image: dict):
@@ -401,7 +393,6 @@ def image_add_user_data(self, image: dict):
         image.setdefault("objects").extend(user_data["objects"])
         image["flag"] = user_data["flag"]
         image["flag_ts"] = user_data["flag_ts"]
-        image["label_confirm"] = user_data["label_confirm"]
 
     def run_import(self):
         """
diff --git a/deepdataspace/model/image.py b/deepdataspace/model/image.py
index 39fd579..50c864e 100644
--- a/deepdataspace/model/image.py
+++ b/deepdataspace/model/image.py
@@ -90,12 +90,6 @@ class ImageModel(BaseModel):
         fp counter of image in the format {"label_id": {90:x, 80: y, ..., 10: z}}. Default is an empty dict.
     num_fp_cat: dict
         fp counter of image categorized, in the format {"label_id": {"category_id: {90:x, 80: y, ..., 10: z}}}. Default is an empty dict.
-    label_confirm: dict
-        Confirm status of every label sets, where confirm can be:
-        0 = not confirmed,
-        1 = confirmed,
-        2 = refine required.
-        Format is {"label_id": {"confirm": int, "confirm_ts": int}}. Default is an empty dict.
     """
 
     @classmethod
@@ -128,10 +122,6 @@ def get_collection(cls):
     num_fp: dict = {}  # {"label_id": {90:x, 80: y, ..., 10: z}}
     num_fp_cat: dict = {}  # {"label_id": {"category_id: {90:x, 80: y, ..., 10: z}}}
 
-    # confirm status of every label sets, confirm: 0 = not confirmed, 1 = confirmed, 2 = refine required
-    # {"label_id": {"confirm": int, "confirm_ts": int}}
-    label_confirm: dict = {}
-
     _dataset = None
 
     _labels: dict = {}
@@ -313,7 +303,6 @@ def _add_annotation(self,
                         keypoint_skeleton: List[int] = None,
                         keypoint_names: List[str] = None,
                         caption: str = None,
-                        confirm_type: int = 0,
                         ):
         if bbox:
             if not self.width or not self.height:
@@ -336,7 +325,7 @@ def _add_annotation(self,
                           category_name=category, category_id=category_obj.id, caption=caption,
                           bounding_box=bounding_box, segmentation=segmentation, alpha=alpha_uri,
                           points=points, lines=lines, point_colors=colors, point_names=names,
-                          conf=conf, is_group=is_group, confirm_type=confirm_type)
+                          conf=conf, is_group=is_group)
         self.objects.append(anno_obj)
 
     def add_annotation(self,
@@ -353,7 +342,6 @@ def add_annotation(self,
                        keypoint_skeleton: List[int] = None,
                        keypoint_names: List[str] = None,
                        caption: str = None,
-                       confirm_type: int = 0,
                        ):
         """
         Add an annotation to the image.
@@ -373,13 +361,12 @@ def add_annotation(self,
         :param keypoint_colors: the key point colors, [255, 0, 0, ...].
         :param keypoint_skeleton: the key point skeleton, [0, 1, 2, ...].
         :param caption: the caption of the annotation.
-        :param confirm_type: the confirm_type of the annotation, 0 = not confirmed, 1 = gt may be fn, 2 = pred may be fp
         """
 
         self._add_annotation(category, label, label_type, conf,
                              is_group, bbox, segmentation, alpha_uri,
                              keypoints, keypoint_colors, keypoint_skeleton, keypoint_names,
-                             caption, confirm_type)
+                             caption)
 
         self.save()
         self._update_dataset(bbox, segmentation, alpha_uri, keypoints)
@@ -398,7 +385,7 @@ def batch_add_annotation(self,
                              keypoint_skeleton: List[int] = None,
                              keypoint_names: List[str] = None,
                              caption: str = None,
-                             confirm_type: int = 0, ):
+                             ):
         """
         The batch version of add_annotation.
         The performance is better if we are saving a lot of annotations.
@@ -428,7 +415,6 @@ def batch_add_annotation(self,
         :param keypoint_colors: the key point colors, [255, 0, 0, ...].
         :param keypoint_skeleton: the key point skeleton, [0, 1, 2, ...].
         :param caption: the caption of the annotation.
-        :param confirm_type: the confirm_type of the annotation, 0 = not confirmed, 1 = gt may be fn, 2 = pred may be fp
         :return: None
         """
 
@@ -447,7 +433,7 @@ def batch_add_annotation(self,
                           category_name=category, caption=caption,
                           bounding_box=bbox, segmentation=segmentation, alpha=alpha_uri,
                           points=points, lines=lines, point_colors=colors, point_names=names,
-                          conf=conf, is_group=is_group, confirm_type=confirm_type)
+                          conf=conf, is_group=is_group)
         self.objects.append(anno_obj)
 
     def finish_batch_add_annotation(self):
diff --git a/deepdataspace/model/object.py b/deepdataspace/model/object.py
index 5855969..a0c7a14 100644
--- a/deepdataspace/model/object.py
+++ b/deepdataspace/model/object.py
@@ -6,13 +6,10 @@
 
 from typing import Dict
 from typing import List
-from typing import Literal
 from typing import Optional
 from typing import Union
 
-from deepdataspace import constants
 from deepdataspace.model._base import BaseModel
-from deepdataspace.utils.file import create_file_url
 
 
 class Object(BaseModel):
@@ -52,8 +49,6 @@ class Object(BaseModel):
         The point names of the object.
     caption: str
         The caption of the object.
-    confirm_type: int
-        The image confirm type, 0 for unconfirmed, 1 for confirmed, 2 for rejected.
     compare_result: dict
         The compare result of the object, {"90": "FP", ..., "10": "OK"}.
     matched_det_idx: int
@@ -86,6 +81,5 @@ def get_collection(cls, *args, **kwargs):
     point_colors: Optional[List[int]] = []
     point_names: Optional[List[str]] = []
     caption: Optional[str] = ""
-    confirm_type: Optional[int] = 0  # the image confirm type, 0 no confirm required, 1 gt may be fn, 2 pred may be fp
     compare_result: Optional[Dict[str, str]] = {}  # {"90": "FP", ..., "10": "OK"}
     matched_det_idx: Optional[int] = None  # The matched ground truth index, for prediction objects only.
diff --git a/deepdataspace/plugins/tsv/importer.py b/deepdataspace/plugins/tsv/importer.py
index e243ecb..490516b 100644
--- a/deepdataspace/plugins/tsv/importer.py
+++ b/deepdataspace/plugins/tsv/importer.py
@@ -156,9 +156,6 @@ def load_objects(self,
             # prepare is_group
             is_group = bool(obj.get("iscrowd", False))
 
-            # prepare confirm_type
-            confirm_type = obj.get("confirm_type", None)
-
             # prepare confidence
             confidence = obj.get("conf", 1.0)
 
@@ -167,7 +164,6 @@ def load_objects(self,
                                          label=label_name, label_type=label_type,
                                          conf=confidence, is_group=is_group,
                                          bbox=bbox, segmentation=segmentation, alpha_uri=alpha,
-                                         confirm_type=confirm_type,
                                          )
             obj_list.append(obj)
 
diff --git a/deepdataspace/server/resources/api_v1/__init__.py b/deepdataspace/server/resources/api_v1/__init__.py
index 3681a8d..445f99b 100644
--- a/deepdataspace/server/resources/api_v1/__init__.py
+++ b/deepdataspace/server/resources/api_v1/__init__.py
@@ -17,7 +17,6 @@
 from deepdataspace.server.resources.api_v1.image_flags import ImageFlagsView
 from deepdataspace.server.resources.api_v1.images import ImagesView
 from deepdataspace.server.resources.api_v1.label_clone import LabelCloneView
-from deepdataspace.server.resources.api_v1.object_confirm import ObjectConfirmView
 
 urls = [
     path("ping", ping.PingView.as_view()),
@@ -29,7 +28,6 @@
     path("datasets/<dataset_id>", DatasetView.as_view()),
     path("image_flags", ImageFlagsView.as_view()),
     path("label_clone", LabelCloneView.as_view()),
-    path("object_confirm", ObjectConfirmView.as_view()),
     path("annotations", AnnotationsView.as_view()),
     path("comparisons", ComparisonsView.as_view()),
     path("label_projects", label_tasks.ProjectsView.as_view()),
diff --git a/deepdataspace/server/resources/api_v1/annotations.py b/deepdataspace/server/resources/api_v1/annotations.py
index 49b97a4..3580e22 100644
--- a/deepdataspace/server/resources/api_v1/annotations.py
+++ b/deepdataspace/server/resources/api_v1/annotations.py
@@ -118,7 +118,6 @@ def _save_annotations(dataset: DataSet, image, annotations):
                 cur_objs.append(obj)
 
         image.objects = cur_objs
-        image.label_confirm[label_id] = {"confirm": 1, "confirm_ts": int(time.time())}
         image.save()
 
         return saving_categories, saving_labels
diff --git a/deepdataspace/server/resources/api_v1/images.py b/deepdataspace/server/resources/api_v1/images.py
index da93987..acd303c 100644
--- a/deepdataspace/server/resources/api_v1/images.py
+++ b/deepdataspace/server/resources/api_v1/images.py
@@ -68,7 +68,6 @@ class ImagesView(BaseAPIView):
         Argument("dataset_id", str, Argument.QUERY, required=True),
         Argument("category_id", str, Argument.QUERY, required=False),
         Argument("flag", int, Argument.QUERY, required=False),
-        Argument("confirm", int, Argument.QUERY, required=False),
         Argument("label_id", str, Argument.QUERY, required=False),
         Argument("page_num", Argument.PositiveInt, Argument.QUERY, default=1),
         Argument("page_size", Argument.PositiveInt, Argument.QUERY, default=100)
@@ -80,7 +79,7 @@ def get(self, request):
         - GET /api/v1/images
         """
 
-        dataset_id, category_id, flag, confirm, label_id, page_num, page_size = parse_arguments(request, self.get_args)
+        dataset_id, category_id, flag, label_id, page_num, page_size = parse_arguments(request, self.get_args)
 
         dataset = DataSet.find_one({"_id": dataset_id})
         if dataset is None:
@@ -101,15 +100,13 @@ def get(self, request):
 
         if flag is not None:
             filters["flag"] = flag
-        if confirm is not None and label_id is not None:
-            filters[f"label_confirm.{label_id}.confirm"] = confirm
 
         total = Image(dataset_id).count_num(filters)
 
         image_list = []
         offset = max(0, page_size * (page_num - 1))
 
-        includes = {"id", "idx", "flag", "label_confirm", "objects", "metadata", "type", "width", "height", "url",
+        includes = {"id", "idx", "flag", "objects", "metadata", "type", "width", "height", "url",
                     "url_full_res"}
         includes = {i: 1 for i in includes}
 
@@ -123,10 +120,6 @@ def get(self, request):
                                                      skip=offset,
                                                      size=page_size,
                                                      to_dict=True):
-
-                # TODO keep for compatibility, delete this after run op/migrates/add_confirm_fields.py
-                image.setdefault("label_confirm", {})
-
                 for obj in image["objects"]:
                     obj["source"] = obj["label_type"]  # TODO keep for compatibility, delete this in the future
 
diff --git a/deepdataspace/server/resources/api_v1/label_clone.py b/deepdataspace/server/resources/api_v1/label_clone.py
index 8dec90b..0542a7e 100644
--- a/deepdataspace/server/resources/api_v1/label_clone.py
+++ b/deepdataspace/server/resources/api_v1/label_clone.py
@@ -64,12 +64,9 @@ def gen_unique_clone_name(dataset_id, src_label_name, dst_label_name):
     def clone_images_collection(dataset_id, target_dataset_id, src_label_id, dst_label_id, dst_label_name):
         for image in Image(dataset_id).find_many({}):
             cloned = []
-            label_confirm = image.label_confirm
-            label_confirm.setdefault(dst_label_id, {"confirm": 0, "confirm_ts": 0})
 
             for obj in image.objects:
                 label_id = obj.label_id
-                label_confirm.setdefault(label_id, {"confirm": 0, "confirm_ts": 0})
 
                 if obj.label_id != src_label_id:
                     continue
@@ -79,7 +76,7 @@ def clone_images_collection(dataset_id, target_dataset_id, src_label_id, dst_lab
                         category_id=obj.category_id, category_name=obj.category_name, conf=obj.conf,
                         is_group=obj.is_group, bounding_box=obj.bounding_box, segmentation=obj.segmentation,
                         alpha=obj.alpha, points=obj.points, lines=obj.lines, point_colors=obj.point_colors,
-                        point_names=obj.point_names, confirm_type=obj.confirm_type,
+                        point_names=obj.point_names
                 )
                 cloned.append(obj)
             image.objects.extend(cloned)
diff --git a/deepdataspace/server/resources/api_v1/object_confirm.py b/deepdataspace/server/resources/api_v1/object_confirm.py
deleted file mode 100644
index aaa736e..0000000
--- a/deepdataspace/server/resources/api_v1/object_confirm.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""
-deepdataspace.server.resources.api_v1.object_confirm
-
-RESTful API for object confirm.
-"""
-
-import logging
-import time
-
-from deepdataspace.constants import ErrCode
-from deepdataspace.constants import DatasetStatus
-from deepdataspace.model import DataSet
-from deepdataspace.model import Label
-from deepdataspace.model import Object
-from deepdataspace.model.image import Image
-from deepdataspace.utils.http import Argument
-from deepdataspace.utils.http import BaseAPIView
-from deepdataspace.utils.http import format_response
-from deepdataspace.utils.http import parse_arguments
-from deepdataspace.utils.http import raise_exception
-
-logger = logging.getLogger("django")
-
-
-class ObjectConfirmView(BaseAPIView):
-    """
-    - POST /api/v1/object_confirm
-    """
-
-    post_args = [
-        Argument("dataset_id", str, Argument.JSON, required=True),
-        Argument("label_id", str, Argument.JSON, required=True),
-        Argument("image_id", int, Argument.JSON, required=True),
-        Argument("confirm", Argument.Choice([1, 2]), Argument.JSON, required=True),
-        Argument("objects", list, Argument.JSON, required=False),
-    ]
-
-    def post(self, request):
-        """
-        Confirm a label set of an image.
-
-        - POST /api/v1/object_confirm
-        """
-
-        dataset_id, label_id, image_id, confirm, objects = parse_arguments(request, self.post_args)
-
-        dataset = DataSet.find_one({"id": dataset_id})
-        if dataset is None:
-            raise_exception(ErrCode.DatasetNotFound, f"dataset_id[{dataset_id}] not found")
-        if dataset.status in DatasetStatus.DontRead_:
-            raise_exception(404, f"dataset_id[{dataset_id}] is in status [{dataset.status}] now, try again later")
-
-        label = Label.find_one({"id": label_id, "dataset_id": dataset_id})
-        if label is None:
-            raise_exception(ErrCode.DatasetLabelNotFound, f"label_id[{label_id}] not found")
-
-        image = Image(dataset_id).find_one({"id": image_id})
-        if image is None:
-            raise_exception(ErrCode.DatasetImageNotFound, f"image_id[{image_id}] not found")
-
-        new_objects = []
-        for idx, obj in enumerate(objects):
-            if obj["label_id"] != label_id:
-                continue
-
-            try:
-                obj = Object.from_dict(obj)
-                obj.compare_result = {}
-                obj.matched_det_idx = None
-            except Exception as err:
-                logger.warning(err)
-                raise_exception(ErrCode.AnnotationFormatError, f"objects[{idx}] data structure mismatch")
-            new_objects.append(obj)
-
-        saving_objects = [o for o in image.objects if o.label_id != label_id]
-        saving_objects.extend(new_objects)
-        image.objects = saving_objects
-
-        label_confirm = image.label_confirm
-        confirm_ts = int(time.time())
-        label_confirm[label_id] = {"confirm": confirm, "confirm_ts": confirm_ts}
-        image.save()
-
-        return format_response({"confirm": confirm, "confirm_ts": confirm_ts})

From c46cda039b7d3fbdfa9f8997e8773eec6f1963b9 Mon Sep 17 00:00:00 2001
From: imhuwq <imhuwq@gmail.com>
Date: Mon, 26 Feb 2024 15:02:49 +0800
Subject: [PATCH 6/6] delete(comments): delete outdated comments

---
 deepdataspace/io/importer.py   | 2 --
 deepdataspace/model/_base.py   | 1 -
 deepdataspace/model/dataset.py | 2 --
 3 files changed, 5 deletions(-)

diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py
index 66d70a0..d30c5e5 100644
--- a/deepdataspace/io/importer.py
+++ b/deepdataspace/io/importer.py
@@ -169,8 +169,6 @@ def dataset_import_image(self,
         metadata = metadata or {}
         metadata = json.dumps(metadata)
 
-        # if id is not set,
-        # we use a negative value to indicate we are adding a new image instead of updating an existing one
         idx = dataset.num_images
         id_ = id_ if id_ is not None else dataset.num_images
         image = dict(id=id_, idx=idx,
diff --git a/deepdataspace/model/_base.py b/deepdataspace/model/_base.py
index 977228d..3c9830b 100644
--- a/deepdataspace/model/_base.py
+++ b/deepdataspace/model/_base.py
@@ -25,7 +25,6 @@
 _batch_lock = {}  # a dict of batch operation lock for every collection, {'collection_name': batch_op_lock, }
 _batch_save_queue = {}  # a dict of batch save queue for every collection, {'collection_name': batch_save_queue, }
 _batch_update_queue = {}  # a dict of batch update queue for every collection, {'collection_name': batch_update_queue, }
-_batch_insert_queue = {}  # a dict of batch insert queue for every collection, {'collection_name': batch_insert_queue, }
 
 logger = logging.getLogger("model.base")
 
diff --git a/deepdataspace/model/dataset.py b/deepdataspace/model/dataset.py
index 31c16ce..28d8311 100644
--- a/deepdataspace/model/dataset.py
+++ b/deepdataspace/model/dataset.py
@@ -130,8 +130,6 @@ def create_dataset(cls,
         :param description_func: an import path of a function to generate description.
             The function takes the dataset instance as the only argument and returns a string.
             If this is provided, it proceeds the description str.
-        :param batch_upsert: If True, images from batch_add_image will be saved by UpdateOne with upsert=True.
-                             otherwise they will be saved by faster InsertOne operation.
         :return: the dataset object.
         """