From a4baa131d878d86e6b507fb88db6e3a85c835f66 Mon Sep 17 00:00:00 2001
From: Robert Bartel <robert.bartel@noaa.gov>
Date: Mon, 10 Jul 2023 10:42:40 -0400
Subject: [PATCH 1/6] Add InitialDataAdder abstract type and usage.

Creating abstract InitialDataAdder type for use with creating datasets,
and modifying Dataset.create abstract function to use this type for the
'initial_data' parameter, rather than a string that represents the
location of some kind of initial data.
---
 python/lib/core/dmod/core/dataset.py | 57 +++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 5 deletions(-)

diff --git a/python/lib/core/dmod/core/dataset.py b/python/lib/core/dmod/core/dataset.py
index 24f595168..ce4ffc57b 100644
--- a/python/lib/core/dmod/core/dataset.py
+++ b/python/lib/core/dmod/core/dataset.py
@@ -54,6 +54,48 @@ def is_file_based(self) -> bool:
         return self._is_file_based
 
 
+class InitialDataAdder(ABC):
+    """
+    Abstract type for adding initial data from some other source(s) to a dataset in the process of being created.
+
+    Abstract type for adding initial data to a new dataset, typically from some other source(s) of data, and frequently
+    when some logic has to be applied to the data before it can be added.  This logic may include things like
+    transforming formats, extracting subsets, and/or combining separate data together.
+
+    Note that it is important (and this is where "initial data" really applies) that subtypes are implemented so that
+    an instance **does not** expect its applicable ::class:`Dataset` to exist at the time the instance is created.  Only
+    when ::method:`add_initial_data` is invoked must the dataset exist.
+    """
+
+    def __init__(self, dataset_name: str, dataset_manager: 'DatasetManager', *args, **kwargs):
+        """
+        Initialize.
+
+        Parameters
+        ----------
+        dataset_name : str
+            The name of the new dataset to which this instance will add initial data.
+        dataset_manager : DatasetManager
+            The manager of the new dataset to which this instance will add initial data.
+        args
+        kwargs
+        """
+        self._dataset_name: str = dataset_name
+        self._dataset_manager: DatasetManager = dataset_manager
+
+    @abstractmethod
+    def add_initial_data(self):
+        """
+        Assemble and add the initial data.
+
+        Raises
+        -------
+        DmodRuntimeError
+            Raised when initial data could not be assembled and/or added successfully to the dataset.
+        """
+        pass
+
+
 class Dataset(Serializable):
     """
     Rrepresentation of the descriptive metadata for a grouped collection of data.
@@ -463,14 +505,18 @@ def combine_partials_into_composite(self, dataset_name: str, item_name: str, com
 
     @abstractmethod
     def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_only: bool,
-               initial_data: Optional[str] = None) -> Dataset:
+               initial_data: Optional[InitialDataAdder] = None, expires_on: Optional[datetime] = None) -> Dataset:
         """
-        Create a new dataset instance.
+        Create a new dataset instance, optionally inserting data.
 
         Implementations should ensure that a returned dataset is ready for use.  That is, existing data at the time of
         creation is accessible according to the dataset's metadata, and (when appropriate) the dataset is ready for
         receiving output written to it.
 
+        Note that ``initial_data`` allows for optionally adding data to the dataset as it is created. Implementations
+        should ensure that, if ``initial_data`` is not ``None``, the expected dataset survives this method only if the
+        addition was successful (i.e., it is either never created or removed if not).
+
         Parameters
         ----------
         name : str
@@ -481,9 +527,10 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
             The data domain for the new dataset, which includes the format, fields, and restrictions on values.
         is_read_only : bool
             Whether the new dataset is read-only.
-        initial_data : Optional[str]
-            Optional string representation of a location in which there is initial data that should be added to the
-            dataset.
+        initial_data : Union[Path, str, Dict[str, bytes], Callable[[str, 'DatasetManager'], bool], None]
+            Optional means for initially adding data as the dataset is created.
+        expires_on : Optional[datetime]
+            Optional point when the dataset (initially) expires, if it should be temporary.
 
         Returns
         -------

From 057807f206542b362a27c9b6eb75f0205cd95d16 Mon Sep 17 00:00:00 2001
From: Robert Bartel <robert.bartel@noaa.gov>
Date: Mon, 10 Jul 2023 10:44:05 -0400
Subject: [PATCH 2/6] Update ObjectStoreManager for InitialDataAdder.

Updating DatasetManager implementation's create function to support
usage of the InitialDataAdder abstraction for the 'initial_data' param.
---
 .../modeldata/data/object_store_manager.py    | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
index 33045f461..fbcfec2c5 100644
--- a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
+++ b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
@@ -4,7 +4,7 @@
 import minio.retention
 
 from dmod.core.meta_data import DataCategory, DataDomain
-from dmod.core.dataset import Dataset, DatasetManager, DatasetType
+from dmod.core.dataset import Dataset, DatasetManager, DatasetType, InitialDataAdder
 from datetime import datetime, timedelta
 from minio import Minio
 from minio.api import ObjectWriteResult
@@ -282,11 +282,15 @@ def combine_partials_into_composite(self, dataset_name: str, item_name: str, com
         except Exception as e:
             return False
 
+    # TODO: adjust signature to give "is_read_only" a default of False
     def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_only: bool,
-               initial_data: Optional[str] = None) -> Dataset:
+               initial_data: Optional[InitialDataAdder] = None, expires_on: Optional[datetime] = None) -> Dataset:
         """
         Create a new ::class:`Dataset` instance and, if needed, backing object store bucket of the same name.
 
+        Note that, if ``initial_data`` is not ``None``, the expected dataset will survive this method only if the
+        addition of the initial data was successful (i.e., the dataset is either never created or removed if not).
+
         Parameters
         ----------
         name : str
@@ -297,8 +301,10 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
             The data domain for the new dataset, which includes the format, fields, and restrictions on values.
         is_read_only : bool
             Whether the new dataset is read-only.
-        initial_data : Optional[str]
-            Optional string form of a path to a directory containing initial data that should be added to the dataset.
+        initial_data : Optional[InitialDataAdder]
+            Optional means for initially adding data as the dataset is created.
+        expires_on : Optional[datetime]
+            Optional point when the dataset (initially) expires, if it should be temporary.
 
         Returns
         -------
@@ -309,13 +315,7 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
             raise RuntimeError("Cannot create new dataset with name {}: name already in use".format(name))
         if self._client.bucket_exists(name):
             raise RuntimeError("Unexpected existing bucket when creating dataset {}".format(name))
-
-        files_dir = None
-        if initial_data is not None:
-            files_dir = Path(initial_data)
-            if not files_dir.is_dir():
-                raise RuntimeError("Invalid param for initial dataset data: {} not a directory".format(files_dir))
-        elif is_read_only:
+        if is_read_only and initial_data is None:
             msg = "Attempting to create read-only dataset {} without supplying it with any initial data"
             raise RuntimeError(msg.format(name))
 
@@ -330,12 +330,22 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
         access_loc = "{}://{}/{}".format('https' if self._secure_connection else 'http', self._obj_store_host_str, name)
         dataset = Dataset(name=name, category=category, data_domain=domain, dataset_type=DatasetType.OBJECT_STORE,
                           manager=self, access_location=access_loc, is_read_only=is_read_only, created_on=created_on,
-                          last_updated=created_on)
+                          last_updated=created_on, expires_on=expires_on)
         self.datasets[name] = dataset
-        if files_dir is not None:
-            self._push_files(bucket_name=name, dir_path=files_dir, recursive=True)
-        self.persist_serialized(name)
-        return dataset
+        # Put in a try block to make sure the dataset only remains if adding data worked as needed (if applicable)
+        try:
+            if initial_data is not None:
+                initial_data.add_initial_data()
+
+            # Then updated the persisted state file and return
+            self.persist_serialized(name)
+            return dataset
+        # If we ran into any trouble writing initial data to the dataset, then bail, cleaning up the dataset from the
+        # manager and the object store itself
+        except Exception as e:
+            self.datasets.pop(name)
+            self._client.remove_bucket(name)
+            raise e
 
     @property
     def data_chunking_params(self) -> Optional[Tuple[str, str]]:

From c93220f94c1b0b24320eb056b7a156df3a05fdd7 Mon Sep 17 00:00:00 2001
From: Robert Bartel <robert.bartel@noaa.gov>
Date: Mon, 10 Jul 2023 10:59:31 -0400
Subject: [PATCH 3/6] Bump dmod.core version to 0.9.0.

---
 python/lib/core/dmod/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/lib/core/dmod/core/_version.py b/python/lib/core/dmod/core/_version.py
index ccf9e6286..1658609d0 100644
--- a/python/lib/core/dmod/core/_version.py
+++ b/python/lib/core/dmod/core/_version.py
@@ -1 +1 @@
-__version__ = '0.8.0'
\ No newline at end of file
+__version__ = '0.9.0'
\ No newline at end of file

From 5ea4f92c688581a3a08f6980d36b65361558c52e Mon Sep 17 00:00:00 2001
From: Robert Bartel <robert.bartel@noaa.gov>
Date: Mon, 10 Jul 2023 11:00:39 -0400
Subject: [PATCH 4/6] Update dmod.modeldata dep for dmod.core.

Updating to depend on core 0.9.0.
---
 python/lib/modeldata/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/lib/modeldata/setup.py b/python/lib/modeldata/setup.py
index 5fc41e3b7..869c77080 100644
--- a/python/lib/modeldata/setup.py
+++ b/python/lib/modeldata/setup.py
@@ -26,7 +26,7 @@
         "fiona",
         "geopandas",
         "dmod-communication>=0.4.2",
-        "dmod-core>=0.3.0",
+        "dmod-core>=0.9.0",
         "minio",
         "aiohttp<=3.7.4",
         "shapely>=2.0.0",

From 7eef8e99e939beb4094d53aafe11e6646b0abc81 Mon Sep 17 00:00:00 2001
From: Robert Bartel <robert.bartel@noaa.gov>
Date: Mon, 10 Jul 2023 11:01:05 -0400
Subject: [PATCH 5/6] Bump dmod.modeldata version to 0.9.2.

---
 python/lib/modeldata/dmod/modeldata/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/lib/modeldata/dmod/modeldata/_version.py b/python/lib/modeldata/dmod/modeldata/_version.py
index 102b47c9c..61816e5af 100644
--- a/python/lib/modeldata/dmod/modeldata/_version.py
+++ b/python/lib/modeldata/dmod/modeldata/_version.py
@@ -1 +1 @@
-__version__ = '0.9.1'
\ No newline at end of file
+__version__ = '0.9.2'
\ No newline at end of file

From b2847f081d0f181c94505a1e5b0b66ec5afedd6c Mon Sep 17 00:00:00 2001
From: Robert Bartel <robert.bartel@noaa.gov>
Date: Mon, 10 Jul 2023 16:28:44 -0400
Subject: [PATCH 6/6] Use delete w/ obj store mgr create() if add fails.

Fixing ObjectStoreDatasetManager create() function so that it calls the
delete() function if initial data fails to be added (and thus the
dataset does not need to end up created); previous logic that just
removed the bucket would fail if the bucket had anything in it.
---
 .../dmod/modeldata/data/object_store_manager.py      | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
index fbcfec2c5..a11eab19f 100644
--- a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
+++ b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
@@ -331,20 +331,24 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
         dataset = Dataset(name=name, category=category, data_domain=domain, dataset_type=DatasetType.OBJECT_STORE,
                           manager=self, access_location=access_loc, is_read_only=is_read_only, created_on=created_on,
                           last_updated=created_on, expires_on=expires_on)
+
+        # Once dataset is added to ``datasets``, it's "managed," so calls to add_data, delete, etc., should work
         self.datasets[name] = dataset
+
         # Put in a try block to make sure the dataset only remains if adding data worked as needed (if applicable)
         try:
             if initial_data is not None:
                 initial_data.add_initial_data()
 
+            # TODO: (later) consider whether dataset should not be deleted if everything else worked until this
             # Then updated the persisted state file and return
             self.persist_serialized(name)
+
             return dataset
-        # If we ran into any trouble writing initial data to the dataset, then bail, cleaning up the dataset from the
-        # manager and the object store itself
+        # If we ran into any trouble adding initial data, then bail, cleaning up the dataset and backing storage
         except Exception as e:
-            self.datasets.pop(name)
-            self._client.remove_bucket(name)
+            # Since the dataset is "managed," we can call delete()
+            self.delete(dataset=dataset)
             raise e
 
     @property