NOAA-OWP · robertbartel · Jul 11, 2023 · Jul 10, 2023 · Jul 10, 2023 · Jul 10, 2023
diff --git a/python/lib/core/dmod/core/_version.py b/python/lib/core/dmod/core/_version.py
@@ -1 +1 @@
-__version__ = '0.8.0'
+__version__ = '0.9.0'
diff --git a/python/lib/core/dmod/core/dataset.py b/python/lib/core/dmod/core/dataset.py
@@ -54,6 +54,48 @@ def is_file_based(self) -> bool:
         return self._is_file_based
 
 
+class InitialDataAdder(ABC):
+    """
+    Abstract type for adding initial data from some other source(s) to a dataset in the process of being created.
+
+    Abstract type for adding initial data to a new dataset, typically from some other source(s) of data, and frequently
+    when some logic has to be applied to the data before it can be added.  This logic may include things like
+    transforming formats, extracting subsets, and/or combining separate data together.
+
+    Note that it is important (and this is where "initial data" really applies) that subtypes are implemented so that
+    an instance **does not** expect its applicable ::class:`Dataset` to exist at the time the instance is created.  Only
+    when ::method:`add_initial_data` is invoked must the dataset exist.
+    """
+
+    def __init__(self, dataset_name: str, dataset_manager: 'DatasetManager', *args, **kwargs):
+        """
+        Initialize.
+
+        Parameters
+        ----------
+        dataset_name : str
+            The name of the new dataset to which this instance will add initial data.
+        dataset_manager : DatasetManager
+            The manager of the new dataset to which this instance will add initial data.
+        args
+        kwargs
+        """
+        self._dataset_name: str = dataset_name
+        self._dataset_manager: DatasetManager = dataset_manager
+
+    @abstractmethod
+    def add_initial_data(self):
+        """
+        Assemble and add the initial data.
+
+        Raises
+        -------
+        DmodRuntimeError
+            Raised when initial data could not be assembled and/or added successfully to the dataset.
+        """
+        pass
+
+
 class Dataset(Serializable):
     """
     Rrepresentation of the descriptive metadata for a grouped collection of data.
@@ -463,14 +505,18 @@ def combine_partials_into_composite(self, dataset_name: str, item_name: str, com
 
     @abstractmethod
     def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_only: bool,
-               initial_data: Optional[str] = None) -> Dataset:
+               initial_data: Optional[InitialDataAdder] = None, expires_on: Optional[datetime] = None) -> Dataset:
         """
-        Create a new dataset instance.
+        Create a new dataset instance, optionally inserting data.
 
         Implementations should ensure that a returned dataset is ready for use.  That is, existing data at the time of
         creation is accessible according to the dataset's metadata, and (when appropriate) the dataset is ready for
         receiving output written to it.
 
+        Note that ``initial_data`` allows for optionally adding data to the dataset as it is created. Implementations
+        should ensure that, if ``initial_data`` is not ``None``, the expected dataset survives this method only if the
+        addition was successful (i.e., it is either never created or removed if not).
+
         Parameters
         ----------
         name : str
@@ -481,9 +527,10 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
             The data domain for the new dataset, which includes the format, fields, and restrictions on values.
         is_read_only : bool
             Whether the new dataset is read-only.
-        initial_data : Optional[str]
-            Optional string representation of a location in which there is initial data that should be added to the
-            dataset.
+        initial_data : Union[Path, str, Dict[str, bytes], Callable[[str, 'DatasetManager'], bool], None]
+            Optional means for initially adding data as the dataset is created.
+        expires_on : Optional[datetime]
+            Optional point when the dataset (initially) expires, if it should be temporary.
 
         Returns
         -------

diff --git a/python/lib/modeldata/dmod/modeldata/_version.py b/python/lib/modeldata/dmod/modeldata/_version.py
@@ -1 +1 @@
-__version__ = '0.9.1'
+__version__ = '0.9.2'
diff --git a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
@@ -4,7 +4,7 @@
 import minio.retention
 
 from dmod.core.meta_data import DataCategory, DataDomain
-from dmod.core.dataset import Dataset, DatasetManager, DatasetType
+from dmod.core.dataset import Dataset, DatasetManager, DatasetType, InitialDataAdder
 from datetime import datetime, timedelta
 from minio import Minio
 from minio.api import ObjectWriteResult
@@ -282,11 +282,15 @@ def combine_partials_into_composite(self, dataset_name: str, item_name: str, com
         except Exception as e:
             return False
 
+    # TODO: adjust signature to give "is_read_only" a default of False
     def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_only: bool,
-               initial_data: Optional[str] = None) -> Dataset:
+               initial_data: Optional[InitialDataAdder] = None, expires_on: Optional[datetime] = None) -> Dataset:
         """
         Create a new ::class:`Dataset` instance and, if needed, backing object store bucket of the same name.
 
+        Note that, if ``initial_data`` is not ``None``, the expected dataset will survive this method only if the
+        addition of the initial data was successful (i.e., the dataset is either never created or removed if not).
+
         Parameters
         ----------
         name : str
@@ -297,8 +301,10 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
             The data domain for the new dataset, which includes the format, fields, and restrictions on values.
         is_read_only : bool
             Whether the new dataset is read-only.
-        initial_data : Optional[str]
-            Optional string form of a path to a directory containing initial data that should be added to the dataset.
+        initial_data : Optional[InitialDataAdder]
+            Optional means for initially adding data as the dataset is created.
+        expires_on : Optional[datetime]
+            Optional point when the dataset (initially) expires, if it should be temporary.
 
         Returns
         -------
@@ -309,13 +315,7 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
             raise RuntimeError("Cannot create new dataset with name {}: name already in use".format(name))
         if self._client.bucket_exists(name):
             raise RuntimeError("Unexpected existing bucket when creating dataset {}".format(name))
-
-        files_dir = None
-        if initial_data is not None:
-            files_dir = Path(initial_data)
-            if not files_dir.is_dir():
-                raise RuntimeError("Invalid param for initial dataset data: {} not a directory".format(files_dir))
-        elif is_read_only:
+        if is_read_only and initial_data is None:
             msg = "Attempting to create read-only dataset {} without supplying it with any initial data"
             raise RuntimeError(msg.format(name))
 
@@ -330,12 +330,26 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
         access_loc = "{}://{}/{}".format('https' if self._secure_connection else 'http', self._obj_store_host_str, name)
         dataset = Dataset(name=name, category=category, data_domain=domain, dataset_type=DatasetType.OBJECT_STORE,
                           manager=self, access_location=access_loc, is_read_only=is_read_only, created_on=created_on,
-                          last_updated=created_on)
+                          last_updated=created_on, expires_on=expires_on)
+
+        # Once dataset is added to ``datasets``, it's "managed," so calls to add_data, delete, etc., should work
         self.datasets[name] = dataset
-        if files_dir is not None:
-            self._push_files(bucket_name=name, dir_path=files_dir, recursive=True)
-        self.persist_serialized(name)
-        return dataset
+
+        # Put in a try block to make sure the dataset only remains if adding data worked as needed (if applicable)
+        try:
+            if initial_data is not None:
+                initial_data.add_initial_data()
+
+            # TODO: (later) consider whether dataset should not be deleted if everything else worked until this
+            # Then updated the persisted state file and return
+            self.persist_serialized(name)
+
+            return dataset
+        # If we ran into any trouble adding initial data, then bail, cleaning up the dataset and backing storage
+        except Exception as e:
+            # Since the dataset is "managed," we can call delete()
+            self.delete(dataset=dataset)
+            raise e
 
     @property
     def data_chunking_params(self) -> Optional[Tuple[str, str]]:

diff --git a/python/lib/modeldata/setup.py b/python/lib/modeldata/setup.py
@@ -26,7 +26,7 @@
         "fiona",
         "geopandas",
         "dmod-communication>=0.4.2",
-        "dmod-core>=0.3.0",
+        "dmod-core>=0.9.0",
         "minio",
         "aiohttp<=3.7.4",
         "shapely>=2.0.0",