From a4baa131d878d86e6b507fb88db6e3a85c835f66 Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Mon, 10 Jul 2023 10:42:40 -0400 Subject: [PATCH 1/6] Add InitialDataAdder abstract type and usage. Creating abstract InitialDataAdder type for use with creating datasets, and modifying Dataset.create abstract function to use this type for the 'initial_data' parameter, rather than a string that represents the location of some kind of initial data. --- python/lib/core/dmod/core/dataset.py | 57 +++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/python/lib/core/dmod/core/dataset.py b/python/lib/core/dmod/core/dataset.py index 24f595168..ce4ffc57b 100644 --- a/python/lib/core/dmod/core/dataset.py +++ b/python/lib/core/dmod/core/dataset.py @@ -54,6 +54,48 @@ def is_file_based(self) -> bool: return self._is_file_based +class InitialDataAdder(ABC): + """ + Abstract type for adding initial data from some other source(s) to a dataset in the process of being created. + + Abstract type for adding initial data to a new dataset, typically from some other source(s) of data, and frequently + when some logic has to be applied to the data before it can be added. This logic may include things like + transforming formats, extracting subsets, and/or combining separate data together. + + Note that it is important (and this is where "initial data" really applies) that subtypes are implemented so that + an instance **does not** expect its applicable ::class:`Dataset` to exist at the time the instance is created. Only + when ::method:`add_initial_data` is invoked must the dataset exist. + """ + + def __init__(self, dataset_name: str, dataset_manager: 'DatasetManager', *args, **kwargs): + """ + Initialize. + + Parameters + ---------- + dataset_name : str + The name of the new dataset to which this instance will add initial data. + dataset_manager : DatasetManager + The manager of the new dataset to which this instance will add initial data. + args + kwargs + """ + self._dataset_name: str = dataset_name + self._dataset_manager: DatasetManager = dataset_manager + + @abstractmethod + def add_initial_data(self): + """ + Assemble and add the initial data. + + Raises + ------- + DmodRuntimeError + Raised when initial data could not be assembled and/or added successfully to the dataset. + """ + pass + + class Dataset(Serializable): """ Rrepresentation of the descriptive metadata for a grouped collection of data. @@ -463,14 +505,18 @@ def combine_partials_into_composite(self, dataset_name: str, item_name: str, com @abstractmethod def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_only: bool, - initial_data: Optional[str] = None) -> Dataset: + initial_data: Optional[InitialDataAdder] = None, expires_on: Optional[datetime] = None) -> Dataset: """ - Create a new dataset instance. + Create a new dataset instance, optionally inserting data. Implementations should ensure that a returned dataset is ready for use. That is, existing data at the time of creation is accessible according to the dataset's metadata, and (when appropriate) the dataset is ready for receiving output written to it. + Note that ``initial_data`` allows for optionally adding data to the dataset as it is created. Implementations + should ensure that, if ``initial_data`` is not ``None``, the expected dataset survives this method only if the + addition was successful (i.e., it is either never created or removed if not). + Parameters ---------- name : str @@ -481,9 +527,10 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_ The data domain for the new dataset, which includes the format, fields, and restrictions on values. is_read_only : bool Whether the new dataset is read-only. - initial_data : Optional[str] - Optional string representation of a location in which there is initial data that should be added to the - dataset. + initial_data : Union[Path, str, Dict[str, bytes], Callable[[str, 'DatasetManager'], bool], None] + Optional means for initially adding data as the dataset is created. + expires_on : Optional[datetime] + Optional point when the dataset (initially) expires, if it should be temporary. Returns ------- From 057807f206542b362a27c9b6eb75f0205cd95d16 Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Mon, 10 Jul 2023 10:44:05 -0400 Subject: [PATCH 2/6] Update ObjectStoreManager for InitialDataAdder. Updating DatasetManager implementation's create function to support usage of the InitialDataAdder abstraction for the 'initial_data' param. --- .../modeldata/data/object_store_manager.py | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py index 33045f461..fbcfec2c5 100644 --- a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py +++ b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py @@ -4,7 +4,7 @@ import minio.retention from dmod.core.meta_data import DataCategory, DataDomain -from dmod.core.dataset import Dataset, DatasetManager, DatasetType +from dmod.core.dataset import Dataset, DatasetManager, DatasetType, InitialDataAdder from datetime import datetime, timedelta from minio import Minio from minio.api import ObjectWriteResult @@ -282,11 +282,15 @@ def combine_partials_into_composite(self, dataset_name: str, item_name: str, com except Exception as e: return False + # TODO: adjust signature to give "is_read_only" a default of False def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_only: bool, - initial_data: Optional[str] = None) -> Dataset: + initial_data: Optional[InitialDataAdder] = None, expires_on: Optional[datetime] = None) -> Dataset: """ Create a new ::class:`Dataset` instance and, if needed, backing object store bucket of the same name. + Note that, if ``initial_data`` is not ``None``, the expected dataset will survive this method only if the + addition of the initial data was successful (i.e., the dataset is either never created or removed if not). + Parameters ---------- name : str @@ -297,8 +301,10 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_ The data domain for the new dataset, which includes the format, fields, and restrictions on values. is_read_only : bool Whether the new dataset is read-only. - initial_data : Optional[str] - Optional string form of a path to a directory containing initial data that should be added to the dataset. + initial_data : Optional[InitialDataAdder] + Optional means for initially adding data as the dataset is created. + expires_on : Optional[datetime] + Optional point when the dataset (initially) expires, if it should be temporary. Returns ------- @@ -309,13 +315,7 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_ raise RuntimeError("Cannot create new dataset with name {}: name already in use".format(name)) if self._client.bucket_exists(name): raise RuntimeError("Unexpected existing bucket when creating dataset {}".format(name)) - - files_dir = None - if initial_data is not None: - files_dir = Path(initial_data) - if not files_dir.is_dir(): - raise RuntimeError("Invalid param for initial dataset data: {} not a directory".format(files_dir)) - elif is_read_only: + if is_read_only and initial_data is None: msg = "Attempting to create read-only dataset {} without supplying it with any initial data" raise RuntimeError(msg.format(name)) @@ -330,12 +330,22 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_ access_loc = "{}://{}/{}".format('https' if self._secure_connection else 'http', self._obj_store_host_str, name) dataset = Dataset(name=name, category=category, data_domain=domain, dataset_type=DatasetType.OBJECT_STORE, manager=self, access_location=access_loc, is_read_only=is_read_only, created_on=created_on, - last_updated=created_on) + last_updated=created_on, expires_on=expires_on) self.datasets[name] = dataset - if files_dir is not None: - self._push_files(bucket_name=name, dir_path=files_dir, recursive=True) - self.persist_serialized(name) - return dataset + # Put in a try block to make sure the dataset only remains if adding data worked as needed (if applicable) + try: + if initial_data is not None: + initial_data.add_initial_data() + + # Then updated the persisted state file and return + self.persist_serialized(name) + return dataset + # If we ran into any trouble writing initial data to the dataset, then bail, cleaning up the dataset from the + # manager and the object store itself + except Exception as e: + self.datasets.pop(name) + self._client.remove_bucket(name) + raise e @property def data_chunking_params(self) -> Optional[Tuple[str, str]]: From c93220f94c1b0b24320eb056b7a156df3a05fdd7 Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Mon, 10 Jul 2023 10:59:31 -0400 Subject: [PATCH 3/6] Bump dmod.core version to 0.9.0. --- python/lib/core/dmod/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lib/core/dmod/core/_version.py b/python/lib/core/dmod/core/_version.py index ccf9e6286..1658609d0 100644 --- a/python/lib/core/dmod/core/_version.py +++ b/python/lib/core/dmod/core/_version.py @@ -1 +1 @@ -__version__ = '0.8.0' \ No newline at end of file +__version__ = '0.9.0' \ No newline at end of file From 5ea4f92c688581a3a08f6980d36b65361558c52e Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Mon, 10 Jul 2023 11:00:39 -0400 Subject: [PATCH 4/6] Update dmod.modeldata dep for dmod.core. Updating to depend on core 0.9.0. --- python/lib/modeldata/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lib/modeldata/setup.py b/python/lib/modeldata/setup.py index 5fc41e3b7..869c77080 100644 --- a/python/lib/modeldata/setup.py +++ b/python/lib/modeldata/setup.py @@ -26,7 +26,7 @@ "fiona", "geopandas", "dmod-communication>=0.4.2", - "dmod-core>=0.3.0", + "dmod-core>=0.9.0", "minio", "aiohttp<=3.7.4", "shapely>=2.0.0", From 7eef8e99e939beb4094d53aafe11e6646b0abc81 Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Mon, 10 Jul 2023 11:01:05 -0400 Subject: [PATCH 5/6] Bump dmod.modeldata version to 0.9.2. --- python/lib/modeldata/dmod/modeldata/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lib/modeldata/dmod/modeldata/_version.py b/python/lib/modeldata/dmod/modeldata/_version.py index 102b47c9c..61816e5af 100644 --- a/python/lib/modeldata/dmod/modeldata/_version.py +++ b/python/lib/modeldata/dmod/modeldata/_version.py @@ -1 +1 @@ -__version__ = '0.9.1' \ No newline at end of file +__version__ = '0.9.2' \ No newline at end of file From b2847f081d0f181c94505a1e5b0b66ec5afedd6c Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Mon, 10 Jul 2023 16:28:44 -0400 Subject: [PATCH 6/6] Use delete w/ obj store mgr create() if add fails. Fixing ObjectStoreDatasetManager create() function so that it calls the delete() function if initial data fails to be added (and thus the dataset does not need to end up created); previous logic that just removed the bucket would fail if the bucket had anything in it. --- .../dmod/modeldata/data/object_store_manager.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py index fbcfec2c5..a11eab19f 100644 --- a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py +++ b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py @@ -331,20 +331,24 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_ dataset = Dataset(name=name, category=category, data_domain=domain, dataset_type=DatasetType.OBJECT_STORE, manager=self, access_location=access_loc, is_read_only=is_read_only, created_on=created_on, last_updated=created_on, expires_on=expires_on) + + # Once dataset is added to ``datasets``, it's "managed," so calls to add_data, delete, etc., should work self.datasets[name] = dataset + # Put in a try block to make sure the dataset only remains if adding data worked as needed (if applicable) try: if initial_data is not None: initial_data.add_initial_data() + # TODO: (later) consider whether dataset should not be deleted if everything else worked until this # Then updated the persisted state file and return self.persist_serialized(name) + return dataset - # If we ran into any trouble writing initial data to the dataset, then bail, cleaning up the dataset from the - # manager and the object store itself + # If we ran into any trouble adding initial data, then bail, cleaning up the dataset and backing storage except Exception as e: - self.datasets.pop(name) - self._client.remove_bucket(name) + # Since the dataset is "managed," we can call delete() + self.delete(dataset=dataset) raise e @property