Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addition of the InitialDataAdder abstraction #383

Merged
merged 6 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/lib/core/dmod/core/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.8.0'
__version__ = '0.9.0'
57 changes: 52 additions & 5 deletions python/lib/core/dmod/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,48 @@ def is_file_based(self) -> bool:
return self._is_file_based


class InitialDataAdder(ABC):
"""
Abstract type for adding initial data from some other source(s) to a dataset in the process of being created.

Abstract type for adding initial data to a new dataset, typically from some other source(s) of data, and frequently
when some logic has to be applied to the data before it can be added. This logic may include things like
transforming formats, extracting subsets, and/or combining separate data together.

Note that it is important (and this is where "initial data" really applies) that subtypes are implemented so that
an instance **does not** expect its applicable ::class:`Dataset` to exist at the time the instance is created. Only
when ::method:`add_initial_data` is invoked must the dataset exist.
"""

def __init__(self, dataset_name: str, dataset_manager: 'DatasetManager', *args, **kwargs):
"""
Initialize.

Parameters
----------
dataset_name : str
The name of the new dataset to which this instance will add initial data.
dataset_manager : DatasetManager
The manager of the new dataset to which this instance will add initial data.
args
kwargs
"""
self._dataset_name: str = dataset_name
self._dataset_manager: DatasetManager = dataset_manager

@abstractmethod
def add_initial_data(self):
"""
Assemble and add the initial data.

Raises
-------
DmodRuntimeError
Raised when initial data could not be assembled and/or added successfully to the dataset.
"""
pass


class Dataset(Serializable):
"""
Rrepresentation of the descriptive metadata for a grouped collection of data.
Expand Down Expand Up @@ -463,14 +505,18 @@ def combine_partials_into_composite(self, dataset_name: str, item_name: str, com

@abstractmethod
def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_only: bool,
initial_data: Optional[str] = None) -> Dataset:
initial_data: Optional[InitialDataAdder] = None, expires_on: Optional[datetime] = None) -> Dataset:
"""
Create a new dataset instance.
Create a new dataset instance, optionally inserting data.

Implementations should ensure that a returned dataset is ready for use. That is, existing data at the time of
creation is accessible according to the dataset's metadata, and (when appropriate) the dataset is ready for
receiving output written to it.

Note that ``initial_data`` allows for optionally adding data to the dataset as it is created. Implementations
should ensure that, if ``initial_data`` is not ``None``, the expected dataset survives this method only if the
addition was successful (i.e., it is either never created or removed if not).

Parameters
----------
name : str
Expand All @@ -481,9 +527,10 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
The data domain for the new dataset, which includes the format, fields, and restrictions on values.
is_read_only : bool
Whether the new dataset is read-only.
initial_data : Optional[str]
Optional string representation of a location in which there is initial data that should be added to the
dataset.
initial_data : Union[Path, str, Dict[str, bytes], Callable[[str, 'DatasetManager'], bool], None]
Optional means for initially adding data as the dataset is created.
expires_on : Optional[datetime]
Optional point when the dataset (initially) expires, if it should be temporary.

Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion python/lib/modeldata/dmod/modeldata/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.9.1'
__version__ = '0.9.2'
46 changes: 30 additions & 16 deletions python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import minio.retention

from dmod.core.meta_data import DataCategory, DataDomain
from dmod.core.dataset import Dataset, DatasetManager, DatasetType
from dmod.core.dataset import Dataset, DatasetManager, DatasetType, InitialDataAdder
from datetime import datetime, timedelta
from minio import Minio
from minio.api import ObjectWriteResult
Expand Down Expand Up @@ -282,11 +282,15 @@ def combine_partials_into_composite(self, dataset_name: str, item_name: str, com
except Exception as e:
return False

# TODO: adjust signature to give "is_read_only" a default of False
def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_only: bool,
initial_data: Optional[str] = None) -> Dataset:
initial_data: Optional[InitialDataAdder] = None, expires_on: Optional[datetime] = None) -> Dataset:
"""
Create a new ::class:`Dataset` instance and, if needed, backing object store bucket of the same name.

Note that, if ``initial_data`` is not ``None``, the expected dataset will survive this method only if the
addition of the initial data was successful (i.e., the dataset is either never created or removed if not).

Parameters
----------
name : str
Expand All @@ -297,8 +301,10 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
The data domain for the new dataset, which includes the format, fields, and restrictions on values.
is_read_only : bool
Whether the new dataset is read-only.
initial_data : Optional[str]
Optional string form of a path to a directory containing initial data that should be added to the dataset.
initial_data : Optional[InitialDataAdder]
Optional means for initially adding data as the dataset is created.
expires_on : Optional[datetime]
Optional point when the dataset (initially) expires, if it should be temporary.

Returns
-------
Expand All @@ -309,13 +315,7 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
raise RuntimeError("Cannot create new dataset with name {}: name already in use".format(name))
if self._client.bucket_exists(name):
raise RuntimeError("Unexpected existing bucket when creating dataset {}".format(name))

files_dir = None
if initial_data is not None:
files_dir = Path(initial_data)
if not files_dir.is_dir():
raise RuntimeError("Invalid param for initial dataset data: {} not a directory".format(files_dir))
elif is_read_only:
if is_read_only and initial_data is None:
msg = "Attempting to create read-only dataset {} without supplying it with any initial data"
raise RuntimeError(msg.format(name))

Expand All @@ -330,12 +330,26 @@ def create(self, name: str, category: DataCategory, domain: DataDomain, is_read_
access_loc = "{}://{}/{}".format('https' if self._secure_connection else 'http', self._obj_store_host_str, name)
dataset = Dataset(name=name, category=category, data_domain=domain, dataset_type=DatasetType.OBJECT_STORE,
manager=self, access_location=access_loc, is_read_only=is_read_only, created_on=created_on,
last_updated=created_on)
last_updated=created_on, expires_on=expires_on)

# Once dataset is added to ``datasets``, it's "managed," so calls to add_data, delete, etc., should work
self.datasets[name] = dataset
if files_dir is not None:
self._push_files(bucket_name=name, dir_path=files_dir, recursive=True)
self.persist_serialized(name)
return dataset

# Put in a try block to make sure the dataset only remains if adding data worked as needed (if applicable)
try:
if initial_data is not None:
initial_data.add_initial_data()

# TODO: (later) consider whether dataset should not be deleted if everything else worked until this
# Then updated the persisted state file and return
self.persist_serialized(name)

return dataset
# If we ran into any trouble adding initial data, then bail, cleaning up the dataset and backing storage
except Exception as e:
# Since the dataset is "managed," we can call delete()
self.delete(dataset=dataset)
raise e

@property
def data_chunking_params(self) -> Optional[Tuple[str, str]]:
Expand Down
2 changes: 1 addition & 1 deletion python/lib/modeldata/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"fiona",
"geopandas",
"dmod-communication>=0.4.2",
"dmod-core>=0.3.0",
"dmod-core>=0.9.0",
"minio",
"aiohttp<=3.7.4",
"shapely>=2.0.0",
Expand Down