diff --git a/ingestify/application/dataset_store.py b/ingestify/application/dataset_store.py index e2032aa..f2c1934 100644 --- a/ingestify/application/dataset_store.py +++ b/ingestify/application/dataset_store.py @@ -17,6 +17,7 @@ Dataset, DatasetCollection, DatasetRepository, + DatasetResource, DraftFile, File, LoadedFile, @@ -205,14 +206,14 @@ def add_revision( def update_dataset( self, dataset: Dataset, - dataset_identifier: Identifier, + dataset_resource: DatasetResource, files: Dict[str, DraftFile], ): """The add_revision will also save the dataset.""" metadata_changed = False - # if dataset.update_from_identifier(dataset_identifier): - # self.dataset_repository.save(bucket=self.bucket, dataset=dataset) - # metadata_changed = True + if dataset.update_from_resource(dataset_resource): + self.dataset_repository.save(bucket=self.bucket, dataset=dataset) + metadata_changed = True self.add_revision(dataset, files) diff --git a/ingestify/application/loader.py b/ingestify/application/loader.py index 4131f3e..4b69b14 100644 --- a/ingestify/application/loader.py +++ b/ingestify/application/loader.py @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) -DEFAULT_CHUNK_SIZE = 100 +DEFAULT_CHUNK_SIZE = 1000 def to_batches(input_): @@ -52,7 +52,8 @@ def load_file( file_resource.file_id ) - if file_resource.json_content: + if file_resource.json_content is not None: + # Empty dictionary is allowed return DraftFile.from_input( file_=json.dumps(file_resource.json_content, indent=4), data_serialization_format="json", @@ -93,7 +94,7 @@ def __init__( def run(self): self.store.update_dataset( dataset=self.dataset, - dataset_identifier=Identifier(self.dataset_resource.dataset_resource_id), + dataset_resource=self.dataset_resource, files={ file_id: load_file(file_resource, dataset=self.dataset) for file_id, file_resource in self.dataset_resource.files.items() diff --git a/ingestify/domain/models/dataset/dataset.py b/ingestify/domain/models/dataset/dataset.py index d39bd05..a1309b3 100644 --- a/ingestify/domain/models/dataset/dataset.py +++ b/ingestify/domain/models/dataset/dataset.py @@ -53,18 +53,18 @@ def add_revision(self, revision: Revision): self.revisions.append(revision) self.updated_at = utcnow() - def update_from_identifier(self, dataset_identifier: Identifier) -> bool: + def update_from_resource(self, dataset_resource) -> bool: changed = False - if self.name != dataset_identifier.name: - self.name = dataset_identifier.name + if self.name != dataset_resource.name: + self.name = dataset_resource.name changed = True - if self.metadata != dataset_identifier.metadata: - self.metadata = dataset_identifier.metadata + if self.metadata != dataset_resource.metadata: + self.metadata = dataset_resource.metadata changed = True - if self.state != dataset_identifier.state: - self.state = dataset_identifier.state + if self.state != dataset_resource.state: + self.state = dataset_resource.state changed = True if changed: diff --git a/ingestify/domain/models/resources/dataset_resource.py b/ingestify/domain/models/resources/dataset_resource.py index b447b6e..67e8d4d 100644 --- a/ingestify/domain/models/resources/dataset_resource.py +++ b/ingestify/domain/models/resources/dataset_resource.py @@ -2,6 +2,8 @@ from datetime import datetime from typing import Optional, Callable, TYPE_CHECKING +from ingestify.exceptions import DuplicateFile + if TYPE_CHECKING: from ingestify.domain import DraftFile, File from ingestify.domain.models.dataset.dataset import DatasetState @@ -28,6 +30,12 @@ class FileResource: Callable[["FileResource", Optional["File"]], Optional["DraftFile"]] ] = None + def __post_init__(self): + if self.json_content is None and not self.url and not self.file_loader: + raise TypeError( + "You need to specify `json_content`, `url` or a custom `file_loader`" + ) + class DatasetResource: def __init__( @@ -55,7 +63,8 @@ def add_file( self, last_modified: datetime, data_feed_key: str, - data_spec_version: str, + # Some sources might not have a DataSpecVersion. Set a default + data_spec_version: str = "v1", json_content: Optional[dict] = None, url: Optional[str] = None, http_options: Optional[dict] = None,