Skip to content

Commit

Permalink
feat: Add metadata attribute to datasets (#189)
Browse files Browse the repository at this point in the history
* Add metadata attribute to all datasets

Signed-off-by: Ahdra Merali <[email protected]>
  • Loading branch information
AhdraMeraliQB authored May 22, 2023
1 parent a9b274e commit de8b833
Show file tree
Hide file tree
Showing 38 changed files with 254 additions and 21 deletions.
2 changes: 1 addition & 1 deletion kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
* Added pandas 2.0 support.
* Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
* Added a save method to the APIDataSet

* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.
* Relaxed Kedro version pin to `>=0.16`
* Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins.

## Bug fixes and other changes
* Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
Expand Down
18 changes: 10 additions & 8 deletions kedro-datasets/kedro_datasets/api/api_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@
from requests import Session, sessions
from requests.auth import AuthBase

# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0.
# Any contribution to datasets should be made in kedro-datasets
# in kedro-plugins (https://github.com/kedro-org/kedro-plugins)


class APIDataSet(AbstractDataSet[None, requests.Response]):
"""``APIDataSet`` loads/saves data from/to HTTP(S) APIs.
Expand All @@ -38,7 +34,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
Example usage for the `Python API <https://kedro.readthedocs.io/en/stable/data/\
data_catalog.html#use-the-data-catalog-with-the-code-api>`_: ::
>>> from kedro.extras.datasets.api import APIDataSet
>>> from kedro_datasets.api import APIDataSet
>>>
>>>
>>> data_set = APIDataSet(
Expand Down Expand Up @@ -99,6 +95,7 @@ def __init__(
load_args: Dict[str, Any] = None,
save_args: Dict[str, Any] = None,
credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.
Expand All @@ -108,12 +105,15 @@ def __init__(
methods
load_args: Additional parameters to be fed to requests.request.
https://requests.readthedocs.io/en/latest/api/#requests.request
credentials: Allows specifying secrets in credentials.yml.
Expected format is ``('login', 'password')`` if given as a tuple or
list. An ``AuthBase`` instance can be provided for more complex cases.
save_args: Options for saving data on server. Includes all parameters used
during load method. Adds an optional parameter, ``chunk_size`` which
determines the size of the package sent at each request.
credentials: Allows specifying secrets in credentials.yml.
Expected format is ``('login', 'password')`` if given as a tuple or list.
An ``AuthBase`` instance can be provided for more complex cases.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
Raises:
ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are
specified.
Expand Down Expand Up @@ -153,6 +153,8 @@ def __init__(
**self._params,
}

self.metadata = metadata

@staticmethod
def _convert_type(value: Any):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
from kedro.io.core import AbstractDataSet, get_filepath_str, get_protocol_and_path


class BioSequenceDataSet(AbstractDataSet[List, List]):
class BioSequenceDataSet(
AbstractDataSet[List, List]
): # pylint:disable=too-many-instance-attributes
r"""``BioSequenceDataSet`` loads and saves data to a sequence file.
Example:
Expand Down Expand Up @@ -47,6 +49,7 @@ def __init__(
save_args: Dict[str, Any] = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""
Creates a new instance of ``BioSequenceDataSet`` pointing
Expand All @@ -69,6 +72,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
Note: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO
"""
Expand Down Expand Up @@ -100,6 +105,8 @@ def __init__(
self._fs_open_args_load = _fs_open_args_load
self._fs_open_args_save = _fs_open_args_save

self.metadata = metadata

def _describe(self) -> Dict[str, Any]:
return {
"filepath": self._filepath,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/dask/parquet_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
save_args: Dict[str, Any] = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``ParquetDataSet`` pointing to concrete
parquet files.
Expand All @@ -109,11 +110,15 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Optional parameters to the backend file system driver:
https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#optional-parameters
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
self._filepath = filepath
self._fs_args = deepcopy(fs_args) or {}
self._credentials = deepcopy(credentials) or {}

self.metadata = metadata

# Handle default load and save arguments
self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
if load_args is not None:
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/email/message_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``EmailMessageDataSet`` pointing to a concrete text file
on a specific filesystem.
Expand Down Expand Up @@ -103,6 +104,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -116,6 +119,8 @@ def __init__(
_fs_args.setdefault("auto_mkdir", True)
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GeoJSONDataSet`` pointing to a concrete GeoJSON file
on a specific filesystem fsspec.
Expand Down Expand Up @@ -85,6 +86,8 @@ def __init__(
Here you can find all available arguments for `open`:
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `wb` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = copy.deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -97,6 +100,8 @@ def __init__(

self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
credentials: Dict[str, Any] = None,
save_args: Dict[str, Any] = None,
version: Version = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``HoloviewsWriter``.
Expand All @@ -70,6 +71,8 @@ def __init__(
``kedro.io.core.Version``. If its ``load`` attribute is
None, the latest version will be loaded. If its ``save``
attribute is None, save version will be autogenerated.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_credentials = deepcopy(credentials) or {}
_fs_args = deepcopy(fs_args) or {}
Expand All @@ -83,6 +86,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/json/json_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file
on a specific filesystem.
Expand Down Expand Up @@ -86,6 +87,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -99,6 +102,8 @@ def __init__(
_fs_args.setdefault("auto_mkdir", True)
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def __init__(
save_args: Dict[str, Any] = None,
version: Version = None,
overwrite: bool = False,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``MatplotlibWriter``.
Expand All @@ -140,6 +141,8 @@ def __init__(
overwrite: If True, any existing image files will be removed.
Only relevant when saving multiple Matplotlib objects at
once.
metadata: Any arbitrary Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_credentials = deepcopy(credentials) or {}
_fs_args = deepcopy(fs_args) or {}
Expand All @@ -153,6 +156,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/gml_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GMLDataSet``.
Expand All @@ -73,6 +74,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -86,6 +89,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/graphml_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GraphMLDataSet``.
Expand All @@ -72,6 +73,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -85,6 +88,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/json_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``JSONDataSet``.
Expand All @@ -73,6 +74,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -86,6 +89,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/pandas/csv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``CSVDataSet`` pointing to a concrete CSV file
on a specific filesystem.
Expand All @@ -102,6 +103,8 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Extra arguments to pass into underlying filesystem class constructor
(e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_credentials = deepcopy(credentials) or {}
Expand All @@ -114,6 +117,8 @@ def __init__(
self._storage_options = {**_credentials, **_fs_args}
self._fs = fsspec.filesystem(self._protocol, **self._storage_options)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/pandas/excel_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``ExcelDataSet`` pointing to a concrete Excel file
on a specific filesystem.
Expand Down Expand Up @@ -150,6 +151,8 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Extra arguments to pass into underlying filesystem class constructor
(e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
Raises:
DataSetError: If versioning is enabled while in append mode.
Expand All @@ -165,6 +168,8 @@ def __init__(
self._storage_options = {**_credentials, **_fs_args}
self._fs = fsspec.filesystem(self._protocol, **self._storage_options)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
Loading

0 comments on commit de8b833

Please sign in to comment.