From 78913d8d9fd78f80a58bc6dc27427831c4bab95d Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 10 Oct 2023 02:51:34 -0600 Subject: [PATCH 1/2] docs(datasets): blacken code in rst literal blocks (#362) Signed-off-by: Deepyaman Datta --- .pre-commit-config.yaml | 9 +++ .../kedro_datasets/api/api_dataset.py | 14 ++--- .../biosequence/biosequence_dataset.py | 12 ++-- .../kedro_datasets/dask/parquet_dataset.py | 14 ++--- .../databricks/managed_table_dataset.py | 11 ++-- .../kedro_datasets/email/message_dataset.py | 3 +- .../geopandas/geojson_dataset.py | 9 ++- .../holoviews/holoviews_writer.py | 3 +- .../kedro_datasets/json/json_dataset.py | 5 +- .../matplotlib/matplotlib_writer.py | 24 ++++--- .../kedro_datasets/networkx/gml_dataset.py | 3 +- .../networkx/graphml_dataset.py | 3 +- .../kedro_datasets/networkx/json_dataset.py | 3 +- .../kedro_datasets/pandas/csv_dataset.py | 6 +- .../pandas/deltatable_dataset.py | 7 ++- .../kedro_datasets/pandas/excel_dataset.py | 14 ++--- .../kedro_datasets/pandas/feather_dataset.py | 6 +- .../kedro_datasets/pandas/gbq_dataset.py | 15 +++-- .../kedro_datasets/pandas/generic_dataset.py | 8 +-- .../kedro_datasets/pandas/hdf_dataset.py | 8 +-- .../kedro_datasets/pandas/json_dataset.py | 6 +- .../kedro_datasets/pandas/parquet_dataset.py | 6 +- .../kedro_datasets/pandas/sql_dataset.py | 63 ++++++++++--------- .../kedro_datasets/pandas/xml_dataset.py | 6 +- .../kedro_datasets/pickle/pickle_dataset.py | 16 ++--- .../kedro_datasets/pillow/image_dataset.py | 3 +- .../kedro_datasets/plotly/json_dataset.py | 3 +- .../kedro_datasets/plotly/plotly_dataset.py | 15 ++--- .../kedro_datasets/polars/csv_dataset.py | 8 +-- .../kedro_datasets/polars/generic_dataset.py | 8 +-- .../kedro_datasets/redis/redis_dataset.py | 6 +- kedro-datasets/kedro_datasets/spark/README.md | 1 + .../spark/deltatable_dataset.py | 13 ++-- .../kedro_datasets/spark/spark_dataset.py | 16 ++--- .../spark/spark_hive_dataset.py | 18 +++--- .../spark/spark_jdbc_dataset.py | 25 ++++---- .../svmlight/svmlight_dataset.py | 3 +- .../tensorflow/tensorflow_model_dataset.py | 3 +- .../kedro_datasets/text/text_dataset.py | 3 +- .../kedro_datasets/tracking/json_dataset.py | 5 +- .../tracking/metrics_dataset.py | 5 +- .../kedro_datasets/video/video_dataset.py | 15 +++-- .../kedro_datasets/yaml/yaml_dataset.py | 5 +- 43 files changed, 236 insertions(+), 193 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e8804f2cb..38f6e6bfe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,15 @@ repos: - id: check-merge-conflict # Check for files that contain merge conflict strings. - id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source. + - repo: https://github.com/adamchainz/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + args: + - "--rst-literal-blocks" + additional_dependencies: + - black==22.12.0 + - repo: local hooks: - id: ruff-kedro-datasets diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index b40ab1640..438f3b976 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -37,7 +37,8 @@ class APIDataset(AbstractDataset[None, requests.Response]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.api import APIDataset >>> @@ -51,23 +52,22 @@ class APIDataset(AbstractDataset[None, requests.Response]): ... "commodity_desc": "CORN", ... "statisticcat_des": "YIELD", ... "agg_level_desc": "STATE", - ... "year": 2000 + ... "year": 2000, ... } ... }, - ... credentials=("username", "password") + ... credentials=("username", "password"), ... ) >>> data = dataset.load() ``APIDataset`` can also be used to save output on a remote server using HTTP(S) methods. - :: + + .. code-block:: pycon >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}' >>> >>> dataset = APIDataset( - ... method = "POST", - ... url = "url_of_remote_server", - ... save_args = {"chunk_size":1} + ... method="POST", url="url_of_remote_server", save_args={"chunk_size": 1} ... ) >>> dataset.save(example_table) diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index ebd0722f5..89ea37fce 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -18,7 +18,8 @@ class BioSequenceDataset(AbstractDataset[List, List]): r"""``BioSequenceDataset`` loads and saves data to a sequence file. Example: - :: + + .. code-block:: pycon >>> from kedro_datasets.biosequence import BioSequenceDataset >>> from io import StringIO @@ -28,10 +29,13 @@ class BioSequenceDataset(AbstractDataset[List, List]): >>> raw_data = [] >>> for record in SeqIO.parse(StringIO(data), "fasta"): ... raw_data.append(record) + ... >>> - >>> dataset = BioSequenceDataset(filepath="ls_orchid.fasta", - ... load_args={"format": "fasta"}, - ... save_args={"format": "fasta"}) + >>> dataset = BioSequenceDataset( + ... filepath="ls_orchid.fasta", + ... load_args={"format": "fasta"}, + ... save_args={"format": "fasta"}, + ... ) >>> dataset.save(raw_data) >>> sequence_list = dataset.load() >>> diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 5ec39fed5..3d6626d3d 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -37,25 +37,25 @@ class ParquetDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro.extras.datasets.dask import ParquetDataset >>> import pandas as pd >>> import dask.dataframe as dd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [[5, 6], [7, 8]]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [[5, 6], [7, 8]]}) >>> ddf = dd.from_pandas(data, npartitions=2) >>> >>> dataset = ParquetDataset( ... filepath="s3://bucket_name/path/to/folder", ... credentials={ - ... 'client_kwargs':{ - ... 'aws_access_key_id': 'YOUR_KEY', - ... 'aws_secret_access_key': 'YOUR SECRET', + ... "client_kwargs": { + ... "aws_access_key_id": "YOUR_KEY", + ... "aws_secret_access_key": "YOUR SECRET", ... } ... }, - ... save_args={"compression": "GZIP"} + ... save_args={"compression": "GZIP"}, ... ) >>> dataset.save(ddf) >>> reloaded = dataset.load() diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 33c7ef1d1..dd119559d 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -176,12 +176,13 @@ class ManagedTableDataset(AbstractVersionedDataset): .. code-block:: python from pyspark.sql import SparkSession - from pyspark.sql.types import (StructField, StringType, - IntegerType, StructType) + from pyspark.sql.types import StructField, StringType, IntegerType, StructType from kedro_datasets.databricks import ManagedTableDataset - schema = StructType([StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + + schema = StructType( + [StructField("name", StringType(), True), StructField("age", IntegerType(), True)] + ) + data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) dataset = ManagedTableDataset(table="names_and_ages") dataset.save(spark_df) diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index 1f45042fd..076bfd492 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -26,7 +26,8 @@ class EmailMessageDataset(AbstractVersionedDataset[Message, Message]): Note that ``EmailMessageDataset`` doesn't handle sending email messages. Example: - :: + + .. code-block:: pycon >>> from email.message import EmailMessage >>> diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py index 56a8890a7..ab1e0e620 100644 --- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py +++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py @@ -26,14 +26,17 @@ class GeoJSONDataset( allowed geopandas (pandas) options for loading and saving GeoJSON files. Example: - :: + + .. code-block:: pycon >>> import geopandas as gpd >>> from shapely.geometry import Point >>> from kedro_datasets.geopandas import GeoJSONDataset >>> - >>> data = gpd.GeoDataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}, geometry=[Point(1,1), Point(2,4)]) + >>> data = gpd.GeoDataFrame( + ... {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}, + ... geometry=[Point(1, 1), Point(2, 4)], + ... ) >>> dataset = GeoJSONDataset(filepath="test.geojson", save_args=None) >>> dataset.save(data) >>> reloaded = dataset.load() diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py index 7d64b8bf6..18e817c9b 100644 --- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py +++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py @@ -21,7 +21,8 @@ class HoloviewsWriter(AbstractVersionedDataset[HoloViews, NoReturn]): filesystem (e.g. local, S3, GCS). Example: - :: + + .. code-block:: pycon >>> import holoviews as hv >>> from kedro_datasets.holoviews import HoloviewsWriter diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index 341e13933..418355ea9 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -34,11 +34,12 @@ class JSONDataset(AbstractVersionedDataset[Any, Any]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.json import JSONDataset >>> - >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} + >>> data = {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]} >>> >>> dataset = JSONDataset(filepath="test.json") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index 568928caf..bea1cde1c 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -37,21 +37,21 @@ class MatplotlibWriter( Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter >>> >>> fig = plt.figure() >>> plt.plot([1, 2, 3]) - >>> plot_writer = MatplotlibWriter( - ... filepath="data/08_reporting/output_plot.png" - ... ) + >>> plot_writer = MatplotlibWriter(filepath="data/08_reporting/output_plot.png") >>> plt.close() >>> plot_writer.save(fig) Example saving a plot as a PDF file: - :: + + .. code-block:: pycon >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter @@ -66,7 +66,8 @@ class MatplotlibWriter( >>> pdf_plot_writer.save(fig) Example saving multiple plots in a folder, using a dictionary: - :: + + .. code-block:: pycon >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter @@ -77,13 +78,12 @@ class MatplotlibWriter( ... plt.plot([1, 2, 3], color=colour) ... >>> plt.close("all") - >>> dict_plot_writer = MatplotlibWriter( - ... filepath="data/08_reporting/plots" - ... ) + >>> dict_plot_writer = MatplotlibWriter(filepath="data/08_reporting/plots") >>> dict_plot_writer.save(plots_dict) Example saving multiple plots in a folder, using a list: - :: + + .. code-block:: pycon >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter @@ -94,9 +94,7 @@ class MatplotlibWriter( ... plt.plot([i, i + 1, i + 2]) ... >>> plt.close("all") - >>> list_plot_writer = MatplotlibWriter( - ... filepath="data/08_reporting/plots" - ... ) + >>> list_plot_writer = MatplotlibWriter(filepath="data/08_reporting/plots") >>> list_plot_writer.save(plots_list) """ diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index f4d63e87e..1fd26a7d3 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -22,7 +22,8 @@ class GMLDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): See https://networkx.org/documentation/stable/tutorial.html for details. Example: - :: + + .. code-block:: pycon >>> from kedro_datasets.networkx import GMLDataset >>> import networkx as nx diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index 0a368f505..a797b948d 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -21,7 +21,8 @@ class GraphMLDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): See https://networkx.org/documentation/stable/tutorial.html for details. Example: - :: + + .. code-block:: pycon >>> from kedro_datasets.networkx import GraphMLDataset >>> import networkx as nx diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index 4a41f9a67..55301faca 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -22,7 +22,8 @@ class JSONDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): See https://networkx.org/documentation/stable/tutorial.html for details. Example: - :: + + .. code-block:: pycon >>> from kedro_datasets.networkx import JSONDataset >>> import networkx as nx diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 543035238..557beaf4f 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -52,13 +52,13 @@ class CSVDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import CSVDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = CSVDataset(filepath="test.csv") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py index 4581312c5..c009fe92d 100644 --- a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py @@ -61,19 +61,20 @@ class DeltaTableDataset(AbstractDataset): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import DeltaTableDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> dataset = DeltaTableDataset(filepath="test") >>> >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) >>> - >>> new_data = pd.DataFrame({'col1': [7, 8], 'col2': [9, 10], 'col3': [11, 12]}) + >>> new_data = pd.DataFrame({"col1": [7, 8], "col2": [9, 10], "col3": [11, 12]}) >>> dataset.save(new_data) >>> dataset.get_loaded_version() diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 6f4b0ff27..048130464 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -56,13 +56,13 @@ class ExcelDataset( Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import ExcelDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = ExcelDataset(filepath="test.xlsx") >>> dataset.save(data) @@ -90,16 +90,16 @@ class ExcelDataset( `Python API `_ for a multi-sheet Excel file: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import ExcelDataset >>> import pandas as pd >>> - >>> dataframe = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> dataframe = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> another_dataframe = pd.DataFrame({"x": [10, 20], "y": ["hello", "world"]}) >>> multiframe = {"Sheet1": dataframe, "Sheet2": another_dataframe} - >>> dataset = ExcelDataset(filepath="test.xlsx", load_args = {"sheet_name": None}) + >>> dataset = ExcelDataset(filepath="test.xlsx", load_args={"sheet_name": None}) >>> dataset.save(multiframe) >>> reloaded = dataset.load() >>> assert multiframe["Sheet1"].equals(reloaded["Sheet1"]) diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index 41995dda4..cfca7ce59 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -51,13 +51,13 @@ class FeatherDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import FeatherDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = FeatherDataset(filepath="test.feather") >>> diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index d672cae0c..11ace04ee 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -45,17 +45,15 @@ class GBQTableDataset(AbstractDataset[None, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import GBQTableDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - >>> 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = GBQTableDataset('dataset', - >>> 'table_name', - >>> project='my-project') + >>> dataset = GBQTableDataset("dataset", "table_name", project="my-project") >>> dataset.save(data) >>> reloaded = dataset.load() >>> @@ -196,13 +194,14 @@ class GBQQueryDataset(AbstractDataset[None, pd.DataFrame]): Example using Python API: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import GBQQueryDataset >>> >>> sql = "SELECT * FROM dataset_1.table_a" >>> - >>> dataset = GBQQueryDataset(sql, project='my-project') + >>> dataset = GBQQueryDataset(sql, project="my-project") >>> >>> sql_data = dataset.load() >>> diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 987d79be7..f8e813a74 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -64,15 +64,15 @@ class GenericDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import GenericDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = GenericDataset(filepath="test.csv", file_format='csv') + >>> dataset = GenericDataset(filepath="test.csv", file_format="csv") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index 73870e56a..5d9a6bc16 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -36,15 +36,15 @@ class HDFDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import HDFDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = HDFDataset(filepath="test.h5", key='data') + >>> dataset = HDFDataset(filepath="test.h5", key="data") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index f480f0754..cfc53d627 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -47,13 +47,13 @@ class JSONDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import JSONDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = JSONDataset(filepath="test.json") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index b132d69b3..6cd862379 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -58,13 +58,13 @@ class ParquetDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import ParquetDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = ParquetDataset(filepath="test.parquet") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index beb25fb3f..5bad6e98b 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -127,19 +127,16 @@ class SQLTableDataset(AbstractDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import SQLTableDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], - ... "col3": [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> table_name = "table_a" - >>> credentials = { - ... "con": "postgresql://scott:tiger@localhost/test" - ... } - >>> data_set = SQLTableDataset(table_name=table_name, - ... credentials=credentials) + >>> credentials = {"con": "postgresql://scott:tiger@localhost/test"} + >>> data_set = SQLTableDataset(table_name=table_name, credentials=credentials) >>> >>> data_set.save(data) >>> reloaded = data_set.load() @@ -311,44 +308,48 @@ class SQLQueryDataset(AbstractDataset[None, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import SQLQueryDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], - ... "col3": [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> sql = "SELECT * FROM table_a" - >>> credentials = { - ... "con": "postgresql://scott:tiger@localhost/test" - ... } - >>> data_set = SQLQueryDataset(sql=sql, - ... credentials=credentials) + >>> credentials = {"con": "postgresql://scott:tiger@localhost/test"} + >>> data_set = SQLQueryDataset(sql=sql, credentials=credentials) >>> >>> sql_data = data_set.load() Example of usage for mssql: - :: + + .. code-block:: pycon - >>> credentials = {"server": "localhost", "port": "1433", - ... "database": "TestDB", "user": "SA", - ... "password": "StrongPassword"} + >>> credentials = { + ... "server": "localhost", + ... "port": "1433", + ... "database": "TestDB", + ... "user": "SA", + ... "password": "StrongPassword", + ... } >>> def _make_mssql_connection_str( - ... server: str, port: str, database: str, user: str, password: str + ... server: str, port: str, database: str, user: str, password: str ... ) -> str: - ... import pyodbc # noqa - ... from sqlalchemy.engine import URL # noqa - ... - ... driver = pyodbc.drivers()[-1] - ... connection_str = (f"DRIVER={driver};SERVER={server},{port};DATABASE={database};" - ... f"ENCRYPT=yes;UID={user};PWD={password};" - ... f"TrustServerCertificate=yes;") - ... return URL.create("mssql+pyodbc", query={"odbc_connect": connection_str}) + ... import pyodbc # noqa + ... from sqlalchemy.engine import URL # noqa + ... driver = pyodbc.drivers()[-1] + ... connection_str = ( + ... f"DRIVER={driver};SERVER={server},{port};DATABASE={database};" + ... f"ENCRYPT=yes;UID={user};PWD={password};" + ... f"TrustServerCertificate=yes;" + ... ) + ... return URL.create("mssql+pyodbc", query={"odbc_connect": connection_str}) ... >>> connection_str = _make_mssql_connection_str(**credentials) - >>> data_set = SQLQueryDataset(credentials={"con": connection_str}, - ... sql="SELECT TOP 5 * FROM TestTable;") + >>> data_set = SQLQueryDataset( + ... credentials={"con": connection_str}, sql="SELECT TOP 5 * FROM TestTable;" + ... ) >>> df = data_set.load() In addition, here is an example of a catalog with dates parsing: diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index fa3fe1de4..70196bd63 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -30,13 +30,13 @@ class XMLDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import XMLDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = XMLDataset(filepath="test.xml") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index b28103e7e..21d3b8c71 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -44,23 +44,25 @@ class PickleDataset(AbstractVersionedDataset[Any, Any]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pickle import PickleDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = PickleDataset(filepath="test.pkl", backend="pickle") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) >>> - >>> dataset = PickleDataset(filepath="test.pickle.lz4", - ... backend="compress_pickle", - ... load_args={"compression":"lz4"}, - ... save_args={"compression":"lz4"}) + >>> dataset = PickleDataset( + ... filepath="test.pickle.lz4", + ... backend="compress_pickle", + ... load_args={"compression": "lz4"}, + ... save_args={"compression": "lz4"}, + ... ) >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index 161ff9dc5..c7f7fbeaa 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -21,7 +21,8 @@ class ImageDataset(AbstractVersionedDataset[Image.Image, Image.Image]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pillow import ImageDataset >>> diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index 68e5ad9a5..cb7f2d1e7 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -36,7 +36,8 @@ class JSONDataset( Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.plotly import JSONDataset >>> import plotly.express as px diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index a30e62f0d..2983233fe 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -46,24 +46,25 @@ class PlotlyDataset(JSONDataset): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.plotly import PlotlyDataset >>> import plotly.express as px >>> import pandas as pd >>> - >>> df_data = pd.DataFrame([[0, 1], [1, 0]], columns=('x1', 'x2')) + >>> df_data = pd.DataFrame([[0, 1], [1, 0]], columns=("x1", "x2")) >>> >>> dataset = PlotlyDataset( - ... filepath='scatter_plot.json', + ... filepath="scatter_plot.json", ... plotly_args={ - ... 'type': 'scatter', - ... 'fig': {'x': 'x1', 'y': 'x2'}, - ... } + ... "type": "scatter", + ... "fig": {"x": "x1", "y": "x2"}, + ... }, ... ) >>> dataset.save(df_data) >>> reloaded = dataset.load() - >>> assert px.scatter(df_data, x='x1', y='x2') == reloaded + >>> assert px.scatter(df_data, x="x1", y="x2") == reloaded """ diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index e2638107f..8a33e09a2 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -50,15 +50,15 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.polars import CSVDataset >>> import polars as pl >>> - >>> data = pl.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pl.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = CSVDataset(filepath='test.csv') + >>> dataset = CSVDataset(filepath="test.csv") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.frame_equal(reloaded) diff --git a/kedro-datasets/kedro_datasets/polars/generic_dataset.py b/kedro-datasets/kedro_datasets/polars/generic_dataset.py index 5deceff44..aa6eedd48 100644 --- a/kedro-datasets/kedro_datasets/polars/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/generic_dataset.py @@ -36,15 +36,15 @@ class GenericDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): compression: "snappy" Example using Python API: - :: + + .. code-block:: pycon >>> from kedro_datasets.polars import GenericDataset >>> import polars as pl >>> - >>> data = pl.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pl.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = GenericDataset(filepath='test.parquet', file_format='parquet') + >>> dataset = GenericDataset(filepath="test.parquet", file_format="parquet") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.frame_equal(reloaded) diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 770ee98af..9979cf386 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -43,13 +43,13 @@ class PickleDataset(AbstractDataset[Any, Any]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.redis import PickleDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> my_data = PickleDataset(key="my_data") >>> my_data.save(data) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index 7400c3c47..a0bcef8e1 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -23,6 +23,7 @@ Supported file formats are: from kedro.framework.hooks import hook_impl from pyspark.sql import SparkSession + class SparkStreamsHook: @hook_impl def after_pipeline_run(self) -> None: diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index f1b6a74b5..e5e40a9fe 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -35,18 +35,19 @@ class DeltaTableDataset(AbstractDataset[None, DeltaTable]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - ... IntegerType, StructType) + >>> from pyspark.sql.types import StructField, StringType, IntegerType, StructType >>> >>> from kedro.extras.datasets.spark import DeltaTableDataset, SparkDataset >>> - >>> schema = StructType([StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) + >>> schema = StructType( + ... [StructField("name", StringType(), True), StructField("age", IntegerType(), True)] + ... ) >>> - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] >>> >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) >>> diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 221e4e562..d83e3227a 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -203,21 +203,21 @@ class SparkDataset(AbstractVersionedDataset[DataFrame, DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - ... IntegerType, StructType) + >>> from pyspark.sql.types import StructField, StringType, IntegerType, StructType >>> >>> from kedro_datasets.spark import SparkDataset >>> - >>> schema = StructType([StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) + >>> schema = StructType( + ... [StructField("name", StringType(), True), StructField("age", IntegerType(), True)] + ... ) >>> - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] >>> - >>> spark_df = SparkSession.builder.getOrCreate()\ - ... .createDataFrame(data, schema) + >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) >>> >>> dataset = SparkDataset(filepath="test_data") >>> dataset.save(spark_df) diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index 33cc31f02..b7bd3363c 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -42,23 +42,25 @@ class SparkHiveDataset(AbstractDataset[DataFrame, DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - ... IntegerType, StructType) + >>> from pyspark.sql.types import StructField, StringType, IntegerType, StructType >>> >>> from kedro_datasets.spark import SparkHiveDataset >>> - >>> schema = StructType([StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) + >>> schema = StructType( + ... [StructField("name", StringType(), True), StructField("age", IntegerType(), True)] + ... ) >>> - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] >>> >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) >>> - >>> dataset = SparkHiveDataset(database="test_database", table="test_table", - ... write_mode="overwrite") + >>> dataset = SparkHiveDataset( + ... database="test_database", table="test_table", write_mode="overwrite" + ... ) >>> dataset.save(spark_df) >>> reloaded = dataset.load() >>> diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index 18af44546..029cf15b5 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -37,24 +37,27 @@ class SparkJDBCDataset(AbstractDataset[DataFrame, DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> import pandas as pd >>> from kedro_datasets import SparkJBDCDataset >>> from pyspark.sql import SparkSession >>> >>> spark = SparkSession.builder.getOrCreate() - >>> data = spark.createDataFrame(pd.DataFrame({'col1': [1, 2], - ... 'col2': [4, 5], - ... 'col3': [5, 6]})) - >>> url = 'jdbc:postgresql://localhost/test' - >>> table = 'table_a' - >>> connection_properties = {'driver': 'org.postgresql.Driver'} + >>> data = spark.createDataFrame( + ... pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + ... ) + >>> url = "jdbc:postgresql://localhost/test" + >>> table = "table_a" + >>> connection_properties = {"driver": "org.postgresql.Driver"} >>> dataset = SparkJDBCDataset( - ... url=url, table=table, credentials={'user': 'scott', - ... 'password': 'tiger'}, - ... load_args={'properties': connection_properties}, - ... save_args={'properties': connection_properties}) + ... url=url, + ... table=table, + ... credentials={"user": "scott", "password": "tiger"}, + ... load_args={"properties": connection_properties}, + ... save_args={"properties": connection_properties}, + ... ) >>> >>> dataset.save(data) >>> reloaded = dataset.load() diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index 05edae8a6..a24f40947 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -68,7 +68,8 @@ class SVMLightDataset(AbstractVersionedDataset[_DI, _DO]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.svmlight import SVMLightDataset >>> import numpy as np diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index a95b1bfa2..e2ca6f12e 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -41,7 +41,8 @@ class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.M Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.tensorflow import TensorFlowModelDataset >>> import tensorflow as tf diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index a6d9be17e..3d31dd3dd 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -30,7 +30,8 @@ class TextDataset(AbstractVersionedDataset[str, str]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.text import TextDataset >>> diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 943e686fd..2dbe0c9ca 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -30,11 +30,12 @@ class JSONDataset(json_dataset.JSONDataset): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.tracking import JSONDataset >>> - >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} + >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} >>> >>> dataset = JSONDataset(filepath="test.json") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index cfd30d1a4..d4336cf69 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -32,11 +32,12 @@ class MetricsDataset(json_dataset.JSONDataset): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.tracking import MetricsDataset >>> - >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} + >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} >>> >>> dataset = MetricsDataset(filepath="test.json") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index de97d7b8e..1e601fb8a 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -220,24 +220,26 @@ class VideoDataset(AbstractDataset[AbstractVideo, AbstractVideo]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.video import VideoDataset >>> import numpy as np >>> - >>> video = VideoDataset(filepath='/video/file/path.mp4').load() + >>> video = VideoDataset(filepath="/video/file/path.mp4").load() >>> frame = video[0] >>> np.sum(np.asarray(frame)) Example creating a video from numpy frames using Python API: - :: + + .. code-block:: pycon >>> from kedro_datasets.video.video_dataset import VideoDataset, SequenceVideo >>> import numpy as np >>> from PIL import Image >>> - >>> frame = np.ones((640,480,3), dtype=np.uint8) * 255 + >>> frame = np.ones((640, 480, 3), dtype=np.uint8) * 255 >>> imgs = [] >>> for i in range(255): ... imgs.append(Image.fromarray(frame)) @@ -248,14 +250,15 @@ class VideoDataset(AbstractDataset[AbstractVideo, AbstractVideo]): Example creating a video from numpy frames using a generator and the Python API: - :: + + .. code-block:: pycon >>> from kedro_datasets.video.video_dataset import VideoDataset, GeneratorVideo >>> import numpy as np >>> from PIL import Image >>> >>> def gen(): - ... frame = np.ones((640,480,3), dtype=np.uint8) * 255 + ... frame = np.ones((640, 480, 3), dtype=np.uint8) * 255 ... for i in range(255): ... yield Image.fromarray(frame) ... frame -= 1 diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index d9aa536fb..77d3dcf96 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -31,11 +31,12 @@ class YAMLDataset(AbstractVersionedDataset[Dict, Dict]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.yaml import YAMLDataset >>> - >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} + >>> data = {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]} >>> >>> dataset = YAMLDataset(filepath="test.yaml") >>> dataset.save(data) From 527706d3c512e3f75ae2183afa17757213379e80 Mon Sep 17 00:00:00 2001 From: Felix Wittmann Date: Tue, 10 Oct 2023 12:51:22 +0200 Subject: [PATCH 2/2] docs: cloudpickle is an interesting extension of the pickle functionality (#361) Signed-off-by: H. Felix Wittmann --- kedro-datasets/RELEASE.md | 7 +++++-- kedro-datasets/kedro_datasets/pickle/pickle_dataset.py | 4 ++++ kedro-datasets/kedro_datasets/redis/redis_dataset.py | 5 +++++ kedro-datasets/setup.py | 1 + kedro-datasets/tests/pickle/test_pickle_dataset.py | 1 + kedro-datasets/tests/redis/test_redis_dataset.py | 1 + 6 files changed, 17 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 0b7ac02cc..63f6ae91a 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,8 +1,13 @@ # Upcoming Release ## Major features and improvements ## Bug fixes and other changes +* Updated `PickleDataset` to explicitly mention `cloudpickle` support. ## Upcoming deprecations for Kedro-Datasets 2.0.0 +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: +* [Felix Wittmann](https://github.com/hfwittmann) + # Release 1.7.1 ## Bug fixes and other changes * Pin `tables` version on `kedro-datasets` for Python < 3.8. @@ -10,8 +15,6 @@ ## Upcoming deprecations for Kedro-Datasets 2.0.0 * Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" are deprecated and will be removed in 2.0.0. -## Community contributions - # Release 1.7.0: ## Major features and improvements * Added `polars.GenericDataSet`, a `GenericDataSet` backed by [polars](https://www.pola.rs/), a lightning fast dataframe package built entirely using Rust. diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 21d3b8c71..21f97b713 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -110,6 +110,8 @@ def __init__( # noqa: PLR0913 dill.load: https://dill.readthedocs.io/en/latest/index.html#dill.load compress_pickle.load: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.load + cloudpickle.load: + https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py All defaults are preserved. save_args: Pickle options for saving pickle files. You can pass in arguments that the backend dump function specified accepts, e.g: @@ -118,6 +120,8 @@ def __init__( # noqa: PLR0913 dill.dump: https://dill.readthedocs.io/en/latest/index.html#dill.dump compress_pickle.dump: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dump + cloudpickle.dump: + https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 9979cf386..1e782059b 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -79,6 +79,7 @@ def __init__( # noqa: PLR0913 * `pickle` * `dill` * `compress_pickle` + * `cloudpickle` Example backends that are incompatible: * `torch` @@ -94,6 +95,8 @@ def __init__( # noqa: PLR0913 dill.loads: https://dill.readthedocs.io/en/latest/index.html#dill.loads compress_pickle.loads: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.loads + cloudpickle.loads: + https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py All defaults are preserved. save_args: Pickle options for saving pickle files. You can pass in arguments that the backend dump function specified accepts, e.g: @@ -101,6 +104,8 @@ def __init__( # noqa: PLR0913 dill.dumps: https://dill.readthedocs.io/en/latest/index.html#dill.dumps compress_pickle.dumps: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dumps + cloudpickle.dumps: + https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py All defaults are preserved. credentials: Credentials required to get access to the redis server. E.g. `{"password": None}`. diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 340ad5e67..a22e83f81 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -153,6 +153,7 @@ def _collect_requirements(requires): "biopython~=1.73", "blacken-docs==1.9.2", "black~=22.0", + "cloudpickle<=2.0.0", "compress-pickle[lz4]~=2.1.0", "coverage[toml]", "dask[complete]~=2021.10", # pinned by Snyk to avoid a vulnerability diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py index be09d6291..e53a8b675 100644 --- a/kedro-datasets/tests/pickle/test_pickle_dataset.py +++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py @@ -68,6 +68,7 @@ class TestPickleDataset: ("pickle", None, None), ("joblib", None, None), ("dill", None, None), + ("cloudpickle", None, None), ("compress_pickle", {"compression": "lz4"}, {"compression": "lz4"}), ], indirect=True, diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py index a2ec3bf83..f569d7d22 100644 --- a/kedro-datasets/tests/redis/test_redis_dataset.py +++ b/kedro-datasets/tests/redis/test_redis_dataset.py @@ -76,6 +76,7 @@ class TestPickleDataset: [ ("a", "pickle", None, None), (1, "dill", None, None), + (2, "cloudpickle", None, None), ("key", "compress_pickle", {"compression": "lz4"}, {"compression": "lz4"}), ], indirect=True,