From e74762606de3f5e7b7a12649e4bd7683050f149e Mon Sep 17 00:00:00 2001 From: Marijn Valk Date: Sat, 11 Nov 2023 15:00:27 +0100 Subject: [PATCH] add doctest config --- python/docs/source/conf.py | 4 +- python/docs/source/usage.rst | 175 ++++++++++++++++++++--------------- python/pyproject.toml | 3 +- 3 files changed, 107 insertions(+), 75 deletions(-) diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py index e0c8e424b6..939dbddb4f 100644 --- a/python/docs/source/conf.py +++ b/python/docs/source/conf.py @@ -12,6 +12,7 @@ # import os import sys +import pathlib import toml @@ -24,7 +25,8 @@ def get_release_version() -> str: :return: """ - cargo_content = toml.load("../../Cargo.toml") + cargo_path = pathlib.Path(__file__).parent.parent.parent / "Cargo.toml" + cargo_content = toml.load(str(cargo_path)) return cargo_content["package"]["version"] diff --git a/python/docs/source/usage.rst b/python/docs/source/usage.rst index ed0556a176..ff595af512 100644 --- a/python/docs/source/usage.rst +++ b/python/docs/source/usage.rst @@ -10,13 +10,11 @@ of the table, and other metadata such as creation time. .. code-block:: python >>> from deltalake import DeltaTable - >>> dt = DeltaTable("../rust/tests/data/delta-0.2.0") + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/delta-0.2.0") >>> dt.version() 3 >>> dt.files() - ['part-00000-cb6b150b-30b8-4662-ad28-ff32ddab96d2-c000.snappy.parquet', - 'part-00000-7c2deba3-1994-4fb8-bc07-d46c948aa415-c000.snappy.parquet', - 'part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet'] + ['part-00000-cb6b150b-30b8-4662-ad28-ff32ddab96d2-c000.snappy.parquet', 'part-00000-7c2deba3-1994-4fb8-bc07-d46c948aa415-c000.snappy.parquet', 'part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet'] Loading a Delta Table @@ -26,7 +24,7 @@ To load the current version, use the constructor: .. code-block:: python - >>> dt = DeltaTable("../rust/tests/data/delta-0.2.0") + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/delta-0.2.0") Depending on your storage backend, you could use the ``storage_options`` parameter to provide some configuration. Configuration is defined for specific backends - `s3 options`_, `azure options`_, `gcs options`_. @@ -34,7 +32,7 @@ Configuration is defined for specific backends - `s3 options`_, `azure options`_ .. code-block:: python >>> storage_options = {"AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY":"THE_AWS_SECRET_ACCESS_KEY"} - >>> dt = DeltaTable("../rust/tests/data/delta-0.2.0", storage_options=storage_options) + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/delta-0.2.0", storage_options=storage_options) The configuration can also be provided via the environment, and the basic service provider is derived from the URL being used. We try to support many of the well-known formats to identify basic service properties. @@ -66,8 +64,8 @@ For AWS Glue catalog, use AWS environment variables to authenticate. >>> database_name = "simple_database" >>> table_name = "simple_table" >>> data_catalog = DataCatalog.AWS - >>> dt = DeltaTable.from_data_catalog(data_catalog=data_catalog, database_name=database_name, table_name=table_name) - >>> dt.to_pyarrow_table().to_pydict() + >>> dt = DeltaTable.from_data_catalog(data_catalog=data_catalog, database_name=database_name, table_name=table_name) # doctest: +SKIP + >>> dt.to_pyarrow_table().to_pydict() # doctest: +SKIP {'id': [5, 7, 9, 5, 6, 7, 8, 9]} For Databricks Unity Catalog authentication, use environment variables: @@ -84,7 +82,7 @@ For Databricks Unity Catalog authentication, use environment variables: >>> schema_name = 'db_schema' >>> table_name = 'db_table' >>> data_catalog = DataCatalog.UNITY - >>> dt = DeltaTable.from_data_catalog(data_catalog=data_catalog, data_catalog_id=catalog_name, database_name=schema_name, table_name=table_name) + >>> dt = DeltaTable.from_data_catalog(data_catalog=data_catalog, data_catalog_id=catalog_name, database_name=schema_name, table_name=table_name) # doctest: +SKIP .. _`s3 options`: https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants .. _`azure options`: https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants @@ -135,7 +133,7 @@ load: .. code-block:: python - >>> dt = DeltaTable("../rust/tests/data/simple_table", version=2) + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/simple_table", version=2) Once you've loaded a table, you can also change versions using either a version number or datetime string: @@ -162,7 +160,7 @@ The delta log maintains basic metadata about a table, including: * A ``name``, if provided * A ``description``, if provided * The list of ``partition_columns``. -* The ``created_time`` of the table +* The ``created_time`` of the table. * A map of table ``configuration``. This includes fields such as ``delta.appendOnly``, which if ``true`` indicates the table is not meant to have data deleted from it. @@ -171,9 +169,9 @@ Get metadata from a table with the :meth:`DeltaTable.metadata` method: .. code-block:: python >>> from deltalake import DeltaTable - >>> dt = DeltaTable("../rust/tests/data/simple_table") - >>> dt.metadata() - Metadata(id: 5fba94ed-9794-4965-ba6e-6ee3c0d22af9, name: None, description: None, partitionColumns: [], created_time: 1587968585495, configuration={}) + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/simple_table") + >>> print(dt.metadata()) + Metadata(id: 5fba94ed-9794-4965-ba6e-6ee3c0d22af9, name: None, description: None, partition_columns: [], created_time: 1587968585495, configuration: {}) Schema ~~~~~~ @@ -188,7 +186,7 @@ Use :meth:`DeltaTable.schema` to retrieve the delta lake schema: .. code-block:: python >>> from deltalake import DeltaTable - >>> dt = DeltaTable("../rust/tests/data/simple_table") + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/simple_table") >>> dt.schema() Schema([Field(id, PrimitiveType("long"), nullable=True)]) @@ -198,7 +196,7 @@ from json, use `schema.Schema.from_json()`. .. code-block:: python >>> dt.schema().json() - '{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}}]}' + {'type': 'struct', 'fields': [{'name': 'id', 'type': 'long', 'nullable': True, 'metadata': {}}]} Use `deltalake.schema.Schema.to_pyarrow()` to retrieve the PyArrow schema: @@ -225,16 +223,42 @@ specified by the table configuration ``delta.logRetentionDuration``. To view the available history, use :meth:`DeltaTable.history`: .. code-block:: python - + + >>> from pprint import pprint >>> from deltalake import DeltaTable - >>> dt = DeltaTable("../rust/tests/data/simple_table") - >>> dt.history() - [{'timestamp': 1587968626537, 'operation': 'DELETE', 'operationParameters': {'predicate': '["((`id` % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT))"]'}, 'readVersion': 3, 'isBlindAppend': False}, - {'timestamp': 1587968614187, 'operation': 'UPDATE', 'operationParameters': {'predicate': '((id#697L % cast(2 as bigint)) = cast(0 as bigint))'}, 'readVersion': 2, 'isBlindAppend': False}, - {'timestamp': 1587968604143, 'operation': 'WRITE', 'operationParameters': {'mode': 'Overwrite', 'partitionBy': '[]'}, 'readVersion': 1, 'isBlindAppend': False}, - {'timestamp': 1587968596254, 'operation': 'MERGE', 'operationParameters': {'predicate': '(oldData.`id` = newData.`id`)'}, 'readVersion': 0, 'isBlindAppend': False}, - {'timestamp': 1587968586154, 'operation': 'WRITE', 'operationParameters': {'mode': 'ErrorIfExists', 'partitionBy': '[]'}, 'isBlindAppend': True}] - + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/simple_table") + >>> pprint(dt.history()) + [{'isBlindAppend': False, + 'operation': 'DELETE', + 'operationParameters': {'predicate': '["((`id` % CAST(2 AS BIGINT)) = CAST(0 ' + 'AS BIGINT))"]'}, + 'readVersion': 3, + 'timestamp': 1587968626537, + 'version': 4}, + {'isBlindAppend': False, + 'operation': 'UPDATE', + 'operationParameters': {'predicate': '((id#697L % cast(2 as bigint)) = ' + 'cast(0 as bigint))'}, + 'readVersion': 2, + 'timestamp': 1587968614187, + 'version': 3}, + {'isBlindAppend': False, + 'operation': 'WRITE', + 'operationParameters': {'mode': 'Overwrite', 'partitionBy': '[]'}, + 'readVersion': 1, + 'timestamp': 1587968604143, + 'version': 2}, + {'isBlindAppend': False, + 'operation': 'MERGE', + 'operationParameters': {'predicate': '(oldData.`id` = newData.`id`)'}, + 'readVersion': 0, + 'timestamp': 1587968596254, + 'version': 1}, + {'isBlindAppend': True, + 'operation': 'WRITE', + 'operationParameters': {'mode': 'ErrorIfExists', 'partitionBy': '[]'}, + 'timestamp': 1587968586154, + 'version': 0}] Current Add Actions ~~~~~~~~~~~~~~~~~~~ @@ -247,9 +271,9 @@ the add actions data using :meth:`DeltaTable.get_add_actions`: .. code-block:: python >>> from deltalake import DeltaTable - >>> dt = DeltaTable("../rust/tests/data/delta-0.8.0") + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/delta-0.8.0") >>> dt.get_add_actions(flatten=True).to_pandas() - path size_bytes modification_time data_change num_records null_count.value min.value max.value + path size_bytes modification_time data_change num_records null_count.value min.value max.value 0 part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a... 440 2021-03-06 15:16:07 True 2 0 0 2 1 part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe... 440 2021-03-06 15:16:16 True 2 0 2 4 @@ -257,7 +281,7 @@ This works even with past versions of the table: .. code-block:: python - >>> dt = DeltaTable("../rust/tests/data/delta-0.8.0", version=0) + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/delta-0.8.0", version=0) >>> dt.get_add_actions(flatten=True).to_pandas() path size_bytes modification_time data_change num_records null_count.value min.value max.value 0 part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a... 440 2021-03-06 15:16:07 True 2 0 0 2 @@ -285,14 +309,14 @@ support filtering partitions and selecting particular columns. .. code-block:: python >>> from deltalake import DeltaTable - >>> dt = DeltaTable("../rust/tests/data/delta-0.8.0-partitioned") + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/delta-0.8.0-partitioned") >>> dt.schema().to_pyarrow() value: string year: string month: string day: string >>> dt.to_pandas(partitions=[("year", "=", "2021")], columns=["value"]) - value + value 0 6 1 7 2 5 @@ -300,6 +324,8 @@ support filtering partitions and selecting particular columns. >>> dt.to_pyarrow_table(partitions=[("year", "=", "2021")], columns=["value"]) pyarrow.Table value: string + ---- + value: [["6","7"],["5"],["4"]] Converting to a PyArrow Dataset allows you to filter on columns other than partition columns and load the result as a stream of batches rather than a single @@ -324,6 +350,9 @@ Delta transaction log and push down any other filters to the scanning operation. 1 7 value 0 5 + Empty DataFrame + Columns: [value] + Index: [] PyArrow datasets may also be passed to compatible query engines, such as DuckDB_. @@ -331,9 +360,9 @@ PyArrow datasets may also be passed to compatible query engines, such as DuckDB_ .. code-block:: python - >>> import duckdb - >>> ex_data = duckdb.arrow(dataset) - >>> ex_data.filter("year = 2021 and value > 4").project("value") + >>> import duckdb # doctest: +SKIP + >>> ex_data = duckdb.arrow(dataset) # doctest: +SKIP + >>> ex_data.filter("year = 2021 and value > 4").project("value") # doctest: +SKIP --------------------- -- Expression Tree -- --------------------- @@ -361,9 +390,9 @@ you can pass them to ``dask.dataframe.read_parquet``: .. code-block:: python - >>> import dask.dataframe as dd - >>> df = dd.read_parquet(dt.file_uris()) - >>> df + >>> import dask.dataframe as dd # doctest: +SKIP + >>> df = dd.read_parquet(dt.file_uris()) # doctest: +SKIP + >>> df # doctest: +SKIP Dask DataFrame Structure: value year month day npartitions=6 @@ -373,7 +402,7 @@ you can pass them to ``dask.dataframe.read_parquet``: ... ... ... ... ... ... ... ... Dask Name: read-parquet, 6 tasks - >>> df.compute() + >>> df.compute() # doctest: +SKIP value year month day 0 1 2020 1 1 0 2 2020 2 3 @@ -404,13 +433,10 @@ only list the files to be deleted. Pass ``dry_run=False`` to actually delete fil .. code-block:: python - >>> dt = DeltaTable("../rust/tests/data/simple_table") - >>> dt.vacuum() - ['../rust/tests/data/simple_table/part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet', - '../rust/tests/data/simple_table/part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet', - '../rust/tests/data/simple_table/part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet', - ...] - >>> dt.vacuum(dry_run=False) # Don't run this unless you are sure! + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/simple_table") + >>> dt.vacuum()[:3] + ['part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet', 'part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet', 'part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet'] + >>> dt.vacuum(dry_run=False) # Don't run this unless you are sure! # doctest: +SKIP Optimizing tables ~~~~~~~~~~~~~~~~~ @@ -428,8 +454,8 @@ For just file compaction, use the :meth:`TableOptimizer.compact` method: .. code-block:: python - >>> dt = DeltaTable("../rust/tests/data/simple_table") - >>> dt.optimize.compact() + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/simple_table") + >>> dt.optimize.compact() # doctest: +SKIP {'numFilesAdded': 1, 'numFilesRemoved': 5, 'filesAdded': {'min': 555, 'max': 555, 'avg': 555.0, 'totalFiles': 1, 'totalSize': 555}, 'filesRemoved': {'min': 262, 'max': 429, 'avg': 362.2, 'totalFiles': 5, 'totalSize': 1811}, @@ -442,8 +468,8 @@ filter on multiple columns at once. .. code-block:: python - >>> dt = DeltaTable("../rust/tests/data/COVID-19_NYT") - >>> dt.optimize.z_order(["date", "county"]) + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/COVID-19_NYT") + >>> dt.optimize.z_order(["date", "county"]) # doctest: +SKIP {'numFilesAdded': 1, 'numFilesRemoved': 8, 'filesAdded': {'min': 2473439, 'max': 2473439, 'avg': 2473439.0, 'totalFiles': 1, 'totalSize': 2473439}, 'filesRemoved': {'min': 325440, 'max': 895702, 'avg': 773810.625, 'totalFiles': 8, 'totalSize': 6190485}, @@ -461,9 +487,10 @@ DataFrame, a PyArrow Table, or an iterator of PyArrow Record Batches. .. code-block:: python + >>> import pandas as pd >>> from deltalake import write_deltalake >>> df = pd.DataFrame({'x': [1, 2, 3]}) - >>> write_deltalake('path/to/table', df) + >>> write_deltalake('path/to/table', df) # doctest: +SKIP .. note:: :py:func:`write_deltalake` accepts a Pandas DataFrame, but will convert it to @@ -476,8 +503,8 @@ to append pass in ``mode='append'``: .. code-block:: python - >>> write_deltalake('path/to/table', df, mode='overwrite') - >>> write_deltalake('path/to/table', df, mode='append') + >>> write_deltalake('path/to/table', df, mode='overwrite') # doctest: +SKIP + >>> write_deltalake('path/to/table', df, mode='append') # doctest: +SKIP :py:meth:`write_deltalake` will raise :py:exc:`ValueError` if the schema of the data passed to it differs from the existing table's schema. If you wish to @@ -524,7 +551,7 @@ Here is an example writing to s3 using this mechanism: >>> from deltalake import write_deltalake >>> df = pd.DataFrame({'x': [1, 2, 3]}) >>> storage_options = {'AWS_S3_LOCKING_PROVIDER': 'dynamodb', 'DYNAMO_LOCK_TABLE_NAME': 'custom_table_name'} - >>> write_deltalake('s3://path/to/table', df, 'storage_options'= storage_options) + >>> write_deltalake('s3://path/to/table', df, storage_options=storage_options) # doctest: +SKIP .. note:: if for some reason you don't want to use dynamodb as your locking mechanism you can @@ -547,16 +574,18 @@ Update all the rows for the column "processed" to the value True. .. code-block:: python + >>> import pandas as pd >>> from deltalake import write_deltalake, DeltaTable >>> df = pd.DataFrame({'x': [1, 2, 3], 'deleted': [False, False, False]}) - >>> write_deltalake('path/to/table', df) - >>> dt = DeltaTable('path/to/table') - >>> dt.update({"processed": "True"}) - >>> dt.to_pandas() - >>> x processed + >>> write_deltalake('path/to/table', df) # doctest: +SKIP + >>> dt = DeltaTable('path/to/table') # doctest: +SKIP + >>> dt.update({"processed": "True"}) # doctest: +SKIP + >>> dt.to_pandas() # doctest: +SKIP + x processed 0 1 True 1 2 True 2 3 True + .. note:: :meth:`DeltaTable.update` predicates and updates are all in string format. The predicates and expressions, are parsed into Apache Datafusion expressions. @@ -568,14 +597,14 @@ True where x = 3 >>> from deltalake import write_deltalake, DeltaTable >>> df = pd.DataFrame({'x': [1, 2, 3], 'deleted': [False, False, False]}) - >>> write_deltalake('path/to/table', df) - >>> dt = DeltaTable('path/to/table') - >>> dt.update( + >>> write_deltalake('path/to/table', df) # doctest: +SKIP + >>> dt = DeltaTable('path/to/table') # doctest: +SKIP + >>> dt.update( ... updates={"deleted": "True"}, ... predicate= 'x = 3', - ... ) - >>> dt.to_pandas() - >>> x deleted + ... ) # doctest: +SKIP + >>> dt.to_pandas() # doctest: +SKIP + x deleted 0 1 False 1 2 False 2 3 True @@ -594,13 +623,13 @@ the method will raise an error. >>> from deltalake import write_deltalake >>> df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a', 'a', 'b']}) - >>> write_deltalake('path/to/table', df, partition_by=['y']) + >>> write_deltalake('path/to/table', df, partition_by=['y']) # doctest: +SKIP - >>> table = DeltaTable('path/to/table') - >>> df2 = pd.DataFrame({'x': [100], 'y': ['b']}) - >>> write_deltalake(table, df2, partition_filters=[('y', '=', 'b')], mode="overwrite") + >>> table = DeltaTable('path/to/table') # doctest: +SKIP + >>> df2 = pd.DataFrame({'x': [100], 'y': ['b']}) + >>> write_deltalake(table, df2, partition_filters=[('y', '=', 'b')], mode="overwrite") # doctest: +SKIP - >>> table.to_pandas() + >>> table.to_pandas() # doctest: +SKIP x y 0 1 a 1 2 a @@ -625,13 +654,13 @@ the clause will remove all files from the table. >>> from deltalake import DeltaTable, write_deltalake >>> df = pd.DataFrame({'a': [1, 2, 3], 'to_delete': [False, False, True]}) - >>> write_deltalake('path/to/table', df) + >>> write_deltalake('path/to/table', df) # doctest: +SKIP - >>> table = DeltaTable('path/to/table') - >>> table.delete(predicate="to_delete = true") + >>> table = DeltaTable('path/to/table') # doctest: +SKIP + >>> table.delete(predicate="to_delete = true") # doctest: +SKIP {'num_added_files': 1, 'num_removed_files': 1, 'num_deleted_rows': 1, 'num_copied_rows': 2, 'execution_time_ms': 11081, 'scan_time_ms': 3721, 'rewrite_time_ms': 7} - >>> table.to_pandas() + >>> table.to_pandas() # doctest: +SKIP a to_delete 0 1 False 1 2 False @@ -659,6 +688,6 @@ concurrent operation was performed on the table, restore will fail. .. code-block:: python - >>> dt = DeltaTable("../rust/tests/data/simple_table") + >>> dt = DeltaTable("../crates/deltalake-core/tests/data/simple_table") >>> dt.restore(1) {'numRemovedFile': 5, 'numRestoredFile': 22} diff --git a/python/pyproject.toml b/python/pyproject.toml index 438a49cc56..c97203d25a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -88,9 +88,10 @@ ignore = ["E501"] known-first-party = ["deltalake"] [tool.pytest.ini_options] -addopts = "--cov=deltalake -v -m 'not integration and not benchmark'" +addopts = "--cov=deltalake -v -m 'not integration and not benchmark' --doctest-modules --doctest-glob='*.rst'" testpaths = [ "tests", + "docs", ] markers = [ "integration: marks tests as integration tests (deselect with '-m \"not integration\"')",