From 5b7dbe6827ca3b8114766378f1da0e601cbc08a9 Mon Sep 17 00:00:00 2001 From: Avril Aysha <68642378+avriiil@users.noreply.github.com> Date: Tue, 24 Sep 2024 12:18:18 +0100 Subject: [PATCH 1/9] fix typo --- docs/integrations/object-storage/s3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/object-storage/s3.md b/docs/integrations/object-storage/s3.md index 5b2034827f..1989494978 100644 --- a/docs/integrations/object-storage/s3.md +++ b/docs/integrations/object-storage/s3.md @@ -2,7 +2,7 @@ `delta-rs` offers native support for using AWS S3 as an objet storage backend. -You don’t need to install any extra dependencies to red/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly. +You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly. ## Note for boto3 users From 7783f66558c518d5e7b3434fa5de2d607591773c Mon Sep 17 00:00:00 2001 From: Avril Aysha <68642378+avriiil@users.noreply.github.com> Date: Tue, 24 Sep 2024 12:21:07 +0100 Subject: [PATCH 2/9] typo fix --- docs/integrations/object-storage/s3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/object-storage/s3.md b/docs/integrations/object-storage/s3.md index 1989494978..a7965cb2a5 100644 --- a/docs/integrations/object-storage/s3.md +++ b/docs/integrations/object-storage/s3.md @@ -1,6 +1,6 @@ # AWS S3 Storage Backend -`delta-rs` offers native support for using AWS S3 as an objet storage backend. +`delta-rs` offers native support for using AWS S3 as an object storage backend. You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly. From 7f7e3cddac750197ab0d7ded5d9de926046011f5 Mon Sep 17 00:00:00 2001 From: Avril Aysha <68642378+avriiil@users.noreply.github.com> Date: Tue, 24 Sep 2024 12:20:47 +0100 Subject: [PATCH 3/9] create gcs docs --- docs/integrations/object-storage/gcs.md | 87 +++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 docs/integrations/object-storage/gcs.md diff --git a/docs/integrations/object-storage/gcs.md b/docs/integrations/object-storage/gcs.md new file mode 100644 index 0000000000..c5592ccc5c --- /dev/null +++ b/docs/integrations/object-storage/gcs.md @@ -0,0 +1,87 @@ +# GCS Storage Backend + +`delta-rs` offers native support for using Google Cloud Storage (GCS) as an object storage backend. + +You don’t need to install any extra dependencies to red/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly. + +## Note for boto3 users + +Many Python engines use [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to connect to AWS. This library supports reading credentials automatically from your local `.aws/config` or `.aws/creds` file. + +For example, if you’re running locally with the proper credentials in your local `.aws/config` or `.aws/creds` file then you can write a Parquet file to S3 like this with pandas: + +```python + import pandas as pd + df = pd.DataFrame({'x': [1, 2, 3]}) + df.to_parquet("s3://avriiil/parquet-test-pandas") +``` + +The `delta-rs` writer does not use `boto3` and therefore does not support taking credentials from your `.aws/config` or `.aws/creds` file. If you’re used to working with writers from Python engines like Polars, pandas or Dask, this may mean a small change to your workflow. + +## Passing AWS Credentials + +You can pass your AWS credentials explicitly by using: + +- the `storage_options `kwarg +- Environment variables +- EC2 metadata if using EC2 instances +- AWS Profiles + +## Example + +Let's work through an example with Polars. The same logic applies to other Python engines like Pandas, Daft, Dask, etc. + +Follow the steps below to use Delta Lake on S3 with Polars: + +1. Install Polars and deltalake. For example, using: + + `pip install polars deltalake` + +2. Create a dataframe with some toy data. + + `df = pl.DataFrame({'x': [1, 2, 3]})` + +3. Set your `storage_options` correctly. + +```python +storage_options = { + "AWS_REGION":, + 'AWS_ACCESS_KEY_ID': , + 'AWS_SECRET_ACCESS_KEY': , + 'AWS_S3_LOCKING_PROVIDER': 'dynamodb', + 'DELTA_DYNAMO_TABLE_NAME': 'delta_log', +} +``` + +4. Write data to Delta table using the `storage_options` kwarg. + + ```python + df.write_delta( + "s3://bucket/delta_table", + storage_options=storage_options, + ) + ``` + +## Delta Lake on AWS S3: Safe Concurrent Writes + +You need a locking provider to ensure safe concurrent writes when writing Delta tables to AWS S3. This is because AWS S3 does not guarantee mutual exclusion. + +A locking provider guarantees that only one writer is able to create the same file. This prevents corrupted or conflicting data. + +`delta-rs` uses DynamoDB to guarantee safe concurrent writes. + +Run the code below in your terminal to create a DynamoDB table that will act as your locking provider. + +``` + aws dynamodb create-table \ + --table-name delta_log \ + --attribute-definitions AttributeName=tablePath,AttributeType=S AttributeName=fileName,AttributeType=S \ + --key-schema AttributeName=tablePath,KeyType=HASH AttributeName=fileName,KeyType=RANGE \ + --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 +``` + +If for some reason you don't want to use DynamoDB as your locking mechanism you can choose to set the `AWS_S3_ALLOW_UNSAFE_RENAME` variable to `true` in order to enable S3 unsafe writes. + +Read more in the [Usage](../../usage/writing/writing-to-s3-with-locking-provider.md) section. + +## Delta Lake on GCS: Required permissions From 96dc0a6682e88852f6d14cbad1a085f37787cd33 Mon Sep 17 00:00:00 2001 From: Avril Aysha <68642378+avriiil@users.noreply.github.com> Date: Tue, 24 Sep 2024 15:33:53 +0100 Subject: [PATCH 4/9] update docs --- docs/integrations/object-storage/gcs.md | 91 +++++++------------------ 1 file changed, 24 insertions(+), 67 deletions(-) diff --git a/docs/integrations/object-storage/gcs.md b/docs/integrations/object-storage/gcs.md index c5592ccc5c..aa8682d3cc 100644 --- a/docs/integrations/object-storage/gcs.md +++ b/docs/integrations/object-storage/gcs.md @@ -2,86 +2,43 @@ `delta-rs` offers native support for using Google Cloud Storage (GCS) as an object storage backend. -You don’t need to install any extra dependencies to red/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly. +You don’t need to install any extra dependencies to read/write Delta tables to GCS with engines that use `delta-rs`. You do need to configure your GCS access credentials correctly. -## Note for boto3 users +## Using Application Default Credentials -Many Python engines use [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to connect to AWS. This library supports reading credentials automatically from your local `.aws/config` or `.aws/creds` file. +Application Default Credentials (ADC) is a strategy used by GCS to automatically find credentials based on the application environment. -For example, if you’re running locally with the proper credentials in your local `.aws/config` or `.aws/creds` file then you can write a Parquet file to S3 like this with pandas: +If you are working from your local machine and have ADC set up then you can read/write Delta tables from GCS directly, without having to pass your credentials explicitly. -```python - import pandas as pd - df = pd.DataFrame({'x': [1, 2, 3]}) - df.to_parquet("s3://avriiil/parquet-test-pandas") -``` - -The `delta-rs` writer does not use `boto3` and therefore does not support taking credentials from your `.aws/config` or `.aws/creds` file. If you’re used to working with writers from Python engines like Polars, pandas or Dask, this may mean a small change to your workflow. - -## Passing AWS Credentials - -You can pass your AWS credentials explicitly by using: - -- the `storage_options `kwarg -- Environment variables -- EC2 metadata if using EC2 instances -- AWS Profiles - -## Example - -Let's work through an example with Polars. The same logic applies to other Python engines like Pandas, Daft, Dask, etc. - -Follow the steps below to use Delta Lake on S3 with Polars: - -1. Install Polars and deltalake. For example, using: - - `pip install polars deltalake` +## Example: Write Delta tables to GCS with Polars -2. Create a dataframe with some toy data. - - `df = pl.DataFrame({'x': [1, 2, 3]})` - -3. Set your `storage_options` correctly. +Using Polars, you can write a Delta table to GCS like this: ```python -storage_options = { - "AWS_REGION":, - 'AWS_ACCESS_KEY_ID': , - 'AWS_SECRET_ACCESS_KEY': , - 'AWS_S3_LOCKING_PROVIDER': 'dynamodb', - 'DELTA_DYNAMO_TABLE_NAME': 'delta_log', -} -``` - -4. Write data to Delta table using the `storage_options` kwarg. - - ```python - df.write_delta( - "s3://bucket/delta_table", - storage_options=storage_options, - ) - ``` +# create a toy dataframe +import polars as pl +df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]}) -## Delta Lake on AWS S3: Safe Concurrent Writes +# define path +table_path = "gs://bucket/delta-table" -You need a locking provider to ensure safe concurrent writes when writing Delta tables to AWS S3. This is because AWS S3 does not guarantee mutual exclusion. +# write Delta to GCS +df.write_delta(table_path) +``` -A locking provider guarantees that only one writer is able to create the same file. This prevents corrupted or conflicting data. +## Passing GCS Credentials explicitly -`delta-rs` uses DynamoDB to guarantee safe concurrent writes. +Alternatively, you can pass GCS credentials to your query engine explicitly. -Run the code below in your terminal to create a DynamoDB table that will act as your locking provider. +For Polars, you would do this using the `storage_options` keyword. This will forward your credentials to the `object store` library that Polars uses under the hood. Read the [Polars documentation](https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_delta.html) and the [`object store` documentation](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants) for more information. -``` - aws dynamodb create-table \ - --table-name delta_log \ - --attribute-definitions AttributeName=tablePath,AttributeType=S AttributeName=fileName,AttributeType=S \ - --key-schema AttributeName=tablePath,KeyType=HASH AttributeName=fileName,KeyType=RANGE \ - --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 -``` +## Delta Lake on GCS: Required permissions -If for some reason you don't want to use DynamoDB as your locking mechanism you can choose to set the `AWS_S3_ALLOW_UNSAFE_RENAME` variable to `true` in order to enable S3 unsafe writes. +You will need the following permissions in your GCS account: -Read more in the [Usage](../../usage/writing/writing-to-s3-with-locking-provider.md) section. +- `storage.objects.create` +- `storage.objects.delete` (only required for uploads that overwrite an existing object) +- `storage.objects.get` (only required if you plan on using the Google Cloud CLI) +- `storage.objects.list` (only required if you plan on using the Google Cloud CLI) -## Delta Lake on GCS: Required permissions +For more information, see the [GCP documentation](https://cloud.google.com/storage/docs/uploading-objects) From 5d10cbe786211bfb18fb31f6a49a3838b179e3b4 Mon Sep 17 00:00:00 2001 From: Avril Aysha <68642378+avriiil@users.noreply.github.com> Date: Tue, 24 Sep 2024 15:39:44 +0100 Subject: [PATCH 5/9] fix typos --- docs/integrations/object-storage/s3-like.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/integrations/object-storage/s3-like.md b/docs/integrations/object-storage/s3-like.md index 4d32f7c41b..40b2f6e076 100644 --- a/docs/integrations/object-storage/s3-like.md +++ b/docs/integrations/object-storage/s3-like.md @@ -1,8 +1,8 @@ # CloudFlare R2 & Minio -`delta-rs` offers native support for using Cloudflare R2 and Minio's as storage backend. R2 and Minio support conditional puts, however we have to pass this flag into the storage options. See the example blow +`delta-rs` offers native support for using Cloudflare R2 and Minio's as storage backend. R2 and Minio support conditional puts, however we have to pass this flag into the storage options. See the example below -You don’t need to install any extra dependencies to red/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly. +You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly. ## Passing S3 Credentials From 22ff7a1c6a839f0c9ef28c1f205ecd25bf8e41b9 Mon Sep 17 00:00:00 2001 From: Avril Aysha <68642378+avriiil@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:14:41 +0100 Subject: [PATCH 6/9] add adls docs --- docs/integrations/object-storage/adls.md | 57 ++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 docs/integrations/object-storage/adls.md diff --git a/docs/integrations/object-storage/adls.md b/docs/integrations/object-storage/adls.md new file mode 100644 index 0000000000..2867c07da3 --- /dev/null +++ b/docs/integrations/object-storage/adls.md @@ -0,0 +1,57 @@ +# Azure ADLS Storage Backend + +`delta-rs` offers native support for using Microsoft Azure Data Lake Storage (ADSL) as an object storage backend. + +You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your ADLS access credentials correctly. + +## Passing Credentials Explicitly + +You can also pass ADLS credentials to your query engine explicitly. + +For Polars, you would do this using the `storage_options` keyword as demonstrated above. This will forward your credentials to the `object store` library that Polars uses for cloud storage access under the hood. Read the [`object store` documentation](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants) for more information defining specific credentials. + +## Example: Write Delta table to ADLS with Polars + +Using Polars, you can write a Delta table to ADLS directly like this: + +```python +import polars as pl + +df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]}) + +# define container name +container = + +# define credentials +storage_options = { + "ACCOUNT_NAME": , + "ACCESS_KEY": , +} + +# write Delta to ADLS +df_pl.write_delta( + f"abfs://{container}/delta_table", + storage_options = storage_options +) +``` + +## Example with pandas + +For libraries without direct `write_delta` methods (like Pandas), you can use the `write_deltalake` function from the `deltalake` library: + +```python +import pandas as pd +from deltalake import write_deltalake + +df = pd.DataFrame({"foo": [1, 2, 3, 4, 5]}) + +write_deltalake( + f"abfs://{container}/delta_table_pandas", + df, + storage_options=storage_options +) +``` + +## Using Local Authentication + +If your local session is authenticated using the Azure CLI then you can write Delta tables directly to ADLS. Read more about this in the [Azure CLI documentation](https://learn.microsoft.com/en-us/cli/azure/). From b3b2b9e856a71c3c78d014de104d2699c09c5790 Mon Sep 17 00:00:00 2001 From: Avril Aysha <68642378+avriiil@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:15:53 +0100 Subject: [PATCH 7/9] add adls docs to nav --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index b0c8d3a0ac..baf28ff3fc 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,6 +82,7 @@ nav: - api/exceptions.md - Integrations: - Object Storage: + - integrations/object-storage/adls.md - integrations/object-storage/hdfs.md - integrations/object-storage/s3.md - integrations/object-storage/s3-like.md From 2498837ff6a2c3525058f1a9fd1301ba50fecbba Mon Sep 17 00:00:00 2001 From: Filip Dziuba Date: Wed, 25 Sep 2024 15:14:39 +0200 Subject: [PATCH 8/9] refactor: exposing CommitConflictError enum --- crates/core/src/operations/transaction/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/core/src/operations/transaction/mod.rs b/crates/core/src/operations/transaction/mod.rs index 6c4e81dc63..69027cc4b7 100644 --- a/crates/core/src/operations/transaction/mod.rs +++ b/crates/core/src/operations/transaction/mod.rs @@ -83,7 +83,7 @@ use object_store::path::Path; use object_store::Error as ObjectStoreError; use serde_json::Value; -use self::conflict_checker::{CommitConflictError, TransactionInfo, WinningCommitSummary}; +use self::conflict_checker::{TransactionInfo, WinningCommitSummary}; use crate::checkpoints::{cleanup_expired_logs_for, create_checkpoint_for}; use crate::errors::DeltaTableError; use crate::kernel::{ @@ -97,6 +97,7 @@ use crate::table::config::TableConfig; use crate::table::state::DeltaTableState; use crate::{crate_version, DeltaResult}; +pub use self::conflict_checker::CommitConflictError; pub use self::protocol::INSTANCE as PROTOCOL; #[cfg(test)] From 6b53ac79451b3203506f281fafaa3e9f876f03a9 Mon Sep 17 00:00:00 2001 From: Avril Aysha <68642378+avriiil@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:38:19 +0100 Subject: [PATCH 9/9] squash adding adls docs --- .github/CODEOWNERS | 2 +- .github/actions/setup-env/action.yml | 34 + .github/codecov.yml | 17 + .github/dependabot.yml | 3 +- .github/workflows/build.yml | 37 +- .github/workflows/codecov.yml | 36 + .github/workflows/dev_pr.yml | 1 + .github/workflows/docs.yml | 25 +- .github/workflows/python_benchmark.yml | 54 + .github/workflows/python_build.yml | 136 +-- .github/workflows/python_release.yml | 3 +- .gitignore | 4 +- CHANGELOG.md | 296 +++++ CONTRIBUTING.md | 38 +- Cargo.toml | 49 +- README.md | 6 +- crates/aws/Cargo.toml | 19 +- crates/aws/src/constants.rs | 141 +++ crates/aws/src/credentials.rs | 331 ++++-- crates/aws/src/lib.rs | 427 ++++--- crates/aws/src/logstore/default_logstore.rs | 113 ++ .../dynamodb_logstore.rs} | 27 +- crates/aws/src/logstore/mod.rs | 11 + crates/aws/src/storage.rs | 604 +++++----- crates/aws/tests/common.rs | 6 +- crates/aws/tests/integration_s3_dynamodb.rs | 76 +- crates/aws/tests/repair_s3_rename_test.rs | 22 +- crates/azure/Cargo.toml | 4 +- crates/azure/tests/integration.rs | 5 +- crates/benchmarks/src/bin/merge.rs | 3 +- crates/catalog-glue/Cargo.toml | 6 +- crates/core/Cargo.toml | 22 +- crates/core/src/data_catalog/storage/mod.rs | 5 +- .../core/src/data_catalog/unity/datafusion.rs | 2 +- crates/core/src/data_catalog/unity/models.rs | 2 +- crates/core/src/delta_datafusion/cdf/mod.rs | 66 +- crates/core/src/delta_datafusion/cdf/scan.rs | 8 +- .../src/delta_datafusion/cdf/scan_utils.rs | 17 +- crates/core/src/delta_datafusion/expr.rs | 126 +- .../delta_datafusion/find_files/logical.rs | 13 +- .../src/delta_datafusion/find_files/mod.rs | 30 +- .../delta_datafusion/find_files/physical.rs | 14 +- crates/core/src/delta_datafusion/logical.rs | 15 +- crates/core/src/delta_datafusion/mod.rs | 905 ++++++++++----- crates/core/src/delta_datafusion/physical.rs | 21 +- crates/core/src/delta_datafusion/planner.rs | 58 + .../src/delta_datafusion/schema_adapter.rs | 82 ++ crates/core/src/errors.rs | 11 + crates/core/src/kernel/arrow/mod.rs | 472 +------- crates/core/src/kernel/error.rs | 10 - crates/core/src/kernel/expressions/eval.rs | 384 ------- crates/core/src/kernel/expressions/mod.rs | 478 -------- crates/core/src/kernel/expressions/scalars.rs | 559 --------- crates/core/src/kernel/mod.rs | 8 +- crates/core/src/kernel/models/actions.rs | 361 +++++- crates/core/src/kernel/models/fields.rs | 10 +- crates/core/src/kernel/models/schema.rs | 838 +------------- crates/core/src/kernel/scalars.rs | 286 +++++ crates/core/src/kernel/snapshot/log_data.rs | 225 +++- .../core/src/kernel/snapshot/log_segment.rs | 168 ++- crates/core/src/kernel/snapshot/mod.rs | 357 +++++- crates/core/src/kernel/snapshot/parse.rs | 29 +- crates/core/src/kernel/snapshot/replay.rs | 447 +++++++- crates/core/src/kernel/snapshot/serde.rs | 3 +- crates/core/src/lib.rs | 9 +- crates/core/src/logstore/default_logstore.rs | 53 +- crates/core/src/logstore/mod.rs | 111 +- crates/core/src/operations/add_column.rs | 113 ++ crates/core/src/operations/add_feature.rs | 196 ++++ crates/core/src/operations/cast.rs | 354 ------ .../core/src/operations/cast/merge_schema.rs | 352 ++++++ crates/core/src/operations/cast/mod.rs | 650 +++++++++++ crates/core/src/operations/cdc.rs | 415 +++++++ crates/core/src/operations/constraints.rs | 8 +- .../core/src/operations/convert_to_delta.rs | 106 +- crates/core/src/operations/create.rs | 115 +- crates/core/src/operations/delete.rs | 445 ++++++-- .../core/src/operations/filesystem_check.rs | 6 +- crates/core/src/operations/load.rs | 3 + crates/core/src/operations/load_cdf.rs | 267 ++++- crates/core/src/operations/merge/barrier.rs | 45 +- crates/core/src/operations/merge/filter.rs | 943 +++++++++++++++ crates/core/src/operations/merge/mod.rs | 963 ++++++++-------- crates/core/src/operations/mod.rs | 65 +- crates/core/src/operations/optimize.rs | 160 ++- crates/core/src/operations/restore.rs | 21 +- .../core/src/operations/set_tbl_properties.rs | 215 +--- .../transaction/conflict_checker.rs | 104 +- crates/core/src/operations/transaction/mod.rs | 120 +- .../src/operations/transaction/protocol.rs | 156 ++- .../core/src/operations/transaction/state.rs | 175 +-- .../src/operations/transaction/test_utils.rs | 171 --- crates/core/src/operations/update.rs | 554 ++++++--- crates/core/src/operations/vacuum.rs | 5 +- crates/core/src/operations/write.rs | 892 +++++++++++++-- crates/core/src/operations/writer.rs | 11 +- crates/core/src/protocol/checkpoints.rs | 28 +- crates/core/src/protocol/mod.rs | 75 +- crates/core/src/schema/partitions.rs | 205 +++- crates/core/src/storage/file.rs | 42 +- crates/core/src/storage/mod.rs | 324 +++++- crates/core/src/storage/retry_ext.rs | 5 +- crates/core/src/table/builder.rs | 198 ++-- crates/core/src/table/config.rs | 89 +- crates/core/src/table/mod.rs | 33 +- crates/core/src/table/state.rs | 5 + crates/core/src/table/state_arrow.rs | 23 +- .../core/src/test_utils/factories/actions.rs | 153 +++ crates/core/src/test_utils/factories/data.rs | 247 ++++ crates/core/src/test_utils/factories/mod.rs | 66 ++ crates/core/src/test_utils/mod.rs | 5 + crates/core/src/writer/json.rs | 9 +- crates/core/src/writer/record_batch.rs | 36 +- crates/core/src/writer/stats.rs | 25 +- crates/core/src/writer/test_utils.rs | 10 +- crates/core/src/writer/utils.rs | 6 +- crates/core/tests/checkpoint_writer.rs | 14 +- crates/core/tests/command_merge.rs | 31 +- crates/core/tests/command_optimize.rs | 2 +- crates/core/tests/command_restore.rs | 8 +- crates/core/tests/fs_common/mod.rs | 23 +- crates/core/tests/integration_checkpoint.rs | 10 +- crates/core/tests/integration_datafusion.rs | 87 +- .../core/tests/read_delta_partitions_test.rs | 116 -- crates/deltalake/Cargo.toml | 16 +- crates/deltalake/src/lib.rs | 2 + crates/gcp/Cargo.toml | 4 +- crates/gcp/src/storage.rs | 21 +- crates/gcp/tests/context.rs | 2 +- crates/hdfs/Cargo.toml | 29 + crates/hdfs/src/lib.rs | 48 + crates/hdfs/tests/context.rs | 60 + crates/hdfs/tests/integration.rs | 16 + crates/mount/Cargo.toml | 4 +- crates/mount/src/file.rs | 27 +- crates/sql/src/logical_plan.rs | 44 +- crates/sql/src/planner.rs | 21 +- crates/test/Cargo.toml | 4 +- crates/test/src/concurrent.rs | 2 +- crates/test/src/datafusion.rs | 8 +- crates/test/src/lib.rs | 14 +- .../_delta_log/00000000000000000000.json | 3 + .../_delta_log/00000000000000000001.json | 3 + .../00000000000000000002.checkpoint.parquet | Bin 0 -> 41898 bytes .../_delta_log/00000000000000000002.json | 2 + .../_delta_log/_last_checkpoint | 1 + ...411e-bca9-b067444cbcb0-c000.snappy.parquet | Bin 0 -> 5489 bytes ...4453-9202-51d75dee59af-c000.snappy.parquet | Bin 0 -> 5489 bytes dev/publish.sh | 11 + dev/release/update_change_log.sh | 4 +- docs/Makefile | 20 + docs/api/delta_writer.md | 4 + .../architecture-of-delta-table.md | 18 +- .../delta-lake-acid-transactions.md | 90 +- docs/integrations/delta-lake-daft.md | 10 + docs/integrations/delta-lake-dagster.md | 4 +- docs/integrations/object-storage/adls.md | 57 + docs/integrations/object-storage/gcs.md | 44 + docs/integrations/object-storage/hdfs.md | 48 + docs/integrations/object-storage/s3-like.md | 83 ++ docs/integrations/object-storage/s3.md | 102 ++ docs/requirements.txt | 4 +- docs/usage/loading-table.md | 55 +- docs/usage/managing-tables.md | 9 +- .../small-file-compaction-with-optimize.md | 2 +- docs/usage/writing/index.md | 35 +- .../writing-to-s3-with-locking-provider.md | 53 +- mkdocs.yml | 5 + python/.gitignore | 1 + python/Cargo.toml | 11 +- python/Makefile | 7 +- python/deltalake/__init__.py | 13 +- python/deltalake/_internal.pyi | 166 ++- python/deltalake/schema.py | 111 +- python/deltalake/table.py | 1016 ++++++++++------- python/deltalake/writer.py | 307 +++-- python/docs/source/usage.rst | 26 + python/pyproject.toml | 31 +- python/src/features.rs | 56 + python/src/filesystem.rs | 170 +-- python/src/lib.rs | 966 +++++++++------- python/src/merge.rs | 214 ++++ python/src/schema.rs | 215 ++-- python/src/utils.rs | 36 + python/stubs/pyarrow/__init__.pyi | 7 + python/stubs/pyarrow/parquet.pyi | 8 + python/tests/conftest.py | 89 +- .../test_write_to_pyspark.py | 4 +- python/tests/test_alter.py | 158 ++- python/tests/test_benchmark.py | 2 +- python/tests/test_cdf.py | 267 ++++- python/tests/test_checkpoint.py | 113 ++ python/tests/test_delete.py | 5 +- python/tests/test_file_system_handler.py | 2 +- python/tests/test_fs.py | 25 +- python/tests/test_merge.py | 100 +- python/tests/test_optimize.py | 44 +- python/tests/test_repair.py | 4 +- python/tests/test_restore.py | 4 +- python/tests/test_schema.py | 130 ++- python/tests/test_table_read.py | 188 ++- python/tests/test_update.py | 10 +- python/tests/test_vacuum.py | 4 +- python/tests/test_writer.py | 349 ++++-- python/tests/test_writerproperties.py | 56 +- 205 files changed, 15934 insertions(+), 8216 deletions(-) create mode 100644 .github/actions/setup-env/action.yml create mode 100644 .github/codecov.yml create mode 100644 .github/workflows/codecov.yml create mode 100644 .github/workflows/python_benchmark.yml create mode 100644 crates/aws/src/constants.rs create mode 100644 crates/aws/src/logstore/default_logstore.rs rename crates/aws/src/{logstore.rs => logstore/dynamodb_logstore.rs} (94%) create mode 100644 crates/aws/src/logstore/mod.rs create mode 100644 crates/core/src/delta_datafusion/planner.rs create mode 100644 crates/core/src/delta_datafusion/schema_adapter.rs delete mode 100644 crates/core/src/kernel/expressions/eval.rs delete mode 100644 crates/core/src/kernel/expressions/mod.rs delete mode 100644 crates/core/src/kernel/expressions/scalars.rs create mode 100644 crates/core/src/kernel/scalars.rs create mode 100644 crates/core/src/operations/add_column.rs create mode 100644 crates/core/src/operations/add_feature.rs delete mode 100644 crates/core/src/operations/cast.rs create mode 100644 crates/core/src/operations/cast/merge_schema.rs create mode 100644 crates/core/src/operations/cast/mod.rs create mode 100644 crates/core/src/operations/cdc.rs create mode 100644 crates/core/src/operations/merge/filter.rs delete mode 100644 crates/core/src/operations/transaction/test_utils.rs create mode 100644 crates/core/src/test_utils/factories/actions.rs create mode 100644 crates/core/src/test_utils/factories/data.rs create mode 100644 crates/core/src/test_utils/factories/mod.rs create mode 100644 crates/core/src/test_utils/mod.rs create mode 100644 crates/hdfs/Cargo.toml create mode 100644 crates/hdfs/src/lib.rs create mode 100644 crates/hdfs/tests/context.rs create mode 100644 crates/hdfs/tests/integration.rs create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000000.json create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000001.json create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.checkpoint.parquet create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.json create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/_last_checkpoint create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet create mode 100755 dev/publish.sh create mode 100644 docs/Makefile create mode 100644 docs/integrations/object-storage/adls.md create mode 100644 docs/integrations/object-storage/gcs.md create mode 100644 docs/integrations/object-storage/hdfs.md create mode 100644 docs/integrations/object-storage/s3-like.md create mode 100644 docs/integrations/object-storage/s3.md create mode 100644 python/src/features.rs create mode 100644 python/src/merge.rs create mode 100644 python/stubs/pyarrow/parquet.pyi diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b99809d1f6..736703c551 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,4 +1,4 @@ -crates/ @wjones127 @roeap @rtyler +crates/ @wjones127 @roeap @rtyler @hntd187 @ion-elgreco delta-inspect/ @wjones127 @rtyler proofs/ @houqp python/ @wjones127 @fvaleye @roeap @ion-elgreco diff --git a/.github/actions/setup-env/action.yml b/.github/actions/setup-env/action.yml new file mode 100644 index 0000000000..7875107ddd --- /dev/null +++ b/.github/actions/setup-env/action.yml @@ -0,0 +1,34 @@ +name: "Setup Python and Rust Environment" +description: "Set up Python, virtual environment, and Rust toolchain" + +inputs: + + python-version: + description: "The Python version to set up" + required: true + default: "3.10" + + rust-toolchain: + description: "The Rust toolchain to set up" + required: true + default: "stable" + +runs: + using: "composite" + + steps: + + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + + - name: Install Rust toolchain + uses: actions-rs/toolchain@v1 + with: + profile: default + toolchain: ${{ inputs.rust-toolchain }} + override: true + components: rustfmt, clippy + + - uses: Swatinem/rust-cache@v2 \ No newline at end of file diff --git a/.github/codecov.yml b/.github/codecov.yml new file mode 100644 index 0000000000..dd93c3b7cf --- /dev/null +++ b/.github/codecov.yml @@ -0,0 +1,17 @@ + +coverage: + status: + project: + default: + # allow some leniency on the deviation of pull requests + threshold: '1%' + informational: true + patch: + default: + informational: true + + +ignore: + - "delta-inspect/" + - "proofs/" + - "**/*.toml" diff --git a/.github/dependabot.yml b/.github/dependabot.yml index bdacb4c00c..1e5b6b27a4 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -10,6 +10,5 @@ updates: ignore: # arrow and datafusion are bumped manually - dependency-name: "arrow*" - update-types: ["version-update:semver-major"] - dependency-name: "datafusion*" - update-types: ["version-update:semver-major"] + - dependency-name: "parquet" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8b80dc0a9f..a807184c47 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -5,6 +5,10 @@ on: branches: [main, "rust-v*"] pull_request: branches: [main, "rust-v*"] + merge_group: + +env: + DEFAULT_FEATURES: "azure,datafusion,s3,gcs,glue,hdfs " jobs: format: @@ -16,7 +20,7 @@ jobs: uses: actions-rs/toolchain@v1 with: profile: default - toolchain: stable + toolchain: '1.80' override: true - name: Format @@ -28,7 +32,6 @@ jobs: matrix: os: - ubuntu-latest - - macos-11 - windows-latest runs-on: ${{ matrix.os }} @@ -39,17 +42,17 @@ jobs: uses: actions-rs/toolchain@v1 with: profile: default - toolchain: stable + toolchain: '1.80' override: true - name: build and lint with clippy - run: cargo clippy --features azure,datafusion,s3,gcs,glue --tests + run: cargo clippy --features ${{ env.DEFAULT_FEATURES }} --tests - name: Spot-check build for native-tls features run: cargo clippy --no-default-features --features azure,datafusion,s3-native-tls,gcs,glue --tests - name: Check docs - run: cargo doc --features azure,datafusion,s3,gcs,glue + run: cargo doc --features ${{ env.DEFAULT_FEATURES }} - name: Check no default features (except rustls) run: cargo check --no-default-features --features rustls @@ -60,7 +63,6 @@ jobs: matrix: os: - ubuntu-latest - - macos-11 - windows-latest runs-on: ${{ matrix.os }} env: @@ -77,11 +79,11 @@ jobs: uses: actions-rs/toolchain@v1 with: profile: default - toolchain: "stable" + toolchain: '1.80' override: true - name: Run tests - run: cargo test --verbose --features datafusion,azure + run: cargo test --verbose --features ${{ env.DEFAULT_FEATURES }} integration_test: name: Integration Tests @@ -94,6 +96,7 @@ jobs: # https://github.com/rust-lang/cargo/issues/10280 CARGO_NET_GIT_FETCH_WITH_CLI: "true" RUST_BACKTRACE: "1" + RUST_LOG: debug AWS_DEFAULT_REGION: "us-east-1" AWS_ACCESS_KEY_ID: deltalake AWS_SECRET_ACCESS_KEY: weloverust @@ -111,15 +114,27 @@ jobs: uses: actions-rs/toolchain@v1 with: profile: default - toolchain: stable + toolchain: '1.80' override: true + # Install Java and Hadoop for HDFS integration tests + - uses: actions/setup-java@v4 + with: + distribution: "temurin" + java-version: "17" + + - name: Download Hadoop + run: | + wget -q https://dlcdn.apache.org/hadoop/common/hadoop-3.4.0/hadoop-3.4.0.tar.gz + tar -xf hadoop-3.4.0.tar.gz -C $GITHUB_WORKSPACE + echo "$GITHUB_WORKSPACE/hadoop-3.4.0/bin" >> $GITHUB_PATH + - name: Start emulated services - run: docker-compose up -d + run: docker compose up -d - name: Run tests with rustls (default) run: | - cargo test --features integration_test,azure,s3,gcs,datafusion + cargo test --features integration_test,${{ env.DEFAULT_FEATURES }} - name: Run tests with native-tls run: | diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml new file mode 100644 index 0000000000..a8d9beabcd --- /dev/null +++ b/.github/workflows/codecov.yml @@ -0,0 +1,36 @@ +name: coverage + +on: + push: + branches: [main, "rust-v*"] + pull_request: + branches: [main, "rust-v*"] + +env: + DEFAULT_FEATURES: "azure,datafusion,s3,gcs,glue,hdfs " + +jobs: + coverage: + runs-on: ubuntu-latest + env: + CARGO_TERM_COLOR: always + steps: + - uses: actions/checkout@v4 + - name: Install rust + uses: actions-rs/toolchain@v1 + with: + profile: default + toolchain: '1.80' + override: true + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + - uses: Swatinem/rust-cache@v2 + - name: Generate code coverage + run: cargo llvm-cov --features ${DEFAULT_FEATURES} --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + files: codecov.json + fail_ci_if_error: true + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 6b3d5a7ddb..121e0b8882 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -2,6 +2,7 @@ name: dev_pr # Trigger whenever a PR is changed (title as well as new / changed commits) on: + merge_group: pull_request_target: types: - opened diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 079cd66fcc..5729b87624 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,6 +1,7 @@ name: Build (and maybe release) the documentation on: + merge_group: pull_request: paths: - python/** @@ -31,9 +32,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: psf/black@stable - with: - src: docs/src/python + - run: | + cd docs + make check build-deploy: needs: @@ -47,25 +48,13 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install Rust - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.10' + - name: Setup Environment + uses: ./.github/actions/setup-env - name: Build and install deltalake run: | cd python - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make ${{ env.BUILD_ARGS }} diff --git a/.github/workflows/python_benchmark.yml b/.github/workflows/python_benchmark.yml new file mode 100644 index 0000000000..896c5cc412 --- /dev/null +++ b/.github/workflows/python_benchmark.yml @@ -0,0 +1,54 @@ +name: python_benchmark + + +# This is separate from the python_build so that it doesn't need to run on the merge group +on: + push: + branches: [main] + pull_request: + branches: [main] + +defaults: + run: + working-directory: ./python + +jobs: + benchmark: + name: Python Benchmark + runs-on: ubuntu-latest + env: + RUSTFLAGS: "-C debuginfo=line-tables-only" + CARGO_INCREMENTAL: 0 + + steps: + - uses: actions/checkout@v2 + + - name: Setup Environment + uses: ./.github/actions/setup-env + + - name: Build deltalake in release mode + run: | + python -m venv venv + source venv/bin/activate + MATURIN_EXTRA_ARGS=--release make develop + + # Download previous benchmark result from cache (if exists) + - name: Download previous benchmark data + uses: actions/cache@v2 + with: + path: ./cache + key: ${{ runner.os }}-benchmark + + - name: Run benchmark + run: | + source venv/bin/activate + pytest tests/test_benchmark.py -m benchmark --benchmark-json output.json + + - name: Store benchmark result + uses: benchmark-action/github-action-benchmark@v1 + with: + tool: "pytest" + output-file-path: python/output.json + external-data-json-path: ./cache/benchmark-data.json + fail-on-alert: true + diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index bc2f20cc9a..dc5483e091 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -1,6 +1,7 @@ name: python_build on: + merge_group: push: branches: [main] pull_request: @@ -15,28 +16,22 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 + + - name: Setup Environment + uses: ./.github/actions/setup-env - name: Check Python run: | - pip install ruff black mypy types-dataclasses typing-extensions + python -m venv venv + source venv/bin/activate + pip install ruff==0.5.2 mypy==1.10.1 types-dataclasses typing-extensions make check-python - - name: Install minimal stable with clippy and rustfmt - uses: actions-rs/toolchain@v1 - with: - profile: default - toolchain: stable - override: true - - name: Check Rust run: make check-rust test-minimal: - name: Python Build (Python 3.8 PyArrow 8.0.0) + name: Python Build (Python 3.8 PyArrow 16.0.0) runs-on: ubuntu-latest env: RUSTFLAGS: "-C debuginfo=line-tables-only" @@ -45,28 +40,18 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v2 + - name: Setup Environment + uses: ./.github/actions/setup-env with: python-version: 3.8 - - name: Install latest nightly - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - name: Build and install deltalake run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make setup # Install minimum PyArrow version - pip install -e .[pandas,devel] pyarrow==8.0.0 + pip install -e .[pandas,devel] pyarrow==16.0.0 env: RUSTFLAGS: "-C debuginfo=line-tables-only" @@ -75,10 +60,6 @@ jobs: source venv/bin/activate make unit-test - # - name: Run Integration tests - # run: | - # py.test --cov tests -m integration - test: name: Python Build (Python 3.10 PyArrow latest) runs-on: ubuntu-latest @@ -89,26 +70,15 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install latest nightly - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - - uses: actions/setup-python@v3 - with: - python-version: "3.10" + - name: Setup Environment + uses: ./.github/actions/setup-env - name: Start emulated services - run: docker-compose up -d + run: docker compose up -d - name: Build and install deltalake run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make develop @@ -127,56 +97,6 @@ jobs: python -m pytest -m "not pandas and not integration and not benchmark" pip install pandas - benchmark: - name: Python Benchmark - runs-on: ubuntu-latest - env: - RUSTFLAGS: "-C debuginfo=line-tables-only" - CARGO_INCREMENTAL: 0 - - steps: - - uses: actions/checkout@v2 - - - name: Install latest nightly - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - - name: Build deltalake in release mode - run: | - pip install virtualenv - virtualenv venv - source venv/bin/activate - MATURIN_EXTRA_ARGS=--release make develop - - # Download previous benchmark result from cache (if exists) - - name: Download previous benchmark data - uses: actions/cache@v2 - with: - path: ./cache - key: ${{ runner.os }}-benchmark - - - name: Run benchmark - run: | - source venv/bin/activate - pytest tests/test_benchmark.py -m benchmark --benchmark-json output.json - - - name: Store benchmark result - uses: benchmark-action/github-action-benchmark@v1 - with: - tool: "pytest" - output-file-path: python/output.json - external-data-json-path: ./cache/benchmark-data.json - fail-on-alert: true - test-pyspark: name: PySpark Integration Tests runs-on: ubuntu-latest @@ -187,18 +107,8 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install latest nightly - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - - uses: actions/setup-python@v3 - with: - python-version: "3.10" + - name: Setup Environment + uses: ./.github/actions/setup-env - uses: actions/setup-java@v2 with: @@ -207,8 +117,7 @@ jobs: - name: Build and install deltalake run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make develop-pyspark @@ -231,15 +140,14 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + - name: Setup Environment + uses: ./.github/actions/setup-env with: python-version: ${{ matrix.python-version }} - name: Build and install deltalake run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make setup maturin develop diff --git a/.github/workflows/python_release.yml b/.github/workflows/python_release.yml index 48611bacb4..cf462f2070 100644 --- a/.github/workflows/python_release.yml +++ b/.github/workflows/python_release.yml @@ -35,7 +35,7 @@ jobs: fail-fast: false matrix: target: [x86_64-apple-darwin, aarch64-apple-darwin] - runs-on: macos-12 + runs-on: macos-14 steps: - uses: actions/checkout@v3 @@ -76,6 +76,7 @@ jobs: env: MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} with: + maturin-version: v1.6.0 # https://github.com/PyO3/maturin/issues/2154 target: x86_64-unknown-linux-gnu command: publish args: --skip-existing -m python/Cargo.toml ${{ env.FEATURES_FLAG }} diff --git a/.gitignore b/.gitignore index 84fc17c5f2..18dcc39f69 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ tlaplus/*.toolbox/*/[0-9]*-[0-9]*-[0-9]*-[0-9]*-[0-9]*-[0-9]*/ .vscode .env .venv +venv **/.DS_Store **/.python-version .coverage @@ -21,6 +22,7 @@ __blobstorage__ .githubchangeloggenerator.cache.log .githubchangeloggenerator.cache/ .githubchangeloggenerator* +data # Add all Cargo.lock files except for those in binary crates Cargo.lock @@ -30,4 +32,4 @@ Cargo.lock justfile site -__pycache__ +__pycache__ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index e68641da21..7c0c5099c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,301 @@ # Changelog +## [rust-v0.19.0](https://github.com/delta-io/delta-rs/tree/rust-v0.19.0) (2024-08-14) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.18.2...rust-v0.19.0) + +**Implemented enhancements:** + +- Only allow squash merge [\#2542](https://github.com/delta-io/delta-rs/issues/2542) + +**Fixed bugs:** + +- Write also insert change types in writer CDC [\#2750](https://github.com/delta-io/delta-rs/issues/2750) +- Regression in Python multiprocessing support [\#2744](https://github.com/delta-io/delta-rs/issues/2744) +- SchemaError occurs during table optimisation after upgrade to v0.18.1 [\#2731](https://github.com/delta-io/delta-rs/issues/2731) +- AWS WebIdentityToken exposure in log files [\#2719](https://github.com/delta-io/delta-rs/issues/2719) +- Write performance degrades with multiple writers [\#2683](https://github.com/delta-io/delta-rs/issues/2683) +- Write monotonic sequence, but read is non monotonic [\#2659](https://github.com/delta-io/delta-rs/issues/2659) +- Python `write_deltalake` with `schema_mode="merge"` casts types [\#2642](https://github.com/delta-io/delta-rs/issues/2642) +- Newest docs \(potentially\) not released [\#2587](https://github.com/delta-io/delta-rs/issues/2587) +- CDC is not generated for Structs and Lists [\#2568](https://github.com/delta-io/delta-rs/issues/2568) + +**Closed issues:** + +- delete\_dir bug [\#2713](https://github.com/delta-io/delta-rs/issues/2713) + +**Merged pull requests:** + +- chore: fix a bunch of clippy lints and re-enable tests [\#2773](https://github.com/delta-io/delta-rs/pull/2773) ([rtyler](https://github.com/rtyler)) +- feat: more economic data skipping with datafusion [\#2772](https://github.com/delta-io/delta-rs/pull/2772) ([roeap](https://github.com/roeap)) +- chore: prepare the next notable release of 0.19.0 [\#2768](https://github.com/delta-io/delta-rs/pull/2768) ([rtyler](https://github.com/rtyler)) +- feat: restore the TryFrom for DeltaTablePartition [\#2767](https://github.com/delta-io/delta-rs/pull/2767) ([rtyler](https://github.com/rtyler)) +- feat: fail fast on forked process [\#2765](https://github.com/delta-io/delta-rs/pull/2765) ([Tom-Newton](https://github.com/Tom-Newton)) +- perf: early stop if all values in arr are null [\#2764](https://github.com/delta-io/delta-rs/pull/2764) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): don't flatten fields during cdf read [\#2763](https://github.com/delta-io/delta-rs/pull/2763) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: upgrade to datafusion 41 [\#2761](https://github.com/delta-io/delta-rs/pull/2761) ([rtyler](https://github.com/rtyler)) +- fix\(python, rust\): cdc in writer not creating inserts [\#2751](https://github.com/delta-io/delta-rs/pull/2751) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: improved test fixtures [\#2749](https://github.com/delta-io/delta-rs/pull/2749) ([roeap](https://github.com/roeap)) +- feat: introduce CDC generation for merge operations [\#2747](https://github.com/delta-io/delta-rs/pull/2747) ([rtyler](https://github.com/rtyler)) +- docs: fix broken link in docs [\#2746](https://github.com/delta-io/delta-rs/pull/2746) ([astrojuanlu](https://github.com/astrojuanlu)) +- chore: update delta\_kernel to 0.3.0 [\#2742](https://github.com/delta-io/delta-rs/pull/2742) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- chore: add to code\_owner crates [\#2741](https://github.com/delta-io/delta-rs/pull/2741) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: update changelog and versions for next release [\#2740](https://github.com/delta-io/delta-rs/pull/2740) ([rtyler](https://github.com/rtyler)) +- feat\(python, rust\): arrow large/view types passthrough, rust default engine [\#2738](https://github.com/delta-io/delta-rs/pull/2738) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: column parsing to include nested columns and enclosing char [\#2737](https://github.com/delta-io/delta-rs/pull/2737) ([gtrawinski](https://github.com/gtrawinski)) + +## [rust-v0.18.2](https://github.com/delta-io/delta-rs/tree/rust-v0.18.2) (2024-08-07) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.18.1...rust-v0.18.2) + +**Implemented enhancements:** + +- Choose which columns to store min/max values for [\#2709](https://github.com/delta-io/delta-rs/issues/2709) +- Projection pushdown for load\_cdf [\#2681](https://github.com/delta-io/delta-rs/issues/2681) +- Way to check if Delta table exists at specified path [\#2662](https://github.com/delta-io/delta-rs/issues/2662) +- Support HDFS via hdfs-native package [\#2611](https://github.com/delta-io/delta-rs/issues/2611) +- Deletion `_change_type` does not appear in change data feed [\#2579](https://github.com/delta-io/delta-rs/issues/2579) +- Could you please explain in the README what "Deltalake" is for the uninitiated? [\#2523](https://github.com/delta-io/delta-rs/issues/2523) +- Discuss: Allow protocol change during write actions [\#2444](https://github.com/delta-io/delta-rs/issues/2444) +- Support for Arrow PyCapsule interface [\#2376](https://github.com/delta-io/delta-rs/issues/2376) + +**Fixed bugs:** + +- Slow add\_actions.to\_pydict for tables with large number of columns, impacting read performance [\#2733](https://github.com/delta-io/delta-rs/issues/2733) +- append is deleting records [\#2716](https://github.com/delta-io/delta-rs/issues/2716) +- segmentation fault - Python 3.10 on Mac M3 [\#2706](https://github.com/delta-io/delta-rs/issues/2706) +- Failure to delete dir and files [\#2703](https://github.com/delta-io/delta-rs/issues/2703) +- DeltaTable.from\_data\_catalog not working [\#2699](https://github.com/delta-io/delta-rs/issues/2699) +- Project should use the same version of `ruff` in the `lint` stage of `python_build.yml` as in `pyproject.toml` [\#2678](https://github.com/delta-io/delta-rs/issues/2678) +- un-tracked columns are giving json error when pyarrow schema have feild with nullable=False and create\_checkpoint is trigged [\#2675](https://github.com/delta-io/delta-rs/issues/2675) +- \[BUG\]write\_delta\({'custom\_metadata':str}\) cannot be converted. str to pyDict error \(0.18.2\_DeltaPython/Windows10\) [\#2697](https://github.com/delta-io/delta-rs/issues/2697) +- Pyarrow engine not supporting schema overwrite with Append mode [\#2654](https://github.com/delta-io/delta-rs/issues/2654) +- `deltalake-core` version re-exported by `deltalake` different than versions used by `deltalake-azure` and `deltalake-gcp` [\#2647](https://github.com/delta-io/delta-rs/issues/2647) +- i32 limit in JSON stats [\#2646](https://github.com/delta-io/delta-rs/issues/2646) +- Rust writer not encoding correct URL for partitions in delta table [\#2634](https://github.com/delta-io/delta-rs/issues/2634) +- Large Types breaks merge predicate pruning [\#2632](https://github.com/delta-io/delta-rs/issues/2632) +- Getting error when converting a partitioned parquet table to delta table [\#2626](https://github.com/delta-io/delta-rs/issues/2626) +- Arrow: Parquet does not support writing empty structs when creating checkpoint [\#2622](https://github.com/delta-io/delta-rs/issues/2622) +- InvalidTableLocation\("Unknown scheme: gs"\) on 0.18.0 [\#2610](https://github.com/delta-io/delta-rs/issues/2610) +- Unable to read delta table created using Uniform [\#2578](https://github.com/delta-io/delta-rs/issues/2578) +- schema merging doesn't work when overwriting with a predicate [\#2567](https://github.com/delta-io/delta-rs/issues/2567) +- Not working in AWS Lambda \(0.16.2 - 0.17.4\) OSError: Generic S3 error [\#2511](https://github.com/delta-io/delta-rs/issues/2511) +- DataFusion filter on partition column doesn't work. \(when the phsical schema ordering is different to logical one\) [\#2494](https://github.com/delta-io/delta-rs/issues/2494) +- Creating checkpoints for tables with missing column stats results in Err [\#2493](https://github.com/delta-io/delta-rs/issues/2493) +- Cannot merge to a table with a timestamp column after upgrading delta-rs [\#2478](https://github.com/delta-io/delta-rs/issues/2478) +- Azure AD Auth fails on ARM64 [\#2475](https://github.com/delta-io/delta-rs/issues/2475) +- Generic S3 error: Error after 0 retries ... Broken pipe \(os error 32\) [\#2403](https://github.com/delta-io/delta-rs/issues/2403) +- write\_deltalake identifies large\_string as datatype even though string is set in schema [\#2374](https://github.com/delta-io/delta-rs/issues/2374) +- Inconsistent arrow timestamp type breaks datafusion query [\#2341](https://github.com/delta-io/delta-rs/issues/2341) + +**Closed issues:** + +- Unable to write new partitions with type timestamp on tables created with delta-rs 0.10.0 [\#2631](https://github.com/delta-io/delta-rs/issues/2631) + +**Merged pull requests:** + +- fix: schema adapter doesn't map partial batches correctly [\#2735](https://github.com/delta-io/delta-rs/pull/2735) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- perf: grab file size in rust [\#2734](https://github.com/delta-io/delta-rs/pull/2734) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: use logical plan in update, refactor/simplify CDCTracker [\#2727](https://github.com/delta-io/delta-rs/pull/2727) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: use logical plan in delete, delta planner refactoring [\#2725](https://github.com/delta-io/delta-rs/pull/2725) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: try an alternative docke compose invocation syntax [\#2724](https://github.com/delta-io/delta-rs/pull/2724) ([rtyler](https://github.com/rtyler)) +- fix\(python, rust\): use input schema to get correct schema in cdf reads [\#2723](https://github.com/delta-io/delta-rs/pull/2723) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): cdc write-support for `overwrite` and `replacewhere` writes [\#2722](https://github.com/delta-io/delta-rs/pull/2722) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): cdc write-support for `delete` operation [\#2721](https://github.com/delta-io/delta-rs/pull/2721) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: enabling actions for merge groups [\#2718](https://github.com/delta-io/delta-rs/pull/2718) ([rtyler](https://github.com/rtyler)) +- perf: apply projection when reading checkpoint parquet [\#2717](https://github.com/delta-io/delta-rs/pull/2717) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- feat\(python\): add DeltaTable.is\_deltatable static method \(\#2662\) [\#2715](https://github.com/delta-io/delta-rs/pull/2715) ([omkar-foss](https://github.com/omkar-foss)) +- chore: prepare python release 0.18.3 [\#2707](https://github.com/delta-io/delta-rs/pull/2707) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): use url encoder when encoding partition values [\#2705](https://github.com/delta-io/delta-rs/pull/2705) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): add projection in CDF reads [\#2704](https://github.com/delta-io/delta-rs/pull/2704) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: ensure DataFusion SessionState Parquet options are applied to DeltaScan [\#2702](https://github.com/delta-io/delta-rs/pull/2702) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- chore: refactor `write_deltalake` in `writer.py` [\#2695](https://github.com/delta-io/delta-rs/pull/2695) ([fpgmaas](https://github.com/fpgmaas)) +- fix\(python\): empty dataset fix for "pyarrow" engine [\#2689](https://github.com/delta-io/delta-rs/pull/2689) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: add test coverage command to `Makefile` [\#2688](https://github.com/delta-io/delta-rs/pull/2688) ([fpgmaas](https://github.com/fpgmaas)) +- chore: create separate action to setup python and rust in the cicd pipeline [\#2687](https://github.com/delta-io/delta-rs/pull/2687) ([fpgmaas](https://github.com/fpgmaas)) +- fix: update delta kernel version [\#2685](https://github.com/delta-io/delta-rs/pull/2685) ([jeppe742](https://github.com/jeppe742)) +- chore: update README.md [\#2684](https://github.com/delta-io/delta-rs/pull/2684) ([veronewra](https://github.com/veronewra)) +- fix\(rust,python\): checkpoint with column nullable false [\#2680](https://github.com/delta-io/delta-rs/pull/2680) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: pin `ruff` and `mypy` versions in the `lint` stage in the CI pipeline [\#2679](https://github.com/delta-io/delta-rs/pull/2679) ([fpgmaas](https://github.com/fpgmaas)) +- chore: enable `RUF` ruleset for `ruff` [\#2677](https://github.com/delta-io/delta-rs/pull/2677) ([fpgmaas](https://github.com/fpgmaas)) +- chore: remove stale code for conditional import of `Literal` [\#2676](https://github.com/delta-io/delta-rs/pull/2676) ([fpgmaas](https://github.com/fpgmaas)) +- chore: remove references to black from the project [\#2674](https://github.com/delta-io/delta-rs/pull/2674) ([fpgmaas](https://github.com/fpgmaas)) +- chore: bump ruff to 0.5.2 [\#2673](https://github.com/delta-io/delta-rs/pull/2673) ([fpgmaas](https://github.com/fpgmaas)) +- chore: improve contributing.md [\#2672](https://github.com/delta-io/delta-rs/pull/2672) ([fpgmaas](https://github.com/fpgmaas)) +- feat: support userMetadata in CommitInfo [\#2670](https://github.com/delta-io/delta-rs/pull/2670) ([jkylling](https://github.com/jkylling)) +- chore: upgrade to datafusion 40 [\#2661](https://github.com/delta-io/delta-rs/pull/2661) ([rtyler](https://github.com/rtyler)) +- docs: improve navigation fixes [\#2660](https://github.com/delta-io/delta-rs/pull/2660) ([avriiil](https://github.com/avriiil)) +- docs: add integration docs for s3 backend [\#2658](https://github.com/delta-io/delta-rs/pull/2658) ([avriiil](https://github.com/avriiil)) +- docs: fix bullets on hdfs docs [\#2653](https://github.com/delta-io/delta-rs/pull/2653) ([Kimahriman](https://github.com/Kimahriman)) +- ci: update CODEOWNERS [\#2650](https://github.com/delta-io/delta-rs/pull/2650) ([hntd187](https://github.com/hntd187)) +- feat\(rust\): fix size\_in\_bytes in last\_checkpoint\_ to i64 [\#2649](https://github.com/delta-io/delta-rs/pull/2649) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: increase subcrate versions [\#2648](https://github.com/delta-io/delta-rs/pull/2648) ([rtyler](https://github.com/rtyler)) +- chore: missed one macos runner reference in actions [\#2645](https://github.com/delta-io/delta-rs/pull/2645) ([rtyler](https://github.com/rtyler)) +- chore: add a reproduction case for merge failures with struct\ [\#2644](https://github.com/delta-io/delta-rs/pull/2644) ([rtyler](https://github.com/rtyler)) +- chore: remove macos builders from pull request flow [\#2638](https://github.com/delta-io/delta-rs/pull/2638) ([rtyler](https://github.com/rtyler)) +- fix: enable parquet pushdown for DeltaScan via TableProvider impl for DeltaTable \(rebase\) [\#2637](https://github.com/delta-io/delta-rs/pull/2637) ([rtyler](https://github.com/rtyler)) +- chore: fix documentation generation with a pin of griffe [\#2636](https://github.com/delta-io/delta-rs/pull/2636) ([rtyler](https://github.com/rtyler)) +- fix\(python\): fixed large\_dtype to schema convert [\#2635](https://github.com/delta-io/delta-rs/pull/2635) ([sherlockbeard](https://github.com/sherlockbeard)) +- fix\(rust, python\): fix writing empty structs when creating checkpoint [\#2627](https://github.com/delta-io/delta-rs/pull/2627) ([sherlockbeard](https://github.com/sherlockbeard)) +- fix\(rust, python\): fix merge schema with overwrite [\#2623](https://github.com/delta-io/delta-rs/pull/2623) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: bump python 0.18.2 [\#2621](https://github.com/delta-io/delta-rs/pull/2621) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: report DataFusion metrics for DeltaScan [\#2617](https://github.com/delta-io/delta-rs/pull/2617) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- feat\(rust,python\): cast each parquet file to delta schema [\#2615](https://github.com/delta-io/delta-rs/pull/2615) ([HawaiianSpork](https://github.com/HawaiianSpork)) +- fix\(rust\): inconsistent order of partitioning columns \(\#2494\) [\#2614](https://github.com/delta-io/delta-rs/pull/2614) ([aditanase](https://github.com/aditanase)) +- docs: add Daft writer [\#2594](https://github.com/delta-io/delta-rs/pull/2594) ([avriiil](https://github.com/avriiil)) +- feat\(python, rust\): `add column` operation [\#2562](https://github.com/delta-io/delta-rs/pull/2562) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: change arrow map root name to follow with parquet root name [\#2538](https://github.com/delta-io/delta-rs/pull/2538) ([sclmn](https://github.com/sclmn)) +- feat\(python\): handle PyCapsule interface objects in write\_deltalake [\#2534](https://github.com/delta-io/delta-rs/pull/2534) ([kylebarron](https://github.com/kylebarron)) +- feat: improve merge performance by using predicate non-partition columns min/max for prefiltering [\#2513](https://github.com/delta-io/delta-rs/pull/2513) ([JonasDev1](https://github.com/JonasDev1)) +- feat\(python, rust\): cleanup expired logs post-commit hook [\#2459](https://github.com/delta-io/delta-rs/pull/2459) ([ion-elgreco](https://github.com/ion-elgreco)) + +## [rust-v0.18.0](https://github.com/delta-io/delta-rs/tree/rust-v0.18.0) (2024-06-12) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.17.3...rust-v0.18.0) + +**Implemented enhancements:** + +- documentation: concurrent writes for non-S3 backends [\#2556](https://github.com/delta-io/delta-rs/issues/2556) +- pyarrow options for `write_delta` [\#2515](https://github.com/delta-io/delta-rs/issues/2515) +- \[deltalake\_aws\] Allow configuring separate endpoints for S3 and DynamoDB clients. [\#2498](https://github.com/delta-io/delta-rs/issues/2498) +- Include file stats when converting a parquet directory to a Delta table [\#2490](https://github.com/delta-io/delta-rs/issues/2490) +- Adopt the delta kernel types [\#2489](https://github.com/delta-io/delta-rs/issues/2489) + +**Fixed bugs:** + +- `raise_if_not_exists` for properties not configurable on CreateBuilder [\#2564](https://github.com/delta-io/delta-rs/issues/2564) +- write\_deltalake with rust engine fails when mode is append and overwrite schema is enabled [\#2553](https://github.com/delta-io/delta-rs/issues/2553) +- Running the basic\_operations examples fails with `Error: Transaction { source: WriterFeaturesRequired(TimestampWithoutTimezone) `} [\#2552](https://github.com/delta-io/delta-rs/issues/2552) +- invalid peer certificate: BadSignature when connecting to s3 from arm64/aarch64 [\#2551](https://github.com/delta-io/delta-rs/issues/2551) +- load\_cdf\(\) issue : Generic S3 error: request or response body error: operation timed out [\#2549](https://github.com/delta-io/delta-rs/issues/2549) +- write\_deltalake fails on Databricks volume [\#2540](https://github.com/delta-io/delta-rs/issues/2540) +- Getting "Microsoft Azure Error: Operation timed out" when trying to retrieve big files [\#2537](https://github.com/delta-io/delta-rs/issues/2537) +- Impossible to append to a DeltaTable with float data type on RHEL [\#2520](https://github.com/delta-io/delta-rs/issues/2520) +- Creating DeltaTable object slow [\#2518](https://github.com/delta-io/delta-rs/issues/2518) +- `write_deltalake` throws parser error when using `rust` engine and big decimals [\#2510](https://github.com/delta-io/delta-rs/issues/2510) +- TypeError: Object of type int64 is not JSON serializable when writing using a Pandas dataframe [\#2501](https://github.com/delta-io/delta-rs/issues/2501) +- unable to read delta table when table contains both null and non-null add stats [\#2477](https://github.com/delta-io/delta-rs/issues/2477) +- Commits on WriteMode::MergeSchema cause table metadata corruption [\#2468](https://github.com/delta-io/delta-rs/issues/2468) +- S3 object store always returns IMDS warnings [\#2460](https://github.com/delta-io/delta-rs/issues/2460) +- File skipping according to documentation [\#2427](https://github.com/delta-io/delta-rs/issues/2427) +- LockClientError [\#2379](https://github.com/delta-io/delta-rs/issues/2379) +- get\_app\_transaction\_version\(\) returns wrong result [\#2340](https://github.com/delta-io/delta-rs/issues/2340) +- Property setting in `create` is not handled correctly [\#2247](https://github.com/delta-io/delta-rs/issues/2247) +- Handling of decimals in scientific notation [\#2221](https://github.com/delta-io/delta-rs/issues/2221) +- Unable to append to delta table without datafusion feature [\#2204](https://github.com/delta-io/delta-rs/issues/2204) +- Decimal Column with Value 0 Causes Failure in Python Binding [\#2193](https://github.com/delta-io/delta-rs/issues/2193) + +**Merged pull requests:** + +- docs: improve S3 access docs [\#2589](https://github.com/delta-io/delta-rs/pull/2589) ([avriiil](https://github.com/avriiil)) +- chore: bump macOS runners, maybe resolve import error [\#2588](https://github.com/delta-io/delta-rs/pull/2588) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: bump to datafusion 39, arrow 52, pyo3 0.21 [\#2581](https://github.com/delta-io/delta-rs/pull/2581) ([abhiaagarwal](https://github.com/abhiaagarwal)) +- feat: add custom dynamodb endpoint configuration [\#2575](https://github.com/delta-io/delta-rs/pull/2575) ([hnaoto](https://github.com/hnaoto)) +- fix: consistently use raise\_if\_key\_not\_exists in CreateBuilder [\#2569](https://github.com/delta-io/delta-rs/pull/2569) ([vegarsti](https://github.com/vegarsti)) +- fix: add raise\_if\_key\_not\_exists to CreateBuilder [\#2565](https://github.com/delta-io/delta-rs/pull/2565) ([vegarsti](https://github.com/vegarsti)) +- docs: dt.delete add context + api docs link [\#2560](https://github.com/delta-io/delta-rs/pull/2560) ([avriiil](https://github.com/avriiil)) +- fix: update deltalake crate examples for crate layout and TimestampNtz [\#2559](https://github.com/delta-io/delta-rs/pull/2559) ([jhoekx](https://github.com/jhoekx)) +- docs: clarify locking mechanism requirement for S3 [\#2558](https://github.com/delta-io/delta-rs/pull/2558) ([inigohidalgo](https://github.com/inigohidalgo)) +- fix: remove deprecated overwrite\_schema configuration which has incorrect behavior [\#2554](https://github.com/delta-io/delta-rs/pull/2554) ([rtyler](https://github.com/rtyler)) +- fix: clippy warnings [\#2548](https://github.com/delta-io/delta-rs/pull/2548) ([imor](https://github.com/imor)) +- docs: dask write syntax fix [\#2543](https://github.com/delta-io/delta-rs/pull/2543) ([avriiil](https://github.com/avriiil)) +- fix: cast support fields nested in lists and maps [\#2541](https://github.com/delta-io/delta-rs/pull/2541) ([HawaiianSpork](https://github.com/HawaiianSpork)) +- feat: implement transaction identifiers - continued [\#2539](https://github.com/delta-io/delta-rs/pull/2539) ([roeap](https://github.com/roeap)) +- docs: pull delta from conda not pip [\#2535](https://github.com/delta-io/delta-rs/pull/2535) ([avriiil](https://github.com/avriiil)) +- chore: expose `files_by_partition` to public api [\#2533](https://github.com/delta-io/delta-rs/pull/2533) ([edmondop](https://github.com/edmondop)) +- chore: bump python 0.17.5 [\#2531](https://github.com/delta-io/delta-rs/pull/2531) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(rust\): make PartitionWriter public [\#2525](https://github.com/delta-io/delta-rs/pull/2525) ([adriangb](https://github.com/adriangb)) +- fix: msrv in workspace [\#2524](https://github.com/delta-io/delta-rs/pull/2524) ([roeap](https://github.com/roeap)) +- chore: fixing some clips [\#2521](https://github.com/delta-io/delta-rs/pull/2521) ([rtyler](https://github.com/rtyler)) +- fix: enable field\_with\_name to support nested fields with '.' delimiter [\#2519](https://github.com/delta-io/delta-rs/pull/2519) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- chore: tidying up builds without datafusion feature and clippy [\#2516](https://github.com/delta-io/delta-rs/pull/2516) ([rtyler](https://github.com/rtyler)) +- fix\(python\): release GIL on most operations [\#2512](https://github.com/delta-io/delta-rs/pull/2512) ([adriangb](https://github.com/adriangb)) +- docs: fix typo [\#2508](https://github.com/delta-io/delta-rs/pull/2508) ([avriiil](https://github.com/avriiil)) +- fix\(rust, python\): fixed differences in storage options between log and object stores [\#2500](https://github.com/delta-io/delta-rs/pull/2500) ([mightyshazam](https://github.com/mightyshazam)) +- docs: improve daft integration docs [\#2496](https://github.com/delta-io/delta-rs/pull/2496) ([avriiil](https://github.com/avriiil)) +- feat: adopt kernel schema types [\#2495](https://github.com/delta-io/delta-rs/pull/2495) ([roeap](https://github.com/roeap)) +- feat: add stats to convert-to-delta operation [\#2491](https://github.com/delta-io/delta-rs/pull/2491) ([gruuya](https://github.com/gruuya)) +- fix\(python, rust\): region lookup wasn't working correctly for dynamo [\#2488](https://github.com/delta-io/delta-rs/pull/2488) ([mightyshazam](https://github.com/mightyshazam)) +- feat: introduce CDC write-side support for the Update operations [\#2486](https://github.com/delta-io/delta-rs/pull/2486) ([rtyler](https://github.com/rtyler)) +- fix\(python\): reuse state in `to_pyarrow_dataset` [\#2485](https://github.com/delta-io/delta-rs/pull/2485) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: check to see if the file exists before attempting to rename [\#2482](https://github.com/delta-io/delta-rs/pull/2482) ([rtyler](https://github.com/rtyler)) +- fix\(python, rust\): use new schema for stats parsing instead of old [\#2480](https://github.com/delta-io/delta-rs/pull/2480) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): unable to read delta table when table contains both null and non-null add stats [\#2476](https://github.com/delta-io/delta-rs/pull/2476) ([yjshen](https://github.com/yjshen)) +- chore: update the changelog to include rust-v0.17.3 [\#2473](https://github.com/delta-io/delta-rs/pull/2473) ([rtyler](https://github.com/rtyler)) +- chore: a bunch of tweaks to get releases out the door [\#2472](https://github.com/delta-io/delta-rs/pull/2472) ([rtyler](https://github.com/rtyler)) +- chore: bump the core crate for its next release [\#2470](https://github.com/delta-io/delta-rs/pull/2470) ([rtyler](https://github.com/rtyler)) +- fix: return unsupported error for merging schemas in the presence of partition columns [\#2469](https://github.com/delta-io/delta-rs/pull/2469) ([emcake](https://github.com/emcake)) +- feat\(python\): add parameter to DeltaTable.to\_pyarrow\_dataset\(\) [\#2465](https://github.com/delta-io/delta-rs/pull/2465) ([adriangb](https://github.com/adriangb)) +- feat\(python, rust\): add OBJECT\_STORE\_CONCURRENCY\_LIMIT setting for ObjectStoreFactory [\#2458](https://github.com/delta-io/delta-rs/pull/2458) ([vigimite](https://github.com/vigimite)) +- fix\(rust\): handle 429 from GCS [\#2454](https://github.com/delta-io/delta-rs/pull/2454) ([adriangb](https://github.com/adriangb)) +- fix\(python\): reuse table state in write engine [\#2453](https://github.com/delta-io/delta-rs/pull/2453) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): implement abort commit for S3DynamoDBLogStore [\#2452](https://github.com/delta-io/delta-rs/pull/2452) ([PeterKeDer](https://github.com/PeterKeDer)) +- fix\(python, rust\): check timestamp\_ntz in nested fields, add check\_can\_write in pyarrow writer [\#2443](https://github.com/delta-io/delta-rs/pull/2443) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): remove imds calls from profile auth and region [\#2442](https://github.com/delta-io/delta-rs/pull/2442) ([mightyshazam](https://github.com/mightyshazam)) +- fix\(python, rust\): use from\_name during column projection creation [\#2441](https://github.com/delta-io/delta-rs/pull/2441) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: bump python for 0.17 release [\#2439](https://github.com/delta-io/delta-rs/pull/2439) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python,rust\): missing remove actions during `create_or_replace` [\#2437](https://github.com/delta-io/delta-rs/pull/2437) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: introduce the Operation trait to enforce consistency between operations [\#2435](https://github.com/delta-io/delta-rs/pull/2435) ([rtyler](https://github.com/rtyler)) +- fix\(python\): load\_as\_version with datetime object with no timezone specified [\#2429](https://github.com/delta-io/delta-rs/pull/2429) ([t1g0rz](https://github.com/t1g0rz)) +- feat\(python, rust\): respect column stats collection configurations [\#2428](https://github.com/delta-io/delta-rs/pull/2428) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: lazy static runtime in python [\#2424](https://github.com/delta-io/delta-rs/pull/2424) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: implement repartitioned for DeltaScan [\#2421](https://github.com/delta-io/delta-rs/pull/2421) ([jkylling](https://github.com/jkylling)) +- fix: return error when checkpoints and metadata get out of sync [\#2406](https://github.com/delta-io/delta-rs/pull/2406) ([esarili](https://github.com/esarili)) +- fix\(rust\): stats\_parsed has different number of records with stats [\#2405](https://github.com/delta-io/delta-rs/pull/2405) ([yjshen](https://github.com/yjshen)) +- docs: add Daft integration [\#2402](https://github.com/delta-io/delta-rs/pull/2402) ([avriiil](https://github.com/avriiil)) +- feat\(rust\): advance state in post commit [\#2396](https://github.com/delta-io/delta-rs/pull/2396) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore\(rust\): bump arrow v51 and datafusion v37.1 [\#2395](https://github.com/delta-io/delta-rs/pull/2395) ([lasantosr](https://github.com/lasantosr)) +- docs: document required aws permissions [\#2393](https://github.com/delta-io/delta-rs/pull/2393) ([ale-rinaldi](https://github.com/ale-rinaldi)) +- feat\(rust\): post commit hook \(v2\), create checkpoint hook [\#2391](https://github.com/delta-io/delta-rs/pull/2391) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: time travel when checkpointed and logs removed [\#2389](https://github.com/delta-io/delta-rs/pull/2389) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): remove flush after writing every batch [\#2387](https://github.com/delta-io/delta-rs/pull/2387) ([PeterKeDer](https://github.com/PeterKeDer)) +- feat: added configuration variables to handle EC2 metadata service [\#2385](https://github.com/delta-io/delta-rs/pull/2385) ([mightyshazam](https://github.com/mightyshazam)) +- fix\(rust\): timestamp deserialization format, missing type [\#2383](https://github.com/delta-io/delta-rs/pull/2383) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: bump chrono [\#2372](https://github.com/delta-io/delta-rs/pull/2372) ([universalmind303](https://github.com/universalmind303)) +- chore: bump python 0.16.4 [\#2371](https://github.com/delta-io/delta-rs/pull/2371) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: add snappy compression on checkpoint files [\#2365](https://github.com/delta-io/delta-rs/pull/2365) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: add config for parquet pushdown on delta scan [\#2364](https://github.com/delta-io/delta-rs/pull/2364) ([Blajda](https://github.com/Blajda)) +- fix\(python,rust\): optimize compact on schema evolved table [\#2358](https://github.com/delta-io/delta-rs/pull/2358) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): expr parsing date/timestamp [\#2357](https://github.com/delta-io/delta-rs/pull/2357) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: remove tmp files in cleanup\_metadata [\#2356](https://github.com/delta-io/delta-rs/pull/2356) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: make struct fields nullable in stats schema [\#2346](https://github.com/delta-io/delta-rs/pull/2346) ([qinix](https://github.com/qinix)) +- fix\(rust\): adhere to protocol for Decimal [\#2332](https://github.com/delta-io/delta-rs/pull/2332) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): raise schema mismatch when decimal is not subset [\#2330](https://github.com/delta-io/delta-rs/pull/2330) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(rust\): derive Copy on some public enums [\#2329](https://github.com/delta-io/delta-rs/pull/2329) ([lasantosr](https://github.com/lasantosr)) +- fix: merge pushdown handling [\#2326](https://github.com/delta-io/delta-rs/pull/2326) ([Blajda](https://github.com/Blajda)) +- fix: merge concurrency control [\#2324](https://github.com/delta-io/delta-rs/pull/2324) ([ion-elgreco](https://github.com/ion-elgreco)) +- Revert 2291 merge predicate fix [\#2323](https://github.com/delta-io/delta-rs/pull/2323) ([Blajda](https://github.com/Blajda)) +- fix: try to fix timeouts [\#2318](https://github.com/delta-io/delta-rs/pull/2318) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): serialize MetricDetails from compaction runs to a string [\#2317](https://github.com/delta-io/delta-rs/pull/2317) ([liamphmurphy](https://github.com/liamphmurphy)) +- docs: add example in to\_pyarrow\_dataset [\#2315](https://github.com/delta-io/delta-rs/pull/2315) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python\): wrong batch size [\#2314](https://github.com/delta-io/delta-rs/pull/2314) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: object store 0.9.1 [\#2311](https://github.com/delta-io/delta-rs/pull/2311) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: checkpoint features format below v3,7 [\#2307](https://github.com/delta-io/delta-rs/pull/2307) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: schema evolution not coercing with large arrow types [\#2305](https://github.com/delta-io/delta-rs/pull/2305) ([aersam](https://github.com/aersam)) +- fix: clean up some non-datafusion builds [\#2303](https://github.com/delta-io/delta-rs/pull/2303) ([rtyler](https://github.com/rtyler)) +- docs: fix typo [\#2300](https://github.com/delta-io/delta-rs/pull/2300) ([LauH1987](https://github.com/LauH1987)) +- docs: make replaceWhere example compile [\#2299](https://github.com/delta-io/delta-rs/pull/2299) ([LauH1987](https://github.com/LauH1987)) +- fix\(rust\): add missing chrono-tz feature [\#2295](https://github.com/delta-io/delta-rs/pull/2295) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore\(python\): bump to v0.16.1 [\#2294](https://github.com/delta-io/delta-rs/pull/2294) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): features not maintained in protocol after checkpoint [\#2293](https://github.com/delta-io/delta-rs/pull/2293) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: merge predicate for concurrent writes [\#2291](https://github.com/delta-io/delta-rs/pull/2291) ([JonasDev1](https://github.com/JonasDev1)) +- fix: replace assert and AssertionError with appropriate exceptions [\#2286](https://github.com/delta-io/delta-rs/pull/2286) ([joe-sharman](https://github.com/joe-sharman)) +- docs: fix typo in delta-lake-polars.md [\#2285](https://github.com/delta-io/delta-rs/pull/2285) ([vladdoster](https://github.com/vladdoster)) +- fix\(python, rust\): prevent table scan returning large arrow dtypes [\#2274](https://github.com/delta-io/delta-rs/pull/2274) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python\): always encapsulate column names in backticks in \_all functions [\#2271](https://github.com/delta-io/delta-rs/pull/2271) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): read only checkpoints that match \_last\_checkpoint version [\#2270](https://github.com/delta-io/delta-rs/pull/2270) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: add .venv to .gitignore [\#2268](https://github.com/delta-io/delta-rs/pull/2268) ([gacharya](https://github.com/gacharya)) +- feat\(python, rust\): add `set table properties` operation [\#2264](https://github.com/delta-io/delta-rs/pull/2264) ([ion-elgreco](https://github.com/ion-elgreco)) +- docs: use dagster deltalake polars library [\#2263](https://github.com/delta-io/delta-rs/pull/2263) ([avriiil](https://github.com/avriiil)) +- docs: update comment about r2 requiring locks [\#2261](https://github.com/delta-io/delta-rs/pull/2261) ([cmackenzie1](https://github.com/cmackenzie1)) +- fix\(\#2256\): use consistent units of time [\#2260](https://github.com/delta-io/delta-rs/pull/2260) ([cmackenzie1](https://github.com/cmackenzie1)) +- chore: update the changelog for rust-v0.17.1 [\#2259](https://github.com/delta-io/delta-rs/pull/2259) ([rtyler](https://github.com/rtyler)) +- feat\(python\): release GIL in the write\_deltalake function [\#2257](https://github.com/delta-io/delta-rs/pull/2257) ([franz101](https://github.com/franz101)) +- chore\(rust\): bump datafusion to 36 [\#2249](https://github.com/delta-io/delta-rs/pull/2249) ([universalmind303](https://github.com/universalmind303)) +- chore!: replace rusoto with AWS SDK [\#2243](https://github.com/delta-io/delta-rs/pull/2243) ([mightyshazam](https://github.com/mightyshazam)) +- fix: handle conflict checking in optimize correctly [\#2208](https://github.com/delta-io/delta-rs/pull/2208) ([emcake](https://github.com/emcake)) +- feat: logical Node for find files [\#2194](https://github.com/delta-io/delta-rs/pull/2194) ([hntd187](https://github.com/hntd187)) + ## [rust-v0.17.3](https://github.com/delta-io/delta-rs/tree/rust-v0.17.3) (2024-05-01) [Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.17.1...rust-v0.17.3) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4472a3640a..f681aa3948 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,34 +17,40 @@ If you want to claim an issue to work on, you can write the word `take` as a com - Install Rust, e.g. as described [here](https://doc.rust-lang.org/cargo/getting-started/installation.html) - Have a compatible Python version installed (check `python/pyproject.toml` for current requirement) - Create a Python virtual environment (required for development builds), e.g. as described [here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) + ```sh + python -m venv .venv + ``` + - Build the project for development (this requires an active virtual environment and will also install `deltalake` in that virtual environment) -``` -cd python -make develop -``` + ```sh + cd python + make develop + ``` - Run some Python code, e.g. to run a specific test -``` -python -m pytest tests/test_writer.py -s -k "test_with_deltalake_schema" -``` + ```sh + python -m pytest tests/test_writer.py -s -k "test_with_deltalake_schema" + ``` - Run some Rust code, e.g. run an example -``` -cd crates/deltalake -cargo run --example basic_operations --features="datafusion" -``` + ```sh + cd crates/deltalake + cargo run --example basic_operations --features="datafusion" + ``` ## Run the docs locally -*This serves your local contens of docs via a web browser, handy for checking what they look like if you are making changes to docs or docstings* -``` +*This serves your local contents of docs via a web browser, handy for checking what they look like if you are making changes to docs or docstings* + +```sh (cd python; make develop) pip install -r docs/requirements.txt mkdocs serve ``` ## To make a pull request (PR) -- Make sure all the following steps run/pass locally before submitting a PR -``` +Make sure all the following steps run/pass locally before submitting a PR + +```sh cargo fmt -- --check cd python make check-rust @@ -62,7 +68,7 @@ make build-docs - For debugging Rust code, install [CodeLLDB](https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb). The extension should even create Debug launch configurations for the project if you allow it, an easy way to get started. Just set a breakpoint and run the relevant configuration. - For debugging from Python into Rust, follow this procedure: 1. Add this to `.vscode/launch.json` -``` +```json { "type": "lldb", "request": "attach", diff --git a/Cargo.toml b/Cargo.toml index 6168a500fd..ccbb766e0f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ resolver = "2" [workspace.package] authors = ["Qingping Hou "] -rust-version = "1.75" +rust-version = "1.80" keywords = ["deltalake", "delta", "datalake"] readme = "README.md" edition = "2021" @@ -26,30 +26,34 @@ debug = true debug = "line-tables-only" [workspace.dependencies] +delta_kernel = { version = "=0.3.0" } +# delta_kernel = { path = "../delta-kernel-rs/kernel", version = "0.3.0" } + # arrow -arrow = { version = "51" } -arrow-arith = { version = "51" } -arrow-array = { version = "51", features = ["chrono-tz"] } -arrow-buffer = { version = "51" } -arrow-cast = { version = "51" } -arrow-ipc = { version = "51" } -arrow-json = { version = "51" } -arrow-ord = { version = "51" } -arrow-row = { version = "51" } -arrow-schema = { version = "51" } -arrow-select = { version = "51" } -object_store = { version = "0.9" } -parquet = { version = "51" } +arrow = { version = "52" } +arrow-arith = { version = "52" } +arrow-array = { version = "52", features = ["chrono-tz"] } +arrow-buffer = { version = "52" } +arrow-cast = { version = "52" } +arrow-ipc = { version = "52" } +arrow-json = { version = "52" } +arrow-ord = { version = "52" } +arrow-row = { version = "52" } +arrow-schema = { version = "52" } +arrow-select = { version = "52" } +object_store = { version = "0.10.1" } +parquet = { version = "52" } # datafusion -datafusion = { version = "37.1" } -datafusion-expr = { version = "37.1" } -datafusion-common = { version = "37.1" } -datafusion-proto = { version = "37.1" } -datafusion-sql = { version = "37.1" } -datafusion-physical-expr = { version = "37.1" } -datafusion-functions = { version = "37.1" } -datafusion-functions-array = { version = "37.1" } +datafusion = { version = "41" } +datafusion-expr = { version = "41" } +datafusion-common = { version = "41" } +datafusion-proto = { version = "41" } +datafusion-sql = { version = "41" } +datafusion-physical-expr = { version = "41" } +datafusion-physical-plan = { version = "41" } +datafusion-functions = { version = "41" } +datafusion-functions-aggregate = { version = "41" } # serde serde = { version = "1.0.194", features = ["derive"] } @@ -62,6 +66,7 @@ tracing = { version = "0.1", features = ["log"] } regex = { version = "1" } thiserror = { version = "1" } url = { version = "2" } +urlencoding = "2.1.3" uuid = { version = "1" } # runtime / async diff --git a/README.md b/README.md index ec9a7d2d59..b7a26b8a42 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ #delta-rs in the Delta Lake Slack workspace

+Delta Lake is an open-source storage format that runs on top of existing data lakes. Delta Lake is compatible with processing engines like Apache Spark and provides benefits such as ACID transaction guarantees, schema enforcement, and scalable data handling. The Delta Lake project aims to unlock the power of the Deltalake for as many users and projects as possible by providing native low-level APIs aimed at developers and integrators, as well as a high-level operations @@ -135,12 +136,13 @@ of features outlined in the Delta [protocol][protocol] is also [tracked](#protoc | -------------------- | :-----: | :-----: | ---------------------------------------------------------------- | | Local | ![done] | ![done] | | | S3 - AWS | ![done] | ![done] | requires lock for concurrent writes | -| S3 - MinIO | ![done] | ![done] | requires lock for concurrent writes | -| S3 - R2 | ![done] | ![done] | No lock required when using `AmazonS3ConfigKey::CopyIfNotExists` | +| S3 - MinIO | ![done] | ![done] | No lock required when using `AmazonS3ConfigKey::ConditionalPut` with `storage_options = {"conditional_put":"etag"}` | +| S3 - R2 | ![done] | ![done] | No lock required when using `AmazonS3ConfigKey::ConditionalPut` with `storage_options = {"conditional_put":"etag"}` | | Azure Blob | ![done] | ![done] | | | Azure ADLS Gen2 | ![done] | ![done] | | | Microsoft OneLake | ![done] | ![done] | | | Google Cloud Storage | ![done] | ![done] | | +| HDFS | ![done] | ![done] | | ### Supported Operations diff --git a/crates/aws/Cargo.toml b/crates/aws/Cargo.toml index e6913a2162..992a32c93e 100644 --- a/crates/aws/Cargo.toml +++ b/crates/aws/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-aws" -version = "0.1.2" +version = "0.3.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,19 +12,20 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = ">=0.17.0, <0.19.0", path = "../core" } -aws-smithy-runtime-api = { version="1.1.7" } -aws-smithy-runtime = { version="1.1.7", optional = true} -aws-credential-types = { version="1.1.7", features = ["hardcoded-credentials"]} -aws-config = { version = "1.1.6", default-features = false, features = ["behavior-version-latest","rt-tokio", "credentials-process", "sso"] } -aws-sdk-dynamodb = {version = "1.15.0", default-features = false, features = ["behavior-version-latest", "rt-tokio"] } -aws-sdk-sts = {version = "1.1.6", default-features = false, features = ["behavior-version-latest", "rt-tokio"] } +deltalake-core = { version = "0.20.0", path = "../core" } +aws-smithy-runtime-api = { version="1.7" } +aws-smithy-runtime = { version="1.7", optional = true} +aws-credential-types = { version="1.2", features = ["hardcoded-credentials"]} +aws-config = { version = "1.5", default-features = false, features = ["behavior-version-latest","rt-tokio", "credentials-process", "sso"] } +aws-sdk-dynamodb = {version = "1.45", default-features = false, features = ["behavior-version-latest", "rt-tokio"] } +aws-sdk-sts = {version = "1.42", default-features = false, features = ["behavior-version-latest", "rt-tokio"] } lazy_static = "1" maplit = "1" # workspace dependencies async-trait = { workspace = true } bytes = { workspace = true } +chrono = { workspace = true } futures = { workspace = true } tracing = { workspace = true } object_store = { workspace = true, features = ["aws"]} @@ -33,7 +34,7 @@ tokio = { workspace = true } regex = { workspace = true } uuid = { workspace = true, features = ["serde", "v4"] } url = { workspace = true } -backoff = { version = "0.4", features = [ "tokio" ] } +backon = { version = "1",default-features = false, features = [ "tokio-sleep" ] } hyper-tls = { version = "0.5", optional = true } [dev-dependencies] diff --git a/crates/aws/src/constants.rs b/crates/aws/src/constants.rs new file mode 100644 index 0000000000..90c23ff572 --- /dev/null +++ b/crates/aws/src/constants.rs @@ -0,0 +1,141 @@ +//! Constants used for modifying and configuring various AWS S3 (or similar) connections with +//! delta-rs +//! + +use lazy_static::lazy_static; +use std::time::Duration; + +/// Custom S3 endpoint. +pub const AWS_ENDPOINT_URL: &str = "AWS_ENDPOINT_URL"; +/// Custom DynamoDB endpoint. +/// If DynamoDB endpoint is not supplied, will use S3 endpoint (AWS_ENDPOINT_URL) +/// If it is supplied, this endpoint takes precedence over the global endpoint set in AWS_ENDPOINT_URL for DynamoDB +pub const AWS_ENDPOINT_URL_DYNAMODB: &str = "AWS_ENDPOINT_URL_DYNAMODB"; +/// The AWS region. +pub const AWS_REGION: &str = "AWS_REGION"; +/// The AWS profile. +pub const AWS_PROFILE: &str = "AWS_PROFILE"; +/// The AWS_ACCESS_KEY_ID to use for S3. +pub const AWS_ACCESS_KEY_ID: &str = "AWS_ACCESS_KEY_ID"; +/// The AWS_SECRET_ACCESS_KEY to use for S3. +pub const AWS_SECRET_ACCESS_KEY: &str = "AWS_SECRET_ACCESS_KEY"; +/// The AWS_SESSION_TOKEN to use for S3. +pub const AWS_SESSION_TOKEN: &str = "AWS_SESSION_TOKEN"; +/// Uses either "path" (the default) or "virtual", which turns on +/// [virtual host addressing](http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). +pub const AWS_S3_ADDRESSING_STYLE: &str = "AWS_S3_ADDRESSING_STYLE"; +/// Locking provider to use for safe atomic rename. +/// `dynamodb` is currently the only supported locking provider. +/// If not set, safe atomic rename is not available. +pub const AWS_S3_LOCKING_PROVIDER: &str = "AWS_S3_LOCKING_PROVIDER"; +/// The role to assume for S3 writes. +pub const AWS_IAM_ROLE_ARN: &str = "AWS_IAM_ROLE_ARN"; +/// The role to assume. Please use [AWS_IAM_ROLE_ARN] instead +#[deprecated(since = "0.20.0", note = "Please use AWS_IAM_ROLE_ARN instead")] +pub const AWS_S3_ASSUME_ROLE_ARN: &str = "AWS_S3_ASSUME_ROLE_ARN"; +/// The role session name to use when a role is assumed. If not provided a random session name is generated. +pub const AWS_IAM_ROLE_SESSION_NAME: &str = "AWS_IAM_ROLE_SESSION_NAME"; +/// The role session name to use when a role is assumed. If not provided a random session name is generated. +#[deprecated( + since = "0.20.0", + note = "Please use AWS_IAM_ROLE_SESSION_NAME instead" +)] +pub const AWS_S3_ROLE_SESSION_NAME: &str = "AWS_S3_ROLE_SESSION_NAME"; +/// The `pool_idle_timeout` option of aws http client. Has to be lower than 20 seconds, which is +/// default S3 server timeout . +/// However, since rusoto uses hyper as a client, its default timeout is 90 seconds +/// . +/// Hence, the `connection closed before message completed` could occur. +/// To avoid that, the default value of this setting is 15 seconds if it's not set otherwise. +pub const AWS_S3_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_S3_POOL_IDLE_TIMEOUT_SECONDS"; +/// The `pool_idle_timeout` for the as3_constants sts client. See +/// the reasoning in `AWS_S3_POOL_IDLE_TIMEOUT_SECONDS`. +pub const AWS_STS_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_STS_POOL_IDLE_TIMEOUT_SECONDS"; +/// The number of retries for S3 GET requests failed with 500 Internal Server Error. +pub const AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES: &str = + "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES"; +/// The web identity token file to use when using a web identity provider. +/// NOTE: web identity related options are set in the environment when +/// creating an instance of [crate::storage::s3::S3StorageOptions]. +/// See also . +pub const AWS_WEB_IDENTITY_TOKEN_FILE: &str = "AWS_WEB_IDENTITY_TOKEN_FILE"; +/// The role name to use for web identity. +/// NOTE: web identity related options are set in the environment when +/// creating an instance of [crate::storage::s3::S3StorageOptions]. +/// See also . +pub const AWS_ROLE_ARN: &str = "AWS_ROLE_ARN"; +/// The role session name to use for web identity. +/// NOTE: web identity related options are set in the environment when +/// creating an instance of [crate::storage::s3::S3StorageOptions]. +/// See also . +pub const AWS_ROLE_SESSION_NAME: &str = "AWS_ROLE_SESSION_NAME"; +/// Allow http connections - mainly useful for integration tests +pub const AWS_ALLOW_HTTP: &str = "AWS_ALLOW_HTTP"; + +/// If set to "true", allows creating commits without concurrent writer protection. +/// Only safe if there is one writer to a given table. +pub const AWS_S3_ALLOW_UNSAFE_RENAME: &str = "AWS_S3_ALLOW_UNSAFE_RENAME"; + +/// If set to "true", disables the imds client +/// Defaults to "true" +pub const AWS_EC2_METADATA_DISABLED: &str = "AWS_EC2_METADATA_DISABLED"; + +/// The timeout in milliseconds for the EC2 metadata endpoint +/// Defaults to 100 +pub const AWS_EC2_METADATA_TIMEOUT: &str = "AWS_EC2_METADATA_TIMEOUT"; + +/// Force the delta-rs to attempt to load AWS credentials +pub const AWS_FORCE_CREDENTIAL_LOAD: &str = "AWS_FORCE_CREDENTIAL_LOAD"; + +/// The list of option keys owned by the S3 module. +/// Option keys not contained in this list will be added to the `extra_opts` +/// field of [crate::storage::s3::S3StorageOptions]. +pub const S3_OPTS: &[&str] = &[ + AWS_ENDPOINT_URL, + AWS_ENDPOINT_URL_DYNAMODB, + AWS_REGION, + AWS_PROFILE, + AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY, + AWS_SESSION_TOKEN, + AWS_S3_LOCKING_PROVIDER, + AWS_S3_ASSUME_ROLE_ARN, + AWS_S3_ROLE_SESSION_NAME, + AWS_WEB_IDENTITY_TOKEN_FILE, + AWS_ROLE_ARN, + AWS_ROLE_SESSION_NAME, + AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, + AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, + AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, + AWS_EC2_METADATA_DISABLED, + AWS_EC2_METADATA_TIMEOUT, +]; + +pub const DEFAULT_LOCK_TABLE_NAME: &str = "delta_log"; +pub const LOCK_TABLE_KEY_NAME: &str = "DELTA_DYNAMO_TABLE_NAME"; +pub const BILLING_MODE_KEY_NAME: &str = "DELTA_DYNAMO_BILLING_MODE"; +pub const MAX_ELAPSED_REQUEST_TIME_KEY_NAME: &str = "DELTA_DYNAMO_MAX_ELAPSED_REQUEST_TIME"; + +pub const ATTR_TABLE_PATH: &str = "tablePath"; +pub const ATTR_FILE_NAME: &str = "fileName"; +pub const ATTR_TEMP_PATH: &str = "tempPath"; +pub const ATTR_COMPLETE: &str = "complete"; +pub const ATTR_EXPIRE_TIME: &str = "expireTime"; + +pub const STRING_TYPE: &str = "S"; + +pub const KEY_TYPE_HASH: &str = "HASH"; +pub const KEY_TYPE_RANGE: &str = "RANGE"; + +lazy_static! { + pub static ref CONDITION_EXPR_CREATE: String = format!( + "attribute_not_exists({ATTR_TABLE_PATH}) and attribute_not_exists({ATTR_FILE_NAME})" + ); + + pub static ref CONDITION_DELETE_INCOMPLETE: String = format!( + "(complete = :f) or (attribute_not_exists({ATTR_TABLE_PATH}) and attribute_not_exists({ATTR_FILE_NAME}))" + ); +} + +pub const CONDITION_UPDATE_INCOMPLETE: &str = "complete = :f"; +pub const DEFAULT_COMMIT_ENTRY_EXPIRATION_DELAY: Duration = Duration::from_secs(86_400); diff --git a/crates/aws/src/credentials.rs b/crates/aws/src/credentials.rs index 9ddf19b74c..71441bf05e 100644 --- a/crates/aws/src/credentials.rs +++ b/crates/aws/src/credentials.rs @@ -1,118 +1,259 @@ -use std::{sync::Arc, time::Duration}; - -use aws_config::{ - ecs::EcsCredentialsProvider, - environment::{EnvironmentVariableCredentialsProvider, EnvironmentVariableRegionProvider}, - imds::credentials::ImdsCredentialsProvider, - meta::{credentials::CredentialsProviderChain, region::RegionProviderChain}, - profile::ProfileFileCredentialsProvider, - provider_config::ProviderConfig, - web_identity_token::WebIdentityTokenCredentialsProvider, -}; -use aws_credential_types::provider::{self, ProvideCredentials}; -use tracing::Instrument; +//! Custom AWS credential providers used by delta-rs +//! -const IMDS_PROVIDER_NAME: &str = "Ec2InstanceMetadata"; +use std::sync::Arc; -#[derive(Debug)] -pub struct ConfiguredCredentialChain { - provider_chain: CredentialsProviderChain, -} +use aws_config::default_provider::credentials::DefaultCredentialsChain; +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::sts::AssumeRoleProvider; +use aws_config::SdkConfig; +use aws_credential_types::provider::error::CredentialsError; +use aws_credential_types::provider::{future, ProvideCredentials}; +use aws_credential_types::Credentials; -#[derive(Debug)] -pub struct NoOpCredentials {} +use deltalake_core::storage::object_store::aws::{AmazonS3ConfigKey, AwsCredential}; +use deltalake_core::storage::object_store::{ + CredentialProvider, Error as ObjectStoreError, Result as ObjectStoreResult, +}; +use deltalake_core::storage::StorageOptions; +use deltalake_core::DeltaResult; +use tracing::log::*; -pub fn new_region_provider(disable_imds: bool, imds_timeout: u64) -> RegionProviderChain { - let env_provider = EnvironmentVariableRegionProvider::new(); - let profile_file = aws_config::profile::region::ProfileFileRegionProvider::default(); - if disable_imds { - return RegionProviderChain::first_try(env_provider).or_else(profile_file); - } +use crate::constants::{self, AWS_ENDPOINT_URL}; - RegionProviderChain::first_try(env_provider) - .or_else(profile_file) - .or_else( - aws_config::imds::region::Builder::default() - .imds_client( - aws_config::imds::Client::builder() - .connect_timeout(Duration::from_millis(imds_timeout)) - .read_timeout(Duration::from_millis(imds_timeout)) - .build(), - ) - .build(), - ) +/// An [object_store::CredentialProvider] which handles converting a populated [SdkConfig] +/// into a necessary [AwsCredential] type for configuring [object_store::aws::AmazonS3] +#[derive(Clone, Debug)] +pub(crate) struct AWSForObjectStore { + sdk_config: SdkConfig, } -impl ConfiguredCredentialChain { - pub fn new(disable_imds: bool, imds_timeout: u64, conf: &ProviderConfig) -> Self { - let imds_provider = Self::build_imds_provider(conf, disable_imds, imds_timeout); - let env_provider = EnvironmentVariableCredentialsProvider::default(); - let profile_provider = ProfileFileCredentialsProvider::builder() - .configure(conf) - .with_custom_provider(IMDS_PROVIDER_NAME, imds_provider.clone()) - .build(); - let web_identity_token_provider = WebIdentityTokenCredentialsProvider::builder() - .configure(conf) - .build(); - - let ecs_provider = EcsCredentialsProvider::builder().configure(conf).build(); - - let provider_chain = CredentialsProviderChain::first_try("Environment", env_provider) - .or_else("Profile", profile_provider) - .or_else("WebIdentityToken", web_identity_token_provider) - .or_else("EcsContainer", ecs_provider) - .or_else(IMDS_PROVIDER_NAME, imds_provider); - - Self { provider_chain } +impl AWSForObjectStore { + pub(crate) fn new(sdk_config: SdkConfig) -> Self { + Self { sdk_config } } +} - async fn credentials(&self) -> provider::Result { - self.provider_chain - .provide_credentials() - .instrument(tracing::debug_span!("provide_credentials", provider = %"default_chain")) - .await +#[async_trait::async_trait] +impl CredentialProvider for AWSForObjectStore { + type Credential = AwsCredential; + + /// Provide the necessary configured credentials from the AWS SDK for use by + /// [object_store::aws::AmazonS3] + async fn get_credential(&self) -> ObjectStoreResult> { + let provider = self + .sdk_config + .credentials_provider() + .ok_or(ObjectStoreError::NotImplemented)?; + let credentials = + provider + .provide_credentials() + .await + .map_err(|e| ObjectStoreError::NotSupported { + source: Box::new(e), + })?; + + debug!( + "CredentialProvider for Object Store using access key: {}", + credentials.access_key_id() + ); + + Ok(Arc::new(Self::Credential { + key_id: credentials.access_key_id().into(), + secret_key: credentials.secret_access_key().into(), + token: credentials.session_token().map(|o| o.to_string()), + })) } +} - fn build_imds_provider( - conf: &ProviderConfig, - disable_imds: bool, - imds_timeout: u64, - ) -> Arc { - if disable_imds { - return Arc::new(NoOpCredentials {}); - } +/// Name of the [OptionsCredentialsProvider] for AWS SDK use +const OPTS_PROVIDER: &str = "DeltaStorageOptionsProvider"; - let imds_provider = ImdsCredentialsProvider::builder() - .configure(conf) - .imds_client( - aws_config::imds::Client::builder() - .connect_timeout(Duration::from_millis(imds_timeout)) - .read_timeout(Duration::from_millis(imds_timeout)) - .build(), - ) - .build(); - Arc::new(imds_provider) +/// The [OptionsCredentialsProvider] helps users plug specific AWS credentials into their +/// [StorageOptions] in such a way that the AWS SDK code will be properly +/// loaded with those credentials before following the +/// [aws_config::default_provider::credentials::DefaultCredentialsChain] +#[derive(Clone, Debug)] +pub(crate) struct OptionsCredentialsProvider { + options: StorageOptions, +} + +impl OptionsCredentialsProvider { + /// Look at the options configured on the provider and return an appropriate + /// [Credentials] instance for AWS SDK credential resolution + fn credentials(&self) -> aws_credential_types::provider::Result { + debug!("Attempting to pull credentials from `StorageOptions`"); + let access_key = self.options.0.get(constants::AWS_ACCESS_KEY_ID).ok_or( + CredentialsError::not_loaded("access key not in StorageOptions"), + )?; + let secret_key = self.options.0.get(constants::AWS_SECRET_ACCESS_KEY).ok_or( + CredentialsError::not_loaded("secret key not in StorageOptions"), + )?; + let session_token = self.options.0.get(constants::AWS_SESSION_TOKEN).cloned(); + + Ok(Credentials::new( + access_key, + secret_key, + session_token, + None, + OPTS_PROVIDER, + )) } } -impl ProvideCredentials for ConfiguredCredentialChain { - fn provide_credentials<'a>( - &'a self, - ) -> aws_credential_types::provider::future::ProvideCredentials<'a> +impl ProvideCredentials for OptionsCredentialsProvider { + fn provide_credentials<'a>(&'a self) -> future::ProvideCredentials<'a> where Self: 'a, { - aws_credential_types::provider::future::ProvideCredentials::new(self.credentials()) + future::ProvideCredentials::ready(self.credentials()) } } -impl ProvideCredentials for NoOpCredentials { - fn provide_credentials<'a>(&'a self) -> provider::future::ProvideCredentials<'a> - where - Self: 'a, - { - aws_credential_types::provider::future::ProvideCredentials::new(std::future::ready(Err( - provider::error::CredentialsError::not_loaded_no_source(), - ))) +/// Generate a random session name for assuming IAM roles +fn assume_role_sessio_name() -> String { + let now = chrono::Utc::now(); + + format!("delta-rs_{}", now.timestamp_millis()) +} + +/// Return the configured IAM role ARN or whatever is defined in the environment +fn assume_role_arn(options: &StorageOptions) -> Option { + options + .0 + .get(constants::AWS_IAM_ROLE_ARN) + .or(options.0.get(constants::AWS_S3_ASSUME_ROLE_ARN)) + .or(std::env::var_os(constants::AWS_IAM_ROLE_ARN) + .map(|o| { + o.into_string() + .expect("Failed to unwrap AWS_IAM_ROLE_ARN which may have invalid data") + }) + .as_ref()) + .or(std::env::var_os(constants::AWS_S3_ASSUME_ROLE_ARN) + .map(|o| { + o.into_string() + .expect("Failed to unwrap AWS_S3_ASSUME_ROLE_ARN which may have invalid data") + }) + .as_ref()) + .cloned() +} + +/// Return the configured IAM assume role session name or provide a unique one +fn assume_session_name(options: &StorageOptions) -> String { + let assume_session = options + .0 + .get(constants::AWS_IAM_ROLE_SESSION_NAME) + .or(options.0.get(constants::AWS_S3_ROLE_SESSION_NAME)) + .cloned(); + + match assume_session { + Some(s) => s, + None => assume_role_sessio_name(), + } +} + +/// Take a set of [StorageOptions] and produce an appropriate AWS SDK [SdkConfig] +/// for use with various AWS SDK APIs, such as in our [crate::logstore::S3DynamoDbLogStore] +pub async fn resolve_credentials(options: StorageOptions) -> DeltaResult { + let default_provider = DefaultCredentialsChain::builder().build().await; + + let credentials_provider = match assume_role_arn(&options) { + Some(arn) => { + debug!("Configuring AssumeRoleProvider with role arn: {arn}"); + CredentialsProviderChain::first_try( + "AssumeRoleProvider", + AssumeRoleProvider::builder(arn) + .session_name(assume_session_name(&options)) + .build() + .await, + ) + .or_else( + "StorageOptions", + OptionsCredentialsProvider { + options: options.clone(), + }, + ) + .or_else("DefaultChain", default_provider) + } + None => CredentialsProviderChain::first_try( + "StorageOptions", + OptionsCredentialsProvider { + options: options.clone(), + }, + ) + .or_else("DefaultChain", default_provider), + }; + + Ok(aws_config::from_env() + .credentials_provider(credentials_provider) + .load() + .await) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::constants; + use maplit::hashmap; + use serial_test::serial; + + #[tokio::test] + #[serial] + async fn test_options_credentials_provider() { + let options = StorageOptions(hashmap! { + constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), + constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + }); + + let config = resolve_credentials(options).await; + assert!(config.is_ok(), "{config:?}"); + let config = config.unwrap(); + + if let Some(provider) = &config.credentials_provider() { + let credentials = provider + .provide_credentials() + .await + .expect("Failed to provide credentials"); + assert_eq!( + "test_id", + credentials.access_key_id(), + "The access key should come from our options! {credentials:?}" + ); + assert_eq!( + "test_secret", + credentials.secret_access_key(), + "The secret should come from our options! {credentials:?}" + ); + } else { + panic!("Could not retrieve credentials from the SdkConfig: {config:?}"); + } + } + + #[tokio::test] + #[serial] + async fn test_options_credentials_provider_session_token() { + let options = StorageOptions(hashmap! { + constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), + constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + constants::AWS_SESSION_TOKEN.to_string() => "test_token".to_string(), + }); + + let config = resolve_credentials(options) + .await + .expect("Failed to resolve_credentials"); + + if let Some(provider) = &config.credentials_provider() { + let credentials = provider + .provide_credentials() + .await + .expect("Failed to provide credentials"); + assert_eq!( + Some("test_token"), + credentials.session_token(), + "The session token should come from our options! {credentials:?}" + ); + } else { + panic!("Could not retrieve credentials from the SdkConfig: {config:?}"); + } } } diff --git a/crates/aws/src/lib.rs b/crates/aws/src/lib.rs index d179c37e68..ddb768bdd9 100644 --- a/crates/aws/src/lib.rs +++ b/crates/aws/src/lib.rs @@ -1,5 +1,9 @@ -//! Lock client implementation based on DynamoDb. +//! AWS S3 and similar tooling for delta-rs +//! +//! This module also contains the [S3DynamoDbLogStore] implemtnation for concurrent writer support +//! with AWS S3 specifically. +pub mod constants; mod credentials; pub mod errors; pub mod logstore; @@ -7,6 +11,7 @@ pub mod logstore; mod native; pub mod storage; use aws_config::SdkConfig; +use aws_sdk_dynamodb::error::SdkError; use aws_sdk_dynamodb::{ operation::{ create_table::CreateTableError, delete_item::DeleteItemError, get_item::GetItemError, @@ -29,7 +34,7 @@ use std::{ }; use tracing::debug; -use deltalake_core::logstore::{logstores, LogStore, LogStoreFactory}; +use deltalake_core::logstore::{default_logstore, logstores, LogStore, LogStoreFactory}; use deltalake_core::storage::{factories, url_prefix_handler, ObjectStoreRef, StorageOptions}; use deltalake_core::{DeltaResult, Path}; use url::Url; @@ -49,23 +54,36 @@ impl LogStoreFactory for S3LogStoreFactory { ) -> DeltaResult> { let store = url_prefix_handler(store, Path::parse(location.path())?); - if options - .0 - .contains_key(AmazonS3ConfigKey::CopyIfNotExists.as_ref()) - { + // With conditional put in S3-like API we can use the deltalake default logstore which use PutIfAbsent + if options.0.keys().any(|key| { + let key = key.to_ascii_lowercase(); + vec![ + AmazonS3ConfigKey::ConditionalPut.as_ref(), + "conditional_put", + ] + .contains(&key.as_str()) + }) { + debug!("S3LogStoreFactory has been asked to create a default LogStore where the underlying store has Conditonal Put enabled - no locking provider required"); + return Ok(default_logstore(store, location, options)); + } + + if options.0.keys().any(|key| { + let key = key.to_ascii_lowercase(); + vec![ + AmazonS3ConfigKey::CopyIfNotExists.as_ref(), + "copy_if_not_exists", + ] + .contains(&key.as_str()) + }) { debug!("S3LogStoreFactory has been asked to create a LogStore where the underlying store has copy-if-not-exists enabled - no locking provider required"); - return Ok(deltalake_core::logstore::default_logstore( - store, location, options, - )); + return Ok(logstore::default_s3_logstore(store, location, options)); } let s3_options = S3StorageOptions::from_map(&options.0)?; if s3_options.locking_provider.as_deref() != Some("dynamodb") { debug!("S3LogStoreFactory has been asked to create a LogStore without the dynamodb locking provider"); - return Ok(deltalake_core::logstore::default_logstore( - store, location, options, - )); + return Ok(logstore::default_s3_logstore(store, location, options)); } Ok(Arc::new(logstore::S3DynamoDbLogStore::try_new( @@ -141,8 +159,12 @@ impl DynamoDbLockClient { lock_table_name: Option, billing_mode: Option, max_elapsed_request_time: Option, + dynamodb_override_endpoint: Option, ) -> Result { - let dynamodb_client = aws_sdk_dynamodb::Client::new(sdk_config); + let dynamodb_sdk_config = + Self::create_dynamodb_sdk_config(sdk_config, dynamodb_override_endpoint); + + let dynamodb_client = aws_sdk_dynamodb::Client::new(&dynamodb_sdk_config); let lock_table_name = lock_table_name .or_else(|| std::env::var(constants::LOCK_TABLE_KEY_NAME).ok()) @@ -177,6 +199,24 @@ impl DynamoDbLockClient { config, }) } + fn create_dynamodb_sdk_config( + sdk_config: &SdkConfig, + dynamodb_override_endpoint: Option, + ) -> SdkConfig { + /* + if dynamodb_override_endpoint exists/AWS_ENDPOINT_URL_DYNAMODB is specified by user + use dynamodb_override_endpoint to create dynamodb client + */ + + match dynamodb_override_endpoint { + Some(dynamodb_endpoint_url) => sdk_config + .to_owned() + .to_builder() + .endpoint_url(dynamodb_endpoint_url) + .build(), + None => sdk_config.to_owned(), + } + } /// Create the lock table where DynamoDb stores the commit information for all delta tables. /// @@ -256,28 +296,28 @@ impl DynamoDbLockClient { version: i64, ) -> Result, LockClientError> { let item = self - .retry(|| async { - match self - .dynamodb_client - .get_item() - .consistent_read(true) - .table_name(&self.config.lock_table_name) - .set_key(Some(self.get_primary_key(version, table_path))) - .send() - .await - { - Ok(x) => Ok(x), - Err(sdk_err) => match sdk_err.as_service_error() { - Some(GetItemError::ProvisionedThroughputExceededException(_)) => { - Err(backoff::Error::transient( - LockClientError::ProvisionedThroughputExceeded, - )) - } - _ => Err(backoff::Error::permanent(sdk_err.into())), - }, + .retry( + || async { + self.dynamodb_client + .get_item() + .consistent_read(true) + .table_name(&self.config.lock_table_name) + .set_key(Some(self.get_primary_key(version, table_path))) + .send() + .await + }, + |err| match err.as_service_error() { + Some(GetItemError::ProvisionedThroughputExceededException(_)) => true, + _ => false, + }, + ) + .await + .map_err(|err| match err.as_service_error() { + Some(GetItemError::ProvisionedThroughputExceededException(_)) => { + LockClientError::ProvisionedThroughputExceeded } - }) - .await?; + _ => err.into(), + })?; item.item.as_ref().map(CommitEntry::try_from).transpose() } @@ -287,36 +327,38 @@ impl DynamoDbLockClient { table_path: &str, entry: &CommitEntry, ) -> Result<(), LockClientError> { - self.retry(|| async { - let item = create_value_map(entry, table_path); - match self - .dynamodb_client - .put_item() - .condition_expression(constants::CONDITION_EXPR_CREATE.as_str()) - .table_name(self.get_lock_table_name()) - .set_item(Some(item)) - .send() - .await - { - Ok(_) => Ok(()), - Err(err) => match err.as_service_error() { - Some(PutItemError::ProvisionedThroughputExceededException(_)) => Err( - backoff::Error::transient(LockClientError::ProvisionedThroughputExceeded), - ), - Some(PutItemError::ConditionalCheckFailedException(_)) => Err( - backoff::Error::permanent(LockClientError::VersionAlreadyExists { - table_path: table_path.to_owned(), - version: entry.version, - }), - ), - Some(PutItemError::ResourceNotFoundException(_)) => Err( - backoff::Error::permanent(LockClientError::LockTableNotFound), - ), - _ => Err(backoff::Error::permanent(err.into())), - }, + self.retry( + || async { + let item = create_value_map(entry, table_path); + let _ = self + .dynamodb_client + .put_item() + .condition_expression(constants::CONDITION_EXPR_CREATE.as_str()) + .table_name(self.get_lock_table_name()) + .set_item(Some(item)) + .send() + .await?; + Ok(()) + }, + |err: &SdkError<_, _>| match err.as_service_error() { + Some(PutItemError::ProvisionedThroughputExceededException(_)) => true, + _ => false, + }, + ) + .await + .map_err(|err| match err.as_service_error() { + Some(PutItemError::ProvisionedThroughputExceededException(_)) => { + LockClientError::ProvisionedThroughputExceeded } + Some(PutItemError::ConditionalCheckFailedException(_)) => { + LockClientError::VersionAlreadyExists { + table_path: table_path.to_owned(), + version: entry.version, + } + } + Some(PutItemError::ResourceNotFoundException(_)) => LockClientError::LockTableNotFound, + _ => err.into(), }) - .await } /// Get the latest entry (entry with highest version). @@ -338,33 +380,33 @@ impl DynamoDbLockClient { limit: i64, ) -> Result, LockClientError> { let query_result = self - .retry(|| async { - match self - .dynamodb_client - .query() - .table_name(self.get_lock_table_name()) - .consistent_read(true) - .limit(limit.try_into().unwrap_or(i32::MAX)) - .scan_index_forward(false) - .key_condition_expression(format!("{} = :tn", constants::ATTR_TABLE_PATH)) - .set_expression_attribute_values(Some( - maplit::hashmap!(":tn".into() => string_attr(table_path)), - )) - .send() - .await - { - Ok(result) => Ok(result), - Err(sdk_err) => match sdk_err.as_service_error() { - Some(QueryError::ProvisionedThroughputExceededException(_)) => { - Err(backoff::Error::transient( - LockClientError::ProvisionedThroughputExceeded, - )) - } - _ => Err(backoff::Error::permanent(sdk_err.into())), - }, + .retry( + || async { + self.dynamodb_client + .query() + .table_name(self.get_lock_table_name()) + .consistent_read(true) + .limit(limit.try_into().unwrap_or(i32::MAX)) + .scan_index_forward(false) + .key_condition_expression(format!("{} = :tn", constants::ATTR_TABLE_PATH)) + .set_expression_attribute_values(Some( + maplit::hashmap!(":tn".into() => string_attr(table_path)), + )) + .send() + .await + }, + |err: &SdkError<_, _>| match err.as_service_error() { + Some(QueryError::ProvisionedThroughputExceededException(_)) => true, + _ => false, + }, + ) + .await + .map_err(|err| match err.as_service_error() { + Some(QueryError::ProvisionedThroughputExceededException(_)) => { + LockClientError::ProvisionedThroughputExceeded } - }) - .await?; + _ => err.into(), + })?; query_result .items @@ -385,35 +427,44 @@ impl DynamoDbLockClient { .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .as_secs(); - self.retry(|| async { - match self - .dynamodb_client - .update_item() - .table_name(self.get_lock_table_name()) - .set_key(Some(self.get_primary_key(version, table_path))) - .update_expression("SET complete = :c, expireTime = :e".to_owned()) - .set_expression_attribute_values(Some(maplit::hashmap! { - ":c".to_owned() => string_attr("true"), - ":e".to_owned() => num_attr(seconds_since_epoch), - ":f".into() => string_attr("false"), - })) - .condition_expression(constants::CONDITION_UPDATE_INCOMPLETE) - .send() - .await - { - Ok(_) => Ok(UpdateLogEntryResult::UpdatePerformed), - Err(err) => match err.as_service_error() { - Some(UpdateItemError::ProvisionedThroughputExceededException(_)) => Err( - backoff::Error::transient(LockClientError::ProvisionedThroughputExceeded), - ), - Some(UpdateItemError::ConditionalCheckFailedException(_)) => { - Ok(UpdateLogEntryResult::AlreadyCompleted) - } - _ => Err(backoff::Error::permanent(err.into())), + let res = self + .retry( + || async { + let _ = self + .dynamodb_client + .update_item() + .table_name(self.get_lock_table_name()) + .set_key(Some(self.get_primary_key(version, table_path))) + .update_expression("SET complete = :c, expireTime = :e".to_owned()) + .set_expression_attribute_values(Some(maplit::hashmap! { + ":c".to_owned() => string_attr("true"), + ":e".to_owned() => num_attr(seconds_since_epoch), + ":f".into() => string_attr("false"), + })) + .condition_expression(constants::CONDITION_UPDATE_INCOMPLETE) + .send() + .await?; + Ok(()) }, - } - }) - .await + |err: &SdkError<_, _>| match err.as_service_error() { + Some(UpdateItemError::ProvisionedThroughputExceededException(_)) => true, + _ => false, + }, + ) + .await; + + match res { + Ok(()) => Ok(UpdateLogEntryResult::UpdatePerformed), + Err(err) => match err.as_service_error() { + Some(UpdateItemError::ProvisionedThroughputExceededException(_)) => { + Err(LockClientError::ProvisionedThroughputExceeded) + } + Some(UpdateItemError::ConditionalCheckFailedException(_)) => { + Ok(UpdateLogEntryResult::AlreadyCompleted) + } + _ => Err(err.into()), + }, + } } /// Delete existing log entry if it is not already complete @@ -422,48 +473,52 @@ impl DynamoDbLockClient { version: i64, table_path: &str, ) -> Result<(), LockClientError> { - self.retry(|| async { - match self - .dynamodb_client - .delete_item() - .table_name(self.get_lock_table_name()) - .set_key(Some(self.get_primary_key(version, table_path))) - .set_expression_attribute_values(Some(maplit::hashmap! { - ":f".into() => string_attr("false"), - })) - .condition_expression(constants::CONDITION_DELETE_INCOMPLETE.as_str()) - .send() - .await - { - Ok(_) => Ok(()), - Err(err) => match err.as_service_error() { - Some(DeleteItemError::ProvisionedThroughputExceededException(_)) => Err( - backoff::Error::transient(LockClientError::ProvisionedThroughputExceeded), - ), - Some(DeleteItemError::ConditionalCheckFailedException(_)) => Err( - backoff::Error::permanent(LockClientError::VersionAlreadyCompleted { - table_path: table_path.to_owned(), - version, - }), - ), - _ => Err(backoff::Error::permanent(err.into())), - }, + self.retry( + || async { + let _ = self + .dynamodb_client + .delete_item() + .table_name(self.get_lock_table_name()) + .set_key(Some(self.get_primary_key(version, table_path))) + .set_expression_attribute_values(Some(maplit::hashmap! { + ":f".into() => string_attr("false"), + })) + .condition_expression(constants::CONDITION_DELETE_INCOMPLETE.as_str()) + .send() + .await?; + Ok(()) + }, + |err: &SdkError<_, _>| match err.as_service_error() { + Some(DeleteItemError::ProvisionedThroughputExceededException(_)) => true, + _ => false, + }, + ) + .await + .map_err(|err| match err.as_service_error() { + Some(DeleteItemError::ProvisionedThroughputExceededException(_)) => { + LockClientError::ProvisionedThroughputExceeded } + Some(DeleteItemError::ConditionalCheckFailedException(_)) => { + LockClientError::VersionAlreadyCompleted { + table_path: table_path.to_owned(), + version, + } + } + _ => err.into(), }) - .await } - async fn retry(&self, operation: Fn) -> Result + async fn retry(&self, operation: F, when: Wn) -> Result where - Fn: FnMut() -> Fut, - Fut: std::future::Future>>, + F: FnMut() -> Fut, + Fut: std::future::Future>, + Wn: Fn(&E) -> bool, { - let backoff = backoff::ExponentialBackoffBuilder::new() - .with_multiplier(2.) - .with_max_interval(Duration::from_secs(15)) - .with_max_elapsed_time(Some(self.config.max_elapsed_request_time)) - .build(); - backoff::future::retry(backoff, operation).await + use backon::Retryable; + let backoff = backon::ExponentialBuilder::default() + .with_factor(2.) + .with_max_delay(self.config.max_elapsed_request_time); + operation.retry(backoff).when(when).await } } @@ -565,42 +620,6 @@ pub enum CreateLockTableResult { TableAlreadyExists, } -pub mod constants { - use std::time::Duration; - - use lazy_static::lazy_static; - - pub const DEFAULT_LOCK_TABLE_NAME: &str = "delta_log"; - pub const LOCK_TABLE_KEY_NAME: &str = "DELTA_DYNAMO_TABLE_NAME"; - pub const BILLING_MODE_KEY_NAME: &str = "DELTA_DYNAMO_BILLING_MODE"; - pub const MAX_ELAPSED_REQUEST_TIME_KEY_NAME: &str = "DELTA_DYNAMO_MAX_ELAPSED_REQUEST_TIME"; - - pub const ATTR_TABLE_PATH: &str = "tablePath"; - pub const ATTR_FILE_NAME: &str = "fileName"; - pub const ATTR_TEMP_PATH: &str = "tempPath"; - pub const ATTR_COMPLETE: &str = "complete"; - pub const ATTR_EXPIRE_TIME: &str = "expireTime"; - - pub const STRING_TYPE: &str = "S"; - - pub const KEY_TYPE_HASH: &str = "HASH"; - pub const KEY_TYPE_RANGE: &str = "RANGE"; - - lazy_static! { - pub static ref CONDITION_EXPR_CREATE: String = format!( - "attribute_not_exists({ATTR_TABLE_PATH}) and attribute_not_exists({ATTR_FILE_NAME})" - ); - - pub static ref CONDITION_DELETE_INCOMPLETE: String = format!( - "(complete = :f) or (attribute_not_exists({ATTR_TABLE_PATH}) and attribute_not_exists({ATTR_FILE_NAME}))" - ); - } - - pub const CONDITION_UPDATE_INCOMPLETE: &str = "complete = :f"; - - pub const DEFAULT_COMMIT_ENTRY_EXPIRATION_DELAY: Duration = Duration::from_secs(86_400); -} - /// Extract a field from an item's attribute value map, producing a descriptive error /// of the various failure cases. fn extract_required_string_field<'a>( @@ -663,6 +682,7 @@ fn extract_version_from_filename(name: &str) -> Option { #[cfg(test)] mod tests { use super::*; + use aws_config::Region; use object_store::memory::InMemory; use serial_test::serial; @@ -705,10 +725,37 @@ mod tests { let factory = S3LogStoreFactory::default(); let store = InMemory::new(); let url = Url::parse("s3://test-bucket").unwrap(); - std::env::remove_var(storage::s3_constants::AWS_S3_LOCKING_PROVIDER); + std::env::remove_var(crate::constants::AWS_S3_LOCKING_PROVIDER); let logstore = factory .with_options(Arc::new(store), &url, &StorageOptions::from(HashMap::new())) .unwrap(); - assert_eq!(logstore.name(), "DefaultLogStore"); + assert_eq!(logstore.name(), "S3LogStore"); + } + + #[test] + #[serial] + fn test_create_dynamodb_sdk_config() { + let sdk_config = SdkConfig::builder() + .region(Region::from_static("eu-west-1")) + .endpoint_url("http://localhost:1234") + .build(); + let dynamodb_sdk_config = DynamoDbLockClient::create_dynamodb_sdk_config( + &sdk_config, + Some("http://localhost:2345".to_string()), + ); + assert_eq!( + dynamodb_sdk_config.endpoint_url(), + Some("http://localhost:2345"), + ); + assert_eq!( + dynamodb_sdk_config.region().unwrap().to_string(), + "eu-west-1".to_string(), + ); + let dynamodb_sdk_no_override_config = + DynamoDbLockClient::create_dynamodb_sdk_config(&sdk_config, None); + assert_eq!( + dynamodb_sdk_no_override_config.endpoint_url(), + Some("http://localhost:1234"), + ); } } diff --git a/crates/aws/src/logstore/default_logstore.rs b/crates/aws/src/logstore/default_logstore.rs new file mode 100644 index 0000000000..a5688141c2 --- /dev/null +++ b/crates/aws/src/logstore/default_logstore.rs @@ -0,0 +1,113 @@ +//! Default implementation of [`LogStore`] for S3 storage backends + +use std::sync::Arc; + +use bytes::Bytes; +use deltalake_core::{ + logstore::{ + abort_commit_entry, get_latest_version, read_commit_entry, write_commit_entry, + CommitOrBytes, LogStore, LogStoreConfig, + }, + operations::transaction::TransactionError, + storage::{ObjectStoreRef, StorageOptions}, + DeltaResult, +}; +use object_store::{Error as ObjectStoreError, ObjectStore}; +use url::Url; + +/// Return the [S3LogStore] implementation with the provided configuration options +pub fn default_s3_logstore( + store: ObjectStoreRef, + location: &Url, + options: &StorageOptions, +) -> Arc { + Arc::new(S3LogStore::new( + store, + LogStoreConfig { + location: location.clone(), + options: options.clone(), + }, + )) +} + +/// Default [`LogStore`] implementation +#[derive(Debug, Clone)] +pub struct S3LogStore { + pub(crate) storage: Arc, + config: LogStoreConfig, +} + +impl S3LogStore { + /// Create a new instance of [`S3LogStore`] + /// + /// # Arguments + /// + /// * `storage` - A shared reference to an [`object_store::ObjectStore`] with "/" pointing at delta table root (i.e. where `_delta_log` is located). + /// * `location` - A url corresponding to the storage location of `storage`. + pub fn new(storage: ObjectStoreRef, config: LogStoreConfig) -> Self { + Self { storage, config } + } +} + +#[async_trait::async_trait] +impl LogStore for S3LogStore { + fn name(&self) -> String { + "S3LogStore".into() + } + + async fn read_commit_entry(&self, version: i64) -> DeltaResult> { + read_commit_entry(self.storage.as_ref(), version).await + } + + /// Tries to commit a prepared commit file. Returns [`TransactionError`] + /// if the given `version` already exists. The caller should handle the retry logic itself. + /// This is low-level transaction API. If user does not want to maintain the commit loop then + /// the `DeltaTransaction.commit` is desired to be used as it handles `try_commit_transaction` + /// with retry logic. + async fn write_commit_entry( + &self, + version: i64, + commit_or_bytes: CommitOrBytes, + ) -> Result<(), TransactionError> { + match commit_or_bytes { + CommitOrBytes::TmpCommit(tmp_commit) => { + Ok(write_commit_entry(&self.object_store(), version, &tmp_commit).await?) + } + _ => unreachable!(), // S3 Log Store should never receive bytes + } + .map_err(|err| -> TransactionError { + match err { + ObjectStoreError::AlreadyExists { .. } => { + TransactionError::VersionAlreadyExists(version) + } + _ => TransactionError::from(err), + } + })?; + Ok(()) + } + + async fn abort_commit_entry( + &self, + version: i64, + commit_or_bytes: CommitOrBytes, + ) -> Result<(), TransactionError> { + match &commit_or_bytes { + CommitOrBytes::TmpCommit(tmp_commit) => { + abort_commit_entry(self.storage.as_ref(), version, tmp_commit).await + } + _ => unreachable!(), // S3 Log Store should never receive bytes + } + } + + async fn get_latest_version(&self, current_version: i64) -> DeltaResult { + get_latest_version(self, current_version).await + } + + fn object_store(&self) -> Arc { + self.storage.clone() + } + + fn config(&self) -> &LogStoreConfig { + &self.config + } +} diff --git a/crates/aws/src/logstore.rs b/crates/aws/src/logstore/dynamodb_logstore.rs similarity index 94% rename from crates/aws/src/logstore.rs rename to crates/aws/src/logstore/dynamodb_logstore.rs index 9eba66cb93..202df1709e 100644 --- a/crates/aws/src/logstore.rs +++ b/crates/aws/src/logstore/dynamodb_logstore.rs @@ -45,7 +45,7 @@ impl S3DynamoDbLogStore { object_store: ObjectStoreRef, ) -> DeltaResult { let lock_client = DynamoDbLockClient::try_new( - &s3_options.sdk_config, + &s3_options.sdk_config.clone().unwrap(), s3_options .extra_opts .get(constants::LOCK_TABLE_KEY_NAME) @@ -58,6 +58,7 @@ impl S3DynamoDbLogStore { .extra_opts .get(constants::MAX_ELAPSED_REQUEST_TIME_KEY_NAME) .cloned(), + s3_options.dynamodb_endpoint.clone(), ) .map_err(|err| DeltaTableError::ObjectStore { source: ObjectStoreError::Generic { @@ -198,8 +199,12 @@ impl LogStore for S3DynamoDbLogStore { async fn write_commit_entry( &self, version: i64, - tmp_commit: &Path, + commit_or_bytes: CommitOrBytes, ) -> Result<(), TransactionError> { + let tmp_commit = match commit_or_bytes { + CommitOrBytes::TmpCommit(tmp_commit) => tmp_commit, + _ => unreachable!(), // S3DynamoDBLogstore should never get Bytes + }; let entry = CommitEntry::new(version, tmp_commit.clone()); debug!("Writing commit entry for {self:?}: {entry:?}"); // create log entry in dynamo db: complete = false, no expireTime @@ -243,8 +248,12 @@ impl LogStore for S3DynamoDbLogStore { async fn abort_commit_entry( &self, version: i64, - tmp_commit: &Path, + commit_or_bytes: CommitOrBytes, ) -> Result<(), TransactionError> { + let tmp_commit = match commit_or_bytes { + CommitOrBytes::TmpCommit(tmp_commit) => tmp_commit, + _ => unreachable!(), // S3DynamoDBLogstore should never get Bytes + }; self.lock_client .delete_commit_entry(version, &self.table_path) .await @@ -265,7 +274,7 @@ impl LogStore for S3DynamoDbLogStore { }, })?; - abort_commit_entry(&self.storage, version, tmp_commit).await?; + abort_commit_entry(&self.storage, version, &tmp_commit).await?; Ok(()) } @@ -308,13 +317,3 @@ pub enum RepairLogEntryResult { /// Both parts of the repair process where already carried. AlreadyCompleted, } - -/// Represents the possible, positive outcomes of calling `DynamoDbClient::try_create_lock_table()` -#[derive(Debug, PartialEq)] -pub enum CreateLockTableResult { - /// Table created successfully. - TableCreated, - /// Table was not created because it already exists. - /// Does not imply that the table has the correct schema. - TableAlreadyExists, -} diff --git a/crates/aws/src/logstore/mod.rs b/crates/aws/src/logstore/mod.rs new file mode 100644 index 0000000000..e5d7f87aec --- /dev/null +++ b/crates/aws/src/logstore/mod.rs @@ -0,0 +1,11 @@ +//! Contains the different logstore implementations for S3. +//! - S3LogStore (used when copy-if-not-exists or unsafe_rename is passed) +//! - S3DynamoDBLogStore (used when DynamoDB is the locking client) + +mod default_logstore; +mod dynamodb_logstore; + +pub use default_logstore::default_s3_logstore; +pub use default_logstore::S3LogStore; +pub use dynamodb_logstore::RepairLogEntryResult; +pub use dynamodb_logstore::S3DynamoDbLogStore; diff --git a/crates/aws/src/storage.rs b/crates/aws/src/storage.rs index e7b4d71109..a6735b1c0f 100644 --- a/crates/aws/src/storage.rs +++ b/crates/aws/src/storage.rs @@ -1,28 +1,30 @@ //! AWS S3 storage backend. -use aws_config::meta::region::ProvideRegion; -use aws_config::provider_config::ProviderConfig; use aws_config::{Region, SdkConfig}; use bytes::Bytes; +use deltalake_core::storage::object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; use deltalake_core::storage::object_store::{ - aws::AmazonS3ConfigKey, parse_url_opts, GetOptions, GetResult, ListResult, MultipartId, - ObjectMeta, ObjectStore, PutOptions, PutResult, Result as ObjectStoreResult, + parse_url_opts, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, + ObjectStoreScheme, PutMultipartOpts, PutOptions, PutPayload, PutResult, + Result as ObjectStoreResult, }; use deltalake_core::storage::{ limit_store_handler, str_is_truthy, ObjectStoreFactory, ObjectStoreRef, StorageOptions, }; -use deltalake_core::{DeltaResult, ObjectStoreError, Path}; +use deltalake_core::{DeltaResult, DeltaTableError, ObjectStoreError, Path}; use futures::stream::BoxStream; use futures::Future; +use object_store::aws::S3CopyIfNotExists; use std::collections::HashMap; use std::fmt::Debug; use std::ops::Range; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use tokio::io::AsyncWrite; +use tracing::log::*; use url::Url; +use crate::constants; use crate::errors::DynamoDbConfigError; #[cfg(feature = "native-tls")] use crate::native; @@ -71,36 +73,86 @@ impl ObjectStoreFactory for S3ObjectStoreFactory { storage_options: &StorageOptions, ) -> DeltaResult<(ObjectStoreRef, Path)> { let options = self.with_env_s3(storage_options); - let (inner, prefix) = parse_url_opts( - url, - options.0.iter().filter_map(|(key, value)| { - let s3_key = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; - Some((s3_key, value.clone())) - }), - )?; - - let store = limit_store_handler(inner, &options); - // If the copy-if-not-exists env var is set, we don't need to instantiate a locking client or check for allow-unsafe-rename. - if options - .0 - .contains_key(AmazonS3ConfigKey::CopyIfNotExists.as_ref()) - { - Ok((store, prefix)) - } else { - let s3_options = S3StorageOptions::from_map(&storage_options.0)?; + // All S3-likes should start their builder the same way + let mut builder = AmazonS3Builder::new().with_url(url.to_string()); - let store = S3StorageBackend::try_new( - store, - Some("dynamodb") == s3_options.locking_provider.as_deref() - || s3_options.allow_unsafe_rename, - )?; + for (key, value) in options.0.iter() { + if let Ok(key) = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) { + builder = builder.with_config(key, value.clone()); + } + } - Ok((Arc::new(store), prefix)) + let (_scheme, path) = + ObjectStoreScheme::parse(url).map_err(|e| DeltaTableError::GenericError { + source: Box::new(e), + })?; + let prefix = Path::parse(path)?; + + if is_aws(storage_options) { + debug!("Detected AWS S3, resolving credentials"); + let sdk_config = execute_sdk_future(crate::credentials::resolve_credentials( + storage_options.clone(), + ))??; + builder = builder.with_credentials(Arc::new( + crate::credentials::AWSForObjectStore::new(sdk_config), + )); } + + let inner = builder.build()?; + + let store = aws_storage_handler(limit_store_handler(inner, &options), &options)?; + debug!("Initialized the object store: {store:?}"); + + Ok((store, prefix)) } } +fn aws_storage_handler( + store: ObjectStoreRef, + options: &StorageOptions, +) -> DeltaResult { + // If the copy-if-not-exists env var is set or ConditionalPut is set, we don't need to instantiate a locking client or check for allow-unsafe-rename. + if options + .0 + .contains_key(AmazonS3ConfigKey::CopyIfNotExists.as_ref()) + || options + .0 + .contains_key(AmazonS3ConfigKey::ConditionalPut.as_ref()) + { + Ok(store) + } else { + let s3_options = S3StorageOptions::from_map(&options.0)?; + + let store = S3StorageBackend::try_new( + store, + Some("dynamodb") == s3_options.locking_provider.as_deref() + || s3_options.allow_unsafe_rename, + )?; + Ok(Arc::new(store)) + } +} + +// Determine whether this crate is being configured for use with native AWS S3 or an S3-alike +// +// This function will rteturn true in the default case since it's most likely that the absence of +// options will mean default/S3 configuration +fn is_aws(options: &StorageOptions) -> bool { + if options + .0 + .contains_key(crate::constants::AWS_FORCE_CREDENTIAL_LOAD) + { + return true; + } + if options + .0 + .contains_key(crate::constants::AWS_S3_LOCKING_PROVIDER) + { + return true; + } + !options.0.contains_key(crate::constants::AWS_ENDPOINT_URL) +} + /// Options used to configure the [S3StorageBackend]. /// /// Available options are described in [s3_constants]. @@ -109,12 +161,13 @@ impl ObjectStoreFactory for S3ObjectStoreFactory { pub struct S3StorageOptions { pub virtual_hosted_style_request: bool, pub locking_provider: Option, + pub dynamodb_endpoint: Option, pub s3_pool_idle_timeout: Duration, pub sts_pool_idle_timeout: Duration, pub s3_get_internal_server_error_retries: usize, pub allow_unsafe_rename: bool, pub extra_opts: HashMap, - pub sdk_config: SdkConfig, + pub sdk_config: Option, } impl Eq for S3StorageOptions {} @@ -122,43 +175,42 @@ impl PartialEq for S3StorageOptions { fn eq(&self, other: &Self) -> bool { self.virtual_hosted_style_request == other.virtual_hosted_style_request && self.locking_provider == other.locking_provider + && self.dynamodb_endpoint == other.dynamodb_endpoint && self.s3_pool_idle_timeout == other.s3_pool_idle_timeout && self.sts_pool_idle_timeout == other.sts_pool_idle_timeout && self.s3_get_internal_server_error_retries == other.s3_get_internal_server_error_retries && self.allow_unsafe_rename == other.allow_unsafe_rename && self.extra_opts == other.extra_opts - && self.sdk_config.endpoint_url() == other.sdk_config.endpoint_url() - && self.sdk_config.region() == other.sdk_config.region() } } impl S3StorageOptions { /// Creates an instance of S3StorageOptions from the given HashMap. pub fn from_map(options: &HashMap) -> DeltaResult { - let extra_opts = options + let extra_opts: HashMap = options .iter() .filter(|(k, _)| !s3_constants::S3_OPTS.contains(&k.as_str())) .map(|(k, v)| (k.to_owned(), v.to_owned())) .collect(); // Copy web identity values provided in options but not the environment into the environment // to get picked up by the `from_k8s_env` call in `get_web_identity_provider`. - Self::ensure_env_var(options, s3_constants::AWS_REGION); - Self::ensure_env_var(options, s3_constants::AWS_PROFILE); - Self::ensure_env_var(options, s3_constants::AWS_ACCESS_KEY_ID); - Self::ensure_env_var(options, s3_constants::AWS_SECRET_ACCESS_KEY); - Self::ensure_env_var(options, s3_constants::AWS_SESSION_TOKEN); - Self::ensure_env_var(options, s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE); - Self::ensure_env_var(options, s3_constants::AWS_ROLE_ARN); - Self::ensure_env_var(options, s3_constants::AWS_ROLE_SESSION_NAME); + Self::ensure_env_var(options, constants::AWS_REGION); + Self::ensure_env_var(options, constants::AWS_PROFILE); + Self::ensure_env_var(options, constants::AWS_ACCESS_KEY_ID); + Self::ensure_env_var(options, constants::AWS_SECRET_ACCESS_KEY); + Self::ensure_env_var(options, constants::AWS_SESSION_TOKEN); + Self::ensure_env_var(options, constants::AWS_WEB_IDENTITY_TOKEN_FILE); + Self::ensure_env_var(options, constants::AWS_ROLE_ARN); + Self::ensure_env_var(options, constants::AWS_ROLE_SESSION_NAME); let s3_pool_idle_timeout = - Self::u64_or_default(options, s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, 15); + Self::u64_or_default(options, constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, 15); let sts_pool_idle_timeout = - Self::u64_or_default(options, s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, 10); + Self::u64_or_default(options, constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, 10); let s3_get_internal_server_error_retries = Self::u64_or_default( options, - s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, + constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, 10, ) as usize; @@ -167,58 +219,26 @@ impl S3StorageOptions { .map(|addressing_style| addressing_style == "virtual") .unwrap_or(false); - let allow_unsafe_rename = str_option(options, s3_constants::AWS_S3_ALLOW_UNSAFE_RENAME) + let allow_unsafe_rename = str_option(options, constants::AWS_S3_ALLOW_UNSAFE_RENAME) .map(|val| str_is_truthy(&val)) .unwrap_or(false); - let disable_imds = str_option(options, s3_constants::AWS_EC2_METADATA_DISABLED) - .map(|val| str_is_truthy(&val)) - .unwrap_or(true); - let imds_timeout = - Self::u64_or_default(options, s3_constants::AWS_EC2_METADATA_TIMEOUT, 100); - let (loader, provider_config) = - if let Some(endpoint_url) = str_option(options, s3_constants::AWS_ENDPOINT_URL) { - let (region_provider, provider_config) = Self::create_provider_config( - str_option(options, s3_constants::AWS_REGION) - .or_else(|| std::env::var("AWS_DEFAULT_REGION").ok()) - .map_or(Region::from_static("custom"), Region::new), - )?; - let loader = aws_config::from_env() - .endpoint_url(endpoint_url) - .region(region_provider); - (loader, provider_config) - } else { - let (region_provider, provider_config) = Self::create_provider_config( - crate::credentials::new_region_provider(disable_imds, imds_timeout), - )?; - ( - aws_config::from_env().region(region_provider), - provider_config, - ) - }; - let credentials_provider = crate::credentials::ConfiguredCredentialChain::new( - disable_imds, - imds_timeout, - &provider_config, - ); - #[cfg(feature = "native-tls")] - let sdk_config = execute_sdk_future( - loader - .http_client(native::use_native_tls_client( - str_option(options, s3_constants::AWS_ALLOW_HTTP) - .map(|val| str_is_truthy(&val)) - .unwrap_or(false), - )) - .credentials_provider(credentials_provider) - .load(), - )?; - #[cfg(feature = "rustls")] - let sdk_config = - execute_sdk_future(loader.credentials_provider(credentials_provider).load())?; + let storage_options = StorageOptions(options.clone()); + + let sdk_config = match is_aws(&storage_options) { + false => None, + true => { + debug!("Detected AWS S3, resolving credentials"); + Some(execute_sdk_future( + crate::credentials::resolve_credentials(storage_options.clone()), + )??) + } + }; Ok(Self { virtual_hosted_style_request, - locking_provider: str_option(options, s3_constants::AWS_S3_LOCKING_PROVIDER), + locking_provider: str_option(options, constants::AWS_S3_LOCKING_PROVIDER), + dynamodb_endpoint: str_option(options, constants::AWS_ENDPOINT_URL_DYNAMODB), s3_pool_idle_timeout: Duration::from_secs(s3_pool_idle_timeout), sts_pool_idle_timeout: Duration::from_secs(sts_pool_idle_timeout), s3_get_internal_server_error_retries, @@ -228,22 +248,14 @@ impl S3StorageOptions { }) } + /// Return the configured endpoint URL for S3 operations pub fn endpoint_url(&self) -> Option<&str> { - self.sdk_config.endpoint_url() + self.sdk_config.as_ref().map(|v| v.endpoint_url()).flatten() } + /// Return the configured region used for S3 operations pub fn region(&self) -> Option<&Region> { - self.sdk_config.region() - } - - fn create_provider_config( - region_provider: T, - ) -> DeltaResult<(T, ProviderConfig)> { - let region = execute_sdk_future(region_provider.region())?; - Ok(( - region_provider, - ProviderConfig::default().with_region(region), - )) + self.sdk_config.as_ref().map(|v| v.region()).flatten() } fn u64_or_default(map: &HashMap, key: &str, default: u64) -> u64 { @@ -254,7 +266,9 @@ impl S3StorageOptions { fn ensure_env_var(map: &HashMap, key: &str) { if let Some(val) = str_option(map, key) { - std::env::set_var(key, val); + unsafe { + std::env::set_var(key, val); + } } } @@ -280,7 +294,7 @@ where cfg = Some(handle.block_on(future)); }); }); - cfg.ok_or(deltalake_core::DeltaTableError::ObjectStore { + cfg.ok_or(DeltaTableError::ObjectStore { source: ObjectStoreError::Generic { store: STORE_NAME, source: Box::new(DynamoDbConfigError::InitializationError), @@ -307,7 +321,11 @@ pub struct S3StorageBackend { impl std::fmt::Display for S3StorageBackend { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "S3StorageBackend") + write!( + f, + "S3StorageBackend {{ allow_unsafe_rename: {}, inner: {} }}", + self.allow_unsafe_rename, self.inner + ) } } @@ -325,20 +343,24 @@ impl S3StorageBackend { impl std::fmt::Debug for S3StorageBackend { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - write!(fmt, "S3StorageBackend") + write!( + fmt, + "S3StorageBackend {{ allow_unsafe_rename: {}, inner: {:?} }}", + self.allow_unsafe_rename, self.inner + ) } } #[async_trait::async_trait] impl ObjectStore for S3StorageBackend { - async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult { + async fn put(&self, location: &Path, bytes: PutPayload) -> ObjectStoreResult { self.inner.put(location, bytes).await } async fn put_opts( &self, location: &Path, - bytes: Bytes, + bytes: PutPayload, options: PutOptions, ) -> ObjectStoreResult { self.inner.put_opts(location, bytes, options).await @@ -399,114 +421,28 @@ impl ObjectStore for S3StorageBackend { } } - async fn put_multipart( - &self, - location: &Path, - ) -> ObjectStoreResult<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> ObjectStoreResult> { self.inner.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &Path, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.inner.put_multipart_opts(location, options).await } } /// Storage option keys to use when creating [crate::storage::s3::S3StorageOptions]. /// The same key should be used whether passing a key in the hashmap or setting it as an environment variable. /// Provided keys may include configuration for the S3 backend and also the optional DynamoDb lock used for atomic rename. +#[deprecated( + since = "0.20.0", + note = "s3_constants has moved up to deltalake_aws::constants::*" +)] pub mod s3_constants { - /// Custom S3 endpoint. - pub const AWS_ENDPOINT_URL: &str = "AWS_ENDPOINT_URL"; - /// The AWS region. - pub const AWS_REGION: &str = "AWS_REGION"; - /// The AWS profile. - pub const AWS_PROFILE: &str = "AWS_PROFILE"; - /// The AWS_ACCESS_KEY_ID to use for S3. - pub const AWS_ACCESS_KEY_ID: &str = "AWS_ACCESS_KEY_ID"; - /// The AWS_SECRET_ACCESS_KEY to use for S3. - pub const AWS_SECRET_ACCESS_KEY: &str = "AWS_SECRET_ACCESS_KEY"; - /// The AWS_SESSION_TOKEN to use for S3. - pub const AWS_SESSION_TOKEN: &str = "AWS_SESSION_TOKEN"; - /// Uses either "path" (the default) or "virtual", which turns on - /// [virtual host addressing](http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). - pub const AWS_S3_ADDRESSING_STYLE: &str = "AWS_S3_ADDRESSING_STYLE"; - /// Locking provider to use for safe atomic rename. - /// `dynamodb` is currently the only supported locking provider. - /// If not set, safe atomic rename is not available. - pub const AWS_S3_LOCKING_PROVIDER: &str = "AWS_S3_LOCKING_PROVIDER"; - /// The role to assume for S3 writes. - pub const AWS_S3_ASSUME_ROLE_ARN: &str = "AWS_S3_ASSUME_ROLE_ARN"; - /// The role session name to use when a role is assumed. If not provided a random session name is generated. - pub const AWS_S3_ROLE_SESSION_NAME: &str = "AWS_S3_ROLE_SESSION_NAME"; - /// The `pool_idle_timeout` option of aws http client. Has to be lower than 20 seconds, which is - /// default S3 server timeout . - /// However, since rusoto uses hyper as a client, its default timeout is 90 seconds - /// . - /// Hence, the `connection closed before message completed` could occur. - /// To avoid that, the default value of this setting is 15 seconds if it's not set otherwise. - pub const AWS_S3_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_S3_POOL_IDLE_TIMEOUT_SECONDS"; - /// The `pool_idle_timeout` for the as3_constants sts client. See - /// the reasoning in `AWS_S3_POOL_IDLE_TIMEOUT_SECONDS`. - pub const AWS_STS_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_STS_POOL_IDLE_TIMEOUT_SECONDS"; - /// The number of retries for S3 GET requests failed with 500 Internal Server Error. - pub const AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES: &str = - "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES"; - /// The web identity token file to use when using a web identity provider. - /// NOTE: web identity related options are set in the environment when - /// creating an instance of [crate::storage::s3::S3StorageOptions]. - /// See also . - pub const AWS_WEB_IDENTITY_TOKEN_FILE: &str = "AWS_WEB_IDENTITY_TOKEN_FILE"; - /// The role name to use for web identity. - /// NOTE: web identity related options are set in the environment when - /// creating an instance of [crate::storage::s3::S3StorageOptions]. - /// See also . - pub const AWS_ROLE_ARN: &str = "AWS_ROLE_ARN"; - /// The role session name to use for web identity. - /// NOTE: web identity related options are set in the environment when - /// creating an instance of [crate::storage::s3::S3StorageOptions]. - /// See also . - pub const AWS_ROLE_SESSION_NAME: &str = "AWS_ROLE_SESSION_NAME"; - /// Allow http connections - mainly useful for integration tests - pub const AWS_ALLOW_HTTP: &str = "AWS_ALLOW_HTTP"; - - /// If set to "true", allows creating commits without concurrent writer protection. - /// Only safe if there is one writer to a given table. - pub const AWS_S3_ALLOW_UNSAFE_RENAME: &str = "AWS_S3_ALLOW_UNSAFE_RENAME"; - - /// If set to "true", disables the imds client - /// Defaults to "true" - pub const AWS_EC2_METADATA_DISABLED: &str = "AWS_EC2_METADATA_DISABLED"; - - /// The timeout in milliseconds for the EC2 metadata endpoint - /// Defaults to 100 - pub const AWS_EC2_METADATA_TIMEOUT: &str = "AWS_EC2_METADATA_TIMEOUT"; - - /// The list of option keys owned by the S3 module. - /// Option keys not contained in this list will be added to the `extra_opts` - /// field of [crate::storage::s3::S3StorageOptions]. - pub const S3_OPTS: &[&str] = &[ - AWS_ENDPOINT_URL, - AWS_REGION, - AWS_PROFILE, - AWS_ACCESS_KEY_ID, - AWS_SECRET_ACCESS_KEY, - AWS_SESSION_TOKEN, - AWS_S3_LOCKING_PROVIDER, - AWS_S3_ASSUME_ROLE_ARN, - AWS_S3_ROLE_SESSION_NAME, - AWS_WEB_IDENTITY_TOKEN_FILE, - AWS_ROLE_ARN, - AWS_ROLE_SESSION_NAME, - AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, - AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, - AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, - AWS_EC2_METADATA_DISABLED, - AWS_EC2_METADATA_TIMEOUT, - ]; + pub use crate::constants::*; } pub(crate) fn str_option(map: &HashMap, key: &str) -> Option { @@ -523,11 +459,9 @@ pub(crate) fn str_option(map: &HashMap, key: &str) -> Option "http://localhost:1234".to_string(), - s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), - s3_constants::AWS_PROFILE.to_string() => "default".to_string(), - s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string(), - s3_constants::AWS_S3_LOCKING_PROVIDER.to_string() => "another_locking_provider".to_string(), - s3_constants::AWS_S3_ASSUME_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/another_role".to_string(), - s3_constants::AWS_S3_ROLE_SESSION_NAME.to_string() => "another_session_name".to_string(), - s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "another_token_file".to_string(), - s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "1".to_string(), - s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "2".to_string(), - s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES.to_string() => "3".to_string(), - s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), - s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + constants::AWS_ENDPOINT_URL.to_string() => "http://localhost:1234".to_string(), + constants::AWS_REGION.to_string() => "us-west-2".to_string(), + constants::AWS_PROFILE.to_string() => "default".to_string(), + constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string(), + constants::AWS_S3_LOCKING_PROVIDER.to_string() => "another_locking_provider".to_string(), + constants::AWS_S3_ASSUME_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/another_role".to_string(), + constants::AWS_S3_ROLE_SESSION_NAME.to_string() => "another_session_name".to_string(), + constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "another_token_file".to_string(), + constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "1".to_string(), + constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "2".to_string(), + constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES.to_string() => "3".to_string(), + constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), + constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), }).unwrap(); assert_eq!( - S3StorageOptions { - sdk_config: SdkConfig::builder() - .endpoint_url("http://localhost:1234".to_string()) - .region(Region::from_static("us-west-2")) - .build(), - virtual_hosted_style_request: true, - locking_provider: Some("another_locking_provider".to_string()), - s3_pool_idle_timeout: Duration::from_secs(1), - sts_pool_idle_timeout: Duration::from_secs(2), - s3_get_internal_server_error_retries: 3, - extra_opts: hashmap! { - s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string() - }, - allow_unsafe_rename: false, + Some("another_locking_provider"), + options.locking_provider.as_deref() + ); + assert_eq!(Duration::from_secs(1), options.s3_pool_idle_timeout); + assert_eq!(Duration::from_secs(2), options.sts_pool_idle_timeout); + assert_eq!(3, options.s3_get_internal_server_error_retries); + assert!(options.virtual_hosted_style_request); + assert!(!options.allow_unsafe_rename); + assert_eq!( + hashmap! { + constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string() }, - options + options.extra_opts + ); + }); + } + + #[test] + #[serial] + fn storage_options_from_map_with_dynamodb_endpoint_test() { + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + let options = S3StorageOptions::from_map(&hashmap! { + constants::AWS_ENDPOINT_URL.to_string() => "http://localhost:1234".to_string(), + constants::AWS_ENDPOINT_URL_DYNAMODB.to_string() => "http://localhost:2345".to_string(), + constants::AWS_REGION.to_string() => "us-west-2".to_string(), + constants::AWS_PROFILE.to_string() => "default".to_string(), + constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string(), + constants::AWS_S3_LOCKING_PROVIDER.to_string() => "another_locking_provider".to_string(), + constants::AWS_S3_ASSUME_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/another_role".to_string(), + constants::AWS_S3_ROLE_SESSION_NAME.to_string() => "another_session_name".to_string(), + constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "another_token_file".to_string(), + constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "1".to_string(), + constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "2".to_string(), + constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES.to_string() => "3".to_string(), + constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), + constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + }).unwrap(); + + assert_eq!( + Some("http://localhost:2345"), + options.dynamodb_endpoint.as_deref() ); }); } @@ -692,38 +663,45 @@ mod tests { fn storage_options_mixed_test() { ScopedEnv::run(|| { clear_env_of_aws_keys(); - std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "http://localhost"); - std::env::set_var(s3_constants::AWS_REGION, "us-west-1"); - std::env::set_var(s3_constants::AWS_PROFILE, "default"); - std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "wrong_key_id"); - std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "wrong_secret_key"); - std::env::set_var(s3_constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); + std::env::set_var(constants::AWS_ENDPOINT_URL, "http://localhost"); std::env::set_var( - s3_constants::AWS_S3_ASSUME_ROLE_ARN, + constants::AWS_ENDPOINT_URL_DYNAMODB, + "http://localhost:dynamodb", + ); + std::env::set_var(constants::AWS_REGION, "us-west-1"); + std::env::set_var(constants::AWS_PROFILE, "default"); + std::env::set_var(constants::AWS_ACCESS_KEY_ID, "wrong_key_id"); + std::env::set_var(constants::AWS_SECRET_ACCESS_KEY, "wrong_secret_key"); + std::env::set_var(constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); + std::env::set_var( + constants::AWS_S3_ASSUME_ROLE_ARN, "arn:aws:iam::123456789012:role/some_role", ); - std::env::set_var(s3_constants::AWS_S3_ROLE_SESSION_NAME, "session_name"); - std::env::set_var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE, "token_file"); + std::env::set_var(constants::AWS_S3_ROLE_SESSION_NAME, "session_name"); + std::env::set_var(constants::AWS_WEB_IDENTITY_TOKEN_FILE, "token_file"); - std::env::set_var(s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, "1"); - std::env::set_var(s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, "2"); - std::env::set_var(s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, "3"); + std::env::set_var(constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, "1"); + std::env::set_var(constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, "2"); + std::env::set_var(constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, "3"); let options = S3StorageOptions::from_map(&hashmap! { - s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id_mixed".to_string(), - s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret_mixed".to_string(), - s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), + constants::AWS_ACCESS_KEY_ID.to_string() => "test_id_mixed".to_string(), + constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret_mixed".to_string(), + constants::AWS_REGION.to_string() => "us-west-2".to_string(), "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES".to_string() => "3".to_string(), }) .unwrap(); assert_eq!( S3StorageOptions { - sdk_config: SdkConfig::builder() - .endpoint_url("http://localhost".to_string()) - .region(Region::from_static("us-west-2")) - .build(), + sdk_config: Some( + SdkConfig::builder() + .endpoint_url("http://localhost".to_string()) + .region(Region::from_static("us-west-2")) + .build() + ), virtual_hosted_style_request: false, locking_provider: Some("dynamodb".to_string()), + dynamodb_endpoint: Some("http://localhost:dynamodb".to_string()), s3_pool_idle_timeout: Duration::from_secs(1), sts_pool_idle_timeout: Duration::from_secs(2), s3_get_internal_server_error_retries: 3, @@ -741,30 +719,27 @@ mod tests { ScopedEnv::run(|| { clear_env_of_aws_keys(); let _options = S3StorageOptions::from_map(&hashmap! { - s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), - s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "web_identity_token_file".to_string(), - s3_constants::AWS_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/web_identity_role".to_string(), - s3_constants::AWS_ROLE_SESSION_NAME.to_string() => "web_identity_session_name".to_string(), + constants::AWS_REGION.to_string() => "eu-west-1".to_string(), + constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "web_identity_token_file".to_string(), + constants::AWS_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/web_identity_role".to_string(), + constants::AWS_ROLE_SESSION_NAME.to_string() => "web_identity_session_name".to_string(), }).unwrap(); - assert_eq!( - "eu-west-1", - std::env::var(s3_constants::AWS_REGION).unwrap() - ); + assert_eq!("eu-west-1", std::env::var(constants::AWS_REGION).unwrap()); assert_eq!( "web_identity_token_file", - std::env::var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE).unwrap() + std::env::var(constants::AWS_WEB_IDENTITY_TOKEN_FILE).unwrap() ); assert_eq!( "arn:aws:iam::123456789012:role/web_identity_role", - std::env::var(s3_constants::AWS_ROLE_ARN).unwrap() + std::env::var(constants::AWS_ROLE_ARN).unwrap() ); assert_eq!( "web_identity_session_name", - std::env::var(s3_constants::AWS_ROLE_SESSION_NAME).unwrap() + std::env::var(constants::AWS_ROLE_SESSION_NAME).unwrap() ); }); } @@ -776,10 +751,10 @@ mod tests { clear_env_of_aws_keys(); let raw_options = hashmap! {}; - std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "env_key"); - std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "env_key"); - std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "env_key"); - std::env::set_var(s3_constants::AWS_REGION, "env_key"); + std::env::set_var(constants::AWS_ACCESS_KEY_ID, "env_key"); + std::env::set_var(constants::AWS_ENDPOINT_URL, "env_key"); + std::env::set_var(constants::AWS_SECRET_ACCESS_KEY, "env_key"); + std::env::set_var(constants::AWS_REGION, "env_key"); let combined_options = S3ObjectStoreFactory {}.with_env_s3(&StorageOptions(raw_options)); @@ -818,47 +793,22 @@ mod tests { }); } - #[tokio::test] - #[serial] - async fn storage_options_toggle_imds() { - ScopedEnv::run_async(async { - clear_env_of_aws_keys(); - let disabled_time = storage_options_configure_imds(Some("true")).await; - let enabled_time = storage_options_configure_imds(Some("false")).await; - let default_time = storage_options_configure_imds(None).await; - println!( - "enabled_time: {}, disabled_time: {}, default_time: {}", - enabled_time.as_micros(), - disabled_time.as_micros(), - default_time.as_micros(), - ); - assert!(disabled_time < enabled_time); - assert!(default_time < enabled_time); - }) - .await; - } + #[test] + fn test_is_aws() { + let options = StorageOptions::default(); + assert!(is_aws(&options)); - async fn storage_options_configure_imds(value: Option<&str>) -> Duration { - let _options = match value { - Some(value) => S3StorageOptions::from_map(&hashmap! { - s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), - s3_constants::AWS_EC2_METADATA_DISABLED.to_string() => value.to_string(), - }) - .unwrap(), - None => S3StorageOptions::from_map(&hashmap! { - s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), - }) - .unwrap(), + let minio: HashMap = hashmap! { + crate::constants::AWS_ENDPOINT_URL.to_string() => "http://minio:8080".to_string(), }; + let options = StorageOptions::from(minio); + assert!(!is_aws(&options)); - assert_eq!( - "eu-west-1", - std::env::var(s3_constants::AWS_REGION).unwrap() - ); - - let provider = _options.sdk_config.credentials_provider().unwrap(); - let now = SystemTime::now(); - _ = provider.provide_credentials().await; - now.elapsed().unwrap() + let localstack: HashMap = hashmap! { + crate::constants::AWS_FORCE_CREDENTIAL_LOAD.to_string() => "true".to_string(), + crate::constants::AWS_ENDPOINT_URL.to_string() => "http://minio:8080".to_string(), + }; + let options = StorageOptions::from(localstack); + assert!(is_aws(&options)); } } diff --git a/crates/aws/tests/common.rs b/crates/aws/tests/common.rs index 01aa505b1b..dfa2a9cd51 100644 --- a/crates/aws/tests/common.rs +++ b/crates/aws/tests/common.rs @@ -87,7 +87,7 @@ impl S3Integration { "dynamodb", "create-table", "--table-name", - &table_name, + table_name, "--provisioned-throughput", "ReadCapacityUnits=1,WriteCapacityUnits=1", "--attribute-definitions", @@ -112,7 +112,7 @@ impl S3Integration { } fn wait_for_table(table_name: &str) -> std::io::Result<()> { - let args = ["dynamodb", "describe-table", "--table-name", &table_name]; + let args = ["dynamodb", "describe-table", "--table-name", table_name]; loop { let output = Command::new("aws") .args(args) @@ -145,7 +145,7 @@ impl S3Integration { fn delete_dynamodb_table(table_name: &str) -> std::io::Result { let mut child = Command::new("aws") - .args(["dynamodb", "delete-table", "--table-name", &table_name]) + .args(["dynamodb", "delete-table", "--table-name", table_name]) .stdout(Stdio::null()) .spawn() .expect("aws command is installed"); diff --git a/crates/aws/tests/integration_s3_dynamodb.rs b/crates/aws/tests/integration_s3_dynamodb.rs index eb674c4235..da0b0e06c8 100644 --- a/crates/aws/tests/integration_s3_dynamodb.rs +++ b/crates/aws/tests/integration_s3_dynamodb.rs @@ -10,7 +10,8 @@ use deltalake_aws::logstore::{RepairLogEntryResult, S3DynamoDbLogStore}; use deltalake_aws::storage::S3StorageOptions; use deltalake_aws::{CommitEntry, DynamoDbConfig, DynamoDbLockClient}; use deltalake_core::kernel::{Action, Add, DataType, PrimitiveType, StructField, StructType}; -use deltalake_core::logstore::LogStore; +use deltalake_core::logstore::{logstore_for, CommitOrBytes, LogStore}; +use deltalake_core::operations::create::CreateBuilder; use deltalake_core::operations::transaction::CommitBuilder; use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::commit_uri_from_version; @@ -22,6 +23,11 @@ use lazy_static::lazy_static; use object_store::path::Path; use serde_json::Value; use serial_test::serial; +use tracing::log::*; + +use maplit::hashmap; +use object_store::{PutOptions, PutPayload}; +use url::Url; mod common; use common::*; @@ -38,7 +44,8 @@ lazy_static! { fn make_client() -> TestResult { let options: S3StorageOptions = S3StorageOptions::try_default().unwrap(); Ok(DynamoDbLockClient::try_new( - &options.sdk_config, + &options.sdk_config.unwrap(), + None, None, None, None, @@ -68,7 +75,7 @@ fn client_configs_via_env_variables() -> TestResult<()> { billing_mode: BillingMode::PayPerRequest, lock_table_name: "some_table".to_owned(), max_elapsed_request_time: Duration::from_secs(64), - sdk_config: options.sdk_config, + sdk_config: options.sdk_config.unwrap(), }, *config, ); @@ -78,6 +85,48 @@ fn client_configs_via_env_variables() -> TestResult<()> { Ok(()) } +#[tokio::test] +#[serial] +async fn test_create_s3_table() -> TestResult<()> { + let _ = pretty_env_logger::try_init(); + let context = IntegrationContext::new(Box::new(S3Integration::default()))?; + let _client = make_client()?; + let table_name = format!("{}_{}", "create_test", uuid::Uuid::new_v4()); + let table_uri = context.uri_for_table(TestTables::Custom(table_name.to_owned())); + + let schema = StructType::new(vec![StructField::new( + "id".to_string(), + DataType::Primitive(PrimitiveType::Integer), + true, + )]); + let storage_options: HashMap = hashmap! { + deltalake_aws::constants::AWS_ALLOW_HTTP.into() => "true".into(), + // Despite not being in AWS, we should force credential resolution + deltalake_aws::constants::AWS_FORCE_CREDENTIAL_LOAD.into() => "true".into(), + deltalake_aws::constants::AWS_ENDPOINT_URL.into() => "http://localhost:4566".into(), + }; + let log_store = logstore_for(Url::parse(&table_uri)?, storage_options, None)?; + + let payload = PutPayload::from_static(b"test-drivin"); + let _put = log_store + .object_store() + .put_opts( + &Path::from("_delta_log/_commit_failed.tmp"), + payload, + PutOptions::default(), + ) + .await?; + + debug!("creating a CreateBuilder"); + let _created = CreateBuilder::new() + .with_log_store(log_store) + .with_partition_columns(vec!["id"]) + .with_columns(schema.fields().cloned()) + .with_save_mode(SaveMode::Ignore) + .await?; + Ok(()) +} + #[tokio::test] #[serial] async fn get_missing_item() -> TestResult<()> { @@ -197,7 +246,10 @@ async fn test_abort_commit_entry() -> TestResult<()> { let entry = create_incomplete_commit_entry(&table, 1, "unfinished_commit").await?; log_store - .abort_commit_entry(entry.version, &entry.temp_path) + .abort_commit_entry( + entry.version, + CommitOrBytes::TmpCommit(entry.temp_path.clone()), + ) .await?; // The entry should have been aborted - the latest entry should be one version lower @@ -212,7 +264,7 @@ async fn test_abort_commit_entry() -> TestResult<()> { // Test abort commit is idempotent - still works if already aborted log_store - .abort_commit_entry(entry.version, &entry.temp_path) + .abort_commit_entry(entry.version, CommitOrBytes::TmpCommit(entry.temp_path)) .await?; Ok(()) @@ -243,7 +295,10 @@ async fn test_abort_commit_entry_fail_to_delete_entry() -> TestResult<()> { // Abort will fail since we marked the entry as complete assert!(matches!( log_store - .abort_commit_entry(entry.version, &entry.temp_path) + .abort_commit_entry( + entry.version, + CommitOrBytes::TmpCommit(entry.temp_path.clone()) + ) .await, Err(_), )); @@ -345,7 +400,12 @@ async fn create_incomplete_commit_entry( .into_prepared_commit_future() .await?; - let commit_entry = CommitEntry::new(version, prepared.path().to_owned()); + let tmp_commit = match prepared.commit_or_bytes() { + CommitOrBytes::TmpCommit(tmp_commit) => tmp_commit, + _ => unreachable!(), + }; + + let commit_entry = CommitEntry::new(version, tmp_commit.to_owned()); make_client()? .put_commit_entry(&table.table_uri(), &commit_entry) .await?; @@ -390,7 +450,7 @@ async fn prepare_table(context: &IntegrationContext, table_name: &str) -> TestRe // create delta table let table = DeltaOps(table) .create() - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await?; println!("table created: {table:?}"); Ok(table) diff --git a/crates/aws/tests/repair_s3_rename_test.rs b/crates/aws/tests/repair_s3_rename_test.rs index 68d8727ebe..d9e19de7b7 100644 --- a/crates/aws/tests/repair_s3_rename_test.rs +++ b/crates/aws/tests/repair_s3_rename_test.rs @@ -9,6 +9,7 @@ use deltalake_core::storage::object_store::{ use deltalake_core::{DeltaTableBuilder, ObjectStore, Path}; use deltalake_test::utils::IntegrationContext; use futures::stream::BoxStream; +use object_store::{MultipartUpload, PutMultipartOpts, PutPayload}; use serial_test::serial; use std::ops::Range; use std::sync::{Arc, Mutex}; @@ -60,8 +61,8 @@ async fn run_repair_test_case(path: &str, pause_copy: bool) -> Result<(), Object }; let (s3_2, _) = create_s3_backend(&context, "w2", None, None); - s3_1.put(&src1, Bytes::from("test1")).await.unwrap(); - s3_2.put(&src2, Bytes::from("test2")).await.unwrap(); + s3_1.put(&src1, Bytes::from("test1").into()).await.unwrap(); + s3_2.put(&src2, Bytes::from("test2").into()).await.unwrap(); let rename1 = rename(s3_1, &src1, &dst1); // to ensure that first one is started actually first @@ -166,14 +167,14 @@ impl ObjectStore for DelayedObjectStore { self.delete(from).await } - async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult { + async fn put(&self, location: &Path, bytes: PutPayload) -> ObjectStoreResult { self.inner.put(location, bytes).await } async fn put_opts( &self, location: &Path, - bytes: Bytes, + bytes: PutPayload, options: PutOptions, ) -> ObjectStoreResult { self.inner.put_opts(location, bytes, options).await @@ -227,19 +228,16 @@ impl ObjectStore for DelayedObjectStore { self.inner.rename_if_not_exists(from, to).await } - async fn put_multipart( - &self, - location: &Path, - ) -> ObjectStoreResult<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> ObjectStoreResult> { self.inner.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &Path, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.inner.put_multipart_opts(location, options).await } } diff --git a/crates/azure/Cargo.toml b/crates/azure/Cargo.toml index cbe55a1b83..87a744d608 100644 --- a/crates/azure/Cargo.toml +++ b/crates/azure/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-azure" -version = "0.1.2" +version = "0.3.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,7 +12,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = ">=0.17.0, <0.19.0", path = "../core" } +deltalake-core = { version = "0.20.0", path = "../core" } lazy_static = "1" # workspace depenndecies diff --git a/crates/azure/tests/integration.rs b/crates/azure/tests/integration.rs index 5230462c92..3ffaa00cc5 100644 --- a/crates/azure/tests/integration.rs +++ b/crates/azure/tests/integration.rs @@ -75,7 +75,10 @@ async fn read_write_test_onelake(context: &IntegrationContext, path: &Path) -> T let expected = Bytes::from_static(b"test world from delta-rs on friday"); - delta_store.put(path, expected.clone()).await.unwrap(); + delta_store + .put(path, expected.clone().into()) + .await + .unwrap(); let fetched = delta_store.get(path).await.unwrap().bytes().await.unwrap(); assert_eq!(expected, fetched); diff --git a/crates/benchmarks/src/bin/merge.rs b/crates/benchmarks/src/bin/merge.rs index bb178a192d..2465e23d94 100644 --- a/crates/benchmarks/src/bin/merge.rs +++ b/crates/benchmarks/src/bin/merge.rs @@ -7,9 +7,10 @@ use arrow::datatypes::Schema as ArrowSchema; use arrow_array::{RecordBatch, StringArray, UInt32Array}; use chrono::Duration; use clap::{command, Args, Parser, Subcommand}; +use datafusion::functions::expr_fn::random; use datafusion::{datasource::MemTable, prelude::DataFrame}; use datafusion_common::DataFusionError; -use datafusion_expr::{cast, col, lit, random}; +use datafusion_expr::{cast, col, lit}; use deltalake_core::protocol::SaveMode; use deltalake_core::{ arrow::{ diff --git a/crates/catalog-glue/Cargo.toml b/crates/catalog-glue/Cargo.toml index c757563c1b..549b3a11c8 100644 --- a/crates/catalog-glue/Cargo.toml +++ b/crates/catalog-glue/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-catalog-glue" -version = "0.1.0" +version = "0.4.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -15,9 +15,7 @@ rust-version.workspace = true async-trait = { workspace = true } aws-config = "1" aws-sdk-glue = "1" -deltalake-core = { version = ">=0.17.0, <0.19.0", path = "../core" } -# This can depend on a lowest common denominator of core once that's released -# deltalake_core = { version = "0.17.0" } +deltalake-core = { version = "0.20.0", path = "../core" } thiserror = { workspace = true } [dev-dependencies] diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 8056c85f29..52df035c71 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-core" -version = "0.18.0" +version = "0.20.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -15,6 +15,8 @@ rust-version.workspace = true features = ["datafusion", "json", "unity-experimental"] [dependencies] +delta_kernel.workspace = true + # arrow arrow = { workspace = true } arrow-arith = { workspace = true } @@ -40,8 +42,9 @@ datafusion-common = { workspace = true, optional = true } datafusion-proto = { workspace = true, optional = true } datafusion-sql = { workspace = true, optional = true } datafusion-physical-expr = { workspace = true, optional = true } +datafusion-physical-plan = { workspace = true, optional = true } datafusion-functions = { workspace = true, optional = true } -datafusion-functions-array = { workspace = true, optional = true } +datafusion-functions-aggregate = { workspace = true, optional = true } # serde serde = { workspace = true, features = ["derive"] } @@ -55,6 +58,7 @@ regex = { workspace = true } thiserror = { workspace = true } uuid = { workspace = true, features = ["serde", "v4"] } url = { workspace = true } +urlencoding = { workspace = true} # runtime async-trait = { workspace = true } @@ -62,8 +66,10 @@ futures = { workspace = true } num_cpus = { workspace = true } tokio = { workspace = true, features = [ "macros", + "process", "rt", "rt-multi-thread", + "signal", "sync", "fs", "parking_lot", @@ -71,7 +77,7 @@ tokio = { workspace = true, features = [ # other deps (these should be organized and pulled into workspace.dependencies as necessary) cfg-if = "1" -dashmap = "5" +dashmap = "6" errno = "0.3" either = "1.8" fix-hidden-lifetime-bug = "0.2" @@ -91,19 +97,20 @@ tracing = { workspace = true } rand = "0.8" z85 = "3.0.5" maplit = "1" +sqlparser = { version = "0.51" } # Unity reqwest = { version = "0.11.18", default-features = false, features = [ "rustls-tls", "json", ], optional = true } -sqlparser = { version = "0.46", optional = true } [dev-dependencies] criterion = "0.5" ctor = "0" deltalake-test = { path = "../test", features = ["datafusion"] } dotenvy = "0" +fs_extra = "1.2.0" hyper = { version = "0.14", features = ["server"] } maplit = "1" pretty_assertions = "1.2.1" @@ -115,17 +122,18 @@ tokio = { version = "1", features = ["macros", "rt-multi-thread"] } utime = "0.3" [features] -default = [] +cdf = [] +default = ["cdf"] datafusion = [ "dep:datafusion", "datafusion-expr", "datafusion-common", "datafusion-proto", "datafusion-physical-expr", + "datafusion-physical-plan", "datafusion-sql", "datafusion-functions", - "datafusion-functions-array", - "sqlparser", + "datafusion-functions-aggregate", ] datafusion-ext = ["datafusion"] json = ["parquet/json"] diff --git a/crates/core/src/data_catalog/storage/mod.rs b/crates/core/src/data_catalog/storage/mod.rs index fc30f32144..7b0b779069 100644 --- a/crates/core/src/data_catalog/storage/mod.rs +++ b/crates/core/src/data_catalog/storage/mod.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use async_trait::async_trait; use dashmap::DashMap; -use datafusion::catalog::schema::SchemaProvider; +use datafusion::catalog::SchemaProvider; use datafusion::datasource::TableProvider; use datafusion_common::DataFusionError; use futures::TryStreamExt; @@ -147,7 +147,8 @@ impl SchemaProvider for ListingSchemaProvider { mod tests { use super::*; use datafusion::assert_batches_sorted_eq; - use datafusion::catalog::{CatalogProvider, MemoryCatalogProvider}; + use datafusion::catalog::CatalogProvider; + use datafusion::catalog_common::MemoryCatalogProvider; use datafusion::execution::context::SessionContext; #[test] diff --git a/crates/core/src/data_catalog/unity/datafusion.rs b/crates/core/src/data_catalog/unity/datafusion.rs index 6b6a4b4a63..44e7c9ca33 100644 --- a/crates/core/src/data_catalog/unity/datafusion.rs +++ b/crates/core/src/data_catalog/unity/datafusion.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::sync::Arc; use dashmap::DashMap; -use datafusion::catalog::schema::SchemaProvider; +use datafusion::catalog::SchemaProvider; use datafusion::catalog::{CatalogProvider, CatalogProviderList}; use datafusion::datasource::TableProvider; use datafusion_common::DataFusionError; diff --git a/crates/core/src/data_catalog/unity/models.rs b/crates/core/src/data_catalog/unity/models.rs index 265149b969..2066a4ee86 100644 --- a/crates/core/src/data_catalog/unity/models.rs +++ b/crates/core/src/data_catalog/unity/models.rs @@ -252,8 +252,8 @@ pub enum TableType { StreamingTable, } -/// #[derive(Deserialize)] +/// Summary of the table pub struct TableSummary { /// The full name of the table. pub full_name: String, diff --git a/crates/core/src/delta_datafusion/cdf/mod.rs b/crates/core/src/delta_datafusion/cdf/mod.rs index 02382aa725..e561fc2152 100644 --- a/crates/core/src/delta_datafusion/cdf/mod.rs +++ b/crates/core/src/delta_datafusion/cdf/mod.rs @@ -1,13 +1,13 @@ //! Logical operators and physical executions for CDF +use std::collections::HashMap; use arrow_schema::{DataType, Field, TimeUnit}; use lazy_static::lazy_static; -use std::collections::HashMap; -pub(crate) use scan::*; -pub(crate) use scan_utils::*; - -use crate::kernel::{Add, AddCDCFile}; +pub(crate) use self::scan::*; +pub(crate) use self::scan_utils::*; +use crate::kernel::{Add, AddCDCFile, Remove}; +use crate::DeltaResult; mod scan; mod scan_utils; @@ -59,37 +59,73 @@ impl CdcDataSpec { /// This trait defines a generic set of operations used by CDF Reader pub trait FileAction { /// Adds partition values - fn partition_values(&self) -> &HashMap>; + fn partition_values(&self) -> DeltaResult<&HashMap>>; /// Physical Path to the data fn path(&self) -> String; /// Byte size of the physical file - fn size(&self) -> usize; + fn size(&self) -> DeltaResult; } impl FileAction for Add { - fn partition_values(&self) -> &HashMap> { - &self.partition_values + fn partition_values(&self) -> DeltaResult<&HashMap>> { + Ok(&self.partition_values) } fn path(&self) -> String { self.path.clone() } - fn size(&self) -> usize { - self.size as usize + fn size(&self) -> DeltaResult { + Ok(self.size as usize) } } impl FileAction for AddCDCFile { - fn partition_values(&self) -> &HashMap> { - &self.partition_values + fn partition_values(&self) -> DeltaResult<&HashMap>> { + Ok(&self.partition_values) + } + + fn path(&self) -> String { + self.path.clone() + } + + fn size(&self) -> DeltaResult { + Ok(self.size as usize) + } +} + +impl FileAction for Remove { + fn partition_values(&self) -> DeltaResult<&HashMap>> { + // If extended_file_metadata is true, it should be required to have this filled in + if self.extended_file_metadata.unwrap_or_default() { + Ok(self.partition_values.as_ref().unwrap()) + } else { + match self.partition_values { + Some(ref part_map) => Ok(part_map), + _ => Err(crate::DeltaTableError::Protocol { + source: crate::protocol::ProtocolError::InvalidField( + "partition_values".to_string(), + ), + }), + } + } } fn path(&self) -> String { self.path.clone() } - fn size(&self) -> usize { - self.size as usize + fn size(&self) -> DeltaResult { + // If extended_file_metadata is true, it should be required to have this filled in + if self.extended_file_metadata.unwrap_or_default() { + Ok(self.size.unwrap() as usize) + } else { + match self.size { + Some(size) => Ok(size as usize), + _ => Err(crate::DeltaTableError::Protocol { + source: crate::protocol::ProtocolError::InvalidField("size".to_string()), + }), + } + } } } diff --git a/crates/core/src/delta_datafusion/cdf/scan.rs b/crates/core/src/delta_datafusion/cdf/scan.rs index 1f9c9f52b3..e5098bca72 100644 --- a/crates/core/src/delta_datafusion/cdf/scan.rs +++ b/crates/core/src/delta_datafusion/cdf/scan.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use arrow_schema::SchemaRef; use datafusion::execution::{SendableRecordBatchStream, TaskContext}; -use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; +use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; /// Physical execution of a scan #[derive(Debug, Clone)] @@ -26,6 +26,10 @@ impl DisplayAs for DeltaCdfScan { } impl ExecutionPlan for DeltaCdfScan { + fn name(&self) -> &str { + Self::static_name() + } + fn as_any(&self) -> &dyn Any { self } @@ -38,7 +42,7 @@ impl ExecutionPlan for DeltaCdfScan { self.plan.properties() } - fn children(&self) -> Vec> { + fn children(&self) -> Vec<&Arc> { vec![] } diff --git a/crates/core/src/delta_datafusion/cdf/scan_utils.rs b/crates/core/src/delta_datafusion/cdf/scan_utils.rs index 434afa4f74..27285179f6 100644 --- a/crates/core/src/delta_datafusion/cdf/scan_utils.rs +++ b/crates/core/src/delta_datafusion/cdf/scan_utils.rs @@ -18,9 +18,9 @@ pub fn map_action_to_scalar( action: &F, part: &str, schema: SchemaRef, -) -> ScalarValue { - action - .partition_values() +) -> DeltaResult { + Ok(action + .partition_values()? .get(part) .map(|val| { schema @@ -36,7 +36,7 @@ pub fn map_action_to_scalar( }) .unwrap_or(ScalarValue::Null) }) - .unwrap_or(ScalarValue::Null) + .unwrap_or(ScalarValue::Null)) } pub fn create_spec_partition_values( @@ -67,7 +67,7 @@ pub fn create_partition_values( let partition_values = table_partition_cols .iter() .map(|part| map_action_to_scalar(&action, part, schema.clone())) - .collect::>(); + .collect::>>()?; let mut new_part_values = spec_partition_values.clone(); new_part_values.extend(partition_values); @@ -75,7 +75,7 @@ pub fn create_partition_values( let part = PartitionedFile { object_meta: ObjectMeta { location: Path::parse(action.path().as_str())?, - size: action.size(), + size: action.size()?, e_tag: None, last_modified: chrono::Utc.timestamp_nanos(0), version: None, @@ -83,6 +83,7 @@ pub fn create_partition_values( partition_values: new_part_values.clone(), extensions: None, range: None, + statistics: None, }; file_groups.entry(new_part_values).or_default().push(part); @@ -91,9 +92,9 @@ pub fn create_partition_values( Ok(file_groups) } -pub fn create_cdc_schema(mut schema_fields: Vec, include_type: bool) -> SchemaRef { +pub fn create_cdc_schema(mut schema_fields: Vec>, include_type: bool) -> SchemaRef { if include_type { - schema_fields.push(Field::new(CHANGE_TYPE_COL, DataType::Utf8, true)); + schema_fields.push(Field::new(CHANGE_TYPE_COL, DataType::Utf8, true).into()); } Arc::new(Schema::new(schema_fields)) } diff --git a/crates/core/src/delta_datafusion/expr.rs b/crates/core/src/delta_datafusion/expr.rs index 868969c571..eb542d98dd 100644 --- a/crates/core/src/delta_datafusion/expr.rs +++ b/crates/core/src/delta_datafusion/expr.rs @@ -20,39 +20,67 @@ // Display functions and required macros were pulled from https://github.com/apache/arrow-datafusion/blob/ddb95497e2792015d5a5998eec79aac8d37df1eb/datafusion/expr/src/expr.rs //! Utility functions for Datafusion's Expressions - -use std::{ - fmt::{self, Display, Error, Formatter, Write}, - sync::Arc, -}; +use std::fmt::{self, Display, Error, Formatter, Write}; +use std::sync::Arc; use arrow_schema::DataType; use chrono::{DateTime, NaiveDate}; use datafusion::execution::context::SessionState; +use datafusion::execution::session_state::SessionStateBuilder; +use datafusion::execution::FunctionRegistry; use datafusion_common::Result as DFResult; use datafusion_common::{config::ConfigOptions, DFSchema, Result, ScalarValue, TableReference}; -use datafusion_expr::{ - expr::InList, AggregateUDF, Between, BinaryExpr, Cast, Expr, GetIndexedField, Like, TableSource, -}; +use datafusion_expr::expr::InList; +use datafusion_expr::planner::ExprPlanner; +use datafusion_expr::{AggregateUDF, Between, BinaryExpr, Cast, Expr, Like, TableSource}; use datafusion_sql::planner::{ContextProvider, SqlToRel}; use datafusion_sql::sqlparser::ast::escape_quoted_string; use datafusion_sql::sqlparser::dialect::GenericDialect; use datafusion_sql::sqlparser::parser::Parser; use datafusion_sql::sqlparser::tokenizer::Tokenizer; -use crate::{DeltaResult, DeltaTableError}; - use super::DeltaParserOptions; +use crate::{DeltaResult, DeltaTableError}; pub(crate) struct DeltaContextProvider<'a> { - state: &'a SessionState, + state: SessionState, + /// Keeping this around just to make use of the 'a lifetime + _original: &'a SessionState, + planners: Vec>, +} + +impl<'a> DeltaContextProvider<'a> { + fn new(state: &'a SessionState) -> Self { + let planners = state.expr_planners(); + DeltaContextProvider { + planners, + // Creating a new session state with overridden scalar_functions since + // the get_field() UDF was dropped from the default scalar functions upstream in + // `36660fe10d9c0cdff62e0da0b94bee28422d3419` + state: SessionStateBuilder::new_from_existing(state.clone()) + .with_scalar_functions( + state + .scalar_functions() + .values() + .cloned() + .chain(std::iter::once(datafusion::functions::core::get_field())) + .collect(), + ) + .build(), + _original: state, + } + } } impl<'a> ContextProvider for DeltaContextProvider<'a> { - fn get_table_provider(&self, _name: TableReference) -> DFResult> { + fn get_table_source(&self, _name: TableReference) -> DFResult> { unimplemented!() } + fn get_expr_planners(&self) -> &[Arc] { + self.planners.as_slice() + } + fn get_function_meta(&self, name: &str) -> Option> { self.state.scalar_functions().get(name).cloned() } @@ -73,20 +101,16 @@ impl<'a> ContextProvider for DeltaContextProvider<'a> { self.state.window_functions().get(name).cloned() } - fn get_table_source(&self, _name: TableReference) -> DFResult> { - unimplemented!() - } - - fn udfs_names(&self) -> Vec { - unimplemented!() + fn udf_names(&self) -> Vec { + self.state.scalar_functions().keys().cloned().collect() } - fn udafs_names(&self) -> Vec { - unimplemented!() + fn udaf_names(&self) -> Vec { + self.state.aggregate_functions().keys().cloned().collect() } - fn udwfs_names(&self) -> Vec { - unimplemented!() + fn udwf_names(&self) -> Vec { + self.state.window_functions().keys().cloned().collect() } } @@ -110,7 +134,7 @@ pub(crate) fn parse_predicate_expression( source: Box::new(err), })?; - let context_provider = DeltaContextProvider { state: df_state }; + let context_provider = DeltaContextProvider::new(df_state); let sql_to_rel = SqlToRel::new_with_options(&context_provider, DeltaParserOptions::default().into()); @@ -198,7 +222,7 @@ impl<'a> Display for SqlFormat<'a> { Expr::IsNotFalse(expr) => write!(f, "{} IS NOT FALSE", SqlFormat { expr }), Expr::IsNotUnknown(expr) => write!(f, "{} IS NOT UNKNOWN", SqlFormat { expr }), Expr::BinaryExpr(expr) => write!(f, "{}", BinaryExprFormat { expr }), - Expr::ScalarFunction(func) => fmt_function(f, func.func_def.name(), false, &func.args), + Expr::ScalarFunction(func) => fmt_function(f, func.func.name(), false, &func.args), Expr::Cast(Cast { expr, data_type }) => { write!(f, "arrow_cast({}, '{}')", SqlFormat { expr }, data_type) } @@ -276,33 +300,6 @@ impl<'a> Display for SqlFormat<'a> { write!(f, "{expr} IN ({})", expr_vec_fmt!(list)) } } - Expr::GetIndexedField(GetIndexedField { expr, field }) => match field { - datafusion_expr::GetFieldAccess::NamedStructField { name } => { - write!( - f, - "{}[{}]", - SqlFormat { expr }, - ScalarValueFormat { scalar: name } - ) - } - datafusion_expr::GetFieldAccess::ListIndex { key } => { - write!(f, "{}[{}]", SqlFormat { expr }, SqlFormat { expr: key }) - } - datafusion_expr::GetFieldAccess::ListRange { - start, - stop, - stride, - } => { - write!( - f, - "{expr}[{start}:{stop}:{stride}]", - expr = SqlFormat { expr }, - start = SqlFormat { expr: start }, - stop = SqlFormat { expr: stop }, - stride = SqlFormat { expr: stride } - ) - } - }, _ => Err(fmt::Error), } } @@ -425,15 +422,16 @@ impl<'a> fmt::Display for ScalarValueFormat<'a> { #[cfg(test)] mod test { use arrow_schema::DataType as ArrowDataType; + use datafusion::functions_array::expr_fn::cardinality; + use datafusion::functions_nested::expr_ext::{IndexAccessor, SliceAccessor}; use datafusion::prelude::SessionContext; use datafusion_common::{Column, ScalarValue, ToDFSchema}; use datafusion_expr::expr::ScalarFunction; - use datafusion_expr::{ - col, lit, substring, BinaryExpr, Cast, Expr, ExprSchemable, ScalarFunctionDefinition, - }; + use datafusion_expr::{col, lit, BinaryExpr, Cast, Expr, ExprSchemable}; use datafusion_functions::core::arrow_cast; + use datafusion_functions::core::expr_ext::FieldAccessor; use datafusion_functions::encoding::expr_fn::decode; - use datafusion_functions_array::expr_fn::cardinality; + use datafusion_functions::expr_fn::substring; use crate::delta_datafusion::{DataFusionMixins, DeltaSessionContext}; use crate::kernel::{ArrayType, DataType, PrimitiveType, StructField, StructType}; @@ -542,7 +540,7 @@ mod test { let table = DeltaOps::new_in_memory() .create() - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -564,7 +562,7 @@ mod test { override_expected_expr: Some( datafusion_expr::Expr::ScalarFunction( ScalarFunction { - func_def: ScalarFunctionDefinition::UDF(arrow_cast()), + func: arrow_cast(), args: vec![ lit(ScalarValue::Int64(Some(1))), lit(ScalarValue::Utf8(Some("Int32".into()))) @@ -671,7 +669,7 @@ mod test { datafusion_expr::Expr::BinaryExpr(BinaryExpr { left: Box::new(datafusion_expr::Expr::ScalarFunction( ScalarFunction { - func_def: ScalarFunctionDefinition::UDF(arrow_cast()), + func: arrow_cast(), args: vec![ col("value"), lit(ScalarValue::Utf8(Some("Utf8".into()))) @@ -685,19 +683,19 @@ mod test { }, simple!( col("_struct").field("a").eq(lit(20_i64)), - "_struct['a'] = 20".to_string() + "get_field(_struct, 'a') = 20".to_string() ), simple!( col("_struct").field("nested").field("b").eq(lit(20_i64)), - "_struct['nested']['b'] = 20".to_string() + "get_field(get_field(_struct, 'nested'), 'b') = 20".to_string() ), simple!( col("_list").index(lit(1_i64)).eq(lit(20_i64)), - "_list[1] = 20".to_string() + "array_element(_list, 1) = 20".to_string() ), simple!( cardinality(col("_list").range(col("value"), lit(10_i64))), - "cardinality(_list[value:10:1])".to_string() + "cardinality(array_slice(_list, value, 10))".to_string() ), ParseTest { expr: col("_timestamp_ntz").gt(lit(ScalarValue::TimestampMicrosecond(Some(1262304000000000), None))), @@ -705,7 +703,7 @@ mod test { override_expected_expr: Some(col("_timestamp_ntz").gt( datafusion_expr::Expr::ScalarFunction( ScalarFunction { - func_def: ScalarFunctionDefinition::UDF(arrow_cast()), + func: arrow_cast(), args: vec![ lit(ScalarValue::Utf8(Some("2010-01-01T00:00:00.000000".into()))), lit(ScalarValue::Utf8(Some("Timestamp(Microsecond, None)".into()))) @@ -723,7 +721,7 @@ mod test { override_expected_expr: Some(col("_timestamp").gt( datafusion_expr::Expr::ScalarFunction( ScalarFunction { - func_def: ScalarFunctionDefinition::UDF(arrow_cast()), + func: arrow_cast(), args: vec![ lit(ScalarValue::Utf8(Some("2010-01-01T00:00:00.000000".into()))), lit(ScalarValue::Utf8(Some("Timestamp(Microsecond, Some(\"UTC\"))".into()))) diff --git a/crates/core/src/delta_datafusion/find_files/logical.rs b/crates/core/src/delta_datafusion/find_files/logical.rs index 6234cbe5c2..4dd4a3b5da 100644 --- a/crates/core/src/delta_datafusion/find_files/logical.rs +++ b/crates/core/src/delta_datafusion/find_files/logical.rs @@ -92,7 +92,16 @@ impl UserDefinedLogicalNodeCore for FindFilesNode { ) } - fn from_template(&self, _exprs: &[Expr], _inputs: &[LogicalPlan]) -> Self { - self.clone() + fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self { + self.with_exprs_and_inputs(exprs.to_vec(), inputs.to_vec()) + .unwrap() + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + _inputs: Vec, + ) -> datafusion_common::Result { + Ok(self.clone()) } } diff --git a/crates/core/src/delta_datafusion/find_files/mod.rs b/crates/core/src/delta_datafusion/find_files/mod.rs index 2e8d26dee3..0c235242c2 100644 --- a/crates/core/src/delta_datafusion/find_files/mod.rs +++ b/crates/core/src/delta_datafusion/find_files/mod.rs @@ -1,6 +1,6 @@ -use arrow_array::cast::AsArray; use std::sync::Arc; +use arrow_array::cast::AsArray; use arrow_array::types::UInt16Type; use arrow_array::RecordBatch; use arrow_schema::SchemaBuilder; @@ -10,13 +10,13 @@ use async_trait::async_trait; use datafusion::datasource::MemTable; use datafusion::execution::context::{QueryPlanner, SessionState}; use datafusion::execution::TaskContext; -use datafusion::physical_plan::filter::FilterExec; -use datafusion::physical_plan::limit::LocalLimitExec; -use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}; use datafusion::prelude::SessionContext; use datafusion_common::{DFSchemaRef, Result, ToDFSchema}; use datafusion_expr::{col, Expr, LogicalPlan, UserDefinedLogicalNode}; +use datafusion_physical_plan::filter::FilterExec; +use datafusion_physical_plan::limit::LocalLimitExec; +use datafusion_physical_plan::ExecutionPlan; use lazy_static::lazy_static; use crate::delta_datafusion::find_files::logical::FindFilesNode; @@ -28,8 +28,6 @@ use crate::logstore::LogStoreRef; use crate::table::state::DeltaTableState; use crate::DeltaTableError; -use super::create_physical_expr_fix; - pub mod logical; pub mod physical; @@ -43,8 +41,10 @@ lazy_static! { ONLY_FILES_SCHEMA.clone().to_dfschema_ref().unwrap(); } +#[derive(Default)] struct FindFilesPlannerExtension {} +#[derive(Default)] struct FindFilesPlanner {} #[async_trait] @@ -139,11 +139,11 @@ async fn scan_table_by_files( .with_file_column(true) .build(&snapshot)?; - let logical_schema = df_logical_schema(&snapshot, &scan_config)?; + let logical_schema = df_logical_schema(&snapshot, &scan_config.file_column_name, None)?; // Identify which columns we need to project let mut used_columns = expression - .to_columns()? + .column_refs() .into_iter() .map(|column| logical_schema.index_of(&column.name)) .collect::, ArrowError>>()?; @@ -161,11 +161,8 @@ async fn scan_table_by_files( let input_schema = scan.logical_schema.as_ref().to_owned(); let input_dfschema = input_schema.clone().try_into()?; - let predicate_expr = create_physical_expr_fix( - Expr::IsTrue(Box::new(expression.clone())), - &input_dfschema, - state.execution_props(), - )?; + let predicate_expr = + state.create_physical_expr(Expr::IsTrue(Box::new(expression.clone())), &input_dfschema)?; let filter: Arc = Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); @@ -193,6 +190,7 @@ async fn scan_table_by_files( pub mod tests { use std::sync::Arc; + use datafusion::execution::session_state::SessionStateBuilder; use datafusion::prelude::{DataFrame, SessionContext}; use datafusion_common::{assert_batches_eq, assert_batches_sorted_eq}; use datafusion_expr::{col, lit, Expr, Extension, LogicalPlan}; @@ -207,9 +205,9 @@ pub mod tests { expr: Expr, ) -> Result, DeltaTableError> { let ctx = SessionContext::new(); - let state = ctx - .state() - .with_query_planner(Arc::new(FindFilesPlanner {})); + let state = SessionStateBuilder::new_from_existing(ctx.state()) + .with_query_planner(Arc::new(FindFilesPlanner::default())) + .build(); let find_files_node = LogicalPlan::Extension(Extension { node: Arc::new(FindFilesNode::new( "my_cool_plan".into(), diff --git a/crates/core/src/delta_datafusion/find_files/physical.rs b/crates/core/src/delta_datafusion/find_files/physical.rs index eb09d2d94b..508d1f672e 100644 --- a/crates/core/src/delta_datafusion/find_files/physical.rs +++ b/crates/core/src/delta_datafusion/find_files/physical.rs @@ -8,14 +8,14 @@ use arrow_array::RecordBatch; use arrow_schema::SchemaRef; use datafusion::error::Result; use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; -use datafusion::physical_plan::memory::MemoryStream; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, PlanProperties, -}; use datafusion::prelude::SessionContext; use datafusion_common::tree_node::TreeNode; use datafusion_expr::Expr; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; +use datafusion_physical_plan::memory::MemoryStream; +use datafusion_physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, PlanProperties, +}; use futures::stream::BoxStream; use futures::{FutureExt, Stream, StreamExt, TryStreamExt}; @@ -85,6 +85,10 @@ impl DisplayAs for FindFilesExec { } impl ExecutionPlan for FindFilesExec { + fn name(&self) -> &str { + Self::static_name() + } + fn as_any(&self) -> &dyn Any { self } @@ -97,7 +101,7 @@ impl ExecutionPlan for FindFilesExec { &self.plan_properties } - fn children(&self) -> Vec> { + fn children(&self) -> Vec<&Arc> { vec![] } diff --git a/crates/core/src/delta_datafusion/logical.rs b/crates/core/src/delta_datafusion/logical.rs index 52ee1194f4..2ce435b5b6 100644 --- a/crates/core/src/delta_datafusion/logical.rs +++ b/crates/core/src/delta_datafusion/logical.rs @@ -52,13 +52,22 @@ impl UserDefinedLogicalNodeCore for MetricObserver { fn from_template( &self, - _exprs: &[datafusion_expr::Expr], + exprs: &[datafusion_expr::Expr], inputs: &[datafusion_expr::LogicalPlan], ) -> Self { - MetricObserver { + self.with_exprs_and_inputs(exprs.to_vec(), inputs.to_vec()) + .unwrap() + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + inputs: Vec, + ) -> datafusion_common::Result { + Ok(MetricObserver { id: self.id.clone(), input: inputs[0].clone(), enable_pushdown: self.enable_pushdown, - } + }) } } diff --git a/crates/core/src/delta_datafusion/mod.rs b/crates/core/src/delta_datafusion/mod.rs index c1b6208cff..8d64f85fb2 100644 --- a/crates/core/src/delta_datafusion/mod.rs +++ b/crates/core/src/delta_datafusion/mod.rs @@ -25,82 +25,74 @@ use std::collections::{HashMap, HashSet}; use std::fmt::{self, Debug}; use std::sync::Arc; -use arrow::compute::{cast_with_options, CastOptions}; -use arrow::datatypes::DataType; -use arrow::datatypes::{ - DataType as ArrowDataType, Schema as ArrowSchema, SchemaRef, SchemaRef as ArrowSchemaRef, - TimeUnit, -}; -use arrow::error::ArrowError; -use arrow::record_batch::RecordBatch; use arrow_array::types::UInt16Type; -use arrow_array::{Array, DictionaryArray, StringArray, TypedDictionaryArray}; +use arrow_array::{Array, DictionaryArray, RecordBatch, StringArray, TypedDictionaryArray}; use arrow_cast::display::array_value_to_string; - -use arrow_schema::Field; +use arrow_cast::{cast_with_options, CastOptions}; +use arrow_schema::{ + ArrowError, DataType as ArrowDataType, Field, Schema as ArrowSchema, SchemaRef, + SchemaRef as ArrowSchemaRef, TimeUnit, +}; +use arrow_select::concat::concat_batches; use async_trait::async_trait; use chrono::{DateTime, TimeZone, Utc}; -use datafusion::datasource::file_format::{parquet::ParquetFormat, FileFormat}; +use datafusion::catalog::{Session, TableProviderFactory}; +use datafusion::config::TableParquetOptions; +use datafusion::datasource::physical_plan::parquet::ParquetExecBuilder; use datafusion::datasource::physical_plan::{ wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig, }; -use datafusion::datasource::provider::TableProviderFactory; use datafusion::datasource::{listing::PartitionedFile, MemTable, TableProvider, TableType}; use datafusion::execution::context::{SessionConfig, SessionContext, SessionState, TaskContext}; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::FunctionRegistry; use datafusion::physical_optimizer::pruning::PruningPredicate; -use datafusion::physical_plan::filter::FilterExec; -use datafusion::physical_plan::limit::LocalLimitExec; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, - Statistics, -}; use datafusion_common::scalar::ScalarValue; -use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion_common::{ config::ConfigOptions, Column, DFSchema, DataFusionError, Result as DataFusionResult, - ToDFSchema, + TableReference, ToDFSchema, }; -use datafusion_expr::expr::ScalarFunction; use datafusion_expr::logical_plan::CreateExternalTable; use datafusion_expr::utils::conjunction; -use datafusion_expr::{ - col, Expr, Extension, GetFieldAccess, GetIndexedField, LogicalPlan, - TableProviderFilterPushDown, Volatility, +use datafusion_expr::{col, Expr, Extension, LogicalPlan, TableProviderFilterPushDown, Volatility}; +use datafusion_physical_plan::filter::FilterExec; +use datafusion_physical_plan::limit::LocalLimitExec; +use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; +use datafusion_physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, + Statistics, }; -use datafusion_functions::expr_fn::get_field; -use datafusion_functions_array::extract::{array_element, array_slice}; -use datafusion_physical_expr::execution_props::ExecutionProps; -use datafusion_physical_expr::PhysicalExpr; use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::physical_plan::PhysicalExtensionCodec; use datafusion_sql::planner::ParserOptions; use either::Either; use futures::TryStreamExt; - use itertools::Itertools; use object_store::ObjectMeta; use serde::{Deserialize, Serialize}; use url::Url; use crate::delta_datafusion::expr::parse_predicate_expression; +use crate::delta_datafusion::schema_adapter::DeltaSchemaAdapterFactory; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Add, DataCheck, EagerSnapshot, Invariant, Snapshot}; +use crate::kernel::{Add, DataCheck, EagerSnapshot, Invariant, Snapshot, StructTypeExt}; use crate::logstore::LogStoreRef; use crate::table::builder::ensure_table_uri; use crate::table::state::DeltaTableState; use crate::table::Constraint; use crate::{open_table, open_table_with_storage_options, DeltaTable}; -const PATH_COLUMN: &str = "__delta_rs_path"; +pub(crate) const PATH_COLUMN: &str = "__delta_rs_path"; pub mod cdf; pub mod expr; pub mod logical; pub mod physical; +pub mod planner; mod find_files; +mod schema_adapter; impl From for DataFusionError { fn from(err: DeltaTableError) -> Self { @@ -199,53 +191,41 @@ impl DataFusionMixins for DeltaTableState { fn _arrow_schema(snapshot: &Snapshot, wrap_partitions: bool) -> DeltaResult { let meta = snapshot.metadata(); - let fields = meta - .schema()? + + let schema = meta.schema()?; + let fields = schema .fields() - .iter() .filter(|f| !meta.partition_columns.contains(&f.name().to_string())) .map(|f| f.try_into()) .chain( - meta.schema()? - .fields() - .iter() - .filter(|f| meta.partition_columns.contains(&f.name().to_string())) - .map(|f| { - let field = Field::try_from(f)?; - let corrected = if wrap_partitions { - match field.data_type() { - // Only dictionary-encode types that may be large - // // https://github.com/apache/arrow-datafusion/pull/5545 - DataType::Utf8 - | DataType::LargeUtf8 - | DataType::Binary - | DataType::LargeBinary => { - wrap_partition_type_in_dict(field.data_type().clone()) - } - _ => field.data_type().clone(), + // We need stable order between logical and physical schemas, but the order of + // partitioning columns is not always the same in the json schema and the array + meta.partition_columns.iter().map(|partition_col| { + let f = schema.field(partition_col).unwrap(); + let field = Field::try_from(f)?; + let corrected = if wrap_partitions { + match field.data_type() { + // Only dictionary-encode types that may be large + // // https://github.com/apache/arrow-datafusion/pull/5545 + ArrowDataType::Utf8 + | ArrowDataType::LargeUtf8 + | ArrowDataType::Binary + | ArrowDataType::LargeBinary => { + wrap_partition_type_in_dict(field.data_type().clone()) } - } else { - field.data_type().clone() - }; - Ok(field.with_data_type(corrected)) - }), + _ => field.data_type().clone(), + } + } else { + field.data_type().clone() + }; + Ok(field.with_data_type(corrected)) + }), ) .collect::, _>>()?; Ok(Arc::new(ArrowSchema::new(fields))) } -pub(crate) trait DataFusionFileMixins { - /// Iterate over all files in the log matching a predicate - fn files_matching_predicate(&self, filters: &[Expr]) -> DeltaResult>; -} - -impl DataFusionFileMixins for EagerSnapshot { - fn files_matching_predicate(&self, filters: &[Expr]) -> DeltaResult> { - files_matching_predicate(self, filters) - } -} - pub(crate) fn files_matching_predicate<'a>( snapshot: &'a EagerSnapshot, filters: &[Expr], @@ -253,7 +233,8 @@ pub(crate) fn files_matching_predicate<'a>( if let Some(Some(predicate)) = (!filters.is_empty()).then_some(conjunction(filters.iter().cloned())) { - let expr = logical_expr_to_physical_expr(predicate, snapshot.arrow_schema()?.as_ref()); + let expr = SessionContext::new() + .create_physical_expr(predicate, &snapshot.arrow_schema()?.to_dfschema()?)?; let pruning_predicate = PruningPredicate::try_new(expr, snapshot.arrow_schema()?)?; Ok(Either::Left( snapshot @@ -309,9 +290,13 @@ pub(crate) fn register_store(store: LogStoreRef, env: Arc) { /// at the physical level pub(crate) fn df_logical_schema( snapshot: &DeltaTableState, - scan_config: &DeltaScanConfig, + file_column_name: &Option, + schema: Option, ) -> DeltaResult { - let input_schema = snapshot.arrow_schema()?; + let input_schema = match schema { + Some(schema) => schema, + None => snapshot.input_schema()?, + }; let table_partition_cols = &snapshot.metadata().partition_columns; let mut fields: Vec> = input_schema @@ -330,8 +315,12 @@ pub(crate) fn df_logical_schema( )); } - if let Some(file_column_name) = &scan_config.file_column_name { - fields.push(Arc::new(Field::new(file_column_name, DataType::Utf8, true))); + if let Some(file_column_name) = file_column_name { + fields.push(Arc::new(Field::new( + file_column_name, + ArrowDataType::Utf8, + true, + ))); } Ok(Arc::new(ArrowSchema::new(fields))) @@ -349,7 +338,10 @@ pub struct DeltaScanConfigBuilder { file_column_name: Option, /// Whether to wrap partition values in a dictionary encoding to potentially save space wrap_partition_values: Option, + /// Whether to push down filter in end result or just prune the files enable_parquet_pushdown: bool, + /// Schema to scan table with + schema: Option, } impl Default for DeltaScanConfigBuilder { @@ -359,6 +351,7 @@ impl Default for DeltaScanConfigBuilder { file_column_name: None, wrap_partition_values: None, enable_parquet_pushdown: true, + schema: None, } } } @@ -397,16 +390,21 @@ impl DeltaScanConfigBuilder { self } + /// Use the provided [SchemaRef] for the [DeltaScan] + pub fn with_schema(mut self, schema: SchemaRef) -> Self { + self.schema = Some(schema); + self + } + /// Build a DeltaScanConfig and ensure no column name conflicts occur during downstream processing pub fn build(&self, snapshot: &DeltaTableState) -> DeltaResult { - let input_schema = snapshot.input_schema()?; - let mut file_column_name = None; - let mut column_names: HashSet<&String> = HashSet::new(); - for field in input_schema.fields.iter() { - column_names.insert(field.name()); - } + let file_column_name = if self.include_file_column { + let input_schema = snapshot.input_schema()?; + let mut column_names: HashSet<&String> = HashSet::new(); + for field in input_schema.fields.iter() { + column_names.insert(field.name()); + } - if self.include_file_column { match &self.file_column_name { Some(name) => { if column_names.contains(name) { @@ -416,7 +414,7 @@ impl DeltaScanConfigBuilder { ))); } - file_column_name = Some(name.to_owned()) + Some(name.to_owned()) } None => { let prefix = PATH_COLUMN; @@ -428,15 +426,18 @@ impl DeltaScanConfigBuilder { name = format!("{}_{}", prefix, idx); } - file_column_name = Some(name); + Some(name) } } - } + } else { + None + }; Ok(DeltaScanConfig { file_column_name, wrap_partition_values: self.wrap_partition_values.unwrap_or(true), enable_parquet_pushdown: self.enable_parquet_pushdown, + schema: self.schema.clone(), }) } } @@ -450,37 +451,36 @@ pub struct DeltaScanConfig { pub wrap_partition_values: bool, /// Allow pushdown of the scan filter pub enable_parquet_pushdown: bool, + /// Schema to read as + pub schema: Option, } -#[derive(Debug)] pub(crate) struct DeltaScanBuilder<'a> { snapshot: &'a DeltaTableState, log_store: LogStoreRef, filter: Option, - state: &'a SessionState, + session: &'a dyn Session, projection: Option<&'a Vec>, limit: Option, files: Option<&'a [Add]>, - config: DeltaScanConfig, - schema: Option, + config: Option, } impl<'a> DeltaScanBuilder<'a> { pub fn new( snapshot: &'a DeltaTableState, log_store: LogStoreRef, - state: &'a SessionState, + session: &'a dyn Session, ) -> Self { DeltaScanBuilder { snapshot, log_store, filter: None, - state, - files: None, + session, projection: None, limit: None, - config: DeltaScanConfig::default(), - schema: None, + files: None, + config: None, } } @@ -505,21 +505,26 @@ impl<'a> DeltaScanBuilder<'a> { } pub fn with_scan_config(mut self, config: DeltaScanConfig) -> Self { - self.config = config; + self.config = Some(config); self } pub async fn build(self) -> DeltaResult { - let config = self.config; - let schema = match self.schema { - Some(schema) => schema, - None => { - self.snapshot - .physical_arrow_schema(self.log_store.object_store()) - .await? - } + let config = match self.config { + Some(config) => config, + None => DeltaScanConfigBuilder::new().build(self.snapshot)?, }; - let logical_schema = df_logical_schema(self.snapshot, &config)?; + + let schema = match config.schema.clone() { + Some(value) => Ok(value), + None => self.snapshot.arrow_schema(), + }?; + + let logical_schema = df_logical_schema( + self.snapshot, + &config.file_column_name, + Some(schema.clone()), + )?; let logical_schema = if let Some(used_columns) = self.projection { let mut fields = vec![]; @@ -531,33 +536,45 @@ impl<'a> DeltaScanBuilder<'a> { logical_schema }; + let context = SessionContext::new(); + let df_schema = logical_schema.clone().to_dfschema()?; let logical_filter = self .filter - .map(|expr| logical_expr_to_physical_expr(expr, &logical_schema)); + .map(|expr| context.create_physical_expr(expr, &df_schema).unwrap()); // Perform Pruning of files to scan - let files = match self.files { - Some(files) => files.to_owned(), + let (files, files_scanned, files_pruned) = match self.files { + Some(files) => { + let files = files.to_owned(); + let files_scanned = files.len(); + (files, files_scanned, 0) + } None => { if let Some(predicate) = &logical_filter { let pruning_predicate = PruningPredicate::try_new(predicate.clone(), logical_schema.clone())?; let files_to_prune = pruning_predicate.prune(self.snapshot)?; - self.snapshot + let mut files_pruned = 0usize; + let files = self + .snapshot .file_actions_iter()? .zip(files_to_prune.into_iter()) - .filter_map( - |(action, keep)| { - if keep { - Some(action.to_owned()) - } else { - None - } - }, - ) - .collect() + .filter_map(|(action, keep)| { + if keep { + Some(action.to_owned()) + } else { + files_pruned += 1; + None + } + }) + .collect::>(); + + let files_scanned = files.len(); + (files, files_scanned, files_pruned) } else { - self.snapshot.file_actions()? + let files = self.snapshot.file_actions()?; + let files_scanned = files.len(); + (files, files_scanned, 0) } } }; @@ -603,9 +620,9 @@ impl<'a> DeltaScanBuilder<'a> { if let Some(file_column_name) = &config.file_column_name { let field_name_datatype = if config.wrap_partition_values { - wrap_partition_type_in_dict(DataType::Utf8) + wrap_partition_type_in_dict(ArrowDataType::Utf8) } else { - DataType::Utf8 + ArrowDataType::Utf8 }; table_partition_cols.push(Field::new( file_column_name.clone(), @@ -619,37 +636,47 @@ impl<'a> DeltaScanBuilder<'a> { .datafusion_table_statistics() .unwrap_or(Statistics::new_unknown(&schema)); + let parquet_options = TableParquetOptions { + global: self.session.config().options().execution.parquet.clone(), + ..Default::default() + }; + + let mut exec_plan_builder = ParquetExecBuilder::new(FileScanConfig { + object_store_url: self.log_store.object_store_url(), + file_schema, + file_groups: file_groups.into_values().collect(), + statistics: stats, + projection: self.projection.cloned(), + limit: self.limit, + table_partition_cols, + output_ordering: vec![], + }) + .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) + .with_table_parquet_options(parquet_options); + // Sometimes (i.e Merge) we want to prune files that don't make the // filter and read the entire contents for files that do match the // filter - let parquet_pushdown = if config.enable_parquet_pushdown { - logical_filter.clone() - } else { - None + if let Some(predicate) = logical_filter { + if config.enable_parquet_pushdown { + exec_plan_builder = exec_plan_builder.with_predicate(predicate); + } }; - let scan = ParquetFormat::new() - .create_physical_plan( - self.state, - FileScanConfig { - object_store_url: self.log_store.object_store_url(), - file_schema, - file_groups: file_groups.into_values().collect(), - statistics: stats, - projection: self.projection.cloned(), - limit: self.limit, - table_partition_cols, - output_ordering: vec![], - }, - parquet_pushdown.as_ref(), - ) - .await?; + let metrics = ExecutionPlanMetricsSet::new(); + MetricBuilder::new(&metrics) + .global_counter("files_scanned") + .add(files_scanned); + MetricBuilder::new(&metrics) + .global_counter("files_pruned") + .add(files_pruned); Ok(DeltaScan { table_uri: ensure_table_uri(self.log_store.root_uri())?.as_str().into(), - parquet_scan: scan, + parquet_scan: exec_plan_builder.build_arc(), config, logical_schema, + metrics, }) } } @@ -679,7 +706,7 @@ impl TableProvider for DeltaTable { async fn scan( &self, - session: &SessionState, + session: &dyn Session, projection: Option<&Vec>, filters: &[Expr], limit: Option, @@ -697,11 +724,14 @@ impl TableProvider for DeltaTable { Ok(Arc::new(scan)) } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> DataFusionResult { - Ok(TableProviderFilterPushDown::Inexact) + filter: &[&Expr], + ) -> DataFusionResult> { + Ok(filter + .iter() + .map(|_| TableProviderFilterPushDown::Inexact) + .collect()) } fn statistics(&self) -> Option { @@ -715,6 +745,7 @@ pub struct DeltaTableProvider { log_store: LogStoreRef, config: DeltaScanConfig, schema: Arc, + files: Option>, } impl DeltaTableProvider { @@ -725,12 +756,19 @@ impl DeltaTableProvider { config: DeltaScanConfig, ) -> DeltaResult { Ok(DeltaTableProvider { - schema: df_logical_schema(&snapshot, &config)?, + schema: df_logical_schema(&snapshot, &config.file_column_name, config.schema.clone())?, snapshot, log_store, config, + files: None, }) } + + /// Define which files to consider while building a scan, for advanced usecases + pub fn with_files(mut self, files: Vec) -> DeltaTableProvider { + self.files = Some(files); + self + } } #[async_trait] @@ -757,7 +795,7 @@ impl TableProvider for DeltaTableProvider { async fn scan( &self, - session: &SessionState, + session: &dyn Session, projection: Option<&Vec>, filters: &[Expr], limit: Option, @@ -765,22 +803,23 @@ impl TableProvider for DeltaTableProvider { register_store(self.log_store.clone(), session.runtime_env().clone()); let filter_expr = conjunction(filters.iter().cloned()); - let scan = DeltaScanBuilder::new(&self.snapshot, self.log_store.clone(), session) + let mut scan = DeltaScanBuilder::new(&self.snapshot, self.log_store.clone(), session) .with_projection(projection) .with_limit(limit) .with_filter(filter_expr) - .with_scan_config(self.config.clone()) - .build() - .await?; + .with_scan_config(self.config.clone()); - Ok(Arc::new(scan)) + if let Some(files) = &self.files { + scan = scan.with_files(files); + } + Ok(Arc::new(scan.build().await?)) } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> DataFusionResult { - Ok(TableProviderFilterPushDown::Inexact) + _filter: &[&Expr], + ) -> DataFusionResult> { + Ok(vec![TableProviderFilterPushDown::Inexact]) } fn statistics(&self) -> Option { @@ -800,6 +839,8 @@ pub struct DeltaScan { pub parquet_scan: Arc, /// The schema of the table to be used when evaluating expressions pub logical_schema: Arc, + /// Metrics for scan reported via DataFusion + metrics: ExecutionPlanMetricsSet, } #[derive(Debug, Serialize, Deserialize)] @@ -816,6 +857,10 @@ impl DisplayAs for DeltaScan { } impl ExecutionPlan for DeltaScan { + fn name(&self) -> &str { + Self::static_name() + } + fn as_any(&self) -> &dyn Any { self } @@ -828,8 +873,8 @@ impl ExecutionPlan for DeltaScan { self.parquet_scan.properties() } - fn children(&self) -> Vec> { - vec![self.parquet_scan.clone()] + fn children(&self) -> Vec<&Arc> { + vec![&self.parquet_scan] } fn with_new_children( @@ -847,6 +892,7 @@ impl ExecutionPlan for DeltaScan { config: self.config.clone(), parquet_scan: children[0].clone(), logical_schema: self.logical_schema.clone(), + metrics: self.metrics.clone(), })) } @@ -858,6 +904,10 @@ impl ExecutionPlan for DeltaScan { self.parquet_scan.execute(partition, context) } + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + fn statistics(&self) -> DataFusionResult { self.parquet_scan.statistics() } @@ -873,6 +923,7 @@ impl ExecutionPlan for DeltaScan { config: self.config.clone(), parquet_scan, logical_schema: self.logical_schema.clone(), + metrics: self.metrics.clone(), }))) } else { Ok(None) @@ -945,7 +996,7 @@ pub(crate) fn get_null_of_arrow_type(t: &ArrowDataType) -> DeltaResult Arc { - let df_schema = schema.clone().to_dfschema().unwrap(); - let execution_props = ExecutionProps::new(); - create_physical_expr_fix(expr, &df_schema, &execution_props).unwrap() -} - -// TODO This should be removed after datafusion v38 -pub(crate) fn create_physical_expr_fix( - expr: Expr, - input_dfschema: &DFSchema, - execution_props: &ExecutionProps, -) -> Result, DataFusionError> { - // Support Expr::struct by rewriting expressions. - let expr = expr - .transform_up(&|expr| { - // see https://github.com/apache/datafusion/issues/10181 - // This is part of the function rewriter code in DataFusion inlined here temporarily - Ok(match expr { - Expr::GetIndexedField(GetIndexedField { - expr, - field: GetFieldAccess::NamedStructField { name }, - }) => { - let name = Expr::Literal(name); - Transformed::yes(get_field(*expr, name)) - } - // expr[idx] ==> array_element(expr, idx) - Expr::GetIndexedField(GetIndexedField { - expr, - field: GetFieldAccess::ListIndex { key }, - }) => Transformed::yes(array_element(*expr, *key)), - - // expr[start, stop, stride] ==> array_slice(expr, start, stop, stride) - Expr::GetIndexedField(GetIndexedField { - expr, - field: - GetFieldAccess::ListRange { - start, - stop, - stride, - }, - }) => Transformed::yes(array_slice(*expr, *start, *stop, *stride)), - - _ => Transformed::no(expr), - }) - })? - .data; - - datafusion_physical_expr::create_physical_expr(&expr, input_dfschema, execution_props) -} - pub(crate) async fn execute_plan_to_batch( state: &SessionState, plan: Arc, @@ -1133,15 +1132,13 @@ pub(crate) async fn execute_plan_to_batch( let batches = batch_stream.try_collect::>().await?; - DataFusionResult::<_>::Ok(arrow::compute::concat_batches(&schema, batches.iter())?) + DataFusionResult::<_>::Ok(concat_batches(&schema, batches.iter())?) } }), ) .await?; - let batch = arrow::compute::concat_batches(&plan.schema(), data.iter())?; - - Ok(batch) + Ok(concat_batches(&plan.schema(), data.iter())?) } /// Responsible for checking batches of data conform to table's invariants. @@ -1286,6 +1283,7 @@ impl PhysicalExtensionCodec for DeltaPhysicalCodec { parquet_scan: (*inputs)[0].clone(), config: wire.config, logical_schema: wire.logical_schema, + metrics: ExecutionPlanMetricsSet::new(), }; Ok(Arc::new(delta_scan)) } @@ -1332,6 +1330,7 @@ impl LogicalExtensionCodec for DeltaLogicalCodec { fn try_decode_table_provider( &self, buf: &[u8], + _table_ref: &TableReference, _schema: SchemaRef, _ctx: &SessionContext, ) -> Result, DataFusionError> { @@ -1342,6 +1341,7 @@ impl LogicalExtensionCodec for DeltaLogicalCodec { fn try_encode_table_provider( &self, + _table_ref: &TableReference, node: Arc, buf: &mut Vec, ) -> Result<(), DataFusionError> { @@ -1364,7 +1364,7 @@ pub struct DeltaTableFactory {} impl TableProviderFactory for DeltaTableFactory { async fn create( &self, - _ctx: &SessionState, + _ctx: &dyn Session, cmd: &CreateExternalTable, ) -> datafusion::error::Result> { let provider = if cmd.options.is_empty() { @@ -1386,7 +1386,7 @@ pub(crate) struct FindFilesExprProperties { /// Ensure only expressions that make sense are accepted, check for /// non-deterministic functions, and determine if the expression only contains /// partition columns -impl TreeNodeVisitor for FindFilesExprProperties { +impl TreeNodeVisitor<'_> for FindFilesExprProperties { type Node = Expr; fn f_down(&mut self, expr: &Self::Node) -> datafusion_common::Result { @@ -1417,28 +1417,20 @@ impl TreeNodeVisitor for FindFilesExprProperties { | Expr::IsNotUnknown(_) | Expr::Negative(_) | Expr::InList { .. } - | Expr::GetIndexedField(_) | Expr::Between(_) | Expr::Case(_) | Expr::Cast(_) | Expr::TryCast(_) => (), - Expr::ScalarFunction(ScalarFunction { func_def, .. }) => { - let v = match func_def { - datafusion_expr::ScalarFunctionDefinition::BuiltIn(f) => f.volatility(), - datafusion_expr::ScalarFunctionDefinition::UDF(u) => u.signature().volatility, - datafusion_expr::ScalarFunctionDefinition::Name(n) => { + Expr::ScalarFunction(scalar_function) => { + match scalar_function.func.signature().volatility { + Volatility::Immutable => (), + _ => { self.result = Err(DeltaTableError::Generic(format!( - "Cannot determine volatility of find files predicate function {n}", + "Find files predicate contains nondeterministic function {}", + scalar_function.func.name() ))); return Ok(TreeNodeRecursion::Stop); } - }; - if v > Volatility::Immutable { - self.result = Err(DeltaTableError::Generic(format!( - "Find files predicate contains nondeterministic function {}", - func_def.name() - ))); - return Ok(TreeNodeRecursion::Stop); } } _ => { @@ -1526,11 +1518,11 @@ pub(crate) async fn find_files_scan<'a>( } .build(snapshot)?; - let logical_schema = df_logical_schema(snapshot, &scan_config)?; + let logical_schema = df_logical_schema(snapshot, &scan_config.file_column_name, None)?; // Identify which columns we need to project let mut used_columns = expression - .to_columns()? + .column_refs() .into_iter() .map(|column| logical_schema.index_of(&column.name)) .collect::, ArrowError>>()?; @@ -1549,11 +1541,8 @@ pub(crate) async fn find_files_scan<'a>( let input_schema = scan.logical_schema.as_ref().to_owned(); let input_dfschema = input_schema.clone().try_into()?; - let predicate_expr = create_physical_expr_fix( - Expr::IsTrue(Box::new(expression.clone())), - &input_dfschema, - state.execution_props(), - )?; + let predicate_expr = + state.create_physical_expr(Expr::IsTrue(Box::new(expression.clone())), &input_dfschema)?; let filter: Arc = Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); @@ -1590,7 +1579,7 @@ pub(crate) async fn scan_memory_table( ))? .to_owned(), ); - fields.push(Field::new(PATH_COLUMN, DataType::Utf8, false)); + fields.push(Field::new(PATH_COLUMN, ArrowDataType::Utf8, false)); for field in schema.fields() { if field.name().starts_with("partition.") { @@ -1775,12 +1764,14 @@ impl From for DeltaColumn { #[cfg(test)] mod tests { - use crate::writer::test_utils::get_delta_schema; - use arrow::array::StructArray; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow_array::StructArray; + use arrow_schema::Schema; use chrono::{TimeZone, Utc}; use datafusion::assert_batches_sorted_eq; + use datafusion::datasource::physical_plan::ParquetExec; use datafusion::physical_plan::empty::EmptyExec; + use datafusion::physical_plan::{visit_execution_plan, ExecutionPlanVisitor, PhysicalExpr}; + use datafusion_expr::lit; use datafusion_proto::physical_plan::AsExecutionPlan; use datafusion_proto::protobuf; use object_store::path::Path; @@ -1788,6 +1779,8 @@ mod tests { use std::ops::Deref; use super::*; + use crate::operations::write::SchemaMode; + use crate::writer::test_utils::get_delta_schema; // test deserialization of serialized partition values. // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#partition-value-serialization @@ -1891,7 +1884,7 @@ mod tests { let file = partitioned_file_from_action(&action, &part_columns, &schema); let ref_file = PartitionedFile { object_meta: object_store::ObjectMeta { - location: Path::from("year=2015/month=1/part-00000-4dcb50d3-d017-450c-9df7-a7257dbd3c5d-c000.snappy.parquet".to_string()), + location: Path::from("year=2015/month=1/part-00000-4dcb50d3-d017-450c-9df7-a7257dbd3c5d-c000.snappy.parquet".to_string()), last_modified: Utc.timestamp_millis_opt(1660497727833).unwrap(), size: 10644, e_tag: None, @@ -1900,6 +1893,7 @@ mod tests { partition_values: [ScalarValue::Int64(Some(2015)), ScalarValue::Int64(Some(1))].to_vec(), range: None, extensions: None, + statistics: None, }; assert_eq!(file.partition_values, ref_file.partition_values) } @@ -1907,8 +1901,8 @@ mod tests { #[tokio::test] async fn test_enforce_invariants() { let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::Int32, false), + Field::new("a", ArrowDataType::Utf8, false), + Field::new("b", ArrowDataType::Int32, false), ])); let batch = RecordBatch::try_new( Arc::clone(&schema), @@ -1960,7 +1954,7 @@ mod tests { let struct_fields = schema.fields().clone(); let schema = Arc::new(Schema::new(vec![Field::new( "x", - DataType::Struct(struct_fields), + ArrowDataType::Struct(struct_fields), false, )])); let inner = Arc::new(StructArray::from(batch)); @@ -1980,14 +1974,15 @@ mod tests { let codec = DeltaPhysicalCodec {}; let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::Int32, false), + Field::new("a", ArrowDataType::Utf8, false), + Field::new("b", ArrowDataType::Int32, false), ])); let exec_plan = Arc::from(DeltaScan { table_uri: "s3://my_bucket/this/is/some/path".to_string(), parquet_scan: Arc::from(EmptyExec::new(schema.clone())), config: DeltaScanConfig::default(), logical_schema: schema.clone(), + metrics: ExecutionPlanMetricsSet::new(), }); let proto: protobuf::PhysicalPlanNode = protobuf::PhysicalPlanNode::try_from_physical_plan(exec_plan.clone(), &codec) @@ -2036,14 +2031,14 @@ mod tests { // Tests issue (1787) where partition columns were incorrect when they // have a different order in the metadata and table schema let schema = Arc::new(ArrowSchema::new(vec![ - Field::new("modified", DataType::Utf8, true), - Field::new("id", DataType::Utf8, true), - Field::new("value", DataType::Int32, true), + Field::new("modified", ArrowDataType::Utf8, true), + Field::new("id", ArrowDataType::Utf8, true), + Field::new("value", ArrowDataType::Int32, true), ])); let table = crate::DeltaOps::new_in_memory() .create() - .with_columns(get_delta_schema().fields().clone()) + .with_columns(get_delta_schema().fields().cloned()) .with_partition_columns(["modified", "id"]) .await .unwrap(); @@ -2108,9 +2103,9 @@ mod tests { #[tokio::test] async fn delta_scan_case_sensitive() { let schema = Arc::new(ArrowSchema::new(vec![ - Field::new("moDified", DataType::Utf8, true), - Field::new("ID", DataType::Utf8, true), - Field::new("vaLue", DataType::Int32, true), + Field::new("moDified", ArrowDataType::Utf8, true), + Field::new("ID", ArrowDataType::Utf8, true), + Field::new("vaLue", ArrowDataType::Int32, true), ])); let batch = RecordBatch::try_new( @@ -2173,4 +2168,402 @@ mod tests { assert_batches_sorted_eq!(&expected, &actual); */ } + + #[tokio::test] + async fn delta_scan_supports_missing_columns() { + let schema1 = Arc::new(ArrowSchema::new(vec![Field::new( + "col_1", + ArrowDataType::Utf8, + true, + )])); + + let batch1 = RecordBatch::try_new( + schema1.clone(), + vec![Arc::new(arrow::array::StringArray::from(vec![ + Some("A"), + Some("B"), + ]))], + ) + .unwrap(); + + let schema2 = Arc::new(ArrowSchema::new(vec![ + Field::new("col_1", ArrowDataType::Utf8, true), + Field::new("col_2", ArrowDataType::Utf8, true), + ])); + + let batch2 = RecordBatch::try_new( + schema2.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + Some("E"), + Some("F"), + Some("G"), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + Some("E2"), + Some("F2"), + Some("G2"), + ])), + ], + ) + .unwrap(); + + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch2]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let table = crate::DeltaOps(table) + .write(vec![batch1]) + .with_schema_mode(SchemaMode::Merge) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let config = DeltaScanConfigBuilder::new() + .build(table.snapshot().unwrap()) + .unwrap(); + let log = table.log_store(); + + let provider = + DeltaTableProvider::try_new(table.snapshot().unwrap().clone(), log, config).unwrap(); + let ctx: SessionContext = DeltaSessionContext::default().into(); + ctx.register_table("test", Arc::new(provider)).unwrap(); + + let df = ctx.sql("select col_1, col_2 from test").await.unwrap(); + let actual = df.collect().await.unwrap(); + let expected = vec![ + "+-------+-------+", + "| col_1 | col_2 |", + "+-------+-------+", + "| A | |", + "| B | |", + "| E | E2 |", + "| F | F2 |", + "| G | G2 |", + "+-------+-------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn delta_scan_supports_pushdown() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("col_1", ArrowDataType::Utf8, false), + Field::new("col_2", ArrowDataType::Utf8, false), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + Some("A"), + Some("B"), + Some("C"), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + Some("A2"), + Some("B2"), + Some("C2"), + ])), + ], + ) + .unwrap(); + + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let config = DeltaScanConfigBuilder::new() + .build(table.snapshot().unwrap()) + .unwrap(); + let log = table.log_store(); + + let provider = + DeltaTableProvider::try_new(table.snapshot().unwrap().clone(), log, config).unwrap(); + + let mut cfg = SessionConfig::default(); + cfg.options_mut().execution.parquet.pushdown_filters = true; + let ctx = SessionContext::new_with_config(cfg); + ctx.register_table("test", Arc::new(provider)).unwrap(); + + let df = ctx + .sql("select col_1, col_2 from test WHERE col_1 = 'A'") + .await + .unwrap(); + let actual = df.collect().await.unwrap(); + let expected = vec![ + "+-------+-------+", + "| col_1 | col_2 |", + "+-------+-------+", + "| A | A2 |", + "+-------+-------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn delta_scan_supports_nested_missing_columns() { + let column1_schema1: arrow::datatypes::Fields = + vec![Field::new("col_1a", ArrowDataType::Utf8, true)].into(); + let schema1 = Arc::new(ArrowSchema::new(vec![Field::new( + "col_1", + ArrowDataType::Struct(column1_schema1.clone()), + true, + )])); + + let batch1 = RecordBatch::try_new( + schema1.clone(), + vec![Arc::new(StructArray::new( + column1_schema1, + vec![Arc::new(arrow::array::StringArray::from(vec![ + Some("A"), + Some("B"), + ]))], + None, + ))], + ) + .unwrap(); + + let column1_schema2: arrow_schema::Fields = vec![ + Field::new("col_1a", ArrowDataType::Utf8, true), + Field::new("col_1b", ArrowDataType::Utf8, true), + ] + .into(); + let schema2 = Arc::new(ArrowSchema::new(vec![Field::new( + "col_1", + ArrowDataType::Struct(column1_schema2.clone()), + true, + )])); + + let batch2 = RecordBatch::try_new( + schema2.clone(), + vec![Arc::new(StructArray::new( + column1_schema2, + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + Some("E"), + Some("F"), + Some("G"), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + Some("E2"), + Some("F2"), + Some("G2"), + ])), + ], + None, + ))], + ) + .unwrap(); + + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch1]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let table = crate::DeltaOps(table) + .write(vec![batch2]) + .with_schema_mode(SchemaMode::Merge) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let config = DeltaScanConfigBuilder::new() + .build(table.snapshot().unwrap()) + .unwrap(); + let log = table.log_store(); + + let provider = + DeltaTableProvider::try_new(table.snapshot().unwrap().clone(), log, config).unwrap(); + let ctx: SessionContext = DeltaSessionContext::default().into(); + ctx.register_table("test", Arc::new(provider)).unwrap(); + + let df = ctx + .sql("select col_1.col_1a, col_1.col_1b from test") + .await + .unwrap(); + let actual = df.collect().await.unwrap(); + let expected = vec![ + "+--------------------+--------------------+", + "| test.col_1[col_1a] | test.col_1[col_1b] |", + "+--------------------+--------------------+", + "| A | |", + "| B | |", + "| E | E2 |", + "| F | F2 |", + "| G | G2 |", + "+--------------------+--------------------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn test_multiple_predicate_pushdown() { + use crate::datafusion::prelude::SessionContext; + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("moDified", ArrowDataType::Utf8, true), + Field::new("id", ArrowDataType::Utf8, true), + Field::new("vaLue", ArrowDataType::Int32, true), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-01", + "2021-02-01", + "2021-02-02", + "2021-02-02", + ])), + Arc::new(arrow::array::StringArray::from(vec!["A", "B", "C", "D"])), + Arc::new(arrow::array::Int32Array::from(vec![1, 10, 20, 100])), + ], + ) + .unwrap(); + // write some data + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let datafusion = SessionContext::new(); + let table = Arc::new(table); + + datafusion.register_table("snapshot", table).unwrap(); + + let df = datafusion + .sql("select * from snapshot where id > 10000 and id < 20000") + .await + .unwrap(); + + df.collect().await.unwrap(); + } + + #[tokio::test] + async fn test_delta_scan_builder_no_scan_config() { + let arr: Arc = Arc::new(arrow::array::StringArray::from(vec!["s"])); + let batch = RecordBatch::try_from_iter_with_nullable(vec![("a", arr, false)]).unwrap(); + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let ctx = SessionContext::new(); + let state = ctx.state(); + let scan = DeltaScanBuilder::new(table.snapshot().unwrap(), table.log_store(), &state) + .with_filter(Some(col("a").eq(lit("s")))) + .build() + .await + .unwrap(); + + let mut visitor = ParquetPredicateVisitor::default(); + visit_execution_plan(&scan, &mut visitor).unwrap(); + + assert_eq!(visitor.predicate.unwrap().to_string(), "a@0 = s"); + assert_eq!( + visitor.pruning_predicate.unwrap().orig_expr().to_string(), + "a@0 = s" + ); + } + + #[tokio::test] + async fn test_delta_scan_builder_scan_config_disable_pushdown() { + let arr: Arc = Arc::new(arrow::array::StringArray::from(vec!["s"])); + let batch = RecordBatch::try_from_iter_with_nullable(vec![("a", arr, false)]).unwrap(); + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let snapshot = table.snapshot().unwrap(); + let ctx = SessionContext::new(); + let state = ctx.state(); + let scan = DeltaScanBuilder::new(snapshot, table.log_store(), &state) + .with_filter(Some(col("a").eq(lit("s")))) + .with_scan_config( + DeltaScanConfigBuilder::new() + .with_parquet_pushdown(false) + .build(snapshot) + .unwrap(), + ) + .build() + .await + .unwrap(); + + let mut visitor = ParquetPredicateVisitor::default(); + visit_execution_plan(&scan, &mut visitor).unwrap(); + + assert!(visitor.predicate.is_none()); + assert!(visitor.pruning_predicate.is_none()); + } + + #[tokio::test] + async fn test_delta_scan_applies_parquet_options() { + let arr: Arc = Arc::new(arrow::array::StringArray::from(vec!["s"])); + let batch = RecordBatch::try_from_iter_with_nullable(vec![("a", arr, false)]).unwrap(); + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let snapshot = table.snapshot().unwrap(); + + let mut config = SessionConfig::default(); + config.options_mut().execution.parquet.pushdown_filters = true; + let ctx = SessionContext::new_with_config(config); + let state = ctx.state(); + + let scan = DeltaScanBuilder::new(snapshot, table.log_store(), &state) + .build() + .await + .unwrap(); + + let mut visitor = ParquetOptionsVisitor::default(); + visit_execution_plan(&scan, &mut visitor).unwrap(); + + assert_eq!(ctx.copied_table_options().parquet, visitor.options.unwrap()); + } + + #[derive(Default)] + struct ParquetPredicateVisitor { + predicate: Option>, + pruning_predicate: Option>, + } + + impl ExecutionPlanVisitor for ParquetPredicateVisitor { + type Error = DataFusionError; + + fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { + if let Some(parquet_exec) = plan.as_any().downcast_ref::() { + self.predicate = parquet_exec.predicate().cloned(); + self.pruning_predicate = parquet_exec.pruning_predicate().cloned(); + } + Ok(true) + } + } + + #[derive(Default)] + struct ParquetOptionsVisitor { + options: Option, + } + + impl ExecutionPlanVisitor for ParquetOptionsVisitor { + type Error = DataFusionError; + + fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { + if let Some(parquet_exec) = plan.as_any().downcast_ref::() { + self.options = Some(parquet_exec.table_parquet_options().clone()) + } + Ok(true) + } + } } diff --git a/crates/core/src/delta_datafusion/physical.rs b/crates/core/src/delta_datafusion/physical.rs index 0251836fa8..dd28e0d93b 100644 --- a/crates/core/src/delta_datafusion/physical.rs +++ b/crates/core/src/delta_datafusion/physical.rs @@ -1,13 +1,12 @@ //! Physical Operations for DataFusion use std::sync::Arc; +use arrow_array::RecordBatch; use arrow_schema::SchemaRef; -use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::Result as DataFusionResult; -use datafusion::physical_plan::DisplayAs; -use datafusion::physical_plan::{ - metrics::{ExecutionPlanMetricsSet, MetricsSet}, - ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, +use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use datafusion_physical_plan::{ + DisplayAs, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, }; use futures::{Stream, StreamExt}; @@ -74,6 +73,10 @@ impl DisplayAs for MetricObserverExec { } impl ExecutionPlan for MetricObserverExec { + fn name(&self) -> &str { + Self::static_name() + } + fn as_any(&self) -> &dyn std::any::Any { self } @@ -86,8 +89,8 @@ impl ExecutionPlan for MetricObserverExec { self.parent.properties() } - fn children(&self) -> Vec> { - vec![self.parent.clone()] + fn children(&self) -> Vec<&Arc> { + vec![&self.parent] } fn execute( @@ -174,3 +177,7 @@ pub(crate) fn find_metric_node( None } + +pub(crate) fn get_metric(metrics: &MetricsSet, name: &str) -> usize { + metrics.sum_by_name(name).map(|m| m.as_usize()).unwrap_or(0) +} diff --git a/crates/core/src/delta_datafusion/planner.rs b/crates/core/src/delta_datafusion/planner.rs new file mode 100644 index 0000000000..6119b78ce6 --- /dev/null +++ b/crates/core/src/delta_datafusion/planner.rs @@ -0,0 +1,58 @@ +//! Custom planners for datafusion so that you can convert custom nodes, can be used +//! to trace custom metrics in an operation +//! +//! # Example +//! +//! #[derive(Clone)] +//! struct MergeMetricExtensionPlanner {} +//! +//! #[async_trait] +//! impl ExtensionPlanner for MergeMetricExtensionPlanner { +//! async fn plan_extension( +//! &self, +//! planner: &dyn PhysicalPlanner, +//! node: &dyn UserDefinedLogicalNode, +//! _logical_inputs: &[&LogicalPlan], +//! physical_inputs: &[Arc], +//! session_state: &SessionState, +//! ) -> DataFusionResult>> {} +//! +//! let merge_planner = DeltaPlanner:: { +//! extension_planner: MergeMetricExtensionPlanner {} +//! }; +//! +//! let state = state.with_query_planner(Arc::new(merge_planner)); +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::physical_planner::PhysicalPlanner; +use datafusion::{ + execution::{context::QueryPlanner, session_state::SessionState}, + physical_plan::ExecutionPlan, + physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner}, +}; +use datafusion_expr::LogicalPlan; + +use crate::delta_datafusion::DataFusionResult; + +/// Deltaplanner +pub struct DeltaPlanner { + /// custom extension planner + pub extension_planner: T, +} + +#[async_trait] +impl QueryPlanner for DeltaPlanner { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + session_state: &SessionState, + ) -> DataFusionResult> { + let planner = Arc::new(Box::new(DefaultPhysicalPlanner::with_extension_planners( + vec![Arc::new(self.extension_planner.clone())], + ))); + planner + .create_physical_plan(logical_plan, session_state) + .await + } +} diff --git a/crates/core/src/delta_datafusion/schema_adapter.rs b/crates/core/src/delta_datafusion/schema_adapter.rs new file mode 100644 index 0000000000..99a97e2130 --- /dev/null +++ b/crates/core/src/delta_datafusion/schema_adapter.rs @@ -0,0 +1,82 @@ +use std::fmt::Debug; +use std::sync::Arc; + +use arrow_array::RecordBatch; +use arrow_schema::{Schema, SchemaRef}; +use datafusion::datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper}; + +use crate::operations::cast::cast_record_batch; + +/// A Schema Adapter Factory which provides casting record batches from parquet to meet +/// delta lake conventions. +#[derive(Debug)] +pub(crate) struct DeltaSchemaAdapterFactory {} + +impl SchemaAdapterFactory for DeltaSchemaAdapterFactory { + fn create(&self, schema: SchemaRef) -> Box { + Box::new(DeltaSchemaAdapter { + table_schema: schema, + }) + } +} + +pub(crate) struct DeltaSchemaAdapter { + /// Schema for the table + table_schema: SchemaRef, +} + +impl SchemaAdapter for DeltaSchemaAdapter { + fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { + let field = self.table_schema.field(index); + Some(file_schema.fields.find(field.name())?.0) + } + + fn map_schema( + &self, + file_schema: &Schema, + ) -> datafusion_common::Result<(Arc, Vec)> { + let mut projection = Vec::with_capacity(file_schema.fields().len()); + + for (file_idx, file_field) in file_schema.fields.iter().enumerate() { + if self.table_schema.fields().find(file_field.name()).is_some() { + projection.push(file_idx); + } + } + + Ok(( + Arc::new(SchemaMapping { + table_schema: self.table_schema.clone(), + }), + projection, + )) + } +} + +#[derive(Debug)] +pub(crate) struct SchemaMapping { + table_schema: SchemaRef, +} + +impl SchemaMapper for SchemaMapping { + fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result { + let record_batch = cast_record_batch(&batch, self.table_schema.clone(), false, true)?; + Ok(record_batch) + } + + fn map_partial_batch(&self, batch: RecordBatch) -> datafusion_common::Result { + let partial_table_schema = Arc::new(Schema::new( + batch + .schema() + .fields() + .iter() + .filter_map(|batch_field| { + self.table_schema.field_with_name(batch_field.name()).ok() + }) + .cloned() + .collect::>(), + )); + + let record_batch = cast_record_batch(&batch, partial_table_schema, false, true)?; + Ok(record_batch) + } +} diff --git a/crates/core/src/errors.rs b/crates/core/src/errors.rs index 9c3b04aac3..609bc16656 100644 --- a/crates/core/src/errors.rs +++ b/crates/core/src/errors.rs @@ -11,6 +11,9 @@ pub type DeltaResult = Result; #[allow(missing_docs)] #[derive(thiserror::Error, Debug)] pub enum DeltaTableError { + #[error("Kernel error: {0}")] + KernelError(#[from] delta_kernel::error::Error), + #[error("Delta protocol violation: {source}")] Protocol { source: ProtocolError }, @@ -218,6 +221,9 @@ pub enum DeltaTableError { #[error("Table has not yet been initialized")] NotInitialized, + #[error("Table has not yet been initialized with files, therefore {0} is not supported")] + NotInitializedWithFiles(String), + #[error("Change Data not enabled for version: {version}, Start: {start}, End: {end}")] ChangeDataNotRecorded { version: i64, start: i64, end: i64 }, @@ -263,4 +269,9 @@ impl DeltaTableError { ); Self::NotATable(msg) } + + /// Create a [Generic](DeltaTableError::Generic) error with the given message. + pub fn generic(msg: impl ToString) -> Self { + Self::Generic(msg.to_string()) + } } diff --git a/crates/core/src/kernel/arrow/mod.rs b/crates/core/src/kernel/arrow/mod.rs index 648ad16bbc..3ddd35560c 100644 --- a/crates/core/src/kernel/arrow/mod.rs +++ b/crates/core/src/kernel/arrow/mod.rs @@ -3,275 +3,17 @@ use std::sync::Arc; use arrow_schema::{ - ArrowError, DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, - Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, + DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, + Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; use lazy_static::lazy_static; -use super::{ - ActionType, ArrayType, DataType, MapType, PrimitiveType, StructField, StructType, - DECIMAL_MAX_PRECISION, DECIMAL_MAX_SCALE, -}; - pub(crate) mod extract; pub(crate) mod json; -const MAP_ROOT_DEFAULT: &str = "entries"; +const MAP_ROOT_DEFAULT: &str = "key_value"; const MAP_KEY_DEFAULT: &str = "key"; const MAP_VALUE_DEFAULT: &str = "value"; -const LIST_ROOT_DEFAULT: &str = "item"; - -impl TryFrom for ArrowField { - type Error = ArrowError; - - fn try_from(value: ActionType) -> Result { - value.schema_field().try_into() - } -} - -impl TryFrom<&StructType> for ArrowSchema { - type Error = ArrowError; - - fn try_from(s: &StructType) -> Result { - let fields = s - .fields() - .iter() - .map(TryInto::try_into) - .collect::, ArrowError>>()?; - - Ok(ArrowSchema::new(fields)) - } -} - -impl TryFrom<&StructField> for ArrowField { - type Error = ArrowError; - - fn try_from(f: &StructField) -> Result { - let metadata = f - .metadata() - .iter() - .map(|(key, val)| Ok((key.clone(), serde_json::to_string(val)?))) - .collect::>() - .map_err(|err| ArrowError::JsonError(err.to_string()))?; - - let field = ArrowField::new( - f.name(), - ArrowDataType::try_from(f.data_type())?, - f.is_nullable(), - ) - .with_metadata(metadata); - - Ok(field) - } -} - -impl TryFrom<&ArrayType> for ArrowField { - type Error = ArrowError; - fn try_from(a: &ArrayType) -> Result { - Ok(ArrowField::new( - LIST_ROOT_DEFAULT, - ArrowDataType::try_from(a.element_type())?, - // TODO check how to handle nullability - a.contains_null(), - )) - } -} - -impl TryFrom<&MapType> for ArrowField { - type Error = ArrowError; - - fn try_from(a: &MapType) -> Result { - Ok(ArrowField::new( - MAP_ROOT_DEFAULT, - ArrowDataType::Struct( - vec![ - ArrowField::new( - MAP_KEY_DEFAULT, - ArrowDataType::try_from(a.key_type())?, - false, - ), - ArrowField::new( - MAP_VALUE_DEFAULT, - ArrowDataType::try_from(a.value_type())?, - a.value_contains_null(), - ), - ] - .into(), - ), - // always non-null - false, - )) - } -} - -impl TryFrom<&DataType> for ArrowDataType { - type Error = ArrowError; - - fn try_from(t: &DataType) -> Result { - match t { - DataType::Primitive(p) => { - match p { - PrimitiveType::String => Ok(ArrowDataType::Utf8), - PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type - PrimitiveType::Integer => Ok(ArrowDataType::Int32), - PrimitiveType::Short => Ok(ArrowDataType::Int16), - PrimitiveType::Byte => Ok(ArrowDataType::Int8), - PrimitiveType::Float => Ok(ArrowDataType::Float32), - PrimitiveType::Double => Ok(ArrowDataType::Float64), - PrimitiveType::Boolean => Ok(ArrowDataType::Boolean), - PrimitiveType::Binary => Ok(ArrowDataType::Binary), - PrimitiveType::Decimal(precision, scale) => { - if precision <= &DECIMAL_MAX_PRECISION && scale <= &DECIMAL_MAX_SCALE { - Ok(ArrowDataType::Decimal128(*precision, *scale)) - } else { - Err(ArrowError::CastError(format!( - "Precision/scale can not be larger than 38 ({},{})", - precision, scale - ))) - } - } - PrimitiveType::Date => { - // A calendar date, represented as a year-month-day triple without a - // timezone. Stored as 4 bytes integer representing days since 1970-01-01 - Ok(ArrowDataType::Date32) - } - PrimitiveType::Timestamp => Ok(ArrowDataType::Timestamp( - TimeUnit::Microsecond, - Some("UTC".into()), - )), - PrimitiveType::TimestampNtz => { - Ok(ArrowDataType::Timestamp(TimeUnit::Microsecond, None)) - } - } - } - DataType::Struct(s) => Ok(ArrowDataType::Struct( - s.fields() - .iter() - .map(TryInto::try_into) - .collect::, ArrowError>>()? - .into(), - )), - DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(a.as_ref().try_into()?))), - DataType::Map(m) => Ok(ArrowDataType::Map(Arc::new(m.as_ref().try_into()?), false)), - } - } -} - -impl TryFrom<&ArrowSchema> for StructType { - type Error = ArrowError; - - fn try_from(arrow_schema: &ArrowSchema) -> Result { - let new_fields: Result, _> = arrow_schema - .fields() - .iter() - .map(|field| field.as_ref().try_into()) - .collect(); - Ok(StructType::new(new_fields?)) - } -} - -impl TryFrom for StructType { - type Error = ArrowError; - - fn try_from(arrow_schema: ArrowSchemaRef) -> Result { - arrow_schema.as_ref().try_into() - } -} - -impl TryFrom<&ArrowField> for StructField { - type Error = ArrowError; - - fn try_from(arrow_field: &ArrowField) -> Result { - Ok(StructField::new( - arrow_field.name().clone(), - DataType::try_from(arrow_field.data_type())?, - arrow_field.is_nullable(), - ) - .with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v)))) - } -} - -impl TryFrom<&ArrowDataType> for DataType { - type Error = ArrowError; - - fn try_from(arrow_datatype: &ArrowDataType) -> Result { - match arrow_datatype { - ArrowDataType::Utf8 => Ok(DataType::Primitive(PrimitiveType::String)), - ArrowDataType::LargeUtf8 => Ok(DataType::Primitive(PrimitiveType::String)), - ArrowDataType::Int64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type - ArrowDataType::Int32 => Ok(DataType::Primitive(PrimitiveType::Integer)), - ArrowDataType::Int16 => Ok(DataType::Primitive(PrimitiveType::Short)), - ArrowDataType::Int8 => Ok(DataType::Primitive(PrimitiveType::Byte)), - ArrowDataType::UInt64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type - ArrowDataType::UInt32 => Ok(DataType::Primitive(PrimitiveType::Integer)), - ArrowDataType::UInt16 => Ok(DataType::Primitive(PrimitiveType::Short)), - ArrowDataType::UInt8 => Ok(DataType::Primitive(PrimitiveType::Byte)), - ArrowDataType::Float32 => Ok(DataType::Primitive(PrimitiveType::Float)), - ArrowDataType::Float64 => Ok(DataType::Primitive(PrimitiveType::Double)), - ArrowDataType::Boolean => Ok(DataType::Primitive(PrimitiveType::Boolean)), - ArrowDataType::Binary => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::FixedSizeBinary(_) => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::LargeBinary => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::Decimal128(p, s) => { - Ok(DataType::Primitive(PrimitiveType::Decimal(*p, *s))) - } - ArrowDataType::Decimal256(p, s) => DataType::decimal(*p, *s).map_err(|_| { - ArrowError::SchemaError(format!( - "Invalid data type for Delta Lake: decimal({},{})", - p, s - )) - }), - ArrowDataType::Date32 => Ok(DataType::Primitive(PrimitiveType::Date)), - ArrowDataType::Date64 => Ok(DataType::Primitive(PrimitiveType::Date)), - ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => { - Ok(DataType::Primitive(PrimitiveType::TimestampNtz)) - } - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz)) - if tz.eq_ignore_ascii_case("utc") => - { - Ok(DataType::Primitive(PrimitiveType::Timestamp)) - } - ArrowDataType::Struct(fields) => { - let converted_fields: Result, _> = fields - .iter() - .map(|field| field.as_ref().try_into()) - .collect(); - Ok(DataType::Struct(Box::new(StructType::new( - converted_fields?, - )))) - } - ArrowDataType::List(field) => Ok(DataType::Array(Box::new(ArrayType::new( - (*field).data_type().try_into()?, - (*field).is_nullable(), - )))), - ArrowDataType::LargeList(field) => Ok(DataType::Array(Box::new(ArrayType::new( - (*field).data_type().try_into()?, - (*field).is_nullable(), - )))), - ArrowDataType::FixedSizeList(field, _) => Ok(DataType::Array(Box::new( - ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()), - ))), - ArrowDataType::Map(field, _) => { - if let ArrowDataType::Struct(struct_fields) = field.data_type() { - let key_type = struct_fields[0].data_type().try_into()?; - let value_type = struct_fields[1].data_type().try_into()?; - let value_type_nullable = struct_fields[1].is_nullable(); - Ok(DataType::Map(Box::new(MapType::new( - key_type, - value_type, - value_type_nullable, - )))) - } else { - panic!("DataType::Map should contain a struct field child"); - } - } - ArrowDataType::Dictionary(_, value_type) => Ok(value_type.as_ref().try_into()?), - s => Err(ArrowError::SchemaError(format!( - "Invalid data type for Delta Lake: {s}" - ))), - } - } -} macro_rules! arrow_map { ($fieldname: ident, null) => { @@ -507,13 +249,15 @@ pub(crate) fn delta_log_schema_for_table( .iter() .for_each(|f| max_min_schema_for_fields(&mut max_min_vec, f)); - stats_parsed_fields.extend(["minValues", "maxValues"].into_iter().map(|name| { - ArrowField::new( - name, - ArrowDataType::Struct(max_min_vec.clone().into()), - true, - ) - })); + if !max_min_vec.is_empty() { + stats_parsed_fields.extend(["minValues", "maxValues"].into_iter().map(|name| { + ArrowField::new( + name, + ArrowDataType::Struct(max_min_vec.clone().into()), + true, + ) + })); + } let mut null_count_vec = Vec::new(); non_partition_fields @@ -585,8 +329,7 @@ fn max_min_schema_for_fields(dest: &mut Vec, f: &ArrowField) { // don't compute min or max for list, map or binary types ArrowDataType::List(_) | ArrowDataType::Map(_, _) | ArrowDataType::Binary => { /* noop */ } _ => { - let f = f.clone(); - dest.push(f); + dest.push(ArrowField::new(f.name(), f.data_type().clone(), true)); } } } @@ -615,15 +358,15 @@ fn null_count_schema_for_fields(dest: &mut Vec, f: &ArrowField) { #[cfg(test)] mod tests { + use std::collections::HashMap; + use std::sync::Arc; + use arrow::array::ArrayData; - use arrow_array::Array; - use arrow_array::{make_array, ArrayRef, MapArray, StringArray, StructArray}; + use arrow_array::{Array, BinaryArray, MapArray, RecordBatch, StringArray, StructArray}; use arrow_buffer::{Buffer, ToByteSlice}; - use arrow_schema::Field; + use delta_kernel::schema::{DataType, MapType, PrimitiveType, StructField, StructType}; use super::*; - use std::collections::HashMap; - use std::sync::Arc; #[test] fn delta_log_schema_for_table_test() { @@ -766,108 +509,6 @@ mod tests { } } - #[test] - fn test_arrow_from_delta_decimal_type() { - let precision = 20; - let scale = 2; - let decimal_field = DataType::Primitive(PrimitiveType::Decimal(precision, scale)); - assert_eq!( - >::try_from(&decimal_field).unwrap(), - ArrowDataType::Decimal128(precision, scale) - ); - } - - #[test] - fn test_arrow_from_delta_decimal_type_invalid_precision() { - let precision = 39; - let scale = 2; - assert!(matches!( - >::try_from(&ArrowDataType::Decimal256( - precision, scale - )) - .unwrap_err(), - _ - )); - } - - #[test] - fn test_arrow_from_delta_decimal_type_invalid_scale() { - let precision = 2; - let scale = 39; - assert!(matches!( - >::try_from(&ArrowDataType::Decimal256( - precision, scale - )) - .unwrap_err(), - _ - )); - } - - #[test] - fn test_arrow_from_delta_timestamp_type() { - let timestamp_field = DataType::Primitive(PrimitiveType::Timestamp); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())) - ); - } - - #[test] - fn test_arrow_from_delta_timestampntz_type() { - let timestamp_field = DataType::Primitive(PrimitiveType::TimestampNtz); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - ArrowDataType::Timestamp(TimeUnit::Microsecond, None) - ); - } - - #[test] - fn test_delta_from_arrow_timestamp_type_no_tz() { - let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, None); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - DataType::Primitive(PrimitiveType::TimestampNtz) - ); - } - - #[test] - fn test_delta_from_arrow_timestamp_type_with_tz() { - let timestamp_field = - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - DataType::Primitive(PrimitiveType::Timestamp) - ); - } - - #[test] - fn test_delta_from_arrow_map_type() { - let arrow_map = ArrowDataType::Map( - Arc::new(ArrowField::new( - "entries", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::Int8, false), - ArrowField::new("value", ArrowDataType::Binary, true), - ] - .into(), - ), - false, - )), - false, - ); - let converted_map: DataType = (&arrow_map).try_into().unwrap(); - - assert_eq!( - converted_map, - DataType::Map(Box::new(MapType::new( - DataType::Primitive(PrimitiveType::Byte), - DataType::Primitive(PrimitiveType::Binary), - true, - ))) - ); - } - #[test] fn test_record_batch_from_map_type() { let keys = vec!["0", "1", "5", "6", "7"]; @@ -881,52 +522,36 @@ mod tests { let entry_offsets = vec![0u32, 1, 1, 4, 5, 5]; let num_rows = keys.len(); - // Copied the function `new_from_string` with the patched code from https://github.com/apache/arrow-rs/pull/4808 - // This should be reverted back [`MapArray::new_from_strings`] once arrow is upgraded in this project. - fn new_from_strings<'a>( - keys: impl Iterator, - values: &dyn Array, - entry_offsets: &[u32], - ) -> Result { - let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); - let keys_data = StringArray::from_iter_values(keys); - - let keys_field = Arc::new(Field::new("key", ArrowDataType::Utf8, false)); - let values_field = Arc::new(Field::new( - "value", - values.data_type().clone(), - values.null_count() > 0, - )); - - let entry_struct = StructArray::from(vec![ - (keys_field, Arc::new(keys_data) as ArrayRef), - (values_field, make_array(values.to_data())), - ]); - - let map_data_type = ArrowDataType::Map( - Arc::new(Field::new( - "entries", - entry_struct.data_type().clone(), - false, - )), - false, - ); - - let map_data = ArrayData::builder(map_data_type) - .len(entry_offsets.len() - 1) - .add_buffer(entry_offsets_buffer) - .add_child_data(entry_struct.into_data()) - .build()?; + let key_field = Arc::new(ArrowField::new(MAP_KEY_DEFAULT, ArrowDataType::Utf8, false)); + let value_field = Arc::new(ArrowField::new( + MAP_VALUE_DEFAULT, + ArrowDataType::Binary, + false, + )); + let key_value_field = ArrowField::new_struct( + MAP_ROOT_DEFAULT, + vec![key_field.clone(), value_field.clone()], + false, + ); + let key_value_array = StructArray::new( + vec![key_field, value_field].into(), + vec![ + Arc::new(StringArray::from(keys)), + Arc::new(BinaryArray::from(values)), + ], + None, + ); + let entry_offsets_buffer = Buffer::from(entry_offsets.as_slice().to_byte_slice()); - Ok(MapArray::from(map_data)) - } + let map_data_type = ArrowDataType::Map(Arc::new(key_value_field), false); + let map_data = ArrayData::builder(map_data_type) + .len(entry_offsets.len() - 1) + .add_buffer(entry_offsets_buffer) + .add_child_data(key_value_array.into_data()) + .build() + .unwrap(); - let map_array = new_from_strings( - keys.into_iter(), - &arrow::array::BinaryArray::from(values), - entry_offsets.as_slice(), - ) - .expect("Could not create a map array"); + let map_array = MapArray::from(map_data); let schema = >::try_from(&StructType::new(vec![ @@ -942,9 +567,8 @@ mod tests { ])) .expect("Could not get schema"); - let record_batch = - arrow::record_batch::RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) - .expect("Failed to create RecordBatch"); + let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) + .expect("Failed to create RecordBatch"); assert_eq!(record_batch.num_columns(), 1); assert_eq!(record_batch.num_rows(), num_rows); diff --git a/crates/core/src/kernel/error.rs b/crates/core/src/kernel/error.rs index 853b10e411..cefe81bf9d 100644 --- a/crates/core/src/kernel/error.rs +++ b/crates/core/src/kernel/error.rs @@ -71,13 +71,3 @@ pub enum Error { #[error("Failed to parse value '{0}' as '{1}'")] Parse(String, DataType), } - -#[cfg(feature = "object_store")] -impl From for Error { - fn from(value: object_store::Error) -> Self { - match value { - object_store::Error::NotFound { path, .. } => Self::FileNotFound(path), - err => Self::ObjectStore(err), - } - } -} diff --git a/crates/core/src/kernel/expressions/eval.rs b/crates/core/src/kernel/expressions/eval.rs deleted file mode 100644 index cb6beea3ad..0000000000 --- a/crates/core/src/kernel/expressions/eval.rs +++ /dev/null @@ -1,384 +0,0 @@ -//! Default Expression handler. -//! -//! Expression handling based on arrow-rs compute kernels. - -use std::sync::Arc; - -use arrow_arith::boolean::{and, is_null, not, or}; -use arrow_arith::numeric::{add, div, mul, sub}; -use arrow_array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Datum, Decimal128Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, - StructArray, TimestampMicrosecondArray, -}; -use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; -use arrow_schema::{ArrowError, Field as ArrowField, Schema as ArrowSchema}; -use arrow_select::nullif::nullif; - -use crate::kernel::arrow::extract::extract_column; -use crate::kernel::error::{DeltaResult, Error}; -use crate::kernel::expressions::{scalars::Scalar, Expression}; -use crate::kernel::expressions::{BinaryOperator, UnaryOperator}; -use crate::kernel::{DataType, PrimitiveType, VariadicOperator}; - -fn downcast_to_bool(arr: &dyn Array) -> DeltaResult<&BooleanArray> { - arr.as_any() - .downcast_ref::() - .ok_or(Error::Generic("expected boolean array".to_string())) -} - -fn wrap_comparison_result(arr: BooleanArray) -> ArrayRef { - Arc::new(arr) as Arc -} - -// TODO leverage scalars / Datum - -impl Scalar { - /// Convert scalar to arrow array. - pub fn to_array(&self, num_rows: usize) -> DeltaResult { - use Scalar::*; - let arr: ArrayRef = match self { - Integer(val) => Arc::new(Int32Array::from_value(*val, num_rows)), - Long(val) => Arc::new(Int64Array::from_value(*val, num_rows)), - Short(val) => Arc::new(Int16Array::from_value(*val, num_rows)), - Byte(val) => Arc::new(Int8Array::from_value(*val, num_rows)), - Float(val) => Arc::new(Float32Array::from_value(*val, num_rows)), - Double(val) => Arc::new(Float64Array::from_value(*val, num_rows)), - String(val) => Arc::new(StringArray::from(vec![val.clone(); num_rows])), - Boolean(val) => Arc::new(BooleanArray::from(vec![*val; num_rows])), - Timestamp(val) => { - Arc::new(TimestampMicrosecondArray::from_value(*val, num_rows).with_timezone("UTC")) - } - TimestampNtz(val) => Arc::new(TimestampMicrosecondArray::from_value(*val, num_rows)), - Date(val) => Arc::new(Date32Array::from_value(*val, num_rows)), - Binary(val) => Arc::new(BinaryArray::from(vec![val.as_slice(); num_rows])), - Decimal(val, precision, scale) => Arc::new( - Decimal128Array::from_value(*val, num_rows) - .with_precision_and_scale(*precision, *scale)?, - ), - Null(data_type) => match data_type { - DataType::Primitive(primitive) => match primitive { - PrimitiveType::Byte => Arc::new(Int8Array::new_null(num_rows)), - PrimitiveType::Short => Arc::new(Int16Array::new_null(num_rows)), - PrimitiveType::Integer => Arc::new(Int32Array::new_null(num_rows)), - PrimitiveType::Long => Arc::new(Int64Array::new_null(num_rows)), - PrimitiveType::Float => Arc::new(Float32Array::new_null(num_rows)), - PrimitiveType::Double => Arc::new(Float64Array::new_null(num_rows)), - PrimitiveType::String => Arc::new(StringArray::new_null(num_rows)), - PrimitiveType::Boolean => Arc::new(BooleanArray::new_null(num_rows)), - PrimitiveType::Timestamp => { - Arc::new(TimestampMicrosecondArray::new_null(num_rows).with_timezone("UTC")) - } - PrimitiveType::TimestampNtz => { - Arc::new(TimestampMicrosecondArray::new_null(num_rows)) - } - PrimitiveType::Date => Arc::new(Date32Array::new_null(num_rows)), - PrimitiveType::Binary => Arc::new(BinaryArray::new_null(num_rows)), - PrimitiveType::Decimal(precision, scale) => Arc::new( - Decimal128Array::new_null(num_rows) - .with_precision_and_scale(*precision, *scale) - .unwrap(), - ), - }, - DataType::Array(_) => unimplemented!(), - DataType::Map { .. } => unimplemented!(), - DataType::Struct { .. } => unimplemented!(), - }, - Struct(values, fields) => { - let mut columns = Vec::with_capacity(values.len()); - for val in values { - columns.push(val.to_array(num_rows)?); - } - Arc::new(StructArray::try_new( - fields - .iter() - .map(TryInto::::try_into) - .collect::, _>>()? - .into(), - columns, - None, - )?) - } - }; - Ok(arr) - } -} - -/// evaluate expression -pub(crate) fn evaluate_expression( - expression: &Expression, - batch: &RecordBatch, - result_type: Option<&DataType>, -) -> DeltaResult { - use BinaryOperator::*; - use Expression::*; - - match (expression, result_type) { - (Literal(scalar), _) => Ok(scalar.to_array(batch.num_rows())?), - (Column(name), _) => { - if name.contains('.') { - let mut path = name.split('.'); - // Safety: we know that the first path step exists, because we checked for '.' - let arr = extract_column(batch, path.next().unwrap(), &mut path).cloned()?; - // NOTE: need to assign first so that rust can figure out lifetimes - Ok(arr) - } else { - batch - .column_by_name(name) - .ok_or(Error::MissingColumn(name.clone())) - .cloned() - } - } - (Struct(fields), Some(DataType::Struct(schema))) => { - let output_schema: ArrowSchema = schema.as_ref().try_into()?; - let mut columns = Vec::with_capacity(fields.len()); - for (expr, field) in fields.iter().zip(schema.fields()) { - columns.push(evaluate_expression(expr, batch, Some(field.data_type()))?); - } - Ok(Arc::new(StructArray::try_new( - output_schema.fields().clone(), - columns, - None, - )?)) - } - (Struct(_), _) => Err(Error::Generic( - "Data type is required to evaluate struct expressions".to_string(), - )), - (UnaryOperation { op, expr }, _) => { - let arr = evaluate_expression(expr.as_ref(), batch, None)?; - Ok(match op { - UnaryOperator::Not => Arc::new(not(downcast_to_bool(&arr)?)?), - UnaryOperator::IsNull => Arc::new(is_null(&arr)?), - }) - } - (BinaryOperation { op, left, right }, _) => { - let left_arr = evaluate_expression(left.as_ref(), batch, None)?; - let right_arr = evaluate_expression(right.as_ref(), batch, None)?; - - type Operation = fn(&dyn Datum, &dyn Datum) -> Result, ArrowError>; - let eval: Operation = match op { - Plus => add, - Minus => sub, - Multiply => mul, - Divide => div, - LessThan => |l, r| lt(l, r).map(wrap_comparison_result), - LessThanOrEqual => |l, r| lt_eq(l, r).map(wrap_comparison_result), - GreaterThan => |l, r| gt(l, r).map(wrap_comparison_result), - GreaterThanOrEqual => |l, r| gt_eq(l, r).map(wrap_comparison_result), - Equal => |l, r| eq(l, r).map(wrap_comparison_result), - NotEqual => |l, r| neq(l, r).map(wrap_comparison_result), - }; - - eval(&left_arr, &right_arr).map_err(|err| Error::GenericError { - source: Box::new(err), - }) - } - (VariadicOperation { op, exprs }, _) => { - let reducer = match op { - VariadicOperator::And => and, - VariadicOperator::Or => or, - }; - exprs - .iter() - .map(|expr| evaluate_expression(expr, batch, Some(&DataType::BOOLEAN))) - .reduce(|l, r| { - Ok(reducer(downcast_to_bool(&l?)?, downcast_to_bool(&r?)?) - .map(wrap_comparison_result)?) - }) - .transpose()? - .ok_or(Error::Generic("empty expression".to_string())) - } - (NullIf { expr, if_expr }, _) => { - let expr_arr = evaluate_expression(expr.as_ref(), batch, None)?; - let if_expr_arr = - evaluate_expression(if_expr.as_ref(), batch, Some(&DataType::BOOLEAN))?; - let if_expr_arr = downcast_to_bool(&if_expr_arr)?; - Ok(nullif(&expr_arr, if_expr_arr)?) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::Int32Array; - use arrow_schema::{DataType, Field, Fields, Schema}; - use std::ops::{Add, Div, Mul, Sub}; - - #[test] - fn test_extract_column() { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let values = Int32Array::from(vec![1, 2, 3]); - let batch = - RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values.clone())]).unwrap(); - let column = Expression::Column("a".to_string()); - - let results = evaluate_expression(&column, &batch, None).unwrap(); - assert_eq!(results.as_ref(), &values); - - let schema = Schema::new(vec![Field::new( - "b", - DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)])), - false, - )]); - - let struct_values: ArrayRef = Arc::new(values.clone()); - let struct_array = StructArray::from(vec![( - Arc::new(Field::new("a", DataType::Int32, false)), - struct_values, - )]); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(struct_array.clone())], - ) - .unwrap(); - let column = Expression::Column("b.a".to_string()); - let results = evaluate_expression(&column, &batch, None).unwrap(); - assert_eq!(results.as_ref(), &values); - } - - #[test] - fn test_binary_op_scalar() { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let values = Int32Array::from(vec![1, 2, 3]); - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = Expression::Column("a".to_string()); - - let expression = Box::new(column.clone().add(Expression::Literal(Scalar::Integer(1)))); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![2, 3, 4])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().sub(Expression::Literal(Scalar::Integer(1)))); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![0, 1, 2])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().mul(Expression::Literal(Scalar::Integer(2)))); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![2, 4, 6])); - assert_eq!(results.as_ref(), expected.as_ref()); - - // TODO handle type casting - let expression = Box::new(column.div(Expression::Literal(Scalar::Integer(1)))); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![1, 2, 3])); - assert_eq!(results.as_ref(), expected.as_ref()) - } - - #[test] - fn test_binary_op() { - let schema = Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Int32, false), - ]); - let values = Int32Array::from(vec![1, 2, 3]); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(values.clone()), Arc::new(values)], - ) - .unwrap(); - let column_a = Expression::Column("a".to_string()); - let column_b = Expression::Column("b".to_string()); - - let expression = Box::new(column_a.clone().add(column_b.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![2, 4, 6])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column_a.clone().sub(column_b.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![0, 0, 0])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column_a.clone().mul(column_b)); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![1, 4, 9])); - assert_eq!(results.as_ref(), expected.as_ref()); - } - - #[test] - fn test_binary_cmp() { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let values = Int32Array::from(vec![1, 2, 3]); - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = Expression::Column("a".to_string()); - let lit = Expression::Literal(Scalar::Integer(2)); - - let expression = Box::new(column.clone().lt(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, false, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().lt_eq(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, true, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().gt(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![false, false, true])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().gt_eq(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![false, true, true])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().eq(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![false, true, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().ne(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, false, true])); - assert_eq!(results.as_ref(), expected.as_ref()); - } - - #[test] - fn test_logical() { - let schema = Schema::new(vec![ - Field::new("a", DataType::Boolean, false), - Field::new("b", DataType::Boolean, false), - ]); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![ - Arc::new(BooleanArray::from(vec![true, false])), - Arc::new(BooleanArray::from(vec![false, true])), - ], - ) - .unwrap(); - let column_a = Expression::Column("a".to_string()); - let column_b = Expression::Column("b".to_string()); - - let expression = Box::new(column_a.clone().and(column_b.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![false, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new( - column_a - .clone() - .and(Expression::literal(Scalar::Boolean(true))), - ); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column_a.clone().or(column_b)); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, true])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new( - column_a - .clone() - .or(Expression::literal(Scalar::Boolean(false))), - ); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - } -} diff --git a/crates/core/src/kernel/expressions/mod.rs b/crates/core/src/kernel/expressions/mod.rs deleted file mode 100644 index dd8aab51de..0000000000 --- a/crates/core/src/kernel/expressions/mod.rs +++ /dev/null @@ -1,478 +0,0 @@ -//! expressions. - -use std::collections::HashSet; -use std::fmt::{Display, Formatter}; -use std::sync::Arc; - -use arrow_array::{ArrayRef, RecordBatch}; -use arrow_schema::Schema as ArrowSchema; -use itertools::Itertools; - -use self::eval::evaluate_expression; -use super::{DataType, DeltaResult, SchemaRef}; - -pub use self::scalars::*; - -mod eval; -mod scalars; - -/// Interface for implementing an Expression evaluator. -/// -/// It contains one Expression which can be evaluated on multiple ColumnarBatches. -/// Connectors can implement this interface to optimize the evaluation using the -/// connector specific capabilities. -pub trait ExpressionEvaluator { - /// Evaluate the expression on given ColumnarBatch data. - /// - /// Contains one value for each row of the input. - /// The data type of the output is same as the type output of the expression this evaluator is using. - fn evaluate(&self, batch: &RecordBatch) -> DeltaResult; -} - -/// Provides expression evaluation capability to Delta Kernel. -/// -/// Delta Kernel can use this client to evaluate predicate on partition filters, -/// fill up partition column values and any computation on data using Expressions. -pub trait ExpressionHandler { - /// Create an [`ExpressionEvaluator`] that can evaluate the given [`Expression`] - /// on columnar batches with the given [`Schema`] to produce data of [`DataType`]. - /// - /// # Parameters - /// - /// - `schema`: Schema of the input data. - /// - `expression`: Expression to evaluate. - /// - `output_type`: Expected result data type. - /// - /// [`Schema`]: crate::schema::StructType - /// [`DataType`]: crate::schema::DataType - fn get_evaluator( - &self, - schema: SchemaRef, - expression: Expression, - output_type: DataType, - ) -> Arc; -} - -/// Default implementation of [`ExpressionHandler`] that uses [`evaluate_expression`] -#[derive(Debug)] -pub struct ArrowExpressionHandler {} - -impl ExpressionHandler for ArrowExpressionHandler { - fn get_evaluator( - &self, - schema: SchemaRef, - expression: Expression, - output_type: DataType, - ) -> Arc { - Arc::new(DefaultExpressionEvaluator { - input_schema: schema, - expression: Box::new(expression), - output_type, - }) - } -} - -/// Default implementation of [`ExpressionEvaluator`] that uses [`evaluate_expression`] -#[derive(Debug)] -pub struct DefaultExpressionEvaluator { - input_schema: SchemaRef, - expression: Box, - output_type: DataType, -} - -impl ExpressionEvaluator for DefaultExpressionEvaluator { - fn evaluate(&self, batch: &RecordBatch) -> DeltaResult { - let _input_schema: ArrowSchema = self.input_schema.as_ref().try_into()?; - // TODO: make sure we have matching schemas for validation - // if batch.schema().as_ref() != &input_schema { - // return Err(Error::Generic(format!( - // "input schema does not match batch schema: {:?} != {:?}", - // input_schema, - // batch.schema() - // ))); - // }; - evaluate_expression(&self.expression, batch, Some(&self.output_type)) - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -/// A binary operator. -pub enum BinaryOperator { - /// Arithmetic Plus - Plus, - /// Arithmetic Minus - Minus, - /// Arithmetic Multiply - Multiply, - /// Arithmetic Divide - Divide, - /// Comparison Less Than - LessThan, - /// Comparison Less Than Or Equal - LessThanOrEqual, - /// Comparison Greater Than - GreaterThan, - /// Comparison Greater Than Or Equal - GreaterThanOrEqual, - /// Comparison Equal - Equal, - /// Comparison Not Equal - NotEqual, -} - -/// Variadic operators -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum VariadicOperator { - /// AND - And, - /// OR - Or, -} - -impl Display for BinaryOperator { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - // Self::And => write!(f, "AND"), - // Self::Or => write!(f, "OR"), - Self::Plus => write!(f, "+"), - Self::Minus => write!(f, "-"), - Self::Multiply => write!(f, "*"), - Self::Divide => write!(f, "/"), - Self::LessThan => write!(f, "<"), - Self::LessThanOrEqual => write!(f, "<="), - Self::GreaterThan => write!(f, ">"), - Self::GreaterThanOrEqual => write!(f, ">="), - Self::Equal => write!(f, "="), - Self::NotEqual => write!(f, "!="), - } - } -} - -#[derive(Debug, Copy, Clone, PartialEq)] -/// A unary operator. -pub enum UnaryOperator { - /// Unary Not - Not, - /// Unary Is Null - IsNull, -} - -/// A SQL expression. -/// -/// These expressions do not track or validate data types, other than the type -/// of literals. It is up to the expression evaluator to validate the -/// expression against a schema and add appropriate casts as required. -#[derive(Debug, Clone, PartialEq)] -pub enum Expression { - /// A literal value. - Literal(Scalar), - /// A column reference by name. - Column(String), - /// - Struct(Vec), - /// A binary operation. - BinaryOperation { - /// The operator. - op: BinaryOperator, - /// The left-hand side of the operation. - left: Box, - /// The right-hand side of the operation. - right: Box, - }, - /// A unary operation. - UnaryOperation { - /// The operator. - op: UnaryOperator, - /// The expression. - expr: Box, - }, - /// A variadic operation. - VariadicOperation { - /// The operator. - op: VariadicOperator, - /// The expressions. - exprs: Vec, - }, - /// A NULLIF expression. - NullIf { - /// The expression to evaluate. - expr: Box, - /// The expression to compare against. - if_expr: Box, - }, - // TODO: support more expressions, such as IS IN, LIKE, etc. -} - -impl> From for Expression { - fn from(value: T) -> Self { - Self::literal(value) - } -} - -impl Display for Expression { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Self::Literal(l) => write!(f, "{}", l), - Self::Column(name) => write!(f, "Column({})", name), - Self::Struct(exprs) => write!( - f, - "Struct({})", - &exprs.iter().map(|e| format!("{e}")).join(", ") - ), - Self::BinaryOperation { op, left, right } => write!(f, "{} {} {}", left, op, right), - Self::UnaryOperation { op, expr } => match op { - UnaryOperator::Not => write!(f, "NOT {}", expr), - UnaryOperator::IsNull => write!(f, "{} IS NULL", expr), - }, - Self::VariadicOperation { op, exprs } => match op { - VariadicOperator::And => { - write!( - f, - "AND({})", - &exprs.iter().map(|e| format!("{e}")).join(", ") - ) - } - VariadicOperator::Or => { - write!( - f, - "OR({})", - &exprs.iter().map(|e| format!("{e}")).join(", ") - ) - } - }, - Self::NullIf { expr, if_expr } => write!(f, "NULLIF({}, {})", expr, if_expr), - } - } -} - -impl Expression { - /// Returns a set of columns referenced by this expression. - pub fn references(&self) -> HashSet<&str> { - let mut set = HashSet::new(); - - for expr in self.walk() { - if let Self::Column(name) = expr { - set.insert(name.as_str()); - } - } - - set - } - - /// Create an new expression for a column reference - pub fn column(name: impl Into) -> Self { - Self::Column(name.into()) - } - - /// Create a new expression for a literal value - pub fn literal(value: impl Into) -> Self { - Self::Literal(value.into()) - } - - /// Create a new expression for a struct - pub fn struct_expr(exprs: impl IntoIterator) -> Self { - Self::Struct(exprs.into_iter().collect()) - } - - /// Create a new expression for a unary operation - pub fn unary(op: UnaryOperator, expr: impl Into) -> Self { - Self::UnaryOperation { - op, - expr: Box::new(expr.into()), - } - } - - /// Create a new expression for a binary operation - pub fn binary( - op: BinaryOperator, - lhs: impl Into, - rhs: impl Into, - ) -> Self { - Self::BinaryOperation { - op, - left: Box::new(lhs.into()), - right: Box::new(rhs.into()), - } - } - - /// Create a new expression for a variadic operation - pub fn variadic(op: VariadicOperator, other: impl IntoIterator) -> Self { - let mut exprs = other.into_iter().collect::>(); - if exprs.is_empty() { - // TODO this might break if we introduce new variadic operators? - return Self::literal(matches!(op, VariadicOperator::And)); - } - if exprs.len() == 1 { - return exprs.pop().unwrap(); - } - Self::VariadicOperation { op, exprs } - } - - /// Create a new expression `self == other` - pub fn eq(self, other: Self) -> Self { - Self::binary(BinaryOperator::Equal, self, other) - } - - /// Create a new expression `self != other` - pub fn ne(self, other: Self) -> Self { - Self::binary(BinaryOperator::NotEqual, self, other) - } - - /// Create a new expression `self < other` - pub fn lt(self, other: Self) -> Self { - Self::binary(BinaryOperator::LessThan, self, other) - } - - /// Create a new expression `self > other` - pub fn gt(self, other: Self) -> Self { - Self::binary(BinaryOperator::GreaterThan, self, other) - } - - /// Create a new expression `self >= other` - pub fn gt_eq(self, other: Self) -> Self { - Self::binary(BinaryOperator::GreaterThanOrEqual, self, other) - } - - /// Create a new expression `self <= other` - pub fn lt_eq(self, other: Self) -> Self { - Self::binary(BinaryOperator::LessThanOrEqual, self, other) - } - - /// Create a new expression `self AND other` - pub fn and(self, other: Self) -> Self { - self.and_many([other]) - } - - /// Create a new expression `self AND others` - pub fn and_many(self, other: impl IntoIterator) -> Self { - Self::variadic(VariadicOperator::And, std::iter::once(self).chain(other)) - } - - /// Create a new expression `self AND other` - pub fn or(self, other: Self) -> Self { - self.or_many([other]) - } - - /// Create a new expression `self OR other` - pub fn or_many(self, other: impl IntoIterator) -> Self { - Self::variadic(VariadicOperator::Or, std::iter::once(self).chain(other)) - } - - /// Create a new expression `self IS NULL` - pub fn is_null(self) -> Self { - Self::unary(UnaryOperator::IsNull, self) - } - - /// Create a new expression `NULLIF(self, other)` - pub fn null_if(self, other: Self) -> Self { - Self::NullIf { - expr: Box::new(self), - if_expr: Box::new(other), - } - } - - fn walk(&self) -> impl Iterator + '_ { - let mut stack = vec![self]; - std::iter::from_fn(move || { - let expr = stack.pop()?; - match expr { - Self::Literal(_) => {} - Self::Column { .. } => {} - Self::Struct(exprs) => { - stack.extend(exprs.iter()); - } - Self::BinaryOperation { left, right, .. } => { - stack.push(left); - stack.push(right); - } - Self::UnaryOperation { expr, .. } => { - stack.push(expr); - } - Self::VariadicOperation { op, exprs } => match op { - VariadicOperator::And | VariadicOperator::Or => { - stack.extend(exprs.iter()); - } - }, - Self::NullIf { expr, if_expr } => { - stack.push(expr); - stack.push(if_expr); - } - } - Some(expr) - }) - } -} - -impl std::ops::Add for Expression { - type Output = Self; - - fn add(self, rhs: Expression) -> Self::Output { - Self::binary(BinaryOperator::Plus, self, rhs) - } -} - -impl std::ops::Sub for Expression { - type Output = Self; - - fn sub(self, rhs: Expression) -> Self::Output { - Self::binary(BinaryOperator::Minus, self, rhs) - } -} - -impl std::ops::Mul for Expression { - type Output = Self; - - fn mul(self, rhs: Expression) -> Self::Output { - Self::binary(BinaryOperator::Multiply, self, rhs) - } -} - -impl std::ops::Div for Expression { - type Output = Self; - - fn div(self, rhs: Expression) -> Self::Output { - Self::binary(BinaryOperator::Divide, self, rhs) - } -} - -#[cfg(test)] -mod tests { - use super::Expression as Expr; - - #[test] - fn test_expression_format() { - let col_ref = Expr::column("x"); - let cases = [ - (col_ref.clone(), "Column(x)"), - (col_ref.clone().eq(Expr::literal(2)), "Column(x) = 2"), - ( - col_ref - .clone() - .gt_eq(Expr::literal(2)) - .and(col_ref.clone().lt_eq(Expr::literal(10))), - "AND(Column(x) >= 2, Column(x) <= 10)", - ), - ( - col_ref - .clone() - .gt(Expr::literal(2)) - .or(col_ref.clone().lt(Expr::literal(10))), - "OR(Column(x) > 2, Column(x) < 10)", - ), - ( - (col_ref.clone() - Expr::literal(4)).lt(Expr::literal(10)), - "Column(x) - 4 < 10", - ), - ( - (col_ref.clone() + Expr::literal(4)) / Expr::literal(10) * Expr::literal(42), - "Column(x) + 4 / 10 * 42", - ), - (col_ref.eq(Expr::literal("foo")), "Column(x) = 'foo'"), - ]; - - for (expr, expected) in cases { - let result = format!("{}", expr); - assert_eq!(result, expected); - } - } -} diff --git a/crates/core/src/kernel/expressions/scalars.rs b/crates/core/src/kernel/expressions/scalars.rs deleted file mode 100644 index 571c2abf92..0000000000 --- a/crates/core/src/kernel/expressions/scalars.rs +++ /dev/null @@ -1,559 +0,0 @@ -//! Scalar values for use in expressions. - -use std::cmp::Ordering; -use std::fmt::{Display, Formatter}; - -use arrow_array::Array; -use arrow_schema::TimeUnit; -use chrono::{DateTime, NaiveDate, NaiveDateTime, TimeZone, Utc}; -use object_store::path::Path; - -use crate::kernel::{DataType, Error, PrimitiveType, StructField}; -use crate::NULL_PARTITION_VALUE_DATA_PATH; - -/// A single value, which can be null. Used for representing literal values -/// in [Expressions][crate::expressions::Expression]. -#[derive(Debug, Clone, PartialEq)] -pub enum Scalar { - /// 32bit integer - Integer(i32), - /// 64bit integer - Long(i64), - /// 16bit integer - Short(i16), - /// 8bit integer - Byte(i8), - /// 32bit floating point - Float(f32), - /// 64bit floating point - Double(f64), - /// utf-8 encoded string. - String(String), - /// true or false value - Boolean(bool), - /// Microsecond precision timestamp, adjusted to UTC. - Timestamp(i64), - /// Microsecond precision timestamp, with no timezone. - TimestampNtz(i64), - /// Date stored as a signed 32bit int days since UNIX epoch 1970-01-01 - Date(i32), - /// Binary data - Binary(Vec), - /// Decimal value - Decimal(i128, u8, i8), - /// Null value with a given data type. - Null(DataType), - /// Struct value - Struct(Vec, Vec), -} - -impl Scalar { - /// Returns the data type of this scalar. - pub fn data_type(&self) -> DataType { - match self { - Self::Integer(_) => DataType::Primitive(PrimitiveType::Integer), - Self::Long(_) => DataType::Primitive(PrimitiveType::Long), - Self::Short(_) => DataType::Primitive(PrimitiveType::Short), - Self::Byte(_) => DataType::Primitive(PrimitiveType::Byte), - Self::Float(_) => DataType::Primitive(PrimitiveType::Float), - Self::Double(_) => DataType::Primitive(PrimitiveType::Double), - Self::String(_) => DataType::Primitive(PrimitiveType::String), - Self::Boolean(_) => DataType::Primitive(PrimitiveType::Boolean), - Self::Timestamp(_) => DataType::Primitive(PrimitiveType::Timestamp), - Self::TimestampNtz(_) => DataType::Primitive(PrimitiveType::TimestampNtz), - Self::Date(_) => DataType::Primitive(PrimitiveType::Date), - Self::Binary(_) => DataType::Primitive(PrimitiveType::Binary), - // Unwrapping should be safe, since the scalar should never have values that are unsupported - Self::Decimal(_, precision, scale) => DataType::decimal(*precision, *scale).unwrap(), - Self::Null(data_type) => data_type.clone(), - Self::Struct(_, fields) => DataType::struct_type(fields.clone()), - } - } - - /// Returns true if this scalar is null. - pub fn is_null(&self) -> bool { - matches!(self, Self::Null(_)) - } - - /// Serializes this scalar as a string. - pub fn serialize(&self) -> String { - match self { - Self::String(s) => s.to_owned(), - Self::Byte(b) => b.to_string(), - Self::Short(s) => s.to_string(), - Self::Integer(i) => i.to_string(), - Self::Long(l) => l.to_string(), - Self::Float(f) => f.to_string(), - Self::Double(d) => d.to_string(), - Self::Boolean(b) => { - if *b { - "true".to_string() - } else { - "false".to_string() - } - } - Self::TimestampNtz(ts) | Self::Timestamp(ts) => { - let ts = Utc.timestamp_micros(*ts).single().unwrap(); - ts.format("%Y-%m-%d %H:%M:%S%.6f").to_string() - } - Self::Date(days) => { - let date = DateTime::from_timestamp(*days as i64 * 24 * 3600, 0).unwrap(); - date.format("%Y-%m-%d").to_string() - } - Self::Decimal(value, _, scale) => match scale.cmp(&0) { - Ordering::Equal => value.to_string(), - Ordering::Greater => { - let scalar_multiple = 10_i128.pow(*scale as u32); - let mut s = String::new(); - s.push_str((value / scalar_multiple).to_string().as_str()); - s.push('.'); - s.push_str(&format!( - "{:0>scale$}", - value % scalar_multiple, - scale = *scale as usize - )); - s - } - Ordering::Less => { - let mut s = value.to_string(); - for _ in 0..(scale.abs()) { - s.push('0'); - } - s - } - }, - Self::Binary(val) => create_escaped_binary_string(val.as_slice()), - Self::Null(_) => "null".to_string(), - Self::Struct(_, _) => todo!("serializing struct values is not yet supported"), - } - } - - /// Serializes this scalar as a string for use in hive partition file names. - pub fn serialize_encoded(&self) -> String { - if self.is_null() { - return NULL_PARTITION_VALUE_DATA_PATH.to_string(); - } - Path::from(self.serialize()).to_string() - } - - /// Create a [`Scalar`] form a row in an arrow array. - pub fn from_array(arr: &dyn Array, index: usize) -> Option { - use arrow_array::*; - use arrow_schema::DataType::*; - - if arr.len() <= index { - return None; - } - if arr.is_null(index) { - return Some(Self::Null(arr.data_type().try_into().ok()?)); - } - - match arr.data_type() { - Utf8 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::String(v.value(index).to_string())), - LargeUtf8 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::String(v.value(index).to_string())), - Boolean => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Boolean(v.value(index))), - Binary => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Binary(v.value(index).to_vec())), - LargeBinary => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Binary(v.value(index).to_vec())), - FixedSizeBinary(_) => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Binary(v.value(index).to_vec())), - Int8 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Byte(v.value(index))), - Int16 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Short(v.value(index))), - Int32 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Integer(v.value(index))), - Int64 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Long(v.value(index))), - UInt8 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Byte(v.value(index) as i8)), - UInt16 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Short(v.value(index) as i16)), - UInt32 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Integer(v.value(index) as i32)), - UInt64 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Long(v.value(index) as i64)), - Float32 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Float(v.value(index))), - Float64 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Double(v.value(index))), - Decimal128(precision, scale) => { - arr.as_any().downcast_ref::().map(|v| { - let value = v.value(index); - Self::Decimal(value, *precision, *scale) - }) - } - Date32 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Date(v.value(index))), - // TODO handle timezones when implementing timestamp ntz feature. - Timestamp(TimeUnit::Microsecond, tz) => match tz { - None => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Timestamp(v.value(index))), - Some(tz_str) if tz_str.as_ref() == "UTC" => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Timestamp(v.clone().with_timezone("UTC").value(index))), - _ => None, - }, - Struct(fields) => { - let struct_fields = fields - .iter() - .flat_map(|f| TryFrom::try_from(f.as_ref())) - .collect::>(); - let values = arr - .as_any() - .downcast_ref::() - .and_then(|struct_arr| { - struct_fields - .iter() - .map(|f: &StructField| { - struct_arr - .column_by_name(f.name()) - .and_then(|c| Self::from_array(c.as_ref(), index)) - }) - .collect::>>() - })?; - if struct_fields.len() != values.len() { - return None; - } - Some(Self::Struct(values, struct_fields)) - } - Float16 - | Decimal256(_, _) - | List(_) - | LargeList(_) - | FixedSizeList(_, _) - | Map(_, _) - | Date64 - | Timestamp(_, _) - | Time32(_) - | Time64(_) - | Duration(_) - | Interval(_) - | Dictionary(_, _) - | RunEndEncoded(_, _) - | Union(_, _) - | Utf8View - | BinaryView - | ListView(_) - | LargeListView(_) - | Null => None, - } - } -} - -impl PartialOrd for Scalar { - fn partial_cmp(&self, other: &Self) -> Option { - use Scalar::*; - match (self, other) { - (Null(_), Null(_)) => Some(Ordering::Equal), - (Integer(a), Integer(b)) => a.partial_cmp(b), - (Long(a), Long(b)) => a.partial_cmp(b), - (Short(a), Short(b)) => a.partial_cmp(b), - (Byte(a), Byte(b)) => a.partial_cmp(b), - (Float(a), Float(b)) => a.partial_cmp(b), - (Double(a), Double(b)) => a.partial_cmp(b), - (String(a), String(b)) => a.partial_cmp(b), - (Boolean(a), Boolean(b)) => a.partial_cmp(b), - (Timestamp(a), Timestamp(b)) => a.partial_cmp(b), - (TimestampNtz(a), TimestampNtz(b)) => a.partial_cmp(b), - (Date(a), Date(b)) => a.partial_cmp(b), - (Binary(a), Binary(b)) => a.partial_cmp(b), - (Decimal(a, _, _), Decimal(b, _, _)) => a.partial_cmp(b), - (Struct(a, _), Struct(b, _)) => a.partial_cmp(b), - // TODO should we make an assumption about the ordering of nulls? - // rigth now this is only used for internal purposes. - (Null(_), _) => Some(Ordering::Less), - (_, Null(_)) => Some(Ordering::Greater), - _ => None, - } - } -} - -impl Display for Scalar { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Self::Integer(i) => write!(f, "{}", i), - Self::Long(i) => write!(f, "{}", i), - Self::Short(i) => write!(f, "{}", i), - Self::Byte(i) => write!(f, "{}", i), - Self::Float(fl) => write!(f, "{}", fl), - Self::Double(fl) => write!(f, "{}", fl), - Self::String(s) => write!(f, "'{}'", s), - Self::Boolean(b) => write!(f, "{}", b), - Self::Timestamp(ts) => write!(f, "{}", ts), - Self::TimestampNtz(ts) => write!(f, "{}", ts), - Self::Date(d) => write!(f, "{}", d), - Self::Binary(b) => write!(f, "{:?}", b), - Self::Decimal(value, _, scale) => match scale.cmp(&0) { - Ordering::Equal => { - write!(f, "{}", value) - } - Ordering::Greater => { - let scalar_multiple = 10_i128.pow(*scale as u32); - write!(f, "{}", value / scalar_multiple)?; - write!(f, ".")?; - write!( - f, - "{:0>scale$}", - value % scalar_multiple, - scale = *scale as usize - ) - } - Ordering::Less => { - write!(f, "{}", value)?; - for _ in 0..(scale.abs()) { - write!(f, "0")?; - } - Ok(()) - } - }, - Self::Null(_) => write!(f, "null"), - Self::Struct(values, fields) => { - write!(f, "{{")?; - for (i, (value, field)) in values.iter().zip(fields.iter()).enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{}: {}", field.name, value)?; - } - write!(f, "}}") - } - } - } -} - -impl From for Scalar { - fn from(i: i32) -> Self { - Self::Integer(i) - } -} - -impl From for Scalar { - fn from(i: i64) -> Self { - Self::Long(i) - } -} - -impl From for Scalar { - fn from(b: bool) -> Self { - Self::Boolean(b) - } -} - -impl From<&str> for Scalar { - fn from(s: &str) -> Self { - Self::String(s.into()) - } -} - -impl From for Scalar { - fn from(value: String) -> Self { - Self::String(value) - } -} - -// TODO: add more From impls - -impl PrimitiveType { - fn data_type(&self) -> DataType { - DataType::Primitive(*self) - } - - /// Parses a string into a scalar value. - pub fn parse_scalar(&self, raw: &str) -> Result { - use PrimitiveType::*; - - lazy_static::lazy_static! { - static ref UNIX_EPOCH: DateTime = DateTime::from_timestamp(0, 0).unwrap(); - } - - if raw.is_empty() || raw == NULL_PARTITION_VALUE_DATA_PATH { - return Ok(Scalar::Null(self.data_type())); - } - - match self { - String => Ok(Scalar::String(raw.to_string())), - Byte => self.str_parse_scalar(raw, Scalar::Byte), - Short => self.str_parse_scalar(raw, Scalar::Short), - Integer => self.str_parse_scalar(raw, Scalar::Integer), - Long => self.str_parse_scalar(raw, Scalar::Long), - Float => self.str_parse_scalar(raw, Scalar::Float), - Double => self.str_parse_scalar(raw, Scalar::Double), - Boolean => { - if raw.eq_ignore_ascii_case("true") { - Ok(Scalar::Boolean(true)) - } else if raw.eq_ignore_ascii_case("false") { - Ok(Scalar::Boolean(false)) - } else { - Err(self.parse_error(raw)) - } - } - Date => { - let date = NaiveDate::parse_from_str(raw, "%Y-%m-%d") - .map_err(|_| self.parse_error(raw))? - .and_hms_opt(0, 0, 0) - .ok_or(self.parse_error(raw))?; - let date = Utc.from_utc_datetime(&date); - let days = date.signed_duration_since(*UNIX_EPOCH).num_days() as i32; - Ok(Scalar::Date(days)) - } - Timestamp => { - let timestamp = NaiveDateTime::parse_from_str(raw, "%Y-%m-%d %H:%M:%S%.f") - .map_err(|_| self.parse_error(raw))?; - let timestamp = Utc.from_utc_datetime(×tamp); - let micros = timestamp - .signed_duration_since(*UNIX_EPOCH) - .num_microseconds() - .ok_or(self.parse_error(raw))?; - Ok(Scalar::Timestamp(micros)) - } - TimestampNtz => { - let timestamp = NaiveDateTime::parse_from_str(raw, "%Y-%m-%d %H:%M:%S%.f") - .map_err(|_| self.parse_error(raw))?; - let timestamp = Utc.from_utc_datetime(×tamp); - let micros = timestamp - .signed_duration_since(*UNIX_EPOCH) - .num_microseconds() - .ok_or(self.parse_error(raw))?; - Ok(Scalar::TimestampNtz(micros)) - } - Binary => { - let bytes = parse_escaped_binary_string(raw).map_err(|_| self.parse_error(raw))?; - Ok(Scalar::Binary(bytes)) - } - _ => todo!("parsing {:?} is not yet supported", self), - } - } - - fn parse_error(&self, raw: &str) -> Error { - Error::Parse(raw.to_string(), self.data_type()) - } - - fn str_parse_scalar( - &self, - raw: &str, - f: impl FnOnce(T) -> Scalar, - ) -> Result { - match raw.parse() { - Ok(val) => Ok(f(val)), - Err(..) => Err(self.parse_error(raw)), - } - } -} - -fn create_escaped_binary_string(data: &[u8]) -> String { - let mut escaped_string = String::new(); - for &byte in data { - // Convert each byte to its two-digit hexadecimal representation - let hex_representation = format!("{:04X}", byte); - // Append the hexadecimal representation with an escape sequence - escaped_string.push_str("\\u"); - escaped_string.push_str(&hex_representation); - } - escaped_string -} - -fn parse_escaped_binary_string(escaped_string: &str) -> Result, &'static str> { - let mut parsed_bytes = Vec::new(); - let mut chars = escaped_string.chars(); - - while let Some(ch) = chars.next() { - if ch == '\\' { - // Check for the escape sequence "\\u" indicating a hexadecimal value - if chars.next() == Some('u') { - // Read two hexadecimal digits and convert to u8 - if let (Some(digit1), Some(digit2), Some(digit3), Some(digit4)) = - (chars.next(), chars.next(), chars.next(), chars.next()) - { - if let Ok(byte) = - u8::from_str_radix(&format!("{}{}{}{}", digit1, digit2, digit3, digit4), 16) - { - parsed_bytes.push(byte); - } else { - return Err("Error parsing hexadecimal value"); - } - } else { - return Err("Incomplete escape sequence"); - } - } else { - // Unrecognized escape sequence - return Err("Unrecognized escape sequence"); - } - } else { - // Regular character, convert to u8 and push into the result vector - parsed_bytes.push(ch as u8); - } - } - - Ok(parsed_bytes) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_binary_roundtrip() { - let scalar = Scalar::Binary(vec![0, 1, 2, 3, 4, 5]); - let parsed = PrimitiveType::Binary - .parse_scalar(&scalar.serialize()) - .unwrap(); - assert_eq!(scalar, parsed); - } - - #[test] - fn test_decimal_display() { - let s = Scalar::Decimal(123456789, 9, 2); - assert_eq!(s.to_string(), "1234567.89"); - - let s = Scalar::Decimal(123456789, 9, 0); - assert_eq!(s.to_string(), "123456789"); - - let s = Scalar::Decimal(123456789, 9, 9); - assert_eq!(s.to_string(), "0.123456789"); - - let s = Scalar::Decimal(123, 9, -3); - assert_eq!(s.to_string(), "123000"); - } -} diff --git a/crates/core/src/kernel/mod.rs b/crates/core/src/kernel/mod.rs index 876a09a33c..bcd9abbd15 100644 --- a/crates/core/src/kernel/mod.rs +++ b/crates/core/src/kernel/mod.rs @@ -1,15 +1,15 @@ //! Delta Kernel module //! //! The Kernel module contains all the logic for reading and processing the Delta Lake transaction log. +use delta_kernel::engine::arrow_expression::ArrowExpressionHandler; pub mod arrow; pub mod error; -pub mod expressions; pub mod models; +pub mod scalars; mod snapshot; pub use error::*; -pub use expressions::*; pub use models::*; pub use snapshot::*; @@ -20,3 +20,7 @@ pub trait DataCheck { /// The SQL expression to use for the check fn get_expression(&self) -> &str; } + +lazy_static::lazy_static! { + static ref ARROW_HANDLER: ArrowExpressionHandler = ArrowExpressionHandler {}; +} diff --git a/crates/core/src/kernel/models/actions.rs b/crates/core/src/kernel/models/actions.rs index f44ff4ac00..6ec8fc11fb 100644 --- a/crates/core/src/kernel/models/actions.rs +++ b/crates/core/src/kernel/models/actions.rs @@ -1,19 +1,18 @@ use std::collections::{HashMap, HashSet}; use std::fmt; use std::str::FromStr; -// use std::io::{Cursor, Read}; -// use std::sync::Arc; -// use roaring::RoaringTreemap; +use maplit::hashset; use serde::{Deserialize, Serialize}; use tracing::warn; use url::Url; use super::schema::StructType; use crate::kernel::{error::Error, DeltaResult}; +use crate::TableProperty; -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] /// Defines a file format used in table +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub struct Format { /// Name of the encoding for files in this table pub provider: String, @@ -137,34 +136,338 @@ pub struct Protocol { impl Protocol { /// Create a new protocol action - pub fn new(min_reader_version: i32, min_wrriter_version: i32) -> Self { + pub fn new(min_reader_version: i32, min_writer_version: i32) -> Self { Self { min_reader_version, - min_writer_version: min_wrriter_version, + min_writer_version, reader_features: None, writer_features: None, } } - /// set the reader features in the protocol action + /// set the reader features in the protocol action, automatically bumps min_reader_version pub fn with_reader_features( mut self, reader_features: impl IntoIterator>, ) -> Self { - self.reader_features = Some(reader_features.into_iter().map(|c| c.into()).collect()); + let all_reader_features = reader_features + .into_iter() + .map(Into::into) + .collect::>(); + if !all_reader_features.is_empty() { + self.min_reader_version = 3 + } + self.reader_features = Some(all_reader_features); self } - /// set the writer features in the protocol action + /// set the writer features in the protocol action, automatically bumps min_writer_version pub fn with_writer_features( mut self, writer_features: impl IntoIterator>, ) -> Self { - self.writer_features = Some(writer_features.into_iter().map(|c| c.into()).collect()); + let all_writer_feautures = writer_features + .into_iter() + .map(|c| c.into()) + .collect::>(); + if !all_writer_feautures.is_empty() { + self.min_writer_version = 7 + } + self.writer_features = Some(all_writer_feautures); + self + } + + /// Converts existing properties into features if the reader_version is >=3 or writer_version >=3 + /// only converts features that are "true" + pub fn move_table_properties_into_features( + mut self, + configuration: &HashMap>, + ) -> Protocol { + if self.min_writer_version >= 7 { + let mut converted_writer_features = configuration + .iter() + .filter(|(_, value)| { + value.as_ref().map_or(false, |v| { + v.to_ascii_lowercase().parse::().is_ok_and(|v| v) + }) + }) + .collect::>>() + .keys() + .map(|key| (*key).clone().into()) + .filter(|v| !matches!(v, WriterFeatures::Other(_))) + .collect::>(); + + if configuration + .keys() + .any(|v| v.starts_with("delta.constraints.")) + { + converted_writer_features.insert(WriterFeatures::CheckConstraints); + } + + match self.writer_features { + Some(mut features) => { + features.extend(converted_writer_features); + self.writer_features = Some(features); + } + None => self.writer_features = Some(converted_writer_features), + } + } + if self.min_reader_version > 3 { + let converted_reader_features = configuration + .iter() + .filter(|(_, value)| { + value.as_ref().map_or(false, |v| { + v.to_ascii_lowercase().parse::().is_ok_and(|v| v) + }) + }) + .map(|(key, _)| (*key).clone().into()) + .filter(|v| !matches!(v, ReaderFeatures::Other(_))) + .collect::>(); + match self.reader_features { + Some(mut features) => { + features.extend(converted_reader_features); + self.reader_features = Some(features); + } + None => self.reader_features = Some(converted_reader_features), + } + } + self + } + /// Will apply the properties to the protocol by either bumping the version or setting + /// features + pub fn apply_properties_to_protocol( + mut self, + new_properties: &HashMap, + raise_if_not_exists: bool, + ) -> DeltaResult { + let mut parsed_properties: HashMap = HashMap::new(); + + for (key, value) in new_properties { + if let Ok(parsed_key) = key.parse::() { + parsed_properties.insert(parsed_key, value.to_string()); + } else if raise_if_not_exists { + return Err(Error::Generic(format!( + "Error parsing property '{}':'{}'", + key, value + ))); + } + } + + // Check and update delta.minReaderVersion + if let Some(min_reader_version) = parsed_properties.get(&TableProperty::MinReaderVersion) { + let new_min_reader_version = min_reader_version.parse::(); + match new_min_reader_version { + Ok(version) => match version { + 1..=3 => { + if version > self.min_reader_version { + self.min_reader_version = version + } + } + _ => { + return Err(Error::Generic(format!( + "delta.minReaderVersion = '{}' is invalid, valid values are ['1','2','3']", + min_reader_version + ))) + } + }, + Err(_) => { + return Err(Error::Generic(format!( + "delta.minReaderVersion = '{}' is invalid, valid values are ['1','2','3']", + min_reader_version + ))) + } + } + } + + // Check and update delta.minWriterVersion + if let Some(min_writer_version) = parsed_properties.get(&TableProperty::MinWriterVersion) { + let new_min_writer_version = min_writer_version.parse::(); + match new_min_writer_version { + Ok(version) => match version { + 2..=7 => { + if version > self.min_writer_version { + self.min_writer_version = version + } + } + _ => { + return Err(Error::Generic(format!( + "delta.minWriterVersion = '{}' is invalid, valid values are ['2','3','4','5','6','7']", + min_writer_version + ))) + } + }, + Err(_) => { + return Err(Error::Generic(format!( + "delta.minWriterVersion = '{}' is invalid, valid values are ['2','3','4','5','6','7']", + min_writer_version + ))) + } + } + } + + // Check enableChangeDataFeed and bump protocol or add writerFeature if writer versions is >=7 + if let Some(enable_cdf) = parsed_properties.get(&TableProperty::EnableChangeDataFeed) { + let if_enable_cdf = enable_cdf.to_ascii_lowercase().parse::(); + match if_enable_cdf { + Ok(true) => { + if self.min_writer_version >= 7 { + match self.writer_features { + Some(mut features) => { + features.insert(WriterFeatures::ChangeDataFeed); + self.writer_features = Some(features); + } + None => { + self.writer_features = + Some(hashset! {WriterFeatures::ChangeDataFeed}) + } + } + } else if self.min_writer_version <= 3 { + self.min_writer_version = 4 + } + } + Ok(false) => {} + _ => { + return Err(Error::Generic(format!( + "delta.enableChangeDataFeed = '{}' is invalid, valid values are ['true']", + enable_cdf + ))) + } + } + } + + if let Some(enable_dv) = parsed_properties.get(&TableProperty::EnableDeletionVectors) { + let if_enable_dv = enable_dv.to_ascii_lowercase().parse::(); + match if_enable_dv { + Ok(true) => { + let writer_features = match self.writer_features { + Some(mut features) => { + features.insert(WriterFeatures::DeletionVectors); + features + } + None => hashset! {WriterFeatures::DeletionVectors}, + }; + let reader_features = match self.reader_features { + Some(mut features) => { + features.insert(ReaderFeatures::DeletionVectors); + features + } + None => hashset! {ReaderFeatures::DeletionVectors}, + }; + self.min_reader_version = 3; + self.min_writer_version = 7; + self.writer_features = Some(writer_features); + self.reader_features = Some(reader_features); + } + Ok(false) => {} + _ => { + return Err(Error::Generic(format!( + "delta.enableDeletionVectors = '{}' is invalid, valid values are ['true']", + enable_dv + ))) + } + } + } + Ok(self) + } + /// Enable timestamp_ntz in the protocol + pub fn enable_timestamp_ntz(mut self) -> Protocol { + self = self.with_reader_features(vec![ReaderFeatures::TimestampWithoutTimezone]); + self = self.with_writer_features(vec![WriterFeatures::TimestampWithoutTimezone]); self } } +/// High level table features +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] +#[serde(rename_all = "camelCase")] +pub enum TableFeatures { + /// Mapping of one column to another + ColumnMapping, + /// Deletion vectors for merge, update, delete + DeletionVectors, + /// timestamps without timezone support + #[serde(rename = "timestampNtz")] + TimestampWithoutTimezone, + /// version 2 of checkpointing + V2Checkpoint, + /// Append Only Tables + AppendOnly, + /// Table invariants + Invariants, + /// Check constraints on columns + CheckConstraints, + /// CDF on a table + ChangeDataFeed, + /// Columns with generated values + GeneratedColumns, + /// ID Columns + IdentityColumns, + /// Row tracking on tables + RowTracking, + /// domain specific metadata + DomainMetadata, + /// Iceberg compatibility support + IcebergCompatV1, +} + +impl FromStr for TableFeatures { + type Err = (); + + fn from_str(value: &str) -> Result { + match value { + "columnMapping" => Ok(TableFeatures::ColumnMapping), + "deletionVectors" => Ok(TableFeatures::DeletionVectors), + "timestampNtz" => Ok(TableFeatures::TimestampWithoutTimezone), + "v2Checkpoint" => Ok(TableFeatures::V2Checkpoint), + "appendOnly" => Ok(TableFeatures::AppendOnly), + "invariants" => Ok(TableFeatures::Invariants), + "checkConstraints" => Ok(TableFeatures::CheckConstraints), + "changeDataFeed" => Ok(TableFeatures::ChangeDataFeed), + "generatedColumns" => Ok(TableFeatures::GeneratedColumns), + "identityColumns" => Ok(TableFeatures::IdentityColumns), + "rowTracking" => Ok(TableFeatures::RowTracking), + "domainMetadata" => Ok(TableFeatures::DomainMetadata), + "icebergCompatV1" => Ok(TableFeatures::IcebergCompatV1), + _ => Err(()), + } + } +} + +impl AsRef for TableFeatures { + fn as_ref(&self) -> &str { + match self { + TableFeatures::ColumnMapping => "columnMapping", + TableFeatures::DeletionVectors => "deletionVectors", + TableFeatures::TimestampWithoutTimezone => "timestampNtz", + TableFeatures::V2Checkpoint => "v2Checkpoint", + TableFeatures::AppendOnly => "appendOnly", + TableFeatures::Invariants => "invariants", + TableFeatures::CheckConstraints => "checkConstraints", + TableFeatures::ChangeDataFeed => "changeDataFeed", + TableFeatures::GeneratedColumns => "generatedColumns", + TableFeatures::IdentityColumns => "identityColumns", + TableFeatures::RowTracking => "rowTracking", + TableFeatures::DomainMetadata => "domainMetadata", + TableFeatures::IcebergCompatV1 => "icebergCompatV1", + } + } +} + +impl fmt::Display for TableFeatures { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_ref()) + } +} + +impl TableFeatures { + /// Convert table feature to respective reader or/and write feature + pub fn to_reader_writer_features(&self) -> (Option, Option) { + let reader_feature = ReaderFeatures::try_from(self).ok(); + let writer_feature = WriterFeatures::try_from(self).ok(); + (reader_feature, writer_feature) + } +} + /// Features table readers can support as well as let users know /// what is supported #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] @@ -237,6 +540,19 @@ impl fmt::Display for ReaderFeatures { } } +impl TryFrom<&TableFeatures> for ReaderFeatures { + type Error = String; + + fn try_from(value: &TableFeatures) -> Result { + match ReaderFeatures::from(value.as_ref()) { + ReaderFeatures::Other(_) => { + Err(format!("Table feature {} is not a reader feature", value)) + } + value => Ok(value), + } + } +} + /// Features table writers can support as well as let users know /// what is supported #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] @@ -328,6 +644,19 @@ impl fmt::Display for WriterFeatures { } } +impl TryFrom<&TableFeatures> for WriterFeatures { + type Error = String; + + fn try_from(value: &TableFeatures) -> Result { + match WriterFeatures::from(value.as_ref()) { + WriterFeatures::Other(_) => { + Err(format!("Table feature {} is not a writer feature", value)) + } + value => Ok(value), + } + } +} + impl From<&parquet::record::Field> for WriterFeatures { fn from(value: &parquet::record::Field) -> Self { match value { @@ -736,6 +1065,10 @@ pub struct CommitInfo { /// Additional provenance information for the commit #[serde(flatten, default)] pub info: HashMap, + + /// User defined metadata + #[serde(skip_serializing_if = "Option::is_none")] + pub user_metadata: Option, } /// The domain metadata action contains a configuration (string) for a named metadata domain @@ -907,15 +1240,9 @@ pub(crate) mod serde_path { #[cfg(test)] mod tests { use std::path::PathBuf; - // use std::sync::Arc; - - // use object_store::local::LocalFileSystem; - - use crate::kernel::PrimitiveType; use super::*; - // use crate::client::filesystem::ObjectStoreFileSystemClient; - // use crate::executor::tokio::TokioBackgroundExecutor; + use crate::kernel::PrimitiveType; fn dv_relateive() -> DeletionVectorDescriptor { DeletionVectorDescriptor { diff --git a/crates/core/src/kernel/models/fields.rs b/crates/core/src/kernel/models/fields.rs index fa672aaefc..a5a6585060 100644 --- a/crates/core/src/kernel/models/fields.rs +++ b/crates/core/src/kernel/models/fields.rs @@ -1,8 +1,9 @@ //! Schema definitions for action types +use std::sync::Arc; +use delta_kernel::schema::{ArrayType, DataType, MapType, StructField, StructType}; use lazy_static::lazy_static; -use super::schema::{ArrayType, DataType, MapType, StructField, StructType}; use super::ActionType; impl ActionType { @@ -271,3 +272,10 @@ fn deletion_vector_field() -> StructField { pub(crate) fn log_schema() -> &'static StructType { &LOG_SCHEMA } + +pub(crate) fn log_schema_ref() -> &'static Arc { + lazy_static! { + static ref LOG_SCHEMA_REF: Arc = Arc::new(LOG_SCHEMA.clone()); + } + &LOG_SCHEMA_REF +} diff --git a/crates/core/src/kernel/models/schema.rs b/crates/core/src/kernel/models/schema.rs index 161de0352a..3a88564f1d 100644 --- a/crates/core/src/kernel/models/schema.rs +++ b/crates/core/src/kernel/models/schema.rs @@ -1,93 +1,21 @@ //! Delta table schema -use std::borrow::Borrow; -use std::fmt::Formatter; -use std::hash::{Hash, Hasher}; use std::sync::Arc; -use std::{collections::HashMap, fmt::Display}; -use serde::{Deserialize, Serialize}; +pub use delta_kernel::schema::{ + ArrayType, ColumnMetadataKey, DataType, MapType, MetadataValue, PrimitiveType, StructField, + StructType, +}; use serde_json::Value; use crate::kernel::error::Error; use crate::kernel::DataCheck; -use crate::protocol::ProtocolError; /// Type alias for a top level schema pub type Schema = StructType; /// Schema reference type pub type SchemaRef = Arc; -/// A value that can be stored in the metadata of a Delta table schema entity. -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] -#[serde(untagged)] -pub enum MetadataValue { - /// A number value - Number(i32), - /// A string value - String(String), - /// A Boolean value - Boolean(bool), -} - -impl From for MetadataValue { - fn from(value: String) -> Self { - Self::String(value) - } -} - -impl From<&String> for MetadataValue { - fn from(value: &String) -> Self { - Self::String(value.clone()) - } -} - -impl From for MetadataValue { - fn from(value: i32) -> Self { - Self::Number(value) - } -} - -impl From for MetadataValue { - fn from(value: bool) -> Self { - Self::Boolean(value) - } -} - -impl From for MetadataValue { - fn from(value: Value) -> Self { - Self::String(value.to_string()) - } -} - -#[derive(Debug)] -#[allow(missing_docs)] -pub enum ColumnMetadataKey { - ColumnMappingId, - ColumnMappingPhysicalName, - GenerationExpression, - IdentityStart, - IdentityStep, - IdentityHighWaterMark, - IdentityAllowExplicitInsert, - Invariants, -} - -impl AsRef for ColumnMetadataKey { - fn as_ref(&self) -> &str { - match self { - Self::ColumnMappingId => "delta.columnMapping.id", - Self::ColumnMappingPhysicalName => "delta.columnMapping.physicalName", - Self::GenerationExpression => "delta.generationExpression", - Self::IdentityAllowExplicitInsert => "delta.identity.allowExplicitInsert", - Self::IdentityHighWaterMark => "delta.identity.highWaterMark", - Self::IdentityStart => "delta.identity.start", - Self::IdentityStep => "delta.identity.step", - Self::Invariants => "delta.invariants", - } - } -} - /// An invariant for a column that is enforced on all writes to a Delta table. #[derive(Eq, PartialEq, Debug, Default, Clone)] pub struct Invariant { @@ -117,168 +45,17 @@ impl DataCheck for Invariant { } } -/// Represents a struct field defined in the Delta table schema. -// https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] -pub struct StructField { - /// Name of this (possibly nested) column - pub name: String, - /// The data type of this field - #[serde(rename = "type")] - pub data_type: DataType, - /// Denotes whether this Field can be null - pub nullable: bool, - /// A JSON map containing information about this column - pub metadata: HashMap, -} - -impl Hash for StructField { - fn hash(&self, state: &mut H) { - self.name.hash(state); - self.data_type.hash(state); - self.nullable.hash(state); - } -} - -impl Borrow for StructField { - fn borrow(&self) -> &str { - self.name.as_ref() - } -} - -impl Eq for StructField {} - -impl StructField { - /// Creates a new field - pub fn new(name: impl Into, data_type: impl Into, nullable: bool) -> Self { - Self { - name: name.into(), - data_type: data_type.into(), - nullable, - metadata: HashMap::default(), - } - } - - /// Creates a new field with metadata - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, impl Into)>, - ) -> Self { - self.metadata = metadata - .into_iter() - .map(|(k, v)| (k.into(), v.into())) - .collect(); - self - } - - /// Get the value of a specific metadata key - pub fn get_config_value(&self, key: &ColumnMetadataKey) -> Option<&MetadataValue> { - self.metadata.get(key.as_ref()) - } - - #[inline] - /// Returns the name of the column - pub fn name(&self) -> &String { - &self.name - } - - #[inline] - /// Returns whether the column is nullable - pub fn is_nullable(&self) -> bool { - self.nullable - } - - /// Returns the physical name of the column - /// Equals the name if column mapping is not enabled on table - pub fn physical_name(&self) -> Result<&str, Error> { - // Even on mapping type id the physical name should be there for partitions - let phys_name = self.get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName); - match phys_name { - None => Ok(&self.name), - Some(MetadataValue::Boolean(_)) => Ok(&self.name), - Some(MetadataValue::String(s)) => Ok(s), - Some(MetadataValue::Number(_)) => Err(Error::MetadataError( - "Unexpected type for physical name".to_string(), - )), - } - } - - #[inline] - /// Returns the data type of the column - pub const fn data_type(&self) -> &DataType { - &self.data_type - } - - #[inline] - /// Returns the metadata of the column - pub const fn metadata(&self) -> &HashMap { - &self.metadata - } -} - -/// A struct is used to represent both the top-level schema of the table -/// as well as struct columns that contain nested columns. -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq, Hash)] -pub struct StructType { - #[serde(rename = "type")] - /// The type of this struct - pub type_name: String, - /// The type of element stored in this array - pub fields: Vec, +/// Trait to add convenince functions to struct type +pub trait StructTypeExt { + /// Get all invariants in the schemas + fn get_invariants(&self) -> Result, Error>; } -impl StructType { - /// Creates a new struct type - pub fn new(fields: Vec) -> Self { - Self { - type_name: "struct".into(), - fields, - } - } - - /// Returns an immutable reference of the fields in the struct - pub fn fields(&self) -> &Vec { - &self.fields - } - - /// Find the index of the column with the given name. - pub fn index_of(&self, name: &str) -> Result { - let (idx, _) = self - .fields() - .iter() - .enumerate() - .find(|(_, b)| b.name() == name) - .ok_or_else(|| { - let valid_fields: Vec<_> = self.fields.iter().map(|f| f.name()).collect(); - Error::Schema(format!( - "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" - )) - })?; - Ok(idx) - } - - /// Returns a reference of a specific [`StructField`] instance selected by name. - pub fn field_with_name(&self, name: &str) -> Result<&StructField, Error> { - match name.split_once('.') { - Some((parent, children)) => { - let parent_field = &self.fields[self.index_of(parent)?]; - match parent_field.data_type { - DataType::Struct(ref inner) => Ok(inner.field_with_name(children)?), - _ => Err(Error::Schema(format!( - "Field {} is not a struct type", - parent_field.name() - ))), - } - } - None => Ok(&self.fields[self.index_of(name)?]), - } - } - +impl StructTypeExt for StructType { /// Get all invariants in the schemas - pub fn get_invariants(&self) -> Result, Error> { + fn get_invariants(&self) -> Result, Error> { let mut remaining_fields: Vec<(String, StructField)> = self .fields() - .iter() .map(|field| (field.name.clone(), field.clone())) .collect(); let mut invariants: Vec = Vec::new(); @@ -297,7 +74,6 @@ impl StructType { remaining_fields.extend( inner .fields() - .iter() .map(|field| { let new_prefix = add_segment(&field_path, &field.name); (new_prefix, field.clone()) @@ -349,521 +125,11 @@ impl StructType { } } -impl FromIterator for StructType { - fn from_iter>(iter: T) -> Self { - Self { - type_name: "struct".into(), - fields: iter.into_iter().collect(), - } - } -} - -impl<'a> FromIterator<&'a StructField> for StructType { - fn from_iter>(iter: T) -> Self { - Self { - type_name: "struct".into(), - fields: iter.into_iter().cloned().collect(), - } - } -} - -impl From<[StructField; N]> for StructType { - fn from(value: [StructField; N]) -> Self { - Self { - type_name: "struct".into(), - fields: value.to_vec(), - } - } -} - -impl<'a, const N: usize> From<[&'a StructField; N]> for StructType { - fn from(value: [&'a StructField; N]) -> Self { - Self { - type_name: "struct".into(), - fields: value.into_iter().cloned().collect(), - } - } -} - -impl<'a> IntoIterator for &'a StructType { - type Item = &'a StructField; - type IntoIter = std::slice::Iter<'a, StructField>; - - fn into_iter(self) -> Self::IntoIter { - self.fields.iter() - } -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq, Hash)] -#[serde(rename_all = "camelCase")] -/// An array stores a variable length collection of items of some type. -pub struct ArrayType { - #[serde(rename = "type")] - /// The type of this struct - pub type_name: String, - /// The type of element stored in this array - pub element_type: DataType, - /// Denoting whether this array can contain one or more null values - pub contains_null: bool, -} - -impl ArrayType { - /// Creates a new array type - pub fn new(element_type: DataType, contains_null: bool) -> Self { - Self { - type_name: "array".into(), - element_type, - contains_null, - } - } - - #[inline] - /// Returns the element type of the array - pub const fn element_type(&self) -> &DataType { - &self.element_type - } - - #[inline] - /// Returns whether the array can contain null values - pub const fn contains_null(&self) -> bool { - self.contains_null - } -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq, Hash)] -#[serde(rename_all = "camelCase")] -/// A map stores an arbitrary length collection of key-value pairs -pub struct MapType { - #[serde(rename = "type")] - /// The type of this struct - pub type_name: String, - /// The type of element used for the key of this map - pub key_type: DataType, - /// The type of element used for the value of this map - pub value_type: DataType, - /// Denoting whether this array can contain one or more null values - #[serde(default = "default_true")] - pub value_contains_null: bool, -} - -impl MapType { - /// Creates a new map type - pub fn new(key_type: DataType, value_type: DataType, value_contains_null: bool) -> Self { - Self { - type_name: "map".into(), - key_type, - value_type, - value_contains_null, - } - } - - #[inline] - /// Returns the key type of the map - pub const fn key_type(&self) -> &DataType { - &self.key_type - } - - #[inline] - /// Returns the value type of the map - pub const fn value_type(&self) -> &DataType { - &self.value_type - } - - #[inline] - /// Returns whether the map can contain null values - pub const fn value_contains_null(&self) -> bool { - self.value_contains_null - } -} - -fn default_true() -> bool { - true -} - -/// The maximum precision for [PrimitiveType::Decimal] values -pub const DECIMAL_MAX_PRECISION: u8 = 38; - -/// The maximum scale for [PrimitiveType::Decimal] values -pub const DECIMAL_MAX_SCALE: i8 = 38; - -#[derive(Debug, Serialize, Deserialize, PartialEq, Copy, Clone, Eq, Hash)] -#[serde(rename_all = "snake_case")] -/// Primitive types supported by Delta -pub enum PrimitiveType { - /// UTF-8 encoded string of characters - String, - /// i64: 8-byte signed integer. Range: -9223372036854775808 to 9223372036854775807 - Long, - /// i32: 4-byte signed integer. Range: -2147483648 to 2147483647 - Integer, - /// i16: 2-byte signed integer numbers. Range: -32768 to 32767 - Short, - /// i8: 1-byte signed integer number. Range: -128 to 127 - Byte, - /// f32: 4-byte single-precision floating-point numbers - Float, - /// f64: 8-byte double-precision floating-point numbers - Double, - /// bool: boolean values - Boolean, - /// Binary: uninterpreted binary data - Binary, - /// Date: Calendar date (year, month, day) - Date, - /// Microsecond precision timestamp, adjusted to UTC. - Timestamp, - /// Micrsoecond precision timestamp with no timezone - #[serde(alias = "timestampNtz")] - TimestampNtz, - #[serde( - serialize_with = "serialize_decimal", - deserialize_with = "deserialize_decimal", - untagged - )] - /// Decimal: arbitrary precision decimal numbers - Decimal(u8, i8), -} - -fn serialize_decimal( - precision: &u8, - scale: &i8, - serializer: S, -) -> Result { - serializer.serialize_str(&format!("decimal({},{})", precision, scale)) -} - -fn deserialize_decimal<'de, D>(deserializer: D) -> Result<(u8, i8), D::Error> -where - D: serde::Deserializer<'de>, -{ - let str_value = String::deserialize(deserializer)?; - if !str_value.starts_with("decimal(") || !str_value.ends_with(')') { - return Err(serde::de::Error::custom(format!( - "Invalid decimal: {}", - str_value - ))); - } - - let mut parts = str_value[8..str_value.len() - 1].split(','); - let precision = parts - .next() - .and_then(|part| part.trim().parse::().ok()) - .ok_or_else(|| { - serde::de::Error::custom(format!("Invalid precision in decimal: {}", str_value)) - })?; - let scale = parts - .next() - .and_then(|part| part.trim().parse::().ok()) - .ok_or_else(|| { - serde::de::Error::custom(format!("Invalid scale in decimal: {}", str_value)) - })?; - if precision > DECIMAL_MAX_PRECISION || scale > DECIMAL_MAX_SCALE { - return Err(serde::de::Error::custom(format!( - "Precision or scale is larger than 38: {}, {}", - precision, scale - ))); - } - Ok((precision, scale)) -} - -impl Display for PrimitiveType { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - PrimitiveType::String => write!(f, "string"), - PrimitiveType::Long => write!(f, "long"), - PrimitiveType::Integer => write!(f, "integer"), - PrimitiveType::Short => write!(f, "short"), - PrimitiveType::Byte => write!(f, "byte"), - PrimitiveType::Float => write!(f, "float"), - PrimitiveType::Double => write!(f, "double"), - PrimitiveType::Boolean => write!(f, "boolean"), - PrimitiveType::Binary => write!(f, "binary"), - PrimitiveType::Date => write!(f, "date"), - PrimitiveType::Timestamp => write!(f, "timestamp"), - PrimitiveType::TimestampNtz => write!(f, "timestampNtz"), - PrimitiveType::Decimal(precision, scale) => { - write!(f, "decimal({},{})", precision, scale) - } - } - } -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq, Hash)] -#[serde(untagged, rename_all = "camelCase")] -/// Top level delta tdatatypes -pub enum DataType { - /// UTF-8 encoded string of characters - Primitive(PrimitiveType), - /// An array stores a variable length collection of items of some type. - Array(Box), - /// A struct is used to represent both the top-level schema of the table as well - /// as struct columns that contain nested columns. - Struct(Box), - /// A map stores an arbitrary length collection of key-value pairs - /// with a single keyType and a single valueType - Map(Box), -} - -impl From for DataType { - fn from(map_type: MapType) -> Self { - DataType::Map(Box::new(map_type)) - } -} - -impl From for DataType { - fn from(struct_type: StructType) -> Self { - DataType::Struct(Box::new(struct_type)) - } -} - -impl From for DataType { - fn from(array_type: ArrayType) -> Self { - DataType::Array(Box::new(array_type)) - } -} - -#[allow(missing_docs)] -impl DataType { - pub const STRING: Self = DataType::Primitive(PrimitiveType::String); - pub const LONG: Self = DataType::Primitive(PrimitiveType::Long); - pub const INTEGER: Self = DataType::Primitive(PrimitiveType::Integer); - pub const SHORT: Self = DataType::Primitive(PrimitiveType::Short); - pub const BYTE: Self = DataType::Primitive(PrimitiveType::Byte); - pub const FLOAT: Self = DataType::Primitive(PrimitiveType::Float); - pub const DOUBLE: Self = DataType::Primitive(PrimitiveType::Double); - pub const BOOLEAN: Self = DataType::Primitive(PrimitiveType::Boolean); - pub const BINARY: Self = DataType::Primitive(PrimitiveType::Binary); - pub const DATE: Self = DataType::Primitive(PrimitiveType::Date); - pub const TIMESTAMP: Self = DataType::Primitive(PrimitiveType::Timestamp); - pub const TIMESTAMPNTZ: Self = DataType::Primitive(PrimitiveType::TimestampNtz); - - pub fn decimal(precision: u8, scale: i8) -> Result { - if precision > DECIMAL_MAX_PRECISION || scale > DECIMAL_MAX_SCALE { - return Err(ProtocolError::InvalidField(format!( - "decimal({},{})", - precision, scale - ))); - } - Ok(DataType::Primitive(PrimitiveType::Decimal( - precision, scale, - ))) - } - - pub fn struct_type(fields: Vec) -> Self { - DataType::Struct(Box::new(StructType::new(fields))) - } -} - -impl Display for DataType { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - DataType::Primitive(p) => write!(f, "{}", p), - DataType::Array(a) => write!(f, "array<{}>", a.element_type), - DataType::Struct(s) => { - write!(f, "struct<")?; - for (i, field) in s.fields.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{}: {}", field.name, field.data_type)?; - } - write!(f, ">") - } - DataType::Map(m) => write!(f, "map<{}, {}>", m.key_type, m.value_type), - } - } -} - #[cfg(test)] mod tests { use super::*; use serde_json; use serde_json::json; - use std::collections::hash_map::DefaultHasher; - - #[test] - fn test_serde_data_types() { - let data = r#" - { - "name": "a", - "type": "integer", - "nullable": false, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!( - field.data_type, - DataType::Primitive(PrimitiveType::Integer) - )); - - let data = r#" - { - "name": "c", - "type": { - "type": "array", - "elementType": "integer", - "containsNull": false - }, - "nullable": true, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!(field.data_type, DataType::Array(_))); - - let data = r#" - { - "name": "e", - "type": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [ - { - "name": "d", - "type": "integer", - "nullable": false, - "metadata": {} - } - ] - }, - "containsNull": true - }, - "nullable": true, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!(field.data_type, DataType::Array(_))); - match field.data_type { - DataType::Array(array) => assert!(matches!(array.element_type, DataType::Struct(_))), - _ => unreachable!(), - } - - let data = r#" - { - "name": "f", - "type": { - "type": "map", - "keyType": "string", - "valueType": "string", - "valueContainsNull": true - }, - "nullable": true, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!(field.data_type, DataType::Map(_))); - } - - #[test] - fn test_roundtrip_decimal() { - let data = r#" - { - "name": "a", - "type": "decimal(10, 2)", - "nullable": false, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!( - field.data_type, - DataType::Primitive(PrimitiveType::Decimal(10, 2)) - )); - - let json_str = serde_json::to_string(&field).unwrap(); - assert_eq!( - json_str, - r#"{"name":"a","type":"decimal(10,2)","nullable":false,"metadata":{}}"# - ); - } - - #[test] - fn test_invalid_decimal() { - let data = r#" - { - "name": "a", - "type": "decimal(39, 10)", - "nullable": false, - "metadata": {} - } - "#; - assert!(matches!( - serde_json::from_str::(data).unwrap_err(), - _ - )); - - let data = r#" - { - "name": "a", - "type": "decimal(10, 39)", - "nullable": false, - "metadata": {} - } - "#; - assert!(matches!( - serde_json::from_str::(data).unwrap_err(), - _ - )); - } - - #[test] - fn test_field_metadata() { - let data = r#" - { - "name": "e", - "type": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [ - { - "name": "d", - "type": "integer", - "nullable": false, - "metadata": { - "delta.columnMapping.id": 5, - "delta.columnMapping.physicalName": "col-a7f4159c-53be-4cb0-b81a-f7e5240cfc49" - } - } - ] - }, - "containsNull": true - }, - "nullable": true, - "metadata": { - "delta.columnMapping.id": 4, - "delta.columnMapping.physicalName": "col-5f422f40-de70-45b2-88ab-1d5c90e94db1" - } - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - - let col_id = field - .get_config_value(&ColumnMetadataKey::ColumnMappingId) - .unwrap(); - assert!(matches!(col_id, MetadataValue::Number(num) if *num == 4)); - let physical_name = field - .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) - .unwrap(); - assert!( - matches!(physical_name, MetadataValue::String(name) if *name == "col-5f422f40-de70-45b2-88ab-1d5c90e94db1") - ); - } - - #[test] - fn test_read_schemas() { - let file = std::fs::File::open("./tests/serde/schema.json").unwrap(); - let schema: Result = serde_json::from_reader(file); - assert!(schema.is_ok()); - - let file = std::fs::File::open("./tests/serde/checkpoint_schema.json").unwrap(); - let schema: Result = serde_json::from_reader(file); - assert!(schema.is_ok()) - } #[test] fn test_get_invariants() { @@ -934,88 +200,4 @@ mod tests { let buf = r#"{"type":"struct","fields":[{"name":"ID_D_DATE","type":"long","nullable":true,"metadata":{"delta.identity.start":1,"delta.identity.step":1,"delta.identity.allowExplicitInsert":false}},{"name":"TXT_DateKey","type":"string","nullable":true,"metadata":{}}]}"#; let _schema: StructType = serde_json::from_str(buf).expect("Failed to load"); } - - fn get_hash(field: &StructField) -> u64 { - let mut hasher = DefaultHasher::new(); - field.hash(&mut hasher); - hasher.finish() - } - - #[test] - fn test_hash_struct_field() { - // different names should result in different hashes - let field_1 = StructField::new( - "field_name_1", - DataType::Primitive(PrimitiveType::Decimal(4, 4)), - true, - ); - let field_2 = StructField::new( - "field_name_2", - DataType::Primitive(PrimitiveType::Decimal(4, 4)), - true, - ); - assert_ne!(get_hash(&field_1), get_hash(&field_2)); - - // different types should result in different hashes - let field_int = StructField::new( - "field_name", - DataType::Primitive(PrimitiveType::Integer), - true, - ); - let field_string = StructField::new( - "field_name", - DataType::Primitive(PrimitiveType::String), - true, - ); - assert_ne!(get_hash(&field_int), get_hash(&field_string)); - - // different nullability should result in different hashes - let field_true = StructField::new( - "field_name", - DataType::Primitive(PrimitiveType::Binary), - true, - ); - let field_false = StructField::new( - "field_name", - DataType::Primitive(PrimitiveType::Binary), - false, - ); - assert_ne!(get_hash(&field_true), get_hash(&field_false)); - - // case where hashes are the same - let field_1 = StructField::new( - "field_name", - DataType::Primitive(PrimitiveType::Timestamp), - true, - ); - let field_2 = StructField::new( - "field_name", - DataType::Primitive(PrimitiveType::Timestamp), - true, - ); - assert_eq!(get_hash(&field_1), get_hash(&field_2)); - } - - #[test] - fn test_field_with_name() { - let schema = StructType::new(vec![ - StructField::new("a", DataType::STRING, true), - StructField::new("b", DataType::INTEGER, true), - ]); - let field = schema.field_with_name("b").unwrap(); - assert_eq!(*field, StructField::new("b", DataType::INTEGER, true)); - } - - #[test] - fn test_field_with_name_nested() { - let nested = StructType::new(vec![StructField::new("a", DataType::BOOLEAN, true)]); - let schema = StructType::new(vec![ - StructField::new("a", DataType::STRING, true), - StructField::new("b", DataType::Struct(Box::new(nested)), true), - ]); - - let field = schema.field_with_name("b.a").unwrap(); - - assert_eq!(*field, StructField::new("a", DataType::BOOLEAN, true)); - } } diff --git a/crates/core/src/kernel/scalars.rs b/crates/core/src/kernel/scalars.rs new file mode 100644 index 0000000000..bc1bd6eed9 --- /dev/null +++ b/crates/core/src/kernel/scalars.rs @@ -0,0 +1,286 @@ +//! Auxiliary methods for dealing with kernel scalars +use std::cmp::Ordering; + +use arrow_array::Array; +use arrow_schema::TimeUnit; +use chrono::{DateTime, TimeZone, Utc}; +use delta_kernel::{ + expressions::{Scalar, StructData}, + schema::StructField, +}; +use object_store::path::Path; +#[cfg(test)] +use serde_json::Value; +use urlencoding::encode; + +use crate::NULL_PARTITION_VALUE_DATA_PATH; + +/// Auxiliary methods for dealing with kernel scalars +pub trait ScalarExt: Sized { + /// Serialize to string + fn serialize(&self) -> String; + /// Serialize to string for use in hive partition file names + fn serialize_encoded(&self) -> String; + /// Create a [`Scalar`] from an arrow array row + fn from_array(arr: &dyn Array, index: usize) -> Option; + /// Serialize as serde_json::Value + #[cfg(test)] + fn to_json(&self) -> serde_json::Value; +} + +impl ScalarExt for Scalar { + /// Serializes this scalar as a string. + fn serialize(&self) -> String { + match self { + Self::String(s) => s.to_owned(), + Self::Byte(b) => b.to_string(), + Self::Short(s) => s.to_string(), + Self::Integer(i) => i.to_string(), + Self::Long(l) => l.to_string(), + Self::Float(f) => f.to_string(), + Self::Double(d) => d.to_string(), + Self::Boolean(b) => if *b { "true" } else { "false" }.to_string(), + Self::TimestampNtz(ts) | Self::Timestamp(ts) => { + let ts = Utc.timestamp_micros(*ts).single().unwrap(); + ts.format("%Y-%m-%d %H:%M:%S%.6f").to_string() + } + Self::Date(days) => { + let date = DateTime::from_timestamp(*days as i64 * 24 * 3600, 0).unwrap(); + date.format("%Y-%m-%d").to_string() + } + Self::Decimal(value, _, scale) => match scale.cmp(&0) { + Ordering::Equal => value.to_string(), + Ordering::Greater => { + let scalar_multiple = 10_i128.pow(*scale as u32); + let mut s = String::new(); + s.push_str((value / scalar_multiple).to_string().as_str()); + s.push('.'); + s.push_str(&format!( + "{:0>scale$}", + value % scalar_multiple, + scale = *scale as usize + )); + s + } + Ordering::Less => { + let mut s = value.to_string(); + for _ in 0..*scale { + s.push('0'); + } + s + } + }, + Self::Binary(val) => create_escaped_binary_string(val.as_slice()), + Self::Null(_) => "null".to_string(), + Self::Struct(_) => unimplemented!(), + } + } + + /// Serializes this scalar as a string for use in hive partition file names. + fn serialize_encoded(&self) -> String { + if self.is_null() { + return NULL_PARTITION_VALUE_DATA_PATH.to_string(); + } + encode(Path::from(self.serialize()).as_ref()).to_string() + } + + /// Create a [`Scalar`] form a row in an arrow array. + fn from_array(arr: &dyn Array, index: usize) -> Option { + use arrow_array::*; + use arrow_schema::DataType::*; + + if arr.len() <= index { + return None; + } + if arr.is_null(index) { + return Some(Self::Null(arr.data_type().try_into().ok()?)); + } + + match arr.data_type() { + Utf8 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::String(v.value(index).to_string())), + LargeUtf8 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::String(v.value(index).to_string())), + Boolean => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Boolean(v.value(index))), + Binary => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Binary(v.value(index).to_vec())), + LargeBinary => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Binary(v.value(index).to_vec())), + FixedSizeBinary(_) => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Binary(v.value(index).to_vec())), + Int8 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Byte(v.value(index))), + Int16 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Short(v.value(index))), + Int32 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Integer(v.value(index))), + Int64 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Long(v.value(index))), + UInt8 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Byte(v.value(index) as i8)), + UInt16 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Short(v.value(index) as i16)), + UInt32 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Integer(v.value(index) as i32)), + UInt64 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Long(v.value(index) as i64)), + Float32 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Float(v.value(index))), + Float64 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Double(v.value(index))), + Decimal128(precision, scale) => { + arr.as_any().downcast_ref::().map(|v| { + let value = v.value(index); + Self::Decimal(value, *precision, *scale as u8) + }) + } + Date32 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Date(v.value(index))), + Timestamp(TimeUnit::Microsecond, None) => arr + .as_any() + .downcast_ref::() + .map(|v| Self::TimestampNtz(v.value(index))), + Timestamp(TimeUnit::Microsecond, Some(tz)) if tz.eq_ignore_ascii_case("utc") => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Timestamp(v.clone().value(index))), + Struct(fields) => { + let struct_fields = fields + .iter() + .flat_map(|f| TryFrom::try_from(f.as_ref())) + .collect::>(); + let values = arr + .as_any() + .downcast_ref::() + .and_then(|struct_arr| { + struct_fields + .iter() + .map(|f: &StructField| { + struct_arr + .column_by_name(f.name()) + .and_then(|c| Self::from_array(c.as_ref(), index)) + }) + .collect::>>() + })?; + Some(Self::Struct( + StructData::try_new(struct_fields, values).ok()?, + )) + } + Float16 + | Decimal256(_, _) + | List(_) + | LargeList(_) + | FixedSizeList(_, _) + | Map(_, _) + | Date64 + | Timestamp(_, _) + | Time32(_) + | Time64(_) + | Duration(_) + | Interval(_) + | Dictionary(_, _) + | RunEndEncoded(_, _) + | Union(_, _) + | Utf8View + | BinaryView + | ListView(_) + | LargeListView(_) + | Null => None, + } + } + + /// Serializes this scalar as a serde_json::Value. + #[cfg(test)] + fn to_json(&self) -> serde_json::Value { + match self { + Self::String(s) => Value::String(s.to_owned()), + Self::Byte(b) => Value::Number(serde_json::Number::from(*b)), + Self::Short(s) => Value::Number(serde_json::Number::from(*s)), + Self::Integer(i) => Value::Number(serde_json::Number::from(*i)), + Self::Long(l) => Value::Number(serde_json::Number::from(*l)), + Self::Float(f) => Value::Number(serde_json::Number::from_f64(*f as f64).unwrap()), + Self::Double(d) => Value::Number(serde_json::Number::from_f64(*d).unwrap()), + Self::Boolean(b) => Value::Bool(*b), + Self::TimestampNtz(ts) | Self::Timestamp(ts) => { + let ts = Utc.timestamp_micros(*ts).single().unwrap(); + Value::String(ts.format("%Y-%m-%d %H:%M:%S%.6f").to_string()) + } + Self::Date(days) => { + let date = DateTime::from_timestamp(*days as i64 * 24 * 3600, 0).unwrap(); + Value::String(date.format("%Y-%m-%d").to_string()) + } + Self::Decimal(value, _, scale) => match scale.cmp(&0) { + Ordering::Equal => Value::String(value.to_string()), + Ordering::Greater => { + let scalar_multiple = 10_i128.pow(*scale as u32); + let mut s = String::new(); + s.push_str((value / scalar_multiple).to_string().as_str()); + s.push('.'); + s.push_str(&format!( + "{:0>scale$}", + value % scalar_multiple, + scale = *scale as usize + )); + Value::String(s) + } + Ordering::Less => { + let mut s = value.to_string(); + for _ in 0..*scale { + s.push('0'); + } + Value::String(s) + } + }, + Self::Binary(val) => Value::String(create_escaped_binary_string(val.as_slice())), + Self::Null(_) => Value::Null, + Self::Struct(_) => unimplemented!(), + } + } +} + +fn create_escaped_binary_string(data: &[u8]) -> String { + let mut escaped_string = String::new(); + for &byte in data { + // Convert each byte to its two-digit hexadecimal representation + let hex_representation = format!("{:04X}", byte); + // Append the hexadecimal representation with an escape sequence + escaped_string.push_str("\\u"); + escaped_string.push_str(&hex_representation); + } + escaped_string +} diff --git a/crates/core/src/kernel/snapshot/log_data.rs b/crates/core/src/kernel/snapshot/log_data.rs index 24fae0ad75..1a30ec7c46 100644 --- a/crates/core/src/kernel/snapshot/log_data.rs +++ b/crates/core/src/kernel/snapshot/log_data.rs @@ -2,16 +2,20 @@ use std::borrow::Cow; use std::collections::HashMap; use std::sync::Arc; -use arrow_array::{Array, Int32Array, Int64Array, MapArray, RecordBatch, StringArray, StructArray}; +use arrow_array::{ + Array, Int32Array, Int64Array, MapArray, RecordBatch, StringArray, StructArray, UInt64Array, +}; use chrono::{DateTime, Utc}; +use delta_kernel::expressions::Scalar; use indexmap::IndexMap; use object_store::path::Path; use object_store::ObjectMeta; use percent_encoding::percent_decode_str; +use super::super::scalars::ScalarExt; use crate::kernel::arrow::extract::{extract_and_cast, extract_and_cast_opt}; use crate::kernel::{ - DataType, DeletionVectorDescriptor, Metadata, Remove, Scalar, StructField, StructType, + DataType, DeletionVectorDescriptor, Metadata, Remove, StructField, StructType, }; use crate::{DeltaResult, DeltaTableError}; @@ -196,12 +200,16 @@ impl LogicalFile<'_> { .column(0) .as_any() .downcast_ref::() - .ok_or(DeltaTableError::Generic("()".into()))?; + .ok_or(DeltaTableError::generic( + "expected partition values key field to be of type string", + ))?; let values = map_value .column(1) .as_any() .downcast_ref::() - .ok_or(DeltaTableError::Generic("()".into()))?; + .ok_or(DeltaTableError::generic( + "expected partition values value field to be of type string", + ))?; let values = keys .iter() @@ -210,8 +218,8 @@ impl LogicalFile<'_> { let (key, field) = self.partition_fields.get_key_value(k.unwrap()).unwrap(); let field_type = match field.data_type() { DataType::Primitive(p) => Ok(p), - _ => Err(DeltaTableError::Generic( - "nested partitioning values are not supported".to_string(), + _ => Err(DeltaTableError::generic( + "nested partitioning values are not supported", )), }?; Ok(( @@ -223,7 +231,7 @@ impl LogicalFile<'_> { }) .collect::>>()?; - // NOTE: we recreate the map as a BTreeMap to ensure the order of the keys is consistently + // NOTE: we recreate the map as a IndexMap to ensure the order of the keys is consistently // the same as the order of partition fields. self.partition_fields .iter() @@ -351,7 +359,16 @@ impl<'a> FileStatsAccessor<'a> { metadata .partition_columns .iter() - .map(|c| Ok((c.as_str(), schema.field_with_name(c.as_str())?))) + .map(|c| { + Ok(( + c.as_str(), + schema + .field(c.as_str()) + .ok_or(DeltaTableError::PartitionError { + partition: c.clone(), + })?, + )) + }) .collect::>>()?, ); let deletion_vector = extract_and_cast_opt::(data, "add.deletionVector"); @@ -459,20 +476,35 @@ impl<'a> IntoIterator for LogDataHandler<'a> { #[cfg(feature = "datafusion")] mod datafusion { + use std::collections::HashSet; use std::sync::Arc; + use ::datafusion::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; + use ::datafusion::physical_optimizer::pruning::PruningStatistics; + use ::datafusion::physical_plan::Accumulator; + use arrow::compute::concat_batches; use arrow_arith::aggregate::sum; - use arrow_array::Int64Array; + use arrow_array::{ArrayRef, BooleanArray, Int64Array}; use arrow_schema::DataType as ArrowDataType; use datafusion_common::scalar::ScalarValue; use datafusion_common::stats::{ColumnStatistics, Precision, Statistics}; - use datafusion_expr::AggregateFunction; - use datafusion_physical_expr::aggregate::AggregateExpr; - use datafusion_physical_expr::expressions::{Column, Max, Min}; + use datafusion_common::Column; + use delta_kernel::engine::arrow_data::ArrowEngineData; + use delta_kernel::expressions::Expression; + use delta_kernel::schema::{DataType, PrimitiveType}; + use delta_kernel::{ExpressionEvaluator, ExpressionHandler}; use super::*; use crate::kernel::arrow::extract::{extract_and_cast_opt, extract_column}; + use crate::kernel::ARROW_HANDLER; + #[derive(Debug, Default, Clone)] + enum AccumulatorType { + Min, + Max, + #[default] + Unused, + } // TODO validate this works with "wide and narrow" builds / stats impl FileStatsAccessor<'_> { @@ -501,7 +533,7 @@ mod datafusion { &self, path_step: &str, name: &str, - fun: &AggregateFunction, + fun_type: AccumulatorType, ) -> Precision { let mut path = name.split('.'); let array = if let Ok(array) = extract_column(self.stats, path_step, &mut path) { @@ -511,28 +543,24 @@ mod datafusion { }; if array.data_type().is_primitive() { - let agg: Box = match fun { - AggregateFunction::Min => Box::new(Min::new( - // NOTE: this is just a placeholder, we never evalutae this expression - Arc::new(Column::new(name, 0)), - name, - array.data_type().clone(), - )), - AggregateFunction::Max => Box::new(Max::new( - // NOTE: this is just a placeholder, we never evalutae this expression - Arc::new(Column::new(name, 0)), - name, - array.data_type().clone(), - )), - _ => return Precision::Absent, + let accumulator: Option> = match fun_type { + AccumulatorType::Min => MinAccumulator::try_new(array.data_type()) + .map_or(None, |a| Some(Box::new(a))), + AccumulatorType::Max => MaxAccumulator::try_new(array.data_type()) + .map_or(None, |a| Some(Box::new(a))), + _ => None, }; - let mut accum = agg.create_accumulator().ok().unwrap(); - return accum - .update_batch(&[array.clone()]) - .ok() - .and_then(|_| accum.evaluate().ok()) - .map(Precision::Exact) - .unwrap_or(Precision::Absent); + + if let Some(mut accumulator) = accumulator { + return accumulator + .update_batch(&[array.clone()]) + .ok() + .and_then(|_| accumulator.evaluate().ok()) + .map(Precision::Exact) + .unwrap_or(Precision::Absent); + } + + return Precision::Absent; } match array.data_type() { @@ -540,7 +568,11 @@ mod datafusion { return fields .iter() .map(|f| { - self.column_bounds(path_step, &format!("{name}.{}", f.name()), fun) + self.column_bounds( + path_step, + &format!("{name}.{}", f.name()), + fun_type.clone(), + ) }) .map(|s| match s { Precision::Exact(s) => Some(s), @@ -579,8 +611,7 @@ mod datafusion { let null_count_col = format!("{COL_NULL_COUNT}.{}", name.as_ref()); let null_count = self.collect_count(&null_count_col); - let min_value = - self.column_bounds(COL_MIN_VALUES, name.as_ref(), &AggregateFunction::Min); + let min_value = self.column_bounds(COL_MIN_VALUES, name.as_ref(), AccumulatorType::Min); let min_value = match &min_value { Precision::Exact(value) if value.is_null() => Precision::Absent, // TODO this is a hack, we should not be casting here but rather when we read the checkpoint data. @@ -591,8 +622,7 @@ mod datafusion { _ => min_value, }; - let max_value = - self.column_bounds(COL_MAX_VALUES, name.as_ref(), &AggregateFunction::Max); + let max_value = self.column_bounds(COL_MAX_VALUES, name.as_ref(), AccumulatorType::Max); let max_value = match &max_value { Precision::Exact(value) if value.is_null() => Precision::Absent, Precision::Exact(ScalarValue::TimestampNanosecond(a, b)) => Precision::Exact( @@ -670,7 +700,6 @@ mod datafusion { let column_statistics = self .schema .fields() - .iter() .map(|f| self.column_stats(f.name())) .collect::>>()?; Some(Statistics { @@ -679,6 +708,122 @@ mod datafusion { column_statistics, }) } + + fn pick_stats(&self, column: &Column, stats_field: &'static str) -> Option { + let field = self.schema.field(&column.name)?; + // See issue #1214. Binary type does not support natural order which is required for Datafusion to prune + if field.data_type() == &DataType::Primitive(PrimitiveType::Binary) { + return None; + } + let expression = if self.metadata.partition_columns.contains(&column.name) { + Expression::Column(format!("add.partitionValues_parsed.{}", column.name)) + } else { + Expression::Column(format!("add.stats_parsed.{}.{}", stats_field, column.name)) + }; + let evaluator = ARROW_HANDLER.get_evaluator( + crate::kernel::models::fields::log_schema_ref().clone(), + expression, + field.data_type().clone(), + ); + let mut results = Vec::with_capacity(self.data.len()); + for batch in self.data.iter() { + let engine = ArrowEngineData::new(batch.clone()); + let result = evaluator.evaluate(&engine).ok()?; + let result = result + .as_any() + .downcast_ref::() + .ok_or(DeltaTableError::generic( + "failed to downcast evaluator result to ArrowEngineData.", + )) + .ok()?; + results.push(result.record_batch().clone()); + } + let batch = concat_batches(results[0].schema_ref(), &results).ok()?; + batch.column_by_name("output").map(|c| c.clone()) + } + } + + impl<'a> PruningStatistics for LogDataHandler<'a> { + /// return the minimum values for the named column, if known. + /// Note: the returned array must contain `num_containers()` rows + fn min_values(&self, column: &Column) -> Option { + self.pick_stats(column, "minValues") + } + + /// return the maximum values for the named column, if known. + /// Note: the returned array must contain `num_containers()` rows. + fn max_values(&self, column: &Column) -> Option { + self.pick_stats(column, "maxValues") + } + + /// return the number of containers (e.g. row groups) being + /// pruned with these statistics + fn num_containers(&self) -> usize { + self.data.iter().map(|f| f.num_rows()).sum() + } + + /// return the number of null values for the named column as an + /// `Option`. + /// + /// Note: the returned array must contain `num_containers()` rows. + fn null_counts(&self, column: &Column) -> Option { + if !self.metadata.partition_columns.contains(&column.name) { + let counts = self.pick_stats(column, "nullCount")?; + return arrow_cast::cast(counts.as_ref(), &ArrowDataType::UInt64).ok(); + } + let partition_values = self.pick_stats(column, "__dummy__")?; + let row_counts = self.row_counts(column)?; + let row_counts = row_counts.as_any().downcast_ref::()?; + let mut null_counts = Vec::with_capacity(partition_values.len()); + for i in 0..partition_values.len() { + let null_count = if partition_values.is_null(i) { + row_counts.value(i) + } else { + 0 + }; + null_counts.push(null_count); + } + Some(Arc::new(UInt64Array::from(null_counts))) + } + + /// return the number of rows for the named column in each container + /// as an `Option`. + /// + /// Note: the returned array must contain `num_containers()` rows + fn row_counts(&self, _column: &Column) -> Option { + lazy_static::lazy_static! { + static ref ROW_COUNTS_EVAL: Arc = ARROW_HANDLER.get_evaluator( + crate::kernel::models::fields::log_schema_ref().clone(), + Expression::column("add.stats_parsed.numRecords"), + DataType::Primitive(PrimitiveType::Long), + ); + } + let mut results = Vec::with_capacity(self.data.len()); + for batch in self.data.iter() { + let engine = ArrowEngineData::new(batch.clone()); + let result = ROW_COUNTS_EVAL.evaluate(&engine).ok()?; + let result = result + .as_any() + .downcast_ref::() + .ok_or(DeltaTableError::generic( + "failed to downcast evaluator result to ArrowEngineData.", + )) + .ok()?; + results.push(result.record_batch().clone()); + } + let batch = concat_batches(results[0].schema_ref(), &results).ok()?; + arrow_cast::cast(batch.column_by_name("output")?, &ArrowDataType::UInt64).ok() + } + + // This function is required since DataFusion 35.0, but is implemented as a no-op + // https://github.com/apache/arrow-datafusion/blob/ec6abece2dcfa68007b87c69eefa6b0d7333f628/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs#L550 + fn contained( + &self, + _column: &Column, + _value: &HashSet, + ) -> Option { + None + } } } diff --git a/crates/core/src/kernel/snapshot/log_segment.rs b/crates/core/src/kernel/snapshot/log_segment.rs index 2f76ac18d4..596304e003 100644 --- a/crates/core/src/kernel/snapshot/log_segment.rs +++ b/crates/core/src/kernel/snapshot/log_segment.rs @@ -9,8 +9,9 @@ use itertools::Itertools; use lazy_static::lazy_static; use object_store::path::Path; use object_store::{Error as ObjectStoreError, ObjectMeta, ObjectStore}; -use parquet::arrow::arrow_reader::ArrowReaderOptions; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; +use parquet::arrow::ProjectionMask; use regex::Regex; use serde::{Deserialize, Serialize}; use tracing::debug; @@ -36,7 +37,7 @@ lazy_static! { /// specifically, this trait adds the ability to recognize valid log files and /// parse the version number from a log file path // TODO handle compaction files -pub(super) trait PathExt { +pub(crate) trait PathExt { fn child(&self, path: impl AsRef) -> DeltaResult; /// Returns the last path segment if not terminated with a "/" fn filename(&self) -> Option<&str>; @@ -250,19 +251,45 @@ impl LogSegment { pub(super) fn checkpoint_stream( &self, store: Arc, - _read_schema: &Schema, + read_schema: &Schema, config: &DeltaTableConfig, ) -> BoxStream<'_, DeltaResult> { let batch_size = config.log_batch_size; + let read_schema = Arc::new(read_schema.clone()); futures::stream::iter(self.checkpoint_files.clone()) .map(move |meta| { let store = store.clone(); + let read_schema = read_schema.clone(); async move { - let reader = ParquetObjectReader::new(store, meta); - let options = ArrowReaderOptions::new(); //.with_page_index(enable_page_index); - let builder = - ParquetRecordBatchStreamBuilder::new_with_options(reader, options).await?; - builder.with_batch_size(batch_size).build() + let mut reader = ParquetObjectReader::new(store, meta); + let options = ArrowReaderOptions::new(); + let reader_meta = ArrowReaderMetadata::load_async(&mut reader, options).await?; + + // Create projection selecting read_schema fields from parquet file's arrow schema + let projection = reader_meta + .schema() + .fields + .iter() + .enumerate() + .filter_map(|(i, f)| { + if read_schema.fields.contains_key(f.name()) { + Some(i) + } else { + None + } + }) + .collect::>(); + let projection = + ProjectionMask::roots(reader_meta.parquet_schema(), projection); + + // Note: the output batch stream batches have all null value rows for action types not + // present in the projection. When a RowFilter was used to remove null rows, the performance + // got worse when projecting all fields, and was no better when projecting a subset. + // The all null rows are filtered out anyway when the batch stream is consumed. + ParquetRecordBatchStreamBuilder::new_with_metadata(reader, reader_meta) + .with_projection(projection.clone()) + .with_batch_size(batch_size) + .build() } }) .buffered(config.log_buffer_size) @@ -373,13 +400,13 @@ struct CheckpointMetadata { #[allow(unreachable_pub)] // used by acceptance tests (TODO make an fn accessor?) pub version: i64, /// The number of actions that are stored in the checkpoint. - pub(crate) size: i32, + pub(crate) size: i64, /// The number of fragments if the last checkpoint was written in multiple parts. pub(crate) parts: Option, /// The number of bytes of the checkpoint. - pub(crate) size_in_bytes: Option, + pub(crate) size_in_bytes: Option, /// The number of AddFile actions in the checkpoint. - pub(crate) num_of_add_files: Option, + pub(crate) num_of_add_files: Option, /// The schema of the checkpoint file. pub(crate) checkpoint_schema: Option, /// The checksum of the last checkpoint JSON. @@ -514,7 +541,13 @@ pub(super) mod tests { use deltalake_test::utils::*; use tokio::task::JoinHandle; - use crate::checkpoints::create_checkpoint_from_table_uri_and_cleanup; + use crate::{ + checkpoints::{create_checkpoint_for, create_checkpoint_from_table_uri_and_cleanup}, + kernel::{Action, Add, Format, Remove}, + operations::transaction::{CommitBuilder, TableReference}, + protocol::{DeltaOperation, SaveMode}, + DeltaTableBuilder, + }; use super::*; @@ -655,13 +688,11 @@ pub(super) mod tests { mod slow_store { use std::sync::Arc; - use bytes::Bytes; use futures::stream::BoxStream; use object_store::{ - path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, - PutOptions, PutResult, Result, + path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, + ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, }; - use tokio::io::AsyncWrite; #[derive(Debug)] pub(super) struct SlowListStore { @@ -679,24 +710,21 @@ pub(super) mod tests { async fn put_opts( &self, location: &Path, - bytes: Bytes, + bytes: PutPayload, opts: PutOptions, ) -> Result { self.store.put_opts(location, bytes, opts).await } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> Result> { self.store.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { - self.store.abort_multipart(location, multipart_id).await + opts: PutMultipartOpts, + ) -> Result> { + self.store.put_multipart_opts(location, opts).await } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { @@ -742,4 +770,94 @@ pub(super) mod tests { assert!(!path.is_commit_file()); } } + + #[tokio::test] + async fn test_checkpoint_stream_parquet_read() { + let metadata = Metadata { + id: "test".to_string(), + format: Format::new("parquet".to_string(), None), + schema_string: r#"{"type":"struct", "fields": []}"#.to_string(), + ..Default::default() + }; + let protocol = Protocol::default(); + + let mut actions = vec![Action::Metadata(metadata), Action::Protocol(protocol)]; + for i in 0..10 { + actions.push(Action::Add(Add { + path: format!("part-{}.parquet", i), + modification_time: chrono::Utc::now().timestamp_millis(), + ..Default::default() + })); + } + + let log_store = DeltaTableBuilder::from_uri("memory:///") + .build_storage() + .unwrap(); + let op = DeltaOperation::Write { + mode: SaveMode::Overwrite, + partition_by: None, + predicate: None, + }; + let commit = CommitBuilder::default() + .with_actions(actions) + .build(None, log_store.clone(), op) + .await + .unwrap(); + + let mut actions = Vec::new(); + // remove all but one file + for i in 0..9 { + actions.push(Action::Remove(Remove { + path: format!("part-{}.parquet", i), + deletion_timestamp: Some(chrono::Utc::now().timestamp_millis()), + ..Default::default() + })) + } + + let op = DeltaOperation::Delete { predicate: None }; + let table_data = &commit.snapshot as &dyn TableReference; + let commit = CommitBuilder::default() + .with_actions(actions) + .build(Some(table_data), log_store.clone(), op) + .await + .unwrap(); + + create_checkpoint_for(commit.version, &commit.snapshot, log_store.as_ref()) + .await + .unwrap(); + + let batches = LogSegment::try_new( + &Path::default(), + Some(commit.version), + log_store.object_store().as_ref(), + ) + .await + .unwrap() + .checkpoint_stream( + log_store.object_store(), + &StructType::new(vec![ + ActionType::Metadata.schema_field().clone(), + ActionType::Protocol.schema_field().clone(), + ActionType::Add.schema_field().clone(), + ]), + &Default::default(), + ) + .try_collect::>() + .await + .unwrap(); + + let batch = arrow::compute::concat_batches(&batches[0].schema(), batches.iter()).unwrap(); + + // there are 9 remove action rows but all columns are null + // because the removes are not projected in the schema + // these get filtered out upstream and there was no perf + // benefit when applying a row filter + // in addition there is 1 add, 1 metadata, and 1 protocol row + assert_eq!(batch.num_rows(), 12); + + assert_eq!(batch.schema().fields().len(), 3); + assert!(batch.schema().field_with_name("metaData").is_ok()); + assert!(batch.schema().field_with_name("protocol").is_ok()); + assert!(batch.schema().field_with_name("add").is_ok()); + } } diff --git a/crates/core/src/kernel/snapshot/mod.rs b/crates/core/src/kernel/snapshot/mod.rs index cd6cf8bb5f..0df62c867b 100644 --- a/crates/core/src/kernel/snapshot/mod.rs +++ b/crates/core/src/kernel/snapshot/mod.rs @@ -44,7 +44,7 @@ use crate::{DeltaResult, DeltaTableConfig, DeltaTableError}; pub use self::log_data::*; mod log_data; -mod log_segment; +pub(crate) mod log_segment; pub(crate) mod parse; mod replay; mod serde; @@ -193,6 +193,11 @@ impl Snapshot { &self.protocol } + /// Get the table config which is loaded with of the snapshot + pub fn load_config(&self) -> &DeltaTableConfig { + &self.config + } + /// Get the table root of the snapshot pub fn table_root(&self) -> Path { Path::from(self.table_url.clone()) @@ -311,50 +316,19 @@ impl Snapshot { /// Get the statistics schema of the snapshot pub fn stats_schema(&self, table_schema: Option<&StructType>) -> DeltaResult { let schema = table_schema.unwrap_or_else(|| self.schema()); + stats_schema(schema, self.table_config()) + } - let stats_fields = if let Some(stats_cols) = self.table_config().stats_columns() { - stats_cols - .iter() - .map(|col| match schema.field_with_name(col) { - Ok(field) => match field.data_type() { - DataType::Map(_) | DataType::Array(_) | &DataType::BINARY => { - Err(DeltaTableError::Generic(format!( - "Stats column {} has unsupported type {}", - col, - field.data_type() - ))) - } - _ => Ok(StructField::new( - field.name(), - field.data_type().clone(), - true, - )), - }, - _ => Err(DeltaTableError::Generic(format!( - "Stats column {} not found in schema", - col - ))), - }) - .collect::, _>>()? - } else { - let num_indexed_cols = self.table_config().num_indexed_cols(); - schema - .fields - .iter() - .enumerate() - .filter_map(|(idx, f)| stats_field(idx, num_indexed_cols, f)) - .collect() - }; - Ok(StructType::new(vec![ - StructField::new("numRecords", DataType::LONG, true), - StructField::new("minValues", StructType::new(stats_fields.clone()), true), - StructField::new("maxValues", StructType::new(stats_fields.clone()), true), - StructField::new( - "nullCount", - StructType::new(stats_fields.iter().filter_map(to_count_field).collect()), - true, - ), - ])) + /// Get the partition values schema of the snapshot + pub fn partitions_schema( + &self, + table_schema: Option<&StructType>, + ) -> DeltaResult> { + if self.metadata().partition_columns.is_empty() { + return Ok(None); + } + let schema = table_schema.unwrap_or_else(|| self.schema()); + partitions_schema(schema, &self.metadata().partition_columns) } } @@ -369,7 +343,7 @@ pub struct EagerSnapshot { // NOTE: this is a Vec of RecordBatch instead of a single RecordBatch because // we do not yet enforce a consistent schema across all batches we read from the log. - files: Vec, + pub(crate) files: Vec, } impl EagerSnapshot { @@ -395,8 +369,13 @@ impl EagerSnapshot { .iter() .flat_map(get_visitor) .collect::>(); - let snapshot = Snapshot::try_new(table_root, store.clone(), config, version).await?; - let files = snapshot.files(store, &mut visitors)?.try_collect().await?; + let snapshot = + Snapshot::try_new(table_root, store.clone(), config.clone(), version).await?; + + let files = match config.require_files { + true => snapshot.files(store, &mut visitors)?.try_collect().await?, + false => vec![], + }; let mut sn = Self { snapshot, @@ -561,6 +540,11 @@ impl EagerSnapshot { self.snapshot.table_root() } + /// Get the table config which is loaded with of the snapshot + pub fn load_config(&self) -> &DeltaTableConfig { + &self.snapshot.load_config() + } + /// Well known table configuration pub fn table_config(&self) -> TableConfig<'_> { self.snapshot.table_config() @@ -688,6 +672,74 @@ impl EagerSnapshot { } } +fn stats_schema(schema: &StructType, config: TableConfig<'_>) -> DeltaResult { + let stats_fields = if let Some(stats_cols) = config.stats_columns() { + stats_cols + .iter() + .map(|col| match get_stats_field(schema, col) { + Some(field) => match field.data_type() { + DataType::Map(_) | DataType::Array(_) | &DataType::BINARY => { + Err(DeltaTableError::Generic(format!( + "Stats column {} has unsupported type {}", + col, + field.data_type() + ))) + } + _ => Ok(StructField::new( + field.name(), + field.data_type().clone(), + true, + )), + }, + _ => Err(DeltaTableError::Generic(format!( + "Stats column {} not found in schema", + col + ))), + }) + .collect::, _>>()? + } else { + let num_indexed_cols = config.num_indexed_cols(); + schema + .fields + .values() + .enumerate() + .filter_map(|(idx, f)| stats_field(idx, num_indexed_cols, f)) + .collect() + }; + Ok(StructType::new(vec![ + StructField::new("numRecords", DataType::LONG, true), + StructField::new("minValues", StructType::new(stats_fields.clone()), true), + StructField::new("maxValues", StructType::new(stats_fields.clone()), true), + StructField::new( + "nullCount", + StructType::new(stats_fields.iter().filter_map(to_count_field).collect()), + true, + ), + ])) +} + +pub(crate) fn partitions_schema( + schema: &StructType, + partition_columns: &Vec, +) -> DeltaResult> { + if partition_columns.is_empty() { + return Ok(None); + } + Ok(Some(StructType::new( + partition_columns + .iter() + .map(|col| { + schema.field(col).map(|field| field.clone()).ok_or_else(|| { + DeltaTableError::Generic(format!( + "Partition column {} not found in schema", + col + )) + }) + }) + .collect::, _>>()?, + ))) +} + fn stats_field(idx: usize, num_indexed_cols: i32, field: &StructField) -> Option { if !(num_indexed_cols < 0 || (idx as i32) < num_indexed_cols) { return None; @@ -699,7 +751,6 @@ fn stats_field(idx: usize, num_indexed_cols: i32, field: &StructField) -> Option StructType::new( dt_struct .fields() - .iter() .flat_map(|f| stats_field(idx, num_indexed_cols, f)) .collect(), ), @@ -718,12 +769,7 @@ fn to_count_field(field: &StructField) -> Option { DataType::Map(_) | DataType::Array(_) | &DataType::BINARY => None, DataType::Struct(s) => Some(StructField::new( field.name(), - StructType::new( - s.fields() - .iter() - .filter_map(to_count_field) - .collect::>(), - ), + StructType::new(s.fields().filter_map(to_count_field).collect::>()), true, )), _ => Some(StructField::new(field.name(), DataType::LONG, true)), @@ -744,6 +790,45 @@ mod datafusion { } } +/// Retrieves a specific field from the schema based on the provided field name. +/// It handles cases where the field name is nested or enclosed in backticks. +fn get_stats_field<'a>(schema: &'a StructType, stats_field_name: &str) -> Option<&'a StructField> { + let dialect = sqlparser::dialect::GenericDialect {}; + match sqlparser::parser::Parser::new(&dialect).try_with_sql(stats_field_name) { + Ok(mut parser) => match parser.parse_multipart_identifier() { + Ok(parts) => find_nested_field(schema, &parts), + Err(_) => schema.field(stats_field_name), + }, + Err(_) => schema.field(stats_field_name), + } +} + +fn find_nested_field<'a>( + schema: &'a StructType, + parts: &[sqlparser::ast::Ident], +) -> Option<&'a StructField> { + if parts.is_empty() { + return None; + } + let part_name = &parts[0].value; + match schema.field(part_name) { + Some(field) => { + if parts.len() == 1 { + Some(field) + } else { + match field.data_type() { + DataType::Struct(struct_schema) => { + find_nested_field(struct_schema, &parts[1..]) + } + // Any part before the end must be a struct + _ => None, + } + } + } + None => None, + } +} + #[cfg(test)] mod tests { use std::collections::HashMap; @@ -758,6 +843,7 @@ mod tests { use super::*; use crate::kernel::Remove; use crate::protocol::{DeltaOperation, SaveMode}; + use crate::test_utils::ActionFactory; #[tokio::test] async fn test_snapshots() -> TestResult { @@ -962,4 +1048,167 @@ mod tests { Ok(()) } + + #[test] + fn test_partition_schema() { + let schema = StructType::new(vec![ + StructField::new("id", DataType::LONG, true), + StructField::new("name", DataType::STRING, true), + StructField::new("date", DataType::DATE, true), + ]); + + let partition_columns = vec!["date".to_string()]; + let metadata = ActionFactory::metadata(&schema, Some(&partition_columns), None); + let protocol = ActionFactory::protocol(None, None, None::>, None::>); + + let commit_data = CommitData::new( + vec![ + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ], + DeltaOperation::Write { + mode: SaveMode::Append, + partition_by: Some(partition_columns), + predicate: None, + }, + HashMap::new(), + vec![], + ); + let (log_segment, _) = LogSegment::new_test(vec![&commit_data]).unwrap(); + + let snapshot = Snapshot { + log_segment: log_segment.clone(), + protocol: protocol.clone(), + metadata, + schema: schema.clone(), + table_url: "table".to_string(), + config: Default::default(), + }; + + let expected = StructType::new(vec![StructField::new("date", DataType::DATE, true)]); + assert_eq!(snapshot.partitions_schema(None).unwrap(), Some(expected)); + + let metadata = ActionFactory::metadata(&schema, None::>, None); + let commit_data = CommitData::new( + vec![ + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ], + DeltaOperation::Write { + mode: SaveMode::Append, + partition_by: None, + predicate: None, + }, + HashMap::new(), + vec![], + ); + let (log_segment, _) = LogSegment::new_test(vec![&commit_data]).unwrap(); + + let snapshot = Snapshot { + log_segment, + config: Default::default(), + protocol: protocol.clone(), + metadata, + schema: schema.clone(), + table_url: "table".to_string(), + }; + + assert_eq!(snapshot.partitions_schema(None).unwrap(), None); + } + + #[test] + fn test_field_with_name() { + let schema = StructType::new(vec![ + StructField::new("a", DataType::STRING, true), + StructField::new("b", DataType::INTEGER, true), + ]); + let field = get_stats_field(&schema, "b").unwrap(); + assert_eq!(*field, StructField::new("b", DataType::INTEGER, true)); + } + + #[test] + fn test_field_with_name_escaped() { + let schema = StructType::new(vec![ + StructField::new("a", DataType::STRING, true), + StructField::new("b.b", DataType::INTEGER, true), + ]); + let field = get_stats_field(&schema, "`b.b`").unwrap(); + assert_eq!(*field, StructField::new("b.b", DataType::INTEGER, true)); + } + + #[test] + fn test_field_does_not_exist() { + let schema = StructType::new(vec![ + StructField::new("a", DataType::STRING, true), + StructField::new("b", DataType::INTEGER, true), + ]); + let field = get_stats_field(&schema, "c"); + assert!(field.is_none()); + } + + #[test] + fn test_field_part_is_not_struct() { + let schema = StructType::new(vec![ + StructField::new("a", DataType::STRING, true), + StructField::new("b", DataType::INTEGER, true), + ]); + let field = get_stats_field(&schema, "b.c"); + assert!(field.is_none()); + } + + #[test] + fn test_field_name_does_not_parse() { + let schema = StructType::new(vec![ + StructField::new("a", DataType::STRING, true), + StructField::new("b", DataType::INTEGER, true), + ]); + let field = get_stats_field(&schema, "b."); + assert!(field.is_none()); + } + + #[test] + fn test_field_with_name_nested() { + let nested = StructType::new(vec![StructField::new( + "nested_struct", + DataType::BOOLEAN, + true, + )]); + let schema = StructType::new(vec![ + StructField::new("a", DataType::STRING, true), + StructField::new("b", DataType::Struct(Box::new(nested)), true), + ]); + + let field = get_stats_field(&schema, "b.nested_struct").unwrap(); + + assert_eq!( + *field, + StructField::new("nested_struct", DataType::BOOLEAN, true) + ); + } + + #[test] + fn test_field_with_last_name_nested_backticks() { + let nested = StructType::new(vec![StructField::new("pr!me", DataType::BOOLEAN, true)]); + let schema = StructType::new(vec![ + StructField::new("a", DataType::STRING, true), + StructField::new("b", DataType::Struct(Box::new(nested)), true), + ]); + + let field = get_stats_field(&schema, "b.`pr!me`").unwrap(); + + assert_eq!(*field, StructField::new("pr!me", DataType::BOOLEAN, true)); + } + + #[test] + fn test_field_with_name_nested_backticks() { + let nested = StructType::new(vec![StructField::new("pr", DataType::BOOLEAN, true)]); + let schema = StructType::new(vec![ + StructField::new("a", DataType::STRING, true), + StructField::new("b&b", DataType::Struct(Box::new(nested)), true), + ]); + + let field = get_stats_field(&schema, "`b&b`.pr").unwrap(); + + assert_eq!(*field, StructField::new("pr", DataType::BOOLEAN, true)); + } } diff --git a/crates/core/src/kernel/snapshot/parse.rs b/crates/core/src/kernel/snapshot/parse.rs index a3ccef1902..f75744691e 100644 --- a/crates/core/src/kernel/snapshot/parse.rs +++ b/crates/core/src/kernel/snapshot/parse.rs @@ -11,6 +11,11 @@ use crate::{DeltaResult, DeltaTableError}; pub(super) fn read_metadata(batch: &dyn ProvidesColumnByName) -> DeltaResult> { if let Some(arr) = ex::extract_and_cast_opt::(batch, "metaData") { + // Stop early if all values are null + if arr.null_count() == arr.len() { + return Ok(None); + } + let id = ex::extract_and_cast::(arr, "id")?; let name = ex::extract_and_cast::(arr, "name")?; let description = ex::extract_and_cast::(arr, "description")?; @@ -43,6 +48,11 @@ pub(super) fn read_metadata(batch: &dyn ProvidesColumnByName) -> DeltaResult DeltaResult> { if let Some(arr) = ex::extract_and_cast_opt::(batch, "protocol") { + // Stop early if all values are null + if arr.null_count() == arr.len() { + return Ok(None); + } + let min_reader_version = ex::extract_and_cast::(arr, "minReaderVersion")?; let min_writer_version = ex::extract_and_cast::(arr, "minWriterVersion")?; let maybe_reader_features = ex::extract_and_cast_opt::(arr, "readerFeatures"); @@ -73,7 +83,7 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult(arr, "size")?; let modification_time = ex::extract_and_cast::(arr, "modificationTime")?; let data_change = ex::extract_and_cast::(arr, "dataChange")?; - let stats = ex::extract_and_cast::(arr, "stats")?; + let stats = ex::extract_and_cast_opt::(arr, "stats"); let tags = ex::extract_and_cast_opt::(arr, "tags"); let dv = ex::extract_and_cast_opt::(arr, "deletionVector"); @@ -116,7 +126,8 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult DeltaResult(array, "cdc") { + // Stop early if all values are null + if arr.null_count() == arr.len() { + return Ok(result); + } + let path = ex::extract_and_cast::(arr, "path")?; let pvs = ex::extract_and_cast_opt::(arr, "partitionValues"); let size = ex::extract_and_cast::(arr, "size")?; @@ -171,6 +187,11 @@ pub(super) fn read_removes(array: &dyn ProvidesColumnByName) -> DeltaResult(array, "remove") { + // Stop early if all values are null + if arr.null_count() == arr.len() { + return Ok(result); + } + let path = ex::extract_and_cast::(arr, "path")?; let data_change = ex::extract_and_cast::(arr, "dataChange")?; let deletion_timestamp = ex::extract_and_cast::(arr, "deletionTimestamp")?; @@ -237,7 +258,9 @@ pub(super) fn read_removes(array: &dyn ProvidesColumnByName) -> DeltaResult Option)> + '_> { +pub(super) fn collect_map( + val: &StructArray, +) -> Option)> + '_> { let keys = val .column(0) .as_ref() diff --git a/crates/core/src/kernel/snapshot/replay.rs b/crates/core/src/kernel/snapshot/replay.rs index 3efd9584e2..1b18b61bc7 100644 --- a/crates/core/src/kernel/snapshot/replay.rs +++ b/crates/core/src/kernel/snapshot/replay.rs @@ -1,17 +1,20 @@ +use std::collections::HashMap; use std::pin::Pin; use std::sync::Arc; use std::task::Context; use std::task::Poll; use arrow_arith::boolean::{is_not_null, or}; -use arrow_array::{ - Array, ArrayRef, BooleanArray, Int32Array, RecordBatch, StringArray, StructArray, -}; +use arrow_array::MapArray; +use arrow_array::*; use arrow_schema::{ - DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, + DataType as ArrowDataType, Field as ArrowField, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; use arrow_select::filter::filter_record_batch; +use delta_kernel::expressions::Scalar; +use delta_kernel::schema::DataType; +use delta_kernel::schema::PrimitiveType; use futures::Stream; use hashbrown::HashSet; use itertools::Itertools; @@ -19,14 +22,14 @@ use percent_encoding::percent_decode_str; use pin_project_lite::pin_project; use tracing::debug; +use super::parse::collect_map; +use super::ReplayVisitor; +use super::Snapshot; use crate::kernel::arrow::extract::{self as ex, ProvidesColumnByName}; use crate::kernel::arrow::json; use crate::kernel::StructType; use crate::{DeltaResult, DeltaTableConfig, DeltaTableError}; -use super::ReplayVisitor; -use super::Snapshot; - pin_project! { pub struct ReplayStream<'a, S> { scanner: LogReplayScanner, @@ -51,8 +54,10 @@ impl<'a, S> ReplayStream<'a, S> { visitors: &'a mut Vec>, ) -> DeltaResult { let stats_schema = Arc::new((&snapshot.stats_schema(None)?).try_into()?); + let partitions_schema = snapshot.partitions_schema(None)?.map(|s| Arc::new(s)); let mapper = Arc::new(LogMapper { stats_schema, + partitions_schema, config: snapshot.config.clone(), }); Ok(Self { @@ -67,6 +72,7 @@ impl<'a, S> ReplayStream<'a, S> { pub(super) struct LogMapper { stats_schema: ArrowSchemaRef, + partitions_schema: Option>, config: DeltaTableConfig, } @@ -77,67 +83,289 @@ impl LogMapper { ) -> DeltaResult { Ok(Self { stats_schema: Arc::new((&snapshot.stats_schema(table_schema)?).try_into()?), + partitions_schema: snapshot + .partitions_schema(table_schema)? + .map(|s| Arc::new(s)), config: snapshot.config.clone(), }) } pub fn map_batch(&self, batch: RecordBatch) -> DeltaResult { - map_batch(batch, self.stats_schema.clone(), &self.config) + map_batch( + batch, + self.stats_schema.clone(), + self.partitions_schema.clone(), + &self.config, + ) } } fn map_batch( batch: RecordBatch, stats_schema: ArrowSchemaRef, + partition_schema: Option>, config: &DeltaTableConfig, ) -> DeltaResult { - let stats_col = ex::extract_and_cast_opt::(&batch, "add.stats"); + let mut new_batch = batch.clone(); + + let stats = ex::extract_and_cast_opt::(&batch, "add.stats"); let stats_parsed_col = ex::extract_and_cast_opt::(&batch, "add.stats_parsed"); - if stats_parsed_col.is_some() { - return Ok(batch); + if stats_parsed_col.is_none() && stats.is_some() { + new_batch = parse_stats(new_batch, stats_schema, config)?; } - if let Some(stats) = stats_col { - let stats: Arc = - Arc::new(json::parse_json(stats, stats_schema.clone(), config)?.into()); - let schema = batch.schema(); - let add_col = ex::extract_and_cast::(&batch, "add")?; - let (add_idx, _) = schema.column_with_name("add").unwrap(); - let add_type = add_col - .fields() - .iter() - .cloned() - .chain(std::iter::once(Arc::new(ArrowField::new( - "stats_parsed", - ArrowDataType::Struct(stats_schema.fields().clone()), - true, - )))) - .collect_vec(); - let new_add = Arc::new(StructArray::try_new( - add_type.clone().into(), - add_col - .columns() - .iter() + + if let Some(partitions_schema) = partition_schema { + let partitions_parsed_col = + ex::extract_and_cast_opt::(&batch, "add.partitionValues_parsed"); + if partitions_parsed_col.is_none() { + new_batch = parse_partitions(new_batch, partitions_schema.as_ref())?; + } + } + + Ok(new_batch) +} + +/// parse the serialized stats in the `add.stats` column in the files batch +/// and add a new column `stats_parsed` containing the the parsed stats. +fn parse_stats( + batch: RecordBatch, + stats_schema: ArrowSchemaRef, + config: &DeltaTableConfig, +) -> DeltaResult { + let stats = ex::extract_and_cast_opt::(&batch, "add.stats").ok_or( + DeltaTableError::generic("No stats column found in files batch. This is unexpected."), + )?; + let stats: StructArray = json::parse_json(stats, stats_schema.clone(), config)?.into(); + insert_field(batch, stats, "stats_parsed") +} + +fn parse_partitions(batch: RecordBatch, partition_schema: &StructType) -> DeltaResult { + let partitions = ex::extract_and_cast_opt::(&batch, "add.partitionValues").ok_or( + DeltaTableError::generic( + "No partitionValues column found in files batch. This is unexpected.", + ), + )?; + + let mut values = partition_schema + .fields() + .map(|f| { + ( + f.name().to_string(), + Vec::::with_capacity(partitions.len()), + ) + }) + .collect::>(); + + for i in 0..partitions.len() { + if partitions.is_null(i) { + return Err(DeltaTableError::generic( + "Expected potentially empty partition values map, but found a null value.", + )); + } + let data: HashMap<_, _> = collect_map(&partitions.value(i)) + .ok_or(DeltaTableError::generic( + "Failed to collect partition values from map array.", + ))? + .map(|(k, v)| { + let field = partition_schema + .field(k.as_str()) + .ok_or(DeltaTableError::generic(format!( + "Partition column {} not found in schema.", + k + )))?; + let field_type = match field.data_type() { + DataType::Primitive(p) => Ok(p), + _ => Err(DeltaTableError::generic( + "nested partitioning values are not supported", + )), + }?; + Ok::<_, DeltaTableError>(( + k, + v.map(|vv| field_type.parse_scalar(vv.as_str())) + .transpose()? + .unwrap_or(Scalar::Null(field.data_type().clone())), + )) + }) + .collect::>()?; + + partition_schema.fields().for_each(|f| { + let value = data + .get(f.name()) .cloned() - .chain(std::iter::once(stats as ArrayRef)) - .collect(), - add_col.nulls().cloned(), - )?); - let new_add_field = Arc::new(ArrowField::new( - "add", - ArrowDataType::Struct(add_type.into()), - true, - )); - let mut fields = schema.fields().to_vec(); - let _ = std::mem::replace(&mut fields[add_idx], new_add_field); - let mut columns = batch.columns().to_vec(); - let _ = std::mem::replace(&mut columns[add_idx], new_add); - return Ok(RecordBatch::try_new( - Arc::new(ArrowSchema::new(fields)), - columns, - )?); + .unwrap_or(Scalar::Null(f.data_type().clone())); + values.get_mut(f.name()).unwrap().push(value); + }); } - Ok(batch) + let columns = partition_schema + .fields() + .map(|f| { + let values = values.get(f.name()).unwrap(); + match f.data_type() { + DataType::Primitive(p) => { + // Safety: we created the Scalars above using the parsing function of the same PrimitiveType + // should this fail, it's a bug in our code, and we should panic + let arr = match p { + PrimitiveType::String => { + Arc::new(StringArray::from_iter(values.iter().map(|v| match v { + Scalar::String(s) => Some(s.clone()), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Long => { + Arc::new(Int64Array::from_iter(values.iter().map(|v| match v { + Scalar::Long(i) => Some(*i), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Integer => { + Arc::new(Int32Array::from_iter(values.iter().map(|v| match v { + Scalar::Integer(i) => Some(*i), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Short => { + Arc::new(Int16Array::from_iter(values.iter().map(|v| match v { + Scalar::Short(i) => Some(*i), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Byte => { + Arc::new(Int8Array::from_iter(values.iter().map(|v| match v { + Scalar::Byte(i) => Some(*i), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Float => { + Arc::new(Float32Array::from_iter(values.iter().map(|v| match v { + Scalar::Float(f) => Some(*f), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Double => { + Arc::new(Float64Array::from_iter(values.iter().map(|v| match v { + Scalar::Double(f) => Some(*f), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Boolean => { + Arc::new(BooleanArray::from_iter(values.iter().map(|v| match v { + Scalar::Boolean(b) => Some(*b), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Binary => { + Arc::new(BinaryArray::from_iter(values.iter().map(|v| match v { + Scalar::Binary(b) => Some(b.clone()), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + PrimitiveType::Date => { + Arc::new(Date32Array::from_iter(values.iter().map(|v| match v { + Scalar::Date(d) => Some(*d), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + }))) as ArrayRef + } + + PrimitiveType::Timestamp => Arc::new( + TimestampMicrosecondArray::from_iter(values.iter().map(|v| match v { + Scalar::Timestamp(t) => Some(*t), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + })) + .with_timezone("UTC"), + ) as ArrayRef, + PrimitiveType::TimestampNtz => Arc::new( + TimestampMicrosecondArray::from_iter(values.iter().map(|v| match v { + Scalar::TimestampNtz(t) => Some(*t), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + })), + ) as ArrayRef, + PrimitiveType::Decimal(p, s) => Arc::new( + Decimal128Array::from_iter(values.iter().map(|v| match v { + Scalar::Decimal(d, _, _) => Some(*d), + Scalar::Null(_) => None, + _ => panic!("unexpected scalar type"), + })) + .with_precision_and_scale(*p, *s as i8)?, + ) as ArrayRef, + }; + Ok(arr) + } + _ => Err(DeltaTableError::generic( + "complex partitioning values are not supported", + )), + } + }) + .collect::, _>>()?; + + insert_field( + batch, + StructArray::try_new( + Fields::from( + partition_schema + .fields() + .map(|f| f.try_into()) + .collect::, _>>()?, + ), + columns, + None, + )?, + "partitionValues_parsed", + ) +} + +fn insert_field(batch: RecordBatch, array: StructArray, name: &str) -> DeltaResult { + let schema = batch.schema(); + let add_col = ex::extract_and_cast::(&batch, "add")?; + let (add_idx, _) = schema.column_with_name("add").unwrap(); + + let add_type = add_col + .fields() + .iter() + .cloned() + .chain(std::iter::once(Arc::new(ArrowField::new( + name, + array.data_type().clone(), + true, + )))) + .collect_vec(); + let new_add = Arc::new(StructArray::try_new( + add_type.clone().into(), + add_col + .columns() + .iter() + .cloned() + .chain(std::iter::once(Arc::new(array) as ArrayRef)) + .collect(), + add_col.nulls().cloned(), + )?); + let new_add_field = Arc::new(ArrowField::new( + "add", + ArrowDataType::Struct(add_type.into()), + true, + )); + + let mut fields = schema.fields().to_vec(); + let _ = std::mem::replace(&mut fields[add_idx], new_add_field); + let mut columns = batch.columns().to_vec(); + let _ = std::mem::replace(&mut columns[add_idx], new_add); + + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(fields)), + columns, + )?) } impl<'a, S> Stream for ReplayStream<'a, S> @@ -356,16 +584,22 @@ fn read_file_info<'a>(arr: &'a dyn ProvidesColumnByName) -> DeltaResult TestResult { let log_schema = Arc::new(StructType::new(vec![ @@ -420,4 +654,115 @@ pub(super) mod tests { Ok(()) } + + #[test] + fn test_parse_stats() -> TestResult { + let schema = TestSchemas::simple(); + let config_map = HashMap::new(); + let table_config = TableConfig(&config_map); + let config = DeltaTableConfig::default(); + + let commit_data = CommitData { + actions: vec![ActionFactory::add(schema, HashMap::new(), Vec::new(), true).into()], + operation: DeltaOperation::Write { + mode: crate::protocol::SaveMode::Append, + partition_by: None, + predicate: None, + }, + app_metadata: Default::default(), + app_transactions: Default::default(), + }; + let (_, maybe_batches) = LogSegment::new_test(&[commit_data])?; + + let batches = maybe_batches.into_iter().collect::, _>>()?; + let batch = concat_batches(&batches[0].schema(), &batches)?; + + assert!(ex::extract_and_cast_opt::(&batch, "add.stats").is_some()); + assert!(ex::extract_and_cast_opt::(&batch, "add.stats_parsed").is_none()); + + let stats_schema = stats_schema(&schema, table_config)?; + let new_batch = parse_stats(batch, Arc::new((&stats_schema).try_into()?), &config)?; + + assert!(ex::extract_and_cast_opt::(&new_batch, "add.stats_parsed").is_some()); + let parsed_col = ex::extract_and_cast::(&new_batch, "add.stats_parsed")?; + let delta_type: DataType = parsed_col.data_type().try_into()?; + + match delta_type { + DataType::Struct(fields) => { + assert_eq!(fields.as_ref(), &stats_schema); + } + _ => panic!("unexpected data type"), + } + + // let expression = Expression::column("add.stats"); + // let evaluator = ARROW_HANDLER.get_evaluator( + // Arc::new(batch.schema_ref().as_ref().try_into()?), + // expression, + // DataType::Primitive(PrimitiveType::String), + // ); + // let engine_data = ArrowEngineData::new(batch); + // let result = evaluator + // .evaluate(&engine_data)? + // .as_any() + // .downcast_ref::() + // .ok_or(DeltaTableError::generic( + // "failed to downcast evaluator result to ArrowEngineData.", + // ))? + // .record_batch() + // .clone(); + + Ok(()) + } + + #[test] + fn test_parse_partition_values() -> TestResult { + let schema = TestSchemas::simple(); + let partition_columns = vec![schema.field("modified").unwrap().name().to_string()]; + + let commit_data = CommitData { + actions: vec![ActionFactory::add( + schema, + HashMap::new(), + partition_columns.clone(), + true, + ) + .into()], + operation: DeltaOperation::Write { + mode: crate::protocol::SaveMode::Append, + partition_by: Some(partition_columns.clone()), + predicate: None, + }, + app_metadata: Default::default(), + app_transactions: Default::default(), + }; + let (_, maybe_batches) = LogSegment::new_test(&[commit_data])?; + + let batches = maybe_batches.into_iter().collect::, _>>()?; + let batch = concat_batches(&batches[0].schema(), &batches)?; + + assert!(ex::extract_and_cast_opt::(&batch, "add.partitionValues").is_some()); + assert!( + ex::extract_and_cast_opt::(&batch, "add.partitionValues_parsed").is_none() + ); + + let partitions_schema = partitions_schema(&schema, &partition_columns)?.unwrap(); + let new_batch = parse_partitions(batch, &partitions_schema)?; + + assert!( + ex::extract_and_cast_opt::(&new_batch, "add.partitionValues_parsed") + .is_some() + ); + let parsed_col = + ex::extract_and_cast::(&new_batch, "add.partitionValues_parsed")?; + let delta_type: DataType = parsed_col.data_type().try_into()?; + + match delta_type { + DataType::Struct(fields) => { + assert_eq!(fields.as_ref(), &partitions_schema); + } + _ => panic!("unexpected data type"), + } + + Ok(()) + } } diff --git a/crates/core/src/kernel/snapshot/serde.rs b/crates/core/src/kernel/snapshot/serde.rs index dd7403bc28..45c1206c82 100644 --- a/crates/core/src/kernel/snapshot/serde.rs +++ b/crates/core/src/kernel/snapshot/serde.rs @@ -1,10 +1,11 @@ +use std::fmt; + use arrow_ipc::reader::FileReader; use arrow_ipc::writer::FileWriter; use chrono::{DateTime, TimeZone, Utc}; use object_store::ObjectMeta; use serde::de::{self, Deserializer, SeqAccess, Visitor}; use serde::{ser::SerializeSeq, Deserialize, Serialize}; -use std::fmt; use super::log_segment::LogSegment; use super::EagerSnapshot; diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 4ef9fc06fd..0498e4493c 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -79,6 +79,9 @@ pub mod schema; pub mod storage; pub mod table; +#[cfg(test)] +pub mod test_utils; + #[cfg(feature = "datafusion")] pub mod delta_datafusion; pub mod writer; @@ -89,10 +92,8 @@ pub use self::data_catalog::{DataCatalog, DataCatalogError}; pub use self::errors::*; pub use self::schema::partitions::*; pub use self::schema::*; -pub use self::table::builder::{ - DeltaTableBuilder, DeltaTableConfig, DeltaTableLoadOptions, DeltaVersion, -}; -pub use self::table::config::DeltaConfigKey; +pub use self::table::builder::{DeltaTableBuilder, DeltaTableConfig, DeltaVersion}; +pub use self::table::config::TableProperty; pub use self::table::DeltaTable; pub use object_store::{path::Path, Error as ObjectStoreError, ObjectMeta, ObjectStore}; pub use operations::DeltaOps; diff --git a/crates/core/src/logstore/default_logstore.rs b/crates/core/src/logstore/default_logstore.rs index 8fd4f52beb..79a1c76653 100644 --- a/crates/core/src/logstore/default_logstore.rs +++ b/crates/core/src/logstore/default_logstore.rs @@ -1,12 +1,25 @@ //! Default implementation of [`LogStore`] for storage backends with atomic put-if-absent operation -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use bytes::Bytes; -use object_store::{path::Path, ObjectStore}; +use object_store::{Attributes, Error as ObjectStoreError, ObjectStore, PutOptions, TagSet}; -use super::{LogStore, LogStoreConfig}; -use crate::{operations::transaction::TransactionError, storage::ObjectStoreRef, DeltaResult}; +use super::{CommitOrBytes, LogStore, LogStoreConfig}; +use crate::{ + operations::transaction::TransactionError, + storage::{commit_uri_from_version, ObjectStoreRef}, + DeltaResult, +}; + +fn put_options() -> &'static PutOptions { + static PUT_OPTS: OnceLock = OnceLock::new(); + PUT_OPTS.get_or_init(|| PutOptions { + mode: object_store::PutMode::Create, // Creates if file doesn't exists yet + tags: TagSet::default(), + attributes: Attributes::default(), + }) +} /// Default [`LogStore`] implementation #[derive(Debug, Clone)] @@ -45,17 +58,39 @@ impl LogStore for DefaultLogStore { async fn write_commit_entry( &self, version: i64, - tmp_commit: &Path, + commit_or_bytes: CommitOrBytes, ) -> Result<(), TransactionError> { - super::write_commit_entry(self.storage.as_ref(), version, tmp_commit).await + match commit_or_bytes { + CommitOrBytes::LogBytes(log_bytes) => self + .object_store() + .put_opts( + &commit_uri_from_version(version), + log_bytes.into(), + put_options().clone(), + ) + .await + .map_err(|err| -> TransactionError { + match err { + ObjectStoreError::AlreadyExists { .. } => { + TransactionError::VersionAlreadyExists(version) + } + _ => TransactionError::from(err), + } + })?, + _ => unreachable!(), // Default log store should never get a tmp_commit, since this is for conditional put stores + }; + Ok(()) } async fn abort_commit_entry( &self, - version: i64, - tmp_commit: &Path, + _version: i64, + commit_or_bytes: CommitOrBytes, ) -> Result<(), TransactionError> { - super::abort_commit_entry(self.storage.as_ref(), version, tmp_commit).await + match &commit_or_bytes { + CommitOrBytes::LogBytes(_) => Ok(()), + _ => unreachable!(), // Default log store should never get a tmp_commit, since this is for conditional put stores + } } async fn get_latest_version(&self, current_version: i64) -> DeltaResult { diff --git a/crates/core/src/logstore/mod.rs b/crates/core/src/logstore/mod.rs index b8646cdb65..dd82274157 100644 --- a/crates/core/src/logstore/mod.rs +++ b/crates/core/src/logstore/mod.rs @@ -1,31 +1,31 @@ //! Delta log store. -use dashmap::DashMap; -use futures::StreamExt; -use lazy_static::lazy_static; -use regex::Regex; -use serde::{ - de::{Error, SeqAccess, Visitor}, - ser::SerializeSeq, - Deserialize, Serialize, -}; use std::io::{BufRead, BufReader, Cursor}; use std::sync::OnceLock; use std::{cmp::max, collections::HashMap, sync::Arc}; -use url::Url; -use crate::{ - errors::DeltaResult, - kernel::Action, - operations::transaction::TransactionError, - protocol::{get_last_checkpoint, ProtocolError}, - storage::{ - commit_uri_from_version, retry_ext::ObjectStoreRetryExt, ObjectStoreRef, StorageOptions, - }, - DeltaTableError, -}; use bytes::Bytes; +use dashmap::DashMap; +use futures::StreamExt; +use lazy_static::lazy_static; use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; +use regex::Regex; +use serde::de::{Error, SeqAccess, Visitor}; +use serde::ser::SerializeSeq; +use serde::{Deserialize, Serialize}; use tracing::{debug, warn}; +use url::Url; + +use crate::kernel::log_segment::PathExt; +use crate::kernel::Action; +use crate::operations::transaction::TransactionError; +use crate::protocol::{get_last_checkpoint, ProtocolError}; +use crate::storage::DeltaIOStorageBackend; +use crate::storage::{ + commit_uri_from_version, retry_ext::ObjectStoreRetryExt, IORuntime, ObjectStoreRef, + StorageOptions, +}; + +use crate::{DeltaResult, DeltaTableError}; #[cfg(feature = "datafusion")] use datafusion::datasource::object_store::ObjectStoreUrl; @@ -102,11 +102,12 @@ lazy_static! { /// # use std::collections::HashMap; /// # use url::Url; /// let location = Url::parse("memory:///").expect("Failed to make location"); -/// let logstore = logstore_for(location, HashMap::new()).expect("Failed to get a logstore"); +/// let logstore = logstore_for(location, HashMap::new(), None).expect("Failed to get a logstore"); /// ``` pub fn logstore_for( location: Url, options: impl Into + Clone, + io_runtime: Option, ) -> DeltaResult { // turn location into scheme let scheme = Url::parse(&format!("{}://", location.scheme())) @@ -114,10 +115,11 @@ pub fn logstore_for( if let Some(entry) = crate::storage::factories().get(&scheme) { debug!("Found a storage provider for {scheme} ({location})"); + let (store, _prefix) = entry .value() .parse_url_opts(&location, &options.clone().into())?; - return logstore_with(store, location, options); + return logstore_with(store, location, options, io_runtime); } Err(DeltaTableError::InvalidTableLocation(location.into())) } @@ -127,10 +129,17 @@ pub fn logstore_with( store: ObjectStoreRef, location: Url, options: impl Into + Clone, + io_runtime: Option, ) -> DeltaResult { let scheme = Url::parse(&format!("{}://", location.scheme())) .map_err(|_| DeltaTableError::InvalidTableLocation(location.clone().into()))?; + let store = if let Some(io_runtime) = io_runtime { + Arc::new(DeltaIOStorageBackend::new(store, io_runtime.get_handle())) as ObjectStoreRef + } else { + store + }; + if let Some(factory) = logstores().get(&scheme) { debug!("Found a logstore provider for {scheme}"); return factory.with_options(store, &location, &options.into()); @@ -143,6 +152,15 @@ pub fn logstore_with( )) } +/// Holder whether it's tmp_commit path or commit bytes +#[derive(Clone)] +pub enum CommitOrBytes { + /// Path of the tmp commit, to be used by logstores which use CopyIfNotExists + TmpCommit(Path), + /// Bytes of the log, to be used by logstoers which use Conditional Put + LogBytes(Bytes), +} + /// Configuration parameters for a log store #[derive(Debug, Clone)] pub struct LogStoreConfig { @@ -182,14 +200,14 @@ pub trait LogStore: Sync + Send { async fn write_commit_entry( &self, version: i64, - tmp_commit: &Path, + commit_or_bytes: CommitOrBytes, ) -> Result<(), TransactionError>; /// Abort the commit entry for the given version. async fn abort_commit_entry( &self, version: i64, - tmp_commit: &Path, + commit_or_bytes: CommitOrBytes, ) -> Result<(), TransactionError>; /// Find latest version currently stored in the delta log. @@ -221,7 +239,7 @@ pub trait LogStore: Sync + Send { let mut stream = object_store.list(Some(self.log_path())); if let Some(res) = stream.next().await { match res { - Ok(_) => Ok(true), + Ok(meta) => Ok(meta.location.is_commit_file()), Err(ObjectStoreError::NotFound { .. }) => Ok(false), Err(err) => Err(err)?, } @@ -311,7 +329,7 @@ pub async fn get_actions( // TODO: maybe a bit of a hack, required to `#[derive(Debug)]` for the operation builders impl std::fmt::Debug for dyn LogStore + '_ { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "LogStore({})", self.root_uri()) + write!(f, "{}({})", self.name(), self.root_uri()) } } @@ -475,16 +493,53 @@ mod tests { #[test] fn logstore_with_invalid_url() { let location = Url::parse("nonexistent://table").unwrap(); - let store = logstore_for(location, HashMap::default()); + let store = logstore_for(location, HashMap::default(), None); assert!(store.is_err()); } #[test] fn logstore_with_memory() { let location = Url::parse("memory://table").unwrap(); - let store = logstore_for(location, HashMap::default()); + let store = logstore_for(location, HashMap::default(), None); assert!(store.is_ok()); } + + #[test] + fn logstore_with_memory_and_rt() { + let location = Url::parse("memory://table").unwrap(); + let store = logstore_for(location, HashMap::default(), Some(IORuntime::default())); + assert!(store.is_ok()); + } + + #[tokio::test] + async fn test_is_location_a_table() { + use object_store::path::Path; + use object_store::{PutOptions, PutPayload}; + let location = Url::parse("memory://table").unwrap(); + let store = + logstore_for(location, HashMap::default(), None).expect("Failed to get logstore"); + assert!(!store + .is_delta_table_location() + .await + .expect("Failed to look at table")); + + // Let's put a failed commit into the directory and then see if it's still considered a + // delta table (it shouldn't be). + let payload = PutPayload::from_static(b"test-drivin"); + let _put = store + .object_store() + .put_opts( + &Path::from("_delta_log/_commit_failed.tmp"), + payload, + PutOptions::default(), + ) + .await + .expect("Failed to put"); + assert!(!store + .is_delta_table_location() + .await + .expect("Failed to look at table")); + } } #[cfg(feature = "datafusion")] diff --git a/crates/core/src/operations/add_column.rs b/crates/core/src/operations/add_column.rs new file mode 100644 index 0000000000..8fff1677b8 --- /dev/null +++ b/crates/core/src/operations/add_column.rs @@ -0,0 +1,113 @@ +//! Add a new column to a table + +use delta_kernel::schema::StructType; +use futures::future::BoxFuture; +use itertools::Itertools; + +use super::transaction::{CommitBuilder, CommitProperties, PROTOCOL}; +use crate::kernel::StructField; +use crate::logstore::LogStoreRef; +use crate::operations::cast::merge_schema::merge_delta_struct; +use crate::protocol::DeltaOperation; +use crate::table::state::DeltaTableState; +use crate::{DeltaResult, DeltaTable, DeltaTableError}; + +/// Add new columns and/or nested fields to a table +pub struct AddColumnBuilder { + /// A snapshot of the table's state + snapshot: DeltaTableState, + /// Fields to add/merge into schema + fields: Option>, + /// Delta object store for handling data files + log_store: LogStoreRef, + /// Additional information to add to the commit + commit_properties: CommitProperties, +} + +impl super::Operation<()> for AddColumnBuilder {} + +impl AddColumnBuilder { + /// Create a new builder + pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { + Self { + snapshot, + log_store, + fields: None, + commit_properties: CommitProperties::default(), + } + } + + /// Specify the fields to be added + pub fn with_fields(mut self, fields: impl IntoIterator + Clone) -> Self { + self.fields = Some(fields.into_iter().collect()); + self + } + /// Additional metadata to be added to commit info + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; + self + } +} + +impl std::future::IntoFuture for AddColumnBuilder { + type Output = DeltaResult; + + type IntoFuture = BoxFuture<'static, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let this = self; + + Box::pin(async move { + let mut metadata = this.snapshot.metadata().clone(); + let fields = match this.fields { + Some(v) => v, + None => return Err(DeltaTableError::Generic("No fields provided".to_string())), + }; + + let fields_right = &StructType::new(fields.clone()); + let table_schema = this.snapshot.schema(); + let new_table_schema = merge_delta_struct(table_schema, fields_right)?; + + // TODO(ion): Think of a way how we can simply this checking through the API or centralize some checks. + let contains_timestampntz = PROTOCOL.contains_timestampntz(fields.iter()); + let protocol = this.snapshot.protocol(); + + let maybe_new_protocol = if contains_timestampntz { + let updated_protocol = protocol.clone().enable_timestamp_ntz(); + if !(protocol.min_reader_version == 3 && protocol.min_writer_version == 7) { + // Convert existing properties to features since we advanced the protocol to v3,7 + Some( + updated_protocol + .move_table_properties_into_features(&metadata.configuration), + ) + } else { + Some(updated_protocol) + } + } else { + None + }; + + let operation = DeltaOperation::AddColumn { + fields: fields.into_iter().collect_vec(), + }; + + metadata.schema_string = serde_json::to_string(&new_table_schema)?; + + let mut actions = vec![metadata.into()]; + + if let Some(new_protocol) = maybe_new_protocol { + actions.push(new_protocol.into()) + } + + let commit = CommitBuilder::from(this.commit_properties) + .with_actions(actions) + .build(Some(&this.snapshot), this.log_store.clone(), operation) + .await?; + + Ok(DeltaTable::new_with_state( + this.log_store, + commit.snapshot(), + )) + }) + } +} diff --git a/crates/core/src/operations/add_feature.rs b/crates/core/src/operations/add_feature.rs new file mode 100644 index 0000000000..7200c37d03 --- /dev/null +++ b/crates/core/src/operations/add_feature.rs @@ -0,0 +1,196 @@ +//! Enable table features + +use futures::future::BoxFuture; +use itertools::Itertools; + +use super::transaction::{CommitBuilder, CommitProperties}; +use crate::kernel::{ReaderFeatures, TableFeatures, WriterFeatures}; +use crate::logstore::LogStoreRef; +use crate::protocol::DeltaOperation; +use crate::table::state::DeltaTableState; +use crate::DeltaTable; +use crate::{DeltaResult, DeltaTableError}; + +/// Enable table features for a table +pub struct AddTableFeatureBuilder { + /// A snapshot of the table's state + snapshot: DeltaTableState, + /// Name of the feature + name: Vec, + /// Allow protocol versions to be increased by setting features + allow_protocol_versions_increase: bool, + /// Delta object store for handling data files + log_store: LogStoreRef, + /// Additional information to add to the commit + commit_properties: CommitProperties, +} + +impl super::Operation<()> for AddTableFeatureBuilder {} + +impl AddTableFeatureBuilder { + /// Create a new builder + pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { + Self { + name: vec![], + allow_protocol_versions_increase: false, + snapshot, + log_store, + commit_properties: CommitProperties::default(), + } + } + + /// Specify the features to be added + pub fn with_feature>(mut self, name: S) -> Self { + self.name.push(name.into()); + self + } + + /// Specify the features to be added + pub fn with_features>(mut self, name: Vec) -> Self { + self.name + .extend(name.into_iter().map(Into::into).collect_vec()); + self + } + + /// Specify if you want to allow protocol version to be increased + pub fn with_allow_protocol_versions_increase(mut self, allow: bool) -> Self { + self.allow_protocol_versions_increase = allow; + self + } + + /// Additional metadata to be added to commit info + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; + self + } +} + +impl std::future::IntoFuture for AddTableFeatureBuilder { + type Output = DeltaResult; + + type IntoFuture = BoxFuture<'static, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let this = self; + + Box::pin(async move { + let name = if this.name.is_empty() { + return Err(DeltaTableError::Generic("No features provided".to_string())); + } else { + this.name + }; + let (reader_features, writer_features): ( + Vec>, + Vec>, + ) = name.iter().map(|v| v.to_reader_writer_features()).unzip(); + let reader_features = reader_features.into_iter().flatten().collect_vec(); + let writer_features = writer_features.into_iter().flatten().collect_vec(); + + let mut protocol = this.snapshot.protocol().clone(); + + if !this.allow_protocol_versions_increase { + if !reader_features.is_empty() + && !writer_features.is_empty() + && !(protocol.min_reader_version == 3 && protocol.min_writer_version == 7) + { + return Err(DeltaTableError::Generic("Table feature enables reader and writer feature, but reader is not v3, and writer not v7. Set allow_protocol_versions_increase or increase versions explicitly through set_tbl_properties".to_string())); + } else if !reader_features.is_empty() && protocol.min_reader_version < 3 { + return Err(DeltaTableError::Generic("Table feature enables reader feature, but min_reader is not v3. Set allow_protocol_versions_increase or increase version explicitly through set_tbl_properties".to_string())); + } else if !writer_features.is_empty() && protocol.min_writer_version < 7 { + return Err(DeltaTableError::Generic("Table feature enables writer feature, but min_writer is not v7. Set allow_protocol_versions_increase or increase version explicitly through set_tbl_properties".to_string())); + } + } + + protocol = protocol.with_reader_features(reader_features); + protocol = protocol.with_writer_features(writer_features); + + let operation = DeltaOperation::AddFeature { name }; + + let actions = vec![protocol.into()]; + + let commit = CommitBuilder::from(this.commit_properties) + .with_actions(actions) + .build(Some(&this.snapshot), this.log_store.clone(), operation) + .await?; + + Ok(DeltaTable::new_with_state( + this.log_store, + commit.snapshot(), + )) + }) + } +} + +#[cfg(feature = "datafusion")] +#[cfg(test)] +mod tests { + use delta_kernel::DeltaResult; + + use crate::{ + kernel::TableFeatures, + writer::test_utils::{create_bare_table, get_record_batch}, + DeltaOps, + }; + + #[tokio::test] + async fn add_feature() -> DeltaResult<()> { + let batch = get_record_batch(None, false); + let write = DeltaOps(create_bare_table()) + .write(vec![batch.clone()]) + .await + .unwrap(); + let table = DeltaOps(write); + let result = table + .add_feature() + .with_feature(TableFeatures::ChangeDataFeed) + .with_allow_protocol_versions_increase(true) + .await + .unwrap(); + + assert!(&result + .protocol() + .cloned() + .unwrap() + .writer_features + .unwrap_or_default() + .contains(&crate::kernel::WriterFeatures::ChangeDataFeed)); + + let result = DeltaOps(result) + .add_feature() + .with_feature(TableFeatures::DeletionVectors) + .with_allow_protocol_versions_increase(true) + .await + .unwrap(); + + let current_protocol = &result.protocol().cloned().unwrap(); + assert!(¤t_protocol + .writer_features + .clone() + .unwrap_or_default() + .contains(&crate::kernel::WriterFeatures::DeletionVectors)); + assert!(¤t_protocol + .reader_features + .clone() + .unwrap_or_default() + .contains(&crate::kernel::ReaderFeatures::DeletionVectors)); + assert_eq!(result.version(), 2); + Ok(()) + } + + #[tokio::test] + async fn add_feature_disallowed_increase() -> DeltaResult<()> { + let batch = get_record_batch(None, false); + let write = DeltaOps(create_bare_table()) + .write(vec![batch.clone()]) + .await + .unwrap(); + let table = DeltaOps(write); + let result = table + .add_feature() + .with_feature(TableFeatures::ChangeDataFeed) + .await; + + assert!(result.is_err()); + Ok(()) + } +} diff --git a/crates/core/src/operations/cast.rs b/crates/core/src/operations/cast.rs deleted file mode 100644 index b231346266..0000000000 --- a/crates/core/src/operations/cast.rs +++ /dev/null @@ -1,354 +0,0 @@ -//! Provide common cast functionality for callers -//! -use crate::kernel::{ - ArrayType, DataType as DeltaDataType, MapType, MetadataValue, StructField, StructType, -}; -use arrow_array::{new_null_array, Array, ArrayRef, RecordBatch, StructArray}; -use arrow_cast::{cast_with_options, CastOptions}; -use arrow_schema::{ArrowError, DataType, Fields, SchemaRef as ArrowSchemaRef}; -use std::collections::HashMap; -use std::sync::Arc; - -use crate::DeltaResult; - -fn try_merge_metadata( - left: &mut HashMap, - right: &HashMap, -) -> Result<(), ArrowError> { - for (k, v) in right { - if let Some(vl) = left.get(k) { - if vl != v { - return Err(ArrowError::SchemaError(format!( - "Cannot merge metadata with different values for key {}", - k - ))); - } - } else { - left.insert(k.clone(), v.clone()); - } - } - Ok(()) -} - -pub(crate) fn merge_struct( - left: &StructType, - right: &StructType, -) -> Result { - let mut errors = Vec::with_capacity(left.fields().len()); - let merged_fields: Result, ArrowError> = left - .fields() - .iter() - .map(|field| { - let right_field = right.field_with_name(field.name()); - if let Ok(right_field) = right_field { - let type_or_not = merge_type(field.data_type(), right_field.data_type()); - match type_or_not { - Err(e) => { - errors.push(e.to_string()); - Err(e) - } - Ok(f) => { - let mut new_field = StructField::new( - field.name(), - f, - field.is_nullable() || right_field.is_nullable(), - ); - - new_field.metadata = field.metadata.clone(); - try_merge_metadata(&mut new_field.metadata, &right_field.metadata)?; - Ok(new_field) - } - } - } else { - Ok(field.clone()) - } - }) - .collect(); - match merged_fields { - Ok(mut fields) => { - for field in right.fields() { - if !left.field_with_name(field.name()).is_ok() { - fields.push(field.clone()); - } - } - - Ok(StructType::new(fields)) - } - Err(e) => { - errors.push(e.to_string()); - Err(ArrowError::SchemaError(errors.join("\n"))) - } - } -} - -pub(crate) fn merge_type( - left: &DeltaDataType, - right: &DeltaDataType, -) -> Result { - if left == right { - return Ok(left.clone()); - } - match (left, right) { - (DeltaDataType::Array(a), DeltaDataType::Array(b)) => { - let merged = merge_type(&a.element_type, &b.element_type)?; - Ok(DeltaDataType::Array(Box::new(ArrayType::new( - merged, - a.contains_null() || b.contains_null(), - )))) - } - (DeltaDataType::Map(a), DeltaDataType::Map(b)) => { - let merged_key = merge_type(&a.key_type, &b.key_type)?; - let merged_value = merge_type(&a.value_type, &b.value_type)?; - Ok(DeltaDataType::Map(Box::new(MapType::new( - merged_key, - merged_value, - a.value_contains_null() || b.value_contains_null(), - )))) - } - (DeltaDataType::Struct(a), DeltaDataType::Struct(b)) => { - let merged = merge_struct(a, b)?; - Ok(DeltaDataType::Struct(Box::new(merged))) - } - (a, b) => Err(ArrowError::SchemaError(format!( - "Cannot merge types {} and {}", - a, b - ))), - } -} - -pub(crate) fn merge_schema( - left: ArrowSchemaRef, - right: ArrowSchemaRef, -) -> Result { - let left_delta: StructType = left.try_into()?; - let right_delta: StructType = right.try_into()?; - let merged: StructType = merge_struct(&left_delta, &right_delta)?; - Ok(Arc::new((&merged).try_into()?)) -} - -fn cast_struct( - struct_array: &StructArray, - fields: &Fields, - cast_options: &CastOptions, - add_missing: bool, -) -> Result>, arrow_schema::ArrowError> { - fields - .iter() - .map(|field| { - let col_or_not = struct_array.column_by_name(field.name()); - match col_or_not { - None => match add_missing { - true => Ok(new_null_array(field.data_type(), struct_array.len())), - false => Err(arrow_schema::ArrowError::SchemaError(format!( - "Could not find column {0}", - field.name() - ))), - }, - Some(col) => { - if let (DataType::Struct(_), DataType::Struct(child_fields)) = - (col.data_type(), field.data_type()) - { - let child_struct = StructArray::from(col.into_data()); - let s = - cast_struct(&child_struct, child_fields, cast_options, add_missing)?; - Ok(Arc::new(StructArray::new( - child_fields.clone(), - s, - child_struct.nulls().map(ToOwned::to_owned), - )) as ArrayRef) - } else if is_cast_required(col.data_type(), field.data_type()) { - cast_with_options(col, field.data_type(), cast_options) - } else { - Ok(col.clone()) - } - } - } - }) - .collect::, _>>() -} - -fn is_cast_required(a: &DataType, b: &DataType) -> bool { - match (a, b) { - (DataType::List(a_item), DataType::List(b_item)) => { - // If list item name is not the default('item') the list must be casted - !a.equals_datatype(b) || a_item.name() != b_item.name() - } - (_, _) => !a.equals_datatype(b), - } -} - -/// Cast recordbatch to a new target_schema, by casting each column array -pub fn cast_record_batch( - batch: &RecordBatch, - target_schema: ArrowSchemaRef, - safe: bool, - add_missing: bool, -) -> DeltaResult { - let cast_options = CastOptions { - safe, - ..Default::default() - }; - - let s = StructArray::new( - batch.schema().as_ref().to_owned().fields, - batch.columns().to_owned(), - None, - ); - let columns = cast_struct(&s, target_schema.fields(), &cast_options, add_missing)?; - Ok(RecordBatch::try_new(target_schema, columns)?) -} - -#[cfg(test)] -mod tests { - use crate::kernel::{ - ArrayType as DeltaArrayType, DataType as DeltaDataType, StructField as DeltaStructField, - StructType as DeltaStructType, - }; - use crate::operations::cast::MetadataValue; - use crate::operations::cast::{cast_record_batch, is_cast_required}; - use arrow::array::ArrayData; - use arrow_array::{Array, ArrayRef, ListArray, RecordBatch}; - use arrow_buffer::Buffer; - use arrow_schema::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; - use std::collections::HashMap; - use std::sync::Arc; - - #[test] - fn test_merge_schema_with_dict() { - let left_schema = Arc::new(Schema::new(vec![Field::new( - "f", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - )])); - let right_schema = Arc::new(Schema::new(vec![Field::new( - "f", - DataType::LargeUtf8, - true, - )])); - - let result = super::merge_schema(left_schema, right_schema).unwrap(); - assert_eq!(result.fields().len(), 1); - let delta_type: DeltaDataType = result.fields()[0].data_type().try_into().unwrap(); - assert_eq!(delta_type, DeltaDataType::STRING); - assert!(result.fields()[0].is_nullable()); - } - - #[test] - fn test_merge_schema_with_meta() { - let mut left_meta = HashMap::new(); - left_meta.insert("a".to_string(), "a1".to_string()); - let left_schema = DeltaStructType::new(vec![DeltaStructField::new( - "f", - DeltaDataType::STRING, - false, - ) - .with_metadata(left_meta)]); - let mut right_meta = HashMap::new(); - right_meta.insert("b".to_string(), "b2".to_string()); - let right_schema = DeltaStructType::new(vec![DeltaStructField::new( - "f", - DeltaDataType::STRING, - true, - ) - .with_metadata(right_meta)]); - - let result = super::merge_struct(&left_schema, &right_schema).unwrap(); - assert_eq!(result.fields().len(), 1); - let delta_type = result.fields()[0].data_type(); - assert_eq!(delta_type, &DeltaDataType::STRING); - let mut expected_meta = HashMap::new(); - expected_meta.insert("a".to_string(), MetadataValue::String("a1".to_string())); - expected_meta.insert("b".to_string(), MetadataValue::String("b2".to_string())); - assert_eq!(result.fields()[0].metadata(), &expected_meta); - } - - #[test] - fn test_merge_schema_with_nested() { - let left_schema = Arc::new(Schema::new(vec![Field::new( - "f", - DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, false))), - false, - )])); - let right_schema = Arc::new(Schema::new(vec![Field::new( - "f", - DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, false))), - true, - )])); - - let result = super::merge_schema(left_schema, right_schema).unwrap(); - assert_eq!(result.fields().len(), 1); - let delta_type: DeltaDataType = result.fields()[0].data_type().try_into().unwrap(); - assert_eq!( - delta_type, - DeltaDataType::Array(Box::new(DeltaArrayType::new(DeltaDataType::STRING, false))) - ); - assert!(result.fields()[0].is_nullable()); - } - - #[test] - fn test_cast_record_batch_with_list_non_default_item() { - let array = Arc::new(make_list_array()) as ArrayRef; - let source_schema = Schema::new(vec![Field::new( - "list_column", - array.data_type().clone(), - false, - )]); - let record_batch = RecordBatch::try_new(Arc::new(source_schema), vec![array]).unwrap(); - - let fields = Fields::from(vec![Field::new_list( - "list_column", - Field::new("item", DataType::Int8, false), - false, - )]); - let target_schema = Arc::new(Schema::new(fields)) as SchemaRef; - - let result = cast_record_batch(&record_batch, target_schema, false, false); - - let schema = result.unwrap().schema(); - let field = schema.column_with_name("list_column").unwrap().1; - if let DataType::List(list_item) = field.data_type() { - assert_eq!(list_item.name(), "item"); - } else { - panic!("Not a list"); - } - } - - fn make_list_array() -> ListArray { - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) - .build() - .unwrap(); - - let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); - - let list_data_type = DataType::List(Arc::new(Field::new("element", DataType::Int32, true))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build() - .unwrap(); - ListArray::from(list_data) - } - - #[test] - fn test_is_cast_required_with_list() { - let field1 = DataType::List(FieldRef::from(Field::new("item", DataType::Int32, false))); - let field2 = DataType::List(FieldRef::from(Field::new("item", DataType::Int32, false))); - - assert!(!is_cast_required(&field1, &field2)); - } - - #[test] - fn test_is_cast_required_with_list_non_default_item() { - let field1 = DataType::List(FieldRef::from(Field::new("item", DataType::Int32, false))); - let field2 = DataType::List(FieldRef::from(Field::new( - "element", - DataType::Int32, - false, - ))); - - assert!(is_cast_required(&field1, &field2)); - } -} diff --git a/crates/core/src/operations/cast/merge_schema.rs b/crates/core/src/operations/cast/merge_schema.rs new file mode 100644 index 0000000000..64fe2b7ed6 --- /dev/null +++ b/crates/core/src/operations/cast/merge_schema.rs @@ -0,0 +1,352 @@ +//! Provide schema merging for delta schemas +//! +use std::collections::HashMap; + +use arrow::datatypes::DataType::Dictionary; +use arrow_schema::{ + ArrowError, DataType, Field as ArrowField, Fields, Schema as ArrowSchema, + SchemaRef as ArrowSchemaRef, +}; + +use crate::kernel::{ArrayType, DataType as DeltaDataType, MapType, StructField, StructType}; + +fn try_merge_metadata( + left: &mut HashMap, + right: &HashMap, +) -> Result<(), ArrowError> { + for (k, v) in right { + if let Some(vl) = left.get(k) { + if vl != v { + return Err(ArrowError::SchemaError(format!( + "Cannot merge metadata with different values for key {}", + k + ))); + } + } else { + left.insert(k.clone(), v.clone()); + } + } + Ok(()) +} + +pub(crate) fn merge_delta_type( + left: &DeltaDataType, + right: &DeltaDataType, +) -> Result { + if left == right { + return Ok(left.clone()); + } + match (left, right) { + (DeltaDataType::Array(a), DeltaDataType::Array(b)) => { + let merged = merge_delta_type(&a.element_type, &b.element_type)?; + Ok(DeltaDataType::Array(Box::new(ArrayType::new( + merged, + a.contains_null() || b.contains_null(), + )))) + } + (DeltaDataType::Map(a), DeltaDataType::Map(b)) => { + let merged_key = merge_delta_type(&a.key_type, &b.key_type)?; + let merged_value = merge_delta_type(&a.value_type, &b.value_type)?; + Ok(DeltaDataType::Map(Box::new(MapType::new( + merged_key, + merged_value, + a.value_contains_null() || b.value_contains_null(), + )))) + } + (DeltaDataType::Struct(a), DeltaDataType::Struct(b)) => { + let merged = merge_delta_struct(a, b)?; + Ok(DeltaDataType::Struct(Box::new(merged))) + } + (a, b) => Err(ArrowError::SchemaError(format!( + "Cannot merge types {} and {}", + a, b + ))), + } +} + +pub(crate) fn merge_delta_struct( + left: &StructType, + right: &StructType, +) -> Result { + let mut errors = Vec::new(); + let merged_fields: Result, ArrowError> = left + .fields() + .map(|field| { + let right_field = right.field(field.name()); + if let Some(right_field) = right_field { + let type_or_not = merge_delta_type(field.data_type(), right_field.data_type()); + match type_or_not { + Err(e) => { + errors.push(e.to_string()); + Err(e) + } + Ok(f) => { + let mut new_field = StructField::new( + field.name(), + f, + field.is_nullable() || right_field.is_nullable(), + ); + + new_field.metadata.clone_from(&field.metadata); + try_merge_metadata(&mut new_field.metadata, &right_field.metadata)?; + Ok(new_field) + } + } + } else { + Ok(field.clone()) + } + }) + .collect(); + match merged_fields { + Ok(mut fields) => { + for field in right.fields() { + if !left.field(field.name()).is_some() { + fields.push(field.clone()); + } + } + + Ok(StructType::new(fields)) + } + Err(e) => { + errors.push(e.to_string()); + Err(ArrowError::SchemaError(errors.join("\n"))) + } + } +} + +pub(crate) fn merge_arrow_field( + left: &ArrowField, + right: &ArrowField, + preserve_new_fields: bool, +) -> Result { + if left == right { + return Ok(left.clone()); + } + + let (table_type, batch_type) = (left.data_type(), right.data_type()); + + match (table_type, batch_type) { + (Dictionary(key_type, value_type), _) + if matches!( + value_type.as_ref(), + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 + ) && matches!( + batch_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + ) => + { + Ok(ArrowField::new( + right.name(), + Dictionary(key_type.clone(), Box::new(batch_type.clone())), + left.is_nullable() || right.is_nullable(), + )) + } + (Dictionary(key_type, value_type), _) + if matches!( + value_type.as_ref(), + DataType::Binary | DataType::BinaryView | DataType::LargeBinary + ) && matches!( + batch_type, + DataType::Binary | DataType::LargeBinary | DataType::BinaryView + ) => + { + Ok(ArrowField::new( + right.name(), + Dictionary(key_type.clone(), Box::new(batch_type.clone())), + left.is_nullable() || right.is_nullable(), + )) + } + (Dictionary(_, value_type), _) if value_type.equals_datatype(batch_type) => Ok(left + .clone() + .with_nullable(left.is_nullable() || right.is_nullable())), + + (_, Dictionary(_, value_type)) + if matches!( + table_type, + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 + ) && matches!( + value_type.as_ref(), + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + ) => + { + Ok(right + .clone() + .with_nullable(left.is_nullable() || right.is_nullable())) + } + (_, Dictionary(_, value_type)) + if matches!( + table_type, + DataType::Binary | DataType::BinaryView | DataType::LargeBinary + ) && matches!( + value_type.as_ref(), + DataType::Binary | DataType::LargeBinary | DataType::BinaryView + ) => + { + Ok(right + .clone() + .with_nullable(left.is_nullable() || right.is_nullable())) + } + (_, Dictionary(_, value_type)) if value_type.equals_datatype(table_type) => Ok(right + .clone() + .with_nullable(left.is_nullable() || right.is_nullable())), + // With Utf8/binary we always take the right type since that is coming from the incoming data + // by doing that we allow passthrough of any string flavor + ( + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View, + ) + | ( + DataType::Binary | DataType::BinaryView | DataType::LargeBinary, + DataType::Binary | DataType::LargeBinary | DataType::BinaryView, + ) => Ok(ArrowField::new( + left.name(), + batch_type.clone(), + right.is_nullable() || left.is_nullable(), + )), + ( + DataType::List(left_child_fields) | DataType::LargeList(left_child_fields), + DataType::LargeList(right_child_fields), + ) => { + let merged = + merge_arrow_field(left_child_fields, right_child_fields, preserve_new_fields)?; + Ok(ArrowField::new( + left.name(), + DataType::LargeList(merged.into()), + right.is_nullable() || left.is_nullable(), + )) + } + ( + DataType::List(left_child_fields) | DataType::LargeList(left_child_fields), + DataType::List(right_child_fields), + ) => { + let merged = + merge_arrow_field(left_child_fields, right_child_fields, preserve_new_fields)?; + Ok(ArrowField::new( + left.name(), + DataType::List(merged.into()), + right.is_nullable() || left.is_nullable(), + )) + } + (DataType::Struct(left_child_fields), DataType::Struct(right_child_fields)) => { + let merged = + merge_arrow_vec_fields(left_child_fields, right_child_fields, preserve_new_fields)?; + Ok(ArrowField::new( + left.name(), + DataType::Struct(merged.into()), + right.is_nullable() || left.is_nullable(), + )) + } + (DataType::Map(left_field, left_sorted), DataType::Map(right_field, right_sorted)) + if left_sorted == right_sorted => + { + let merged = merge_arrow_field(left_field, right_field, preserve_new_fields)?; + Ok(ArrowField::new( + left.name(), + DataType::Map(merged.into(), *right_sorted), + right.is_nullable() || left.is_nullable(), + )) + } + _ => { + let mut new_field = left.clone(); + match new_field.try_merge(right) { + Ok(()) => (), + Err(_err) => { + // We cannot keep the table field here, there is some weird behavior where + // Decimal(5,1) can be safely casted into Decimal(4,1) with out loss of data + // Then our stats parser fails to parse this decimal(1000.1) into Decimal(4,1) + // even though datafusion was able to write it into parquet + // We manually have to check if the decimal in the recordbatch is a subset of the table decimal + if let ( + DataType::Decimal128(left_precision, left_scale) + | DataType::Decimal256(left_precision, left_scale), + DataType::Decimal128(right_precision, right_scale), + ) = (right.data_type(), new_field.data_type()) + { + if !(left_precision <= right_precision && left_scale <= right_scale) { + return Err(ArrowError::SchemaError(format!( + "Cannot merge field {} from {} to {}", + right.name(), + right.data_type(), + new_field.data_type() + ))); + } + }; + // If it's not Decimal datatype, the new_field remains the left table field. + } + }; + Ok(new_field) + } + } +} + +/// Merges Arrow Table schema and Arrow Batch Schema, by allowing Large/View Types to passthrough. +// Sometimes fields can't be merged because they are not the same types. So table has int32, +// but batch int64. We want the preserve the table type. At later stage we will call cast_record_batch +// which will cast the batch int64->int32. This is desired behaviour so we can have flexibility +// in the batch data types. But preserve the correct table and parquet types. +// +// Preserve_new_fields can also be disabled if you just want to only use the passthrough functionality +pub(crate) fn merge_arrow_schema( + table_schema: ArrowSchemaRef, + batch_schema: ArrowSchemaRef, + preserve_new_fields: bool, +) -> Result { + let table_fields = table_schema.fields(); + let batch_fields = batch_schema.fields(); + + let merged_schema = ArrowSchema::new(merge_arrow_vec_fields( + table_fields, + batch_fields, + preserve_new_fields, + )?) + .into(); + Ok(merged_schema) +} + +fn merge_arrow_vec_fields( + table_fields: &Fields, + batch_fields: &Fields, + preserve_new_fields: bool, +) -> Result, ArrowError> { + let mut errors = Vec::with_capacity(table_fields.len()); + let merged_fields: Result, ArrowError> = table_fields + .iter() + .map(|field| { + let right_field = batch_fields.find(field.name()); + if let Some((_, right_field)) = right_field { + let field_or_not = + merge_arrow_field(field.as_ref(), right_field, preserve_new_fields); + match field_or_not { + Err(e) => { + errors.push(e.to_string()); + Err(e) + } + Ok(mut f) => { + let mut field_matadata = f.metadata().clone(); + try_merge_metadata(&mut field_matadata, right_field.metadata())?; + f.set_metadata(field_matadata); + Ok(f) + } + } + } else { + Ok(field.as_ref().clone()) + } + }) + .collect(); + match merged_fields { + Ok(mut fields) => { + if preserve_new_fields { + for field in batch_fields.into_iter() { + if table_fields.find(field.name()).is_none() { + fields.push(field.as_ref().clone()); + } + } + } + Ok(fields) + } + Err(e) => { + errors.push(e.to_string()); + Err(ArrowError::SchemaError(errors.join("\n"))) + } + } +} diff --git a/crates/core/src/operations/cast/mod.rs b/crates/core/src/operations/cast/mod.rs new file mode 100644 index 0000000000..554373e623 --- /dev/null +++ b/crates/core/src/operations/cast/mod.rs @@ -0,0 +1,650 @@ +//! Provide common cast functionality for callers +//! +use arrow_array::cast::AsArray; +use arrow_array::{ + new_null_array, Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, + RecordBatchOptions, StructArray, +}; +use arrow_cast::{cast_with_options, CastOptions}; +use arrow_schema::{ArrowError, DataType, FieldRef, Fields, SchemaRef as ArrowSchemaRef}; +use std::sync::Arc; + +pub(crate) mod merge_schema; + +use crate::DeltaResult; + +fn cast_struct( + struct_array: &StructArray, + fields: &Fields, + cast_options: &CastOptions, + add_missing: bool, +) -> Result { + StructArray::try_new( + fields.to_owned(), + fields + .iter() + .map(|field| { + let col_or_not = struct_array.column_by_name(field.name()); + match col_or_not { + None => { + if add_missing && field.is_nullable() { + Ok(new_null_array(field.data_type(), struct_array.len())) + } else { + Err(ArrowError::SchemaError(format!( + "Could not find column {}", + field.name() + ))) + } + } + Some(col) => cast_field(col, field, cast_options, add_missing), + } + }) + .collect::, _>>()?, + struct_array.nulls().map(ToOwned::to_owned), + ) +} + +fn cast_list( + array: &GenericListArray, + field: &FieldRef, + cast_options: &CastOptions, + add_missing: bool, +) -> Result, ArrowError> { + let values = cast_field(array.values(), field, cast_options, add_missing)?; + GenericListArray::::try_new( + field.clone(), + array.offsets().clone(), + values, + array.nulls().cloned(), + ) +} + +fn cast_map( + array: &MapArray, + entries_field: &FieldRef, + sorted: bool, + cast_options: &CastOptions, + add_missing: bool, +) -> Result { + match entries_field.data_type() { + DataType::Struct(entry_fields) => { + let entries = cast_struct(array.entries(), entry_fields, cast_options, add_missing)?; + MapArray::try_new( + entries_field.clone(), + array.offsets().to_owned(), + entries, + array.nulls().cloned(), + sorted, + ) + } + _ => Err(ArrowError::CastError( + "Map entries must be a struct".to_string(), + )), + } +} + +fn cast_field( + col: &ArrayRef, + field: &FieldRef, + cast_options: &CastOptions, + add_missing: bool, +) -> Result { + let (col_type, field_type) = (col.data_type(), field.data_type()); + + match (col_type, field_type) { + (DataType::Struct(_), DataType::Struct(child_fields)) => { + let child_struct = StructArray::from(col.into_data()); + Ok(Arc::new(cast_struct( + &child_struct, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef) + } + (DataType::List(_), DataType::List(child_fields)) => Ok(Arc::new(cast_list( + col.as_any() + .downcast_ref::>() + .ok_or_else(|| { + ArrowError::CastError(format!( + "Expected a list for {} but got {}", + field.name(), + col_type + )) + })?, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef), + (DataType::LargeList(_), DataType::LargeList(child_fields)) => Ok(Arc::new(cast_list( + col.as_any() + .downcast_ref::>() + .ok_or_else(|| { + ArrowError::CastError(format!( + "Expected a list for {} but got {}", + field.name(), + col_type + )) + })?, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef), + // TODO: add list view cast + (DataType::Map(_, _), DataType::Map(child_fields, sorted)) => Ok(Arc::new(cast_map( + col.as_map_opt().ok_or_else(|| { + ArrowError::CastError(format!( + "Expected a map for {} but got {}", + field.name(), + col_type + )) + })?, + child_fields, + *sorted, + cast_options, + add_missing, + )?) as ArrayRef), + _ if is_cast_required(col_type, field_type) => { + cast_with_options(col, field_type, cast_options) + } + _ => Ok(col.clone()), + } +} + +fn is_cast_required(a: &DataType, b: &DataType) -> bool { + match (a, b) { + (DataType::List(a_item), DataType::List(b_item)) => { + // If list item name is not the default('item') the list must be casted + !a.equals_datatype(b) || a_item.name() != b_item.name() + } + (_, _) => !a.equals_datatype(b), + } +} + +/// Cast recordbatch to a new target_schema, by casting each column array +pub fn cast_record_batch( + batch: &RecordBatch, + target_schema: ArrowSchemaRef, + safe: bool, + add_missing: bool, +) -> DeltaResult { + let cast_options = CastOptions { + safe, + ..Default::default() + }; + + let s = StructArray::new( + batch.schema().as_ref().to_owned().fields, + batch.columns().to_owned(), + None, + ); + let struct_array = cast_struct(&s, target_schema.fields(), &cast_options, add_missing)?; + + Ok(RecordBatch::try_new_with_options( + target_schema, + struct_array.columns().to_vec(), + &RecordBatchOptions::new().with_row_count(Some(batch.num_rows())), + )?) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::ops::Deref; + use std::sync::Arc; + + use super::merge_schema::{merge_arrow_schema, merge_delta_struct}; + use arrow::array::types::Int32Type; + use arrow::array::{ + new_empty_array, new_null_array, Array, ArrayData, ArrayRef, AsArray, Int32Array, + ListArray, PrimitiveArray, RecordBatch, StringArray, StructArray, + }; + use arrow::buffer::{Buffer, NullBuffer}; + use arrow_schema::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; + use delta_kernel::schema::MetadataValue; + use itertools::Itertools; + + use crate::kernel::{ + ArrayType as DeltaArrayType, DataType as DeltaDataType, StructField as DeltaStructField, + StructType as DeltaStructType, + }; + use crate::operations::cast::{cast_record_batch, is_cast_required}; + + #[test] + fn test_merge_arrow_schema_with_dict() { + let left_schema = Arc::new(Schema::new(vec![Field::new( + "f", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + )])); + let right_schema = Arc::new(Schema::new(vec![Field::new( + "f", + DataType::LargeUtf8, + true, + )])); + + let result = merge_arrow_schema(left_schema, right_schema, true).unwrap(); + assert_eq!(result.fields().len(), 1); + let delta_type: DeltaDataType = result.fields()[0].data_type().try_into().unwrap(); + assert_eq!(delta_type, DeltaDataType::STRING); + assert!(result.fields()[0].is_nullable()); + } + + #[test] + fn test_merge_delta_schema_with_meta() { + let mut left_meta = HashMap::new(); + left_meta.insert("a".to_string(), "a1".to_string()); + let left_schema = DeltaStructType::new(vec![DeltaStructField::new( + "f", + DeltaDataType::STRING, + false, + ) + .with_metadata(left_meta)]); + let mut right_meta = HashMap::new(); + right_meta.insert("b".to_string(), "b2".to_string()); + let right_schema = DeltaStructType::new(vec![DeltaStructField::new( + "f", + DeltaDataType::STRING, + true, + ) + .with_metadata(right_meta)]); + + let result = merge_delta_struct(&left_schema, &right_schema).unwrap(); + let fields = result.fields().collect_vec(); + assert_eq!(fields.len(), 1); + let delta_type = fields[0].data_type(); + assert_eq!(delta_type, &DeltaDataType::STRING); + let mut expected_meta = HashMap::new(); + expected_meta.insert("a".to_string(), MetadataValue::String("a1".to_string())); + expected_meta.insert("b".to_string(), MetadataValue::String("b2".to_string())); + assert_eq!(fields[0].metadata(), &expected_meta); + } + + #[test] + fn test_merge_arrow_schema_with_nested() { + let left_schema = Arc::new(Schema::new(vec![Field::new( + "f", + DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, false))), + false, + )])); + let right_schema = Arc::new(Schema::new(vec![Field::new( + "f", + DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, false))), + true, + )])); + + let result = merge_arrow_schema(left_schema, right_schema, true).unwrap(); + assert_eq!(result.fields().len(), 1); + let delta_type: DeltaDataType = result.fields()[0].data_type().try_into().unwrap(); + assert_eq!( + delta_type, + DeltaDataType::Array(Box::new(DeltaArrayType::new(DeltaDataType::STRING, false))) + ); + assert!(result.fields()[0].is_nullable()); + } + + #[test] + fn test_cast_record_batch_with_list_non_default_item() { + let array = Arc::new(make_list_array()) as ArrayRef; + let source_schema = Schema::new(vec![Field::new( + "list_column", + array.data_type().clone(), + false, + )]); + let record_batch = RecordBatch::try_new(Arc::new(source_schema), vec![array]).unwrap(); + + let fields = Fields::from(vec![Field::new_list( + "list_column", + Field::new("item", DataType::Int8, false), + false, + )]); + let target_schema = Arc::new(Schema::new(fields)) as SchemaRef; + + let result = cast_record_batch(&record_batch, target_schema, false, false); + + let schema = result.unwrap().schema(); + let field = schema.column_with_name("list_column").unwrap().1; + if let DataType::List(list_item) = field.data_type() { + assert_eq!(list_item.name(), "item"); + } else { + panic!("Not a list"); + } + } + + fn make_list_array() -> ListArray { + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); + + let list_data_type = DataType::List(Arc::new(Field::new("element", DataType::Int32, true))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_child_data(value_data) + .build() + .unwrap(); + ListArray::from(list_data) + } + + #[test] + fn test_is_cast_required_with_list() { + let field1 = DataType::List(FieldRef::from(Field::new("item", DataType::Int32, false))); + let field2 = DataType::List(FieldRef::from(Field::new("item", DataType::Int32, false))); + + assert!(!is_cast_required(&field1, &field2)); + } + + #[test] + fn test_is_cast_required_with_list_non_default_item() { + let field1 = DataType::List(FieldRef::from(Field::new("item", DataType::Int32, false))); + let field2 = DataType::List(FieldRef::from(Field::new( + "element", + DataType::Int32, + false, + ))); + + assert!(is_cast_required(&field1, &field2)); + } + + #[test] + fn test_add_missing_null_fields_with_no_missing_fields() { + let schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec![Some("a"), None, Some("c")])), + ], + ) + .unwrap(); + let result = cast_record_batch(&batch, schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + assert_eq!( + result.column(1).deref().as_string(), + &StringArray::from(vec![Some("a"), None, Some("c")]) + ); + } + + #[test] + fn test_add_missing_null_fields_with_missing_beginning() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field2", + DataType::Utf8, + true, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(StringArray::from(vec![ + Some("a"), + None, + Some("c"), + ]))], + ) + .unwrap(); + + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Utf8, true), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + new_null_array(&DataType::Int32, 3) + .deref() + .as_primitive::() + ); + assert_eq!( + result.column(1).deref().as_string(), + &StringArray::from(vec![Some("a"), None, Some("c")]) + ); + } + + #[test] + fn test_add_missing_null_fields_with_missing_end() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field1", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Utf8, true), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from(vec![Some(1), Some(2), Some(3)]) + ); + assert_eq!( + result.column(1).deref().as_string::(), + new_null_array(&DataType::Utf8, 3).deref().as_string() + ); + } + + #[test] + fn test_add_missing_null_fields_error_on_missing_non_null() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field1", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Utf8, false), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true); + assert!(result.is_err()); + } + + #[test] + fn test_add_missing_null_fields_nested_struct_missing() { + let nested_fields = Fields::from(vec![Field::new("nested1", DataType::Utf8, true)]); + let schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Struct(nested_fields.clone()), true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StructArray::new( + nested_fields, + vec![Arc::new(StringArray::from(vec![Some("a"), None, Some("c")])) as ArrayRef], + None, + )), + ], + ) + .unwrap(); + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new( + "field2", + DataType::Struct(Fields::from(vec![ + Field::new("nested1", DataType::Utf8, true), + Field::new("nested2", DataType::Utf8, true), + ])), + true, + ), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + let struct_column = result.column(1).deref().as_struct(); + assert_eq!(struct_column.num_columns(), 2); + assert_eq!( + struct_column.column(0).deref().as_string(), + &StringArray::from(vec![Some("a"), None, Some("c")]) + ); + assert_eq!( + struct_column.column(1).deref().as_string::(), + new_null_array(&DataType::Utf8, 3).deref().as_string() + ); + } + + #[test] + fn test_add_missing_null_fields_nested_struct_missing_non_nullable() { + let nested_fields = Fields::from(vec![Field::new("nested1", DataType::Utf8, false)]); + let schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Struct(nested_fields.clone()), true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StructArray::new( + nested_fields, + vec![new_null_array(&DataType::Utf8, 3)], + Some(NullBuffer::new_null(3)), + )), + ], + ) + .unwrap(); + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new( + "field2", + DataType::Struct(Fields::from(vec![ + Field::new("nested1", DataType::Utf8, false), + Field::new("nested2", DataType::Utf8, true), + ])), + true, + ), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + let struct_column = result.column(1).deref().as_struct(); + assert_eq!(struct_column.num_columns(), 2); + let expected: [Option<&str>; 3] = Default::default(); + assert_eq!( + struct_column.column(0).deref().as_string(), + &StringArray::from(Vec::from(expected)) + ); + assert_eq!( + struct_column.column(1).deref().as_string::(), + new_null_array(&DataType::Utf8, 3).deref().as_string(), + ); + } + + #[test] + fn test_add_missing_null_fields_list_missing() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field1", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new( + "field2", + DataType::List(Arc::new(Field::new("nested1", DataType::Utf8, true))), + true, + ), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + let list_column = result.column(1).deref().as_list::(); + assert_eq!(list_column.len(), 3); + assert_eq!(list_column.value_offsets(), &[0, 0, 0, 0]); + assert_eq!( + list_column.values().deref().as_string::(), + new_empty_array(&DataType::Utf8).deref().as_string() + ) + } + + #[test] + fn test_add_missing_null_fields_map_missing() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field1", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new( + "field2", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Utf8, true), + Field::new("value", DataType::Utf8, true), + ])), + true, + )), + false, + ), + true, + ), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + let map_column = result.column(1).deref().as_map(); + assert_eq!(map_column.len(), 3); + assert_eq!(map_column.offsets().as_ref(), &[0; 4]); + assert_eq!( + map_column.keys().deref().as_string::(), + new_empty_array(&DataType::Utf8).deref().as_string() + ); + assert_eq!( + map_column.values().deref().as_string::(), + new_empty_array(&DataType::Utf8).deref().as_string() + ); + } +} diff --git a/crates/core/src/operations/cdc.rs b/crates/core/src/operations/cdc.rs new file mode 100644 index 0000000000..b04c794c61 --- /dev/null +++ b/crates/core/src/operations/cdc.rs @@ -0,0 +1,415 @@ +//! +//! The CDC module contains private tools for managing CDC files +//! + +use crate::table::state::DeltaTableState; +use crate::DeltaResult; + +use datafusion::prelude::*; +use datafusion_common::ScalarValue; + +pub const CDC_COLUMN_NAME: &str = "_change_type"; + +/// The CDCTracker is useful for hooking reads/writes in a manner nececessary to create CDC files +/// associated with commits +pub(crate) struct CDCTracker { + pre_dataframe: DataFrame, + post_dataframe: DataFrame, +} + +impl CDCTracker { + /// construct + pub(crate) fn new(pre_dataframe: DataFrame, post_dataframe: DataFrame) -> Self { + Self { + pre_dataframe, + post_dataframe, + } + } + + pub(crate) fn collect(self) -> DeltaResult { + // Collect _all_ the batches for consideration + let pre_df = self.pre_dataframe; + let post_df = self.post_dataframe; + + // There is certainly a better way to do this other than stupidly cloning data for diffing + // purposes, but this is the quickest and easiest way to "diff" the two sets of batches + let preimage = pre_df.clone().except(post_df.clone())?; + let postimage = post_df.except(pre_df)?; + + let preimage = preimage.with_column( + "_change_type", + lit(ScalarValue::Utf8(Some("update_preimage".to_string()))), + )?; + + let postimage = postimage.with_column( + "_change_type", + lit(ScalarValue::Utf8(Some("update_postimage".to_string()))), + )?; + + let final_df = preimage.union(postimage)?; + Ok(final_df) + } +} + +/// +/// Return true if the specified table is capable of writing Change Data files +/// +/// From the Protocol: +/// +/// > For Writer Versions 4 up to 6, all writers must respect the delta.enableChangeDataFeed +/// > configuration flag in the metadata of the table. When delta.enableChangeDataFeed is true, +/// > writers must produce the relevant AddCDCFile's for any operation that changes data, as +/// > specified in Change Data Files. +/// > +/// > For Writer Version 7, all writers must respect the delta.enableChangeDataFeed configuration flag in +/// > the metadata of the table only if the feature changeDataFeed exists in the table protocol's +/// > writerFeatures. +pub(crate) fn should_write_cdc(snapshot: &DeltaTableState) -> DeltaResult { + if let Some(features) = &snapshot.protocol().writer_features { + // Features should only exist at writer version 7 but to avoid cases where + // the Option> can get filled with an empty set, checking for the value + // explicitly + if snapshot.protocol().min_writer_version == 7 + && !features.contains(&crate::kernel::WriterFeatures::ChangeDataFeed) + { + // If the writer feature has not been set, then the table should not have CDC written + // to it. Otherwise fallback to the configured table configuration + return Ok(false); + } + } + Ok(snapshot.table_config().enable_change_data_feed()) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + use crate::kernel::DataType as DeltaDataType; + use crate::kernel::{Action, PrimitiveType, Protocol}; + use crate::operations::DeltaOps; + use crate::{DeltaTable, TableProperty}; + use arrow::array::{ArrayRef, Int32Array, StructArray}; + use arrow::datatypes::{DataType, Field}; + use arrow_array::RecordBatch; + use arrow_schema::Schema; + use datafusion::assert_batches_sorted_eq; + use datafusion::datasource::{MemTable, TableProvider}; + + /// A simple test which validates primitive writer version 1 tables should + /// not write Change Data Files + #[tokio::test] + async fn test_should_write_cdc_basic_table() { + let mut table = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .await + .expect("Failed to make a table"); + table.load().await.expect("Failed to reload table"); + let result = should_write_cdc(table.snapshot().unwrap()).expect("Failed to use table"); + assert!(!result, "A default table should not create CDC files"); + } + + /// + /// This test manually creates a table with writer version 4 that has the configuration sets + /// + #[tokio::test] + async fn test_should_write_cdc_table_with_configuration() { + let actions = vec![Action::Protocol(Protocol::new(1, 4))]; + let mut table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_actions(actions) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .expect("failed to make a version 4 table with EnableChangeDataFeed"); + table.load().await.expect("Failed to reload table"); + + let result = should_write_cdc(table.snapshot().unwrap()).expect("Failed to use table"); + assert!( + result, + "A table with the EnableChangeDataFeed should create CDC files" + ); + } + + /// + /// This test creates a writer version 7 table which has a slightly different way of + /// determining whether CDC files should be written or not. + #[tokio::test] + async fn test_should_write_cdc_v7_table_no_writer_feature() { + let actions = vec![Action::Protocol(Protocol::new(1, 7))]; + let mut table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_actions(actions) + .await + .expect("failed to make a version 4 table with EnableChangeDataFeed"); + table.load().await.expect("Failed to reload table"); + + let result = should_write_cdc(table.snapshot().unwrap()).expect("Failed to use table"); + assert!( + !result, + "A v7 table must not write CDC files unless the writer feature is set" + ); + } + + /// + /// This test creates a writer version 7 table with a writer table feature enabled for CDC and + /// therefore should write CDC files + #[tokio::test] + async fn test_should_write_cdc_v7_table_with_writer_feature() { + let protocol = Protocol::new(1, 7) + .with_writer_features(vec![crate::kernel::WriterFeatures::ChangeDataFeed]); + let actions = vec![Action::Protocol(protocol)]; + let mut table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_actions(actions) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .expect("failed to make a version 4 table with EnableChangeDataFeed"); + table.load().await.expect("Failed to reload table"); + + let result = should_write_cdc(table.snapshot().unwrap()).expect("Failed to use table"); + assert!( + result, + "A v7 table must not write CDC files unless the writer feature is set" + ); + } + + #[tokio::test] + async fn test_sanity_check() { + let ctx = SessionContext::new(); + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + DataType::Int32, + true, + )])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema.clone()), + vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]))], + ) + .unwrap(); + let table_provider: Arc = + Arc::new(MemTable::try_new(schema.clone(), vec![vec![batch]]).unwrap()); + let source_df = ctx.read_table(table_provider).unwrap(); + + let updated_batch = RecordBatch::try_new( + Arc::clone(&schema.clone()), + vec![Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)]))], + ) + .unwrap(); + let table_provider_updated: Arc = + Arc::new(MemTable::try_new(schema.clone(), vec![vec![updated_batch]]).unwrap()); + let updated_df = ctx.read_table(table_provider_updated).unwrap(); + + let tracker = CDCTracker::new(source_df, updated_df); + + match tracker.collect() { + Ok(df) => { + let batches = &df.collect().await.unwrap(); + let _ = arrow::util::pretty::print_batches(batches); + assert_eq!(batches.len(), 2); + assert_batches_sorted_eq! {[ + "+-------+------------------+", + "| value | _change_type |", + "+-------+------------------+", + "| 2 | update_preimage |", + "| 12 | update_postimage |", + "+-------+------------------+", + ], &batches } + } + Err(err) => { + println!("err: {err:#?}"); + panic!("Should have never reached this assertion"); + } + } + } + + #[tokio::test] + async fn test_sanity_check_with_pure_df() { + let nested_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("lat", DataType::Int32, true), + Field::new("long", DataType::Int32, true), + ])); + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Int32, true), + Field::new( + "nested", + DataType::Struct(nested_schema.fields.clone()), + true, + ), + ])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("lat", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("long", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + + let updated_batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("lat", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("long", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + let _ = arrow::util::pretty::print_batches(&[batch.clone()]); + let _ = arrow::util::pretty::print_batches(&[updated_batch.clone()]); + + let ctx = SessionContext::new(); + let before = ctx.read_batch(batch).expect("Failed to make DataFrame"); + let after = ctx + .read_batch(updated_batch) + .expect("Failed to make DataFrame"); + + let diff = before + .except(after) + .expect("Failed to except") + .collect() + .await + .expect("Failed to diff"); + assert_eq!(diff.len(), 1); + } + + #[tokio::test] + async fn test_sanity_check_with_struct() { + let ctx = SessionContext::new(); + let nested_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("lat", DataType::Int32, true), + Field::new("long", DataType::Int32, true), + ])); + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Int32, true), + Field::new( + "nested", + DataType::Struct(nested_schema.fields.clone()), + true, + ), + ])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema.clone()), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("lat", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("long", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + let table_provider: Arc = + Arc::new(MemTable::try_new(schema.clone(), vec![vec![batch]]).unwrap()); + let source_df = ctx.read_table(table_provider).unwrap(); + + let updated_batch = RecordBatch::try_new( + Arc::clone(&schema.clone()), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("lat", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("long", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + let table_provider_updated: Arc = + Arc::new(MemTable::try_new(schema.clone(), vec![vec![updated_batch]]).unwrap()); + let updated_df = ctx.read_table(table_provider_updated).unwrap(); + + let tracker = CDCTracker::new(source_df, updated_df); + + match tracker.collect() { + Ok(df) => { + let batches = &df.collect().await.unwrap(); + let _ = arrow::util::pretty::print_batches(batches); + assert_eq!(batches.len(), 2); + assert_batches_sorted_eq! {[ + "+-------+--------------------------+------------------+", + "| value | nested | _change_type |", + "+-------+--------------------------+------------------+", + "| 12 | {id: 2, lat: 2, long: 2} | update_postimage |", + "| 2 | {id: 2, lat: 2, long: 2} | update_preimage |", + "+-------+--------------------------+------------------+", + ], &batches } + } + Err(err) => { + println!("err: {err:#?}"); + panic!("Should have never reached this assertion"); + } + } + } +} diff --git a/crates/core/src/operations/constraints.rs b/crates/core/src/operations/constraints.rs index e5d356f81c..2acf57a03d 100644 --- a/crates/core/src/operations/constraints.rs +++ b/crates/core/src/operations/constraints.rs @@ -4,9 +4,9 @@ use std::sync::Arc; use datafusion::execution::context::SessionState; use datafusion::execution::{SendableRecordBatchStream, TaskContext}; -use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; use datafusion_common::ToDFSchema; +use datafusion_physical_plan::ExecutionPlan; use futures::future::BoxFuture; use futures::StreamExt; @@ -89,6 +89,12 @@ impl std::future::IntoFuture for ConstraintBuilder { let this = self; Box::pin(async move { + if !this.snapshot.load_config().require_files { + return Err(DeltaTableError::NotInitializedWithFiles( + "ADD CONSTRAINTS".into(), + )); + } + let name = match this.name { Some(v) => v, None => return Err(DeltaTableError::Generic("No name provided".to_string())), diff --git a/crates/core/src/operations/convert_to_delta.rs b/crates/core/src/operations/convert_to_delta.rs index 2e157c38c0..148b581d8b 100644 --- a/crates/core/src/operations/convert_to_delta.rs +++ b/crates/core/src/operations/convert_to_delta.rs @@ -1,36 +1,32 @@ //! Command for converting a Parquet table to a Delta table in place // https://github.com/delta-io/delta/blob/1d5dd774111395b0c4dc1a69c94abc169b1c83b6/spark/src/main/scala/org/apache/spark/sql/delta/commands/ConvertToDeltaCommand.scala +use std::collections::{HashMap, HashSet}; +use std::num::TryFromIntError; +use std::str::{FromStr, Utf8Error}; +use std::sync::Arc; + +use arrow_schema::{ArrowError, Schema as ArrowSchema}; +use futures::future::{self, BoxFuture}; +use futures::TryStreamExt; +use indexmap::IndexMap; +use itertools::Itertools; +use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; +use parquet::errors::ParquetError; +use percent_encoding::percent_decode_str; +use serde_json::{Map, Value}; +use tracing::debug; use crate::operations::get_num_idx_cols_and_stats_columns; use crate::{ - kernel::{Add, DataType, Schema, StructField}, + kernel::{scalars::ScalarExt, Add, DataType, Schema, StructField}, logstore::{LogStore, LogStoreRef}, operations::create::CreateBuilder, protocol::SaveMode, table::builder::ensure_table_uri, - table::config::DeltaConfigKey, + table::config::TableProperty, writer::stats::stats_from_parquet_metadata, DeltaResult, DeltaTable, DeltaTableError, ObjectStoreError, NULL_PARTITION_VALUE_DATA_PATH, }; -use arrow::{datatypes::Schema as ArrowSchema, error::ArrowError}; -use futures::{ - future::{self, BoxFuture}, - TryStreamExt, -}; -use indexmap::IndexMap; -use parquet::{ - arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}, - errors::ParquetError, -}; -use percent_encoding::percent_decode_str; -use serde_json::{Map, Value}; -use std::{ - collections::{HashMap, HashSet}, - num::TryFromIntError, - str::{FromStr, Utf8Error}, - sync::Arc, -}; -use tracing::debug; /// Error converting a Parquet table to a Delta table #[derive(Debug, thiserror::Error)] @@ -52,7 +48,7 @@ enum Error { #[error("The schema of partition columns must be provided to convert a Parquet table to a Delta table")] MissingPartitionSchema, #[error("Partition column provided by the user does not exist in the parquet files")] - PartitionColumnNotExist(HashSet), + PartitionColumnNotExist, #[error("The given location is already a delta table location")] DeltaTableAlready, #[error("Location must be provided to convert a Parquet table to a Delta table")] @@ -104,7 +100,7 @@ pub struct ConvertToDeltaBuilder { log_store: Option, location: Option, storage_options: Option>, - partition_schema: HashSet, + partition_schema: HashMap, partition_strategy: PartitionStrategy, mode: SaveMode, name: Option, @@ -169,7 +165,10 @@ impl ConvertToDeltaBuilder { mut self, partition_schema: impl IntoIterator, ) -> Self { - self.partition_schema = HashSet::from_iter(partition_schema); + self.partition_schema = partition_schema + .into_iter() + .map(|f| (f.name.clone(), f)) + .collect(); self } @@ -213,7 +212,7 @@ impl ConvertToDeltaBuilder { /// Specify a table property in the table configuration pub fn with_configuration_property( mut self, - key: DeltaConfigKey, + key: TableProperty, value: Option>, ) -> Self { self.configuration @@ -240,6 +239,7 @@ impl ConvertToDeltaBuilder { crate::logstore::logstore_for( ensure_table_uri(location)?, self.storage_options.unwrap_or_default(), + None, // TODO: allow runtime to be passed into builder )? } else { return Err(Error::MissingLocation); @@ -276,12 +276,7 @@ impl ConvertToDeltaBuilder { let mut arrow_schemas = Vec::new(); let mut actions = Vec::new(); // partition columns that were defined by caller and are expected to apply on this table - let mut expected_partitions: HashMap = self - .partition_schema - .clone() - .into_iter() - .map(|field| (field.name.clone(), field)) - .collect(); + let mut expected_partitions: HashMap = self.partition_schema.clone(); // A HashSet of all unique partition columns in a Parquet table let mut partition_columns = HashSet::new(); // A vector of StructField of all unique partition columns in a Parquet table @@ -317,12 +312,14 @@ impl ConvertToDeltaBuilder { // Safety: we just checked that the key is present in the map let field = partition_schema_fields.get(key).unwrap(); let scalar = if value == NULL_PARTITION_VALUE_DATA_PATH { - Ok(crate::kernel::Scalar::Null(field.data_type().clone())) + Ok(delta_kernel::expressions::Scalar::Null( + field.data_type().clone(), + )) } else { let decoded = percent_decode_str(value).decode_utf8()?; match field.data_type() { DataType::Primitive(p) => p.parse_scalar(decoded.as_ref()), - _ => Err(crate::kernel::Error::Generic(format!( + _ => Err(delta_kernel::Error::Generic(format!( "Exprected primitive type, found: {:?}", field.data_type() ))), @@ -390,25 +387,19 @@ impl ConvertToDeltaBuilder { if !expected_partitions.is_empty() { // Partition column provided by the user does not exist in the parquet files - return Err(Error::PartitionColumnNotExist(self.partition_schema)); + return Err(Error::PartitionColumnNotExist); } // Merge parquet file schemas // This step is needed because timestamp will not be preserved when copying files in S3. We can't use the schema of the latest parqeut file as Delta table's schema - let mut schema_fields = Schema::try_from(&ArrowSchema::try_merge(arrow_schemas)?)? - .fields() - .clone(); - schema_fields.append( - &mut partition_schema_fields - .values() - .cloned() - .collect::>(), - ); + let schema = Schema::try_from(&ArrowSchema::try_merge(arrow_schemas)?)?; + let mut schema_fields = schema.fields().collect_vec(); + schema_fields.append(&mut partition_schema_fields.values().collect::>()); // Generate CreateBuilder with corresponding add actions, schemas and operation meta let mut builder = CreateBuilder::new() .with_log_store(log_store) - .with_columns(schema_fields) + .with_columns(schema_fields.into_iter().cloned()) .with_partition_columns(partition_columns.into_iter()) .with_actions(actions) .with_save_mode(self.mode) @@ -447,17 +438,20 @@ impl std::future::IntoFuture for ConvertToDeltaBuilder { #[cfg(test)] mod tests { + use std::fs; + + use delta_kernel::expressions::Scalar; + use itertools::Itertools; + use pretty_assertions::assert_eq; + use tempfile::tempdir; + use super::*; use crate::{ - kernel::{DataType, PrimitiveType, Scalar}, + kernel::{DataType, PrimitiveType}, open_table, storage::StorageOptions, Path, }; - use itertools::Itertools; - use pretty_assertions::assert_eq; - use std::fs; - use tempfile::tempdir; fn schema_field(key: &str, primitive: PrimitiveType, nullable: bool) -> StructField { StructField::new(key.to_string(), DataType::Primitive(primitive), nullable) @@ -484,7 +478,7 @@ mod tests { fn log_store(path: impl Into) -> LogStoreRef { let path: String = path.into(); let location = ensure_table_uri(path).expect("Failed to get the URI from the path"); - crate::logstore::logstore_for(location, StorageOptions::default()) + crate::logstore::logstore_for(location, StorageOptions::default(), None) .expect("Failed to create an object store") } @@ -563,7 +557,8 @@ mod tests { .get_schema() .expect("Failed to get schema") .fields() - .clone(); + .cloned() + .collect_vec(); schema_fields.sort_by(|a, b| a.name().cmp(b.name())); assert_eq!( schema_fields, expected_schema, @@ -603,14 +598,15 @@ mod tests { "part-00000-d22c627d-9655-4153-9527-f8995620fa42-c000.snappy.parquet" ); - let Some(Scalar::Struct(min_values, _)) = action.min_values() else { + let Some(Scalar::Struct(data)) = action.min_values() else { panic!("Missing min values"); }; - assert_eq!(min_values, vec![Scalar::Date(18628), Scalar::Integer(1)]); - let Some(Scalar::Struct(max_values, _)) = action.max_values() else { + assert_eq!(data.values(), vec![Scalar::Date(18628), Scalar::Integer(1)]); + + let Some(Scalar::Struct(data)) = action.max_values() else { panic!("Missing max values"); }; - assert_eq!(max_values, vec![Scalar::Date(18632), Scalar::Integer(5)]); + assert_eq!(data.values(), vec![Scalar::Date(18632), Scalar::Integer(5)]); assert_delta_table( table, diff --git a/crates/core/src/operations/create.rs b/crates/core/src/operations/create.rs index 53cab30c81..ad0413722e 100644 --- a/crates/core/src/operations/create.rs +++ b/crates/core/src/operations/create.rs @@ -4,9 +4,11 @@ use std::collections::HashMap; use std::sync::Arc; +use delta_kernel::schema::MetadataValue; use futures::future::BoxFuture; use maplit::hashset; use serde_json::Value; +use tracing::log::*; use super::transaction::{CommitBuilder, TableReference, PROTOCOL}; use crate::errors::{DeltaResult, DeltaTableError}; @@ -14,12 +16,9 @@ use crate::kernel::{ Action, DataType, Metadata, Protocol, ReaderFeatures, StructField, StructType, WriterFeatures, }; use crate::logstore::{LogStore, LogStoreRef}; -use crate::operations::set_tbl_properties::{ - apply_properties_to_protocol, convert_properties_to_features, -}; use crate::protocol::{DeltaOperation, SaveMode}; use crate::table::builder::ensure_table_uri; -use crate::table::config::DeltaConfigKey; +use crate::table::config::TableProperty; use crate::{DeltaTable, DeltaTableBuilder}; #[derive(thiserror::Error, Debug)] @@ -62,6 +61,7 @@ pub struct CreateBuilder { log_store: Option, configuration: HashMap>, metadata: Option>, + raise_if_key_not_exists: bool, } impl super::Operation<()> for CreateBuilder {} @@ -87,6 +87,7 @@ impl CreateBuilder { log_store: None, configuration: Default::default(), metadata: Default::default(), + raise_if_key_not_exists: true, } } @@ -126,7 +127,24 @@ impl CreateBuilder { ) -> Self { let mut field = StructField::new(name.into(), data_type, nullable); if let Some(meta) = metadata { - field = field.with_metadata(meta); + field = field.with_metadata(meta.iter().map(|(k, v)| { + ( + k, + if let Value::Number(n) = v { + n.as_i64().map_or_else( + || MetadataValue::String(v.to_string()), + |i| { + i32::try_from(i) + .ok() + .map(MetadataValue::Number) + .unwrap_or_else(|| MetadataValue::String(v.to_string())) + }, + ) + } else { + MetadataValue::String(v.to_string()) + }, + ) + })); }; self.columns.push(field); self @@ -176,7 +194,7 @@ impl CreateBuilder { /// Specify a table property in the table configuration pub fn with_configuration_property( mut self, - key: DeltaConfigKey, + key: TableProperty, value: Option>, ) -> Self { self.configuration @@ -196,6 +214,12 @@ impl CreateBuilder { self } + /// Specify whether to raise an error if the table properties in the configuration are not TablePropertys + pub fn with_raise_if_key_not_exists(mut self, raise_if_key_not_exists: bool) -> Self { + self.raise_if_key_not_exists = raise_if_key_not_exists; + self + } + /// Specify additional actions to be added to the commit. /// /// This method is mainly meant for internal use. Manually adding inconsistent @@ -242,8 +266,7 @@ impl CreateBuilder { }; let configuration = self.configuration; - let contains_timestampntz = PROTOCOL.contains_timestampntz(&self.columns); - + let contains_timestampntz = PROTOCOL.contains_timestampntz(self.columns.iter()); // TODO configure more permissive versions based on configuration. Also how should this ideally be handled? // We set the lowest protocol we can, and if subsequent writes use newer features we update metadata? @@ -273,16 +296,15 @@ impl CreateBuilder { }) .unwrap_or_else(|| current_protocol); - let protocol = apply_properties_to_protocol( - &protocol, + let protocol = protocol.apply_properties_to_protocol( &configuration .iter() .map(|(k, v)| (k.clone(), v.clone().unwrap())) .collect::>(), - true, + self.raise_if_key_not_exists, )?; - let protocol = convert_properties_to_features(protocol, &configuration); + let protocol = protocol.move_table_properties_into_features(&configuration); let mut metadata = Metadata::try_new( StructType::new(self.columns), @@ -372,7 +394,7 @@ impl std::future::IntoFuture for CreateBuilder { mod tests { use super::*; use crate::operations::DeltaOps; - use crate::table::config::DeltaConfigKey; + use crate::table::config::TableProperty; use crate::writer::test_utils::{get_delta_schema, get_record_batch}; use tempfile::TempDir; @@ -382,7 +404,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -402,7 +424,7 @@ mod tests { .await .unwrap() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -420,7 +442,7 @@ mod tests { ); let table = CreateBuilder::new() .with_location(format!("./{relative_path}")) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -431,7 +453,7 @@ mod tests { let schema = get_delta_schema(); let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -454,7 +476,7 @@ mod tests { }; let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_actions(vec![Action::Protocol(protocol)]) .await .unwrap(); @@ -463,15 +485,15 @@ mod tests { let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.fields().clone()) - .with_configuration_property(DeltaConfigKey::AppendOnly, Some("true")) + .with_columns(schema.fields().cloned()) + .with_configuration_property(TableProperty::AppendOnly, Some("true")) .await .unwrap(); let append = table .metadata() .unwrap() .configuration - .get(DeltaConfigKey::AppendOnly.as_ref()) + .get(TableProperty::AppendOnly.as_ref()) .unwrap() .as_ref() .unwrap() @@ -486,7 +508,7 @@ mod tests { let schema = get_delta_schema(); let table = CreateBuilder::new() .with_location(tmp_dir.path().to_str().unwrap()) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -497,7 +519,7 @@ mod tests { // Check an error is raised when a table exists at location let table = CreateBuilder::new() .with_log_store(log_store.clone()) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_save_mode(SaveMode::ErrorIfExists) .await; assert!(table.is_err()); @@ -505,7 +527,7 @@ mod tests { // Check current table is returned when ignore option is chosen. let table = CreateBuilder::new() .with_log_store(log_store.clone()) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -514,7 +536,7 @@ mod tests { // Check table is overwritten let table = CreateBuilder::new() .with_log_store(log_store) - .with_columns(schema.fields().iter().cloned()) + .with_columns(schema.fields().cloned()) .with_save_mode(SaveMode::Overwrite) .await .unwrap(); @@ -535,7 +557,7 @@ mod tests { let mut table = DeltaOps(table) .create() - .with_columns(schema.fields().iter().cloned()) + .with_columns(schema.fields().cloned()) .with_save_mode(SaveMode::Overwrite) .await .unwrap(); @@ -559,7 +581,7 @@ mod tests { let mut table = DeltaOps(table) .create() - .with_columns(schema.fields().iter().cloned()) + .with_columns(schema.fields().cloned()) .with_save_mode(SaveMode::Overwrite) .with_partition_columns(vec!["id"]) .await @@ -569,4 +591,43 @@ mod tests { // Checks if files got removed after overwrite assert_eq!(table.get_files_count(), 0); } + + #[tokio::test] + async fn test_create_table_metadata_raise_if_key_not_exists() { + let schema = get_delta_schema(); + let config: HashMap> = + vec![("key".to_string(), Some("value".to_string()))] + .into_iter() + .collect(); + + // Fail to create table with unknown Delta key + let table = CreateBuilder::new() + .with_location("memory://") + .with_columns(schema.fields().cloned()) + .with_configuration(config.clone()) + .await; + assert!(table.is_err()); + + // Succeed in creating table with unknown Delta key since we set raise_if_key_not_exists to false + let table = CreateBuilder::new() + .with_location("memory://") + .with_columns(schema.fields().cloned()) + .with_raise_if_key_not_exists(false) + .with_configuration(config) + .await; + assert!(table.is_ok()); + + // Ensure the non-Delta key was set correctly + let value = table + .unwrap() + .metadata() + .unwrap() + .configuration + .get("key") + .unwrap() + .as_ref() + .unwrap() + .clone(); + assert_eq!(String::from("value"), value); + } } diff --git a/crates/core/src/operations/delete.rs b/crates/core/src/operations/delete.rs index bf17ed6085..7dc58b5929 100644 --- a/crates/core/src/operations/delete.rs +++ b/crates/core/src/operations/delete.rs @@ -17,35 +17,47 @@ //! .await?; //! ```` -use core::panic; -use std::sync::Arc; -use std::time::{Instant, SystemTime, UNIX_EPOCH}; - -use crate::logstore::LogStoreRef; +use async_trait::async_trait; +use datafusion::dataframe::DataFrame; +use datafusion::datasource::provider_as_source; +use datafusion::error::Result as DataFusionResult; use datafusion::execution::context::{SessionContext, SessionState}; -use datafusion::physical_plan::filter::FilterExec; -use datafusion::physical_plan::ExecutionPlan; +use datafusion::execution::session_state::SessionStateBuilder; +use datafusion::physical_planner::{ExtensionPlanner, PhysicalPlanner}; use datafusion::prelude::Expr; -use datafusion_common::scalar::ScalarValue; -use datafusion_common::DFSchema; +use datafusion_common::ScalarValue; +use datafusion_expr::{lit, Extension, LogicalPlan, LogicalPlanBuilder, UserDefinedLogicalNode}; +use datafusion_physical_plan::metrics::MetricBuilder; +use datafusion_physical_plan::ExecutionPlan; + use futures::future::BoxFuture; +use std::sync::Arc; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; + use parquet::file::properties::WriterProperties; use serde::Serialize; +use super::cdc::should_write_cdc; use super::datafusion_utils::Expression; use super::transaction::{CommitBuilder, CommitProperties, PROTOCOL}; -use super::write::WriterStatsConfig; use crate::delta_datafusion::expr::fmt_expr_to_sql; +use crate::delta_datafusion::logical::MetricObserver; +use crate::delta_datafusion::physical::{find_metric_node, get_metric, MetricObserverExec}; +use crate::delta_datafusion::planner::DeltaPlanner; use crate::delta_datafusion::{ - create_physical_expr_fix, find_files, register_store, DataFusionMixins, DeltaScanBuilder, - DeltaSessionContext, + find_files, register_store, DataFusionMixins, DeltaScanConfigBuilder, DeltaSessionContext, + DeltaTableProvider, }; use crate::errors::DeltaResult; use crate::kernel::{Action, Add, Remove}; -use crate::operations::write::write_execution_plan; +use crate::logstore::LogStoreRef; +use crate::operations::write::{write_execution_plan, write_execution_plan_cdc, WriterStatsConfig}; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; -use crate::DeltaTable; +use crate::{DeltaTable, DeltaTableError}; + +const SOURCE_COUNT_ID: &str = "delete_source_count"; +const SOURCE_COUNT_METRIC: &str = "num_source_rows"; /// Delete Records from the Delta Table. /// See this module's documentation for more information @@ -72,15 +84,15 @@ pub struct DeleteMetrics { /// Number of files removed pub num_removed_files: usize, /// Number of rows removed - pub num_deleted_rows: Option, + pub num_deleted_rows: usize, /// Number of rows copied in the process of deleting files - pub num_copied_rows: Option, + pub num_copied_rows: usize, /// Time taken to execute the entire operation - pub execution_time_ms: u128, + pub execution_time_ms: u64, /// Time taken to scan the file for matches - pub scan_time_ms: u128, + pub scan_time_ms: u64, /// Time taken to rewrite the matched files - pub rewrite_time_ms: u128, + pub rewrite_time_ms: u64, } impl super::Operation<()> for DeleteBuilder {} @@ -123,36 +135,81 @@ impl DeleteBuilder { } } +#[derive(Clone)] +struct DeleteMetricExtensionPlanner {} + +#[async_trait] +impl ExtensionPlanner for DeleteMetricExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> DataFusionResult>> { + if let Some(metric_observer) = node.as_any().downcast_ref::() { + if metric_observer.id.eq(SOURCE_COUNT_ID) { + return Ok(Some(MetricObserverExec::try_new( + SOURCE_COUNT_ID.into(), + physical_inputs, + |batch, metrics| { + MetricBuilder::new(metrics) + .global_counter(SOURCE_COUNT_METRIC) + .add(batch.num_rows()); + }, + )?)); + } + } + Ok(None) + } +} + +#[allow(clippy::too_many_arguments)] async fn excute_non_empty_expr( snapshot: &DeltaTableState, log_store: LogStoreRef, state: &SessionState, expression: &Expr, - metrics: &mut DeleteMetrics, rewrite: &[Add], + metrics: &mut DeleteMetrics, writer_properties: Option, -) -> DeltaResult> { + partition_scan: bool, +) -> DeltaResult> { // For each identified file perform a parquet scan + filter + limit (1) + count. // If returned count is not zero then append the file to be rewritten and removed from the log. Otherwise do nothing to the file. + let mut actions: Vec = Vec::new(); + let table_partition_cols = snapshot.metadata().partition_columns.clone(); - let input_schema = snapshot.input_schema()?; - let input_dfschema: DFSchema = input_schema.clone().as_ref().clone().try_into()?; + let delete_planner = DeltaPlanner:: { + extension_planner: DeleteMetricExtensionPlanner {}, + }; - let table_partition_cols = snapshot.metadata().partition_columns.clone(); + let state = SessionStateBuilder::new_from_existing(state.clone()) + .with_query_planner(Arc::new(delete_planner)) + .build(); - let scan = DeltaScanBuilder::new(snapshot, log_store.clone(), state) - .with_files(rewrite) - .build() - .await?; - let scan = Arc::new(scan); + let scan_config = DeltaScanConfigBuilder::default() + .with_file_column(false) + .with_schema(snapshot.input_schema()?) + .build(snapshot)?; + + let target_provider = Arc::new( + DeltaTableProvider::try_new(snapshot.clone(), log_store.clone(), scan_config.clone())? + .with_files(rewrite.to_vec()), + ); + let target_provider = provider_as_source(target_provider); + let source = LogicalPlanBuilder::scan("target", target_provider.clone(), None)?.build()?; - // Apply the negation of the filter and rewrite files - let negated_expression = Expr::Not(Box::new(Expr::IsTrue(Box::new(expression.clone())))); + let source = LogicalPlan::Extension(Extension { + node: Arc::new(MetricObserver { + id: "delete_source_count".into(), + input: source, + enable_pushdown: false, + }), + }); - let predicate_expr = - create_physical_expr_fix(negated_expression, &input_dfschema, state.execution_props())?; - let filter: Arc = - Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); + let df = DataFrame::new(state.clone(), source); let writer_stats_config = WriterStatsConfig::new( snapshot.table_config().num_indexed_cols(), @@ -162,35 +219,70 @@ async fn excute_non_empty_expr( .map(|v| v.iter().map(|v| v.to_string()).collect::>()), ); - let add_actions = write_execution_plan( - Some(snapshot), - state.clone(), - filter.clone(), - table_partition_cols.clone(), - log_store.object_store(), - Some(snapshot.table_config().target_file_size() as usize), - None, - writer_properties, - false, - None, - writer_stats_config, - ) - .await? - .into_iter() - .map(|a| match a { - Action::Add(a) => a, - _ => panic!("Expected Add action"), - }) - .collect::>(); - - let read_records = scan.parquet_scan.metrics().and_then(|m| m.output_rows()); - let filter_records = filter.metrics().and_then(|m| m.output_rows()); - metrics.num_copied_rows = filter_records; - metrics.num_deleted_rows = read_records - .zip(filter_records) - .map(|(read, filter)| read - filter); - - Ok(add_actions) + if !partition_scan { + // Apply the negation of the filter and rewrite files + let negated_expression = Expr::Not(Box::new(Expr::IsTrue(Box::new(expression.clone())))); + + let filter = df + .clone() + .filter(negated_expression)? + .create_physical_plan() + .await?; + + let add_actions: Vec = write_execution_plan( + Some(snapshot), + state.clone(), + filter.clone(), + table_partition_cols.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties.clone(), + writer_stats_config.clone(), + None, + ) + .await?; + + actions.extend(add_actions); + + let source_count = find_metric_node(SOURCE_COUNT_ID, &filter).ok_or_else(|| { + DeltaTableError::Generic("Unable to locate expected metric node".into()) + })?; + let source_count_metrics = source_count.metrics().unwrap(); + let read_records = get_metric(&source_count_metrics, SOURCE_COUNT_METRIC); + let filter_records = filter.metrics().and_then(|m| m.output_rows()).unwrap_or(0); + + metrics.num_copied_rows = filter_records; + metrics.num_deleted_rows = read_records - filter_records; + } + + // CDC logic, simply filters data with predicate and adds the _change_type="delete" as literal column + if let Ok(true) = should_write_cdc(snapshot) { + // Create CDC scan + let change_type_lit = lit(ScalarValue::Utf8(Some("delete".to_string()))); + let cdc_filter = df + .filter(expression.clone())? + .with_column("_change_type", change_type_lit)? + .create_physical_plan() + .await?; + + let cdc_actions = write_execution_plan_cdc( + Some(snapshot), + state.clone(), + cdc_filter, + table_partition_cols.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties, + writer_stats_config, + None, + ) + .await?; + actions.extend(cdc_actions) + } + + Ok(actions) } async fn execute( @@ -201,30 +293,33 @@ async fn execute( writer_properties: Option, mut commit_properties: CommitProperties, ) -> DeltaResult<(DeltaTableState, DeleteMetrics)> { + if !&snapshot.load_config().require_files { + return Err(DeltaTableError::NotInitializedWithFiles("DELETE".into())); + } + let exec_start = Instant::now(); let mut metrics = DeleteMetrics::default(); let scan_start = Instant::now(); let candidates = find_files(&snapshot, log_store.clone(), &state, predicate.clone()).await?; - metrics.scan_time_ms = Instant::now().duration_since(scan_start).as_millis(); + metrics.scan_time_ms = Instant::now().duration_since(scan_start).as_millis() as u64; let predicate = predicate.unwrap_or(Expr::Literal(ScalarValue::Boolean(Some(true)))); - let add = if candidates.partition_scan { - Vec::new() - } else { + let mut actions = { let write_start = Instant::now(); let add = excute_non_empty_expr( &snapshot, log_store.clone(), &state, &predicate, - &mut metrics, &candidates.candidates, + &mut metrics, writer_properties, + candidates.partition_scan, ) .await?; - metrics.rewrite_time_ms = Instant::now().duration_since(write_start).as_millis(); + metrics.rewrite_time_ms = Instant::now().duration_since(write_start).as_millis() as u64; add }; let remove = candidates.candidates; @@ -234,7 +329,6 @@ async fn execute( .unwrap() .as_millis() as i64; - let mut actions: Vec = add.into_iter().map(Action::Add).collect(); metrics.num_removed_files = remove.len(); metrics.num_added_files = actions.len(); @@ -253,7 +347,7 @@ async fn execute( })) } - metrics.execution_time_ms = Instant::now().duration_since(exec_start).as_millis(); + metrics.execution_time_ms = Instant::now().duration_since(exec_start).as_millis() as u64; commit_properties .app_metadata @@ -328,6 +422,9 @@ impl std::future::IntoFuture for DeleteBuilder { #[cfg(test)] mod tests { + use crate::delta_datafusion::cdf::DeltaCdfScan; + use crate::kernel::DataType as DeltaDataType; + use crate::operations::collect_sendable_stream; use crate::operations::DeltaOps; use crate::protocol::*; use crate::writer::test_utils::datafusion::get_data; @@ -335,17 +432,21 @@ mod tests { use crate::writer::test_utils::{ get_arrow_schema, get_delta_schema, get_record_batch, setup_table_with_configuration, }; - use crate::DeltaConfigKey; use crate::DeltaTable; + use crate::TableProperty; use arrow::array::Int32Array; use arrow::datatypes::{Field, Schema}; use arrow::record_batch::RecordBatch; use arrow_array::ArrayRef; + use arrow_array::StringArray; use arrow_array::StructArray; use arrow_buffer::NullBuffer; + use arrow_schema::DataType; use arrow_schema::Fields; use datafusion::assert_batches_sorted_eq; + use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; + use delta_kernel::schema::PrimitiveType; use serde_json::json; use std::sync::Arc; @@ -354,7 +455,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); @@ -364,7 +465,7 @@ mod tests { #[tokio::test] async fn test_delete_when_delta_table_is_append_only() { - let table = setup_table_with_configuration(DeltaConfigKey::AppendOnly, Some("true")).await; + let table = setup_table_with_configuration(TableProperty::AppendOnly, Some("true")).await; let batch = get_record_batch(None, false); // append some data let table = write_batch(table, batch).await; @@ -409,8 +510,8 @@ mod tests { assert_eq!(table.get_files_count(), 0); assert_eq!(metrics.num_added_files, 0); assert_eq!(metrics.num_removed_files, 1); - assert_eq!(metrics.num_deleted_rows, None); - assert_eq!(metrics.num_copied_rows, None); + assert_eq!(metrics.num_deleted_rows, 0); + assert_eq!(metrics.num_copied_rows, 0); let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; @@ -420,16 +521,13 @@ mod tests { // serde_json::to_value(&metrics).unwrap() // ); - // rewrite is not required - assert_eq!(metrics.rewrite_time_ms, 0); - // Deletes with no changes to state must not commit let (table, metrics) = DeltaOps(table).delete().await.unwrap(); assert_eq!(table.version(), 2); assert_eq!(metrics.num_added_files, 0); assert_eq!(metrics.num_removed_files, 0); - assert_eq!(metrics.num_deleted_rows, None); - assert_eq!(metrics.num_copied_rows, None); + assert_eq!(metrics.num_deleted_rows, 0); + assert_eq!(metrics.num_copied_rows, 0); } #[tokio::test] @@ -500,8 +598,8 @@ mod tests { assert_eq!(metrics.num_added_files, 1); assert_eq!(metrics.num_removed_files, 1); assert!(metrics.scan_time_ms > 0); - assert_eq!(metrics.num_deleted_rows, Some(1)); - assert_eq!(metrics.num_copied_rows, Some(3)); + assert_eq!(metrics.num_deleted_rows, 1); + assert_eq!(metrics.num_copied_rows, 3); let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; @@ -655,10 +753,9 @@ mod tests { assert_eq!(metrics.num_added_files, 0); assert_eq!(metrics.num_removed_files, 1); - assert_eq!(metrics.num_deleted_rows, None); - assert_eq!(metrics.num_copied_rows, None); + assert_eq!(metrics.num_deleted_rows, 0); + assert_eq!(metrics.num_copied_rows, 0); assert!(metrics.scan_time_ms > 0); - assert_eq!(metrics.rewrite_time_ms, 0); let expected = vec![ "+----+-------+------------+", @@ -717,8 +814,8 @@ mod tests { assert_eq!(metrics.num_added_files, 0); assert_eq!(metrics.num_removed_files, 1); - assert_eq!(metrics.num_deleted_rows, Some(1)); - assert_eq!(metrics.num_copied_rows, Some(0)); + assert_eq!(metrics.num_deleted_rows, 1); + assert_eq!(metrics.num_copied_rows, 0); assert!(metrics.scan_time_ms > 0); let expected = [ @@ -801,4 +898,174 @@ mod tests { .await; assert!(res.is_err()); } + + #[tokio::test] + async fn test_delete_cdc_enabled() { + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + arrow::datatypes::DataType::Int32, + true, + )])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]))], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .delete() + .with_predicate(col("value").eq(lit(2))) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(3)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+--------------+-----------------+", + "| value | _change_type | _commit_version |", + "+-------+--------------+-----------------+", + "| 1 | insert | 1 |", + "| 2 | delete | 2 |", + "| 2 | insert | 1 |", + "| 3 | insert | 1 |", + "+-------+--------------+-----------------+", + ], &batches } + } + + #[tokio::test] + async fn test_delete_cdc_enabled_partitioned() { + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "year", + DeltaDataType::Primitive(PrimitiveType::String), + true, + None, + ) + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_partition_columns(vec!["year"]) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::new(vec![ + Field::new("year", DataType::Utf8, true), + Field::new("value", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![ + Some("2020"), + Some("2020"), + Some("2024"), + ])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .delete() + .with_predicate(col("value").eq(lit(2))) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(3)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+--------------+-----------------+------+", + "| value | _change_type | _commit_version | year |", + "+-------+--------------+-----------------+------+", + "| 1 | insert | 1 | 2020 |", + "| 2 | delete | 2 | 2020 |", + "| 2 | insert | 1 | 2020 |", + "| 3 | insert | 1 | 2024 |", + "+-------+--------------+-----------------+------+", + ], &batches } + } + + async fn collect_batches( + num_partitions: usize, + stream: DeltaCdfScan, + ctx: SessionContext, + ) -> Result, Box> { + let mut batches = vec![]; + for p in 0..num_partitions { + let data: Vec = + collect_sendable_stream(stream.execute(p, ctx.task_ctx())?).await?; + batches.extend_from_slice(&data); + } + Ok(batches) + } } diff --git a/crates/core/src/operations/filesystem_check.rs b/crates/core/src/operations/filesystem_check.rs index 44fa84d29a..6129c1cde3 100644 --- a/crates/core/src/operations/filesystem_check.rs +++ b/crates/core/src/operations/filesystem_check.rs @@ -24,6 +24,7 @@ use object_store::ObjectStore; use serde::Serialize; use url::{ParseError, Url}; +use super::transaction::{CommitBuilder, CommitProperties}; use crate::errors::{DeltaResult, DeltaTableError}; use crate::kernel::{Action, Add, Remove}; use crate::logstore::LogStoreRef; @@ -31,9 +32,6 @@ use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; use crate::DeltaTable; -use super::transaction::CommitBuilder; -use super::transaction::CommitProperties; - /// Audit the Delta Table's active files with the underlying file system. /// See this module's documentation for more information #[derive(Debug)] @@ -102,7 +100,7 @@ impl FileSystemCheckBuilder { async fn create_fsck_plan(&self) -> DeltaResult { let mut files_relative: HashMap = - HashMap::with_capacity(self.snapshot.file_actions()?.len()); + HashMap::with_capacity(self.snapshot.files_count()); let log_store = self.log_store.clone(); for active in self.snapshot.file_actions_iter()? { diff --git a/crates/core/src/operations/load.rs b/crates/core/src/operations/load.rs index 4bf439cd0d..930b5d48ec 100644 --- a/crates/core/src/operations/load.rs +++ b/crates/core/src/operations/load.rs @@ -51,6 +51,9 @@ impl std::future::IntoFuture for LoadBuilder { Box::pin(async move { PROTOCOL.can_read_from(&this.snapshot.snapshot)?; + if !this.snapshot.load_config().require_files { + return Err(DeltaTableError::NotInitializedWithFiles("reading".into())); + } let table = DeltaTable::new_with_state(this.log_store, this.snapshot); let schema = table.snapshot()?.arrow_schema()?; diff --git a/crates/core/src/operations/load_cdf.rs b/crates/core/src/operations/load_cdf.rs index 4f3a4bdbd6..ad2986de80 100644 --- a/crates/core/src/operations/load_cdf.rs +++ b/crates/core/src/operations/load_cdf.rs @@ -3,24 +3,28 @@ use std::sync::Arc; use std::time::SystemTime; +use arrow_array::RecordBatch; use arrow_schema::{ArrowError, Field}; use chrono::{DateTime, Utc}; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::physical_plan::FileScanConfig; -use datafusion::physical_plan::union::UnionExec; -use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; use datafusion_common::{ScalarValue, Statistics}; +use datafusion_physical_expr::expressions; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::projection::ProjectionExec; +use datafusion_physical_plan::union::UnionExec; +use datafusion_physical_plan::ExecutionPlan; use tracing::log; -use crate::delta_datafusion::cdf::*; use crate::delta_datafusion::{register_store, DataFusionMixins}; use crate::errors::DeltaResult; use crate::kernel::{Action, Add, AddCDCFile, CommitInfo}; use crate::logstore::{get_actions, LogStoreRef}; use crate::table::state::DeltaTableState; use crate::DeltaTableError; +use crate::{delta_datafusion::cdf::*, kernel::Remove}; /// Builder for create a read of change data feeds for delta tables #[derive(Clone)] @@ -29,6 +33,8 @@ pub struct CdfLoadBuilder { snapshot: DeltaTableState, /// Delta object store for handling data files log_store: LogStoreRef, + /// Columns to project + columns: Option>, /// Version to read from starting_version: i64, /// Version to stop reading at @@ -47,6 +53,7 @@ impl CdfLoadBuilder { Self { snapshot, log_store, + columns: None, starting_version: 0, ending_version: None, starting_timestamp: None, @@ -85,13 +92,23 @@ impl CdfLoadBuilder { self } + /// Columns to select + pub fn with_columns(mut self, columns: Vec) -> Self { + self.columns = Some(columns); + self + } + /// This is a rust version of https://github.com/delta-io/delta/blob/master/spark/src/main/scala/org/apache/spark/sql/delta/commands/cdc/CDCReader.scala#L418 /// Which iterates through versions of the delta table collects the relevant actions / commit info and returns those /// groupings for later use. The scala implementation has a lot more edge case handling and read schema checking (and just error checking in general) /// than I have right now. I plan to extend the checks once we have a stable state of the initial implementation. async fn determine_files_to_read( &self, - ) -> DeltaResult<(Vec>, Vec>)> { + ) -> DeltaResult<( + Vec>, + Vec>, + Vec>, + )> { let start = self.starting_version; let end = self .ending_version @@ -113,8 +130,9 @@ impl CdfLoadBuilder { ); log::debug!("starting version = {}, ending version = {:?}", start, end); - let mut change_files = vec![]; - let mut add_files = vec![]; + let mut change_files: Vec> = vec![]; + let mut add_files: Vec> = vec![]; + let mut remove_files: Vec> = vec![]; for version in start..=end { let snapshot_bytes = self @@ -128,6 +146,8 @@ impl CdfLoadBuilder { let mut cdc_actions = vec![]; if self.starting_timestamp.is_some() || self.ending_timestamp.is_some() { + // TODO: fallback on other actions for timestamps because CommitInfo action is optional + // theoretically. let version_commit = version_actions .iter() .find(|a| matches!(a, Action::CommitInfo(_))); @@ -188,6 +208,14 @@ impl CdfLoadBuilder { }) .collect::>(); + let remove_actions = version_actions + .iter() + .filter_map(|r| match r { + Action::Remove(r) if r.data_change => Some(r.clone()), + _ => None, + }) + .collect::>(); + if !add_actions.is_empty() { log::debug!( "Located {} cdf actions for version: {}", @@ -196,10 +224,19 @@ impl CdfLoadBuilder { ); add_files.push(CdcDataSpec::new(version, ts, add_actions)); } + + if !remove_actions.is_empty() { + log::debug!( + "Located {} cdf actions for version: {}", + remove_actions.len(), + version + ); + remove_files.push(CdcDataSpec::new(version, ts, remove_actions)); + } } } - Ok((change_files, add_files)) + Ok((change_files, add_files, remove_files)) } #[inline] @@ -207,20 +244,24 @@ impl CdfLoadBuilder { Some(ScalarValue::Utf8(Some(String::from("insert")))) } + fn get_remove_action_type() -> Option { + Some(ScalarValue::Utf8(Some(String::from("delete")))) + } + /// Executes the scan pub async fn build(&self) -> DeltaResult { - let (cdc, add) = self.determine_files_to_read().await?; + let (cdc, add, remove) = self.determine_files_to_read().await?; register_store( self.log_store.clone(), self.ctx.state().runtime_env().clone(), ); let partition_values = self.snapshot.metadata().partition_columns.clone(); - let schema = self.snapshot.arrow_schema()?; - let schema_fields: Vec = self + let schema = self.snapshot.input_schema()?; + let schema_fields: Vec> = self .snapshot - .arrow_schema()? - .all_fields() + .input_schema()? + .fields() .into_iter() .filter(|f| !partition_values.contains(f.name())) .cloned() @@ -234,16 +275,16 @@ impl CdfLoadBuilder { // Setup for the Read Schemas of each kind of file, CDC files include commit action type so they need a slightly // different schema than standard add file reads let cdc_file_schema = create_cdc_schema(schema_fields.clone(), true); - let add_file_schema = create_cdc_schema(schema_fields, false); + let add_remove_file_schema = create_cdc_schema(schema_fields, false); // Set up the mapping of partition columns to be projected into the final output batch // cdc for example has timestamp, version, and any table partitions mapped here. // add on the other hand has action type, timestamp, version and any additional table partitions because adds do // not include their actions let mut cdc_partition_cols = CDC_PARTITION_SCHEMA.clone(); - let mut add_partition_cols = ADD_PARTITION_SCHEMA.clone(); + let mut add_remove_partition_cols = ADD_PARTITION_SCHEMA.clone(); cdc_partition_cols.extend_from_slice(&this_partition_values); - add_partition_cols.extend_from_slice(&this_partition_values); + add_remove_partition_cols.extend_from_slice(&this_partition_values); // Set up the partition to physical file mapping, this is a mostly unmodified version of what is done in load let cdc_file_groups = @@ -254,9 +295,14 @@ impl CdfLoadBuilder { &partition_values, Self::get_add_action_type(), )?; + let remove_file_groups = create_partition_values( + schema.clone(), + remove, + &partition_values, + Self::get_remove_action_type(), + )?; - // Create the parquet scans for each associated type of file. I am not sure when we would use removes yet, but - // they would be here if / when they are necessary + // Create the parquet scans for each associated type of file. let cdc_scan = ParquetFormat::new() .create_physical_plan( &self.ctx.state(), @@ -279,12 +325,29 @@ impl CdfLoadBuilder { &self.ctx.state(), FileScanConfig { object_store_url: self.log_store.object_store_url(), - file_schema: add_file_schema.clone(), + file_schema: add_remove_file_schema.clone(), file_groups: add_file_groups.into_values().collect(), - statistics: Statistics::new_unknown(&add_file_schema), + statistics: Statistics::new_unknown(&add_remove_file_schema.clone()), + projection: None, + limit: None, + table_partition_cols: add_remove_partition_cols.clone(), + output_ordering: vec![], + }, + None, + ) + .await?; + + let remove_scan = ParquetFormat::new() + .create_physical_plan( + &self.ctx.state(), + FileScanConfig { + object_store_url: self.log_store.object_store_url(), + file_schema: add_remove_file_schema.clone(), + file_groups: remove_file_groups.into_values().collect(), + statistics: Statistics::new_unknown(&add_remove_file_schema), projection: None, limit: None, - table_partition_cols: add_partition_cols, + table_partition_cols: add_remove_partition_cols, output_ordering: vec![], }, None, @@ -293,41 +356,60 @@ impl CdfLoadBuilder { // The output batches are then unioned to create a single output. Coalesce partitions is only here for the time // being for development. I plan to parallelize the reads once the base idea is correct. - let union_scan: Arc = Arc::new(UnionExec::new(vec![cdc_scan, add_scan])); + let mut union_scan: Arc = + Arc::new(UnionExec::new(vec![cdc_scan, add_scan, remove_scan])); + + if let Some(columns) = &self.columns { + let expressions: Vec<(Arc, String)> = union_scan + .schema() + .fields() + .into_iter() + .enumerate() + .map(|(idx, field)| -> (Arc, String) { + let field_name = field.name(); + let expr = Arc::new(expressions::Column::new(field_name, idx)); + (expr, field_name.to_owned()) + }) + .filter(|(_, field_name)| columns.contains(field_name)) + .collect(); + union_scan = Arc::new(ProjectionExec::try_new(expressions, union_scan)?); + } Ok(DeltaCdfScan::new(union_scan)) } } +#[allow(unused)] +/// Helper function to collect batches associated with reading CDF data +pub(crate) async fn collect_batches( + num_partitions: usize, + stream: DeltaCdfScan, + ctx: SessionContext, +) -> Result, Box> { + let mut batches = vec![]; + for p in 0..num_partitions { + let data: Vec = + crate::operations::collect_sendable_stream(stream.execute(p, ctx.task_ctx())?).await?; + batches.extend_from_slice(&data); + } + Ok(batches) +} + #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; - use std::error::Error; use std::str::FromStr; - use arrow_array::RecordBatch; + use arrow_array::{Int32Array, RecordBatch, StringArray}; + use arrow_schema::Schema; use chrono::NaiveDateTime; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; use datafusion_common::assert_batches_sorted_eq; + use itertools::Itertools; - use crate::delta_datafusion::cdf::DeltaCdfScan; - use crate::operations::collect_sendable_stream; + use crate::test_utils::TestSchemas; use crate::writer::test_utils::TestResult; - use crate::DeltaOps; - - async fn collect_batches( - num_partitions: usize, - stream: DeltaCdfScan, - ctx: SessionContext, - ) -> Result, Box> { - let mut batches = vec![]; - for p in 0..num_partitions { - let data: Vec = - collect_sendable_stream(stream.execute(p, ctx.task_ctx())?).await?; - batches.extend_from_slice(&data); - } - Ok(batches) - } + use crate::{DeltaOps, DeltaTable, TableProperty}; #[tokio::test] async fn test_load_local() -> TestResult { @@ -513,4 +595,107 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_use_remove_actions_for_deletions() -> TestResult { + let delta_schema = TestSchemas::simple(); + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_columns(delta_schema.fields().cloned()) + .with_partition_columns(["id"]) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::try_from(delta_schema)?); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("1"), Some("2"), Some("3")])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(StringArray::from(vec![ + Some("yes"), + Some("yes"), + Some("no"), + ])), + ], + ) + .unwrap(); + + let second_batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("3")])), + Arc::new(Int32Array::from(vec![Some(10)])), + Arc::new(StringArray::from(vec![Some("yes")])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let table = DeltaOps(table) + .write([second_batch]) + .with_save_mode(crate::protocol::SaveMode::Overwrite) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let cdf_scan = DeltaOps(table.clone()) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + cdf_scan + .properties() + .output_partitioning() + .partition_count(), + cdf_scan, + ctx, + ) + .await + .expect("Failed to collect batches"); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(4)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+----------+--------------+-----------------+----+", + "| value | modified | _change_type | _commit_version | id |", + "+-------+----------+--------------+-----------------+----+", + "| 1 | yes | delete | 2 | 1 |", + "| 1 | yes | insert | 1 | 1 |", + "| 10 | yes | insert | 2 | 3 |", + "| 2 | yes | delete | 2 | 2 |", + "| 2 | yes | insert | 1 | 2 |", + "| 3 | no | delete | 2 | 3 |", + "| 3 | no | insert | 1 | 3 |", + "+-------+----------+--------------+-----------------+----+", + ], &batches } + + let snapshot_bytes = table + .log_store + .read_commit_entry(2) + .await? + .expect("failed to get snapshot bytes"); + let version_actions = get_actions(2, snapshot_bytes).await?; + + let cdc_actions = version_actions + .iter() + .filter(|action| matches!(action, &&Action::Cdc(_))) + .collect_vec(); + assert!(cdc_actions.is_empty()); + Ok(()) + } } diff --git a/crates/core/src/operations/merge/barrier.rs b/crates/core/src/operations/merge/barrier.rs index 7d18843af7..9084d721b7 100644 --- a/crates/core/src/operations/merge/barrier.rs +++ b/crates/core/src/operations/merge/barrier.rs @@ -18,12 +18,12 @@ use std::{ use arrow_array::{builder::UInt64Builder, ArrayRef, RecordBatch}; use arrow_schema::SchemaRef; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, -}; use datafusion_common::{DataFusionError, Result as DataFusionResult}; use datafusion_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; use datafusion_physical_expr::{Distribution, PhysicalExpr}; +use datafusion_physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, +}; use futures::{Stream, StreamExt}; use crate::{ @@ -67,6 +67,10 @@ impl MergeBarrierExec { } impl ExecutionPlan for MergeBarrierExec { + fn name(&self) -> &str { + Self::static_name() + } + fn as_any(&self) -> &dyn std::any::Any { self } @@ -83,14 +87,14 @@ impl ExecutionPlan for MergeBarrierExec { vec![Distribution::HashPartitioned(vec![self.expr.clone()]); 1] } - fn children(&self) -> Vec> { - vec![self.input.clone()] + fn children(&self) -> Vec<&Arc> { + vec![&self.input] } fn with_new_children( - self: std::sync::Arc, - children: Vec>, - ) -> datafusion_common::Result> { + self: Arc, + children: Vec>, + ) -> datafusion_common::Result> { if children.len() != 1 { return Err(DataFusionError::Plan( "MergeBarrierExec wrong number of children".to_string(), @@ -106,7 +110,7 @@ impl ExecutionPlan for MergeBarrierExec { fn execute( &self, partition: usize, - context: std::sync::Arc, + context: Arc, ) -> datafusion_common::Result { let input = self.input.execute(partition, context)?; Ok(Box::pin(MergeBarrierStream::new( @@ -422,22 +426,33 @@ impl UserDefinedLogicalNodeCore for MergeBarrier { exprs: &[datafusion_expr::Expr], inputs: &[datafusion_expr::LogicalPlan], ) -> Self { - MergeBarrier { + self.with_exprs_and_inputs(exprs.to_vec(), inputs.to_vec()) + .unwrap() + } + + fn with_exprs_and_inputs( + &self, + exprs: Vec, + inputs: Vec, + ) -> DataFusionResult { + Ok(MergeBarrier { input: inputs[0].clone(), file_column: self.file_column.clone(), expr: exprs[0].clone(), - } + }) } } -pub(crate) fn find_barrier_node(parent: &Arc) -> Option> { - //! Used to locate the physical Barrier Node after the planner converts the logical node - if parent.as_any().downcast_ref::().is_some() { +pub(crate) fn find_node( + parent: &Arc, +) -> Option> { + //! Used to locate a Node:: after the planner converts the logical node + if parent.as_any().downcast_ref::().is_some() { return Some(parent.to_owned()); } for child in &parent.children() { - let res = find_barrier_node(child); + let res = find_node::(child); if res.is_some() { return res; } diff --git a/crates/core/src/operations/merge/filter.rs b/crates/core/src/operations/merge/filter.rs new file mode 100644 index 0000000000..0745c55830 --- /dev/null +++ b/crates/core/src/operations/merge/filter.rs @@ -0,0 +1,943 @@ +//! Utility functions to determine early filters for file/partition pruning +use datafusion::functions_aggregate::expr_fn::{max, min}; +use std::collections::HashMap; + +use datafusion::execution::context::SessionState; +use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_common::{ScalarValue, TableReference}; +use datafusion_expr::expr::{InList, Placeholder}; +use datafusion_expr::{Aggregate, BinaryExpr, LogicalPlan, Operator}; +use datafusion_expr::{Between, Expr}; + +use either::{Left, Right}; + +use itertools::Itertools; + +use crate::delta_datafusion::execute_plan_to_batch; +use crate::table::state::DeltaTableState; +use crate::DeltaResult; + +#[derive(Debug)] +enum ReferenceTableCheck { + HasReference(String), + NoReference, + Unknown, +} +impl ReferenceTableCheck { + fn has_reference(&self) -> bool { + matches!(self, ReferenceTableCheck::HasReference(_)) + } +} + +fn references_table(expr: &Expr, table: &TableReference) -> ReferenceTableCheck { + let res = match expr { + Expr::Alias(alias) => references_table(&alias.expr, table), + Expr::Column(col) => col + .relation + .as_ref() + .map(|rel| { + if rel == table { + ReferenceTableCheck::HasReference(col.name.to_owned()) + } else { + ReferenceTableCheck::NoReference + } + }) + .unwrap_or(ReferenceTableCheck::NoReference), + Expr::Negative(neg) => references_table(neg, table), + Expr::Cast(cast) => references_table(&cast.expr, table), + Expr::TryCast(try_cast) => references_table(&try_cast.expr, table), + Expr::ScalarFunction(func) => { + if func.args.len() == 1 { + references_table(&func.args[0], table) + } else { + ReferenceTableCheck::Unknown + } + } + Expr::IsNull(inner) => references_table(inner, table), + Expr::Literal(_) => ReferenceTableCheck::NoReference, + _ => ReferenceTableCheck::Unknown, + }; + res +} + +fn construct_placeholder( + binary: BinaryExpr, + source_left: bool, + is_partition_column: bool, + column_name: String, + placeholders: &mut Vec, +) -> Option { + if is_partition_column { + let placeholder_name = format!("{column_name}_{}", placeholders.len()); + let placeholder = Expr::Placeholder(Placeholder { + id: placeholder_name.clone(), + data_type: None, + }); + + let (left, right, source_expr): (Box, Box, Expr) = if source_left { + (placeholder.into(), binary.clone().right, *binary.left) + } else { + (binary.clone().left, placeholder.into(), *binary.right) + }; + + let replaced = Expr::BinaryExpr(BinaryExpr { + left, + op: binary.op, + right, + }); + + placeholders.push(PredicatePlaceholder { + expr: source_expr, + alias: placeholder_name, + is_aggregate: false, + }); + + Some(replaced) + } else { + match binary.op { + Operator::Eq => { + let name_min = format!("{column_name}_{}_min", placeholders.len()); + let placeholder_min = Expr::Placeholder(Placeholder { + id: name_min.clone(), + data_type: None, + }); + let name_max = format!("{column_name}_{}_max", placeholders.len()); + let placeholder_max = Expr::Placeholder(Placeholder { + id: name_max.clone(), + data_type: None, + }); + let (source_expr, target_expr) = if source_left { + (*binary.left, *binary.right) + } else { + (*binary.right, *binary.left) + }; + let replaced = Expr::Between(Between { + expr: target_expr.into(), + negated: false, + low: placeholder_min.into(), + high: placeholder_max.into(), + }); + + placeholders.push(PredicatePlaceholder { + expr: min(source_expr.clone()), + alias: name_min, + is_aggregate: true, + }); + placeholders.push(PredicatePlaceholder { + expr: max(source_expr), + alias: name_max, + is_aggregate: true, + }); + Some(replaced) + } + _ => None, + } + } +} + +fn replace_placeholders(expr: Expr, placeholders: &HashMap) -> Expr { + expr.transform(&|expr| match expr { + Expr::Placeholder(Placeholder { id, .. }) => { + let value = placeholders[&id].clone(); + // Replace the placeholder with the value + Ok(Transformed::yes(Expr::Literal(value))) + } + _ => Ok(Transformed::no(expr)), + }) + .unwrap() + .data +} + +pub(crate) struct PredicatePlaceholder { + pub expr: Expr, + pub alias: String, + pub is_aggregate: bool, +} + +/// Takes the predicate provided and does three things: +/// +/// 1. for any relations between a source column and a partition target column, +/// replace source with a placeholder matching the name of the partition +/// columns +/// +/// 2. for any is equal relations between a source column and a non-partition target column, +/// replace source with is between expression with min(source_column) and max(source_column) placeholders +/// +/// 3. for any other relation with a source column, remove them. +/// +/// For example, for the predicate: +/// +/// `source.date = target.date and source.id = target.id and frob > 42` +/// +/// where `date` is a partition column, would result in the expr: +/// +/// `$date_0 = target.date and target.id between $id_1_min and $id_1_max and frob > 42` +/// +/// This leaves us with a predicate that we can push into delta scan after expanding it out to +/// a conjunction between the distinct partitions in the source input. +/// +pub(crate) fn generalize_filter( + predicate: Expr, + partition_columns: &Vec, + source_name: &TableReference, + target_name: &TableReference, + placeholders: &mut Vec, +) -> Option { + match predicate { + Expr::BinaryExpr(binary) => { + if references_table(&binary.right, source_name).has_reference() { + if let ReferenceTableCheck::HasReference(left_target) = + references_table(&binary.left, target_name) + { + return construct_placeholder( + binary, + false, + partition_columns.contains(&left_target), + left_target, + placeholders, + ); + } + return None; + } + if references_table(&binary.left, source_name).has_reference() { + if let ReferenceTableCheck::HasReference(right_target) = + references_table(&binary.right, target_name) + { + return construct_placeholder( + binary, + true, + partition_columns.contains(&right_target), + right_target, + placeholders, + ); + } + return None; + } + + let left = generalize_filter( + *binary.left, + partition_columns, + source_name, + target_name, + placeholders, + ); + let right = generalize_filter( + *binary.right, + partition_columns, + source_name, + target_name, + placeholders, + ); + + match (left, right) { + (None, None) => None, + (None, Some(one_side)) | (Some(one_side), None) => { + // in the case of an AND clause, it's safe to generalize the filter down to just one side of the AND. + // this is because this filter will be more permissive than the actual predicate, so we know that + // we will catch all data that could be matched by the predicate. For OR this is not the case - we + // could potentially eliminate one side of the predicate and the filter would only match half the + // cases that would have satisfied the match predicate. + match binary.op { + Operator::And => Some(one_side), + Operator::Or => None, + _ => None, + } + } + (Some(l), Some(r)) => Expr::BinaryExpr(BinaryExpr { + left: l.into(), + op: binary.op, + right: r.into(), + }) + .into(), + } + } + Expr::InList(in_list) => { + let compare_expr = match generalize_filter( + *in_list.expr, + partition_columns, + source_name, + target_name, + placeholders, + ) { + Some(expr) => expr, + None => return None, // Return early + }; + + let mut list_expr = Vec::new(); + for item in in_list.list.into_iter() { + match item { + // If it's a literal just immediately push it in list_expr so we can avoid the unnecessary generalizing + Expr::Literal(_) => list_expr.push(item), + _ => { + if let Some(item) = generalize_filter( + item.clone(), + partition_columns, + source_name, + target_name, + placeholders, + ) { + list_expr.push(item) + } + } + } + } + if !list_expr.is_empty() { + Expr::InList(InList { + expr: compare_expr.into(), + list: list_expr, + negated: in_list.negated, + }) + .into() + } else { + None + } + } + other => match references_table(&other, source_name) { + ReferenceTableCheck::HasReference(col) => { + let placeholder_name = format!("{col}_{}", placeholders.len()); + + let placeholder = Expr::Placeholder(Placeholder { + id: placeholder_name.clone(), + data_type: None, + }); + + placeholders.push(PredicatePlaceholder { + expr: other, + alias: placeholder_name, + is_aggregate: true, + }); + Some(placeholder) + } + ReferenceTableCheck::NoReference => Some(other), + ReferenceTableCheck::Unknown => None, + }, + } +} + +pub(crate) async fn try_construct_early_filter( + join_predicate: Expr, + table_snapshot: &DeltaTableState, + session_state: &SessionState, + source: &LogicalPlan, + source_name: &TableReference, + target_name: &TableReference, +) -> DeltaResult> { + let table_metadata = table_snapshot.metadata(); + let partition_columns = &table_metadata.partition_columns; + + let mut placeholders = Vec::default(); + + match generalize_filter( + join_predicate, + partition_columns, + source_name, + target_name, + &mut placeholders, + ) { + None => Ok(None), + Some(filter) => { + if placeholders.is_empty() { + // if we haven't recognised any source predicates in the join predicate, return our filter with static only predicates + Ok(Some(filter)) + } else { + // if we have some filters, which depend on the source df, then collect the placeholders values from the source data + // We aggregate the distinct values for partitions with the group_columns and stats(min, max) for dynamic filter as agg_columns + // Can be translated into `SELECT partition1 as part1_0, min(id) as id_1_min, max(id) as id_1_max FROM source GROUP BY partition1` + let (agg_columns, group_columns) = placeholders.into_iter().partition_map(|p| { + if p.is_aggregate { + Left(p.expr.alias(p.alias)) + } else { + Right(p.expr.alias(p.alias)) + } + }); + let distinct_partitions = LogicalPlan::Aggregate(Aggregate::try_new( + source.clone().into(), + group_columns, + agg_columns, + )?); + let execution_plan = session_state + .create_physical_plan(&distinct_partitions) + .await?; + let items = execute_plan_to_batch(session_state, execution_plan).await?; + let placeholder_names = items + .schema() + .fields() + .iter() + .map(|f| f.name().to_owned()) + .collect_vec(); + let expr = (0..items.num_rows()) + .map(|i| { + let replacements = placeholder_names + .iter() + .map(|placeholder| { + let col = items.column_by_name(placeholder).unwrap(); + let value = ScalarValue::try_from_array(col, i)?; + DeltaResult::Ok((placeholder.to_owned(), value)) + }) + .try_collect()?; + Ok(replace_placeholders(filter.clone(), &replacements)) + }) + .collect::>>()? + .into_iter() + .reduce(Expr::or); + Ok(expr) + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::operations::merge::tests::setup_table; + use crate::operations::merge::try_construct_early_filter; + use crate::writer::test_utils::get_arrow_schema; + + use arrow::record_batch::RecordBatch; + + use datafusion::datasource::provider_as_source; + + use datafusion::prelude::*; + use datafusion_common::Column; + use datafusion_common::ScalarValue; + use datafusion_common::TableReference; + use datafusion_expr::col; + + use datafusion_expr::Expr; + use datafusion_expr::LogicalPlanBuilder; + use datafusion_expr::Operator; + + use std::sync::Arc; + + #[tokio::test] + async fn test_try_construct_early_filter_with_partitions_expands() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["id"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["B", "C", "X"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-02", + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let source_name = TableReference::parse_str("source"); + let target_name = TableReference::parse_str("target"); + + let source = LogicalPlanBuilder::scan( + source_name.clone(), + provider_as_source(source.into_view()), + None, + ) + .unwrap() + .build() + .unwrap(); + + let join_predicate = col(Column { + relation: Some(source_name.clone()), + name: "id".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + })); + + let pred = try_construct_early_filter( + join_predicate, + table.snapshot().unwrap(), + &ctx.state(), + &source, + &source_name, + &target_name, + ) + .await + .unwrap(); + + assert!(pred.is_some()); + + let split_pred = { + fn split(expr: Expr, parts: &mut Vec<(String, String)>) { + match expr { + Expr::BinaryExpr(ex) if ex.op == Operator::Or => { + split(*ex.left, parts); + split(*ex.right, parts); + } + Expr::BinaryExpr(ex) if ex.op == Operator::Eq => { + let col = match *ex.right { + Expr::Column(col) => col.name, + ex => panic!("expected column in pred, got {ex}!"), + }; + + let value = match *ex.left { + Expr::Literal(ScalarValue::Utf8(Some(value))) => value, + ex => panic!("expected value in predicate, got {ex}!"), + }; + + parts.push((col, value)) + } + + expr => panic!("expected either = or OR, got {expr}"), + } + } + + let mut parts = vec![]; + split(pred.unwrap(), &mut parts); + parts.sort(); + parts + }; + + let expected_pred_parts = [ + ("id".to_owned(), "B".to_owned()), + ("id".to_owned(), "C".to_owned()), + ("id".to_owned(), "X".to_owned()), + ]; + + assert_eq!(split_pred, expected_pred_parts); + } + + #[tokio::test] + async fn test_try_construct_early_filter_with_range() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["B", "C"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let source_name = TableReference::parse_str("source"); + let target_name = TableReference::parse_str("target"); + + let source = LogicalPlanBuilder::scan( + source_name.clone(), + provider_as_source(source.into_view()), + None, + ) + .unwrap() + .build() + .unwrap(); + + let join_predicate = col(Column { + relation: Some(source_name.clone()), + name: "id".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + })); + + let pred = try_construct_early_filter( + join_predicate, + table.snapshot().unwrap(), + &ctx.state(), + &source, + &source_name, + &target_name, + ) + .await + .unwrap(); + + assert!(pred.is_some()); + + let filter = col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }) + .between( + Expr::Literal(ScalarValue::Utf8(Some("B".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("C".to_string()))), + ); + assert_eq!(pred.unwrap(), filter); + } + + #[tokio::test] + async fn test_try_construct_early_filter_with_partition_and_range() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["B", "C"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let source_name = TableReference::parse_str("source"); + let target_name = TableReference::parse_str("target"); + + let source = LogicalPlanBuilder::scan( + source_name.clone(), + provider_as_source(source.into_view()), + None, + ) + .unwrap() + .build() + .unwrap(); + + let join_predicate = col(Column { + relation: Some(source_name.clone()), + name: "id".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + })) + .and( + col(Column { + relation: Some(source_name.clone()), + name: "modified".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "modified".to_owned(), + })), + ); + + let pred = try_construct_early_filter( + join_predicate, + table.snapshot().unwrap(), + &ctx.state(), + &source, + &source_name, + &target_name, + ) + .await + .unwrap(); + + assert!(pred.is_some()); + + let filter = col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }) + .between( + Expr::Literal(ScalarValue::Utf8(Some("B".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("C".to_string()))), + ) + .and( + Expr::Literal(ScalarValue::Utf8(Some("2023-07-04".to_string()))).eq(col(Column { + relation: Some(target_name.clone()), + name: "modified".to_owned(), + })), + ); + assert_eq!(pred.unwrap(), filter); + } + + #[tokio::test] + async fn test_try_construct_early_filter_with_is_in_literals() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B", "C"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-05", + "2023-07-05", + ])), + ], + ) + .unwrap(); + let source_df = ctx.read_batch(batch).unwrap(); + + let source_name = TableReference::parse_str("source"); + let target_name = TableReference::parse_str("target"); + + let source_plan = LogicalPlanBuilder::scan( + source_name.clone(), + provider_as_source(source_df.into_view()), + None, + ) + .unwrap() + .build() + .unwrap(); + + let join_predicate = col(Column { + relation: Some(source_name.clone()), + name: "id".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + })) + .and(col("modified".to_owned()).in_list( + vec![lit("2023-07-05"), lit("2023-07-06"), lit("2023-07-07")], + false, + )); + + let pred = try_construct_early_filter( + join_predicate, + table.snapshot().unwrap(), + &ctx.state(), + &source_plan, + &source_name, + &target_name, + ) + .await + .unwrap(); + + assert!(pred.is_some()); + + let filter = col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }) + .between( + Expr::Literal(ScalarValue::Utf8(Some("A".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("C".to_string()))), + ) + .and( + col(Column { + relation: None, + name: "modified".to_owned(), + }) + .in_list( + vec![ + Expr::Literal(ScalarValue::Utf8(Some("2023-07-05".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("2023-07-06".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("2023-07-07".to_string()))), + ], + false, + ), + ); + assert_eq!(pred.unwrap(), filter); + } + + #[tokio::test] + async fn test_try_construct_early_filter_with_is_in_columns() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B", "C"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-05", + "2023-07-05", + ])), + ], + ) + .unwrap(); + let source_df = ctx.read_batch(batch).unwrap(); + + let source_name = TableReference::parse_str("source"); + let target_name = TableReference::parse_str("target"); + + let source_plan = LogicalPlanBuilder::scan( + source_name.clone(), + provider_as_source(source_df.into_view()), + None, + ) + .unwrap() + .build() + .unwrap(); + + let join_predicate = col(Column { + relation: Some(source_name.clone()), + name: "id".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + })) + .and(col("modified".to_owned()).in_list( + vec![ + col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }), + col(Column { + relation: Some(target_name.clone()), + name: "modified".to_owned(), + }), + ], + false, + )); + + let pred = try_construct_early_filter( + join_predicate, + table.snapshot().unwrap(), + &ctx.state(), + &source_plan, + &source_name, + &target_name, + ) + .await + .unwrap(); + + assert!(pred.is_some()); + + let filter = col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }) + .between( + Expr::Literal(ScalarValue::Utf8(Some("A".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("C".to_string()))), + ) + .and( + col(Column { + relation: None, + name: "modified".to_owned(), + }) + .in_list( + vec![ + col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }), + col(Column { + relation: Some(target_name.clone()), + name: "modified".to_owned(), + }), + ], + false, + ), + ); + assert_eq!(pred.unwrap(), filter); + } + + #[tokio::test] + async fn test_try_construct_early_filter_with_is_in_ident_and_cols() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B", "C"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-05", + "2023-07-05", + ])), + ], + ) + .unwrap(); + let source_df = ctx.read_batch(batch).unwrap(); + + let source_name = TableReference::parse_str("source"); + let target_name = TableReference::parse_str("target"); + + let source_plan = LogicalPlanBuilder::scan( + source_name.clone(), + provider_as_source(source_df.into_view()), + None, + ) + .unwrap() + .build() + .unwrap(); + + let join_predicate = col(Column { + relation: Some(source_name.clone()), + name: "id".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + })) + .and(ident("source.id").in_list( + vec![ + col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }), + col(Column { + relation: Some(target_name.clone()), + name: "modified".to_owned(), + }), + ], + false, + )); + + let pred = try_construct_early_filter( + join_predicate, + table.snapshot().unwrap(), + &ctx.state(), + &source_plan, + &source_name, + &target_name, + ) + .await + .unwrap(); + + assert!(pred.is_some()); + + let filter = col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }) + .between( + Expr::Literal(ScalarValue::Utf8(Some("A".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("C".to_string()))), + ) + .and(ident("source.id").in_list( + vec![ + col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }), + col(Column { + relation: Some(target_name.clone()), + name: "modified".to_owned(), + }), + ], + false, + )); + assert_eq!(pred.unwrap(), filter); + } +} diff --git a/crates/core/src/operations/merge/mod.rs b/crates/core/src/operations/merge/mod.rs index ddbe113d16..7f87d30d35 100644 --- a/crates/core/src/operations/merge/mod.rs +++ b/crates/core/src/operations/merge/mod.rs @@ -29,35 +29,36 @@ //! ```` use std::collections::HashMap; +use std::fmt::Debug; use std::sync::Arc; use std::time::Instant; use async_trait::async_trait; use datafusion::datasource::provider_as_source; use datafusion::error::Result as DataFusionResult; -use datafusion::execution::context::{QueryPlanner, SessionConfig}; +use datafusion::execution::context::SessionConfig; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::logical_expr::build_join_schema; -use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}; +use datafusion::physical_plan::metrics::MetricBuilder; +use datafusion::physical_planner::{ExtensionPlanner, PhysicalPlanner}; use datafusion::{ execution::context::SessionState, - physical_plan::{ - metrics::{MetricBuilder, MetricsSet}, - ExecutionPlan, - }, + physical_plan::ExecutionPlan, prelude::{DataFrame, SessionContext}, }; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{Column, DFSchema, ScalarValue, TableReference}; -use datafusion_expr::expr::Placeholder; use datafusion_expr::{col, conditional_expressions::CaseBuilder, lit, when, Expr, JoinType}; use datafusion_expr::{ - BinaryExpr, Distinct, Extension, LogicalPlan, LogicalPlanBuilder, Operator, Projection, - UserDefinedLogicalNode, UNNAMED_TABLE, + Extension, LogicalPlan, LogicalPlanBuilder, UserDefinedLogicalNode, UNNAMED_TABLE, }; + +use filter::try_construct_early_filter; use futures::future::BoxFuture; use itertools::Itertools; use parquet::file::properties::WriterProperties; use serde::Serialize; +use tracing::log::*; use self::barrier::{MergeBarrier, MergeBarrierExec}; @@ -65,21 +66,24 @@ use super::datafusion_utils::{into_expr, maybe_into_expr, Expression}; use super::transaction::{CommitProperties, PROTOCOL}; use crate::delta_datafusion::expr::{fmt_expr_to_sql, parse_predicate_expression}; use crate::delta_datafusion::logical::MetricObserver; -use crate::delta_datafusion::physical::{find_metric_node, MetricObserverExec}; +use crate::delta_datafusion::physical::{find_metric_node, get_metric, MetricObserverExec}; +use crate::delta_datafusion::planner::DeltaPlanner; use crate::delta_datafusion::{ - execute_plan_to_batch, register_store, DeltaColumn, DeltaScanConfigBuilder, DeltaSessionConfig, - DeltaTableProvider, + register_store, DataFusionMixins, DeltaColumn, DeltaScan, DeltaScanConfigBuilder, + DeltaSessionConfig, DeltaTableProvider, }; use crate::kernel::Action; use crate::logstore::LogStoreRef; -use crate::operations::merge::barrier::find_barrier_node; +use crate::operations::cdc::*; +use crate::operations::merge::barrier::find_node; use crate::operations::transaction::CommitBuilder; -use crate::operations::write::{write_execution_plan, WriterStatsConfig}; +use crate::operations::write::{write_execution_plan, write_execution_plan_cdc, WriterStatsConfig}; use crate::protocol::{DeltaOperation, MergePredicate}; use crate::table::state::DeltaTableState; use crate::{DeltaResult, DeltaTable, DeltaTableError}; mod barrier; +mod filter; const SOURCE_COLUMN: &str = "__delta_rs_source"; const TARGET_COLUMN: &str = "__delta_rs_target"; @@ -502,7 +506,7 @@ impl MergeOperation { relation: Some(TableReference::Bare { table }), name, } => { - if table.eq(alias) { + if table.as_ref() == alias { Column { relation: Some(r), name, @@ -562,6 +566,10 @@ pub struct MergeMetrics { pub num_target_rows_copied: usize, /// Total number of rows written out pub num_output_rows: usize, + /// Amount of files considered during table scan + pub num_target_files_scanned: usize, + /// Amount of files not considered (pruned) during table scan + pub num_target_files_skipped_during_scan: usize, /// Number of files added to the sink(target) pub num_target_files_added: usize, /// Number of files removed from the sink(target) @@ -573,7 +581,7 @@ pub struct MergeMetrics { /// Time taken to rewrite the matched files pub rewrite_time_ms: u64, } - +#[derive(Clone)] struct MergeMetricExtensionPlanner {} #[async_trait] @@ -666,288 +674,47 @@ impl ExtensionPlanner for MergeMetricExtensionPlanner { } } -/// Takes the predicate provided and does two things: -/// -/// 1. for any relations between a source column and a target column, if the target column is a -/// partition column, then replace source with a placeholder matching the name of the partition -/// columns -/// -/// 2. for any other relation with a source column, remove them. -/// -/// For example, for the predicate: -/// -/// `source.date = target.date and source.id = target.id and frob > 42` -/// -/// where `date` is a partition column, would result in the expr: -/// -/// `$date = target.date and frob > 42` -/// -/// This leaves us with a predicate that we can push into delta scan after expanding it out to -/// a conjunction between the distinct partitions in the source input. -/// -/// TODO: A further improvement here might be for non-partition columns to be replaced with min/max -/// checks, so the above example could become: -/// -/// `$date = target.date and target.id between 12345 and 99999 and frob > 42` -fn generalize_filter( - predicate: Expr, - partition_columns: &Vec, - source_name: &TableReference, - target_name: &TableReference, - placeholders: &mut HashMap, -) -> Option { - #[derive(Debug)] - enum ReferenceTableCheck { - HasReference(String), - NoReference, - Unknown, - } - impl ReferenceTableCheck { - fn has_reference(&self) -> bool { - matches!(self, ReferenceTableCheck::HasReference(_)) - } - } - fn references_table(expr: &Expr, table: &TableReference) -> ReferenceTableCheck { - let res = match expr { - Expr::Alias(alias) => references_table(&alias.expr, table), - Expr::Column(col) => col - .relation - .as_ref() - .map(|rel| { - if rel == table { - ReferenceTableCheck::HasReference(col.name.to_owned()) - } else { - ReferenceTableCheck::NoReference - } - }) - .unwrap_or(ReferenceTableCheck::NoReference), - Expr::Negative(neg) => references_table(neg, table), - Expr::Cast(cast) => references_table(&cast.expr, table), - Expr::TryCast(try_cast) => references_table(&try_cast.expr, table), - Expr::ScalarFunction(func) => { - if func.args.len() == 1 { - references_table(&func.args[0], table) - } else { - ReferenceTableCheck::Unknown - } - } - Expr::IsNull(inner) => references_table(inner, table), - Expr::Literal(_) => ReferenceTableCheck::NoReference, - _ => ReferenceTableCheck::Unknown, - }; - res - } - - match predicate { - Expr::BinaryExpr(binary) => { - if references_table(&binary.right, source_name).has_reference() { - if let ReferenceTableCheck::HasReference(left_target) = - references_table(&binary.left, target_name) - { - if partition_columns.contains(&left_target) { - let placeholder_name = format!("{left_target}_{}", placeholders.len()); - - let placeholder = Expr::Placeholder(datafusion_expr::expr::Placeholder { - id: placeholder_name.clone(), - data_type: None, - }); - let replaced = Expr::BinaryExpr(BinaryExpr { - left: binary.left, - op: binary.op, - right: placeholder.into(), - }); - - placeholders.insert(placeholder_name, *binary.right); - - return Some(replaced); - } - } - return None; - } - if references_table(&binary.left, source_name).has_reference() { - if let ReferenceTableCheck::HasReference(right_target) = - references_table(&binary.right, target_name) - { - if partition_columns.contains(&right_target) { - let placeholder_name = format!("{right_target}_{}", placeholders.len()); - - let placeholder = Expr::Placeholder(datafusion_expr::expr::Placeholder { - id: placeholder_name.clone(), - data_type: None, - }); - let replaced = Expr::BinaryExpr(BinaryExpr { - right: binary.right, - op: binary.op, - left: placeholder.into(), - }); - - placeholders.insert(placeholder_name, *binary.left); - - return Some(replaced); - } - } - return None; - } - - let left = generalize_filter( - *binary.left, - partition_columns, - source_name, - target_name, - placeholders, - ); - let right = generalize_filter( - *binary.right, - partition_columns, - source_name, - target_name, - placeholders, - ); - - match (left, right) { - (None, None) => None, - (None, Some(one_side)) | (Some(one_side), None) => { - // in the case of an AND clause, it's safe to generalize the filter down to just one side of the AND. - // this is because this filter will be more permissive than the actual predicate, so we know that - // we will catch all data that could be matched by the predicate. For OR this is not the case - we - // could potentially eliminate one side of the predicate and the filter would only match half the - // cases that would have satisfied the match predicate. - match binary.op { - Operator::And => Some(one_side), - Operator::Or => None, - _ => None, - } - } - (Some(l), Some(r)) => Expr::BinaryExpr(BinaryExpr { - left: l.into(), - op: binary.op, - right: r.into(), - }) - .into(), - } - } - other => match references_table(&other, source_name) { - ReferenceTableCheck::HasReference(col) => { - let placeholder_name = format!("{col}_{}", placeholders.len()); - - let placeholder = Expr::Placeholder(datafusion_expr::expr::Placeholder { - id: placeholder_name.clone(), - data_type: None, - }); - - placeholders.insert(placeholder_name, other); - - Some(placeholder) - } - ReferenceTableCheck::NoReference => Some(other), - ReferenceTableCheck::Unknown => None, - }, - } -} - -fn replace_placeholders(expr: Expr, placeholders: &HashMap) -> Expr { - expr.transform(&|expr| match expr { - Expr::Placeholder(Placeholder { id, .. }) => { - let value = placeholders[&id].clone(); - // Replace the placeholder with the value - Ok(Transformed::yes(Expr::Literal(value))) - } - _ => Ok(Transformed::no(expr)), - }) - .unwrap() - .data -} - -async fn try_construct_early_filter( - join_predicate: Expr, - table_snapshot: &DeltaTableState, - session_state: &SessionState, - source: &LogicalPlan, - source_name: &TableReference<'_>, - target_name: &TableReference<'_>, -) -> DeltaResult> { - let table_metadata = table_snapshot.metadata(); - let partition_columns = &table_metadata.partition_columns; - - let mut placeholders = HashMap::default(); - - match generalize_filter( - join_predicate, - partition_columns, - source_name, - target_name, - &mut placeholders, - ) { - None => Ok(None), - Some(filter) => { - if placeholders.is_empty() { - // if we haven't recognised any partition-based predicates in the join predicate, return our reduced filter - Ok(Some(filter)) - } else { - // if we have some recognised partitions, then discover the distinct set of partitions in the source data and - // make a new filter, which expands out the placeholders for each distinct partition (and then OR these together) - let distinct_partitions = LogicalPlan::Distinct(Distinct::All( - LogicalPlan::Projection(Projection::try_new( - placeholders - .into_iter() - .map(|(alias, expr)| expr.alias(alias)) - .collect_vec(), - source.clone().into(), - )?) - .into(), - )); - let execution_plan = session_state - .create_physical_plan(&distinct_partitions) - .await?; - let items = execute_plan_to_batch(session_state, execution_plan).await?; - let placeholder_names = items - .schema() - .fields() - .iter() - .map(|f| f.name().to_owned()) - .collect_vec(); - let expr = (0..items.num_rows()) - .map(|i| { - let replacements = placeholder_names - .iter() - .map(|placeholder| { - let col = items.column_by_name(placeholder).unwrap(); - let value = ScalarValue::try_from_array(col, i)?; - DeltaResult::Ok((placeholder.to_owned(), value)) - }) - .try_collect()?; - Ok(replace_placeholders(filter.clone(), &replacements)) - }) - .collect::>>()? - .into_iter() - .reduce(Expr::or); - Ok(expr) - } - } - } -} - #[allow(clippy::too_many_arguments)] async fn execute( predicate: Expression, source: DataFrame, log_store: LogStoreRef, snapshot: DeltaTableState, - state: SessionState, + _state: SessionState, writer_properties: Option, mut commit_properties: CommitProperties, - safe_cast: bool, + _safe_cast: bool, source_alias: Option, target_alias: Option, match_operations: Vec, not_match_target_operations: Vec, not_match_source_operations: Vec, ) -> DeltaResult<(DeltaTableState, MergeMetrics)> { + if !snapshot.load_config().require_files { + return Err(DeltaTableError::NotInitializedWithFiles("MERGE".into())); + } + let mut metrics = MergeMetrics::default(); let exec_start = Instant::now(); + // Determining whether we should write change data once so that computation of change data can + // be disabled in the common case(s) + let should_cdc = should_write_cdc(&snapshot)?; + // Change data may be collected and then written out at the completion of the merge + let mut change_data = vec![]; + + if should_cdc { + debug!("Executing a merge and I should write CDC!"); + } let current_metadata = snapshot.metadata(); - let state = state.with_query_planner(Arc::new(MergePlanner {})); + let merge_planner = DeltaPlanner:: { + extension_planner: MergeMetricExtensionPlanner {}, + }; + + let state = SessionStateBuilder::new() + .with_default_features() + .with_query_planner(Arc::new(merge_planner)) + .build(); // TODO: Given the join predicate, remove any expression that involve the // source table and keep expressions that only involve the target table. @@ -987,6 +754,7 @@ async fn execute( let scan_config = DeltaScanConfigBuilder::default() .with_file_column(true) .with_parquet_pushdown(false) + .with_schema(snapshot.input_schema()?) .build(&snapshot)?; let target_provider = Arc::new(DeltaTableProvider::try_new( @@ -1002,6 +770,7 @@ async fn execute( let source_schema = source.schema(); let target_schema = target.schema(); let join_schema_df = build_join_schema(source_schema, target_schema, &JoinType::Full)?; + let predicate = match predicate { Expression::DataFusion(expr) => expr, Expression::String(s) => parse_predicate_expression(&join_schema_df, s, &state)?, @@ -1045,7 +814,7 @@ async fn execute( None => LogicalPlanBuilder::scan(target_name.clone(), target_provider, None)?.build()?, }; - let source = DataFrame::new(state.clone(), source); + let source = DataFrame::new(state.clone(), source.clone()); let source = source.with_column(SOURCE_COLUMN, lit(true))?; // Not match operations imply a full scan of the target table is required @@ -1324,9 +1093,9 @@ async fn execute( let plan = projection.into_unoptimized_plan(); let mut fields: Vec = plan .schema() - .fields() + .columns() .iter() - .map(|f| col(f.qualified_column())) + .map(|f| col(f.clone())) .collect(); fields.extend(new_columns.into_iter().map(|(name, ex)| ex.alias(name))); @@ -1338,7 +1107,7 @@ async fn execute( let merge_barrier = LogicalPlan::Extension(Extension { node: Arc::new(MergeBarrier { - input: new_columns, + input: new_columns.clone(), expr: distrbute_expr, file_column, }), @@ -1353,19 +1122,70 @@ async fn execute( }); let operation_count = DataFrame::new(state.clone(), operation_count); + + if should_cdc { + // Create a dataframe containing the CDC deletes which are present at this point + change_data.push( + operation_count + .clone() + .filter(col(DELETE_COLUMN))? + .select(write_projection.clone())? + .with_column(crate::operations::cdc::CDC_COLUMN_NAME, lit("delete"))?, + ); + } + let filtered = operation_count.filter(col(DELETE_COLUMN).is_false())?; - let project = filtered.select(write_projection)?; - let merge_final = &project.into_unoptimized_plan(); + if should_cdc { + debug!("The merge should triggere a CDC tracking, computing pre/insert/postimage datasets"); + let cdc_projection = filtered.clone().filter(col(OPERATION_COLUMN).not_eq( + // This is a copy operation, but I'm not sure how to turn that enum into an int + lit(5), + ))?; + + change_data.push( + cdc_projection + .clone() + .filter( + col(SOURCE_COLUMN) + .is_true() + .and(col(TARGET_COLUMN).is_null()), + )? + .select(write_projection.clone())? + .with_column(CDC_COLUMN_NAME, lit("insert"))?, + ); + let before = cdc_projection + .clone() + .filter(col(crate::delta_datafusion::PATH_COLUMN).is_not_null())? + .select( + target_schema + .columns() + .iter() + .filter(|c| c.name != crate::delta_datafusion::PATH_COLUMN) + .map(|c| Expr::Column(c.clone())) + .collect_vec(), + )?; + + let after = cdc_projection + .clone() + .filter(col(TARGET_COLUMN).is_true())? + .select(write_projection.clone())?; + + let tracker = CDCTracker::new(before, after); + change_data.push(tracker.collect()?); + } + + let project = filtered.clone().select(write_projection)?; + let merge_final = &project.into_unoptimized_plan(); let write = state.create_physical_plan(merge_final).await?; let err = || DeltaTableError::Generic("Unable to locate expected metric node".into()); let source_count = find_metric_node(SOURCE_COUNT_ID, &write).ok_or_else(err)?; let op_count = find_metric_node(OUTPUT_COUNT_ID, &write).ok_or_else(err)?; - let barrier = find_barrier_node(&write).ok_or_else(err)?; + let barrier = find_node::(&write).ok_or_else(err)?; + let scan_count = find_node::(&write).ok_or_else(err)?; - // write projected records let table_partition_cols = current_metadata.partition_columns.clone(); let writer_stats_config = WriterStatsConfig::new( @@ -1377,7 +1197,7 @@ async fn execute( ); let rewrite_start = Instant::now(); - let add_actions = write_execution_plan( + let mut add_actions = write_execution_plan( Some(&snapshot), state.clone(), write, @@ -1385,13 +1205,38 @@ async fn execute( log_store.object_store(), Some(snapshot.table_config().target_file_size() as usize), None, - writer_properties, - safe_cast, + writer_properties.clone(), + writer_stats_config.clone(), None, - writer_stats_config, ) .await?; + if should_cdc && !change_data.is_empty() { + let mut df = change_data + .pop() + .expect("change_data should never be empty"); + // Accumulate all the changes together into a single data frame to produce the necessary + // change data files + for change in change_data { + df = df.union(change)?; + } + add_actions.extend( + write_execution_plan_cdc( + Some(&snapshot), + state.clone(), + df.create_physical_plan().await?, + table_partition_cols.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties, + writer_stats_config, + None, + ) + .await?, + ); + } + metrics.rewrite_time_ms = Instant::now().duration_since(rewrite_start).as_millis() as u64; let mut actions: Vec = add_actions.clone(); @@ -1415,9 +1260,7 @@ async fn execute( let source_count_metrics = source_count.metrics().unwrap(); let target_count_metrics = op_count.metrics().unwrap(); - fn get_metric(metrics: &MetricsSet, name: &str) -> usize { - metrics.sum_by_name(name).map(|m| m.as_usize()).unwrap_or(0) - } + let scan_count_metrics = scan_count.metrics().unwrap(); metrics.num_source_rows = get_metric(&source_count_metrics, SOURCE_COUNT_METRIC); metrics.num_target_rows_inserted = get_metric(&target_count_metrics, TARGET_INSERTED_METRIC); @@ -1427,7 +1270,8 @@ async fn execute( metrics.num_output_rows = metrics.num_target_rows_inserted + metrics.num_target_rows_updated + metrics.num_target_rows_copied; - + metrics.num_target_files_scanned = get_metric(&scan_count_metrics, "files_scanned"); + metrics.num_target_files_skipped_during_scan = get_metric(&scan_count_metrics, "files_pruned"); metrics.execution_time_ms = Instant::now().duration_since(exec_start).as_millis() as u64; let app_metadata = &mut commit_properties.app_metadata; @@ -1484,25 +1328,6 @@ fn remove_table_alias(expr: Expr, table_alias: &str) -> Expr { .data } -// TODO: Abstract MergePlanner into DeltaPlanner to support other delta operations in the future. -struct MergePlanner {} - -#[async_trait] -impl QueryPlanner for MergePlanner { - async fn create_physical_plan( - &self, - logical_plan: &LogicalPlan, - session_state: &SessionState, - ) -> DataFusionResult> { - let planner = Arc::new(Box::new(DefaultPhysicalPlanner::with_extension_planners( - vec![Arc::new(MergeMetricExtensionPlanner {})], - ))); - planner - .create_physical_plan(logical_plan, session_state) - .await - } -} - impl std::future::IntoFuture for MergeBuilder { type Output = DeltaResult<(DeltaTable, MergeMetrics)>; type IntoFuture = BoxFuture<'static, Self::Output>; @@ -1553,48 +1378,43 @@ mod tests { use crate::kernel::DataType; use crate::kernel::PrimitiveType; use crate::kernel::StructField; - use crate::operations::merge::generalize_filter; - use crate::operations::merge::try_construct_early_filter; + use crate::operations::load_cdf::collect_batches; + use crate::operations::merge::filter::generalize_filter; use crate::operations::DeltaOps; use crate::protocol::*; use crate::writer::test_utils::datafusion::get_data; use crate::writer::test_utils::get_arrow_schema; use crate::writer::test_utils::get_delta_schema; use crate::writer::test_utils::setup_table_with_configuration; - use crate::DeltaConfigKey; use crate::DeltaTable; + use crate::TableProperty; use arrow::datatypes::Schema as ArrowSchema; use arrow::record_batch::RecordBatch; use arrow_schema::DataType as ArrowDataType; use arrow_schema::Field; use datafusion::assert_batches_sorted_eq; - use datafusion::datasource::provider_as_source; - use datafusion::prelude::DataFrame; - use datafusion::prelude::SessionContext; + use datafusion::physical_plan::ExecutionPlan; + use datafusion::prelude::*; use datafusion_common::Column; - use datafusion_common::ScalarValue; use datafusion_common::TableReference; use datafusion_expr::col; use datafusion_expr::expr::Placeholder; use datafusion_expr::lit; use datafusion_expr::Expr; - use datafusion_expr::LogicalPlanBuilder; - use datafusion_expr::Operator; use itertools::Itertools; use regex::Regex; use serde_json::json; - use std::collections::HashMap; use std::ops::Neg; use std::sync::Arc; use super::MergeMetrics; - async fn setup_table(partitions: Option>) -> DeltaTable { + pub(crate) async fn setup_table(partitions: Option>) -> DeltaTable { let table_schema = get_delta_schema(); let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); @@ -1606,7 +1426,7 @@ mod tests { #[tokio::test] async fn test_merge_when_delta_table_is_append_only() { let schema = get_arrow_schema(&None); - let table = setup_table_with_configuration(DeltaConfigKey::AppendOnly, Some("true")).await; + let table = setup_table_with_configuration(TableProperty::AppendOnly, Some("true")).await; // append some data let table = write_data(table, &schema).await; // merge @@ -2063,7 +1883,10 @@ mod tests { let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; let parameters = last_commit.operation_parameters.clone().unwrap(); - assert_eq!(parameters["predicate"], "modified = '2021-02-02'"); + assert_eq!( + parameters["predicate"], + "id BETWEEN 'B' AND 'C' AND modified = '2021-02-02'" + ); assert_eq!( parameters["mergePredicate"], "target.id = source.id AND target.modified = '2021-02-02'" @@ -2149,6 +1972,115 @@ mod tests { assert_batches_sorted_eq!(&expected, &actual); } + #[tokio::test] + async fn test_merge_partitions_with_in() { + /* Validate the join predicate works with table partitions */ + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + let table = write_data(table, &schema).await; + assert_eq!(table.version(), 1); + assert_eq!(table.get_files_count(), 2); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["B", "C", "X"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-02", + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let (table, metrics) = DeltaOps(table) + .merge( + source, + col("target.id") + .eq(col("source.id")) + .and(col("target.id").in_list( + vec![ + col("source.id"), + col("source.modified"), + col("source.value"), + ], + false, + )) + .and(col("target.modified").in_list(vec![lit("2021-02-02")], false)), + ) + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|update| { + update + .update("value", col("source.value")) + .update("modified", col("source.modified")) + }) + .unwrap() + .when_not_matched_by_source_update(|update| { + update + .predicate(col("target.value").eq(lit(1))) + .update("value", col("target.value") + lit(1)) + }) + .unwrap() + .when_not_matched_by_source_update(|update| { + update + .predicate(col("target.modified").eq(lit("2021-02-01"))) + .update("value", col("target.value") - lit(1)) + }) + .unwrap() + .when_not_matched_insert(|insert| { + insert + .set("id", col("source.id")) + .set("value", col("source.value")) + .set("modified", col("source.modified")) + }) + .unwrap() + .await + .unwrap(); + + assert_eq!(table.version(), 2); + assert!(table.get_files_count() >= 3); + assert!(metrics.num_target_files_added >= 3); + assert_eq!(metrics.num_target_files_removed, 2); + assert_eq!(metrics.num_target_rows_copied, 1); + assert_eq!(metrics.num_target_rows_updated, 3); + assert_eq!(metrics.num_target_rows_inserted, 2); + assert_eq!(metrics.num_target_rows_deleted, 0); + assert_eq!(metrics.num_output_rows, 6); + assert_eq!(metrics.num_source_rows, 3); + + let commit_info = table.history(None).await.unwrap(); + let last_commit = &commit_info[0]; + let parameters = last_commit.operation_parameters.clone().unwrap(); + assert!(!parameters.contains_key("predicate")); + assert_eq!( + parameters["mergePredicate"], + "target.id = source.id AND \ + target.id IN (source.id, source.modified, source.value) AND \ + target.modified IN ('2021-02-02')" + ); + + let expected = vec![ + "+----+-------+------------+", + "| id | value | modified |", + "+----+-------+------------+", + "| A | 2 | 2021-02-01 |", + "| B | 9 | 2021-02-01 |", + "| B | 10 | 2021-02-02 |", + "| C | 20 | 2023-07-04 |", + "| D | 100 | 2021-02-02 |", + "| X | 30 | 2023-07-04 |", + "+----+-------+------------+", + ]; + let actual = get_data(&table).await; + assert_batches_sorted_eq!(&expected, &actual); + } + #[tokio::test] async fn test_merge_delete_matched() { // Validate behaviours of match delete @@ -2204,7 +2136,7 @@ mod tests { extra_info["operationMetrics"], serde_json::to_value(&metrics).unwrap() ); - assert!(!parameters.contains_key("predicate")); + assert_eq!(parameters["predicate"], "id BETWEEN 'B' AND 'X'"); assert_eq!(parameters["mergePredicate"], json!("target.id = source.id")); assert_eq!( parameters["matchedPredicates"], @@ -2486,7 +2418,10 @@ mod tests { let last_commit = &commit_info[0]; let parameters = last_commit.operation_parameters.clone().unwrap(); - assert_eq!(parameters["predicate"], json!("modified = '2021-02-02'")); + assert_eq!( + parameters["predicate"], + json!("id BETWEEN 'B' AND 'X' AND modified = '2021-02-02'") + ); let expected = vec![ "+----+-------+------------+", @@ -2590,7 +2525,7 @@ mod tests { let parsed_filter = col(Column::new(source.clone().into(), "id")) .eq(col(Column::new(target.clone().into(), "id"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2622,7 +2557,7 @@ mod tests { let parsed_filter = (source_id.clone().eq(target_id.clone())) .or(source_id.clone().is_null().and(target_id.clone().is_null())); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2645,12 +2580,12 @@ mod tests { }) .and(target_id.clone().is_null())); - assert!(placeholders.len() == 2); + assert_eq!(placeholders.len(), 2); - let captured_expressions = placeholders.values().collect_vec(); + let captured_expressions = placeholders.into_iter().map(|p| p.expr).collect_vec(); - assert!(captured_expressions.contains(&&source_id)); - assert!(captured_expressions.contains(&&source_id.is_null())); + assert!(captured_expressions.contains(&source_id)); + assert!(captured_expressions.contains(&source_id.is_null())); assert_eq!(generalized, expected_filter); } @@ -2666,7 +2601,7 @@ mod tests { .neg() .eq(col(Column::new(target.clone().into(), "id"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2686,12 +2621,13 @@ mod tests { assert_eq!(generalized, expected_filter); assert_eq!(placeholders.len(), 1); - - let placeholder_expr = &placeholders["id_0"]; + let placeholder_expr = placeholders.first().unwrap(); let expected_placeholder = col(Column::new(source.clone().into(), "id")).neg(); - assert_eq!(placeholder_expr, &expected_placeholder); + assert_eq!(placeholder_expr.expr, expected_placeholder); + assert_eq!(placeholder_expr.alias, "id_0"); + assert!(!placeholder_expr.is_aggregate); } #[tokio::test] @@ -2704,7 +2640,7 @@ mod tests { .eq(col(Column::new(target.clone().into(), "id"))) .and(col(Column::new(target.clone().into(), "id")).eq(lit("C"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2727,15 +2663,14 @@ mod tests { } #[tokio::test] - async fn test_generalize_filter_keeps_only_static_target_references() { + async fn test_generalize_filter_with_dynamic_target_range_references() { let source = TableReference::parse_str("source"); let target = TableReference::parse_str("target"); let parsed_filter = col(Column::new(source.clone().into(), "id")) - .eq(col(Column::new(target.clone().into(), "id"))) - .and(col(Column::new(target.clone().into(), "id")).eq(lit("C"))); + .eq(col(Column::new(target.clone().into(), "id"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2745,8 +2680,16 @@ mod tests { &mut placeholders, ) .unwrap(); - - let expected_filter = col(Column::new(target.clone().into(), "id")).eq(lit("C")); + let expected_filter_l = Expr::Placeholder(Placeholder { + id: "id_0_min".to_owned(), + data_type: None, + }); + let expected_filter_h = Expr::Placeholder(Placeholder { + id: "id_0_max".to_owned(), + data_type: None, + }); + let expected_filter = col(Column::new(target.clone().into(), "id")) + .between(expected_filter_l, expected_filter_h); assert_eq!(generalized, expected_filter); } @@ -2760,7 +2703,7 @@ mod tests { .eq(col(Column::new(target.clone().into(), "id"))) .and(col(Column::new(source.clone().into(), "id")).eq(lit("C"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2780,104 +2723,6 @@ mod tests { assert_eq!(generalized, expected_filter); } - #[tokio::test] - async fn test_try_construct_early_filter_with_partitions_expands() { - let schema = get_arrow_schema(&None); - let table = setup_table(Some(vec!["id"])).await; - - assert_eq!(table.version(), 0); - assert_eq!(table.get_files_count(), 0); - - let ctx = SessionContext::new(); - let batch = RecordBatch::try_new( - Arc::clone(&schema), - vec![ - Arc::new(arrow::array::StringArray::from(vec!["B", "C", "X"])), - Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), - Arc::new(arrow::array::StringArray::from(vec![ - "2021-02-02", - "2023-07-04", - "2023-07-04", - ])), - ], - ) - .unwrap(); - let source = ctx.read_batch(batch).unwrap(); - - let source_name = TableReference::parse_str("source"); - let target_name = TableReference::parse_str("target"); - - let source = LogicalPlanBuilder::scan( - source_name.clone(), - provider_as_source(source.into_view()), - None, - ) - .unwrap() - .build() - .unwrap(); - - let join_predicate = col(Column { - relation: Some(source_name.clone()), - name: "id".to_owned(), - }) - .eq(col(Column { - relation: Some(target_name.clone()), - name: "id".to_owned(), - })); - - let pred = try_construct_early_filter( - join_predicate, - table.snapshot().unwrap(), - &ctx.state(), - &source, - &source_name, - &target_name, - ) - .await - .unwrap(); - - assert!(pred.is_some()); - - let split_pred = { - fn split(expr: Expr, parts: &mut Vec<(String, String)>) { - match expr { - Expr::BinaryExpr(ex) if ex.op == Operator::Or => { - split(*ex.left, parts); - split(*ex.right, parts); - } - Expr::BinaryExpr(ex) if ex.op == Operator::Eq => { - let col = match *ex.right { - Expr::Column(col) => col.name, - ex => panic!("expected column in pred, got {ex}!"), - }; - - let value = match *ex.left { - Expr::Literal(ScalarValue::Utf8(Some(value))) => value, - ex => panic!("expected value in predicate, got {ex}!"), - }; - - parts.push((col, value)) - } - - expr => panic!("expected either = or OR, got {expr}"), - } - } - - let mut parts = vec![]; - split(pred.unwrap(), &mut parts); - parts.sort(); - parts - }; - - let expected_pred_parts = [ - ("id".to_owned(), "B".to_owned()), - ("id".to_owned(), "C".to_owned()), - ("id".to_owned(), "X".to_owned()), - ]; - - assert_eq!(split_pred, expected_pred_parts); - } - #[tokio::test] async fn test_merge_pushdowns() { //See https://github.com/delta-io/delta-rs/issues/2158 @@ -3200,4 +3045,228 @@ mod tests { let actual = get_data(&table).await; assert_batches_sorted_eq!(&expected, &actual); } + + #[tokio::test] + async fn test_merge_cdc_disabled() { + let (table, source) = setup().await; + + let (table, metrics) = DeltaOps(table) + .merge(source, col("target.id").eq(col("source.id"))) + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|update| { + update + .update("value", col("source.value")) + .update("modified", col("source.modified")) + }) + .unwrap() + .when_not_matched_by_source_update(|update| { + update + .predicate(col("target.value").eq(lit(1))) + .update("value", col("target.value") + lit(1)) + }) + .unwrap() + .when_not_matched_insert(|insert| { + insert + .set("id", col("source.id")) + .set("value", col("source.value")) + .set("modified", col("source.modified")) + }) + .unwrap() + .await + .unwrap(); + + assert_merge(table.clone(), metrics).await; + + // Just checking that the data wasn't actually written instead! + if let Ok(files) = crate::storage::utils::flatten_list_stream( + &table.object_store(), + Some(&object_store::path::Path::from("_change_data")), + ) + .await + { + assert_eq!( + 0, + files.len(), + "This test should not find any written CDC files! {files:#?}" + ); + } + } + + #[tokio::test] + async fn test_merge_cdc_enabled_simple() { + // Manually creating the desired table with the right minimum CDC features + use crate::kernel::Protocol; + use crate::operations::merge::Action; + + let schema = get_delta_schema(); + + let actions = vec![Action::Protocol(Protocol::new(1, 4))]; + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_columns(schema.fields().cloned()) + .with_actions(actions) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = get_arrow_schema(&None); + let table = write_data(table, &schema).await; + + assert_eq!(table.version(), 1); + assert_eq!(table.get_files_count(), 1); + let source = merge_source(schema); + + let (table, metrics) = DeltaOps(table) + .merge(source, col("target.id").eq(col("source.id"))) + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|update| { + update + .update("value", col("source.value")) + .update("modified", col("source.modified")) + }) + .unwrap() + .when_not_matched_by_source_update(|update| { + update + .predicate(col("target.value").eq(lit(1))) + .update("value", col("target.value") + lit(1)) + }) + .unwrap() + .when_not_matched_insert(|insert| { + insert + .set("id", col("source.id")) + .set("value", col("source.value")) + .set("modified", col("source.modified")) + }) + .unwrap() + .await + .unwrap(); + + assert_merge(table.clone(), metrics).await; + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + let _ = arrow::util::pretty::print_batches(&batches); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(5)).collect(); + + assert_batches_sorted_eq! {[ + "+----+-------+------------+------------------+-----------------+", + "| id | value | modified | _change_type | _commit_version |", + "+----+-------+------------+------------------+-----------------+", + "| A | 1 | 2021-02-01 | update_preimage | 2 |", + "| A | 2 | 2021-02-01 | update_postimage | 2 |", + "| B | 10 | 2021-02-01 | update_preimage | 2 |", + "| B | 10 | 2021-02-02 | update_postimage | 2 |", + "| C | 10 | 2021-02-02 | update_preimage | 2 |", + "| C | 20 | 2023-07-04 | update_postimage | 2 |", + "| X | 30 | 2023-07-04 | insert | 2 |", + "| A | 1 | 2021-02-01 | insert | 1 |", + "| B | 10 | 2021-02-01 | insert | 1 |", + "| C | 10 | 2021-02-02 | insert | 1 |", + "| D | 100 | 2021-02-02 | insert | 1 |", + "+----+-------+------------+------------------+-----------------+", + ], &batches } + } + + #[tokio::test] + async fn test_merge_cdc_enabled_delete() { + // Manually creating the desired table with the right minimum CDC features + use crate::kernel::Protocol; + use crate::operations::merge::Action; + + let schema = get_delta_schema(); + + let actions = vec![Action::Protocol(Protocol::new(1, 4))]; + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_columns(schema.fields().cloned()) + .with_actions(actions) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = get_arrow_schema(&None); + let table = write_data(table, &schema).await; + + assert_eq!(table.version(), 1); + assert_eq!(table.get_files_count(), 1); + let source = merge_source(schema); + + let (table, _metrics) = DeltaOps(table) + .merge(source, col("target.id").eq(col("source.id"))) + .with_source_alias("source") + .with_target_alias("target") + .when_not_matched_by_source_delete(|delete| { + delete.predicate(col("target.modified").gt(lit("2021-02-01"))) + }) + .unwrap() + .await + .unwrap(); + + let expected = vec![ + "+----+-------+------------+", + "| id | value | modified |", + "+----+-------+------------+", + "| A | 1 | 2021-02-01 |", + "| B | 10 | 2021-02-01 |", + "| C | 10 | 2021-02-02 |", + "+----+-------+------------+", + ]; + let actual = get_data(&table).await; + assert_batches_sorted_eq!(&expected, &actual); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + let _ = arrow::util::pretty::print_batches(&batches); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(5)).collect(); + + assert_batches_sorted_eq! {[ + "+----+-------+------------+--------------+-----------------+", + "| id | value | modified | _change_type | _commit_version |", + "+----+-------+------------+--------------+-----------------+", + "| D | 100 | 2021-02-02 | delete | 2 |", + "| A | 1 | 2021-02-01 | insert | 1 |", + "| B | 10 | 2021-02-01 | insert | 1 |", + "| C | 10 | 2021-02-02 | insert | 1 |", + "| D | 100 | 2021-02-02 | insert | 1 |", + "+----+-------+------------+--------------+-----------------+", + ], &batches } + } } diff --git a/crates/core/src/operations/mod.rs b/crates/core/src/operations/mod.rs index 7923431d45..c71141d277 100644 --- a/crates/core/src/operations/mod.rs +++ b/crates/core/src/operations/mod.rs @@ -6,15 +6,33 @@ //! the operations' behaviors and will return an updated table potentially in conjunction //! with a [data stream][datafusion::physical_plan::SendableRecordBatchStream], //! if the operation returns data as well. +use std::collections::HashMap; + +use add_feature::AddTableFeatureBuilder; +#[cfg(feature = "datafusion")] +use arrow_array::RecordBatch; +#[cfg(feature = "datafusion")] +pub use datafusion_physical_plan::common::collect as collect_sendable_stream; +use self::add_column::AddColumnBuilder; use self::create::CreateBuilder; use self::filesystem_check::FileSystemCheckBuilder; +use self::optimize::OptimizeBuilder; +use self::restore::RestoreBuilder; +use self::set_tbl_properties::SetTablePropertiesBuilder; use self::vacuum::VacuumBuilder; +#[cfg(feature = "datafusion")] +use self::{ + constraints::ConstraintBuilder, datafusion_utils::Expression, delete::DeleteBuilder, + drop_constraints::DropConstraintBuilder, load::LoadBuilder, load_cdf::CdfLoadBuilder, + merge::MergeBuilder, update::UpdateBuilder, write::WriteBuilder, +}; use crate::errors::{DeltaResult, DeltaTableError}; use crate::table::builder::DeltaTableBuilder; use crate::DeltaTable; -use std::collections::HashMap; +pub mod add_column; +pub mod add_feature; pub mod cast; pub mod convert_to_delta; pub mod create; @@ -25,20 +43,8 @@ pub mod restore; pub mod transaction; pub mod vacuum; -#[cfg(feature = "datafusion")] -use self::{ - constraints::ConstraintBuilder, datafusion_utils::Expression, delete::DeleteBuilder, - drop_constraints::DropConstraintBuilder, load::LoadBuilder, load_cdf::CdfLoadBuilder, - merge::MergeBuilder, update::UpdateBuilder, write::WriteBuilder, -}; -#[cfg(feature = "datafusion")] -pub use ::datafusion::physical_plan::common::collect as collect_sendable_stream; -#[cfg(feature = "datafusion")] -use arrow::record_batch::RecordBatch; -use optimize::OptimizeBuilder; -use restore::RestoreBuilder; -use set_tbl_properties::SetTablePropertiesBuilder; - +#[cfg(all(feature = "cdf", feature = "datafusion"))] +mod cdc; #[cfg(feature = "datafusion")] pub mod constraints; #[cfg(feature = "datafusion")] @@ -56,6 +62,7 @@ pub mod update; pub mod write; pub mod writer; +#[allow(unused)] /// The [Operation] trait defines common behaviors that all operations builders /// should have consistent pub(crate) trait Operation: std::future::IntoFuture {} @@ -215,6 +222,12 @@ impl DeltaOps { ConstraintBuilder::new(self.0.log_store, self.0.state.unwrap()) } + /// Enable a table feature for a table + #[must_use] + pub fn add_feature(self) -> AddTableFeatureBuilder { + AddTableFeatureBuilder::new(self.0.log_store, self.0.state.unwrap()) + } + /// Drops constraints from a table #[cfg(feature = "datafusion")] #[must_use] @@ -226,6 +239,11 @@ impl DeltaOps { pub fn set_tbl_properties(self) -> SetTablePropertiesBuilder { SetTablePropertiesBuilder::new(self.0.log_store, self.0.state.unwrap()) } + + /// Add new columns + pub fn add_columns(self) -> AddColumnBuilder { + AddColumnBuilder::new(self.0.log_store, self.0.state.unwrap()) + } } impl From for DeltaOps { @@ -273,6 +291,22 @@ pub fn get_num_idx_cols_and_stats_columns( ) } +/// Get the target_file_size from the table configuration in the sates +/// If table_config does not exist (only can occur in the first write action) it takes +/// the configuration that was passed to the writerBuilder. +pub(crate) fn get_target_file_size( + config: &Option>, + configuration: &HashMap>, +) -> i64 { + match &config { + Some(conf) => conf.target_file_size(), + _ => configuration + .get("delta.targetFileSize") + .and_then(|v| v.clone().map(|v| v.parse::().unwrap())) + .unwrap_or(crate::table::config::DEFAULT_TARGET_FILE_SIZE), + } +} + #[cfg(feature = "datafusion")] mod datafusion_utils { use datafusion::execution::context::SessionState; @@ -282,6 +316,7 @@ mod datafusion_utils { use crate::{delta_datafusion::expr::parse_predicate_expression, DeltaResult}; /// Used to represent user input of either a Datafusion expression or string expression + #[derive(Debug)] pub enum Expression { /// Datafusion Expression DataFusion(Expr), diff --git a/crates/core/src/operations/optimize.rs b/crates/core/src/operations/optimize.rs index 10cbb6a22a..cf096d56d1 100644 --- a/crates/core/src/operations/optimize.rs +++ b/crates/core/src/operations/optimize.rs @@ -25,8 +25,9 @@ use std::fmt; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use arrow::datatypes::SchemaRef as ArrowSchemaRef; use arrow_array::RecordBatch; +use arrow_schema::SchemaRef as ArrowSchemaRef; +use delta_kernel::expressions::Scalar; use futures::future::BoxFuture; use futures::stream::BoxStream; use futures::{Future, StreamExt, TryStreamExt}; @@ -38,12 +39,13 @@ use parquet::basic::{Compression, ZstdLevel}; use parquet::errors::ParquetError; use parquet::file::properties::WriterProperties; use serde::{de::Error as DeError, Deserialize, Deserializer, Serialize, Serializer}; -use tracing::debug; +use tracing::*; +use url::Url; use super::transaction::PROTOCOL; use super::writer::{PartitionWriter, PartitionWriterConfig}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Action, PartitionsExt, Remove, Scalar}; +use crate::kernel::{scalars::ScalarExt, Action, PartitionsExt, Remove}; use crate::logstore::LogStoreRef; use crate::operations::transaction::{CommitBuilder, CommitProperties, DEFAULT_RETRIES}; use crate::protocol::DeltaOperation; @@ -136,6 +138,7 @@ impl fmt::Display for MetricDetails { } } +#[derive(Debug)] /// Metrics for a single partition pub struct PartialMetrics { /// Number of optimized files added @@ -202,9 +205,9 @@ pub struct OptimizeBuilder<'a> { commit_properties: CommitProperties, /// Whether to preserve insertion order within files (default false) preserve_insertion_order: bool, - /// Max number of concurrent tasks (default is number of cpus) + /// Maximum number of concurrent tasks (default is number of cpus) max_concurrent_tasks: usize, - /// Maximum number of bytes that are allowed to spill to disk + /// Maximum number of bytes allowed in memory before spilling to disk max_spill_size: usize, /// Optimize type optimize_type: OptimizeType, @@ -225,7 +228,7 @@ impl<'a> OptimizeBuilder<'a> { commit_properties: CommitProperties::default(), preserve_insertion_order: false, max_concurrent_tasks: num_cpus::get(), - max_spill_size: 20 * 1024 * 1024 * 2014, // 20 GB. + max_spill_size: 20 * 1024 * 1024 * 1024, // 20 GB. optimize_type: OptimizeType::Compact, min_commit_interval: None, } @@ -295,6 +298,9 @@ impl<'a> std::future::IntoFuture for OptimizeBuilder<'a> { Box::pin(async move { PROTOCOL.can_write_to(&this.snapshot.snapshot)?; + if !&this.snapshot.load_config().require_files { + return Err(DeltaTableError::NotInitializedWithFiles("OPTIMIZE".into())); + } let writer_properties = this.writer_properties.unwrap_or_else(|| { WriterProperties::builder() @@ -341,6 +347,7 @@ impl From for DeltaOperation { } } +/// Generate an appropriate remove action for the optimization task fn create_remove( path: &str, partitions: &IndexMap, @@ -602,12 +609,26 @@ impl MergePlan { use datafusion_expr::expr::ScalarFunction; use datafusion_expr::{Expr, ScalarUDF}; - let locations = files + // This code is ... not ideal. Essentially `read_parquet` expects Strings that it will then + // parse as URLs and then pass back to the object store (x_x). This can cause problems when + // paths in object storage have special characters like spaces, etc. + // + // This [str::replace] i kind of a hack to address + // + let locations: Vec = files .iter() - .map(|file| format!("delta-rs:///{}", file.location)) - .collect_vec(); + .map(|om| { + format!( + "delta-rs:///{}", + str::replace(om.location.as_ref(), "%", "%25") + ) + }) + .collect(); + debug!("Reading z-order with locations are: {locations:?}"); + let df = context .ctx + // TODO: should read options have the partition columns .read_parquet(locations, ParquetReadOptions::default()) .await?; @@ -708,6 +729,7 @@ impl MergePlan { bins.len() <= num_cpus::get(), )); + debug!("Starting zorder with the columns: {zorder_columns:?} {bins:?}"); #[cfg(feature = "datafusion")] let exec_context = Arc::new(zorder::ZOrderExecContext::new( zorder_columns, @@ -715,6 +737,7 @@ impl MergePlan { max_spill_size, )?); let task_parameters = self.task_parameters.clone(); + let log_store = log_store.clone(); futures::stream::iter(bins) .map(move |(_, (partition, files))| { @@ -887,9 +910,7 @@ impl MergeBin { self.size_bytes += meta.size as i64; self.files.push(meta); } -} -impl MergeBin { fn iter(&self) -> impl Iterator { self.files.iter() } @@ -1001,7 +1022,6 @@ fn build_zorder_plan( let field_names = snapshot .schema() .fields() - .iter() .map(|field| field.name().to_string()) .collect_vec(); let unknown_columns = zorder_columns @@ -1033,6 +1053,7 @@ fn build_zorder_plan( .or_insert_with(|| (partition_values, MergeBin::new())) .1 .add(object_meta); + error!("partition_files inside the zorder plan: {partition_files:?}"); } let operation = OptimizeOperations::ZOrder(zorder_columns, partition_files); @@ -1226,7 +1247,6 @@ pub(super) mod zorder { let runtime = Arc::new(RuntimeEnv::new(config)?); runtime.register_object_store(&Url::parse("delta-rs://").unwrap(), object_store); - use url::Url; let ctx = SessionContext::new_with_config_rt(SessionConfig::default(), runtime); ctx.register_udf(ScalarUDF::from(datafusion::ZOrderUDF)); Ok(Self { columns, ctx }) @@ -1266,6 +1286,7 @@ pub(super) mod zorder { fn zorder_key_datafusion( columns: &[ColumnarValue], ) -> Result { + debug!("zorder_key_datafusion: {columns:#?}"); let length = columns .iter() .map(|col| match col { @@ -1420,6 +1441,94 @@ pub(super) mod zorder { .await; assert!(res.is_ok()); } + + /// Issue + #[tokio::test] + async fn test_zorder_space_in_partition_value() { + use arrow_schema::Schema as ArrowSchema; + let _ = pretty_env_logger::try_init(); + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("modified", DataType::Utf8, true), + Field::new("country", DataType::Utf8, true), + Field::new("value", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-01", + "2021-02-01", + "2021-02-02", + "2021-02-02", + ])), + Arc::new(arrow::array::StringArray::from(vec![ + "Germany", + "China", + "Canada", + "Dominican Republic", + ])), + Arc::new(arrow::array::Int32Array::from(vec![1, 10, 20, 100])), + //Arc::new(arrow::array::StringArray::from(vec!["Dominican Republic"])), + //Arc::new(arrow::array::Int32Array::from(vec![100])), + ], + ) + .unwrap(); + // write some data + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_partition_columns(vec!["country"]) + .with_save_mode(crate::protocol::SaveMode::Overwrite) + .await + .unwrap(); + + let res = crate::DeltaOps(table) + .optimize() + .with_type(OptimizeType::ZOrder(vec!["modified".into()])) + .await; + assert!(res.is_ok(), "Failed to optimize: {res:#?}"); + } + + #[tokio::test] + async fn test_zorder_space_in_partition_value_garbage() { + use arrow_schema::Schema as ArrowSchema; + let _ = pretty_env_logger::try_init(); + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("modified", DataType::Utf8, true), + Field::new("country", DataType::Utf8, true), + Field::new("value", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-01", + "2021-02-01", + "2021-02-02", + "2021-02-02", + ])), + Arc::new(arrow::array::StringArray::from(vec![ + "Germany", "China", "Canada", "USA$$!", + ])), + Arc::new(arrow::array::Int32Array::from(vec![1, 10, 20, 100])), + ], + ) + .unwrap(); + // write some data + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_partition_columns(vec!["country"]) + .with_save_mode(crate::protocol::SaveMode::Overwrite) + .await + .unwrap(); + + let res = crate::DeltaOps(table) + .optimize() + .with_type(OptimizeType::ZOrder(vec!["modified".into()])) + .await; + assert!(res.is_ok(), "Failed to optimize: {res:#?}"); + } } } @@ -1575,5 +1684,30 @@ pub(super) mod zorder { assert_eq!(data.value_data().len(), 3 * 16 * 3); assert!(data.iter().all(|x| x.unwrap().len() == 3 * 16)); } + + #[tokio::test] + async fn works_on_spark_table() { + use crate::DeltaOps; + use tempfile::TempDir; + // Create a temporary directory + let tmp_dir = TempDir::new().expect("Failed to make temp dir"); + let table_name = "delta-1.2.1-only-struct-stats"; + + // Copy recursively from the test data directory to the temporary directory + let source_path = format!("../test/tests/data/{table_name}"); + fs_extra::dir::copy(source_path, tmp_dir.path(), &Default::default()).unwrap(); + + // Run optimize + let (_, metrics) = + DeltaOps::try_from_uri(tmp_dir.path().join(table_name).to_str().unwrap()) + .await + .unwrap() + .optimize() + .await + .unwrap(); + + // Verify it worked + assert_eq!(metrics.num_files_added, 1); + } } } diff --git a/crates/core/src/operations/restore.rs b/crates/core/src/operations/restore.rs index e2ab9741bc..498edc67c0 100644 --- a/crates/core/src/operations/restore.rs +++ b/crates/core/src/operations/restore.rs @@ -4,14 +4,14 @@ //! 1) Read the latest state snapshot of the table. //! 2) Read table state for version or datetime to restore //! 3) Compute files available in state for restoring (files were removed by some commit) -//! but missed in the latest. Add these files into commit as AddFile action. +//! but missed in the latest. Add these files into commit as AddFile action. //! 4) Compute files available in the latest state snapshot (files were added after version to restore) -//! but missed in the state to restore. Add these files into commit as RemoveFile action. +//! but missed in the state to restore. Add these files into commit as RemoveFile action. //! 5) If ignore_missing_files option is false (default value) check availability of AddFile -//! in file system. +//! in file system. //! 6) Commit Protocol, all RemoveFile and AddFile actions -//! into delta log using `LogStore::write_commit_entry` (commit will be failed in case of parallel transaction) -//! TODO: comment is outdated +//! into delta log using `LogStore::write_commit_entry` (commit will be failed in case of parallel transaction) +//! TODO: comment is outdated //! 7) If table was modified in parallel then ignore restore and raise exception. //! //! # Example @@ -272,14 +272,19 @@ async fn execute( .await?; let commit_version = snapshot.version() + 1; - let commit = prepared_commit.path(); - match log_store.write_commit_entry(commit_version, commit).await { + let commit_bytes = prepared_commit.commit_or_bytes(); + match log_store + .write_commit_entry(commit_version, commit_bytes.clone()) + .await + { Ok(_) => {} Err(err @ TransactionError::VersionAlreadyExists(_)) => { return Err(err.into()); } Err(err) => { - log_store.abort_commit_entry(commit_version, commit).await?; + log_store + .abort_commit_entry(commit_version, commit_bytes.clone()) + .await?; return Err(err.into()); } } diff --git a/crates/core/src/operations/set_tbl_properties.rs b/crates/core/src/operations/set_tbl_properties.rs index e0c4ea2e9a..b3ca7607ac 100644 --- a/crates/core/src/operations/set_tbl_properties.rs +++ b/crates/core/src/operations/set_tbl_properties.rs @@ -1,18 +1,16 @@ //! Set table properties on a table -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use futures::future::BoxFuture; -use maplit::hashset; use super::transaction::{CommitBuilder, CommitProperties}; -use crate::kernel::{Action, Protocol, ReaderFeatures, WriterFeatures}; +use crate::kernel::Action; use crate::logstore::LogStoreRef; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; -use crate::DeltaConfigKey; +use crate::DeltaResult; use crate::DeltaTable; -use crate::{DeltaResult, DeltaTableError}; /// Remove constraints from the table pub struct SetTablePropertiesBuilder { @@ -59,203 +57,6 @@ impl SetTablePropertiesBuilder { } } -/// Will apply the properties to the protocol by either bumping the version or setting -/// features -pub fn apply_properties_to_protocol( - current_protocol: &Protocol, - new_properties: &HashMap, - raise_if_not_exists: bool, -) -> DeltaResult { - let mut parsed_properties: HashMap = HashMap::new(); - - for (key, value) in new_properties { - if let Ok(parsed_key) = key.parse::() { - parsed_properties.insert(parsed_key, value.to_string()); - } else if raise_if_not_exists { - return Err(DeltaTableError::Generic(format!( - "Error parsing property '{}':'{}'", - key, value - ))); - } - } - - let mut new_protocol = current_protocol.clone(); - - // Check and update delta.minReaderVersion - if let Some(min_reader_version) = parsed_properties.get(&DeltaConfigKey::MinReaderVersion) { - let new_min_reader_version = min_reader_version.parse::(); - match new_min_reader_version { - Ok(version) => match version { - 1..=3 => { - if version > new_protocol.min_reader_version { - new_protocol.min_reader_version = version - } - } - _ => { - return Err(DeltaTableError::Generic(format!( - "delta.minReaderVersion = '{}' is invalid, valid values are ['1','2','3']", - min_reader_version - ))) - } - }, - Err(_) => { - return Err(DeltaTableError::Generic(format!( - "delta.minReaderVersion = '{}' is invalid, valid values are ['1','2','3']", - min_reader_version - ))) - } - } - } - - // Check and update delta.minWriterVersion - if let Some(min_writer_version) = parsed_properties.get(&DeltaConfigKey::MinWriterVersion) { - let new_min_writer_version = min_writer_version.parse::(); - match new_min_writer_version { - Ok(version) => match version { - 2..=7 => { - if version > new_protocol.min_writer_version { - new_protocol.min_writer_version = version - } - } - _ => { - return Err(DeltaTableError::Generic(format!( - "delta.minWriterVersion = '{}' is invalid, valid values are ['2','3','4','5','6','7']", - min_writer_version - ))) - } - }, - Err(_) => { - return Err(DeltaTableError::Generic(format!( - "delta.minWriterVersion = '{}' is invalid, valid values are ['2','3','4','5','6','7']", - min_writer_version - ))) - } - } - } - - // Check enableChangeDataFeed and bump protocol or add writerFeature if writer versions is >=7 - if let Some(enable_cdf) = parsed_properties.get(&DeltaConfigKey::EnableChangeDataFeed) { - let if_enable_cdf = enable_cdf.to_ascii_lowercase().parse::(); - match if_enable_cdf { - Ok(true) => { - if new_protocol.min_writer_version >= 7 { - match new_protocol.writer_features { - Some(mut features) => { - features.insert(WriterFeatures::ChangeDataFeed); - new_protocol.writer_features = Some(features); - } - None => { - new_protocol.writer_features = - Some(hashset! {WriterFeatures::ChangeDataFeed}) - } - } - } else if new_protocol.min_writer_version <= 3 { - new_protocol.min_writer_version = 4 - } - } - Ok(false) => {} - _ => { - return Err(DeltaTableError::Generic(format!( - "delta.enableChangeDataFeed = '{}' is invalid, valid values are ['true']", - enable_cdf - ))) - } - } - } - - if let Some(enable_dv) = parsed_properties.get(&DeltaConfigKey::EnableDeletionVectors) { - let if_enable_dv = enable_dv.to_ascii_lowercase().parse::(); - match if_enable_dv { - Ok(true) => { - let writer_features = match new_protocol.writer_features { - Some(mut features) => { - features.insert(WriterFeatures::DeletionVectors); - features - } - None => hashset! {WriterFeatures::DeletionVectors}, - }; - let reader_features = match new_protocol.reader_features { - Some(mut features) => { - features.insert(ReaderFeatures::DeletionVectors); - features - } - None => hashset! {ReaderFeatures::DeletionVectors}, - }; - new_protocol.min_reader_version = 3; - new_protocol.min_writer_version = 7; - new_protocol.writer_features = Some(writer_features); - new_protocol.reader_features = Some(reader_features); - } - Ok(false) => {} - _ => { - return Err(DeltaTableError::Generic(format!( - "delta.enableDeletionVectors = '{}' is invalid, valid values are ['true']", - enable_dv - ))) - } - } - } - - Ok(new_protocol) -} - -/// Converts existing properties into features if the reader_version is >=3 or writer_version >=3 -/// only converts features that are "true" -pub fn convert_properties_to_features( - mut new_protocol: Protocol, - configuration: &HashMap>, -) -> Protocol { - if new_protocol.min_writer_version >= 7 { - let mut converted_writer_features = configuration - .iter() - .filter(|(_, value)| { - value.as_ref().map_or(false, |v| { - v.to_ascii_lowercase().parse::().is_ok_and(|v| v) - }) - }) - .collect::>>() - .keys() - .map(|key| (*key).clone().into()) - .filter(|v| !matches!(v, WriterFeatures::Other(_))) - .collect::>(); - - if configuration - .keys() - .any(|v| v.contains("delta.constraints.")) - { - converted_writer_features.insert(WriterFeatures::CheckConstraints); - } - - match new_protocol.writer_features { - Some(mut features) => { - features.extend(converted_writer_features); - new_protocol.writer_features = Some(features); - } - None => new_protocol.writer_features = Some(converted_writer_features), - } - } - if new_protocol.min_reader_version >= 3 { - let converted_reader_features = configuration - .iter() - .filter(|(_, value)| { - value.as_ref().map_or(false, |v| { - v.to_ascii_lowercase().parse::().is_ok_and(|v| v) - }) - }) - .map(|(key, _)| (*key).clone().into()) - .filter(|v| !matches!(v, ReaderFeatures::Other(_))) - .collect::>(); - match new_protocol.reader_features { - Some(mut features) => { - features.extend(converted_reader_features); - new_protocol.reader_features = Some(features); - } - None => new_protocol.reader_features = Some(converted_reader_features), - } - } - new_protocol -} - impl std::future::IntoFuture for SetTablePropertiesBuilder { type Output = DeltaResult; @@ -270,11 +71,9 @@ impl std::future::IntoFuture for SetTablePropertiesBuilder { let current_protocol = this.snapshot.protocol(); let properties = this.properties; - let new_protocol = apply_properties_to_protocol( - current_protocol, - &properties, - this.raise_if_not_exists, - )?; + let new_protocol = current_protocol + .clone() + .apply_properties_to_protocol(&properties, this.raise_if_not_exists)?; metadata.configuration.extend( properties @@ -285,7 +84,7 @@ impl std::future::IntoFuture for SetTablePropertiesBuilder { ); let final_protocol = - convert_properties_to_features(new_protocol, &metadata.configuration); + new_protocol.move_table_properties_into_features(&metadata.configuration); let operation = DeltaOperation::SetTableProperties { properties }; diff --git a/crates/core/src/operations/transaction/conflict_checker.rs b/crates/core/src/operations/transaction/conflict_checker.rs index d44c704b53..d163ba2f9b 100644 --- a/crates/core/src/operations/transaction/conflict_checker.rs +++ b/crates/core/src/operations/transaction/conflict_checker.rs @@ -645,28 +645,30 @@ pub(super) fn can_downgrade_to_snapshot_isolation<'a>( #[cfg(test)] #[allow(unused)] mod tests { - use super::super::test_utils as tu; - use super::super::test_utils::init_table_actions; - use super::*; - use crate::kernel::Action; + use std::collections::HashMap; + #[cfg(feature = "datafusion")] use datafusion_expr::{col, lit}; use serde_json::json; - fn get_stats(min: i64, max: i64) -> Option { - let data = json!({ - "numRecords": 18, - "minValues": { - "value": min - }, - "maxValues": { - "value": max - }, - "nullCount": { - "value": 0 - } - }); - Some(data.to_string()) + use super::*; + use crate::kernel::Action; + use crate::test_utils::{ActionFactory, TestSchemas}; + + fn simple_add(data_change: bool, min: &str, max: &str) -> Add { + ActionFactory::add( + TestSchemas::simple(), + HashMap::from_iter([("value", (min, max))]), + Default::default(), + true, + ) + } + + fn init_table_actions() -> Vec { + vec![ + ActionFactory::protocol(None, None, None::>, None::>).into(), + ActionFactory::metadata(TestSchemas::simple(), None::>, None).into(), + ] } #[test] @@ -676,7 +678,8 @@ mod tests { predicate: None, target_size: 0, }; - let add = tu::create_add_action("p", false, None); + let add = + ActionFactory::add(TestSchemas::simple(), HashMap::new(), Vec::new(), true).into(); let res = can_downgrade_to_snapshot_isolation(&[add], &operation, &isolation); assert!(!res) } @@ -697,7 +700,7 @@ mod tests { ) -> Result<(), CommitConflictError> { use crate::table::state::DeltaTableState; - let setup_actions = setup.unwrap_or_else(|| init_table_actions(None)); + let setup_actions = setup.unwrap_or_else(init_table_actions); let state = DeltaTableState::from_actions(setup_actions).unwrap(); let snapshot = state.snapshot(); let transaction_info = TransactionInfo::new(snapshot, reads, &actions, read_whole_table); @@ -715,22 +718,23 @@ mod tests { async fn test_allowed_concurrent_actions() { // append - append // append file to table while a concurrent writer also appends a file - let file1 = tu::create_add_action("file1", true, get_stats(1, 10)); - let file2 = tu::create_add_action("file2", true, get_stats(1, 10)); + let file1 = simple_add(true, "1", "10").into(); + let file2 = simple_add(true, "1", "10").into(); + let result = execute_test(None, None, vec![file1], vec![file2], false); assert!(result.is_ok()); // disjoint delete - read // the concurrent transaction deletes a file that the current transaction did NOT read - let file_not_read = tu::create_add_action("file_not_read", true, get_stats(1, 10)); - let file_read = tu::create_add_action("file_read", true, get_stats(100, 10000)); - let mut setup_actions = init_table_actions(None); - setup_actions.push(file_not_read); + let file_not_read = simple_add(true, "1", "10"); + let file_read = simple_add(true, "100", "10000").into(); + let mut setup_actions = init_table_actions(); + setup_actions.push(file_not_read.clone().into()); setup_actions.push(file_read); let result = execute_test( Some(setup_actions), Some(col("value").gt(lit::(10))), - vec![tu::create_remove_action("file_not_read", true)], + vec![ActionFactory::remove(&file_not_read, true).into()], vec![], false, ); @@ -738,9 +742,9 @@ mod tests { // disjoint add - read // concurrently add file, that the current transaction would not have read - let file_added = tu::create_add_action("file_added", true, get_stats(1, 10)); - let file_read = tu::create_add_action("file_read", true, get_stats(100, 10000)); - let mut setup_actions = init_table_actions(None); + let file_added = simple_add(true, "1", "10").into(); + let file_read = simple_add(true, "100", "10000").into(); + let mut setup_actions = init_table_actions(); setup_actions.push(file_read); let result = execute_test( Some(setup_actions), @@ -774,7 +778,8 @@ mod tests { async fn test_disallowed_concurrent_actions() { // delete - delete // remove file from table that has previously been removed - let removed_file = tu::create_remove_action("removed_file", true); + let removed_file = simple_add(true, "1", "10"); + let removed_file: Action = ActionFactory::remove(&removed_file, true).into(); let result = execute_test( None, None, @@ -789,9 +794,8 @@ mod tests { // add / read + write // a file is concurrently added that should have been read by the current transaction - let file_added = tu::create_add_action("file_added", true, get_stats(1, 10)); - let file_should_have_read = - tu::create_add_action("file_should_have_read", true, get_stats(1, 10)); + let file_added = simple_add(true, "1", "10").into(); + let file_should_have_read = simple_add(true, "1", "10").into(); let result = execute_test( None, Some(col("value").lt_eq(lit::(10))), @@ -803,13 +807,13 @@ mod tests { // delete / read // transaction reads a file that is removed by concurrent transaction - let file_read = tu::create_add_action("file_read", true, get_stats(1, 10)); - let mut setup_actions = init_table_actions(None); - setup_actions.push(file_read); + let file_read = simple_add(true, "1", "10"); + let mut setup_actions = init_table_actions(); + setup_actions.push(file_read.clone().into()); let result = execute_test( Some(setup_actions), Some(col("value").lt_eq(lit::(10))), - vec![tu::create_remove_action("file_read", true)], + vec![ActionFactory::remove(&file_read, true).into()], vec![], false, ); @@ -823,7 +827,7 @@ mod tests { let result = execute_test( None, None, - vec![tu::create_metadata_action(None, None)], + vec![ActionFactory::metadata(TestSchemas::simple(), None::>, None).into()], vec![], false, ); @@ -834,8 +838,8 @@ mod tests { let result = execute_test( None, None, - vec![tu::create_protocol_action(None, None)], - vec![tu::create_protocol_action(None, None)], + vec![ActionFactory::protocol(None, None, None::>, None::>).into()], + vec![ActionFactory::protocol(None, None, None::>, None::>).into()], false, ); assert!(matches!( @@ -846,10 +850,10 @@ mod tests { // taint whole table // `read_whole_table` should disallow any concurrent change, even if the change // is disjoint with the earlier filter - let file_part1 = tu::create_add_action("file_part1", true, get_stats(1, 10)); - let file_part2 = tu::create_add_action("file_part2", true, get_stats(11, 100)); - let file_part3 = tu::create_add_action("file_part3", true, get_stats(101, 1000)); - let mut setup_actions = init_table_actions(None); + let file_part1 = simple_add(true, "1", "10").into(); + let file_part2 = simple_add(true, "11", "100").into(); + let file_part3 = simple_add(true, "101", "1000").into(); + let mut setup_actions = init_table_actions(); setup_actions.push(file_part1); let result = execute_test( Some(setup_actions), @@ -863,14 +867,14 @@ mod tests { // taint whole table + concurrent remove // `read_whole_table` should disallow any concurrent remove actions - let file_part1 = tu::create_add_action("file_part1", true, get_stats(1, 10)); - let file_part2 = tu::create_add_action("file_part2", true, get_stats(11, 100)); - let mut setup_actions = init_table_actions(None); - setup_actions.push(file_part1); + let file_part1 = simple_add(true, "1", "10"); + let file_part2 = simple_add(true, "11", "100").into(); + let mut setup_actions = init_table_actions(); + setup_actions.push(file_part1.clone().into()); let result = execute_test( Some(setup_actions), None, - vec![tu::create_remove_action("file_part1", true)], + vec![ActionFactory::remove(&file_part1, true).into()], vec![file_part2], true, ); diff --git a/crates/core/src/operations/transaction/mod.rs b/crates/core/src/operations/transaction/mod.rs index 31cbc3a33b..69027cc4b7 100644 --- a/crates/core/src/operations/transaction/mod.rs +++ b/crates/core/src/operations/transaction/mod.rs @@ -73,28 +73,31 @@ //! │ │ //! └───────────────────────────────┘ //! +use std::collections::HashMap; +use bytes::Bytes; use chrono::Utc; use conflict_checker::ConflictChecker; use futures::future::BoxFuture; use object_store::path::Path; -use object_store::{Error as ObjectStoreError, ObjectStore}; +use object_store::Error as ObjectStoreError; use serde_json::Value; -use std::collections::HashMap; -use self::conflict_checker::{CommitConflictError, TransactionInfo, WinningCommitSummary}; -use crate::checkpoints::create_checkpoint_for; +use self::conflict_checker::{TransactionInfo, WinningCommitSummary}; +use crate::checkpoints::{cleanup_expired_logs_for, create_checkpoint_for}; use crate::errors::DeltaTableError; use crate::kernel::{ Action, CommitInfo, EagerSnapshot, Metadata, Protocol, ReaderFeatures, Transaction, WriterFeatures, }; -use crate::logstore::LogStoreRef; +use crate::logstore::{CommitOrBytes, LogStoreRef}; use crate::protocol::DeltaOperation; +use crate::storage::ObjectStoreRef; use crate::table::config::TableConfig; use crate::table::state::DeltaTableState; use crate::{crate_version, DeltaResult}; +pub use self::conflict_checker::CommitConflictError; pub use self::protocol::INSTANCE as PROTOCOL; #[cfg(test)] @@ -103,8 +106,6 @@ mod conflict_checker; mod protocol; #[cfg(feature = "datafusion")] mod state; -#[cfg(test)] -pub(crate) mod test_utils; const DELTA_LOG_FOLDER: &str = "_delta_log"; pub(crate) const DEFAULT_RETRIES: usize = 15; @@ -309,6 +310,8 @@ impl CommitData { /// Properties for post commit hook. pub struct PostCommitHookProperties { create_checkpoint: bool, + /// Override the EnableExpiredLogCleanUp setting, if None config setting is used + cleanup_expired_logs: Option, } #[derive(Clone, Debug)] @@ -319,6 +322,7 @@ pub struct CommitProperties { pub(crate) app_transaction: Vec, max_retries: usize, create_checkpoint: bool, + cleanup_expired_logs: Option, } impl Default for CommitProperties { @@ -328,6 +332,7 @@ impl Default for CommitProperties { app_transaction: Vec::new(), max_retries: DEFAULT_RETRIES, create_checkpoint: true, + cleanup_expired_logs: None, } } } @@ -342,6 +347,12 @@ impl CommitProperties { self } + /// Specify maximum number of times to retry the transaction before failing to commit + pub fn with_max_retries(mut self, max_retries: usize) -> Self { + self.max_retries = max_retries; + self + } + /// Specify if it should create a checkpoint when the commit interval condition is met pub fn with_create_checkpoint(mut self, create_checkpoint: bool) -> Self { self.create_checkpoint = create_checkpoint; @@ -359,6 +370,12 @@ impl CommitProperties { self.app_transaction = txn; self } + + /// Specify if it should clean up the logs when the logRetentionDuration interval is met + pub fn with_cleanup_expired_logs(mut self, cleanup_expired_logs: Option) -> Self { + self.cleanup_expired_logs = cleanup_expired_logs; + self + } } impl From for CommitBuilder { @@ -368,6 +385,7 @@ impl From for CommitBuilder { app_metadata: value.app_metadata, post_commit_hook: Some(PostCommitHookProperties { create_checkpoint: value.create_checkpoint, + cleanup_expired_logs: value.cleanup_expired_logs, }), app_transaction: value.app_transaction, ..Default::default() @@ -467,20 +485,34 @@ impl<'a> PreCommit<'a> { pub fn into_prepared_commit_future(self) -> BoxFuture<'a, DeltaResult>> { let this = self; + // Write delta log entry as temporary file to storage. For the actual commit, + // the temporary file is moved (atomic rename) to the delta log folder within `commit` function. + async fn write_tmp_commit( + log_entry: Bytes, + store: ObjectStoreRef, + ) -> DeltaResult { + let token = uuid::Uuid::new_v4().to_string(); + let path = Path::from_iter([DELTA_LOG_FOLDER, &format!("_commit_{token}.json.tmp")]); + store.put(&path, log_entry.into()).await?; + Ok(CommitOrBytes::TmpCommit(path)) + } + Box::pin(async move { if let Some(table_reference) = this.table_data { PROTOCOL.can_commit(table_reference, &this.data.actions, &this.data.operation)?; } - - // Write delta log entry as temporary file to storage. For the actual commit, - // the temporary file is moved (atomic rename) to the delta log folder within `commit` function. let log_entry = this.data.get_bytes()?; - let token = uuid::Uuid::new_v4().to_string(); - let path = Path::from_iter([DELTA_LOG_FOLDER, &format!("_commit_{token}.json.tmp")]); - this.log_store.object_store().put(&path, log_entry).await?; + + // With the DefaultLogStore, we just pass the bytes around, since we use conditionalPuts + // Other stores will use tmp_commits + let commit_or_bytes = if this.log_store.name() == "DefaultLogStore" { + CommitOrBytes::LogBytes(log_entry) + } else { + write_tmp_commit(log_entry, this.log_store.object_store()).await? + }; Ok(PreparedCommit { - path, + commit_or_bytes, log_store: this.log_store, table_data: this.table_data, max_retries: this.max_retries, @@ -491,9 +523,9 @@ impl<'a> PreCommit<'a> { } } -/// Represents a inflight commit with a temporary commit marker on the log store +/// Represents a inflight commit pub struct PreparedCommit<'a> { - path: Path, + commit_or_bytes: CommitOrBytes, log_store: LogStoreRef, data: CommitData, table_data: Option<&'a dyn TableReference>, @@ -503,8 +535,8 @@ pub struct PreparedCommit<'a> { impl<'a> PreparedCommit<'a> { /// The temporary commit file created - pub fn path(&self) -> &Path { - &self.path + pub fn commit_or_bytes(&self) -> &CommitOrBytes { + &self.commit_or_bytes } } @@ -516,14 +548,17 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { let this = self; Box::pin(async move { - let tmp_commit = &this.path; + let commit_or_bytes = this.commit_or_bytes; if this.table_data.is_none() { - this.log_store.write_commit_entry(0, tmp_commit).await?; + this.log_store + .write_commit_entry(0, commit_or_bytes.clone()) + .await?; return Ok(PostCommit { version: 0, data: this.data, create_checkpoint: false, + cleanup_expired_logs: None, log_store: this.log_store, table_data: this.table_data, }); @@ -536,7 +571,11 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { let mut attempt_number = 1; while attempt_number <= this.max_retries { let version = read_snapshot.version() + attempt_number as i64; - match this.log_store.write_commit_entry(version, tmp_commit).await { + match this + .log_store + .write_commit_entry(version, commit_or_bytes.clone()) + .await + { Ok(()) => { return Ok(PostCommit { version, @@ -545,6 +584,10 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { .post_commit .map(|v| v.create_checkpoint) .unwrap_or_default(), + cleanup_expired_logs: this + .post_commit + .map(|v| v.cleanup_expired_logs) + .unwrap_or_default(), log_store: this.log_store, table_data: this.table_data, }); @@ -573,7 +616,7 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { } Err(err) => { this.log_store - .abort_commit_entry(version, tmp_commit) + .abort_commit_entry(version, commit_or_bytes) .await?; return Err(TransactionError::CommitConflict(err).into()); } @@ -581,7 +624,7 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { } Err(err) => { this.log_store - .abort_commit_entry(version, tmp_commit) + .abort_commit_entry(version, commit_or_bytes) .await?; return Err(err.into()); } @@ -600,6 +643,7 @@ pub struct PostCommit<'a> { /// The data that was comitted to the log store pub data: CommitData, create_checkpoint: bool, + cleanup_expired_logs: Option, log_store: LogStoreRef, table_data: Option<&'a dyn TableReference>, } @@ -625,6 +669,21 @@ impl<'a> PostCommit<'a> { self.create_checkpoint(&state, &self.log_store, self.version) .await?; } + let cleanup_logs = if let Some(cleanup_logs) = self.cleanup_expired_logs { + cleanup_logs + } else { + state.table_config().enable_expired_log_cleanup() + }; + + if cleanup_logs { + cleanup_expired_logs_for( + self.version, + self.log_store.as_ref(), + Utc::now().timestamp_millis() + - state.table_config().log_retention_duration().as_millis() as i64, + ) + .await?; + } Ok(state) } else { let state = DeltaTableState::try_new( @@ -699,7 +758,7 @@ mod tests { logstore::{default_logstore::DefaultLogStore, LogStore}, storage::commit_uri_from_version, }; - use object_store::memory::InMemory; + use object_store::{memory::InMemory, ObjectStore, PutPayload}; use url::Url; #[test] @@ -721,16 +780,19 @@ mod tests { options: HashMap::new().into(), }, ); - let tmp_path = Path::from("_delta_log/tmp"); let version_path = Path::from("_delta_log/00000000000000000000.json"); - store.put(&tmp_path, bytes::Bytes::new()).await.unwrap(); - store.put(&version_path, bytes::Bytes::new()).await.unwrap(); + store.put(&version_path, PutPayload::new()).await.unwrap(); - let res = log_store.write_commit_entry(0, &tmp_path).await; + let res = log_store + .write_commit_entry(0, CommitOrBytes::LogBytes(PutPayload::new().into())) + .await; // fails if file version already exists assert!(res.is_err()); // succeeds for next version - log_store.write_commit_entry(1, &tmp_path).await.unwrap(); + log_store + .write_commit_entry(1, CommitOrBytes::LogBytes(PutPayload::new().into())) + .await + .unwrap(); } } diff --git a/crates/core/src/operations/transaction/protocol.rs b/crates/core/src/operations/transaction/protocol.rs index c5d9cdf650..b9ea7d65aa 100644 --- a/crates/core/src/operations/transaction/protocol.rs +++ b/crates/core/src/operations/transaction/protocol.rs @@ -2,6 +2,7 @@ use std::collections::HashSet; use lazy_static::lazy_static; use once_cell::sync::Lazy; +use tracing::log::*; use super::{TableReference, TransactionError}; use crate::kernel::{ @@ -80,20 +81,19 @@ impl ProtocolChecker { } /// checks if table contains timestamp_ntz in any field including nested fields. - pub fn contains_timestampntz(&self, fields: &[StructField]) -> bool { - fn check_vec_fields(fields: &[StructField]) -> bool { - fields.iter().any(|f| _check_type(f.data_type())) - } - + pub fn contains_timestampntz<'a>( + &self, + mut fields: impl Iterator, + ) -> bool { fn _check_type(dtype: &DataType) -> bool { match dtype { - &DataType::TIMESTAMPNTZ => true, + &DataType::TIMESTAMP_NTZ => true, DataType::Array(inner) => _check_type(inner.element_type()), - DataType::Struct(inner) => check_vec_fields(inner.fields()), + DataType::Struct(inner) => inner.fields().any(|f| _check_type(f.data_type())), _ => false, } } - check_vec_fields(fields) + fields.any(|f| _check_type(f.data_type())) } /// Check can write_timestamp_ntz @@ -148,17 +148,33 @@ impl ProtocolChecker { pub fn can_write_to(&self, snapshot: &dyn TableReference) -> Result<(), TransactionError> { // NOTE: writers must always support all required reader features self.can_read_from(snapshot)?; + let min_writer_version = snapshot.protocol().min_writer_version; + + let required_features: Option<&HashSet> = match min_writer_version { + 0 | 1 => None, + 2 => Some(&WRITER_V2), + 3 => Some(&WRITER_V3), + 4 => Some(&WRITER_V4), + 5 => Some(&WRITER_V5), + 6 => Some(&WRITER_V6), + _ => snapshot.protocol().writer_features.as_ref(), + }; - let required_features: Option<&HashSet> = - match snapshot.protocol().min_writer_version { - 0 | 1 => None, - 2 => Some(&WRITER_V2), - 3 => Some(&WRITER_V3), - 4 => Some(&WRITER_V4), - 5 => Some(&WRITER_V5), - 6 => Some(&WRITER_V6), - _ => snapshot.protocol().writer_features.as_ref(), - }; + if (4..7).contains(&min_writer_version) { + debug!("min_writer_version is less 4-6, checking for unsupported table features"); + if let Ok(schema) = snapshot.metadata().schema() { + for field in schema.fields() { + if field.metadata.contains_key( + crate::kernel::ColumnMetadataKey::GenerationExpression.as_ref(), + ) { + error!("The table contains `delta.generationExpression` settings on columns which mean this table cannot be currently written to by delta-rs"); + return Err(TransactionError::UnsupportedWriterFeatures(vec![ + WriterFeatures::GeneratedColumns, + ])); + } + } + } + } if let Some(features) = required_features { let mut diff = features.difference(&self.writer_features).peekable(); @@ -228,6 +244,11 @@ pub static INSTANCE: Lazy = Lazy::new(|| { let mut writer_features = HashSet::new(); writer_features.insert(WriterFeatures::AppendOnly); writer_features.insert(WriterFeatures::TimestampWithoutTimezone); + #[cfg(feature = "cdf")] + { + writer_features.insert(WriterFeatures::ChangeDataFeed); + writer_features.insert(WriterFeatures::GeneratedColumns); + } #[cfg(feature = "datafusion")] { writer_features.insert(WriterFeatures::Invariants); @@ -243,13 +264,19 @@ pub static INSTANCE: Lazy = Lazy::new(|| { #[cfg(test)] mod tests { - use super::super::test_utils::create_metadata_action; + use std::collections::HashMap; + use super::*; - use crate::kernel::{Action, Add, Protocol, Remove}; + use crate::kernel::DataType as DeltaDataType; + use crate::kernel::{Action, Add, Metadata, PrimitiveType, Protocol, Remove}; use crate::protocol::SaveMode; use crate::table::state::DeltaTableState; - use crate::DeltaConfigKey; - use std::collections::HashMap; + use crate::test_utils::{ActionFactory, TestSchemas}; + use crate::TableProperty; + + fn metadata_action(configuration: Option>>) -> Metadata { + ActionFactory::metadata(TestSchemas::simple(), None::>, configuration) + } #[test] fn test_can_commit_append_only() { @@ -300,13 +327,11 @@ mod tests { writer_features: Some(feat.into_iter().collect()), ..Default::default() }), - create_metadata_action( - None, - Some(HashMap::from([( - DeltaConfigKey::AppendOnly.as_ref().to_string(), - Some(append.to_string()), - )])), - ), + metadata_action(Some(HashMap::from([( + TableProperty::AppendOnly.as_ref().to_string(), + Some(append.to_string()), + )]))) + .into(), ] }; @@ -400,7 +425,7 @@ mod tests { min_writer_version: 1, ..Default::default() }), - create_metadata_action(None, Some(HashMap::new())), + metadata_action(None).into(), ]; let snapshot_1 = DeltaTableState::from_actions(actions).unwrap(); let eager_1 = snapshot_1.snapshot(); @@ -414,7 +439,7 @@ mod tests { min_writer_version: 1, ..Default::default() }), - create_metadata_action(None, Some(HashMap::new())), + metadata_action(None).into(), ]; let snapshot_2 = DeltaTableState::from_actions(actions).unwrap(); let eager_2 = snapshot_2.snapshot(); @@ -431,7 +456,7 @@ mod tests { min_writer_version: 2, ..Default::default() }), - create_metadata_action(None, Some(HashMap::new())), + metadata_action(None).into(), ]; let snapshot_3 = DeltaTableState::from_actions(actions).unwrap(); let eager_3 = snapshot_3.snapshot(); @@ -451,7 +476,7 @@ mod tests { min_writer_version: 3, ..Default::default() }), - create_metadata_action(None, Some(HashMap::new())), + metadata_action(None).into(), ]; let snapshot_4 = DeltaTableState::from_actions(actions).unwrap(); let eager_4 = snapshot_4.snapshot(); @@ -474,7 +499,7 @@ mod tests { min_writer_version: 4, ..Default::default() }), - create_metadata_action(None, Some(HashMap::new())), + metadata_action(None).into(), ]; let snapshot_5 = DeltaTableState::from_actions(actions).unwrap(); let eager_5 = snapshot_5.snapshot(); @@ -500,7 +525,7 @@ mod tests { min_writer_version: 5, ..Default::default() }), - create_metadata_action(None, Some(HashMap::new())), + metadata_action(None).into(), ]; let snapshot_6 = DeltaTableState::from_actions(actions).unwrap(); let eager_6 = snapshot_6.snapshot(); @@ -529,7 +554,7 @@ mod tests { min_writer_version: 6, ..Default::default() }), - create_metadata_action(None, Some(HashMap::new())), + metadata_action(None).into(), ]; let snapshot_7 = DeltaTableState::from_actions(actions).unwrap(); let eager_7 = snapshot_7.snapshot(); @@ -554,4 +579,63 @@ mod tests { assert!(checker_7.can_read_from(eager_7).is_ok()); assert!(checker_7.can_write_to(eager_7).is_ok()); } + + #[tokio::test] + async fn test_minwriter_v4_with_cdf() { + let checker_5 = ProtocolChecker::new(READER_V2.clone(), WRITER_V4.clone()); + let actions = vec![ + Action::Protocol( + Protocol::new(2, 4) + .with_writer_features(vec![crate::kernel::WriterFeatures::ChangeDataFeed]), + ), + metadata_action(None).into(), + ]; + let snapshot_5 = DeltaTableState::from_actions(actions).unwrap(); + let eager_5 = snapshot_5.snapshot(); + assert!(checker_5.can_write_to(eager_5).is_ok()); + } + + /// Technically we do not yet support generated columns, but it is okay to "accept" writing to + /// a column with minWriterVersion=4 and the generated columns feature as long as the + /// `delta.generationExpression` isn't actually defined the write is still allowed + #[tokio::test] + async fn test_minwriter_v4_with_generated_columns() { + let checker_5 = ProtocolChecker::new(READER_V2.clone(), WRITER_V4.clone()); + let actions = vec![ + Action::Protocol( + Protocol::new(2, 4) + .with_writer_features(vec![crate::kernel::WriterFeatures::GeneratedColumns]), + ), + metadata_action(None).into(), + ]; + let snapshot_5 = DeltaTableState::from_actions(actions).unwrap(); + let eager_5 = snapshot_5.snapshot(); + assert!(checker_5.can_write_to(eager_5).is_ok()); + } + + #[tokio::test] + async fn test_minwriter_v4_with_generated_columns_and_expressions() { + let checker_5 = ProtocolChecker::new(READER_V2.clone(), WRITER_V4.clone()); + let actions = vec![Action::Protocol(Protocol::new(2, 4))]; + + let table: crate::DeltaTable = crate::DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + Some(HashMap::from([( + "delta.generationExpression".into(), + "x IS TRUE".into(), + )])), + ) + .with_actions(actions) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .expect("failed to make a version 4 table with EnableChangeDataFeed"); + let eager_5 = table + .snapshot() + .expect("Failed to get snapshot from test table"); + assert!(checker_5.can_write_to(eager_5).is_err()); + } } diff --git a/crates/core/src/operations/transaction/state.rs b/crates/core/src/operations/transaction/state.rs index e979cda363..56769c8c62 100644 --- a/crates/core/src/operations/transaction/state.rs +++ b/crates/core/src/operations/transaction/state.rs @@ -1,85 +1,18 @@ use std::collections::HashSet; -use std::sync::Arc; -use arrow::array::{ArrayRef, BooleanArray}; -use arrow::datatypes::{ - DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, -}; +use arrow_array::{ArrayRef, BooleanArray}; +use arrow_schema::{DataType as ArrowDataType, SchemaRef as ArrowSchemaRef}; +use datafusion::execution::context::SessionContext; use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; use datafusion_common::scalar::ScalarValue; -use datafusion_common::Column; +use datafusion_common::{Column, ToDFSchema}; use datafusion_expr::Expr; -use itertools::Itertools; -use object_store::ObjectStore; -use parquet::arrow::arrow_reader::ArrowReaderOptions; -use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; - -use crate::delta_datafusion::{ - get_null_of_arrow_type, logical_expr_to_physical_expr, to_correct_scalar_value, - DataFusionMixins, -}; + +use crate::delta_datafusion::{get_null_of_arrow_type, to_correct_scalar_value}; use crate::errors::DeltaResult; use crate::kernel::{Add, EagerSnapshot}; use crate::table::state::DeltaTableState; -impl DeltaTableState { - /// Get the physical table schema. - /// - /// This will construct a schema derived from the parquet schema of the latest data file, - /// and fields for partition columns from the schema defined in table meta data. - pub async fn physical_arrow_schema( - &self, - object_store: Arc, - ) -> DeltaResult { - self.snapshot.physical_arrow_schema(object_store).await - } -} - -impl EagerSnapshot { - /// Get the physical table schema. - /// - /// This will construct a schema derived from the parquet schema of the latest data file, - /// and fields for partition columns from the schema defined in table meta data. - pub async fn physical_arrow_schema( - &self, - object_store: Arc, - ) -> DeltaResult { - if let Some(add) = self.file_actions()?.max_by_key(|obj| obj.modification_time) { - let file_meta = add.try_into()?; - let file_reader = ParquetObjectReader::new(object_store, file_meta); - let file_schema = ParquetRecordBatchStreamBuilder::new_with_options( - file_reader, - ArrowReaderOptions::new().with_skip_arrow_metadata(true), - ) - .await? - .build()? - .schema() - .clone(); - - let table_schema = Arc::new(ArrowSchema::new( - self.arrow_schema()? - .fields - .clone() - .into_iter() - .map(|field| { - // field is an &Arc - let owned_field: ArrowField = field.as_ref().clone(); - file_schema - .field_with_name(field.name()) - // yielded with &Field - .cloned() - .unwrap_or(owned_field) - }) - .collect::>(), - )); - - Ok(table_schema) - } else { - self.arrow_schema() - } - } -} - pub struct AddContainer<'a> { inner: &'a Vec, partition_columns: &'a Vec, @@ -104,7 +37,7 @@ impl<'a> AddContainer<'a> { let (_, field) = self.schema.column_with_name(&column.name)?; // See issue 1214. Binary type does not support natural order which is required for Datafusion to prune - if field.data_type() == &DataType::Binary { + if field.data_type() == &ArrowDataType::Binary { return None; } @@ -153,7 +86,9 @@ impl<'a> AddContainer<'a> { /// so evaluating expressions is inexact. However, excluded files are guaranteed (for a correct log) /// to not contain matches by the predicate expression. pub fn predicate_matches(&self, predicate: Expr) -> DeltaResult> { - let expr = logical_expr_to_physical_expr(predicate, &self.schema); + //let expr = logical_expr_to_physical_expr(predicate, &self.schema); + let expr = SessionContext::new() + .create_physical_expr(predicate, &self.schema.clone().to_dfschema()?)?; let pruning_predicate = PruningPredicate::try_new(expr, self.schema.clone())?; Ok(self .inner @@ -249,25 +184,19 @@ impl PruningStatistics for EagerSnapshot { /// return the minimum values for the named column, if known. /// Note: the returned array must contain `num_containers()` rows fn min_values(&self, column: &Column) -> Option { - let files = self.file_actions().ok()?.collect_vec(); - let partition_columns = &self.metadata().partition_columns; - let container = AddContainer::new(&files, partition_columns, self.arrow_schema().ok()?); - container.min_values(column) + self.log_data().min_values(column) } /// return the maximum values for the named column, if known. /// Note: the returned array must contain `num_containers()` rows. fn max_values(&self, column: &Column) -> Option { - let files = self.file_actions().ok()?.collect_vec(); - let partition_columns = &self.metadata().partition_columns; - let container = AddContainer::new(&files, partition_columns, self.arrow_schema().ok()?); - container.max_values(column) + self.log_data().max_values(column) } /// return the number of containers (e.g. row groups) being /// pruned with these statistics fn num_containers(&self) -> usize { - self.files_count() + self.log_data().num_containers() } /// return the number of null values for the named column as an @@ -275,10 +204,7 @@ impl PruningStatistics for EagerSnapshot { /// /// Note: the returned array must contain `num_containers()` rows. fn null_counts(&self, column: &Column) -> Option { - let files = self.file_actions().ok()?.collect_vec(); - let partition_columns = &self.metadata().partition_columns; - let container = AddContainer::new(&files, partition_columns, self.arrow_schema().ok()?); - container.null_counts(column) + self.log_data().null_counts(column) } /// return the number of rows for the named column in each container @@ -286,56 +212,64 @@ impl PruningStatistics for EagerSnapshot { /// /// Note: the returned array must contain `num_containers()` rows fn row_counts(&self, column: &Column) -> Option { - let files = self.file_actions().ok()?.collect_vec(); - let partition_columns = &self.metadata().partition_columns; - let container = AddContainer::new(&files, partition_columns, self.arrow_schema().ok()?); - container.row_counts(column) + self.log_data().row_counts(column) } // This function is required since DataFusion 35.0, but is implemented as a no-op // https://github.com/apache/arrow-datafusion/blob/ec6abece2dcfa68007b87c69eefa6b0d7333f628/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs#L550 - fn contained(&self, _column: &Column, _value: &HashSet) -> Option { - None + fn contained(&self, column: &Column, value: &HashSet) -> Option { + self.log_data().contained(column, value) } } impl PruningStatistics for DeltaTableState { fn min_values(&self, column: &Column) -> Option { - self.snapshot.min_values(column) + self.snapshot.log_data().min_values(column) } fn max_values(&self, column: &Column) -> Option { - self.snapshot.max_values(column) + self.snapshot.log_data().max_values(column) } fn num_containers(&self) -> usize { - self.snapshot.num_containers() + self.snapshot.log_data().num_containers() } fn null_counts(&self, column: &Column) -> Option { - self.snapshot.null_counts(column) + self.snapshot.log_data().null_counts(column) } fn row_counts(&self, column: &Column) -> Option { - self.snapshot.row_counts(column) + self.snapshot.log_data().row_counts(column) } fn contained(&self, column: &Column, values: &HashSet) -> Option { - self.snapshot.contained(column, values) + self.snapshot.log_data().contained(column, values) } } #[cfg(test)] mod tests { - use super::*; - use crate::delta_datafusion::DataFusionFileMixins; - use crate::operations::transaction::test_utils::{create_add_action, init_table_actions}; + use std::collections::HashMap; + use datafusion::prelude::SessionContext; use datafusion_expr::{col, lit}; + use super::*; + use crate::delta_datafusion::{files_matching_predicate, DataFusionMixins}; + use crate::kernel::Action; + use crate::test_utils::{ActionFactory, TestSchemas}; + + fn init_table_actions() -> Vec { + vec![ + ActionFactory::protocol(None, None, None::>, None::>).into(), + ActionFactory::metadata(TestSchemas::simple(), None::>, None).into(), + ] + } + #[test] fn test_parse_predicate_expression() { - let snapshot = DeltaTableState::from_actions(init_table_actions(None)).unwrap(); + let snapshot = DeltaTableState::from_actions(init_table_actions()).unwrap(); let session = SessionContext::new(); let state = session.state(); @@ -362,15 +296,29 @@ mod tests { #[test] fn test_files_matching_predicate() { - let mut actions = init_table_actions(None); - actions.push(create_add_action("excluded", true, Some("{\"numRecords\":10,\"minValues\":{\"value\":1},\"maxValues\":{\"value\":10},\"nullCount\":{\"value\":0}}".into()))); - actions.push(create_add_action("included-1", true, Some("{\"numRecords\":10,\"minValues\":{\"value\":1},\"maxValues\":{\"value\":100},\"nullCount\":{\"value\":0}}".into()))); - actions.push(create_add_action("included-2", true, Some("{\"numRecords\":10,\"minValues\":{\"value\":-10},\"maxValues\":{\"value\":3},\"nullCount\":{\"value\":0}}".into()))); + let mut actions = init_table_actions(); + + actions.push(Action::Add(ActionFactory::add( + TestSchemas::simple(), + HashMap::from_iter([("value", ("1", "10"))]), + Default::default(), + true, + ))); + actions.push(Action::Add(ActionFactory::add( + TestSchemas::simple(), + HashMap::from_iter([("value", ("1", "100"))]), + Default::default(), + true, + ))); + actions.push(Action::Add(ActionFactory::add( + TestSchemas::simple(), + HashMap::from_iter([("value", ("-10", "3"))]), + Default::default(), + true, + ))); let state = DeltaTableState::from_actions(actions).unwrap(); - let files = state - .snapshot - .files_matching_predicate(&[]) + let files = files_matching_predicate(&state.snapshot, &[]) .unwrap() .collect::>(); assert_eq!(files.len(), 3); @@ -379,12 +327,9 @@ mod tests { .gt(lit::(10)) .or(col("value").lt_eq(lit::(0))); - let files = state - .snapshot - .files_matching_predicate(&[predictate]) + let files = files_matching_predicate(&state.snapshot, &[predictate]) .unwrap() .collect::>(); assert_eq!(files.len(), 2); - assert!(files.iter().all(|add| add.path.contains("included"))); } } diff --git a/crates/core/src/operations/transaction/test_utils.rs b/crates/core/src/operations/transaction/test_utils.rs deleted file mode 100644 index ada5ded056..0000000000 --- a/crates/core/src/operations/transaction/test_utils.rs +++ /dev/null @@ -1,171 +0,0 @@ -#![allow(unused)] -use std::collections::HashMap; - -use super::CommitBuilder; -use crate::kernel::{ - Action, Add, CommitInfo, DataType, Metadata, PrimitiveType, Protocol, Remove, StructField, - StructType, -}; -use crate::operations::transaction::PROTOCOL; -use crate::protocol::{DeltaOperation, SaveMode}; -use crate::table::state::DeltaTableState; -use crate::{DeltaTable, DeltaTableBuilder}; - -pub fn create_add_action( - path: impl Into, - data_change: bool, - stats: Option, -) -> Action { - Action::Add(Add { - path: path.into(), - size: 100, - data_change, - stats, - modification_time: -1, - partition_values: Default::default(), - stats_parsed: None, - base_row_id: None, - default_row_commit_version: None, - tags: None, - deletion_vector: None, - clustering_provider: None, - }) -} - -pub fn create_remove_action(path: impl Into, data_change: bool) -> Action { - Action::Remove(Remove { - path: path.into(), - data_change, - size: None, - deletion_timestamp: None, - deletion_vector: None, - partition_values: Default::default(), - extended_file_metadata: None, - base_row_id: None, - default_row_commit_version: None, - tags: None, - }) -} - -pub fn create_protocol_action(max_reader: Option, max_writer: Option) -> Action { - let protocol = Protocol { - min_reader_version: max_reader.unwrap_or(PROTOCOL.default_reader_version()), - min_writer_version: max_writer.unwrap_or(PROTOCOL.default_writer_version()), - writer_features: None, - reader_features: None, - }; - Action::Protocol(protocol) -} - -pub fn create_metadata_action( - parttiton_columns: Option>, - configuration: Option>>, -) -> Action { - let table_schema = StructType::new(vec![ - StructField::new( - "id".to_string(), - DataType::Primitive(PrimitiveType::String), - true, - ), - StructField::new( - "value".to_string(), - DataType::Primitive(PrimitiveType::Integer), - true, - ), - StructField::new( - "modified".to_string(), - DataType::Primitive(PrimitiveType::String), - true, - ), - ]); - Action::Metadata( - Metadata::try_new( - table_schema, - parttiton_columns.unwrap_or_default(), - configuration.unwrap_or_default(), - ) - .unwrap(), - ) -} - -pub fn init_table_actions(configuration: Option>>) -> Vec { - let raw = r#" - { - "timestamp": 1670892998177, - "operation": "WRITE", - "operationParameters": { - "mode": "Append", - "partitionBy": "[\"c1\",\"c2\"]" - }, - "isolationLevel": "Serializable", - "isBlindAppend": true, - "operationMetrics": { - "numFiles": "3", - "numOutputRows": "3", - "numOutputBytes": "1356" - }, - "engineInfo": "Apache-Spark/3.3.1 Delta-Lake/2.2.0", - "txnId": "046a258f-45e3-4657-b0bf-abfb0f76681c" - }"#; - - let commit_info = serde_json::from_str::(raw).unwrap(); - vec![ - Action::CommitInfo(commit_info), - create_protocol_action(None, None), - create_metadata_action(None, configuration), - ] -} - -pub async fn create_initialized_table( - partition_cols: &[String], - configuration: Option>>, -) -> DeltaTable { - let log_store = DeltaTableBuilder::from_uri("memory://") - .build_storage() - .unwrap(); - let table_schema = StructType::new(vec![ - StructField::new( - "id".to_string(), - DataType::Primitive(PrimitiveType::String), - true, - ), - StructField::new( - "value".to_string(), - DataType::Primitive(PrimitiveType::Integer), - true, - ), - StructField::new( - "modified".to_string(), - DataType::Primitive(PrimitiveType::String), - true, - ), - ]); - let state = DeltaTableState::from_actions(init_table_actions(None)).unwrap(); - let operation = DeltaOperation::Create { - mode: SaveMode::ErrorIfExists, - location: "location".into(), - protocol: Protocol { - min_reader_version: 1, - min_writer_version: 1, - writer_features: None, - reader_features: None, - }, - metadata: Metadata { - id: uuid::Uuid::new_v4().to_string(), - name: None, - description: None, - format: Default::default(), - schema_string: serde_json::to_string(&table_schema).unwrap(), - partition_columns: partition_cols.to_vec(), - configuration: configuration.unwrap_or_default(), - created_time: Some(chrono::Utc::now().timestamp_millis()), - }, - }; - let actions = init_table_actions(None); - CommitBuilder::default() - .with_actions(actions) - .build(None, log_store.clone(), operation) - .await - .unwrap(); - DeltaTable::new_with_state(log_store, state) -} diff --git a/crates/core/src/operations/update.rs b/crates/core/src/operations/update.rs index 9a088c6ae9..61dc4b2f46 100644 --- a/crates/core/src/operations/update.rs +++ b/crates/core/src/operations/update.rs @@ -19,45 +19,62 @@ //! ```` use std::{ - collections::{HashMap, HashSet}, + collections::HashMap, sync::Arc, time::{Instant, SystemTime, UNIX_EPOCH}, }; -use arrow::datatypes::Schema as ArrowSchema; -use arrow_schema::Field; +use async_trait::async_trait; +use datafusion::error::Result as DataFusionResult; use datafusion::{ + dataframe::DataFrame, + datasource::provider_as_source, execution::context::SessionState, - physical_plan::{metrics::MetricBuilder, projection::ProjectionExec, ExecutionPlan}, + execution::session_state::SessionStateBuilder, + physical_plan::{metrics::MetricBuilder, ExecutionPlan}, + physical_planner::{ExtensionPlanner, PhysicalPlanner}, prelude::SessionContext, }; -use datafusion_common::{Column, DFSchema, ScalarValue}; -use datafusion_expr::{case, col, lit, when, Expr}; -use datafusion_physical_expr::{ - expressions::{self}, - PhysicalExpr, +use datafusion_common::{Column, ScalarValue}; +use datafusion_expr::{ + case, col, lit, when, Expr, Extension, LogicalPlan, LogicalPlanBuilder, UserDefinedLogicalNode, }; use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; +use tracing::log::*; -use super::write::write_execution_plan; +use super::write::{write_execution_plan, write_execution_plan_cdc}; use super::{ datafusion_utils::Expression, transaction::{CommitBuilder, CommitProperties}, }; use super::{transaction::PROTOCOL, write::WriterStatsConfig}; -use crate::delta_datafusion::{ - create_physical_expr_fix, expr::fmt_expr_to_sql, physical::MetricObserverExec, - DataFusionMixins, DeltaColumn, DeltaSessionContext, -}; -use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; +use crate::delta_datafusion::{find_files, planner::DeltaPlanner, register_store}; use crate::kernel::{Action, Remove}; use crate::logstore::LogStoreRef; +use crate::operations::cdc::*; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; +use crate::{ + delta_datafusion::{ + expr::fmt_expr_to_sql, + logical::MetricObserver, + physical::{find_metric_node, get_metric, MetricObserverExec}, + DataFusionMixins, DeltaColumn, DeltaScanConfigBuilder, DeltaSessionContext, + DeltaTableProvider, + }, + DeltaTableError, +}; use crate::{DeltaResult, DeltaTable}; +/// Custom column name used for marking internal [RecordBatch] rows as updated +pub(crate) const UPDATE_PREDICATE_COLNAME: &str = "__delta_rs_update_predicate"; + +const UPDATE_COUNT_ID: &str = "update_source_count"; +const UPDATE_ROW_COUNT: &str = "num_updated_rows"; +const COPIED_ROW_COUNT: &str = "num_copied_rows"; + /// Updates records in the Delta Table. /// See this module's documentation for more information pub struct UpdateBuilder { @@ -163,6 +180,44 @@ impl UpdateBuilder { } } +#[derive(Clone)] +struct UpdateMetricExtensionPlanner {} + +#[async_trait] +impl ExtensionPlanner for UpdateMetricExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> DataFusionResult>> { + if let Some(metric_observer) = node.as_any().downcast_ref::() { + if metric_observer.id.eq(UPDATE_COUNT_ID) { + return Ok(Some(MetricObserverExec::try_new( + UPDATE_COUNT_ID.into(), + physical_inputs, + |batch, metrics| { + let array = batch.column_by_name(UPDATE_PREDICATE_COLNAME).unwrap(); + let copied_rows = array.null_count(); + let num_updated = array.len() - copied_rows; + + MetricBuilder::new(metrics) + .global_counter(UPDATE_ROW_COUNT) + .add(num_updated); + + MetricBuilder::new(metrics) + .global_counter(COPIED_ROW_COUNT) + .add(copied_rows); + }, + )?)); + } + } + Ok(None) + } +} + #[allow(clippy::too_many_arguments)] async fn execute( predicate: Option, @@ -172,7 +227,7 @@ async fn execute( state: SessionState, writer_properties: Option, mut commit_properties: CommitProperties, - safe_cast: bool, + _safe_cast: bool, ) -> DeltaResult<(DeltaTableState, UpdateMetrics)> { // Validate the predicate and update expressions. // @@ -183,6 +238,17 @@ async fn execute( // For files that were identified, scan for records that match the predicate, // perform update operations, and then commit add and remove actions to // the log. + if !&snapshot.load_config().require_files { + return Err(DeltaTableError::NotInitializedWithFiles("UPDATE".into())); + } + + let update_planner = DeltaPlanner:: { + extension_planner: UpdateMetricExtensionPlanner {}, + }; + + let state = SessionStateBuilder::new_from_existing(state) + .with_query_planner(Arc::new(update_planner)) + .build(); let exec_start = Instant::now(); let mut metrics = UpdateMetrics::default(); @@ -199,15 +265,15 @@ async fn execute( None => None, }; - let updates: HashMap = updates + let updates = updates .into_iter() .map(|(key, expr)| match expr { - Expression::DataFusion(e) => Ok((key, e)), + Expression::DataFusion(e) => Ok((key.name, e)), Expression::String(s) => snapshot .parse_predicate_expression(s, &state) - .map(|e| (key, e)), + .map(|e| (key.name, e)), }) - .collect::, _>>()?; + .collect::, _>>()?; let current_metadata = snapshot.metadata(); let table_partition_cols = current_metadata.partition_columns.clone(); @@ -222,132 +288,63 @@ async fn execute( let predicate = predicate.unwrap_or(Expr::Literal(ScalarValue::Boolean(Some(true)))); - let execution_props = state.execution_props(); + let scan_config = DeltaScanConfigBuilder::default() + .with_file_column(false) + .with_schema(snapshot.input_schema()?) + .build(&snapshot)?; + // For each rewrite evaluate the predicate and then modify each expression // to either compute the new value or obtain the old one then write these batches - let scan = DeltaScanBuilder::new(&snapshot, log_store.clone(), &state) - .with_files(&candidates.candidates) - .build() - .await?; - let scan = Arc::new(scan); + let target_provider = Arc::new( + DeltaTableProvider::try_new(snapshot.clone(), log_store.clone(), scan_config.clone())? + .with_files(candidates.candidates.clone()), + ); - // Create a projection for a new column with the predicate evaluated - let input_schema = snapshot.input_schema()?; + let target_provider = provider_as_source(target_provider); + let plan = LogicalPlanBuilder::scan("target", target_provider.clone(), None)?.build()?; - let mut fields = Vec::new(); - for field in input_schema.fields.iter() { - fields.push(field.to_owned()); - } - fields.push(Arc::new(Field::new( - "__delta_rs_update_predicate", - arrow_schema::DataType::Boolean, - true, - ))); - // Recreate the schemas with the new column included - let input_schema = Arc::new(ArrowSchema::new(fields)); - let input_dfschema: DFSchema = input_schema.as_ref().clone().try_into()?; - - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let scan_schema = scan.schema(); - for (i, field) in scan_schema.fields().into_iter().enumerate() { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } + let df = DataFrame::new(state.clone(), plan); // Take advantage of how null counts are tracked in arrow arrays use the // null count to track how many records do NOT statisfy the predicate. The // count is then exposed through the metrics through the `UpdateCountExec` // execution plan - let predicate_null = when(predicate.clone(), lit(true)).otherwise(lit(ScalarValue::Boolean(None)))?; - let predicate_expr = - create_physical_expr_fix(predicate_null, &input_dfschema, execution_props)?; - expressions.push((predicate_expr, "__delta_rs_update_predicate".to_string())); - - let projection_predicate: Arc = - Arc::new(ProjectionExec::try_new(expressions, scan)?); - - let count_plan = Arc::new(MetricObserverExec::new( - "update_count".into(), - projection_predicate.clone(), - |batch, metrics| { - let array = batch.column_by_name("__delta_rs_update_predicate").unwrap(); - let copied_rows = array.null_count(); - let num_updated = array.len() - copied_rows; - - MetricBuilder::new(metrics) - .global_counter("num_updated_rows") - .add(num_updated); - - MetricBuilder::new(metrics) - .global_counter("num_copied_rows") - .add(copied_rows); - }, - )); - // Perform another projection but instead calculate updated values based on - // the predicate value. If the predicate is true then evalute the user - // provided expression otherwise return the original column value - // - // For each update column a new column with a name of __delta_rs_ + `original name` is created - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let scan_schema = count_plan.schema(); - for (i, field) in scan_schema.fields().into_iter().enumerate() { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } + let df_with_update_col = df + .clone() + .with_column(UPDATE_PREDICATE_COLNAME, predicate_null)?; - // Maintain a map from the original column name to its temporary column index - let mut map = HashMap::::new(); - let mut control_columns = HashSet::::new(); - control_columns.insert("__delta_rs_update_predicate".to_owned()); - - for (column, expr) in updates { - let expr = case(col("__delta_rs_update_predicate")) - .when(lit(true), expr.to_owned()) - .otherwise(col(column.to_owned()))?; - let predicate_expr = create_physical_expr_fix(expr, &input_dfschema, execution_props)?; - map.insert(column.name.clone(), expressions.len()); - let c = "__delta_rs_".to_string() + &column.name; - expressions.push((predicate_expr, c.clone())); - control_columns.insert(c); - } + let plan_with_metrics = LogicalPlan::Extension(Extension { + node: Arc::new(MetricObserver { + id: UPDATE_COUNT_ID.into(), + input: df_with_update_col.into_unoptimized_plan(), + enable_pushdown: false, + }), + }); - let projection_update: Arc = - Arc::new(ProjectionExec::try_new(expressions, count_plan.clone())?); - - // Project again to remove __delta_rs columns and rename update columns to their original name - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let scan_schema = projection_update.schema(); - for (i, field) in scan_schema.fields().into_iter().enumerate() { - if !control_columns.contains(field.name()) { - match map.get(field.name()) { - Some(value) => { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), *value)), - field.name().to_owned(), - )); - } - None => { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } - } - } - } + let df_with_predicate_and_metrics = DataFrame::new(state.clone(), plan_with_metrics); - let projection: Arc = Arc::new(ProjectionExec::try_new( - expressions, - projection_update.clone(), - )?); + let expressions: Vec = df_with_predicate_and_metrics + .schema() + .fields() + .into_iter() + .map(|field| { + let field_name = field.name(); + let expr = match updates.get(field_name) { + Some(expr) => case(col(UPDATE_PREDICATE_COLNAME)) + .when(lit(true), expr.to_owned()) + .otherwise(col(Column::from_name(field_name)))? + .alias(field_name), + None => col(Column::from_name(field_name)), + }; + Ok(expr) + }) + .collect::>>()?; + let updated_df = df_with_predicate_and_metrics.select(expressions.clone())?; + let physical_plan = updated_df.clone().create_physical_plan().await?; let writer_stats_config = WriterStatsConfig::new( snapshot.table_config().num_indexed_cols(), snapshot @@ -356,32 +353,28 @@ async fn execute( .map(|v| v.iter().map(|v| v.to_string()).collect::>()), ); + let tracker = CDCTracker::new(df, updated_df.drop_columns(&[UPDATE_PREDICATE_COLNAME])?); + let add_actions = write_execution_plan( Some(&snapshot), state.clone(), - projection.clone(), + physical_plan.clone(), table_partition_cols.clone(), log_store.object_store().clone(), Some(snapshot.table_config().target_file_size() as usize), None, - writer_properties, - safe_cast, + writer_properties.clone(), + writer_stats_config.clone(), None, - writer_stats_config, ) .await?; - let count_metrics = count_plan.metrics().unwrap(); + let err = || DeltaTableError::Generic("Unable to locate expected metric node".into()); + let update_count = find_metric_node(UPDATE_COUNT_ID, &physical_plan).ok_or_else(err)?; + let update_count_metrics = update_count.metrics().unwrap(); - metrics.num_updated_rows = count_metrics - .sum_by_name("num_updated_rows") - .map(|m| m.as_usize()) - .unwrap_or(0); - - metrics.num_copied_rows = count_metrics - .sum_by_name("num_copied_rows") - .map(|m| m.as_usize()) - .unwrap_or(0); + metrics.num_updated_rows = get_metric(&update_count_metrics, UPDATE_ROW_COUNT); + metrics.num_copied_rows = get_metric(&update_count_metrics, COPIED_ROW_COUNT); let deletion_timestamp = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -422,6 +415,30 @@ async fn execute( serde_json::to_value(&metrics)?, ); + if let Ok(true) = should_write_cdc(&snapshot) { + match tracker.collect() { + Ok(df) => { + let cdc_actions = write_execution_plan_cdc( + Some(&snapshot), + state, + df.create_physical_plan().await?, + table_partition_cols, + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties, + writer_stats_config, + None, + ) + .await?; + actions.extend(cdc_actions); + } + Err(err) => { + error!("Failed to collect CDC batches: {err:#?}"); + } + }; + } + let commit = CommitBuilder::from(commit_properties) .with_actions(actions) .build(Some(&snapshot), log_store, operation) @@ -472,24 +489,25 @@ impl std::future::IntoFuture for UpdateBuilder { #[cfg(test)] mod tests { + use super::*; + use crate::kernel::DataType as DeltaDataType; - use crate::kernel::PrimitiveType; - use crate::kernel::StructField; - use crate::kernel::StructType; + use crate::kernel::{Action, PrimitiveType, Protocol, StructField, StructType}; + use crate::operations::load_cdf::*; use crate::operations::DeltaOps; use crate::writer::test_utils::datafusion::get_data; use crate::writer::test_utils::datafusion::write_batch; use crate::writer::test_utils::{ get_arrow_schema, get_delta_schema, get_record_batch, setup_table_with_configuration, }; - use crate::DeltaConfigKey; - use crate::DeltaTable; + use crate::{DeltaTable, TableProperty}; + use arrow::array::{Int32Array, StringArray}; use arrow::datatypes::Schema as ArrowSchema; use arrow::datatypes::{Field, Schema}; use arrow::record_batch::RecordBatch; - use arrow_array::Int32Array; use arrow_schema::DataType; use datafusion::assert_batches_sorted_eq; + use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; use serde_json::json; use std::sync::Arc; @@ -499,7 +517,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); @@ -531,7 +549,7 @@ mod tests { #[tokio::test] async fn test_update_when_delta_table_is_append_only() { - let table = setup_table_with_configuration(DeltaConfigKey::AppendOnly, Some("true")).await; + let table = setup_table_with_configuration(TableProperty::AppendOnly, Some("true")).await; let batch = get_record_batch(None, false); // Append let table = write_batch(table, batch).await; @@ -789,7 +807,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); let table = write_batch(table, batch).await; @@ -969,4 +987,234 @@ mod tests { .await; assert!(res.is_err()); } + + #[tokio::test] + async fn test_no_cdc_on_older_tables() { + let table = prepare_values_table().await; + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 1); + + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + arrow::datatypes::DataType::Int32, + true, + )])); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]))], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .update() + .with_predicate(col("value").eq(lit(2))) + .with_update("value", lit(12)) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + // NOTE: This currently doesn't really assert anything because cdc_files() is not reading + // actions correct + if let Some(state) = table.state.clone() { + let cdc_files = state.cdc_files(); + assert!(cdc_files.is_ok()); + if let Ok(cdc_files) = cdc_files { + let cdc_files: Vec<_> = cdc_files.collect(); + assert_eq!(cdc_files.len(), 0); + } + } else { + panic!("I shouldn't exist!"); + } + + // Too close for missiles, switching to guns. Just checking that the data wasn't actually + // written instead! + if let Ok(files) = crate::storage::utils::flatten_list_stream( + &table.object_store(), + Some(&object_store::path::Path::from("_change_data")), + ) + .await + { + assert_eq!( + 0, + files.len(), + "This test should not find any written CDC files! {files:#?}" + ); + } + } + + #[tokio::test] + async fn test_update_cdc_enabled() { + // Currently you cannot pass EnableChangeDataFeed through `with_configuration_property` + // so the only way to create a truly CDC enabled table is by shoving the Protocol + // directly into the actions list + let actions = vec![Action::Protocol(Protocol::new(1, 4))]; + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_actions(actions) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + arrow::datatypes::DataType::Int32, + true, + )])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]))], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .update() + .with_predicate(col("value").eq(lit(2))) + .with_update("value", lit(12)) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(3)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+------------------+-----------------+", + "| value | _change_type | _commit_version |", + "+-------+------------------+-----------------+", + "| 1 | insert | 1 |", + "| 2 | insert | 1 |", + "| 2 | update_preimage | 2 |", + "| 12 | update_postimage | 2 |", + "| 3 | insert | 1 |", + "+-------+------------------+-----------------+", + ], &batches } + } + + #[tokio::test] + async fn test_update_cdc_enabled_partitions() { + // Currently you cannot pass EnableChangeDataFeed through `with_configuration_property` + // so the only way to create a truly CDC enabled table is by shoving the Protocol + // directly into the actions list + let actions = vec![Action::Protocol(Protocol::new(1, 4))]; + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "year", + DeltaDataType::Primitive(PrimitiveType::String), + true, + None, + ) + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_partition_columns(vec!["year"]) + .with_actions(actions) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::new(vec![ + Field::new("year", DataType::Utf8, true), + Field::new("value", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![ + Some("2020"), + Some("2020"), + Some("2024"), + ])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + ], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .update() + .with_predicate(col("value").eq(lit(2))) + .with_update("year", "2024") + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + let _ = arrow::util::pretty::print_batches(&batches); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(3)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+------------------+-----------------+------+", + "| value | _change_type | _commit_version | year |", + "+-------+------------------+-----------------+------+", + "| 1 | insert | 1 | 2020 |", + "| 2 | insert | 1 | 2020 |", + "| 2 | update_preimage | 2 | 2020 |", + "| 2 | update_postimage | 2 | 2024 |", + "| 3 | insert | 1 | 2024 |", + "+-------+------------------+-----------------+------+", + ], &batches } + } } diff --git a/crates/core/src/operations/vacuum.rs b/crates/core/src/operations/vacuum.rs index 0e4bd2b467..4452526258 100644 --- a/crates/core/src/operations/vacuum.rs +++ b/crates/core/src/operations/vacuum.rs @@ -240,8 +240,11 @@ impl std::future::IntoFuture for VacuumBuilder { fn into_future(self) -> Self::IntoFuture { let this = self; - Box::pin(async move { + if !&this.snapshot.load_config().require_files { + return Err(DeltaTableError::NotInitializedWithFiles("VACUUM".into())); + } + let plan = this.create_vacuum_plan().await?; if this.dry_run { return Ok(( diff --git a/crates/core/src/operations/write.rs b/crates/core/src/operations/write.rs index f3b87d4f66..36dcec5b70 100644 --- a/crates/core/src/operations/write.rs +++ b/crates/core/src/operations/write.rs @@ -27,21 +27,29 @@ use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; -use std::time::{SystemTime, UNIX_EPOCH}; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; use std::vec; use arrow_array::RecordBatch; use arrow_cast::can_cast_types; use arrow_schema::{ArrowError, DataType, Fields, SchemaRef as ArrowSchemaRef}; use datafusion::execution::context::{SessionContext, SessionState, TaskContext}; -use datafusion::physical_plan::filter::FilterExec; -use datafusion::physical_plan::{memory::MemoryExec, ExecutionPlan}; use datafusion_common::DFSchema; -use datafusion_expr::Expr; +use datafusion_expr::{lit, Expr}; +use datafusion_physical_expr::expressions::{self}; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::filter::FilterExec; +use datafusion_physical_plan::projection::ProjectionExec; +use datafusion_physical_plan::union::UnionExec; +use datafusion_physical_plan::{memory::MemoryExec, ExecutionPlan}; use futures::future::BoxFuture; use futures::StreamExt; +use object_store::prefix::PrefixStore; use parquet::file::properties::WriterProperties; +use serde::{Deserialize, Serialize}; +use tracing::log::*; +use super::cdc::should_write_cdc; use super::datafusion_utils::Expression; use super::transaction::{CommitBuilder, CommitProperties, TableReference, PROTOCOL}; use super::writer::{DeltaWriter, WriterConfig}; @@ -49,13 +57,15 @@ use super::CreateBuilder; use crate::delta_datafusion::expr::fmt_expr_to_sql; use crate::delta_datafusion::expr::parse_predicate_expression; use crate::delta_datafusion::{ - create_physical_expr_fix, find_files, register_store, DeltaScanBuilder, + find_files, register_store, DeltaScanBuilder, DeltaScanConfigBuilder, }; use crate::delta_datafusion::{DataFusionMixins, DeltaDataChecker}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Action, Add, Metadata, PartitionsExt, Remove, StructType}; +use crate::kernel::{ + Action, ActionType, Add, AddCDCFile, Metadata, PartitionsExt, Remove, StructType, +}; use crate::logstore::LogStoreRef; -use crate::operations::cast::{cast_record_batch, merge_schema}; +use crate::operations::cast::{cast_record_batch, merge_schema::merge_arrow_schema}; use crate::protocol::{DeltaOperation, SaveMode}; use crate::storage::ObjectStoreRef; use crate::table::state::DeltaTableState; @@ -63,6 +73,8 @@ use crate::table::Constraint as DeltaConstraint; use crate::writer::record_batch::divide_by_partition_values; use crate::DeltaTable; +use tokio::sync::mpsc::Sender; + #[derive(thiserror::Error, Debug)] enum WriteError { #[error("No data source supplied to write command.")] @@ -153,6 +165,21 @@ pub struct WriteBuilder { configuration: HashMap>, } +#[derive(Default, Debug, Serialize, Deserialize)] +/// Metrics for the Write Operation +pub struct WriteMetrics { + /// Number of files added + pub num_added_files: usize, + /// Number of files removed + pub num_removed_files: usize, + /// Number of partitions + pub num_partitions: usize, + /// Number of rows added + pub num_added_rows: usize, + /// Time taken to execute the entire operation + pub execution_time_ms: u64, +} + impl super::Operation<()> for WriteBuilder {} impl WriteBuilder { @@ -286,17 +313,20 @@ impl WriteBuilder { Some(snapshot) => { PROTOCOL.can_write_to(snapshot)?; - if let Some(plan) = &self.input { - let schema: StructType = (plan.schema()).try_into()?; - PROTOCOL.check_can_write_timestamp_ntz(snapshot, &schema)?; + let schema: StructType = if let Some(plan) = &self.input { + (plan.schema()).try_into()? } else if let Some(batches) = &self.batches { if batches.is_empty() { return Err(WriteError::MissingData.into()); } - let schema: StructType = (batches[0].schema()).try_into()?; + (batches[0].schema()).try_into()? + } else { + return Err(WriteError::MissingData.into()); + }; + + if self.schema_mode.is_none() { PROTOCOL.check_can_write_timestamp_ntz(snapshot, &schema)?; } - match self.mode { SaveMode::ErrorIfExists => { Err(WriteError::AlreadyExists(self.log_store.root_uri()).into()) @@ -317,7 +347,7 @@ impl WriteBuilder { }?; let mut builder = CreateBuilder::new() .with_log_store(self.log_store.clone()) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_configuration(self.configuration.clone()); if let Some(partition_columns) = self.partition_columns.as_ref() { builder = builder.with_partition_columns(partition_columns.clone()) @@ -367,18 +397,12 @@ async fn write_execution_plan_with_predicate( target_file_size: Option, write_batch_size: Option, writer_properties: Option, - safe_cast: bool, - schema_mode: Option, writer_stats_config: WriterStatsConfig, + sender: Option>, ) -> DeltaResult> { - let schema: ArrowSchemaRef = if schema_mode.is_some() { - plan.schema() - } else { - snapshot - .and_then(|s| s.input_schema().ok()) - .unwrap_or(plan.schema()) - }; - + // We always take the plan Schema since the data may contain Large/View arrow types, + // the schema and batches were prior constructed with this in mind. + let schema: ArrowSchemaRef = plan.schema(); let checker = if let Some(snapshot) = snapshot { DeltaDataChecker::new(snapshot) } else { @@ -392,7 +416,6 @@ async fn write_execution_plan_with_predicate( } _ => checker, }; - // Write data to disk let mut tasks = vec![]; for i in 0..plan.properties().output_partitioning().partition_count() { @@ -410,26 +433,33 @@ async fn write_execution_plan_with_predicate( ); let mut writer = DeltaWriter::new(object_store.clone(), config); let checker_stream = checker.clone(); + let sender_stream = sender.clone(); let mut stream = inner_plan.execute(i, task_ctx)?; - let handle: tokio::task::JoinHandle>> = - tokio::task::spawn(async move { + + let handle: tokio::task::JoinHandle>> = tokio::task::spawn( + async move { + let sendable = sender_stream.clone(); while let Some(maybe_batch) = stream.next().await { let batch = maybe_batch?; + checker_stream.check_batch(&batch).await?; - let arr = super::cast::cast_record_batch( - &batch, - inner_schema.clone(), - safe_cast, - schema_mode == Some(SchemaMode::Merge), - )?; - writer.write(&arr).await?; + + if let Some(s) = sendable.as_ref() { + if let Err(e) = s.send(batch.clone()).await { + error!("Failed to send data to observer: {e:#?}"); + } + } else { + debug!("write_execution_plan_with_predicate did not send any batches, no sender."); + } + writer.write(&batch).await?; } let add_actions = writer.close().await; match add_actions { Ok(actions) => Ok(actions.into_iter().map(Action::Add).collect::>()), Err(err) => Err(err), } - }); + }, + ); tasks.push(handle); } @@ -447,6 +477,55 @@ async fn write_execution_plan_with_predicate( Ok(actions) } +#[allow(clippy::too_many_arguments)] +pub(crate) async fn write_execution_plan_cdc( + snapshot: Option<&DeltaTableState>, + state: SessionState, + plan: Arc, + partition_columns: Vec, + object_store: ObjectStoreRef, + target_file_size: Option, + write_batch_size: Option, + writer_properties: Option, + writer_stats_config: WriterStatsConfig, + sender: Option>, +) -> DeltaResult> { + let cdc_store = Arc::new(PrefixStore::new(object_store, "_change_data")); + + Ok(write_execution_plan( + snapshot, + state, + plan, + partition_columns, + cdc_store, + target_file_size, + write_batch_size, + writer_properties, + writer_stats_config, + sender, + ) + .await? + .into_iter() + .map(|add| { + // Modify add actions into CDC actions + match add { + Action::Add(add) => { + Action::Cdc(AddCDCFile { + // This is a gnarly hack, but the action needs the nested path, not the + // path isnide the prefixed store + path: format!("_change_data/{}", add.path), + size: add.size, + partition_values: add.partition_values, + data_change: false, + tags: add.tags, + }) + } + _ => panic!("Expected Add action"), + } + }) + .collect::>()) +} + #[allow(clippy::too_many_arguments)] pub(crate) async fn write_execution_plan( snapshot: Option<&DeltaTableState>, @@ -457,9 +536,8 @@ pub(crate) async fn write_execution_plan( target_file_size: Option, write_batch_size: Option, writer_properties: Option, - safe_cast: bool, - schema_mode: Option, writer_stats_config: WriterStatsConfig, + sender: Option>, ) -> DeltaResult> { write_execution_plan_with_predicate( None, @@ -471,9 +549,8 @@ pub(crate) async fn write_execution_plan( target_file_size, write_batch_size, writer_properties, - safe_cast, - schema_mode, writer_stats_config, + sender, ) .await } @@ -488,44 +565,165 @@ async fn execute_non_empty_expr( rewrite: &[Add], writer_properties: Option, writer_stats_config: WriterStatsConfig, + partition_scan: bool, + insert_plan: Arc, ) -> DeltaResult> { // For each identified file perform a parquet scan + filter + limit (1) + count. // If returned count is not zero then append the file to be rewritten and removed from the log. Otherwise do nothing to the file. + let mut actions: Vec = Vec::new(); - let input_schema = snapshot.input_schema()?; - let input_dfschema: DFSchema = input_schema.clone().as_ref().clone().try_into()?; + // Take the insert plan schema since it might have been schema evolved, if its not + // it is simply the table schema + let df_schema = insert_plan.schema(); + let input_dfschema: DFSchema = df_schema.as_ref().clone().try_into()?; + + let scan_config = DeltaScanConfigBuilder::new() + .with_schema(snapshot.input_schema()?) + .build(snapshot)?; let scan = DeltaScanBuilder::new(snapshot, log_store.clone(), &state) .with_files(rewrite) + // Use input schema which doesn't wrap partition values, otherwise divide_by_partition_value won't work on UTF8 partitions + // Since it can't fetch a scalar from a dictionary type + .with_scan_config(scan_config) .build() .await?; let scan = Arc::new(scan); - // Apply the negation of the filter and rewrite files - let negated_expression = Expr::Not(Box::new(Expr::IsTrue(Box::new(expression.clone())))); + // We don't want to verify the predicate against existing data + if !partition_scan { + // Apply the negation of the filter and rewrite files + let negated_expression = Expr::Not(Box::new(Expr::IsTrue(Box::new(expression.clone())))); + + let predicate_expr = state.create_physical_expr(negated_expression, &input_dfschema)?; + let filter: Arc = + Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); + + let add_actions: Vec = write_execution_plan( + Some(snapshot), + state.clone(), + filter, + partition_columns.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties.clone(), + writer_stats_config.clone(), + None, + ) + .await?; - let predicate_expr = - create_physical_expr_fix(negated_expression, &input_dfschema, state.execution_props())?; - let filter: Arc = - Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); + actions.extend(add_actions); + } - // We don't want to verify the predicate against existing data - let add_actions = write_execution_plan( - Some(snapshot), - state, - filter, - partition_columns, - log_store.object_store(), - Some(snapshot.table_config().target_file_size() as usize), - None, - writer_properties, - false, - None, - writer_stats_config, - ) - .await?; + // CDC logic, simply filters data with predicate and adds the _change_type="delete" as literal column + // Only write when CDC actions when it was not a partition scan, load_cdf can deduce the deletes in that case + // based on the remove actions if a partition got deleted + if !partition_scan { + // We only write deletions when it was not a partition scan + if let Some(cdc_actions) = execute_non_empty_expr_cdc( + snapshot, + log_store, + state.clone(), + scan, + input_dfschema, + expression, + partition_columns, + writer_properties, + writer_stats_config, + insert_plan, + ) + .await? + { + actions.extend(cdc_actions) + } + } + Ok(actions) +} - Ok(add_actions) +/// If CDC is enabled it writes all the deletions based on predicate into _change_data directory +#[allow(clippy::too_many_arguments)] +pub(crate) async fn execute_non_empty_expr_cdc( + snapshot: &DeltaTableState, + log_store: LogStoreRef, + state: SessionState, + scan: Arc, + input_dfschema: DFSchema, + expression: &Expr, + table_partition_cols: Vec, + writer_properties: Option, + writer_stats_config: WriterStatsConfig, + insert_plan: Arc, +) -> DeltaResult>> { + match should_write_cdc(snapshot) { + // Create CDC scan + Ok(true) => { + let cdc_predicate_expr = + state.create_physical_expr(expression.clone(), &input_dfschema)?; + let cdc_scan: Arc = + Arc::new(FilterExec::try_new(cdc_predicate_expr, scan.clone())?); + + // Add literal column "_change_type" + let delete_change_type_expr = + state.create_physical_expr(lit("delete"), &input_dfschema)?; + + let insert_change_type_expr = + state.create_physical_expr(lit("insert"), &input_dfschema)?; + + // Project columns and lit + let mut delete_project_expressions: Vec<(Arc, String)> = scan + .schema() + .fields() + .into_iter() + .enumerate() + .map(|(idx, field)| -> (Arc, String) { + ( + Arc::new(expressions::Column::new(field.name(), idx)), + field.name().to_owned(), + ) + }) + .collect(); + + let mut insert_project_expressions = delete_project_expressions.clone(); + delete_project_expressions.insert( + delete_project_expressions.len(), + (delete_change_type_expr, "_change_type".to_owned()), + ); + insert_project_expressions.insert( + insert_project_expressions.len(), + (insert_change_type_expr, "_change_type".to_owned()), + ); + + let delete_plan: Arc = Arc::new(ProjectionExec::try_new( + delete_project_expressions, + cdc_scan.clone(), + )?); + + let insert_plan: Arc = Arc::new(ProjectionExec::try_new( + insert_project_expressions, + insert_plan.clone(), + )?); + + let cdc_plan: Arc = + Arc::new(UnionExec::new(vec![delete_plan, insert_plan])); + + let cdc_actions = write_execution_plan_cdc( + Some(snapshot), + state.clone(), + cdc_plan.clone(), + table_partition_cols.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties, + writer_stats_config, + None, + ) + .await?; + Ok(Some(cdc_actions)) + } + _ => Ok(None), + } } // This should only be called wth a valid predicate @@ -539,28 +737,26 @@ async fn prepare_predicate_actions( writer_properties: Option, deletion_timestamp: i64, writer_stats_config: WriterStatsConfig, + insert_plan: Arc, ) -> DeltaResult> { let candidates = find_files(snapshot, log_store.clone(), &state, Some(predicate.clone())).await?; - let add = if candidates.partition_scan { - Vec::new() - } else { - execute_non_empty_expr( - snapshot, - log_store, - state, - partition_columns, - &predicate, - &candidates.candidates, - writer_properties, - writer_stats_config, - ) - .await? - }; - let remove = candidates.candidates; + let mut actions = execute_non_empty_expr( + snapshot, + log_store, + state, + partition_columns, + &predicate, + &candidates.candidates, + writer_properties, + writer_stats_config, + candidates.partition_scan, + insert_plan, + ) + .await?; - let mut actions: Vec = add.into_iter().collect(); + let remove = candidates.candidates; for action in remove { actions.push(Action::Remove(Remove { @@ -587,9 +783,15 @@ impl std::future::IntoFuture for WriteBuilder { let this = self; Box::pin(async move { + let mut metrics = WriteMetrics::default(); + let exec_start = Instant::now(); + if this.mode == SaveMode::Overwrite { if let Some(snapshot) = &this.snapshot { PROTOCOL.check_append_only(&snapshot.snapshot)?; + if !snapshot.load_config().require_files { + return Err(DeltaTableError::NotInitializedWithFiles("WRITE".into())); + } } } if this.schema_mode == Some(SchemaMode::Overwrite) && this.mode != SaveMode::Overwrite { @@ -639,35 +841,52 @@ impl std::future::IntoFuture for WriteBuilder { let mut new_schema = None; if let Some(snapshot) = &this.snapshot { - let table_schema = snapshot - .physical_arrow_schema(this.log_store.object_store().clone()) - .await - .or_else(|_| snapshot.arrow_schema()) - .unwrap_or(schema.clone()); - + let table_schema = snapshot.input_schema()?; if let Err(schema_err) = try_cast_batch(schema.fields(), table_schema.fields()) { schema_drift = true; - if this.mode == SaveMode::Overwrite && this.schema_mode.is_some() { + if this.mode == SaveMode::Overwrite + && this.schema_mode == Some(SchemaMode::Overwrite) + { new_schema = None // we overwrite anyway, so no need to cast } else if this.schema_mode == Some(SchemaMode::Merge) { - new_schema = - Some(merge_schema(table_schema.clone(), schema.clone())?); + new_schema = Some(merge_arrow_schema( + table_schema.clone(), + schema.clone(), + schema_drift, + )?); } else { return Err(schema_err.into()); } + } else if this.mode == SaveMode::Overwrite + && this.schema_mode == Some(SchemaMode::Overwrite) + { + new_schema = None // we overwrite anyway, so no need to cast + } else { + // Schema needs to be merged so that utf8/binary/list types are preserved from the batch side if both table + // and batch contains such type. Other types are preserved from the table side. + // At this stage it will never introduce more fields since try_cast_batch passed correctly. + new_schema = Some(merge_arrow_schema( + table_schema.clone(), + schema.clone(), + schema_drift, + )?); } } - let data = if !partition_columns.is_empty() { // TODO partitioning should probably happen in its own plan ... let mut partitions: HashMap> = HashMap::new(); + let mut num_partitions = 0; + let mut num_added_rows = 0; for batch in batches { let real_batch = match new_schema.clone() { - Some(new_schema) => { - cast_record_batch(&batch, new_schema, false, true)? - } + Some(new_schema) => cast_record_batch( + &batch, + new_schema, + this.safe_cast, + schema_drift, // Schema drifted so we have to add the missing columns/structfields. + )?, None => batch, }; @@ -676,7 +895,9 @@ impl std::future::IntoFuture for WriteBuilder { partition_columns.clone(), &real_batch, )?; + num_partitions += divided.len(); for part in divided { + num_added_rows += part.record_batch.num_rows(); let key = part.partition_values.hive_partition_path(); match partitions.get_mut(&key) { Some(part_batches) => { @@ -688,22 +909,30 @@ impl std::future::IntoFuture for WriteBuilder { } } } + metrics.num_partitions = num_partitions; + metrics.num_added_rows = num_added_rows; partitions.into_values().collect::>() } else { match new_schema { Some(ref new_schema) => { let mut new_batches = vec![]; + let mut num_added_rows = 0; for batch in batches { new_batches.push(cast_record_batch( &batch, new_schema.clone(), - false, - true, + this.safe_cast, + schema_drift, // Schema drifted so we have to add the missing columns/structfields. )?); + num_added_rows += batch.num_rows(); } + metrics.num_added_rows = num_added_rows; vec![new_batches] } - None => vec![batches], + None => { + metrics.num_added_rows = batches.iter().map(|b| b.num_rows()).sum(); + vec![batches] + } } }; @@ -720,12 +949,38 @@ impl std::future::IntoFuture for WriteBuilder { if this.schema_mode == Some(SchemaMode::Merge) && schema_drift { if let Some(snapshot) = &this.snapshot { let schema_struct: StructType = schema.clone().try_into()?; + let current_protocol = snapshot.protocol(); + let configuration = snapshot.metadata().configuration.clone(); + let maybe_new_protocol = if PROTOCOL + .contains_timestampntz(schema_struct.fields()) + && !current_protocol + .reader_features + .clone() + .unwrap_or_default() + .contains(&crate::kernel::ReaderFeatures::TimestampWithoutTimezone) + // We can check only reader features, as reader and writer timestampNtz + // should be always enabled together + { + let new_protocol = current_protocol.clone().enable_timestamp_ntz(); + if !(current_protocol.min_reader_version == 3 + && current_protocol.min_writer_version == 7) + { + Some(new_protocol.move_table_properties_into_features(&configuration)) + } else { + Some(new_protocol) + } + } else { + None + }; let schema_action = Action::Metadata(Metadata::try_new( schema_struct, partition_columns.clone(), - snapshot.metadata().configuration.clone(), + configuration, )?); actions.push(schema_action); + if let Some(new_protocol) = maybe_new_protocol { + actions.push(new_protocol.into()) + } } } let state = match this.state { @@ -757,6 +1012,9 @@ impl std::future::IntoFuture for WriteBuilder { .as_ref() .map(|snapshot| snapshot.table_config()); + let target_file_size = this.target_file_size.or_else(|| { + Some(super::get_target_file_size(&config, &this.configuration) as usize) + }); let (num_indexed_cols, stats_columns) = super::get_num_idx_cols_and_stats_columns(config, this.configuration); @@ -764,33 +1022,58 @@ impl std::future::IntoFuture for WriteBuilder { num_indexed_cols, stats_columns, }; + // Here we need to validate if the new data conforms to a predicate if one is provided let add_actions = write_execution_plan_with_predicate( predicate.clone(), this.snapshot.as_ref(), state.clone(), - plan, + plan.clone(), partition_columns.clone(), this.log_store.object_store().clone(), - this.target_file_size, + target_file_size, this.write_batch_size, this.writer_properties.clone(), - this.safe_cast, - this.schema_mode, writer_stats_config.clone(), + None, ) .await?; + metrics.num_added_files = add_actions.len(); actions.extend(add_actions); // Collect remove actions if we are overwriting the table if let Some(snapshot) = &this.snapshot { if matches!(this.mode, SaveMode::Overwrite) { // Update metadata with new schema - let table_schema = snapshot - .physical_arrow_schema(this.log_store.object_store().clone()) - .await - .or_else(|_| snapshot.arrow_schema()) - .unwrap_or(schema.clone()); + let table_schema = snapshot.input_schema()?; + + let configuration = snapshot.metadata().configuration.clone(); + let current_protocol = snapshot.protocol(); + let maybe_new_protocol = if PROTOCOL.contains_timestampntz( + TryInto::::try_into(schema.clone())?.fields(), + ) && !current_protocol + .reader_features + .clone() + .unwrap_or_default() + .contains(&crate::kernel::ReaderFeatures::TimestampWithoutTimezone) + // We can check only reader features, as reader and writer timestampNtz + // should be always enabled together + { + let new_protocol = current_protocol.clone().enable_timestamp_ntz(); + if !(current_protocol.min_reader_version == 3 + && current_protocol.min_writer_version == 7) + { + Some(new_protocol.move_table_properties_into_features(&configuration)) + } else { + Some(new_protocol) + } + } else { + None + }; + + if let Some(protocol) = maybe_new_protocol { + actions.push(protocol.into()) + } if schema != table_schema { let mut metadata = snapshot.metadata().clone(); @@ -815,6 +1098,7 @@ impl std::future::IntoFuture for WriteBuilder { this.writer_properties, deletion_timestamp, writer_stats_config, + plan, ) .await?; if !predicate_actions.is_empty() { @@ -830,8 +1114,15 @@ impl std::future::IntoFuture for WriteBuilder { } }; } + metrics.num_removed_files = actions + .iter() + .filter(|a| a.action_type() == ActionType::Remove) + .count(); } + metrics.execution_time_ms = + Instant::now().duration_since(exec_start).as_millis() as u64; + let operation = DeltaOperation::Write { mode: this.mode, partition_by: if !partition_columns.is_empty() { @@ -842,7 +1133,13 @@ impl std::future::IntoFuture for WriteBuilder { predicate: predicate_str, }; - let commit = CommitBuilder::from(this.commit_properties) + let mut commit_properties = this.commit_properties.clone(); + commit_properties.app_metadata.insert( + "operationMetrics".to_owned(), + serde_json::to_value(&metrics)?, + ); + + let commit = CommitBuilder::from(commit_properties) .with_actions(actions) .build( this.snapshot.as_ref().map(|f| f as &dyn TableReference), @@ -924,26 +1221,51 @@ fn try_cast_batch(from_fields: &Fields, to_fields: &Fields) -> Result<(), ArrowE #[cfg(test)] mod tests { use super::*; + use crate::logstore::get_actions; + use crate::operations::load_cdf::collect_batches; use crate::operations::{collect_sendable_stream, DeltaOps}; use crate::protocol::SaveMode; + use crate::test_utils::{TestResult, TestSchemas}; use crate::writer::test_utils::datafusion::{get_data, get_data_sorted, write_batch}; use crate::writer::test_utils::{ get_arrow_schema, get_delta_schema, get_delta_schema_with_nested_struct, get_record_batch, get_record_batch_with_nested_struct, setup_table_with_configuration, }; - use crate::DeltaConfigKey; + use crate::TableProperty; use arrow_array::{Int32Array, StringArray, TimestampMicrosecondArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use datafusion::prelude::*; use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; + use itertools::Itertools; use serde_json::{json, Value}; + async fn get_write_metrics(table: DeltaTable) -> WriteMetrics { + let mut commit_info = table.history(Some(1)).await.unwrap(); + let metrics = commit_info + .first_mut() + .unwrap() + .info + .remove("operationMetrics") + .unwrap(); + serde_json::from_value(metrics).unwrap() + } + + fn assert_common_write_metrics(write_metrics: WriteMetrics) { + assert!(write_metrics.execution_time_ms > 0); + assert!(write_metrics.num_added_files > 0); + } + #[tokio::test] async fn test_write_when_delta_table_is_append_only() { - let table = setup_table_with_configuration(DeltaConfigKey::AppendOnly, Some("true")).await; + let table = setup_table_with_configuration(TableProperty::AppendOnly, Some("true")).await; let batch = get_record_batch(None, false); // Append let table = write_batch(table, batch.clone()).await; + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, batch.num_rows()); + assert_eq!(write_metrics.num_removed_files, 0); + assert_common_write_metrics(write_metrics); + // Overwrite let _err = DeltaOps(table) .write(vec![batch]) @@ -959,7 +1281,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -975,6 +1297,12 @@ mod tests { .unwrap(); assert_eq!(table.version(), 1); assert_eq!(table.get_files_count(), 1); + + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, batch.num_rows()); + assert_eq!(write_metrics.num_added_files, table.get_files_count()); + assert_common_write_metrics(write_metrics); + table.load().await.unwrap(); assert_eq!(table.history(None).await.unwrap().len(), 2); assert_eq!( @@ -982,7 +1310,7 @@ mod tests { .info .clone() .into_iter() - .filter(|(k, _)| k != "clientVersion") + .filter(|(k, _)| k == "k1") .collect::>(), metadata ); @@ -998,6 +1326,11 @@ mod tests { .unwrap(); assert_eq!(table.version(), 2); assert_eq!(table.get_files_count(), 2); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, batch.num_rows()); + assert_eq!(write_metrics.num_added_files, 1); + assert_common_write_metrics(write_metrics); + table.load().await.unwrap(); assert_eq!(table.history(None).await.unwrap().len(), 3); assert_eq!( @@ -1005,7 +1338,7 @@ mod tests { .info .clone() .into_iter() - .filter(|(k, _)| k != "clientVersion") + .filter(|(k, _)| k == "k1") .collect::>(), metadata ); @@ -1014,13 +1347,18 @@ mod tests { let metadata: HashMap = HashMap::from_iter(vec![("k2".to_string(), json!("v2.1"))]); let mut table = DeltaOps(table) - .write(vec![batch]) + .write(vec![batch.clone()]) .with_save_mode(SaveMode::Overwrite) .with_commit_properties(CommitProperties::default().with_metadata(metadata.clone())) .await .unwrap(); assert_eq!(table.version(), 3); assert_eq!(table.get_files_count(), 1); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, batch.num_rows()); + assert!(write_metrics.num_removed_files > 0); + assert_common_write_metrics(write_metrics); + table.load().await.unwrap(); assert_eq!(table.history(None).await.unwrap().len(), 4); assert_eq!( @@ -1028,7 +1366,7 @@ mod tests { .info .clone() .into_iter() - .filter(|(k, _)| k != "clientVersion") + .filter(|(k, _)| k == "k2") .collect::>(), metadata ); @@ -1051,6 +1389,9 @@ mod tests { ) .unwrap(); let table = DeltaOps::new_in_memory().write(vec![batch]).await.unwrap(); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 2); + assert_common_write_metrics(write_metrics); let schema = Arc::new(ArrowSchema::new(vec![Field::new( "value", @@ -1075,6 +1416,10 @@ mod tests { .await .unwrap(); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 3); + assert_common_write_metrics(write_metrics); + let expected = [ "+-------+", "| value |", @@ -1108,6 +1453,10 @@ mod tests { .unwrap(); let table = DeltaOps::new_in_memory().write(vec![batch]).await.unwrap(); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 1); + assert_common_write_metrics(write_metrics); + let schema = Arc::new(ArrowSchema::new(vec![Field::new( "value", DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())), @@ -1143,7 +1492,9 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 0); - assert_eq!(table.get_files_count(), 1) + assert_eq!(table.get_files_count(), 1); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); } #[tokio::test] @@ -1157,6 +1508,10 @@ mod tests { .unwrap(); assert_eq!(table.version(), 0); assert_eq!(table.get_files_count(), 2); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert!(write_metrics.num_partitions > 0); + assert_eq!(write_metrics.num_added_files, 2); + assert_common_write_metrics(write_metrics); let table = DeltaOps::new_in_memory() .write(vec![batch]) @@ -1165,7 +1520,12 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 0); - assert_eq!(table.get_files_count(), 4) + assert_eq!(table.get_files_count(), 4); + + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert!(write_metrics.num_partitions > 0); + assert_eq!(write_metrics.num_added_files, 4); + assert_common_write_metrics(write_metrics); } #[tokio::test] @@ -1178,6 +1538,9 @@ mod tests { .unwrap(); assert_eq!(table.version(), 0); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); + let mut new_schema_builder = arrow_schema::SchemaBuilder::new(); for field in batch.schema().fields() { if field.name() != "modified" { @@ -1222,8 +1585,11 @@ mod tests { assert_eq!(table.version(), 1); let new_schema = table.metadata().unwrap().schema().unwrap(); let fields = new_schema.fields(); - let names = fields.iter().map(|f| f.name()).collect::>(); + let names = fields.map(|f| f.name()).collect::>(); assert_eq!(names, vec!["id", "value", "modified", "inserted_by"]); + + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); } #[tokio::test] @@ -1237,6 +1603,10 @@ mod tests { .unwrap(); assert_eq!(table.version(), 0); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert!(write_metrics.num_partitions > 0); + assert_common_write_metrics(write_metrics); + let mut new_schema_builder = arrow_schema::SchemaBuilder::new(); for field in batch.schema().fields() { if field.name() != "modified" { @@ -1270,7 +1640,6 @@ mod tests { ], ) .unwrap(); - println!("new_batch: {:?}", new_batch.schema()); let table = DeltaOps(table) .write(vec![new_batch]) .with_save_mode(SaveMode::Append) @@ -1281,11 +1650,15 @@ mod tests { assert_eq!(table.version(), 1); let new_schema = table.metadata().unwrap().schema().unwrap(); let fields = new_schema.fields(); - let mut names = fields.iter().map(|f| f.name()).collect::>(); + let mut names = fields.map(|f| f.name()).collect::>(); names.sort(); assert_eq!(names, vec!["id", "inserted_by", "modified", "value"]); let part_cols = table.metadata().unwrap().partition_columns.clone(); assert_eq!(part_cols, vec!["id", "value"]); // we want to preserve partitions + + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert!(write_metrics.num_partitions > 0); + assert_common_write_metrics(write_metrics); } #[tokio::test] @@ -1297,7 +1670,8 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 0); - + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); let mut new_schema_builder = arrow_schema::SchemaBuilder::new(); for field in batch.schema().fields() { if field.name() != "modified" { @@ -1350,6 +1724,8 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 0); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); let mut new_schema_builder = arrow_schema::SchemaBuilder::new(); @@ -1398,13 +1774,15 @@ mod tests { let table = DeltaOps::new_in_memory() .create() .with_save_mode(SaveMode::ErrorIfExists) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); let table = DeltaOps(table).write(vec![batch.clone()]).await.unwrap(); assert_eq!(table.version(), 1); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); let schema: StructType = serde_json::from_value(json!({ "type": "struct", @@ -1420,13 +1798,13 @@ mod tests { let table = DeltaOps::new_in_memory() .create() .with_save_mode(SaveMode::ErrorIfExists) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); let table = DeltaOps(table).write(vec![batch.clone()]).await; - assert!(table.is_err()) + assert!(table.is_err()); } #[tokio::test] @@ -1436,7 +1814,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -1447,6 +1825,8 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 1); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); let actual = get_data(&table).await; let expected = DataType::Struct(Fields::from(vec![Field::new( @@ -1485,6 +1865,8 @@ mod tests { .with_partition_columns(["string"]) .await .unwrap(); + let write_metrics: WriteMetrics = get_write_metrics(_table.clone()).await; + assert_common_write_metrics(write_metrics); let table = crate::open_table(tmp_path.as_os_str().to_str().unwrap()) .await @@ -1528,6 +1910,9 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 0); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 4); + assert_common_write_metrics(write_metrics); let batch_add = RecordBatch::try_new( Arc::clone(&schema), @@ -1546,6 +1931,9 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 1); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 1); + assert_common_write_metrics(write_metrics); let expected = [ "+----+-------+------------+", @@ -1584,6 +1972,8 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 0); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); // Take clones of these before an operation resulting in error, otherwise it will // be impossible to refer to an in-memory table @@ -1626,6 +2016,8 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 0); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_common_write_metrics(write_metrics); let batch_add = RecordBatch::try_new( Arc::clone(&schema), @@ -1648,6 +2040,9 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 1); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 3); + assert_common_write_metrics(write_metrics); let expected = [ "+----+-------+------------+", @@ -1665,4 +2060,255 @@ mod tests { let actual = get_data_sorted(&table, "id,value,modified").await; assert_batches_sorted_eq!(&expected, &actual); } + + #[tokio::test] + async fn test_dont_write_cdc_with_overwrite() -> TestResult { + let delta_schema = TestSchemas::simple(); + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_columns(delta_schema.fields().cloned()) + .with_partition_columns(["id"]) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(ArrowSchema::try_from(delta_schema)?); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("1"), Some("2"), Some("3")])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(StringArray::from(vec![ + Some("yes"), + Some("yes"), + Some("no"), + ])), + ], + ) + .unwrap(); + + let second_batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("3")])), + Arc::new(Int32Array::from(vec![Some(10)])), + Arc::new(StringArray::from(vec![Some("yes")])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 3); + assert_common_write_metrics(write_metrics); + + let table = DeltaOps(table) + .write([second_batch]) + .with_save_mode(crate::protocol::SaveMode::Overwrite) + .await + .unwrap(); + assert_eq!(table.version(), 2); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 1); + assert!(write_metrics.num_removed_files > 0); + assert_common_write_metrics(write_metrics); + + let snapshot_bytes = table + .log_store + .read_commit_entry(2) + .await? + .expect("failed to get snapshot bytes"); + let version_actions = get_actions(2, snapshot_bytes).await?; + + let cdc_actions = version_actions + .iter() + .filter(|action| matches!(action, &&Action::Cdc(_))) + .collect_vec(); + assert!(cdc_actions.is_empty()); + Ok(()) + } + + #[tokio::test] + async fn test_dont_write_cdc_with_overwrite_predicate_partitioned() -> TestResult { + let delta_schema = TestSchemas::simple(); + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_columns(delta_schema.fields().cloned()) + .with_partition_columns(["id"]) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(ArrowSchema::try_from(delta_schema)?); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("1"), Some("2"), Some("3")])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(StringArray::from(vec![ + Some("yes"), + Some("yes"), + Some("no"), + ])), + ], + ) + .unwrap(); + + let second_batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("3")])), + Arc::new(Int32Array::from(vec![Some(10)])), + Arc::new(StringArray::from(vec![Some("yes")])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 3); + assert!(write_metrics.num_partitions > 0); + assert_common_write_metrics(write_metrics); + + let table = DeltaOps(table) + .write([second_batch]) + .with_save_mode(crate::protocol::SaveMode::Overwrite) + .with_replace_where("id='3'") + .await + .unwrap(); + assert_eq!(table.version(), 2); + let write_metrics: WriteMetrics = get_write_metrics(table.clone()).await; + assert_eq!(write_metrics.num_added_rows, 1); + assert!(write_metrics.num_partitions > 0); + assert!(write_metrics.num_removed_files > 0); + assert_common_write_metrics(write_metrics); + + let snapshot_bytes = table + .log_store + .read_commit_entry(2) + .await? + .expect("failed to get snapshot bytes"); + let version_actions = get_actions(2, snapshot_bytes).await?; + + let cdc_actions = version_actions + .iter() + .filter(|action| matches!(action, &&Action::Cdc(_))) + .collect_vec(); + assert!(cdc_actions.is_empty()); + Ok(()) + } + + #[tokio::test] + async fn test_dont_write_cdc_with_overwrite_predicate_unpartitioned() -> TestResult { + let delta_schema = TestSchemas::simple(); + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_columns(delta_schema.fields().cloned()) + .with_partition_columns(["id"]) + .with_configuration_property(TableProperty::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(ArrowSchema::try_from(delta_schema)?); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("1"), Some("2"), Some("3")])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(StringArray::from(vec![ + Some("yes"), + Some("yes"), + Some("no"), + ])), + ], + ) + .unwrap(); + + let second_batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("3")])), + Arc::new(Int32Array::from(vec![Some(3)])), + Arc::new(StringArray::from(vec![Some("yes")])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let table = DeltaOps(table) + .write([second_batch]) + .with_save_mode(crate::protocol::SaveMode::Overwrite) + .with_replace_where("value=3") + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let cdf_scan = DeltaOps(table.clone()) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + cdf_scan + .properties() + .output_partitioning() + .partition_count(), + cdf_scan, + ctx, + ) + .await + .expect("Failed to collect batches"); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(4)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+----------+--------------+-----------------+----+", + "| value | modified | _change_type | _commit_version | id |", + "+-------+----------+--------------+-----------------+----+", + "| 1 | yes | insert | 1 | 1 |", + "| 2 | yes | insert | 1 | 2 |", + "| 3 | no | delete | 2 | 3 |", + "| 3 | no | insert | 1 | 3 |", + "| 3 | yes | insert | 2 | 3 |", + "+-------+----------+--------------+-----------------+----+", + ], &batches } + + let snapshot_bytes = table + .log_store + .read_commit_entry(2) + .await? + .expect("failed to get snapshot bytes"); + let version_actions = get_actions(2, snapshot_bytes).await?; + + let cdc_actions = version_actions + .iter() + .filter(|action| matches!(action, &&Action::Cdc(_))) + .collect_vec(); + assert!(!cdc_actions.is_empty()); + Ok(()) + } } diff --git a/crates/core/src/operations/writer.rs b/crates/core/src/operations/writer.rs index f04d68e412..3c9d3bda97 100644 --- a/crates/core/src/operations/writer.rs +++ b/crates/core/src/operations/writer.rs @@ -2,10 +2,10 @@ use std::collections::HashMap; -use arrow::datatypes::SchemaRef as ArrowSchemaRef; -use arrow::error::ArrowError; -use arrow::record_batch::RecordBatch; +use arrow_array::RecordBatch; +use arrow_schema::{ArrowError, SchemaRef as ArrowSchemaRef}; use bytes::Bytes; +use delta_kernel::expressions::Scalar; use indexmap::IndexMap; use object_store::{path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; @@ -15,7 +15,7 @@ use tracing::debug; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Add, PartitionsExt, Scalar}; +use crate::kernel::{Add, PartitionsExt}; use crate::storage::ObjectStoreRef; use crate::writer::record_batch::{divide_by_partition_values, PartitionResult}; use crate::writer::stats::create_add; @@ -368,7 +368,8 @@ impl PartitionWriter { let file_size = buffer.len() as i64; // write file to object store - self.object_store.put(&path, buffer).await?; + self.object_store.put(&path, buffer.into()).await?; + self.files_written.push( create_add( &self.config.partition_values, diff --git a/crates/core/src/protocol/checkpoints.rs b/crates/core/src/protocol/checkpoints.rs index 67994c5e49..fc2238d03b 100644 --- a/crates/core/src/protocol/checkpoints.rs +++ b/crates/core/src/protocol/checkpoints.rs @@ -8,6 +8,7 @@ use arrow_schema::ArrowError; use chrono::{Datelike, NaiveDate, NaiveDateTime, Utc}; use futures::{StreamExt, TryStreamExt}; +use itertools::Itertools; use lazy_static::lazy_static; use object_store::{Error, ObjectStore}; use parquet::arrow::ArrowWriter; @@ -27,6 +28,7 @@ use crate::logstore::LogStore; use crate::table::state::DeltaTableState; use crate::table::{get_partition_col_data_types, CheckPoint, CheckPointBuilder}; use crate::{open_table_with_version, DeltaTable}; + type SchemaPath = Vec; /// Error returned when there is an error during creating a checkpoint. @@ -57,7 +59,7 @@ enum CheckpointError { source: ArrowError, }, - #[error("missing rewquired action type in snapshot: {0}")] + #[error("missing required action type in snapshot: {0}")] MissingActionType(String), } @@ -169,14 +171,16 @@ pub async fn create_checkpoint_for( let object_store = log_store.object_store(); debug!("Writing checkpoint to {:?}.", checkpoint_path); - object_store.put(&checkpoint_path, parquet_bytes).await?; + object_store + .put(&checkpoint_path, parquet_bytes.into()) + .await?; let last_checkpoint_content: Value = serde_json::to_value(checkpoint)?; let last_checkpoint_content = bytes::Bytes::from(serde_json::to_vec(&last_checkpoint_content)?); debug!("Writing _last_checkpoint to {:?}.", last_checkpoint_path); object_store - .put(&last_checkpoint_path, last_checkpoint_content) + .put(&last_checkpoint_path, last_checkpoint_content.into()) .await?; Ok(()) @@ -259,7 +263,8 @@ fn parquet_bytes_from_state( // Collect a map of paths that require special stats conversion. let mut stats_conversions: Vec<(SchemaPath, DataType)> = Vec::new(); - collect_stats_conversions(&mut stats_conversions, schema.fields().as_slice()); + let fields = schema.fields().collect_vec(); + collect_stats_conversions(&mut stats_conversions, fields.as_slice()); // if any, tombstones do not include extended file metadata, we must omit the extended metadata fields from the remove schema // See https://github.com/delta-io/delta/blob/master/PROTOCOL.md#add-file-and-remove-file @@ -477,7 +482,7 @@ fn typed_partition_value_from_option_string( } } -fn collect_stats_conversions(paths: &mut Vec<(SchemaPath, DataType)>, fields: &[StructField]) { +fn collect_stats_conversions(paths: &mut Vec<(SchemaPath, DataType)>, fields: &[&StructField]) { let mut _path = SchemaPath::new(); fields .iter() @@ -498,9 +503,7 @@ fn collect_field_conversion( DataType::Struct(struct_field) => { let struct_fields = struct_field.fields(); current_path.push(field.name().to_owned()); - struct_fields - .iter() - .for_each(|f| collect_field_conversion(current_path, all_paths, f)); + struct_fields.for_each(|f| collect_field_conversion(current_path, all_paths, f)); current_path.pop(); } _ => { /* noop */ } @@ -560,7 +563,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(crate::protocol::SaveMode::Ignore) .await .unwrap(); @@ -592,7 +595,7 @@ mod tests { let mut table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(crate::protocol::SaveMode::Ignore) .await .unwrap(); @@ -668,7 +671,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(crate::protocol::SaveMode::Ignore) .await .unwrap(); @@ -802,9 +805,8 @@ mod tests { #[test] fn collect_stats_conversions_test() { let delta_schema: StructType = serde_json::from_value(SCHEMA.clone()).unwrap(); - let fields = delta_schema.fields(); + let fields = delta_schema.fields().collect_vec(); let mut paths = Vec::new(); - collect_stats_conversions(&mut paths, fields.as_slice()); assert_eq!(2, paths.len()); diff --git a/crates/core/src/protocol/mod.rs b/crates/core/src/protocol/mod.rs index 9cfa429fde..f82f48411a 100644 --- a/crates/core/src/protocol/mod.rs +++ b/crates/core/src/protocol/mod.rs @@ -2,9 +2,11 @@ #![allow(non_camel_case_types)] -pub mod checkpoints; -mod parquet_read; -mod time_utils; +use std::borrow::Borrow; +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::mem::take; +use std::str::FromStr; use arrow_schema::ArrowError; use futures::StreamExt; @@ -13,18 +15,17 @@ use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::Value; -use std::borrow::Borrow; -use std::collections::HashMap; -use std::hash::{Hash, Hasher}; -use std::mem::take; -use std::str::FromStr; use tracing::{debug, error}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Add, CommitInfo, Metadata, Protocol, Remove}; +use crate::kernel::{Add, CommitInfo, Metadata, Protocol, Remove, StructField, TableFeatures}; use crate::logstore::LogStore; use crate::table::CheckPoint; +pub mod checkpoints; +mod parquet_read; +mod time_utils; + /// Error returned when an invalid Delta log action is encountered. #[allow(missing_docs)] #[derive(thiserror::Error, Debug)] @@ -196,18 +197,9 @@ impl PartialStats { let null_count = take(&mut self.null_count); Stats { num_records: self.num_records, - min_values: match min_values { - Some(minv) => minv, - None => HashMap::default(), - }, - max_values: match max_values { - Some(maxv) => maxv, - None => HashMap::default(), - }, - null_count: match null_count { - Some(nc) => nc, - None => HashMap::default(), - }, + min_values: min_values.unwrap_or_default(), + max_values: max_values.unwrap_or_default(), + null_count: null_count.unwrap_or_default(), } } } @@ -267,17 +259,11 @@ impl Add { /// Returns the serde_json representation of stats contained in the action if present. /// Since stats are defined as optional in the protocol, this may be None. - fn get_json_stats(&self) -> Result, serde_json::error::Error> { - let ps: Result, serde_json::error::Error> = self - .stats + pub fn get_json_stats(&self) -> Result, serde_json::error::Error> { + self.stats .as_ref() - .map_or(Ok(None), |s| serde_json::from_str(s)); - - match ps { - Ok(Some(mut partial)) => Ok(Some(partial.as_stats())), - Ok(None) => Ok(None), - Err(e) => Err(e), - } + .map(|stats| serde_json::from_str(stats).map(|mut ps: PartialStats| ps.as_stats())) + .transpose() } } @@ -326,6 +312,13 @@ pub struct MergePredicate { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase")] pub enum DeltaOperation { + /// Represents a Delta `Add Column` operation. + /// Used to add new columns or field in a struct + AddColumn { + /// Fields added to existing schema + fields: Vec, + }, + /// Represents a Delta `Create` operation. /// Would usually only create the table, if also data is written, /// a `Write` operations is more appropriate @@ -371,6 +364,12 @@ pub enum DeltaOperation { expr: String, }, + /// Add table features to a table + AddFeature { + /// Name of the feature + name: Vec, + }, + /// Drops constraints from a table DropConstraint { /// Constraints name @@ -458,6 +457,7 @@ impl DeltaOperation { pub fn name(&self) -> &str { // operation names taken from https://learn.microsoft.com/en-us/azure/databricks/delta/history#--operation-metrics-keys match &self { + DeltaOperation::AddColumn { .. } => "ADD COLUMN", DeltaOperation::Create { mode: SaveMode::Overwrite, .. @@ -476,6 +476,7 @@ impl DeltaOperation { DeltaOperation::VacuumEnd { .. } => "VACUUM END", DeltaOperation::AddConstraint { .. } => "ADD CONSTRAINT", DeltaOperation::DropConstraint { .. } => "DROP CONSTRAINT", + DeltaOperation::AddFeature { .. } => "ADD FEATURE", } } @@ -513,6 +514,8 @@ impl DeltaOperation { match self { Self::Optimize { .. } | Self::SetTableProperties { .. } + | Self::AddColumn { .. } + | Self::AddFeature { .. } | Self::VacuumStart { .. } | Self::VacuumEnd { .. } | Self::AddConstraint { .. } @@ -1082,6 +1085,7 @@ mod tests { } #[tokio::test] + #[ignore = "column mapping not yet supported."] async fn test_with_column_mapping() { // test table with column mapping and partitions let path = "../test/tests/data/table_with_column_mapping"; @@ -1225,6 +1229,15 @@ mod tests { assert_eq!(&expected_null_count, null_count_column); } + #[tokio::test] + async fn test_table_checkpoint_not_always_with_stats() { + let path = "../test/tests/data/delta-checkpoint-stats-optional"; + let mut table = crate::open_table(path).await.unwrap(); + table.load().await.unwrap(); + + assert_eq!(2, table.snapshot().unwrap().file_actions().unwrap().len()); + } + #[tokio::test] async fn test_only_struct_stats() { // test table with no json stats diff --git a/crates/core/src/schema/partitions.rs b/crates/core/src/schema/partitions.rs index c766c1d630..23abb3896e 100644 --- a/crates/core/src/schema/partitions.rs +++ b/crates/core/src/schema/partitions.rs @@ -1,12 +1,13 @@ //! Delta Table partition handling logic. -//! -use serde::{Serialize, Serializer}; use std::cmp::Ordering; use std::collections::HashMap; use std::convert::TryFrom; +use delta_kernel::expressions::Scalar; +use serde::{Serialize, Serializer}; + use crate::errors::DeltaTableError; -use crate::kernel::{DataType, PrimitiveType, Scalar}; +use crate::kernel::{scalars::ScalarExt, DataType, PrimitiveType}; /// A special value used in Hive to represent the null partition in partitioned tables pub const NULL_PARTITION_VALUE_DATA_PATH: &str = "__HIVE_DEFAULT_PARTITION__"; @@ -32,6 +33,42 @@ pub enum PartitionValue { NotIn(Vec), } +#[derive(Clone, Debug, PartialEq)] +struct ScalarHelper<'a>(&'a Scalar); + +impl PartialOrd for ScalarHelper<'_> { + fn partial_cmp(&self, other: &Self) -> Option { + use Scalar::*; + match (self.0, other.0) { + (Null(_), Null(_)) => Some(Ordering::Equal), + (Integer(a), Integer(b)) => a.partial_cmp(b), + (Long(a), Long(b)) => a.partial_cmp(b), + (Short(a), Short(b)) => a.partial_cmp(b), + (Byte(a), Byte(b)) => a.partial_cmp(b), + (Float(a), Float(b)) => a.partial_cmp(b), + (Double(a), Double(b)) => a.partial_cmp(b), + (String(a), String(b)) => a.partial_cmp(b), + (Boolean(a), Boolean(b)) => a.partial_cmp(b), + (Timestamp(a), Timestamp(b)) => a.partial_cmp(b), + (TimestampNtz(a), TimestampNtz(b)) => a.partial_cmp(b), + (Date(a), Date(b)) => a.partial_cmp(b), + (Binary(a), Binary(b)) => a.partial_cmp(b), + (Decimal(a, p1, s1), Decimal(b, p2, s2)) => { + // TODO implement proper decimal comparison + if p1 != p2 || s1 != s2 { + return None; + }; + a.partial_cmp(b) + } + // TODO should we make an assumption about the ordering of nulls? + // rigth now this is only used for internal purposes. + (Null(_), _) => Some(Ordering::Less), + (_, Null(_)) => Some(Ordering::Greater), + _ => None, + } + } +} + /// A Struct used for filtering a DeltaTable partition by key and value. #[derive(Clone, Debug, PartialEq, Eq)] pub struct PartitionFilter { @@ -49,7 +86,7 @@ fn compare_typed_value( match data_type { DataType::Primitive(primitive_type) => { let other = primitive_type.parse_scalar(filter_value).ok()?; - partition_value.partial_cmp(&other) + ScalarHelper(partition_value).partial_cmp(&ScalarHelper(&other)) } // NOTE: complex types are not supported as partition columns _ => None, @@ -239,6 +276,37 @@ impl DeltaTablePartition { } } +/// +/// A HivePartition string is represented by a "key=value" format. +/// +/// ```rust +/// # use delta_kernel::expressions::Scalar; +/// use deltalake_core::DeltaTablePartition; +/// +/// let hive_part = "ds=2023-01-01"; +/// let partition = DeltaTablePartition::try_from(hive_part).unwrap(); +/// assert_eq!("ds", partition.key); +/// assert_eq!(Scalar::String("2023-01-01".into()), partition.value); +/// ``` +impl TryFrom<&str> for DeltaTablePartition { + type Error = DeltaTableError; + + /// Try to create a DeltaTable partition from a HivePartition string. + /// Returns a DeltaTableError if the string is not in the form of a HivePartition. + fn try_from(partition: &str) -> Result { + let partition_splitted: Vec<&str> = partition.split('=').collect(); + match partition_splitted { + partition_splitted if partition_splitted.len() == 2 => Ok(DeltaTablePartition { + key: partition_splitted[0].to_owned(), + value: Scalar::String(partition_splitted[1].to_owned()), + }), + _ => Err(DeltaTableError::PartitionError { + partition: partition.to_string(), + }), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -289,4 +357,133 @@ mod tests { "date NOT IN ('2023-11-04', '2023-06-07')", ); } + + #[test] + fn tryfrom_invalid() { + let buf = "this-is-not-a-partition"; + let partition = DeltaTablePartition::try_from(buf); + assert!(partition.is_err()); + } + + #[test] + fn tryfrom_valid() { + let buf = "ds=2024-04-01"; + let partition = DeltaTablePartition::try_from(buf); + assert!(partition.is_ok()); + let partition = partition.unwrap(); + assert_eq!(partition.key, "ds"); + assert_eq!(partition.value, Scalar::String("2024-04-01".into())); + } + + #[test] + fn test_create_delta_table_partition() { + let year = "2021".to_string(); + let path = format!("year={year}"); + assert_eq!( + DeltaTablePartition::try_from(path.as_ref()).unwrap(), + DeltaTablePartition { + key: "year".into(), + value: Scalar::String(year.into()), + } + ); + + let _wrong_path = "year=2021/month="; + assert!(matches!( + DeltaTablePartition::try_from(_wrong_path).unwrap_err(), + DeltaTableError::PartitionError { + partition: _wrong_path + }, + )) + } + + #[test] + fn test_match_partition() { + let partition_2021 = DeltaTablePartition { + key: "year".into(), + value: Scalar::String("2021".into()), + }; + let partition_2020 = DeltaTablePartition { + key: "year".into(), + value: Scalar::String("2020".into()), + }; + let partition_2019 = DeltaTablePartition { + key: "year".into(), + value: Scalar::String("2019".into()), + }; + + let partition_year_2020_filter = PartitionFilter { + key: "year".to_string(), + value: PartitionValue::Equal("2020".to_string()), + }; + let partition_month_12_filter = PartitionFilter { + key: "month".to_string(), + value: PartitionValue::Equal("12".to_string()), + }; + let string_type = DataType::Primitive(PrimitiveType::String); + + assert!(!partition_year_2020_filter.match_partition(&partition_2021, &string_type)); + assert!(partition_year_2020_filter.match_partition(&partition_2020, &string_type)); + assert!(!partition_year_2020_filter.match_partition(&partition_2019, &string_type)); + assert!(!partition_month_12_filter.match_partition(&partition_2019, &string_type)); + + /* TODO: To be re-enabled at a future date, needs some type futzing + let partition_2020_12_31_23_59_59 = DeltaTablePartition { + key: "time".into(), + value: PrimitiveType::TimestampNtz.parse_scalar("2020-12-31 23:59:59").expect("Failed to parse timestamp"), + }; + + let partition_time_2020_12_31_23_59_59_filter = PartitionFilter { + key: "time".to_string(), + value: PartitionValue::Equal("2020-12-31 23:59:59.000000".into()), + }; + + assert!(partition_time_2020_12_31_23_59_59_filter.match_partition( + &partition_2020_12_31_23_59_59, + &DataType::Primitive(PrimitiveType::TimestampNtz) + )); + assert!(!partition_time_2020_12_31_23_59_59_filter + .match_partition(&partition_2020_12_31_23_59_59, &string_type)); + */ + } + + #[test] + fn test_match_filters() { + let partitions = vec![ + DeltaTablePartition { + key: "year".into(), + value: Scalar::String("2021".into()), + }, + DeltaTablePartition { + key: "month".into(), + value: Scalar::String("12".into()), + }, + ]; + + let string_type = DataType::Primitive(PrimitiveType::String); + let partition_data_types: HashMap<&String, &DataType> = vec![ + (&partitions[0].key, &string_type), + (&partitions[1].key, &string_type), + ] + .into_iter() + .collect(); + + let valid_filters = PartitionFilter { + key: "year".to_string(), + value: PartitionValue::Equal("2021".to_string()), + }; + + let valid_filter_month = PartitionFilter { + key: "month".to_string(), + value: PartitionValue::Equal("12".to_string()), + }; + + let invalid_filter = PartitionFilter { + key: "year".to_string(), + value: PartitionValue::Equal("2020".to_string()), + }; + + assert!(valid_filters.match_partitions(&partitions, &partition_data_types),); + assert!(valid_filter_month.match_partitions(&partitions, &partition_data_types),); + assert!(!invalid_filter.match_partitions(&partitions, &partition_data_types),); + } } diff --git a/crates/core/src/storage/file.rs b/crates/core/src/storage/file.rs index c63a00dae6..100faafcc5 100644 --- a/crates/core/src/storage/file.rs +++ b/crates/core/src/storage/file.rs @@ -1,17 +1,17 @@ //! Local file storage backend. This backend read and write objects from local filesystem. //! //! The local file storage backend is multi-writer safe. +use std::ops::Range; +use std::sync::Arc; use bytes::Bytes; use futures::stream::BoxStream; use object_store::{ local::LocalFileSystem, path::Path as ObjectStorePath, Error as ObjectStoreError, GetOptions, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + GetResult, ListResult, ObjectMeta, ObjectStore, PutOptions, PutResult, Result as ObjectStoreResult, }; -use std::ops::Range; -use std::sync::Arc; -use tokio::io::AsyncWrite; +use object_store::{MultipartUpload, PutMultipartOpts, PutPayload}; use url::Url; const STORE_NAME: &str = "DeltaLocalObjectStore"; @@ -106,14 +106,14 @@ impl From for ObjectStoreError { /// Multi-writer support for different platforms: /// /// * Modern Linux kernels are well supported. However because Linux implementation leverages -/// `RENAME_NOREPLACE`, older versions of the kernel might not work depending on what filesystem is -/// being used: +/// `RENAME_NOREPLACE`, older versions of the kernel might not work depending on what filesystem is +/// being used: /// * ext4 requires >= Linux 3.15 /// * btrfs, shmem, and cif requires >= Linux 3.17 /// * xfs requires >= Linux 4.0 /// * ext2, minix, reiserfs, jfs, vfat, and bpf requires >= Linux 4.9 /// * Darwin is supported but not fully tested. -/// Patches welcome. +/// Patches welcome. /// * Support for other platforms are not implemented at the moment. #[derive(Debug)] pub struct FileStorageBackend { @@ -166,14 +166,18 @@ impl std::fmt::Display for FileStorageBackend { #[async_trait::async_trait] impl ObjectStore for FileStorageBackend { - async fn put(&self, location: &ObjectStorePath, bytes: Bytes) -> ObjectStoreResult { + async fn put( + &self, + location: &ObjectStorePath, + bytes: PutPayload, + ) -> ObjectStoreResult { self.inner.put(location, bytes).await } async fn put_opts( &self, location: &ObjectStorePath, - bytes: Bytes, + bytes: PutPayload, options: PutOptions, ) -> ObjectStoreResult { self.inner.put_opts(location, bytes, options).await @@ -254,16 +258,16 @@ impl ObjectStore for FileStorageBackend { async fn put_multipart( &self, location: &ObjectStorePath, - ) -> ObjectStoreResult<(MultipartId, Box)> { + ) -> ObjectStoreResult> { self.inner.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &ObjectStorePath, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.inner.put_multipart_opts(location, options).await } } @@ -275,10 +279,7 @@ async fn rename_noreplace(from: &str, to: &str) -> Result<(), LocalFileSystemErr } // Generic implementation (Requires 2 system calls) -#[cfg(not(any( - all(target_os = "linux", target_env = "gnu", glibc_renameat2), - target_os = "macos" -)))] +#[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))] mod imp { use super::*; @@ -319,10 +320,7 @@ mod imp { } // Optimized implementations (Only 1 system call) -#[cfg(any( - all(target_os = "linux", target_env = "gnu", glibc_renameat2), - target_os = "macos" -))] +#[cfg(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos"))] mod imp { use super::*; use std::ffi::CString; diff --git a/crates/core/src/storage/mod.rs b/crates/core/src/storage/mod.rs index 3c38a337af..0ad1435d1c 100644 --- a/crates/core/src/storage/mod.rs +++ b/crates/core/src/storage/mod.rs @@ -1,36 +1,336 @@ //! Object storage backend abstraction layer for Delta Table transaction logs and data - -use dashmap::DashMap; -use object_store::limit::LimitStore; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; +use crate::{DeltaResult, DeltaTableError}; +use dashmap::DashMap; +use futures::future::BoxFuture; +use futures::FutureExt; +use futures::TryFutureExt; use lazy_static::lazy_static; +use object_store::limit::LimitStore; +use object_store::local::LocalFileSystem; +use object_store::memory::InMemory; +use object_store::prefix::PrefixStore; +use object_store::{GetOptions, PutOptions, PutPayload, PutResult}; use serde::{Deserialize, Serialize}; +use tokio::runtime::{Builder as RuntimeBuilder, Handle, Runtime}; use url::Url; -pub mod file; -pub mod retry_ext; -pub mod utils; - -use crate::{DeltaResult, DeltaTableError}; - +use bytes::Bytes; +use futures::stream::BoxStream; pub use object_store; -use object_store::local::LocalFileSystem; -use object_store::memory::InMemory; pub use object_store::path::{Path, DELIMITER}; -use object_store::prefix::PrefixStore; pub use object_store::{ DynObjectStore, Error as ObjectStoreError, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result as ObjectStoreResult, }; +use object_store::{MultipartUpload, PutMultipartOpts}; pub use retry_ext::ObjectStoreRetryExt; +use std::ops::Range; pub use utils::*; +pub mod file; +pub mod retry_ext; +pub mod utils; + lazy_static! { static ref DELTA_LOG_PATH: Path = Path::from("_delta_log"); } +/// Creates static IO Runtime with optional configuration +fn io_rt(config: Option<&RuntimeConfig>) -> &Runtime { + static IO_RT: OnceLock = OnceLock::new(); + IO_RT.get_or_init(|| { + let rt = match config { + Some(config) => { + let mut builder = if config.multi_threaded { + RuntimeBuilder::new_multi_thread() + } else { + RuntimeBuilder::new_current_thread() + }; + let builder = builder.worker_threads(config.worker_threads); + let mut builder = if config.enable_io && config.enable_time { + builder.enable_all() + } else if !config.enable_io && config.enable_time { + builder.enable_time() + } else { + builder + }; + #[cfg(unix)] + { + if config.enable_io && !config.enable_time { + builder = builder.enable_io(); + } + } + builder + .thread_name( + config + .thread_name + .clone() + .unwrap_or("IO-runtime".to_string()), + ) + .build() + } + _ => Runtime::new(), + }; + rt.expect("Failed to create a tokio runtime for IO.") + }) +} + +/// Configuration for Tokio runtime +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RuntimeConfig { + multi_threaded: bool, + worker_threads: usize, + thread_name: Option, + enable_io: bool, + enable_time: bool, +} + +/// Provide custom Tokio RT or a runtime config +#[derive(Debug, Clone)] +pub enum IORuntime { + /// Tokio RT handle + RT(Handle), + /// Configuration for tokio runtime + Config(RuntimeConfig), +} + +impl Default for IORuntime { + fn default() -> Self { + IORuntime::RT(io_rt(None).handle().clone()) + } +} + +impl IORuntime { + /// Retrieves the Tokio runtime for IO bound operations + pub fn get_handle(&self) -> Handle { + match self { + IORuntime::RT(handle) => handle, + IORuntime::Config(config) => io_rt(Some(config)).handle(), + } + .clone() + } +} + +/// Wraps any object store and runs IO in it's own runtime [EXPERIMENTAL] +pub struct DeltaIOStorageBackend { + inner: ObjectStoreRef, + rt_handle: Handle, +} + +impl DeltaIOStorageBackend { + /// create wrapped object store which spawns tasks in own runtime + pub fn new(storage: ObjectStoreRef, rt_handle: Handle) -> Self { + Self { + inner: storage, + rt_handle, + } + } + + /// spawn taks on IO runtime + pub fn spawn_io_rt( + &self, + f: F, + store: &Arc, + path: Path, + ) -> BoxFuture<'_, ObjectStoreResult> + where + F: for<'a> FnOnce( + &'a Arc, + &'a Path, + ) -> BoxFuture<'a, ObjectStoreResult> + + Send + + 'static, + O: Send + 'static, + { + let store = Arc::clone(store); + let fut = self.rt_handle.spawn(async move { f(&store, &path).await }); + fut.unwrap_or_else(|e| match e.try_into_panic() { + Ok(p) => std::panic::resume_unwind(p), + Err(e) => Err(ObjectStoreError::JoinError { source: e }), + }) + .boxed() + } + + /// spawn taks on IO runtime + pub fn spawn_io_rt_from_to( + &self, + f: F, + store: &Arc, + from: Path, + to: Path, + ) -> BoxFuture<'_, ObjectStoreResult> + where + F: for<'a> FnOnce( + &'a Arc, + &'a Path, + &'a Path, + ) -> BoxFuture<'a, ObjectStoreResult> + + Send + + 'static, + O: Send + 'static, + { + let store = Arc::clone(store); + let fut = self + .rt_handle + .spawn(async move { f(&store, &from, &to).await }); + fut.unwrap_or_else(|e| match e.try_into_panic() { + Ok(p) => std::panic::resume_unwind(p), + Err(e) => Err(ObjectStoreError::JoinError { source: e }), + }) + .boxed() + } +} + +impl std::fmt::Debug for DeltaIOStorageBackend { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(fmt, "DeltaIOStorageBackend") + } +} + +impl std::fmt::Display for DeltaIOStorageBackend { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(fmt, "DeltaIOStorageBackend") + } +} + +#[async_trait::async_trait] +impl ObjectStore for DeltaIOStorageBackend { + async fn put(&self, location: &Path, bytes: PutPayload) -> ObjectStoreResult { + self.spawn_io_rt( + |store, path| store.put(path, bytes), + &self.inner, + location.clone(), + ) + .await + } + + async fn put_opts( + &self, + location: &Path, + bytes: PutPayload, + options: PutOptions, + ) -> ObjectStoreResult { + self.spawn_io_rt( + |store, path| store.put_opts(path, bytes, options), + &self.inner, + location.clone(), + ) + .await + } + + async fn get(&self, location: &Path) -> ObjectStoreResult { + self.spawn_io_rt(|store, path| store.get(path), &self.inner, location.clone()) + .await + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> ObjectStoreResult { + self.spawn_io_rt( + |store, path| store.get_opts(path, options), + &self.inner, + location.clone(), + ) + .await + } + + async fn get_range(&self, location: &Path, range: Range) -> ObjectStoreResult { + self.spawn_io_rt( + |store, path| store.get_range(path, range), + &self.inner, + location.clone(), + ) + .await + } + + async fn head(&self, location: &Path) -> ObjectStoreResult { + self.spawn_io_rt( + |store, path| store.head(path), + &self.inner, + location.clone(), + ) + .await + } + + async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { + self.spawn_io_rt( + |store, path| store.delete(path), + &self.inner, + location.clone(), + ) + .await + } + + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, ObjectStoreResult> { + self.inner.list(prefix) + } + + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'_, ObjectStoreResult> { + self.inner.list_with_offset(prefix, offset) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> ObjectStoreResult { + self.inner.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + self.spawn_io_rt_from_to( + |store, from_path, to_path| store.copy(from_path, to_path), + &self.inner, + from.clone(), + to.clone(), + ) + .await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + self.spawn_io_rt_from_to( + |store, from_path, to_path| store.copy_if_not_exists(from_path, to_path), + &self.inner, + from.clone(), + to.clone(), + ) + .await + } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + self.spawn_io_rt_from_to( + |store, from_path, to_path| store.rename_if_not_exists(from_path, to_path), + &self.inner, + from.clone(), + to.clone(), + ) + .await + } + + async fn put_multipart(&self, location: &Path) -> ObjectStoreResult> { + self.spawn_io_rt( + |store, path| store.put_multipart(path), + &self.inner, + location.clone(), + ) + .await + } + + async fn put_multipart_opts( + &self, + location: &Path, + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.spawn_io_rt( + |store, path| store.put_multipart_opts(path, options), + &self.inner, + location.clone(), + ) + .await + } +} + /// Sharable reference to [`ObjectStore`] pub type ObjectStoreRef = Arc; diff --git a/crates/core/src/storage/retry_ext.rs b/crates/core/src/storage/retry_ext.rs index 81a52f3ba3..b63c29a8ae 100644 --- a/crates/core/src/storage/retry_ext.rs +++ b/crates/core/src/storage/retry_ext.rs @@ -1,7 +1,6 @@ //! Retry extension for [`ObjectStore`] -use bytes::Bytes; -use object_store::{path::Path, Error, ObjectStore, PutResult, Result}; +use object_store::{path::Path, Error, ObjectStore, PutPayload, PutResult, Result}; use tracing::log::*; /// Retry extension for [`ObjectStore`] @@ -29,7 +28,7 @@ pub trait ObjectStoreRetryExt: ObjectStore { async fn put_with_retries( &self, location: &Path, - bytes: Bytes, + bytes: PutPayload, max_retries: usize, ) -> Result { let mut attempt_number = 1; diff --git a/crates/core/src/table/builder.rs b/crates/core/src/table/builder.rs index b421a6199b..5631079269 100644 --- a/crates/core/src/table/builder.rs +++ b/crates/core/src/table/builder.rs @@ -13,7 +13,7 @@ use url::Url; use super::DeltaTable; use crate::errors::{DeltaResult, DeltaTableError}; use crate::logstore::LogStoreRef; -use crate::storage::{factories, StorageOptions}; +use crate::storage::{factories, IORuntime, StorageOptions}; #[allow(dead_code)] #[derive(Debug, thiserror::Error)] @@ -51,7 +51,7 @@ pub enum DeltaVersion { } /// Configuration options for delta table -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +#[derive(Debug, Serialize, Deserialize, Clone)] #[serde(rename_all = "camelCase")] pub struct DeltaTableConfig { /// Indicates whether our use case requires tracking tombstones. @@ -79,6 +79,9 @@ pub struct DeltaTableConfig { /// Control the number of records to read / process from the commit / checkpoint files /// when processing record batches. pub log_batch_size: usize, + #[serde(skip_serializing, skip_deserializing)] + /// When a runtime handler is provided, all IO tasks are spawn in that handle + pub io_runtime: Option, } impl Default for DeltaTableConfig { @@ -88,68 +91,34 @@ impl Default for DeltaTableConfig { require_files: true, log_buffer_size: num_cpus::get() * 4, log_batch_size: 1024, + io_runtime: None, } } } -/// Load-time delta table configuration options -#[derive(Debug)] -pub struct DeltaTableLoadOptions { - /// table root uri - pub table_uri: String, - /// backend to access storage system - pub storage_backend: Option<(Arc, Url)>, - /// specify the version we are going to load: a time stamp, a version, or just the newest - /// available version - pub version: DeltaVersion, - /// Indicates whether our use case requires tracking tombstones. - /// This defaults to `true` - /// - /// Read-only applications never require tombstones. Tombstones - /// are only required when writing checkpoints, so even many writers - /// may want to skip them. - pub require_tombstones: bool, - /// Indicates whether DeltaTable should track files. - /// This defaults to `true` - /// - /// Some append-only applications might have no need of tracking any files. - /// Hence, DeltaTable will be loaded with significant memory reduction. - pub require_files: bool, - /// Controls how many files to buffer from the commit log when updating the table. - /// This defaults to 4 * number of cpus - /// - /// Setting a value greater than 1 results in concurrent calls to the storage api. - /// This can be helpful to decrease latency if there are many files in the log since the - /// last checkpoint, but will also increase memory usage. Possible rate limits of the storage backend should - /// also be considered for optimal performance. - pub log_buffer_size: usize, - /// Control the number of records to read / process from the commit / checkpoint files - /// when processing record batches. - pub log_batch_size: usize, -} - -impl DeltaTableLoadOptions { - /// create default table load options for a table uri - pub fn new(table_uri: impl Into) -> Self { - Self { - table_uri: table_uri.into(), - storage_backend: None, - require_tombstones: true, - require_files: true, - log_buffer_size: num_cpus::get() * 4, - version: DeltaVersion::default(), - log_batch_size: 1024, - } +impl PartialEq for DeltaTableConfig { + fn eq(&self, other: &Self) -> bool { + self.require_tombstones == other.require_tombstones + && self.require_files == other.require_files + && self.log_buffer_size == other.log_buffer_size + && self.log_batch_size == other.log_batch_size } } /// builder for configuring a delta table load. #[derive(Debug)] pub struct DeltaTableBuilder { - options: DeltaTableLoadOptions, + /// table root uri + table_uri: String, + /// backend to access storage system + storage_backend: Option<(Arc, Url)>, + /// specify the version we are going to load: a time stamp, a version, or just the newest + /// available version + version: DeltaVersion, storage_options: Option>, #[allow(unused_variables)] allow_http: Option, + table_config: DeltaTableConfig, } impl DeltaTableBuilder { @@ -190,27 +159,30 @@ impl DeltaTableBuilder { debug!("creating table builder with {url}"); Ok(Self { - options: DeltaTableLoadOptions::new(url), + table_uri: url.into(), + storage_backend: None, + version: DeltaVersion::default(), storage_options: None, allow_http: None, + table_config: DeltaTableConfig::default(), }) } /// Sets `require_tombstones=false` to the builder pub fn without_tombstones(mut self) -> Self { - self.options.require_tombstones = false; + self.table_config.require_tombstones = false; self } /// Sets `require_files=false` to the builder pub fn without_files(mut self) -> Self { - self.options.require_files = false; + self.table_config.require_files = false; self } /// Sets `version` to the builder pub fn with_version(mut self, version: i64) -> Self { - self.options.version = DeltaVersion::Version(version); + self.version = DeltaVersion::Version(version); self } @@ -221,7 +193,7 @@ impl DeltaTableBuilder { "Log buffer size should be positive", ))); } - self.options.log_buffer_size = log_buffer_size; + self.table_config.log_buffer_size = log_buffer_size; Ok(self) } @@ -235,7 +207,7 @@ impl DeltaTableBuilder { /// specify a timestamp pub fn with_timestamp(mut self, timestamp: DateTime) -> Self { - self.options.version = DeltaVersion::Timestamp(timestamp); + self.version = DeltaVersion::Timestamp(timestamp); self } @@ -248,20 +220,39 @@ impl DeltaTableBuilder { /// * `storage` - A shared reference to an [`ObjectStore`](object_store::ObjectStore) with "/" pointing at delta table root (i.e. where `_delta_log` is located). /// * `location` - A url corresponding to the storagle location of `storage`. pub fn with_storage_backend(mut self, storage: Arc, location: Url) -> Self { - self.options.storage_backend = Some((storage, location)); + self.storage_backend = Some((storage, location)); self } /// Set options used to initialize storage backend /// /// Options may be passed in the HashMap or set as environment variables. See documentation of - /// underlying object store implementation for details. + /// underlying object store implementation for details. Trailing slash will be trimmed in + /// the option's value to avoid failures. Trimming will only be done if one or more of below + /// conditions are met: + /// - key ends with `_URL` (e.g., `ENDPOINT_URL`, `S3_URL`, `JDBC_URL`, etc.) + /// - value starts with `http://`` or `https://` (e.g., `http://localhost:8000/`) /// /// - [Azure options](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants) /// - [S3 options](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants) /// - [Google options](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants) pub fn with_storage_options(mut self, storage_options: HashMap) -> Self { - self.storage_options = Some(storage_options); + self.storage_options = Some( + storage_options + .clone() + .into_iter() + .map(|(k, v)| { + let needs_trim = v.starts_with("http://") + || v.starts_with("https://") + || k.to_lowercase().ends_with("_url"); + if needs_trim { + (k.to_owned(), v.trim_end_matches('/').to_owned()) + } else { + (k, v) + } + }) + .collect(), + ); self } @@ -273,6 +264,12 @@ impl DeltaTableBuilder { self } + /// Provide a custom runtime handle or runtime config + pub fn with_io_runtime(mut self, io_runtime: IORuntime) -> Self { + self.table_config.io_runtime = Some(io_runtime); + self + } + /// Storage options for configuring backend object store pub fn storage_options(&self) -> StorageOptions { let mut storage_options = self.storage_options.clone().unwrap_or_default(); @@ -286,22 +283,28 @@ impl DeltaTableBuilder { } /// Build a delta storage backend for the given config - pub fn build_storage(self) -> DeltaResult { - debug!("build_storage() with {}", &self.options.table_uri); - let location = Url::parse(&self.options.table_uri).map_err(|_| { - DeltaTableError::NotATable(format!( - "Could not turn {} into a URL", - self.options.table_uri - )) + pub fn build_storage(&self) -> DeltaResult { + debug!("build_storage() with {}", self.table_uri); + let location = Url::parse(&self.table_uri).map_err(|_| { + DeltaTableError::NotATable(format!("Could not turn {} into a URL", self.table_uri)) })?; - if let Some((store, _url)) = self.options.storage_backend.as_ref() { + if let Some((store, _url)) = self.storage_backend.as_ref() { debug!("Loading a logstore with a custom store: {store:?}"); - crate::logstore::logstore_with(store.clone(), location, self.storage_options()) + crate::logstore::logstore_with( + store.clone(), + location, + self.storage_options(), + self.table_config.io_runtime.clone(), + ) } else { // If there has been no backend defined just default to the normal logstore look up debug!("Loading a logstore based off the location: {location:?}"); - crate::logstore::logstore_for(location, self.storage_options()) + crate::logstore::logstore_for( + location, + self.storage_options(), + self.table_config.io_runtime.clone(), + ) } } @@ -310,18 +313,12 @@ impl DeltaTableBuilder { /// This will not load the log, i.e. the table is not initialized. To get an initialized /// table use the `load` function pub fn build(self) -> DeltaResult { - let config = DeltaTableConfig { - require_tombstones: self.options.require_tombstones, - require_files: self.options.require_files, - log_buffer_size: self.options.log_buffer_size, - log_batch_size: self.options.log_batch_size, - }; - Ok(DeltaTable::new(self.build_storage()?, config)) + Ok(DeltaTable::new(self.build_storage()?, self.table_config)) } /// Build the [`DeltaTable`] and load its state pub async fn load(self) -> DeltaResult { - let version = self.options.version; + let version = self.version; let mut table = self.build()?; match version { DeltaVersion::Newest => table.load().await?, @@ -561,4 +558,49 @@ mod tests { DeltaTableBuilder::from_valid_uri("this://is.nonsense") .expect_err("this should be an error"); } + + #[test] + fn test_writer_storage_opts_url_trim() { + let cases = [ + // Trim Case 1 - Key indicating a url + ("SOMETHING_URL", "something://else/", "something://else"), + // Trim Case 2 - Value https url ending with slash + ( + "SOMETHING", + "http://something:port/", + "http://something:port", + ), + // Trim Case 3 - Value https url ending with slash + ( + "SOMETHING", + "https://something:port/", + "https://something:port", + ), + // No Trim Case 4 - JDBC MySQL url with slash + ( + "SOME_JDBC_PREFIX", + "jdbc:mysql://mysql.db.server:3306/", + "jdbc:mysql://mysql.db.server:3306/", + ), + // No Trim Case 5 - S3A file system link + ("SOME_S3_LINK", "s3a://bucket-name/", "s3a://bucket-name/"), + // No Trim Case 6 - Not a url but ending with slash + ("SOME_RANDOM_STRING", "a1b2c3d4e5f#/", "a1b2c3d4e5f#/"), + // No Trim Case 7 - Some value not a url + ( + "SOME_VALUE", + "/ This is some value 123 /", + "/ This is some value 123 /", + ), + ]; + for (key, val, expected) in cases { + let table_uri = Url::parse("memory:///test/tests/data/delta-0.8.0").unwrap(); + let mut storage_opts = HashMap::::new(); + storage_opts.insert(key.to_owned(), val.to_owned()); + + let table = DeltaTableBuilder::from_uri(table_uri).with_storage_options(storage_opts); + let found_opts = table.storage_options(); + assert_eq!(expected, found_opts.0.get(key).unwrap()); + } + } } diff --git a/crates/core/src/table/config.rs b/crates/core/src/table/config.rs index 05fb0c53ca..bc04ec6e91 100644 --- a/crates/core/src/table/config.rs +++ b/crates/core/src/table/config.rs @@ -2,19 +2,19 @@ use std::time::Duration; use std::{collections::HashMap, str::FromStr}; +use delta_kernel::features::ColumnMappingMode; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; -use crate::errors::DeltaTableError; - use super::Constraint; +use crate::errors::DeltaTableError; /// Typed property keys that can be defined on a delta table /// /// #[derive(PartialEq, Eq, Hash)] #[non_exhaustive] -pub enum DeltaConfigKey { +pub enum TableProperty { /// true for this Delta table to be append-only. If append-only, /// existing records cannot be deleted, and existing values cannot be updated. AppendOnly, @@ -116,7 +116,7 @@ pub enum DeltaConfigKey { CheckpointPolicy, } -impl AsRef for DeltaConfigKey { +impl AsRef for TableProperty { fn as_ref(&self) -> &str { match self { Self::AppendOnly => "delta.appendOnly", @@ -146,7 +146,7 @@ impl AsRef for DeltaConfigKey { } } -impl FromStr for DeltaConfigKey { +impl FromStr for TableProperty { type Err = DeltaTableError; fn from_str(s: &str) -> Result { @@ -210,33 +210,35 @@ pub struct TableConfig<'a>(pub(crate) &'a HashMap>); /// Default num index cols pub const DEFAULT_NUM_INDEX_COLS: i32 = 32; +/// Default target file size +pub const DEFAULT_TARGET_FILE_SIZE: i64 = 104857600; impl<'a> TableConfig<'a> { table_config!( ( "true for this Delta table to be append-only", - DeltaConfigKey::AppendOnly, + TableProperty::AppendOnly, append_only, bool, false ), ( "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column.", - DeltaConfigKey::CheckpointWriteStatsAsJson, + TableProperty::CheckpointWriteStatsAsJson, write_stats_as_json, bool, true ), ( "true for Delta Lake to write file statistics to checkpoints in struct format", - DeltaConfigKey::CheckpointWriteStatsAsStruct, + TableProperty::CheckpointWriteStatsAsStruct, write_stats_as_struct, bool, false ), ( "The target file size in bytes or higher units for file tuning", - DeltaConfigKey::TargetFileSize, + TableProperty::TargetFileSize, target_file_size, i64, // Databricks / spark defaults to 104857600 (bytes) or 100mb @@ -244,14 +246,14 @@ impl<'a> TableConfig<'a> { ), ( "true to enable change data feed.", - DeltaConfigKey::EnableChangeDataFeed, + TableProperty::EnableChangeDataFeed, enable_change_data_feed, bool, false ), ( "true to enable deletion vectors and predictive I/O for updates.", - DeltaConfigKey::EnableDeletionVectors, + TableProperty::EnableDeletionVectors, enable_deletion_vectors, bool, // in databricks the default is dependent on the workspace settings and runtime version @@ -260,21 +262,21 @@ impl<'a> TableConfig<'a> { ), ( "The number of columns for Delta Lake to collect statistics about for data skipping.", - DeltaConfigKey::DataSkippingNumIndexedCols, + TableProperty::DataSkippingNumIndexedCols, num_indexed_cols, i32, 32 ), ( "whether to cleanup expired logs", - DeltaConfigKey::EnableExpiredLogCleanup, + TableProperty::EnableExpiredLogCleanup, enable_expired_log_cleanup, bool, true ), ( "Interval (number of commits) after which a new checkpoint should be created", - DeltaConfigKey::CheckpointInterval, + TableProperty::CheckpointInterval, checkpoint_interval, i32, 100 @@ -295,7 +297,7 @@ impl<'a> TableConfig<'a> { static ref DEFAULT_DURATION: Duration = parse_interval("interval 1 weeks").unwrap(); } self.0 - .get(DeltaConfigKey::DeletedFileRetentionDuration.as_ref()) + .get(TableProperty::DeletedFileRetentionDuration.as_ref()) .and_then(|o| o.as_ref().and_then(|v| parse_interval(v).ok())) .unwrap_or_else(|| DEFAULT_DURATION.to_owned()) } @@ -311,7 +313,7 @@ impl<'a> TableConfig<'a> { static ref DEFAULT_DURATION: Duration = parse_interval("interval 30 days").unwrap(); } self.0 - .get(DeltaConfigKey::LogRetentionDuration.as_ref()) + .get(TableProperty::LogRetentionDuration.as_ref()) .and_then(|o| o.as_ref().and_then(|v| parse_interval(v).ok())) .unwrap_or_else(|| DEFAULT_DURATION.to_owned()) } @@ -321,7 +323,7 @@ impl<'a> TableConfig<'a> { /// Valid values are `Serializable` and `WriteSerializable`. pub fn isolation_level(&self) -> IsolationLevel { self.0 - .get(DeltaConfigKey::IsolationLevel.as_ref()) + .get(TableProperty::IsolationLevel.as_ref()) .and_then(|o| o.as_ref().and_then(|v| v.parse().ok())) .unwrap_or_default() } @@ -329,7 +331,7 @@ impl<'a> TableConfig<'a> { /// Policy applied during chepoint creation pub fn checkpoint_policy(&self) -> CheckpointPolicy { self.0 - .get(DeltaConfigKey::CheckpointPolicy.as_ref()) + .get(TableProperty::CheckpointPolicy.as_ref()) .and_then(|o| o.as_ref().and_then(|v| v.parse().ok())) .unwrap_or_default() } @@ -337,7 +339,7 @@ impl<'a> TableConfig<'a> { /// Return the column mapping mode according to delta.columnMapping.mode pub fn column_mapping_mode(&self) -> ColumnMappingMode { self.0 - .get(DeltaConfigKey::ColumnMappingMode.as_ref()) + .get(TableProperty::ColumnMappingMode.as_ref()) .and_then(|o| o.as_ref().and_then(|v| v.parse().ok())) .unwrap_or_default() } @@ -360,7 +362,7 @@ impl<'a> TableConfig<'a> { /// This property takes precedence over [num_indexed_cols](Self::num_indexed_cols). pub fn stats_columns(&self) -> Option> { self.0 - .get(DeltaConfigKey::DataSkippingStatsColumns.as_ref()) + .get(TableProperty::DataSkippingStatsColumns.as_ref()) .and_then(|o| o.as_ref().map(|v| v.split(',').collect())) } } @@ -463,49 +465,6 @@ impl FromStr for CheckpointPolicy { } } -#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq)] -/// The Column Mapping modes used for reading and writing data -#[serde(rename_all = "camelCase")] -pub enum ColumnMappingMode { - /// No column mapping is applied - None, - /// Columns are mapped by their field_id in parquet - Id, - /// Columns are mapped to a physical name - Name, -} - -impl Default for ColumnMappingMode { - fn default() -> Self { - Self::None - } -} - -impl AsRef for ColumnMappingMode { - fn as_ref(&self) -> &str { - match self { - Self::None => "none", - Self::Id => "id", - Self::Name => "name", - } - } -} - -impl FromStr for ColumnMappingMode { - type Err = DeltaTableError; - - fn from_str(s: &str) -> Result { - match s.to_ascii_lowercase().as_str() { - "none" => Ok(Self::None), - "id" => Ok(Self::Id), - "name" => Ok(Self::Name), - _ => Err(DeltaTableError::Generic( - "Invalid string for ColumnMappingMode".into(), - )), - } - } -} - const SECONDS_PER_MINUTE: u64 = 60; const SECONDS_PER_HOUR: u64 = 60 * SECONDS_PER_MINUTE; const SECONDS_PER_DAY: u64 = 24 * SECONDS_PER_HOUR; @@ -577,7 +536,7 @@ mod tests { // change to 2 day let mut md = dummy_metadata(); md.configuration.insert( - DeltaConfigKey::DeletedFileRetentionDuration + TableProperty::DeletedFileRetentionDuration .as_ref() .to_string(), Some("interval 2 day".to_string()), @@ -608,7 +567,7 @@ mod tests { // change to false let mut md = dummy_metadata(); md.configuration.insert( - DeltaConfigKey::EnableExpiredLogCleanup.as_ref().into(), + TableProperty::EnableExpiredLogCleanup.as_ref().into(), Some("false".to_string()), ); let config = TableConfig(&md.configuration); diff --git a/crates/core/src/table/mod.rs b/crates/core/src/table/mod.rs index 4b818513b0..65d84985c7 100644 --- a/crates/core/src/table/mod.rs +++ b/crates/core/src/table/mod.rs @@ -30,6 +30,7 @@ pub mod state_arrow; /// Metadata for a checkpoint file #[derive(Serialize, Deserialize, Debug, Default, Clone, Copy)] +#[serde(rename_all = "camelCase")] pub struct CheckPoint { /// Delta table version pub(crate) version: i64, // 20 digits decimals @@ -163,7 +164,6 @@ pub(crate) fn get_partition_col_data_types<'a>( // When loading `partitionValues_parsed` we have to convert the stringified partition values back to the correct data type. schema .fields() - .iter() .filter_map(|f| { if metadata .partition_columns @@ -240,9 +240,12 @@ impl<'de> Deserialize<'de> for DeltaTable { let storage_config: LogStoreConfig = seq .next_element()? .ok_or_else(|| A::Error::invalid_length(0, &self))?; - let log_store = - crate::logstore::logstore_for(storage_config.location, storage_config.options) - .map_err(|_| A::Error::custom("Failed deserializing LogStore"))?; + let log_store = crate::logstore::logstore_for( + storage_config.location, + storage_config.options, + None, + ) + .map_err(|_| A::Error::custom("Failed deserializing LogStore"))?; let table = DeltaTable { state, @@ -288,6 +291,11 @@ impl DeltaTable { self.log_store.object_store() } + /// Check if the [`DeltaTable`] exists + pub async fn verify_deltatable_existence(&self) -> DeltaResult { + self.log_store.is_delta_table_location().await + } + /// The URI of the underlying data pub fn table_uri(&self) -> String { self.log_store.root_uri() @@ -619,4 +627,21 @@ mod tests { .unwrap(); (dt, tmp_dir) } + + #[test] + fn checkpoint_should_serialize_in_camel_case() { + let checkpoint = CheckPoint { + version: 1, + size: 1, + parts: None, + size_in_bytes: Some(1), + num_of_add_files: Some(1), + }; + + let checkpoint_json_serialized = + serde_json::to_string(&checkpoint).expect("could not serialize to json"); + + assert!(checkpoint_json_serialized.contains("sizeInBytes")); + assert!(checkpoint_json_serialized.contains("numOfAddFiles")); + } } diff --git a/crates/core/src/table/state.rs b/crates/core/src/table/state.rs index 9544198581..0876dc9e79 100644 --- a/crates/core/src/table/state.rs +++ b/crates/core/src/table/state.rs @@ -181,6 +181,11 @@ impl DeltaTableState { self.snapshot.schema() } + /// Get the table config which is loaded with of the snapshot + pub fn load_config(&self) -> &DeltaTableConfig { + &self.snapshot.load_config() + } + /// Well known table configuration pub fn table_config(&self) -> TableConfig<'_> { self.snapshot.table_config() diff --git a/crates/core/src/table/state_arrow.rs b/crates/core/src/table/state_arrow.rs index fe35787cb4..e4a374b763 100644 --- a/crates/core/src/table/state_arrow.rs +++ b/crates/core/src/table/state_arrow.rs @@ -6,17 +6,17 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::Arc; -use arrow::compute::cast; -use arrow::compute::kernels::cast_utils::Parser; use arrow_array::types::{Date32Type, TimestampMicrosecondType}; use arrow_array::{ Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Float64Array, Int64Array, NullArray, StringArray, StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, }; +use arrow_cast::cast; +use arrow_cast::parse::Parser; use arrow_schema::{DataType, Field, Fields, TimeUnit}; +use delta_kernel::features::ColumnMappingMode; use itertools::Itertools; -use super::config::ColumnMappingMode; use super::state::DeltaTableState; use crate::errors::DeltaTableError; use crate::kernel::{Add, DataType as DeltaDataType, StructType}; @@ -149,7 +149,13 @@ impl DeltaTableState { .map( |name| -> Result { let schema = metadata.schema()?; - let field = schema.field_with_name(name)?; + let field = + schema + .field(name) + .ok_or(DeltaTableError::MetadataError(format!( + "Invalid partition column {0}", + name + )))?; Ok(field.data_type().try_into()?) }, ) @@ -173,12 +179,12 @@ impl DeltaTableState { .map(|name| -> Result<_, DeltaTableError> { let physical_name = self .schema() - .field_with_name(name) - .or(Err(DeltaTableError::MetadataError(format!( + .field(name) + .ok_or(DeltaTableError::MetadataError(format!( "Invalid partition column {0}", name - ))))? - .physical_name()? + )))? + .physical_name(column_mapping_mode)? .to_string(); Ok((physical_name, name.as_str())) }) @@ -674,7 +680,6 @@ impl<'a> SchemaLeafIterator<'a> { SchemaLeafIterator { fields_remaining: schema .fields() - .iter() .map(|field| (vec![field.name().as_ref()], field.data_type())) .collect(), } diff --git a/crates/core/src/test_utils/factories/actions.rs b/crates/core/src/test_utils/factories/actions.rs new file mode 100644 index 0000000000..1f1e13a793 --- /dev/null +++ b/crates/core/src/test_utils/factories/actions.rs @@ -0,0 +1,153 @@ +use std::collections::HashMap; + +use arrow_array::*; +use chrono::Utc; +use delta_kernel::schema::{DataType, PrimitiveType}; +use object_store::path::Path; +use object_store::ObjectMeta; + +use super::{get_parquet_bytes, DataFactory, FileStats}; +use crate::kernel::arrow::extract::{self as ex}; +use crate::kernel::partitions_schema; +use crate::kernel::{Add, Metadata, Protocol, ReaderFeatures, Remove, StructType, WriterFeatures}; +use crate::operations::transaction::PROTOCOL; + +pub struct ActionFactory; + +impl ActionFactory { + pub fn add_raw( + meta: ObjectMeta, + stats: FileStats, + partition_values: HashMap>, + data_change: bool, + ) -> Add { + Add { + path: meta.location.to_string(), + size: meta.size as i64, + partition_values, + data_change, + modification_time: meta.last_modified.timestamp_millis(), + stats: serde_json::to_string(&stats).ok(), + tags: Some(HashMap::new()), + default_row_commit_version: None, + deletion_vector: None, + base_row_id: None, + clustering_provider: None, + stats_parsed: None, + } + } + + pub fn add( + schema: &StructType, + bounds: HashMap<&str, (&str, &str)>, + partition_columns: Vec, + data_change: bool, + ) -> Add { + let partitions_schema = partitions_schema(&schema, &partition_columns).unwrap(); + let partition_values = if let Some(p_schema) = partitions_schema { + let batch = DataFactory::record_batch(&p_schema, 1, &bounds).unwrap(); + p_schema + .fields() + .map(|f| { + let value = match f.data_type() { + DataType::Primitive(PrimitiveType::String) => { + let arr = + ex::extract_and_cast::(&batch, f.name()).unwrap(); + Some(arr.value(0).to_string()) + } + DataType::Primitive(PrimitiveType::Integer) => { + let arr = ex::extract_and_cast::(&batch, f.name()).unwrap(); + Some(arr.value(0).to_string()) + } + DataType::Primitive(PrimitiveType::Long) => { + let arr = ex::extract_and_cast::(&batch, f.name()).unwrap(); + Some(arr.value(0).to_string()) + } + _ => unimplemented!(), + }; + (f.name().to_owned(), value) + }) + .collect() + } else { + HashMap::new() + }; + + let data_schema = StructType::new( + schema + .fields() + .filter(|f| !partition_columns.contains(f.name())) + .cloned() + .collect(), + ); + + let batch = DataFactory::record_batch(&data_schema, 10, &bounds).unwrap(); + let stats = DataFactory::file_stats(&batch).unwrap(); + let path = Path::from(generate_file_name()); + let data = get_parquet_bytes(&batch).unwrap(); + let meta = ObjectMeta { + location: path.clone(), + size: data.len(), + last_modified: Utc::now(), + e_tag: None, + version: None, + }; + ActionFactory::add_raw(meta, stats, partition_values, data_change) + } + + pub fn remove(add: &Add, data_change: bool) -> Remove { + add_as_remove(add, data_change) + } + + pub fn protocol( + max_reader: Option, + max_writer: Option, + reader_features: Option>, + writer_features: Option>, + ) -> Protocol { + Protocol { + min_reader_version: max_reader.unwrap_or(PROTOCOL.default_reader_version()), + min_writer_version: max_writer.unwrap_or(PROTOCOL.default_writer_version()), + writer_features: writer_features.map(|i| i.into_iter().collect()), + reader_features: reader_features.map(|i| i.into_iter().collect()), + } + } + + pub fn metadata( + schema: &StructType, + partition_columns: Option>, + configuration: Option>>, + ) -> Metadata { + Metadata { + id: uuid::Uuid::new_v4().hyphenated().to_string(), + format: Default::default(), + schema_string: serde_json::to_string(schema).unwrap(), + partition_columns: partition_columns + .map(|i| i.into_iter().map(|c| c.to_string()).collect()) + .unwrap_or_default(), + configuration: configuration.unwrap_or_default(), + name: None, + description: None, + created_time: Some(Utc::now().timestamp_millis()), + } + } +} + +pub fn add_as_remove(add: &Add, data_change: bool) -> Remove { + Remove { + path: add.path.clone(), + data_change, + deletion_timestamp: Some(Utc::now().timestamp_millis()), + size: Some(add.size), + extended_file_metadata: Some(true), + partition_values: Some(add.partition_values.clone()), + tags: add.tags.clone(), + deletion_vector: add.deletion_vector.clone(), + base_row_id: add.base_row_id, + default_row_commit_version: add.default_row_commit_version, + } +} + +fn generate_file_name() -> String { + let file_name = uuid::Uuid::new_v4().hyphenated().to_string(); + format!("part-0001-{}.parquet", file_name) +} diff --git a/crates/core/src/test_utils/factories/data.rs b/crates/core/src/test_utils/factories/data.rs new file mode 100644 index 0000000000..d69869ae92 --- /dev/null +++ b/crates/core/src/test_utils/factories/data.rs @@ -0,0 +1,247 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_arith::aggregate::{max as arrow_max, max_string, min as arrow_min, min_string}; +use arrow_array::*; +use arrow_schema::DataType as ArrowDataType; +use bytes::Bytes; +use delta_kernel::expressions::Scalar; +use parquet::arrow::arrow_writer::ArrowWriter; +use parquet::file::properties::WriterProperties; +use rand::distributions::{Alphanumeric, DistString, Distribution, Uniform}; + +use super::super::TestResult; +use super::FileStats; +use crate::kernel::scalars::ScalarExt; +use crate::kernel::{DataType, PrimitiveType, StructType}; + +pub struct DataFactory; + +impl DataFactory { + pub fn record_batch( + schema: &StructType, + length: usize, + bounds: &HashMap<&str, (&str, &str)>, + ) -> TestResult { + generate_random_batch(schema, length, bounds) + } + + pub fn file_stats(batch: &RecordBatch) -> TestResult { + get_stats(batch) + } + + pub fn array( + data_type: DataType, + length: usize, + min_val: Option, + max_val: Option, + ) -> TestResult { + generate_random_array(data_type, length, min_val, max_val) + } +} + +fn generate_random_batch( + schema: &StructType, + length: usize, + bounds: &HashMap<&str, (&str, &str)>, +) -> TestResult { + schema + .fields() + .map(|field| { + let (min_val, max_val) = + if let Some((min_val, max_val)) = bounds.get(field.name().as_str()) { + (*min_val, *max_val) + } else { + // NOTE providing illegal strings will resolve to default bounds, + // an empty string will resolve to null. + ("$%&", "$%&") + }; + generate_random_array( + field.data_type().clone(), + length, + Some(min_val.to_string()), + Some(max_val.to_string()), + ) + }) + .collect::>>() + .map(|columns| RecordBatch::try_new(Arc::new(schema.try_into().unwrap()), columns).unwrap()) +} + +pub fn generate_random_array( + data_type: DataType, + length: usize, + min_val: Option, + max_val: Option, +) -> TestResult { + use DataType::*; + use PrimitiveType::*; + let mut rng = rand::thread_rng(); + + match data_type { + Primitive(Integer) => { + let min_val = min_val + .and_then(|min| Integer.parse_scalar(&min).ok()) + .unwrap_or(Scalar::Integer(-10)); + let max_val = max_val + .and_then(|max| Integer.parse_scalar(&max).ok()) + .unwrap_or(Scalar::Integer(10)); + let between = match (min_val, max_val) { + (Scalar::Integer(min), Scalar::Integer(max)) => Uniform::from(min..=max), + _ => unreachable!(), + }; + let arr = Int32Array::from( + (0..length) + .map(|_| between.sample(&mut rng)) + .collect::>(), + ); + Ok(Arc::new(arr)) + } + Primitive(Long) => { + let min_val = min_val + .and_then(|min| Long.parse_scalar(&min).ok()) + .unwrap_or(Scalar::Long(-10)); + let max_val = max_val + .and_then(|max| Long.parse_scalar(&max).ok()) + .unwrap_or(Scalar::Long(10)); + let between = match (min_val, max_val) { + (Scalar::Long(min), Scalar::Long(max)) => Uniform::from(min..=max), + _ => unreachable!(), + }; + let arr = Int64Array::from( + (0..length) + .map(|_| between.sample(&mut rng)) + .collect::>(), + ); + Ok(Arc::new(arr)) + } + Primitive(Float) => { + let min_val = min_val + .and_then(|min| Float.parse_scalar(&min).ok()) + .unwrap_or(Scalar::Float(-10.1)); + let max_val = max_val + .and_then(|max| Float.parse_scalar(&max).ok()) + .unwrap_or(Scalar::Float(10.1)); + let between = match (min_val, max_val) { + (Scalar::Float(min), Scalar::Float(max)) => Uniform::from(min..=max), + _ => unreachable!(), + }; + let arr = Float32Array::from( + (0..length) + .map(|_| between.sample(&mut rng)) + .collect::>(), + ); + Ok(Arc::new(arr)) + } + Primitive(Double) => { + let min_val = min_val + .and_then(|min| Double.parse_scalar(&min).ok()) + .unwrap_or(Scalar::Double(-10.1)); + let max_val = max_val + .and_then(|max| Double.parse_scalar(&max).ok()) + .unwrap_or(Scalar::Double(10.1)); + let between = match (min_val, max_val) { + (Scalar::Double(min), Scalar::Double(max)) => Uniform::from(min..=max), + _ => unreachable!(), + }; + let arr = Float64Array::from( + (0..length) + .map(|_| between.sample(&mut rng)) + .collect::>(), + ); + Ok(Arc::new(arr)) + } + Primitive(String) => { + let arr = StringArray::from( + (0..length) + .map(|_| Alphanumeric.sample_string(&mut rng, 3)) + .collect::>(), + ); + Ok(Arc::new(arr)) + } + _ => todo!(), + } +} + +fn get_stats(batch: &RecordBatch) -> TestResult { + use ArrowDataType::*; + + let mut file_stats = FileStats::new(batch.num_rows() as i64); + for (i, field) in batch.schema().fields().iter().enumerate() { + let array = batch.column(i); + let stats = match array.data_type() { + Int8 => { + let array = array.as_any().downcast_ref::().unwrap(); + let min = Scalar::Byte(arrow_min(array).unwrap()); + let max = Scalar::Byte(arrow_max(array).unwrap()); + let null_count = Scalar::Long(array.null_count() as i64); + Some((null_count, min, max)) + } + Int16 => { + let array = array.as_any().downcast_ref::().unwrap(); + let min = Scalar::Short(arrow_min(array).unwrap()); + let max = Scalar::Short(arrow_max(array).unwrap()); + let null_count = Scalar::Long(array.null_count() as i64); + Some((null_count, min, max)) + } + Int32 => { + let array = array.as_any().downcast_ref::().unwrap(); + let min = Scalar::Integer(arrow_min(array).unwrap()); + let max = Scalar::Integer(arrow_max(array).unwrap()); + let null_count = Scalar::Long(array.null_count() as i64); + Some((null_count, min, max)) + } + Int64 => { + let array = array.as_any().downcast_ref::().unwrap(); + let min = Scalar::Long(arrow_min(array).unwrap()); + let max = Scalar::Long(arrow_max(array).unwrap()); + let null_count = Scalar::Long(array.null_count() as i64); + Some((null_count, min, max)) + } + Float32 => { + let array = array.as_any().downcast_ref::().unwrap(); + let min = Scalar::Float(arrow_min(array).unwrap()); + let max = Scalar::Float(arrow_max(array).unwrap()); + let null_count = Scalar::Long(array.null_count() as i64); + Some((null_count, min, max)) + } + Float64 => { + let array = array.as_any().downcast_ref::().unwrap(); + let min = Scalar::Double(arrow_min(array).unwrap()); + let max = Scalar::Double(arrow_max(array).unwrap()); + let null_count = Scalar::Long(array.null_count() as i64); + Some((null_count, min, max)) + } + Utf8 => { + let array = array.as_any().downcast_ref::().unwrap(); + let min = Scalar::String(min_string(array).unwrap().into()); + let max = Scalar::String(max_string(array).unwrap().into()); + let null_count = Scalar::Long(array.null_count() as i64); + Some((null_count, min, max)) + } + Struct(_) => None, + _ => todo!(), + }; + if let Some((null_count, min, max)) = stats { + file_stats + .null_count + .insert(field.name().to_string(), null_count.to_json()); + file_stats + .min_values + .insert(field.name().to_string(), min.to_json()); + file_stats + .max_values + .insert(field.name().to_string(), max.to_json()); + } + } + Ok(file_stats) +} + +pub(crate) fn get_parquet_bytes(batch: &RecordBatch) -> TestResult { + let mut data: Vec = Vec::new(); + let props = WriterProperties::builder().build(); + let mut writer = ArrowWriter::try_new(&mut data, batch.schema(), Some(props))?; + writer.write(batch)?; + // writer must be closed to write footer + writer.close()?; + Ok(data.into()) +} diff --git a/crates/core/src/test_utils/factories/mod.rs b/crates/core/src/test_utils/factories/mod.rs new file mode 100644 index 0000000000..551749a89d --- /dev/null +++ b/crates/core/src/test_utils/factories/mod.rs @@ -0,0 +1,66 @@ +use std::collections::HashMap; + +use lazy_static::lazy_static; +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::kernel::{DataType, PrimitiveType, StructField, StructType}; + +mod actions; +mod data; + +pub use actions::*; +pub use data::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct FileStats { + pub num_records: i64, + pub null_count: HashMap, + pub min_values: HashMap, + pub max_values: HashMap, +} + +impl FileStats { + pub fn new(num_records: i64) -> Self { + Self { + num_records, + null_count: HashMap::new(), + min_values: HashMap::new(), + max_values: HashMap::new(), + } + } +} + +pub struct TestSchemas; + +impl TestSchemas { + /// A simple flat schema with string and integer columns. + /// + /// ### Columns + /// - id: string + /// - value: integer + /// - modified: string + pub fn simple() -> &'static StructType { + lazy_static! { + static ref _simple: StructType = StructType::new(vec![ + StructField::new( + "id".to_string(), + DataType::Primitive(PrimitiveType::String), + true + ), + StructField::new( + "value".to_string(), + DataType::Primitive(PrimitiveType::Integer), + true + ), + StructField::new( + "modified".to_string(), + DataType::Primitive(PrimitiveType::String), + true + ), + ]); + } + &_simple + } +} diff --git a/crates/core/src/test_utils/mod.rs b/crates/core/src/test_utils/mod.rs new file mode 100644 index 0000000000..0d3ff9ed65 --- /dev/null +++ b/crates/core/src/test_utils/mod.rs @@ -0,0 +1,5 @@ +mod factories; + +pub use factories::*; + +pub type TestResult = Result>; diff --git a/crates/core/src/writer/json.rs b/crates/core/src/writer/json.rs index d97d3ef16c..2cf7f6a950 100644 --- a/crates/core/src/writer/json.rs +++ b/crates/core/src/writer/json.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use arrow::datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; use arrow::record_batch::*; use bytes::Bytes; +use delta_kernel::expressions::Scalar; use indexmap::IndexMap; use object_store::path::Path; use object_store::ObjectStore; @@ -24,7 +25,7 @@ use super::utils::{ }; use super::{DeltaWriter, DeltaWriterError, WriteMode}; use crate::errors::DeltaTableError; -use crate::kernel::{Add, PartitionsExt, Scalar, StructType}; +use crate::kernel::{scalars::ScalarExt, Add, PartitionsExt, StructType}; use crate::storage::ObjectStoreRetryExt; use crate::table::builder::DeltaTableBuilder; use crate::table::config::DEFAULT_NUM_INDEX_COLS; @@ -362,7 +363,9 @@ impl DeltaWriter> for JsonWriter { let path = next_data_path(&prefix, 0, &uuid, &writer.writer_properties); let obj_bytes = Bytes::from(writer.buffer.to_vec()); let file_size = obj_bytes.len() as i64; - self.storage.put_with_retries(&path, obj_bytes, 15).await?; + self.storage + .put_with_retries(&path, obj_bytes.into(), 15) + .await?; actions.push(create_add( &writer.partition_values, @@ -616,7 +619,7 @@ mod tests { .with_location(&path) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); table.load().await.expect("Failed to load table"); diff --git a/crates/core/src/writer/record_batch.rs b/crates/core/src/writer/record_batch.rs index c21435dd14..10ba52ae62 100644 --- a/crates/core/src/writer/record_batch.rs +++ b/crates/core/src/writer/record_batch.rs @@ -7,13 +7,13 @@ use std::{collections::HashMap, sync::Arc}; -use arrow::array::{new_null_array, Array, UInt32Array}; -use arrow::compute::{partition, take}; -use arrow::record_batch::RecordBatch; -use arrow_array::ArrayRef; +use arrow_array::{new_null_array, Array, ArrayRef, RecordBatch, UInt32Array}; +use arrow_ord::partition::partition; use arrow_row::{RowConverter, SortField}; use arrow_schema::{ArrowError, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use arrow_select::take::take; use bytes::Bytes; +use delta_kernel::expressions::Scalar; use indexmap::IndexMap; use object_store::{path::Path, ObjectStore}; use parquet::{arrow::ArrowWriter, errors::ParquetError}; @@ -28,8 +28,8 @@ use super::utils::{ }; use super::{DeltaWriter, DeltaWriterError, WriteMode}; use crate::errors::DeltaTableError; -use crate::kernel::{Action, Add, PartitionsExt, Scalar, StructType}; -use crate::operations::cast::merge_schema; +use crate::kernel::{scalars::ScalarExt, Action, Add, PartitionsExt, StructType}; +use crate::operations::cast::merge_schema::merge_arrow_schema; use crate::storage::ObjectStoreRetryExt; use crate::table::builder::DeltaTableBuilder; use crate::table::config::DEFAULT_NUM_INDEX_COLS; @@ -224,7 +224,9 @@ impl DeltaWriter for RecordBatchWriter { let path = next_data_path(&prefix, 0, &uuid, &writer.writer_properties); let obj_bytes = Bytes::from(writer.buffer.to_vec()); let file_size = obj_bytes.len() as i64; - self.storage.put_with_retries(&path, obj_bytes, 15).await?; + self.storage + .put_with_retries(&path, obj_bytes.into(), 15) + .await?; actions.push(create_add( &writer.partition_values, @@ -319,8 +321,11 @@ impl PartitionWriter { WriteMode::MergeSchema => { debug!("The writer and record batch schemas do not match, merging"); - let merged = - merge_schema(self.arrow_schema.clone(), record_batch.schema().clone())?; + let merged = merge_arrow_schema( + self.arrow_schema.clone(), + record_batch.schema().clone(), + true, + )?; self.arrow_schema = merged; let mut cols = vec![]; @@ -539,7 +544,7 @@ mod tests { let table = DeltaOps(table) .create() .with_partition_columns(partition_cols.to_vec()) - .with_columns(delta_schema.fields().clone()) + .with_columns(delta_schema.fields().cloned()) .await .unwrap(); @@ -659,7 +664,7 @@ mod tests { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partition_cols) .await .unwrap(); @@ -735,7 +740,7 @@ mod tests { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .await .unwrap(); table.load().await.expect("Failed to load table"); @@ -779,8 +784,7 @@ mod tests { let new_schema = table.metadata().unwrap().schema().unwrap(); let expected_columns = vec!["id", "value", "modified", "vid", "name"]; - let found_columns: Vec<&String> = - new_schema.fields().iter().map(|f| f.name()).collect(); + let found_columns: Vec<&String> = new_schema.fields().map(|f| f.name()).collect(); assert_eq!( expected_columns, found_columns, "The new table schema does not contain all evolved columns as expected" @@ -797,7 +801,7 @@ mod tests { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(["id"]) .await .unwrap(); @@ -928,7 +932,7 @@ mod tests { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .await .unwrap(); table.load().await.expect("Failed to load table"); diff --git a/crates/core/src/writer/stats.rs b/crates/core/src/writer/stats.rs index 0cea01ee6a..e4b93a54f5 100644 --- a/crates/core/src/writer/stats.rs +++ b/crates/core/src/writer/stats.rs @@ -3,7 +3,9 @@ use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use std::{collections::HashMap, ops::AddAssign}; +use delta_kernel::expressions::Scalar; use indexmap::IndexMap; +use itertools::Itertools; use parquet::file::metadata::ParquetMetaData; use parquet::format::FileMetaData; use parquet::schema::types::{ColumnDescriptor, SchemaDescriptor}; @@ -14,7 +16,7 @@ use parquet::{ }; use super::*; -use crate::kernel::{Add, Scalar}; +use crate::kernel::{scalars::ScalarExt, Add}; use crate::protocol::{ColumnValueStat, Stats}; /// Creates an [`Add`] log action struct. @@ -129,8 +131,29 @@ fn stats_from_metadata( let mut min_values: HashMap = HashMap::new(); let mut max_values: HashMap = HashMap::new(); let mut null_count: HashMap = HashMap::new(); + let dialect = sqlparser::dialect::GenericDialect {}; let idx_to_iterate = if let Some(stats_cols) = stats_columns { + let stats_cols = stats_cols + .into_iter() + .map(|v| { + match sqlparser::parser::Parser::new(&dialect) + .try_with_sql(v) + .map_err(|e| DeltaTableError::generic(e.to_string()))? + .parse_multipart_identifier() + { + Ok(parts) => Ok(parts.into_iter().map(|v| v.value).join(".")), + Err(e) => { + return Err(DeltaWriterError::DeltaTable( + DeltaTableError::GenericError { + source: Box::new(e), + }, + )) + } + } + }) + .collect::, DeltaWriterError>>()?; + schema_descriptor .columns() .iter() diff --git a/crates/core/src/writer/test_utils.rs b/crates/core/src/writer/test_utils.rs index 093ad7cbd0..be0dfebb66 100644 --- a/crates/core/src/writer/test_utils.rs +++ b/crates/core/src/writer/test_utils.rs @@ -3,14 +3,14 @@ use std::collections::HashMap; use std::sync::Arc; -use arrow::compute::take; use arrow_array::{Int32Array, Int64Array, RecordBatch, StringArray, StructArray, UInt32Array}; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; +use arrow_select::take::take; use crate::kernel::{DataType as DeltaDataType, Metadata, PrimitiveType, StructField, StructType}; use crate::operations::create::CreateBuilder; use crate::operations::DeltaOps; -use crate::{DeltaConfigKey, DeltaTable, DeltaTableBuilder}; +use crate::{DeltaTable, DeltaTableBuilder, TableProperty}; pub type TestResult = Result<(), Box>; @@ -270,13 +270,13 @@ pub fn get_delta_schema_with_nested_struct() -> StructType { } pub async fn setup_table_with_configuration( - key: DeltaConfigKey, + key: TableProperty, value: Option>, ) -> DeltaTable { let table_schema = get_delta_schema(); DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_configuration_property(key, value) .await .expect("Failed to create table") @@ -299,7 +299,7 @@ pub async fn create_initialized_table(partition_cols: &[String]) -> DeltaTable { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partition_cols) .await .unwrap() diff --git a/crates/core/src/writer/utils.rs b/crates/core/src/writer/utils.rs index 3c95942993..864476684a 100644 --- a/crates/core/src/writer/utils.rs +++ b/crates/core/src/writer/utils.rs @@ -4,9 +4,9 @@ use std::io::Write; use std::sync::Arc; -use arrow::datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; -use arrow::json::ReaderBuilder; -use arrow::record_batch::*; +use arrow_array::RecordBatch; +use arrow_json::ReaderBuilder; +use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; use object_store::path::Path; use parking_lot::RwLock; use parquet::basic::Compression; diff --git a/crates/core/tests/checkpoint_writer.rs b/crates/core/tests/checkpoint_writer.rs index 696e379569..1be439f9e5 100644 --- a/crates/core/tests/checkpoint_writer.rs +++ b/crates/core/tests/checkpoint_writer.rs @@ -87,7 +87,7 @@ mod delete_expired_delta_log_in_checkpoint { use ::object_store::path::Path as ObjectStorePath; use chrono::Utc; - use deltalake_core::table::config::DeltaConfigKey; + use deltalake_core::table::config::TableProperty; use deltalake_core::*; use maplit::hashmap; @@ -96,8 +96,8 @@ mod delete_expired_delta_log_in_checkpoint { let mut table = fs_common::create_table( "../test/tests/data/checkpoints_with_expired_logs/expired", Some(hashmap! { - DeltaConfigKey::LogRetentionDuration.as_ref().into() => Some("interval 10 minute".to_string()), - DeltaConfigKey::EnableExpiredLogCleanup.as_ref().into() => Some("true".to_string()) + TableProperty::LogRetentionDuration.as_ref().into() => Some("interval 10 minute".to_string()), + TableProperty::EnableExpiredLogCleanup.as_ref().into() => Some("true".to_string()) }), ) .await; @@ -160,8 +160,8 @@ mod delete_expired_delta_log_in_checkpoint { let mut table = fs_common::create_table( "../test/tests/data/checkpoints_with_expired_logs/not_delete_expired", Some(hashmap! { - DeltaConfigKey::LogRetentionDuration.as_ref().into() => Some("interval 1 second".to_string()), - DeltaConfigKey::EnableExpiredLogCleanup.as_ref().into() => Some("false".to_string()) + TableProperty::LogRetentionDuration.as_ref().into() => Some("interval 1 second".to_string()), + TableProperty::EnableExpiredLogCleanup.as_ref().into() => Some("false".to_string()) }), ) .await; @@ -208,7 +208,7 @@ mod checkpoints_with_tombstones { use ::object_store::path::Path as ObjectStorePath; use chrono::Utc; use deltalake_core::kernel::*; - use deltalake_core::table::config::DeltaConfigKey; + use deltalake_core::table::config::TableProperty; use deltalake_core::*; use maplit::hashmap; use parquet::file::reader::{FileReader, SerializedFileReader}; @@ -235,7 +235,7 @@ mod checkpoints_with_tombstones { #[ignore] async fn test_expired_tombstones() { let mut table = fs_common::create_table("../test/tests/data/checkpoints_tombstones/expired", Some(hashmap! { - DeltaConfigKey::DeletedFileRetentionDuration.as_ref().into() => Some("interval 1 minute".to_string()) + TableProperty::DeletedFileRetentionDuration.as_ref().into() => Some("interval 1 minute".to_string()) })).await; let a1 = fs_common::add(3 * 60 * 1000); // 3 mins ago, diff --git a/crates/core/tests/command_merge.rs b/crates/core/tests/command_merge.rs index 59a941a24f..783c858750 100644 --- a/crates/core/tests/command_merge.rs +++ b/crates/core/tests/command_merge.rs @@ -19,7 +19,7 @@ async fn create_table(table_uri: &str, partition: Option>) -> DeltaTab let ops = DeltaOps::try_from_uri(table_uri).await.unwrap(); let table = ops .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partition.unwrap_or_default()) .await .expect("Failed to create table"); @@ -138,17 +138,17 @@ async fn merge( #[tokio::test] async fn test_merge_concurrent_conflict() { - // No partition key or filter predicate -> Commit conflict + // Overlapping id ranges -> Commit conflict let tmp_dir = tempfile::tempdir().unwrap(); let table_uri = tmp_dir.path().to_str().to_owned().unwrap(); let table_ref1 = create_table(table_uri, Some(vec!["event_date"])).await; let table_ref2 = open_table(table_uri).await.unwrap(); - let (df1, df2) = create_test_data(); + let (df1, _df2) = create_test_data(); let expr = col("target.id").eq(col("source.id")); - let (_table_ref1, _metrics) = merge(table_ref1, df1, expr.clone()).await.unwrap(); - let result = merge(table_ref2, df2, expr).await; + let (_table_ref1, _metrics) = merge(table_ref1, df1.clone(), expr.clone()).await.unwrap(); + let result = merge(table_ref2, df1, expr).await; assert!(matches!( result.as_ref().unwrap_err(), @@ -159,6 +159,23 @@ async fn test_merge_concurrent_conflict() { } } +#[tokio::test] +async fn test_merge_different_range() { + // No overlapping id ranges -> No conflict + let tmp_dir = tempfile::tempdir().unwrap(); + let table_uri = tmp_dir.path().to_str().to_owned().unwrap(); + + let table_ref1 = create_table(table_uri, Some(vec!["event_date"])).await; + let table_ref2 = open_table(table_uri).await.unwrap(); + let (df1, df2) = create_test_data(); + + let expr = col("target.id").eq(col("source.id")); + let (_table_ref1, _metrics) = merge(table_ref1, df1, expr.clone()).await.unwrap(); + let result = merge(table_ref2, df2, expr).await; + + assert!(result.is_ok()); +} + #[tokio::test] async fn test_merge_concurrent_different_partition() { // partition key in predicate -> Successful merge @@ -175,9 +192,7 @@ async fn test_merge_concurrent_different_partition() { let (_table_ref1, _metrics) = merge(table_ref1, df1, expr.clone()).await.unwrap(); let result = merge(table_ref2, df2, expr).await; - // TODO: Currently it throws a Version mismatch error, but the merge commit was successfully - // This bug needs to be fixed, see pull request #2280 - assert!(result.as_ref().is_ok()); + assert!(result.is_ok()); } #[tokio::test] diff --git a/crates/core/tests/command_optimize.rs b/crates/core/tests/command_optimize.rs index 4f26c55fd4..13cbd168e4 100644 --- a/crates/core/tests/command_optimize.rs +++ b/crates/core/tests/command_optimize.rs @@ -249,7 +249,7 @@ async fn test_optimize_with_partitions() -> Result<(), Box> { let partition_values = partition_adds[0].partition_values()?; assert_eq!( partition_values.get("date"), - Some(&deltalake_core::kernel::Scalar::String( + Some(&delta_kernel::expressions::Scalar::String( "2022-05-22".to_string() )) ); diff --git a/crates/core/tests/command_restore.rs b/crates/core/tests/command_restore.rs index aa5b598347..5013556ab8 100644 --- a/crates/core/tests/command_restore.rs +++ b/crates/core/tests/command_restore.rs @@ -6,6 +6,7 @@ use deltalake_core::kernel::{DataType, PrimitiveType, StructField}; use deltalake_core::protocol::SaveMode; use deltalake_core::storage::commit_uri_from_version; use deltalake_core::{DeltaOps, DeltaTable}; +use itertools::Itertools; use rand::Rng; use std::error::Error; use std::fs; @@ -103,10 +104,9 @@ async fn test_restore_by_version() -> Result<(), Box> { let table_uri = context.tmp_dir.path().to_str().to_owned().unwrap(); let mut table = DeltaOps::try_from_uri(table_uri).await?; table.0.load_version(1).await?; - assert_eq!( - table.0.snapshot()?.file_actions()?, - result.0.snapshot()?.file_actions()? - ); + let curr_files = table.0.snapshot()?.file_paths_iter().collect_vec(); + let result_files = result.0.snapshot()?.file_paths_iter().collect_vec(); + assert_eq!(curr_files, result_files); let result = DeltaOps(result.0) .restore() diff --git a/crates/core/tests/fs_common/mod.rs b/crates/core/tests/fs_common/mod.rs index 3ef7c82edf..13683b408a 100644 --- a/crates/core/tests/fs_common/mod.rs +++ b/crates/core/tests/fs_common/mod.rs @@ -8,7 +8,9 @@ use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::{GetResult, ObjectStoreResult}; use deltalake_core::DeltaTable; use object_store::path::Path as StorePath; -use object_store::{ObjectStore, PutOptions, PutResult}; +use object_store::{ + MultipartUpload, ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult, +}; use serde_json::Value; use std::collections::HashMap; use std::fs; @@ -55,7 +57,7 @@ pub async fn create_test_table( .with_location(path) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_partition_columns(partition_columns) .with_configuration(config) .await @@ -158,14 +160,14 @@ impl SlowStore { #[async_trait::async_trait] impl ObjectStore for SlowStore { /// Save the provided bytes to the specified location. - async fn put(&self, location: &StorePath, bytes: bytes::Bytes) -> ObjectStoreResult { + async fn put(&self, location: &StorePath, bytes: PutPayload) -> ObjectStoreResult { self.inner.put(location, bytes).await } async fn put_opts( &self, location: &StorePath, - bytes: bytes::Bytes, + bytes: PutPayload, options: PutOptions, ) -> ObjectStoreResult { self.inner.put_opts(location, bytes, options).await @@ -272,18 +274,15 @@ impl ObjectStore for SlowStore { async fn put_multipart( &self, location: &StorePath, - ) -> ObjectStoreResult<( - object_store::MultipartId, - Box, - )> { + ) -> ObjectStoreResult> { self.inner.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &StorePath, - multipart_id: &object_store::MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.inner.put_multipart_opts(location, options).await } } diff --git a/crates/core/tests/integration_checkpoint.rs b/crates/core/tests/integration_checkpoint.rs index ce4525ba83..e90d4ec0cc 100644 --- a/crates/core/tests/integration_checkpoint.rs +++ b/crates/core/tests/integration_checkpoint.rs @@ -1,5 +1,3 @@ -#![cfg(feature = "integration_test")] - use chrono::Utc; use deltalake_core::checkpoints::{cleanup_expired_logs_for, create_checkpoint}; use deltalake_core::kernel::{DataType, PrimitiveType}; @@ -14,6 +12,8 @@ use tokio::time::sleep; #[tokio::test] #[serial] +// This test requires refactoring and a revisit +#[ignore] async fn cleanup_metadata_fs_test() -> TestResult { let storage = Box::new(LocalStorageIntegration::default()); let context = IntegrationContext::new(storage)?; @@ -34,19 +34,19 @@ async fn cleanup_metadata_test(context: &IntegrationContext) -> TestResult { // we don't need to actually populate files with content as cleanup works only with file's metadata object_store - .put(&log_path(0), bytes::Bytes::from("foo")) + .put(&log_path(0), bytes::Bytes::from("foo").into()) .await?; // since we cannot alter s3 object metadata, we mimic it with pauses // also we forced to use 2 seconds since Last-Modified is stored in seconds std::thread::sleep(Duration::from_secs(2)); object_store - .put(&log_path(1), bytes::Bytes::from("foo")) + .put(&log_path(1), bytes::Bytes::from("foo").into()) .await?; std::thread::sleep(Duration::from_secs(3)); object_store - .put(&log_path(2), bytes::Bytes::from("foo")) + .put(&log_path(2), bytes::Bytes::from("foo").into()) .await?; let v0time = object_store.head(&log_path(0)).await?.last_modified; diff --git a/crates/core/tests/integration_datafusion.rs b/crates/core/tests/integration_datafusion.rs index 64d80e3bce..3a55c63bb5 100644 --- a/crates/core/tests/integration_datafusion.rs +++ b/crates/core/tests/integration_datafusion.rs @@ -1,14 +1,10 @@ #![cfg(feature = "datafusion")] - -use arrow::array::Int64Array; -use deltalake_test::datafusion::*; -use deltalake_test::utils::*; -use serial_test::serial; - use std::collections::{HashMap, HashSet}; +use std::error::Error; use std::path::PathBuf; use std::sync::Arc; +use arrow::array::Int64Array; use arrow::array::*; use arrow::record_batch::RecordBatch; use arrow_schema::{ @@ -28,8 +24,6 @@ use datafusion_expr::Expr; use datafusion_proto::bytes::{ physical_plan_from_bytes_with_extension_codec, physical_plan_to_bytes_with_extension_codec, }; -use url::Url; - use deltalake_core::delta_datafusion::{DeltaPhysicalCodec, DeltaScan}; use deltalake_core::kernel::{DataType, MapType, PrimitiveType, StructField, StructType}; use deltalake_core::logstore::logstore_for; @@ -41,7 +35,10 @@ use deltalake_core::{ operations::{write::WriteBuilder, DeltaOps}, DeltaTable, DeltaTableError, }; -use std::error::Error; +use deltalake_test::datafusion::*; +use deltalake_test::utils::*; +use serial_test::serial; +use url::Url; mod local { use datafusion::common::stats::Precision; @@ -68,6 +65,8 @@ mod local { #[derive(Debug, Default)] pub struct ExecutionMetricsCollector { scanned_files: HashSet