From 5b7dbe6827ca3b8114766378f1da0e601cbc08a9 Mon Sep 17 00:00:00 2001
From: Avril Aysha <68642378+avriiil@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:18:18 +0100
Subject: [PATCH 1/9] fix typo
---
docs/integrations/object-storage/s3.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/integrations/object-storage/s3.md b/docs/integrations/object-storage/s3.md
index 5b2034827f..1989494978 100644
--- a/docs/integrations/object-storage/s3.md
+++ b/docs/integrations/object-storage/s3.md
@@ -2,7 +2,7 @@
`delta-rs` offers native support for using AWS S3 as an objet storage backend.
-You don’t need to install any extra dependencies to red/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly.
+You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly.
## Note for boto3 users
From 7783f66558c518d5e7b3434fa5de2d607591773c Mon Sep 17 00:00:00 2001
From: Avril Aysha <68642378+avriiil@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:21:07 +0100
Subject: [PATCH 2/9] typo fix
---
docs/integrations/object-storage/s3.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/integrations/object-storage/s3.md b/docs/integrations/object-storage/s3.md
index 1989494978..a7965cb2a5 100644
--- a/docs/integrations/object-storage/s3.md
+++ b/docs/integrations/object-storage/s3.md
@@ -1,6 +1,6 @@
# AWS S3 Storage Backend
-`delta-rs` offers native support for using AWS S3 as an objet storage backend.
+`delta-rs` offers native support for using AWS S3 as an object storage backend.
You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly.
From 7f7e3cddac750197ab0d7ded5d9de926046011f5 Mon Sep 17 00:00:00 2001
From: Avril Aysha <68642378+avriiil@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:20:47 +0100
Subject: [PATCH 3/9] create gcs docs
---
docs/integrations/object-storage/gcs.md | 87 +++++++++++++++++++++++++
1 file changed, 87 insertions(+)
create mode 100644 docs/integrations/object-storage/gcs.md
diff --git a/docs/integrations/object-storage/gcs.md b/docs/integrations/object-storage/gcs.md
new file mode 100644
index 0000000000..c5592ccc5c
--- /dev/null
+++ b/docs/integrations/object-storage/gcs.md
@@ -0,0 +1,87 @@
+# GCS Storage Backend
+
+`delta-rs` offers native support for using Google Cloud Storage (GCS) as an object storage backend.
+
+You don’t need to install any extra dependencies to red/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly.
+
+## Note for boto3 users
+
+Many Python engines use [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to connect to AWS. This library supports reading credentials automatically from your local `.aws/config` or `.aws/creds` file.
+
+For example, if you’re running locally with the proper credentials in your local `.aws/config` or `.aws/creds` file then you can write a Parquet file to S3 like this with pandas:
+
+```python
+ import pandas as pd
+ df = pd.DataFrame({'x': [1, 2, 3]})
+ df.to_parquet("s3://avriiil/parquet-test-pandas")
+```
+
+The `delta-rs` writer does not use `boto3` and therefore does not support taking credentials from your `.aws/config` or `.aws/creds` file. If you’re used to working with writers from Python engines like Polars, pandas or Dask, this may mean a small change to your workflow.
+
+## Passing AWS Credentials
+
+You can pass your AWS credentials explicitly by using:
+
+- the `storage_options `kwarg
+- Environment variables
+- EC2 metadata if using EC2 instances
+- AWS Profiles
+
+## Example
+
+Let's work through an example with Polars. The same logic applies to other Python engines like Pandas, Daft, Dask, etc.
+
+Follow the steps below to use Delta Lake on S3 with Polars:
+
+1. Install Polars and deltalake. For example, using:
+
+ `pip install polars deltalake`
+
+2. Create a dataframe with some toy data.
+
+ `df = pl.DataFrame({'x': [1, 2, 3]})`
+
+3. Set your `storage_options` correctly.
+
+```python
+storage_options = {
+ "AWS_REGION":,
+ 'AWS_ACCESS_KEY_ID': ,
+ 'AWS_SECRET_ACCESS_KEY': ,
+ 'AWS_S3_LOCKING_PROVIDER': 'dynamodb',
+ 'DELTA_DYNAMO_TABLE_NAME': 'delta_log',
+}
+```
+
+4. Write data to Delta table using the `storage_options` kwarg.
+
+ ```python
+ df.write_delta(
+ "s3://bucket/delta_table",
+ storage_options=storage_options,
+ )
+ ```
+
+## Delta Lake on AWS S3: Safe Concurrent Writes
+
+You need a locking provider to ensure safe concurrent writes when writing Delta tables to AWS S3. This is because AWS S3 does not guarantee mutual exclusion.
+
+A locking provider guarantees that only one writer is able to create the same file. This prevents corrupted or conflicting data.
+
+`delta-rs` uses DynamoDB to guarantee safe concurrent writes.
+
+Run the code below in your terminal to create a DynamoDB table that will act as your locking provider.
+
+```
+ aws dynamodb create-table \
+ --table-name delta_log \
+ --attribute-definitions AttributeName=tablePath,AttributeType=S AttributeName=fileName,AttributeType=S \
+ --key-schema AttributeName=tablePath,KeyType=HASH AttributeName=fileName,KeyType=RANGE \
+ --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5
+```
+
+If for some reason you don't want to use DynamoDB as your locking mechanism you can choose to set the `AWS_S3_ALLOW_UNSAFE_RENAME` variable to `true` in order to enable S3 unsafe writes.
+
+Read more in the [Usage](../../usage/writing/writing-to-s3-with-locking-provider.md) section.
+
+## Delta Lake on GCS: Required permissions
From 96dc0a6682e88852f6d14cbad1a085f37787cd33 Mon Sep 17 00:00:00 2001
From: Avril Aysha <68642378+avriiil@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:33:53 +0100
Subject: [PATCH 4/9] update docs
---
docs/integrations/object-storage/gcs.md | 91 +++++++------------------
1 file changed, 24 insertions(+), 67 deletions(-)
diff --git a/docs/integrations/object-storage/gcs.md b/docs/integrations/object-storage/gcs.md
index c5592ccc5c..aa8682d3cc 100644
--- a/docs/integrations/object-storage/gcs.md
+++ b/docs/integrations/object-storage/gcs.md
@@ -2,86 +2,43 @@
`delta-rs` offers native support for using Google Cloud Storage (GCS) as an object storage backend.
-You don’t need to install any extra dependencies to red/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly.
+You don’t need to install any extra dependencies to read/write Delta tables to GCS with engines that use `delta-rs`. You do need to configure your GCS access credentials correctly.
-## Note for boto3 users
+## Using Application Default Credentials
-Many Python engines use [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to connect to AWS. This library supports reading credentials automatically from your local `.aws/config` or `.aws/creds` file.
+Application Default Credentials (ADC) is a strategy used by GCS to automatically find credentials based on the application environment.
-For example, if you’re running locally with the proper credentials in your local `.aws/config` or `.aws/creds` file then you can write a Parquet file to S3 like this with pandas:
+If you are working from your local machine and have ADC set up then you can read/write Delta tables from GCS directly, without having to pass your credentials explicitly.
-```python
- import pandas as pd
- df = pd.DataFrame({'x': [1, 2, 3]})
- df.to_parquet("s3://avriiil/parquet-test-pandas")
-```
-
-The `delta-rs` writer does not use `boto3` and therefore does not support taking credentials from your `.aws/config` or `.aws/creds` file. If you’re used to working with writers from Python engines like Polars, pandas or Dask, this may mean a small change to your workflow.
-
-## Passing AWS Credentials
-
-You can pass your AWS credentials explicitly by using:
-
-- the `storage_options `kwarg
-- Environment variables
-- EC2 metadata if using EC2 instances
-- AWS Profiles
-
-## Example
-
-Let's work through an example with Polars. The same logic applies to other Python engines like Pandas, Daft, Dask, etc.
-
-Follow the steps below to use Delta Lake on S3 with Polars:
-
-1. Install Polars and deltalake. For example, using:
-
- `pip install polars deltalake`
+## Example: Write Delta tables to GCS with Polars
-2. Create a dataframe with some toy data.
-
- `df = pl.DataFrame({'x': [1, 2, 3]})`
-
-3. Set your `storage_options` correctly.
+Using Polars, you can write a Delta table to GCS like this:
```python
-storage_options = {
- "AWS_REGION":,
- 'AWS_ACCESS_KEY_ID': ,
- 'AWS_SECRET_ACCESS_KEY': ,
- 'AWS_S3_LOCKING_PROVIDER': 'dynamodb',
- 'DELTA_DYNAMO_TABLE_NAME': 'delta_log',
-}
-```
-
-4. Write data to Delta table using the `storage_options` kwarg.
-
- ```python
- df.write_delta(
- "s3://bucket/delta_table",
- storage_options=storage_options,
- )
- ```
+# create a toy dataframe
+import polars as pl
+df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]})
-## Delta Lake on AWS S3: Safe Concurrent Writes
+# define path
+table_path = "gs://bucket/delta-table"
-You need a locking provider to ensure safe concurrent writes when writing Delta tables to AWS S3. This is because AWS S3 does not guarantee mutual exclusion.
+# write Delta to GCS
+df.write_delta(table_path)
+```
-A locking provider guarantees that only one writer is able to create the same file. This prevents corrupted or conflicting data.
+## Passing GCS Credentials explicitly
-`delta-rs` uses DynamoDB to guarantee safe concurrent writes.
+Alternatively, you can pass GCS credentials to your query engine explicitly.
-Run the code below in your terminal to create a DynamoDB table that will act as your locking provider.
+For Polars, you would do this using the `storage_options` keyword. This will forward your credentials to the `object store` library that Polars uses under the hood. Read the [Polars documentation](https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_delta.html) and the [`object store` documentation](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants) for more information.
-```
- aws dynamodb create-table \
- --table-name delta_log \
- --attribute-definitions AttributeName=tablePath,AttributeType=S AttributeName=fileName,AttributeType=S \
- --key-schema AttributeName=tablePath,KeyType=HASH AttributeName=fileName,KeyType=RANGE \
- --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5
-```
+## Delta Lake on GCS: Required permissions
-If for some reason you don't want to use DynamoDB as your locking mechanism you can choose to set the `AWS_S3_ALLOW_UNSAFE_RENAME` variable to `true` in order to enable S3 unsafe writes.
+You will need the following permissions in your GCS account:
-Read more in the [Usage](../../usage/writing/writing-to-s3-with-locking-provider.md) section.
+- `storage.objects.create`
+- `storage.objects.delete` (only required for uploads that overwrite an existing object)
+- `storage.objects.get` (only required if you plan on using the Google Cloud CLI)
+- `storage.objects.list` (only required if you plan on using the Google Cloud CLI)
-## Delta Lake on GCS: Required permissions
+For more information, see the [GCP documentation](https://cloud.google.com/storage/docs/uploading-objects)
From 5d10cbe786211bfb18fb31f6a49a3838b179e3b4 Mon Sep 17 00:00:00 2001
From: Avril Aysha <68642378+avriiil@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:39:44 +0100
Subject: [PATCH 5/9] fix typos
---
docs/integrations/object-storage/s3-like.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/integrations/object-storage/s3-like.md b/docs/integrations/object-storage/s3-like.md
index 4d32f7c41b..40b2f6e076 100644
--- a/docs/integrations/object-storage/s3-like.md
+++ b/docs/integrations/object-storage/s3-like.md
@@ -1,8 +1,8 @@
# CloudFlare R2 & Minio
-`delta-rs` offers native support for using Cloudflare R2 and Minio's as storage backend. R2 and Minio support conditional puts, however we have to pass this flag into the storage options. See the example blow
+`delta-rs` offers native support for using Cloudflare R2 and Minio's as storage backend. R2 and Minio support conditional puts, however we have to pass this flag into the storage options. See the example below
-You don’t need to install any extra dependencies to red/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly.
+You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly.
## Passing S3 Credentials
From 22ff7a1c6a839f0c9ef28c1f205ecd25bf8e41b9 Mon Sep 17 00:00:00 2001
From: Avril Aysha <68642378+avriiil@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:14:41 +0100
Subject: [PATCH 6/9] add adls docs
---
docs/integrations/object-storage/adls.md | 57 ++++++++++++++++++++++++
1 file changed, 57 insertions(+)
create mode 100644 docs/integrations/object-storage/adls.md
diff --git a/docs/integrations/object-storage/adls.md b/docs/integrations/object-storage/adls.md
new file mode 100644
index 0000000000..2867c07da3
--- /dev/null
+++ b/docs/integrations/object-storage/adls.md
@@ -0,0 +1,57 @@
+# Azure ADLS Storage Backend
+
+`delta-rs` offers native support for using Microsoft Azure Data Lake Storage (ADSL) as an object storage backend.
+
+You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your ADLS access credentials correctly.
+
+## Passing Credentials Explicitly
+
+You can also pass ADLS credentials to your query engine explicitly.
+
+For Polars, you would do this using the `storage_options` keyword as demonstrated above. This will forward your credentials to the `object store` library that Polars uses for cloud storage access under the hood. Read the [`object store` documentation](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants) for more information defining specific credentials.
+
+## Example: Write Delta table to ADLS with Polars
+
+Using Polars, you can write a Delta table to ADLS directly like this:
+
+```python
+import polars as pl
+
+df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]})
+
+# define container name
+container =
+
+# define credentials
+storage_options = {
+ "ACCOUNT_NAME": ,
+ "ACCESS_KEY": ,
+}
+
+# write Delta to ADLS
+df_pl.write_delta(
+ f"abfs://{container}/delta_table",
+ storage_options = storage_options
+)
+```
+
+## Example with pandas
+
+For libraries without direct `write_delta` methods (like Pandas), you can use the `write_deltalake` function from the `deltalake` library:
+
+```python
+import pandas as pd
+from deltalake import write_deltalake
+
+df = pd.DataFrame({"foo": [1, 2, 3, 4, 5]})
+
+write_deltalake(
+ f"abfs://{container}/delta_table_pandas",
+ df,
+ storage_options=storage_options
+)
+```
+
+## Using Local Authentication
+
+If your local session is authenticated using the Azure CLI then you can write Delta tables directly to ADLS. Read more about this in the [Azure CLI documentation](https://learn.microsoft.com/en-us/cli/azure/).
From b3b2b9e856a71c3c78d014de104d2699c09c5790 Mon Sep 17 00:00:00 2001
From: Avril Aysha <68642378+avriiil@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:15:53 +0100
Subject: [PATCH 7/9] add adls docs to nav
---
mkdocs.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/mkdocs.yml b/mkdocs.yml
index b0c8d3a0ac..baf28ff3fc 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -82,6 +82,7 @@ nav:
- api/exceptions.md
- Integrations:
- Object Storage:
+ - integrations/object-storage/adls.md
- integrations/object-storage/hdfs.md
- integrations/object-storage/s3.md
- integrations/object-storage/s3-like.md
From 2498837ff6a2c3525058f1a9fd1301ba50fecbba Mon Sep 17 00:00:00 2001
From: Filip Dziuba
Date: Wed, 25 Sep 2024 15:14:39 +0200
Subject: [PATCH 8/9] refactor: exposing CommitConflictError enum
---
crates/core/src/operations/transaction/mod.rs | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/crates/core/src/operations/transaction/mod.rs b/crates/core/src/operations/transaction/mod.rs
index 6c4e81dc63..69027cc4b7 100644
--- a/crates/core/src/operations/transaction/mod.rs
+++ b/crates/core/src/operations/transaction/mod.rs
@@ -83,7 +83,7 @@ use object_store::path::Path;
use object_store::Error as ObjectStoreError;
use serde_json::Value;
-use self::conflict_checker::{CommitConflictError, TransactionInfo, WinningCommitSummary};
+use self::conflict_checker::{TransactionInfo, WinningCommitSummary};
use crate::checkpoints::{cleanup_expired_logs_for, create_checkpoint_for};
use crate::errors::DeltaTableError;
use crate::kernel::{
@@ -97,6 +97,7 @@ use crate::table::config::TableConfig;
use crate::table::state::DeltaTableState;
use crate::{crate_version, DeltaResult};
+pub use self::conflict_checker::CommitConflictError;
pub use self::protocol::INSTANCE as PROTOCOL;
#[cfg(test)]
From 6b53ac79451b3203506f281fafaa3e9f876f03a9 Mon Sep 17 00:00:00 2001
From: Avril Aysha <68642378+avriiil@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:38:19 +0100
Subject: [PATCH 9/9] squash adding adls docs
---
.github/CODEOWNERS | 2 +-
.github/actions/setup-env/action.yml | 34 +
.github/codecov.yml | 17 +
.github/dependabot.yml | 3 +-
.github/workflows/build.yml | 37 +-
.github/workflows/codecov.yml | 36 +
.github/workflows/dev_pr.yml | 1 +
.github/workflows/docs.yml | 25 +-
.github/workflows/python_benchmark.yml | 54 +
.github/workflows/python_build.yml | 136 +--
.github/workflows/python_release.yml | 3 +-
.gitignore | 4 +-
CHANGELOG.md | 296 +++++
CONTRIBUTING.md | 38 +-
Cargo.toml | 49 +-
README.md | 6 +-
crates/aws/Cargo.toml | 19 +-
crates/aws/src/constants.rs | 141 +++
crates/aws/src/credentials.rs | 331 ++++--
crates/aws/src/lib.rs | 427 ++++---
crates/aws/src/logstore/default_logstore.rs | 113 ++
.../dynamodb_logstore.rs} | 27 +-
crates/aws/src/logstore/mod.rs | 11 +
crates/aws/src/storage.rs | 604 +++++-----
crates/aws/tests/common.rs | 6 +-
crates/aws/tests/integration_s3_dynamodb.rs | 76 +-
crates/aws/tests/repair_s3_rename_test.rs | 22 +-
crates/azure/Cargo.toml | 4 +-
crates/azure/tests/integration.rs | 5 +-
crates/benchmarks/src/bin/merge.rs | 3 +-
crates/catalog-glue/Cargo.toml | 6 +-
crates/core/Cargo.toml | 22 +-
crates/core/src/data_catalog/storage/mod.rs | 5 +-
.../core/src/data_catalog/unity/datafusion.rs | 2 +-
crates/core/src/data_catalog/unity/models.rs | 2 +-
crates/core/src/delta_datafusion/cdf/mod.rs | 66 +-
crates/core/src/delta_datafusion/cdf/scan.rs | 8 +-
.../src/delta_datafusion/cdf/scan_utils.rs | 17 +-
crates/core/src/delta_datafusion/expr.rs | 126 +-
.../delta_datafusion/find_files/logical.rs | 13 +-
.../src/delta_datafusion/find_files/mod.rs | 30 +-
.../delta_datafusion/find_files/physical.rs | 14 +-
crates/core/src/delta_datafusion/logical.rs | 15 +-
crates/core/src/delta_datafusion/mod.rs | 905 ++++++++++-----
crates/core/src/delta_datafusion/physical.rs | 21 +-
crates/core/src/delta_datafusion/planner.rs | 58 +
.../src/delta_datafusion/schema_adapter.rs | 82 ++
crates/core/src/errors.rs | 11 +
crates/core/src/kernel/arrow/mod.rs | 472 +-------
crates/core/src/kernel/error.rs | 10 -
crates/core/src/kernel/expressions/eval.rs | 384 -------
crates/core/src/kernel/expressions/mod.rs | 478 --------
crates/core/src/kernel/expressions/scalars.rs | 559 ---------
crates/core/src/kernel/mod.rs | 8 +-
crates/core/src/kernel/models/actions.rs | 361 +++++-
crates/core/src/kernel/models/fields.rs | 10 +-
crates/core/src/kernel/models/schema.rs | 838 +-------------
crates/core/src/kernel/scalars.rs | 286 +++++
crates/core/src/kernel/snapshot/log_data.rs | 225 +++-
.../core/src/kernel/snapshot/log_segment.rs | 168 ++-
crates/core/src/kernel/snapshot/mod.rs | 357 +++++-
crates/core/src/kernel/snapshot/parse.rs | 29 +-
crates/core/src/kernel/snapshot/replay.rs | 447 +++++++-
crates/core/src/kernel/snapshot/serde.rs | 3 +-
crates/core/src/lib.rs | 9 +-
crates/core/src/logstore/default_logstore.rs | 53 +-
crates/core/src/logstore/mod.rs | 111 +-
crates/core/src/operations/add_column.rs | 113 ++
crates/core/src/operations/add_feature.rs | 196 ++++
crates/core/src/operations/cast.rs | 354 ------
.../core/src/operations/cast/merge_schema.rs | 352 ++++++
crates/core/src/operations/cast/mod.rs | 650 +++++++++++
crates/core/src/operations/cdc.rs | 415 +++++++
crates/core/src/operations/constraints.rs | 8 +-
.../core/src/operations/convert_to_delta.rs | 106 +-
crates/core/src/operations/create.rs | 115 +-
crates/core/src/operations/delete.rs | 445 ++++++--
.../core/src/operations/filesystem_check.rs | 6 +-
crates/core/src/operations/load.rs | 3 +
crates/core/src/operations/load_cdf.rs | 267 ++++-
crates/core/src/operations/merge/barrier.rs | 45 +-
crates/core/src/operations/merge/filter.rs | 943 +++++++++++++++
crates/core/src/operations/merge/mod.rs | 963 ++++++++--------
crates/core/src/operations/mod.rs | 65 +-
crates/core/src/operations/optimize.rs | 160 ++-
crates/core/src/operations/restore.rs | 21 +-
.../core/src/operations/set_tbl_properties.rs | 215 +---
.../transaction/conflict_checker.rs | 104 +-
crates/core/src/operations/transaction/mod.rs | 120 +-
.../src/operations/transaction/protocol.rs | 156 ++-
.../core/src/operations/transaction/state.rs | 175 +--
.../src/operations/transaction/test_utils.rs | 171 ---
crates/core/src/operations/update.rs | 554 ++++++---
crates/core/src/operations/vacuum.rs | 5 +-
crates/core/src/operations/write.rs | 892 +++++++++++++--
crates/core/src/operations/writer.rs | 11 +-
crates/core/src/protocol/checkpoints.rs | 28 +-
crates/core/src/protocol/mod.rs | 75 +-
crates/core/src/schema/partitions.rs | 205 +++-
crates/core/src/storage/file.rs | 42 +-
crates/core/src/storage/mod.rs | 324 +++++-
crates/core/src/storage/retry_ext.rs | 5 +-
crates/core/src/table/builder.rs | 198 ++--
crates/core/src/table/config.rs | 89 +-
crates/core/src/table/mod.rs | 33 +-
crates/core/src/table/state.rs | 5 +
crates/core/src/table/state_arrow.rs | 23 +-
.../core/src/test_utils/factories/actions.rs | 153 +++
crates/core/src/test_utils/factories/data.rs | 247 ++++
crates/core/src/test_utils/factories/mod.rs | 66 ++
crates/core/src/test_utils/mod.rs | 5 +
crates/core/src/writer/json.rs | 9 +-
crates/core/src/writer/record_batch.rs | 36 +-
crates/core/src/writer/stats.rs | 25 +-
crates/core/src/writer/test_utils.rs | 10 +-
crates/core/src/writer/utils.rs | 6 +-
crates/core/tests/checkpoint_writer.rs | 14 +-
crates/core/tests/command_merge.rs | 31 +-
crates/core/tests/command_optimize.rs | 2 +-
crates/core/tests/command_restore.rs | 8 +-
crates/core/tests/fs_common/mod.rs | 23 +-
crates/core/tests/integration_checkpoint.rs | 10 +-
crates/core/tests/integration_datafusion.rs | 87 +-
.../core/tests/read_delta_partitions_test.rs | 116 --
crates/deltalake/Cargo.toml | 16 +-
crates/deltalake/src/lib.rs | 2 +
crates/gcp/Cargo.toml | 4 +-
crates/gcp/src/storage.rs | 21 +-
crates/gcp/tests/context.rs | 2 +-
crates/hdfs/Cargo.toml | 29 +
crates/hdfs/src/lib.rs | 48 +
crates/hdfs/tests/context.rs | 60 +
crates/hdfs/tests/integration.rs | 16 +
crates/mount/Cargo.toml | 4 +-
crates/mount/src/file.rs | 27 +-
crates/sql/src/logical_plan.rs | 44 +-
crates/sql/src/planner.rs | 21 +-
crates/test/Cargo.toml | 4 +-
crates/test/src/concurrent.rs | 2 +-
crates/test/src/datafusion.rs | 8 +-
crates/test/src/lib.rs | 14 +-
.../_delta_log/00000000000000000000.json | 3 +
.../_delta_log/00000000000000000001.json | 3 +
.../00000000000000000002.checkpoint.parquet | Bin 0 -> 41898 bytes
.../_delta_log/00000000000000000002.json | 2 +
.../_delta_log/_last_checkpoint | 1 +
...411e-bca9-b067444cbcb0-c000.snappy.parquet | Bin 0 -> 5489 bytes
...4453-9202-51d75dee59af-c000.snappy.parquet | Bin 0 -> 5489 bytes
dev/publish.sh | 11 +
dev/release/update_change_log.sh | 4 +-
docs/Makefile | 20 +
docs/api/delta_writer.md | 4 +
.../architecture-of-delta-table.md | 18 +-
.../delta-lake-acid-transactions.md | 90 +-
docs/integrations/delta-lake-daft.md | 10 +
docs/integrations/delta-lake-dagster.md | 4 +-
docs/integrations/object-storage/adls.md | 57 +
docs/integrations/object-storage/gcs.md | 44 +
docs/integrations/object-storage/hdfs.md | 48 +
docs/integrations/object-storage/s3-like.md | 83 ++
docs/integrations/object-storage/s3.md | 102 ++
docs/requirements.txt | 4 +-
docs/usage/loading-table.md | 55 +-
docs/usage/managing-tables.md | 9 +-
.../small-file-compaction-with-optimize.md | 2 +-
docs/usage/writing/index.md | 35 +-
.../writing-to-s3-with-locking-provider.md | 53 +-
mkdocs.yml | 5 +
python/.gitignore | 1 +
python/Cargo.toml | 11 +-
python/Makefile | 7 +-
python/deltalake/__init__.py | 13 +-
python/deltalake/_internal.pyi | 166 ++-
python/deltalake/schema.py | 111 +-
python/deltalake/table.py | 1016 ++++++++++-------
python/deltalake/writer.py | 307 +++--
python/docs/source/usage.rst | 26 +
python/pyproject.toml | 31 +-
python/src/features.rs | 56 +
python/src/filesystem.rs | 170 +--
python/src/lib.rs | 966 +++++++++-------
python/src/merge.rs | 214 ++++
python/src/schema.rs | 215 ++--
python/src/utils.rs | 36 +
python/stubs/pyarrow/__init__.pyi | 7 +
python/stubs/pyarrow/parquet.pyi | 8 +
python/tests/conftest.py | 89 +-
.../test_write_to_pyspark.py | 4 +-
python/tests/test_alter.py | 158 ++-
python/tests/test_benchmark.py | 2 +-
python/tests/test_cdf.py | 267 ++++-
python/tests/test_checkpoint.py | 113 ++
python/tests/test_delete.py | 5 +-
python/tests/test_file_system_handler.py | 2 +-
python/tests/test_fs.py | 25 +-
python/tests/test_merge.py | 100 +-
python/tests/test_optimize.py | 44 +-
python/tests/test_repair.py | 4 +-
python/tests/test_restore.py | 4 +-
python/tests/test_schema.py | 130 ++-
python/tests/test_table_read.py | 188 ++-
python/tests/test_update.py | 10 +-
python/tests/test_vacuum.py | 4 +-
python/tests/test_writer.py | 349 ++++--
python/tests/test_writerproperties.py | 56 +-
205 files changed, 15934 insertions(+), 8216 deletions(-)
create mode 100644 .github/actions/setup-env/action.yml
create mode 100644 .github/codecov.yml
create mode 100644 .github/workflows/codecov.yml
create mode 100644 .github/workflows/python_benchmark.yml
create mode 100644 crates/aws/src/constants.rs
create mode 100644 crates/aws/src/logstore/default_logstore.rs
rename crates/aws/src/{logstore.rs => logstore/dynamodb_logstore.rs} (94%)
create mode 100644 crates/aws/src/logstore/mod.rs
create mode 100644 crates/core/src/delta_datafusion/planner.rs
create mode 100644 crates/core/src/delta_datafusion/schema_adapter.rs
delete mode 100644 crates/core/src/kernel/expressions/eval.rs
delete mode 100644 crates/core/src/kernel/expressions/mod.rs
delete mode 100644 crates/core/src/kernel/expressions/scalars.rs
create mode 100644 crates/core/src/kernel/scalars.rs
create mode 100644 crates/core/src/operations/add_column.rs
create mode 100644 crates/core/src/operations/add_feature.rs
delete mode 100644 crates/core/src/operations/cast.rs
create mode 100644 crates/core/src/operations/cast/merge_schema.rs
create mode 100644 crates/core/src/operations/cast/mod.rs
create mode 100644 crates/core/src/operations/cdc.rs
create mode 100644 crates/core/src/operations/merge/filter.rs
delete mode 100644 crates/core/src/operations/transaction/test_utils.rs
create mode 100644 crates/core/src/test_utils/factories/actions.rs
create mode 100644 crates/core/src/test_utils/factories/data.rs
create mode 100644 crates/core/src/test_utils/factories/mod.rs
create mode 100644 crates/core/src/test_utils/mod.rs
create mode 100644 crates/hdfs/Cargo.toml
create mode 100644 crates/hdfs/src/lib.rs
create mode 100644 crates/hdfs/tests/context.rs
create mode 100644 crates/hdfs/tests/integration.rs
create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000000.json
create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000001.json
create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.checkpoint.parquet
create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.json
create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/_last_checkpoint
create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet
create mode 100644 crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet
create mode 100755 dev/publish.sh
create mode 100644 docs/Makefile
create mode 100644 docs/integrations/object-storage/adls.md
create mode 100644 docs/integrations/object-storage/gcs.md
create mode 100644 docs/integrations/object-storage/hdfs.md
create mode 100644 docs/integrations/object-storage/s3-like.md
create mode 100644 docs/integrations/object-storage/s3.md
create mode 100644 python/src/features.rs
create mode 100644 python/src/merge.rs
create mode 100644 python/stubs/pyarrow/parquet.pyi
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b99809d1f6..736703c551 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,4 @@
-crates/ @wjones127 @roeap @rtyler
+crates/ @wjones127 @roeap @rtyler @hntd187 @ion-elgreco
delta-inspect/ @wjones127 @rtyler
proofs/ @houqp
python/ @wjones127 @fvaleye @roeap @ion-elgreco
diff --git a/.github/actions/setup-env/action.yml b/.github/actions/setup-env/action.yml
new file mode 100644
index 0000000000..7875107ddd
--- /dev/null
+++ b/.github/actions/setup-env/action.yml
@@ -0,0 +1,34 @@
+name: "Setup Python and Rust Environment"
+description: "Set up Python, virtual environment, and Rust toolchain"
+
+inputs:
+
+ python-version:
+ description: "The Python version to set up"
+ required: true
+ default: "3.10"
+
+ rust-toolchain:
+ description: "The Rust toolchain to set up"
+ required: true
+ default: "stable"
+
+runs:
+ using: "composite"
+
+ steps:
+
+ - name: Set up Python ${{ inputs.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ inputs.python-version }}
+
+ - name: Install Rust toolchain
+ uses: actions-rs/toolchain@v1
+ with:
+ profile: default
+ toolchain: ${{ inputs.rust-toolchain }}
+ override: true
+ components: rustfmt, clippy
+
+ - uses: Swatinem/rust-cache@v2
\ No newline at end of file
diff --git a/.github/codecov.yml b/.github/codecov.yml
new file mode 100644
index 0000000000..dd93c3b7cf
--- /dev/null
+++ b/.github/codecov.yml
@@ -0,0 +1,17 @@
+
+coverage:
+ status:
+ project:
+ default:
+ # allow some leniency on the deviation of pull requests
+ threshold: '1%'
+ informational: true
+ patch:
+ default:
+ informational: true
+
+
+ignore:
+ - "delta-inspect/"
+ - "proofs/"
+ - "**/*.toml"
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index bdacb4c00c..1e5b6b27a4 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -10,6 +10,5 @@ updates:
ignore:
# arrow and datafusion are bumped manually
- dependency-name: "arrow*"
- update-types: ["version-update:semver-major"]
- dependency-name: "datafusion*"
- update-types: ["version-update:semver-major"]
+ - dependency-name: "parquet"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8b80dc0a9f..a807184c47 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,6 +5,10 @@ on:
branches: [main, "rust-v*"]
pull_request:
branches: [main, "rust-v*"]
+ merge_group:
+
+env:
+ DEFAULT_FEATURES: "azure,datafusion,s3,gcs,glue,hdfs "
jobs:
format:
@@ -16,7 +20,7 @@ jobs:
uses: actions-rs/toolchain@v1
with:
profile: default
- toolchain: stable
+ toolchain: '1.80'
override: true
- name: Format
@@ -28,7 +32,6 @@ jobs:
matrix:
os:
- ubuntu-latest
- - macos-11
- windows-latest
runs-on: ${{ matrix.os }}
@@ -39,17 +42,17 @@ jobs:
uses: actions-rs/toolchain@v1
with:
profile: default
- toolchain: stable
+ toolchain: '1.80'
override: true
- name: build and lint with clippy
- run: cargo clippy --features azure,datafusion,s3,gcs,glue --tests
+ run: cargo clippy --features ${{ env.DEFAULT_FEATURES }} --tests
- name: Spot-check build for native-tls features
run: cargo clippy --no-default-features --features azure,datafusion,s3-native-tls,gcs,glue --tests
- name: Check docs
- run: cargo doc --features azure,datafusion,s3,gcs,glue
+ run: cargo doc --features ${{ env.DEFAULT_FEATURES }}
- name: Check no default features (except rustls)
run: cargo check --no-default-features --features rustls
@@ -60,7 +63,6 @@ jobs:
matrix:
os:
- ubuntu-latest
- - macos-11
- windows-latest
runs-on: ${{ matrix.os }}
env:
@@ -77,11 +79,11 @@ jobs:
uses: actions-rs/toolchain@v1
with:
profile: default
- toolchain: "stable"
+ toolchain: '1.80'
override: true
- name: Run tests
- run: cargo test --verbose --features datafusion,azure
+ run: cargo test --verbose --features ${{ env.DEFAULT_FEATURES }}
integration_test:
name: Integration Tests
@@ -94,6 +96,7 @@ jobs:
# https://github.com/rust-lang/cargo/issues/10280
CARGO_NET_GIT_FETCH_WITH_CLI: "true"
RUST_BACKTRACE: "1"
+ RUST_LOG: debug
AWS_DEFAULT_REGION: "us-east-1"
AWS_ACCESS_KEY_ID: deltalake
AWS_SECRET_ACCESS_KEY: weloverust
@@ -111,15 +114,27 @@ jobs:
uses: actions-rs/toolchain@v1
with:
profile: default
- toolchain: stable
+ toolchain: '1.80'
override: true
+ # Install Java and Hadoop for HDFS integration tests
+ - uses: actions/setup-java@v4
+ with:
+ distribution: "temurin"
+ java-version: "17"
+
+ - name: Download Hadoop
+ run: |
+ wget -q https://dlcdn.apache.org/hadoop/common/hadoop-3.4.0/hadoop-3.4.0.tar.gz
+ tar -xf hadoop-3.4.0.tar.gz -C $GITHUB_WORKSPACE
+ echo "$GITHUB_WORKSPACE/hadoop-3.4.0/bin" >> $GITHUB_PATH
+
- name: Start emulated services
- run: docker-compose up -d
+ run: docker compose up -d
- name: Run tests with rustls (default)
run: |
- cargo test --features integration_test,azure,s3,gcs,datafusion
+ cargo test --features integration_test,${{ env.DEFAULT_FEATURES }}
- name: Run tests with native-tls
run: |
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
new file mode 100644
index 0000000000..a8d9beabcd
--- /dev/null
+++ b/.github/workflows/codecov.yml
@@ -0,0 +1,36 @@
+name: coverage
+
+on:
+ push:
+ branches: [main, "rust-v*"]
+ pull_request:
+ branches: [main, "rust-v*"]
+
+env:
+ DEFAULT_FEATURES: "azure,datafusion,s3,gcs,glue,hdfs "
+
+jobs:
+ coverage:
+ runs-on: ubuntu-latest
+ env:
+ CARGO_TERM_COLOR: always
+ steps:
+ - uses: actions/checkout@v4
+ - name: Install rust
+ uses: actions-rs/toolchain@v1
+ with:
+ profile: default
+ toolchain: '1.80'
+ override: true
+ - name: Install cargo-llvm-cov
+ uses: taiki-e/install-action@cargo-llvm-cov
+ - uses: Swatinem/rust-cache@v2
+ - name: Generate code coverage
+ run: cargo llvm-cov --features ${DEFAULT_FEATURES} --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v4
+ with:
+ files: codecov.json
+ fail_ci_if_error: true
+ env:
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml
index 6b3d5a7ddb..121e0b8882 100644
--- a/.github/workflows/dev_pr.yml
+++ b/.github/workflows/dev_pr.yml
@@ -2,6 +2,7 @@ name: dev_pr
# Trigger whenever a PR is changed (title as well as new / changed commits)
on:
+ merge_group:
pull_request_target:
types:
- opened
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 079cd66fcc..5729b87624 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -1,6 +1,7 @@
name: Build (and maybe release) the documentation
on:
+ merge_group:
pull_request:
paths:
- python/**
@@ -31,9 +32,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- - uses: psf/black@stable
- with:
- src: docs/src/python
+ - run: |
+ cd docs
+ make check
build-deploy:
needs:
@@ -47,25 +48,13 @@ jobs:
steps:
- uses: actions/checkout@v3
- - name: Install Rust
- uses: actions-rs/toolchain@v1
- with:
- toolchain: stable
- override: true
- components: rustfmt, clippy
-
- - uses: Swatinem/rust-cache@v2
-
- - name: Set up Python
- uses: actions/setup-python@v3
- with:
- python-version: '3.10'
+ - name: Setup Environment
+ uses: ./.github/actions/setup-env
- name: Build and install deltalake
run: |
cd python
- pip install virtualenv
- virtualenv venv
+ python -m venv venv
source venv/bin/activate
make ${{ env.BUILD_ARGS }}
diff --git a/.github/workflows/python_benchmark.yml b/.github/workflows/python_benchmark.yml
new file mode 100644
index 0000000000..896c5cc412
--- /dev/null
+++ b/.github/workflows/python_benchmark.yml
@@ -0,0 +1,54 @@
+name: python_benchmark
+
+
+# This is separate from the python_build so that it doesn't need to run on the merge group
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+defaults:
+ run:
+ working-directory: ./python
+
+jobs:
+ benchmark:
+ name: Python Benchmark
+ runs-on: ubuntu-latest
+ env:
+ RUSTFLAGS: "-C debuginfo=line-tables-only"
+ CARGO_INCREMENTAL: 0
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Setup Environment
+ uses: ./.github/actions/setup-env
+
+ - name: Build deltalake in release mode
+ run: |
+ python -m venv venv
+ source venv/bin/activate
+ MATURIN_EXTRA_ARGS=--release make develop
+
+ # Download previous benchmark result from cache (if exists)
+ - name: Download previous benchmark data
+ uses: actions/cache@v2
+ with:
+ path: ./cache
+ key: ${{ runner.os }}-benchmark
+
+ - name: Run benchmark
+ run: |
+ source venv/bin/activate
+ pytest tests/test_benchmark.py -m benchmark --benchmark-json output.json
+
+ - name: Store benchmark result
+ uses: benchmark-action/github-action-benchmark@v1
+ with:
+ tool: "pytest"
+ output-file-path: python/output.json
+ external-data-json-path: ./cache/benchmark-data.json
+ fail-on-alert: true
+
diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml
index bc2f20cc9a..dc5483e091 100644
--- a/.github/workflows/python_build.yml
+++ b/.github/workflows/python_build.yml
@@ -1,6 +1,7 @@
name: python_build
on:
+ merge_group:
push:
branches: [main]
pull_request:
@@ -15,28 +16,22 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- - name: Setup Python
- uses: actions/setup-python@v2
- with:
- python-version: 3.8
+
+ - name: Setup Environment
+ uses: ./.github/actions/setup-env
- name: Check Python
run: |
- pip install ruff black mypy types-dataclasses typing-extensions
+ python -m venv venv
+ source venv/bin/activate
+ pip install ruff==0.5.2 mypy==1.10.1 types-dataclasses typing-extensions
make check-python
- - name: Install minimal stable with clippy and rustfmt
- uses: actions-rs/toolchain@v1
- with:
- profile: default
- toolchain: stable
- override: true
-
- name: Check Rust
run: make check-rust
test-minimal:
- name: Python Build (Python 3.8 PyArrow 8.0.0)
+ name: Python Build (Python 3.8 PyArrow 16.0.0)
runs-on: ubuntu-latest
env:
RUSTFLAGS: "-C debuginfo=line-tables-only"
@@ -45,28 +40,18 @@ jobs:
steps:
- uses: actions/checkout@v3
- - name: Setup Python
- uses: actions/setup-python@v2
+ - name: Setup Environment
+ uses: ./.github/actions/setup-env
with:
python-version: 3.8
- - name: Install latest nightly
- uses: actions-rs/toolchain@v1
- with:
- toolchain: stable
- override: true
- components: rustfmt, clippy
-
- - uses: Swatinem/rust-cache@v2
-
- name: Build and install deltalake
run: |
- pip install virtualenv
- virtualenv venv
+ python -m venv venv
source venv/bin/activate
make setup
# Install minimum PyArrow version
- pip install -e .[pandas,devel] pyarrow==8.0.0
+ pip install -e .[pandas,devel] pyarrow==16.0.0
env:
RUSTFLAGS: "-C debuginfo=line-tables-only"
@@ -75,10 +60,6 @@ jobs:
source venv/bin/activate
make unit-test
- # - name: Run Integration tests
- # run: |
- # py.test --cov tests -m integration
-
test:
name: Python Build (Python 3.10 PyArrow latest)
runs-on: ubuntu-latest
@@ -89,26 +70,15 @@ jobs:
steps:
- uses: actions/checkout@v3
- - name: Install latest nightly
- uses: actions-rs/toolchain@v1
- with:
- toolchain: stable
- override: true
- components: rustfmt, clippy
-
- - uses: Swatinem/rust-cache@v2
-
- - uses: actions/setup-python@v3
- with:
- python-version: "3.10"
+ - name: Setup Environment
+ uses: ./.github/actions/setup-env
- name: Start emulated services
- run: docker-compose up -d
+ run: docker compose up -d
- name: Build and install deltalake
run: |
- pip install virtualenv
- virtualenv venv
+ python -m venv venv
source venv/bin/activate
make develop
@@ -127,56 +97,6 @@ jobs:
python -m pytest -m "not pandas and not integration and not benchmark"
pip install pandas
- benchmark:
- name: Python Benchmark
- runs-on: ubuntu-latest
- env:
- RUSTFLAGS: "-C debuginfo=line-tables-only"
- CARGO_INCREMENTAL: 0
-
- steps:
- - uses: actions/checkout@v2
-
- - name: Install latest nightly
- uses: actions-rs/toolchain@v1
- with:
- toolchain: stable
- override: true
- components: rustfmt, clippy
-
- - uses: Swatinem/rust-cache@v2
-
- - uses: actions/setup-python@v4
- with:
- python-version: "3.10"
-
- - name: Build deltalake in release mode
- run: |
- pip install virtualenv
- virtualenv venv
- source venv/bin/activate
- MATURIN_EXTRA_ARGS=--release make develop
-
- # Download previous benchmark result from cache (if exists)
- - name: Download previous benchmark data
- uses: actions/cache@v2
- with:
- path: ./cache
- key: ${{ runner.os }}-benchmark
-
- - name: Run benchmark
- run: |
- source venv/bin/activate
- pytest tests/test_benchmark.py -m benchmark --benchmark-json output.json
-
- - name: Store benchmark result
- uses: benchmark-action/github-action-benchmark@v1
- with:
- tool: "pytest"
- output-file-path: python/output.json
- external-data-json-path: ./cache/benchmark-data.json
- fail-on-alert: true
-
test-pyspark:
name: PySpark Integration Tests
runs-on: ubuntu-latest
@@ -187,18 +107,8 @@ jobs:
steps:
- uses: actions/checkout@v3
- - name: Install latest nightly
- uses: actions-rs/toolchain@v1
- with:
- toolchain: stable
- override: true
- components: rustfmt, clippy
-
- - uses: Swatinem/rust-cache@v2
-
- - uses: actions/setup-python@v3
- with:
- python-version: "3.10"
+ - name: Setup Environment
+ uses: ./.github/actions/setup-env
- uses: actions/setup-java@v2
with:
@@ -207,8 +117,7 @@ jobs:
- name: Build and install deltalake
run: |
- pip install virtualenv
- virtualenv venv
+ python -m venv venv
source venv/bin/activate
make develop-pyspark
@@ -231,15 +140,14 @@ jobs:
steps:
- uses: actions/checkout@v3
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v4
+ - name: Setup Environment
+ uses: ./.github/actions/setup-env
with:
python-version: ${{ matrix.python-version }}
- name: Build and install deltalake
run: |
- pip install virtualenv
- virtualenv venv
+ python -m venv venv
source venv/bin/activate
make setup
maturin develop
diff --git a/.github/workflows/python_release.yml b/.github/workflows/python_release.yml
index 48611bacb4..cf462f2070 100644
--- a/.github/workflows/python_release.yml
+++ b/.github/workflows/python_release.yml
@@ -35,7 +35,7 @@ jobs:
fail-fast: false
matrix:
target: [x86_64-apple-darwin, aarch64-apple-darwin]
- runs-on: macos-12
+ runs-on: macos-14
steps:
- uses: actions/checkout@v3
@@ -76,6 +76,7 @@ jobs:
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
with:
+ maturin-version: v1.6.0 # https://github.com/PyO3/maturin/issues/2154
target: x86_64-unknown-linux-gnu
command: publish
args: --skip-existing -m python/Cargo.toml ${{ env.FEATURES_FLAG }}
diff --git a/.gitignore b/.gitignore
index 84fc17c5f2..18dcc39f69 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ tlaplus/*.toolbox/*/[0-9]*-[0-9]*-[0-9]*-[0-9]*-[0-9]*-[0-9]*/
.vscode
.env
.venv
+venv
**/.DS_Store
**/.python-version
.coverage
@@ -21,6 +22,7 @@ __blobstorage__
.githubchangeloggenerator.cache.log
.githubchangeloggenerator.cache/
.githubchangeloggenerator*
+data
# Add all Cargo.lock files except for those in binary crates
Cargo.lock
@@ -30,4 +32,4 @@ Cargo.lock
justfile
site
-__pycache__
+__pycache__
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e68641da21..7c0c5099c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,301 @@
# Changelog
+## [rust-v0.19.0](https://github.com/delta-io/delta-rs/tree/rust-v0.19.0) (2024-08-14)
+
+[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.18.2...rust-v0.19.0)
+
+**Implemented enhancements:**
+
+- Only allow squash merge [\#2542](https://github.com/delta-io/delta-rs/issues/2542)
+
+**Fixed bugs:**
+
+- Write also insert change types in writer CDC [\#2750](https://github.com/delta-io/delta-rs/issues/2750)
+- Regression in Python multiprocessing support [\#2744](https://github.com/delta-io/delta-rs/issues/2744)
+- SchemaError occurs during table optimisation after upgrade to v0.18.1 [\#2731](https://github.com/delta-io/delta-rs/issues/2731)
+- AWS WebIdentityToken exposure in log files [\#2719](https://github.com/delta-io/delta-rs/issues/2719)
+- Write performance degrades with multiple writers [\#2683](https://github.com/delta-io/delta-rs/issues/2683)
+- Write monotonic sequence, but read is non monotonic [\#2659](https://github.com/delta-io/delta-rs/issues/2659)
+- Python `write_deltalake` with `schema_mode="merge"` casts types [\#2642](https://github.com/delta-io/delta-rs/issues/2642)
+- Newest docs \(potentially\) not released [\#2587](https://github.com/delta-io/delta-rs/issues/2587)
+- CDC is not generated for Structs and Lists [\#2568](https://github.com/delta-io/delta-rs/issues/2568)
+
+**Closed issues:**
+
+- delete\_dir bug [\#2713](https://github.com/delta-io/delta-rs/issues/2713)
+
+**Merged pull requests:**
+
+- chore: fix a bunch of clippy lints and re-enable tests [\#2773](https://github.com/delta-io/delta-rs/pull/2773) ([rtyler](https://github.com/rtyler))
+- feat: more economic data skipping with datafusion [\#2772](https://github.com/delta-io/delta-rs/pull/2772) ([roeap](https://github.com/roeap))
+- chore: prepare the next notable release of 0.19.0 [\#2768](https://github.com/delta-io/delta-rs/pull/2768) ([rtyler](https://github.com/rtyler))
+- feat: restore the TryFrom for DeltaTablePartition [\#2767](https://github.com/delta-io/delta-rs/pull/2767) ([rtyler](https://github.com/rtyler))
+- feat: fail fast on forked process [\#2765](https://github.com/delta-io/delta-rs/pull/2765) ([Tom-Newton](https://github.com/Tom-Newton))
+- perf: early stop if all values in arr are null [\#2764](https://github.com/delta-io/delta-rs/pull/2764) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(python, rust\): don't flatten fields during cdf read [\#2763](https://github.com/delta-io/delta-rs/pull/2763) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: upgrade to datafusion 41 [\#2761](https://github.com/delta-io/delta-rs/pull/2761) ([rtyler](https://github.com/rtyler))
+- fix\(python, rust\): cdc in writer not creating inserts [\#2751](https://github.com/delta-io/delta-rs/pull/2751) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat: improved test fixtures [\#2749](https://github.com/delta-io/delta-rs/pull/2749) ([roeap](https://github.com/roeap))
+- feat: introduce CDC generation for merge operations [\#2747](https://github.com/delta-io/delta-rs/pull/2747) ([rtyler](https://github.com/rtyler))
+- docs: fix broken link in docs [\#2746](https://github.com/delta-io/delta-rs/pull/2746) ([astrojuanlu](https://github.com/astrojuanlu))
+- chore: update delta\_kernel to 0.3.0 [\#2742](https://github.com/delta-io/delta-rs/pull/2742) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
+- chore: add to code\_owner crates [\#2741](https://github.com/delta-io/delta-rs/pull/2741) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: update changelog and versions for next release [\#2740](https://github.com/delta-io/delta-rs/pull/2740) ([rtyler](https://github.com/rtyler))
+- feat\(python, rust\): arrow large/view types passthrough, rust default engine [\#2738](https://github.com/delta-io/delta-rs/pull/2738) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: column parsing to include nested columns and enclosing char [\#2737](https://github.com/delta-io/delta-rs/pull/2737) ([gtrawinski](https://github.com/gtrawinski))
+
+## [rust-v0.18.2](https://github.com/delta-io/delta-rs/tree/rust-v0.18.2) (2024-08-07)
+
+[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.18.1...rust-v0.18.2)
+
+**Implemented enhancements:**
+
+- Choose which columns to store min/max values for [\#2709](https://github.com/delta-io/delta-rs/issues/2709)
+- Projection pushdown for load\_cdf [\#2681](https://github.com/delta-io/delta-rs/issues/2681)
+- Way to check if Delta table exists at specified path [\#2662](https://github.com/delta-io/delta-rs/issues/2662)
+- Support HDFS via hdfs-native package [\#2611](https://github.com/delta-io/delta-rs/issues/2611)
+- Deletion `_change_type` does not appear in change data feed [\#2579](https://github.com/delta-io/delta-rs/issues/2579)
+- Could you please explain in the README what "Deltalake" is for the uninitiated? [\#2523](https://github.com/delta-io/delta-rs/issues/2523)
+- Discuss: Allow protocol change during write actions [\#2444](https://github.com/delta-io/delta-rs/issues/2444)
+- Support for Arrow PyCapsule interface [\#2376](https://github.com/delta-io/delta-rs/issues/2376)
+
+**Fixed bugs:**
+
+- Slow add\_actions.to\_pydict for tables with large number of columns, impacting read performance [\#2733](https://github.com/delta-io/delta-rs/issues/2733)
+- append is deleting records [\#2716](https://github.com/delta-io/delta-rs/issues/2716)
+- segmentation fault - Python 3.10 on Mac M3 [\#2706](https://github.com/delta-io/delta-rs/issues/2706)
+- Failure to delete dir and files [\#2703](https://github.com/delta-io/delta-rs/issues/2703)
+- DeltaTable.from\_data\_catalog not working [\#2699](https://github.com/delta-io/delta-rs/issues/2699)
+- Project should use the same version of `ruff` in the `lint` stage of `python_build.yml` as in `pyproject.toml` [\#2678](https://github.com/delta-io/delta-rs/issues/2678)
+- un-tracked columns are giving json error when pyarrow schema have feild with nullable=False and create\_checkpoint is trigged [\#2675](https://github.com/delta-io/delta-rs/issues/2675)
+- \[BUG\]write\_delta\({'custom\_metadata':str}\) cannot be converted. str to pyDict error \(0.18.2\_DeltaPython/Windows10\) [\#2697](https://github.com/delta-io/delta-rs/issues/2697)
+- Pyarrow engine not supporting schema overwrite with Append mode [\#2654](https://github.com/delta-io/delta-rs/issues/2654)
+- `deltalake-core` version re-exported by `deltalake` different than versions used by `deltalake-azure` and `deltalake-gcp` [\#2647](https://github.com/delta-io/delta-rs/issues/2647)
+- i32 limit in JSON stats [\#2646](https://github.com/delta-io/delta-rs/issues/2646)
+- Rust writer not encoding correct URL for partitions in delta table [\#2634](https://github.com/delta-io/delta-rs/issues/2634)
+- Large Types breaks merge predicate pruning [\#2632](https://github.com/delta-io/delta-rs/issues/2632)
+- Getting error when converting a partitioned parquet table to delta table [\#2626](https://github.com/delta-io/delta-rs/issues/2626)
+- Arrow: Parquet does not support writing empty structs when creating checkpoint [\#2622](https://github.com/delta-io/delta-rs/issues/2622)
+- InvalidTableLocation\("Unknown scheme: gs"\) on 0.18.0 [\#2610](https://github.com/delta-io/delta-rs/issues/2610)
+- Unable to read delta table created using Uniform [\#2578](https://github.com/delta-io/delta-rs/issues/2578)
+- schema merging doesn't work when overwriting with a predicate [\#2567](https://github.com/delta-io/delta-rs/issues/2567)
+- Not working in AWS Lambda \(0.16.2 - 0.17.4\) OSError: Generic S3 error [\#2511](https://github.com/delta-io/delta-rs/issues/2511)
+- DataFusion filter on partition column doesn't work. \(when the phsical schema ordering is different to logical one\) [\#2494](https://github.com/delta-io/delta-rs/issues/2494)
+- Creating checkpoints for tables with missing column stats results in Err [\#2493](https://github.com/delta-io/delta-rs/issues/2493)
+- Cannot merge to a table with a timestamp column after upgrading delta-rs [\#2478](https://github.com/delta-io/delta-rs/issues/2478)
+- Azure AD Auth fails on ARM64 [\#2475](https://github.com/delta-io/delta-rs/issues/2475)
+- Generic S3 error: Error after 0 retries ... Broken pipe \(os error 32\) [\#2403](https://github.com/delta-io/delta-rs/issues/2403)
+- write\_deltalake identifies large\_string as datatype even though string is set in schema [\#2374](https://github.com/delta-io/delta-rs/issues/2374)
+- Inconsistent arrow timestamp type breaks datafusion query [\#2341](https://github.com/delta-io/delta-rs/issues/2341)
+
+**Closed issues:**
+
+- Unable to write new partitions with type timestamp on tables created with delta-rs 0.10.0 [\#2631](https://github.com/delta-io/delta-rs/issues/2631)
+
+**Merged pull requests:**
+
+- fix: schema adapter doesn't map partial batches correctly [\#2735](https://github.com/delta-io/delta-rs/pull/2735) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
+- perf: grab file size in rust [\#2734](https://github.com/delta-io/delta-rs/pull/2734) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat: use logical plan in update, refactor/simplify CDCTracker [\#2727](https://github.com/delta-io/delta-rs/pull/2727) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat: use logical plan in delete, delta planner refactoring [\#2725](https://github.com/delta-io/delta-rs/pull/2725) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: try an alternative docke compose invocation syntax [\#2724](https://github.com/delta-io/delta-rs/pull/2724) ([rtyler](https://github.com/rtyler))
+- fix\(python, rust\): use input schema to get correct schema in cdf reads [\#2723](https://github.com/delta-io/delta-rs/pull/2723) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat\(python, rust\): cdc write-support for `overwrite` and `replacewhere` writes [\#2722](https://github.com/delta-io/delta-rs/pull/2722) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat\(python, rust\): cdc write-support for `delete` operation [\#2721](https://github.com/delta-io/delta-rs/pull/2721) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: enabling actions for merge groups [\#2718](https://github.com/delta-io/delta-rs/pull/2718) ([rtyler](https://github.com/rtyler))
+- perf: apply projection when reading checkpoint parquet [\#2717](https://github.com/delta-io/delta-rs/pull/2717) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
+- feat\(python\): add DeltaTable.is\_deltatable static method \(\#2662\) [\#2715](https://github.com/delta-io/delta-rs/pull/2715) ([omkar-foss](https://github.com/omkar-foss))
+- chore: prepare python release 0.18.3 [\#2707](https://github.com/delta-io/delta-rs/pull/2707) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(python, rust\): use url encoder when encoding partition values [\#2705](https://github.com/delta-io/delta-rs/pull/2705) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat\(python, rust\): add projection in CDF reads [\#2704](https://github.com/delta-io/delta-rs/pull/2704) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: ensure DataFusion SessionState Parquet options are applied to DeltaScan [\#2702](https://github.com/delta-io/delta-rs/pull/2702) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
+- chore: refactor `write_deltalake` in `writer.py` [\#2695](https://github.com/delta-io/delta-rs/pull/2695) ([fpgmaas](https://github.com/fpgmaas))
+- fix\(python\): empty dataset fix for "pyarrow" engine [\#2689](https://github.com/delta-io/delta-rs/pull/2689) ([sherlockbeard](https://github.com/sherlockbeard))
+- chore: add test coverage command to `Makefile` [\#2688](https://github.com/delta-io/delta-rs/pull/2688) ([fpgmaas](https://github.com/fpgmaas))
+- chore: create separate action to setup python and rust in the cicd pipeline [\#2687](https://github.com/delta-io/delta-rs/pull/2687) ([fpgmaas](https://github.com/fpgmaas))
+- fix: update delta kernel version [\#2685](https://github.com/delta-io/delta-rs/pull/2685) ([jeppe742](https://github.com/jeppe742))
+- chore: update README.md [\#2684](https://github.com/delta-io/delta-rs/pull/2684) ([veronewra](https://github.com/veronewra))
+- fix\(rust,python\): checkpoint with column nullable false [\#2680](https://github.com/delta-io/delta-rs/pull/2680) ([sherlockbeard](https://github.com/sherlockbeard))
+- chore: pin `ruff` and `mypy` versions in the `lint` stage in the CI pipeline [\#2679](https://github.com/delta-io/delta-rs/pull/2679) ([fpgmaas](https://github.com/fpgmaas))
+- chore: enable `RUF` ruleset for `ruff` [\#2677](https://github.com/delta-io/delta-rs/pull/2677) ([fpgmaas](https://github.com/fpgmaas))
+- chore: remove stale code for conditional import of `Literal` [\#2676](https://github.com/delta-io/delta-rs/pull/2676) ([fpgmaas](https://github.com/fpgmaas))
+- chore: remove references to black from the project [\#2674](https://github.com/delta-io/delta-rs/pull/2674) ([fpgmaas](https://github.com/fpgmaas))
+- chore: bump ruff to 0.5.2 [\#2673](https://github.com/delta-io/delta-rs/pull/2673) ([fpgmaas](https://github.com/fpgmaas))
+- chore: improve contributing.md [\#2672](https://github.com/delta-io/delta-rs/pull/2672) ([fpgmaas](https://github.com/fpgmaas))
+- feat: support userMetadata in CommitInfo [\#2670](https://github.com/delta-io/delta-rs/pull/2670) ([jkylling](https://github.com/jkylling))
+- chore: upgrade to datafusion 40 [\#2661](https://github.com/delta-io/delta-rs/pull/2661) ([rtyler](https://github.com/rtyler))
+- docs: improve navigation fixes [\#2660](https://github.com/delta-io/delta-rs/pull/2660) ([avriiil](https://github.com/avriiil))
+- docs: add integration docs for s3 backend [\#2658](https://github.com/delta-io/delta-rs/pull/2658) ([avriiil](https://github.com/avriiil))
+- docs: fix bullets on hdfs docs [\#2653](https://github.com/delta-io/delta-rs/pull/2653) ([Kimahriman](https://github.com/Kimahriman))
+- ci: update CODEOWNERS [\#2650](https://github.com/delta-io/delta-rs/pull/2650) ([hntd187](https://github.com/hntd187))
+- feat\(rust\): fix size\_in\_bytes in last\_checkpoint\_ to i64 [\#2649](https://github.com/delta-io/delta-rs/pull/2649) ([sherlockbeard](https://github.com/sherlockbeard))
+- chore: increase subcrate versions [\#2648](https://github.com/delta-io/delta-rs/pull/2648) ([rtyler](https://github.com/rtyler))
+- chore: missed one macos runner reference in actions [\#2645](https://github.com/delta-io/delta-rs/pull/2645) ([rtyler](https://github.com/rtyler))
+- chore: add a reproduction case for merge failures with struct\ [\#2644](https://github.com/delta-io/delta-rs/pull/2644) ([rtyler](https://github.com/rtyler))
+- chore: remove macos builders from pull request flow [\#2638](https://github.com/delta-io/delta-rs/pull/2638) ([rtyler](https://github.com/rtyler))
+- fix: enable parquet pushdown for DeltaScan via TableProvider impl for DeltaTable \(rebase\) [\#2637](https://github.com/delta-io/delta-rs/pull/2637) ([rtyler](https://github.com/rtyler))
+- chore: fix documentation generation with a pin of griffe [\#2636](https://github.com/delta-io/delta-rs/pull/2636) ([rtyler](https://github.com/rtyler))
+- fix\(python\): fixed large\_dtype to schema convert [\#2635](https://github.com/delta-io/delta-rs/pull/2635) ([sherlockbeard](https://github.com/sherlockbeard))
+- fix\(rust, python\): fix writing empty structs when creating checkpoint [\#2627](https://github.com/delta-io/delta-rs/pull/2627) ([sherlockbeard](https://github.com/sherlockbeard))
+- fix\(rust, python\): fix merge schema with overwrite [\#2623](https://github.com/delta-io/delta-rs/pull/2623) ([sherlockbeard](https://github.com/sherlockbeard))
+- chore: bump python 0.18.2 [\#2621](https://github.com/delta-io/delta-rs/pull/2621) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat: report DataFusion metrics for DeltaScan [\#2617](https://github.com/delta-io/delta-rs/pull/2617) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
+- feat\(rust,python\): cast each parquet file to delta schema [\#2615](https://github.com/delta-io/delta-rs/pull/2615) ([HawaiianSpork](https://github.com/HawaiianSpork))
+- fix\(rust\): inconsistent order of partitioning columns \(\#2494\) [\#2614](https://github.com/delta-io/delta-rs/pull/2614) ([aditanase](https://github.com/aditanase))
+- docs: add Daft writer [\#2594](https://github.com/delta-io/delta-rs/pull/2594) ([avriiil](https://github.com/avriiil))
+- feat\(python, rust\): `add column` operation [\#2562](https://github.com/delta-io/delta-rs/pull/2562) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: change arrow map root name to follow with parquet root name [\#2538](https://github.com/delta-io/delta-rs/pull/2538) ([sclmn](https://github.com/sclmn))
+- feat\(python\): handle PyCapsule interface objects in write\_deltalake [\#2534](https://github.com/delta-io/delta-rs/pull/2534) ([kylebarron](https://github.com/kylebarron))
+- feat: improve merge performance by using predicate non-partition columns min/max for prefiltering [\#2513](https://github.com/delta-io/delta-rs/pull/2513) ([JonasDev1](https://github.com/JonasDev1))
+- feat\(python, rust\): cleanup expired logs post-commit hook [\#2459](https://github.com/delta-io/delta-rs/pull/2459) ([ion-elgreco](https://github.com/ion-elgreco))
+
+## [rust-v0.18.0](https://github.com/delta-io/delta-rs/tree/rust-v0.18.0) (2024-06-12)
+
+[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.17.3...rust-v0.18.0)
+
+**Implemented enhancements:**
+
+- documentation: concurrent writes for non-S3 backends [\#2556](https://github.com/delta-io/delta-rs/issues/2556)
+- pyarrow options for `write_delta` [\#2515](https://github.com/delta-io/delta-rs/issues/2515)
+- \[deltalake\_aws\] Allow configuring separate endpoints for S3 and DynamoDB clients. [\#2498](https://github.com/delta-io/delta-rs/issues/2498)
+- Include file stats when converting a parquet directory to a Delta table [\#2490](https://github.com/delta-io/delta-rs/issues/2490)
+- Adopt the delta kernel types [\#2489](https://github.com/delta-io/delta-rs/issues/2489)
+
+**Fixed bugs:**
+
+- `raise_if_not_exists` for properties not configurable on CreateBuilder [\#2564](https://github.com/delta-io/delta-rs/issues/2564)
+- write\_deltalake with rust engine fails when mode is append and overwrite schema is enabled [\#2553](https://github.com/delta-io/delta-rs/issues/2553)
+- Running the basic\_operations examples fails with `Error: Transaction { source: WriterFeaturesRequired(TimestampWithoutTimezone) `} [\#2552](https://github.com/delta-io/delta-rs/issues/2552)
+- invalid peer certificate: BadSignature when connecting to s3 from arm64/aarch64 [\#2551](https://github.com/delta-io/delta-rs/issues/2551)
+- load\_cdf\(\) issue : Generic S3 error: request or response body error: operation timed out [\#2549](https://github.com/delta-io/delta-rs/issues/2549)
+- write\_deltalake fails on Databricks volume [\#2540](https://github.com/delta-io/delta-rs/issues/2540)
+- Getting "Microsoft Azure Error: Operation timed out" when trying to retrieve big files [\#2537](https://github.com/delta-io/delta-rs/issues/2537)
+- Impossible to append to a DeltaTable with float data type on RHEL [\#2520](https://github.com/delta-io/delta-rs/issues/2520)
+- Creating DeltaTable object slow [\#2518](https://github.com/delta-io/delta-rs/issues/2518)
+- `write_deltalake` throws parser error when using `rust` engine and big decimals [\#2510](https://github.com/delta-io/delta-rs/issues/2510)
+- TypeError: Object of type int64 is not JSON serializable when writing using a Pandas dataframe [\#2501](https://github.com/delta-io/delta-rs/issues/2501)
+- unable to read delta table when table contains both null and non-null add stats [\#2477](https://github.com/delta-io/delta-rs/issues/2477)
+- Commits on WriteMode::MergeSchema cause table metadata corruption [\#2468](https://github.com/delta-io/delta-rs/issues/2468)
+- S3 object store always returns IMDS warnings [\#2460](https://github.com/delta-io/delta-rs/issues/2460)
+- File skipping according to documentation [\#2427](https://github.com/delta-io/delta-rs/issues/2427)
+- LockClientError [\#2379](https://github.com/delta-io/delta-rs/issues/2379)
+- get\_app\_transaction\_version\(\) returns wrong result [\#2340](https://github.com/delta-io/delta-rs/issues/2340)
+- Property setting in `create` is not handled correctly [\#2247](https://github.com/delta-io/delta-rs/issues/2247)
+- Handling of decimals in scientific notation [\#2221](https://github.com/delta-io/delta-rs/issues/2221)
+- Unable to append to delta table without datafusion feature [\#2204](https://github.com/delta-io/delta-rs/issues/2204)
+- Decimal Column with Value 0 Causes Failure in Python Binding [\#2193](https://github.com/delta-io/delta-rs/issues/2193)
+
+**Merged pull requests:**
+
+- docs: improve S3 access docs [\#2589](https://github.com/delta-io/delta-rs/pull/2589) ([avriiil](https://github.com/avriiil))
+- chore: bump macOS runners, maybe resolve import error [\#2588](https://github.com/delta-io/delta-rs/pull/2588) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: bump to datafusion 39, arrow 52, pyo3 0.21 [\#2581](https://github.com/delta-io/delta-rs/pull/2581) ([abhiaagarwal](https://github.com/abhiaagarwal))
+- feat: add custom dynamodb endpoint configuration [\#2575](https://github.com/delta-io/delta-rs/pull/2575) ([hnaoto](https://github.com/hnaoto))
+- fix: consistently use raise\_if\_key\_not\_exists in CreateBuilder [\#2569](https://github.com/delta-io/delta-rs/pull/2569) ([vegarsti](https://github.com/vegarsti))
+- fix: add raise\_if\_key\_not\_exists to CreateBuilder [\#2565](https://github.com/delta-io/delta-rs/pull/2565) ([vegarsti](https://github.com/vegarsti))
+- docs: dt.delete add context + api docs link [\#2560](https://github.com/delta-io/delta-rs/pull/2560) ([avriiil](https://github.com/avriiil))
+- fix: update deltalake crate examples for crate layout and TimestampNtz [\#2559](https://github.com/delta-io/delta-rs/pull/2559) ([jhoekx](https://github.com/jhoekx))
+- docs: clarify locking mechanism requirement for S3 [\#2558](https://github.com/delta-io/delta-rs/pull/2558) ([inigohidalgo](https://github.com/inigohidalgo))
+- fix: remove deprecated overwrite\_schema configuration which has incorrect behavior [\#2554](https://github.com/delta-io/delta-rs/pull/2554) ([rtyler](https://github.com/rtyler))
+- fix: clippy warnings [\#2548](https://github.com/delta-io/delta-rs/pull/2548) ([imor](https://github.com/imor))
+- docs: dask write syntax fix [\#2543](https://github.com/delta-io/delta-rs/pull/2543) ([avriiil](https://github.com/avriiil))
+- fix: cast support fields nested in lists and maps [\#2541](https://github.com/delta-io/delta-rs/pull/2541) ([HawaiianSpork](https://github.com/HawaiianSpork))
+- feat: implement transaction identifiers - continued [\#2539](https://github.com/delta-io/delta-rs/pull/2539) ([roeap](https://github.com/roeap))
+- docs: pull delta from conda not pip [\#2535](https://github.com/delta-io/delta-rs/pull/2535) ([avriiil](https://github.com/avriiil))
+- chore: expose `files_by_partition` to public api [\#2533](https://github.com/delta-io/delta-rs/pull/2533) ([edmondop](https://github.com/edmondop))
+- chore: bump python 0.17.5 [\#2531](https://github.com/delta-io/delta-rs/pull/2531) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat\(rust\): make PartitionWriter public [\#2525](https://github.com/delta-io/delta-rs/pull/2525) ([adriangb](https://github.com/adriangb))
+- fix: msrv in workspace [\#2524](https://github.com/delta-io/delta-rs/pull/2524) ([roeap](https://github.com/roeap))
+- chore: fixing some clips [\#2521](https://github.com/delta-io/delta-rs/pull/2521) ([rtyler](https://github.com/rtyler))
+- fix: enable field\_with\_name to support nested fields with '.' delimiter [\#2519](https://github.com/delta-io/delta-rs/pull/2519) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
+- chore: tidying up builds without datafusion feature and clippy [\#2516](https://github.com/delta-io/delta-rs/pull/2516) ([rtyler](https://github.com/rtyler))
+- fix\(python\): release GIL on most operations [\#2512](https://github.com/delta-io/delta-rs/pull/2512) ([adriangb](https://github.com/adriangb))
+- docs: fix typo [\#2508](https://github.com/delta-io/delta-rs/pull/2508) ([avriiil](https://github.com/avriiil))
+- fix\(rust, python\): fixed differences in storage options between log and object stores [\#2500](https://github.com/delta-io/delta-rs/pull/2500) ([mightyshazam](https://github.com/mightyshazam))
+- docs: improve daft integration docs [\#2496](https://github.com/delta-io/delta-rs/pull/2496) ([avriiil](https://github.com/avriiil))
+- feat: adopt kernel schema types [\#2495](https://github.com/delta-io/delta-rs/pull/2495) ([roeap](https://github.com/roeap))
+- feat: add stats to convert-to-delta operation [\#2491](https://github.com/delta-io/delta-rs/pull/2491) ([gruuya](https://github.com/gruuya))
+- fix\(python, rust\): region lookup wasn't working correctly for dynamo [\#2488](https://github.com/delta-io/delta-rs/pull/2488) ([mightyshazam](https://github.com/mightyshazam))
+- feat: introduce CDC write-side support for the Update operations [\#2486](https://github.com/delta-io/delta-rs/pull/2486) ([rtyler](https://github.com/rtyler))
+- fix\(python\): reuse state in `to_pyarrow_dataset` [\#2485](https://github.com/delta-io/delta-rs/pull/2485) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: check to see if the file exists before attempting to rename [\#2482](https://github.com/delta-io/delta-rs/pull/2482) ([rtyler](https://github.com/rtyler))
+- fix\(python, rust\): use new schema for stats parsing instead of old [\#2480](https://github.com/delta-io/delta-rs/pull/2480) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(rust\): unable to read delta table when table contains both null and non-null add stats [\#2476](https://github.com/delta-io/delta-rs/pull/2476) ([yjshen](https://github.com/yjshen))
+- chore: update the changelog to include rust-v0.17.3 [\#2473](https://github.com/delta-io/delta-rs/pull/2473) ([rtyler](https://github.com/rtyler))
+- chore: a bunch of tweaks to get releases out the door [\#2472](https://github.com/delta-io/delta-rs/pull/2472) ([rtyler](https://github.com/rtyler))
+- chore: bump the core crate for its next release [\#2470](https://github.com/delta-io/delta-rs/pull/2470) ([rtyler](https://github.com/rtyler))
+- fix: return unsupported error for merging schemas in the presence of partition columns [\#2469](https://github.com/delta-io/delta-rs/pull/2469) ([emcake](https://github.com/emcake))
+- feat\(python\): add parameter to DeltaTable.to\_pyarrow\_dataset\(\) [\#2465](https://github.com/delta-io/delta-rs/pull/2465) ([adriangb](https://github.com/adriangb))
+- feat\(python, rust\): add OBJECT\_STORE\_CONCURRENCY\_LIMIT setting for ObjectStoreFactory [\#2458](https://github.com/delta-io/delta-rs/pull/2458) ([vigimite](https://github.com/vigimite))
+- fix\(rust\): handle 429 from GCS [\#2454](https://github.com/delta-io/delta-rs/pull/2454) ([adriangb](https://github.com/adriangb))
+- fix\(python\): reuse table state in write engine [\#2453](https://github.com/delta-io/delta-rs/pull/2453) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(rust\): implement abort commit for S3DynamoDBLogStore [\#2452](https://github.com/delta-io/delta-rs/pull/2452) ([PeterKeDer](https://github.com/PeterKeDer))
+- fix\(python, rust\): check timestamp\_ntz in nested fields, add check\_can\_write in pyarrow writer [\#2443](https://github.com/delta-io/delta-rs/pull/2443) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(python, rust\): remove imds calls from profile auth and region [\#2442](https://github.com/delta-io/delta-rs/pull/2442) ([mightyshazam](https://github.com/mightyshazam))
+- fix\(python, rust\): use from\_name during column projection creation [\#2441](https://github.com/delta-io/delta-rs/pull/2441) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: bump python for 0.17 release [\#2439](https://github.com/delta-io/delta-rs/pull/2439) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(python,rust\): missing remove actions during `create_or_replace` [\#2437](https://github.com/delta-io/delta-rs/pull/2437) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: introduce the Operation trait to enforce consistency between operations [\#2435](https://github.com/delta-io/delta-rs/pull/2435) ([rtyler](https://github.com/rtyler))
+- fix\(python\): load\_as\_version with datetime object with no timezone specified [\#2429](https://github.com/delta-io/delta-rs/pull/2429) ([t1g0rz](https://github.com/t1g0rz))
+- feat\(python, rust\): respect column stats collection configurations [\#2428](https://github.com/delta-io/delta-rs/pull/2428) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat: lazy static runtime in python [\#2424](https://github.com/delta-io/delta-rs/pull/2424) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat: implement repartitioned for DeltaScan [\#2421](https://github.com/delta-io/delta-rs/pull/2421) ([jkylling](https://github.com/jkylling))
+- fix: return error when checkpoints and metadata get out of sync [\#2406](https://github.com/delta-io/delta-rs/pull/2406) ([esarili](https://github.com/esarili))
+- fix\(rust\): stats\_parsed has different number of records with stats [\#2405](https://github.com/delta-io/delta-rs/pull/2405) ([yjshen](https://github.com/yjshen))
+- docs: add Daft integration [\#2402](https://github.com/delta-io/delta-rs/pull/2402) ([avriiil](https://github.com/avriiil))
+- feat\(rust\): advance state in post commit [\#2396](https://github.com/delta-io/delta-rs/pull/2396) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore\(rust\): bump arrow v51 and datafusion v37.1 [\#2395](https://github.com/delta-io/delta-rs/pull/2395) ([lasantosr](https://github.com/lasantosr))
+- docs: document required aws permissions [\#2393](https://github.com/delta-io/delta-rs/pull/2393) ([ale-rinaldi](https://github.com/ale-rinaldi))
+- feat\(rust\): post commit hook \(v2\), create checkpoint hook [\#2391](https://github.com/delta-io/delta-rs/pull/2391) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: time travel when checkpointed and logs removed [\#2389](https://github.com/delta-io/delta-rs/pull/2389) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(rust\): remove flush after writing every batch [\#2387](https://github.com/delta-io/delta-rs/pull/2387) ([PeterKeDer](https://github.com/PeterKeDer))
+- feat: added configuration variables to handle EC2 metadata service [\#2385](https://github.com/delta-io/delta-rs/pull/2385) ([mightyshazam](https://github.com/mightyshazam))
+- fix\(rust\): timestamp deserialization format, missing type [\#2383](https://github.com/delta-io/delta-rs/pull/2383) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: bump chrono [\#2372](https://github.com/delta-io/delta-rs/pull/2372) ([universalmind303](https://github.com/universalmind303))
+- chore: bump python 0.16.4 [\#2371](https://github.com/delta-io/delta-rs/pull/2371) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: add snappy compression on checkpoint files [\#2365](https://github.com/delta-io/delta-rs/pull/2365) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: add config for parquet pushdown on delta scan [\#2364](https://github.com/delta-io/delta-rs/pull/2364) ([Blajda](https://github.com/Blajda))
+- fix\(python,rust\): optimize compact on schema evolved table [\#2358](https://github.com/delta-io/delta-rs/pull/2358) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(python, rust\): expr parsing date/timestamp [\#2357](https://github.com/delta-io/delta-rs/pull/2357) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: remove tmp files in cleanup\_metadata [\#2356](https://github.com/delta-io/delta-rs/pull/2356) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: make struct fields nullable in stats schema [\#2346](https://github.com/delta-io/delta-rs/pull/2346) ([qinix](https://github.com/qinix))
+- fix\(rust\): adhere to protocol for Decimal [\#2332](https://github.com/delta-io/delta-rs/pull/2332) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(rust\): raise schema mismatch when decimal is not subset [\#2330](https://github.com/delta-io/delta-rs/pull/2330) ([ion-elgreco](https://github.com/ion-elgreco))
+- feat\(rust\): derive Copy on some public enums [\#2329](https://github.com/delta-io/delta-rs/pull/2329) ([lasantosr](https://github.com/lasantosr))
+- fix: merge pushdown handling [\#2326](https://github.com/delta-io/delta-rs/pull/2326) ([Blajda](https://github.com/Blajda))
+- fix: merge concurrency control [\#2324](https://github.com/delta-io/delta-rs/pull/2324) ([ion-elgreco](https://github.com/ion-elgreco))
+- Revert 2291 merge predicate fix [\#2323](https://github.com/delta-io/delta-rs/pull/2323) ([Blajda](https://github.com/Blajda))
+- fix: try to fix timeouts [\#2318](https://github.com/delta-io/delta-rs/pull/2318) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(rust\): serialize MetricDetails from compaction runs to a string [\#2317](https://github.com/delta-io/delta-rs/pull/2317) ([liamphmurphy](https://github.com/liamphmurphy))
+- docs: add example in to\_pyarrow\_dataset [\#2315](https://github.com/delta-io/delta-rs/pull/2315) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(python\): wrong batch size [\#2314](https://github.com/delta-io/delta-rs/pull/2314) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore: object store 0.9.1 [\#2311](https://github.com/delta-io/delta-rs/pull/2311) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: checkpoint features format below v3,7 [\#2307](https://github.com/delta-io/delta-rs/pull/2307) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: schema evolution not coercing with large arrow types [\#2305](https://github.com/delta-io/delta-rs/pull/2305) ([aersam](https://github.com/aersam))
+- fix: clean up some non-datafusion builds [\#2303](https://github.com/delta-io/delta-rs/pull/2303) ([rtyler](https://github.com/rtyler))
+- docs: fix typo [\#2300](https://github.com/delta-io/delta-rs/pull/2300) ([LauH1987](https://github.com/LauH1987))
+- docs: make replaceWhere example compile [\#2299](https://github.com/delta-io/delta-rs/pull/2299) ([LauH1987](https://github.com/LauH1987))
+- fix\(rust\): add missing chrono-tz feature [\#2295](https://github.com/delta-io/delta-rs/pull/2295) ([ion-elgreco](https://github.com/ion-elgreco))
+- chore\(python\): bump to v0.16.1 [\#2294](https://github.com/delta-io/delta-rs/pull/2294) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(rust\): features not maintained in protocol after checkpoint [\#2293](https://github.com/delta-io/delta-rs/pull/2293) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: merge predicate for concurrent writes [\#2291](https://github.com/delta-io/delta-rs/pull/2291) ([JonasDev1](https://github.com/JonasDev1))
+- fix: replace assert and AssertionError with appropriate exceptions [\#2286](https://github.com/delta-io/delta-rs/pull/2286) ([joe-sharman](https://github.com/joe-sharman))
+- docs: fix typo in delta-lake-polars.md [\#2285](https://github.com/delta-io/delta-rs/pull/2285) ([vladdoster](https://github.com/vladdoster))
+- fix\(python, rust\): prevent table scan returning large arrow dtypes [\#2274](https://github.com/delta-io/delta-rs/pull/2274) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(python\): always encapsulate column names in backticks in \_all functions [\#2271](https://github.com/delta-io/delta-rs/pull/2271) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix\(rust\): read only checkpoints that match \_last\_checkpoint version [\#2270](https://github.com/delta-io/delta-rs/pull/2270) ([ion-elgreco](https://github.com/ion-elgreco))
+- fix: add .venv to .gitignore [\#2268](https://github.com/delta-io/delta-rs/pull/2268) ([gacharya](https://github.com/gacharya))
+- feat\(python, rust\): add `set table properties` operation [\#2264](https://github.com/delta-io/delta-rs/pull/2264) ([ion-elgreco](https://github.com/ion-elgreco))
+- docs: use dagster deltalake polars library [\#2263](https://github.com/delta-io/delta-rs/pull/2263) ([avriiil](https://github.com/avriiil))
+- docs: update comment about r2 requiring locks [\#2261](https://github.com/delta-io/delta-rs/pull/2261) ([cmackenzie1](https://github.com/cmackenzie1))
+- fix\(\#2256\): use consistent units of time [\#2260](https://github.com/delta-io/delta-rs/pull/2260) ([cmackenzie1](https://github.com/cmackenzie1))
+- chore: update the changelog for rust-v0.17.1 [\#2259](https://github.com/delta-io/delta-rs/pull/2259) ([rtyler](https://github.com/rtyler))
+- feat\(python\): release GIL in the write\_deltalake function [\#2257](https://github.com/delta-io/delta-rs/pull/2257) ([franz101](https://github.com/franz101))
+- chore\(rust\): bump datafusion to 36 [\#2249](https://github.com/delta-io/delta-rs/pull/2249) ([universalmind303](https://github.com/universalmind303))
+- chore!: replace rusoto with AWS SDK [\#2243](https://github.com/delta-io/delta-rs/pull/2243) ([mightyshazam](https://github.com/mightyshazam))
+- fix: handle conflict checking in optimize correctly [\#2208](https://github.com/delta-io/delta-rs/pull/2208) ([emcake](https://github.com/emcake))
+- feat: logical Node for find files [\#2194](https://github.com/delta-io/delta-rs/pull/2194) ([hntd187](https://github.com/hntd187))
+
## [rust-v0.17.3](https://github.com/delta-io/delta-rs/tree/rust-v0.17.3) (2024-05-01)
[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.17.1...rust-v0.17.3)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4472a3640a..f681aa3948 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -17,34 +17,40 @@ If you want to claim an issue to work on, you can write the word `take` as a com
- Install Rust, e.g. as described [here](https://doc.rust-lang.org/cargo/getting-started/installation.html)
- Have a compatible Python version installed (check `python/pyproject.toml` for current requirement)
- Create a Python virtual environment (required for development builds), e.g. as described [here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/)
+ ```sh
+ python -m venv .venv
+ ```
+
- Build the project for development (this requires an active virtual environment and will also install `deltalake` in that virtual environment)
-```
-cd python
-make develop
-```
+ ```sh
+ cd python
+ make develop
+ ```
- Run some Python code, e.g. to run a specific test
-```
-python -m pytest tests/test_writer.py -s -k "test_with_deltalake_schema"
-```
+ ```sh
+ python -m pytest tests/test_writer.py -s -k "test_with_deltalake_schema"
+ ```
- Run some Rust code, e.g. run an example
-```
-cd crates/deltalake
-cargo run --example basic_operations --features="datafusion"
-```
+ ```sh
+ cd crates/deltalake
+ cargo run --example basic_operations --features="datafusion"
+ ```
## Run the docs locally
-*This serves your local contens of docs via a web browser, handy for checking what they look like if you are making changes to docs or docstings*
-```
+*This serves your local contents of docs via a web browser, handy for checking what they look like if you are making changes to docs or docstings*
+
+```sh
(cd python; make develop)
pip install -r docs/requirements.txt
mkdocs serve
```
## To make a pull request (PR)
-- Make sure all the following steps run/pass locally before submitting a PR
-```
+Make sure all the following steps run/pass locally before submitting a PR
+
+```sh
cargo fmt -- --check
cd python
make check-rust
@@ -62,7 +68,7 @@ make build-docs
- For debugging Rust code, install [CodeLLDB](https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb). The extension should even create Debug launch configurations for the project if you allow it, an easy way to get started. Just set a breakpoint and run the relevant configuration.
- For debugging from Python into Rust, follow this procedure:
1. Add this to `.vscode/launch.json`
-```
+```json
{
"type": "lldb",
"request": "attach",
diff --git a/Cargo.toml b/Cargo.toml
index 6168a500fd..ccbb766e0f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,7 @@ resolver = "2"
[workspace.package]
authors = ["Qingping Hou "]
-rust-version = "1.75"
+rust-version = "1.80"
keywords = ["deltalake", "delta", "datalake"]
readme = "README.md"
edition = "2021"
@@ -26,30 +26,34 @@ debug = true
debug = "line-tables-only"
[workspace.dependencies]
+delta_kernel = { version = "=0.3.0" }
+# delta_kernel = { path = "../delta-kernel-rs/kernel", version = "0.3.0" }
+
# arrow
-arrow = { version = "51" }
-arrow-arith = { version = "51" }
-arrow-array = { version = "51", features = ["chrono-tz"] }
-arrow-buffer = { version = "51" }
-arrow-cast = { version = "51" }
-arrow-ipc = { version = "51" }
-arrow-json = { version = "51" }
-arrow-ord = { version = "51" }
-arrow-row = { version = "51" }
-arrow-schema = { version = "51" }
-arrow-select = { version = "51" }
-object_store = { version = "0.9" }
-parquet = { version = "51" }
+arrow = { version = "52" }
+arrow-arith = { version = "52" }
+arrow-array = { version = "52", features = ["chrono-tz"] }
+arrow-buffer = { version = "52" }
+arrow-cast = { version = "52" }
+arrow-ipc = { version = "52" }
+arrow-json = { version = "52" }
+arrow-ord = { version = "52" }
+arrow-row = { version = "52" }
+arrow-schema = { version = "52" }
+arrow-select = { version = "52" }
+object_store = { version = "0.10.1" }
+parquet = { version = "52" }
# datafusion
-datafusion = { version = "37.1" }
-datafusion-expr = { version = "37.1" }
-datafusion-common = { version = "37.1" }
-datafusion-proto = { version = "37.1" }
-datafusion-sql = { version = "37.1" }
-datafusion-physical-expr = { version = "37.1" }
-datafusion-functions = { version = "37.1" }
-datafusion-functions-array = { version = "37.1" }
+datafusion = { version = "41" }
+datafusion-expr = { version = "41" }
+datafusion-common = { version = "41" }
+datafusion-proto = { version = "41" }
+datafusion-sql = { version = "41" }
+datafusion-physical-expr = { version = "41" }
+datafusion-physical-plan = { version = "41" }
+datafusion-functions = { version = "41" }
+datafusion-functions-aggregate = { version = "41" }
# serde
serde = { version = "1.0.194", features = ["derive"] }
@@ -62,6 +66,7 @@ tracing = { version = "0.1", features = ["log"] }
regex = { version = "1" }
thiserror = { version = "1" }
url = { version = "2" }
+urlencoding = "2.1.3"
uuid = { version = "1" }
# runtime / async
diff --git a/README.md b/README.md
index ec9a7d2d59..b7a26b8a42 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@
+Delta Lake is an open-source storage format that runs on top of existing data lakes. Delta Lake is compatible with processing engines like Apache Spark and provides benefits such as ACID transaction guarantees, schema enforcement, and scalable data handling.
The Delta Lake project aims to unlock the power of the Deltalake for as many users and projects as possible
by providing native low-level APIs aimed at developers and integrators, as well as a high-level operations
@@ -135,12 +136,13 @@ of features outlined in the Delta [protocol][protocol] is also [tracked](#protoc
| -------------------- | :-----: | :-----: | ---------------------------------------------------------------- |
| Local | ![done] | ![done] | |
| S3 - AWS | ![done] | ![done] | requires lock for concurrent writes |
-| S3 - MinIO | ![done] | ![done] | requires lock for concurrent writes |
-| S3 - R2 | ![done] | ![done] | No lock required when using `AmazonS3ConfigKey::CopyIfNotExists` |
+| S3 - MinIO | ![done] | ![done] | No lock required when using `AmazonS3ConfigKey::ConditionalPut` with `storage_options = {"conditional_put":"etag"}` |
+| S3 - R2 | ![done] | ![done] | No lock required when using `AmazonS3ConfigKey::ConditionalPut` with `storage_options = {"conditional_put":"etag"}` |
| Azure Blob | ![done] | ![done] | |
| Azure ADLS Gen2 | ![done] | ![done] | |
| Microsoft OneLake | ![done] | ![done] | |
| Google Cloud Storage | ![done] | ![done] | |
+| HDFS | ![done] | ![done] | |
### Supported Operations
diff --git a/crates/aws/Cargo.toml b/crates/aws/Cargo.toml
index e6913a2162..992a32c93e 100644
--- a/crates/aws/Cargo.toml
+++ b/crates/aws/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "deltalake-aws"
-version = "0.1.2"
+version = "0.3.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
@@ -12,19 +12,20 @@ repository.workspace = true
rust-version.workspace = true
[dependencies]
-deltalake-core = { version = ">=0.17.0, <0.19.0", path = "../core" }
-aws-smithy-runtime-api = { version="1.1.7" }
-aws-smithy-runtime = { version="1.1.7", optional = true}
-aws-credential-types = { version="1.1.7", features = ["hardcoded-credentials"]}
-aws-config = { version = "1.1.6", default-features = false, features = ["behavior-version-latest","rt-tokio", "credentials-process", "sso"] }
-aws-sdk-dynamodb = {version = "1.15.0", default-features = false, features = ["behavior-version-latest", "rt-tokio"] }
-aws-sdk-sts = {version = "1.1.6", default-features = false, features = ["behavior-version-latest", "rt-tokio"] }
+deltalake-core = { version = "0.20.0", path = "../core" }
+aws-smithy-runtime-api = { version="1.7" }
+aws-smithy-runtime = { version="1.7", optional = true}
+aws-credential-types = { version="1.2", features = ["hardcoded-credentials"]}
+aws-config = { version = "1.5", default-features = false, features = ["behavior-version-latest","rt-tokio", "credentials-process", "sso"] }
+aws-sdk-dynamodb = {version = "1.45", default-features = false, features = ["behavior-version-latest", "rt-tokio"] }
+aws-sdk-sts = {version = "1.42", default-features = false, features = ["behavior-version-latest", "rt-tokio"] }
lazy_static = "1"
maplit = "1"
# workspace dependencies
async-trait = { workspace = true }
bytes = { workspace = true }
+chrono = { workspace = true }
futures = { workspace = true }
tracing = { workspace = true }
object_store = { workspace = true, features = ["aws"]}
@@ -33,7 +34,7 @@ tokio = { workspace = true }
regex = { workspace = true }
uuid = { workspace = true, features = ["serde", "v4"] }
url = { workspace = true }
-backoff = { version = "0.4", features = [ "tokio" ] }
+backon = { version = "1",default-features = false, features = [ "tokio-sleep" ] }
hyper-tls = { version = "0.5", optional = true }
[dev-dependencies]
diff --git a/crates/aws/src/constants.rs b/crates/aws/src/constants.rs
new file mode 100644
index 0000000000..90c23ff572
--- /dev/null
+++ b/crates/aws/src/constants.rs
@@ -0,0 +1,141 @@
+//! Constants used for modifying and configuring various AWS S3 (or similar) connections with
+//! delta-rs
+//!
+
+use lazy_static::lazy_static;
+use std::time::Duration;
+
+/// Custom S3 endpoint.
+pub const AWS_ENDPOINT_URL: &str = "AWS_ENDPOINT_URL";
+/// Custom DynamoDB endpoint.
+/// If DynamoDB endpoint is not supplied, will use S3 endpoint (AWS_ENDPOINT_URL)
+/// If it is supplied, this endpoint takes precedence over the global endpoint set in AWS_ENDPOINT_URL for DynamoDB
+pub const AWS_ENDPOINT_URL_DYNAMODB: &str = "AWS_ENDPOINT_URL_DYNAMODB";
+/// The AWS region.
+pub const AWS_REGION: &str = "AWS_REGION";
+/// The AWS profile.
+pub const AWS_PROFILE: &str = "AWS_PROFILE";
+/// The AWS_ACCESS_KEY_ID to use for S3.
+pub const AWS_ACCESS_KEY_ID: &str = "AWS_ACCESS_KEY_ID";
+/// The AWS_SECRET_ACCESS_KEY to use for S3.
+pub const AWS_SECRET_ACCESS_KEY: &str = "AWS_SECRET_ACCESS_KEY";
+/// The AWS_SESSION_TOKEN to use for S3.
+pub const AWS_SESSION_TOKEN: &str = "AWS_SESSION_TOKEN";
+/// Uses either "path" (the default) or "virtual", which turns on
+/// [virtual host addressing](http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html).
+pub const AWS_S3_ADDRESSING_STYLE: &str = "AWS_S3_ADDRESSING_STYLE";
+/// Locking provider to use for safe atomic rename.
+/// `dynamodb` is currently the only supported locking provider.
+/// If not set, safe atomic rename is not available.
+pub const AWS_S3_LOCKING_PROVIDER: &str = "AWS_S3_LOCKING_PROVIDER";
+/// The role to assume for S3 writes.
+pub const AWS_IAM_ROLE_ARN: &str = "AWS_IAM_ROLE_ARN";
+/// The role to assume. Please use [AWS_IAM_ROLE_ARN] instead
+#[deprecated(since = "0.20.0", note = "Please use AWS_IAM_ROLE_ARN instead")]
+pub const AWS_S3_ASSUME_ROLE_ARN: &str = "AWS_S3_ASSUME_ROLE_ARN";
+/// The role session name to use when a role is assumed. If not provided a random session name is generated.
+pub const AWS_IAM_ROLE_SESSION_NAME: &str = "AWS_IAM_ROLE_SESSION_NAME";
+/// The role session name to use when a role is assumed. If not provided a random session name is generated.
+#[deprecated(
+ since = "0.20.0",
+ note = "Please use AWS_IAM_ROLE_SESSION_NAME instead"
+)]
+pub const AWS_S3_ROLE_SESSION_NAME: &str = "AWS_S3_ROLE_SESSION_NAME";
+/// The `pool_idle_timeout` option of aws http client. Has to be lower than 20 seconds, which is
+/// default S3 server timeout .
+/// However, since rusoto uses hyper as a client, its default timeout is 90 seconds
+/// .
+/// Hence, the `connection closed before message completed` could occur.
+/// To avoid that, the default value of this setting is 15 seconds if it's not set otherwise.
+pub const AWS_S3_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_S3_POOL_IDLE_TIMEOUT_SECONDS";
+/// The `pool_idle_timeout` for the as3_constants sts client. See
+/// the reasoning in `AWS_S3_POOL_IDLE_TIMEOUT_SECONDS`.
+pub const AWS_STS_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_STS_POOL_IDLE_TIMEOUT_SECONDS";
+/// The number of retries for S3 GET requests failed with 500 Internal Server Error.
+pub const AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES: &str =
+ "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES";
+/// The web identity token file to use when using a web identity provider.
+/// NOTE: web identity related options are set in the environment when
+/// creating an instance of [crate::storage::s3::S3StorageOptions].
+/// See also .
+pub const AWS_WEB_IDENTITY_TOKEN_FILE: &str = "AWS_WEB_IDENTITY_TOKEN_FILE";
+/// The role name to use for web identity.
+/// NOTE: web identity related options are set in the environment when
+/// creating an instance of [crate::storage::s3::S3StorageOptions].
+/// See also .
+pub const AWS_ROLE_ARN: &str = "AWS_ROLE_ARN";
+/// The role session name to use for web identity.
+/// NOTE: web identity related options are set in the environment when
+/// creating an instance of [crate::storage::s3::S3StorageOptions].
+/// See also .
+pub const AWS_ROLE_SESSION_NAME: &str = "AWS_ROLE_SESSION_NAME";
+/// Allow http connections - mainly useful for integration tests
+pub const AWS_ALLOW_HTTP: &str = "AWS_ALLOW_HTTP";
+
+/// If set to "true", allows creating commits without concurrent writer protection.
+/// Only safe if there is one writer to a given table.
+pub const AWS_S3_ALLOW_UNSAFE_RENAME: &str = "AWS_S3_ALLOW_UNSAFE_RENAME";
+
+/// If set to "true", disables the imds client
+/// Defaults to "true"
+pub const AWS_EC2_METADATA_DISABLED: &str = "AWS_EC2_METADATA_DISABLED";
+
+/// The timeout in milliseconds for the EC2 metadata endpoint
+/// Defaults to 100
+pub const AWS_EC2_METADATA_TIMEOUT: &str = "AWS_EC2_METADATA_TIMEOUT";
+
+/// Force the delta-rs to attempt to load AWS credentials
+pub const AWS_FORCE_CREDENTIAL_LOAD: &str = "AWS_FORCE_CREDENTIAL_LOAD";
+
+/// The list of option keys owned by the S3 module.
+/// Option keys not contained in this list will be added to the `extra_opts`
+/// field of [crate::storage::s3::S3StorageOptions].
+pub const S3_OPTS: &[&str] = &[
+ AWS_ENDPOINT_URL,
+ AWS_ENDPOINT_URL_DYNAMODB,
+ AWS_REGION,
+ AWS_PROFILE,
+ AWS_ACCESS_KEY_ID,
+ AWS_SECRET_ACCESS_KEY,
+ AWS_SESSION_TOKEN,
+ AWS_S3_LOCKING_PROVIDER,
+ AWS_S3_ASSUME_ROLE_ARN,
+ AWS_S3_ROLE_SESSION_NAME,
+ AWS_WEB_IDENTITY_TOKEN_FILE,
+ AWS_ROLE_ARN,
+ AWS_ROLE_SESSION_NAME,
+ AWS_S3_POOL_IDLE_TIMEOUT_SECONDS,
+ AWS_STS_POOL_IDLE_TIMEOUT_SECONDS,
+ AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES,
+ AWS_EC2_METADATA_DISABLED,
+ AWS_EC2_METADATA_TIMEOUT,
+];
+
+pub const DEFAULT_LOCK_TABLE_NAME: &str = "delta_log";
+pub const LOCK_TABLE_KEY_NAME: &str = "DELTA_DYNAMO_TABLE_NAME";
+pub const BILLING_MODE_KEY_NAME: &str = "DELTA_DYNAMO_BILLING_MODE";
+pub const MAX_ELAPSED_REQUEST_TIME_KEY_NAME: &str = "DELTA_DYNAMO_MAX_ELAPSED_REQUEST_TIME";
+
+pub const ATTR_TABLE_PATH: &str = "tablePath";
+pub const ATTR_FILE_NAME: &str = "fileName";
+pub const ATTR_TEMP_PATH: &str = "tempPath";
+pub const ATTR_COMPLETE: &str = "complete";
+pub const ATTR_EXPIRE_TIME: &str = "expireTime";
+
+pub const STRING_TYPE: &str = "S";
+
+pub const KEY_TYPE_HASH: &str = "HASH";
+pub const KEY_TYPE_RANGE: &str = "RANGE";
+
+lazy_static! {
+ pub static ref CONDITION_EXPR_CREATE: String = format!(
+ "attribute_not_exists({ATTR_TABLE_PATH}) and attribute_not_exists({ATTR_FILE_NAME})"
+ );
+
+ pub static ref CONDITION_DELETE_INCOMPLETE: String = format!(
+ "(complete = :f) or (attribute_not_exists({ATTR_TABLE_PATH}) and attribute_not_exists({ATTR_FILE_NAME}))"
+ );
+}
+
+pub const CONDITION_UPDATE_INCOMPLETE: &str = "complete = :f";
+pub const DEFAULT_COMMIT_ENTRY_EXPIRATION_DELAY: Duration = Duration::from_secs(86_400);
diff --git a/crates/aws/src/credentials.rs b/crates/aws/src/credentials.rs
index 9ddf19b74c..71441bf05e 100644
--- a/crates/aws/src/credentials.rs
+++ b/crates/aws/src/credentials.rs
@@ -1,118 +1,259 @@
-use std::{sync::Arc, time::Duration};
-
-use aws_config::{
- ecs::EcsCredentialsProvider,
- environment::{EnvironmentVariableCredentialsProvider, EnvironmentVariableRegionProvider},
- imds::credentials::ImdsCredentialsProvider,
- meta::{credentials::CredentialsProviderChain, region::RegionProviderChain},
- profile::ProfileFileCredentialsProvider,
- provider_config::ProviderConfig,
- web_identity_token::WebIdentityTokenCredentialsProvider,
-};
-use aws_credential_types::provider::{self, ProvideCredentials};
-use tracing::Instrument;
+//! Custom AWS credential providers used by delta-rs
+//!
-const IMDS_PROVIDER_NAME: &str = "Ec2InstanceMetadata";
+use std::sync::Arc;
-#[derive(Debug)]
-pub struct ConfiguredCredentialChain {
- provider_chain: CredentialsProviderChain,
-}
+use aws_config::default_provider::credentials::DefaultCredentialsChain;
+use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::sts::AssumeRoleProvider;
+use aws_config::SdkConfig;
+use aws_credential_types::provider::error::CredentialsError;
+use aws_credential_types::provider::{future, ProvideCredentials};
+use aws_credential_types::Credentials;
-#[derive(Debug)]
-pub struct NoOpCredentials {}
+use deltalake_core::storage::object_store::aws::{AmazonS3ConfigKey, AwsCredential};
+use deltalake_core::storage::object_store::{
+ CredentialProvider, Error as ObjectStoreError, Result as ObjectStoreResult,
+};
+use deltalake_core::storage::StorageOptions;
+use deltalake_core::DeltaResult;
+use tracing::log::*;
-pub fn new_region_provider(disable_imds: bool, imds_timeout: u64) -> RegionProviderChain {
- let env_provider = EnvironmentVariableRegionProvider::new();
- let profile_file = aws_config::profile::region::ProfileFileRegionProvider::default();
- if disable_imds {
- return RegionProviderChain::first_try(env_provider).or_else(profile_file);
- }
+use crate::constants::{self, AWS_ENDPOINT_URL};
- RegionProviderChain::first_try(env_provider)
- .or_else(profile_file)
- .or_else(
- aws_config::imds::region::Builder::default()
- .imds_client(
- aws_config::imds::Client::builder()
- .connect_timeout(Duration::from_millis(imds_timeout))
- .read_timeout(Duration::from_millis(imds_timeout))
- .build(),
- )
- .build(),
- )
+/// An [object_store::CredentialProvider] which handles converting a populated [SdkConfig]
+/// into a necessary [AwsCredential] type for configuring [object_store::aws::AmazonS3]
+#[derive(Clone, Debug)]
+pub(crate) struct AWSForObjectStore {
+ sdk_config: SdkConfig,
}
-impl ConfiguredCredentialChain {
- pub fn new(disable_imds: bool, imds_timeout: u64, conf: &ProviderConfig) -> Self {
- let imds_provider = Self::build_imds_provider(conf, disable_imds, imds_timeout);
- let env_provider = EnvironmentVariableCredentialsProvider::default();
- let profile_provider = ProfileFileCredentialsProvider::builder()
- .configure(conf)
- .with_custom_provider(IMDS_PROVIDER_NAME, imds_provider.clone())
- .build();
- let web_identity_token_provider = WebIdentityTokenCredentialsProvider::builder()
- .configure(conf)
- .build();
-
- let ecs_provider = EcsCredentialsProvider::builder().configure(conf).build();
-
- let provider_chain = CredentialsProviderChain::first_try("Environment", env_provider)
- .or_else("Profile", profile_provider)
- .or_else("WebIdentityToken", web_identity_token_provider)
- .or_else("EcsContainer", ecs_provider)
- .or_else(IMDS_PROVIDER_NAME, imds_provider);
-
- Self { provider_chain }
+impl AWSForObjectStore {
+ pub(crate) fn new(sdk_config: SdkConfig) -> Self {
+ Self { sdk_config }
}
+}
- async fn credentials(&self) -> provider::Result {
- self.provider_chain
- .provide_credentials()
- .instrument(tracing::debug_span!("provide_credentials", provider = %"default_chain"))
- .await
+#[async_trait::async_trait]
+impl CredentialProvider for AWSForObjectStore {
+ type Credential = AwsCredential;
+
+ /// Provide the necessary configured credentials from the AWS SDK for use by
+ /// [object_store::aws::AmazonS3]
+ async fn get_credential(&self) -> ObjectStoreResult> {
+ let provider = self
+ .sdk_config
+ .credentials_provider()
+ .ok_or(ObjectStoreError::NotImplemented)?;
+ let credentials =
+ provider
+ .provide_credentials()
+ .await
+ .map_err(|e| ObjectStoreError::NotSupported {
+ source: Box::new(e),
+ })?;
+
+ debug!(
+ "CredentialProvider for Object Store using access key: {}",
+ credentials.access_key_id()
+ );
+
+ Ok(Arc::new(Self::Credential {
+ key_id: credentials.access_key_id().into(),
+ secret_key: credentials.secret_access_key().into(),
+ token: credentials.session_token().map(|o| o.to_string()),
+ }))
}
+}
- fn build_imds_provider(
- conf: &ProviderConfig,
- disable_imds: bool,
- imds_timeout: u64,
- ) -> Arc {
- if disable_imds {
- return Arc::new(NoOpCredentials {});
- }
+/// Name of the [OptionsCredentialsProvider] for AWS SDK use
+const OPTS_PROVIDER: &str = "DeltaStorageOptionsProvider";
- let imds_provider = ImdsCredentialsProvider::builder()
- .configure(conf)
- .imds_client(
- aws_config::imds::Client::builder()
- .connect_timeout(Duration::from_millis(imds_timeout))
- .read_timeout(Duration::from_millis(imds_timeout))
- .build(),
- )
- .build();
- Arc::new(imds_provider)
+/// The [OptionsCredentialsProvider] helps users plug specific AWS credentials into their
+/// [StorageOptions] in such a way that the AWS SDK code will be properly
+/// loaded with those credentials before following the
+/// [aws_config::default_provider::credentials::DefaultCredentialsChain]
+#[derive(Clone, Debug)]
+pub(crate) struct OptionsCredentialsProvider {
+ options: StorageOptions,
+}
+
+impl OptionsCredentialsProvider {
+ /// Look at the options configured on the provider and return an appropriate
+ /// [Credentials] instance for AWS SDK credential resolution
+ fn credentials(&self) -> aws_credential_types::provider::Result {
+ debug!("Attempting to pull credentials from `StorageOptions`");
+ let access_key = self.options.0.get(constants::AWS_ACCESS_KEY_ID).ok_or(
+ CredentialsError::not_loaded("access key not in StorageOptions"),
+ )?;
+ let secret_key = self.options.0.get(constants::AWS_SECRET_ACCESS_KEY).ok_or(
+ CredentialsError::not_loaded("secret key not in StorageOptions"),
+ )?;
+ let session_token = self.options.0.get(constants::AWS_SESSION_TOKEN).cloned();
+
+ Ok(Credentials::new(
+ access_key,
+ secret_key,
+ session_token,
+ None,
+ OPTS_PROVIDER,
+ ))
}
}
-impl ProvideCredentials for ConfiguredCredentialChain {
- fn provide_credentials<'a>(
- &'a self,
- ) -> aws_credential_types::provider::future::ProvideCredentials<'a>
+impl ProvideCredentials for OptionsCredentialsProvider {
+ fn provide_credentials<'a>(&'a self) -> future::ProvideCredentials<'a>
where
Self: 'a,
{
- aws_credential_types::provider::future::ProvideCredentials::new(self.credentials())
+ future::ProvideCredentials::ready(self.credentials())
}
}
-impl ProvideCredentials for NoOpCredentials {
- fn provide_credentials<'a>(&'a self) -> provider::future::ProvideCredentials<'a>
- where
- Self: 'a,
- {
- aws_credential_types::provider::future::ProvideCredentials::new(std::future::ready(Err(
- provider::error::CredentialsError::not_loaded_no_source(),
- )))
+/// Generate a random session name for assuming IAM roles
+fn assume_role_sessio_name() -> String {
+ let now = chrono::Utc::now();
+
+ format!("delta-rs_{}", now.timestamp_millis())
+}
+
+/// Return the configured IAM role ARN or whatever is defined in the environment
+fn assume_role_arn(options: &StorageOptions) -> Option {
+ options
+ .0
+ .get(constants::AWS_IAM_ROLE_ARN)
+ .or(options.0.get(constants::AWS_S3_ASSUME_ROLE_ARN))
+ .or(std::env::var_os(constants::AWS_IAM_ROLE_ARN)
+ .map(|o| {
+ o.into_string()
+ .expect("Failed to unwrap AWS_IAM_ROLE_ARN which may have invalid data")
+ })
+ .as_ref())
+ .or(std::env::var_os(constants::AWS_S3_ASSUME_ROLE_ARN)
+ .map(|o| {
+ o.into_string()
+ .expect("Failed to unwrap AWS_S3_ASSUME_ROLE_ARN which may have invalid data")
+ })
+ .as_ref())
+ .cloned()
+}
+
+/// Return the configured IAM assume role session name or provide a unique one
+fn assume_session_name(options: &StorageOptions) -> String {
+ let assume_session = options
+ .0
+ .get(constants::AWS_IAM_ROLE_SESSION_NAME)
+ .or(options.0.get(constants::AWS_S3_ROLE_SESSION_NAME))
+ .cloned();
+
+ match assume_session {
+ Some(s) => s,
+ None => assume_role_sessio_name(),
+ }
+}
+
+/// Take a set of [StorageOptions] and produce an appropriate AWS SDK [SdkConfig]
+/// for use with various AWS SDK APIs, such as in our [crate::logstore::S3DynamoDbLogStore]
+pub async fn resolve_credentials(options: StorageOptions) -> DeltaResult {
+ let default_provider = DefaultCredentialsChain::builder().build().await;
+
+ let credentials_provider = match assume_role_arn(&options) {
+ Some(arn) => {
+ debug!("Configuring AssumeRoleProvider with role arn: {arn}");
+ CredentialsProviderChain::first_try(
+ "AssumeRoleProvider",
+ AssumeRoleProvider::builder(arn)
+ .session_name(assume_session_name(&options))
+ .build()
+ .await,
+ )
+ .or_else(
+ "StorageOptions",
+ OptionsCredentialsProvider {
+ options: options.clone(),
+ },
+ )
+ .or_else("DefaultChain", default_provider)
+ }
+ None => CredentialsProviderChain::first_try(
+ "StorageOptions",
+ OptionsCredentialsProvider {
+ options: options.clone(),
+ },
+ )
+ .or_else("DefaultChain", default_provider),
+ };
+
+ Ok(aws_config::from_env()
+ .credentials_provider(credentials_provider)
+ .load()
+ .await)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::constants;
+ use maplit::hashmap;
+ use serial_test::serial;
+
+ #[tokio::test]
+ #[serial]
+ async fn test_options_credentials_provider() {
+ let options = StorageOptions(hashmap! {
+ constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(),
+ constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(),
+ });
+
+ let config = resolve_credentials(options).await;
+ assert!(config.is_ok(), "{config:?}");
+ let config = config.unwrap();
+
+ if let Some(provider) = &config.credentials_provider() {
+ let credentials = provider
+ .provide_credentials()
+ .await
+ .expect("Failed to provide credentials");
+ assert_eq!(
+ "test_id",
+ credentials.access_key_id(),
+ "The access key should come from our options! {credentials:?}"
+ );
+ assert_eq!(
+ "test_secret",
+ credentials.secret_access_key(),
+ "The secret should come from our options! {credentials:?}"
+ );
+ } else {
+ panic!("Could not retrieve credentials from the SdkConfig: {config:?}");
+ }
+ }
+
+ #[tokio::test]
+ #[serial]
+ async fn test_options_credentials_provider_session_token() {
+ let options = StorageOptions(hashmap! {
+ constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(),
+ constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(),
+ constants::AWS_SESSION_TOKEN.to_string() => "test_token".to_string(),
+ });
+
+ let config = resolve_credentials(options)
+ .await
+ .expect("Failed to resolve_credentials");
+
+ if let Some(provider) = &config.credentials_provider() {
+ let credentials = provider
+ .provide_credentials()
+ .await
+ .expect("Failed to provide credentials");
+ assert_eq!(
+ Some("test_token"),
+ credentials.session_token(),
+ "The session token should come from our options! {credentials:?}"
+ );
+ } else {
+ panic!("Could not retrieve credentials from the SdkConfig: {config:?}");
+ }
}
}
diff --git a/crates/aws/src/lib.rs b/crates/aws/src/lib.rs
index d179c37e68..ddb768bdd9 100644
--- a/crates/aws/src/lib.rs
+++ b/crates/aws/src/lib.rs
@@ -1,5 +1,9 @@
-//! Lock client implementation based on DynamoDb.
+//! AWS S3 and similar tooling for delta-rs
+//!
+//! This module also contains the [S3DynamoDbLogStore] implemtnation for concurrent writer support
+//! with AWS S3 specifically.
+pub mod constants;
mod credentials;
pub mod errors;
pub mod logstore;
@@ -7,6 +11,7 @@ pub mod logstore;
mod native;
pub mod storage;
use aws_config::SdkConfig;
+use aws_sdk_dynamodb::error::SdkError;
use aws_sdk_dynamodb::{
operation::{
create_table::CreateTableError, delete_item::DeleteItemError, get_item::GetItemError,
@@ -29,7 +34,7 @@ use std::{
};
use tracing::debug;
-use deltalake_core::logstore::{logstores, LogStore, LogStoreFactory};
+use deltalake_core::logstore::{default_logstore, logstores, LogStore, LogStoreFactory};
use deltalake_core::storage::{factories, url_prefix_handler, ObjectStoreRef, StorageOptions};
use deltalake_core::{DeltaResult, Path};
use url::Url;
@@ -49,23 +54,36 @@ impl LogStoreFactory for S3LogStoreFactory {
) -> DeltaResult> {
let store = url_prefix_handler(store, Path::parse(location.path())?);
- if options
- .0
- .contains_key(AmazonS3ConfigKey::CopyIfNotExists.as_ref())
- {
+ // With conditional put in S3-like API we can use the deltalake default logstore which use PutIfAbsent
+ if options.0.keys().any(|key| {
+ let key = key.to_ascii_lowercase();
+ vec![
+ AmazonS3ConfigKey::ConditionalPut.as_ref(),
+ "conditional_put",
+ ]
+ .contains(&key.as_str())
+ }) {
+ debug!("S3LogStoreFactory has been asked to create a default LogStore where the underlying store has Conditonal Put enabled - no locking provider required");
+ return Ok(default_logstore(store, location, options));
+ }
+
+ if options.0.keys().any(|key| {
+ let key = key.to_ascii_lowercase();
+ vec![
+ AmazonS3ConfigKey::CopyIfNotExists.as_ref(),
+ "copy_if_not_exists",
+ ]
+ .contains(&key.as_str())
+ }) {
debug!("S3LogStoreFactory has been asked to create a LogStore where the underlying store has copy-if-not-exists enabled - no locking provider required");
- return Ok(deltalake_core::logstore::default_logstore(
- store, location, options,
- ));
+ return Ok(logstore::default_s3_logstore(store, location, options));
}
let s3_options = S3StorageOptions::from_map(&options.0)?;
if s3_options.locking_provider.as_deref() != Some("dynamodb") {
debug!("S3LogStoreFactory has been asked to create a LogStore without the dynamodb locking provider");
- return Ok(deltalake_core::logstore::default_logstore(
- store, location, options,
- ));
+ return Ok(logstore::default_s3_logstore(store, location, options));
}
Ok(Arc::new(logstore::S3DynamoDbLogStore::try_new(
@@ -141,8 +159,12 @@ impl DynamoDbLockClient {
lock_table_name: Option,
billing_mode: Option,
max_elapsed_request_time: Option,
+ dynamodb_override_endpoint: Option,
) -> Result {
- let dynamodb_client = aws_sdk_dynamodb::Client::new(sdk_config);
+ let dynamodb_sdk_config =
+ Self::create_dynamodb_sdk_config(sdk_config, dynamodb_override_endpoint);
+
+ let dynamodb_client = aws_sdk_dynamodb::Client::new(&dynamodb_sdk_config);
let lock_table_name = lock_table_name
.or_else(|| std::env::var(constants::LOCK_TABLE_KEY_NAME).ok())
@@ -177,6 +199,24 @@ impl DynamoDbLockClient {
config,
})
}
+ fn create_dynamodb_sdk_config(
+ sdk_config: &SdkConfig,
+ dynamodb_override_endpoint: Option,
+ ) -> SdkConfig {
+ /*
+ if dynamodb_override_endpoint exists/AWS_ENDPOINT_URL_DYNAMODB is specified by user
+ use dynamodb_override_endpoint to create dynamodb client
+ */
+
+ match dynamodb_override_endpoint {
+ Some(dynamodb_endpoint_url) => sdk_config
+ .to_owned()
+ .to_builder()
+ .endpoint_url(dynamodb_endpoint_url)
+ .build(),
+ None => sdk_config.to_owned(),
+ }
+ }
/// Create the lock table where DynamoDb stores the commit information for all delta tables.
///
@@ -256,28 +296,28 @@ impl DynamoDbLockClient {
version: i64,
) -> Result