diff --git a/.cargo/config b/.cargo/config.toml similarity index 100% rename from .cargo/config rename to .cargo/config.toml diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b99809d1f6..736703c551 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,4 +1,4 @@ -crates/ @wjones127 @roeap @rtyler +crates/ @wjones127 @roeap @rtyler @hntd187 @ion-elgreco delta-inspect/ @wjones127 @rtyler proofs/ @houqp python/ @wjones127 @fvaleye @roeap @ion-elgreco diff --git a/.github/actions/setup-env/action.yml b/.github/actions/setup-env/action.yml new file mode 100644 index 0000000000..7875107ddd --- /dev/null +++ b/.github/actions/setup-env/action.yml @@ -0,0 +1,34 @@ +name: "Setup Python and Rust Environment" +description: "Set up Python, virtual environment, and Rust toolchain" + +inputs: + + python-version: + description: "The Python version to set up" + required: true + default: "3.10" + + rust-toolchain: + description: "The Rust toolchain to set up" + required: true + default: "stable" + +runs: + using: "composite" + + steps: + + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + + - name: Install Rust toolchain + uses: actions-rs/toolchain@v1 + with: + profile: default + toolchain: ${{ inputs.rust-toolchain }} + override: true + components: rustfmt, clippy + + - uses: Swatinem/rust-cache@v2 \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 220c5b21d9..93b3cbdc3e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -5,6 +5,7 @@ on: branches: [main, "rust-v*"] pull_request: branches: [main, "rust-v*"] + merge_group: jobs: format: @@ -28,7 +29,6 @@ jobs: matrix: os: - ubuntu-latest - - macos-11 - windows-latest runs-on: ${{ matrix.os }} @@ -42,16 +42,14 @@ jobs: toolchain: stable override: true - - uses: Swatinem/rust-cache@v2 - - name: build and lint with clippy - run: cargo clippy --features azure,datafusion,s3,gcs,glue --tests + run: cargo clippy --features azure,datafusion,s3,gcs,glue,hdfs --tests - name: Spot-check build for native-tls features run: cargo clippy --no-default-features --features azure,datafusion,s3-native-tls,gcs,glue --tests - name: Check docs - run: cargo doc --features azure,datafusion,s3,gcs,glue + run: cargo doc --features azure,datafusion,s3,gcs,glue,hdfs - name: Check no default features (except rustls) run: cargo check --no-default-features --features rustls @@ -62,7 +60,6 @@ jobs: matrix: os: - ubuntu-latest - - macos-11 - windows-latest runs-on: ${{ matrix.os }} env: @@ -82,8 +79,6 @@ jobs: toolchain: "stable" override: true - - uses: Swatinem/rust-cache@v2 - - name: Run tests run: cargo test --verbose --features datafusion,azure @@ -118,28 +113,24 @@ jobs: toolchain: stable override: true - # - uses: actions/setup-java@v3 - # with: - # distribution: "zulu" - # java-version: "17" - - # - uses: beyondstorage/setup-hdfs@master - # with: - # hdfs-version: "3.3.2" - - # - name: Set Hadoop env - # run: | - # echo "CLASSPATH=$CLASSPATH:`hadoop classpath --glob`" >> $GITHUB_ENV - # echo "LD_LIBRARY_PATH=$JAVA_HOME/lib/server" >> $GITHUB_ENV + # Install Java and Hadoop for HDFS integration tests + - uses: actions/setup-java@v4 + with: + distribution: "temurin" + java-version: "17" - - uses: Swatinem/rust-cache@v2 + - name: Download Hadoop + run: | + wget -q https://dlcdn.apache.org/hadoop/common/hadoop-3.4.0/hadoop-3.4.0.tar.gz + tar -xf hadoop-3.4.0.tar.gz -C $GITHUB_WORKSPACE + echo "$GITHUB_WORKSPACE/hadoop-3.4.0/bin" >> $GITHUB_PATH - name: Start emulated services - run: docker-compose up -d + run: docker compose up -d - name: Run tests with rustls (default) run: | - cargo test --features integration_test,azure,s3,gcs,datafusion + cargo test --features integration_test,azure,s3,gcs,datafusion,hdfs - name: Run tests with native-tls run: | diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 6b3d5a7ddb..121e0b8882 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -2,6 +2,7 @@ name: dev_pr # Trigger whenever a PR is changed (title as well as new / changed commits) on: + merge_group: pull_request_target: types: - opened diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 079cd66fcc..5729b87624 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,6 +1,7 @@ name: Build (and maybe release) the documentation on: + merge_group: pull_request: paths: - python/** @@ -31,9 +32,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: psf/black@stable - with: - src: docs/src/python + - run: | + cd docs + make check build-deploy: needs: @@ -47,25 +48,13 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install Rust - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.10' + - name: Setup Environment + uses: ./.github/actions/setup-env - name: Build and install deltalake run: | cd python - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make ${{ env.BUILD_ARGS }} diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index bc2f20cc9a..ce2a7e0bfd 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -1,6 +1,7 @@ name: python_build on: + merge_group: push: branches: [main] pull_request: @@ -15,23 +16,17 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 + + - name: Setup Environment + uses: ./.github/actions/setup-env - name: Check Python run: | - pip install ruff black mypy types-dataclasses typing-extensions + python -m venv venv + source venv/bin/activate + pip install ruff==0.5.2 mypy==1.10.1 types-dataclasses typing-extensions make check-python - - name: Install minimal stable with clippy and rustfmt - uses: actions-rs/toolchain@v1 - with: - profile: default - toolchain: stable - override: true - - name: Check Rust run: make check-rust @@ -45,24 +40,14 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v2 + - name: Setup Environment + uses: ./.github/actions/setup-env with: python-version: 3.8 - - name: Install latest nightly - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - name: Build and install deltalake run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make setup # Install minimum PyArrow version @@ -89,26 +74,15 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install latest nightly - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - - uses: actions/setup-python@v3 - with: - python-version: "3.10" + - name: Setup Environment + uses: ./.github/actions/setup-env - name: Start emulated services - run: docker-compose up -d + run: docker compose up -d - name: Build and install deltalake run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make develop @@ -137,23 +111,12 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Install latest nightly - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - - uses: actions/setup-python@v4 - with: - python-version: "3.10" + - name: Setup Environment + uses: ./.github/actions/setup-env - name: Build deltalake in release mode run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate MATURIN_EXTRA_ARGS=--release make develop @@ -187,18 +150,8 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install latest nightly - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - - uses: actions/setup-python@v3 - with: - python-version: "3.10" + - name: Setup Environment + uses: ./.github/actions/setup-env - uses: actions/setup-java@v2 with: @@ -207,8 +160,7 @@ jobs: - name: Build and install deltalake run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make develop-pyspark @@ -231,15 +183,14 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + - name: Setup Environment + uses: ./.github/actions/setup-env with: python-version: ${{ matrix.python-version }} - name: Build and install deltalake run: | - pip install virtualenv - virtualenv venv + python -m venv venv source venv/bin/activate make setup maturin develop diff --git a/.github/workflows/python_release.yml b/.github/workflows/python_release.yml index 48611bacb4..46b4230af1 100644 --- a/.github/workflows/python_release.yml +++ b/.github/workflows/python_release.yml @@ -35,7 +35,7 @@ jobs: fail-fast: false matrix: target: [x86_64-apple-darwin, aarch64-apple-darwin] - runs-on: macos-12 + runs-on: macos-14 steps: - uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index ca0576b47c..18dcc39f69 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,8 @@ tlaplus/*.toolbox/*/[0-9]*-[0-9]*-[0-9]*-[0-9]*-[0-9]*-[0-9]*/ /.idea .vscode .env +.venv +venv **/.DS_Store **/.python-version .coverage @@ -20,6 +22,7 @@ __blobstorage__ .githubchangeloggenerator.cache.log .githubchangeloggenerator.cache/ .githubchangeloggenerator* +data # Add all Cargo.lock files except for those in binary crates Cargo.lock diff --git a/CHANGELOG.md b/CHANGELOG.md index 922a49f47e..9161a320c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,689 @@ # Changelog +## [rust-v0.18.2](https://github.com/delta-io/delta-rs/tree/rust-v0.18.2) (2024-08-07) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.18.1...rust-v0.18.2) + +**Implemented enhancements:** + +- Choose which columns to store min/max values for [\#2709](https://github.com/delta-io/delta-rs/issues/2709) +- Projection pushdown for load\_cdf [\#2681](https://github.com/delta-io/delta-rs/issues/2681) +- Way to check if Delta table exists at specified path [\#2662](https://github.com/delta-io/delta-rs/issues/2662) +- Support HDFS via hdfs-native package [\#2611](https://github.com/delta-io/delta-rs/issues/2611) +- Deletion `_change_type` does not appear in change data feed [\#2579](https://github.com/delta-io/delta-rs/issues/2579) +- Could you please explain in the README what "Deltalake" is for the uninitiated? [\#2523](https://github.com/delta-io/delta-rs/issues/2523) +- Discuss: Allow protocol change during write actions [\#2444](https://github.com/delta-io/delta-rs/issues/2444) +- Support for Arrow PyCapsule interface [\#2376](https://github.com/delta-io/delta-rs/issues/2376) + +**Fixed bugs:** + +- Slow add\_actions.to\_pydict for tables with large number of columns, impacting read performance [\#2733](https://github.com/delta-io/delta-rs/issues/2733) +- append is deleting records [\#2716](https://github.com/delta-io/delta-rs/issues/2716) +- segmentation fault - Python 3.10 on Mac M3 [\#2706](https://github.com/delta-io/delta-rs/issues/2706) +- Failure to delete dir and files [\#2703](https://github.com/delta-io/delta-rs/issues/2703) +- DeltaTable.from\_data\_catalog not working [\#2699](https://github.com/delta-io/delta-rs/issues/2699) +- Project should use the same version of `ruff` in the `lint` stage of `python_build.yml` as in `pyproject.toml` [\#2678](https://github.com/delta-io/delta-rs/issues/2678) +- un-tracked columns are giving json error when pyarrow schema have feild with nullable=False and create\_checkpoint is trigged [\#2675](https://github.com/delta-io/delta-rs/issues/2675) +- \[BUG\]write\_delta\({'custom\_metadata':str}\) cannot be converted. str to pyDict error \(0.18.2\_DeltaPython/Windows10\) [\#2697](https://github.com/delta-io/delta-rs/issues/2697) +- Pyarrow engine not supporting schema overwrite with Append mode [\#2654](https://github.com/delta-io/delta-rs/issues/2654) +- `deltalake-core` version re-exported by `deltalake` different than versions used by `deltalake-azure` and `deltalake-gcp` [\#2647](https://github.com/delta-io/delta-rs/issues/2647) +- i32 limit in JSON stats [\#2646](https://github.com/delta-io/delta-rs/issues/2646) +- Rust writer not encoding correct URL for partitions in delta table [\#2634](https://github.com/delta-io/delta-rs/issues/2634) +- Large Types breaks merge predicate pruning [\#2632](https://github.com/delta-io/delta-rs/issues/2632) +- Getting error when converting a partitioned parquet table to delta table [\#2626](https://github.com/delta-io/delta-rs/issues/2626) +- Arrow: Parquet does not support writing empty structs when creating checkpoint [\#2622](https://github.com/delta-io/delta-rs/issues/2622) +- InvalidTableLocation\("Unknown scheme: gs"\) on 0.18.0 [\#2610](https://github.com/delta-io/delta-rs/issues/2610) +- Unable to read delta table created using Uniform [\#2578](https://github.com/delta-io/delta-rs/issues/2578) +- schema merging doesn't work when overwriting with a predicate [\#2567](https://github.com/delta-io/delta-rs/issues/2567) +- Not working in AWS Lambda \(0.16.2 - 0.17.4\) OSError: Generic S3 error [\#2511](https://github.com/delta-io/delta-rs/issues/2511) +- DataFusion filter on partition column doesn't work. \(when the phsical schema ordering is different to logical one\) [\#2494](https://github.com/delta-io/delta-rs/issues/2494) +- Creating checkpoints for tables with missing column stats results in Err [\#2493](https://github.com/delta-io/delta-rs/issues/2493) +- Cannot merge to a table with a timestamp column after upgrading delta-rs [\#2478](https://github.com/delta-io/delta-rs/issues/2478) +- Azure AD Auth fails on ARM64 [\#2475](https://github.com/delta-io/delta-rs/issues/2475) +- Generic S3 error: Error after 0 retries ... Broken pipe \(os error 32\) [\#2403](https://github.com/delta-io/delta-rs/issues/2403) +- write\_deltalake identifies large\_string as datatype even though string is set in schema [\#2374](https://github.com/delta-io/delta-rs/issues/2374) +- Inconsistent arrow timestamp type breaks datafusion query [\#2341](https://github.com/delta-io/delta-rs/issues/2341) + +**Closed issues:** + +- Unable to write new partitions with type timestamp on tables created with delta-rs 0.10.0 [\#2631](https://github.com/delta-io/delta-rs/issues/2631) + +**Merged pull requests:** + +- fix: schema adapter doesn't map partial batches correctly [\#2735](https://github.com/delta-io/delta-rs/pull/2735) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- perf: grab file size in rust [\#2734](https://github.com/delta-io/delta-rs/pull/2734) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: use logical plan in update, refactor/simplify CDCTracker [\#2727](https://github.com/delta-io/delta-rs/pull/2727) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: use logical plan in delete, delta planner refactoring [\#2725](https://github.com/delta-io/delta-rs/pull/2725) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: try an alternative docke compose invocation syntax [\#2724](https://github.com/delta-io/delta-rs/pull/2724) ([rtyler](https://github.com/rtyler)) +- fix\(python, rust\): use input schema to get correct schema in cdf reads [\#2723](https://github.com/delta-io/delta-rs/pull/2723) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): cdc write-support for `overwrite` and `replacewhere` writes [\#2722](https://github.com/delta-io/delta-rs/pull/2722) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): cdc write-support for `delete` operation [\#2721](https://github.com/delta-io/delta-rs/pull/2721) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: enabling actions for merge groups [\#2718](https://github.com/delta-io/delta-rs/pull/2718) ([rtyler](https://github.com/rtyler)) +- perf: apply projection when reading checkpoint parquet [\#2717](https://github.com/delta-io/delta-rs/pull/2717) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- feat\(python\): add DeltaTable.is\_deltatable static method \(\#2662\) [\#2715](https://github.com/delta-io/delta-rs/pull/2715) ([omkar-foss](https://github.com/omkar-foss)) +- chore: prepare python release 0.18.3 [\#2707](https://github.com/delta-io/delta-rs/pull/2707) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): use url encoder when encoding partition values [\#2705](https://github.com/delta-io/delta-rs/pull/2705) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): add projection in CDF reads [\#2704](https://github.com/delta-io/delta-rs/pull/2704) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: ensure DataFusion SessionState Parquet options are applied to DeltaScan [\#2702](https://github.com/delta-io/delta-rs/pull/2702) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- chore: refactor `write_deltalake` in `writer.py` [\#2695](https://github.com/delta-io/delta-rs/pull/2695) ([fpgmaas](https://github.com/fpgmaas)) +- fix\(python\): empty dataset fix for "pyarrow" engine [\#2689](https://github.com/delta-io/delta-rs/pull/2689) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: add test coverage command to `Makefile` [\#2688](https://github.com/delta-io/delta-rs/pull/2688) ([fpgmaas](https://github.com/fpgmaas)) +- chore: create separate action to setup python and rust in the cicd pipeline [\#2687](https://github.com/delta-io/delta-rs/pull/2687) ([fpgmaas](https://github.com/fpgmaas)) +- fix: update delta kernel version [\#2685](https://github.com/delta-io/delta-rs/pull/2685) ([jeppe742](https://github.com/jeppe742)) +- chore: update README.md [\#2684](https://github.com/delta-io/delta-rs/pull/2684) ([veronewra](https://github.com/veronewra)) +- fix\(rust,python\): checkpoint with column nullable false [\#2680](https://github.com/delta-io/delta-rs/pull/2680) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: pin `ruff` and `mypy` versions in the `lint` stage in the CI pipeline [\#2679](https://github.com/delta-io/delta-rs/pull/2679) ([fpgmaas](https://github.com/fpgmaas)) +- chore: enable `RUF` ruleset for `ruff` [\#2677](https://github.com/delta-io/delta-rs/pull/2677) ([fpgmaas](https://github.com/fpgmaas)) +- chore: remove stale code for conditional import of `Literal` [\#2676](https://github.com/delta-io/delta-rs/pull/2676) ([fpgmaas](https://github.com/fpgmaas)) +- chore: remove references to black from the project [\#2674](https://github.com/delta-io/delta-rs/pull/2674) ([fpgmaas](https://github.com/fpgmaas)) +- chore: bump ruff to 0.5.2 [\#2673](https://github.com/delta-io/delta-rs/pull/2673) ([fpgmaas](https://github.com/fpgmaas)) +- chore: improve contributing.md [\#2672](https://github.com/delta-io/delta-rs/pull/2672) ([fpgmaas](https://github.com/fpgmaas)) +- feat: support userMetadata in CommitInfo [\#2670](https://github.com/delta-io/delta-rs/pull/2670) ([jkylling](https://github.com/jkylling)) +- chore: upgrade to datafusion 40 [\#2661](https://github.com/delta-io/delta-rs/pull/2661) ([rtyler](https://github.com/rtyler)) +- docs: improve navigation fixes [\#2660](https://github.com/delta-io/delta-rs/pull/2660) ([avriiil](https://github.com/avriiil)) +- docs: add integration docs for s3 backend [\#2658](https://github.com/delta-io/delta-rs/pull/2658) ([avriiil](https://github.com/avriiil)) +- docs: fix bullets on hdfs docs [\#2653](https://github.com/delta-io/delta-rs/pull/2653) ([Kimahriman](https://github.com/Kimahriman)) +- ci: update CODEOWNERS [\#2650](https://github.com/delta-io/delta-rs/pull/2650) ([hntd187](https://github.com/hntd187)) +- feat\(rust\): fix size\_in\_bytes in last\_checkpoint\_ to i64 [\#2649](https://github.com/delta-io/delta-rs/pull/2649) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: increase subcrate versions [\#2648](https://github.com/delta-io/delta-rs/pull/2648) ([rtyler](https://github.com/rtyler)) +- chore: missed one macos runner reference in actions [\#2645](https://github.com/delta-io/delta-rs/pull/2645) ([rtyler](https://github.com/rtyler)) +- chore: add a reproduction case for merge failures with struct\ [\#2644](https://github.com/delta-io/delta-rs/pull/2644) ([rtyler](https://github.com/rtyler)) +- chore: remove macos builders from pull request flow [\#2638](https://github.com/delta-io/delta-rs/pull/2638) ([rtyler](https://github.com/rtyler)) +- fix: enable parquet pushdown for DeltaScan via TableProvider impl for DeltaTable \(rebase\) [\#2637](https://github.com/delta-io/delta-rs/pull/2637) ([rtyler](https://github.com/rtyler)) +- chore: fix documentation generation with a pin of griffe [\#2636](https://github.com/delta-io/delta-rs/pull/2636) ([rtyler](https://github.com/rtyler)) +- fix\(python\): fixed large\_dtype to schema convert [\#2635](https://github.com/delta-io/delta-rs/pull/2635) ([sherlockbeard](https://github.com/sherlockbeard)) +- fix\(rust, python\): fix writing empty structs when creating checkpoint [\#2627](https://github.com/delta-io/delta-rs/pull/2627) ([sherlockbeard](https://github.com/sherlockbeard)) +- fix\(rust, python\): fix merge schema with overwrite [\#2623](https://github.com/delta-io/delta-rs/pull/2623) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: bump python 0.18.2 [\#2621](https://github.com/delta-io/delta-rs/pull/2621) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: report DataFusion metrics for DeltaScan [\#2617](https://github.com/delta-io/delta-rs/pull/2617) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- feat\(rust,python\): cast each parquet file to delta schema [\#2615](https://github.com/delta-io/delta-rs/pull/2615) ([HawaiianSpork](https://github.com/HawaiianSpork)) +- fix\(rust\): inconsistent order of partitioning columns \(\#2494\) [\#2614](https://github.com/delta-io/delta-rs/pull/2614) ([aditanase](https://github.com/aditanase)) +- docs: add Daft writer [\#2594](https://github.com/delta-io/delta-rs/pull/2594) ([avriiil](https://github.com/avriiil)) +- feat\(python, rust\): `add column` operation [\#2562](https://github.com/delta-io/delta-rs/pull/2562) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: change arrow map root name to follow with parquet root name [\#2538](https://github.com/delta-io/delta-rs/pull/2538) ([sclmn](https://github.com/sclmn)) +- feat\(python\): handle PyCapsule interface objects in write\_deltalake [\#2534](https://github.com/delta-io/delta-rs/pull/2534) ([kylebarron](https://github.com/kylebarron)) +- feat: improve merge performance by using predicate non-partition columns min/max for prefiltering [\#2513](https://github.com/delta-io/delta-rs/pull/2513) ([JonasDev1](https://github.com/JonasDev1)) +- feat\(python, rust\): cleanup expired logs post-commit hook [\#2459](https://github.com/delta-io/delta-rs/pull/2459) ([ion-elgreco](https://github.com/ion-elgreco)) + +## [rust-v0.18.0](https://github.com/delta-io/delta-rs/tree/rust-v0.18.0) (2024-06-12) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.17.3...rust-v0.18.0) + +**Implemented enhancements:** + +- documentation: concurrent writes for non-S3 backends [\#2556](https://github.com/delta-io/delta-rs/issues/2556) +- pyarrow options for `write_delta` [\#2515](https://github.com/delta-io/delta-rs/issues/2515) +- \[deltalake\_aws\] Allow configuring separate endpoints for S3 and DynamoDB clients. [\#2498](https://github.com/delta-io/delta-rs/issues/2498) +- Include file stats when converting a parquet directory to a Delta table [\#2490](https://github.com/delta-io/delta-rs/issues/2490) +- Adopt the delta kernel types [\#2489](https://github.com/delta-io/delta-rs/issues/2489) + +**Fixed bugs:** + +- `raise_if_not_exists` for properties not configurable on CreateBuilder [\#2564](https://github.com/delta-io/delta-rs/issues/2564) +- write\_deltalake with rust engine fails when mode is append and overwrite schema is enabled [\#2553](https://github.com/delta-io/delta-rs/issues/2553) +- Running the basic\_operations examples fails with `Error: Transaction { source: WriterFeaturesRequired(TimestampWithoutTimezone) `} [\#2552](https://github.com/delta-io/delta-rs/issues/2552) +- invalid peer certificate: BadSignature when connecting to s3 from arm64/aarch64 [\#2551](https://github.com/delta-io/delta-rs/issues/2551) +- load\_cdf\(\) issue : Generic S3 error: request or response body error: operation timed out [\#2549](https://github.com/delta-io/delta-rs/issues/2549) +- write\_deltalake fails on Databricks volume [\#2540](https://github.com/delta-io/delta-rs/issues/2540) +- Getting "Microsoft Azure Error: Operation timed out" when trying to retrieve big files [\#2537](https://github.com/delta-io/delta-rs/issues/2537) +- Impossible to append to a DeltaTable with float data type on RHEL [\#2520](https://github.com/delta-io/delta-rs/issues/2520) +- Creating DeltaTable object slow [\#2518](https://github.com/delta-io/delta-rs/issues/2518) +- `write_deltalake` throws parser error when using `rust` engine and big decimals [\#2510](https://github.com/delta-io/delta-rs/issues/2510) +- TypeError: Object of type int64 is not JSON serializable when writing using a Pandas dataframe [\#2501](https://github.com/delta-io/delta-rs/issues/2501) +- unable to read delta table when table contains both null and non-null add stats [\#2477](https://github.com/delta-io/delta-rs/issues/2477) +- Commits on WriteMode::MergeSchema cause table metadata corruption [\#2468](https://github.com/delta-io/delta-rs/issues/2468) +- S3 object store always returns IMDS warnings [\#2460](https://github.com/delta-io/delta-rs/issues/2460) +- File skipping according to documentation [\#2427](https://github.com/delta-io/delta-rs/issues/2427) +- LockClientError [\#2379](https://github.com/delta-io/delta-rs/issues/2379) +- get\_app\_transaction\_version\(\) returns wrong result [\#2340](https://github.com/delta-io/delta-rs/issues/2340) +- Property setting in `create` is not handled correctly [\#2247](https://github.com/delta-io/delta-rs/issues/2247) +- Handling of decimals in scientific notation [\#2221](https://github.com/delta-io/delta-rs/issues/2221) +- Unable to append to delta table without datafusion feature [\#2204](https://github.com/delta-io/delta-rs/issues/2204) +- Decimal Column with Value 0 Causes Failure in Python Binding [\#2193](https://github.com/delta-io/delta-rs/issues/2193) + +**Merged pull requests:** + +- docs: improve S3 access docs [\#2589](https://github.com/delta-io/delta-rs/pull/2589) ([avriiil](https://github.com/avriiil)) +- chore: bump macOS runners, maybe resolve import error [\#2588](https://github.com/delta-io/delta-rs/pull/2588) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: bump to datafusion 39, arrow 52, pyo3 0.21 [\#2581](https://github.com/delta-io/delta-rs/pull/2581) ([abhiaagarwal](https://github.com/abhiaagarwal)) +- feat: add custom dynamodb endpoint configuration [\#2575](https://github.com/delta-io/delta-rs/pull/2575) ([hnaoto](https://github.com/hnaoto)) +- fix: consistently use raise\_if\_key\_not\_exists in CreateBuilder [\#2569](https://github.com/delta-io/delta-rs/pull/2569) ([vegarsti](https://github.com/vegarsti)) +- fix: add raise\_if\_key\_not\_exists to CreateBuilder [\#2565](https://github.com/delta-io/delta-rs/pull/2565) ([vegarsti](https://github.com/vegarsti)) +- docs: dt.delete add context + api docs link [\#2560](https://github.com/delta-io/delta-rs/pull/2560) ([avriiil](https://github.com/avriiil)) +- fix: update deltalake crate examples for crate layout and TimestampNtz [\#2559](https://github.com/delta-io/delta-rs/pull/2559) ([jhoekx](https://github.com/jhoekx)) +- docs: clarify locking mechanism requirement for S3 [\#2558](https://github.com/delta-io/delta-rs/pull/2558) ([inigohidalgo](https://github.com/inigohidalgo)) +- fix: remove deprecated overwrite\_schema configuration which has incorrect behavior [\#2554](https://github.com/delta-io/delta-rs/pull/2554) ([rtyler](https://github.com/rtyler)) +- fix: clippy warnings [\#2548](https://github.com/delta-io/delta-rs/pull/2548) ([imor](https://github.com/imor)) +- docs: dask write syntax fix [\#2543](https://github.com/delta-io/delta-rs/pull/2543) ([avriiil](https://github.com/avriiil)) +- fix: cast support fields nested in lists and maps [\#2541](https://github.com/delta-io/delta-rs/pull/2541) ([HawaiianSpork](https://github.com/HawaiianSpork)) +- feat: implement transaction identifiers - continued [\#2539](https://github.com/delta-io/delta-rs/pull/2539) ([roeap](https://github.com/roeap)) +- docs: pull delta from conda not pip [\#2535](https://github.com/delta-io/delta-rs/pull/2535) ([avriiil](https://github.com/avriiil)) +- chore: expose `files_by_partition` to public api [\#2533](https://github.com/delta-io/delta-rs/pull/2533) ([edmondop](https://github.com/edmondop)) +- chore: bump python 0.17.5 [\#2531](https://github.com/delta-io/delta-rs/pull/2531) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(rust\): make PartitionWriter public [\#2525](https://github.com/delta-io/delta-rs/pull/2525) ([adriangb](https://github.com/adriangb)) +- fix: msrv in workspace [\#2524](https://github.com/delta-io/delta-rs/pull/2524) ([roeap](https://github.com/roeap)) +- chore: fixing some clips [\#2521](https://github.com/delta-io/delta-rs/pull/2521) ([rtyler](https://github.com/rtyler)) +- fix: enable field\_with\_name to support nested fields with '.' delimiter [\#2519](https://github.com/delta-io/delta-rs/pull/2519) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- chore: tidying up builds without datafusion feature and clippy [\#2516](https://github.com/delta-io/delta-rs/pull/2516) ([rtyler](https://github.com/rtyler)) +- fix\(python\): release GIL on most operations [\#2512](https://github.com/delta-io/delta-rs/pull/2512) ([adriangb](https://github.com/adriangb)) +- docs: fix typo [\#2508](https://github.com/delta-io/delta-rs/pull/2508) ([avriiil](https://github.com/avriiil)) +- fix\(rust, python\): fixed differences in storage options between log and object stores [\#2500](https://github.com/delta-io/delta-rs/pull/2500) ([mightyshazam](https://github.com/mightyshazam)) +- docs: improve daft integration docs [\#2496](https://github.com/delta-io/delta-rs/pull/2496) ([avriiil](https://github.com/avriiil)) +- feat: adopt kernel schema types [\#2495](https://github.com/delta-io/delta-rs/pull/2495) ([roeap](https://github.com/roeap)) +- feat: add stats to convert-to-delta operation [\#2491](https://github.com/delta-io/delta-rs/pull/2491) ([gruuya](https://github.com/gruuya)) +- fix\(python, rust\): region lookup wasn't working correctly for dynamo [\#2488](https://github.com/delta-io/delta-rs/pull/2488) ([mightyshazam](https://github.com/mightyshazam)) +- feat: introduce CDC write-side support for the Update operations [\#2486](https://github.com/delta-io/delta-rs/pull/2486) ([rtyler](https://github.com/rtyler)) +- fix\(python\): reuse state in `to_pyarrow_dataset` [\#2485](https://github.com/delta-io/delta-rs/pull/2485) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: check to see if the file exists before attempting to rename [\#2482](https://github.com/delta-io/delta-rs/pull/2482) ([rtyler](https://github.com/rtyler)) +- fix\(python, rust\): use new schema for stats parsing instead of old [\#2480](https://github.com/delta-io/delta-rs/pull/2480) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): unable to read delta table when table contains both null and non-null add stats [\#2476](https://github.com/delta-io/delta-rs/pull/2476) ([yjshen](https://github.com/yjshen)) +- chore: update the changelog to include rust-v0.17.3 [\#2473](https://github.com/delta-io/delta-rs/pull/2473) ([rtyler](https://github.com/rtyler)) +- chore: a bunch of tweaks to get releases out the door [\#2472](https://github.com/delta-io/delta-rs/pull/2472) ([rtyler](https://github.com/rtyler)) +- chore: bump the core crate for its next release [\#2470](https://github.com/delta-io/delta-rs/pull/2470) ([rtyler](https://github.com/rtyler)) +- fix: return unsupported error for merging schemas in the presence of partition columns [\#2469](https://github.com/delta-io/delta-rs/pull/2469) ([emcake](https://github.com/emcake)) +- feat\(python\): add parameter to DeltaTable.to\_pyarrow\_dataset\(\) [\#2465](https://github.com/delta-io/delta-rs/pull/2465) ([adriangb](https://github.com/adriangb)) +- feat\(python, rust\): add OBJECT\_STORE\_CONCURRENCY\_LIMIT setting for ObjectStoreFactory [\#2458](https://github.com/delta-io/delta-rs/pull/2458) ([vigimite](https://github.com/vigimite)) +- fix\(rust\): handle 429 from GCS [\#2454](https://github.com/delta-io/delta-rs/pull/2454) ([adriangb](https://github.com/adriangb)) +- fix\(python\): reuse table state in write engine [\#2453](https://github.com/delta-io/delta-rs/pull/2453) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): implement abort commit for S3DynamoDBLogStore [\#2452](https://github.com/delta-io/delta-rs/pull/2452) ([PeterKeDer](https://github.com/PeterKeDer)) +- fix\(python, rust\): check timestamp\_ntz in nested fields, add check\_can\_write in pyarrow writer [\#2443](https://github.com/delta-io/delta-rs/pull/2443) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): remove imds calls from profile auth and region [\#2442](https://github.com/delta-io/delta-rs/pull/2442) ([mightyshazam](https://github.com/mightyshazam)) +- fix\(python, rust\): use from\_name during column projection creation [\#2441](https://github.com/delta-io/delta-rs/pull/2441) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: bump python for 0.17 release [\#2439](https://github.com/delta-io/delta-rs/pull/2439) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python,rust\): missing remove actions during `create_or_replace` [\#2437](https://github.com/delta-io/delta-rs/pull/2437) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: introduce the Operation trait to enforce consistency between operations [\#2435](https://github.com/delta-io/delta-rs/pull/2435) ([rtyler](https://github.com/rtyler)) +- fix\(python\): load\_as\_version with datetime object with no timezone specified [\#2429](https://github.com/delta-io/delta-rs/pull/2429) ([t1g0rz](https://github.com/t1g0rz)) +- feat\(python, rust\): respect column stats collection configurations [\#2428](https://github.com/delta-io/delta-rs/pull/2428) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: lazy static runtime in python [\#2424](https://github.com/delta-io/delta-rs/pull/2424) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: implement repartitioned for DeltaScan [\#2421](https://github.com/delta-io/delta-rs/pull/2421) ([jkylling](https://github.com/jkylling)) +- fix: return error when checkpoints and metadata get out of sync [\#2406](https://github.com/delta-io/delta-rs/pull/2406) ([esarili](https://github.com/esarili)) +- fix\(rust\): stats\_parsed has different number of records with stats [\#2405](https://github.com/delta-io/delta-rs/pull/2405) ([yjshen](https://github.com/yjshen)) +- docs: add Daft integration [\#2402](https://github.com/delta-io/delta-rs/pull/2402) ([avriiil](https://github.com/avriiil)) +- feat\(rust\): advance state in post commit [\#2396](https://github.com/delta-io/delta-rs/pull/2396) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore\(rust\): bump arrow v51 and datafusion v37.1 [\#2395](https://github.com/delta-io/delta-rs/pull/2395) ([lasantosr](https://github.com/lasantosr)) +- docs: document required aws permissions [\#2393](https://github.com/delta-io/delta-rs/pull/2393) ([ale-rinaldi](https://github.com/ale-rinaldi)) +- feat\(rust\): post commit hook \(v2\), create checkpoint hook [\#2391](https://github.com/delta-io/delta-rs/pull/2391) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: time travel when checkpointed and logs removed [\#2389](https://github.com/delta-io/delta-rs/pull/2389) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): remove flush after writing every batch [\#2387](https://github.com/delta-io/delta-rs/pull/2387) ([PeterKeDer](https://github.com/PeterKeDer)) +- feat: added configuration variables to handle EC2 metadata service [\#2385](https://github.com/delta-io/delta-rs/pull/2385) ([mightyshazam](https://github.com/mightyshazam)) +- fix\(rust\): timestamp deserialization format, missing type [\#2383](https://github.com/delta-io/delta-rs/pull/2383) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: bump chrono [\#2372](https://github.com/delta-io/delta-rs/pull/2372) ([universalmind303](https://github.com/universalmind303)) +- chore: bump python 0.16.4 [\#2371](https://github.com/delta-io/delta-rs/pull/2371) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: add snappy compression on checkpoint files [\#2365](https://github.com/delta-io/delta-rs/pull/2365) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: add config for parquet pushdown on delta scan [\#2364](https://github.com/delta-io/delta-rs/pull/2364) ([Blajda](https://github.com/Blajda)) +- fix\(python,rust\): optimize compact on schema evolved table [\#2358](https://github.com/delta-io/delta-rs/pull/2358) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): expr parsing date/timestamp [\#2357](https://github.com/delta-io/delta-rs/pull/2357) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: remove tmp files in cleanup\_metadata [\#2356](https://github.com/delta-io/delta-rs/pull/2356) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: make struct fields nullable in stats schema [\#2346](https://github.com/delta-io/delta-rs/pull/2346) ([qinix](https://github.com/qinix)) +- fix\(rust\): adhere to protocol for Decimal [\#2332](https://github.com/delta-io/delta-rs/pull/2332) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): raise schema mismatch when decimal is not subset [\#2330](https://github.com/delta-io/delta-rs/pull/2330) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(rust\): derive Copy on some public enums [\#2329](https://github.com/delta-io/delta-rs/pull/2329) ([lasantosr](https://github.com/lasantosr)) +- fix: merge pushdown handling [\#2326](https://github.com/delta-io/delta-rs/pull/2326) ([Blajda](https://github.com/Blajda)) +- fix: merge concurrency control [\#2324](https://github.com/delta-io/delta-rs/pull/2324) ([ion-elgreco](https://github.com/ion-elgreco)) +- Revert 2291 merge predicate fix [\#2323](https://github.com/delta-io/delta-rs/pull/2323) ([Blajda](https://github.com/Blajda)) +- fix: try to fix timeouts [\#2318](https://github.com/delta-io/delta-rs/pull/2318) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): serialize MetricDetails from compaction runs to a string [\#2317](https://github.com/delta-io/delta-rs/pull/2317) ([liamphmurphy](https://github.com/liamphmurphy)) +- docs: add example in to\_pyarrow\_dataset [\#2315](https://github.com/delta-io/delta-rs/pull/2315) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python\): wrong batch size [\#2314](https://github.com/delta-io/delta-rs/pull/2314) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: object store 0.9.1 [\#2311](https://github.com/delta-io/delta-rs/pull/2311) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: checkpoint features format below v3,7 [\#2307](https://github.com/delta-io/delta-rs/pull/2307) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: schema evolution not coercing with large arrow types [\#2305](https://github.com/delta-io/delta-rs/pull/2305) ([aersam](https://github.com/aersam)) +- fix: clean up some non-datafusion builds [\#2303](https://github.com/delta-io/delta-rs/pull/2303) ([rtyler](https://github.com/rtyler)) +- docs: fix typo [\#2300](https://github.com/delta-io/delta-rs/pull/2300) ([LauH1987](https://github.com/LauH1987)) +- docs: make replaceWhere example compile [\#2299](https://github.com/delta-io/delta-rs/pull/2299) ([LauH1987](https://github.com/LauH1987)) +- fix\(rust\): add missing chrono-tz feature [\#2295](https://github.com/delta-io/delta-rs/pull/2295) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore\(python\): bump to v0.16.1 [\#2294](https://github.com/delta-io/delta-rs/pull/2294) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): features not maintained in protocol after checkpoint [\#2293](https://github.com/delta-io/delta-rs/pull/2293) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: merge predicate for concurrent writes [\#2291](https://github.com/delta-io/delta-rs/pull/2291) ([JonasDev1](https://github.com/JonasDev1)) +- fix: replace assert and AssertionError with appropriate exceptions [\#2286](https://github.com/delta-io/delta-rs/pull/2286) ([joe-sharman](https://github.com/joe-sharman)) +- docs: fix typo in delta-lake-polars.md [\#2285](https://github.com/delta-io/delta-rs/pull/2285) ([vladdoster](https://github.com/vladdoster)) +- fix\(python, rust\): prevent table scan returning large arrow dtypes [\#2274](https://github.com/delta-io/delta-rs/pull/2274) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python\): always encapsulate column names in backticks in \_all functions [\#2271](https://github.com/delta-io/delta-rs/pull/2271) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): read only checkpoints that match \_last\_checkpoint version [\#2270](https://github.com/delta-io/delta-rs/pull/2270) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: add .venv to .gitignore [\#2268](https://github.com/delta-io/delta-rs/pull/2268) ([gacharya](https://github.com/gacharya)) +- feat\(python, rust\): add `set table properties` operation [\#2264](https://github.com/delta-io/delta-rs/pull/2264) ([ion-elgreco](https://github.com/ion-elgreco)) +- docs: use dagster deltalake polars library [\#2263](https://github.com/delta-io/delta-rs/pull/2263) ([avriiil](https://github.com/avriiil)) +- docs: update comment about r2 requiring locks [\#2261](https://github.com/delta-io/delta-rs/pull/2261) ([cmackenzie1](https://github.com/cmackenzie1)) +- fix\(\#2256\): use consistent units of time [\#2260](https://github.com/delta-io/delta-rs/pull/2260) ([cmackenzie1](https://github.com/cmackenzie1)) +- chore: update the changelog for rust-v0.17.1 [\#2259](https://github.com/delta-io/delta-rs/pull/2259) ([rtyler](https://github.com/rtyler)) +- feat\(python\): release GIL in the write\_deltalake function [\#2257](https://github.com/delta-io/delta-rs/pull/2257) ([franz101](https://github.com/franz101)) +- chore\(rust\): bump datafusion to 36 [\#2249](https://github.com/delta-io/delta-rs/pull/2249) ([universalmind303](https://github.com/universalmind303)) +- chore!: replace rusoto with AWS SDK [\#2243](https://github.com/delta-io/delta-rs/pull/2243) ([mightyshazam](https://github.com/mightyshazam)) +- fix: handle conflict checking in optimize correctly [\#2208](https://github.com/delta-io/delta-rs/pull/2208) ([emcake](https://github.com/emcake)) +- feat: logical Node for find files [\#2194](https://github.com/delta-io/delta-rs/pull/2194) ([hntd187](https://github.com/hntd187)) + +## [rust-v0.17.3](https://github.com/delta-io/delta-rs/tree/rust-v0.17.3) (2024-05-01) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.17.1...rust-v0.17.3) + +**Implemented enhancements:** + +- Limit concurrent ObjectStore access to avoid resource limitations in constrained environments [\#2457](https://github.com/delta-io/delta-rs/issues/2457) +- How to get a DataFrame in Rust? [\#2404](https://github.com/delta-io/delta-rs/issues/2404) +- Allow checkpoint creation when partion column is "timestampNtz " [\#2381](https://github.com/delta-io/delta-rs/issues/2381) +- is there a way to make writing timestamp\_ntz optional [\#2339](https://github.com/delta-io/delta-rs/issues/2339) +- Update arrow dependency [\#2328](https://github.com/delta-io/delta-rs/issues/2328) +- Release GIL in deltalake.write\_deltalake [\#2234](https://github.com/delta-io/delta-rs/issues/2234) +- Unable to retrieve custom metadata from tables in rust [\#2153](https://github.com/delta-io/delta-rs/issues/2153) +- Refactor commit interface to be a Builder [\#2131](https://github.com/delta-io/delta-rs/issues/2131) + +**Fixed bugs:** + +- Handle rate limiting during write contention [\#2451](https://github.com/delta-io/delta-rs/issues/2451) +- regression : delta.logRetentionDuration don't seems to be respected [\#2447](https://github.com/delta-io/delta-rs/issues/2447) +- Issue writing to mounted storage in AKS using delta-rs library [\#2445](https://github.com/delta-io/delta-rs/issues/2445) +- TableMerger - when\_matched\_delete\(\) fails when Column names contain special characters [\#2438](https://github.com/delta-io/delta-rs/issues/2438) +- Generic DeltaTable error: External error: Arrow error: Invalid argument error: arguments need to have the same data type - while merge data in to delta table [\#2423](https://github.com/delta-io/delta-rs/issues/2423) +- Merge on predicate throw error on date colum: Unable to convert expression to string [\#2420](https://github.com/delta-io/delta-rs/issues/2420) +- Writing Tables with Append mode errors if the schema metadata is different [\#2419](https://github.com/delta-io/delta-rs/issues/2419) +- Logstore issues on AWS Lambda [\#2410](https://github.com/delta-io/delta-rs/issues/2410) +- Datafusion timestamp type doesn't respect delta lake schema [\#2408](https://github.com/delta-io/delta-rs/issues/2408) +- Compacting produces smaller row groups than expected [\#2386](https://github.com/delta-io/delta-rs/issues/2386) +- ValueError: Partition value cannot be parsed from string. [\#2380](https://github.com/delta-io/delta-rs/issues/2380) +- Very slow s3 connection after 0.16.1 [\#2377](https://github.com/delta-io/delta-rs/issues/2377) +- Merge update+insert truncates a delta table if the table is big enough [\#2362](https://github.com/delta-io/delta-rs/issues/2362) +- Do not add readerFeatures or writerFeatures keys under checkpoint files if minReaderVersion or minWriterVersion do not satisfy the requirements [\#2360](https://github.com/delta-io/delta-rs/issues/2360) +- Create empty table failed on rust engine [\#2354](https://github.com/delta-io/delta-rs/issues/2354) +- Getting error message when running in lambda: message: "Too many open files" [\#2353](https://github.com/delta-io/delta-rs/issues/2353) +- Temporary files filling up \_delta\_log folder - increasing table load time [\#2351](https://github.com/delta-io/delta-rs/issues/2351) +- compact fails with merged schemas [\#2347](https://github.com/delta-io/delta-rs/issues/2347) +- Cannot merge into table partitioned by date type column on 0.16.3 [\#2344](https://github.com/delta-io/delta-rs/issues/2344) +- Merge breaks using logical datatype decimal128 [\#2343](https://github.com/delta-io/delta-rs/issues/2343) +- Decimal types are not checked against max precision/scale at table creation [\#2331](https://github.com/delta-io/delta-rs/issues/2331) +- Merge update+insert truncates a delta table [\#2320](https://github.com/delta-io/delta-rs/issues/2320) +- Extract `add.stats_parsed` with wrong type [\#2312](https://github.com/delta-io/delta-rs/issues/2312) +- Process fails without error message when executing merge [\#2310](https://github.com/delta-io/delta-rs/issues/2310) +- delta\_rs don't seems to respect the row group size [\#2309](https://github.com/delta-io/delta-rs/issues/2309) +- Auth error when running inside VS Code [\#2306](https://github.com/delta-io/delta-rs/issues/2306) +- Unable to read deltatables with binary columns: Binary is not supported by JSON [\#2302](https://github.com/delta-io/delta-rs/issues/2302) +- Schema evolution not coercing with Large arrow types [\#2298](https://github.com/delta-io/delta-rs/issues/2298) +- Panic in `deltalake_core::kernel::snapshot::log_segment::list_log_files_with_checkpoint::{{closure}}` [\#2290](https://github.com/delta-io/delta-rs/issues/2290) +- Checkpoint does not preserve reader and writer features for the table protocol. [\#2288](https://github.com/delta-io/delta-rs/issues/2288) +- Z-Order with larger dataset resulting in memory error [\#2284](https://github.com/delta-io/delta-rs/issues/2284) +- Successful writes return error when using concurrent writers [\#2279](https://github.com/delta-io/delta-rs/issues/2279) +- Rust writer should raise when decimal types are incompatible \(currently writers and puts table in invalid state\) [\#2275](https://github.com/delta-io/delta-rs/issues/2275) +- Generic DeltaTable error: Version mismatch with new schema merge functionality in AWS S3 [\#2262](https://github.com/delta-io/delta-rs/issues/2262) +- DeltaTable is not resilient to corrupted checkpoint state [\#2258](https://github.com/delta-io/delta-rs/issues/2258) +- Inconsistent units of time [\#2256](https://github.com/delta-io/delta-rs/issues/2256) +- Partition column comparison is an assertion rather than if block with raise exception [\#2242](https://github.com/delta-io/delta-rs/issues/2242) +- Unable to merge column names starting from numbers [\#2230](https://github.com/delta-io/delta-rs/issues/2230) +- Merging to a table with multiple distinct partitions in parallel fails [\#2227](https://github.com/delta-io/delta-rs/issues/2227) +- cleanup\_metadata not respecting custom `logRetentionDuration` [\#2180](https://github.com/delta-io/delta-rs/issues/2180) +- Merge predicate fails with a field with a space [\#2167](https://github.com/delta-io/delta-rs/issues/2167) +- When\_matched\_update causes records to be lost with explicit predicate [\#2158](https://github.com/delta-io/delta-rs/issues/2158) +- Merge execution time grows exponetially with the number of column [\#2107](https://github.com/delta-io/delta-rs/issues/2107) +- \_internal.DeltaError when merging [\#2084](https://github.com/delta-io/delta-rs/issues/2084) + +## [rust-v0.17.1](https://github.com/delta-io/delta-rs/tree/rust-v0.17.1) (2024-03-06) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.17.0...rust-v0.17.1) + +**Implemented enhancements:** + +- Get statistics metadata [\#2233](https://github.com/delta-io/delta-rs/issues/2233) +- add option to append only a subsets of columns [\#2212](https://github.com/delta-io/delta-rs/issues/2212) +- add documentation how to configure delta.logRetentionDuration [\#2072](https://github.com/delta-io/delta-rs/issues/2072) +- Add `drop constraint` [\#2070](https://github.com/delta-io/delta-rs/issues/2070) +- Add 0.16 deprecation warnings for DynamoDB lock [\#2049](https://github.com/delta-io/delta-rs/issues/2049) + +**Fixed bugs:** + +- cleanup\_metadata not respecting custom `logRetentionDuration` [\#2180](https://github.com/delta-io/delta-rs/issues/2180) +- Rust writer panics on empty record batches [\#2253](https://github.com/delta-io/delta-rs/issues/2253) +- DeltaLake executed Rust: write method not found in `DeltaOps` [\#2244](https://github.com/delta-io/delta-rs/issues/2244) +- DELTA\_FILE\_PATTERN regex is incorrectly matching tmp commit files [\#2201](https://github.com/delta-io/delta-rs/issues/2201) +- Failed to create checkpoint with "Parquet does not support writing empty structs" [\#2189](https://github.com/delta-io/delta-rs/issues/2189) +- Error when parsing delete expressions [\#2187](https://github.com/delta-io/delta-rs/issues/2187) +- terminate called without an active exception [\#2184](https://github.com/delta-io/delta-rs/issues/2184) +- Now conda-installable on M1 [\#2178](https://github.com/delta-io/delta-rs/issues/2178) +- Add error message for parition\_by check [\#2177](https://github.com/delta-io/delta-rs/issues/2177) +- deltalake 0.15.2 prints partitions\_values and paths which is not desired [\#2176](https://github.com/delta-io/delta-rs/issues/2176) +- cleanup\_metadata can potentially delete most recent checkpoint, corrupting table [\#2174](https://github.com/delta-io/delta-rs/issues/2174) +- Broken filter for newly created delta table [\#2169](https://github.com/delta-io/delta-rs/issues/2169) +- Hash for StructField should consider more than the name [\#2045](https://github.com/delta-io/delta-rs/issues/2045) +- Schema comparaison in writer [\#1853](https://github.com/delta-io/delta-rs/issues/1853) +- fix\(python\): sort before schema comparison [\#2209](https://github.com/delta-io/delta-rs/pull/2209) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: prevent writing checkpoints with a version that does not exist in table state [\#1863](https://github.com/delta-io/delta-rs/pull/1863) ([rtyler](https://github.com/rtyler)) + +**Closed issues:** + +- Bug/Question: arrow's`FixedSizeList` is not roundtrippable [\#2162](https://github.com/delta-io/delta-rs/issues/2162) + +**Merged pull requests:** + +- fix: fixes panic on empty write [\#2254](https://github.com/delta-io/delta-rs/pull/2254) ([aersam](https://github.com/aersam)) +- fix\(rust\): typo deletionvectors [\#2251](https://github.com/delta-io/delta-rs/pull/2251) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): make interval parsing compatible with plural form [\#2250](https://github.com/delta-io/delta-rs/pull/2250) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: bump to 0.16 [\#2248](https://github.com/delta-io/delta-rs/pull/2248) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: merge schema support for the write operation and Python [\#2246](https://github.com/delta-io/delta-rs/pull/2246) ([rtyler](https://github.com/rtyler)) +- fix: object\_store 0.9.0 since 0.9.1 causes CI failure [\#2245](https://github.com/delta-io/delta-rs/pull/2245) ([aersam](https://github.com/aersam)) +- chore\(python\): bump version [\#2241](https://github.com/delta-io/delta-rs/pull/2241) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: fix ruff and mypy version and do formatting [\#2240](https://github.com/delta-io/delta-rs/pull/2240) ([aersam](https://github.com/aersam)) +- feat\(python, rust\): timestampNtz support [\#2236](https://github.com/delta-io/delta-rs/pull/2236) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: clean up some compilation failures and un-ignore some tests [\#2231](https://github.com/delta-io/delta-rs/pull/2231) ([rtyler](https://github.com/rtyler)) +- docs: fixing example in CONTRIBUTING.md [\#2224](https://github.com/delta-io/delta-rs/pull/2224) ([gacharya](https://github.com/gacharya)) +- perf: directly create projection instead of using DataFrame::with\_column [\#2222](https://github.com/delta-io/delta-rs/pull/2222) ([emcake](https://github.com/emcake)) +- chore: remove caches from github actions [\#2215](https://github.com/delta-io/delta-rs/pull/2215) ([rtyler](https://github.com/rtyler)) +- fix: `is_commit_file` should only catch commit jsons [\#2213](https://github.com/delta-io/delta-rs/pull/2213) ([emcake](https://github.com/emcake)) +- chore: fix the Cargo.tomls to publish information properly on docs.rs [\#2211](https://github.com/delta-io/delta-rs/pull/2211) ([rtyler](https://github.com/rtyler)) +- fix\(writer\): retry storage.put on temporary network errors [\#2207](https://github.com/delta-io/delta-rs/pull/2207) ([qinix](https://github.com/qinix)) +- fix: canonicalize config keys [\#2206](https://github.com/delta-io/delta-rs/pull/2206) ([emcake](https://github.com/emcake)) +- docs: update README code samples for newer versions [\#2202](https://github.com/delta-io/delta-rs/pull/2202) ([jhoekx](https://github.com/jhoekx)) +- docs: dask integration fix formatting typo [\#2196](https://github.com/delta-io/delta-rs/pull/2196) ([avriiil](https://github.com/avriiil)) +- fix: add data\_type and nullable to StructField hash \(\#2045\) [\#2190](https://github.com/delta-io/delta-rs/pull/2190) ([sonhmai](https://github.com/sonhmai)) +- fix: removed panic in method [\#2185](https://github.com/delta-io/delta-rs/pull/2185) ([mightyshazam](https://github.com/mightyshazam)) +- feat: implement string representation for PartitionFilter [\#2183](https://github.com/delta-io/delta-rs/pull/2183) ([sonhmai](https://github.com/sonhmai)) +- fix: correct map field names [\#2182](https://github.com/delta-io/delta-rs/pull/2182) ([emcake](https://github.com/emcake)) +- feat: add comment to explain why assert has failed and show state [\#2179](https://github.com/delta-io/delta-rs/pull/2179) ([braaannigan](https://github.com/braaannigan)) +- docs: include the 0.17.0 changelog [\#2173](https://github.com/delta-io/delta-rs/pull/2173) ([rtyler](https://github.com/rtyler)) +- fix\(python\): skip empty row groups during stats gathering [\#2172](https://github.com/delta-io/delta-rs/pull/2172) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: 0.17.0 publish changes [\#2171](https://github.com/delta-io/delta-rs/pull/2171) ([rtyler](https://github.com/rtyler)) +- chore\(python\): bump version [\#2170](https://github.com/delta-io/delta-rs/pull/2170) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: update all the package metadata for publication to crates.io [\#2168](https://github.com/delta-io/delta-rs/pull/2168) ([rtyler](https://github.com/rtyler)) +- fix: rm println in python lib [\#2166](https://github.com/delta-io/delta-rs/pull/2166) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: cleanup minor clippies and other warns [\#2161](https://github.com/delta-io/delta-rs/pull/2161) ([rtyler](https://github.com/rtyler)) +- feat: implement clone for DeltaTable struct [\#2160](https://github.com/delta-io/delta-rs/pull/2160) ([mightyshazam](https://github.com/mightyshazam)) +- fix: allow loading of tables with identity columns [\#2155](https://github.com/delta-io/delta-rs/pull/2155) ([rtyler](https://github.com/rtyler)) +- fix: replace BTreeMap with IndexMap to preserve insertion order [\#2150](https://github.com/delta-io/delta-rs/pull/2150) ([roeap](https://github.com/roeap)) +- fix: made generalize\_filter less permissive, also added more cases [\#2149](https://github.com/delta-io/delta-rs/pull/2149) ([emcake](https://github.com/emcake)) +- docs: add delta lake best practices [\#2147](https://github.com/delta-io/delta-rs/pull/2147) ([MrPowers](https://github.com/MrPowers)) +- chore: shorten up the crate folder names in the tree [\#2145](https://github.com/delta-io/delta-rs/pull/2145) ([rtyler](https://github.com/rtyler)) +- fix\(\#2143\): keep specific error type when writing fails [\#2144](https://github.com/delta-io/delta-rs/pull/2144) ([abaerptc](https://github.com/abaerptc)) +- refactor\(python\): drop custom filesystem in write\_deltalake [\#2137](https://github.com/delta-io/delta-rs/pull/2137) ([ion-elgreco](https://github.com/ion-elgreco)) +- docs: use transparent logo in README [\#2132](https://github.com/delta-io/delta-rs/pull/2132) ([roeap](https://github.com/roeap)) +- fix: order logical schema to match physical schema [\#2129](https://github.com/delta-io/delta-rs/pull/2129) ([Blajda](https://github.com/Blajda)) +- feat: expose stats schema on Snapshot [\#2128](https://github.com/delta-io/delta-rs/pull/2128) ([roeap](https://github.com/roeap)) +- feat: update table config to contain new config keys [\#2127](https://github.com/delta-io/delta-rs/pull/2127) ([roeap](https://github.com/roeap)) +- fix: clean-up paths created during tests [\#2126](https://github.com/delta-io/delta-rs/pull/2126) ([roeap](https://github.com/roeap)) +- fix: prevent empty stats struct during parquet write [\#2125](https://github.com/delta-io/delta-rs/pull/2125) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- fix: temporarily skip s3 roundtrip test [\#2124](https://github.com/delta-io/delta-rs/pull/2124) ([roeap](https://github.com/roeap)) +- fix: do not write empty parquet file/add on writer close; accurately … [\#2123](https://github.com/delta-io/delta-rs/pull/2123) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- docs: add dask page to integration docs [\#2122](https://github.com/delta-io/delta-rs/pull/2122) ([avriiil](https://github.com/avriiil)) +- chore: upgrade to DataFusion 35.0 [\#2121](https://github.com/delta-io/delta-rs/pull/2121) ([philippemnoel](https://github.com/philippemnoel)) +- fix\(s3\): restore working test for DynamoDb log store repair log on read [\#2120](https://github.com/delta-io/delta-rs/pull/2120) ([dispanser](https://github.com/dispanser)) +- fix: set partition values for added files when building compaction plan [\#2119](https://github.com/delta-io/delta-rs/pull/2119) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- fix: add missing pandas import [\#2116](https://github.com/delta-io/delta-rs/pull/2116) ([Tim-Haarman](https://github.com/Tim-Haarman)) +- chore: temporarily ignore the repair on update test [\#2114](https://github.com/delta-io/delta-rs/pull/2114) ([rtyler](https://github.com/rtyler)) +- docs: delta lake is great for small data [\#2113](https://github.com/delta-io/delta-rs/pull/2113) ([MrPowers](https://github.com/MrPowers)) +- chore: removed unnecessary print statement from update method [\#2111](https://github.com/delta-io/delta-rs/pull/2111) ([LilMonk](https://github.com/LilMonk)) +- fix: schema issue within writebuilder [\#2106](https://github.com/delta-io/delta-rs/pull/2106) ([universalmind303](https://github.com/universalmind303)) +- docs: fix arg indent [\#2103](https://github.com/delta-io/delta-rs/pull/2103) ([wchatx](https://github.com/wchatx)) +- docs: delta lake file skipping [\#2096](https://github.com/delta-io/delta-rs/pull/2096) ([MrPowers](https://github.com/MrPowers)) +- docs: move dynamo docs into new docs page [\#2093](https://github.com/delta-io/delta-rs/pull/2093) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: bump python [\#2092](https://github.com/delta-io/delta-rs/pull/2092) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: allow merge\_execute to release the GIL [\#2091](https://github.com/delta-io/delta-rs/pull/2091) ([emcake](https://github.com/emcake)) +- docs: how delta lake transactions work [\#2089](https://github.com/delta-io/delta-rs/pull/2089) ([MrPowers](https://github.com/MrPowers)) +- fix: reinstate copy-if-not-exists passthrough [\#2083](https://github.com/delta-io/delta-rs/pull/2083) ([emcake](https://github.com/emcake)) +- docs: make an overview tab visible in docs [\#2080](https://github.com/delta-io/delta-rs/pull/2080) ([r3stl355](https://github.com/r3stl355)) +- docs: add usage guide for check constraints [\#2079](https://github.com/delta-io/delta-rs/pull/2079) ([hntd187](https://github.com/hntd187)) +- docs: update docs for rust print statement [\#2077](https://github.com/delta-io/delta-rs/pull/2077) ([skariyania](https://github.com/skariyania)) +- docs: add page on why to use delta lake [\#2076](https://github.com/delta-io/delta-rs/pull/2076) ([MrPowers](https://github.com/MrPowers)) +- feat\(rust, python\): add `drop constraint` operation [\#2071](https://github.com/delta-io/delta-rs/pull/2071) ([ion-elgreco](https://github.com/ion-elgreco)) +- refactor: add deltalake-gcp crate [\#2061](https://github.com/delta-io/delta-rs/pull/2061) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: allow checkpoints to contain metadata actions without a createdTime value [\#2059](https://github.com/delta-io/delta-rs/pull/2059) ([rtyler](https://github.com/rtyler)) +- chore: bump version python [\#2047](https://github.com/delta-io/delta-rs/pull/2047) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: ensure metadata cleanup do not corrupt tables without checkpoints [\#2044](https://github.com/delta-io/delta-rs/pull/2044) ([Blajda](https://github.com/Blajda)) +- docs: update docs for merge [\#2042](https://github.com/delta-io/delta-rs/pull/2042) ([Blajda](https://github.com/Blajda)) +- chore: update documentation for S3 / DynamoDb log store configuration [\#2041](https://github.com/delta-io/delta-rs/pull/2041) ([dispanser](https://github.com/dispanser)) +- feat: arrow backed log replay and table state [\#2037](https://github.com/delta-io/delta-rs/pull/2037) ([roeap](https://github.com/roeap)) +- fix: properly deserialize percent-encoded file paths of Remove actions, to make sure tombstone and file paths match [\#2035](https://github.com/delta-io/delta-rs/pull/2035) ([sigorbor](https://github.com/sigorbor)) +- fix: remove casts of structs to record batch [\#2033](https://github.com/delta-io/delta-rs/pull/2033) ([Blajda](https://github.com/Blajda)) +- feat\(python, rust\): expose custom\_metadata for all operations [\#2032](https://github.com/delta-io/delta-rs/pull/2032) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: refactor WriterProperties class [\#2030](https://github.com/delta-io/delta-rs/pull/2030) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: update datafusion [\#2029](https://github.com/delta-io/delta-rs/pull/2029) ([roeap](https://github.com/roeap)) +- refactor: increase metadata action usage [\#2027](https://github.com/delta-io/delta-rs/pull/2027) ([roeap](https://github.com/roeap)) +- fix: github actions for releasing docs [\#2026](https://github.com/delta-io/delta-rs/pull/2026) ([r3stl355](https://github.com/r3stl355)) +- feat: introduce schema evolution on RecordBatchWriter [\#2024](https://github.com/delta-io/delta-rs/pull/2024) ([rtyler](https://github.com/rtyler)) +- refactor: move azure integration to dedicated crate [\#2023](https://github.com/delta-io/delta-rs/pull/2023) ([roeap](https://github.com/roeap)) +- fix: use temporary table names during the constraint checks [\#2017](https://github.com/delta-io/delta-rs/pull/2017) ([r3stl355](https://github.com/r3stl355)) +- docs: add alterer [\#2014](https://github.com/delta-io/delta-rs/pull/2014) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: version bump python release [\#2011](https://github.com/delta-io/delta-rs/pull/2011) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: fix the test\_restore\_by\_datetime test [\#2010](https://github.com/delta-io/delta-rs/pull/2010) ([r3stl355](https://github.com/r3stl355)) +- feat\(rust\): add more commit info to most operations [\#2009](https://github.com/delta-io/delta-rs/pull/2009) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python\): add schema conversion of FixedSizeBinaryArray and FixedSizeListType [\#2005](https://github.com/delta-io/delta-rs/pull/2005) ([balbok0](https://github.com/balbok0)) +- feat\(python\): expose large\_dtype param in `merge` [\#2003](https://github.com/delta-io/delta-rs/pull/2003) ([ion-elgreco](https://github.com/ion-elgreco)) +- docs: add writer properties to docs [\#2002](https://github.com/delta-io/delta-rs/pull/2002) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: fix CI breaking lint issues [\#1999](https://github.com/delta-io/delta-rs/pull/1999) ([r3stl355](https://github.com/r3stl355)) +- feat: implementation for replaceWhere [\#1996](https://github.com/delta-io/delta-rs/pull/1996) ([r3stl355](https://github.com/r3stl355)) +- chore: refactoring AWS code out of the core crate [\#1995](https://github.com/delta-io/delta-rs/pull/1995) ([rtyler](https://github.com/rtyler)) +- feat\(python\): expose custom metadata to writers [\#1994](https://github.com/delta-io/delta-rs/pull/1994) ([ion-elgreco](https://github.com/ion-elgreco)) +- docs: datafusion integration [\#1993](https://github.com/delta-io/delta-rs/pull/1993) ([MrPowers](https://github.com/MrPowers)) +- fix: flakey gcs test [\#1987](https://github.com/delta-io/delta-rs/pull/1987) ([roeap](https://github.com/roeap)) +- fix: implement consistent formatting for constraint expressions [\#1985](https://github.com/delta-io/delta-rs/pull/1985) ([Blajda](https://github.com/Blajda)) +- fix: case sensitivity for z-order [\#1982](https://github.com/delta-io/delta-rs/pull/1982) ([Blajda](https://github.com/Blajda)) +- feat\(python\): add writer\_properties to all operations [\#1980](https://github.com/delta-io/delta-rs/pull/1980) ([ion-elgreco](https://github.com/ion-elgreco)) +- refactor: trigger metadata retrieval only during `DeltaTable.metadata` [\#1979](https://github.com/delta-io/delta-rs/pull/1979) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: retry with exponential backoff for DynamoDb interaction [\#1975](https://github.com/delta-io/delta-rs/pull/1975) ([dispanser](https://github.com/dispanser)) +- feat\(python\): expose `add constraint` operation [\#1973](https://github.com/delta-io/delta-rs/pull/1973) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: properly decode percent-encoded file paths coming from parquet checkpoints [\#1970](https://github.com/delta-io/delta-rs/pull/1970) ([sigorbor](https://github.com/sigorbor)) +- feat: omit unmodified files during merge write [\#1969](https://github.com/delta-io/delta-rs/pull/1969) ([Blajda](https://github.com/Blajda)) +- feat\(python\): combine load\_version/load\_with\_datetime into `load_as_version` [\#1968](https://github.com/delta-io/delta-rs/pull/1968) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: enable S3 integration tests to be configured via environment vars [\#1966](https://github.com/delta-io/delta-rs/pull/1966) ([dispanser](https://github.com/dispanser)) +- fix: handle empty table response in unity api [\#1963](https://github.com/delta-io/delta-rs/pull/1963) ([JonasDev1](https://github.com/JonasDev1)) +- docs: add auto-release when docs are merged to main [\#1962](https://github.com/delta-io/delta-rs/pull/1962) ([r3stl355](https://github.com/r3stl355)) +- feat: cast list items to default before write with different item names [\#1959](https://github.com/delta-io/delta-rs/pull/1959) ([JonasDev1](https://github.com/JonasDev1)) +- feat: merge using partition filters [\#1958](https://github.com/delta-io/delta-rs/pull/1958) ([emcake](https://github.com/emcake)) +- chore: relocate cast\_record\_batch into its own module to shed the datafusion dependency [\#1955](https://github.com/delta-io/delta-rs/pull/1955) ([rtyler](https://github.com/rtyler)) +- fix: respect case sensitivity on operations [\#1954](https://github.com/delta-io/delta-rs/pull/1954) ([Blajda](https://github.com/Blajda)) +- docs: add better installation instructions [\#1951](https://github.com/delta-io/delta-rs/pull/1951) ([MrPowers](https://github.com/MrPowers)) +- docs: add polars integration [\#1949](https://github.com/delta-io/delta-rs/pull/1949) ([MrPowers](https://github.com/MrPowers)) +- fix: add arrow page back [\#1944](https://github.com/delta-io/delta-rs/pull/1944) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: remove the get\_data\_catalog\(\) function [\#1941](https://github.com/delta-io/delta-rs/pull/1941) ([rtyler](https://github.com/rtyler)) +- chore: update runs-on value in python\_release.yml [\#1940](https://github.com/delta-io/delta-rs/pull/1940) ([wjones127](https://github.com/wjones127)) +- docs: start how delta lake works [\#1938](https://github.com/delta-io/delta-rs/pull/1938) ([MrPowers](https://github.com/MrPowers)) +- docs: add logo, dark mode, boost search [\#1936](https://github.com/delta-io/delta-rs/pull/1936) ([ion-elgreco](https://github.com/ion-elgreco)) +- refactor: prefer usage of metadata and protocol fields [\#1935](https://github.com/delta-io/delta-rs/pull/1935) ([roeap](https://github.com/roeap)) +- chore: update python version [\#1934](https://github.com/delta-io/delta-rs/pull/1934) ([wjones127](https://github.com/wjones127)) +- feat\(python\): expose create to DeltaTable class [\#1932](https://github.com/delta-io/delta-rs/pull/1932) ([ion-elgreco](https://github.com/ion-elgreco)) +- docs: fix all examples and change overall structure [\#1931](https://github.com/delta-io/delta-rs/pull/1931) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: update to include pyarrow-hotfix [\#1930](https://github.com/delta-io/delta-rs/pull/1930) ([dennyglee](https://github.com/dennyglee)) +- fix: get rid of panic in during table [\#1928](https://github.com/delta-io/delta-rs/pull/1928) ([dimonchik-suvorov](https://github.com/dimonchik-suvorov)) +- fix\(rust/python\): `optimize.compact` not working with tables with mixed large/normal arrow [\#1926](https://github.com/delta-io/delta-rs/pull/1926) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: extend write\_deltalake to accept Deltalake schema [\#1922](https://github.com/delta-io/delta-rs/pull/1922) ([r3stl355](https://github.com/r3stl355)) +- fix: fail fast for opening non-existent path [\#1917](https://github.com/delta-io/delta-rs/pull/1917) ([dimonchik-suvorov](https://github.com/dimonchik-suvorov)) +- feat: check constraints [\#1915](https://github.com/delta-io/delta-rs/pull/1915) ([hntd187](https://github.com/hntd187)) +- docs: delta lake arrow integration page [\#1914](https://github.com/delta-io/delta-rs/pull/1914) ([MrPowers](https://github.com/MrPowers)) +- feat: add more info for contributors [\#1913](https://github.com/delta-io/delta-rs/pull/1913) ([r3stl355](https://github.com/r3stl355)) +- fix: add buffer flushing to filesystem writes [\#1911](https://github.com/delta-io/delta-rs/pull/1911) ([r3stl355](https://github.com/r3stl355)) +- docs: update docs home page and add pandas integration [\#1905](https://github.com/delta-io/delta-rs/pull/1905) ([MrPowers](https://github.com/MrPowers)) +- feat: implement S3 log store with transactions backed by DynamoDb [\#1904](https://github.com/delta-io/delta-rs/pull/1904) ([dispanser](https://github.com/dispanser)) +- fix: prune each merge bin with only 1 file [\#1902](https://github.com/delta-io/delta-rs/pull/1902) ([haruband](https://github.com/haruband)) +- docs: update python docs link in readme.md [\#1899](https://github.com/delta-io/delta-rs/pull/1899) ([thomasfrederikhoeck](https://github.com/thomasfrederikhoeck)) +- docs: on append, overwrite, delete and z-ordering [\#1897](https://github.com/delta-io/delta-rs/pull/1897) ([MrPowers](https://github.com/MrPowers)) +- feat: compare timestamp partition values as timestamps instead of strings [\#1895](https://github.com/delta-io/delta-rs/pull/1895) ([sigorbor](https://github.com/sigorbor)) +- feat\(python\): expose rust writer as additional engine v2 [\#1891](https://github.com/delta-io/delta-rs/pull/1891) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: add high-level checking for append-only tables [\#1887](https://github.com/delta-io/delta-rs/pull/1887) ([junjunjd](https://github.com/junjunjd)) +- test: loading version 0 Delta table [\#1885](https://github.com/delta-io/delta-rs/pull/1885) ([dimonchik-suvorov](https://github.com/dimonchik-suvorov)) +- fix: improve catalog failure error message, add missing Glue native-tls feature dependency [\#1883](https://github.com/delta-io/delta-rs/pull/1883) ([r3stl355](https://github.com/r3stl355)) +- refactor: simplify `DeltaTableState` [\#1877](https://github.com/delta-io/delta-rs/pull/1877) ([roeap](https://github.com/roeap)) +- refactor: express log schema in delta types [\#1876](https://github.com/delta-io/delta-rs/pull/1876) ([roeap](https://github.com/roeap)) +- docs: add Rust installation instructions [\#1875](https://github.com/delta-io/delta-rs/pull/1875) ([MrPowers](https://github.com/MrPowers)) +- chore: clippy [\#1871](https://github.com/delta-io/delta-rs/pull/1871) ([roeap](https://github.com/roeap)) +- fix: docs deployment action [\#1869](https://github.com/delta-io/delta-rs/pull/1869) ([r3stl355](https://github.com/r3stl355)) +- docs: tell how to claim an issue [\#1866](https://github.com/delta-io/delta-rs/pull/1866) ([wjones127](https://github.com/wjones127)) +- feat: drop python 3.7 and adopt 3.12 [\#1859](https://github.com/delta-io/delta-rs/pull/1859) ([roeap](https://github.com/roeap)) +- feat: create benchmarks for merge [\#1857](https://github.com/delta-io/delta-rs/pull/1857) ([Blajda](https://github.com/Blajda)) +- chore: add @ion-elgreco to python/ [\#1855](https://github.com/delta-io/delta-rs/pull/1855) ([rtyler](https://github.com/rtyler)) +- fix: compile error with lifetime issues on optimize \(\#1843\) [\#1852](https://github.com/delta-io/delta-rs/pull/1852) ([dispanser](https://github.com/dispanser)) +- feat: implement issue auto-assign on `take` comment [\#1851](https://github.com/delta-io/delta-rs/pull/1851) ([r3stl355](https://github.com/r3stl355)) +- docs: add docs on small file compaction with optimize [\#1850](https://github.com/delta-io/delta-rs/pull/1850) ([MrPowers](https://github.com/MrPowers)) +- fix: checkpoint error with Azure Synapse [\#1848](https://github.com/delta-io/delta-rs/pull/1848) ([PierreDubrulle](https://github.com/PierreDubrulle)) +- feat\(python\): expose `convert_to_deltalake` [\#1842](https://github.com/delta-io/delta-rs/pull/1842) ([ion-elgreco](https://github.com/ion-elgreco)) +- ci: adopt `ruff format` for formatting [\#1841](https://github.com/delta-io/delta-rs/pull/1841) ([roeap](https://github.com/roeap)) + +## [rust-v0.17.0](https://github.com/delta-io/delta-rs/tree/rust-v0.17.0) (2024-02-06) + +:warning: The release of 0.17.0 **removes** the legacy dynamodb lock functionality, AWS users must read these release notes! :warning: + +### File handlers + +The 0.17.0 release moves storage implementations into their own crates, such as +`deltalake-aws`. A consequence of that refactoring is that custom storage and +file scheme handlers must be registered/initialized at runtime. Storage +subcrates conventionally define a `register_handlers` function which performs +that task. Users may see errors such as: +``` +thread 'main' panicked at /home/ubuntu/.cargo/registry/src/index.crates.io-6f17d22bba15001f/deltalake-core-0.17.0/src/table/builder.rs:189:48: +The specified table_uri is not valid: InvalidTableLocation("Unknown scheme: s3") +``` + +* Users of the meta-crate (`deltalake`) can call the storage crate via: `deltalake::aws::register_handlers(None);` at the entrypoint for their code. +* Users who adopt `core` and storage crates independently (e.g. `deltalake-aws`) can register via `deltalake_aws::register_handlers(None);`. + +The AWS, Azure, and GCP crates must all have their custom file schemes registered in this fashion. + + +### dynamodblock to S3DynamoDbLogStore + +The locking mechanism is fundamentally different between `deltalake` v0.16.x and v0.17.0, starting with this release the `deltalake` and `deltalake-aws` crates this library now relies on the same [protocol for concurrent writes on AWS](https://docs.delta.io/latest/delta-storage.html#setup-configuration-s3-multi-cluster) as the Delta Lake/Spark implementation. + +Fundamentally the DynamoDB table structure changes, [which is documented here](https://docs.delta.io/latest/delta-storage.html#setup-configuration-s3-multi-cluster). The configuration of a Rust process should continue to use the `AWS_S3_LOCKING_PROVIDER` environment value of `dynamodb`. The new table must be specified with the `DELTA_DYNAMO_TABLE_NAME` environment or configuration variable, and that should name the _new_ `S3DynamoDbLogStore` compatible DynamoDB table. + +Because locking is required to ensure safe cconsistent writes, **there is no iterative migration**, 0.16 and 0.17 writers **cannot** safely coexist. The following steps should be taken when upgrading: + +1. Stop all 0.16.x writers +2. Ensure writes are completed, and lock table is empty. +3. Deploy 0.17.0 writers + + + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.16.5...rust-v0.17.0) + +**Implemented enhancements:** + +- Expose the ability to compile DataFusion with SIMD [\#2118](https://github.com/delta-io/delta-rs/issues/2118) +- Updating Table log retention configuration with `write_deltalake` silently changes nothing [\#2108](https://github.com/delta-io/delta-rs/issues/2108) +- ALTER table, ALTER Column, Add/Modify Comment, Add/remove/rename partitions, Set Tags, Set location, Set TBLProperties [\#2088](https://github.com/delta-io/delta-rs/issues/2088) +- Docs: Update docs for check constraints [\#2063](https://github.com/delta-io/delta-rs/issues/2063) +- Don't `ensure_table_uri` when creating a table `with_log_store` [\#2036](https://github.com/delta-io/delta-rs/issues/2036) +- Exposing custom\_metadata in merge operation [\#2031](https://github.com/delta-io/delta-rs/issues/2031) +- Support custom table properties via TableAlterer and write/merge [\#2022](https://github.com/delta-io/delta-rs/issues/2022) +- Remove parquet2 crate support [\#2004](https://github.com/delta-io/delta-rs/issues/2004) +- Merge operation that only touches necessary partitions [\#1991](https://github.com/delta-io/delta-rs/issues/1991) +- store userMetadata on write operations [\#1990](https://github.com/delta-io/delta-rs/issues/1990) +- Create Dask integration page [\#1956](https://github.com/delta-io/delta-rs/issues/1956) +- Merge: Filtering on partitions [\#1918](https://github.com/delta-io/delta-rs/issues/1918) +- Rethink the load\_version and load\_with\_datetime interfaces [\#1910](https://github.com/delta-io/delta-rs/issues/1910) +- docs: Delta Lake + Arrow Integration [\#1908](https://github.com/delta-io/delta-rs/issues/1908) +- docs: Delta Lake + Polars integration [\#1906](https://github.com/delta-io/delta-rs/issues/1906) +- Rethink decision to expose the public interface in namespaces [\#1900](https://github.com/delta-io/delta-rs/issues/1900) +- Add documentation on how to build and run documentation locally [\#1893](https://github.com/delta-io/delta-rs/issues/1893) +- Add API to create an empty Delta Lake table [\#1892](https://github.com/delta-io/delta-rs/issues/1892) +- Implementing CHECK constraints [\#1881](https://github.com/delta-io/delta-rs/issues/1881) +- Check Invariants are respecting table features for write paths [\#1880](https://github.com/delta-io/delta-rs/issues/1880) +- Organize docs with single lefthand sidebar [\#1873](https://github.com/delta-io/delta-rs/issues/1873) +- Make sure invariants are handled properly throughout the codebase [\#1870](https://github.com/delta-io/delta-rs/issues/1870) +- Unable to use deltalake `Schema` in `write_deltalake` [\#1862](https://github.com/delta-io/delta-rs/issues/1862) +- Add a Rust-backed engine for write\_deltalake [\#1861](https://github.com/delta-io/delta-rs/issues/1861) +- Run doctest in CI for Python API examples [\#1783](https://github.com/delta-io/delta-rs/issues/1783) +- \[RFC\] Use arrow for checkpoint reading and state handling [\#1776](https://github.com/delta-io/delta-rs/issues/1776) +- Expose Python exceptions in public module [\#1771](https://github.com/delta-io/delta-rs/issues/1771) +- Expose cleanup\_metadata or create\_checkpoint\_from\_table\_uri\_and\_cleanup to the Python API [\#1768](https://github.com/delta-io/delta-rs/issues/1768) +- Expose convert\_to\_delta to Python API [\#1767](https://github.com/delta-io/delta-rs/issues/1767) +- Add high-level checking for append-only tables [\#1759](https://github.com/delta-io/delta-rs/issues/1759) + +**Fixed bugs:** + +- Row order no longer preserved after merge operation [\#2165](https://github.com/delta-io/delta-rs/issues/2165) +- Error when reading delta table with IDENTITY column [\#2152](https://github.com/delta-io/delta-rs/issues/2152) +- Merge on IS NULL condition doesn't work for empty table [\#2148](https://github.com/delta-io/delta-rs/issues/2148) +- JsonWriter converts structured parsing error into plain string [\#2143](https://github.com/delta-io/delta-rs/issues/2143) +- Pandas import error when merging tables [\#2112](https://github.com/delta-io/delta-rs/issues/2112) +- test\_repair\_on\_update broken in main [\#2109](https://github.com/delta-io/delta-rs/issues/2109) +- `WriteBuilder::with_input_execution_plan` does not apply the schema to the log's metadata fields [\#2105](https://github.com/delta-io/delta-rs/issues/2105) +- MERGE logical plan vs execution plan schema mismatch [\#2104](https://github.com/delta-io/delta-rs/issues/2104) +- Partitions not pushed down [\#2090](https://github.com/delta-io/delta-rs/issues/2090) +- Cant create empty table with write\_deltalake [\#2086](https://github.com/delta-io/delta-rs/issues/2086) +- Unexpected high costs on Google Cloud Storage [\#2085](https://github.com/delta-io/delta-rs/issues/2085) +- Unable to read s3 table: `Unknown scheme: s3` [\#2065](https://github.com/delta-io/delta-rs/issues/2065) +- write\_deltalake not respecting writer\_properties [\#2064](https://github.com/delta-io/delta-rs/issues/2064) +- Unable to read/write tables with the "gs" schema in the table\_uri in 0.15.1 [\#2060](https://github.com/delta-io/delta-rs/issues/2060) +- LockClient requiered error for S3 backend in 0.15.1 python [\#2057](https://github.com/delta-io/delta-rs/issues/2057) +- Error while writing Pandas DataFrame to Delta Lake \(S3\) [\#2051](https://github.com/delta-io/delta-rs/issues/2051) +- Error with dynamo locking provider on 0.15 [\#2034](https://github.com/delta-io/delta-rs/issues/2034) +- Conda version 0.15.0 is missing files [\#2021](https://github.com/delta-io/delta-rs/issues/2021) +- Rust panicking through Python library when a delete predicate uses a nullable field [\#2019](https://github.com/delta-io/delta-rs/issues/2019) +- No snapshot or version 0 found, perhaps /Users/watsy0007/resources/test\_table/ is an empty dir? [\#2016](https://github.com/delta-io/delta-rs/issues/2016) +- Generic DeltaTable error: type\_coercion in Struct column in merge operation [\#1998](https://github.com/delta-io/delta-rs/issues/1998) +- Constraint expr not formatted during commit action [\#1971](https://github.com/delta-io/delta-rs/issues/1971) +- .load\_with\_datetime\(\) is incorrectly rounding to nearest second [\#1967](https://github.com/delta-io/delta-rs/issues/1967) +- vacuuming log files [\#1965](https://github.com/delta-io/delta-rs/issues/1965) +- Unable to merge uppercase column names [\#1960](https://github.com/delta-io/delta-rs/issues/1960) +- Schema error: Invalid data type for Delta Lake: Null [\#1946](https://github.com/delta-io/delta-rs/issues/1946) +- Python v0.14 wheel files not up to date [\#1945](https://github.com/delta-io/delta-rs/issues/1945) +- python Release 0.14 is missing Windows wheels [\#1942](https://github.com/delta-io/delta-rs/issues/1942) +- CI integration test fails randomly: test\_restore\_by\_datetime [\#1925](https://github.com/delta-io/delta-rs/issues/1925) +- Merge data freezes indefenetely [\#1920](https://github.com/delta-io/delta-rs/issues/1920) +- Load DeltaTable from non-existing folder causing empty folder creation [\#1916](https://github.com/delta-io/delta-rs/issues/1916) +- Reoptimizes merge bins with only 1 file, even though they have no effect. [\#1901](https://github.com/delta-io/delta-rs/issues/1901) +- The Python Docs link in README.MD points to old docs [\#1898](https://github.com/delta-io/delta-rs/issues/1898) +- optimize.compact\(\) fails with bad schema after updating to pyarrow 8.0 [\#1889](https://github.com/delta-io/delta-rs/issues/1889) +- Python build is broken on main [\#1856](https://github.com/delta-io/delta-rs/issues/1856) +- Checkpoint error with Azure Synapse [\#1847](https://github.com/delta-io/delta-rs/issues/1847) +- merge very slow compared to delete + append on larger dataset [\#1846](https://github.com/delta-io/delta-rs/issues/1846) +- get\_add\_actions fails with deltalake 0.13 [\#1835](https://github.com/delta-io/delta-rs/issues/1835) +- Handle PyArrow CVE-2023-47248 [\#1834](https://github.com/delta-io/delta-rs/issues/1834) +- Delta-rs writer hangs with to many file handles open \(Azure\) [\#1832](https://github.com/delta-io/delta-rs/issues/1832) +- Encountering NotATable\("No snapshot or version 0 found, perhaps xxx is an empty dir?"\) [\#1831](https://github.com/delta-io/delta-rs/issues/1831) +- write\_deltalake is not creating checkpoints [\#1815](https://github.com/delta-io/delta-rs/issues/1815) +- Problem writing tables in directory named with char `~` [\#1806](https://github.com/delta-io/delta-rs/issues/1806) +- DeltaTable Merge throws in merging if there are uppercase in Schema. [\#1797](https://github.com/delta-io/delta-rs/issues/1797) +- rust merge error - datafusion panics [\#1790](https://github.com/delta-io/delta-rs/issues/1790) +- expose use\_dictionary=False when writing Delta Table and running optimize [\#1772](https://github.com/delta-io/delta-rs/issues/1772) + +**Closed issues:** + +- Is this print necessary? Can we remove this. [\#2110](https://github.com/delta-io/delta-rs/issues/2110) +- Azure concurrent writes [\#2069](https://github.com/delta-io/delta-rs/issues/2069) +- Fix docs deployment [\#1867](https://github.com/delta-io/delta-rs/issues/1867) +- Add a header in old docs and direct users to new docs [\#1865](https://github.com/delta-io/delta-rs/issues/1865) + +## [rust-v0.16.5](https://github.com/delta-io/delta-rs/tree/rust-v0.16.5) (2023-11-15) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.16.4...rust-v0.16.5) + +**Implemented enhancements:** + +- When will upgrade object\_store to 0.8? [\#1858](https://github.com/delta-io/delta-rs/issues/1858) +- No Official Help [\#1849](https://github.com/delta-io/delta-rs/issues/1849) +- Auto assign GitHub issues with a "take" message [\#1791](https://github.com/delta-io/delta-rs/issues/1791) + +**Fixed bugs:** + +- cargo clippy fails on core in main [\#1843](https://github.com/delta-io/delta-rs/issues/1843) + +## [rust-v0.16.4](https://github.com/delta-io/delta-rs/tree/rust-v0.16.4) (2023-11-12) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.16.3...rust-v0.16.4) + +**Implemented enhancements:** + +- Unable to add deltalake git dependency to cargo.toml [\#1821](https://github.com/delta-io/delta-rs/issues/1821) + +## [rust-v0.16.3](https://github.com/delta-io/delta-rs/tree/rust-v0.16.3) (2023-11-08) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.16.2...rust-v0.16.3) + +**Implemented enhancements:** + +- Docs: add release GitHub action [\#1799](https://github.com/delta-io/delta-rs/issues/1799) +- Use bulk deletes where possible [\#1761](https://github.com/delta-io/delta-rs/issues/1761) + +**Fixed bugs:** + +- Code Owners no longer valid [\#1794](https://github.com/delta-io/delta-rs/issues/1794) +- `MERGE` works incorrectly with partitioned table if the data column order is not same as table column order [\#1787](https://github.com/delta-io/delta-rs/issues/1787) +- errors when using pyarrow dataset as a source [\#1779](https://github.com/delta-io/delta-rs/issues/1779) +- Write to Microsoft OneLake failed. [\#1764](https://github.com/delta-io/delta-rs/issues/1764) + +## [rust-v0.16.2](https://github.com/delta-io/delta-rs/tree/rust-v0.16.2) (2023-10-21) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.16.1...rust-v0.16.2) + +## [rust-v0.16.1](https://github.com/delta-io/delta-rs/tree/rust-v0.16.1) (2023-10-21) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.16.0...rust-v0.16.1) + ## [rust-v0.16.0](https://github.com/delta-io/delta-rs/tree/rust-v0.16.0) (2023-09-27) [Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.15.0...rust-v0.16.0) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ee258a3ce8..f681aa3948 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ # Contributing to delta-rs -Development on this project is mostly driven by volunteer contributors. We welcome new contributors, including not only those who develop new features, but also those who are able to help with documentation and provide detailed bug reports. +Development on this project is mostly driven by volunteer contributors. We welcome new contributors, including not only those who develop new features, but also those who are able to help with documentation and provide detailed bug reports. Please take note of our [code of conduct](CODE_OF_CONDUCT.md). @@ -17,34 +17,40 @@ If you want to claim an issue to work on, you can write the word `take` as a com - Install Rust, e.g. as described [here](https://doc.rust-lang.org/cargo/getting-started/installation.html) - Have a compatible Python version installed (check `python/pyproject.toml` for current requirement) - Create a Python virtual environment (required for development builds), e.g. as described [here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) + ```sh + python -m venv .venv + ``` + - Build the project for development (this requires an active virtual environment and will also install `deltalake` in that virtual environment) -``` -cd python -make develop -``` + ```sh + cd python + make develop + ``` - Run some Python code, e.g. to run a specific test -``` -python -m pytest tests/test_writer.py -s -k "test_with_deltalake_schema" -``` + ```sh + python -m pytest tests/test_writer.py -s -k "test_with_deltalake_schema" + ``` - Run some Rust code, e.g. run an example -``` -cd crates/deltalake -cargo run --examples basic_operations -``` + ```sh + cd crates/deltalake + cargo run --example basic_operations --features="datafusion" + ``` ## Run the docs locally -*This serves your local contens of docs via a web browser, handy for checking what they look like if you are making changes to docs or docstings* -``` +*This serves your local contents of docs via a web browser, handy for checking what they look like if you are making changes to docs or docstings* + +```sh (cd python; make develop) pip install -r docs/requirements.txt mkdocs serve ``` ## To make a pull request (PR) -- Make sure all the following steps run/pass locally before submitting a PR -``` +Make sure all the following steps run/pass locally before submitting a PR + +```sh cargo fmt -- --check cd python make check-rust @@ -62,7 +68,7 @@ make build-docs - For debugging Rust code, install [CodeLLDB](https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb). The extension should even create Debug launch configurations for the project if you allow it, an easy way to get started. Just set a breakpoint and run the relevant configuration. - For debugging from Python into Rust, follow this procedure: 1. Add this to `.vscode/launch.json` -``` +```json { "type": "lldb", "request": "attach", diff --git a/Cargo.toml b/Cargo.toml index cfcb4eaf3c..0892b0f12b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,20 @@ [workspace] -members = [ - "crates/*", - "delta-inspect", - "python", -] +members = ["crates/*", "delta-inspect", "python"] exclude = ["proofs"] resolver = "2" +[workspace.package] +authors = ["Qingping Hou "] +rust-version = "1.75" +keywords = ["deltalake", "delta", "datalake"] +readme = "README.md" +edition = "2021" +description = "Native Delta Lake implementation in Rust" +homepage = "https://github.com/delta-io/delta.rs" +license = "Apache-2.0" +documentation = "https://docs.rs/deltalake" +repository = "https://github.com/delta-io/delta.rs" + [profile.release-with-debug] inherits = "release" debug = true @@ -18,28 +26,33 @@ debug = true debug = "line-tables-only" [workspace.dependencies] +delta_kernel = { version = "0.3.0" } +# delta_kernel = { path = "../delta-kernel-rs/kernel" } + # arrow -arrow = { version = "50" } -arrow-arith = { version = "50" } -arrow-array = { version = "50" } -arrow-buffer = { version = "50" } -arrow-cast = { version = "50" } -arrow-ipc = { version = "50" } -arrow-json = { version = "50" } -arrow-ord = { version = "50" } -arrow-row = { version = "50" } -arrow-schema = { version = "50" } -arrow-select = { version = "50" } -object_store = { version = "0.9" } -parquet = { version = "50" } +arrow = { version = "52" } +arrow-arith = { version = "52" } +arrow-array = { version = "52", features = ["chrono-tz"] } +arrow-buffer = { version = "52" } +arrow-cast = { version = "52" } +arrow-ipc = { version = "52" } +arrow-json = { version = "52" } +arrow-ord = { version = "52" } +arrow-row = { version = "52" } +arrow-schema = { version = "52" } +arrow-select = { version = "52" } +object_store = { version = "0.10.1" } +parquet = { version = "52" } # datafusion -datafusion = { version = "35" } -datafusion-expr = { version = "35" } -datafusion-common = { version = "35" } -datafusion-proto = { version = "35" } -datafusion-sql = { version = "35" } -datafusion-physical-expr = { version = "35" } +datafusion = { version = "40" } +datafusion-expr = { version = "40" } +datafusion-common = { version = "40" } +datafusion-proto = { version = "40" } +datafusion-sql = { version = "40" } +datafusion-physical-expr = { version = "40" } +datafusion-functions = { version = "40" } +datafusion-functions-array = { version = "40" } # serde serde = { version = "1.0.194", features = ["derive"] } @@ -47,11 +60,12 @@ serde_json = "1" # "stdlib" bytes = { version = "1" } -chrono = { version = "0.4.31", default-features = false, features = ["clock"] } +chrono = { version = ">0.4.34", default-features = false, features = ["clock"] } tracing = { version = "0.1", features = ["log"] } regex = { version = "1" } thiserror = { version = "1" } url = { version = "2" } +urlencoding = "2.1.3" uuid = { version = "1" } # runtime / async diff --git a/README.md b/README.md index 927b68ee63..b00026b8d8 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ #delta-rs in the Delta Lake Slack workspace

+Delta Lake is an open-source storage format that runs on top of existing data lakes. Delta Lake is compatible with processing engines like Apache Spark and provides benefits such as ACID transaction guarantees, schema enforcement, and scalable data handling. The Delta Lake project aims to unlock the power of the Deltalake for as many users and projects as possible by providing native low-level APIs aimed at developers and integrators, as well as a high-level operations @@ -77,7 +78,7 @@ write_deltalake("./data/delta", df) dt = DeltaTable("./data/delta") df2 = dt.to_pandas() -assert df == df2 +assert df.equals(df2) ``` The same table can also be loaded using the core Rust crate: @@ -91,7 +92,7 @@ async fn main() -> Result<(), DeltaTableError> { let table = open_table("./data/delta").await?; // show all active files in the table - let files = table.get_files(); + let files: Vec<_> = table.get_file_uris()?.collect(); println!("{:?}", files); Ok(()) @@ -116,6 +117,7 @@ Libraries and frameworks that interoperate with delta-rs - in alphabetical order - [AWS SDK for Pandas](https://github.com/aws/aws-sdk-pandas) - [ballista][ballista] - [datafusion][datafusion] +- [Daft](https://www.getdaft.io/) - [Dask](https://github.com/dask-contrib/dask-deltatable) - [datahub](https://datahubproject.io/) - [DuckDB](https://duckdb.org/) @@ -130,45 +132,46 @@ of features outlined in the Delta [protocol][protocol] is also [tracked](#protoc ### Cloud Integrations -| Storage | Rust | Python | Comment | -| -------------------- | :-----: | :-----: | ----------------------------------- | -| Local | ![done] | ![done] | | -| S3 - AWS | ![done] | ![done] | requires lock for concurrent writes | -| S3 - MinIO | ![done] | ![done] | requires lock for concurrent writes | -| S3 - R2 | ![done] | ![done] | requires lock for concurrent writes | -| Azure Blob | ![done] | ![done] | | -| Azure ADLS Gen2 | ![done] | ![done] | | -| Microsoft OneLake | ![done] | ![done] | | -| Google Cloud Storage | ![done] | ![done] | | +| Storage | Rust | Python | Comment | +| -------------------- | :-----: | :-----: | ---------------------------------------------------------------- | +| Local | ![done] | ![done] | | +| S3 - AWS | ![done] | ![done] | requires lock for concurrent writes | +| S3 - MinIO | ![done] | ![done] | requires lock for concurrent writes | +| S3 - R2 | ![done] | ![done] | No lock required when using `AmazonS3ConfigKey::CopyIfNotExists` | +| Azure Blob | ![done] | ![done] | | +| Azure ADLS Gen2 | ![done] | ![done] | | +| Microsoft OneLake | ![done] | ![done] | | +| Google Cloud Storage | ![done] | ![done] | | +| HDFS | ![done] | ![done] | | ### Supported Operations -| Operation | Rust | Python | Description | -| --------------------- | :----------------------: | :----------------------: | ------------------------------------------- | -| Create | ![done] | ![done] | Create a new table | -| Read | ![done] | ![done] | Read data from a table | -| Vacuum | ![done] | ![done] | Remove unused files and log entries | -| Delete - partitions | | ![done] | Delete a table partition | -| Delete - predicates | ![done] | ![done] | Delete data based on a predicate | -| Optimize - compaction | ![done] | ![done] | Harmonize the size of data file | -| Optimize - Z-order | ![done] | ![done] | Place similar data into the same file | -| Merge | ![done] | ![done] | Merge a target Delta table with source data | -| FS check | ![done] | ![done] | Remove corrupted files from table | +| Operation | Rust | Python | Description | +| --------------------- | :-----: | :-----: | ------------------------------------------- | +| Create | ![done] | ![done] | Create a new table | +| Read | ![done] | ![done] | Read data from a table | +| Vacuum | ![done] | ![done] | Remove unused files and log entries | +| Delete - partitions | | ![done] | Delete a table partition | +| Delete - predicates | ![done] | ![done] | Delete data based on a predicate | +| Optimize - compaction | ![done] | ![done] | Harmonize the size of data file | +| Optimize - Z-order | ![done] | ![done] | Place similar data into the same file | +| Merge | ![done] | ![done] | Merge a target Delta table with source data | +| FS check | ![done] | ![done] | Remove corrupted files from table | ### Protocol Support Level -| Writer Version | Requirement | Status | -| -------------- | --------------------------------------------- | :------------------: | -| Version 2 | Append Only Tables | ![done] | -| Version 2 | Column Invariants | ![done] | -| Version 3 | Enforce `delta.checkpoint.writeStatsAsJson` | [![open]][writer-rs] | -| Version 3 | Enforce `delta.checkpoint.writeStatsAsStruct` | [![open]][writer-rs] | +| Writer Version | Requirement | Status | +| -------------- | --------------------------------------------- | :-------------------------------: | +| Version 2 | Append Only Tables | ![done] | +| Version 2 | Column Invariants | ![done] | +| Version 3 | Enforce `delta.checkpoint.writeStatsAsJson` | [![open]][writer-rs] | +| Version 3 | Enforce `delta.checkpoint.writeStatsAsStruct` | [![open]][writer-rs] | | Version 3 | CHECK constraints | [![semi-done]][check-constraints] | -| Version 4 | Change Data Feed | | -| Version 4 | Generated Columns | | -| Version 5 | Column Mapping | | -| Version 6 | Identity Columns | | -| Version 7 | Table Features | | +| Version 4 | Change Data Feed | | +| Version 4 | Generated Columns | | +| Version 5 | Column Mapping | | +| Version 6 | Identity Columns | | +| Version 7 | Table Features | | | Reader Version | Requirement | Status | | -------------- | ----------------------------------- | ------ | diff --git a/crates/aws/Cargo.toml b/crates/aws/Cargo.toml index b18729e262..e6913a2162 100644 --- a/crates/aws/Cargo.toml +++ b/crates/aws/Cargo.toml @@ -1,14 +1,24 @@ [package] name = "deltalake-aws" -version = "0.1.0" -edition = "2021" +version = "0.1.2" +authors.workspace = true +keywords.workspace = true +readme.workspace = true +edition.workspace = true +homepage.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true [dependencies] -deltalake-core = { version = "0.17.0", path = "../core" } -rusoto_core = { version = "0.47", default-features = false, optional = true } -rusoto_credential = { version = "0.47" } -rusoto_sts = { version = "0.47", default-features = false, optional = true } -rusoto_dynamodb = { version = "0.47", default-features = false, optional = true } +deltalake-core = { version = ">=0.17.0, <0.19.0", path = "../core" } +aws-smithy-runtime-api = { version="1.1.7" } +aws-smithy-runtime = { version="1.1.7", optional = true} +aws-credential-types = { version="1.1.7", features = ["hardcoded-credentials"]} +aws-config = { version = "1.1.6", default-features = false, features = ["behavior-version-latest","rt-tokio", "credentials-process", "sso"] } +aws-sdk-dynamodb = {version = "1.15.0", default-features = false, features = ["behavior-version-latest", "rt-tokio"] } +aws-sdk-sts = {version = "1.1.6", default-features = false, features = ["behavior-version-latest", "rt-tokio"] } lazy_static = "1" maplit = "1" @@ -24,13 +34,14 @@ regex = { workspace = true } uuid = { workspace = true, features = ["serde", "v4"] } url = { workspace = true } backoff = { version = "0.4", features = [ "tokio" ] } +hyper-tls = { version = "0.5", optional = true } [dev-dependencies] deltalake-core = { path = "../core", features = ["datafusion"] } chrono = { workspace = true } serial_test = "3" deltalake-test = { path = "../test" } -pretty_env_logger = "*" +pretty_env_logger = "0.5.0" rand = "0.8" serde_json = { workspace = true } @@ -38,12 +49,13 @@ serde_json = { workspace = true } default = ["rustls"] integration_test = [] native-tls = [ - "rusoto_core/native-tls", - "rusoto_sts/native-tls", - "rusoto_dynamodb/native-tls", + "aws-config/client-hyper", + "aws-smithy-runtime/connector-hyper-0-14-x", + "hyper-tls" ] rustls = [ - "rusoto_core/rustls", - "rusoto_sts/rustls", - "rusoto_dynamodb/rustls", + "aws-config/client-hyper", + "aws-config/rustls", + "aws-sdk-dynamodb/rustls", + "aws-sdk-sts/rustls", ] diff --git a/crates/aws/src/credentials.rs b/crates/aws/src/credentials.rs new file mode 100644 index 0000000000..9ddf19b74c --- /dev/null +++ b/crates/aws/src/credentials.rs @@ -0,0 +1,118 @@ +use std::{sync::Arc, time::Duration}; + +use aws_config::{ + ecs::EcsCredentialsProvider, + environment::{EnvironmentVariableCredentialsProvider, EnvironmentVariableRegionProvider}, + imds::credentials::ImdsCredentialsProvider, + meta::{credentials::CredentialsProviderChain, region::RegionProviderChain}, + profile::ProfileFileCredentialsProvider, + provider_config::ProviderConfig, + web_identity_token::WebIdentityTokenCredentialsProvider, +}; +use aws_credential_types::provider::{self, ProvideCredentials}; +use tracing::Instrument; + +const IMDS_PROVIDER_NAME: &str = "Ec2InstanceMetadata"; + +#[derive(Debug)] +pub struct ConfiguredCredentialChain { + provider_chain: CredentialsProviderChain, +} + +#[derive(Debug)] +pub struct NoOpCredentials {} + +pub fn new_region_provider(disable_imds: bool, imds_timeout: u64) -> RegionProviderChain { + let env_provider = EnvironmentVariableRegionProvider::new(); + let profile_file = aws_config::profile::region::ProfileFileRegionProvider::default(); + if disable_imds { + return RegionProviderChain::first_try(env_provider).or_else(profile_file); + } + + RegionProviderChain::first_try(env_provider) + .or_else(profile_file) + .or_else( + aws_config::imds::region::Builder::default() + .imds_client( + aws_config::imds::Client::builder() + .connect_timeout(Duration::from_millis(imds_timeout)) + .read_timeout(Duration::from_millis(imds_timeout)) + .build(), + ) + .build(), + ) +} + +impl ConfiguredCredentialChain { + pub fn new(disable_imds: bool, imds_timeout: u64, conf: &ProviderConfig) -> Self { + let imds_provider = Self::build_imds_provider(conf, disable_imds, imds_timeout); + let env_provider = EnvironmentVariableCredentialsProvider::default(); + let profile_provider = ProfileFileCredentialsProvider::builder() + .configure(conf) + .with_custom_provider(IMDS_PROVIDER_NAME, imds_provider.clone()) + .build(); + let web_identity_token_provider = WebIdentityTokenCredentialsProvider::builder() + .configure(conf) + .build(); + + let ecs_provider = EcsCredentialsProvider::builder().configure(conf).build(); + + let provider_chain = CredentialsProviderChain::first_try("Environment", env_provider) + .or_else("Profile", profile_provider) + .or_else("WebIdentityToken", web_identity_token_provider) + .or_else("EcsContainer", ecs_provider) + .or_else(IMDS_PROVIDER_NAME, imds_provider); + + Self { provider_chain } + } + + async fn credentials(&self) -> provider::Result { + self.provider_chain + .provide_credentials() + .instrument(tracing::debug_span!("provide_credentials", provider = %"default_chain")) + .await + } + + fn build_imds_provider( + conf: &ProviderConfig, + disable_imds: bool, + imds_timeout: u64, + ) -> Arc { + if disable_imds { + return Arc::new(NoOpCredentials {}); + } + + let imds_provider = ImdsCredentialsProvider::builder() + .configure(conf) + .imds_client( + aws_config::imds::Client::builder() + .connect_timeout(Duration::from_millis(imds_timeout)) + .read_timeout(Duration::from_millis(imds_timeout)) + .build(), + ) + .build(); + Arc::new(imds_provider) + } +} + +impl ProvideCredentials for ConfiguredCredentialChain { + fn provide_credentials<'a>( + &'a self, + ) -> aws_credential_types::provider::future::ProvideCredentials<'a> + where + Self: 'a, + { + aws_credential_types::provider::future::ProvideCredentials::new(self.credentials()) + } +} + +impl ProvideCredentials for NoOpCredentials { + fn provide_credentials<'a>(&'a self) -> provider::future::ProvideCredentials<'a> + where + Self: 'a, + { + aws_credential_types::provider::future::ProvideCredentials::new(std::future::ready(Err( + provider::error::CredentialsError::not_loaded_no_source(), + ))) + } +} diff --git a/crates/aws/src/errors.rs b/crates/aws/src/errors.rs index bbce9dc426..55f2a2d013 100644 --- a/crates/aws/src/errors.rs +++ b/crates/aws/src/errors.rs @@ -2,27 +2,45 @@ use std::num::ParseIntError; -use rusoto_core::RusotoError; -use rusoto_dynamodb::{CreateTableError, GetItemError, PutItemError, QueryError, UpdateItemError}; - -#[derive(thiserror::Error, Debug, PartialEq)] -pub enum DynamoDbConfigError { - /// Error raised creating http client - #[error("Failed to create request dispatcher: {source}")] - HttpClient { - /// The underlying Rusoto TlsError - #[from] - source: rusoto_core::request::TlsError, +use aws_credential_types::provider::error::CredentialsError; +use aws_sdk_dynamodb::{ + error::SdkError, + operation::{ + create_table::CreateTableError, delete_item::DeleteItemError, get_item::GetItemError, + put_item::PutItemError, query::QueryError, update_item::UpdateItemError, }, +}; +use aws_smithy_runtime_api::client::result::ServiceError; + +macro_rules! impl_from_service_error { + ($error_type:ty) => { + impl From> for LockClientError + where + R: Send + Sync + std::fmt::Debug + 'static, + { + fn from(err: SdkError<$error_type, R>) -> Self { + match err { + SdkError::ServiceError(e) => e.into(), + _ => LockClientError::GenericDynamoDb { + source: Box::new(err), + }, + } + } + } - /// Error raised getting credentials - #[error("Failed to retrieve AWS credentials: {source}")] - Credentials { - /// The underlying Rusoto CredentialsError - #[from] - source: rusoto_credential::CredentialsError, - }, + impl From> for LockClientError + where + R: Send + Sync + std::fmt::Debug + 'static, + { + fn from(value: ServiceError<$error_type, R>) -> Self { + value.into_err().into() + } + } + }; +} +#[derive(thiserror::Error, Debug)] +pub enum DynamoDbConfigError { /// Billing mode string invalid #[error("Invalid billing mode : {0}, supported values : ['provided', 'pay_per_request']")] InvalidBillingMode(String), @@ -33,6 +51,9 @@ pub enum DynamoDbConfigError { // config_value: String, source: ParseIntError, }, + /// Cannot initialize DynamoDbConfiguration due to some sort of threading issue + #[error("Cannot initialize dynamodb lock configuration")] + InitializationError, } /// Errors produced by `DynamoDbLockClient` @@ -44,7 +65,7 @@ pub enum LockClientError { #[error("Lock table '{name}': creation failed: {source}")] LockTableCreateFailure { name: String, - source: RusotoError, + source: Box, }, #[error("Log entry for table '{table_path}' and version '{version}' already exists")] @@ -60,29 +81,30 @@ pub enum LockClientError { GenericDynamoDb { source: Box, }, - #[error("configuration error: {source}")] - Credentials { - source: rusoto_credential::CredentialsError, - }, - + Credentials { source: CredentialsError }, #[error( "Atomic rename requires a LockClient for S3 backends. \ Either configure the LockClient, or set AWS_S3_ALLOW_UNSAFE_RENAME=true \ to opt out of support for concurrent writers." )] LockClientRequired, + + #[error("Log entry for table '{table_path}' and version '{version}' is already complete")] + VersionAlreadyCompleted { table_path: String, version: i64 }, } impl From for LockClientError { fn from(err: GetItemError) -> Self { match err { - GetItemError::InternalServerError(_) => err.into(), - GetItemError::ProvisionedThroughputExceeded(_) => { + GetItemError::ProvisionedThroughputExceededException(_) => { LockClientError::ProvisionedThroughputExceeded } GetItemError::RequestLimitExceeded(_) => LockClientError::ProvisionedThroughputExceeded, - GetItemError::ResourceNotFound(_) => LockClientError::LockTableNotFound, + GetItemError::ResourceNotFoundException(_) => LockClientError::LockTableNotFound, + _ => LockClientError::GenericDynamoDb { + source: Box::new(err), + }, } } } @@ -90,12 +112,14 @@ impl From for LockClientError { impl From for LockClientError { fn from(err: QueryError) -> Self { match err { - QueryError::InternalServerError(_) => err.into(), - QueryError::ProvisionedThroughputExceeded(_) => { + QueryError::ProvisionedThroughputExceededException(_) => { LockClientError::ProvisionedThroughputExceeded } QueryError::RequestLimitExceeded(_) => LockClientError::ProvisionedThroughputExceeded, - QueryError::ResourceNotFound(_) => LockClientError::LockTableNotFound, + QueryError::ResourceNotFoundException(_) => LockClientError::LockTableNotFound, + _ => LockClientError::GenericDynamoDb { + source: Box::new(err), + }, } } } @@ -103,17 +127,19 @@ impl From for LockClientError { impl From for LockClientError { fn from(err: PutItemError) -> Self { match err { - PutItemError::ConditionalCheckFailed(_) => { + PutItemError::ConditionalCheckFailedException(_) => { unreachable!("error must be handled explicitely") } - PutItemError::InternalServerError(_) => err.into(), - PutItemError::ProvisionedThroughputExceeded(_) => { + PutItemError::ProvisionedThroughputExceededException(_) => { LockClientError::ProvisionedThroughputExceeded } PutItemError::RequestLimitExceeded(_) => LockClientError::ProvisionedThroughputExceeded, - PutItemError::ResourceNotFound(_) => LockClientError::LockTableNotFound, - PutItemError::ItemCollectionSizeLimitExceeded(_) => err.into(), - PutItemError::TransactionConflict(_) => err.into(), + PutItemError::ResourceNotFoundException(_) => LockClientError::LockTableNotFound, + PutItemError::ItemCollectionSizeLimitExceededException(_) => err.into(), + PutItemError::TransactionConflictException(_) => err.into(), + _ => LockClientError::GenericDynamoDb { + source: Box::new(err), + }, } } } @@ -121,34 +147,51 @@ impl From for LockClientError { impl From for LockClientError { fn from(err: UpdateItemError) -> Self { match err { - UpdateItemError::ConditionalCheckFailed(_) => { + UpdateItemError::ConditionalCheckFailedException(_) => { unreachable!("condition check failure in update is not an error") } UpdateItemError::InternalServerError(_) => err.into(), - UpdateItemError::ProvisionedThroughputExceeded(_) => { + UpdateItemError::ProvisionedThroughputExceededException(_) => { LockClientError::ProvisionedThroughputExceeded } UpdateItemError::RequestLimitExceeded(_) => { LockClientError::ProvisionedThroughputExceeded } - UpdateItemError::ResourceNotFound(_) => LockClientError::LockTableNotFound, - UpdateItemError::ItemCollectionSizeLimitExceeded(_) => err.into(), - UpdateItemError::TransactionConflict(_) => err.into(), + UpdateItemError::ResourceNotFoundException(_) => LockClientError::LockTableNotFound, + UpdateItemError::ItemCollectionSizeLimitExceededException(_) => err.into(), + UpdateItemError::TransactionConflictException(_) => err.into(), + _ => LockClientError::GenericDynamoDb { + source: Box::new(err), + }, } } } -impl From> for LockClientError -where - E: Into + std::error::Error + Send + Sync + 'static, -{ - fn from(err: RusotoError) -> Self { +impl From for LockClientError { + fn from(err: DeleteItemError) -> Self { match err { - RusotoError::Service(e) => e.into(), - RusotoError::Credentials(e) => LockClientError::Credentials { source: e }, + DeleteItemError::ConditionalCheckFailedException(_) => { + unreachable!("error must be handled explicitly") + } + DeleteItemError::InternalServerError(_) => err.into(), + DeleteItemError::ProvisionedThroughputExceededException(_) => { + LockClientError::ProvisionedThroughputExceeded + } + DeleteItemError::RequestLimitExceeded(_) => { + LockClientError::ProvisionedThroughputExceeded + } + DeleteItemError::ResourceNotFoundException(_) => LockClientError::LockTableNotFound, + DeleteItemError::ItemCollectionSizeLimitExceededException(_) => err.into(), + DeleteItemError::TransactionConflictException(_) => err.into(), _ => LockClientError::GenericDynamoDb { source: Box::new(err), }, } } } + +impl_from_service_error!(GetItemError); +impl_from_service_error!(PutItemError); +impl_from_service_error!(QueryError); +impl_from_service_error!(UpdateItemError); +impl_from_service_error!(DeleteItemError); diff --git a/crates/aws/src/lib.rs b/crates/aws/src/lib.rs index 2630f80512..a0a99c01f0 100644 --- a/crates/aws/src/lib.rs +++ b/crates/aws/src/lib.rs @@ -1,9 +1,23 @@ //! Lock client implementation based on DynamoDb. +mod credentials; pub mod errors; pub mod logstore; +#[cfg(feature = "native-tls")] +mod native; pub mod storage; - +use aws_config::SdkConfig; +use aws_sdk_dynamodb::{ + operation::{ + create_table::CreateTableError, delete_item::DeleteItemError, get_item::GetItemError, + put_item::PutItemError, query::QueryError, update_item::UpdateItemError, + }, + types::{ + AttributeDefinition, AttributeValue, BillingMode, KeySchemaElement, KeyType, + ScalarAttributeType, + }, + Client, +}; use lazy_static::lazy_static; use object_store::aws::AmazonS3ConfigKey; use regex::Regex; @@ -18,21 +32,13 @@ use tracing::debug; use deltalake_core::logstore::{logstores, LogStore, LogStoreFactory}; use deltalake_core::storage::{factories, url_prefix_handler, ObjectStoreRef, StorageOptions}; use deltalake_core::{DeltaResult, Path}; -use rusoto_core::{HttpClient, Region, RusotoError}; -use rusoto_credential::AutoRefreshingProvider; -use rusoto_dynamodb::{ - AttributeDefinition, AttributeValue, CreateTableError, CreateTableInput, DynamoDb, - DynamoDbClient, GetItemError, GetItemInput, KeySchemaElement, PutItemError, PutItemInput, - QueryError, QueryInput, UpdateItemError, UpdateItemInput, -}; -use rusoto_sts::WebIdentityProvider; use url::Url; use errors::{DynamoDbConfigError, LockClientError}; use storage::{S3ObjectStoreFactory, S3StorageOptions}; #[derive(Clone, Debug, Default)] -struct S3LogStoreFactory {} +pub struct S3LogStoreFactory {} impl LogStoreFactory for S3LogStoreFactory { fn with_options( @@ -41,7 +47,7 @@ impl LogStoreFactory for S3LogStoreFactory { location: &Url, options: &StorageOptions, ) -> DeltaResult> { - let store = url_prefix_handler(store, Path::parse(location.path())?)?; + let store = url_prefix_handler(store, Path::parse(location.path())?); if options .0 @@ -53,7 +59,7 @@ impl LogStoreFactory for S3LogStoreFactory { )); } - let s3_options = S3StorageOptions::from_map(&options.0); + let s3_options = S3StorageOptions::from_map(&options.0)?; if s3_options.locking_provider.as_deref() != Some("dynamodb") { debug!("S3LogStoreFactory has been asked to create a LogStore without the dynamodb locking provider"); @@ -117,7 +123,7 @@ impl CommitEntry { /// Lock client backed by DynamoDb. pub struct DynamoDbLockClient { /// DynamoDb client - dynamodb_client: DynamoDbClient, + dynamodb_client: Client, /// configuration of the config: DynamoDbConfig, } @@ -131,24 +137,30 @@ impl std::fmt::Debug for DynamoDbLockClient { impl DynamoDbLockClient { /// Creates a new DynamoDbLockClient from the supplied storage options. pub fn try_new( + sdk_config: &SdkConfig, lock_table_name: Option, billing_mode: Option, max_elapsed_request_time: Option, - region: Region, - use_web_identity: bool, + dynamodb_override_endpoint: Option, ) -> Result { - let dynamodb_client = create_dynamodb_client(region.clone(), use_web_identity)?; + let dynamodb_sdk_config = + Self::create_dynamodb_sdk_config(sdk_config, dynamodb_override_endpoint); + + let dynamodb_client = aws_sdk_dynamodb::Client::new(&dynamodb_sdk_config); let lock_table_name = lock_table_name .or_else(|| std::env::var(constants::LOCK_TABLE_KEY_NAME).ok()) .unwrap_or(constants::DEFAULT_LOCK_TABLE_NAME.to_owned()); - let billing_mode = billing_mode + let billing_mode = if let Some(bm) = billing_mode .or_else(|| std::env::var(constants::BILLING_MODE_KEY_NAME).ok()) - .map_or_else( - || Ok(BillingMode::PayPerRequest), - |bm| BillingMode::from_str(&bm), - )?; + .as_ref() + { + BillingMode::try_parse(bm.to_ascii_uppercase().as_str()) + .map_err(|_| DynamoDbConfigError::InvalidBillingMode(String::default()))? + } else { + BillingMode::PayPerRequest + }; let max_elapsed_request_time = max_elapsed_request_time .or_else(|| std::env::var(constants::MAX_ELAPSED_REQUEST_TIME_KEY_NAME).ok()) @@ -162,14 +174,31 @@ impl DynamoDbLockClient { billing_mode, lock_table_name, max_elapsed_request_time, - use_web_identity, - region, + sdk_config: sdk_config.clone(), }; Ok(Self { dynamodb_client, config, }) } + fn create_dynamodb_sdk_config( + sdk_config: &SdkConfig, + dynamodb_override_endpoint: Option, + ) -> SdkConfig { + /* + if dynamodb_override_endpoint exists/AWS_ENDPOINT_URL_DYNAMODB is specified by user + use dynamodb_override_endpoint to create dynamodb client + */ + + match dynamodb_override_endpoint { + Some(dynamodb_endpoint_url) => sdk_config + .to_owned() + .to_builder() + .endpoint_url(dynamodb_endpoint_url) + .build(), + None => sdk_config.to_owned(), + } + } /// Create the lock table where DynamoDb stores the commit information for all delta tables. /// @@ -179,40 +208,50 @@ impl DynamoDbLockClient { /// `active`, so transient failures might occurr when immediately using the lock client. pub async fn try_create_lock_table(&self) -> Result { let attribute_definitions = vec![ - AttributeDefinition { - attribute_name: constants::ATTR_TABLE_PATH.to_owned(), - attribute_type: constants::STRING_TYPE.to_owned(), - }, - AttributeDefinition { - attribute_name: constants::ATTR_FILE_NAME.to_owned(), - attribute_type: constants::STRING_TYPE.to_owned(), - }, + AttributeDefinition::builder() + .attribute_name(constants::ATTR_TABLE_PATH) + .attribute_type(ScalarAttributeType::S) + .build() + .unwrap(), + AttributeDefinition::builder() + .attribute_name(constants::ATTR_FILE_NAME) + .attribute_type(ScalarAttributeType::S) + .build() + .unwrap(), ]; - let input = CreateTableInput { - attribute_definitions, - key_schema: vec![ - KeySchemaElement { - attribute_name: constants::ATTR_TABLE_PATH.to_owned(), - key_type: constants::KEY_TYPE_HASH.to_owned(), - }, - KeySchemaElement { - attribute_name: constants::ATTR_FILE_NAME.to_owned(), - key_type: constants::KEY_TYPE_RANGE.to_owned(), - }, - ], - billing_mode: Some(self.config.billing_mode.to_str()), - table_name: self.config.lock_table_name.clone(), - ..Default::default() - }; - match self.dynamodb_client.create_table(input).await { + let request = self + .dynamodb_client + .create_table() + .set_attribute_definitions(Some(attribute_definitions)) + .set_key_schema(Some(vec![ + KeySchemaElement::builder() + .attribute_name(constants::ATTR_TABLE_PATH.to_owned()) + .key_type(KeyType::Hash) + .build() + .unwrap(), + KeySchemaElement::builder() + .attribute_name(constants::ATTR_FILE_NAME.to_owned()) + .key_type(KeyType::Range) + .build() + .unwrap(), + ])) + .billing_mode(self.config.billing_mode.clone()) + .table_name(&self.config.lock_table_name) + .send(); + match request.await { Ok(_) => Ok(CreateLockTableResult::TableCreated), - Err(RusotoError::Service(CreateTableError::ResourceInUse(_))) => { - Ok(CreateLockTableResult::TableAlreadyExists) - } - Err(reason) => Err(LockClientError::LockTableCreateFailure { - name: self.config.lock_table_name.clone(), - source: reason, - }), + Err(sdk_err) => match sdk_err.as_service_error() { + Some(CreateTableError::ResourceInUseException(_)) => { + Ok(CreateLockTableResult::TableAlreadyExists) + } + Some(_) => Err(LockClientError::LockTableCreateFailure { + name: self.config.lock_table_name.clone(), + source: Box::new(sdk_err.into_service_error()), + }), + _ => Err(LockClientError::GenericDynamoDb { + source: Box::new(sdk_err), + }), + }, } } @@ -238,22 +277,26 @@ impl DynamoDbLockClient { table_path: &str, version: i64, ) -> Result, LockClientError> { - let input = GetItemInput { - consistent_read: Some(true), - table_name: self.config.lock_table_name.clone(), - key: self.get_primary_key(version, table_path), - ..Default::default() - }; let item = self .retry(|| async { - match self.dynamodb_client.get_item(input.clone()).await { + match self + .dynamodb_client + .get_item() + .consistent_read(true) + .table_name(&self.config.lock_table_name) + .set_key(Some(self.get_primary_key(version, table_path))) + .send() + .await + { Ok(x) => Ok(x), - Err(RusotoError::Service(GetItemError::ProvisionedThroughputExceeded(_))) => { - Err(backoff::Error::transient( - LockClientError::ProvisionedThroughputExceeded, - )) - } - Err(err) => Err(backoff::Error::permanent(err.into())), + Err(sdk_err) => match sdk_err.as_service_error() { + Some(GetItemError::ProvisionedThroughputExceededException(_)) => { + Err(backoff::Error::transient( + LockClientError::ProvisionedThroughputExceeded, + )) + } + _ => Err(backoff::Error::permanent(sdk_err.into())), + }, } }) .await?; @@ -266,29 +309,33 @@ impl DynamoDbLockClient { table_path: &str, entry: &CommitEntry, ) -> Result<(), LockClientError> { - let item = create_value_map(entry, table_path); - let input = PutItemInput { - condition_expression: Some(constants::CONDITION_EXPR_CREATE.to_owned()), - table_name: self.get_lock_table_name(), - item, - ..Default::default() - }; self.retry(|| async { - match self.dynamodb_client.put_item(input.clone()).await { + let item = create_value_map(entry, table_path); + match self + .dynamodb_client + .put_item() + .condition_expression(constants::CONDITION_EXPR_CREATE.as_str()) + .table_name(self.get_lock_table_name()) + .set_item(Some(item)) + .send() + .await + { Ok(_) => Ok(()), - Err(RusotoError::Service(PutItemError::ProvisionedThroughputExceeded(_))) => Err( - backoff::Error::transient(LockClientError::ProvisionedThroughputExceeded), - ), - Err(RusotoError::Service(PutItemError::ConditionalCheckFailed(_))) => Err( - backoff::Error::permanent(LockClientError::VersionAlreadyExists { - table_path: table_path.to_owned(), - version: entry.version, - }), - ), - Err(RusotoError::Service(PutItemError::ResourceNotFound(_))) => Err( - backoff::Error::permanent(LockClientError::LockTableNotFound), - ), - Err(err) => Err(backoff::Error::permanent(err.into())), + Err(err) => match err.as_service_error() { + Some(PutItemError::ProvisionedThroughputExceededException(_)) => Err( + backoff::Error::transient(LockClientError::ProvisionedThroughputExceeded), + ), + Some(PutItemError::ConditionalCheckFailedException(_)) => Err( + backoff::Error::permanent(LockClientError::VersionAlreadyExists { + table_path: table_path.to_owned(), + version: entry.version, + }), + ), + Some(PutItemError::ResourceNotFoundException(_)) => Err( + backoff::Error::permanent(LockClientError::LockTableNotFound), + ), + _ => Err(backoff::Error::permanent(err.into())), + }, } }) .await @@ -312,25 +359,31 @@ impl DynamoDbLockClient { table_path: &str, limit: i64, ) -> Result, LockClientError> { - let input = QueryInput { - table_name: self.get_lock_table_name(), - consistent_read: Some(true), - limit: Some(limit), - scan_index_forward: Some(false), - key_condition_expression: Some(format!("{} = :tn", constants::ATTR_TABLE_PATH)), - expression_attribute_values: Some( - maplit::hashmap!(":tn".into() => string_attr(table_path)), - ), - ..Default::default() - }; let query_result = self .retry(|| async { - match self.dynamodb_client.query(input.clone()).await { + match self + .dynamodb_client + .query() + .table_name(self.get_lock_table_name()) + .consistent_read(true) + .limit(limit.try_into().unwrap_or(i32::MAX)) + .scan_index_forward(false) + .key_condition_expression(format!("{} = :tn", constants::ATTR_TABLE_PATH)) + .set_expression_attribute_values(Some( + maplit::hashmap!(":tn".into() => string_attr(table_path)), + )) + .send() + .await + { Ok(result) => Ok(result), - Err(RusotoError::Service(QueryError::ProvisionedThroughputExceeded(_))) => Err( - backoff::Error::transient(LockClientError::ProvisionedThroughputExceeded), - ), - Err(err) => Err(backoff::Error::permanent(err.into())), + Err(sdk_err) => match sdk_err.as_service_error() { + Some(QueryError::ProvisionedThroughputExceededException(_)) => { + Err(backoff::Error::transient( + LockClientError::ProvisionedThroughputExceeded, + )) + } + _ => Err(backoff::Error::permanent(sdk_err.into())), + }, } }) .await?; @@ -354,31 +407,69 @@ impl DynamoDbLockClient { .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .as_secs(); - let input = UpdateItemInput { - table_name: self.get_lock_table_name(), - key: self.get_primary_key(version, table_path), - update_expression: Some("SET complete = :c, expireTime = :e".to_owned()), - expression_attribute_values: Some(maplit::hashmap! { - ":c".to_owned() => string_attr("true"), - ":e".to_owned() => num_attr(seconds_since_epoch), - ":f".into() => string_attr("false"), - }), - condition_expression: Some(constants::CONDITION_UPDATE_INCOMPLETE.to_owned()), - ..Default::default() - }; - self.retry(|| async { - match self.dynamodb_client.update_item(input.clone()).await { + match self + .dynamodb_client + .update_item() + .table_name(self.get_lock_table_name()) + .set_key(Some(self.get_primary_key(version, table_path))) + .update_expression("SET complete = :c, expireTime = :e".to_owned()) + .set_expression_attribute_values(Some(maplit::hashmap! { + ":c".to_owned() => string_attr("true"), + ":e".to_owned() => num_attr(seconds_since_epoch), + ":f".into() => string_attr("false"), + })) + .condition_expression(constants::CONDITION_UPDATE_INCOMPLETE) + .send() + .await + { Ok(_) => Ok(UpdateLogEntryResult::UpdatePerformed), - Err(RusotoError::Service(UpdateItemError::ConditionalCheckFailed(_))) => { - Ok(UpdateLogEntryResult::AlreadyCompleted) - } - Err(RusotoError::Service(UpdateItemError::ProvisionedThroughputExceeded(_))) => { - Err(backoff::Error::transient( - LockClientError::ProvisionedThroughputExceeded, - )) - } - Err(err) => Err(backoff::Error::permanent(err.into())), + Err(err) => match err.as_service_error() { + Some(UpdateItemError::ProvisionedThroughputExceededException(_)) => Err( + backoff::Error::transient(LockClientError::ProvisionedThroughputExceeded), + ), + Some(UpdateItemError::ConditionalCheckFailedException(_)) => { + Ok(UpdateLogEntryResult::AlreadyCompleted) + } + _ => Err(backoff::Error::permanent(err.into())), + }, + } + }) + .await + } + + /// Delete existing log entry if it is not already complete + pub async fn delete_commit_entry( + &self, + version: i64, + table_path: &str, + ) -> Result<(), LockClientError> { + self.retry(|| async { + match self + .dynamodb_client + .delete_item() + .table_name(self.get_lock_table_name()) + .set_key(Some(self.get_primary_key(version, table_path))) + .set_expression_attribute_values(Some(maplit::hashmap! { + ":f".into() => string_attr("false"), + })) + .condition_expression(constants::CONDITION_DELETE_INCOMPLETE.as_str()) + .send() + .await + { + Ok(_) => Ok(()), + Err(err) => match err.as_service_error() { + Some(DeleteItemError::ProvisionedThroughputExceededException(_)) => Err( + backoff::Error::transient(LockClientError::ProvisionedThroughputExceeded), + ), + Some(DeleteItemError::ConditionalCheckFailedException(_)) => Err( + backoff::Error::permanent(LockClientError::VersionAlreadyCompleted { + table_path: table_path.to_owned(), + version, + }), + ), + _ => Err(backoff::Error::permanent(err.into())), + }, } }) .await @@ -467,40 +558,23 @@ fn create_value_map( value_map } -#[derive(Debug, PartialEq)] -pub enum BillingMode { - PayPerRequest, - Provisioned, -} - -impl BillingMode { - fn to_str(&self) -> String { - match self { - Self::PayPerRequest => "PAY_PER_REQUEST".to_owned(), - Self::Provisioned => "PROVISIONED".to_owned(), - } - } -} - -impl FromStr for BillingMode { - type Err = DynamoDbConfigError; - - fn from_str(s: &str) -> Result { - match s.to_ascii_lowercase().as_str() { - "provisioned" => Ok(BillingMode::Provisioned), - "pay_per_request" => Ok(BillingMode::PayPerRequest), - _ => Err(DynamoDbConfigError::InvalidBillingMode(s.to_owned())), - } - } -} - -#[derive(Debug, PartialEq)] +#[derive(Debug)] pub struct DynamoDbConfig { pub billing_mode: BillingMode, pub lock_table_name: String, pub max_elapsed_request_time: Duration, - pub use_web_identity: bool, - pub region: Region, + pub sdk_config: SdkConfig, +} + +impl Eq for DynamoDbConfig {} +impl PartialEq for DynamoDbConfig { + fn eq(&self, other: &Self) -> bool { + self.billing_mode == other.billing_mode + && self.lock_table_name == other.lock_table_name + && self.max_elapsed_request_time == other.max_elapsed_request_time + && self.sdk_config.endpoint_url() == other.sdk_config.endpoint_url() + && self.sdk_config.region() == other.sdk_config.region() + } } /// Represents the possible, positive outcomes of calling `DynamoDbClient::try_create_lock_table()` @@ -538,6 +612,10 @@ pub mod constants { pub static ref CONDITION_EXPR_CREATE: String = format!( "attribute_not_exists({ATTR_TABLE_PATH}) and attribute_not_exists({ATTR_FILE_NAME})" ); + + pub static ref CONDITION_DELETE_INCOMPLETE: String = format!( + "(complete = :f) or (attribute_not_exists({ATTR_TABLE_PATH}) and attribute_not_exists({ATTR_FILE_NAME}))" + ); } pub const CONDITION_UPDATE_INCOMPLETE: &str = "complete = :f"; @@ -545,23 +623,6 @@ pub mod constants { pub const DEFAULT_COMMIT_ENTRY_EXPIRATION_DELAY: Duration = Duration::from_secs(86_400); } -fn create_dynamodb_client( - region: Region, - use_web_identity: bool, -) -> Result { - Ok(match use_web_identity { - true => { - let dispatcher = HttpClient::new()?; - rusoto_dynamodb::DynamoDbClient::new_with( - dispatcher, - get_web_identity_provider()?, - region, - ) - } - false => rusoto_dynamodb::DynamoDbClient::new(region), - }) -} - /// Extract a field from an item's attribute value map, producing a descriptive error /// of the various failure cases. fn extract_required_string_field<'a>( @@ -573,12 +634,11 @@ fn extract_required_string_field<'a>( .ok_or_else(|| LockClientError::InconsistentData { description: format!("mandatory string field '{field_name}' missing"), })? - .s - .as_ref() - .ok_or_else(|| LockClientError::InconsistentData { + .as_s() + .map_err(|v| LockClientError::InconsistentData { description: format!( "mandatory string field '{field_name}' exists, but is not a string: {:#?}", - fields.get(field_name) + v, ), }) .map(|s| s.as_str()) @@ -593,35 +653,21 @@ fn extract_optional_number_field<'a>( fields .get(field_name) .map(|attr| { - attr.n - .as_ref() - .ok_or_else(|| LockClientError::InconsistentData { - description: format!( - "field with name '{field_name}' exists, but is not of type number" - ), - }) + attr.as_n().map_err(|_| LockClientError::InconsistentData { + description: format!( + "field with name '{field_name}' exists, but is not of type number" + ), + }) }) .transpose() } fn string_attr(s: T) -> AttributeValue { - AttributeValue { - s: Some(s.to_string()), - ..Default::default() - } + AttributeValue::S(s.to_string()) } fn num_attr(n: T) -> AttributeValue { - AttributeValue { - n: Some(n.to_string()), - ..Default::default() - } -} - -fn get_web_identity_provider( -) -> Result, DynamoDbConfigError> { - let provider = WebIdentityProvider::from_k8s_env(); - Ok(AutoRefreshingProvider::new(provider)?) + AttributeValue::N(n.to_string()) } lazy_static! { @@ -639,6 +685,7 @@ fn extract_version_from_filename(name: &str) -> Option { #[cfg(test)] mod tests { use super::*; + use aws_config::Region; use object_store::memory::InMemory; use serial_test::serial; @@ -687,4 +734,31 @@ mod tests { .unwrap(); assert_eq!(logstore.name(), "DefaultLogStore"); } + + #[test] + #[serial] + fn test_create_dynamodb_sdk_config() { + let sdk_config = SdkConfig::builder() + .region(Region::from_static("eu-west-1")) + .endpoint_url("http://localhost:1234") + .build(); + let dynamodb_sdk_config = DynamoDbLockClient::create_dynamodb_sdk_config( + &sdk_config, + Some("http://localhost:2345".to_string()), + ); + assert_eq!( + dynamodb_sdk_config.endpoint_url(), + Some("http://localhost:2345"), + ); + assert_eq!( + dynamodb_sdk_config.region().unwrap().to_string(), + "eu-west-1".to_string(), + ); + let dynamodb_sdk_no_override_config = + DynamoDbLockClient::create_dynamodb_sdk_config(&sdk_config, None); + assert_eq!( + dynamodb_sdk_no_override_config.endpoint_url(), + Some("http://localhost:1234"), + ); + } } diff --git a/crates/aws/src/logstore.rs b/crates/aws/src/logstore.rs index 123aadd2d1..fe569256ee 100644 --- a/crates/aws/src/logstore.rs +++ b/crates/aws/src/logstore.rs @@ -45,6 +45,7 @@ impl S3DynamoDbLogStore { object_store: ObjectStoreRef, ) -> DeltaResult { let lock_client = DynamoDbLockClient::try_new( + &s3_options.sdk_config, s3_options .extra_opts .get(constants::LOCK_TABLE_KEY_NAME) @@ -57,13 +58,12 @@ impl S3DynamoDbLogStore { .extra_opts .get(constants::MAX_ELAPSED_REQUEST_TIME_KEY_NAME) .cloned(), - s3_options.region.clone(), - s3_options.use_web_identity, + s3_options.dynamodb_endpoint.clone(), ) .map_err(|err| DeltaTableError::ObjectStore { source: ObjectStoreError::Generic { store: STORE_NAME, - source: err.into(), + source: Box::new(err), }, })?; let table_path = to_uri(&location, &Path::from("")); @@ -240,6 +240,36 @@ impl LogStore for S3DynamoDbLogStore { Ok(()) } + /// Tries to abort an entry by first deleting the commit log entry, then deleting the temp commit file + async fn abort_commit_entry( + &self, + version: i64, + tmp_commit: &Path, + ) -> Result<(), TransactionError> { + self.lock_client + .delete_commit_entry(version, &self.table_path) + .await + .map_err(|err| match err { + LockClientError::ProvisionedThroughputExceeded => todo!( + "deltalake-aws does not yet handle DynamoDB provisioned throughput errors" + ), + LockClientError::VersionAlreadyCompleted { version, .. } => { + error!("Trying to abort a completed commit"); + TransactionError::LogStoreError { + msg: format!("trying to abort a completed log entry: {}", version), + source: Box::new(err), + } + } + err => TransactionError::LogStoreError { + msg: "dynamodb client failed to delete log entry".to_owned(), + source: Box::new(err), + }, + })?; + + abort_commit_entry(&self.storage, version, tmp_commit).await?; + Ok(()) + } + async fn get_latest_version(&self, current_version: i64) -> DeltaResult { debug!("Retrieving latest version of {self:?} at v{current_version}"); let entry = self diff --git a/crates/aws/src/native.rs b/crates/aws/src/native.rs new file mode 100644 index 0000000000..c647194eb7 --- /dev/null +++ b/crates/aws/src/native.rs @@ -0,0 +1,12 @@ +use aws_sdk_sts::config::SharedHttpClient; +use aws_smithy_runtime::client::http::hyper_014::HyperClientBuilder; + +pub fn use_native_tls_client(allow_http: bool) -> SharedHttpClient { + let mut tls_connector = hyper_tls::HttpsConnector::new(); + if allow_http { + tls_connector.https_only(false); + } + + let client = HyperClientBuilder::new().build(tls_connector); + client +} diff --git a/crates/aws/src/storage.rs b/crates/aws/src/storage.rs index 87d488b54f..4625bb6be9 100644 --- a/crates/aws/src/storage.rs +++ b/crates/aws/src/storage.rs @@ -1,23 +1,32 @@ //! AWS S3 storage backend. +use aws_config::meta::region::ProvideRegion; +use aws_config::provider_config::ProviderConfig; +use aws_config::{Region, SdkConfig}; use bytes::Bytes; use deltalake_core::storage::object_store::{ - aws::AmazonS3ConfigKey, parse_url_opts, GetOptions, GetResult, ListResult, MultipartId, - ObjectMeta, ObjectStore, PutOptions, PutResult, Result as ObjectStoreResult, + aws::AmazonS3ConfigKey, parse_url_opts, GetOptions, GetResult, ListResult, ObjectMeta, + ObjectStore, PutOptions, PutResult, Result as ObjectStoreResult, +}; +use deltalake_core::storage::{ + limit_store_handler, str_is_truthy, ObjectStoreFactory, ObjectStoreRef, StorageOptions, }; -use deltalake_core::storage::{str_is_truthy, ObjectStoreFactory, ObjectStoreRef, StorageOptions}; use deltalake_core::{DeltaResult, ObjectStoreError, Path}; use futures::stream::BoxStream; -use rusoto_core::Region; +use futures::Future; +use object_store::{MultipartUpload, PutMultipartOpts, PutPayload}; use std::collections::HashMap; use std::fmt::Debug; use std::ops::Range; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use tokio::io::AsyncWrite; use url::Url; +use crate::errors::DynamoDbConfigError; +#[cfg(feature = "native-tls")] +use crate::native; + const STORE_NAME: &str = "DeltaS3ObjectStore"; #[derive(Clone, Default, Debug)] @@ -25,7 +34,21 @@ pub struct S3ObjectStoreFactory {} impl S3ObjectStoreFactory { fn with_env_s3(&self, options: &StorageOptions) -> StorageOptions { - let mut options = options.clone(); + let mut options = StorageOptions( + options + .0 + .clone() + .into_iter() + .map(|(k, v)| { + if let Ok(config_key) = AmazonS3ConfigKey::from_str(&k.to_ascii_lowercase()) { + (config_key.as_ref().to_string(), v) + } else { + (k, v) + } + }) + .collect(), + ); + for (os_key, os_value) in std::env::vars_os() { if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { if let Ok(config_key) = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) { @@ -45,10 +68,10 @@ impl ObjectStoreFactory for S3ObjectStoreFactory { fn parse_url_opts( &self, url: &Url, - options: &StorageOptions, + storage_options: &StorageOptions, ) -> DeltaResult<(ObjectStoreRef, Path)> { - let options = self.with_env_s3(options); - let (store, prefix) = parse_url_opts( + let options = self.with_env_s3(storage_options); + let (inner, prefix) = parse_url_opts( url, options.0.iter().filter_map(|(key, value)| { let s3_key = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; @@ -56,58 +79,70 @@ impl ObjectStoreFactory for S3ObjectStoreFactory { }), )?; + let store = limit_store_handler(inner, &options); + + // If the copy-if-not-exists env var is set, we don't need to instantiate a locking client or check for allow-unsafe-rename. if options .0 .contains_key(AmazonS3ConfigKey::CopyIfNotExists.as_ref()) { - // If the copy-if-not-exists env var is set, we don't need to instantiate a locking client or check for allow-unsafe-rename. - return Ok((Arc::from(store), prefix)); - } - - let options = S3StorageOptions::from_map(&options.0); + Ok((store, prefix)) + } else { + let s3_options = S3StorageOptions::from_map(&storage_options.0)?; - let store = S3StorageBackend::try_new( - store.into(), - Some("dynamodb") == options.locking_provider.as_deref() || options.allow_unsafe_rename, - )?; + let store = S3StorageBackend::try_new( + store, + Some("dynamodb") == s3_options.locking_provider.as_deref() + || s3_options.allow_unsafe_rename, + )?; - Ok((Arc::new(store), prefix)) + Ok((Arc::new(store), prefix)) + } } } /// Options used to configure the [S3StorageBackend]. /// /// Available options are described in [s3_constants]. -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug)] #[allow(missing_docs)] pub struct S3StorageOptions { - pub endpoint_url: Option, - pub region: Region, - pub profile: Option, - pub aws_access_key_id: Option, - pub aws_secret_access_key: Option, - pub aws_session_token: Option, pub virtual_hosted_style_request: bool, pub locking_provider: Option, - pub assume_role_arn: Option, - pub assume_role_session_name: Option, - pub use_web_identity: bool, + pub dynamodb_endpoint: Option, pub s3_pool_idle_timeout: Duration, pub sts_pool_idle_timeout: Duration, pub s3_get_internal_server_error_retries: usize, pub allow_unsafe_rename: bool, pub extra_opts: HashMap, + pub sdk_config: SdkConfig, +} + +impl Eq for S3StorageOptions {} +impl PartialEq for S3StorageOptions { + fn eq(&self, other: &Self) -> bool { + self.virtual_hosted_style_request == other.virtual_hosted_style_request + && self.locking_provider == other.locking_provider + && self.dynamodb_endpoint == other.dynamodb_endpoint + && self.s3_pool_idle_timeout == other.s3_pool_idle_timeout + && self.sts_pool_idle_timeout == other.sts_pool_idle_timeout + && self.s3_get_internal_server_error_retries + == other.s3_get_internal_server_error_retries + && self.allow_unsafe_rename == other.allow_unsafe_rename + && self.extra_opts == other.extra_opts + && self.sdk_config.endpoint_url() == other.sdk_config.endpoint_url() + && self.sdk_config.region() == other.sdk_config.region() + } } impl S3StorageOptions { /// Creates an instance of S3StorageOptions from the given HashMap. - pub fn from_map(options: &HashMap) -> S3StorageOptions { + pub fn from_map(options: &HashMap) -> DeltaResult { let extra_opts = options .iter() .filter(|(k, _)| !s3_constants::S3_OPTS.contains(&k.as_str())) .map(|(k, v)| (k.to_owned(), v.to_owned())) .collect(); - // Copy web identity values provided in options but not the environment into the environment // to get picked up by the `from_k8s_env` call in `get_web_identity_provider`. Self::ensure_env_var(options, s3_constants::AWS_REGION); @@ -118,18 +153,6 @@ impl S3StorageOptions { Self::ensure_env_var(options, s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE); Self::ensure_env_var(options, s3_constants::AWS_ROLE_ARN); Self::ensure_env_var(options, s3_constants::AWS_ROLE_SESSION_NAME); - - let endpoint_url = str_option(options, s3_constants::AWS_ENDPOINT_URL); - let region = if let Some(endpoint_url) = endpoint_url.as_ref() { - Region::Custom { - name: Self::str_or_default(options, s3_constants::AWS_REGION, "custom".to_string()), - endpoint: endpoint_url.to_owned(), - } - } else { - Region::default() - }; - let profile = str_option(options, s3_constants::AWS_PROFILE); - let s3_pool_idle_timeout = Self::u64_or_default(options, s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, 15); let sts_pool_idle_timeout = @@ -149,31 +172,81 @@ impl S3StorageOptions { let allow_unsafe_rename = str_option(options, s3_constants::AWS_S3_ALLOW_UNSAFE_RENAME) .map(|val| str_is_truthy(&val)) .unwrap_or(false); + let disable_imds = str_option(options, s3_constants::AWS_EC2_METADATA_DISABLED) + .map(|val| str_is_truthy(&val)) + .unwrap_or(true); + let imds_timeout = + Self::u64_or_default(options, s3_constants::AWS_EC2_METADATA_TIMEOUT, 100); + let (loader, provider_config) = + if let Some(endpoint_url) = str_option(options, s3_constants::AWS_ENDPOINT_URL) { + let (region_provider, provider_config) = Self::create_provider_config( + str_option(options, s3_constants::AWS_REGION) + .or_else(|| std::env::var("AWS_DEFAULT_REGION").ok()) + .map_or(Region::from_static("custom"), Region::new), + )?; + let loader = aws_config::from_env() + .endpoint_url(endpoint_url) + .region(region_provider); + (loader, provider_config) + } else { + let (region_provider, provider_config) = Self::create_provider_config( + crate::credentials::new_region_provider(disable_imds, imds_timeout), + )?; + ( + aws_config::from_env().region(region_provider), + provider_config, + ) + }; + + let credentials_provider = crate::credentials::ConfiguredCredentialChain::new( + disable_imds, + imds_timeout, + &provider_config, + ); + #[cfg(feature = "native-tls")] + let sdk_config = execute_sdk_future( + loader + .http_client(native::use_native_tls_client( + str_option(options, s3_constants::AWS_ALLOW_HTTP) + .map(|val| str_is_truthy(&val)) + .unwrap_or(false), + )) + .credentials_provider(credentials_provider) + .load(), + )?; + #[cfg(feature = "rustls")] + let sdk_config = + execute_sdk_future(loader.credentials_provider(credentials_provider).load())?; - Self { - endpoint_url, - region, - profile, - aws_access_key_id: str_option(options, s3_constants::AWS_ACCESS_KEY_ID), - aws_secret_access_key: str_option(options, s3_constants::AWS_SECRET_ACCESS_KEY), - aws_session_token: str_option(options, s3_constants::AWS_SESSION_TOKEN), + Ok(Self { virtual_hosted_style_request, locking_provider: str_option(options, s3_constants::AWS_S3_LOCKING_PROVIDER), - assume_role_arn: str_option(options, s3_constants::AWS_S3_ASSUME_ROLE_ARN), - assume_role_session_name: str_option(options, s3_constants::AWS_S3_ROLE_SESSION_NAME), - use_web_identity: std::env::var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE).is_ok(), + dynamodb_endpoint: str_option(options, s3_constants::AWS_ENDPOINT_URL_DYNAMODB), s3_pool_idle_timeout: Duration::from_secs(s3_pool_idle_timeout), sts_pool_idle_timeout: Duration::from_secs(sts_pool_idle_timeout), s3_get_internal_server_error_retries, allow_unsafe_rename, extra_opts, - } + sdk_config, + }) + } + + pub fn endpoint_url(&self) -> Option<&str> { + self.sdk_config.endpoint_url() } - fn str_or_default(map: &HashMap, key: &str, default: String) -> String { - map.get(key) - .map(|v| v.to_owned()) - .unwrap_or_else(|| std::env::var(key).unwrap_or(default)) + pub fn region(&self) -> Option<&Region> { + self.sdk_config.region() + } + + fn create_provider_config( + region_provider: T, + ) -> DeltaResult<(T, ProviderConfig)> { + let region = execute_sdk_future(region_provider.region())?; + Ok(( + region_provider, + ProviderConfig::default().with_region(region), + )) } fn u64_or_default(map: &HashMap, key: &str, default: u64) -> u64 { @@ -187,15 +260,47 @@ impl S3StorageOptions { std::env::set_var(key, val); } } -} -impl Default for S3StorageOptions { - /// Creates an instance of S3StorageOptions from environment variables. - fn default() -> S3StorageOptions { + pub fn try_default() -> DeltaResult { Self::from_map(&HashMap::new()) } } +fn execute_sdk_future(future: F) -> DeltaResult +where + T: Send, + F: Future + Send, +{ + match tokio::runtime::Handle::try_current() { + Ok(handle) => match handle.runtime_flavor() { + tokio::runtime::RuntimeFlavor::MultiThread => { + Ok(tokio::task::block_in_place(move || handle.block_on(future))) + } + _ => { + let mut cfg: Option = None; + std::thread::scope(|scope| { + scope.spawn(|| { + cfg = Some(handle.block_on(future)); + }); + }); + cfg.ok_or(deltalake_core::DeltaTableError::ObjectStore { + source: ObjectStoreError::Generic { + store: STORE_NAME, + source: Box::new(DynamoDbConfigError::InitializationError), + }, + }) + } + }, + Err(_) => { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("a tokio runtime is required by the AWS sdk"); + Ok(runtime.block_on(future)) + } + } +} + /// An S3 implementation of the [ObjectStore] trait pub struct S3StorageBackend { inner: ObjectStoreRef, @@ -229,14 +334,14 @@ impl std::fmt::Debug for S3StorageBackend { #[async_trait::async_trait] impl ObjectStore for S3StorageBackend { - async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult { + async fn put(&self, location: &Path, bytes: PutPayload) -> ObjectStoreResult { self.inner.put(location, bytes).await } async fn put_opts( &self, location: &Path, - bytes: Bytes, + bytes: PutPayload, options: PutOptions, ) -> ObjectStoreResult { self.inner.put_opts(location, bytes, options).await @@ -297,19 +402,16 @@ impl ObjectStore for S3StorageBackend { } } - async fn put_multipart( - &self, - location: &Path, - ) -> ObjectStoreResult<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> ObjectStoreResult> { self.inner.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &Path, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.inner.put_multipart_opts(location, options).await } } @@ -319,6 +421,10 @@ impl ObjectStore for S3StorageBackend { pub mod s3_constants { /// Custom S3 endpoint. pub const AWS_ENDPOINT_URL: &str = "AWS_ENDPOINT_URL"; + /// Custom DynamoDB endpoint. + /// If DynamoDB endpoint is not supplied, will use S3 endpoint (AWS_ENDPOINT_URL) + /// If it is supplied, this endpoint takes precedence over the global endpoint set in AWS_ENDPOINT_URL for DynamoDB + pub const AWS_ENDPOINT_URL_DYNAMODB: &str = "AWS_ENDPOINT_URL_DYNAMODB"; /// The AWS region. pub const AWS_REGION: &str = "AWS_REGION"; /// The AWS profile. @@ -375,11 +481,20 @@ pub mod s3_constants { /// Only safe if there is one writer to a given table. pub const AWS_S3_ALLOW_UNSAFE_RENAME: &str = "AWS_S3_ALLOW_UNSAFE_RENAME"; + /// If set to "true", disables the imds client + /// Defaults to "true" + pub const AWS_EC2_METADATA_DISABLED: &str = "AWS_EC2_METADATA_DISABLED"; + + /// The timeout in milliseconds for the EC2 metadata endpoint + /// Defaults to 100 + pub const AWS_EC2_METADATA_TIMEOUT: &str = "AWS_EC2_METADATA_TIMEOUT"; + /// The list of option keys owned by the S3 module. /// Option keys not contained in this list will be added to the `extra_opts` /// field of [crate::storage::s3::S3StorageOptions]. pub const S3_OPTS: &[&str] = &[ AWS_ENDPOINT_URL, + AWS_ENDPOINT_URL_DYNAMODB, AWS_REGION, AWS_PROFILE, AWS_ACCESS_KEY_ID, @@ -394,216 +509,412 @@ pub mod s3_constants { AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, + AWS_EC2_METADATA_DISABLED, + AWS_EC2_METADATA_TIMEOUT, ]; } pub(crate) fn str_option(map: &HashMap, key: &str) -> Option { - map.get(key) - .map_or_else(|| std::env::var(key).ok(), |v| Some(v.to_owned())) + if let Some(s) = map.get(key) { + return Some(s.to_owned()); + } + + if let Some(s) = map.get(&key.to_ascii_lowercase()) { + return Some(s.to_owned()); + } + + std::env::var(key).ok() } #[cfg(test)] mod tests { + use std::time::SystemTime; + use super::*; + use aws_sdk_sts::config::ProvideCredentials; use maplit::hashmap; use serial_test::serial; + struct ScopedEnv { + vars: HashMap, + } + + impl ScopedEnv { + pub fn new() -> Self { + let vars = std::env::vars_os().collect(); + Self { vars } + } + + pub fn run(mut f: impl FnMut() -> T) -> T { + let _env_scope = Self::new(); + f() + } + + pub async fn run_async(future: F) -> F::Output + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + let _env_scope = Self::new(); + future.await + } + } + + impl Drop for ScopedEnv { + fn drop(&mut self) { + let to_remove: Vec<_> = std::env::vars_os() + .map(|kv| kv.0) + .filter(|k| !self.vars.contains_key(k)) + .collect(); + for k in to_remove { + std::env::remove_var(k); + } + for (key, value) in self.vars.drain() { + std::env::set_var(key, value); + } + } + } + + fn clear_env_of_aws_keys() { + let keys_to_clear = std::env::vars().filter_map(|(k, _v)| { + if AmazonS3ConfigKey::from_str(&k.to_ascii_lowercase()).is_ok() { + Some(k) + } else { + None + } + }); + + for k in keys_to_clear { + std::env::remove_var(k); + } + } + #[test] #[serial] fn storage_options_default_test() { - std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "http://localhost"); - std::env::set_var(s3_constants::AWS_REGION, "us-west-1"); - std::env::set_var(s3_constants::AWS_PROFILE, "default"); - std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "default_key_id"); - std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "default_secret_key"); - std::env::set_var(s3_constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); - std::env::set_var( - s3_constants::AWS_S3_ASSUME_ROLE_ARN, - "arn:aws:iam::123456789012:role/some_role", - ); - std::env::set_var(s3_constants::AWS_S3_ROLE_SESSION_NAME, "session_name"); - std::env::set_var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE, "token_file"); - std::env::remove_var(s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS); - std::env::remove_var(s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS); - std::env::remove_var(s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES); - - let options = S3StorageOptions::default(); - - assert_eq!( - S3StorageOptions { - endpoint_url: Some("http://localhost".to_string()), - region: Region::Custom { - name: "us-west-1".to_string(), - endpoint: "http://localhost".to_string() + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + + std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "http://localhost"); + std::env::set_var(s3_constants::AWS_REGION, "us-west-1"); + std::env::set_var(s3_constants::AWS_PROFILE, "default"); + std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "default_key_id"); + std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "default_secret_key"); + std::env::set_var(s3_constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); + std::env::set_var( + s3_constants::AWS_S3_ASSUME_ROLE_ARN, + "arn:aws:iam::123456789012:role/some_role", + ); + std::env::set_var(s3_constants::AWS_S3_ROLE_SESSION_NAME, "session_name"); + std::env::set_var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE, "token_file"); + + let options = S3StorageOptions::try_default().unwrap(); + assert_eq!( + S3StorageOptions { + sdk_config: SdkConfig::builder() + .endpoint_url("http://localhost".to_string()) + .region(Region::from_static("us-west-1")) + .build(), + virtual_hosted_style_request: false, + locking_provider: Some("dynamodb".to_string()), + dynamodb_endpoint: None, + s3_pool_idle_timeout: Duration::from_secs(15), + sts_pool_idle_timeout: Duration::from_secs(10), + s3_get_internal_server_error_retries: 10, + extra_opts: HashMap::new(), + allow_unsafe_rename: false, }, - profile: Some("default".to_string()), - aws_access_key_id: Some("default_key_id".to_string()), - aws_secret_access_key: Some("default_secret_key".to_string()), - aws_session_token: None, - virtual_hosted_style_request: false, - assume_role_arn: Some("arn:aws:iam::123456789012:role/some_role".to_string()), - assume_role_session_name: Some("session_name".to_string()), - use_web_identity: true, - locking_provider: Some("dynamodb".to_string()), - s3_pool_idle_timeout: Duration::from_secs(15), - sts_pool_idle_timeout: Duration::from_secs(10), - s3_get_internal_server_error_retries: 10, - extra_opts: HashMap::new(), - allow_unsafe_rename: false, - }, - options - ); + options + ); + }); } #[test] #[serial] fn storage_options_with_only_region_and_credentials() { - std::env::remove_var(s3_constants::AWS_ENDPOINT_URL); - let options = S3StorageOptions::from_map(&hashmap! { - s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), - s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test".to_string(), - s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), - }); + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + std::env::remove_var(s3_constants::AWS_ENDPOINT_URL); + let options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), + s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test".to_string(), + s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + }) + .unwrap(); - assert_eq!( - S3StorageOptions { - endpoint_url: None, - region: Region::default(), - aws_access_key_id: Some("test".to_string()), - aws_secret_access_key: Some("test_secret".to_string()), - ..Default::default() - }, - options - ); + let mut expected = S3StorageOptions::try_default().unwrap(); + expected.sdk_config = SdkConfig::builder() + .region(Region::from_static("eu-west-1")) + .build(); + assert_eq!(expected, options); + }); } #[test] #[serial] fn storage_options_from_map_test() { - let options = S3StorageOptions::from_map(&hashmap! { - s3_constants::AWS_ENDPOINT_URL.to_string() => "http://localhost:1234".to_string(), - s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), - s3_constants::AWS_PROFILE.to_string() => "default".to_string(), - s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string(), - s3_constants::AWS_S3_LOCKING_PROVIDER.to_string() => "another_locking_provider".to_string(), - s3_constants::AWS_S3_ASSUME_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/another_role".to_string(), - s3_constants::AWS_S3_ROLE_SESSION_NAME.to_string() => "another_session_name".to_string(), - s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "another_token_file".to_string(), - s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "1".to_string(), - s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "2".to_string(), - s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES.to_string() => "3".to_string(), - s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), - s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + let options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_ENDPOINT_URL.to_string() => "http://localhost:1234".to_string(), + s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), + s3_constants::AWS_PROFILE.to_string() => "default".to_string(), + s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string(), + s3_constants::AWS_S3_LOCKING_PROVIDER.to_string() => "another_locking_provider".to_string(), + s3_constants::AWS_S3_ASSUME_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/another_role".to_string(), + s3_constants::AWS_S3_ROLE_SESSION_NAME.to_string() => "another_session_name".to_string(), + s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "another_token_file".to_string(), + s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "1".to_string(), + s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "2".to_string(), + s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES.to_string() => "3".to_string(), + s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), + s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + }).unwrap(); + + assert_eq!( + S3StorageOptions { + sdk_config: SdkConfig::builder() + .endpoint_url("http://localhost:1234".to_string()) + .region(Region::from_static("us-west-2")) + .build(), + virtual_hosted_style_request: true, + locking_provider: Some("another_locking_provider".to_string()), + dynamodb_endpoint: None, + s3_pool_idle_timeout: Duration::from_secs(1), + sts_pool_idle_timeout: Duration::from_secs(2), + s3_get_internal_server_error_retries: 3, + extra_opts: hashmap! { + s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string() + }, + allow_unsafe_rename: false, + }, + options + ); }); + } - assert_eq!( - S3StorageOptions { - endpoint_url: Some("http://localhost:1234".to_string()), - region: Region::Custom { - name: "us-west-2".to_string(), - endpoint: "http://localhost:1234".to_string() - }, - profile: Some("default".to_string()), - aws_access_key_id: Some("test_id".to_string()), - aws_secret_access_key: Some("test_secret".to_string()), - aws_session_token: None, - virtual_hosted_style_request: true, - assume_role_arn: Some("arn:aws:iam::123456789012:role/another_role".to_string()), - assume_role_session_name: Some("another_session_name".to_string()), - use_web_identity: true, - locking_provider: Some("another_locking_provider".to_string()), - s3_pool_idle_timeout: Duration::from_secs(1), - sts_pool_idle_timeout: Duration::from_secs(2), - s3_get_internal_server_error_retries: 3, - extra_opts: hashmap! { - s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string() + #[test] + #[serial] + fn storage_options_from_map_with_dynamodb_endpoint_test() { + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + let options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_ENDPOINT_URL.to_string() => "http://localhost:1234".to_string(), + s3_constants::AWS_ENDPOINT_URL_DYNAMODB.to_string() => "http://localhost:2345".to_string(), + s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), + s3_constants::AWS_PROFILE.to_string() => "default".to_string(), + s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string(), + s3_constants::AWS_S3_LOCKING_PROVIDER.to_string() => "another_locking_provider".to_string(), + s3_constants::AWS_S3_ASSUME_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/another_role".to_string(), + s3_constants::AWS_S3_ROLE_SESSION_NAME.to_string() => "another_session_name".to_string(), + s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "another_token_file".to_string(), + s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "1".to_string(), + s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "2".to_string(), + s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES.to_string() => "3".to_string(), + s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), + s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + }).unwrap(); + + assert_eq!( + S3StorageOptions { + sdk_config: SdkConfig::builder() + .endpoint_url("http://localhost:1234".to_string()) + .region(Region::from_static("us-west-2")) + .build(), + virtual_hosted_style_request: true, + locking_provider: Some("another_locking_provider".to_string()), + dynamodb_endpoint: Some("http://localhost:2345".to_string()), + s3_pool_idle_timeout: Duration::from_secs(1), + sts_pool_idle_timeout: Duration::from_secs(2), + s3_get_internal_server_error_retries: 3, + extra_opts: hashmap! { + s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string() + }, + allow_unsafe_rename: false, }, - allow_unsafe_rename: false, - }, - options - ); + options + ); + }); } #[test] #[serial] fn storage_options_mixed_test() { - std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "http://localhost"); - std::env::set_var(s3_constants::AWS_REGION, "us-west-1"); - std::env::set_var(s3_constants::AWS_PROFILE, "default"); - std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "wrong_key_id"); - std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "wrong_secret_key"); - std::env::set_var(s3_constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); - std::env::set_var( - s3_constants::AWS_S3_ASSUME_ROLE_ARN, - "arn:aws:iam::123456789012:role/some_role", - ); - std::env::set_var(s3_constants::AWS_S3_ROLE_SESSION_NAME, "session_name"); - std::env::set_var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE, "token_file"); - - std::env::set_var(s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, "1"); - std::env::set_var(s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, "2"); - std::env::set_var(s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, "3"); - let options = S3StorageOptions::from_map(&hashmap! { - s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id_mixed".to_string(), - s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret_mixed".to_string(), - s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), - "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES".to_string() => "3".to_string(), - }); - - assert_eq!( - S3StorageOptions { - endpoint_url: Some("http://localhost".to_string()), - region: Region::Custom { - name: "us-west-2".to_string(), - endpoint: "http://localhost".to_string() + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "http://localhost"); + std::env::set_var( + s3_constants::AWS_ENDPOINT_URL_DYNAMODB, + "http://localhost:dynamodb", + ); + std::env::set_var(s3_constants::AWS_REGION, "us-west-1"); + std::env::set_var(s3_constants::AWS_PROFILE, "default"); + std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "wrong_key_id"); + std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "wrong_secret_key"); + std::env::set_var(s3_constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); + std::env::set_var( + s3_constants::AWS_S3_ASSUME_ROLE_ARN, + "arn:aws:iam::123456789012:role/some_role", + ); + std::env::set_var(s3_constants::AWS_S3_ROLE_SESSION_NAME, "session_name"); + std::env::set_var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE, "token_file"); + + std::env::set_var(s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, "1"); + std::env::set_var(s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, "2"); + std::env::set_var(s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, "3"); + let options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id_mixed".to_string(), + s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret_mixed".to_string(), + s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), + "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES".to_string() => "3".to_string(), + }) + .unwrap(); + + assert_eq!( + S3StorageOptions { + sdk_config: SdkConfig::builder() + .endpoint_url("http://localhost".to_string()) + .region(Region::from_static("us-west-2")) + .build(), + virtual_hosted_style_request: false, + locking_provider: Some("dynamodb".to_string()), + dynamodb_endpoint: Some("http://localhost:dynamodb".to_string()), + s3_pool_idle_timeout: Duration::from_secs(1), + sts_pool_idle_timeout: Duration::from_secs(2), + s3_get_internal_server_error_retries: 3, + extra_opts: hashmap! {}, + allow_unsafe_rename: false, }, - profile: Some("default".to_string()), - aws_access_key_id: Some("test_id_mixed".to_string()), - aws_secret_access_key: Some("test_secret_mixed".to_string()), - aws_session_token: None, - virtual_hosted_style_request: false, - assume_role_arn: Some("arn:aws:iam::123456789012:role/some_role".to_string()), - assume_role_session_name: Some("session_name".to_string()), - use_web_identity: true, - locking_provider: Some("dynamodb".to_string()), - s3_pool_idle_timeout: Duration::from_secs(1), - sts_pool_idle_timeout: Duration::from_secs(2), - s3_get_internal_server_error_retries: 3, - extra_opts: hashmap! {}, - allow_unsafe_rename: false, - }, - options - ); + options + ); + }); } + #[test] #[serial] fn storage_options_web_identity_test() { - let _options = S3StorageOptions::from_map(&hashmap! { - s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), - s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "web_identity_token_file".to_string(), - s3_constants::AWS_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/web_identity_role".to_string(), - s3_constants::AWS_ROLE_SESSION_NAME.to_string() => "web_identity_session_name".to_string(), + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + let _options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), + s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "web_identity_token_file".to_string(), + s3_constants::AWS_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/web_identity_role".to_string(), + s3_constants::AWS_ROLE_SESSION_NAME.to_string() => "web_identity_session_name".to_string(), + }).unwrap(); + + assert_eq!( + "eu-west-1", + std::env::var(s3_constants::AWS_REGION).unwrap() + ); + + assert_eq!( + "web_identity_token_file", + std::env::var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE).unwrap() + ); + + assert_eq!( + "arn:aws:iam::123456789012:role/web_identity_role", + std::env::var(s3_constants::AWS_ROLE_ARN).unwrap() + ); + + assert_eq!( + "web_identity_session_name", + std::env::var(s3_constants::AWS_ROLE_SESSION_NAME).unwrap() + ); }); + } - assert_eq!( - "eu-west-1", - std::env::var(s3_constants::AWS_REGION).unwrap() - ); + #[test] + #[serial] + fn when_merging_with_env_unsupplied_options_are_added() { + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + let raw_options = hashmap! {}; - assert_eq!( - "web_identity_token_file", - std::env::var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE).unwrap() - ); + std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "env_key"); + std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "env_key"); + std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "env_key"); + std::env::set_var(s3_constants::AWS_REGION, "env_key"); - assert_eq!( - "arn:aws:iam::123456789012:role/web_identity_role", - std::env::var(s3_constants::AWS_ROLE_ARN).unwrap() - ); + let combined_options = + S3ObjectStoreFactory {}.with_env_s3(&StorageOptions(raw_options)); + + assert_eq!(combined_options.0.len(), 4); + + for v in combined_options.0.values() { + assert_eq!(v, "env_key"); + } + }); + } + + #[tokio::test] + #[serial] + async fn when_merging_with_env_supplied_options_take_precedence() { + ScopedEnv::run(|| { + clear_env_of_aws_keys(); + let raw_options = hashmap! { + "AWS_ACCESS_KEY_ID".to_string() => "options_key".to_string(), + "AWS_ENDPOINT_URL".to_string() => "options_key".to_string(), + "AWS_SECRET_ACCESS_KEY".to_string() => "options_key".to_string(), + "AWS_REGION".to_string() => "options_key".to_string() + }; + + std::env::set_var("aws_access_key_id", "env_key"); + std::env::set_var("aws_endpoint", "env_key"); + std::env::set_var("aws_secret_access_key", "env_key"); + std::env::set_var("aws_region", "env_key"); + + let combined_options = + S3ObjectStoreFactory {}.with_env_s3(&StorageOptions(raw_options)); + + for v in combined_options.0.values() { + assert_eq!(v, "options_key"); + } + }); + } + + #[tokio::test] + #[serial] + async fn storage_options_toggle_imds() { + ScopedEnv::run_async(async { + clear_env_of_aws_keys(); + let disabled_time = storage_options_configure_imds(Some("true")).await; + let enabled_time = storage_options_configure_imds(Some("false")).await; + let default_time = storage_options_configure_imds(None).await; + println!( + "enabled_time: {}, disabled_time: {}, default_time: {}", + enabled_time.as_micros(), + disabled_time.as_micros(), + default_time.as_micros(), + ); + assert!(disabled_time < enabled_time); + assert!(default_time < enabled_time); + }) + .await; + } + + async fn storage_options_configure_imds(value: Option<&str>) -> Duration { + let _options = match value { + Some(value) => S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), + s3_constants::AWS_EC2_METADATA_DISABLED.to_string() => value.to_string(), + }) + .unwrap(), + None => S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), + }) + .unwrap(), + }; assert_eq!( - "web_identity_session_name", - std::env::var(s3_constants::AWS_ROLE_SESSION_NAME).unwrap() + "eu-west-1", + std::env::var(s3_constants::AWS_REGION).unwrap() ); + + let provider = _options.sdk_config.credentials_provider().unwrap(); + let now = SystemTime::now(); + _ = provider.provide_credentials().await; + now.elapsed().unwrap() } } diff --git a/crates/aws/tests/common.rs b/crates/aws/tests/common.rs index 01aa505b1b..dfa2a9cd51 100644 --- a/crates/aws/tests/common.rs +++ b/crates/aws/tests/common.rs @@ -87,7 +87,7 @@ impl S3Integration { "dynamodb", "create-table", "--table-name", - &table_name, + table_name, "--provisioned-throughput", "ReadCapacityUnits=1,WriteCapacityUnits=1", "--attribute-definitions", @@ -112,7 +112,7 @@ impl S3Integration { } fn wait_for_table(table_name: &str) -> std::io::Result<()> { - let args = ["dynamodb", "describe-table", "--table-name", &table_name]; + let args = ["dynamodb", "describe-table", "--table-name", table_name]; loop { let output = Command::new("aws") .args(args) @@ -145,7 +145,7 @@ impl S3Integration { fn delete_dynamodb_table(table_name: &str) -> std::io::Result { let mut child = Command::new("aws") - .args(["dynamodb", "delete-table", "--table-name", &table_name]) + .args(["dynamodb", "delete-table", "--table-name", table_name]) .stdout(Stdio::null()) .spawn() .expect("aws command is installed"); diff --git a/crates/aws/tests/integration_s3_dynamodb.rs b/crates/aws/tests/integration_s3_dynamodb.rs index 179c46fc5a..57eb44ea24 100644 --- a/crates/aws/tests/integration_s3_dynamodb.rs +++ b/crates/aws/tests/integration_s3_dynamodb.rs @@ -5,17 +5,18 @@ use std::collections::HashMap; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use aws_sdk_dynamodb::types::BillingMode; use deltalake_aws::logstore::{RepairLogEntryResult, S3DynamoDbLogStore}; -use deltalake_aws::storage::S3StorageOptions; +use deltalake_aws::storage::{s3_constants, S3StorageOptions}; use deltalake_aws::{CommitEntry, DynamoDbConfig, DynamoDbLockClient}; use deltalake_core::kernel::{Action, Add, DataType, PrimitiveType, StructField, StructType}; use deltalake_core::logstore::LogStore; -use deltalake_core::operations::transaction::{commit, prepare_commit}; +use deltalake_core::operations::transaction::CommitBuilder; use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::commit_uri_from_version; use deltalake_core::storage::StorageOptions; use deltalake_core::table::builder::ensure_table_uri; -use deltalake_core::{DeltaOps, DeltaTable, DeltaTableBuilder}; +use deltalake_core::{DeltaOps, DeltaTable, DeltaTableBuilder, ObjectStoreError}; use deltalake_test::utils::*; use lazy_static::lazy_static; use object_store::path::Path; @@ -31,17 +32,17 @@ lazy_static! { static ref OPTIONS: HashMap = maplit::hashmap! { "allow_http".to_owned() => "true".to_owned(), }; - static ref S3_OPTIONS: S3StorageOptions = S3StorageOptions::from_map(&OPTIONS); + static ref S3_OPTIONS: S3StorageOptions = S3StorageOptions::from_map(&OPTIONS).unwrap(); } fn make_client() -> TestResult { - let options: S3StorageOptions = S3StorageOptions::default(); + let options: S3StorageOptions = S3StorageOptions::try_default().unwrap(); Ok(DynamoDbLockClient::try_new( + &options.sdk_config, + None, None, None, None, - options.region.clone(), - false, )?) } @@ -62,13 +63,13 @@ fn client_configs_via_env_variables() -> TestResult<()> { ); let client = make_client()?; let config = client.get_dynamodb_config(); + let options: S3StorageOptions = S3StorageOptions::try_default().unwrap(); assert_eq!( DynamoDbConfig { - billing_mode: deltalake_aws::BillingMode::PayPerRequest, + billing_mode: BillingMode::PayPerRequest, lock_table_name: "some_table".to_owned(), max_elapsed_request_time: Duration::from_secs(64), - use_web_identity: false, - region: config.region.clone(), + sdk_config: options.sdk_config, }, *config, ); @@ -180,6 +181,80 @@ async fn test_repair_on_load() -> TestResult<()> { Ok(()) } +#[tokio::test] +#[serial] +async fn test_abort_commit_entry() -> TestResult<()> { + let context = IntegrationContext::new(Box::new(S3Integration::default()))?; + let client = make_client()?; + let table = prepare_table(&context, "abort_entry").await?; + let options: StorageOptions = OPTIONS.clone().into(); + let log_store: S3DynamoDbLogStore = S3DynamoDbLogStore::try_new( + ensure_table_uri(table.table_uri())?, + options.clone(), + &S3_OPTIONS, + std::sync::Arc::new(table.object_store()), + )?; + + let entry = create_incomplete_commit_entry(&table, 1, "unfinished_commit").await?; + + log_store + .abort_commit_entry(entry.version, &entry.temp_path) + .await?; + + // The entry should have been aborted - the latest entry should be one version lower + if let Some(new_entry) = client.get_latest_entry(&table.table_uri()).await? { + assert_eq!(entry.version - 1, new_entry.version); + } + // Temp commit file should have been deleted + assert!(matches!( + log_store.object_store().get(&entry.temp_path).await, + Err(ObjectStoreError::NotFound { .. }) + )); + + // Test abort commit is idempotent - still works if already aborted + log_store + .abort_commit_entry(entry.version, &entry.temp_path) + .await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn test_abort_commit_entry_fail_to_delete_entry() -> TestResult<()> { + // Test abort commit does not delete the temp commit if the DynamoDB entry is not deleted + let context = IntegrationContext::new(Box::new(S3Integration::default()))?; + let client = make_client()?; + let table = prepare_table(&context, "abort_entry_fail").await?; + let options: StorageOptions = OPTIONS.clone().into(); + let log_store: S3DynamoDbLogStore = S3DynamoDbLogStore::try_new( + ensure_table_uri(table.table_uri())?, + options.clone(), + &S3_OPTIONS, + std::sync::Arc::new(table.object_store()), + )?; + + let entry = create_incomplete_commit_entry(&table, 1, "finished_commit").await?; + + // Mark entry as complete + client + .update_commit_entry(entry.version, &table.table_uri()) + .await?; + + // Abort will fail since we marked the entry as complete + assert!(matches!( + log_store + .abort_commit_entry(entry.version, &entry.temp_path) + .await, + Err(_), + )); + + // Check temp commit file still exists + assert!(log_store.object_store().get(&entry.temp_path).await.is_ok()); + + Ok(()) +} + const WORKERS: i64 = 3; const COMMITS: i64 = 15; @@ -208,7 +283,9 @@ async fn test_concurrent_writers() -> TestResult<()> { for f in futures { map.extend(f.await?); } + validate_lock_table_state(&table, WORKERS * COMMITS).await?; + Ok(()) } @@ -258,18 +335,18 @@ async fn create_incomplete_commit_entry( tag: &str, ) -> TestResult { let actions = vec![add_action(tag)]; - let temp_path = prepare_commit( - table.object_store().as_ref(), - &DeltaOperation::Write { - mode: SaveMode::Append, - partition_by: None, - predicate: None, - }, - &actions, - None, - ) - .await?; - let commit_entry = CommitEntry::new(version, temp_path); + let operation = DeltaOperation::Write { + mode: SaveMode::Append, + partition_by: None, + predicate: None, + }; + let prepared = CommitBuilder::default() + .with_actions(actions) + .build(Some(table.snapshot()?), table.log_store(), operation) + .into_prepared_commit_future() + .await?; + + let commit_entry = CommitEntry::new(version, prepared.path().to_owned()); make_client()? .put_commit_entry(&table.table_uri(), &commit_entry) .await?; @@ -314,7 +391,7 @@ async fn prepare_table(context: &IntegrationContext, table_name: &str) -> TestRe // create delta table let table = DeltaOps(table) .create() - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await?; println!("table created: {table:?}"); Ok(table) @@ -331,15 +408,12 @@ async fn append_to_table( predicate: None, }; let actions = vec![add_action(name)]; - let version = commit( - table.log_store().as_ref(), - &actions, - operation, - Some(table.snapshot()?), - metadata, - ) - .await - .unwrap(); + let version = CommitBuilder::default() + .with_actions(actions) + .with_app_metadata(metadata.unwrap_or_default()) + .build(Some(table.snapshot()?), table.log_store(), operation) + .await? + .version(); Ok(version) } diff --git a/crates/aws/tests/repair_s3_rename_test.rs b/crates/aws/tests/repair_s3_rename_test.rs index 68d8727ebe..d9e19de7b7 100644 --- a/crates/aws/tests/repair_s3_rename_test.rs +++ b/crates/aws/tests/repair_s3_rename_test.rs @@ -9,6 +9,7 @@ use deltalake_core::storage::object_store::{ use deltalake_core::{DeltaTableBuilder, ObjectStore, Path}; use deltalake_test::utils::IntegrationContext; use futures::stream::BoxStream; +use object_store::{MultipartUpload, PutMultipartOpts, PutPayload}; use serial_test::serial; use std::ops::Range; use std::sync::{Arc, Mutex}; @@ -60,8 +61,8 @@ async fn run_repair_test_case(path: &str, pause_copy: bool) -> Result<(), Object }; let (s3_2, _) = create_s3_backend(&context, "w2", None, None); - s3_1.put(&src1, Bytes::from("test1")).await.unwrap(); - s3_2.put(&src2, Bytes::from("test2")).await.unwrap(); + s3_1.put(&src1, Bytes::from("test1").into()).await.unwrap(); + s3_2.put(&src2, Bytes::from("test2").into()).await.unwrap(); let rename1 = rename(s3_1, &src1, &dst1); // to ensure that first one is started actually first @@ -166,14 +167,14 @@ impl ObjectStore for DelayedObjectStore { self.delete(from).await } - async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult { + async fn put(&self, location: &Path, bytes: PutPayload) -> ObjectStoreResult { self.inner.put(location, bytes).await } async fn put_opts( &self, location: &Path, - bytes: Bytes, + bytes: PutPayload, options: PutOptions, ) -> ObjectStoreResult { self.inner.put_opts(location, bytes, options).await @@ -227,19 +228,16 @@ impl ObjectStore for DelayedObjectStore { self.inner.rename_if_not_exists(from, to).await } - async fn put_multipart( - &self, - location: &Path, - ) -> ObjectStoreResult<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> ObjectStoreResult> { self.inner.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &Path, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.inner.put_multipart_opts(location, options).await } } diff --git a/crates/azure/Cargo.toml b/crates/azure/Cargo.toml index 7ed67f74c9..574684627f 100644 --- a/crates/azure/Cargo.toml +++ b/crates/azure/Cargo.toml @@ -1,10 +1,18 @@ [package] name = "deltalake-azure" -version = "0.1.0" -edition = "2021" +version = "0.1.3" +authors.workspace = true +keywords.workspace = true +readme.workspace = true +edition.workspace = true +homepage.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true [dependencies] -deltalake-core = { version = "0.17.0", path = "../core" } +deltalake-core = { version = ">=0.17.0, <0.19.0", path = "../core" } lazy_static = "1" # workspace depenndecies @@ -22,7 +30,7 @@ url = { workspace = true } chrono = { workspace = true } serial_test = "3" deltalake-test = { path = "../test" } -pretty_env_logger = "*" +pretty_env_logger = "0.5.0" rand = "0.8" serde_json = { workspace = true } diff --git a/crates/azure/src/lib.rs b/crates/azure/src/lib.rs index 9b957c7b5e..7782f69f43 100644 --- a/crates/azure/src/lib.rs +++ b/crates/azure/src/lib.rs @@ -4,7 +4,8 @@ use std::sync::Arc; use deltalake_core::logstore::{default_logstore, logstores, LogStore, LogStoreFactory}; use deltalake_core::storage::{ - factories, url_prefix_handler, ObjectStoreFactory, ObjectStoreRef, StorageOptions, + factories, limit_store_handler, url_prefix_handler, ObjectStoreFactory, ObjectStoreRef, + StorageOptions, }; use deltalake_core::{DeltaResult, Path}; use object_store::azure::AzureConfigKey; @@ -42,8 +43,9 @@ impl ObjectStoreFactory for AzureFactory { options: &StorageOptions, ) -> DeltaResult<(ObjectStoreRef, Path)> { let config = config::AzureConfigHelper::try_new(options.as_azure_options())?.build()?; - let (store, prefix) = parse_url_opts(url, config)?; - Ok((url_prefix_handler(store, prefix.clone())?, prefix)) + let (inner, prefix) = parse_url_opts(url, config)?; + let store = limit_store_handler(url_prefix_handler(inner, prefix.clone()), options); + Ok((store, prefix)) } } diff --git a/crates/azure/tests/integration.rs b/crates/azure/tests/integration.rs index 5230462c92..3ffaa00cc5 100644 --- a/crates/azure/tests/integration.rs +++ b/crates/azure/tests/integration.rs @@ -75,7 +75,10 @@ async fn read_write_test_onelake(context: &IntegrationContext, path: &Path) -> T let expected = Bytes::from_static(b"test world from delta-rs on friday"); - delta_store.put(path, expected.clone()).await.unwrap(); + delta_store + .put(path, expected.clone().into()) + .await + .unwrap(); let fetched = delta_store.get(path).await.unwrap().bytes().await.unwrap(); assert_eq!(expected, fetched); diff --git a/crates/benchmarks/src/bin/merge.rs b/crates/benchmarks/src/bin/merge.rs index ea43171052..2465e23d94 100644 --- a/crates/benchmarks/src/bin/merge.rs +++ b/crates/benchmarks/src/bin/merge.rs @@ -7,9 +7,10 @@ use arrow::datatypes::Schema as ArrowSchema; use arrow_array::{RecordBatch, StringArray, UInt32Array}; use chrono::Duration; use clap::{command, Args, Parser, Subcommand}; +use datafusion::functions::expr_fn::random; use datafusion::{datasource::MemTable, prelude::DataFrame}; use datafusion_common::DataFusionError; -use datafusion_expr::{cast, col, lit, random}; +use datafusion_expr::{cast, col, lit}; use deltalake_core::protocol::SaveMode; use deltalake_core::{ arrow::{ @@ -200,6 +201,7 @@ async fn benchmark_merge_tpcds( table.log_store(), DeltaScanConfig { file_column_name: Some("file_path".to_string()), + ..Default::default() }, ) .unwrap(); diff --git a/crates/catalog-glue/Cargo.toml b/crates/catalog-glue/Cargo.toml index a535cb8984..c757563c1b 100644 --- a/crates/catalog-glue/Cargo.toml +++ b/crates/catalog-glue/Cargo.toml @@ -1,13 +1,21 @@ [package] name = "deltalake-catalog-glue" version = "0.1.0" -edition = "2021" +authors.workspace = true +keywords.workspace = true +readme.workspace = true +edition.workspace = true +homepage.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true [dependencies] async-trait = { workspace = true } aws-config = "1" aws-sdk-glue = "1" -deltalake-core = { version = "0.17.0", path = "../core" } +deltalake-core = { version = ">=0.17.0, <0.19.0", path = "../core" } # This can depend on a lowest common denominator of core once that's released # deltalake_core = { version = "0.17.0" } thiserror = { workspace = true } diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 9773f82c46..296abf2fef 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -1,27 +1,26 @@ [package] name = "deltalake-core" -version = "0.17.0" -rust-version = "1.64" -authors = ["Qingping Hou "] -homepage = "https://github.com/delta-io/delta.rs" -license = "Apache-2.0" -keywords = ["deltalake", "delta", "datalake"] -description = "Native Delta Lake implementation in Rust" -documentation = "https://docs.rs/deltalake" -repository = "https://github.com/delta-io/delta.rs" -readme = "README.md" -edition = "2021" +version = "0.18.3" +authors.workspace = true +keywords.workspace = true +readme.workspace = true +edition.workspace = true +homepage.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true [package.metadata.docs.rs] -# We cannot use all_features because TLS features are mutually exclusive. -# We cannot use hdfs feature because it requires Java to be installed. -features = ["azure", "datafusion", "gcs", "hdfs", "json", "python", "s3", "unity-experimental"] +features = ["datafusion", "json", "unity-experimental"] [dependencies] +delta_kernel.workspace = true + # arrow arrow = { workspace = true } arrow-arith = { workspace = true } -arrow-array = { workspace = true } +arrow-array = { workspace = true , features = ["chrono-tz"]} arrow-buffer = { workspace = true } arrow-cast = { workspace = true } arrow-ipc = { workspace = true } @@ -43,6 +42,8 @@ datafusion-common = { workspace = true, optional = true } datafusion-proto = { workspace = true, optional = true } datafusion-sql = { workspace = true, optional = true } datafusion-physical-expr = { workspace = true, optional = true } +datafusion-functions = { workspace = true, optional = true } +datafusion-functions-array = { workspace = true, optional = true } # serde serde = { workspace = true, features = ["derive"] } @@ -51,11 +52,12 @@ serde_json = { workspace = true } # "stdlib" bytes = { workspace = true } chrono = { workspace = true, default-features = false, features = ["clock"] } -hashbrown = "*" +hashbrown = "0.14.3" regex = { workspace = true } thiserror = { workspace = true } uuid = { workspace = true, features = ["serde", "v4"] } url = { workspace = true } +urlencoding = { workspace = true} # runtime async-trait = { workspace = true } @@ -72,12 +74,13 @@ tokio = { workspace = true, features = [ # other deps (these should be organized and pulled into workspace.dependencies as necessary) cfg-if = "1" -dashmap = "5" +dashmap = "6" errno = "0.3" either = "1.8" fix-hidden-lifetime-bug = "0.2" hyper = { version = "0.14", optional = true } -itertools = "0.12" +indexmap = "2.2.1" +itertools = "0.13" lazy_static = "1" libc = ">=0.2.90, <1" num-bigint = "0.4" @@ -97,7 +100,7 @@ reqwest = { version = "0.11.18", default-features = false, features = [ "rustls-tls", "json", ], optional = true } -sqlparser = { version = "0.41", optional = true } +sqlparser = { version = "0.49", optional = true } [dev-dependencies] criterion = "0.5" @@ -107,7 +110,7 @@ dotenvy = "0" hyper = { version = "0.14", features = ["server"] } maplit = "1" pretty_assertions = "1.2.1" -pretty_env_logger = "*" +pretty_env_logger = "0.5.0" rand = "0.8" serial_test = "3" tempfile = "3" @@ -115,7 +118,8 @@ tokio = { version = "1", features = ["macros", "rt-multi-thread"] } utime = "0.3" [features] -default = [] +cdf = [] +default = ["cdf"] datafusion = [ "dep:datafusion", "datafusion-expr", @@ -123,6 +127,8 @@ datafusion = [ "datafusion-proto", "datafusion-physical-expr", "datafusion-sql", + "datafusion-functions", + "datafusion-functions-array", "sqlparser", ] datafusion-ext = ["datafusion"] diff --git a/crates/core/src/data_catalog/storage/mod.rs b/crates/core/src/data_catalog/storage/mod.rs index 5a25054316..fc30f32144 100644 --- a/crates/core/src/data_catalog/storage/mod.rs +++ b/crates/core/src/data_catalog/storage/mod.rs @@ -110,12 +110,13 @@ impl SchemaProvider for ListingSchemaProvider { self.tables.iter().map(|t| t.key().clone()).collect() } - async fn table(&self, name: &str) -> Option> { - let location = self.tables.get(name).map(|t| t.clone())?; - let provider = open_table_with_storage_options(location, self.storage_options.0.clone()) - .await - .ok()?; - Some(Arc::new(provider) as Arc) + async fn table(&self, name: &str) -> datafusion_common::Result>> { + let Some(location) = self.tables.get(name).map(|t| t.clone()) else { + return Ok(None); + }; + let provider = + open_table_with_storage_options(location, self.storage_options.0.clone()).await?; + Ok(Some(Arc::new(provider) as Arc)) } fn register_table( diff --git a/crates/core/src/data_catalog/unity/datafusion.rs b/crates/core/src/data_catalog/unity/datafusion.rs index 21246c865a..6b6a4b4a63 100644 --- a/crates/core/src/data_catalog/unity/datafusion.rs +++ b/crates/core/src/data_catalog/unity/datafusion.rs @@ -6,8 +6,9 @@ use std::sync::Arc; use dashmap::DashMap; use datafusion::catalog::schema::SchemaProvider; -use datafusion::catalog::{CatalogList, CatalogProvider}; +use datafusion::catalog::{CatalogProvider, CatalogProviderList}; use datafusion::datasource::TableProvider; +use datafusion_common::DataFusionError; use tracing::error; use super::models::{GetTableResponse, ListCatalogsResponse, ListTableSummariesResponse}; @@ -49,7 +50,7 @@ impl UnityCatalogList { } } -impl CatalogList for UnityCatalogList { +impl CatalogProviderList for UnityCatalogList { fn as_any(&self) -> &dyn Any { self } @@ -180,25 +181,24 @@ impl SchemaProvider for UnitySchemaProvider { self.table_names.clone() } - async fn table(&self, name: &str) -> Option> { + async fn table(&self, name: &str) -> datafusion_common::Result>> { let maybe_table = self .client .get_table(&self.catalog_name, &self.schema_name, name) .await - .ok()?; + .map_err(|err| DataFusionError::External(Box::new(err)))?; match maybe_table { GetTableResponse::Success(table) => { let table = DeltaTableBuilder::from_uri(table.storage_location) .with_storage_options(self.storage_options.clone()) .load() - .await - .ok()?; - Some(Arc::new(table)) + .await?; + Ok(Some(Arc::new(table))) } GetTableResponse::Error(err) => { error!("failed to fetch table from unity catalog: {}", err.message); - None + Err(DataFusionError::External(Box::new(err))) } } } diff --git a/crates/core/src/data_catalog/unity/models.rs b/crates/core/src/data_catalog/unity/models.rs index e1c8b7d1b7..265149b969 100644 --- a/crates/core/src/data_catalog/unity/models.rs +++ b/crates/core/src/data_catalog/unity/models.rs @@ -1,17 +1,24 @@ //! Api models for databricks unity catalog APIs +use core::fmt; use std::collections::HashMap; use serde::Deserialize; /// Error response from unity API -#[derive(Deserialize)] +#[derive(Debug, Deserialize)] pub struct ErrorResponse { /// The error code pub error_code: String, /// The error message pub message: String, } +impl fmt::Display for ErrorResponse { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "[{}] {}", self.error_code, self.message) + } +} +impl std::error::Error for ErrorResponse {} /// List catalogs response #[derive(Deserialize)] diff --git a/crates/core/src/delta_datafusion/cdf/mod.rs b/crates/core/src/delta_datafusion/cdf/mod.rs new file mode 100644 index 0000000000..02382aa725 --- /dev/null +++ b/crates/core/src/delta_datafusion/cdf/mod.rs @@ -0,0 +1,95 @@ +//! Logical operators and physical executions for CDF + +use arrow_schema::{DataType, Field, TimeUnit}; +use lazy_static::lazy_static; +use std::collections::HashMap; + +pub(crate) use scan::*; +pub(crate) use scan_utils::*; + +use crate::kernel::{Add, AddCDCFile}; + +mod scan; +mod scan_utils; + +/// Change type column name +pub const CHANGE_TYPE_COL: &str = "_change_type"; +/// Commit version column name +pub const COMMIT_VERSION_COL: &str = "_commit_version"; +/// Commit Timestamp column name +pub const COMMIT_TIMESTAMP_COL: &str = "_commit_timestamp"; + +lazy_static! { + pub(crate) static ref CDC_PARTITION_SCHEMA: Vec = vec![ + Field::new(COMMIT_VERSION_COL, DataType::Int64, true), + Field::new( + COMMIT_TIMESTAMP_COL, + DataType::Timestamp(TimeUnit::Millisecond, None), + true + ) + ]; + pub(crate) static ref ADD_PARTITION_SCHEMA: Vec = vec![ + Field::new(CHANGE_TYPE_COL, DataType::Utf8, true), + Field::new(COMMIT_VERSION_COL, DataType::Int64, true), + Field::new( + COMMIT_TIMESTAMP_COL, + DataType::Timestamp(TimeUnit::Millisecond, None), + true + ), + ]; +} + +#[derive(Debug)] +pub(crate) struct CdcDataSpec { + version: i64, + timestamp: i64, + actions: Vec, +} + +impl CdcDataSpec { + pub fn new(version: i64, timestamp: i64, actions: Vec) -> Self { + Self { + version, + timestamp, + actions, + } + } +} + +/// This trait defines a generic set of operations used by CDF Reader +pub trait FileAction { + /// Adds partition values + fn partition_values(&self) -> &HashMap>; + /// Physical Path to the data + fn path(&self) -> String; + /// Byte size of the physical file + fn size(&self) -> usize; +} + +impl FileAction for Add { + fn partition_values(&self) -> &HashMap> { + &self.partition_values + } + + fn path(&self) -> String { + self.path.clone() + } + + fn size(&self) -> usize { + self.size as usize + } +} + +impl FileAction for AddCDCFile { + fn partition_values(&self) -> &HashMap> { + &self.partition_values + } + + fn path(&self) -> String { + self.path.clone() + } + + fn size(&self) -> usize { + self.size as usize + } +} diff --git a/crates/core/src/delta_datafusion/cdf/scan.rs b/crates/core/src/delta_datafusion/cdf/scan.rs new file mode 100644 index 0000000000..bd7488899f --- /dev/null +++ b/crates/core/src/delta_datafusion/cdf/scan.rs @@ -0,0 +1,63 @@ +use std::any::Any; +use std::fmt::Formatter; +use std::sync::Arc; + +use arrow_schema::SchemaRef; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; + +/// Physical execution of a scan +#[derive(Debug, Clone)] +pub struct DeltaCdfScan { + plan: Arc, +} + +impl DeltaCdfScan { + /// Creates a new scan + pub fn new(plan: Arc) -> Self { + Self { plan } + } +} + +impl DisplayAs for DeltaCdfScan { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + +impl ExecutionPlan for DeltaCdfScan { + fn name(&self) -> &str { + Self::static_name() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.plan.schema().clone() + } + + fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + self.plan.properties() + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> datafusion_common::Result> { + self.plan.clone().with_new_children(_children) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> datafusion_common::Result { + self.plan.execute(partition, context) + } +} diff --git a/crates/core/src/delta_datafusion/cdf/scan_utils.rs b/crates/core/src/delta_datafusion/cdf/scan_utils.rs new file mode 100644 index 0000000000..79d7a2359e --- /dev/null +++ b/crates/core/src/delta_datafusion/cdf/scan_utils.rs @@ -0,0 +1,100 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use chrono::TimeZone; +use datafusion::datasource::listing::PartitionedFile; +use datafusion_common::ScalarValue; +use object_store::path::Path; +use object_store::ObjectMeta; +use serde_json::Value; + +use crate::delta_datafusion::cdf::CHANGE_TYPE_COL; +use crate::delta_datafusion::cdf::{CdcDataSpec, FileAction}; +use crate::delta_datafusion::{get_null_of_arrow_type, to_correct_scalar_value}; +use crate::DeltaResult; + +pub fn map_action_to_scalar( + action: &F, + part: &str, + schema: SchemaRef, +) -> ScalarValue { + action + .partition_values() + .get(part) + .map(|val| { + schema + .field_with_name(part) + .map(|field| match val { + Some(value) => to_correct_scalar_value( + &Value::String(value.to_string()), + field.data_type(), + ) + .unwrap_or(Some(ScalarValue::Null)) + .unwrap_or(ScalarValue::Null), + None => get_null_of_arrow_type(field.data_type()).unwrap_or(ScalarValue::Null), + }) + .unwrap_or(ScalarValue::Null) + }) + .unwrap_or(ScalarValue::Null) +} + +pub fn create_spec_partition_values( + spec: &CdcDataSpec, + action_type: Option<&ScalarValue>, +) -> Vec { + let mut spec_partition_values = action_type.cloned().map(|at| vec![at]).unwrap_or_default(); + spec_partition_values.push(ScalarValue::Int64(Some(spec.version))); + spec_partition_values.push(ScalarValue::TimestampMillisecond( + Some(spec.timestamp), + None, + )); + spec_partition_values +} + +pub fn create_partition_values( + schema: SchemaRef, + specs: Vec>, + table_partition_cols: &[String], + action_type: Option, +) -> DeltaResult, Vec>> { + let mut file_groups: HashMap, Vec> = HashMap::new(); + + for spec in specs { + let spec_partition_values = create_spec_partition_values(&spec, action_type.as_ref()); + + for action in spec.actions { + let partition_values = table_partition_cols + .iter() + .map(|part| map_action_to_scalar(&action, part, schema.clone())) + .collect::>(); + + let mut new_part_values = spec_partition_values.clone(); + new_part_values.extend(partition_values); + + let part = PartitionedFile { + object_meta: ObjectMeta { + location: Path::parse(action.path().as_str())?, + size: action.size(), + e_tag: None, + last_modified: chrono::Utc.timestamp_nanos(0), + version: None, + }, + partition_values: new_part_values.clone(), + extensions: None, + range: None, + statistics: None, + }; + + file_groups.entry(new_part_values).or_default().push(part); + } + } + Ok(file_groups) +} + +pub fn create_cdc_schema(mut schema_fields: Vec, include_type: bool) -> SchemaRef { + if include_type { + schema_fields.push(Field::new(CHANGE_TYPE_COL, DataType::Utf8, true)); + } + Arc::new(Schema::new(schema_fields)) +} diff --git a/crates/core/src/delta_datafusion/expr.rs b/crates/core/src/delta_datafusion/expr.rs index 03849f4df9..2577d1a1db 100644 --- a/crates/core/src/delta_datafusion/expr.rs +++ b/crates/core/src/delta_datafusion/expr.rs @@ -22,22 +22,24 @@ //! Utility functions for Datafusion's Expressions use std::{ - fmt::{self, Display, Formatter, Write}, + fmt::{self, Display, Error, Formatter, Write}, sync::Arc, }; use arrow_schema::DataType; +use chrono::{DateTime, NaiveDate}; use datafusion::execution::context::SessionState; +use datafusion::execution::FunctionRegistry; use datafusion_common::Result as DFResult; use datafusion_common::{config::ConfigOptions, DFSchema, Result, ScalarValue, TableReference}; use datafusion_expr::{ - expr::InList, AggregateUDF, Between, BinaryExpr, Cast, Expr, GetIndexedField, Like, TableSource, + expr::InList, AggregateUDF, Between, BinaryExpr, Cast, Expr, Like, TableSource, }; use datafusion_sql::planner::{ContextProvider, SqlToRel}; -use sqlparser::ast::escape_quoted_string; -use sqlparser::dialect::GenericDialect; -use sqlparser::parser::Parser; -use sqlparser::tokenizer::Tokenizer; +use datafusion_sql::sqlparser::ast::escape_quoted_string; +use datafusion_sql::sqlparser::dialect::GenericDialect; +use datafusion_sql::sqlparser::parser::Parser; +use datafusion_sql::sqlparser::tokenizer::Tokenizer; use crate::{DeltaResult, DeltaTableError}; @@ -48,7 +50,7 @@ pub(crate) struct DeltaContextProvider<'a> { } impl<'a> ContextProvider for DeltaContextProvider<'a> { - fn get_table_provider(&self, _name: TableReference) -> DFResult> { + fn get_table_source(&self, _name: TableReference) -> DFResult> { unimplemented!() } @@ -72,7 +74,15 @@ impl<'a> ContextProvider for DeltaContextProvider<'a> { self.state.window_functions().get(name).cloned() } - fn get_table_source(&self, _name: TableReference) -> DFResult> { + fn udf_names(&self) -> Vec { + unimplemented!() + } + + fn udaf_names(&self) -> Vec { + unimplemented!() + } + + fn udwf_names(&self) -> Vec { unimplemented!() } } @@ -98,9 +108,15 @@ pub(crate) fn parse_predicate_expression( })?; let context_provider = DeltaContextProvider { state: df_state }; - let sql_to_rel = + let mut sql_to_rel = SqlToRel::new_with_options(&context_provider, DeltaParserOptions::default().into()); + // NOTE: This can be probably removed with Datafusion 41 once + // is released + for planner in context_provider.state.expr_planners() { + sql_to_rel = sql_to_rel.with_user_defined_planner(planner.clone()); + } + Ok(sql_to_rel.sql_to_expr(sql, schema, &mut Default::default())?) } @@ -185,7 +201,7 @@ impl<'a> Display for SqlFormat<'a> { Expr::IsNotFalse(expr) => write!(f, "{} IS NOT FALSE", SqlFormat { expr }), Expr::IsNotUnknown(expr) => write!(f, "{} IS NOT UNKNOWN", SqlFormat { expr }), Expr::BinaryExpr(expr) => write!(f, "{}", BinaryExprFormat { expr }), - Expr::ScalarFunction(func) => fmt_function(f, func.func_def.name(), false, &func.args), + Expr::ScalarFunction(func) => fmt_function(f, func.func.name(), false, &func.args), Expr::Cast(Cast { expr, data_type }) => { write!(f, "arrow_cast({}, '{}')", SqlFormat { expr }, data_type) } @@ -263,28 +279,6 @@ impl<'a> Display for SqlFormat<'a> { write!(f, "{expr} IN ({})", expr_vec_fmt!(list)) } } - Expr::GetIndexedField(GetIndexedField { expr, field }) => match field { - datafusion_expr::GetFieldAccess::NamedStructField { name } => { - write!( - f, - "{}[{}]", - SqlFormat { expr }, - ScalarValueFormat { scalar: name } - ) - } - datafusion_expr::GetFieldAccess::ListIndex { key } => { - write!(f, "{}[{}]", SqlFormat { expr }, SqlFormat { expr: key }) - } - datafusion_expr::GetFieldAccess::ListRange { start, stop } => { - write!( - f, - "{}[{}:{}]", - SqlFormat { expr }, - SqlFormat { expr: start }, - SqlFormat { expr: stop } - ) - } - }, _ => Err(fmt::Error), } } @@ -321,6 +315,9 @@ macro_rules! format_option { }}; } +/// Epoch days from ce calander until 1970-01-01 +pub const EPOCH_DAYS_FROM_CE: i32 = 719_163; + struct ScalarValueFormat<'a> { scalar: &'a ScalarValue, } @@ -339,6 +336,44 @@ impl<'a> fmt::Display for ScalarValueFormat<'a> { ScalarValue::UInt16(e) => format_option!(f, e)?, ScalarValue::UInt32(e) => format_option!(f, e)?, ScalarValue::UInt64(e) => format_option!(f, e)?, + ScalarValue::Date32(e) => match e { + Some(e) => write!( + f, + "{}", + NaiveDate::from_num_days_from_ce_opt(EPOCH_DAYS_FROM_CE + (*e)).ok_or(Error)? + )?, + None => write!(f, "NULL")?, + }, + ScalarValue::Date64(e) => match e { + Some(e) => write!( + f, + "'{}'::date", + DateTime::from_timestamp_millis(*e) + .ok_or(Error)? + .date_naive() + .format("%Y-%m-%d") + )?, + None => write!(f, "NULL")?, + }, + ScalarValue::TimestampMicrosecond(e, tz) => match e { + Some(e) => match tz { + Some(_tz) => write!( + f, + "arrow_cast('{}', 'Timestamp(Microsecond, Some(\"UTC\"))')", + DateTime::from_timestamp_micros(*e) + .ok_or(Error)? + .format("%Y-%m-%dT%H:%M:%S%.6f") + )?, + None => write!( + f, + "arrow_cast('{}', 'Timestamp(Microsecond, None)')", + DateTime::from_timestamp_micros(*e) + .ok_or(Error)? + .format("%Y-%m-%dT%H:%M:%S%.6f") + )?, + }, + None => write!(f, "NULL")?, + }, ScalarValue::Utf8(e) | ScalarValue::LargeUtf8(e) => match e { Some(e) => write!(f, "'{}'", escape_quoted_string(e, '\''))?, None => write!(f, "NULL")?, @@ -357,7 +392,7 @@ impl<'a> fmt::Display for ScalarValueFormat<'a> { None => write!(f, "NULL")?, }, ScalarValue::Null => write!(f, "NULL")?, - _ => return Err(fmt::Error), + _ => return Err(Error), }; Ok(()) } @@ -367,10 +402,17 @@ impl<'a> fmt::Display for ScalarValueFormat<'a> { mod test { use arrow_schema::DataType as ArrowDataType; use datafusion::prelude::SessionContext; - use datafusion_common::{Column, DFSchema, ScalarValue}; - use datafusion_expr::{cardinality, col, decode, lit, substring, Cast, Expr, ExprSchemable}; - - use crate::delta_datafusion::DeltaSessionContext; + use datafusion_common::{Column, ScalarValue, ToDFSchema}; + use datafusion_expr::expr::ScalarFunction; + use datafusion_expr::{col, lit, BinaryExpr, Cast, Expr, ExprSchemable}; + use datafusion_functions::core::arrow_cast; + use datafusion_functions::core::expr_ext::FieldAccessor; + use datafusion_functions::encoding::expr_fn::decode; + use datafusion_functions::expr_fn::substring; + use datafusion_functions_array::expr_ext::{IndexAccessor, SliceAccessor}; + use datafusion_functions_array::expr_fn::cardinality; + + use crate::delta_datafusion::{DataFusionMixins, DeltaSessionContext}; use crate::kernel::{ArrayType, DataType, PrimitiveType, StructField, StructType}; use crate::{DeltaOps, DeltaTable}; @@ -439,6 +481,11 @@ mod test { DataType::Primitive(PrimitiveType::Timestamp), true, ), + StructField::new( + "_timestamp_ntz".to_string(), + DataType::Primitive(PrimitiveType::TimestampNtz), + true, + ), StructField::new( "_binary".to_string(), DataType::Primitive(PrimitiveType::Binary), @@ -472,7 +519,7 @@ mod test { let table = DeltaOps::new_in_memory() .create() - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -485,13 +532,24 @@ mod test { // String expression that we output must be parsable for conflict resolution. let tests = vec![ - simple!( - Expr::Cast(Cast { + ParseTest { + expr: Expr::Cast(Cast { expr: Box::new(lit(1_i64)), data_type: ArrowDataType::Int32 }), - "arrow_cast(1, 'Int32')".to_string() - ), + expected: "arrow_cast(1, 'Int32')".to_string(), + override_expected_expr: Some( + datafusion_expr::Expr::ScalarFunction( + ScalarFunction { + func: arrow_cast(), + args: vec![ + lit(ScalarValue::Int64(Some(1))), + lit(ScalarValue::Utf8(Some("Int32".into()))) + ] + } + ) + ), + }, simple!( Expr::Column(Column::from_qualified_name_ignore_case("Value3")).eq(lit(3_i64)), "Value3 = 3".to_string() @@ -570,9 +628,8 @@ mod test { substring(col("modified"), lit(0_i64), lit(4_i64)).eq(lit("2021")), "substr(modified, 0, 4) = '2021'".to_string() ), - simple!( - col("value") - .cast_to::( + ParseTest { + expr: col("value").cast_to( &arrow_schema::DataType::Utf8, &table .snapshot() @@ -581,29 +638,77 @@ mod test { .unwrap() .as_ref() .to_owned() - .try_into() + .to_dfschema() .unwrap() ) .unwrap() .eq(lit("1")), - "arrow_cast(value, 'Utf8') = '1'".to_string() - ), + expected: "arrow_cast(value, 'Utf8') = '1'".to_string(), + override_expected_expr: Some( + datafusion_expr::Expr::BinaryExpr(BinaryExpr { + left: Box::new(datafusion_expr::Expr::ScalarFunction( + ScalarFunction { + func: arrow_cast(), + args: vec![ + col("value"), + lit(ScalarValue::Utf8(Some("Utf8".into()))) + ] + } + )), + op: datafusion_expr::Operator::Eq, + right: Box::new(lit(ScalarValue::Utf8(Some("1".into())))) + }) + ), + }, simple!( col("_struct").field("a").eq(lit(20_i64)), - "_struct['a'] = 20".to_string() + "get_field(_struct, 'a') = 20".to_string() ), simple!( col("_struct").field("nested").field("b").eq(lit(20_i64)), - "_struct['nested']['b'] = 20".to_string() + "get_field(get_field(_struct, 'nested'), 'b') = 20".to_string() ), simple!( col("_list").index(lit(1_i64)).eq(lit(20_i64)), - "_list[1] = 20".to_string() + "array_element(_list, 1) = 20".to_string() ), simple!( cardinality(col("_list").range(col("value"), lit(10_i64))), - "cardinality(_list[value:10])".to_string() + "cardinality(array_slice(_list, value, 10))".to_string() ), + ParseTest { + expr: col("_timestamp_ntz").gt(lit(ScalarValue::TimestampMicrosecond(Some(1262304000000000), None))), + expected: "_timestamp_ntz > arrow_cast('2010-01-01T00:00:00.000000', 'Timestamp(Microsecond, None)')".to_string(), + override_expected_expr: Some(col("_timestamp_ntz").gt( + datafusion_expr::Expr::ScalarFunction( + ScalarFunction { + func: arrow_cast(), + args: vec![ + lit(ScalarValue::Utf8(Some("2010-01-01T00:00:00.000000".into()))), + lit(ScalarValue::Utf8(Some("Timestamp(Microsecond, None)".into()))) + ] + } + ) + )), + }, + ParseTest { + expr: col("_timestamp").gt(lit(ScalarValue::TimestampMicrosecond( + Some(1262304000000000), + Some("UTC".into()) + ))), + expected: "_timestamp > arrow_cast('2010-01-01T00:00:00.000000', 'Timestamp(Microsecond, Some(\"UTC\"))')".to_string(), + override_expected_expr: Some(col("_timestamp").gt( + datafusion_expr::Expr::ScalarFunction( + ScalarFunction { + func: arrow_cast(), + args: vec![ + lit(ScalarValue::Utf8(Some("2010-01-01T00:00:00.000000".into()))), + lit(ScalarValue::Utf8(Some("Timestamp(Microsecond, Some(\"UTC\"))".into()))) + ] + } + ) + )), + }, ]; let session: SessionContext = DeltaSessionContext::default().into(); diff --git a/crates/core/src/delta_datafusion/find_files/logical.rs b/crates/core/src/delta_datafusion/find_files/logical.rs new file mode 100644 index 0000000000..4dd4a3b5da --- /dev/null +++ b/crates/core/src/delta_datafusion/find_files/logical.rs @@ -0,0 +1,107 @@ +use std::collections::HashSet; +use std::hash::{Hash, Hasher}; + +use datafusion_common::DFSchemaRef; +use datafusion_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; + +use crate::delta_datafusion::find_files::ONLY_FILES_DF_SCHEMA; +use crate::logstore::LogStoreRef; +use crate::table::state::DeltaTableState; + +#[derive(Debug, Clone)] +pub struct FindFilesNode { + id: String, + predicate: Expr, + table_state: DeltaTableState, + log_store: LogStoreRef, + version: i64, +} + +impl FindFilesNode { + pub fn new( + id: String, + table_state: DeltaTableState, + log_store: LogStoreRef, + predicate: Expr, + ) -> datafusion_common::Result { + let version = table_state.version(); + Ok(Self { + id, + predicate, + log_store, + table_state, + + version, + }) + } + + pub fn predicate(&self) -> Expr { + self.predicate.clone() + } + + pub fn state(&self) -> DeltaTableState { + self.table_state.clone() + } + + pub fn log_store(&self) -> LogStoreRef { + self.log_store.clone() + } +} + +impl Eq for FindFilesNode {} + +impl PartialEq for FindFilesNode { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Hash for FindFilesNode { + fn hash(&self, state: &mut H) { + state.write(self.id.as_bytes()); + state.finish(); + } +} + +impl UserDefinedLogicalNodeCore for FindFilesNode { + fn name(&self) -> &str { + "FindFiles" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + &ONLY_FILES_DF_SCHEMA + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn prevent_predicate_push_down_columns(&self) -> HashSet { + HashSet::new() + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "FindFiles[id={}, predicate=\"{}\", version={}]", + &self.id, self.predicate, self.version, + ) + } + + fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self { + self.with_exprs_and_inputs(exprs.to_vec(), inputs.to_vec()) + .unwrap() + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + _inputs: Vec, + ) -> datafusion_common::Result { + Ok(self.clone()) + } +} diff --git a/crates/core/src/delta_datafusion/find_files/mod.rs b/crates/core/src/delta_datafusion/find_files/mod.rs new file mode 100644 index 0000000000..d25d0765ee --- /dev/null +++ b/crates/core/src/delta_datafusion/find_files/mod.rs @@ -0,0 +1,282 @@ +use arrow_array::cast::AsArray; +use std::sync::Arc; + +use arrow_array::types::UInt16Type; +use arrow_array::RecordBatch; +use arrow_schema::SchemaBuilder; +use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef}; +use arrow_select::concat::concat_batches; +use async_trait::async_trait; +use datafusion::datasource::MemTable; +use datafusion::execution::context::{QueryPlanner, SessionState}; +use datafusion::execution::TaskContext; +use datafusion::physical_plan::filter::FilterExec; +use datafusion::physical_plan::limit::LocalLimitExec; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}; +use datafusion::prelude::SessionContext; +use datafusion_common::{DFSchemaRef, Result, ToDFSchema}; +use datafusion_expr::{col, Expr, LogicalPlan, UserDefinedLogicalNode}; +use lazy_static::lazy_static; + +use crate::delta_datafusion::find_files::logical::FindFilesNode; +use crate::delta_datafusion::find_files::physical::FindFilesExec; +use crate::delta_datafusion::{ + df_logical_schema, register_store, DeltaScanBuilder, DeltaScanConfigBuilder, PATH_COLUMN, +}; +use crate::logstore::LogStoreRef; +use crate::table::state::DeltaTableState; +use crate::DeltaTableError; + +pub mod logical; +pub mod physical; + +lazy_static! { + static ref ONLY_FILES_SCHEMA: Arc = { + let mut builder = SchemaBuilder::new(); + builder.push(Field::new(PATH_COLUMN, DataType::Utf8, false)); + Arc::new(builder.finish()) + }; + static ref ONLY_FILES_DF_SCHEMA: DFSchemaRef = + ONLY_FILES_SCHEMA.clone().to_dfschema_ref().unwrap(); +} + +struct FindFilesPlannerExtension {} + +struct FindFilesPlanner {} + +#[async_trait] +impl ExtensionPlanner for FindFilesPlannerExtension { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + _physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> Result>> { + if let Some(find_files_node) = node.as_any().downcast_ref::() { + return Ok(Some(Arc::new(FindFilesExec::new( + find_files_node.state(), + find_files_node.log_store(), + find_files_node.predicate(), + )?))); + } + Ok(None) + } +} + +#[async_trait] +impl QueryPlanner for FindFilesPlanner { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + session_state: &SessionState, + ) -> Result> { + let planner = Arc::new(Box::new(DefaultPhysicalPlanner::with_extension_planners( + vec![Arc::new(FindFilesPlannerExtension {})], + ))); + planner + .create_physical_plan(logical_plan, session_state) + .await + } +} + +async fn scan_table_by_partitions(batch: RecordBatch, predicate: Expr) -> Result { + let mut arrays = Vec::new(); + let mut fields = Vec::new(); + + let schema = batch.schema(); + + arrays.push( + batch + .column_by_name("path") + .ok_or(DeltaTableError::Generic( + "Column with name `path` does not exist".to_owned(), + ))? + .to_owned(), + ); + fields.push(Field::new(PATH_COLUMN, DataType::Utf8, false)); + + for field in schema.fields() { + if field.name().starts_with("partition.") { + let name = field.name().strip_prefix("partition.").unwrap(); + + arrays.push(batch.column_by_name(field.name()).unwrap().to_owned()); + fields.push(Field::new( + name, + field.data_type().to_owned(), + field.is_nullable(), + )); + } + } + + let schema = Arc::new(Schema::new(fields)); + let batch = RecordBatch::try_new(schema, arrays)?; + let mem_table = MemTable::try_new(batch.schema(), vec![vec![batch]])?; + + let ctx = SessionContext::new(); + let mut df = ctx.read_table(Arc::new(mem_table))?; + df = df + .filter(predicate.to_owned())? + .select(vec![col(PATH_COLUMN)])?; + let df_schema = df.schema().clone(); + let batches = df.collect().await?; + Ok(concat_batches(&SchemaRef::from(df_schema), &batches)?) +} + +async fn scan_table_by_files( + snapshot: DeltaTableState, + log_store: LogStoreRef, + state: SessionState, + expression: Expr, +) -> Result { + register_store(log_store.clone(), state.runtime_env().clone()); + let scan_config = DeltaScanConfigBuilder::new() + .wrap_partition_values(true) + .with_file_column(true) + .build(&snapshot)?; + + let logical_schema = df_logical_schema(&snapshot, &scan_config)?; + + // Identify which columns we need to project + let mut used_columns = expression + .column_refs() + .into_iter() + .map(|column| logical_schema.index_of(&column.name)) + .collect::, ArrowError>>()?; + // Add path column + used_columns.push(logical_schema.index_of(scan_config.file_column_name.as_ref().unwrap())?); + + let scan = DeltaScanBuilder::new(&snapshot, log_store, &state) + .with_filter(Some(expression.clone())) + .with_projection(Some(&used_columns)) + .with_scan_config(scan_config) + .build() + .await?; + + let scan = Arc::new(scan); + let input_schema = scan.logical_schema.as_ref().to_owned(); + let input_dfschema = input_schema.clone().try_into()?; + + let predicate_expr = + state.create_physical_expr(Expr::IsTrue(Box::new(expression.clone())), &input_dfschema)?; + + let filter: Arc = + Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); + let limit: Arc = Arc::new(LocalLimitExec::new(filter, 1)); + let field_idx = input_schema.index_of(PATH_COLUMN)?; + let task_ctx = Arc::new(TaskContext::from(&state)); + let path_batches: Vec = datafusion::physical_plan::collect(limit, task_ctx) + .await? + .into_iter() + .map(|batch| { + let col = batch + .column(field_idx) + .as_dictionary::() + .values(); + RecordBatch::try_from_iter(vec![(PATH_COLUMN, col.clone())]).unwrap() + }) + .collect(); + + let result_batches = concat_batches(&ONLY_FILES_SCHEMA.clone(), &path_batches)?; + + Ok(result_batches) +} + +#[cfg(test)] +pub mod tests { + use std::sync::Arc; + + use datafusion::prelude::{DataFrame, SessionContext}; + use datafusion_common::{assert_batches_eq, assert_batches_sorted_eq}; + use datafusion_expr::{col, lit, Expr, Extension, LogicalPlan}; + + use crate::delta_datafusion::find_files::logical::FindFilesNode; + use crate::delta_datafusion::find_files::FindFilesPlanner; + use crate::operations::collect_sendable_stream; + use crate::{DeltaResult, DeltaTable, DeltaTableError}; + + pub async fn test_plan<'a>( + table: DeltaTable, + expr: Expr, + ) -> Result, DeltaTableError> { + let ctx = SessionContext::new(); + let state = ctx + .state() + .with_query_planner(Arc::new(FindFilesPlanner {})); + let find_files_node = LogicalPlan::Extension(Extension { + node: Arc::new(FindFilesNode::new( + "my_cool_plan".into(), + table.snapshot()?.clone(), + table.log_store().clone(), + expr, + )?), + }); + let df = DataFrame::new(state.clone(), find_files_node); + + let p = state + .clone() + .create_physical_plan(df.logical_plan()) + .await?; + + let e = p.execute(0, state.task_ctx())?; + collect_sendable_stream(e).await.map_err(Into::into) + } + + #[tokio::test] + pub async fn test_find_files_partitioned() -> DeltaResult<()> { + let table = crate::open_table("../test/tests/data/delta-0.8.0-partitioned").await?; + let expr: Expr = col("year").eq(lit(2020)); + let s = test_plan(table, expr).await?; + + assert_batches_eq! { + ["+---------------------------------------------------------------------------------------------+", + "| __delta_rs_path |", + "+---------------------------------------------------------------------------------------------+", + "| year=2020/month=1/day=1/part-00000-8eafa330-3be9-4a39-ad78-fd13c2027c7e.c000.snappy.parquet |", + "| year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet |", + "| year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet |", + "+---------------------------------------------------------------------------------------------+"], + &s + } + Ok(()) + } + + #[tokio::test] + pub async fn test_find_files_unpartitioned() -> DeltaResult<()> { + let table = crate::open_table("../test/tests/data/simple_table").await?; + let expr: Expr = col("id").in_list(vec![lit(9i64), lit(7i64)], false); + let s = test_plan(table, expr).await?; + + assert_batches_sorted_eq! { + ["+---------------------------------------------------------------------+", + "| __delta_rs_path |", + "+---------------------------------------------------------------------+", + "| part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet |", + "| part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet |", + "+---------------------------------------------------------------------+"], + &s + } + Ok(()) + } + + #[tokio::test] + pub async fn test_find_files_unpartitioned2() -> DeltaResult<()> { + let table = crate::open_table("../test/tests/data/simple_table").await?; + let expr: Expr = col("id").is_not_null(); + let s = test_plan(table, expr).await?; + + assert_batches_sorted_eq! { + ["+---------------------------------------------------------------------+", + "| __delta_rs_path |", + "+---------------------------------------------------------------------+", + "| part-00001-7891c33d-cedc-47c3-88a6-abcfb049d3b4-c000.snappy.parquet |", + "| part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet |", + "| part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet |", + "+---------------------------------------------------------------------+"], + &s + } + Ok(()) + } +} diff --git a/crates/core/src/delta_datafusion/find_files/physical.rs b/crates/core/src/delta_datafusion/find_files/physical.rs new file mode 100644 index 0000000000..e23a561e5b --- /dev/null +++ b/crates/core/src/delta_datafusion/find_files/physical.rs @@ -0,0 +1,158 @@ +use std::any::Any; +use std::fmt::{Debug, Formatter}; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow_array::RecordBatch; +use arrow_schema::SchemaRef; +use datafusion::error::Result; +use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::memory::MemoryStream; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, PlanProperties, +}; +use datafusion::prelude::SessionContext; +use datafusion_common::tree_node::TreeNode; +use datafusion_expr::Expr; +use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; +use futures::stream::BoxStream; +use futures::{FutureExt, Stream, StreamExt, TryStreamExt}; + +use crate::delta_datafusion::find_files::{ + scan_table_by_files, scan_table_by_partitions, ONLY_FILES_SCHEMA, +}; +use crate::delta_datafusion::FindFilesExprProperties; +use crate::logstore::LogStoreRef; +use crate::table::state::DeltaTableState; + +pub struct FindFilesExec { + predicate: Expr, + state: DeltaTableState, + log_store: LogStoreRef, + plan_properties: PlanProperties, +} + +impl FindFilesExec { + pub fn new(state: DeltaTableState, log_store: LogStoreRef, predicate: Expr) -> Result { + Ok(Self { + predicate, + log_store, + state, + plan_properties: PlanProperties::new( + EquivalenceProperties::new(ONLY_FILES_SCHEMA.clone()), + Partitioning::RoundRobinBatch(num_cpus::get()), + ExecutionMode::Bounded, + ), + }) + } +} + +struct FindFilesStream<'a> { + mem_stream: BoxStream<'a, Result>, +} + +impl<'a> FindFilesStream<'a> { + pub fn new(mem_stream: BoxStream<'a, Result>) -> Result { + Ok(Self { mem_stream }) + } +} + +impl<'a> RecordBatchStream for FindFilesStream<'a> { + fn schema(&self) -> SchemaRef { + ONLY_FILES_SCHEMA.clone() + } +} + +impl<'a> Stream for FindFilesStream<'a> { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.as_mut().mem_stream.poll_next_unpin(cx) + } +} + +impl Debug for FindFilesExec { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "FindFilesExec[predicate=\"{}\"]", self.predicate) + } +} + +impl DisplayAs for FindFilesExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + write!(f, "FindFilesExec[predicate=\"{}\"]", self.predicate) + } +} + +impl ExecutionPlan for FindFilesExec { + fn name(&self) -> &str { + Self::static_name() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + ONLY_FILES_SCHEMA.clone() + } + + fn properties(&self) -> &PlanProperties { + &self.plan_properties + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if !children.is_empty() { + return Err(datafusion::error::DataFusionError::Plan( + "Children cannot be replaced in FindFilesExec".to_string(), + )); + } + + Ok(self) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + let current_metadata = self.state.metadata(); + let mut expr_properties = FindFilesExprProperties { + partition_only: true, + partition_columns: current_metadata.partition_columns.clone(), + result: Ok(()), + }; + + TreeNode::visit(&self.predicate, &mut expr_properties)?; + expr_properties.result?; + + if expr_properties.partition_only { + let actions_table = self.state.add_actions_table(true)?; + let predicate = self.predicate.clone(); + let schema = actions_table.schema(); + let mem_stream = + MemoryStream::try_new(vec![actions_table.clone()], schema.clone(), None)? + .and_then(move |batch| scan_table_by_partitions(batch, predicate.clone())) + .boxed(); + + Ok(Box::pin(FindFilesStream::new(mem_stream)?)) + } else { + let ctx = SessionContext::new(); + let state = ctx.state(); + let table_state = self.state.clone(); + let predicate = self.predicate.clone(); + let output_files = + scan_table_by_files(table_state, self.log_store.clone(), state, predicate); + + let mem_stream = output_files.into_stream().boxed(); + Ok(Box::pin(FindFilesStream::new(mem_stream)?)) + } + } +} diff --git a/crates/core/src/delta_datafusion/logical.rs b/crates/core/src/delta_datafusion/logical.rs index 75ed53d1b1..2ce435b5b6 100644 --- a/crates/core/src/delta_datafusion/logical.rs +++ b/crates/core/src/delta_datafusion/logical.rs @@ -34,10 +34,6 @@ impl UserDefinedLogicalNodeCore for MetricObserver { vec![] } - fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "MetricObserver id={}", &self.id) - } - fn prevent_predicate_push_down_columns(&self) -> HashSet { if self.enable_pushdown { HashSet::new() @@ -50,15 +46,28 @@ impl UserDefinedLogicalNodeCore for MetricObserver { } } + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "MetricObserver id={}", &self.id) + } + fn from_template( &self, - _exprs: &[datafusion_expr::Expr], + exprs: &[datafusion_expr::Expr], inputs: &[datafusion_expr::LogicalPlan], ) -> Self { - MetricObserver { + self.with_exprs_and_inputs(exprs.to_vec(), inputs.to_vec()) + .unwrap() + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + inputs: Vec, + ) -> datafusion_common::Result { + Ok(MetricObserver { id: self.id.clone(), input: inputs[0].clone(), enable_pushdown: self.enable_pushdown, - } + }) } } diff --git a/crates/core/src/delta_datafusion/mod.rs b/crates/core/src/delta_datafusion/mod.rs index 6ea60a0bda..c2b410cb74 100644 --- a/crates/core/src/delta_datafusion/mod.rs +++ b/crates/core/src/delta_datafusion/mod.rs @@ -27,17 +27,20 @@ use std::sync::Arc; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::DataType; -use arrow::datatypes::{DataType as ArrowDataType, Schema as ArrowSchema, SchemaRef, TimeUnit}; +use arrow::datatypes::{ + DataType as ArrowDataType, Schema as ArrowSchema, SchemaRef, SchemaRef as ArrowSchemaRef, + TimeUnit, +}; use arrow::error::ArrowError; use arrow::record_batch::RecordBatch; use arrow_array::types::UInt16Type; use arrow_array::{Array, DictionaryArray, StringArray, TypedDictionaryArray}; use arrow_cast::display::array_value_to_string; - use arrow_schema::Field; use async_trait::async_trait; -use chrono::{NaiveDateTime, TimeZone, Utc}; -use datafusion::datasource::file_format::{parquet::ParquetFormat, FileFormat}; +use chrono::{DateTime, TimeZone, Utc}; +use datafusion::config::TableParquetOptions; +use datafusion::datasource::physical_plan::parquet::ParquetExecBuilder; use datafusion::datasource::physical_plan::{ wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig, }; @@ -46,35 +49,37 @@ use datafusion::datasource::{listing::PartitionedFile, MemTable, TableProvider, use datafusion::execution::context::{SessionConfig, SessionContext, SessionState, TaskContext}; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::FunctionRegistry; -use datafusion::physical_expr::PhysicalSortExpr; use datafusion::physical_optimizer::pruning::PruningPredicate; use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::limit::LocalLimitExec; +use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream, + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, Statistics, }; use datafusion_common::scalar::ScalarValue; -use datafusion_common::tree_node::{TreeNode, TreeNodeVisitor, VisitRecursion}; -use datafusion_common::{Column, DataFusionError, Result as DataFusionResult, ToDFSchema}; -use datafusion_expr::expr::ScalarFunction; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion_common::{ + config::ConfigOptions, Column, DFSchema, DataFusionError, Result as DataFusionResult, + TableReference, ToDFSchema, +}; use datafusion_expr::logical_plan::CreateExternalTable; use datafusion_expr::utils::conjunction; use datafusion_expr::{col, Expr, Extension, LogicalPlan, TableProviderFilterPushDown, Volatility}; -use datafusion_physical_expr::execution_props::ExecutionProps; -use datafusion_physical_expr::{create_physical_expr, PhysicalExpr}; use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::physical_plan::PhysicalExtensionCodec; use datafusion_sql::planner::ParserOptions; +use either::Either; use futures::TryStreamExt; - use itertools::Itertools; use object_store::ObjectMeta; use serde::{Deserialize, Serialize}; use url::Url; +use crate::delta_datafusion::expr::parse_predicate_expression; +use crate::delta_datafusion::schema_adapter::DeltaSchemaAdapterFactory; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Add, DataCheck, Invariant}; +use crate::kernel::{Add, DataCheck, EagerSnapshot, Invariant, Snapshot, StructTypeExt}; use crate::logstore::LogStoreRef; use crate::table::builder::ensure_table_uri; use crate::table::state::DeltaTableState; @@ -83,9 +88,14 @@ use crate::{open_table, open_table_with_storage_options, DeltaTable}; const PATH_COLUMN: &str = "__delta_rs_path"; +pub mod cdf; pub mod expr; pub mod logical; pub mod physical; +pub mod planner; + +mod find_files; +mod schema_adapter; impl From for DataFusionError { fn from(err: DeltaTableError) -> Self { @@ -111,6 +121,155 @@ impl From for DeltaTableError { } } +/// Convience trait for calling common methods on snapshot heirarchies +pub trait DataFusionMixins { + /// The physical datafusion schema of a table + fn arrow_schema(&self) -> DeltaResult; + + /// Get the table schema as an [`ArrowSchemaRef`] + fn input_schema(&self) -> DeltaResult; + + /// Parse an expression string into a datafusion [`Expr`] + fn parse_predicate_expression( + &self, + expr: impl AsRef, + df_state: &SessionState, + ) -> DeltaResult; +} + +impl DataFusionMixins for Snapshot { + fn arrow_schema(&self) -> DeltaResult { + _arrow_schema(self, true) + } + + fn input_schema(&self) -> DeltaResult { + _arrow_schema(self, false) + } + + fn parse_predicate_expression( + &self, + expr: impl AsRef, + df_state: &SessionState, + ) -> DeltaResult { + let schema = DFSchema::try_from(self.arrow_schema()?.as_ref().to_owned())?; + parse_predicate_expression(&schema, expr, df_state) + } +} + +impl DataFusionMixins for EagerSnapshot { + fn arrow_schema(&self) -> DeltaResult { + self.snapshot().arrow_schema() + } + + fn input_schema(&self) -> DeltaResult { + self.snapshot().input_schema() + } + + fn parse_predicate_expression( + &self, + expr: impl AsRef, + df_state: &SessionState, + ) -> DeltaResult { + self.snapshot().parse_predicate_expression(expr, df_state) + } +} + +impl DataFusionMixins for DeltaTableState { + fn arrow_schema(&self) -> DeltaResult { + self.snapshot.arrow_schema() + } + + fn input_schema(&self) -> DeltaResult { + self.snapshot.input_schema() + } + + fn parse_predicate_expression( + &self, + expr: impl AsRef, + df_state: &SessionState, + ) -> DeltaResult { + self.snapshot.parse_predicate_expression(expr, df_state) + } +} + +fn _arrow_schema(snapshot: &Snapshot, wrap_partitions: bool) -> DeltaResult { + let meta = snapshot.metadata(); + + let schema = meta.schema()?; + let fields = schema + .fields() + .filter(|f| !meta.partition_columns.contains(&f.name().to_string())) + .map(|f| f.try_into()) + .chain( + // We need stable order between logical and physical schemas, but the order of + // partitioning columns is not always the same in the json schema and the array + meta.partition_columns.iter().map(|partition_col| { + let f = schema.field(partition_col).unwrap(); + let field = Field::try_from(f)?; + let corrected = if wrap_partitions { + match field.data_type() { + // Only dictionary-encode types that may be large + // // https://github.com/apache/arrow-datafusion/pull/5545 + DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Binary + | DataType::LargeBinary => { + wrap_partition_type_in_dict(field.data_type().clone()) + } + _ => field.data_type().clone(), + } + } else { + field.data_type().clone() + }; + Ok(field.with_data_type(corrected)) + }), + ) + .collect::, _>>()?; + + Ok(Arc::new(ArrowSchema::new(fields))) +} + +pub(crate) trait DataFusionFileMixins { + /// Iterate over all files in the log matching a predicate + fn files_matching_predicate(&self, filters: &[Expr]) -> DeltaResult>; +} + +impl DataFusionFileMixins for EagerSnapshot { + fn files_matching_predicate(&self, filters: &[Expr]) -> DeltaResult> { + files_matching_predicate(self, filters) + } +} + +pub(crate) fn files_matching_predicate<'a>( + snapshot: &'a EagerSnapshot, + filters: &[Expr], +) -> DeltaResult + 'a> { + if let Some(Some(predicate)) = + (!filters.is_empty()).then_some(conjunction(filters.iter().cloned())) + { + //let expr = logical_expr_to_physical_expr(predicate, snapshot.arrow_schema()?.as_ref()); + let expr = SessionContext::new() + .create_physical_expr(predicate, &snapshot.arrow_schema()?.to_dfschema()?)?; + let pruning_predicate = PruningPredicate::try_new(expr, snapshot.arrow_schema()?)?; + Ok(Either::Left( + snapshot + .file_actions()? + .zip(pruning_predicate.prune(snapshot)?) + .filter_map( + |(action, keep_file)| { + if keep_file { + Some(action) + } else { + None + } + }, + ), + )) + } else { + Ok(Either::Right(snapshot.file_actions()?)) + } +} + pub(crate) fn get_path_column<'a>( batch: &'a RecordBatch, path_column: &str, @@ -141,8 +300,9 @@ pub(crate) fn register_store(store: LogStoreRef, env: Arc) { env.register_object_store(url, store.object_store()); } -/// The logical schema for a Deltatable is different then protocol level schema since partiton columns must appear at the end of the schema. -/// This is to align with how partition are handled at the physical level +/// The logical schema for a Deltatable is different from the protocol level schema since partition +/// columns must appear at the end of the schema. This is to align with how partition are handled +/// at the physical level pub(crate) fn df_logical_schema( snapshot: &DeltaTableState, scan_config: &DeltaScanConfig, @@ -167,26 +327,40 @@ pub(crate) fn df_logical_schema( } if let Some(file_column_name) = &scan_config.file_column_name { - fields.push(Arc::new(Field::new( - file_column_name, - arrow_schema::DataType::Utf8, - true, - ))); + fields.push(Arc::new(Field::new(file_column_name, DataType::Utf8, true))); } Ok(Arc::new(ArrowSchema::new(fields))) } -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] /// Used to specify if additional metadata columns are exposed to the user pub struct DeltaScanConfigBuilder { - /// Include the source path for each record. The name of this column is determine by `file_column_name` + /// Include the source path for each record. The name of this column is determined by `file_column_name` include_file_column: bool, /// Column name that contains the source path. /// /// If include_file_column is true and the name is None then it will be auto-generated /// Otherwise the user provided name will be used file_column_name: Option, + /// Whether to wrap partition values in a dictionary encoding to potentially save space + wrap_partition_values: Option, + /// Whether to push down filter in end result or just prune the files + enable_parquet_pushdown: bool, + /// Schema to scan table with + schema: Option, +} + +impl Default for DeltaScanConfigBuilder { + fn default() -> Self { + DeltaScanConfigBuilder { + include_file_column: false, + file_column_name: None, + wrap_partition_values: None, + enable_parquet_pushdown: true, + schema: None, + } + } } impl DeltaScanConfigBuilder { @@ -210,16 +384,34 @@ impl DeltaScanConfigBuilder { self } + /// Whether to wrap partition values in a dictionary encoding + pub fn wrap_partition_values(mut self, wrap: bool) -> Self { + self.wrap_partition_values = Some(wrap); + self + } + + /// Allow pushdown of the scan filter + /// When disabled the filter will only be used for pruning files + pub fn with_parquet_pushdown(mut self, pushdown: bool) -> Self { + self.enable_parquet_pushdown = pushdown; + self + } + + /// Use the provided [SchemaRef] for the [DeltaScan] + pub fn with_schema(mut self, schema: SchemaRef) -> Self { + self.schema = Some(schema); + self + } + /// Build a DeltaScanConfig and ensure no column name conflicts occur during downstream processing pub fn build(&self, snapshot: &DeltaTableState) -> DeltaResult { - let input_schema = snapshot.input_schema()?; - let mut file_column_name = None; - let mut column_names: HashSet<&String> = HashSet::new(); - for field in input_schema.fields.iter() { - column_names.insert(field.name()); - } + let file_column_name = if self.include_file_column { + let input_schema = snapshot.input_schema()?; + let mut column_names: HashSet<&String> = HashSet::new(); + for field in input_schema.fields.iter() { + column_names.insert(field.name()); + } - if self.include_file_column { match &self.file_column_name { Some(name) => { if column_names.contains(name) { @@ -229,7 +421,7 @@ impl DeltaScanConfigBuilder { ))); } - file_column_name = Some(name.to_owned()) + Some(name.to_owned()) } None => { let prefix = PATH_COLUMN; @@ -241,12 +433,19 @@ impl DeltaScanConfigBuilder { name = format!("{}_{}", prefix, idx); } - file_column_name = Some(name); + Some(name) } } - } + } else { + None + }; - Ok(DeltaScanConfig { file_column_name }) + Ok(DeltaScanConfig { + file_column_name, + wrap_partition_values: self.wrap_partition_values.unwrap_or(true), + enable_parquet_pushdown: self.enable_parquet_pushdown, + schema: self.schema.clone(), + }) } } @@ -255,6 +454,12 @@ impl DeltaScanConfigBuilder { pub struct DeltaScanConfig { /// Include the source path for each record pub file_column_name: Option, + /// Wrap partition values in a dictionary encoding + pub wrap_partition_values: bool, + /// Allow pushdown of the scan filter + pub enable_parquet_pushdown: bool, + /// Schema to read as + pub schema: Option, } #[derive(Debug)] @@ -266,8 +471,7 @@ pub(crate) struct DeltaScanBuilder<'a> { projection: Option<&'a Vec>, limit: Option, files: Option<&'a [Add]>, - config: DeltaScanConfig, - schema: Option, + config: Option, } impl<'a> DeltaScanBuilder<'a> { @@ -281,11 +485,10 @@ impl<'a> DeltaScanBuilder<'a> { log_store, filter: None, state, - files: None, projection: None, limit: None, - config: DeltaScanConfig::default(), - schema: None, + files: None, + config: None, } } @@ -310,20 +513,21 @@ impl<'a> DeltaScanBuilder<'a> { } pub fn with_scan_config(mut self, config: DeltaScanConfig) -> Self { - self.config = config; + self.config = Some(config); self } pub async fn build(self) -> DeltaResult { - let config = self.config; - let schema = match self.schema { - Some(schema) => schema, - None => { - self.snapshot - .physical_arrow_schema(self.log_store.object_store()) - .await? - } + let config = match self.config { + Some(config) => config, + None => DeltaScanConfigBuilder::new().build(self.snapshot)?, }; + + let schema = match config.schema.clone() { + Some(value) => Ok(value), + None => self.snapshot.arrow_schema(), + }?; + let logical_schema = df_logical_schema(self.snapshot, &config)?; let logical_schema = if let Some(used_columns) = self.projection { @@ -336,34 +540,45 @@ impl<'a> DeltaScanBuilder<'a> { logical_schema }; + let context = SessionContext::new(); + let df_schema = logical_schema.clone().to_dfschema()?; let logical_filter = self .filter - .map(|expr| logical_expr_to_physical_expr(&expr, &logical_schema)); + .map(|expr| context.create_physical_expr(expr, &df_schema).unwrap()); // Perform Pruning of files to scan - let files = match self.files { - Some(files) => files.to_owned(), + let (files, files_scanned, files_pruned) = match self.files { + Some(files) => { + let files = files.to_owned(); + let files_scanned = files.len(); + (files, files_scanned, 0) + } None => { if let Some(predicate) = &logical_filter { let pruning_predicate = PruningPredicate::try_new(predicate.clone(), logical_schema.clone())?; let files_to_prune = pruning_predicate.prune(self.snapshot)?; - self.snapshot - .file_actions()? - .iter() + let mut files_pruned = 0usize; + let files = self + .snapshot + .file_actions_iter()? .zip(files_to_prune.into_iter()) - .filter_map( - |(action, keep)| { - if keep { - Some(action.to_owned()) - } else { - None - } - }, - ) - .collect() + .filter_map(|(action, keep)| { + if keep { + Some(action.to_owned()) + } else { + files_pruned += 1; + None + } + }) + .collect::>(); + + let files_scanned = files.len(); + (files, files_scanned, files_pruned) } else { - self.snapshot.file_actions()? + let files = self.snapshot.file_actions()?; + let files_scanned = files.len(); + (files, files_scanned, 0) } } }; @@ -379,10 +594,12 @@ impl<'a> DeltaScanBuilder<'a> { let mut part = partitioned_file_from_action(action, table_partition_cols, &schema); if config.file_column_name.is_some() { - part.partition_values - .push(wrap_partition_value_in_dict(ScalarValue::Utf8(Some( - action.path.clone(), - )))); + let partition_value = if config.wrap_partition_values { + wrap_partition_value_in_dict(ScalarValue::Utf8(Some(action.path.clone()))) + } else { + ScalarValue::Utf8(Some(action.path.clone())) + }; + part.partition_values.push(partition_value); } file_groups @@ -406,9 +623,14 @@ impl<'a> DeltaScanBuilder<'a> { .collect::, ArrowError>>()?; if let Some(file_column_name) = &config.file_column_name { + let field_name_datatype = if config.wrap_partition_values { + wrap_partition_type_in_dict(DataType::Utf8) + } else { + DataType::Utf8 + }; table_partition_cols.push(Field::new( file_column_name.clone(), - wrap_partition_type_in_dict(DataType::Utf8), + field_name_datatype, false, )); } @@ -418,28 +640,47 @@ impl<'a> DeltaScanBuilder<'a> { .datafusion_table_statistics() .unwrap_or(Statistics::new_unknown(&schema)); - let scan = ParquetFormat::new() - .create_physical_plan( - self.state, - FileScanConfig { - object_store_url: self.log_store.object_store_url(), - file_schema, - file_groups: file_groups.into_values().collect(), - statistics: stats, - projection: self.projection.cloned(), - limit: self.limit, - table_partition_cols, - output_ordering: vec![], - }, - logical_filter.as_ref(), - ) - .await?; + let parquet_options = TableParquetOptions { + global: self.state.config().options().execution.parquet.clone(), + ..Default::default() + }; + + let mut exec_plan_builder = ParquetExecBuilder::new(FileScanConfig { + object_store_url: self.log_store.object_store_url(), + file_schema, + file_groups: file_groups.into_values().collect(), + statistics: stats, + projection: self.projection.cloned(), + limit: self.limit, + table_partition_cols, + output_ordering: vec![], + }) + .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) + .with_table_parquet_options(parquet_options); + + // Sometimes (i.e Merge) we want to prune files that don't make the + // filter and read the entire contents for files that do match the + // filter + if let Some(predicate) = logical_filter { + if config.enable_parquet_pushdown { + exec_plan_builder = exec_plan_builder.with_predicate(predicate); + } + }; + + let metrics = ExecutionPlanMetricsSet::new(); + MetricBuilder::new(&metrics) + .global_counter("files_scanned") + .add(files_scanned); + MetricBuilder::new(&metrics) + .global_counter("files_pruned") + .add(files_pruned); Ok(DeltaScan { table_uri: ensure_table_uri(self.log_store.root_uri())?.as_str().into(), - parquet_scan: scan, + parquet_scan: exec_plan_builder.build_arc(), config, logical_schema, + metrics, }) } } @@ -487,11 +728,14 @@ impl TableProvider for DeltaTable { Ok(Arc::new(scan)) } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> DataFusionResult { - Ok(TableProviderFilterPushDown::Inexact) + filter: &[&Expr], + ) -> DataFusionResult> { + Ok(filter + .into_iter() + .map(|_| TableProviderFilterPushDown::Inexact) + .collect()) } fn statistics(&self) -> Option { @@ -505,6 +749,7 @@ pub struct DeltaTableProvider { log_store: LogStoreRef, config: DeltaScanConfig, schema: Arc, + files: Option>, } impl DeltaTableProvider { @@ -519,8 +764,15 @@ impl DeltaTableProvider { snapshot, log_store, config, + files: None, }) } + + /// Define which files to consider while building a scan, for advanced usecases + pub fn with_files(mut self, files: Vec) -> DeltaTableProvider { + self.files = Some(files); + self + } } #[async_trait] @@ -555,22 +807,23 @@ impl TableProvider for DeltaTableProvider { register_store(self.log_store.clone(), session.runtime_env().clone()); let filter_expr = conjunction(filters.iter().cloned()); - let scan = DeltaScanBuilder::new(&self.snapshot, self.log_store.clone(), session) + let mut scan = DeltaScanBuilder::new(&self.snapshot, self.log_store.clone(), session) .with_projection(projection) .with_limit(limit) .with_filter(filter_expr) - .with_scan_config(self.config.clone()) - .build() - .await?; + .with_scan_config(self.config.clone()); - Ok(Arc::new(scan)) + if let Some(files) = &self.files { + scan = scan.with_files(files); + } + Ok(Arc::new(scan.build().await?)) } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> DataFusionResult { - Ok(TableProviderFilterPushDown::Inexact) + _filter: &[&Expr], + ) -> DataFusionResult> { + Ok(vec![TableProviderFilterPushDown::Inexact]) } fn statistics(&self) -> Option { @@ -590,6 +843,8 @@ pub struct DeltaScan { pub parquet_scan: Arc, /// The schema of the table to be used when evaluating expressions pub logical_schema: Arc, + /// Metrics for scan reported via DataFusion + metrics: ExecutionPlanMetricsSet, } #[derive(Debug, Serialize, Deserialize)] @@ -606,6 +861,10 @@ impl DisplayAs for DeltaScan { } impl ExecutionPlan for DeltaScan { + fn name(&self) -> &str { + Self::static_name() + } + fn as_any(&self) -> &dyn Any { self } @@ -614,23 +873,31 @@ impl ExecutionPlan for DeltaScan { self.parquet_scan.schema() } - fn output_partitioning(&self) -> Partitioning { - self.parquet_scan.output_partitioning() - } - - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - self.parquet_scan.output_ordering() + fn properties(&self) -> &PlanProperties { + self.parquet_scan.properties() } - fn children(&self) -> Vec> { - vec![self.parquet_scan.clone()] + fn children(&self) -> Vec<&Arc> { + vec![&self.parquet_scan] } fn with_new_children( self: Arc, children: Vec>, ) -> DataFusionResult> { - ExecutionPlan::with_new_children(self.parquet_scan.clone(), children) + if children.len() != 1 { + return Err(DataFusionError::Plan(format!( + "DeltaScan wrong number of children {}", + children.len() + ))); + } + Ok(Arc::new(DeltaScan { + table_uri: self.table_uri.clone(), + config: self.config.clone(), + parquet_scan: children[0].clone(), + logical_schema: self.logical_schema.clone(), + metrics: self.metrics.clone(), + })) } fn execute( @@ -641,9 +908,31 @@ impl ExecutionPlan for DeltaScan { self.parquet_scan.execute(partition, context) } + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + fn statistics(&self) -> DataFusionResult { self.parquet_scan.statistics() } + + fn repartitioned( + &self, + target_partitions: usize, + config: &ConfigOptions, + ) -> DataFusionResult>> { + if let Some(parquet_scan) = self.parquet_scan.repartitioned(target_partitions, config)? { + Ok(Some(Arc::new(DeltaScan { + table_uri: self.table_uri.clone(), + config: self.config.clone(), + parquet_scan, + logical_schema: self.logical_schema.clone(), + metrics: self.metrics.clone(), + }))) + } else { + Ok(None) + } + } } pub(crate) fn get_null_of_arrow_type(t: &ArrowDataType) -> DeltaResult { @@ -700,6 +989,10 @@ pub(crate) fn get_null_of_arrow_type(t: &ArrowDataType) -> DeltaResult Err(DeltaTableError::Generic(format!( "Unsupported data type for Delta Lake {}", t @@ -739,8 +1032,11 @@ pub(crate) fn partitioned_file_from_action( let ts_secs = action.modification_time / 1000; let ts_ns = (action.modification_time % 1000) * 1_000_000; - let last_modified = - Utc.from_utc_datetime(&NaiveDateTime::from_timestamp_opt(ts_secs, ts_ns as u32).unwrap()); + let last_modified = Utc.from_utc_datetime( + &DateTime::from_timestamp(ts_secs, ts_ns as u32) + .unwrap() + .naive_utc(), + ); PartitionedFile { object_meta: ObjectMeta { last_modified, @@ -749,9 +1045,31 @@ pub(crate) fn partitioned_file_from_action( partition_values, range: None, extensions: None, + statistics: None, } } +fn parse_date( + stat_val: &serde_json::Value, + field_dt: &ArrowDataType, +) -> DataFusionResult { + let string = match stat_val { + serde_json::Value::String(s) => s.to_owned(), + _ => stat_val.to_string(), + }; + + let time_micro = ScalarValue::try_from_string(string, &ArrowDataType::Date32)?; + let cast_arr = cast_with_options( + &time_micro.to_array()?, + field_dt, + &CastOptions { + safe: false, + ..Default::default() + }, + )?; + ScalarValue::try_from_array(&cast_arr, 0) +} + fn parse_timestamp( stat_val: &serde_json::Value, field_dt: &ArrowDataType, @@ -786,6 +1104,7 @@ pub(crate) fn to_correct_scalar_value( serde_json::Value::Null => Ok(Some(get_null_of_arrow_type(field_dt)?)), serde_json::Value::String(string_val) => match field_dt { ArrowDataType::Timestamp(_, _) => Ok(Some(parse_timestamp(stat_val, field_dt)?)), + ArrowDataType::Date32 => Ok(Some(parse_date(stat_val, field_dt)?)), _ => Ok(Some(ScalarValue::try_from_string( string_val.to_owned(), field_dt, @@ -793,6 +1112,7 @@ pub(crate) fn to_correct_scalar_value( }, other => match field_dt { ArrowDataType::Timestamp(_, _) => Ok(Some(parse_timestamp(stat_val, field_dt)?)), + ArrowDataType::Date32 => Ok(Some(parse_date(stat_val, field_dt)?)), _ => Ok(Some(ScalarValue::try_from_string( other.to_string(), field_dt, @@ -801,21 +1121,12 @@ pub(crate) fn to_correct_scalar_value( } } -pub(crate) fn logical_expr_to_physical_expr( - expr: &Expr, - schema: &ArrowSchema, -) -> Arc { - let df_schema = schema.clone().to_dfschema().unwrap(); - let execution_props = ExecutionProps::new(); - create_physical_expr(expr, &df_schema, &execution_props).unwrap() -} - pub(crate) async fn execute_plan_to_batch( state: &SessionState, plan: Arc, ) -> DeltaResult { - let data = - futures::future::try_join_all((0..plan.output_partitioning().partition_count()).map(|p| { + let data = futures::future::try_join_all( + (0..plan.properties().output_partitioning().partition_count()).map(|p| { let plan_copy = plan.clone(); let task_context = state.task_ctx().clone(); async move { @@ -827,8 +1138,9 @@ pub(crate) async fn execute_plan_to_batch( DataFusionResult::<_>::Ok(arrow::compute::concat_batches(&schema, batches.iter())?) } - })) - .await?; + }), + ) + .await?; let batch = arrow::compute::concat_batches(&plan.schema(), data.iter())?; @@ -977,6 +1289,7 @@ impl PhysicalExtensionCodec for DeltaPhysicalCodec { parquet_scan: (*inputs)[0].clone(), config: wire.config, logical_schema: wire.logical_schema, + metrics: ExecutionPlanMetricsSet::new(), }; Ok(Arc::new(delta_scan)) } @@ -1023,6 +1336,7 @@ impl LogicalExtensionCodec for DeltaLogicalCodec { fn try_decode_table_provider( &self, buf: &[u8], + _table_ref: &TableReference, _schema: SchemaRef, _ctx: &SessionContext, ) -> Result, DataFusionError> { @@ -1033,6 +1347,7 @@ impl LogicalExtensionCodec for DeltaLogicalCodec { fn try_encode_table_provider( &self, + _table_ref: &TableReference, node: Arc, buf: &mut Vec, ) -> Result<(), DataFusionError> { @@ -1077,10 +1392,10 @@ pub(crate) struct FindFilesExprProperties { /// Ensure only expressions that make sense are accepted, check for /// non-deterministic functions, and determine if the expression only contains /// partition columns -impl TreeNodeVisitor for FindFilesExprProperties { - type N = Expr; +impl TreeNodeVisitor<'_> for FindFilesExprProperties { + type Node = Expr; - fn pre_visit(&mut self, expr: &Self::N) -> datafusion_common::Result { + fn f_down(&mut self, expr: &Self::Node) -> datafusion_common::Result { // TODO: We can likely relax the volatility to STABLE. Would require further // research to confirm the same value is generated during the scan and // rewrite phases. @@ -1108,28 +1423,20 @@ impl TreeNodeVisitor for FindFilesExprProperties { | Expr::IsNotUnknown(_) | Expr::Negative(_) | Expr::InList { .. } - | Expr::GetIndexedField(_) | Expr::Between(_) | Expr::Case(_) | Expr::Cast(_) | Expr::TryCast(_) => (), - Expr::ScalarFunction(ScalarFunction { func_def, .. }) => { - let v = match func_def { - datafusion_expr::ScalarFunctionDefinition::BuiltIn(f) => f.volatility(), - datafusion_expr::ScalarFunctionDefinition::UDF(u) => u.signature().volatility, - datafusion_expr::ScalarFunctionDefinition::Name(n) => { + Expr::ScalarFunction(scalar_function) => { + match scalar_function.func.signature().volatility { + Volatility::Immutable => (), + _ => { self.result = Err(DeltaTableError::Generic(format!( - "Cannot determine volatility of find files predicate function {n}", + "Find files predicate contains nondeterministic function {}", + scalar_function.func.name() ))); - return Ok(VisitRecursion::Stop); + return Ok(TreeNodeRecursion::Stop); } - }; - if v > Volatility::Immutable { - self.result = Err(DeltaTableError::Generic(format!( - "Find files predicate contains nondeterministic function {}", - func_def.name() - ))); - return Ok(VisitRecursion::Stop); } } _ => { @@ -1137,14 +1444,15 @@ impl TreeNodeVisitor for FindFilesExprProperties { "Find files predicate contains unsupported expression {}", expr ))); - return Ok(VisitRecursion::Stop); + return Ok(TreeNodeRecursion::Stop); } } - Ok(VisitRecursion::Continue) + Ok(TreeNodeRecursion::Continue) } } +#[derive(Debug, Hash, Eq, PartialEq)] /// Representing the result of the [find_files] function. pub struct FindFiles { /// A list of `Add` objects that match the given predicate @@ -1198,7 +1506,7 @@ fn join_batches_with_add_actions( Ok(files) } -/// Determine which files contain a record that statisfies the predicate +/// Determine which files contain a record that satisfies the predicate pub(crate) async fn find_files_scan<'a>( snapshot: &DeltaTableState, log_store: LogStoreRef, @@ -1206,8 +1514,7 @@ pub(crate) async fn find_files_scan<'a>( expression: Expr, ) -> DeltaResult> { let candidate_map: HashMap = snapshot - .file_actions()? - .iter() + .file_actions_iter()? .map(|add| (add.path.clone(), add.to_owned())) .collect(); @@ -1221,7 +1528,7 @@ pub(crate) async fn find_files_scan<'a>( // Identify which columns we need to project let mut used_columns = expression - .to_columns()? + .column_refs() .into_iter() .map(|column| logical_schema.index_of(&column.name)) .collect::, ArrowError>>()?; @@ -1240,11 +1547,8 @@ pub(crate) async fn find_files_scan<'a>( let input_schema = scan.logical_schema.as_ref().to_owned(); let input_dfschema = input_schema.clone().try_into()?; - let predicate_expr = create_physical_expr( - &Expr::IsTrue(Box::new(expression.clone())), - &input_dfschema, - state.execution_props(), - )?; + let predicate_expr = + state.create_physical_expr(Expr::IsTrue(Box::new(expression.clone())), &input_dfschema)?; let filter: Arc = Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); @@ -1466,12 +1770,16 @@ impl From for DeltaColumn { #[cfg(test)] mod tests { + use crate::operations::write::SchemaMode; use crate::writer::test_utils::get_delta_schema; use arrow::array::StructArray; use arrow::datatypes::{DataType, Field, Schema}; use chrono::{TimeZone, Utc}; use datafusion::assert_batches_sorted_eq; + use datafusion::datasource::physical_plan::ParquetExec; use datafusion::physical_plan::empty::EmptyExec; + use datafusion::physical_plan::{visit_execution_plan, ExecutionPlanVisitor, PhysicalExpr}; + use datafusion_expr::lit; use datafusion_proto::physical_plan::AsExecutionPlan; use datafusion_proto::protobuf; use object_store::path::Path; @@ -1591,6 +1899,7 @@ mod tests { partition_values: [ScalarValue::Int64(Some(2015)), ScalarValue::Int64(Some(1))].to_vec(), range: None, extensions: None, + statistics: None, }; assert_eq!(file.partition_values, ref_file.partition_values) } @@ -1679,6 +1988,7 @@ mod tests { parquet_scan: Arc::from(EmptyExec::new(schema.clone())), config: DeltaScanConfig::default(), logical_schema: schema.clone(), + metrics: ExecutionPlanMetricsSet::new(), }); let proto: protobuf::PhysicalPlanNode = protobuf::PhysicalPlanNode::try_from_physical_plan(exec_plan.clone(), &codec) @@ -1734,7 +2044,7 @@ mod tests { let table = crate::DeltaOps::new_in_memory() .create() - .with_columns(get_delta_schema().fields().clone()) + .with_columns(get_delta_schema().fields().cloned()) .with_partition_columns(["modified", "id"]) .await .unwrap(); @@ -1864,4 +2174,402 @@ mod tests { assert_batches_sorted_eq!(&expected, &actual); */ } + + #[tokio::test] + async fn delta_scan_supports_missing_columns() { + let schema1 = Arc::new(ArrowSchema::new(vec![Field::new( + "col_1", + DataType::Utf8, + true, + )])); + + let batch1 = RecordBatch::try_new( + schema1.clone(), + vec![Arc::new(arrow::array::StringArray::from(vec![ + Some("A"), + Some("B"), + ]))], + ) + .unwrap(); + + let schema2 = Arc::new(ArrowSchema::new(vec![ + Field::new("col_1", DataType::Utf8, true), + Field::new("col_2", DataType::Utf8, true), + ])); + + let batch2 = RecordBatch::try_new( + schema2.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + Some("E"), + Some("F"), + Some("G"), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + Some("E2"), + Some("F2"), + Some("G2"), + ])), + ], + ) + .unwrap(); + + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch2]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let table = crate::DeltaOps(table) + .write(vec![batch1]) + .with_schema_mode(SchemaMode::Merge) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let config = DeltaScanConfigBuilder::new() + .build(table.snapshot().unwrap()) + .unwrap(); + let log = table.log_store(); + + let provider = + DeltaTableProvider::try_new(table.snapshot().unwrap().clone(), log, config).unwrap(); + let ctx: SessionContext = DeltaSessionContext::default().into(); + ctx.register_table("test", Arc::new(provider)).unwrap(); + + let df = ctx.sql("select col_1, col_2 from test").await.unwrap(); + let actual = df.collect().await.unwrap(); + let expected = vec![ + "+-------+-------+", + "| col_1 | col_2 |", + "+-------+-------+", + "| A | |", + "| B | |", + "| E | E2 |", + "| F | F2 |", + "| G | G2 |", + "+-------+-------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn delta_scan_supports_pushdown() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("col_1", DataType::Utf8, false), + Field::new("col_2", DataType::Utf8, false), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + Some("A"), + Some("B"), + Some("C"), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + Some("A2"), + Some("B2"), + Some("C2"), + ])), + ], + ) + .unwrap(); + + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let config = DeltaScanConfigBuilder::new() + .build(table.snapshot().unwrap()) + .unwrap(); + let log = table.log_store(); + + let provider = + DeltaTableProvider::try_new(table.snapshot().unwrap().clone(), log, config).unwrap(); + + let mut cfg = SessionConfig::default(); + cfg.options_mut().execution.parquet.pushdown_filters = true; + let ctx = SessionContext::new_with_config(cfg); + ctx.register_table("test", Arc::new(provider)).unwrap(); + + let df = ctx + .sql("select col_1, col_2 from test WHERE col_1 = 'A'") + .await + .unwrap(); + let actual = df.collect().await.unwrap(); + let expected = vec![ + "+-------+-------+", + "| col_1 | col_2 |", + "+-------+-------+", + "| A | A2 |", + "+-------+-------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn delta_scan_supports_nested_missing_columns() { + let column1_schema1: arrow::datatypes::Fields = + vec![Field::new("col_1a", DataType::Utf8, true)].into(); + let schema1 = Arc::new(ArrowSchema::new(vec![Field::new( + "col_1", + DataType::Struct(column1_schema1.clone()), + true, + )])); + + let batch1 = RecordBatch::try_new( + schema1.clone(), + vec![Arc::new(StructArray::new( + column1_schema1, + vec![Arc::new(arrow::array::StringArray::from(vec![ + Some("A"), + Some("B"), + ]))], + None, + ))], + ) + .unwrap(); + + let column1_schema2: arrow::datatypes::Fields = vec![ + Field::new("col_1a", DataType::Utf8, true), + Field::new("col_1b", DataType::Utf8, true), + ] + .into(); + let schema2 = Arc::new(ArrowSchema::new(vec![Field::new( + "col_1", + DataType::Struct(column1_schema2.clone()), + true, + )])); + + let batch2 = RecordBatch::try_new( + schema2.clone(), + vec![Arc::new(StructArray::new( + column1_schema2, + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + Some("E"), + Some("F"), + Some("G"), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + Some("E2"), + Some("F2"), + Some("G2"), + ])), + ], + None, + ))], + ) + .unwrap(); + + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch1]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let table = crate::DeltaOps(table) + .write(vec![batch2]) + .with_schema_mode(SchemaMode::Merge) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let config = DeltaScanConfigBuilder::new() + .build(table.snapshot().unwrap()) + .unwrap(); + let log = table.log_store(); + + let provider = + DeltaTableProvider::try_new(table.snapshot().unwrap().clone(), log, config).unwrap(); + let ctx: SessionContext = DeltaSessionContext::default().into(); + ctx.register_table("test", Arc::new(provider)).unwrap(); + + let df = ctx + .sql("select col_1.col_1a, col_1.col_1b from test") + .await + .unwrap(); + let actual = df.collect().await.unwrap(); + let expected = vec![ + "+--------------------+--------------------+", + "| test.col_1[col_1a] | test.col_1[col_1b] |", + "+--------------------+--------------------+", + "| A | |", + "| B | |", + "| E | E2 |", + "| F | F2 |", + "| G | G2 |", + "+--------------------+--------------------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn test_multiple_predicate_pushdown() { + use crate::datafusion::prelude::SessionContext; + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("moDified", DataType::Utf8, true), + Field::new("id", DataType::Utf8, true), + Field::new("vaLue", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-01", + "2021-02-01", + "2021-02-02", + "2021-02-02", + ])), + Arc::new(arrow::array::StringArray::from(vec!["A", "B", "C", "D"])), + Arc::new(arrow::array::Int32Array::from(vec![1, 10, 20, 100])), + ], + ) + .unwrap(); + // write some data + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let datafusion = SessionContext::new(); + let table = Arc::new(table); + + datafusion.register_table("snapshot", table).unwrap(); + + let df = datafusion + .sql("select * from snapshot where id > 10000 and id < 20000") + .await + .unwrap(); + + df.collect().await.unwrap(); + } + + #[tokio::test] + async fn test_delta_scan_builder_no_scan_config() { + let arr: Arc = Arc::new(arrow::array::StringArray::from(vec!["s"])); + let batch = RecordBatch::try_from_iter_with_nullable(vec![("a", arr, false)]).unwrap(); + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let ctx = SessionContext::new(); + let state = ctx.state(); + let scan = DeltaScanBuilder::new(table.snapshot().unwrap(), table.log_store(), &state) + .with_filter(Some(col("a").eq(lit("s")))) + .build() + .await + .unwrap(); + + let mut visitor = ParquetPredicateVisitor::default(); + visit_execution_plan(&scan, &mut visitor).unwrap(); + + assert_eq!(visitor.predicate.unwrap().to_string(), "a@0 = s"); + assert_eq!( + visitor.pruning_predicate.unwrap().orig_expr().to_string(), + "a@0 = s" + ); + } + + #[tokio::test] + async fn test_delta_scan_builder_scan_config_disable_pushdown() { + let arr: Arc = Arc::new(arrow::array::StringArray::from(vec!["s"])); + let batch = RecordBatch::try_from_iter_with_nullable(vec![("a", arr, false)]).unwrap(); + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let snapshot = table.snapshot().unwrap(); + let ctx = SessionContext::new(); + let state = ctx.state(); + let scan = DeltaScanBuilder::new(snapshot, table.log_store(), &state) + .with_filter(Some(col("a").eq(lit("s")))) + .with_scan_config( + DeltaScanConfigBuilder::new() + .with_parquet_pushdown(false) + .build(snapshot) + .unwrap(), + ) + .build() + .await + .unwrap(); + + let mut visitor = ParquetPredicateVisitor::default(); + visit_execution_plan(&scan, &mut visitor).unwrap(); + + assert!(visitor.predicate.is_none()); + assert!(visitor.pruning_predicate.is_none()); + } + + #[tokio::test] + async fn test_delta_scan_applies_parquet_options() { + let arr: Arc = Arc::new(arrow::array::StringArray::from(vec!["s"])); + let batch = RecordBatch::try_from_iter_with_nullable(vec![("a", arr, false)]).unwrap(); + let table = crate::DeltaOps::new_in_memory() + .write(vec![batch]) + .with_save_mode(crate::protocol::SaveMode::Append) + .await + .unwrap(); + + let snapshot = table.snapshot().unwrap(); + + let mut config = SessionConfig::default(); + config.options_mut().execution.parquet.pushdown_filters = true; + let ctx = SessionContext::new_with_config(config); + let state = ctx.state(); + + let scan = DeltaScanBuilder::new(snapshot, table.log_store(), &state) + .build() + .await + .unwrap(); + + let mut visitor = ParquetOptionsVisitor::default(); + visit_execution_plan(&scan, &mut visitor).unwrap(); + + assert_eq!(ctx.copied_table_options().parquet, visitor.options.unwrap()); + } + + #[derive(Default)] + struct ParquetPredicateVisitor { + predicate: Option>, + pruning_predicate: Option>, + } + + impl ExecutionPlanVisitor for ParquetPredicateVisitor { + type Error = DataFusionError; + + fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { + if let Some(parquet_exec) = plan.as_any().downcast_ref::() { + self.predicate = parquet_exec.predicate().cloned(); + self.pruning_predicate = parquet_exec.pruning_predicate().cloned(); + } + Ok(true) + } + } + + #[derive(Default)] + struct ParquetOptionsVisitor { + options: Option, + } + + impl ExecutionPlanVisitor for ParquetOptionsVisitor { + type Error = DataFusionError; + + fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { + if let Some(parquet_exec) = plan.as_any().downcast_ref::() { + self.options = Some(parquet_exec.table_parquet_options().clone()) + } + Ok(true) + } + } } diff --git a/crates/core/src/delta_datafusion/physical.rs b/crates/core/src/delta_datafusion/physical.rs index 954df0b046..c37b85101e 100644 --- a/crates/core/src/delta_datafusion/physical.rs +++ b/crates/core/src/delta_datafusion/physical.rs @@ -74,6 +74,10 @@ impl DisplayAs for MetricObserverExec { } impl ExecutionPlan for MetricObserverExec { + fn name(&self) -> &str { + Self::static_name() + } + fn as_any(&self) -> &dyn std::any::Any { self } @@ -82,16 +86,12 @@ impl ExecutionPlan for MetricObserverExec { self.parent.schema() } - fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning { - self.parent.output_partitioning() - } - - fn output_ordering(&self) -> Option<&[datafusion_physical_expr::PhysicalSortExpr]> { - self.parent.output_ordering() + fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + self.parent.properties() } - fn children(&self) -> Vec> { - vec![self.parent.clone()] + fn children(&self) -> Vec<&Arc> { + vec![&self.parent] } fn execute( @@ -178,3 +178,7 @@ pub(crate) fn find_metric_node( None } + +pub(crate) fn get_metric(metrics: &MetricsSet, name: &str) -> usize { + metrics.sum_by_name(name).map(|m| m.as_usize()).unwrap_or(0) +} diff --git a/crates/core/src/delta_datafusion/planner.rs b/crates/core/src/delta_datafusion/planner.rs new file mode 100644 index 0000000000..f0af1092ca --- /dev/null +++ b/crates/core/src/delta_datafusion/planner.rs @@ -0,0 +1,57 @@ +//! Custom planners for datafusion so that you can convert custom nodes, can be used +//! to trace custom metrics in an operation +//! +//! # Example +//! +//! #[derive(Clone)] +//! struct MergeMetricExtensionPlanner {} +//! +//! #[async_trait] +//! impl ExtensionPlanner for MergeMetricExtensionPlanner { +//! async fn plan_extension( +//! &self, +//! planner: &dyn PhysicalPlanner, +//! node: &dyn UserDefinedLogicalNode, +//! _logical_inputs: &[&LogicalPlan], +//! physical_inputs: &[Arc], +//! session_state: &SessionState, +//! ) -> DataFusionResult>> {} +//! +//! let merge_planner = DeltaPlanner:: { +//! extension_planner: MergeMetricExtensionPlanner {} +//! }; +//! +//! let state = state.with_query_planner(Arc::new(merge_planner)); +use std::sync::Arc; + +use crate::delta_datafusion::DataFusionResult; +use async_trait::async_trait; +use datafusion::physical_planner::PhysicalPlanner; +use datafusion::{ + execution::{context::QueryPlanner, session_state::SessionState}, + physical_plan::ExecutionPlan, + physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner}, +}; +use datafusion_expr::LogicalPlan; + +/// Deltaplanner +pub struct DeltaPlanner { + /// custom extension planner + pub extension_planner: T, +} + +#[async_trait] +impl QueryPlanner for DeltaPlanner { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + session_state: &SessionState, + ) -> DataFusionResult> { + let planner = Arc::new(Box::new(DefaultPhysicalPlanner::with_extension_planners( + vec![Arc::new(self.extension_planner.clone())], + ))); + planner + .create_physical_plan(logical_plan, session_state) + .await + } +} diff --git a/crates/core/src/delta_datafusion/schema_adapter.rs b/crates/core/src/delta_datafusion/schema_adapter.rs new file mode 100644 index 0000000000..5fb0724f50 --- /dev/null +++ b/crates/core/src/delta_datafusion/schema_adapter.rs @@ -0,0 +1,80 @@ +use crate::operations::cast::cast_record_batch; +use arrow_array::RecordBatch; +use arrow_schema::{Schema, SchemaRef}; +use datafusion::datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper}; +use std::fmt::Debug; +use std::sync::Arc; + +/// A Schema Adapter Factory which provides casting record batches from parquet to meet +/// delta lake conventions. +#[derive(Debug)] +pub(crate) struct DeltaSchemaAdapterFactory {} + +impl SchemaAdapterFactory for DeltaSchemaAdapterFactory { + fn create(&self, schema: SchemaRef) -> Box { + Box::new(DeltaSchemaAdapter { + table_schema: schema, + }) + } +} + +pub(crate) struct DeltaSchemaAdapter { + /// Schema for the table + table_schema: SchemaRef, +} + +impl SchemaAdapter for DeltaSchemaAdapter { + fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { + let field = self.table_schema.field(index); + Some(file_schema.fields.find(field.name())?.0) + } + + fn map_schema( + &self, + file_schema: &Schema, + ) -> datafusion_common::Result<(Arc, Vec)> { + let mut projection = Vec::with_capacity(file_schema.fields().len()); + + for (file_idx, file_field) in file_schema.fields.iter().enumerate() { + if self.table_schema.fields().find(file_field.name()).is_some() { + projection.push(file_idx); + } + } + + Ok(( + Arc::new(SchemaMapping { + table_schema: self.table_schema.clone(), + }), + projection, + )) + } +} + +#[derive(Debug)] +pub(crate) struct SchemaMapping { + table_schema: SchemaRef, +} + +impl SchemaMapper for SchemaMapping { + fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result { + let record_batch = cast_record_batch(&batch, self.table_schema.clone(), false, true)?; + Ok(record_batch) + } + + fn map_partial_batch(&self, batch: RecordBatch) -> datafusion_common::Result { + let partial_table_schema = Arc::new(Schema::new( + batch + .schema() + .fields() + .iter() + .filter_map(|batch_field| { + self.table_schema.field_with_name(batch_field.name()).ok() + }) + .cloned() + .collect::>(), + )); + + let record_batch = cast_record_batch(&batch, partial_table_schema, false, true)?; + Ok(record_batch) + } +} diff --git a/crates/core/src/errors.rs b/crates/core/src/errors.rs index 63524fd227..0fa589286b 100644 --- a/crates/core/src/errors.rs +++ b/crates/core/src/errors.rs @@ -1,7 +1,7 @@ //! Exceptions for the deltalake crate use object_store::Error as ObjectStoreError; -use crate::operations::transaction::TransactionError; +use crate::operations::transaction::{CommitBuilderError, TransactionError}; use crate::protocol::ProtocolError; /// A result returned by delta-rs @@ -11,6 +11,9 @@ pub type DeltaResult = Result; #[allow(missing_docs)] #[derive(thiserror::Error, Debug)] pub enum DeltaTableError { + #[error("Kernel error: {0}")] + KernelError(#[from] delta_kernel::error::Error), + #[error("Delta protocol violation: {source}")] Protocol { source: ProtocolError }, @@ -146,6 +149,13 @@ pub enum DeltaTableError { source: std::io::Error, }, + /// Error raised while preparing a commit + #[error("Commit actions are unsound: {source}")] + CommitValidation { + /// The source error + source: CommitBuilderError, + }, + /// Error raised while commititng transaction #[error("Transaction failed: {source}")] Transaction { @@ -210,6 +220,15 @@ pub enum DeltaTableError { #[error("Table has not yet been initialized")] NotInitialized, + + #[error("Change Data not enabled for version: {version}, Start: {start}, End: {end}")] + ChangeDataNotRecorded { version: i64, start: i64, end: i64 }, + + #[error("Reading a table version: {version} that does not have change data enabled")] + ChangeDataNotEnabled { version: i64 }, + + #[error("Invalid version start version {start} is greater than version {end}")] + ChangeDataInvalidVersionRange { start: i64, end: i64 }, } impl From for DeltaTableError { diff --git a/crates/core/src/kernel/arrow/json.rs b/crates/core/src/kernel/arrow/json.rs index dcb56d308a..ed31a7b64e 100644 --- a/crates/core/src/kernel/arrow/json.rs +++ b/crates/core/src/kernel/arrow/json.rs @@ -62,9 +62,10 @@ pub(crate) fn parse_json( for it in 0..json_strings.len() { if json_strings.is_null(it) { if value_count > 0 { - let slice = json_strings.slice(value_start, value_count); - let batch = decode_reader(&mut decoder, get_reader(slice.value_data())) - .collect::, _>>()?; + let slice_data = get_nonnull_slice_data(json_strings, value_start, value_count); + let batch = + decode_reader(&mut decoder, get_reader(&slice_data)) + .collect::, _>>()?; batches.extend(batch); value_count = 0; } @@ -86,15 +87,28 @@ pub(crate) fn parse_json( } if value_count > 0 { - let slice = json_strings.slice(value_start, value_count); - let batch = decode_reader(&mut decoder, get_reader(slice.value_data())) - .collect::, _>>()?; + let slice_data = get_nonnull_slice_data(json_strings, value_start, value_count); + let batch = + decode_reader(&mut decoder, get_reader(&slice_data)).collect::, _>>()?; batches.extend(batch); } Ok(concat_batches(&output_schema, &batches)?) } +/// Get the data of a slice of non-null JSON strings. +fn get_nonnull_slice_data( + json_strings: &StringArray, + value_start: usize, + value_count: usize, +) -> Vec { + let slice = json_strings.slice(value_start, value_count); + slice.iter().fold(Vec::new(), |mut acc, s| { + acc.extend_from_slice(s.unwrap().as_bytes()); + acc + }) +} + /// Decode a stream of bytes into a stream of record batches. pub(crate) fn decode_stream> + Unpin>( mut decoder: Decoder, @@ -148,3 +162,42 @@ pub(crate) fn decode_reader<'a, R: BufRead + 'a>( }; std::iter::from_fn(move || next().map_err(DeltaTableError::from).transpose()) } + +#[cfg(test)] +mod tests { + use crate::kernel::arrow::json::parse_json; + use crate::DeltaTableConfig; + use arrow_array::{Int32Array, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + use std::sync::Arc; + + #[test] + fn json_to_struct() { + let json_strings = StringArray::from(vec![ + Some(r#"{"a": 1, "b": "foo"}"#), + Some(r#"{"a": 2, "b": "bar"}"#), + None, + Some(r#"{"a": 3, "b": "baz"}"#), + ]); + let struct_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Utf8, true), + ])); + let config = DeltaTableConfig::default(); + let result = parse_json(&json_strings, struct_schema.clone(), &config).unwrap(); + let expected = RecordBatch::try_new( + struct_schema, + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(3)])), + Arc::new(StringArray::from(vec![ + Some("foo"), + Some("bar"), + None, + Some("baz"), + ])), + ], + ) + .unwrap(); + assert_eq!(result, expected); + } +} diff --git a/crates/core/src/kernel/arrow/mod.rs b/crates/core/src/kernel/arrow/mod.rs index ab121ee8a6..0fb41379dd 100644 --- a/crates/core/src/kernel/arrow/mod.rs +++ b/crates/core/src/kernel/arrow/mod.rs @@ -3,268 +3,19 @@ use std::sync::Arc; use arrow_schema::{ - ArrowError, DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, - Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, + DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, + Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; use lazy_static::lazy_static; -use super::{ActionType, ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; - pub(crate) mod extract; pub(crate) mod json; -const MAP_ROOT_DEFAULT: &str = "entries"; -const MAP_KEY_DEFAULT: &str = "keys"; -const MAP_VALUE_DEFAULT: &str = "values"; +const MAP_ROOT_DEFAULT: &str = "key_value"; +const MAP_KEY_DEFAULT: &str = "key"; +const MAP_VALUE_DEFAULT: &str = "value"; const LIST_ROOT_DEFAULT: &str = "item"; -impl TryFrom for ArrowField { - type Error = ArrowError; - - fn try_from(value: ActionType) -> Result { - value.schema_field().try_into() - } -} - -impl TryFrom<&StructType> for ArrowSchema { - type Error = ArrowError; - - fn try_from(s: &StructType) -> Result { - let fields = s - .fields() - .iter() - .map(TryInto::try_into) - .collect::, ArrowError>>()?; - - Ok(ArrowSchema::new(fields)) - } -} - -impl TryFrom<&StructField> for ArrowField { - type Error = ArrowError; - - fn try_from(f: &StructField) -> Result { - let metadata = f - .metadata() - .iter() - .map(|(key, val)| Ok((key.clone(), serde_json::to_string(val)?))) - .collect::>() - .map_err(|err| ArrowError::JsonError(err.to_string()))?; - - let field = ArrowField::new( - f.name(), - ArrowDataType::try_from(f.data_type())?, - f.is_nullable(), - ) - .with_metadata(metadata); - - Ok(field) - } -} - -impl TryFrom<&ArrayType> for ArrowField { - type Error = ArrowError; - fn try_from(a: &ArrayType) -> Result { - Ok(ArrowField::new( - LIST_ROOT_DEFAULT, - ArrowDataType::try_from(a.element_type())?, - // TODO check how to handle nullability - a.contains_null(), - )) - } -} - -impl TryFrom<&MapType> for ArrowField { - type Error = ArrowError; - - fn try_from(a: &MapType) -> Result { - Ok(ArrowField::new( - MAP_ROOT_DEFAULT, - ArrowDataType::Struct( - vec![ - ArrowField::new( - MAP_KEY_DEFAULT, - ArrowDataType::try_from(a.key_type())?, - false, - ), - ArrowField::new( - MAP_VALUE_DEFAULT, - ArrowDataType::try_from(a.value_type())?, - a.value_contains_null(), - ), - ] - .into(), - ), - // always non-null - false, - )) - } -} - -impl TryFrom<&DataType> for ArrowDataType { - type Error = ArrowError; - - fn try_from(t: &DataType) -> Result { - match t { - DataType::Primitive(p) => { - match p { - PrimitiveType::String => Ok(ArrowDataType::Utf8), - PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type - PrimitiveType::Integer => Ok(ArrowDataType::Int32), - PrimitiveType::Short => Ok(ArrowDataType::Int16), - PrimitiveType::Byte => Ok(ArrowDataType::Int8), - PrimitiveType::Float => Ok(ArrowDataType::Float32), - PrimitiveType::Double => Ok(ArrowDataType::Float64), - PrimitiveType::Boolean => Ok(ArrowDataType::Boolean), - PrimitiveType::Binary => Ok(ArrowDataType::Binary), - PrimitiveType::Decimal(precision, scale) => { - if precision <= &38 { - Ok(ArrowDataType::Decimal128(*precision, *scale)) - } else if precision <= &76 { - Ok(ArrowDataType::Decimal256(*precision, *scale)) - } else { - Err(ArrowError::SchemaError(format!( - "Precision too large to be represented in Arrow: {}", - precision - ))) - } - } - PrimitiveType::Date => { - // A calendar date, represented as a year-month-day triple without a - // timezone. Stored as 4 bytes integer representing days since 1970-01-01 - Ok(ArrowDataType::Date32) - } - PrimitiveType::Timestamp => { - // Issue: https://github.com/delta-io/delta/issues/643 - Ok(ArrowDataType::Timestamp(TimeUnit::Microsecond, None)) - } - } - } - DataType::Struct(s) => Ok(ArrowDataType::Struct( - s.fields() - .iter() - .map(TryInto::try_into) - .collect::, ArrowError>>()? - .into(), - )), - DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(a.as_ref().try_into()?))), - DataType::Map(m) => Ok(ArrowDataType::Map(Arc::new(m.as_ref().try_into()?), false)), - } - } -} - -impl TryFrom<&ArrowSchema> for StructType { - type Error = ArrowError; - - fn try_from(arrow_schema: &ArrowSchema) -> Result { - let new_fields: Result, _> = arrow_schema - .fields() - .iter() - .map(|field| field.as_ref().try_into()) - .collect(); - Ok(StructType::new(new_fields?)) - } -} - -impl TryFrom for StructType { - type Error = ArrowError; - - fn try_from(arrow_schema: ArrowSchemaRef) -> Result { - arrow_schema.as_ref().try_into() - } -} - -impl TryFrom<&ArrowField> for StructField { - type Error = ArrowError; - - fn try_from(arrow_field: &ArrowField) -> Result { - Ok(StructField::new( - arrow_field.name().clone(), - DataType::try_from(arrow_field.data_type())?, - arrow_field.is_nullable(), - ) - .with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v)))) - } -} - -impl TryFrom<&ArrowDataType> for DataType { - type Error = ArrowError; - - fn try_from(arrow_datatype: &ArrowDataType) -> Result { - match arrow_datatype { - ArrowDataType::Utf8 => Ok(DataType::Primitive(PrimitiveType::String)), - ArrowDataType::LargeUtf8 => Ok(DataType::Primitive(PrimitiveType::String)), - ArrowDataType::Int64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type - ArrowDataType::Int32 => Ok(DataType::Primitive(PrimitiveType::Integer)), - ArrowDataType::Int16 => Ok(DataType::Primitive(PrimitiveType::Short)), - ArrowDataType::Int8 => Ok(DataType::Primitive(PrimitiveType::Byte)), - ArrowDataType::UInt64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type - ArrowDataType::UInt32 => Ok(DataType::Primitive(PrimitiveType::Integer)), - ArrowDataType::UInt16 => Ok(DataType::Primitive(PrimitiveType::Short)), - ArrowDataType::UInt8 => Ok(DataType::Primitive(PrimitiveType::Byte)), - ArrowDataType::Float32 => Ok(DataType::Primitive(PrimitiveType::Float)), - ArrowDataType::Float64 => Ok(DataType::Primitive(PrimitiveType::Double)), - ArrowDataType::Boolean => Ok(DataType::Primitive(PrimitiveType::Boolean)), - ArrowDataType::Binary => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::FixedSizeBinary(_) => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::LargeBinary => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::Decimal128(p, s) => { - Ok(DataType::Primitive(PrimitiveType::Decimal(*p, *s))) - } - ArrowDataType::Decimal256(p, s) => { - Ok(DataType::Primitive(PrimitiveType::Decimal(*p, *s))) - } - ArrowDataType::Date32 => Ok(DataType::Primitive(PrimitiveType::Date)), - ArrowDataType::Date64 => Ok(DataType::Primitive(PrimitiveType::Date)), - ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => { - Ok(DataType::Primitive(PrimitiveType::Timestamp)) - } - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz)) - if tz.eq_ignore_ascii_case("utc") => - { - Ok(DataType::Primitive(PrimitiveType::Timestamp)) - } - ArrowDataType::Struct(fields) => { - let converted_fields: Result, _> = fields - .iter() - .map(|field| field.as_ref().try_into()) - .collect(); - Ok(DataType::Struct(Box::new(StructType::new( - converted_fields?, - )))) - } - ArrowDataType::List(field) => Ok(DataType::Array(Box::new(ArrayType::new( - (*field).data_type().try_into()?, - (*field).is_nullable(), - )))), - ArrowDataType::LargeList(field) => Ok(DataType::Array(Box::new(ArrayType::new( - (*field).data_type().try_into()?, - (*field).is_nullable(), - )))), - ArrowDataType::FixedSizeList(field, _) => Ok(DataType::Array(Box::new( - ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()), - ))), - ArrowDataType::Map(field, _) => { - if let ArrowDataType::Struct(struct_fields) = field.data_type() { - let key_type = struct_fields[0].data_type().try_into()?; - let value_type = struct_fields[1].data_type().try_into()?; - let value_type_nullable = struct_fields[1].is_nullable(); - Ok(DataType::Map(Box::new(MapType::new( - key_type, - value_type, - value_type_nullable, - )))) - } else { - panic!("DataType::Map should contain a struct field child"); - } - } - s => Err(ArrowError::SchemaError(format!( - "Invalid data type for Delta Lake: {s}" - ))), - } - } -} - macro_rules! arrow_map { ($fieldname: ident, null) => { ArrowField::new( @@ -448,7 +199,9 @@ pub(crate) fn delta_log_schema_for_table( ], protocol[ minReaderVersion:Int32, - minWriterVersion:Int32 + minWriterVersion:Int32, + writerFeatures[element]{Utf8}, + readerFeatures[element]{Utf8} ], txn[ appId:Utf8, @@ -497,13 +250,15 @@ pub(crate) fn delta_log_schema_for_table( .iter() .for_each(|f| max_min_schema_for_fields(&mut max_min_vec, f)); - stats_parsed_fields.extend(["minValues", "maxValues"].into_iter().map(|name| { - ArrowField::new( - name, - ArrowDataType::Struct(max_min_vec.clone().into()), - true, - ) - })); + if max_min_vec.len() > 0 { + stats_parsed_fields.extend(["minValues", "maxValues"].into_iter().map(|name| { + ArrowField::new( + name, + ArrowDataType::Struct(max_min_vec.clone().into()), + true, + ) + })); + } let mut null_count_vec = Vec::new(); non_partition_fields @@ -575,8 +330,7 @@ fn max_min_schema_for_fields(dest: &mut Vec, f: &ArrowField) { // don't compute min or max for list, map or binary types ArrowDataType::List(_) | ArrowDataType::Map(_, _) | ArrowDataType::Binary => { /* noop */ } _ => { - let f = f.clone(); - dest.push(f); + dest.push(ArrowField::new(f.name(), f.data_type().clone(), true)); } } } @@ -605,15 +359,15 @@ fn null_count_schema_for_fields(dest: &mut Vec, f: &ArrowField) { #[cfg(test)] mod tests { + use std::collections::HashMap; + use std::sync::Arc; + use arrow::array::ArrayData; - use arrow_array::Array; - use arrow_array::{make_array, ArrayRef, MapArray, StringArray, StructArray}; + use arrow_array::{Array, BinaryArray, MapArray, RecordBatch, StringArray, StructArray}; use arrow_buffer::{Buffer, ToByteSlice}; - use arrow_schema::Field; + use delta_kernel::schema::{DataType, MapType, PrimitiveType, StructField, StructType}; use super::*; - use std::collections::HashMap; - use std::sync::Arc; #[test] fn delta_log_schema_for_table_test() { @@ -756,73 +510,6 @@ mod tests { } } - #[test] - fn test_arrow_from_delta_decimal_type() { - let precision = 20; - let scale = 2; - let decimal_field = DataType::Primitive(PrimitiveType::Decimal(precision, scale)); - assert_eq!( - >::try_from(&decimal_field).unwrap(), - ArrowDataType::Decimal128(precision, scale) - ); - } - - #[test] - fn test_arrow_from_delta_timestamp_type() { - let timestamp_field = DataType::Primitive(PrimitiveType::Timestamp); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - ArrowDataType::Timestamp(TimeUnit::Microsecond, None) - ); - } - - #[test] - fn test_delta_from_arrow_timestamp_type() { - let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, None); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - DataType::Primitive(PrimitiveType::Timestamp) - ); - } - - #[test] - fn test_delta_from_arrow_timestamp_type_with_tz() { - let timestamp_field = - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - DataType::Primitive(PrimitiveType::Timestamp) - ); - } - - #[test] - fn test_delta_from_arrow_map_type() { - let arrow_map = ArrowDataType::Map( - Arc::new(ArrowField::new( - "entries", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::Int8, false), - ArrowField::new("value", ArrowDataType::Binary, true), - ] - .into(), - ), - false, - )), - false, - ); - let converted_map: DataType = (&arrow_map).try_into().unwrap(); - - assert_eq!( - converted_map, - DataType::Map(Box::new(MapType::new( - DataType::Primitive(PrimitiveType::Byte), - DataType::Primitive(PrimitiveType::Binary), - true, - ))) - ); - } - #[test] fn test_record_batch_from_map_type() { let keys = vec!["0", "1", "5", "6", "7"]; @@ -836,52 +523,36 @@ mod tests { let entry_offsets = vec![0u32, 1, 1, 4, 5, 5]; let num_rows = keys.len(); - // Copied the function `new_from_string` with the patched code from https://github.com/apache/arrow-rs/pull/4808 - // This should be reverted back [`MapArray::new_from_strings`] once arrow is upgraded in this project. - fn new_from_strings<'a>( - keys: impl Iterator, - values: &dyn Array, - entry_offsets: &[u32], - ) -> Result { - let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); - let keys_data = StringArray::from_iter_values(keys); - - let keys_field = Arc::new(Field::new("keys", ArrowDataType::Utf8, false)); - let values_field = Arc::new(Field::new( - "values", - values.data_type().clone(), - values.null_count() > 0, - )); - - let entry_struct = StructArray::from(vec![ - (keys_field, Arc::new(keys_data) as ArrayRef), - (values_field, make_array(values.to_data())), - ]); - - let map_data_type = ArrowDataType::Map( - Arc::new(Field::new( - "entries", - entry_struct.data_type().clone(), - false, - )), - false, - ); - - let map_data = ArrayData::builder(map_data_type) - .len(entry_offsets.len() - 1) - .add_buffer(entry_offsets_buffer) - .add_child_data(entry_struct.into_data()) - .build()?; + let key_field = Arc::new(ArrowField::new(MAP_KEY_DEFAULT, ArrowDataType::Utf8, false)); + let value_field = Arc::new(ArrowField::new( + MAP_VALUE_DEFAULT, + ArrowDataType::Binary, + false, + )); + let key_value_field = ArrowField::new_struct( + MAP_ROOT_DEFAULT, + vec![key_field.clone(), value_field.clone()], + false, + ); + let key_value_array = StructArray::new( + vec![key_field, value_field].into(), + vec![ + Arc::new(StringArray::from(keys)), + Arc::new(BinaryArray::from(values)), + ], + None, + ); + let entry_offsets_buffer = Buffer::from(entry_offsets.as_slice().to_byte_slice()); - Ok(MapArray::from(map_data)) - } + let map_data_type = ArrowDataType::Map(Arc::new(key_value_field), false); + let map_data = ArrayData::builder(map_data_type) + .len(entry_offsets.len() - 1) + .add_buffer(entry_offsets_buffer) + .add_child_data(key_value_array.into_data()) + .build() + .unwrap(); - let map_array = new_from_strings( - keys.into_iter(), - &arrow::array::BinaryArray::from(values), - entry_offsets.as_slice(), - ) - .expect("Could not create a map array"); + let map_array = MapArray::from(map_data); let schema = >::try_from(&StructType::new(vec![ @@ -897,9 +568,8 @@ mod tests { ])) .expect("Could not get schema"); - let record_batch = - arrow::record_batch::RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) - .expect("Failed to create RecordBatch"); + let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) + .expect("Failed to create RecordBatch"); assert_eq!(record_batch.num_columns(), 1); assert_eq!(record_batch.num_rows(), num_rows); diff --git a/crates/core/src/kernel/expressions/eval.rs b/crates/core/src/kernel/expressions/eval.rs deleted file mode 100644 index 3796542ffc..0000000000 --- a/crates/core/src/kernel/expressions/eval.rs +++ /dev/null @@ -1,378 +0,0 @@ -//! Default Expression handler. -//! -//! Expression handling based on arrow-rs compute kernels. - -use std::sync::Arc; - -use arrow_arith::boolean::{and, is_null, not, or}; -use arrow_arith::numeric::{add, div, mul, sub}; -use arrow_array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Datum, Decimal128Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, - StructArray, TimestampMicrosecondArray, -}; -use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; -use arrow_schema::{ArrowError, Field as ArrowField, Schema as ArrowSchema}; -use arrow_select::nullif::nullif; - -use crate::kernel::arrow::extract::extract_column; -use crate::kernel::error::{DeltaResult, Error}; -use crate::kernel::expressions::{scalars::Scalar, Expression}; -use crate::kernel::expressions::{BinaryOperator, UnaryOperator}; -use crate::kernel::{DataType, PrimitiveType, VariadicOperator}; - -fn downcast_to_bool(arr: &dyn Array) -> DeltaResult<&BooleanArray> { - arr.as_any() - .downcast_ref::() - .ok_or(Error::Generic("expected boolean array".to_string())) -} - -fn wrap_comparison_result(arr: BooleanArray) -> ArrayRef { - Arc::new(arr) as Arc -} - -// TODO leverage scalars / Datum - -impl Scalar { - /// Convert scalar to arrow array. - pub fn to_array(&self, num_rows: usize) -> DeltaResult { - use Scalar::*; - let arr: ArrayRef = match self { - Integer(val) => Arc::new(Int32Array::from_value(*val, num_rows)), - Long(val) => Arc::new(Int64Array::from_value(*val, num_rows)), - Short(val) => Arc::new(Int16Array::from_value(*val, num_rows)), - Byte(val) => Arc::new(Int8Array::from_value(*val, num_rows)), - Float(val) => Arc::new(Float32Array::from_value(*val, num_rows)), - Double(val) => Arc::new(Float64Array::from_value(*val, num_rows)), - String(val) => Arc::new(StringArray::from(vec![val.clone(); num_rows])), - Boolean(val) => Arc::new(BooleanArray::from(vec![*val; num_rows])), - Timestamp(val) => Arc::new(TimestampMicrosecondArray::from_value(*val, num_rows)), - Date(val) => Arc::new(Date32Array::from_value(*val, num_rows)), - Binary(val) => Arc::new(BinaryArray::from(vec![val.as_slice(); num_rows])), - Decimal(val, precision, scale) => Arc::new( - Decimal128Array::from_value(*val, num_rows) - .with_precision_and_scale(*precision, *scale)?, - ), - Null(data_type) => match data_type { - DataType::Primitive(primitive) => match primitive { - PrimitiveType::Byte => Arc::new(Int8Array::new_null(num_rows)), - PrimitiveType::Short => Arc::new(Int16Array::new_null(num_rows)), - PrimitiveType::Integer => Arc::new(Int32Array::new_null(num_rows)), - PrimitiveType::Long => Arc::new(Int64Array::new_null(num_rows)), - PrimitiveType::Float => Arc::new(Float32Array::new_null(num_rows)), - PrimitiveType::Double => Arc::new(Float64Array::new_null(num_rows)), - PrimitiveType::String => Arc::new(StringArray::new_null(num_rows)), - PrimitiveType::Boolean => Arc::new(BooleanArray::new_null(num_rows)), - PrimitiveType::Timestamp => { - Arc::new(TimestampMicrosecondArray::new_null(num_rows)) - } - PrimitiveType::Date => Arc::new(Date32Array::new_null(num_rows)), - PrimitiveType::Binary => Arc::new(BinaryArray::new_null(num_rows)), - PrimitiveType::Decimal(precision, scale) => Arc::new( - Decimal128Array::new_null(num_rows) - .with_precision_and_scale(*precision, *scale) - .unwrap(), - ), - }, - DataType::Array(_) => unimplemented!(), - DataType::Map { .. } => unimplemented!(), - DataType::Struct { .. } => unimplemented!(), - }, - Struct(values, fields) => { - let mut columns = Vec::with_capacity(values.len()); - for val in values { - columns.push(val.to_array(num_rows)?); - } - Arc::new(StructArray::try_new( - fields - .iter() - .map(TryInto::::try_into) - .collect::, _>>()? - .into(), - columns, - None, - )?) - } - }; - Ok(arr) - } -} - -/// evaluate expression -pub(crate) fn evaluate_expression( - expression: &Expression, - batch: &RecordBatch, - result_type: Option<&DataType>, -) -> DeltaResult { - use BinaryOperator::*; - use Expression::*; - - match (expression, result_type) { - (Literal(scalar), _) => Ok(scalar.to_array(batch.num_rows())?), - (Column(name), _) => { - if name.contains('.') { - let mut path = name.split('.'); - // Safety: we know that the first path step exists, because we checked for '.' - let arr = extract_column(batch, path.next().unwrap(), &mut path).cloned()?; - // NOTE: need to assign first so that rust can figure out lifetimes - Ok(arr) - } else { - batch - .column_by_name(name) - .ok_or(Error::MissingColumn(name.clone())) - .cloned() - } - } - (Struct(fields), Some(DataType::Struct(schema))) => { - let output_schema: ArrowSchema = schema.as_ref().try_into()?; - let mut columns = Vec::with_capacity(fields.len()); - for (expr, field) in fields.iter().zip(schema.fields()) { - columns.push(evaluate_expression(expr, batch, Some(field.data_type()))?); - } - Ok(Arc::new(StructArray::try_new( - output_schema.fields().clone(), - columns, - None, - )?)) - } - (Struct(_), _) => Err(Error::Generic( - "Data type is required to evaluate struct expressions".to_string(), - )), - (UnaryOperation { op, expr }, _) => { - let arr = evaluate_expression(expr.as_ref(), batch, None)?; - Ok(match op { - UnaryOperator::Not => Arc::new(not(downcast_to_bool(&arr)?)?), - UnaryOperator::IsNull => Arc::new(is_null(&arr)?), - }) - } - (BinaryOperation { op, left, right }, _) => { - let left_arr = evaluate_expression(left.as_ref(), batch, None)?; - let right_arr = evaluate_expression(right.as_ref(), batch, None)?; - - type Operation = fn(&dyn Datum, &dyn Datum) -> Result, ArrowError>; - let eval: Operation = match op { - Plus => add, - Minus => sub, - Multiply => mul, - Divide => div, - LessThan => |l, r| lt(l, r).map(wrap_comparison_result), - LessThanOrEqual => |l, r| lt_eq(l, r).map(wrap_comparison_result), - GreaterThan => |l, r| gt(l, r).map(wrap_comparison_result), - GreaterThanOrEqual => |l, r| gt_eq(l, r).map(wrap_comparison_result), - Equal => |l, r| eq(l, r).map(wrap_comparison_result), - NotEqual => |l, r| neq(l, r).map(wrap_comparison_result), - }; - - eval(&left_arr, &right_arr).map_err(|err| Error::GenericError { - source: Box::new(err), - }) - } - (VariadicOperation { op, exprs }, _) => { - let reducer = match op { - VariadicOperator::And => and, - VariadicOperator::Or => or, - }; - exprs - .iter() - .map(|expr| evaluate_expression(expr, batch, Some(&DataType::BOOLEAN))) - .reduce(|l, r| { - Ok(reducer(downcast_to_bool(&l?)?, downcast_to_bool(&r?)?) - .map(wrap_comparison_result)?) - }) - .transpose()? - .ok_or(Error::Generic("empty expression".to_string())) - } - (NullIf { expr, if_expr }, _) => { - let expr_arr = evaluate_expression(expr.as_ref(), batch, None)?; - let if_expr_arr = - evaluate_expression(if_expr.as_ref(), batch, Some(&DataType::BOOLEAN))?; - let if_expr_arr = downcast_to_bool(&if_expr_arr)?; - Ok(nullif(&expr_arr, if_expr_arr)?) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::Int32Array; - use arrow_schema::{DataType, Field, Fields, Schema}; - use std::ops::{Add, Div, Mul, Sub}; - - #[test] - fn test_extract_column() { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let values = Int32Array::from(vec![1, 2, 3]); - let batch = - RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values.clone())]).unwrap(); - let column = Expression::Column("a".to_string()); - - let results = evaluate_expression(&column, &batch, None).unwrap(); - assert_eq!(results.as_ref(), &values); - - let schema = Schema::new(vec![Field::new( - "b", - DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)])), - false, - )]); - - let struct_values: ArrayRef = Arc::new(values.clone()); - let struct_array = StructArray::from(vec![( - Arc::new(Field::new("a", DataType::Int32, false)), - struct_values, - )]); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(struct_array.clone())], - ) - .unwrap(); - let column = Expression::Column("b.a".to_string()); - let results = evaluate_expression(&column, &batch, None).unwrap(); - assert_eq!(results.as_ref(), &values); - } - - #[test] - fn test_binary_op_scalar() { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let values = Int32Array::from(vec![1, 2, 3]); - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = Expression::Column("a".to_string()); - - let expression = Box::new(column.clone().add(Expression::Literal(Scalar::Integer(1)))); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![2, 3, 4])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().sub(Expression::Literal(Scalar::Integer(1)))); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![0, 1, 2])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().mul(Expression::Literal(Scalar::Integer(2)))); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![2, 4, 6])); - assert_eq!(results.as_ref(), expected.as_ref()); - - // TODO handle type casting - let expression = Box::new(column.div(Expression::Literal(Scalar::Integer(1)))); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![1, 2, 3])); - assert_eq!(results.as_ref(), expected.as_ref()) - } - - #[test] - fn test_binary_op() { - let schema = Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Int32, false), - ]); - let values = Int32Array::from(vec![1, 2, 3]); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(values.clone()), Arc::new(values)], - ) - .unwrap(); - let column_a = Expression::Column("a".to_string()); - let column_b = Expression::Column("b".to_string()); - - let expression = Box::new(column_a.clone().add(column_b.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![2, 4, 6])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column_a.clone().sub(column_b.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![0, 0, 0])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column_a.clone().mul(column_b)); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(Int32Array::from(vec![1, 4, 9])); - assert_eq!(results.as_ref(), expected.as_ref()); - } - - #[test] - fn test_binary_cmp() { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let values = Int32Array::from(vec![1, 2, 3]); - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = Expression::Column("a".to_string()); - let lit = Expression::Literal(Scalar::Integer(2)); - - let expression = Box::new(column.clone().lt(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, false, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().lt_eq(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, true, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().gt(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![false, false, true])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().gt_eq(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![false, true, true])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().eq(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![false, true, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column.clone().ne(lit.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, false, true])); - assert_eq!(results.as_ref(), expected.as_ref()); - } - - #[test] - fn test_logical() { - let schema = Schema::new(vec![ - Field::new("a", DataType::Boolean, false), - Field::new("b", DataType::Boolean, false), - ]); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![ - Arc::new(BooleanArray::from(vec![true, false])), - Arc::new(BooleanArray::from(vec![false, true])), - ], - ) - .unwrap(); - let column_a = Expression::Column("a".to_string()); - let column_b = Expression::Column("b".to_string()); - - let expression = Box::new(column_a.clone().and(column_b.clone())); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![false, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new( - column_a - .clone() - .and(Expression::literal(Scalar::Boolean(true))), - ); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new(column_a.clone().or(column_b)); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, true])); - assert_eq!(results.as_ref(), expected.as_ref()); - - let expression = Box::new( - column_a - .clone() - .or(Expression::literal(Scalar::Boolean(false))), - ); - let results = evaluate_expression(&expression, &batch, None).unwrap(); - let expected = Arc::new(BooleanArray::from(vec![true, false])); - assert_eq!(results.as_ref(), expected.as_ref()); - } -} diff --git a/crates/core/src/kernel/expressions/mod.rs b/crates/core/src/kernel/expressions/mod.rs deleted file mode 100644 index b7912681ec..0000000000 --- a/crates/core/src/kernel/expressions/mod.rs +++ /dev/null @@ -1,478 +0,0 @@ -//! expressions. - -use std::collections::HashSet; -use std::fmt::{Display, Formatter}; -use std::sync::Arc; - -use arrow_array::{ArrayRef, RecordBatch}; -use arrow_schema::Schema as ArrowSchema; -use itertools::Itertools; - -use self::eval::evaluate_expression; -use super::{DataType, DeltaResult, SchemaRef}; - -pub use self::scalars::*; - -mod eval; -mod scalars; - -/// Interface for implementing an Expression evaluator. -/// -/// It contains one Expression which can be evaluated on multiple ColumnarBatches. -/// Connectors can implement this interface to optimize the evaluation using the -/// connector specific capabilities. -pub trait ExpressionEvaluator { - /// Evaluate the expression on given ColumnarBatch data. - /// - /// Contains one value for each row of the input. - /// The data type of the output is same as the type output of the expression this evaluator is using. - fn evaluate(&self, batch: &RecordBatch) -> DeltaResult; -} - -/// Provides expression evaluation capability to Delta Kernel. -/// -/// Delta Kernel can use this client to evaluate predicate on partition filters, -/// fill up partition column values and any computation on data using Expressions. -pub trait ExpressionHandler { - /// Create an [`ExpressionEvaluator`] that can evaluate the given [`Expression`] - /// on columnar batches with the given [`Schema`] to produce data of [`DataType`]. - /// - /// # Parameters - /// - /// - `schema`: Schema of the input data. - /// - `expression`: Expression to evaluate. - /// - `output_type`: Expected result data type. - /// - /// [`Schema`]: crate::schema::StructType - /// [`DataType`]: crate::schema::DataType - fn get_evaluator( - &self, - schema: SchemaRef, - expression: Expression, - output_type: DataType, - ) -> Arc; -} - -/// Default implementation of [`ExpressionHandler`] that uses [`evaluate_expression`] -#[derive(Debug)] -pub struct ArrowExpressionHandler {} - -impl ExpressionHandler for ArrowExpressionHandler { - fn get_evaluator( - &self, - schema: SchemaRef, - expression: Expression, - output_type: DataType, - ) -> Arc { - Arc::new(DefaultExpressionEvaluator { - input_schema: schema, - expression: Box::new(expression), - output_type, - }) - } -} - -/// Default implementation of [`ExpressionEvaluator`] that uses [`evaluate_expression`] -#[derive(Debug)] -pub struct DefaultExpressionEvaluator { - input_schema: SchemaRef, - expression: Box, - output_type: DataType, -} - -impl ExpressionEvaluator for DefaultExpressionEvaluator { - fn evaluate(&self, batch: &RecordBatch) -> DeltaResult { - let _input_schema: ArrowSchema = self.input_schema.as_ref().try_into()?; - // TODO: make sure we have matching schemas for validation - // if batch.schema().as_ref() != &input_schema { - // return Err(Error::Generic(format!( - // "input schema does not match batch schema: {:?} != {:?}", - // input_schema, - // batch.schema() - // ))); - // }; - evaluate_expression(&self.expression, batch, Some(&self.output_type)) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -/// A binary operator. -pub enum BinaryOperator { - /// Arithmetic Plus - Plus, - /// Arithmetic Minus - Minus, - /// Arithmetic Multiply - Multiply, - /// Arithmetic Divide - Divide, - /// Comparison Less Than - LessThan, - /// Comparison Less Than Or Equal - LessThanOrEqual, - /// Comparison Greater Than - GreaterThan, - /// Comparison Greater Than Or Equal - GreaterThanOrEqual, - /// Comparison Equal - Equal, - /// Comparison Not Equal - NotEqual, -} - -/// Variadic operators -#[derive(Debug, Clone, PartialEq)] -pub enum VariadicOperator { - /// AND - And, - /// OR - Or, -} - -impl Display for BinaryOperator { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - // Self::And => write!(f, "AND"), - // Self::Or => write!(f, "OR"), - Self::Plus => write!(f, "+"), - Self::Minus => write!(f, "-"), - Self::Multiply => write!(f, "*"), - Self::Divide => write!(f, "/"), - Self::LessThan => write!(f, "<"), - Self::LessThanOrEqual => write!(f, "<="), - Self::GreaterThan => write!(f, ">"), - Self::GreaterThanOrEqual => write!(f, ">="), - Self::Equal => write!(f, "="), - Self::NotEqual => write!(f, "!="), - } - } -} - -#[derive(Debug, Clone, PartialEq)] -/// A unary operator. -pub enum UnaryOperator { - /// Unary Not - Not, - /// Unary Is Null - IsNull, -} - -/// A SQL expression. -/// -/// These expressions do not track or validate data types, other than the type -/// of literals. It is up to the expression evaluator to validate the -/// expression against a schema and add appropriate casts as required. -#[derive(Debug, Clone, PartialEq)] -pub enum Expression { - /// A literal value. - Literal(Scalar), - /// A column reference by name. - Column(String), - /// - Struct(Vec), - /// A binary operation. - BinaryOperation { - /// The operator. - op: BinaryOperator, - /// The left-hand side of the operation. - left: Box, - /// The right-hand side of the operation. - right: Box, - }, - /// A unary operation. - UnaryOperation { - /// The operator. - op: UnaryOperator, - /// The expression. - expr: Box, - }, - /// A variadic operation. - VariadicOperation { - /// The operator. - op: VariadicOperator, - /// The expressions. - exprs: Vec, - }, - /// A NULLIF expression. - NullIf { - /// The expression to evaluate. - expr: Box, - /// The expression to compare against. - if_expr: Box, - }, - // TODO: support more expressions, such as IS IN, LIKE, etc. -} - -impl> From for Expression { - fn from(value: T) -> Self { - Self::literal(value) - } -} - -impl Display for Expression { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Self::Literal(l) => write!(f, "{}", l), - Self::Column(name) => write!(f, "Column({})", name), - Self::Struct(exprs) => write!( - f, - "Struct({})", - &exprs.iter().map(|e| format!("{e}")).join(", ") - ), - Self::BinaryOperation { op, left, right } => write!(f, "{} {} {}", left, op, right), - Self::UnaryOperation { op, expr } => match op { - UnaryOperator::Not => write!(f, "NOT {}", expr), - UnaryOperator::IsNull => write!(f, "{} IS NULL", expr), - }, - Self::VariadicOperation { op, exprs } => match op { - VariadicOperator::And => { - write!( - f, - "AND({})", - &exprs.iter().map(|e| format!("{e}")).join(", ") - ) - } - VariadicOperator::Or => { - write!( - f, - "OR({})", - &exprs.iter().map(|e| format!("{e}")).join(", ") - ) - } - }, - Self::NullIf { expr, if_expr } => write!(f, "NULLIF({}, {})", expr, if_expr), - } - } -} - -impl Expression { - /// Returns a set of columns referenced by this expression. - pub fn references(&self) -> HashSet<&str> { - let mut set = HashSet::new(); - - for expr in self.walk() { - if let Self::Column(name) = expr { - set.insert(name.as_str()); - } - } - - set - } - - /// Create an new expression for a column reference - pub fn column(name: impl Into) -> Self { - Self::Column(name.into()) - } - - /// Create a new expression for a literal value - pub fn literal(value: impl Into) -> Self { - Self::Literal(value.into()) - } - - /// Create a new expression for a struct - pub fn struct_expr(exprs: impl IntoIterator) -> Self { - Self::Struct(exprs.into_iter().collect()) - } - - /// Create a new expression for a unary operation - pub fn unary(op: UnaryOperator, expr: impl Into) -> Self { - Self::UnaryOperation { - op, - expr: Box::new(expr.into()), - } - } - - /// Create a new expression for a binary operation - pub fn binary( - op: BinaryOperator, - lhs: impl Into, - rhs: impl Into, - ) -> Self { - Self::BinaryOperation { - op, - left: Box::new(lhs.into()), - right: Box::new(rhs.into()), - } - } - - /// Create a new expression for a variadic operation - pub fn variadic(op: VariadicOperator, other: impl IntoIterator) -> Self { - let mut exprs = other.into_iter().collect::>(); - if exprs.is_empty() { - // TODO this might break if we introduce new variadic operators? - return Self::literal(matches!(op, VariadicOperator::And)); - } - if exprs.len() == 1 { - return exprs.pop().unwrap(); - } - Self::VariadicOperation { op, exprs } - } - - /// Create a new expression `self == other` - pub fn eq(self, other: Self) -> Self { - Self::binary(BinaryOperator::Equal, self, other) - } - - /// Create a new expression `self != other` - pub fn ne(self, other: Self) -> Self { - Self::binary(BinaryOperator::NotEqual, self, other) - } - - /// Create a new expression `self < other` - pub fn lt(self, other: Self) -> Self { - Self::binary(BinaryOperator::LessThan, self, other) - } - - /// Create a new expression `self > other` - pub fn gt(self, other: Self) -> Self { - Self::binary(BinaryOperator::GreaterThan, self, other) - } - - /// Create a new expression `self >= other` - pub fn gt_eq(self, other: Self) -> Self { - Self::binary(BinaryOperator::GreaterThanOrEqual, self, other) - } - - /// Create a new expression `self <= other` - pub fn lt_eq(self, other: Self) -> Self { - Self::binary(BinaryOperator::LessThanOrEqual, self, other) - } - - /// Create a new expression `self AND other` - pub fn and(self, other: Self) -> Self { - self.and_many([other]) - } - - /// Create a new expression `self AND others` - pub fn and_many(self, other: impl IntoIterator) -> Self { - Self::variadic(VariadicOperator::And, std::iter::once(self).chain(other)) - } - - /// Create a new expression `self AND other` - pub fn or(self, other: Self) -> Self { - self.or_many([other]) - } - - /// Create a new expression `self OR other` - pub fn or_many(self, other: impl IntoIterator) -> Self { - Self::variadic(VariadicOperator::Or, std::iter::once(self).chain(other)) - } - - /// Create a new expression `self IS NULL` - pub fn is_null(self) -> Self { - Self::unary(UnaryOperator::IsNull, self) - } - - /// Create a new expression `NULLIF(self, other)` - pub fn null_if(self, other: Self) -> Self { - Self::NullIf { - expr: Box::new(self), - if_expr: Box::new(other), - } - } - - fn walk(&self) -> impl Iterator + '_ { - let mut stack = vec![self]; - std::iter::from_fn(move || { - let expr = stack.pop()?; - match expr { - Self::Literal(_) => {} - Self::Column { .. } => {} - Self::Struct(exprs) => { - stack.extend(exprs.iter()); - } - Self::BinaryOperation { left, right, .. } => { - stack.push(left); - stack.push(right); - } - Self::UnaryOperation { expr, .. } => { - stack.push(expr); - } - Self::VariadicOperation { op, exprs } => match op { - VariadicOperator::And | VariadicOperator::Or => { - stack.extend(exprs.iter()); - } - }, - Self::NullIf { expr, if_expr } => { - stack.push(expr); - stack.push(if_expr); - } - } - Some(expr) - }) - } -} - -impl std::ops::Add for Expression { - type Output = Self; - - fn add(self, rhs: Expression) -> Self::Output { - Self::binary(BinaryOperator::Plus, self, rhs) - } -} - -impl std::ops::Sub for Expression { - type Output = Self; - - fn sub(self, rhs: Expression) -> Self::Output { - Self::binary(BinaryOperator::Minus, self, rhs) - } -} - -impl std::ops::Mul for Expression { - type Output = Self; - - fn mul(self, rhs: Expression) -> Self::Output { - Self::binary(BinaryOperator::Multiply, self, rhs) - } -} - -impl std::ops::Div for Expression { - type Output = Self; - - fn div(self, rhs: Expression) -> Self::Output { - Self::binary(BinaryOperator::Divide, self, rhs) - } -} - -#[cfg(test)] -mod tests { - use super::Expression as Expr; - - #[test] - fn test_expression_format() { - let col_ref = Expr::column("x"); - let cases = [ - (col_ref.clone(), "Column(x)"), - (col_ref.clone().eq(Expr::literal(2)), "Column(x) = 2"), - ( - col_ref - .clone() - .gt_eq(Expr::literal(2)) - .and(col_ref.clone().lt_eq(Expr::literal(10))), - "AND(Column(x) >= 2, Column(x) <= 10)", - ), - ( - col_ref - .clone() - .gt(Expr::literal(2)) - .or(col_ref.clone().lt(Expr::literal(10))), - "OR(Column(x) > 2, Column(x) < 10)", - ), - ( - (col_ref.clone() - Expr::literal(4)).lt(Expr::literal(10)), - "Column(x) - 4 < 10", - ), - ( - (col_ref.clone() + Expr::literal(4)) / Expr::literal(10) * Expr::literal(42), - "Column(x) + 4 / 10 * 42", - ), - (col_ref.eq(Expr::literal("foo")), "Column(x) = 'foo'"), - ]; - - for (expr, expected) in cases { - let result = format!("{}", expr); - assert_eq!(result, expected); - } - } -} diff --git a/crates/core/src/kernel/expressions/scalars.rs b/crates/core/src/kernel/expressions/scalars.rs deleted file mode 100644 index 147c9d7633..0000000000 --- a/crates/core/src/kernel/expressions/scalars.rs +++ /dev/null @@ -1,534 +0,0 @@ -//! Scalar values for use in expressions. - -use std::cmp::Ordering; -use std::fmt::{Display, Formatter}; - -use arrow_array::Array; -use arrow_schema::TimeUnit; -use chrono::{DateTime, NaiveDate, NaiveDateTime, TimeZone, Utc}; -use object_store::path::Path; - -use crate::kernel::{DataType, Error, PrimitiveType, StructField}; -use crate::NULL_PARTITION_VALUE_DATA_PATH; - -/// A single value, which can be null. Used for representing literal values -/// in [Expressions][crate::expressions::Expression]. -#[derive(Debug, Clone, PartialEq)] -pub enum Scalar { - /// 32bit integer - Integer(i32), - /// 64bit integer - Long(i64), - /// 16bit integer - Short(i16), - /// 8bit integer - Byte(i8), - /// 32bit floating point - Float(f32), - /// 64bit floating point - Double(f64), - /// utf-8 encoded string. - String(String), - /// true or false value - Boolean(bool), - /// Microsecond precision timestamp, adjusted to UTC. - Timestamp(i64), - /// Date stored as a signed 32bit int days since UNIX epoch 1970-01-01 - Date(i32), - /// Binary data - Binary(Vec), - /// Decimal value - Decimal(i128, u8, i8), - /// Null value with a given data type. - Null(DataType), - /// Struct value - Struct(Vec, Vec), -} - -impl Scalar { - /// Returns the data type of this scalar. - pub fn data_type(&self) -> DataType { - match self { - Self::Integer(_) => DataType::Primitive(PrimitiveType::Integer), - Self::Long(_) => DataType::Primitive(PrimitiveType::Long), - Self::Short(_) => DataType::Primitive(PrimitiveType::Short), - Self::Byte(_) => DataType::Primitive(PrimitiveType::Byte), - Self::Float(_) => DataType::Primitive(PrimitiveType::Float), - Self::Double(_) => DataType::Primitive(PrimitiveType::Double), - Self::String(_) => DataType::Primitive(PrimitiveType::String), - Self::Boolean(_) => DataType::Primitive(PrimitiveType::Boolean), - Self::Timestamp(_) => DataType::Primitive(PrimitiveType::Timestamp), - Self::Date(_) => DataType::Primitive(PrimitiveType::Date), - Self::Binary(_) => DataType::Primitive(PrimitiveType::Binary), - Self::Decimal(_, precision, scale) => DataType::decimal(*precision, *scale), - Self::Null(data_type) => data_type.clone(), - Self::Struct(_, fields) => DataType::struct_type(fields.clone()), - } - } - - /// Returns true if this scalar is null. - pub fn is_null(&self) -> bool { - matches!(self, Self::Null(_)) - } - - /// Serializes this scalar as a string. - pub fn serialize(&self) -> String { - match self { - Self::String(s) => s.to_owned(), - Self::Byte(b) => b.to_string(), - Self::Short(s) => s.to_string(), - Self::Integer(i) => i.to_string(), - Self::Long(l) => l.to_string(), - Self::Float(f) => f.to_string(), - Self::Double(d) => d.to_string(), - Self::Boolean(b) => { - if *b { - "true".to_string() - } else { - "false".to_string() - } - } - Self::Timestamp(ts) => { - let ts = Utc.timestamp_micros(*ts).single().unwrap(); - ts.format("%Y-%m-%d %H:%M:%S%.6f").to_string() - } - Self::Date(days) => { - let date = Utc.from_utc_datetime( - &NaiveDateTime::from_timestamp_opt(*days as i64 * 24 * 3600, 0).unwrap(), - ); - date.format("%Y-%m-%d").to_string() - } - Self::Decimal(value, _, scale) => match scale.cmp(&0) { - Ordering::Equal => value.to_string(), - Ordering::Greater => { - let scalar_multiple = 10_i128.pow(*scale as u32); - let mut s = String::new(); - s.push_str((value / scalar_multiple).to_string().as_str()); - s.push('.'); - s.push_str(&format!( - "{:0>scale$}", - value % scalar_multiple, - scale = *scale as usize - )); - s - } - Ordering::Less => { - let mut s = value.to_string(); - for _ in 0..(scale.abs()) { - s.push('0'); - } - s - } - }, - Self::Binary(val) => create_escaped_binary_string(val.as_slice()), - Self::Null(_) => "null".to_string(), - Self::Struct(_, _) => todo!("serializing struct values is not yet supported"), - } - } - - /// Serializes this scalar as a string for use in hive partition file names. - pub fn serialize_encoded(&self) -> String { - if self.is_null() { - return NULL_PARTITION_VALUE_DATA_PATH.to_string(); - } - Path::from(self.serialize()).to_string() - } - - /// Create a [`Scalar`] form a row in an arrow array. - pub fn from_array(arr: &dyn Array, index: usize) -> Option { - use arrow_array::*; - use arrow_schema::DataType::*; - - if arr.len() <= index { - return None; - } - if arr.is_null(index) { - return Some(Self::Null(arr.data_type().try_into().ok()?)); - } - - match arr.data_type() { - Utf8 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::String(v.value(index).to_string())), - LargeUtf8 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::String(v.value(index).to_string())), - Boolean => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Boolean(v.value(index))), - Binary => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Binary(v.value(index).to_vec())), - LargeBinary => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Binary(v.value(index).to_vec())), - FixedSizeBinary(_) => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Binary(v.value(index).to_vec())), - Int8 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Byte(v.value(index))), - Int16 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Short(v.value(index))), - Int32 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Integer(v.value(index))), - Int64 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Long(v.value(index))), - UInt8 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Byte(v.value(index) as i8)), - UInt16 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Short(v.value(index) as i16)), - UInt32 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Integer(v.value(index) as i32)), - UInt64 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Long(v.value(index) as i64)), - Float32 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Float(v.value(index))), - Float64 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Double(v.value(index))), - Decimal128(precision, scale) => { - arr.as_any().downcast_ref::().map(|v| { - let value = v.value(index); - Self::Decimal(value, *precision, *scale) - }) - } - Date32 => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Date(v.value(index))), - // TODO handle timezones when implementing timestamp ntz feature. - Timestamp(TimeUnit::Microsecond, None) => arr - .as_any() - .downcast_ref::() - .map(|v| Self::Timestamp(v.value(index))), - Struct(fields) => { - let struct_fields = fields - .iter() - .flat_map(|f| TryFrom::try_from(f.as_ref())) - .collect::>(); - let values = arr - .as_any() - .downcast_ref::() - .and_then(|struct_arr| { - struct_fields - .iter() - .map(|f: &StructField| { - struct_arr - .column_by_name(f.name()) - .and_then(|c| Self::from_array(c.as_ref(), index)) - }) - .collect::>>() - })?; - if struct_fields.len() != values.len() { - return None; - } - Some(Self::Struct(values, struct_fields)) - } - Float16 - | Decimal256(_, _) - | List(_) - | LargeList(_) - | FixedSizeList(_, _) - | Map(_, _) - | Date64 - | Timestamp(_, _) - | Time32(_) - | Time64(_) - | Duration(_) - | Interval(_) - | Dictionary(_, _) - | RunEndEncoded(_, _) - | Union(_, _) - | Null => None, - } - } -} - -impl PartialOrd for Scalar { - fn partial_cmp(&self, other: &Self) -> Option { - use Scalar::*; - match (self, other) { - (Null(_), Null(_)) => Some(Ordering::Equal), - (Integer(a), Integer(b)) => a.partial_cmp(b), - (Long(a), Long(b)) => a.partial_cmp(b), - (Short(a), Short(b)) => a.partial_cmp(b), - (Byte(a), Byte(b)) => a.partial_cmp(b), - (Float(a), Float(b)) => a.partial_cmp(b), - (Double(a), Double(b)) => a.partial_cmp(b), - (String(a), String(b)) => a.partial_cmp(b), - (Boolean(a), Boolean(b)) => a.partial_cmp(b), - (Timestamp(a), Timestamp(b)) => a.partial_cmp(b), - (Date(a), Date(b)) => a.partial_cmp(b), - (Binary(a), Binary(b)) => a.partial_cmp(b), - (Decimal(a, _, _), Decimal(b, _, _)) => a.partial_cmp(b), - (Struct(a, _), Struct(b, _)) => a.partial_cmp(b), - // TODO should we make an assumption about the ordering of nulls? - // rigth now this is only used for internal purposes. - (Null(_), _) => Some(Ordering::Less), - (_, Null(_)) => Some(Ordering::Greater), - _ => None, - } - } -} - -impl Display for Scalar { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Self::Integer(i) => write!(f, "{}", i), - Self::Long(i) => write!(f, "{}", i), - Self::Short(i) => write!(f, "{}", i), - Self::Byte(i) => write!(f, "{}", i), - Self::Float(fl) => write!(f, "{}", fl), - Self::Double(fl) => write!(f, "{}", fl), - Self::String(s) => write!(f, "'{}'", s), - Self::Boolean(b) => write!(f, "{}", b), - Self::Timestamp(ts) => write!(f, "{}", ts), - Self::Date(d) => write!(f, "{}", d), - Self::Binary(b) => write!(f, "{:?}", b), - Self::Decimal(value, _, scale) => match scale.cmp(&0) { - Ordering::Equal => { - write!(f, "{}", value) - } - Ordering::Greater => { - let scalar_multiple = 10_i128.pow(*scale as u32); - write!(f, "{}", value / scalar_multiple)?; - write!(f, ".")?; - write!( - f, - "{:0>scale$}", - value % scalar_multiple, - scale = *scale as usize - ) - } - Ordering::Less => { - write!(f, "{}", value)?; - for _ in 0..(scale.abs()) { - write!(f, "0")?; - } - Ok(()) - } - }, - Self::Null(_) => write!(f, "null"), - Self::Struct(values, fields) => { - write!(f, "{{")?; - for (i, (value, field)) in values.iter().zip(fields.iter()).enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{}: {}", field.name, value)?; - } - write!(f, "}}") - } - } - } -} - -impl From for Scalar { - fn from(i: i32) -> Self { - Self::Integer(i) - } -} - -impl From for Scalar { - fn from(i: i64) -> Self { - Self::Long(i) - } -} - -impl From for Scalar { - fn from(b: bool) -> Self { - Self::Boolean(b) - } -} - -impl From<&str> for Scalar { - fn from(s: &str) -> Self { - Self::String(s.into()) - } -} - -impl From for Scalar { - fn from(value: String) -> Self { - Self::String(value) - } -} - -// TODO: add more From impls - -impl PrimitiveType { - fn data_type(&self) -> DataType { - DataType::Primitive(self.clone()) - } - - /// Parses a string into a scalar value. - pub fn parse_scalar(&self, raw: &str) -> Result { - use PrimitiveType::*; - - lazy_static::lazy_static! { - static ref UNIX_EPOCH: DateTime = DateTime::from_timestamp(0, 0).unwrap(); - } - - if raw.is_empty() || raw == NULL_PARTITION_VALUE_DATA_PATH { - return Ok(Scalar::Null(self.data_type())); - } - - match self { - String => Ok(Scalar::String(raw.to_string())), - Byte => self.str_parse_scalar(raw, Scalar::Byte), - Short => self.str_parse_scalar(raw, Scalar::Short), - Integer => self.str_parse_scalar(raw, Scalar::Integer), - Long => self.str_parse_scalar(raw, Scalar::Long), - Float => self.str_parse_scalar(raw, Scalar::Float), - Double => self.str_parse_scalar(raw, Scalar::Double), - Boolean => { - if raw.eq_ignore_ascii_case("true") { - Ok(Scalar::Boolean(true)) - } else if raw.eq_ignore_ascii_case("false") { - Ok(Scalar::Boolean(false)) - } else { - Err(self.parse_error(raw)) - } - } - Date => { - let date = NaiveDate::parse_from_str(raw, "%Y-%m-%d") - .map_err(|_| self.parse_error(raw))? - .and_hms_opt(0, 0, 0) - .ok_or(self.parse_error(raw))?; - let date = Utc.from_utc_datetime(&date); - let days = date.signed_duration_since(*UNIX_EPOCH).num_days() as i32; - Ok(Scalar::Date(days)) - } - Timestamp => { - let timestamp = NaiveDateTime::parse_from_str(raw, "%Y-%m-%d %H:%M:%S%.f") - .map_err(|_| self.parse_error(raw))?; - let timestamp = Utc.from_utc_datetime(×tamp); - let micros = timestamp - .signed_duration_since(*UNIX_EPOCH) - .num_microseconds() - .ok_or(self.parse_error(raw))?; - Ok(Scalar::Timestamp(micros)) - } - Binary => { - let bytes = parse_escaped_binary_string(raw).map_err(|_| self.parse_error(raw))?; - Ok(Scalar::Binary(bytes)) - } - _ => todo!("parsing {:?} is not yet supported", self), - } - } - - fn parse_error(&self, raw: &str) -> Error { - Error::Parse(raw.to_string(), self.data_type()) - } - - fn str_parse_scalar( - &self, - raw: &str, - f: impl FnOnce(T) -> Scalar, - ) -> Result { - match raw.parse() { - Ok(val) => Ok(f(val)), - Err(..) => Err(self.parse_error(raw)), - } - } -} - -fn create_escaped_binary_string(data: &[u8]) -> String { - let mut escaped_string = String::new(); - for &byte in data { - // Convert each byte to its two-digit hexadecimal representation - let hex_representation = format!("{:04X}", byte); - // Append the hexadecimal representation with an escape sequence - escaped_string.push_str("\\u"); - escaped_string.push_str(&hex_representation); - } - escaped_string -} - -fn parse_escaped_binary_string(escaped_string: &str) -> Result, &'static str> { - let mut parsed_bytes = Vec::new(); - let mut chars = escaped_string.chars(); - - while let Some(ch) = chars.next() { - if ch == '\\' { - // Check for the escape sequence "\\u" indicating a hexadecimal value - if chars.next() == Some('u') { - // Read two hexadecimal digits and convert to u8 - if let (Some(digit1), Some(digit2), Some(digit3), Some(digit4)) = - (chars.next(), chars.next(), chars.next(), chars.next()) - { - if let Ok(byte) = - u8::from_str_radix(&format!("{}{}{}{}", digit1, digit2, digit3, digit4), 16) - { - parsed_bytes.push(byte); - } else { - return Err("Error parsing hexadecimal value"); - } - } else { - return Err("Incomplete escape sequence"); - } - } else { - // Unrecognized escape sequence - return Err("Unrecognized escape sequence"); - } - } else { - // Regular character, convert to u8 and push into the result vector - parsed_bytes.push(ch as u8); - } - } - - Ok(parsed_bytes) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_binary_roundtrip() { - let scalar = Scalar::Binary(vec![0, 1, 2, 3, 4, 5]); - let parsed = PrimitiveType::Binary - .parse_scalar(&scalar.serialize()) - .unwrap(); - assert_eq!(scalar, parsed); - } - - #[test] - fn test_decimal_display() { - let s = Scalar::Decimal(123456789, 9, 2); - assert_eq!(s.to_string(), "1234567.89"); - - let s = Scalar::Decimal(123456789, 9, 0); - assert_eq!(s.to_string(), "123456789"); - - let s = Scalar::Decimal(123456789, 9, 9); - assert_eq!(s.to_string(), "0.123456789"); - - let s = Scalar::Decimal(123, 9, -3); - assert_eq!(s.to_string(), "123000"); - } -} diff --git a/crates/core/src/kernel/mod.rs b/crates/core/src/kernel/mod.rs index 876a09a33c..ce788d6c4d 100644 --- a/crates/core/src/kernel/mod.rs +++ b/crates/core/src/kernel/mod.rs @@ -4,12 +4,11 @@ pub mod arrow; pub mod error; -pub mod expressions; pub mod models; +pub mod scalars; mod snapshot; pub use error::*; -pub use expressions::*; pub use models::*; pub use snapshot::*; diff --git a/crates/core/src/kernel/models/actions.rs b/crates/core/src/kernel/models/actions.rs index 28eaa89cc4..962b71b21b 100644 --- a/crates/core/src/kernel/models/actions.rs +++ b/crates/core/src/kernel/models/actions.rs @@ -5,6 +5,8 @@ use std::str::FromStr; // use std::sync::Arc; // use roaring::RoaringTreemap; +use crate::DeltaConfigKey; +use maplit::hashset; use serde::{Deserialize, Serialize}; use tracing::warn; use url::Url; @@ -137,30 +139,243 @@ pub struct Protocol { impl Protocol { /// Create a new protocol action - pub fn new(min_reader_version: i32, min_wrriter_version: i32) -> Self { + pub fn new(min_reader_version: i32, min_writer_version: i32) -> Self { Self { min_reader_version, - min_writer_version: min_wrriter_version, + min_writer_version, reader_features: None, writer_features: None, } } - /// set the reader features in the protocol action + /// set the reader features in the protocol action, automatically bumps min_reader_version pub fn with_reader_features( mut self, reader_features: impl IntoIterator>, ) -> Self { - self.reader_features = Some(reader_features.into_iter().map(|c| c.into()).collect()); + let all_reader_features = reader_features + .into_iter() + .map(Into::into) + .collect::>(); + if !all_reader_features.is_empty() { + self.min_reader_version = 3 + } + self.reader_features = Some(all_reader_features); self } - /// set the writer features in the protocol action + /// set the writer features in the protocol action, automatically bumps min_writer_version pub fn with_writer_features( mut self, writer_features: impl IntoIterator>, ) -> Self { - self.writer_features = Some(writer_features.into_iter().map(|c| c.into()).collect()); + let all_writer_feautures = writer_features + .into_iter() + .map(|c| c.into()) + .collect::>(); + if !all_writer_feautures.is_empty() { + self.min_writer_version = 7 + } + self.writer_features = Some(all_writer_feautures); + self + } + + /// Converts existing properties into features if the reader_version is >=3 or writer_version >=3 + /// only converts features that are "true" + pub fn move_table_properties_into_features( + mut self, + configuration: &HashMap>, + ) -> Protocol { + if self.min_writer_version >= 7 { + let mut converted_writer_features = configuration + .iter() + .filter(|(_, value)| { + value.as_ref().map_or(false, |v| { + v.to_ascii_lowercase().parse::().is_ok_and(|v| v) + }) + }) + .collect::>>() + .keys() + .map(|key| (*key).clone().into()) + .filter(|v| !matches!(v, WriterFeatures::Other(_))) + .collect::>(); + + if configuration + .keys() + .any(|v| v.starts_with("delta.constraints.")) + { + converted_writer_features.insert(WriterFeatures::CheckConstraints); + } + + match self.writer_features { + Some(mut features) => { + features.extend(converted_writer_features); + self.writer_features = Some(features); + } + None => self.writer_features = Some(converted_writer_features), + } + } + if self.min_reader_version > 3 { + let converted_reader_features = configuration + .iter() + .filter(|(_, value)| { + value.as_ref().map_or(false, |v| { + v.to_ascii_lowercase().parse::().is_ok_and(|v| v) + }) + }) + .map(|(key, _)| (*key).clone().into()) + .filter(|v| !matches!(v, ReaderFeatures::Other(_))) + .collect::>(); + match self.reader_features { + Some(mut features) => { + features.extend(converted_reader_features); + self.reader_features = Some(features); + } + None => self.reader_features = Some(converted_reader_features), + } + } + self + } + /// Will apply the properties to the protocol by either bumping the version or setting + /// features + pub fn apply_properties_to_protocol( + mut self, + new_properties: &HashMap, + raise_if_not_exists: bool, + ) -> DeltaResult { + let mut parsed_properties: HashMap = HashMap::new(); + + for (key, value) in new_properties { + if let Ok(parsed_key) = key.parse::() { + parsed_properties.insert(parsed_key, value.to_string()); + } else if raise_if_not_exists { + return Err(Error::Generic(format!( + "Error parsing property '{}':'{}'", + key, value + ))); + } + } + + // Check and update delta.minReaderVersion + if let Some(min_reader_version) = parsed_properties.get(&DeltaConfigKey::MinReaderVersion) { + let new_min_reader_version = min_reader_version.parse::(); + match new_min_reader_version { + Ok(version) => match version { + 1..=3 => { + if version > self.min_reader_version { + self.min_reader_version = version + } + } + _ => { + return Err(Error::Generic(format!( + "delta.minReaderVersion = '{}' is invalid, valid values are ['1','2','3']", + min_reader_version + ))) + } + }, + Err(_) => { + return Err(Error::Generic(format!( + "delta.minReaderVersion = '{}' is invalid, valid values are ['1','2','3']", + min_reader_version + ))) + } + } + } + + // Check and update delta.minWriterVersion + if let Some(min_writer_version) = parsed_properties.get(&DeltaConfigKey::MinWriterVersion) { + let new_min_writer_version = min_writer_version.parse::(); + match new_min_writer_version { + Ok(version) => match version { + 2..=7 => { + if version > self.min_writer_version { + self.min_writer_version = version + } + } + _ => { + return Err(Error::Generic(format!( + "delta.minWriterVersion = '{}' is invalid, valid values are ['2','3','4','5','6','7']", + min_writer_version + ))) + } + }, + Err(_) => { + return Err(Error::Generic(format!( + "delta.minWriterVersion = '{}' is invalid, valid values are ['2','3','4','5','6','7']", + min_writer_version + ))) + } + } + } + + // Check enableChangeDataFeed and bump protocol or add writerFeature if writer versions is >=7 + if let Some(enable_cdf) = parsed_properties.get(&DeltaConfigKey::EnableChangeDataFeed) { + let if_enable_cdf = enable_cdf.to_ascii_lowercase().parse::(); + match if_enable_cdf { + Ok(true) => { + if self.min_writer_version >= 7 { + match self.writer_features { + Some(mut features) => { + features.insert(WriterFeatures::ChangeDataFeed); + self.writer_features = Some(features); + } + None => { + self.writer_features = + Some(hashset! {WriterFeatures::ChangeDataFeed}) + } + } + } else if self.min_writer_version <= 3 { + self.min_writer_version = 4 + } + } + Ok(false) => {} + _ => { + return Err(Error::Generic(format!( + "delta.enableChangeDataFeed = '{}' is invalid, valid values are ['true']", + enable_cdf + ))) + } + } + } + + if let Some(enable_dv) = parsed_properties.get(&DeltaConfigKey::EnableDeletionVectors) { + let if_enable_dv = enable_dv.to_ascii_lowercase().parse::(); + match if_enable_dv { + Ok(true) => { + let writer_features = match self.writer_features { + Some(mut features) => { + features.insert(WriterFeatures::DeletionVectors); + features + } + None => hashset! {WriterFeatures::DeletionVectors}, + }; + let reader_features = match self.reader_features { + Some(mut features) => { + features.insert(ReaderFeatures::DeletionVectors); + features + } + None => hashset! {ReaderFeatures::DeletionVectors}, + }; + self.min_reader_version = 3; + self.min_writer_version = 7; + self.writer_features = Some(writer_features); + self.reader_features = Some(reader_features); + } + Ok(false) => {} + _ => { + return Err(Error::Generic(format!( + "delta.enableDeletionVectors = '{}' is invalid, valid values are ['true']", + enable_dv + ))) + } + } + } + Ok(self) + } + /// Enable timestamp_ntz in the protocol + pub fn enable_timestamp_ntz(mut self) -> Protocol { + self = self.with_reader_features(vec![ReaderFeatures::TimestampWithoutTimezone]); + self = self.with_writer_features(vec![WriterFeatures::TimestampWithoutTimezone]); self } } @@ -175,7 +390,7 @@ pub enum ReaderFeatures { /// Deletion vectors for merge, update, delete DeletionVectors, /// timestamps without timezone support - #[serde(alias = "timestampNtz")] + #[serde(rename = "timestampNtz")] TimestampWithoutTimezone, /// version 2 of checkpointing V2Checkpoint, @@ -189,7 +404,9 @@ impl From<&parquet::record::Field> for ReaderFeatures { match value { parquet::record::Field::Str(feature) => match feature.as_str() { "columnMapping" => ReaderFeatures::ColumnMapping, - "deletionVectors" => ReaderFeatures::DeletionVectors, + "deletionVectors" | "delta.enableDeletionVectors" => { + ReaderFeatures::DeletionVectors + } "timestampNtz" => ReaderFeatures::TimestampWithoutTimezone, "v2Checkpoint" => ReaderFeatures::V2Checkpoint, f => ReaderFeatures::Other(f.to_string()), @@ -259,7 +476,7 @@ pub enum WriterFeatures { /// Row tracking on tables RowTracking, /// timestamps without timezone support - #[serde(alias = "timestampNtz")] + #[serde(rename = "timestampNtz")] TimestampWithoutTimezone, /// domain specific metadata DomainMetadata, @@ -281,15 +498,15 @@ impl From for WriterFeatures { impl From<&str> for WriterFeatures { fn from(value: &str) -> Self { match value { - "appendOnly" => WriterFeatures::AppendOnly, + "appendOnly" | "delta.appendOnly" => WriterFeatures::AppendOnly, "invariants" => WriterFeatures::Invariants, "checkConstraints" => WriterFeatures::CheckConstraints, - "changeDataFeed" => WriterFeatures::ChangeDataFeed, + "changeDataFeed" | "delta.enableChangeDataFeed" => WriterFeatures::ChangeDataFeed, "generatedColumns" => WriterFeatures::GeneratedColumns, "columnMapping" => WriterFeatures::ColumnMapping, "identityColumns" => WriterFeatures::IdentityColumns, - "deletionVectors" => WriterFeatures::DeletionVectors, - "rowTracking" => WriterFeatures::RowTracking, + "deletionVectors" | "delta.enableDeletionVectors" => WriterFeatures::DeletionVectors, + "rowTracking" | "delta.enableRowTracking" => WriterFeatures::RowTracking, "timestampNtz" => WriterFeatures::TimestampWithoutTimezone, "domainMetadata" => WriterFeatures::DomainMetadata, "v2Checkpoint" => WriterFeatures::V2Checkpoint, @@ -351,7 +568,7 @@ impl From<&parquet::record::Field> for WriterFeatures { } ///Storage type of deletion vector -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Copy, Clone, Debug, PartialEq, Eq)] pub enum StorageType { /// Stored at relative path derived from a UUID. #[serde(rename = "u")] @@ -657,7 +874,7 @@ pub struct AddCDCFile { /// enable idempotency. #[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)] #[serde(rename_all = "camelCase")] -pub struct Txn { +pub struct Transaction { /// A unique identifier for the application performing the transaction. pub app_id: String, @@ -669,6 +886,26 @@ pub struct Txn { pub last_updated: Option, } +impl Transaction { + /// Create a new application transactions. See [`Txn`] for details. + pub fn new(app_id: impl ToString, version: i64) -> Self { + Self::new_with_last_update(app_id, version, None) + } + + /// Create a new application transactions. See [`Txn`] for details. + pub fn new_with_last_update( + app_id: impl ToString, + version: i64, + last_updated: Option, + ) -> Self { + Transaction { + app_id: app_id.to_string(), + version, + last_updated, + } + } +} + /// The commitInfo is a fairly flexible action within the delta specification, where arbitrary data can be stored. /// However the reference implementation as well as delta-rs store useful information that may for instance /// allow us to be more permissive in commit conflict resolution. @@ -714,6 +951,10 @@ pub struct CommitInfo { /// Additional provenance information for the commit #[serde(flatten, default)] pub info: HashMap, + + /// User defined metadata + #[serde(skip_serializing_if = "Option::is_none")] + pub user_metadata: Option, } /// The domain metadata action contains a configuration (string) for a named metadata domain @@ -766,7 +1007,7 @@ pub struct Sidecar { pub tags: Option>>, } -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq)] /// The isolation level applied during transaction pub enum IsolationLevel { /// The strongest isolation level. It ensures that committed write operations diff --git a/crates/core/src/kernel/models/fields.rs b/crates/core/src/kernel/models/fields.rs index fa672aaefc..6c699f0e88 100644 --- a/crates/core/src/kernel/models/fields.rs +++ b/crates/core/src/kernel/models/fields.rs @@ -1,8 +1,8 @@ //! Schema definitions for action types +use delta_kernel::schema::{ArrayType, DataType, MapType, StructField, StructType}; use lazy_static::lazy_static; -use super::schema::{ArrayType, DataType, MapType, StructField, StructType}; use super::ActionType; impl ActionType { diff --git a/crates/core/src/kernel/models/mod.rs b/crates/core/src/kernel/models/mod.rs index eda7e6fb60..a8ee2f8d31 100644 --- a/crates/core/src/kernel/models/mod.rs +++ b/crates/core/src/kernel/models/mod.rs @@ -14,7 +14,7 @@ mod schema; pub use actions::*; pub use schema::*; -#[derive(Debug)] +#[derive(Debug, Hash, PartialEq, Eq, Clone, Serialize, Deserialize)] /// The type of action that was performed on the table pub enum ActionType { /// modify the data in a table by adding individual logical files @@ -49,7 +49,7 @@ pub enum Action { Add(Add), Remove(Remove), Cdc(AddCDCFile), - Txn(Txn), + Txn(Transaction), CommitInfo(CommitInfo), DomainMetadata(DomainMetadata), } @@ -94,8 +94,8 @@ impl From for Action { } } -impl From for Action { - fn from(a: Txn) -> Self { +impl From for Action { + fn from(a: Transaction) -> Self { Self::Txn(a) } } diff --git a/crates/core/src/kernel/models/schema.rs b/crates/core/src/kernel/models/schema.rs index 874bade71d..3a88564f1d 100644 --- a/crates/core/src/kernel/models/schema.rs +++ b/crates/core/src/kernel/models/schema.rs @@ -1,12 +1,11 @@ //! Delta table schema -use std::borrow::Borrow; -use std::fmt::Formatter; -use std::hash::{Hash, Hasher}; use std::sync::Arc; -use std::{collections::HashMap, fmt::Display}; -use serde::{Deserialize, Serialize}; +pub use delta_kernel::schema::{ + ArrayType, ColumnMetadataKey, DataType, MapType, MetadataValue, PrimitiveType, StructField, + StructType, +}; use serde_json::Value; use crate::kernel::error::Error; @@ -17,76 +16,6 @@ pub type Schema = StructType; /// Schema reference type pub type SchemaRef = Arc; -/// A value that can be stored in the metadata of a Delta table schema entity. -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] -#[serde(untagged)] -pub enum MetadataValue { - /// A number value - Number(i32), - /// A string value - String(String), - /// A Boolean value - Boolean(bool), -} - -impl From for MetadataValue { - fn from(value: String) -> Self { - Self::String(value) - } -} - -impl From<&String> for MetadataValue { - fn from(value: &String) -> Self { - Self::String(value.clone()) - } -} - -impl From for MetadataValue { - fn from(value: i32) -> Self { - Self::Number(value) - } -} - -impl From for MetadataValue { - fn from(value: bool) -> Self { - Self::Boolean(value) - } -} - -impl From for MetadataValue { - fn from(value: Value) -> Self { - Self::String(value.to_string()) - } -} - -#[derive(Debug)] -#[allow(missing_docs)] -pub enum ColumnMetadataKey { - ColumnMappingId, - ColumnMappingPhysicalName, - GenerationExpression, - IdentityStart, - IdentityStep, - IdentityHighWaterMark, - IdentityAllowExplicitInsert, - Invariants, -} - -impl AsRef for ColumnMetadataKey { - fn as_ref(&self) -> &str { - match self { - Self::ColumnMappingId => "delta.columnMapping.id", - Self::ColumnMappingPhysicalName => "delta.columnMapping.physicalName", - Self::GenerationExpression => "delta.generationExpression", - Self::IdentityAllowExplicitInsert => "delta.identity.allowExplicitInsert", - Self::IdentityHighWaterMark => "delta.identity.highWaterMark", - Self::IdentityStart => "delta.identity.start", - Self::IdentityStep => "delta.identity.step", - Self::Invariants => "delta.invariants", - } - } -} - /// An invariant for a column that is enforced on all writes to a Delta table. #[derive(Eq, PartialEq, Debug, Default, Clone)] pub struct Invariant { @@ -116,154 +45,17 @@ impl DataCheck for Invariant { } } -/// Represents a struct field defined in the Delta table schema. -// https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] -pub struct StructField { - /// Name of this (possibly nested) column - pub name: String, - /// The data type of this field - #[serde(rename = "type")] - pub data_type: DataType, - /// Denotes whether this Field can be null - pub nullable: bool, - /// A JSON map containing information about this column - pub metadata: HashMap, -} - -impl Hash for StructField { - fn hash(&self, state: &mut H) { - self.name.hash(state); - } -} - -impl Borrow for StructField { - fn borrow(&self) -> &str { - self.name.as_ref() - } -} - -impl Eq for StructField {} - -impl StructField { - /// Creates a new field - pub fn new(name: impl Into, data_type: impl Into, nullable: bool) -> Self { - Self { - name: name.into(), - data_type: data_type.into(), - nullable, - metadata: HashMap::default(), - } - } - - /// Creates a new field with metadata - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, impl Into)>, - ) -> Self { - self.metadata = metadata - .into_iter() - .map(|(k, v)| (k.into(), v.into())) - .collect(); - self - } - - /// Get the value of a specific metadata key - pub fn get_config_value(&self, key: &ColumnMetadataKey) -> Option<&MetadataValue> { - self.metadata.get(key.as_ref()) - } - - #[inline] - /// Returns the name of the column - pub fn name(&self) -> &String { - &self.name - } - - #[inline] - /// Returns whether the column is nullable - pub fn is_nullable(&self) -> bool { - self.nullable - } - - /// Returns the physical name of the column - /// Equals the name if column mapping is not enabled on table - pub fn physical_name(&self) -> Result<&str, Error> { - // Even on mapping type id the physical name should be there for partitions - let phys_name = self.get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName); - match phys_name { - None => Ok(&self.name), - Some(MetadataValue::Boolean(_)) => Ok(&self.name), - Some(MetadataValue::String(s)) => Ok(s), - Some(MetadataValue::Number(_)) => Err(Error::MetadataError( - "Unexpected type for physical name".to_string(), - )), - } - } - - #[inline] - /// Returns the data type of the column - pub const fn data_type(&self) -> &DataType { - &self.data_type - } - - #[inline] - /// Returns the metadata of the column - pub const fn metadata(&self) -> &HashMap { - &self.metadata - } -} - -/// A struct is used to represent both the top-level schema of the table -/// as well as struct columns that contain nested columns. -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] -pub struct StructType { - #[serde(rename = "type")] - /// The type of this struct - pub type_name: String, - /// The type of element stored in this array - pub fields: Vec, +/// Trait to add convenince functions to struct type +pub trait StructTypeExt { + /// Get all invariants in the schemas + fn get_invariants(&self) -> Result, Error>; } -impl StructType { - /// Creates a new struct type - pub fn new(fields: Vec) -> Self { - Self { - type_name: "struct".into(), - fields, - } - } - - /// Returns an immutable reference of the fields in the struct - pub fn fields(&self) -> &Vec { - &self.fields - } - - /// Find the index of the column with the given name. - pub fn index_of(&self, name: &str) -> Result { - let (idx, _) = self - .fields() - .iter() - .enumerate() - .find(|(_, b)| b.name() == name) - .ok_or_else(|| { - let valid_fields: Vec<_> = self.fields.iter().map(|f| f.name()).collect(); - Error::Schema(format!( - "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" - )) - })?; - Ok(idx) - } - - /// Returns a reference of a specific [`StructField`] instance selected by name. - pub fn field_with_name(&self, name: &str) -> Result<&StructField, Error> { - Ok(&self.fields[self.index_of(name)?]) - } - +impl StructTypeExt for StructType { /// Get all invariants in the schemas - pub fn get_invariants(&self) -> Result, Error> { + fn get_invariants(&self) -> Result, Error> { let mut remaining_fields: Vec<(String, StructField)> = self .fields() - .iter() .map(|field| (field.name.clone(), field.clone())) .collect(); let mut invariants: Vec = Vec::new(); @@ -282,7 +74,6 @@ impl StructType { remaining_fields.extend( inner .fields() - .iter() .map(|field| { let new_prefix = add_segment(&field_path, &field.name); (new_prefix, field.clone()) @@ -334,469 +125,12 @@ impl StructType { } } -impl FromIterator for StructType { - fn from_iter>(iter: T) -> Self { - Self { - type_name: "struct".into(), - fields: iter.into_iter().collect(), - } - } -} - -impl<'a> FromIterator<&'a StructField> for StructType { - fn from_iter>(iter: T) -> Self { - Self { - type_name: "struct".into(), - fields: iter.into_iter().cloned().collect(), - } - } -} - -impl From<[StructField; N]> for StructType { - fn from(value: [StructField; N]) -> Self { - Self { - type_name: "struct".into(), - fields: value.to_vec(), - } - } -} - -impl<'a, const N: usize> From<[&'a StructField; N]> for StructType { - fn from(value: [&'a StructField; N]) -> Self { - Self { - type_name: "struct".into(), - fields: value.into_iter().cloned().collect(), - } - } -} - -impl<'a> IntoIterator for &'a StructType { - type Item = &'a StructField; - type IntoIter = std::slice::Iter<'a, StructField>; - - fn into_iter(self) -> Self::IntoIter { - self.fields.iter() - } -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] -#[serde(rename_all = "camelCase")] -/// An array stores a variable length collection of items of some type. -pub struct ArrayType { - #[serde(rename = "type")] - /// The type of this struct - pub type_name: String, - /// The type of element stored in this array - pub element_type: DataType, - /// Denoting whether this array can contain one or more null values - pub contains_null: bool, -} - -impl ArrayType { - /// Creates a new array type - pub fn new(element_type: DataType, contains_null: bool) -> Self { - Self { - type_name: "array".into(), - element_type, - contains_null, - } - } - - #[inline] - /// Returns the element type of the array - pub const fn element_type(&self) -> &DataType { - &self.element_type - } - - #[inline] - /// Returns whether the array can contain null values - pub const fn contains_null(&self) -> bool { - self.contains_null - } -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] -#[serde(rename_all = "camelCase")] -/// A map stores an arbitrary length collection of key-value pairs -pub struct MapType { - #[serde(rename = "type")] - /// The type of this struct - pub type_name: String, - /// The type of element used for the key of this map - pub key_type: DataType, - /// The type of element used for the value of this map - pub value_type: DataType, - /// Denoting whether this array can contain one or more null values - #[serde(default = "default_true")] - pub value_contains_null: bool, -} - -impl MapType { - /// Creates a new map type - pub fn new(key_type: DataType, value_type: DataType, value_contains_null: bool) -> Self { - Self { - type_name: "map".into(), - key_type, - value_type, - value_contains_null, - } - } - - #[inline] - /// Returns the key type of the map - pub const fn key_type(&self) -> &DataType { - &self.key_type - } - - #[inline] - /// Returns the value type of the map - pub const fn value_type(&self) -> &DataType { - &self.value_type - } - - #[inline] - /// Returns whether the map can contain null values - pub const fn value_contains_null(&self) -> bool { - self.value_contains_null - } -} - -fn default_true() -> bool { - true -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] -#[serde(rename_all = "camelCase")] -/// Primitive types supported by Delta -pub enum PrimitiveType { - /// UTF-8 encoded string of characters - String, - /// i64: 8-byte signed integer. Range: -9223372036854775808 to 9223372036854775807 - Long, - /// i32: 4-byte signed integer. Range: -2147483648 to 2147483647 - Integer, - /// i16: 2-byte signed integer numbers. Range: -32768 to 32767 - Short, - /// i8: 1-byte signed integer number. Range: -128 to 127 - Byte, - /// f32: 4-byte single-precision floating-point numbers - Float, - /// f64: 8-byte double-precision floating-point numbers - Double, - /// bool: boolean values - Boolean, - /// Binary: uninterpreted binary data - Binary, - /// Date: Calendar date (year, month, day) - Date, - /// Microsecond precision timestamp, adjusted to UTC. - Timestamp, - // TODO: timestamp without timezone - #[serde( - serialize_with = "serialize_decimal", - deserialize_with = "deserialize_decimal", - untagged - )] - /// Decimal: arbitrary precision decimal numbers - Decimal(u8, i8), -} - -fn serialize_decimal( - precision: &u8, - scale: &i8, - serializer: S, -) -> Result { - serializer.serialize_str(&format!("decimal({},{})", precision, scale)) -} - -fn deserialize_decimal<'de, D>(deserializer: D) -> Result<(u8, i8), D::Error> -where - D: serde::Deserializer<'de>, -{ - let str_value = String::deserialize(deserializer)?; - if !str_value.starts_with("decimal(") || !str_value.ends_with(')') { - return Err(serde::de::Error::custom(format!( - "Invalid decimal: {}", - str_value - ))); - } - - let mut parts = str_value[8..str_value.len() - 1].split(','); - let precision = parts - .next() - .and_then(|part| part.trim().parse::().ok()) - .ok_or_else(|| { - serde::de::Error::custom(format!("Invalid precision in decimal: {}", str_value)) - })?; - let scale = parts - .next() - .and_then(|part| part.trim().parse::().ok()) - .ok_or_else(|| { - serde::de::Error::custom(format!("Invalid scale in decimal: {}", str_value)) - })?; - - Ok((precision, scale)) -} - -impl Display for PrimitiveType { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - PrimitiveType::String => write!(f, "string"), - PrimitiveType::Long => write!(f, "long"), - PrimitiveType::Integer => write!(f, "integer"), - PrimitiveType::Short => write!(f, "short"), - PrimitiveType::Byte => write!(f, "byte"), - PrimitiveType::Float => write!(f, "float"), - PrimitiveType::Double => write!(f, "double"), - PrimitiveType::Boolean => write!(f, "boolean"), - PrimitiveType::Binary => write!(f, "binary"), - PrimitiveType::Date => write!(f, "date"), - PrimitiveType::Timestamp => write!(f, "timestamp"), - PrimitiveType::Decimal(precision, scale) => { - write!(f, "decimal({},{})", precision, scale) - } - } - } -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] -#[serde(untagged, rename_all = "camelCase")] -/// Top level delta tdatatypes -pub enum DataType { - /// UTF-8 encoded string of characters - Primitive(PrimitiveType), - /// An array stores a variable length collection of items of some type. - Array(Box), - /// A struct is used to represent both the top-level schema of the table as well - /// as struct columns that contain nested columns. - Struct(Box), - /// A map stores an arbitrary length collection of key-value pairs - /// with a single keyType and a single valueType - Map(Box), -} - -impl From for DataType { - fn from(map_type: MapType) -> Self { - DataType::Map(Box::new(map_type)) - } -} - -impl From for DataType { - fn from(struct_type: StructType) -> Self { - DataType::Struct(Box::new(struct_type)) - } -} - -impl From for DataType { - fn from(array_type: ArrayType) -> Self { - DataType::Array(Box::new(array_type)) - } -} - -#[allow(missing_docs)] -impl DataType { - pub const STRING: Self = DataType::Primitive(PrimitiveType::String); - pub const LONG: Self = DataType::Primitive(PrimitiveType::Long); - pub const INTEGER: Self = DataType::Primitive(PrimitiveType::Integer); - pub const SHORT: Self = DataType::Primitive(PrimitiveType::Short); - pub const BYTE: Self = DataType::Primitive(PrimitiveType::Byte); - pub const FLOAT: Self = DataType::Primitive(PrimitiveType::Float); - pub const DOUBLE: Self = DataType::Primitive(PrimitiveType::Double); - pub const BOOLEAN: Self = DataType::Primitive(PrimitiveType::Boolean); - pub const BINARY: Self = DataType::Primitive(PrimitiveType::Binary); - pub const DATE: Self = DataType::Primitive(PrimitiveType::Date); - pub const TIMESTAMP: Self = DataType::Primitive(PrimitiveType::Timestamp); - - pub fn decimal(precision: u8, scale: i8) -> Self { - DataType::Primitive(PrimitiveType::Decimal(precision, scale)) - } - - pub fn struct_type(fields: Vec) -> Self { - DataType::Struct(Box::new(StructType::new(fields))) - } -} - -impl Display for DataType { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - DataType::Primitive(p) => write!(f, "{}", p), - DataType::Array(a) => write!(f, "array<{}>", a.element_type), - DataType::Struct(s) => { - write!(f, "struct<")?; - for (i, field) in s.fields.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{}: {}", field.name, field.data_type)?; - } - write!(f, ">") - } - DataType::Map(m) => write!(f, "map<{}, {}>", m.key_type, m.value_type), - } - } -} - #[cfg(test)] mod tests { use super::*; use serde_json; use serde_json::json; - #[test] - fn test_serde_data_types() { - let data = r#" - { - "name": "a", - "type": "integer", - "nullable": false, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!( - field.data_type, - DataType::Primitive(PrimitiveType::Integer) - )); - - let data = r#" - { - "name": "c", - "type": { - "type": "array", - "elementType": "integer", - "containsNull": false - }, - "nullable": true, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!(field.data_type, DataType::Array(_))); - - let data = r#" - { - "name": "e", - "type": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [ - { - "name": "d", - "type": "integer", - "nullable": false, - "metadata": {} - } - ] - }, - "containsNull": true - }, - "nullable": true, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!(field.data_type, DataType::Array(_))); - match field.data_type { - DataType::Array(array) => assert!(matches!(array.element_type, DataType::Struct(_))), - _ => unreachable!(), - } - - let data = r#" - { - "name": "f", - "type": { - "type": "map", - "keyType": "string", - "valueType": "string", - "valueContainsNull": true - }, - "nullable": true, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!(field.data_type, DataType::Map(_))); - } - - #[test] - fn test_roundtrip_decimal() { - let data = r#" - { - "name": "a", - "type": "decimal(10, 2)", - "nullable": false, - "metadata": {} - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - assert!(matches!( - field.data_type, - DataType::Primitive(PrimitiveType::Decimal(10, 2)) - )); - - let json_str = serde_json::to_string(&field).unwrap(); - assert_eq!( - json_str, - r#"{"name":"a","type":"decimal(10,2)","nullable":false,"metadata":{}}"# - ); - } - - #[test] - fn test_field_metadata() { - let data = r#" - { - "name": "e", - "type": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [ - { - "name": "d", - "type": "integer", - "nullable": false, - "metadata": { - "delta.columnMapping.id": 5, - "delta.columnMapping.physicalName": "col-a7f4159c-53be-4cb0-b81a-f7e5240cfc49" - } - } - ] - }, - "containsNull": true - }, - "nullable": true, - "metadata": { - "delta.columnMapping.id": 4, - "delta.columnMapping.physicalName": "col-5f422f40-de70-45b2-88ab-1d5c90e94db1" - } - } - "#; - let field: StructField = serde_json::from_str(data).unwrap(); - - let col_id = field - .get_config_value(&ColumnMetadataKey::ColumnMappingId) - .unwrap(); - assert!(matches!(col_id, MetadataValue::Number(num) if *num == 4)); - let physical_name = field - .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) - .unwrap(); - assert!( - matches!(physical_name, MetadataValue::String(name) if *name == "col-5f422f40-de70-45b2-88ab-1d5c90e94db1") - ); - } - - #[test] - fn test_read_schemas() { - let file = std::fs::File::open("./tests/serde/schema.json").unwrap(); - let schema: Result = serde_json::from_reader(file); - assert!(schema.is_ok()); - - let file = std::fs::File::open("./tests/serde/checkpoint_schema.json").unwrap(); - let schema: Result = serde_json::from_reader(file); - assert!(schema.is_ok()) - } - #[test] fn test_get_invariants() { let schema: StructType = serde_json::from_value(json!({ @@ -864,6 +198,6 @@ mod tests { #[test] fn test_identity_columns() { let buf = r#"{"type":"struct","fields":[{"name":"ID_D_DATE","type":"long","nullable":true,"metadata":{"delta.identity.start":1,"delta.identity.step":1,"delta.identity.allowExplicitInsert":false}},{"name":"TXT_DateKey","type":"string","nullable":true,"metadata":{}}]}"#; - let schema: StructType = serde_json::from_str(buf).expect("Failed to load"); + let _schema: StructType = serde_json::from_str(buf).expect("Failed to load"); } } diff --git a/crates/core/src/kernel/scalars.rs b/crates/core/src/kernel/scalars.rs new file mode 100644 index 0000000000..92c6838234 --- /dev/null +++ b/crates/core/src/kernel/scalars.rs @@ -0,0 +1,233 @@ +//! Auxiliary methods for dealing with kernel scalars +//! +use arrow_array::Array; +use arrow_schema::TimeUnit; +use chrono::{DateTime, TimeZone, Utc}; +use delta_kernel::{ + expressions::{Scalar, StructData}, + schema::StructField, +}; +use object_store::path::Path; +use std::cmp::Ordering; +use urlencoding::encode; + +use crate::NULL_PARTITION_VALUE_DATA_PATH; + +/// Auxiliary methods for dealing with kernel scalars +pub trait ScalarExt: Sized { + /// Serialize to string + fn serialize(&self) -> String; + /// Serialize to string for use in hive partition file names + fn serialize_encoded(&self) -> String; + /// Create a [`Scalar`] from an arrow array row + fn from_array(arr: &dyn Array, index: usize) -> Option; +} + +impl ScalarExt for Scalar { + /// Serializes this scalar as a string. + fn serialize(&self) -> String { + match self { + Self::String(s) => s.to_owned(), + Self::Byte(b) => b.to_string(), + Self::Short(s) => s.to_string(), + Self::Integer(i) => i.to_string(), + Self::Long(l) => l.to_string(), + Self::Float(f) => f.to_string(), + Self::Double(d) => d.to_string(), + Self::Boolean(b) => if *b { "true" } else { "false" }.to_string(), + Self::TimestampNtz(ts) | Self::Timestamp(ts) => { + let ts = Utc.timestamp_micros(*ts).single().unwrap(); + ts.format("%Y-%m-%d %H:%M:%S%.6f").to_string() + } + Self::Date(days) => { + let date = DateTime::from_timestamp(*days as i64 * 24 * 3600, 0).unwrap(); + date.format("%Y-%m-%d").to_string() + } + Self::Decimal(value, _, scale) => match scale.cmp(&0) { + Ordering::Equal => value.to_string(), + Ordering::Greater => { + let scalar_multiple = 10_i128.pow(*scale as u32); + let mut s = String::new(); + s.push_str((value / scalar_multiple).to_string().as_str()); + s.push('.'); + s.push_str(&format!( + "{:0>scale$}", + value % scalar_multiple, + scale = *scale as usize + )); + s + } + Ordering::Less => { + let mut s = value.to_string(); + for _ in 0..*scale { + s.push('0'); + } + s + } + }, + Self::Binary(val) => create_escaped_binary_string(val.as_slice()), + Self::Null(_) => "null".to_string(), + Self::Struct(_) => unimplemented!(), + } + } + + /// Serializes this scalar as a string for use in hive partition file names. + fn serialize_encoded(&self) -> String { + if self.is_null() { + return NULL_PARTITION_VALUE_DATA_PATH.to_string(); + } + encode(Path::from(self.serialize()).as_ref()).to_string() + } + + /// Create a [`Scalar`] form a row in an arrow array. + fn from_array(arr: &dyn Array, index: usize) -> Option { + use arrow_array::*; + use arrow_schema::DataType::*; + + if arr.len() <= index { + return None; + } + if arr.is_null(index) { + return Some(Self::Null(arr.data_type().try_into().ok()?)); + } + + match arr.data_type() { + Utf8 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::String(v.value(index).to_string())), + LargeUtf8 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::String(v.value(index).to_string())), + Boolean => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Boolean(v.value(index))), + Binary => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Binary(v.value(index).to_vec())), + LargeBinary => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Binary(v.value(index).to_vec())), + FixedSizeBinary(_) => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Binary(v.value(index).to_vec())), + Int8 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Byte(v.value(index))), + Int16 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Short(v.value(index))), + Int32 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Integer(v.value(index))), + Int64 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Long(v.value(index))), + UInt8 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Byte(v.value(index) as i8)), + UInt16 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Short(v.value(index) as i16)), + UInt32 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Integer(v.value(index) as i32)), + UInt64 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Long(v.value(index) as i64)), + Float32 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Float(v.value(index))), + Float64 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Double(v.value(index))), + Decimal128(precision, scale) => { + arr.as_any().downcast_ref::().map(|v| { + let value = v.value(index); + Self::Decimal(value, *precision, *scale as u8) + }) + } + Date32 => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Date(v.value(index))), + Timestamp(TimeUnit::Microsecond, None) => arr + .as_any() + .downcast_ref::() + .map(|v| Self::TimestampNtz(v.value(index))), + Timestamp(TimeUnit::Microsecond, Some(tz)) if tz.eq_ignore_ascii_case("utc") => arr + .as_any() + .downcast_ref::() + .map(|v| Self::Timestamp(v.clone().value(index))), + Struct(fields) => { + let struct_fields = fields + .iter() + .flat_map(|f| TryFrom::try_from(f.as_ref())) + .collect::>(); + let values = arr + .as_any() + .downcast_ref::() + .and_then(|struct_arr| { + struct_fields + .iter() + .map(|f: &StructField| { + struct_arr + .column_by_name(f.name()) + .and_then(|c| Self::from_array(c.as_ref(), index)) + }) + .collect::>>() + })?; + Some(Self::Struct( + StructData::try_new(struct_fields, values).ok()?, + )) + } + Float16 + | Decimal256(_, _) + | List(_) + | LargeList(_) + | FixedSizeList(_, _) + | Map(_, _) + | Date64 + | Timestamp(_, _) + | Time32(_) + | Time64(_) + | Duration(_) + | Interval(_) + | Dictionary(_, _) + | RunEndEncoded(_, _) + | Union(_, _) + | Utf8View + | BinaryView + | ListView(_) + | LargeListView(_) + | Null => None, + } + } +} + +fn create_escaped_binary_string(data: &[u8]) -> String { + let mut escaped_string = String::new(); + for &byte in data { + // Convert each byte to its two-digit hexadecimal representation + let hex_representation = format!("{:04X}", byte); + // Append the hexadecimal representation with an escape sequence + escaped_string.push_str("\\u"); + escaped_string.push_str(&hex_representation); + } + escaped_string +} diff --git a/crates/core/src/kernel/snapshot/log_data.rs b/crates/core/src/kernel/snapshot/log_data.rs index b874b53421..254616691c 100644 --- a/crates/core/src/kernel/snapshot/log_data.rs +++ b/crates/core/src/kernel/snapshot/log_data.rs @@ -1,16 +1,19 @@ use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::sync::Arc; use arrow_array::{Array, Int32Array, Int64Array, MapArray, RecordBatch, StringArray, StructArray}; -use chrono::{NaiveDateTime, TimeZone, Utc}; +use chrono::{DateTime, Utc}; +use delta_kernel::expressions::Scalar; +use indexmap::IndexMap; use object_store::path::Path; use object_store::ObjectMeta; use percent_encoding::percent_decode_str; +use super::super::scalars::ScalarExt; use crate::kernel::arrow::extract::{extract_and_cast, extract_and_cast_opt}; use crate::kernel::{ - DataType, DeletionVectorDescriptor, Metadata, Remove, Scalar, StructField, StructType, + DataType, DeletionVectorDescriptor, Metadata, Remove, StructField, StructType, }; use crate::{DeltaResult, DeltaTableError}; @@ -19,37 +22,35 @@ const COL_MIN_VALUES: &str = "minValues"; const COL_MAX_VALUES: &str = "maxValues"; const COL_NULL_COUNT: &str = "nullCount"; -pub(crate) type PartitionFields<'a> = Arc>; -pub(crate) type PartitionValues<'a> = BTreeMap<&'a str, Scalar>; +pub(crate) type PartitionFields<'a> = Arc>; +pub(crate) type PartitionValues<'a> = IndexMap<&'a str, Scalar>; pub(crate) trait PartitionsExt { fn hive_partition_path(&self) -> String; } -impl PartitionsExt for BTreeMap<&str, Scalar> { +impl PartitionsExt for IndexMap<&str, Scalar> { fn hive_partition_path(&self) -> String { - let mut fields = self + let fields = self .iter() .map(|(k, v)| { let encoded = v.serialize_encoded(); format!("{k}={encoded}") }) .collect::>(); - fields.reverse(); fields.join("/") } } -impl PartitionsExt for BTreeMap { +impl PartitionsExt for IndexMap { fn hive_partition_path(&self) -> String { - let mut fields = self + let fields = self .iter() .map(|(k, v)| { let encoded = v.serialize_encoded(); format!("{k}={encoded}") }) .collect::>(); - fields.reverse(); fields.join("/") } } @@ -121,9 +122,9 @@ impl<'a> DeletionVectorView<'a> { } } -/// A view into the log data representiang a single logical file. +/// A view into the log data representing a single logical file. /// -/// This stuct holds a pointer to a specific row in the log data and provides access to the +/// This struct holds a pointer to a specific row in the log data and provides access to the /// information stored in that row by tracking references to the underlying arrays. /// /// Additionally, references to some table metadata is tracked to provide higher level @@ -179,20 +180,18 @@ impl LogicalFile<'_> { /// Datetime of the last modification time of the file. pub fn modification_datetime(&self) -> DeltaResult> { - Ok(Utc.from_utc_datetime( - &NaiveDateTime::from_timestamp_millis(self.modification_time()).ok_or( - DeltaTableError::from(crate::protocol::ProtocolError::InvalidField(format!( - "invalid modification_time: {:?}", - self.modification_time() - ))), - )?, + DateTime::from_timestamp_millis(self.modification_time()).ok_or(DeltaTableError::from( + crate::protocol::ProtocolError::InvalidField(format!( + "invalid modification_time: {:?}", + self.modification_time() + )), )) } /// The partition values for this logical file. pub fn partition_values(&self) -> DeltaResult> { if self.partition_fields.is_empty() { - return Ok(BTreeMap::new()); + return Ok(IndexMap::new()); } let map_value = self.partition_values.value(self.index); let keys = map_value @@ -237,7 +236,7 @@ impl LogicalFile<'_> { .unwrap_or(Scalar::Null(f.data_type.clone())); Ok((*k, val)) }) - .collect::>>() + .collect::>>() } /// Defines a deletion vector @@ -354,8 +353,17 @@ impl<'a> FileStatsAccessor<'a> { metadata .partition_columns .iter() - .map(|c| Ok((c.as_str(), schema.field_with_name(c.as_str())?))) - .collect::>>()?, + .map(|c| { + Ok(( + c.as_str(), + schema + .field(c.as_str()) + .ok_or(DeltaTableError::PartitionError { + partition: c.clone(), + })?, + )) + }) + .collect::>>()?, ); let deletion_vector = extract_and_cast_opt::(data, "add.deletionVector"); let deletion_vector = deletion_vector.and_then(|dv| { @@ -476,7 +484,7 @@ mod datafusion { use super::*; use crate::kernel::arrow::extract::{extract_and_cast_opt, extract_column}; - // TODO validate this works with "wide and narrow" boulds / stats + // TODO validate this works with "wide and narrow" builds / stats impl FileStatsAccessor<'_> { fn collect_count(&self, name: &str) -> Precision { @@ -550,7 +558,15 @@ mod datafusion { _ => None, }) .collect::>>() - .map(|o| Precision::Exact(ScalarValue::Struct(Some(o), fields.clone()))) + .map(|o| { + let arrays = o + .into_iter() + .map(|sv| sv.to_array()) + .collect::, datafusion_common::DataFusionError>>() + .unwrap(); + let sa = StructArray::new(fields.clone(), arrays, None); + Precision::Exact(ScalarValue::Struct(Arc::new(sa))) + }) .unwrap_or(Precision::Absent); } _ => Precision::Absent, @@ -665,7 +681,6 @@ mod datafusion { let column_statistics = self .schema .fields() - .iter() .map(|f| self.column_stats(f.name())) .collect::>>()?; Some(Statistics { diff --git a/crates/core/src/kernel/snapshot/log_segment.rs b/crates/core/src/kernel/snapshot/log_segment.rs index 6ad1690db1..69076bd066 100644 --- a/crates/core/src/kernel/snapshot/log_segment.rs +++ b/crates/core/src/kernel/snapshot/log_segment.rs @@ -1,5 +1,5 @@ use std::cmp::Ordering; -use std::collections::{HashMap, VecDeque}; +use std::collections::VecDeque; use std::sync::Arc; use arrow_array::RecordBatch; @@ -9,34 +9,25 @@ use itertools::Itertools; use lazy_static::lazy_static; use object_store::path::Path; use object_store::{Error as ObjectStoreError, ObjectMeta, ObjectStore}; -use parquet::arrow::arrow_reader::ArrowReaderOptions; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; +use parquet::arrow::ProjectionMask; use regex::Regex; use serde::{Deserialize, Serialize}; -use serde_json::Value; use tracing::debug; use super::parse; -use crate::kernel::{arrow::json, Action, ActionType, Metadata, Protocol, Schema, StructType}; +use crate::kernel::{arrow::json, ActionType, Metadata, Protocol, Schema, StructType}; use crate::logstore::LogStore; -use crate::operations::transaction::get_commit_bytes; -use crate::protocol::DeltaOperation; +use crate::operations::transaction::CommitData; use crate::{DeltaResult, DeltaTableConfig, DeltaTableError}; const LAST_CHECKPOINT_FILE_NAME: &str = "_last_checkpoint"; -pub type CommitData = (Vec, DeltaOperation, Option>); - lazy_static! { static ref CHECKPOINT_FILE_PATTERN: Regex = Regex::new(r"\d+\.checkpoint(\.\d+\.\d+)?\.parquet").unwrap(); - static ref DELTA_FILE_PATTERN: Regex = Regex::new(r"\d+\.json").unwrap(); - pub(super) static ref COMMIT_SCHEMA: StructType = StructType::new(vec![ - ActionType::Add.schema_field().clone(), - ActionType::Remove.schema_field().clone(), - ]); - pub(super) static ref CHECKPOINT_SCHEMA: StructType = - StructType::new(vec![ActionType::Add.schema_field().clone(),]); + static ref DELTA_FILE_PATTERN: Regex = Regex::new(r"^\d+\.json$").unwrap(); pub(super) static ref TOMBSTONE_SCHEMA: StructType = StructType::new(vec![ActionType::Remove.schema_field().clone(),]); } @@ -260,19 +251,45 @@ impl LogSegment { pub(super) fn checkpoint_stream( &self, store: Arc, - _read_schema: &Schema, + read_schema: &Schema, config: &DeltaTableConfig, ) -> BoxStream<'_, DeltaResult> { let batch_size = config.log_batch_size; + let read_schema = Arc::new(read_schema.clone()); futures::stream::iter(self.checkpoint_files.clone()) .map(move |meta| { let store = store.clone(); + let read_schema = read_schema.clone(); async move { - let reader = ParquetObjectReader::new(store, meta); - let options = ArrowReaderOptions::new(); //.with_page_index(enable_page_index); - let builder = - ParquetRecordBatchStreamBuilder::new_with_options(reader, options).await?; - builder.with_batch_size(batch_size).build() + let mut reader = ParquetObjectReader::new(store, meta); + let options = ArrowReaderOptions::new(); + let reader_meta = ArrowReaderMetadata::load_async(&mut reader, options).await?; + + // Create projection selecting read_schema fields from parquet file's arrow schema + let projection = reader_meta + .schema() + .fields + .iter() + .enumerate() + .filter_map(|(i, f)| { + if read_schema.fields.contains_key(f.name()) { + Some(i) + } else { + None + } + }) + .collect::>(); + let projection = + ProjectionMask::roots(reader_meta.parquet_schema(), projection); + + // Note: the output batch stream batches have all null value rows for action types not + // present in the projection. When a RowFilter was used to remove null rows, the performance + // got worse when projecting all fields, and was no better when projecting a subset. + // The all null rows are filtered out anyway when the batch stream is consumed. + ParquetRecordBatchStreamBuilder::new_with_metadata(reader, reader_meta) + .with_projection(projection.clone()) + .with_batch_size(batch_size) + .build() } }) .buffered(config.log_buffer_size) @@ -351,10 +368,10 @@ impl LogSegment { let mut decoder = json::get_decoder(Arc::new(read_schema.try_into()?), config)?; let mut commit_data = Vec::new(); - for (actions, operation, app_metadata) in commits { + for commit in commits { self.version += 1; let path = log_path.child(format!("{:020}.json", self.version)); - let bytes = get_commit_bytes(operation, actions, app_metadata.clone())?; + let bytes = commit.get_bytes()?; let meta = ObjectMeta { location: path, size: bytes.len(), @@ -383,13 +400,13 @@ struct CheckpointMetadata { #[allow(unreachable_pub)] // used by acceptance tests (TODO make an fn accessor?) pub version: i64, /// The number of actions that are stored in the checkpoint. - pub(crate) size: i32, + pub(crate) size: i64, /// The number of fragments if the last checkpoint was written in multiple parts. pub(crate) parts: Option, /// The number of bytes of the checkpoint. - pub(crate) size_in_bytes: Option, + pub(crate) size_in_bytes: Option, /// The number of AddFile actions in the checkpoint. - pub(crate) num_of_add_files: Option, + pub(crate) num_of_add_files: Option, /// The schema of the checkpoint file. pub(crate) checkpoint_schema: Option, /// The checksum of the last checkpoint JSON. @@ -449,7 +466,7 @@ async fn list_log_files_with_checkpoint( let checkpoint_files = files .iter() .filter_map(|f| { - if f.location.is_checkpoint_file() { + if f.location.is_checkpoint_file() && f.location.commit_version() == Some(cp.version) { Some(f.clone()) } else { None @@ -457,10 +474,16 @@ async fn list_log_files_with_checkpoint( }) .collect_vec(); - // TODO raise a proper error - assert_eq!(checkpoint_files.len(), cp.parts.unwrap_or(1) as usize); - - Ok((commit_files, checkpoint_files)) + if checkpoint_files.len() != cp.parts.unwrap_or(1) as usize { + let msg = format!( + "Number of checkpoint files '{}' is not equal to number of checkpoint metadata parts '{:?}'", + checkpoint_files.len(), + cp.parts + ); + Err(DeltaTableError::MetadataError(msg)) + } else { + Ok((commit_files, checkpoint_files)) + } } /// List relevant log files. @@ -516,6 +539,15 @@ pub(super) async fn list_log_files( #[cfg(test)] pub(super) mod tests { use deltalake_test::utils::*; + use tokio::task::JoinHandle; + + use crate::{ + checkpoints::{create_checkpoint_for, create_checkpoint_from_table_uri_and_cleanup}, + kernel::{Action, Add, Format, Remove}, + operations::transaction::{CommitBuilder, TableReference}, + protocol::{DeltaOperation, SaveMode}, + DeltaTableBuilder, + }; use super::*; @@ -617,4 +649,215 @@ pub(super) mod tests { Ok(()) } + + pub(crate) async fn concurrent_checkpoint(context: &IntegrationContext) -> TestResult { + context + .load_table(TestTables::LatestNotCheckpointed) + .await?; + let table_to_checkpoint = context + .table_builder(TestTables::LatestNotCheckpointed) + .load() + .await?; + let store = context + .table_builder(TestTables::LatestNotCheckpointed) + .build_storage()? + .object_store(); + let slow_list_store = Arc::new(slow_store::SlowListStore { store }); + + let version = table_to_checkpoint.version(); + let load_task: JoinHandle> = tokio::spawn(async move { + let segment = + LogSegment::try_new(&Path::default(), Some(version), slow_list_store.as_ref()) + .await?; + Ok(segment) + }); + + create_checkpoint_from_table_uri_and_cleanup( + &table_to_checkpoint.table_uri(), + version, + Some(false), + ) + .await?; + + let segment = load_task.await??; + assert_eq!(segment.version, version); + + Ok(()) + } + + mod slow_store { + use std::sync::Arc; + + use futures::stream::BoxStream; + use object_store::{ + path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, + ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, + }; + + #[derive(Debug)] + pub(super) struct SlowListStore { + pub store: Arc, + } + + impl std::fmt::Display for SlowListStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "SlowListStore {{ store: {} }}", self.store) + } + } + + #[async_trait::async_trait] + impl object_store::ObjectStore for SlowListStore { + async fn put_opts( + &self, + location: &Path, + bytes: PutPayload, + opts: PutOptions, + ) -> Result { + self.store.put_opts(location, bytes, opts).await + } + async fn put_multipart(&self, location: &Path) -> Result> { + self.store.put_multipart(location).await + } + + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + self.store.put_multipart_opts(location, opts).await + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + self.store.get_opts(location, options).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.store.delete(location).await + } + + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + std::thread::sleep(std::time::Duration::from_secs(1)); + self.store.list(prefix) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + self.store.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.store.copy(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.store.copy_if_not_exists(from, to).await + } + } + } + + #[test] + pub fn is_commit_file_only_matches_commits() { + for path in [0, 1, 5, 10, 100, i64::MAX] + .into_iter() + .map(crate::storage::commit_uri_from_version) + { + assert!(path.is_commit_file()); + } + + let not_commits = ["_delta_log/_commit_2132c4fe-4077-476c-b8f5-e77fea04f170.json.tmp"]; + + for not_commit in not_commits { + let path = Path::from(not_commit); + assert!(!path.is_commit_file()); + } + } + + #[tokio::test] + async fn test_checkpoint_stream_parquet_read() { + let metadata = Metadata { + id: "test".to_string(), + format: Format::new("parquet".to_string(), None), + schema_string: r#"{"type":"struct", "fields": []}"#.to_string(), + ..Default::default() + }; + let protocol = Protocol::default(); + + let mut actions = vec![Action::Metadata(metadata), Action::Protocol(protocol)]; + for i in 0..10 { + actions.push(Action::Add(Add { + path: format!("part-{}.parquet", i), + modification_time: chrono::Utc::now().timestamp_millis() as i64, + ..Default::default() + })); + } + + let log_store = DeltaTableBuilder::from_uri("memory:///".to_string()) + .build_storage() + .unwrap(); + let op = DeltaOperation::Write { + mode: SaveMode::Overwrite, + partition_by: None, + predicate: None, + }; + let commit = CommitBuilder::default() + .with_actions(actions) + .build(None, log_store.clone(), op) + .await + .unwrap(); + + let mut actions = Vec::new(); + // remove all but one file + for i in 0..9 { + actions.push(Action::Remove(Remove { + path: format!("part-{}.parquet", i), + deletion_timestamp: Some(chrono::Utc::now().timestamp_millis() as i64), + ..Default::default() + })) + } + + let op = DeltaOperation::Delete { predicate: None }; + let table_data = &commit.snapshot as &dyn TableReference; + let commit = CommitBuilder::default() + .with_actions(actions) + .build(Some(table_data), log_store.clone(), op) + .await + .unwrap(); + + create_checkpoint_for(commit.version, &commit.snapshot, log_store.as_ref()) + .await + .unwrap(); + + let batches = LogSegment::try_new( + &Path::default(), + Some(commit.version), + log_store.object_store().as_ref(), + ) + .await + .unwrap() + .checkpoint_stream( + log_store.object_store(), + &StructType::new(vec![ + ActionType::Metadata.schema_field().clone(), + ActionType::Protocol.schema_field().clone(), + ActionType::Add.schema_field().clone(), + ]), + &Default::default(), + ) + .try_collect::>() + .await + .unwrap(); + + let batch = arrow::compute::concat_batches(&batches[0].schema(), batches.iter()).unwrap(); + + // there are 9 remove action rows but all columns are null + // because the removes are not projected in the schema + // these get filtered out upstream and there was no perf + // benefit when applying a row filter + // in addition there is 1 add, 1 metadata, and 1 protocol row + assert_eq!(batch.num_rows(), 12); + + assert_eq!(batch.schema().fields().len(), 3); + assert!(batch.schema().field_with_name("metaData").is_ok()); + assert!(batch.schema().field_with_name("protocol").is_ok()); + assert!(batch.schema().field_with_name("add").is_ok()); + } } diff --git a/crates/core/src/kernel/snapshot/mod.rs b/crates/core/src/kernel/snapshot/mod.rs index d12018c245..d34b78fbed 100644 --- a/crates/core/src/kernel/snapshot/mod.rs +++ b/crates/core/src/kernel/snapshot/mod.rs @@ -15,6 +15,7 @@ //! //! +use std::collections::{HashMap, HashSet}; use std::io::{BufRead, BufReader, Cursor}; use std::sync::Arc; @@ -25,22 +26,29 @@ use futures::{StreamExt, TryStreamExt}; use object_store::path::Path; use object_store::ObjectStore; -use self::log_segment::{CommitData, LogSegment, PathExt}; +use self::log_segment::{LogSegment, PathExt}; use self::parse::{read_adds, read_removes}; use self::replay::{LogMapper, LogReplayScanner, ReplayStream}; -use super::{Action, Add, CommitInfo, DataType, Metadata, Protocol, Remove, StructField}; -use crate::kernel::StructType; +use self::visitors::*; +use super::{ + Action, Add, AddCDCFile, CommitInfo, DataType, Metadata, Protocol, Remove, StructField, + Transaction, +}; +use crate::kernel::parse::read_cdf_adds; +use crate::kernel::{ActionType, StructType}; use crate::logstore::LogStore; +use crate::operations::transaction::CommitData; use crate::table::config::TableConfig; use crate::{DeltaResult, DeltaTableConfig, DeltaTableError}; +pub use self::log_data::*; + mod log_data; mod log_segment; pub(crate) mod parse; mod replay; mod serde; - -pub use log_data::*; +mod visitors; /// A snapshot of a Delta table #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] @@ -196,21 +204,39 @@ impl Snapshot { } /// Get the files in the snapshot - pub fn files( + pub fn files<'a>( &self, store: Arc, - ) -> DeltaResult>>> { + visitors: &'a mut Vec>, + ) -> DeltaResult>>> { + let mut schema_actions: HashSet<_> = + visitors.iter().flat_map(|v| v.required_actions()).collect(); + + schema_actions.insert(ActionType::Add); + let checkpoint_stream = self.log_segment.checkpoint_stream( + store.clone(), + &StructType::new( + schema_actions + .iter() + .map(|a| a.schema_field().clone()) + .collect(), + ), + &self.config, + ); + + schema_actions.insert(ActionType::Remove); let log_stream = self.log_segment.commit_stream( store.clone(), - &log_segment::COMMIT_SCHEMA, + &StructType::new( + schema_actions + .iter() + .map(|a| a.schema_field().clone()) + .collect(), + ), &self.config, )?; - let checkpoint_stream = self.log_segment.checkpoint_stream( - store, - &log_segment::CHECKPOINT_SCHEMA, - &self.config, - ); - ReplayStream::try_new(log_stream, checkpoint_stream, &self) + + ReplayStream::try_new(log_stream, checkpoint_stream, self, visitors) } /// Get the commit infos in the snapshot @@ -283,12 +309,14 @@ impl Snapshot { } /// Get the statistics schema of the snapshot - pub fn stats_schema(&self) -> DeltaResult { + pub fn stats_schema(&self, table_schema: Option<&StructType>) -> DeltaResult { + let schema = table_schema.unwrap_or_else(|| self.schema()); + let stats_fields = if let Some(stats_cols) = self.table_config().stats_columns() { stats_cols .iter() - .map(|col| match self.schema().field_with_name(col) { - Ok(field) => match field.data_type() { + .map(|col| match schema.field(col) { + Some(field) => match field.data_type() { DataType::Map(_) | DataType::Array(_) | &DataType::BINARY => { Err(DeltaTableError::Generic(format!( "Stats column {} has unsupported type {}", @@ -310,17 +338,11 @@ impl Snapshot { .collect::, _>>()? } else { let num_indexed_cols = self.table_config().num_indexed_cols(); - self.schema() + schema .fields - .iter() + .values() .enumerate() - .filter_map(|(idx, f)| match f.data_type() { - DataType::Map(_) | DataType::Array(_) | &DataType::BINARY => None, - _ if num_indexed_cols < 0 || (idx as i32) < num_indexed_cols => { - Some(StructField::new(f.name(), f.data_type().clone(), true)) - } - _ => None, - }) + .filter_map(|(idx, f)| stats_field(idx, num_indexed_cols, f)) .collect() }; Ok(StructType::new(vec![ @@ -340,6 +362,11 @@ impl Snapshot { #[derive(Debug, Clone, PartialEq)] pub struct EagerSnapshot { snapshot: Snapshot, + // additional actions that should be tracked during log replay. + tracked_actions: HashSet, + + transactions: Option>, + // NOTE: this is a Vec of RecordBatch instead of a single RecordBatch because // we do not yet enforce a consistent schema across all batches we read from the log. files: Vec, @@ -353,9 +380,51 @@ impl EagerSnapshot { config: DeltaTableConfig, version: Option, ) -> DeltaResult { + Self::try_new_with_visitor(table_root, store, config, version, Default::default()).await + } + + /// Create a new [`EagerSnapshot`] instance + pub async fn try_new_with_visitor( + table_root: &Path, + store: Arc, + config: DeltaTableConfig, + version: Option, + tracked_actions: HashSet, + ) -> DeltaResult { + let mut visitors = tracked_actions + .iter() + .flat_map(get_visitor) + .collect::>(); let snapshot = Snapshot::try_new(table_root, store.clone(), config, version).await?; - let files = snapshot.files(store)?.try_collect().await?; - Ok(Self { snapshot, files }) + let files = snapshot.files(store, &mut visitors)?.try_collect().await?; + + let mut sn = Self { + snapshot, + files, + tracked_actions, + transactions: None, + }; + + sn.process_visitors(visitors)?; + + Ok(sn) + } + + fn process_visitors(&mut self, visitors: Vec>) -> DeltaResult<()> { + for visitor in visitors { + if let Some(tv) = visitor + .as_ref() + .as_any() + .downcast_ref::() + { + if self.transactions.is_none() { + self.transactions = Some(tv.app_transaction_version.clone()); + } else { + self.transactions = Some(tv.merge(self.transactions.as_ref().unwrap())); + } + } + } + Ok(()) } #[cfg(test)] @@ -364,16 +433,21 @@ impl EagerSnapshot { let mut files = Vec::new(); let mut scanner = LogReplayScanner::new(); files.push(scanner.process_files_batch(&batch, true)?); - let mapper = LogMapper::try_new(&snapshot)?; + let mapper = LogMapper::try_new(&snapshot, None)?; files = files .into_iter() .map(|b| mapper.map_batch(b)) .collect::>>()?; - Ok(Self { snapshot, files }) + Ok(Self { + snapshot, + files, + tracked_actions: Default::default(), + transactions: None, + }) } /// Update the snapshot to the given version - pub async fn update( + pub async fn update<'a>( &mut self, log_store: Arc, target_version: Option, @@ -381,36 +455,71 @@ impl EagerSnapshot { if Some(self.version()) == target_version { return Ok(()); } + let new_slice = self .snapshot .update_inner(log_store.clone(), target_version) .await?; - if let Some(new_slice) = new_slice { - let files = std::mem::take(&mut self.files); - let log_stream = new_slice.commit_stream( - log_store.object_store().clone(), - &log_segment::COMMIT_SCHEMA, - &self.snapshot.config, - )?; - let checkpoint_stream = if new_slice.checkpoint_files.is_empty() { - futures::stream::iter(files.into_iter().map(Ok)).boxed() - } else { - new_slice - .checkpoint_stream( - log_store.object_store(), - &log_segment::CHECKPOINT_SCHEMA, - &self.snapshot.config, - ) - .boxed() - }; - let mapper = LogMapper::try_new(&self.snapshot)?; - let files = ReplayStream::try_new(log_stream, checkpoint_stream, &self.snapshot)? + + if new_slice.is_none() { + return Ok(()); + } + let new_slice = new_slice.unwrap(); + + let mut visitors = self + .tracked_actions + .iter() + .flat_map(get_visitor) + .collect::>(); + + let mut schema_actions: HashSet<_> = + visitors.iter().flat_map(|v| v.required_actions()).collect(); + let files = std::mem::take(&mut self.files); + + schema_actions.insert(ActionType::Add); + let checkpoint_stream = if new_slice.checkpoint_files.is_empty() { + // NOTE: we don't need to add the visitor relevant data here, as it is repüresented in teh state already + futures::stream::iter(files.into_iter().map(Ok)).boxed() + } else { + let read_schema = StructType::new( + schema_actions + .iter() + .map(|a| a.schema_field().clone()) + .collect(), + ); + new_slice + .checkpoint_stream( + log_store.object_store(), + &read_schema, + &self.snapshot.config, + ) + .boxed() + }; + + schema_actions.insert(ActionType::Remove); + let read_schema = StructType::new( + schema_actions + .iter() + .map(|a| a.schema_field().clone()) + .collect(), + ); + let log_stream = new_slice.commit_stream( + log_store.object_store().clone(), + &read_schema, + &self.snapshot.config, + )?; + + let mapper = LogMapper::try_new(&self.snapshot, None)?; + + let files = + ReplayStream::try_new(log_stream, checkpoint_stream, &self.snapshot, &mut visitors)? .map(|batch| batch.and_then(|b| mapper.map_batch(b))) .try_collect() .await?; - self.files = files; - } + self.files = files; + self.process_visitors(visitors)?; + Ok(()) } @@ -477,6 +586,22 @@ impl EagerSnapshot { self.log_data().into_iter() } + /// Get an iterator for the CDC files added in this version + pub fn cdc_files(&self) -> DeltaResult + '_> { + Ok(self.files.iter().flat_map(|b| read_cdf_adds(b)).flatten()) + } + + /// Iterate over all latest app transactions + pub fn transactions(&self) -> DeltaResult + '_> { + self.transactions + .as_ref() + .map(|t| t.values().cloned()) + .ok_or(DeltaTableError::Generic( + "Transactions are not available. Please enable tracking of transactions." + .to_string(), + )) + } + /// Advance the snapshot based on the given commit actions pub fn advance<'a>( &mut self, @@ -487,23 +612,38 @@ impl EagerSnapshot { let mut send = Vec::new(); for commit in commits { if metadata.is_none() { - metadata = commit.0.iter().find_map(|a| match a { + metadata = commit.actions.iter().find_map(|a| match a { Action::Metadata(metadata) => Some(metadata.clone()), _ => None, }); } if protocol.is_none() { - protocol = commit.0.iter().find_map(|a| match a { + protocol = commit.actions.iter().find_map(|a| match a { Action::Protocol(protocol) => Some(protocol.clone()), _ => None, }); } send.push(commit); } + + let mut visitors = self + .tracked_actions + .iter() + .flat_map(get_visitor) + .collect::>(); + let mut schema_actions: HashSet<_> = + visitors.iter().flat_map(|v| v.required_actions()).collect(); + schema_actions.extend([ActionType::Add, ActionType::Remove]); + let read_schema = StructType::new( + schema_actions + .iter() + .map(|a| a.schema_field().clone()) + .collect(), + ); let actions = self.snapshot.log_segment.advance( send, &self.table_root(), - &log_segment::COMMIT_SCHEMA, + &read_schema, &self.snapshot.config, )?; @@ -511,10 +651,20 @@ impl EagerSnapshot { let mut scanner = LogReplayScanner::new(); for batch in actions { - files.push(scanner.process_files_batch(&batch?, true)?); + let batch = batch?; + files.push(scanner.process_files_batch(&batch, true)?); + for visitor in &mut visitors { + visitor.visit_batch(&batch)?; + } } - let mapper = LogMapper::try_new(&self.snapshot)?; + let mapper = if let Some(metadata) = &metadata { + let new_schema: StructType = serde_json::from_str(&metadata.schema_string)?; + LogMapper::try_new(&self.snapshot, Some(&new_schema))? + } else { + LogMapper::try_new(&self.snapshot, None)? + }; + self.files = files .into_iter() .chain( @@ -532,24 +682,44 @@ impl EagerSnapshot { if let Some(protocol) = protocol { self.snapshot.protocol = protocol; } + self.process_visitors(visitors)?; Ok(self.snapshot.version()) } } -fn to_count_field(field: &StructField) -> Option { +fn stats_field(idx: usize, num_indexed_cols: i32, field: &StructField) -> Option { + if !(num_indexed_cols < 0 || (idx as i32) < num_indexed_cols) { + return None; + } match field.data_type() { DataType::Map(_) | DataType::Array(_) | &DataType::BINARY => None, - DataType::Struct(s) => Some(StructField::new( + DataType::Struct(dt_struct) => Some(StructField::new( field.name(), StructType::new( - s.fields() - .iter() - .filter_map(to_count_field) - .collect::>(), + dt_struct + .fields() + .flat_map(|f| stats_field(idx, num_indexed_cols, f)) + .collect(), ), true, )), + DataType::Primitive(_) => Some(StructField::new( + field.name(), + field.data_type.clone(), + true, + )), + } +} + +fn to_count_field(field: &StructField) -> Option { + match field.data_type() { + DataType::Map(_) | DataType::Array(_) | &DataType::BINARY => None, + DataType::Struct(s) => Some(StructField::new( + field.name(), + StructType::new(s.fields().filter_map(to_count_field).collect::>()), + true, + )), _ => Some(StructField::new(field.name(), DataType::LONG, true)), } } @@ -570,12 +740,14 @@ mod datafusion { #[cfg(test)] mod tests { + use std::collections::HashMap; + use chrono::Utc; use deltalake_test::utils::*; use futures::TryStreamExt; use itertools::Itertools; - use super::log_segment::tests::test_log_segment; + use super::log_segment::tests::{concurrent_checkpoint, test_log_segment}; use super::replay::tests::test_log_replay; use super::*; use crate::kernel::Remove; @@ -597,6 +769,13 @@ mod tests { Ok(()) } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_concurrent_checkpoint() -> TestResult { + let context = IntegrationContext::new(Box::::default())?; + concurrent_checkpoint(&context).await?; + Ok(()) + } + async fn test_snapshot(context: &IntegrationContext) -> TestResult { let store = context .table_builder(TestTables::Simple) @@ -630,7 +809,7 @@ mod tests { assert_eq!(tombstones.len(), 31); let batches = snapshot - .files(store.clone())? + .files(store.clone(), &mut vec![])? .try_collect::>() .await?; let expected = [ @@ -660,7 +839,7 @@ mod tests { ) .await?; let batches = snapshot - .files(store.clone())? + .files(store.clone(), &mut vec![])? .try_collect::>() .await?; let num_files = batches.iter().map(|b| b.num_rows() as i64).sum::(); @@ -750,14 +929,17 @@ mod tests { }) .collect_vec(); - let actions = vec![( + let operation = DeltaOperation::Write { + mode: SaveMode::Append, + partition_by: None, + predicate: None, + }; + + let actions = vec![CommitData::new( removes, - DeltaOperation::Write { - mode: SaveMode::Append, - partition_by: None, - predicate: None, - }, - None, + operation, + HashMap::new(), + Vec::new(), )]; let new_version = snapshot.advance(&actions)?; diff --git a/crates/core/src/kernel/snapshot/parse.rs b/crates/core/src/kernel/snapshot/parse.rs index 0070880c9b..a3ccef1902 100644 --- a/crates/core/src/kernel/snapshot/parse.rs +++ b/crates/core/src/kernel/snapshot/parse.rs @@ -6,7 +6,7 @@ use arrow_array::{ use percent_encoding::percent_decode_str; use crate::kernel::arrow::extract::{self as ex, ProvidesColumnByName}; -use crate::kernel::{Add, DeletionVectorDescriptor, Metadata, Protocol, Remove}; +use crate::kernel::{Add, AddCDCFile, DeletionVectorDescriptor, Metadata, Protocol, Remove}; use crate::{DeltaResult, DeltaTableError}; pub(super) fn read_metadata(batch: &dyn ProvidesColumnByName) -> DeltaResult> { @@ -134,6 +134,39 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult DeltaResult> { + let mut result = Vec::new(); + + if let Some(arr) = ex::extract_and_cast_opt::(array, "cdc") { + let path = ex::extract_and_cast::(arr, "path")?; + let pvs = ex::extract_and_cast_opt::(arr, "partitionValues"); + let size = ex::extract_and_cast::(arr, "size")?; + let data_change = ex::extract_and_cast::(arr, "dataChange")?; + let tags = ex::extract_and_cast_opt::(arr, "tags"); + + for i in 0..arr.len() { + if arr.is_valid(i) { + let path_ = ex::read_str(path, i)?; + let path_ = percent_decode_str(path_) + .decode_utf8() + .map_err(|_| DeltaTableError::Generic("illegal path encoding".into()))? + .to_string(); + result.push(AddCDCFile { + path: path_, + size: ex::read_primitive(size, i)?, + data_change: ex::read_bool(data_change, i)?, + partition_values: pvs + .and_then(|pv| collect_map(&pv.value(i)).map(|m| m.collect())) + .unwrap_or_default(), + tags: tags.and_then(|t| collect_map(&t.value(i)).map(|m| m.collect())), + }); + } + } + } + + Ok(result) +} + pub(super) fn read_removes(array: &dyn ProvidesColumnByName) -> DeltaResult> { let mut result = Vec::new(); diff --git a/crates/core/src/kernel/snapshot/replay.rs b/crates/core/src/kernel/snapshot/replay.rs index 71408b27d5..3efd9584e2 100644 --- a/crates/core/src/kernel/snapshot/replay.rs +++ b/crates/core/src/kernel/snapshot/replay.rs @@ -21,16 +21,20 @@ use tracing::debug; use crate::kernel::arrow::extract::{self as ex, ProvidesColumnByName}; use crate::kernel::arrow::json; +use crate::kernel::StructType; use crate::{DeltaResult, DeltaTableConfig, DeltaTableError}; +use super::ReplayVisitor; use super::Snapshot; pin_project! { - pub struct ReplayStream { + pub struct ReplayStream<'a, S> { scanner: LogReplayScanner, mapper: Arc, + visitors: &'a mut Vec>, + #[pin] commits: S, @@ -39,9 +43,14 @@ pin_project! { } } -impl ReplayStream { - pub(super) fn try_new(commits: S, checkpoint: S, snapshot: &Snapshot) -> DeltaResult { - let stats_schema = Arc::new((&snapshot.stats_schema()?).try_into()?); +impl<'a, S> ReplayStream<'a, S> { + pub(super) fn try_new( + commits: S, + checkpoint: S, + snapshot: &Snapshot, + visitors: &'a mut Vec>, + ) -> DeltaResult { + let stats_schema = Arc::new((&snapshot.stats_schema(None)?).try_into()?); let mapper = Arc::new(LogMapper { stats_schema, config: snapshot.config.clone(), @@ -50,6 +59,7 @@ impl ReplayStream { commits, checkpoint, mapper, + visitors, scanner: LogReplayScanner::new(), }) } @@ -61,9 +71,12 @@ pub(super) struct LogMapper { } impl LogMapper { - pub(super) fn try_new(snapshot: &Snapshot) -> DeltaResult { + pub(super) fn try_new( + snapshot: &Snapshot, + table_schema: Option<&StructType>, + ) -> DeltaResult { Ok(Self { - stats_schema: Arc::new((&snapshot.stats_schema()?).try_into()?), + stats_schema: Arc::new((&snapshot.stats_schema(table_schema)?).try_into()?), config: snapshot.config.clone(), }) } @@ -79,7 +92,7 @@ fn map_batch( config: &DeltaTableConfig, ) -> DeltaResult { let stats_col = ex::extract_and_cast_opt::(&batch, "add.stats"); - let stats_parsed_col = ex::extract_and_cast_opt::(&batch, "add.stats_parsed"); + let stats_parsed_col = ex::extract_and_cast_opt::(&batch, "add.stats_parsed"); if stats_parsed_col.is_some() { return Ok(batch); } @@ -127,7 +140,7 @@ fn map_batch( Ok(batch) } -impl Stream for ReplayStream +impl<'a, S> Stream for ReplayStream<'a, S> where S: Stream>, { @@ -136,20 +149,34 @@ where fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let this = self.project(); let res = this.commits.poll_next(cx).map(|b| match b { - Some(Ok(batch)) => match this.scanner.process_files_batch(&batch, true) { - Ok(filtered) => Some(this.mapper.map_batch(filtered)), - Err(e) => Some(Err(e)), - }, - Some(Err(e)) => Some(Err(e)), + Some(Ok(batch)) => { + for visitor in this.visitors.iter_mut() { + if let Err(e) = visitor.visit_batch(&batch) { + return Some(Err(e)); + } + } + match this.scanner.process_files_batch(&batch, true) { + Ok(filtered) => Some(this.mapper.map_batch(filtered)), + err => Some(err), + } + } + Some(e) => Some(e), None => None, }); if matches!(res, Poll::Ready(None)) { this.checkpoint.poll_next(cx).map(|b| match b { - Some(Ok(batch)) => match this.scanner.process_files_batch(&batch, false) { - Ok(filtered) => Some(this.mapper.map_batch(filtered)), - Err(e) => Some(Err(e)), - }, - Some(Err(e)) => Some(Err(e)), + Some(Ok(batch)) => { + for visitor in this.visitors.iter_mut() { + if let Err(e) = visitor.visit_batch(&batch) { + return Some(Err(e)); + } + } + match this.scanner.process_files_batch(&batch, false) { + Ok(filtered) => Some(this.mapper.map_batch(filtered)), + err => Some(err), + } + } + Some(e) => Some(e), None => None, }) } else { diff --git a/crates/core/src/kernel/snapshot/serde.rs b/crates/core/src/kernel/snapshot/serde.rs index 5162c4a1fe..dd7403bc28 100644 --- a/crates/core/src/kernel/snapshot/serde.rs +++ b/crates/core/src/kernel/snapshot/serde.rs @@ -1,6 +1,6 @@ use arrow_ipc::reader::FileReader; use arrow_ipc::writer::FileWriter; -use chrono::{TimeZone, Utc}; +use chrono::{DateTime, TimeZone, Utc}; use object_store::ObjectMeta; use serde::de::{self, Deserializer, SeqAccess, Visitor}; use serde::{ser::SerializeSeq, Deserialize, Serialize}; @@ -99,9 +99,8 @@ impl<'de> Visitor<'de> for LogSegmentVisitor { .map(|f| ObjectMeta { location: f.path.into(), size: f.size, - last_modified: Utc.from_utc_datetime( - &chrono::NaiveDateTime::from_timestamp_millis(f.last_modified).unwrap(), - ), + last_modified: DateTime::from_timestamp_millis(f.last_modified).unwrap(), + version: None, e_tag: None, }) @@ -126,6 +125,8 @@ impl Serialize for EagerSnapshot { { let mut seq = serializer.serialize_seq(None)?; seq.serialize_element(&self.snapshot)?; + seq.serialize_element(&self.tracked_actions)?; + seq.serialize_element(&self.transactions)?; for batch in self.files.iter() { let mut buffer = vec![]; let mut writer = FileWriter::try_new(&mut buffer, batch.schema().as_ref()) @@ -153,10 +154,15 @@ impl<'de> Visitor<'de> for EagerSnapshotVisitor { where V: SeqAccess<'de>, { - println!("eager: {:?}", "start"); let snapshot = seq .next_element()? .ok_or_else(|| de::Error::invalid_length(0, &self))?; + let tracked_actions = seq + .next_element()? + .ok_or_else(|| de::Error::invalid_length(1, &self))?; + let transactions = seq + .next_element()? + .ok_or_else(|| de::Error::invalid_length(2, &self))?; let mut files = Vec::new(); while let Some(elem) = seq.next_element::>()? { let mut reader = @@ -171,7 +177,12 @@ impl<'de> Visitor<'de> for EagerSnapshotVisitor { })?; files.push(rb); } - Ok(EagerSnapshot { snapshot, files }) + Ok(EagerSnapshot { + snapshot, + files, + tracked_actions, + transactions, + }) } } diff --git a/crates/core/src/kernel/snapshot/visitors.rs b/crates/core/src/kernel/snapshot/visitors.rs new file mode 100644 index 0000000000..1b68026a5b --- /dev/null +++ b/crates/core/src/kernel/snapshot/visitors.rs @@ -0,0 +1,192 @@ +//! Log replay visitors. +//! +//! Log replay visitors allow to extract additional actions during log replay. + +use std::collections::HashMap; + +use arrow::compute::{filter_record_batch, is_not_null}; +use arrow_array::{Array, Int64Array, RecordBatch, StringArray, StructArray}; + +use super::ActionType; +use crate::errors::DeltaResult; +use crate::kernel::arrow::extract as ex; +use crate::kernel::Transaction; + +/// Allows hooking into the reading of commit files and checkpoints whenever a table is loaded or updated. +pub trait ReplayVisitor: std::fmt::Debug + Send + Sync { + fn as_any(&self) -> &dyn std::any::Any; + + /// Process a batch + fn visit_batch(&mut self, batch: &RecordBatch) -> DeltaResult<()>; + + /// return all relevant actions for the visitor + fn required_actions(&self) -> Vec; +} + +/// Get the relevant visitor for the given action type +pub fn get_visitor(action: &ActionType) -> Option> { + match action { + ActionType::Txn => Some(Box::new(AppTransactionVisitor::new())), + _ => None, + } +} + +#[derive(Debug, Default)] +pub(crate) struct AppTransactionVisitor { + pub(crate) app_transaction_version: HashMap, +} + +impl AppTransactionVisitor { + pub(crate) fn new() -> Self { + Self { + app_transaction_version: HashMap::new(), + } + } +} + +impl AppTransactionVisitor { + pub fn merge(&self, map: &HashMap) -> HashMap { + let mut clone = map.clone(); + for (key, value) in &self.app_transaction_version { + clone.insert(key.clone(), value.clone()); + } + clone + } +} + +impl ReplayVisitor for AppTransactionVisitor { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn visit_batch(&mut self, batch: &arrow_array::RecordBatch) -> DeltaResult<()> { + if batch.column_by_name("txn").is_none() { + return Ok(()); + } + + let txn_col = ex::extract_and_cast::(batch, "txn")?; + let filtered = filter_record_batch(batch, &is_not_null(txn_col)?)?; + let arr = ex::extract_and_cast::(&filtered, "txn")?; + + let id = ex::extract_and_cast::(arr, "appId")?; + let version = ex::extract_and_cast::(arr, "version")?; + let last_updated = ex::extract_and_cast_opt::(arr, "lastUpdated"); + + for idx in 0..id.len() { + if id.is_valid(idx) { + let app_id = ex::read_str(id, idx)?; + if self.app_transaction_version.contains_key(app_id) { + continue; + } + self.app_transaction_version.insert( + app_id.to_owned(), + Transaction { + app_id: app_id.into(), + version: ex::read_primitive(version, idx)?, + last_updated: last_updated.and_then(|arr| ex::read_primitive_opt(arr, idx)), + }, + ); + } + } + + Ok(()) + } + + fn required_actions(&self) -> Vec { + vec![ActionType::Txn] + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Int64Array, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Fields, Schema}; + use std::sync::Arc; + + #[test] + fn test_app_txn_visitor() { + let fields: Fields = vec![ + Field::new("appId", DataType::Utf8, true), + Field::new("version", DataType::Int64, true), + Field::new("lastUpdated", DataType::Int64, true), + ] + .into(); + let schema = Arc::new(Schema::new(vec![Field::new( + "txn", + DataType::Struct(fields.clone()), + false, + )])); + + let mut data_app = vec![None, Some("my-app"), None]; + let mut data_version = vec![None, Some(1), None]; + let mut data_last_updated = vec![None, Some(123), None]; + let arr = Arc::new(StructArray::new( + fields.clone(), + vec![ + Arc::new(StringArray::from(data_app.clone())), + Arc::new(Int64Array::from(data_version.clone())), + Arc::new(Int64Array::from(data_last_updated.clone())), + ], + None, + )); + + let batch = RecordBatch::try_new(schema.clone(), vec![arr]).unwrap(); + let mut visitor = AppTransactionVisitor::new(); + visitor.visit_batch(&batch).unwrap(); + + let app_txns = visitor.app_transaction_version; + assert_eq!(app_txns.len(), 1); + assert_eq!(app_txns.get("my-app").map(|t| t.version), Some(1)); + assert_eq!( + app_txns.get("my-app").map(|t| t.last_updated), + Some(Some(123)) + ); + + // test that only the first encountered txn ist tacked for every app id. + data_app.extend([None, Some("my-app")]); + data_version.extend([None, Some(2)]); + data_last_updated.extend([None, Some(124)]); + let arr = Arc::new(StructArray::new( + fields.clone(), + vec![ + Arc::new(StringArray::from(data_app.clone())), + Arc::new(Int64Array::from(data_version.clone())), + Arc::new(Int64Array::from(data_last_updated.clone())), + ], + None, + )); + let batch = RecordBatch::try_new(schema.clone(), vec![arr]).unwrap(); + let mut visitor = AppTransactionVisitor::new(); + visitor.visit_batch(&batch).unwrap(); + + let app_txns = visitor.app_transaction_version; + assert_eq!(app_txns.len(), 1); + assert_eq!(app_txns.get("my-app").map(|t| t.version), Some(1)); + assert_eq!( + app_txns.get("my-app").map(|t| t.last_updated), + Some(Some(123)) + ); + + // test that multiple app ids are tracked + data_app.extend([Some("my-other-app")]); + data_version.extend([Some(10)]); + data_last_updated.extend([Some(123)]); + let arr = Arc::new(StructArray::new( + fields.clone(), + vec![ + Arc::new(StringArray::from(data_app.clone())), + Arc::new(Int64Array::from(data_version.clone())), + Arc::new(Int64Array::from(data_last_updated.clone())), + ], + None, + )); + let batch = RecordBatch::try_new(schema.clone(), vec![arr]).unwrap(); + let mut visitor = AppTransactionVisitor::new(); + visitor.visit_batch(&batch).unwrap(); + + let app_txns = visitor.app_transaction_version; + assert_eq!(app_txns.len(), 2); + assert_eq!(app_txns.get("my-other-app").map(|t| t.version), Some(10)); + } +} diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index b24faf248e..4ef9fc06fd 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -47,7 +47,7 @@ //! # Querying Delta Tables with Datafusion //! //! Querying from local filesystem: -//! ```ignore +//! ``` //! use std::sync::Arc; //! use datafusion::prelude::SessionContext; //! diff --git a/crates/core/src/logstore/default_logstore.rs b/crates/core/src/logstore/default_logstore.rs index ed463e9947..8fd4f52beb 100644 --- a/crates/core/src/logstore/default_logstore.rs +++ b/crates/core/src/logstore/default_logstore.rs @@ -50,6 +50,14 @@ impl LogStore for DefaultLogStore { super::write_commit_entry(self.storage.as_ref(), version, tmp_commit).await } + async fn abort_commit_entry( + &self, + version: i64, + tmp_commit: &Path, + ) -> Result<(), TransactionError> { + super::abort_commit_entry(self.storage.as_ref(), version, tmp_commit).await + } + async fn get_latest_version(&self, current_version: i64) -> DeltaResult { super::get_latest_version(self, current_version).await } diff --git a/crates/core/src/logstore/mod.rs b/crates/core/src/logstore/mod.rs index e6b4c6e2d4..b8646cdb65 100644 --- a/crates/core/src/logstore/mod.rs +++ b/crates/core/src/logstore/mod.rs @@ -18,7 +18,9 @@ use crate::{ kernel::Action, operations::transaction::TransactionError, protocol::{get_last_checkpoint, ProtocolError}, - storage::{commit_uri_from_version, ObjectStoreRef, StorageOptions}, + storage::{ + commit_uri_from_version, retry_ext::ObjectStoreRetryExt, ObjectStoreRef, StorageOptions, + }, DeltaTableError, }; use bytes::Bytes; @@ -183,6 +185,13 @@ pub trait LogStore: Sync + Send { tmp_commit: &Path, ) -> Result<(), TransactionError>; + /// Abort the commit entry for the given version. + async fn abort_commit_entry( + &self, + version: i64, + tmp_commit: &Path, + ) -> Result<(), TransactionError>; + /// Find latest version currently stored in the delta log. async fn get_latest_version(&self, start_version: i64) -> DeltaResult; @@ -449,6 +458,16 @@ pub async fn write_commit_entry( Ok(()) } +/// Default implementation for aborting a commit entry +pub async fn abort_commit_entry( + storage: &dyn ObjectStore, + _version: i64, + tmp_commit: &Path, +) -> Result<(), TransactionError> { + storage.delete_with_retries(tmp_commit, 15).await?; + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/core/src/operations/add_column.rs b/crates/core/src/operations/add_column.rs new file mode 100644 index 0000000000..028a6e5b2e --- /dev/null +++ b/crates/core/src/operations/add_column.rs @@ -0,0 +1,114 @@ +//! Add a new column to a table + +use delta_kernel::schema::StructType; +use futures::future::BoxFuture; +use itertools::Itertools; + +use super::cast::merge_struct; +use super::transaction::{CommitBuilder, CommitProperties, PROTOCOL}; + +use crate::kernel::StructField; +use crate::logstore::LogStoreRef; +use crate::protocol::DeltaOperation; +use crate::table::state::DeltaTableState; +use crate::{DeltaResult, DeltaTable, DeltaTableError}; + +/// Add new columns and/or nested fields to a table +pub struct AddColumnBuilder { + /// A snapshot of the table's state + snapshot: DeltaTableState, + /// Fields to add/merge into schema + fields: Option>, + /// Delta object store for handling data files + log_store: LogStoreRef, + /// Additional information to add to the commit + commit_properties: CommitProperties, +} + +impl super::Operation<()> for AddColumnBuilder {} + +impl AddColumnBuilder { + /// Create a new builder + pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { + Self { + snapshot, + log_store, + fields: None, + commit_properties: CommitProperties::default(), + } + } + + /// Specify the fields to be added + pub fn with_fields(mut self, fields: impl IntoIterator + Clone) -> Self { + self.fields = Some(fields.into_iter().collect()); + self + } + /// Additional metadata to be added to commit info + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; + self + } +} + +impl std::future::IntoFuture for AddColumnBuilder { + type Output = DeltaResult; + + type IntoFuture = BoxFuture<'static, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let this = self; + + Box::pin(async move { + let mut metadata = this.snapshot.metadata().clone(); + let fields = match this.fields { + Some(v) => v, + None => return Err(DeltaTableError::Generic("No fields provided".to_string())), + }; + + let fields_right = &StructType::new(fields.clone()); + let table_schema = this.snapshot.schema(); + let new_table_schema = merge_struct(table_schema, fields_right)?; + + // TODO(ion): Think of a way how we can simply this checking through the API or centralize some checks. + let contains_timestampntz = PROTOCOL.contains_timestampntz(fields.iter()); + let protocol = this.snapshot.protocol(); + + let maybe_new_protocol = if contains_timestampntz { + let updated_protocol = protocol.clone().enable_timestamp_ntz(); + if !(protocol.min_reader_version == 3 && protocol.min_writer_version == 7) { + // Convert existing properties to features since we advanced the protocol to v3,7 + Some( + updated_protocol + .move_table_properties_into_features(&metadata.configuration), + ) + } else { + Some(updated_protocol) + } + } else { + None + }; + + let operation = DeltaOperation::AddColumn { + fields: fields.into_iter().collect_vec(), + }; + + metadata.schema_string = serde_json::to_string(&new_table_schema)?; + + let mut actions = vec![metadata.into()]; + + if let Some(new_protocol) = maybe_new_protocol { + actions.push(new_protocol.into()) + } + + let commit = CommitBuilder::from(this.commit_properties) + .with_actions(actions) + .build(Some(&this.snapshot), this.log_store.clone(), operation) + .await?; + + Ok(DeltaTable::new_with_state( + this.log_store, + commit.snapshot(), + )) + }) + } +} diff --git a/crates/core/src/operations/cast.rs b/crates/core/src/operations/cast.rs index 6e77552286..68f630239d 100644 --- a/crates/core/src/operations/cast.rs +++ b/crates/core/src/operations/cast.rs @@ -1,39 +1,268 @@ //! Provide common cast functionality for callers //! -use arrow_array::{Array, ArrayRef, RecordBatch, StructArray}; +use crate::kernel::{ + ArrayType, DataType as DeltaDataType, MapType, MetadataValue, StructField, StructType, +}; +use arrow_array::cast::AsArray; +use arrow_array::{ + new_null_array, Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, + RecordBatchOptions, StructArray, +}; use arrow_cast::{cast_with_options, CastOptions}; -use arrow_schema::{DataType, Fields, SchemaRef as ArrowSchemaRef}; - +use arrow_schema::{ArrowError, DataType, FieldRef, Fields, SchemaRef as ArrowSchemaRef}; +use std::collections::HashMap; use std::sync::Arc; use crate::DeltaResult; -fn cast_struct( - struct_array: &StructArray, - fields: &Fields, - cast_options: &CastOptions, -) -> Result>, arrow_schema::ArrowError> { - fields - .iter() +fn try_merge_metadata( + left: &mut HashMap, + right: &HashMap, +) -> Result<(), ArrowError> { + for (k, v) in right { + if let Some(vl) = left.get(k) { + if vl != v { + return Err(ArrowError::SchemaError(format!( + "Cannot merge metadata with different values for key {}", + k + ))); + } + } else { + left.insert(k.clone(), v.clone()); + } + } + Ok(()) +} + +pub(crate) fn merge_struct( + left: &StructType, + right: &StructType, +) -> Result { + let mut errors = Vec::new(); + let merged_fields: Result, ArrowError> = left + .fields() .map(|field| { - let col = struct_array.column_by_name(field.name()).unwrap(); - if let (DataType::Struct(_), DataType::Struct(child_fields)) = - (col.data_type(), field.data_type()) - { - let child_struct = StructArray::from(col.into_data()); - let s = cast_struct(&child_struct, child_fields, cast_options)?; - Ok(Arc::new(StructArray::new( - child_fields.clone(), - s, - child_struct.nulls().map(ToOwned::to_owned), - )) as ArrayRef) - } else if is_cast_required(col.data_type(), field.data_type()) { - cast_with_options(col, field.data_type(), cast_options) + let right_field = right.field(field.name()); + if let Some(right_field) = right_field { + let type_or_not = merge_type(field.data_type(), right_field.data_type()); + match type_or_not { + Err(e) => { + errors.push(e.to_string()); + Err(e) + } + Ok(f) => { + let mut new_field = StructField::new( + field.name(), + f, + field.is_nullable() || right_field.is_nullable(), + ); + + new_field.metadata.clone_from(&field.metadata); + try_merge_metadata(&mut new_field.metadata, &right_field.metadata)?; + Ok(new_field) + } + } } else { - Ok(col.clone()) + Ok(field.clone()) } }) - .collect::, _>>() + .collect(); + match merged_fields { + Ok(mut fields) => { + for field in right.fields() { + if !left.field(field.name()).is_some() { + fields.push(field.clone()); + } + } + + Ok(StructType::new(fields)) + } + Err(e) => { + errors.push(e.to_string()); + Err(ArrowError::SchemaError(errors.join("\n"))) + } + } +} + +pub(crate) fn merge_type( + left: &DeltaDataType, + right: &DeltaDataType, +) -> Result { + if left == right { + return Ok(left.clone()); + } + match (left, right) { + (DeltaDataType::Array(a), DeltaDataType::Array(b)) => { + let merged = merge_type(&a.element_type, &b.element_type)?; + Ok(DeltaDataType::Array(Box::new(ArrayType::new( + merged, + a.contains_null() || b.contains_null(), + )))) + } + (DeltaDataType::Map(a), DeltaDataType::Map(b)) => { + let merged_key = merge_type(&a.key_type, &b.key_type)?; + let merged_value = merge_type(&a.value_type, &b.value_type)?; + Ok(DeltaDataType::Map(Box::new(MapType::new( + merged_key, + merged_value, + a.value_contains_null() || b.value_contains_null(), + )))) + } + (DeltaDataType::Struct(a), DeltaDataType::Struct(b)) => { + let merged = merge_struct(a, b)?; + Ok(DeltaDataType::Struct(Box::new(merged))) + } + (a, b) => Err(ArrowError::SchemaError(format!( + "Cannot merge types {} and {}", + a, b + ))), + } +} + +pub(crate) fn merge_schema( + left: ArrowSchemaRef, + right: ArrowSchemaRef, +) -> Result { + let left_delta: StructType = left.try_into()?; + let right_delta: StructType = right.try_into()?; + let merged: StructType = merge_struct(&left_delta, &right_delta)?; + Ok(Arc::new((&merged).try_into()?)) +} + +fn cast_struct( + struct_array: &StructArray, + fields: &Fields, + cast_options: &CastOptions, + add_missing: bool, +) -> Result { + StructArray::try_new( + fields.to_owned(), + fields + .iter() + .map(|field| { + let col_or_not = struct_array.column_by_name(field.name()); + match col_or_not { + None => match add_missing { + true if field.is_nullable() => { + Ok(new_null_array(field.data_type(), struct_array.len())) + } + _ => Err(ArrowError::SchemaError(format!( + "Could not find column {0}", + field.name() + ))), + }, + Some(col) => cast_field(col, field, cast_options, add_missing), + } + }) + .collect::, _>>()?, + struct_array.nulls().map(ToOwned::to_owned), + ) +} + +fn cast_list( + array: &GenericListArray, + field: &FieldRef, + cast_options: &CastOptions, + add_missing: bool, +) -> Result, ArrowError> { + let values = cast_field(array.values(), field, cast_options, add_missing)?; + GenericListArray::::try_new( + field.clone(), + array.offsets().clone(), + values, + array.nulls().cloned(), + ) +} + +fn cast_map( + array: &MapArray, + entries_field: &FieldRef, + sorted: bool, + cast_options: &CastOptions, + add_missing: bool, +) -> Result { + match entries_field.data_type() { + DataType::Struct(entry_fields) => { + let entries = cast_struct(array.entries(), entry_fields, cast_options, add_missing)?; + MapArray::try_new( + entries_field.clone(), + array.offsets().to_owned(), + entries, + array.nulls().cloned(), + sorted, + ) + } + _ => Err(ArrowError::CastError( + "Map entries must be a struct".to_string(), + )), + } +} + +fn cast_field( + col: &ArrayRef, + field: &FieldRef, + cast_options: &CastOptions, + add_missing: bool, +) -> Result { + if let (DataType::Struct(_), DataType::Struct(child_fields)) = + (col.data_type(), field.data_type()) + { + let child_struct = StructArray::from(col.into_data()); + Ok(Arc::new(cast_struct( + &child_struct, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef) + } else if let (DataType::List(_), DataType::List(child_fields)) = + (col.data_type(), field.data_type()) + { + Ok(Arc::new(cast_list( + col.as_any() + .downcast_ref::>() + .ok_or(ArrowError::CastError(format!( + "Expected a list for {} but got {}", + field.name(), + col.data_type() + )))?, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef) + } else if let (DataType::LargeList(_), DataType::LargeList(child_fields)) = + (col.data_type(), field.data_type()) + { + Ok(Arc::new(cast_list( + col.as_any() + .downcast_ref::>() + .ok_or(ArrowError::CastError(format!( + "Expected a list for {} but got {}", + field.name(), + col.data_type() + )))?, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef) + } else if let (DataType::Map(_, _), DataType::Map(child_fields, sorted)) = + (col.data_type(), field.data_type()) + { + Ok(Arc::new(cast_map( + col.as_map_opt().ok_or(ArrowError::CastError(format!( + "Expected a map for {} but got {}", + field.name(), + col.data_type() + )))?, + child_fields, + *sorted, + cast_options, + add_missing, + )?) as ArrayRef) + } else if is_cast_required(col.data_type(), field.data_type()) { + cast_with_options(col, field.data_type(), cast_options) + } else { + Ok(col.clone()) + } } fn is_cast_required(a: &DataType, b: &DataType) -> bool { @@ -51,6 +280,7 @@ pub fn cast_record_batch( batch: &RecordBatch, target_schema: ArrowSchemaRef, safe: bool, + add_missing: bool, ) -> DeltaResult { let cast_options = CastOptions { safe, @@ -62,20 +292,109 @@ pub fn cast_record_batch( batch.columns().to_owned(), None, ); - - let columns = cast_struct(&s, target_schema.fields(), &cast_options)?; - Ok(RecordBatch::try_new(target_schema, columns)?) + let struct_array = cast_struct(&s, target_schema.fields(), &cast_options, add_missing)?; + Ok(RecordBatch::try_new_with_options( + target_schema, + struct_array.columns().to_vec(), + &RecordBatchOptions::new().with_row_count(Some(batch.num_rows())), + )?) } #[cfg(test)] mod tests { - use crate::operations::cast::{cast_record_batch, is_cast_required}; - use arrow::array::ArrayData; - use arrow_array::{Array, ArrayRef, ListArray, RecordBatch}; - use arrow_buffer::Buffer; - use arrow_schema::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; + use std::collections::HashMap; + use std::ops::Deref; use std::sync::Arc; + use arrow::array::types::Int32Type; + use arrow::array::{ + new_empty_array, new_null_array, Array, ArrayData, ArrayRef, AsArray, Int32Array, + ListArray, PrimitiveArray, RecordBatch, StringArray, StructArray, + }; + use arrow::buffer::{Buffer, NullBuffer}; + use arrow_schema::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; + use itertools::Itertools; + + use crate::kernel::{ + ArrayType as DeltaArrayType, DataType as DeltaDataType, StructField as DeltaStructField, + StructType as DeltaStructType, + }; + use crate::operations::cast::MetadataValue; + use crate::operations::cast::{cast_record_batch, is_cast_required}; + + #[test] + fn test_merge_schema_with_dict() { + let left_schema = Arc::new(Schema::new(vec![Field::new( + "f", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + )])); + let right_schema = Arc::new(Schema::new(vec![Field::new( + "f", + DataType::LargeUtf8, + true, + )])); + + let result = super::merge_schema(left_schema, right_schema).unwrap(); + assert_eq!(result.fields().len(), 1); + let delta_type: DeltaDataType = result.fields()[0].data_type().try_into().unwrap(); + assert_eq!(delta_type, DeltaDataType::STRING); + assert!(result.fields()[0].is_nullable()); + } + + #[test] + fn test_merge_schema_with_meta() { + let mut left_meta = HashMap::new(); + left_meta.insert("a".to_string(), "a1".to_string()); + let left_schema = DeltaStructType::new(vec![DeltaStructField::new( + "f", + DeltaDataType::STRING, + false, + ) + .with_metadata(left_meta)]); + let mut right_meta = HashMap::new(); + right_meta.insert("b".to_string(), "b2".to_string()); + let right_schema = DeltaStructType::new(vec![DeltaStructField::new( + "f", + DeltaDataType::STRING, + true, + ) + .with_metadata(right_meta)]); + + let result = super::merge_struct(&left_schema, &right_schema).unwrap(); + let fields = result.fields().collect_vec(); + assert_eq!(fields.len(), 1); + let delta_type = fields[0].data_type(); + assert_eq!(delta_type, &DeltaDataType::STRING); + let mut expected_meta = HashMap::new(); + expected_meta.insert("a".to_string(), MetadataValue::String("a1".to_string())); + expected_meta.insert("b".to_string(), MetadataValue::String("b2".to_string())); + assert_eq!(fields[0].metadata(), &expected_meta); + } + + #[test] + fn test_merge_schema_with_nested() { + let left_schema = Arc::new(Schema::new(vec![Field::new( + "f", + DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, false))), + false, + )])); + let right_schema = Arc::new(Schema::new(vec![Field::new( + "f", + DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, false))), + true, + )])); + + let result = super::merge_schema(left_schema, right_schema).unwrap(); + assert_eq!(result.fields().len(), 1); + let delta_type: DeltaDataType = result.fields()[0].data_type().try_into().unwrap(); + assert_eq!( + delta_type, + DeltaDataType::Array(Box::new(DeltaArrayType::new(DeltaDataType::STRING, false))) + ); + assert!(result.fields()[0].is_nullable()); + } + #[test] fn test_cast_record_batch_with_list_non_default_item() { let array = Arc::new(make_list_array()) as ArrayRef; @@ -93,7 +412,7 @@ mod tests { )]); let target_schema = Arc::new(Schema::new(fields)) as SchemaRef; - let result = cast_record_batch(&record_batch, target_schema, false); + let result = cast_record_batch(&record_batch, target_schema, false, false); let schema = result.unwrap().schema(); let field = schema.column_with_name("list_column").unwrap().1; @@ -142,4 +461,303 @@ mod tests { assert!(is_cast_required(&field1, &field2)); } + + #[test] + fn test_add_missing_null_fields_with_no_missing_fields() { + let schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec![Some("a"), None, Some("c")])), + ], + ) + .unwrap(); + let result = cast_record_batch(&batch, schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + assert_eq!( + result.column(1).deref().as_string(), + &StringArray::from(vec![Some("a"), None, Some("c")]) + ); + } + + #[test] + fn test_add_missing_null_fields_with_missing_beginning() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field2", + DataType::Utf8, + true, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(StringArray::from(vec![ + Some("a"), + None, + Some("c"), + ]))], + ) + .unwrap(); + + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Utf8, true), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + new_null_array(&DataType::Int32, 3) + .deref() + .as_primitive::() + ); + assert_eq!( + result.column(1).deref().as_string(), + &StringArray::from(vec![Some("a"), None, Some("c")]) + ); + } + + #[test] + fn test_add_missing_null_fields_with_missing_end() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field1", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Utf8, true), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from(vec![Some(1), Some(2), Some(3)]) + ); + assert_eq!( + result.column(1).deref().as_string::(), + new_null_array(&DataType::Utf8, 3).deref().as_string() + ); + } + + #[test] + fn test_add_missing_null_fields_error_on_missing_non_null() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field1", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Utf8, false), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true); + assert!(result.is_err()); + } + + #[test] + fn test_add_missing_null_fields_nested_struct_missing() { + let nested_fields = Fields::from(vec![Field::new("nested1", DataType::Utf8, true)]); + let schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Struct(nested_fields.clone()), true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StructArray::new( + nested_fields, + vec![Arc::new(StringArray::from(vec![Some("a"), None, Some("c")])) as ArrayRef], + None, + )), + ], + ) + .unwrap(); + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new( + "field2", + DataType::Struct(Fields::from(vec![ + Field::new("nested1", DataType::Utf8, true), + Field::new("nested2", DataType::Utf8, true), + ])), + true, + ), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + let struct_column = result.column(1).deref().as_struct(); + assert_eq!(struct_column.num_columns(), 2); + assert_eq!( + struct_column.column(0).deref().as_string(), + &StringArray::from(vec![Some("a"), None, Some("c")]) + ); + assert_eq!( + struct_column.column(1).deref().as_string::(), + new_null_array(&DataType::Utf8, 3).deref().as_string() + ); + } + + #[test] + fn test_add_missing_null_fields_nested_struct_missing_non_nullable() { + let nested_fields = Fields::from(vec![Field::new("nested1", DataType::Utf8, false)]); + let schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new("field2", DataType::Struct(nested_fields.clone()), true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StructArray::new( + nested_fields, + vec![new_null_array(&DataType::Utf8, 3)], + Some(NullBuffer::new_null(3)), + )), + ], + ) + .unwrap(); + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new( + "field2", + DataType::Struct(Fields::from(vec![ + Field::new("nested1", DataType::Utf8, false), + Field::new("nested2", DataType::Utf8, true), + ])), + true, + ), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + let struct_column = result.column(1).deref().as_struct(); + assert_eq!(struct_column.num_columns(), 2); + let expected: [Option<&str>; 3] = Default::default(); + assert_eq!( + struct_column.column(0).deref().as_string(), + &StringArray::from(Vec::from(expected)) + ); + assert_eq!( + struct_column.column(1).deref().as_string::(), + new_null_array(&DataType::Utf8, 3).deref().as_string(), + ); + } + + #[test] + fn test_add_missing_null_fields_list_missing() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field1", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new( + "field2", + DataType::List(Arc::new(Field::new("nested1", DataType::Utf8, true))), + true, + ), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + let list_column = result.column(1).deref().as_list::(); + assert_eq!(list_column.len(), 3); + assert_eq!(list_column.value_offsets(), &[0, 0, 0, 0]); + assert_eq!( + list_column.values().deref().as_string::(), + new_empty_array(&DataType::Utf8).deref().as_string() + ) + } + + #[test] + fn test_add_missing_null_fields_map_missing() { + let schema = Arc::new(Schema::new(vec![Field::new( + "field1", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + let new_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Int32, false), + Field::new( + "field2", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Utf8, true), + Field::new("value", DataType::Utf8, true), + ])), + true, + )), + false, + ), + true, + ), + ])); + let result = cast_record_batch(&batch, new_schema.clone(), false, true).unwrap(); + assert_eq!(result.schema(), new_schema); + assert_eq!(result.num_columns(), 2); + assert_eq!( + result.column(0).deref().as_primitive::(), + &PrimitiveArray::::from_iter([1, 2, 3]) + ); + let map_column = result.column(1).deref().as_map(); + assert_eq!(map_column.len(), 3); + assert_eq!(map_column.offsets().as_ref(), &[0; 4]); + assert_eq!( + map_column.keys().deref().as_string::(), + new_empty_array(&DataType::Utf8).deref().as_string() + ); + assert_eq!( + map_column.values().deref().as_string::(), + new_empty_array(&DataType::Utf8).deref().as_string() + ); + } } diff --git a/crates/core/src/operations/cdc.rs b/crates/core/src/operations/cdc.rs new file mode 100644 index 0000000000..42a33cbcab --- /dev/null +++ b/crates/core/src/operations/cdc.rs @@ -0,0 +1,417 @@ +//! +//! The CDC module contains private tools for managing CDC files +//! + +use crate::table::state::DeltaTableState; +use crate::DeltaResult; + +use arrow::datatypes::{DataType, Field, SchemaRef}; + +use datafusion::prelude::*; +use datafusion_common::ScalarValue; +use std::sync::Arc; +use tracing::log::*; + +/// The CDCTracker is useful for hooking reads/writes in a manner nececessary to create CDC files +/// associated with commits +pub(crate) struct CDCTracker { + pre_dataframe: DataFrame, + post_dataframe: DataFrame, +} + +impl CDCTracker { + /// construct + pub(crate) fn new(pre_dataframe: DataFrame, post_dataframe: DataFrame) -> Self { + Self { + pre_dataframe, + post_dataframe, + } + } + + pub(crate) fn collect(self) -> DeltaResult { + // Collect _all_ the batches for consideration + let pre_df = self.pre_dataframe; + let post_df = self.post_dataframe; + + // There is certainly a better way to do this other than stupidly cloning data for diffing + // purposes, but this is the quickest and easiest way to "diff" the two sets of batches + let preimage = pre_df.clone().except(post_df.clone())?; + let postimage = post_df.except(pre_df)?; + + let preimage = preimage.with_column( + "_change_type", + lit(ScalarValue::Utf8(Some("update_preimage".to_string()))), + )?; + + let postimage = postimage.with_column( + "_change_type", + lit(ScalarValue::Utf8(Some("update_postimage".to_string()))), + )?; + + let final_df = preimage.union(postimage)?; + Ok(final_df) + } +} + +/// +/// Return true if the specified table is capable of writing Change Data files +/// +/// From the Protocol: +/// +/// > For Writer Versions 4 up to 6, all writers must respect the delta.enableChangeDataFeed +/// > configuration flag in the metadata of the table. When delta.enableChangeDataFeed is true, +/// > writers must produce the relevant AddCDCFile's for any operation that changes data, as +/// > specified in Change Data Files. +/// > +/// > For Writer Version 7, all writers must respect the delta.enableChangeDataFeed configuration flag in +/// > the metadata of the table only if the feature changeDataFeed exists in the table protocol's +/// > writerFeatures. +pub(crate) fn should_write_cdc(snapshot: &DeltaTableState) -> DeltaResult { + if let Some(features) = &snapshot.protocol().writer_features { + // Features should only exist at writer version 7 but to avoid cases where + // the Option> can get filled with an empty set, checking for the value + // explicitly + if snapshot.protocol().min_writer_version == 7 + && !features.contains(&crate::kernel::WriterFeatures::ChangeDataFeed) + { + // If the writer feature has not been set, then the table should not have CDC written + // to it. Otherwise fallback to the configured table configuration + return Ok(false); + } + } + Ok(snapshot.table_config().enable_change_data_feed()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::kernel::DataType as DeltaDataType; + use crate::kernel::{Action, PrimitiveType, Protocol}; + use crate::operations::DeltaOps; + use crate::{DeltaConfigKey, DeltaTable}; + use arrow::array::{ArrayRef, Int32Array, StructArray}; + use arrow_array::RecordBatch; + use arrow_schema::Schema; + use datafusion::assert_batches_sorted_eq; + use datafusion::datasource::{MemTable, TableProvider}; + + /// A simple test which validates primitive writer version 1 tables should + /// not write Change Data Files + #[tokio::test] + async fn test_should_write_cdc_basic_table() { + let mut table = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .await + .expect("Failed to make a table"); + table.load().await.expect("Failed to reload table"); + let result = should_write_cdc(table.snapshot().unwrap()).expect("Failed to use table"); + assert!( + result == false, + "A default table should not create CDC files" + ); + } + + /// + /// This test manually creates a table with writer version 4 that has the configuration sets + /// + #[tokio::test] + async fn test_should_write_cdc_table_with_configuration() { + let actions = vec![Action::Protocol(Protocol::new(1, 4))]; + let mut table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_actions(actions) + .with_configuration_property(DeltaConfigKey::EnableChangeDataFeed, Some("true")) + .await + .expect("failed to make a version 4 table with EnableChangeDataFeed"); + table.load().await.expect("Failed to reload table"); + + let result = should_write_cdc(table.snapshot().unwrap()).expect("Failed to use table"); + assert!( + result == true, + "A table with the EnableChangeDataFeed should create CDC files" + ); + } + + /// + /// This test creates a writer version 7 table which has a slightly different way of + /// determining whether CDC files should be written or not. + #[tokio::test] + async fn test_should_write_cdc_v7_table_no_writer_feature() { + let actions = vec![Action::Protocol(Protocol::new(1, 7))]; + let mut table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_actions(actions) + .await + .expect("failed to make a version 4 table with EnableChangeDataFeed"); + table.load().await.expect("Failed to reload table"); + + let result = should_write_cdc(table.snapshot().unwrap()).expect("Failed to use table"); + assert!( + result == false, + "A v7 table must not write CDC files unless the writer feature is set" + ); + } + + /// + /// This test creates a writer version 7 table with a writer table feature enabled for CDC and + /// therefore should write CDC files + #[tokio::test] + async fn test_should_write_cdc_v7_table_with_writer_feature() { + let protocol = Protocol::new(1, 7) + .with_writer_features(vec![crate::kernel::WriterFeatures::ChangeDataFeed]); + let actions = vec![Action::Protocol(protocol)]; + let mut table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_actions(actions) + .with_configuration_property(DeltaConfigKey::EnableChangeDataFeed, Some("true")) + .await + .expect("failed to make a version 4 table with EnableChangeDataFeed"); + table.load().await.expect("Failed to reload table"); + + let result = should_write_cdc(table.snapshot().unwrap()).expect("Failed to use table"); + assert!( + result, + "A v7 table must not write CDC files unless the writer feature is set" + ); + } + + #[tokio::test] + async fn test_sanity_check() { + let ctx = SessionContext::new(); + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + DataType::Int32, + true, + )])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema.clone()), + vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]))], + ) + .unwrap(); + let table_provider: Arc = + Arc::new(MemTable::try_new(schema.clone(), vec![vec![batch]]).unwrap()); + let source_df = ctx.read_table(table_provider).unwrap(); + + let updated_batch = RecordBatch::try_new( + Arc::clone(&schema.clone()), + vec![Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)]))], + ) + .unwrap(); + let table_provider_updated: Arc = + Arc::new(MemTable::try_new(schema.clone(), vec![vec![updated_batch]]).unwrap()); + let updated_df = ctx.read_table(table_provider_updated).unwrap(); + + let tracker = CDCTracker::new(source_df, updated_df); + + match tracker.collect() { + Ok(df) => { + let batches = &df.collect().await.unwrap(); + let _ = arrow::util::pretty::print_batches(batches); + assert_eq!(batches.len(), 2); + assert_batches_sorted_eq! {[ + "+-------+------------------+", + "| value | _change_type |", + "+-------+------------------+", + "| 2 | update_preimage |", + "| 12 | update_postimage |", + "+-------+------------------+", + ], &batches } + } + Err(err) => { + println!("err: {err:#?}"); + panic!("Should have never reached this assertion"); + } + } + } + + #[tokio::test] + async fn test_sanity_check_with_pure_df() { + let nested_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("lat", DataType::Int32, true), + Field::new("long", DataType::Int32, true), + ])); + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Int32, true), + Field::new( + "nested", + DataType::Struct(nested_schema.fields.clone()), + true, + ), + ])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("lat", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("long", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + + let updated_batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("lat", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("long", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + let _ = arrow::util::pretty::print_batches(&vec![batch.clone()]); + let _ = arrow::util::pretty::print_batches(&vec![updated_batch.clone()]); + + let ctx = SessionContext::new(); + let before = ctx.read_batch(batch).expect("Failed to make DataFrame"); + let after = ctx + .read_batch(updated_batch) + .expect("Failed to make DataFrame"); + + let diff = before + .except(after) + .expect("Failed to except") + .collect() + .await + .expect("Failed to diff"); + assert_eq!(diff.len(), 1); + } + + #[tokio::test] + async fn test_sanity_check_with_struct() { + let ctx = SessionContext::new(); + let nested_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("lat", DataType::Int32, true), + Field::new("long", DataType::Int32, true), + ])); + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Int32, true), + Field::new( + "nested", + DataType::Struct(nested_schema.fields.clone()), + true, + ), + ])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema.clone()), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("lat", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("long", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + let table_provider: Arc = + Arc::new(MemTable::try_new(schema.clone(), vec![vec![batch]]).unwrap()); + let source_df = ctx.read_table(table_provider).unwrap(); + + let updated_batch = RecordBatch::try_new( + Arc::clone(&schema.clone()), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("lat", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("long", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + let table_provider_updated: Arc = + Arc::new(MemTable::try_new(schema.clone(), vec![vec![updated_batch]]).unwrap()); + let updated_df = ctx.read_table(table_provider_updated).unwrap(); + + let tracker = CDCTracker::new(source_df, updated_df); + + match tracker.collect() { + Ok(df) => { + let batches = &df.collect().await.unwrap(); + let _ = arrow::util::pretty::print_batches(&batches); + assert_eq!(batches.len(), 2); + assert_batches_sorted_eq! {[ + "+-------+--------------------------+------------------+", + "| value | nested | _change_type |", + "+-------+--------------------------+------------------+", + "| 12 | {id: 2, lat: 2, long: 2} | update_postimage |", + "| 2 | {id: 2, lat: 2, long: 2} | update_preimage |", + "+-------+--------------------------+------------------+", + ], &batches } + } + Err(err) => { + println!("err: {err:#?}"); + panic!("Should have never reached this assertion"); + } + } + } +} diff --git a/crates/core/src/operations/constraints.rs b/crates/core/src/operations/constraints.rs index 9bf5f2d22c..e5d356f81c 100644 --- a/crates/core/src/operations/constraints.rs +++ b/crates/core/src/operations/constraints.rs @@ -1,9 +1,7 @@ //! Add a check constraint to a table -use std::collections::HashMap; use std::sync::Arc; -use chrono::Utc; use datafusion::execution::context::SessionState; use datafusion::execution::{SendableRecordBatchStream, TaskContext}; use datafusion::physical_plan::ExecutionPlan; @@ -11,22 +9,21 @@ use datafusion::prelude::SessionContext; use datafusion_common::ToDFSchema; use futures::future::BoxFuture; use futures::StreamExt; -use serde_json::json; use crate::delta_datafusion::expr::fmt_expr_to_sql; use crate::delta_datafusion::{ register_store, DeltaDataChecker, DeltaScanBuilder, DeltaSessionContext, }; -use crate::kernel::{CommitInfo, IsolationLevel, Protocol}; +use crate::kernel::{Protocol, WriterFeatures}; use crate::logstore::LogStoreRef; use crate::operations::datafusion_utils::Expression; -use crate::operations::transaction::commit; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; use crate::table::Constraint; use crate::{DeltaResult, DeltaTable, DeltaTableError}; use super::datafusion_utils::into_expr; +use super::transaction::{CommitBuilder, CommitProperties}; /// Build a constraint to add to a table pub struct ConstraintBuilder { @@ -40,10 +37,12 @@ pub struct ConstraintBuilder { log_store: LogStoreRef, /// Datafusion session state relevant for executing the input plan state: Option, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Additional information to add to the commit + commit_properties: CommitProperties, } +impl super::Operation<()> for ConstraintBuilder {} + impl ConstraintBuilder { /// Create a new builder pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { @@ -53,7 +52,7 @@ impl ConstraintBuilder { snapshot, log_store, state: None, - app_metadata: None, + commit_properties: CommitProperties::default(), } } @@ -75,11 +74,8 @@ impl ConstraintBuilder { } /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } } @@ -90,7 +86,7 @@ impl std::future::IntoFuture for ConstraintBuilder { type IntoFuture = BoxFuture<'static, Self::Output>; fn into_future(self) -> Self::IntoFuture { - let mut this = self; + let this = self; Box::pin(async move { let name = match this.name { @@ -132,7 +128,7 @@ impl std::future::IntoFuture for ConstraintBuilder { let plan: Arc = Arc::new(scan); let mut tasks = vec![]; - for p in 0..plan.output_partitioning().partition_count() { + for p in 0..plan.properties().output_partitioning().partition_count() { let inner_plan = plan.clone(); let inner_checker = checker.clone(); let task_ctx = Arc::new(TaskContext::from(&state)); @@ -177,48 +173,35 @@ impl std::future::IntoFuture for ConstraintBuilder { 3 }, reader_features: old_protocol.reader_features.clone(), - writer_features: old_protocol.writer_features.clone(), + writer_features: if old_protocol.min_writer_version < 7 { + old_protocol.writer_features.clone() + } else { + let current_features = old_protocol.writer_features.clone(); + if let Some(mut features) = current_features { + features.insert(WriterFeatures::CheckConstraints); + Some(features) + } else { + current_features + } + }, }; - let operational_parameters = HashMap::from_iter([ - ("name".to_string(), json!(&name)), - ("expr".to_string(), json!(&expr_str)), - ]); - - let operations = DeltaOperation::AddConstraint { + let operation = DeltaOperation::AddConstraint { name: name.clone(), expr: expr_str.clone(), }; - let app_metadata = match this.app_metadata { - Some(metadata) => metadata, - None => HashMap::default(), - }; - - let commit_info = CommitInfo { - timestamp: Some(Utc::now().timestamp_millis()), - operation: Some(operations.name().to_string()), - operation_parameters: Some(operational_parameters), - read_version: Some(this.snapshot.version()), - isolation_level: Some(IsolationLevel::Serializable), - is_blind_append: Some(false), - info: app_metadata, - ..Default::default() - }; + let actions = vec![metadata.into(), protocol.into()]; - let actions = vec![commit_info.into(), metadata.into(), protocol.into()]; - - let version = commit( - this.log_store.as_ref(), - &actions, - operations.clone(), - Some(&this.snapshot), - None, - ) - .await?; + let commit = CommitBuilder::from(this.commit_properties) + .with_actions(actions) + .build(Some(&this.snapshot), this.log_store.clone(), operation) + .await?; - this.snapshot.merge(actions, &operations, version)?; - Ok(DeltaTable::new_with_state(this.log_store, this.snapshot)) + Ok(DeltaTable::new_with_state( + this.log_store, + commit.snapshot(), + )) }) } } diff --git a/crates/core/src/operations/convert_to_delta.rs b/crates/core/src/operations/convert_to_delta.rs index 1ed4e1cee6..a51d353b20 100644 --- a/crates/core/src/operations/convert_to_delta.rs +++ b/crates/core/src/operations/convert_to_delta.rs @@ -1,33 +1,32 @@ //! Command for converting a Parquet table to a Delta table in place // https://github.com/delta-io/delta/blob/1d5dd774111395b0c4dc1a69c94abc169b1c83b6/spark/src/main/scala/org/apache/spark/sql/delta/commands/ConvertToDeltaCommand.scala +use std::collections::{HashMap, HashSet}; +use std::num::TryFromIntError; +use std::str::{FromStr, Utf8Error}; +use std::sync::Arc; +use arrow::{datatypes::Schema as ArrowSchema, error::ArrowError}; +use futures::future::{self, BoxFuture}; +use futures::TryStreamExt; +use indexmap::IndexMap; +use itertools::Itertools; +use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; +use parquet::errors::ParquetError; +use percent_encoding::percent_decode_str; +use serde_json::{Map, Value}; +use tracing::debug; + +use crate::operations::get_num_idx_cols_and_stats_columns; use crate::{ - kernel::{Add, DataType, Schema, StructField}, + kernel::{scalars::ScalarExt, Add, DataType, Schema, StructField}, logstore::{LogStore, LogStoreRef}, operations::create::CreateBuilder, protocol::SaveMode, table::builder::ensure_table_uri, table::config::DeltaConfigKey, + writer::stats::stats_from_parquet_metadata, DeltaResult, DeltaTable, DeltaTableError, ObjectStoreError, NULL_PARTITION_VALUE_DATA_PATH, }; -use arrow::{datatypes::Schema as ArrowSchema, error::ArrowError}; -use futures::{ - future::{self, BoxFuture}, - TryStreamExt, -}; -use parquet::{ - arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}, - errors::ParquetError, -}; -use percent_encoding::percent_decode_str; -use serde_json::{Map, Value}; -use std::{ - collections::{HashMap, HashSet}, - num::TryFromIntError, - str::{FromStr, Utf8Error}, - sync::Arc, -}; -use tracing::debug; /// Error converting a Parquet table to a Delta table #[derive(Debug, thiserror::Error)] @@ -49,7 +48,7 @@ enum Error { #[error("The schema of partition columns must be provided to convert a Parquet table to a Delta table")] MissingPartitionSchema, #[error("Partition column provided by the user does not exist in the parquet files")] - PartitionColumnNotExist(HashSet), + PartitionColumnNotExist, #[error("The given location is already a delta table location")] DeltaTableAlready, #[error("Location must be provided to convert a Parquet table to a Delta table")] @@ -101,7 +100,7 @@ pub struct ConvertToDeltaBuilder { log_store: Option, location: Option, storage_options: Option>, - partition_schema: HashSet, + partition_schema: HashMap, partition_strategy: PartitionStrategy, mode: SaveMode, name: Option, @@ -116,6 +115,8 @@ impl Default for ConvertToDeltaBuilder { } } +impl super::Operation<()> for ConvertToDeltaBuilder {} + impl ConvertToDeltaBuilder { /// Create a new [`ConvertToDeltaBuilder`] pub fn new() -> Self { @@ -164,7 +165,10 @@ impl ConvertToDeltaBuilder { mut self, partition_schema: impl IntoIterator, ) -> Self { - self.partition_schema = HashSet::from_iter(partition_schema); + self.partition_schema = partition_schema + .into_iter() + .map(|f| (f.name.clone(), f)) + .collect(); self } @@ -226,7 +230,7 @@ impl ConvertToDeltaBuilder { } /// Consume self into CreateBuilder with corresponding add actions, schemas and operation meta - async fn into_create_builder(mut self) -> Result { + async fn into_create_builder(self) -> Result { // Use the specified log store. If a log store is not provided, create a new store from the specified path. // Return an error if neither log store nor path is provided let log_store = if let Some(log_store) = self.log_store { @@ -270,11 +274,17 @@ impl ConvertToDeltaBuilder { // Iterate over the parquet files. Parse partition columns, generate add actions and collect parquet file schemas let mut arrow_schemas = Vec::new(); let mut actions = Vec::new(); + // partition columns that were defined by caller and are expected to apply on this table + let mut expected_partitions: HashMap = self.partition_schema.clone(); // A HashSet of all unique partition columns in a Parquet table let mut partition_columns = HashSet::new(); // A vector of StructField of all unique partition columns in a Parquet table let mut partition_schema_fields = HashMap::new(); + // Obtain settings on which columns to skip collecting stats on if any + let (num_indexed_cols, stats_columns) = + get_num_idx_cols_and_stats_columns(None, self.configuration.clone()); + for file in files { // A HashMap from partition column to value for this parquet file only let mut partition_values = HashMap::new(); @@ -290,7 +300,7 @@ impl ConvertToDeltaBuilder { .ok_or(Error::MissingPartitionSchema)?; if partition_columns.insert(key.to_string()) { - if let Some(schema) = self.partition_schema.take(key) { + if let Some(schema) = expected_partitions.remove(key) { partition_schema_fields.insert(key.to_string(), schema); } else { // Return an error if the schema of a partition column is not provided by user @@ -301,12 +311,14 @@ impl ConvertToDeltaBuilder { // Safety: we just checked that the key is present in the map let field = partition_schema_fields.get(key).unwrap(); let scalar = if value == NULL_PARTITION_VALUE_DATA_PATH { - Ok(crate::kernel::Scalar::Null(field.data_type().clone())) + Ok(delta_kernel::expressions::Scalar::Null( + field.data_type().clone(), + )) } else { let decoded = percent_decode_str(value).decode_utf8()?; match field.data_type() { DataType::Primitive(p) => p.parse_scalar(decoded.as_ref()), - _ => Err(crate::kernel::Error::Generic(format!( + _ => Err(delta_kernel::Error::Generic(format!( "Exprected primitive type, found: {:?}", field.data_type() ))), @@ -319,6 +331,24 @@ impl ConvertToDeltaBuilder { subpath = iter.next(); } + let batch_builder = ParquetRecordBatchStreamBuilder::new(ParquetObjectReader::new( + object_store.clone(), + file.clone(), + )) + .await?; + + // Fetch the stats + let parquet_metadata = batch_builder.metadata(); + let stats = stats_from_parquet_metadata( + &IndexMap::from_iter(partition_values.clone().into_iter()), + parquet_metadata.as_ref(), + num_indexed_cols, + &stats_columns, + ) + .map_err(|e| Error::DeltaTable(e.into()))?; + let stats_string = + serde_json::to_string(&stats).map_err(|e| Error::DeltaTable(e.into()))?; + actions.push( Add { path: percent_decode_str(file.location.as_ref()) @@ -340,19 +370,13 @@ impl ConvertToDeltaBuilder { .collect(), modification_time: file.last_modified.timestamp_millis(), data_change: true, + stats: Some(stats_string), ..Default::default() } .into(), ); - let mut arrow_schema = ParquetRecordBatchStreamBuilder::new(ParquetObjectReader::new( - object_store.clone(), - file, - )) - .await? - .schema() - .as_ref() - .clone(); + let mut arrow_schema = batch_builder.schema().as_ref().clone(); // Arrow schema of Parquet files may have conflicting metatdata // Since Arrow schema metadata is not used to generate Delta table schema, we set the metadata field to an empty HashMap @@ -360,27 +384,21 @@ impl ConvertToDeltaBuilder { arrow_schemas.push(arrow_schema); } - if !self.partition_schema.is_empty() { + if !expected_partitions.is_empty() { // Partition column provided by the user does not exist in the parquet files - return Err(Error::PartitionColumnNotExist(self.partition_schema)); + return Err(Error::PartitionColumnNotExist); } // Merge parquet file schemas // This step is needed because timestamp will not be preserved when copying files in S3. We can't use the schema of the latest parqeut file as Delta table's schema - let mut schema_fields = Schema::try_from(&ArrowSchema::try_merge(arrow_schemas)?)? - .fields() - .clone(); - schema_fields.append( - &mut partition_schema_fields - .values() - .cloned() - .collect::>(), - ); + let schema = Schema::try_from(&ArrowSchema::try_merge(arrow_schemas)?)?; + let mut schema_fields = schema.fields().collect_vec(); + schema_fields.append(&mut partition_schema_fields.values().collect::>()); // Generate CreateBuilder with corresponding add actions, schemas and operation meta let mut builder = CreateBuilder::new() .with_log_store(log_store) - .with_columns(schema_fields) + .with_columns(schema_fields.into_iter().cloned()) .with_partition_columns(partition_columns.into_iter()) .with_actions(actions) .with_save_mode(self.mode) @@ -419,17 +437,20 @@ impl std::future::IntoFuture for ConvertToDeltaBuilder { #[cfg(test)] mod tests { + use std::fs; + + use delta_kernel::expressions::Scalar; + use itertools::Itertools; + use pretty_assertions::assert_eq; + use tempfile::tempdir; + use super::*; use crate::{ - kernel::{DataType, PrimitiveType, Scalar}, + kernel::{DataType, PrimitiveType}, open_table, storage::StorageOptions, Path, }; - use itertools::Itertools; - use pretty_assertions::assert_eq; - use std::fs; - use tempfile::tempdir; fn schema_field(key: &str, primitive: PrimitiveType, nullable: bool) -> StructField { StructField::new(key.to_string(), DataType::Primitive(primitive), nullable) @@ -535,7 +556,8 @@ mod tests { .get_schema() .expect("Failed to get schema") .fields() - .clone(); + .cloned() + .collect_vec(); schema_fields.sort_by(|a, b| a.name().cmp(b.name())); assert_eq!( schema_fields, expected_schema, @@ -575,6 +597,16 @@ mod tests { "part-00000-d22c627d-9655-4153-9527-f8995620fa42-c000.snappy.parquet" ); + let Some(Scalar::Struct(data)) = action.min_values() else { + panic!("Missing min values"); + }; + assert_eq!(data.values(), vec![Scalar::Date(18628), Scalar::Integer(1)]); + + let Some(Scalar::Struct(data)) = action.max_values() else { + panic!("Missing max values"); + }; + assert_eq!(data.values(), vec![Scalar::Date(18632), Scalar::Integer(5)]); + assert_delta_table( table, path, diff --git a/crates/core/src/operations/create.rs b/crates/core/src/operations/create.rs index bbf11e3705..63b6995f9b 100644 --- a/crates/core/src/operations/create.rs +++ b/crates/core/src/operations/create.rs @@ -4,12 +4,16 @@ use std::collections::HashMap; use std::sync::Arc; +use delta_kernel::schema::MetadataValue; use futures::future::BoxFuture; +use maplit::hashset; use serde_json::Value; -use super::transaction::{commit, PROTOCOL}; +use super::transaction::{CommitBuilder, TableReference, PROTOCOL}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Action, DataType, Metadata, Protocol, StructField, StructType}; +use crate::kernel::{ + Action, DataType, Metadata, Protocol, ReaderFeatures, StructField, StructType, WriterFeatures, +}; use crate::logstore::{LogStore, LogStoreRef}; use crate::protocol::{DeltaOperation, SaveMode}; use crate::table::builder::ensure_table_uri; @@ -56,8 +60,11 @@ pub struct CreateBuilder { log_store: Option, configuration: HashMap>, metadata: Option>, + raise_if_key_not_exists: bool, } +impl super::Operation<()> for CreateBuilder {} + impl Default for CreateBuilder { fn default() -> Self { Self::new() @@ -79,6 +86,7 @@ impl CreateBuilder { log_store: None, configuration: Default::default(), metadata: Default::default(), + raise_if_key_not_exists: true, } } @@ -118,7 +126,24 @@ impl CreateBuilder { ) -> Self { let mut field = StructField::new(name.into(), data_type, nullable); if let Some(meta) = metadata { - field = field.with_metadata(meta); + field = field.with_metadata(meta.iter().map(|(k, v)| { + ( + k, + if let Value::Number(n) = v { + n.as_i64().map_or_else( + || MetadataValue::String(v.to_string()), + |i| { + i32::try_from(i) + .ok() + .map(MetadataValue::Number) + .unwrap_or_else(|| MetadataValue::String(v.to_string())) + }, + ) + } else { + MetadataValue::String(v.to_string()) + }, + ) + })); }; self.columns.push(field); self @@ -188,6 +213,12 @@ impl CreateBuilder { self } + /// Specify whether to raise an error if the table properties in the configuration are not DeltaConfigKeys + pub fn with_raise_if_key_not_exists(mut self, raise_if_key_not_exists: bool) -> Self { + self.raise_if_key_not_exists = raise_if_key_not_exists; + self + } + /// Specify additional actions to be added to the commit. /// /// This method is mainly meant for internal use. Manually adding inconsistent @@ -233,8 +264,27 @@ impl CreateBuilder { ) }; + let configuration = self.configuration; + let contains_timestampntz = PROTOCOL.contains_timestampntz(self.columns.iter()); // TODO configure more permissive versions based on configuration. Also how should this ideally be handled? // We set the lowest protocol we can, and if subsequent writes use newer features we update metadata? + + let current_protocol = if contains_timestampntz { + Protocol { + min_reader_version: 3, + min_writer_version: 7, + writer_features: Some(hashset! {WriterFeatures::TimestampWithoutTimezone}), + reader_features: Some(hashset! {ReaderFeatures::TimestampWithoutTimezone}), + } + } else { + Protocol { + min_reader_version: PROTOCOL.default_reader_version(), + min_writer_version: PROTOCOL.default_writer_version(), + reader_features: None, + writer_features: None, + } + }; + let protocol = self .actions .iter() @@ -243,17 +293,22 @@ impl CreateBuilder { Action::Protocol(p) => p.clone(), _ => unreachable!(), }) - .unwrap_or_else(|| Protocol { - min_reader_version: PROTOCOL.default_reader_version(), - min_writer_version: PROTOCOL.default_writer_version(), - writer_features: None, - reader_features: None, - }); + .unwrap_or_else(|| current_protocol); + + let protocol = protocol.apply_properties_to_protocol( + &configuration + .iter() + .map(|(k, v)| (k.clone(), v.clone().unwrap())) + .collect::>(), + self.raise_if_key_not_exists, + )?; + + let protocol = protocol.move_table_properties_into_features(&configuration); let mut metadata = Metadata::try_new( StructType::new(self.columns), self.partition_columns.unwrap_or_default(), - self.configuration, + configuration, )? .with_created_time(chrono::Utc::now().timestamp_millis()); if let Some(name) = self.name { @@ -264,13 +319,14 @@ impl CreateBuilder { } let operation = DeltaOperation::Create { - mode: self.mode.clone(), + mode: self.mode, metadata: metadata.clone(), location: storage_url, protocol: protocol.clone(), }; let mut actions = vec![Action::Protocol(protocol), Action::Metadata(metadata)]; + actions.extend( self.actions .into_iter() @@ -288,9 +344,9 @@ impl std::future::IntoFuture for CreateBuilder { fn into_future(self) -> Self::IntoFuture { let this = self; Box::pin(async move { - let mode = this.mode.clone(); - let app_metadata = this.metadata.clone(); - let (mut table, actions, operation) = this.into_table_and_actions()?; + let mode = this.mode; + let app_metadata = this.metadata.clone().unwrap_or_default(); + let (mut table, mut actions, operation) = this.into_table_and_actions()?; let log_store = table.log_store(); let table_state = if log_store.is_delta_table_location().await? { @@ -303,6 +359,12 @@ impl std::future::IntoFuture for CreateBuilder { } SaveMode::Overwrite => { table.load().await?; + let remove_actions = table + .snapshot()? + .log_data() + .into_iter() + .map(|p| p.remove_action(true).into()); + actions.extend(remove_actions); Some(table.snapshot()?) } } @@ -310,15 +372,16 @@ impl std::future::IntoFuture for CreateBuilder { None }; - let version = commit( - table.log_store.as_ref(), - &actions, - operation, - table_state, - app_metadata, - ) - .await?; - + let version = CommitBuilder::default() + .with_actions(actions) + .with_app_metadata(app_metadata) + .build( + table_state.map(|f| f as &dyn TableReference), + table.log_store.clone(), + operation, + ) + .await? + .version(); table.load_version(version).await?; Ok(table) @@ -331,7 +394,7 @@ mod tests { use super::*; use crate::operations::DeltaOps; use crate::table::config::DeltaConfigKey; - use crate::writer::test_utils::get_delta_schema; + use crate::writer::test_utils::{get_delta_schema, get_record_batch}; use tempfile::TempDir; #[tokio::test] @@ -340,7 +403,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -360,7 +423,7 @@ mod tests { .await .unwrap() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -378,7 +441,7 @@ mod tests { ); let table = CreateBuilder::new() .with_location(format!("./{relative_path}")) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -389,7 +452,7 @@ mod tests { let schema = get_delta_schema(); let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -412,7 +475,7 @@ mod tests { }; let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_actions(vec![Action::Protocol(protocol)]) .await .unwrap(); @@ -421,7 +484,7 @@ mod tests { let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_configuration_property(DeltaConfigKey::AppendOnly, Some("true")) .await .unwrap(); @@ -444,7 +507,7 @@ mod tests { let schema = get_delta_schema(); let table = CreateBuilder::new() .with_location(tmp_dir.path().to_str().unwrap()) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -455,7 +518,7 @@ mod tests { // Check an error is raised when a table exists at location let table = CreateBuilder::new() .with_log_store(log_store.clone()) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_save_mode(SaveMode::ErrorIfExists) .await; assert!(table.is_err()); @@ -463,7 +526,7 @@ mod tests { // Check current table is returned when ignore option is chosen. let table = CreateBuilder::new() .with_log_store(log_store.clone()) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -472,10 +535,98 @@ mod tests { // Check table is overwritten let table = CreateBuilder::new() .with_log_store(log_store) - .with_columns(schema.fields().iter().cloned()) + .with_columns(schema.fields().cloned()) .with_save_mode(SaveMode::Overwrite) .await .unwrap(); assert_ne!(table.metadata().unwrap().id, first_id) } + + #[tokio::test] + async fn test_create_or_replace_existing_table() { + let batch = get_record_batch(None, false); + let schema = get_delta_schema(); + let table = DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::ErrorIfExists) + .await + .unwrap(); + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 1); + + let mut table = DeltaOps(table) + .create() + .with_columns(schema.fields().cloned()) + .with_save_mode(SaveMode::Overwrite) + .await + .unwrap(); + table.load().await.unwrap(); + assert_eq!(table.version(), 1); + // Checks if files got removed after overwrite + assert_eq!(table.get_files_count(), 0); + } + + #[tokio::test] + async fn test_create_or_replace_existing_table_partitioned() { + let batch = get_record_batch(None, false); + let schema = get_delta_schema(); + let table = DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::ErrorIfExists) + .await + .unwrap(); + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 1); + + let mut table = DeltaOps(table) + .create() + .with_columns(schema.fields().cloned()) + .with_save_mode(SaveMode::Overwrite) + .with_partition_columns(vec!["id"]) + .await + .unwrap(); + table.load().await.unwrap(); + assert_eq!(table.version(), 1); + // Checks if files got removed after overwrite + assert_eq!(table.get_files_count(), 0); + } + + #[tokio::test] + async fn test_create_table_metadata_raise_if_key_not_exists() { + let schema = get_delta_schema(); + let config: HashMap> = + vec![("key".to_string(), Some("value".to_string()))] + .into_iter() + .collect(); + + // Fail to create table with unknown Delta key + let table = CreateBuilder::new() + .with_location("memory://") + .with_columns(schema.fields().cloned()) + .with_configuration(config.clone()) + .await; + assert!(table.is_err()); + + // Succeed in creating table with unknown Delta key since we set raise_if_key_not_exists to false + let table = CreateBuilder::new() + .with_location("memory://") + .with_columns(schema.fields().cloned()) + .with_raise_if_key_not_exists(false) + .with_configuration(config) + .await; + assert!(table.is_ok()); + + // Ensure the non-Delta key was set correctly + let value = table + .unwrap() + .metadata() + .unwrap() + .configuration + .get("key") + .unwrap() + .as_ref() + .unwrap() + .clone(); + assert_eq!(String::from("value"), value); + } } diff --git a/crates/core/src/operations/delete.rs b/crates/core/src/operations/delete.rs index 2e3e99bde2..692c1b303b 100644 --- a/crates/core/src/operations/delete.rs +++ b/crates/core/src/operations/delete.rs @@ -17,34 +17,47 @@ //! .await?; //! ```` -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Instant, SystemTime, UNIX_EPOCH}; - +use crate::delta_datafusion::logical::MetricObserver; +use crate::delta_datafusion::physical::{find_metric_node, get_metric, MetricObserverExec}; +use crate::delta_datafusion::planner::DeltaPlanner; use crate::logstore::LogStoreRef; +use async_trait::async_trait; +use datafusion::dataframe::DataFrame; +use datafusion::datasource::provider_as_source; +use datafusion::error::Result as DataFusionResult; use datafusion::execution::context::{SessionContext, SessionState}; -use datafusion::physical_expr::create_physical_expr; -use datafusion::physical_plan::filter::FilterExec; +use datafusion::physical_plan::metrics::MetricBuilder; use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_planner::{ExtensionPlanner, PhysicalPlanner}; use datafusion::prelude::Expr; -use datafusion_common::scalar::ScalarValue; -use datafusion_common::DFSchema; +use datafusion_common::ScalarValue; +use datafusion_expr::{lit, Extension, LogicalPlan, LogicalPlanBuilder, UserDefinedLogicalNode}; + use futures::future::BoxFuture; +use std::sync::Arc; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; + use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::Value; +use super::cdc::should_write_cdc; use super::datafusion_utils::Expression; -use super::transaction::PROTOCOL; +use super::transaction::{CommitBuilder, CommitProperties, PROTOCOL}; + use crate::delta_datafusion::expr::fmt_expr_to_sql; -use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder, DeltaSessionContext}; +use crate::delta_datafusion::{ + find_files, register_store, DataFusionMixins, DeltaScanConfigBuilder, DeltaSessionContext, + DeltaTableProvider, +}; use crate::errors::DeltaResult; use crate::kernel::{Action, Add, Remove}; -use crate::operations::transaction::commit; -use crate::operations::write::write_execution_plan; +use crate::operations::write::{write_execution_plan, write_execution_plan_cdc, WriterStatsConfig}; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; -use crate::DeltaTable; +use crate::{DeltaTable, DeltaTableError}; + +const SOURCE_COUNT_ID: &str = "delete_source_count"; +const SOURCE_COUNT_METRIC: &str = "num_source_rows"; /// Delete Records from the Delta Table. /// See this module's documentation for more information @@ -59,8 +72,8 @@ pub struct DeleteBuilder { state: Option, /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Commit properties and configuration + commit_properties: CommitProperties, } #[derive(Default, Debug, Serialize)] @@ -71,17 +84,19 @@ pub struct DeleteMetrics { /// Number of files removed pub num_removed_files: usize, /// Number of rows removed - pub num_deleted_rows: Option, + pub num_deleted_rows: usize, /// Number of rows copied in the process of deleting files - pub num_copied_rows: Option, + pub num_copied_rows: usize, /// Time taken to execute the entire operation - pub execution_time_ms: u128, + pub execution_time_ms: u64, /// Time taken to scan the file for matches - pub scan_time_ms: u128, + pub scan_time_ms: u64, /// Time taken to rewrite the matched files - pub rewrite_time_ms: u128, + pub rewrite_time_ms: u64, } +impl super::Operation<()> for DeleteBuilder {} + impl DeleteBuilder { /// Create a new [`DeleteBuilder`] pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { @@ -90,7 +105,7 @@ impl DeleteBuilder { snapshot, log_store, state: None, - app_metadata: None, + commit_properties: CommitProperties::default(), writer_properties: None, } } @@ -107,12 +122,9 @@ impl DeleteBuilder { self } - /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + /// Additonal information to write to the commit + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } @@ -123,96 +135,189 @@ impl DeleteBuilder { } } +#[derive(Clone)] +struct DeleteMetricExtensionPlanner {} + +#[async_trait] +impl ExtensionPlanner for DeleteMetricExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> DataFusionResult>> { + if let Some(metric_observer) = node.as_any().downcast_ref::() { + if metric_observer.id.eq(SOURCE_COUNT_ID) { + return Ok(Some(MetricObserverExec::try_new( + SOURCE_COUNT_ID.into(), + physical_inputs, + |batch, metrics| { + MetricBuilder::new(metrics) + .global_counter(SOURCE_COUNT_METRIC) + .add(batch.num_rows()); + }, + )?)); + } + } + Ok(None) + } +} + +#[allow(clippy::too_many_arguments)] async fn excute_non_empty_expr( snapshot: &DeltaTableState, log_store: LogStoreRef, state: &SessionState, expression: &Expr, - metrics: &mut DeleteMetrics, rewrite: &[Add], + metrics: &mut DeleteMetrics, writer_properties: Option, -) -> DeltaResult> { + partition_scan: bool, +) -> DeltaResult> { // For each identified file perform a parquet scan + filter + limit (1) + count. // If returned count is not zero then append the file to be rewritten and removed from the log. Otherwise do nothing to the file. + let mut actions: Vec = Vec::new(); + let table_partition_cols = snapshot.metadata().partition_columns.clone(); - let input_schema = snapshot.input_schema()?; - let input_dfschema: DFSchema = input_schema.clone().as_ref().clone().try_into()?; + let delete_planner = DeltaPlanner:: { + extension_planner: DeleteMetricExtensionPlanner {}, + }; - let table_partition_cols = snapshot.metadata().partition_columns.clone(); + let state = state.clone().with_query_planner(Arc::new(delete_planner)); + + let scan_config = DeltaScanConfigBuilder::default() + .with_file_column(false) + .with_schema(snapshot.input_schema()?) + .build(snapshot)?; + + let target_provider = Arc::new( + DeltaTableProvider::try_new(snapshot.clone(), log_store.clone(), scan_config.clone())? + .with_files(rewrite.to_vec()), + ); + let target_provider = provider_as_source(target_provider); + let source = LogicalPlanBuilder::scan("target", target_provider.clone(), None)?.build()?; + + let source = LogicalPlan::Extension(Extension { + node: Arc::new(MetricObserver { + id: "delete_source_count".into(), + input: source, + enable_pushdown: false, + }), + }); + + let df = DataFrame::new(state.clone(), source); + + let writer_stats_config = WriterStatsConfig::new( + snapshot.table_config().num_indexed_cols(), + snapshot + .table_config() + .stats_columns() + .map(|v| v.iter().map(|v| v.to_string()).collect::>()), + ); + + if !partition_scan { + // Apply the negation of the filter and rewrite files + let negated_expression = Expr::Not(Box::new(Expr::IsTrue(Box::new(expression.clone())))); + + let filter = df + .clone() + .filter(negated_expression)? + .create_physical_plan() + .await?; - let scan = DeltaScanBuilder::new(snapshot, log_store.clone(), state) - .with_files(rewrite) - .build() + let add_actions: Vec = write_execution_plan( + Some(snapshot), + state.clone(), + filter.clone(), + table_partition_cols.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties.clone(), + false, + None, + writer_stats_config.clone(), + None, + ) .await?; - let scan = Arc::new(scan); - - // Apply the negation of the filter and rewrite files - let negated_expression = Expr::Not(Box::new(Expr::IsTrue(Box::new(expression.clone())))); - - let predicate_expr = create_physical_expr( - &negated_expression, - &input_dfschema, - state.execution_props(), - )?; - let filter: Arc = - Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); - - let add_actions = write_execution_plan( - Some(snapshot), - state.clone(), - filter.clone(), - table_partition_cols.clone(), - log_store.object_store(), - Some(snapshot.table_config().target_file_size() as usize), - None, - writer_properties, - false, - false, - ) - .await?; - - let read_records = scan.parquet_scan.metrics().and_then(|m| m.output_rows()); - let filter_records = filter.metrics().and_then(|m| m.output_rows()); - metrics.num_copied_rows = filter_records; - metrics.num_deleted_rows = read_records - .zip(filter_records) - .map(|(read, filter)| read - filter); - - Ok(add_actions) + + actions.extend(add_actions); + + let source_count = find_metric_node(SOURCE_COUNT_ID, &filter).ok_or_else(|| { + DeltaTableError::Generic("Unable to locate expected metric node".into()) + })?; + let source_count_metrics = source_count.metrics().unwrap(); + let read_records = get_metric(&source_count_metrics, SOURCE_COUNT_METRIC); + let filter_records = filter.metrics().and_then(|m| m.output_rows()).unwrap_or(0); + + metrics.num_copied_rows = filter_records; + metrics.num_deleted_rows = read_records - filter_records; + } + + // CDC logic, simply filters data with predicate and adds the _change_type="delete" as literal column + if let Ok(true) = should_write_cdc(snapshot) { + // Create CDC scan + let change_type_lit = lit(ScalarValue::Utf8(Some("delete".to_string()))); + let cdc_filter = df + .filter(expression.clone())? + .with_column("_change_type", change_type_lit)? + .create_physical_plan() + .await?; + + use crate::operations::write::write_execution_plan_cdc; + let cdc_actions = write_execution_plan_cdc( + Some(snapshot), + state.clone(), + cdc_filter, + table_partition_cols.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties, + false, + writer_stats_config, + None, + ) + .await?; + actions.extend(cdc_actions) + } + + Ok(actions) } async fn execute( predicate: Option, log_store: LogStoreRef, - snapshot: &DeltaTableState, + snapshot: DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, -) -> DeltaResult<((Vec, i64, Option), DeleteMetrics)> { + mut commit_properties: CommitProperties, +) -> DeltaResult<(DeltaTableState, DeleteMetrics)> { let exec_start = Instant::now(); let mut metrics = DeleteMetrics::default(); let scan_start = Instant::now(); - let candidates = find_files(snapshot, log_store.clone(), &state, predicate.clone()).await?; - metrics.scan_time_ms = Instant::now().duration_since(scan_start).as_micros(); + let candidates = find_files(&snapshot, log_store.clone(), &state, predicate.clone()).await?; + metrics.scan_time_ms = Instant::now().duration_since(scan_start).as_millis() as u64; let predicate = predicate.unwrap_or(Expr::Literal(ScalarValue::Boolean(Some(true)))); - let add = if candidates.partition_scan { - Vec::new() - } else { + let mut actions = { let write_start = Instant::now(); let add = excute_non_empty_expr( - snapshot, + &snapshot, log_store.clone(), &state, &predicate, - &mut metrics, &candidates.candidates, + &mut metrics, writer_properties, + candidates.partition_scan, ) .await?; - metrics.rewrite_time_ms = Instant::now().duration_since(write_start).as_millis(); + metrics.rewrite_time_ms = Instant::now().duration_since(write_start).as_millis() as u64; add }; let remove = candidates.candidates; @@ -222,8 +327,6 @@ async fn execute( .unwrap() .as_millis() as i64; - let mut actions: Vec = add.into_iter().map(Action::Add).collect(); - let mut version = snapshot.version(); metrics.num_removed_files = remove.len(); metrics.num_added_files = actions.len(); @@ -242,35 +345,29 @@ async fn execute( })) } - metrics.execution_time_ms = Instant::now().duration_since(exec_start).as_micros(); + metrics.execution_time_ms = Instant::now().duration_since(exec_start).as_millis() as u64; - let mut app_metadata = match app_metadata { - Some(meta) => meta, - None => HashMap::new(), - }; - - app_metadata.insert("readVersion".to_owned(), snapshot.version().into()); - - if let Ok(map) = serde_json::to_value(&metrics) { - app_metadata.insert("operationMetrics".to_owned(), map); - } + commit_properties + .app_metadata + .insert("readVersion".to_owned(), snapshot.version().into()); + commit_properties.app_metadata.insert( + "operationMetrics".to_owned(), + serde_json::to_value(&metrics)?, + ); // Do not make a commit when there are zero updates to the state let operation = DeltaOperation::Delete { predicate: Some(fmt_expr_to_sql(&predicate)?), }; - if !actions.is_empty() { - version = commit( - log_store.as_ref(), - &actions, - operation.clone(), - Some(snapshot), - Some(app_metadata), - ) - .await?; + if actions.is_empty() { + return Ok((snapshot.clone(), metrics)); } - let op = (!actions.is_empty()).then_some(operation); - Ok(((actions, version, op), metrics)) + + let commit = CommitBuilder::from(commit_properties) + .with_actions(actions) + .build(Some(&snapshot), log_store, operation) + .await?; + Ok((commit.snapshot(), metrics)) } impl std::future::IntoFuture for DeleteBuilder { @@ -278,12 +375,11 @@ impl std::future::IntoFuture for DeleteBuilder { type IntoFuture = BoxFuture<'static, Self::Output>; fn into_future(self) -> Self::IntoFuture { - let mut this = self; + let this = self; Box::pin(async move { - PROTOCOL.check_append_only(&this.snapshot)?; - - PROTOCOL.can_write_to(&this.snapshot)?; + PROTOCOL.check_append_only(&this.snapshot.snapshot)?; + PROTOCOL.can_write_to(&this.snapshot.snapshot)?; let state = this.state.unwrap_or_else(|| { let session: SessionContext = DeltaSessionContext::default().into(); @@ -304,28 +400,29 @@ impl std::future::IntoFuture for DeleteBuilder { None => None, }; - let ((actions, version, operation), metrics) = execute( + let (new_snapshot, metrics) = execute( predicate, this.log_store.clone(), - &this.snapshot, + this.snapshot, state, this.writer_properties, - this.app_metadata, + this.commit_properties, ) .await?; - if let Some(op) = &operation { - this.snapshot.merge(actions, op, version)?; - } - - let table = DeltaTable::new_with_state(this.log_store, this.snapshot); - Ok((table, metrics)) + Ok(( + DeltaTable::new_with_state(this.log_store, new_snapshot), + metrics, + )) }) } } #[cfg(test)] mod tests { + use crate::delta_datafusion::cdf::DeltaCdfScan; + use crate::kernel::DataType as DeltaDataType; + use crate::operations::collect_sendable_stream; use crate::operations::DeltaOps; use crate::protocol::*; use crate::writer::test_utils::datafusion::get_data; @@ -339,11 +436,15 @@ mod tests { use arrow::datatypes::{Field, Schema}; use arrow::record_batch::RecordBatch; use arrow_array::ArrayRef; + use arrow_array::StringArray; use arrow_array::StructArray; use arrow_buffer::NullBuffer; + use arrow_schema::DataType; use arrow_schema::Fields; use datafusion::assert_batches_sorted_eq; + use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; + use delta_kernel::schema::PrimitiveType; use serde_json::json; use std::sync::Arc; @@ -352,7 +453,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); @@ -407,8 +508,8 @@ mod tests { assert_eq!(table.get_files_count(), 0); assert_eq!(metrics.num_added_files, 0); assert_eq!(metrics.num_removed_files, 1); - assert_eq!(metrics.num_deleted_rows, None); - assert_eq!(metrics.num_copied_rows, None); + assert_eq!(metrics.num_deleted_rows, 0); + assert_eq!(metrics.num_copied_rows, 0); let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; @@ -418,16 +519,13 @@ mod tests { // serde_json::to_value(&metrics).unwrap() // ); - // rewrite is not required - assert_eq!(metrics.rewrite_time_ms, 0); - // Deletes with no changes to state must not commit let (table, metrics) = DeltaOps(table).delete().await.unwrap(); assert_eq!(table.version(), 2); assert_eq!(metrics.num_added_files, 0); assert_eq!(metrics.num_removed_files, 0); - assert_eq!(metrics.num_deleted_rows, None); - assert_eq!(metrics.num_copied_rows, None); + assert_eq!(metrics.num_deleted_rows, 0); + assert_eq!(metrics.num_copied_rows, 0); } #[tokio::test] @@ -498,8 +596,8 @@ mod tests { assert_eq!(metrics.num_added_files, 1); assert_eq!(metrics.num_removed_files, 1); assert!(metrics.scan_time_ms > 0); - assert_eq!(metrics.num_deleted_rows, Some(1)); - assert_eq!(metrics.num_copied_rows, Some(3)); + assert_eq!(metrics.num_deleted_rows, 1); + assert_eq!(metrics.num_copied_rows, 3); let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; @@ -653,10 +751,9 @@ mod tests { assert_eq!(metrics.num_added_files, 0); assert_eq!(metrics.num_removed_files, 1); - assert_eq!(metrics.num_deleted_rows, None); - assert_eq!(metrics.num_copied_rows, None); + assert_eq!(metrics.num_deleted_rows, 0); + assert_eq!(metrics.num_copied_rows, 0); assert!(metrics.scan_time_ms > 0); - assert_eq!(metrics.rewrite_time_ms, 0); let expected = vec![ "+----+-------+------------+", @@ -715,8 +812,8 @@ mod tests { assert_eq!(metrics.num_added_files, 0); assert_eq!(metrics.num_removed_files, 1); - assert_eq!(metrics.num_deleted_rows, Some(1)); - assert_eq!(metrics.num_copied_rows, Some(0)); + assert_eq!(metrics.num_deleted_rows, 1); + assert_eq!(metrics.num_copied_rows, 0); assert!(metrics.scan_time_ms > 0); let expected = [ @@ -799,4 +896,174 @@ mod tests { .await; assert!(res.is_err()); } + + #[tokio::test] + async fn test_delete_cdc_enabled() { + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_configuration_property(DeltaConfigKey::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + arrow::datatypes::DataType::Int32, + true, + )])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]))], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .delete() + .with_predicate(col("value").eq(lit(2))) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(3)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+--------------+-----------------+", + "| value | _change_type | _commit_version |", + "+-------+--------------+-----------------+", + "| 1 | insert | 1 |", + "| 2 | delete | 2 |", + "| 2 | insert | 1 |", + "| 3 | insert | 1 |", + "+-------+--------------+-----------------+", + ], &batches } + } + + #[tokio::test] + async fn test_delete_cdc_enabled_partitioned() { + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "year", + DeltaDataType::Primitive(PrimitiveType::String), + true, + None, + ) + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_partition_columns(vec!["year"]) + .with_configuration_property(DeltaConfigKey::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::new(vec![ + Field::new("year", DataType::Utf8, true), + Field::new("value", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![ + Some("2020"), + Some("2020"), + Some("2024"), + ])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .delete() + .with_predicate(col("value").eq(lit(2))) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(3)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+--------------+-----------------+------+", + "| value | _change_type | _commit_version | year |", + "+-------+--------------+-----------------+------+", + "| 1 | insert | 1 | 2020 |", + "| 2 | delete | 2 | 2020 |", + "| 2 | insert | 1 | 2020 |", + "| 3 | insert | 1 | 2024 |", + "+-------+--------------+-----------------+------+", + ], &batches } + } + + async fn collect_batches( + num_partitions: usize, + stream: DeltaCdfScan, + ctx: SessionContext, + ) -> Result, Box> { + let mut batches = vec![]; + for p in 0..num_partitions { + let data: Vec = + collect_sendable_stream(stream.execute(p, ctx.task_ctx())?).await?; + batches.extend_from_slice(&data); + } + Ok(batches) + } } diff --git a/crates/core/src/operations/drop_constraints.rs b/crates/core/src/operations/drop_constraints.rs new file mode 100644 index 0000000000..0941b99552 --- /dev/null +++ b/crates/core/src/operations/drop_constraints.rs @@ -0,0 +1,183 @@ +//! Drop a constraint from a table + +use futures::future::BoxFuture; + +use super::transaction::{CommitBuilder, CommitProperties}; +use crate::kernel::Action; +use crate::logstore::LogStoreRef; +use crate::protocol::DeltaOperation; +use crate::table::state::DeltaTableState; +use crate::DeltaTable; +use crate::{DeltaResult, DeltaTableError}; + +/// Remove constraints from the table +pub struct DropConstraintBuilder { + /// A snapshot of the table's state + snapshot: DeltaTableState, + /// Name of the constraint + name: Option, + /// Raise if constraint doesn't exist + raise_if_not_exists: bool, + /// Delta object store for handling data files + log_store: LogStoreRef, + /// Additional information to add to the commit + commit_properties: CommitProperties, +} + +impl super::Operation<()> for DropConstraintBuilder {} + +impl DropConstraintBuilder { + /// Create a new builder + pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { + Self { + name: None, + raise_if_not_exists: true, + snapshot, + log_store, + commit_properties: CommitProperties::default(), + } + } + + /// Specify the constraint to be removed + pub fn with_constraint>(mut self, name: S) -> Self { + self.name = Some(name.into()); + self + } + + /// Specify if you want to raise if the constraint does not exist + pub fn with_raise_if_not_exists(mut self, raise: bool) -> Self { + self.raise_if_not_exists = raise; + self + } + + /// Additional metadata to be added to commit info + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; + self + } +} + +impl std::future::IntoFuture for DropConstraintBuilder { + type Output = DeltaResult; + + type IntoFuture = BoxFuture<'static, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let this = self; + + Box::pin(async move { + let name = this + .name + .ok_or(DeltaTableError::Generic("No name provided".to_string()))?; + + let mut metadata = this.snapshot.metadata().clone(); + let configuration_key = format!("delta.constraints.{}", name); + + if metadata.configuration.remove(&configuration_key).is_none() { + if this.raise_if_not_exists { + return Err(DeltaTableError::Generic(format!( + "Constraint with name: {} doesn't exists", + name + ))); + } + return Ok(DeltaTable::new_with_state(this.log_store, this.snapshot)); + } + let operation = DeltaOperation::DropConstraint { name: name.clone() }; + + let actions = vec![Action::Metadata(metadata)]; + + let commit = CommitBuilder::from(this.commit_properties) + .with_actions(actions) + .build(Some(&this.snapshot), this.log_store.clone(), operation) + .await?; + + Ok(DeltaTable::new_with_state( + this.log_store, + commit.snapshot(), + )) + }) + } +} + +#[cfg(feature = "datafusion")] +#[cfg(test)] +mod tests { + use crate::writer::test_utils::{create_bare_table, get_record_batch}; + use crate::{DeltaOps, DeltaResult, DeltaTable}; + + async fn get_constraint_op_params(table: &mut DeltaTable) -> String { + let commit_info = table.history(None).await.unwrap(); + let last_commit = &commit_info[0]; + + last_commit + .operation_parameters + .as_ref() + .unwrap() + .get("name") + .unwrap() + .as_str() + .unwrap() + .to_owned() + } + + #[tokio::test] + async fn drop_valid_constraint() -> DeltaResult<()> { + let batch = get_record_batch(None, false); + let write = DeltaOps(create_bare_table()) + .write(vec![batch.clone()]) + .await?; + let table = DeltaOps(write); + + let table = table + .add_constraint() + .with_constraint("id", "value < 1000") + .await?; + + let mut table = DeltaOps(table) + .drop_constraints() + .with_constraint("id") + .await?; + + let expected_name = "id"; + assert_eq!(get_constraint_op_params(&mut table).await, expected_name); + assert_eq!(table.metadata().unwrap().configuration.get("id"), None); + Ok(()) + } + + #[tokio::test] + async fn drop_invalid_constraint_not_existing() -> DeltaResult<()> { + let batch = get_record_batch(None, false); + let write = DeltaOps(create_bare_table()) + .write(vec![batch.clone()]) + .await?; + + let table = DeltaOps(write) + .drop_constraints() + .with_constraint("not_existing") + .await; + assert!(table.is_err()); + + Ok(()) + } + + #[tokio::test] + async fn drop_invalid_constraint_ignore() -> DeltaResult<()> { + let batch = get_record_batch(None, false); + let write = DeltaOps(create_bare_table()) + .write(vec![batch.clone()]) + .await?; + + let version = write.version(); + + let table = DeltaOps(write) + .drop_constraints() + .with_constraint("not_existing") + .with_raise_if_not_exists(false) + .await?; + + let version_after = table.version(); + + assert_eq!(version, version_after); + Ok(()) + } +} diff --git a/crates/core/src/operations/filesystem_check.rs b/crates/core/src/operations/filesystem_check.rs index 923f0aea54..44fa84d29a 100644 --- a/crates/core/src/operations/filesystem_check.rs +++ b/crates/core/src/operations/filesystem_check.rs @@ -27,11 +27,13 @@ use url::{ParseError, Url}; use crate::errors::{DeltaResult, DeltaTableError}; use crate::kernel::{Action, Add, Remove}; use crate::logstore::LogStoreRef; -use crate::operations::transaction::commit; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; use crate::DeltaTable; +use super::transaction::CommitBuilder; +use super::transaction::CommitProperties; + /// Audit the Delta Table's active files with the underlying file system. /// See this module's documentation for more information #[derive(Debug)] @@ -42,8 +44,8 @@ pub struct FileSystemCheckBuilder { log_store: LogStoreRef, /// Don't remove actions to the table log. Just determine which files can be removed dry_run: bool, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Commit properties and configuration + commit_properties: CommitProperties, } /// Details of the FSCK operation including which files were removed from the log @@ -73,6 +75,8 @@ fn is_absolute_path(path: &str) -> DeltaResult { } } +impl super::Operation<()> for FileSystemCheckBuilder {} + impl FileSystemCheckBuilder { /// Create a new [`FileSystemCheckBuilder`] pub fn new(log_store: LogStoreRef, state: DeltaTableState) -> Self { @@ -80,7 +84,7 @@ impl FileSystemCheckBuilder { snapshot: state, log_store, dry_run: false, - app_metadata: None, + commit_properties: CommitProperties::default(), } } @@ -90,12 +94,9 @@ impl FileSystemCheckBuilder { self } - /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + /// Additonal information to write to the commit + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } @@ -104,7 +105,7 @@ impl FileSystemCheckBuilder { HashMap::with_capacity(self.snapshot.file_actions()?.len()); let log_store = self.log_store.clone(); - for active in self.snapshot.file_actions()? { + for active in self.snapshot.file_actions_iter()? { if is_absolute_path(&active.path)? { return Err(DeltaTableError::Generic( "Filesystem check does not support absolute paths".to_string(), @@ -141,7 +142,7 @@ impl FileSystemCheckPlan { pub async fn execute( self, snapshot: &DeltaTableState, - app_metadata: Option>, + mut commit_properties: CommitProperties, ) -> DeltaResult { if self.files_to_remove.is_empty() { return Ok(FileSystemCheckMetrics { @@ -175,25 +176,22 @@ impl FileSystemCheckPlan { files_removed: removed_file_paths, }; - let mut app_metadata = match app_metadata { - Some(meta) => meta, - None => HashMap::new(), - }; - - app_metadata.insert("readVersion".to_owned(), snapshot.version().into()); - if let Ok(map) = serde_json::to_value(&metrics) { - app_metadata.insert("operationMetrics".to_owned(), map); - } - - commit( - self.log_store.as_ref(), - &actions, - DeltaOperation::FileSystemCheck {}, - Some(snapshot), - // TODO pass through metadata - Some(app_metadata), - ) - .await?; + commit_properties + .app_metadata + .insert("readVersion".to_owned(), snapshot.version().into()); + commit_properties.app_metadata.insert( + "operationMetrics".to_owned(), + serde_json::to_value(&metrics)?, + ); + + CommitBuilder::from(commit_properties) + .with_actions(actions) + .build( + Some(snapshot), + self.log_store.clone(), + DeltaOperation::FileSystemCheck {}, + ) + .await?; Ok(metrics) } @@ -218,7 +216,7 @@ impl std::future::IntoFuture for FileSystemCheckBuilder { )); } - let metrics = plan.execute(&this.snapshot, this.app_metadata).await?; + let metrics = plan.execute(&this.snapshot, this.commit_properties).await?; let mut table = DeltaTable::new_with_state(this.log_store, this.snapshot); table.update().await?; Ok((table, metrics)) diff --git a/crates/core/src/operations/load.rs b/crates/core/src/operations/load.rs index 2eac151052..4bf439cd0d 100644 --- a/crates/core/src/operations/load.rs +++ b/crates/core/src/operations/load.rs @@ -7,6 +7,7 @@ use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; use futures::future::BoxFuture; use super::transaction::PROTOCOL; +use crate::delta_datafusion::DataFusionMixins; use crate::errors::{DeltaResult, DeltaTableError}; use crate::logstore::LogStoreRef; use crate::table::state::DeltaTableState; @@ -22,6 +23,8 @@ pub struct LoadBuilder { columns: Option>, } +impl super::Operation<()> for LoadBuilder {} + impl LoadBuilder { /// Create a new [`LoadBuilder`] pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { @@ -47,7 +50,7 @@ impl std::future::IntoFuture for LoadBuilder { let this = self; Box::pin(async move { - PROTOCOL.can_read_from(&this.snapshot)?; + PROTOCOL.can_read_from(&this.snapshot.snapshot)?; let table = DeltaTable::new_with_state(this.log_store, this.snapshot); let schema = table.snapshot()?.arrow_schema()?; diff --git a/crates/core/src/operations/load_cdf.rs b/crates/core/src/operations/load_cdf.rs new file mode 100644 index 0000000000..57542ab668 --- /dev/null +++ b/crates/core/src/operations/load_cdf.rs @@ -0,0 +1,547 @@ +//! Module for reading the change datafeed of delta tables + +use datafusion_physical_expr::{ + expressions::{self}, + PhysicalExpr, +}; +use std::sync::Arc; +use std::time::SystemTime; + +use arrow_schema::{ArrowError, Field}; +use chrono::{DateTime, Utc}; +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::file_format::FileFormat; +use datafusion::datasource::physical_plan::FileScanConfig; +use datafusion::physical_plan::projection::ProjectionExec; +use datafusion::physical_plan::union::UnionExec; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::SessionContext; +use datafusion_common::{ScalarValue, Statistics}; +use tracing::log; + +use crate::delta_datafusion::cdf::*; +use crate::delta_datafusion::{register_store, DataFusionMixins}; +use crate::errors::DeltaResult; +use crate::kernel::{Action, Add, AddCDCFile, CommitInfo}; +use crate::logstore::{get_actions, LogStoreRef}; +use crate::table::state::DeltaTableState; +use crate::DeltaTableError; + +/// Builder for create a read of change data feeds for delta tables +#[derive(Clone)] +pub struct CdfLoadBuilder { + /// A snapshot of the to-be-loaded table's state + snapshot: DeltaTableState, + /// Delta object store for handling data files + log_store: LogStoreRef, + /// Columns to project + columns: Option>, + /// Version to read from + starting_version: i64, + /// Version to stop reading at + ending_version: Option, + /// Starting timestamp of commits to accept + starting_timestamp: Option>, + /// Ending timestamp of commits to accept + ending_timestamp: Option>, + /// Provided Datafusion context + ctx: SessionContext, +} + +impl CdfLoadBuilder { + /// Create a new [`LoadBuilder`] + pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { + Self { + snapshot, + log_store, + columns: None, + starting_version: 0, + ending_version: None, + starting_timestamp: None, + ending_timestamp: None, + ctx: SessionContext::new(), + } + } + + /// Version to start at (version 0 if not provided) + pub fn with_starting_version(mut self, starting_version: i64) -> Self { + self.starting_version = starting_version; + self + } + + /// Version (inclusive) to end at + pub fn with_ending_version(mut self, ending_version: i64) -> Self { + self.ending_version = Some(ending_version); + self + } + + /// Provide a datafusion session context + pub fn with_session_ctx(mut self, ctx: SessionContext) -> Self { + self.ctx = ctx; + self + } + + /// Timestamp (inclusive) to end at + pub fn with_ending_timestamp(mut self, timestamp: DateTime) -> Self { + self.ending_timestamp = Some(timestamp); + self + } + + /// Timestamp to start from + pub fn with_starting_timestamp(mut self, timestamp: DateTime) -> Self { + self.starting_timestamp = Some(timestamp); + self + } + + /// Columns to select + pub fn with_columns(mut self, columns: Vec) -> Self { + self.columns = Some(columns); + self + } + + /// This is a rust version of https://github.com/delta-io/delta/blob/master/spark/src/main/scala/org/apache/spark/sql/delta/commands/cdc/CDCReader.scala#L418 + /// Which iterates through versions of the delta table collects the relevant actions / commit info and returns those + /// groupings for later use. The scala implementation has a lot more edge case handling and read schema checking (and just error checking in general) + /// than I have right now. I plan to extend the checks once we have a stable state of the initial implementation. + async fn determine_files_to_read( + &self, + ) -> DeltaResult<(Vec>, Vec>)> { + let start = self.starting_version; + let end = self + .ending_version + .unwrap_or(self.log_store.get_latest_version(start).await?); + + if end < start { + return Err(DeltaTableError::ChangeDataInvalidVersionRange { start, end }); + } + + let starting_timestamp = self.starting_timestamp.unwrap_or(DateTime::UNIX_EPOCH); + let ending_timestamp = self + .ending_timestamp + .unwrap_or(DateTime::from(SystemTime::now())); + + log::debug!( + "starting timestamp = {:?}, ending timestamp = {:?}", + &starting_timestamp, + &ending_timestamp + ); + log::debug!("starting version = {}, ending version = {:?}", start, end); + + let mut change_files = vec![]; + let mut add_files = vec![]; + + for version in start..=end { + let snapshot_bytes = self + .log_store + .read_commit_entry(version) + .await? + .ok_or(DeltaTableError::InvalidVersion(version))?; + let version_actions = get_actions(version, snapshot_bytes).await?; + + let mut ts = 0; + let mut cdc_actions = vec![]; + + if self.starting_timestamp.is_some() || self.ending_timestamp.is_some() { + let version_commit = version_actions + .iter() + .find(|a| matches!(a, Action::CommitInfo(_))); + if let Some(Action::CommitInfo(CommitInfo { + timestamp: Some(t), .. + })) = version_commit + { + if starting_timestamp.timestamp_millis() > *t + || *t > ending_timestamp.timestamp_millis() + { + log::debug!("Version: {} skipped, due to commit timestamp", version); + continue; + } + } + } + + for action in &version_actions { + match action { + Action::Cdc(f) => cdc_actions.push(f.clone()), + Action::Metadata(md) => { + log::info!("Metadata: {:?}", &md); + if let Some(Some(key)) = &md.configuration.get("delta.enableChangeDataFeed") + { + let key = key.to_lowercase(); + // Check here to ensure the CDC function is enabled for the first version of the read + // and check in subsequent versions only that it was not disabled. + if (version == start && key != "true") || key == "false" { + return Err(DeltaTableError::ChangeDataNotRecorded { + version, + start, + end, + }); + } + } else if version == start { + return Err(DeltaTableError::ChangeDataNotEnabled { version }); + }; + } + Action::CommitInfo(ci) => { + ts = ci.timestamp.unwrap_or(0); + } + _ => {} + } + } + + if !cdc_actions.is_empty() { + log::debug!( + "Located {} cdf actions for version: {}", + cdc_actions.len(), + version + ); + change_files.push(CdcDataSpec::new(version, ts, cdc_actions)) + } else { + let add_actions = version_actions + .iter() + .filter_map(|a| match a { + Action::Add(a) if a.data_change => Some(a.clone()), + _ => None, + }) + .collect::>(); + + if !add_actions.is_empty() { + log::debug!( + "Located {} cdf actions for version: {}", + add_actions.len(), + version + ); + add_files.push(CdcDataSpec::new(version, ts, add_actions)); + } + } + } + + Ok((change_files, add_files)) + } + + #[inline] + fn get_add_action_type() -> Option { + Some(ScalarValue::Utf8(Some(String::from("insert")))) + } + + /// Executes the scan + pub async fn build(&self) -> DeltaResult { + let (cdc, add) = self.determine_files_to_read().await?; + register_store( + self.log_store.clone(), + self.ctx.state().runtime_env().clone(), + ); + + let partition_values = self.snapshot.metadata().partition_columns.clone(); + let schema = self.snapshot.input_schema()?; + let schema_fields: Vec = self + .snapshot + .input_schema()? + .flattened_fields() + .into_iter() + .filter(|f| !partition_values.contains(f.name())) + .cloned() + .collect(); + + let this_partition_values = partition_values + .iter() + .map(|name| schema.field_with_name(name).map(|f| f.to_owned())) + .collect::, ArrowError>>()?; + + // Setup for the Read Schemas of each kind of file, CDC files include commit action type so they need a slightly + // different schema than standard add file reads + let cdc_file_schema = create_cdc_schema(schema_fields.clone(), true); + let add_file_schema = create_cdc_schema(schema_fields, false); + + // Set up the mapping of partition columns to be projected into the final output batch + // cdc for example has timestamp, version, and any table partitions mapped here. + // add on the other hand has action type, timestamp, version and any additional table partitions because adds do + // not include their actions + let mut cdc_partition_cols = CDC_PARTITION_SCHEMA.clone(); + let mut add_partition_cols = ADD_PARTITION_SCHEMA.clone(); + cdc_partition_cols.extend_from_slice(&this_partition_values); + add_partition_cols.extend_from_slice(&this_partition_values); + + // Set up the partition to physical file mapping, this is a mostly unmodified version of what is done in load + let cdc_file_groups = + create_partition_values(schema.clone(), cdc, &partition_values, None)?; + let add_file_groups = create_partition_values( + schema.clone(), + add, + &partition_values, + Self::get_add_action_type(), + )?; + + // Create the parquet scans for each associated type of file. I am not sure when we would use removes yet, but + // they would be here if / when they are necessary + let cdc_scan = ParquetFormat::new() + .create_physical_plan( + &self.ctx.state(), + FileScanConfig { + object_store_url: self.log_store.object_store_url(), + file_schema: cdc_file_schema.clone(), + file_groups: cdc_file_groups.into_values().collect(), + statistics: Statistics::new_unknown(&cdc_file_schema), + projection: None, + limit: None, + table_partition_cols: cdc_partition_cols, + output_ordering: vec![], + }, + None, + ) + .await?; + + let add_scan = ParquetFormat::new() + .create_physical_plan( + &self.ctx.state(), + FileScanConfig { + object_store_url: self.log_store.object_store_url(), + file_schema: add_file_schema.clone(), + file_groups: add_file_groups.into_values().collect(), + statistics: Statistics::new_unknown(&add_file_schema), + projection: None, + limit: None, + table_partition_cols: add_partition_cols, + output_ordering: vec![], + }, + None, + ) + .await?; + + // The output batches are then unioned to create a single output. Coalesce partitions is only here for the time + // being for development. I plan to parallelize the reads once the base idea is correct. + let mut union_scan: Arc = + Arc::new(UnionExec::new(vec![cdc_scan, add_scan])); + + if let Some(columns) = &self.columns { + let expressions: Vec<(Arc, String)> = union_scan + .schema() + .fields() + .into_iter() + .enumerate() + .map(|(idx, field)| -> (Arc, String) { + let field_name = field.name(); + let expr = Arc::new(expressions::Column::new(field_name, idx)); + (expr, field_name.to_owned()) + }) + .filter(|(_, field_name)| columns.contains(field_name)) + .collect(); + union_scan = Arc::new(ProjectionExec::try_new(expressions, union_scan)?); + } + Ok(DeltaCdfScan::new(union_scan)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::error::Error; + use std::str::FromStr; + + use arrow_array::RecordBatch; + use chrono::NaiveDateTime; + use datafusion::physical_plan::ExecutionPlan; + use datafusion::prelude::SessionContext; + use datafusion_common::assert_batches_sorted_eq; + + use crate::delta_datafusion::cdf::DeltaCdfScan; + use crate::operations::collect_sendable_stream; + use crate::writer::test_utils::TestResult; + use crate::DeltaOps; + + async fn collect_batches( + num_partitions: usize, + stream: DeltaCdfScan, + ctx: SessionContext, + ) -> Result, Box> { + let mut batches = vec![]; + for p in 0..num_partitions { + let data: Vec = + collect_sendable_stream(stream.execute(p, ctx.task_ctx())?).await?; + batches.extend_from_slice(&data); + } + Ok(batches) + } + + #[tokio::test] + async fn test_load_local() -> TestResult { + let ctx = SessionContext::new(); + let table = DeltaOps::try_from_uri("../test/tests/data/cdf-table") + .await? + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await?; + + let batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await?; + assert_batches_sorted_eq! { + ["+----+--------+------------------+-----------------+-------------------------+------------+", + "| id | name | _change_type | _commit_version | _commit_timestamp | birthday |", + "+----+--------+------------------+-----------------+-------------------------+------------+", + "| 7 | Dennis | delete | 3 | 2024-01-06T16:44:59.570 | 2023-12-29 |", + "| 3 | Dave | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 |", + "| 4 | Kate | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 |", + "| 2 | Bob | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 |", + "| 7 | Dennis | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 |", + "| 5 | Emily | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 |", + "| 6 | Carl | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 |", + "| 7 | Dennis | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 |", + "| 5 | Emily | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 |", + "| 6 | Carl | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 |", + "| 3 | Dave | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 |", + "| 4 | Kate | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 |", + "| 2 | Bob | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 |", + "| 2 | Bob | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 |", + "| 3 | Dave | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 |", + "| 4 | Kate | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 |", + "| 5 | Emily | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 |", + "| 6 | Carl | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 |", + "| 7 | Dennis | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 |", + "| 1 | Steve | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-22 |", + "| 8 | Claire | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 |", + "| 9 | Ada | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 |", + "| 10 | Borb | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 |", + "+----+--------+------------------+-----------------+-------------------------+------------+" + ], &batches } + Ok(()) + } + + #[tokio::test] + async fn test_load_local_datetime() -> TestResult { + let ctx = SessionContext::new(); + let starting_timestamp = NaiveDateTime::from_str("2023-12-22T17:10:21.675").unwrap(); + let table = DeltaOps::try_from_uri("../test/tests/data/cdf-table") + .await? + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_ending_timestamp(starting_timestamp.and_utc()) + .build() + .await?; + + let batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await?; + + assert_batches_sorted_eq! { + ["+----+--------+------------------+-----------------+-------------------------+------------+", + "| id | name | _change_type | _commit_version | _commit_timestamp | birthday |", + "+----+--------+------------------+-----------------+-------------------------+------------+", + "| 3 | Dave | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 |", + "| 4 | Kate | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 |", + "| 2 | Bob | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 |", + "| 3 | Dave | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 |", + "| 4 | Kate | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 |", + "| 2 | Bob | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 |", + "| 2 | Bob | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 |", + "| 3 | Dave | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 |", + "| 4 | Kate | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 |", + "| 8 | Claire | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 |", + "| 9 | Ada | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 |", + "| 10 | Borb | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 |", + "| 1 | Steve | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-22 |", + "| 5 | Emily | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 |", + "| 6 | Carl | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 |", + "| 7 | Dennis | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 |", + "+----+--------+------------------+-----------------+-------------------------+------------+" + ], + &batches + } + Ok(()) + } + + #[tokio::test] + async fn test_load_local_non_partitioned() -> TestResult { + let ctx = SessionContext::new(); + let table = DeltaOps::try_from_uri("../test/tests/data/cdf-table-non-partitioned") + .await? + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await?; + + let batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await?; + + assert_batches_sorted_eq! { + ["+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+-------------------------+", + "| id | name | birthday | long_field | boolean_field | double_field | smallint_field | _change_type | _commit_version | _commit_timestamp |", + "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+-------------------------+", + "| 7 | Dennis | 2024-04-14 | 6 | true | 3.14 | 1 | delete | 3 | 2024-04-14T15:58:32.495 |", + "| 3 | Dave | 2024-04-15 | 2 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", + "| 3 | Dave | 2024-04-14 | 2 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", + "| 4 | Kate | 2024-04-15 | 3 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", + "| 4 | Kate | 2024-04-14 | 3 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", + "| 2 | Bob | 2024-04-15 | 1 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", + "| 2 | Bob | 2024-04-14 | 1 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", + "| 7 | Dennis | 2024-04-16 | 6 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", + "| 7 | Dennis | 2024-04-14 | 6 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", + "| 5 | Emily | 2024-04-16 | 4 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", + "| 5 | Emily | 2024-04-14 | 4 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", + "| 6 | Carl | 2024-04-16 | 5 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", + "| 6 | Carl | 2024-04-14 | 5 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", + "| 1 | Alex | 2024-04-14 | 1 | true | 3.14 | 1 | insert | 4 | 2024-04-14T15:58:33.444 |", + "| 2 | Alan | 2024-04-15 | 1 | true | 3.14 | 1 | insert | 4 | 2024-04-14T15:58:33.444 |", + "| 1 | Steve | 2024-04-14 | 1 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 2 | Bob | 2024-04-15 | 1 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 3 | Dave | 2024-04-15 | 2 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 4 | Kate | 2024-04-15 | 3 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 5 | Emily | 2024-04-16 | 4 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 6 | Carl | 2024-04-16 | 5 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 7 | Dennis | 2024-04-16 | 6 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 8 | Claire | 2024-04-17 | 7 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 9 | Ada | 2024-04-17 | 8 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 10 | Borb | 2024-04-17 | 99999999999999999 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+-------------------------+"], + &batches + } + Ok(()) + } + + #[tokio::test] + async fn test_load_bad_version_range() -> TestResult { + let table = DeltaOps::try_from_uri("../test/tests/data/cdf-table-non-partitioned") + .await? + .load_cdf() + .with_starting_version(4) + .with_ending_version(1) + .build() + .await; + + assert!(table.is_err()); + assert!(matches!( + table.unwrap_err(), + DeltaTableError::ChangeDataInvalidVersionRange { .. } + )); + + Ok(()) + } + + #[tokio::test] + async fn test_load_non_cdf() -> TestResult { + let table = DeltaOps::try_from_uri("../test/tests/data/simple_table") + .await? + .load_cdf() + .with_starting_version(0) + .build() + .await; + + assert!(table.is_err()); + assert!(matches!( + table.unwrap_err(), + DeltaTableError::ChangeDataNotEnabled { .. } + )); + + Ok(()) + } +} diff --git a/crates/core/src/operations/merge/barrier.rs b/crates/core/src/operations/merge/barrier.rs index f1df28c4a4..e9b2f8fd00 100644 --- a/crates/core/src/operations/merge/barrier.rs +++ b/crates/core/src/operations/merge/barrier.rs @@ -6,7 +6,7 @@ //! To determine if a file contains zero changes, the input stream is //! exhausted. Afterwards, records are then dropped. //! -//! Bookkeeping is maintained to determine which files have modifications so +//! Bookkeeping is maintained to determine which files have modifications, so //! they can be removed from the delta log. use std::{ @@ -67,6 +67,10 @@ impl MergeBarrierExec { } impl ExecutionPlan for MergeBarrierExec { + fn name(&self) -> &str { + Self::static_name() + } + fn as_any(&self) -> &dyn std::any::Any { self } @@ -75,26 +79,27 @@ impl ExecutionPlan for MergeBarrierExec { self.input.schema() } - fn output_partitioning(&self) -> datafusion_physical_expr::Partitioning { - self.input.output_partitioning() + fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + self.input.properties() } fn required_input_distribution(&self) -> Vec { vec![Distribution::HashPartitioned(vec![self.expr.clone()]); 1] } - fn output_ordering(&self) -> Option<&[datafusion_physical_expr::PhysicalSortExpr]> { - None - } - - fn children(&self) -> Vec> { - vec![self.input.clone()] + fn children(&self) -> Vec<&Arc> { + vec![&self.input] } fn with_new_children( - self: std::sync::Arc, - children: Vec>, - ) -> datafusion_common::Result> { + self: Arc, + children: Vec>, + ) -> datafusion_common::Result> { + if children.len() != 1 { + return Err(DataFusionError::Plan( + "MergeBarrierExec wrong number of children".to_string(), + )); + } Ok(Arc::new(MergeBarrierExec::new( children[0].clone(), self.file_column.clone(), @@ -105,7 +110,7 @@ impl ExecutionPlan for MergeBarrierExec { fn execute( &self, partition: usize, - context: std::sync::Arc, + context: Arc, ) -> datafusion_common::Result { let input = self.input.execute(partition, context)?; Ok(Box::pin(MergeBarrierStream::new( @@ -421,11 +426,20 @@ impl UserDefinedLogicalNodeCore for MergeBarrier { exprs: &[datafusion_expr::Expr], inputs: &[datafusion_expr::LogicalPlan], ) -> Self { - MergeBarrier { + self.with_exprs_and_inputs(exprs.to_vec(), inputs.to_vec()) + .unwrap() + } + + fn with_exprs_and_inputs( + &self, + exprs: Vec, + inputs: Vec, + ) -> DataFusionResult { + Ok(MergeBarrier { input: inputs[0].clone(), file_column: self.file_column.clone(), expr: exprs[0].clone(), - } + }) } } diff --git a/crates/core/src/operations/merge/mod.rs b/crates/core/src/operations/merge/mod.rs index b1f89c4c12..ea54e4e211 100644 --- a/crates/core/src/operations/merge/mod.rs +++ b/crates/core/src/operations/merge/mod.rs @@ -35,38 +35,39 @@ use std::time::Instant; use async_trait::async_trait; use datafusion::datasource::provider_as_source; use datafusion::error::Result as DataFusionResult; -use datafusion::execution::context::{QueryPlanner, SessionConfig}; +use datafusion::execution::context::SessionConfig; use datafusion::logical_expr::build_join_schema; -use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}; +use datafusion::physical_plan::metrics::MetricBuilder; +use datafusion::physical_planner::{ExtensionPlanner, PhysicalPlanner}; use datafusion::{ execution::context::SessionState, - physical_plan::{ - metrics::{MetricBuilder, MetricsSet}, - ExecutionPlan, - }, + physical_plan::ExecutionPlan, prelude::{DataFrame, SessionContext}, }; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{Column, DFSchema, ScalarValue, TableReference}; use datafusion_expr::expr::Placeholder; -use datafusion_expr::{col, conditional_expressions::CaseBuilder, lit, when, Expr, JoinType}; use datafusion_expr::{ - BinaryExpr, Distinct, Extension, Filter, LogicalPlan, LogicalPlanBuilder, Operator, Projection, + col, conditional_expressions::CaseBuilder, lit, max, min, when, Between, Expr, JoinType, +}; +use datafusion_expr::{ + Aggregate, BinaryExpr, Extension, LogicalPlan, LogicalPlanBuilder, Operator, UserDefinedLogicalNode, UNNAMED_TABLE, }; +use either::{Left, Right}; use futures::future::BoxFuture; use itertools::Itertools; use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::Value; use self::barrier::{MergeBarrier, MergeBarrierExec}; use super::datafusion_utils::{into_expr, maybe_into_expr, Expression}; -use super::transaction::{commit, PROTOCOL}; +use super::transaction::{CommitProperties, PROTOCOL}; use crate::delta_datafusion::expr::{fmt_expr_to_sql, parse_predicate_expression}; use crate::delta_datafusion::logical::MetricObserver; -use crate::delta_datafusion::physical::{find_metric_node, MetricObserverExec}; +use crate::delta_datafusion::physical::{find_metric_node, get_metric, MetricObserverExec}; +use crate::delta_datafusion::planner::DeltaPlanner; use crate::delta_datafusion::{ execute_plan_to_batch, register_store, DeltaColumn, DeltaScanConfigBuilder, DeltaSessionConfig, DeltaTableProvider, @@ -74,7 +75,8 @@ use crate::delta_datafusion::{ use crate::kernel::Action; use crate::logstore::LogStoreRef; use crate::operations::merge::barrier::find_barrier_node; -use crate::operations::write::write_execution_plan; +use crate::operations::transaction::CommitBuilder; +use crate::operations::write::{write_execution_plan, WriterStatsConfig}; use crate::protocol::{DeltaOperation, MergePredicate}; use crate::table::state::DeltaTableState; use crate::{DeltaResult, DeltaTable, DeltaTableError}; @@ -126,13 +128,15 @@ pub struct MergeBuilder { state: Option, /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Additional information to add to the commit + commit_properties: CommitProperties, /// safe_cast determines how data types that do not match the underlying table are handled /// By default an error is returned safe_cast: bool, } +impl super::Operation<()> for MergeBuilder {} + impl MergeBuilder { /// Create a new [`MergeBuilder`] pub fn new>( @@ -150,7 +154,7 @@ impl MergeBuilder { source_alias: None, target_alias: None, state: None, - app_metadata: None, + commit_properties: CommitProperties::default(), writer_properties: None, match_operations: Vec::new(), not_match_operations: Vec::new(), @@ -163,10 +167,10 @@ impl MergeBuilder { /// /// The update expressions can specify both source and target columns. /// - /// Multiple match clasues can be specified and their predicates are + /// Multiple match clauses can be specified and their predicates are /// evaluated to determine if the corresponding operation are performed. - /// Only the first clause that results in an satisfy predicate is executed. - /// Ther order of match clauses matter. + /// Only the first clause that results in a satisfy predicate is executed. + /// The order of match clauses matter. /// /// #Example /// ```rust ignore @@ -201,10 +205,10 @@ impl MergeBuilder { /// Delete a target record when it matches with a source record /// - /// Multiple match clasues can be specified and their predicates are + /// Multiple match clauses can be specified and their predicates are /// evaluated to determine if the corresponding operation are performed. - /// Only the first clause that results in an satisfy predicate is executed. - /// Ther order of match clauses matter. + /// Only the first clause that results in a satisfy predicate is executed. + /// The order of match clauses matter. /// /// #Example /// ```rust ignore @@ -234,10 +238,10 @@ impl MergeBuilder { /// Insert a source record when it does not match with a target record /// - /// Multiple not match clasues can be specified and their predicates are + /// Multiple not match clauses can be specified and their predicates are /// evaluated to determine if the corresponding operation are performed. - /// Only the first clause that results in an satisfy predicate is executed. - /// Ther order of not match clauses matter. + /// Only the first clause that results in a satisfy predicate is executed. + /// The order of not match clauses matter. /// /// #Example /// ```rust ignore @@ -269,10 +273,10 @@ impl MergeBuilder { /// /// The update expressions can specify only target columns. /// - /// Multiple source not match clasues can be specified and their predicates + /// Multiple source not match clauses can be specified and their predicates /// are evaluated to determine if the corresponding operation are performed. - /// Only the first clause that results in an satisfy predicate is executed. - /// Ther order of source not match clauses matter. + /// Only the first clause that results in a satisfy predicate is executed. + /// The order of source not match clauses matter. /// /// #Example /// ```rust ignore @@ -301,10 +305,10 @@ impl MergeBuilder { /// Delete a target record when it does not match with a source record /// - /// Multiple source not match clasues can be specified and their predicates - /// are evaluated to determine if the corresponding operation are performed. - /// Only the first clause that results in an satisfy predicate is executed. - /// Ther order of source not match clauses matter. + /// Multiple source "not match" clauses can be specified and their predicates + /// are evaluated to determine if the corresponding operations are performed. + /// Only the first clause that results in a satisfy predicate is executed. + /// The order of source "not match" clauses matter. /// /// #Example /// ```rust ignore @@ -351,11 +355,8 @@ impl MergeBuilder { } /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } @@ -503,7 +504,7 @@ impl MergeOperation { relation: Some(TableReference::Bare { table }), name, } => { - if table.eq(alias) { + if table.as_ref() == alias { Column { relation: Some(r), name, @@ -574,7 +575,7 @@ pub struct MergeMetrics { /// Time taken to rewrite the matched files pub rewrite_time_ms: u64, } - +#[derive(Clone)] struct MergeMetricExtensionPlanner {} #[async_trait] @@ -667,13 +668,22 @@ impl ExtensionPlanner for MergeMetricExtensionPlanner { } } -/// Takes the predicate provided and does two things: +struct PredicatePlaceholder { + expr: Expr, + alias: String, + is_aggregate: bool, +} + +/// Takes the predicate provided and does three things: /// -/// 1. for any relations between a source column and a target column, if the target column is a -/// partition column, then replace source with a placeholder matching the name of the partition +/// 1. for any relations between a source column and a partition target column, +/// replace source with a placeholder matching the name of the partition /// columns /// -/// 2. for any other relation with a source column, remove them. +/// 2. for any is equal relations between a source column and a non-partition target column, +/// replace source with is between expression with min(source_column) and max(source_column) placeholders +/// +/// 3. for any other relation with a source column, remove them. /// /// For example, for the predicate: /// @@ -681,21 +691,17 @@ impl ExtensionPlanner for MergeMetricExtensionPlanner { /// /// where `date` is a partition column, would result in the expr: /// -/// `$date = target.date and frob > 42` +/// `$date_0 = target.date and target.id between $id_1_min and $id_1_max and frob > 42` /// /// This leaves us with a predicate that we can push into delta scan after expanding it out to -/// a conjunction between the disinct partitions in the source input. +/// a conjunction between the distinct partitions in the source input. /// -/// TODO: A futher improvement here might be for non-partition columns to be replaced with min/max -/// checks, so the above example could become: -/// -/// `$date = target.date and target.id between 12345 and 99999 and frob > 42` fn generalize_filter( predicate: Expr, partition_columns: &Vec, source_name: &TableReference, target_name: &TableReference, - placeholders: &mut HashMap, + placeholders: &mut Vec, ) -> Option { #[derive(Debug)] enum ReferenceTableCheck { @@ -705,10 +711,7 @@ fn generalize_filter( } impl ReferenceTableCheck { fn has_reference(&self) -> bool { - match self { - ReferenceTableCheck::HasReference(_) => true, - _ => false, - } + matches!(self, ReferenceTableCheck::HasReference(_)) } } fn references_table(expr: &Expr, table: &TableReference) -> ReferenceTableCheck { @@ -735,36 +738,101 @@ fn generalize_filter( ReferenceTableCheck::Unknown } } - Expr::IsNull(inner) => references_table(&inner, table), + Expr::IsNull(inner) => references_table(inner, table), Expr::Literal(_) => ReferenceTableCheck::NoReference, _ => ReferenceTableCheck::Unknown, }; res } + fn construct_placeholder( + binary: BinaryExpr, + source_left: bool, + is_partition_column: bool, + column_name: String, + placeholders: &mut Vec, + ) -> Option { + if is_partition_column { + let placeholder_name = format!("{column_name}_{}", placeholders.len()); + let placeholder = Expr::Placeholder(Placeholder { + id: placeholder_name.clone(), + data_type: None, + }); + + let (left, right, source_expr): (Box, Box, Expr) = if source_left { + (placeholder.into(), binary.clone().right, *binary.left) + } else { + (binary.clone().left, placeholder.into(), *binary.right) + }; + + let replaced = Expr::BinaryExpr(BinaryExpr { + left, + op: binary.op, + right, + }); + + placeholders.push(PredicatePlaceholder { + expr: source_expr, + alias: placeholder_name, + is_aggregate: false, + }); + + Some(replaced) + } else { + match binary.op { + Operator::Eq => { + let name_min = format!("{column_name}_{}_min", placeholders.len()); + let placeholder_min = Expr::Placeholder(Placeholder { + id: name_min.clone(), + data_type: None, + }); + let name_max = format!("{column_name}_{}_max", placeholders.len()); + let placeholder_max = Expr::Placeholder(Placeholder { + id: name_max.clone(), + data_type: None, + }); + let (source_expr, target_expr) = if source_left { + (*binary.left, *binary.right) + } else { + (*binary.right, *binary.left) + }; + let replaced = Expr::Between(Between { + expr: target_expr.into(), + negated: false, + low: placeholder_min.into(), + high: placeholder_max.into(), + }); + + placeholders.push(PredicatePlaceholder { + expr: min(source_expr.clone()), + alias: name_min, + is_aggregate: true, + }); + placeholders.push(PredicatePlaceholder { + expr: max(source_expr), + alias: name_max, + is_aggregate: true, + }); + Some(replaced) + } + _ => None, + } + } + } + match predicate { Expr::BinaryExpr(binary) => { if references_table(&binary.right, source_name).has_reference() { if let ReferenceTableCheck::HasReference(left_target) = references_table(&binary.left, target_name) { - if partition_columns.contains(&left_target) { - let placeholder_name = format!("{left_target}_{}", placeholders.len()); - - let placeholder = Expr::Placeholder(datafusion_expr::expr::Placeholder { - id: placeholder_name.clone(), - data_type: None, - }); - let replaced = Expr::BinaryExpr(BinaryExpr { - left: binary.left, - op: binary.op, - right: placeholder.into(), - }); - - placeholders.insert(placeholder_name, *binary.right); - - return Some(replaced); - } + return construct_placeholder( + binary, + false, + partition_columns.contains(&left_target), + left_target, + placeholders, + ); } return None; } @@ -772,23 +840,13 @@ fn generalize_filter( if let ReferenceTableCheck::HasReference(right_target) = references_table(&binary.right, target_name) { - if partition_columns.contains(&right_target) { - let placeholder_name = format!("{right_target}_{}", placeholders.len()); - - let placeholder = Expr::Placeholder(datafusion_expr::expr::Placeholder { - id: placeholder_name.clone(), - data_type: None, - }); - let replaced = Expr::BinaryExpr(BinaryExpr { - right: binary.right, - op: binary.op, - left: placeholder.into(), - }); - - placeholders.insert(placeholder_name, *binary.left); - - return Some(replaced); - } + return construct_placeholder( + binary, + true, + partition_columns.contains(&right_target), + right_target, + placeholders, + ); } return None; } @@ -808,7 +866,7 @@ fn generalize_filter( placeholders, ); - let res = match (left, right) { + match (left, right) { (None, None) => None, (None, Some(one_side)) | (Some(one_side), None) => { // in the case of an AND clause, it's safe to generalize the filter down to just one side of the AND. @@ -828,19 +886,22 @@ fn generalize_filter( right: r.into(), }) .into(), - }; - res + } } other => match references_table(&other, source_name) { ReferenceTableCheck::HasReference(col) => { let placeholder_name = format!("{col}_{}", placeholders.len()); - let placeholder = Expr::Placeholder(datafusion_expr::expr::Placeholder { + let placeholder = Expr::Placeholder(Placeholder { id: placeholder_name.clone(), data_type: None, }); - placeholders.insert(placeholder_name, other); + placeholders.push(PredicatePlaceholder { + expr: other, + alias: placeholder_name, + is_aggregate: true, + }); Some(placeholder) } @@ -855,11 +916,12 @@ fn replace_placeholders(expr: Expr, placeholders: &HashMap) Expr::Placeholder(Placeholder { id, .. }) => { let value = placeholders[&id].clone(); // Replace the placeholder with the value - Ok(Transformed::Yes(Expr::Literal(value))) + Ok(Transformed::yes(Expr::Literal(value))) } - _ => Ok(Transformed::No(expr)), + _ => Ok(Transformed::no(expr)), }) .unwrap() + .data } async fn try_construct_early_filter( @@ -867,17 +929,13 @@ async fn try_construct_early_filter( table_snapshot: &DeltaTableState, session_state: &SessionState, source: &LogicalPlan, - source_name: &TableReference<'_>, - target_name: &TableReference<'_>, + source_name: &TableReference, + target_name: &TableReference, ) -> DeltaResult> { let table_metadata = table_snapshot.metadata(); let partition_columns = &table_metadata.partition_columns; - if partition_columns.is_empty() { - return Ok(None); - } - - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); match generalize_filter( join_predicate, @@ -889,35 +947,34 @@ async fn try_construct_early_filter( None => Ok(None), Some(filter) => { if placeholders.is_empty() { - // if we haven't recognised any partition-based predicates in the join predicate, return our reduced filter + // if we haven't recognised any source predicates in the join predicate, return our filter with static only predicates Ok(Some(filter)) } else { - // if we have some recognised partitions, then discover the distinct set of partitions in the source data and - // make a new filter, which expands out the placeholders for each distinct partition (and then OR these together) - let distinct_partitions = LogicalPlan::Distinct(Distinct::All( - LogicalPlan::Projection(Projection::try_new( - placeholders - .into_iter() - .map(|(alias, expr)| expr.alias(alias)) - .collect_vec(), - source.clone().into(), - )?) - .into(), - )); - + // if we have some filters, which depend on the source df, then collect the placeholders values from the source data + // We aggregate the distinct values for partitions with the group_columns and stats(min, max) for dynamic filter as agg_columns + // Can be translated into `SELECT partition1 as part1_0, min(id) as id_1_min, max(id) as id_1_max FROM source GROUP BY partition1` + let (agg_columns, group_columns) = placeholders.into_iter().partition_map(|p| { + if p.is_aggregate { + Left(p.expr.alias(p.alias)) + } else { + Right(p.expr.alias(p.alias)) + } + }); + let distinct_partitions = LogicalPlan::Aggregate(Aggregate::try_new( + source.clone().into(), + group_columns, + agg_columns, + )?); let execution_plan = session_state .create_physical_plan(&distinct_partitions) .await?; - let items = execute_plan_to_batch(session_state, execution_plan).await?; - let placeholder_names = items .schema() .fields() .iter() .map(|f| f.name().to_owned()) .collect_vec(); - let expr = (0..items.num_rows()) .map(|i| { let replacements = placeholder_names @@ -933,7 +990,6 @@ async fn try_construct_early_filter( .collect::>>()? .into_iter() .reduce(Expr::or); - Ok(expr) } } @@ -945,21 +1001,26 @@ async fn execute( predicate: Expression, source: DataFrame, log_store: LogStoreRef, - snapshot: &DeltaTableState, + snapshot: DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, + mut commit_properties: CommitProperties, safe_cast: bool, source_alias: Option, target_alias: Option, match_operations: Vec, not_match_target_operations: Vec, not_match_source_operations: Vec, -) -> DeltaResult<((Vec, i64, Option), MergeMetrics)> { +) -> DeltaResult<(DeltaTableState, MergeMetrics)> { let mut metrics = MergeMetrics::default(); let exec_start = Instant::now(); let current_metadata = snapshot.metadata(); + let merge_planner = DeltaPlanner:: { + extension_planner: MergeMetricExtensionPlanner {}, + }; + + let state = state.with_query_planner(Arc::new(merge_planner)); // TODO: Given the join predicate, remove any expression that involve the // source table and keep expressions that only involve the target table. @@ -998,19 +1059,18 @@ async fn execute( let scan_config = DeltaScanConfigBuilder::default() .with_file_column(true) - .build(snapshot)?; - - let file_column = Arc::new(scan_config.file_column_name.clone().unwrap()); + .with_parquet_pushdown(false) + .build(&snapshot)?; let target_provider = Arc::new(DeltaTableProvider::try_new( snapshot.clone(), log_store.clone(), - scan_config, + scan_config.clone(), )?); let target_provider = provider_as_source(target_provider); - - let target = LogicalPlanBuilder::scan(target_name.clone(), target_provider, None)?.build()?; + let target = + LogicalPlanBuilder::scan(target_name.clone(), target_provider.clone(), None)?.build()?; let source_schema = source.schema(); let target_schema = target.schema(); @@ -1020,31 +1080,42 @@ async fn execute( Expression::String(s) => parse_predicate_expression(&join_schema_df, s, &state)?, }; - let state = state.with_query_planner(Arc::new(MergePlanner {})); - - let target = { - // Attempt to construct an early filter that we can apply to the Add action list and the delta scan. - // In the case where there are partition columns in the join predicate, we can scan the source table - // to get the distinct list of partitions affected and constrain the search to those. - - if !not_match_source_operations.is_empty() { - // It's only worth trying to create an early filter where there are no `when_not_matched_source` operators, since - // that implies a full scan - target - } else if let Some(filter) = try_construct_early_filter( + // Attempt to construct an early filter that we can apply to the Add action list and the delta scan. + // In the case where there are partition columns in the join predicate, we can scan the source table + // to get the distinct list of partitions affected and constrain the search to those. + let target_subset_filter = if !not_match_source_operations.is_empty() { + // It's only worth trying to create an early filter where there are no `when_not_matched_source` operators, since + // that implies a full scan + None + } else { + try_construct_early_filter( predicate.clone(), - snapshot, + &snapshot, &state, &source, &source_name, &target_name, ) .await? - { - LogicalPlan::Filter(Filter::try_new(filter, target.into())?) - } else { - target + }; + + let file_column = Arc::new(scan_config.file_column_name.clone().unwrap()); + // Need to manually push this filter into the scan... We want to PRUNE files not FILTER RECORDS + let target = match target_subset_filter.clone() { + Some(filter) => { + let filter = match &target_alias { + Some(alias) => remove_table_alias(filter, alias), + None => filter, + }; + LogicalPlanBuilder::scan_with_filters( + target_name.clone(), + target_provider, + None, + vec![filter], + )? + .build()? } + None => LogicalPlanBuilder::scan(target_name.clone(), target_provider, None)?.build()?, }; let source = DataFrame::new(state.clone(), source); @@ -1188,7 +1259,7 @@ async fn execute( let projection = join.with_column(OPERATION_COLUMN, case)?; - let mut new_columns = projection; + let mut new_columns = vec![]; let mut write_projection = Vec::new(); for delta_field in snapshot.schema().fields() { @@ -1223,11 +1294,9 @@ async fn execute( .end()?; let name = "__delta_rs_c_".to_owned() + delta_field.name(); - write_projection.push( - Expr::Column(Column::from_qualified_name_ignore_case(name.clone())) - .alias(delta_field.name()), - ); - new_columns = new_columns.with_column(&name, case)?; + write_projection + .push(Expr::Column(Column::from_name(name.clone())).alias(delta_field.name())); + new_columns.push((name, case)); } let mut insert_when = Vec::with_capacity(ops.len()); @@ -1303,18 +1372,40 @@ async fn execute( .end() } - new_columns = new_columns.with_column(DELETE_COLUMN, build_case(delete_when, delete_then)?)?; - new_columns = - new_columns.with_column(TARGET_INSERT_COLUMN, build_case(insert_when, insert_then)?)?; - new_columns = - new_columns.with_column(TARGET_UPDATE_COLUMN, build_case(update_when, update_then)?)?; - new_columns = new_columns.with_column( - TARGET_DELETE_COLUMN, + new_columns.push(( + DELETE_COLUMN.to_owned(), + build_case(delete_when, delete_then)?, + )); + new_columns.push(( + TARGET_INSERT_COLUMN.to_owned(), + build_case(insert_when, insert_then)?, + )); + new_columns.push(( + TARGET_UPDATE_COLUMN.to_owned(), + build_case(update_when, update_then)?, + )); + new_columns.push(( + TARGET_DELETE_COLUMN.to_owned(), build_case(target_delete_when, target_delete_then)?, - )?; - new_columns = new_columns.with_column(TARGET_COPY_COLUMN, build_case(copy_when, copy_then)?)?; - - let new_columns = new_columns.into_unoptimized_plan(); + )); + new_columns.push(( + TARGET_COPY_COLUMN.to_owned(), + build_case(copy_when, copy_then)?, + )); + + let new_columns = { + let plan = projection.into_unoptimized_plan(); + let mut fields: Vec = plan + .schema() + .columns() + .iter() + .map(|f| col(f.clone())) + .collect(); + + fields.extend(new_columns.into_iter().map(|(name, ex)| ex.alias(name))); + + LogicalPlanBuilder::from(plan).project(fields)?.build()? + }; let distrbute_expr = col(file_column.as_str()); @@ -1350,9 +1441,17 @@ async fn execute( // write projected records let table_partition_cols = current_metadata.partition_columns.clone(); + let writer_stats_config = WriterStatsConfig::new( + snapshot.table_config().num_indexed_cols(), + snapshot + .table_config() + .stats_columns() + .map(|v| v.iter().map(|v| v.to_string()).collect::>()), + ); + let rewrite_start = Instant::now(); let add_actions = write_execution_plan( - Some(snapshot), + Some(&snapshot), state.clone(), write, table_partition_cols.clone(), @@ -1361,13 +1460,15 @@ async fn execute( None, writer_properties, safe_cast, - false, + None, + writer_stats_config, + None, ) .await?; metrics.rewrite_time_ms = Instant::now().duration_since(rewrite_start).as_millis() as u64; - let mut actions: Vec = add_actions.into_iter().map(Action::Add).collect(); + let mut actions: Vec = add_actions.clone(); metrics.num_target_files_added = actions.len(); let survivors = barrier @@ -1386,13 +1487,8 @@ async fn execute( } } - let mut version = snapshot.version(); - let source_count_metrics = source_count.metrics().unwrap(); let target_count_metrics = op_count.metrics().unwrap(); - fn get_metric(metrics: &MetricsSet, name: &str) -> usize { - metrics.sum_by_name(name).map(|m| m.as_usize()).unwrap_or(0) - } metrics.num_source_rows = get_metric(&source_count_metrics, SOURCE_COUNT_METRIC); metrics.num_target_rows_inserted = get_metric(&target_count_metrics, TARGET_INSERTED_METRIC); @@ -1405,55 +1501,58 @@ async fn execute( metrics.execution_time_ms = Instant::now().duration_since(exec_start).as_millis() as u64; - let mut app_metadata = match app_metadata { - Some(meta) => meta, - None => HashMap::new(), - }; - + let app_metadata = &mut commit_properties.app_metadata; app_metadata.insert("readVersion".to_owned(), snapshot.version().into()); - if let Ok(map) = serde_json::to_value(&metrics) { app_metadata.insert("operationMetrics".to_owned(), map); } + // Predicate will be used for conflict detection + let commit_predicate = match target_subset_filter { + None => None, // No predicate means it's a full table merge + Some(some_filter) => { + let predict_expr = match &target_alias { + None => some_filter, + Some(alias) => remove_table_alias(some_filter, alias), + }; + Some(fmt_expr_to_sql(&predict_expr)?) + } + }; + // Do not make a commit when there are zero updates to the state let operation = DeltaOperation::Merge { - predicate: Some(fmt_expr_to_sql(&predicate)?), + predicate: commit_predicate, + merge_predicate: Some(fmt_expr_to_sql(&predicate)?), matched_predicates: match_operations, not_matched_predicates: not_match_target_operations, not_matched_by_source_predicates: not_match_source_operations, }; - if !actions.is_empty() { - version = commit( - log_store.as_ref(), - &actions, - operation.clone(), - Some(snapshot), - Some(app_metadata), - ) - .await?; + + if actions.is_empty() { + return Ok((snapshot, metrics)); } - let op = (!actions.is_empty()).then_some(operation); - Ok(((actions, version, op), metrics)) -} -// TODO: Abstract MergePlanner into DeltaPlanner to support other delta operations in the future. -struct MergePlanner {} + let commit = CommitBuilder::from(commit_properties) + .with_actions(actions) + .build(Some(&snapshot), log_store.clone(), operation) + .await?; + Ok((commit.snapshot(), metrics)) +} -#[async_trait] -impl QueryPlanner for MergePlanner { - async fn create_physical_plan( - &self, - logical_plan: &LogicalPlan, - session_state: &SessionState, - ) -> DataFusionResult> { - let planner = Arc::new(Box::new(DefaultPhysicalPlanner::with_extension_planners( - vec![Arc::new(MergeMetricExtensionPlanner {})], - ))); - planner - .create_physical_plan(logical_plan, session_state) - .await - } +fn remove_table_alias(expr: Expr, table_alias: &str) -> Expr { + expr.transform(&|expr| match expr { + Expr::Column(c) => match c.relation { + Some(rel) if rel.table() == table_alias => Ok(Transformed::yes(Expr::Column( + Column::new_unqualified(c.name), + ))), + _ => Ok(Transformed::no(Expr::Column(Column::new( + c.relation, c.name, + )))), + }, + _ => Ok(Transformed::no(expr)), + }) + .unwrap() + .data } impl std::future::IntoFuture for MergeBuilder { @@ -1461,10 +1560,10 @@ impl std::future::IntoFuture for MergeBuilder { type IntoFuture = BoxFuture<'static, Self::Output>; fn into_future(self) -> Self::IntoFuture { - let mut this = self; + let this = self; Box::pin(async move { - PROTOCOL.can_write_to(&this.snapshot)?; + PROTOCOL.can_write_to(&this.snapshot.snapshot)?; let state = this.state.unwrap_or_else(|| { let config: SessionConfig = DeltaSessionConfig::default().into(); @@ -1476,14 +1575,14 @@ impl std::future::IntoFuture for MergeBuilder { session.state() }); - let ((actions, version, operation), metrics) = execute( + let (snapshot, metrics) = execute( this.predicate, this.source, this.log_store.clone(), - &this.snapshot, + this.snapshot, state, this.writer_properties, - this.app_metadata, + this.commit_properties, this.safe_cast, this.source_alias, this.target_alias, @@ -1493,12 +1592,10 @@ impl std::future::IntoFuture for MergeBuilder { ) .await?; - if let Some(op) = &operation { - this.snapshot.merge(actions, op, version)?; - } - let table = DeltaTable::new_with_state(this.log_store, this.snapshot); - - Ok((table, metrics)) + Ok(( + DeltaTable::new_with_state(this.log_store, snapshot), + metrics, + )) }) } } @@ -1536,8 +1633,8 @@ mod tests { use datafusion_expr::LogicalPlanBuilder; use datafusion_expr::Operator; use itertools::Itertools; + use regex::Regex; use serde_json::json; - use std::collections::HashMap; use std::ops::Neg; use std::sync::Arc; @@ -1548,7 +1645,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); @@ -1556,6 +1653,7 @@ mod tests { table } + // TODO(ion): property keys are not passed through or translated as table features.. fix this as well #[tokio::test] async fn test_merge_when_delta_table_is_append_only() { let schema = get_arrow_schema(&None); @@ -1685,7 +1783,8 @@ mod tests { let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; let parameters = last_commit.operation_parameters.clone().unwrap(); - assert_eq!(parameters["predicate"], json!("target.id = source.id")); + assert!(!parameters.contains_key("predicate")); + assert_eq!(parameters["mergePredicate"], json!("target.id = source.id")); assert_eq!( parameters["matchedPredicates"], json!(r#"[{"actionType":"update"}]"#) @@ -1737,7 +1836,8 @@ mod tests { let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; let parameters = last_commit.operation_parameters.clone().unwrap(); - assert_eq!(parameters["predicate"], json!("target.id = source.id")); + assert!(!parameters.contains_key("predicate")); + assert_eq!(parameters["mergePredicate"], json!("target.id = source.id")); assert_eq!( parameters["matchedPredicates"], json!(r#"[{"actionType":"update"}]"#) @@ -1941,6 +2041,15 @@ mod tests { assert_eq!(metrics.num_output_rows, 6); assert_eq!(metrics.num_source_rows, 3); + let commit_info = table.history(None).await.unwrap(); + let last_commit = &commit_info[0]; + let parameters = last_commit.operation_parameters.clone().unwrap(); + assert!(!parameters.contains_key("predicate")); + assert_eq!( + parameters["mergePredicate"], + "target.id = source.id AND target.modified = '2021-02-02'" + ); + let expected = vec![ "+----+-------+------------+", "| id | value | modified |", @@ -1957,6 +2066,64 @@ mod tests { assert_batches_sorted_eq!(&expected, &actual); } + #[tokio::test] + async fn test_merge_partition_filtered() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + let table = write_data(table, &schema).await; + assert_eq!(table.version(), 1); + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["B", "C"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20])), + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-02", + "2021-02-02", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + let (table, _metrics) = DeltaOps(table) + .merge( + source, + col("target.id") + .eq(col("source.id")) + .and(col("target.modified").eq(lit("2021-02-02"))), + ) + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|update| { + update + .update("value", col("source.value")) + .update("modified", col("source.modified")) + }) + .unwrap() + .when_not_matched_insert(|insert| { + insert + .set("id", col("source.id")) + .set("value", col("source.value")) + .set("modified", col("source.modified")) + }) + .unwrap() + .await + .unwrap(); + assert_eq!(table.version(), 2); + let commit_info = table.history(None).await.unwrap(); + let last_commit = &commit_info[0]; + let parameters = last_commit.operation_parameters.clone().unwrap(); + assert_eq!( + parameters["predicate"], + "id BETWEEN 'B' AND 'C' AND modified = '2021-02-02'" + ); + assert_eq!( + parameters["mergePredicate"], + "target.id = source.id AND target.modified = '2021-02-02'" + ); + } + #[tokio::test] async fn test_merge_partitions_skipping() { /* Validate the join predicate can be used for skipping partitions */ @@ -2014,6 +2181,13 @@ mod tests { assert_eq!(metrics.num_output_rows, 3); assert_eq!(metrics.num_source_rows, 3); + let commit_info = table.history(None).await.unwrap(); + let last_commit = &commit_info[0]; + let parameters = last_commit.operation_parameters.clone().unwrap(); + let predicate = parameters["predicate"].as_str().unwrap(); + let re = Regex::new(r"^id = '(C|X|B)' OR id = '(C|X|B)' OR id = '(C|X|B)'$").unwrap(); + assert!(re.is_match(predicate)); + let expected = vec![ "+-------+------------+----+", "| value | modified | id |", @@ -2084,7 +2258,8 @@ mod tests { extra_info["operationMetrics"], serde_json::to_value(&metrics).unwrap() ); - assert_eq!(parameters["predicate"], json!("target.id = source.id")); + assert_eq!(parameters["predicate"], "id BETWEEN 'B' AND 'X'"); + assert_eq!(parameters["mergePredicate"], json!("target.id = source.id")); assert_eq!( parameters["matchedPredicates"], json!(r#"[{"actionType":"delete"}]"#) @@ -2148,7 +2323,7 @@ mod tests { let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; let parameters = last_commit.operation_parameters.clone().unwrap(); - assert_eq!(parameters["predicate"], json!("target.id = source.id")); + assert_eq!(parameters["mergePredicate"], json!("target.id = source.id")); assert_eq!( parameters["matchedPredicates"], json!(r#"[{"actionType":"delete","predicate":"source.value <= 10"}]"#) @@ -2217,7 +2392,8 @@ mod tests { let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; let parameters = last_commit.operation_parameters.clone().unwrap(); - assert_eq!(parameters["predicate"], json!("target.id = source.id")); + assert!(!parameters.contains_key("predicate")); + assert_eq!(parameters["mergePredicate"], json!("target.id = source.id")); assert_eq!( parameters["notMatchedBySourcePredicates"], json!(r#"[{"actionType":"delete"}]"#) @@ -2281,7 +2457,7 @@ mod tests { let commit_info = table.history(None).await.unwrap(); let last_commit = &commit_info[0]; let parameters = last_commit.operation_parameters.clone().unwrap(); - assert_eq!(parameters["predicate"], json!("target.id = source.id")); + assert_eq!(parameters["mergePredicate"], json!("target.id = source.id")); assert_eq!( parameters["notMatchedBySourcePredicates"], json!(r#"[{"actionType":"delete","predicate":"target.modified > '2021-02-01'"}]"#) @@ -2360,6 +2536,15 @@ mod tests { assert_eq!(metrics.num_output_rows, 3); assert_eq!(metrics.num_source_rows, 3); + let commit_info = table.history(None).await.unwrap(); + let last_commit = &commit_info[0]; + let parameters = last_commit.operation_parameters.clone().unwrap(); + + assert_eq!( + parameters["predicate"], + json!("id BETWEEN 'B' AND 'X' AND modified = '2021-02-02'") + ); + let expected = vec![ "+----+-------+------------+", "| id | value | modified |", @@ -2462,7 +2647,7 @@ mod tests { let parsed_filter = col(Column::new(source.clone().into(), "id")) .eq(col(Column::new(target.clone().into(), "id"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2494,7 +2679,7 @@ mod tests { let parsed_filter = (source_id.clone().eq(target_id.clone())) .or(source_id.clone().is_null().and(target_id.clone().is_null())); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2517,9 +2702,9 @@ mod tests { }) .and(target_id.clone().is_null())); - assert!(placeholders.len() == 2); + assert_eq!(placeholders.len(), 2); - let captured_expressions = placeholders.values().collect_vec(); + let captured_expressions = placeholders.into_iter().map(|p| p.expr).collect_vec(); assert!(captured_expressions.contains(&&source_id)); assert!(captured_expressions.contains(&&source_id.is_null())); @@ -2538,7 +2723,7 @@ mod tests { .neg() .eq(col(Column::new(target.clone().into(), "id"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2558,12 +2743,13 @@ mod tests { assert_eq!(generalized, expected_filter); assert_eq!(placeholders.len(), 1); - - let placeholder_expr = &placeholders["id_0"]; + let placeholder_expr = placeholders.get(0).unwrap(); let expected_placeholder = col(Column::new(source.clone().into(), "id")).neg(); - assert_eq!(placeholder_expr, &expected_placeholder); + assert_eq!(placeholder_expr.expr, expected_placeholder); + assert_eq!(placeholder_expr.alias, "id_0"); + assert_eq!(placeholder_expr.is_aggregate, false); } #[tokio::test] @@ -2576,7 +2762,7 @@ mod tests { .eq(col(Column::new(target.clone().into(), "id"))) .and(col(Column::new(target.clone().into(), "id")).eq(lit("C"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2598,6 +2784,38 @@ mod tests { assert_eq!(generalized, expected_filter); } + #[tokio::test] + async fn test_generalize_filter_with_dynamic_target_range_references() { + let source = TableReference::parse_str("source"); + let target = TableReference::parse_str("target"); + + let parsed_filter = col(Column::new(source.clone().into(), "id")) + .eq(col(Column::new(target.clone().into(), "id"))); + + let mut placeholders = Vec::default(); + + let generalized = generalize_filter( + parsed_filter, + &vec!["other".to_owned()], + &source, + &target, + &mut placeholders, + ) + .unwrap(); + let expected_filter_l = Expr::Placeholder(Placeholder { + id: "id_0_min".to_owned(), + data_type: None, + }); + let expected_filter_h = Expr::Placeholder(Placeholder { + id: "id_0_max".to_owned(), + data_type: None, + }); + let expected_filter = col(Column::new(target.clone().into(), "id")) + .between(expected_filter_l, expected_filter_h); + + assert_eq!(generalized, expected_filter); + } + #[tokio::test] async fn test_generalize_filter_removes_source_references() { let source = TableReference::parse_str("source"); @@ -2607,7 +2825,7 @@ mod tests { .eq(col(Column::new(target.clone().into(), "id"))) .and(col(Column::new(source.clone().into(), "id")).eq(lit("C"))); - let mut placeholders = HashMap::default(); + let mut placeholders = Vec::default(); let generalized = generalize_filter( parsed_filter, @@ -2724,4 +2942,479 @@ mod tests { assert_eq!(split_pred, expected_pred_parts); } + + #[tokio::test] + async fn test_try_construct_early_filter_with_range() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["B", "C"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let source_name = TableReference::parse_str("source"); + let target_name = TableReference::parse_str("target"); + + let source = LogicalPlanBuilder::scan( + source_name.clone(), + provider_as_source(source.into_view()), + None, + ) + .unwrap() + .build() + .unwrap(); + + let join_predicate = col(Column { + relation: Some(source_name.clone()), + name: "id".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + })); + + let pred = try_construct_early_filter( + join_predicate, + table.snapshot().unwrap(), + &ctx.state(), + &source, + &source_name, + &target_name, + ) + .await + .unwrap(); + + assert!(pred.is_some()); + + let filter = col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }) + .between( + Expr::Literal(ScalarValue::Utf8(Some("B".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("C".to_string()))), + ); + assert_eq!(pred.unwrap(), filter); + } + + #[tokio::test] + async fn test_try_construct_early_filter_with_partition_and_range() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["B", "C"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let source_name = TableReference::parse_str("source"); + let target_name = TableReference::parse_str("target"); + + let source = LogicalPlanBuilder::scan( + source_name.clone(), + provider_as_source(source.into_view()), + None, + ) + .unwrap() + .build() + .unwrap(); + + let join_predicate = col(Column { + relation: Some(source_name.clone()), + name: "id".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + })) + .and( + col(Column { + relation: Some(source_name.clone()), + name: "modified".to_owned(), + }) + .eq(col(Column { + relation: Some(target_name.clone()), + name: "modified".to_owned(), + })), + ); + + let pred = try_construct_early_filter( + join_predicate, + table.snapshot().unwrap(), + &ctx.state(), + &source, + &source_name, + &target_name, + ) + .await + .unwrap(); + + assert!(pred.is_some()); + + let filter = col(Column { + relation: Some(target_name.clone()), + name: "id".to_owned(), + }) + .between( + Expr::Literal(ScalarValue::Utf8(Some("B".to_string()))), + Expr::Literal(ScalarValue::Utf8(Some("C".to_string()))), + ) + .and( + Expr::Literal(ScalarValue::Utf8(Some("2023-07-04".to_string()))).eq(col(Column { + relation: Some(target_name.clone()), + name: "modified".to_owned(), + })), + ); + assert_eq!(pred.unwrap(), filter); + } + + #[tokio::test] + async fn test_merge_pushdowns() { + //See https://github.com/delta-io/delta-rs/issues/2158 + let schema = vec![ + StructField::new( + "id".to_string(), + DataType::Primitive(PrimitiveType::String), + true, + ), + StructField::new( + "cost".to_string(), + DataType::Primitive(PrimitiveType::Float), + true, + ), + StructField::new( + "month".to_string(), + DataType::Primitive(PrimitiveType::String), + true, + ), + ]; + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", ArrowDataType::Utf8, true), + Field::new("cost", ArrowDataType::Float32, true), + Field::new("month", ArrowDataType::Utf8, true), + ])); + + let table = DeltaOps::new_in_memory() + .create() + .with_columns(schema) + .await + .unwrap(); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&arrow_schema.clone()), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B"])), + Arc::new(arrow::array::Float32Array::from(vec![Some(10.15), None])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::Append) + .await + .unwrap(); + assert_eq!(table.version(), 1); + assert_eq!(table.get_files_count(), 1); + + let batch = RecordBatch::try_new( + Arc::clone(&arrow_schema.clone()), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B"])), + Arc::new(arrow::array::Float32Array::from(vec![ + Some(12.15), + Some(11.15), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let (table, _metrics) = DeltaOps(table) + .merge(source, "target.id = source.id and target.cost is null") + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|insert| { + insert + .update("id", "target.id") + .update("cost", "source.cost") + .update("month", "target.month") + }) + .unwrap() + .await + .unwrap(); + + let expected = vec![ + "+----+-------+------------+", + "| id | cost | month |", + "+----+-------+------------+", + "| A | 10.15 | 2023-07-04 |", + "| B | 11.15 | 2023-07-04 |", + "+----+-------+------------+", + ]; + let actual = get_data(&table).await; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn test_merge_row_groups_parquet_pushdown() { + //See https://github.com/delta-io/delta-rs/issues/2362 + let schema = vec![ + StructField::new( + "id".to_string(), + DataType::Primitive(PrimitiveType::String), + true, + ), + StructField::new( + "cost".to_string(), + DataType::Primitive(PrimitiveType::Float), + true, + ), + StructField::new( + "month".to_string(), + DataType::Primitive(PrimitiveType::String), + true, + ), + ]; + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", ArrowDataType::Utf8, true), + Field::new("cost", ArrowDataType::Float32, true), + Field::new("month", ArrowDataType::Utf8, true), + ])); + + let table = DeltaOps::new_in_memory() + .create() + .with_columns(schema) + .await + .unwrap(); + + let ctx = SessionContext::new(); + let batch1 = RecordBatch::try_new( + Arc::clone(&arrow_schema.clone()), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B"])), + Arc::new(arrow::array::Float32Array::from(vec![Some(10.15), None])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + + let batch2 = RecordBatch::try_new( + Arc::clone(&arrow_schema.clone()), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["C", "D"])), + Arc::new(arrow::array::Float32Array::from(vec![ + Some(11.0), + Some(12.0), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch1, batch2]) + .with_write_batch_size(2) + .with_save_mode(SaveMode::Append) + .await + .unwrap(); + assert_eq!(table.version(), 1); + assert_eq!(table.get_files_count(), 1); + + let batch = RecordBatch::try_new( + Arc::clone(&arrow_schema.clone()), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["C", "E"])), + Arc::new(arrow::array::Float32Array::from(vec![ + Some(12.15), + Some(11.15), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let (table, _metrics) = DeltaOps(table) + .merge(source, "target.id = source.id and target.id >= 'C'") + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|insert| { + insert + .update("id", "target.id") + .update("cost", "source.cost") + .update("month", "target.month") + }) + .unwrap() + .when_not_matched_insert(|insert| { + insert + .set("id", "source.id") + .set("cost", "source.cost") + .set("month", "source.month") + }) + .unwrap() + .await + .unwrap(); + + let expected = vec![ + "+----+-------+------------+", + "| id | cost | month |", + "+----+-------+------------+", + "| A | 10.15 | 2023-07-04 |", + "| B | | 2023-07-04 |", + "| C | 12.15 | 2023-07-04 |", + "| D | 12.0 | 2023-07-04 |", + "| E | 11.15 | 2023-07-04 |", + "+----+-------+------------+", + ]; + let actual = get_data(&table).await; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn test_merge_pushdowns_partitioned() { + //See #2158 + let schema = vec![ + StructField::new( + "id".to_string(), + DataType::Primitive(PrimitiveType::String), + true, + ), + StructField::new( + "cost".to_string(), + DataType::Primitive(PrimitiveType::Float), + true, + ), + StructField::new( + "month".to_string(), + DataType::Primitive(PrimitiveType::String), + true, + ), + ]; + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", ArrowDataType::Utf8, true), + Field::new("cost", ArrowDataType::Float32, true), + Field::new("month", ArrowDataType::Utf8, true), + ])); + + let part_cols = vec!["month"]; + let table = DeltaOps::new_in_memory() + .create() + .with_columns(schema) + .with_partition_columns(part_cols) + .await + .unwrap(); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&arrow_schema.clone()), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B"])), + Arc::new(arrow::array::Float32Array::from(vec![Some(10.15), None])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::Append) + .await + .unwrap(); + assert_eq!(table.version(), 1); + assert_eq!(table.get_files_count(), 1); + + let batch = RecordBatch::try_new( + Arc::clone(&arrow_schema.clone()), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B"])), + Arc::new(arrow::array::Float32Array::from(vec![ + Some(12.15), + Some(11.15), + ])), + Arc::new(arrow::array::StringArray::from(vec![ + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let (table, _metrics) = DeltaOps(table) + .merge(source, "target.id = source.id and target.cost is null") + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|insert| { + insert + .update("id", "target.id") + .update("cost", "source.cost") + .update("month", "target.month") + }) + .unwrap() + .await + .unwrap(); + + let expected = vec![ + "+----+-------+------------+", + "| id | cost | month |", + "+----+-------+------------+", + "| A | 10.15 | 2023-07-04 |", + "| B | 11.15 | 2023-07-04 |", + "+----+-------+------------+", + ]; + let actual = get_data(&table).await; + assert_batches_sorted_eq!(&expected, &actual); + } } diff --git a/crates/core/src/operations/mod.rs b/crates/core/src/operations/mod.rs index 2271f36641..608bdb1549 100644 --- a/crates/core/src/operations/mod.rs +++ b/crates/core/src/operations/mod.rs @@ -7,6 +7,7 @@ //! with a [data stream][datafusion::physical_plan::SendableRecordBatchStream], //! if the operation returns data as well. +use self::add_column::AddColumnBuilder; use self::create::CreateBuilder; use self::filesystem_check::FileSystemCheckBuilder; use self::vacuum::VacuumBuilder; @@ -15,9 +16,11 @@ use crate::table::builder::DeltaTableBuilder; use crate::DeltaTable; use std::collections::HashMap; +pub mod add_column; pub mod cast; pub mod convert_to_delta; pub mod create; +pub mod drop_constraints; pub mod filesystem_check; pub mod optimize; pub mod restore; @@ -27,7 +30,8 @@ pub mod vacuum; #[cfg(feature = "datafusion")] use self::{ constraints::ConstraintBuilder, datafusion_utils::Expression, delete::DeleteBuilder, - load::LoadBuilder, merge::MergeBuilder, update::UpdateBuilder, write::WriteBuilder, + drop_constraints::DropConstraintBuilder, load::LoadBuilder, load_cdf::CdfLoadBuilder, + merge::MergeBuilder, update::UpdateBuilder, write::WriteBuilder, }; #[cfg(feature = "datafusion")] pub use ::datafusion::physical_plan::common::collect as collect_sendable_stream; @@ -35,7 +39,10 @@ pub use ::datafusion::physical_plan::common::collect as collect_sendable_stream; use arrow::record_batch::RecordBatch; use optimize::OptimizeBuilder; use restore::RestoreBuilder; +use set_tbl_properties::SetTablePropertiesBuilder; +#[cfg(all(feature = "cdf", feature = "datafusion"))] +mod cdc; #[cfg(feature = "datafusion")] pub mod constraints; #[cfg(feature = "datafusion")] @@ -43,14 +50,19 @@ pub mod delete; #[cfg(feature = "datafusion")] mod load; #[cfg(feature = "datafusion")] +pub mod load_cdf; +#[cfg(feature = "datafusion")] pub mod merge; +pub mod set_tbl_properties; #[cfg(feature = "datafusion")] pub mod update; #[cfg(feature = "datafusion")] pub mod write; pub mod writer; -// TODO make ops consume a snapshot ... +/// The [Operation] trait defines common behaviors that all operations builders +/// should have consistent +pub(crate) trait Operation: std::future::IntoFuture {} /// High level interface for executing commands against a DeltaTable pub struct DeltaOps(pub DeltaTable); @@ -132,6 +144,13 @@ impl DeltaOps { LoadBuilder::new(self.0.log_store, self.0.state.unwrap()) } + /// Load a table with CDF Enabled + #[cfg(feature = "datafusion")] + #[must_use] + pub fn load_cdf(self) -> CdfLoadBuilder { + CdfLoadBuilder::new(self.0.log_store, self.0.state.unwrap()) + } + /// Write data to Delta table #[cfg(feature = "datafusion")] #[must_use] @@ -199,6 +218,23 @@ impl DeltaOps { pub fn add_constraint(self) -> ConstraintBuilder { ConstraintBuilder::new(self.0.log_store, self.0.state.unwrap()) } + + /// Drops constraints from a table + #[cfg(feature = "datafusion")] + #[must_use] + pub fn drop_constraints(self) -> DropConstraintBuilder { + DropConstraintBuilder::new(self.0.log_store, self.0.state.unwrap()) + } + + /// Set table properties + pub fn set_tbl_properties(self) -> SetTablePropertiesBuilder { + SetTablePropertiesBuilder::new(self.0.log_store, self.0.state.unwrap()) + } + + /// Add new columns + pub fn add_columns(self) -> AddColumnBuilder { + AddColumnBuilder::new(self.0.log_store, self.0.state.unwrap()) + } } impl From for DeltaOps { @@ -219,6 +255,33 @@ impl AsRef for DeltaOps { } } +/// Get the num_idx_columns and stats_columns from the table configuration in the state +/// If table_config does not exist (only can occur in the first write action) it takes +/// the configuration that was passed to the writerBuilder. +pub fn get_num_idx_cols_and_stats_columns( + config: Option>, + configuration: HashMap>, +) -> (i32, Option>) { + let (num_index_cols, stats_columns) = match &config { + Some(conf) => (conf.num_indexed_cols(), conf.stats_columns()), + _ => ( + configuration + .get("delta.dataSkippingNumIndexedCols") + .and_then(|v| v.clone().map(|v| v.parse::().unwrap())) + .unwrap_or(crate::table::config::DEFAULT_NUM_INDEX_COLS), + configuration + .get("delta.dataSkippingStatsColumns") + .and_then(|v| v.as_ref().map(|v| v.split(',').collect::>())), + ), + }; + ( + num_index_cols, + stats_columns + .clone() + .map(|v| v.iter().map(|v| v.to_string()).collect::>()), + ) +} + #[cfg(feature = "datafusion")] mod datafusion_utils { use datafusion::execution::context::SessionState; @@ -228,6 +291,7 @@ mod datafusion_utils { use crate::{delta_datafusion::expr::parse_predicate_expression, DeltaResult}; /// Used to represent user input of either a Datafusion expression or string expression + #[derive(Debug)] pub enum Expression { /// Datafusion Expression DataFusion(Expr), diff --git a/crates/core/src/operations/optimize.rs b/crates/core/src/operations/optimize.rs index c67b31a71b..9e1641fc7f 100644 --- a/crates/core/src/operations/optimize.rs +++ b/crates/core/src/operations/optimize.rs @@ -20,29 +20,33 @@ //! let (table, metrics) = OptimizeBuilder::new(table.object_store(), table.state).await?; //! ```` -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; +use std::fmt; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use arrow::datatypes::SchemaRef as ArrowSchemaRef; use arrow_array::RecordBatch; +use delta_kernel::expressions::Scalar; use futures::future::BoxFuture; use futures::stream::BoxStream; use futures::{Future, StreamExt, TryStreamExt}; +use indexmap::IndexMap; use itertools::Itertools; use num_cpus; use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; use parquet::basic::{Compression, ZstdLevel}; use parquet::errors::ParquetError; use parquet::file::properties::WriterProperties; -use serde::{Deserialize, Serialize}; +use serde::{de::Error as DeError, Deserialize, Deserializer, Serialize, Serializer}; use tracing::debug; -use super::transaction::{commit, PROTOCOL}; +use super::transaction::PROTOCOL; use super::writer::{PartitionWriter, PartitionWriterConfig}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Action, PartitionsExt, Remove, Scalar}; +use crate::kernel::{scalars::ScalarExt, Action, PartitionsExt, Remove}; use crate::logstore::LogStoreRef; +use crate::operations::transaction::{CommitBuilder, CommitProperties, DEFAULT_RETRIES}; use crate::protocol::DeltaOperation; use crate::storage::ObjectStoreRef; use crate::table::state::DeltaTableState; @@ -58,8 +62,16 @@ pub struct Metrics { /// Number of unoptimized files removed pub num_files_removed: u64, /// Detailed metrics for the add operation + #[serde( + serialize_with = "serialize_metric_details", + deserialize_with = "deserialize_metric_details" + )] pub files_added: MetricDetails, /// Detailed metrics for the remove operation + #[serde( + serialize_with = "serialize_metric_details", + deserialize_with = "deserialize_metric_details" + )] pub files_removed: MetricDetails, /// Number of partitions that had at least one file optimized pub partitions_optimized: u64, @@ -73,17 +85,34 @@ pub struct Metrics { pub preserve_insertion_order: bool, } +// Custom serialization function that serializes metric details as a string +fn serialize_metric_details(value: &MetricDetails, serializer: S) -> Result +where + S: Serializer, +{ + serializer.serialize_str(&value.to_string()) +} + +// Custom deserialization that parses a JSON string into MetricDetails +fn deserialize_metric_details<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s: String = Deserialize::deserialize(deserializer)?; + serde_json::from_str(&s).map_err(DeError::custom) +} + /// Statistics on files for a particular operation /// Operation can be remove or add #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct MetricDetails { - /// Minimum file size of a operation - pub min: i64, - /// Maximum file size of a operation - pub max: i64, /// Average file size of a operation pub avg: f64, + /// Maximum file size of a operation + pub max: i64, + /// Minimum file size of a operation + pub min: i64, /// Number of files encountered during operation pub total_files: usize, /// Sum of file sizes of a operation @@ -101,6 +130,13 @@ impl MetricDetails { } } +impl fmt::Display for MetricDetails { + /// Display the metric details using serde serialization + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + serde_json::to_string(self).map_err(|_| fmt::Error)?.fmt(f) + } +} + /// Metrics for a single partition pub struct PartialMetrics { /// Number of optimized files added @@ -163,8 +199,8 @@ pub struct OptimizeBuilder<'a> { target_size: Option, /// Properties passed to underlying parquet writer writer_properties: Option, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Commit properties and configuration + commit_properties: CommitProperties, /// Whether to preserve insertion order within files (default false) preserve_insertion_order: bool, /// Max number of concurrent tasks (default is number of cpus) @@ -176,6 +212,8 @@ pub struct OptimizeBuilder<'a> { min_commit_interval: Option, } +impl super::Operation<()> for OptimizeBuilder<'_> {} + impl<'a> OptimizeBuilder<'a> { /// Create a new [`OptimizeBuilder`] pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { @@ -185,7 +223,7 @@ impl<'a> OptimizeBuilder<'a> { filters: &[], target_size: None, writer_properties: None, - app_metadata: None, + commit_properties: CommitProperties::default(), preserve_insertion_order: false, max_concurrent_tasks: num_cpus::get(), max_spill_size: 20 * 1024 * 1024 * 2014, // 20 GB. @@ -218,12 +256,9 @@ impl<'a> OptimizeBuilder<'a> { self } - /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + /// Additonal information to write to the commit + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } @@ -260,7 +295,7 @@ impl<'a> std::future::IntoFuture for OptimizeBuilder<'a> { let this = self; Box::pin(async move { - PROTOCOL.can_write_to(&this.snapshot)?; + PROTOCOL.can_write_to(&this.snapshot.snapshot)?; let writer_properties = this.writer_properties.unwrap_or_else(|| { WriterProperties::builder() @@ -282,7 +317,7 @@ impl<'a> std::future::IntoFuture for OptimizeBuilder<'a> { this.max_concurrent_tasks, this.max_spill_size, this.min_commit_interval, - this.app_metadata, + this.commit_properties, ) .await?; let mut table = DeltaTable::new_with_state(this.log_store, this.snapshot); @@ -295,20 +330,21 @@ impl<'a> std::future::IntoFuture for OptimizeBuilder<'a> { #[derive(Debug, Clone)] struct OptimizeInput { target_size: i64, + predicate: Option, } impl From for DeltaOperation { fn from(opt_input: OptimizeInput) -> Self { DeltaOperation::Optimize { target_size: opt_input.target_size, - predicate: None, + predicate: opt_input.predicate, } } } fn create_remove( path: &str, - partitions: &BTreeMap, + partitions: &IndexMap, size: i64, ) -> Result { // NOTE unwrap is safe since UNIX_EPOCH will always be earlier then now. @@ -353,11 +389,11 @@ enum OptimizeOperations { /// /// Bins are determined by the bin-packing algorithm to reach an optimal size. /// Files that are large enough already are skipped. Bins of size 1 are dropped. - Compact(HashMap, Vec)>), + Compact(HashMap, Vec)>), /// Plan to Z-order each partition ZOrder( Vec, - HashMap, MergeBin)>, + HashMap, MergeBin)>, ), // TODO: Sort } @@ -389,6 +425,10 @@ pub struct MergeTaskParameters { file_schema: ArrowSchemaRef, /// Properties passed to parquet writer writer_properties: WriterProperties, + /// Num index cols to collect stats for + num_indexed_cols: i32, + /// Stats columns, specific columns to collect stats from, takes precedence over num_indexed_cols + stats_columns: Option>, } /// A stream of record batches, with a ParquetError on failure. @@ -401,7 +441,7 @@ impl MergePlan { /// collected during the operation. async fn rewrite_files( task_parameters: Arc, - partition_values: BTreeMap, + partition_values: IndexMap, files: MergeBin, object_store: ObjectStoreRef, read_stream: F, @@ -448,15 +488,24 @@ impl MergePlan { Some(task_parameters.input_parameters.target_size as usize), None, )?; - let mut writer = PartitionWriter::try_with_config(object_store, writer_config)?; + let mut writer = PartitionWriter::try_with_config( + object_store, + writer_config, + task_parameters.num_indexed_cols, + task_parameters.stats_columns.clone(), + )?; let mut read_stream = read_stream.await?; while let Some(maybe_batch) = read_stream.next().await { let mut batch = maybe_batch?; - batch = - super::cast::cast_record_batch(&batch, task_parameters.file_schema.clone(), false)?; + batch = super::cast::cast_record_batch( + &batch, + task_parameters.file_schema.clone(), + false, + true, + )?; partial_metrics.num_batches += 1; writer.write(&batch).await.map_err(DeltaTableError::from)?; } @@ -606,7 +655,7 @@ impl MergePlan { #[allow(unused_variables)] // used behind a feature flag max_spill_size: usize, min_commit_interval: Option, - app_metadata: Option>, + commit_properties: CommitProperties, ) -> Result { let operations = std::mem::take(&mut self.operations); @@ -698,6 +747,7 @@ impl MergePlan { let mut total_metrics = orig_metrics.clone(); let mut last_commit = Instant::now(); + let mut commits_made = 0; loop { let next = stream.next().await.transpose()?; @@ -720,31 +770,34 @@ impl MergePlan { last_commit = now; buffered_metrics.preserve_insertion_order = true; - let mut app_metadata = match app_metadata.clone() { - Some(meta) => meta, - None => HashMap::new(), - }; - app_metadata.insert("readVersion".to_owned(), self.read_table_version.into()); + let mut properties = CommitProperties::default(); + properties.app_metadata = commit_properties.app_metadata.clone(); + properties + .app_metadata + .insert("readVersion".to_owned(), self.read_table_version.into()); let maybe_map_metrics = serde_json::to_value(std::mem::replace( &mut buffered_metrics, orig_metrics.clone(), )); if let Ok(map) = maybe_map_metrics { - app_metadata.insert("operationMetrics".to_owned(), map); + properties + .app_metadata + .insert("operationMetrics".to_owned(), map); } - table.update().await?; debug!("committing {} actions", actions.len()); - //// TODO: Check for remove actions on optimized partitions. If a - //// optimized partition was updated then abort the commit. Requires (#593). - commit( - table.log_store.as_ref(), - &actions, - self.task_parameters.input_parameters.clone().into(), - Some(table.snapshot()?), - Some(app_metadata.clone()), - ) - .await?; + + CommitBuilder::from(properties) + .with_actions(actions) + .with_max_retries(DEFAULT_RETRIES + commits_made) + .build( + Some(snapshot), + log_store.clone(), + self.task_parameters.input_parameters.clone().into(), + ) + .await?; + + commits_made += 1; } if end { @@ -760,6 +813,8 @@ impl MergePlan { total_metrics.files_removed.min = 0; } + table.update().await?; + Ok(total_metrics) } } @@ -782,7 +837,10 @@ pub fn create_merge_plan( } }; - let input_parameters = OptimizeInput { target_size }; + let input_parameters = OptimizeInput { + target_size, + predicate: serde_json::to_string(filters).ok(), + }; let file_schema = arrow_schema_without_partitions(&Arc::new(snapshot.schema().try_into()?), partitions_keys); @@ -793,6 +851,11 @@ pub fn create_merge_plan( input_parameters, file_schema, writer_properties, + num_indexed_cols: snapshot.table_config().num_indexed_cols(), + stats_columns: snapshot + .table_config() + .stats_columns() + .map(|v| v.iter().map(|v| v.to_string()).collect::>()), }), read_table_version: snapshot.version(), }) @@ -849,7 +912,7 @@ fn build_compaction_plan( ) -> Result<(OptimizeOperations, Metrics), DeltaTableError> { let mut metrics = Metrics::default(); - let mut partition_files: HashMap, Vec)> = + let mut partition_files: HashMap, Vec)> = HashMap::new(); for add in snapshot.get_active_add_actions_by_partitions(filters)? { let add = add?; @@ -863,7 +926,7 @@ fn build_compaction_plan( .partition_values()? .into_iter() .map(|(k, v)| (k.to_string(), v)) - .collect::>(); + .collect::>(); partition_files .entry(add.partition_values()?.hive_partition_path()) @@ -877,7 +940,7 @@ fn build_compaction_plan( file.sort_by(|a, b| b.size.cmp(&a.size)); } - let mut operations: HashMap, Vec)> = HashMap::new(); + let mut operations: HashMap, Vec)> = HashMap::new(); for (part, (partition, files)) in partition_files { let mut merge_bins = vec![MergeBin::new()]; @@ -939,7 +1002,6 @@ fn build_zorder_plan( let field_names = snapshot .schema() .fields() - .iter() .map(|field| field.name().to_string()) .collect_vec(); let unknown_columns = zorder_columns @@ -955,14 +1017,14 @@ fn build_zorder_plan( // For now, just be naive and optimize all files in each selected partition. let mut metrics = Metrics::default(); - let mut partition_files: HashMap, MergeBin)> = HashMap::new(); + let mut partition_files: HashMap, MergeBin)> = HashMap::new(); for add in snapshot.get_active_add_actions_by_partitions(filters)? { let add = add?; let partition_values = add .partition_values()? .into_iter() .map(|(k, v)| (k.to_string(), v)) - .collect::>(); + .collect::>(); metrics.total_considered_files += 1; let object_meta = ObjectMeta::try_from(&add)?; @@ -1283,7 +1345,7 @@ pub(super) mod zorder { "+-----+-----+-----------+", ]; - let expected = vec![expected_1, expected_2, expected_3]; + let expected = [expected_1, expected_2, expected_3]; let indices = Int32Array::from(shuffled_indices().to_vec()); let shuffled_columns = batch diff --git a/crates/core/src/operations/restore.rs b/crates/core/src/operations/restore.rs index 2718ee34fb..e2ab9741bc 100644 --- a/crates/core/src/operations/restore.rs +++ b/crates/core/src/operations/restore.rs @@ -21,7 +21,7 @@ //! ```` use std::cmp::max; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::ops::BitXor; use std::time::{SystemTime, UNIX_EPOCH}; @@ -33,11 +33,12 @@ use serde::Serialize; use crate::kernel::{Action, Add, Protocol, Remove}; use crate::logstore::LogStoreRef; -use crate::operations::transaction::{prepare_commit, TransactionError}; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; use crate::{DeltaResult, DeltaTable, DeltaTableConfig, DeltaTableError, ObjectStoreError}; +use super::transaction::{CommitBuilder, CommitProperties, TransactionError}; + /// Errors that can occur during restore #[derive(thiserror::Error, Debug)] enum RestoreError { @@ -84,10 +85,12 @@ pub struct RestoreBuilder { ignore_missing_files: bool, /// Protocol downgrade allowed protocol_downgrade_allowed: bool, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Additional information to add to the commit + commit_properties: CommitProperties, } +impl super::Operation<()> for RestoreBuilder {} + impl RestoreBuilder { /// Create a new [`RestoreBuilder`] pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { @@ -98,7 +101,7 @@ impl RestoreBuilder { datetime_to_restore: None, ignore_missing_files: false, protocol_downgrade_allowed: false, - app_metadata: None, + commit_properties: CommitProperties::default(), } } @@ -128,11 +131,8 @@ impl RestoreBuilder { } /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } } @@ -144,7 +144,7 @@ async fn execute( datetime_to_restore: Option>, ignore_missing_files: bool, protocol_downgrade_allowed: bool, - app_metadata: Option>, + mut commit_properties: CommitProperties, ) -> DeltaResult { if !(version_to_restore .is_none() @@ -248,43 +248,41 @@ async fn execute( reader_features: snapshot.protocol().reader_features.clone(), } }; - let mut app_metadata = match app_metadata { - Some(meta) => meta, - None => HashMap::new(), - }; - - app_metadata.insert("readVersion".to_owned(), snapshot.version().into()); - - if let Ok(map) = serde_json::to_value(&metrics) { - app_metadata.insert("operationMetrics".to_owned(), map); - } + commit_properties + .app_metadata + .insert("readVersion".to_owned(), snapshot.version().into()); + commit_properties.app_metadata.insert( + "operationMetrics".to_owned(), + serde_json::to_value(&metrics)?, + ); actions.push(Action::Protocol(protocol)); actions.extend(files_to_add.into_iter().map(Action::Add)); actions.extend(files_to_remove.into_iter().map(Action::Remove)); - let commit = prepare_commit( - log_store.object_store().as_ref(), - &DeltaOperation::Restore { - version: version_to_restore, - datetime: datetime_to_restore.map(|time| -> i64 { time.timestamp_millis() }), - }, - &actions, - Some(app_metadata), - ) - .await?; + let operation = DeltaOperation::Restore { + version: version_to_restore, + datetime: datetime_to_restore.map(|time| -> i64 { time.timestamp_millis() }), + }; + + let prepared_commit = CommitBuilder::from(commit_properties) + .with_actions(actions) + .build(Some(&snapshot), log_store.clone(), operation) + .into_prepared_commit_future() + .await?; + let commit_version = snapshot.version() + 1; - match log_store.write_commit_entry(commit_version, &commit).await { + let commit = prepared_commit.path(); + match log_store.write_commit_entry(commit_version, commit).await { Ok(_) => {} Err(err @ TransactionError::VersionAlreadyExists(_)) => { return Err(err.into()); } Err(err) => { - log_store.object_store().delete(&commit).await?; + log_store.abort_commit_entry(commit_version, commit).await?; return Err(err.into()); } } - Ok(metrics) } @@ -322,7 +320,7 @@ impl std::future::IntoFuture for RestoreBuilder { this.datetime_to_restore, this.ignore_missing_files, this.protocol_downgrade_allowed, - this.app_metadata, + this.commit_properties, ) .await?; let mut table = DeltaTable::new_with_state(this.log_store, this.snapshot); diff --git a/crates/core/src/operations/set_tbl_properties.rs b/crates/core/src/operations/set_tbl_properties.rs new file mode 100644 index 0000000000..b3ca7607ac --- /dev/null +++ b/crates/core/src/operations/set_tbl_properties.rs @@ -0,0 +1,111 @@ +//! Set table properties on a table + +use std::collections::HashMap; + +use futures::future::BoxFuture; + +use super::transaction::{CommitBuilder, CommitProperties}; +use crate::kernel::Action; +use crate::logstore::LogStoreRef; +use crate::protocol::DeltaOperation; +use crate::table::state::DeltaTableState; +use crate::DeltaResult; +use crate::DeltaTable; + +/// Remove constraints from the table +pub struct SetTablePropertiesBuilder { + /// A snapshot of the table's state + snapshot: DeltaTableState, + /// Name of the property + properties: HashMap, + /// Raise if property doesn't exist + raise_if_not_exists: bool, + /// Delta object store for handling data files + log_store: LogStoreRef, + /// Additional information to add to the commit + commit_properties: CommitProperties, +} + +impl SetTablePropertiesBuilder { + /// Create a new builder + pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { + Self { + properties: HashMap::new(), + raise_if_not_exists: true, + snapshot, + log_store, + commit_properties: CommitProperties::default(), + } + } + + /// Specify the properties to be removed + pub fn with_properties(mut self, table_properties: HashMap) -> Self { + self.properties = table_properties; + self + } + + /// Specify if you want to raise if the property does not exist + pub fn with_raise_if_not_exists(mut self, raise: bool) -> Self { + self.raise_if_not_exists = raise; + self + } + + /// Additional metadata to be added to commit info + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; + self + } +} + +impl std::future::IntoFuture for SetTablePropertiesBuilder { + type Output = DeltaResult; + + type IntoFuture = BoxFuture<'static, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let this = self; + + Box::pin(async move { + let mut metadata = this.snapshot.metadata().clone(); + + let current_protocol = this.snapshot.protocol(); + let properties = this.properties; + + let new_protocol = current_protocol + .clone() + .apply_properties_to_protocol(&properties, this.raise_if_not_exists)?; + + metadata.configuration.extend( + properties + .clone() + .into_iter() + .map(|(k, v)| (k, Some(v))) + .collect::>>(), + ); + + let final_protocol = + new_protocol.move_table_properties_into_features(&metadata.configuration); + + let operation = DeltaOperation::SetTableProperties { properties }; + + let mut actions = vec![Action::Metadata(metadata)]; + + if current_protocol.ne(&final_protocol) { + actions.push(Action::Protocol(final_protocol)); + } + + let commit = CommitBuilder::from(this.commit_properties) + .with_actions(actions.clone()) + .build( + Some(&this.snapshot), + this.log_store.clone(), + operation.clone(), + ) + .await?; + Ok(DeltaTable::new_with_state( + this.log_store, + commit.snapshot(), + )) + }) + } +} diff --git a/crates/core/src/operations/transaction/application.rs b/crates/core/src/operations/transaction/application.rs new file mode 100644 index 0000000000..5a636bcecf --- /dev/null +++ b/crates/core/src/operations/transaction/application.rs @@ -0,0 +1,136 @@ +#[cfg(test)] +mod tests { + use crate::{ + checkpoints, kernel::Transaction, operations::transaction::CommitProperties, + protocol::SaveMode, writer::test_utils::get_record_batch, DeltaOps, DeltaTableBuilder, + }; + + #[tokio::test] + async fn test_app_txn_workload() { + // Test that the transaction ids can be read from different scenarios + // 1. Write new table to storage + // 2. Read new table + // 3. Write to table a new txn id and then update a different table state that uses the same underlying table + // 4. Write a checkpoint and read that checkpoint. + + let tmp_dir = tempfile::tempdir().unwrap(); + let tmp_path = std::fs::canonicalize(tmp_dir.path()).unwrap(); + + let batch = get_record_batch(None, false); + let table = DeltaOps::try_from_uri(tmp_path.to_str().unwrap()) + .await + .unwrap() + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::ErrorIfExists) + .with_partition_columns(["modified"]) + .with_commit_properties( + CommitProperties::default() + .with_application_transaction(Transaction::new("my-app", 1)), + ) + .await + .unwrap(); + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 2); + + let app_txns = table.get_app_transaction_version(); + assert_eq!(app_txns.len(), 1); + assert_eq!(app_txns.get("my-app").map(|t| t.version), Some(1)); + + // Test Txn Id can be read from existing table + + let mut table2 = DeltaTableBuilder::from_uri(tmp_path.to_str().unwrap()) + .load() + .await + .unwrap(); + let app_txns2 = table2.get_app_transaction_version(); + + assert_eq!(app_txns2.len(), 1); + assert_eq!(app_txns2.get("my-app").map(|t| t.version), Some(1)); + + // Write new data to the table and check that `update` functions work + + let table = DeltaOps::from(table) + .write(vec![get_record_batch(None, false)]) + .with_commit_properties( + CommitProperties::default() + .with_application_transaction(Transaction::new("my-app", 3)), + ) + .await + .unwrap(); + + assert_eq!(table.version(), 1); + let app_txns = table.get_app_transaction_version(); + assert_eq!(app_txns.len(), 1); + assert_eq!(app_txns.get("my-app").map(|t| t.version), Some(3)); + + table2.update_incremental(None).await.unwrap(); + assert_eq!(table2.version(), 1); + let app_txns2 = table2.get_app_transaction_version(); + assert_eq!(app_txns2.len(), 1); + assert_eq!(app_txns2.get("my-app").map(|t| t.version), Some(3)); + + // Create a checkpoint and then load + checkpoints::create_checkpoint(&table).await.unwrap(); + let table3 = DeltaTableBuilder::from_uri(tmp_path.to_str().unwrap()) + .load() + .await + .unwrap(); + let app_txns3 = table2.get_app_transaction_version(); + assert_eq!(app_txns3.len(), 1); + assert_eq!(app_txns3.get("my-app").map(|t| t.version), Some(3)); + assert_eq!(table3.version(), 1); + } + + #[tokio::test] + async fn test_app_txn_conflict() { + // A conflict must be raised whenever the same application id is used for two concurrent transactions + + let tmp_dir = tempfile::tempdir().unwrap(); + let tmp_path = std::fs::canonicalize(tmp_dir.path()).unwrap(); + + let batch = get_record_batch(None, false); + let table = DeltaOps::try_from_uri(tmp_path.to_str().unwrap()) + .await + .unwrap() + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::ErrorIfExists) + .with_partition_columns(["modified"]) + .with_commit_properties( + CommitProperties::default() + .with_application_transaction(Transaction::new("my-app", 1)), + ) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let table2 = DeltaTableBuilder::from_uri(tmp_path.to_str().unwrap()) + .load() + .await + .unwrap(); + assert_eq!(table2.version(), 0); + + let table = DeltaOps::from(table) + .write(vec![get_record_batch(None, false)]) + .with_commit_properties( + CommitProperties::default() + .with_application_transaction(Transaction::new("my-app", 2)), + ) + .await + .unwrap(); + assert_eq!(table.version(), 1); + + let res = DeltaOps::from(table2) + .write(vec![get_record_batch(None, false)]) + .with_commit_properties( + CommitProperties::default() + .with_application_transaction(Transaction::new("my-app", 3)), + ) + .await; + + let err = res.err().unwrap(); + assert_eq!( + err.to_string(), + "Transaction failed: Failed to commit transaction: Concurrent transaction failed." + ); + } +} diff --git a/crates/core/src/operations/transaction/conflict_checker.rs b/crates/core/src/operations/transaction/conflict_checker.rs index abd5351ef9..d44c704b53 100644 --- a/crates/core/src/operations/transaction/conflict_checker.rs +++ b/crates/core/src/operations/transaction/conflict_checker.rs @@ -2,12 +2,15 @@ use std::collections::HashSet; use super::CommitInfo; +#[cfg(feature = "datafusion")] +use crate::delta_datafusion::DataFusionMixins; use crate::errors::DeltaResult; +use crate::kernel::EagerSnapshot; +use crate::kernel::Transaction; use crate::kernel::{Action, Add, Metadata, Protocol, Remove}; use crate::logstore::{get_actions, LogStore}; use crate::protocol::DeltaOperation; use crate::table::config::IsolationLevel; -use crate::table::state::DeltaTableState; use crate::DeltaTableError; #[cfg(feature = "datafusion")] @@ -98,9 +101,9 @@ pub(crate) struct TransactionInfo<'a> { /// appIds that have been seen by the transaction pub(crate) read_app_ids: HashSet, /// delta log actions that the transaction wants to commit - actions: &'a Vec, + actions: &'a [Action], /// read [`DeltaTableState`] used for the transaction - pub(crate) read_snapshot: &'a DeltaTableState, + pub(crate) read_snapshot: &'a EagerSnapshot, /// Whether the transaction tainted the whole table read_whole_table: bool, } @@ -108,9 +111,9 @@ pub(crate) struct TransactionInfo<'a> { impl<'a> TransactionInfo<'a> { #[cfg(feature = "datafusion")] pub fn try_new( - read_snapshot: &'a DeltaTableState, + read_snapshot: &'a EagerSnapshot, read_predicates: Option, - actions: &'a Vec, + actions: &'a [Action], read_whole_table: bool, ) -> DeltaResult { use datafusion::prelude::SessionContext; @@ -119,10 +122,18 @@ impl<'a> TransactionInfo<'a> { let read_predicates = read_predicates .map(|pred| read_snapshot.parse_predicate_expression(pred, &session.state())) .transpose()?; + + let mut read_app_ids = HashSet::::new(); + for action in actions.iter() { + if let Action::Txn(Transaction { app_id, .. }) = action { + read_app_ids.insert(app_id.clone()); + } + } + Ok(Self { txn_id: "".into(), read_predicates, - read_app_ids: Default::default(), + read_app_ids, actions, read_snapshot, read_whole_table, @@ -132,15 +143,21 @@ impl<'a> TransactionInfo<'a> { #[cfg(feature = "datafusion")] #[allow(unused)] pub fn new( - read_snapshot: &'a DeltaTableState, + read_snapshot: &'a EagerSnapshot, read_predicates: Option, actions: &'a Vec, read_whole_table: bool, ) -> Self { + let mut read_app_ids = HashSet::::new(); + for action in actions.iter() { + if let Action::Txn(Transaction { app_id, .. }) = action { + read_app_ids.insert(app_id.clone()); + } + } Self { txn_id: "".into(), read_predicates, - read_app_ids: Default::default(), + read_app_ids, actions, read_snapshot, read_whole_table, @@ -149,15 +166,21 @@ impl<'a> TransactionInfo<'a> { #[cfg(not(feature = "datafusion"))] pub fn try_new( - read_snapshot: &'a DeltaTableState, + read_snapshot: &'a EagerSnapshot, read_predicates: Option, actions: &'a Vec, read_whole_table: bool, ) -> DeltaResult { + let mut read_app_ids = HashSet::::new(); + for action in actions.iter() { + if let Action::Txn(Transaction { app_id, .. }) = action { + read_app_ids.insert(app_id.clone()); + } + } Ok(Self { txn_id: "".into(), read_predicates, - read_app_ids: Default::default(), + read_app_ids, actions, read_snapshot, read_whole_table, @@ -173,14 +196,16 @@ impl<'a> TransactionInfo<'a> { #[cfg(feature = "datafusion")] /// Files read by the transaction - pub fn read_files(&self) -> Result, CommitConflictError> { + pub fn read_files(&self) -> Result + '_, CommitConflictError> { + use crate::delta_datafusion::files_matching_predicate; + if let Some(predicate) = &self.read_predicates { Ok(Either::Left( - self.read_snapshot - .files_matching_predicate(&[predicate.clone()]) - .map_err(|err| CommitConflictError::Predicate { + files_matching_predicate(self.read_snapshot, &[predicate.clone()]).map_err( + |err| CommitConflictError::Predicate { source: Box::new(err), - })?, + }, + )?, )) } else { Ok(Either::Right(std::iter::empty())) @@ -189,8 +214,8 @@ impl<'a> TransactionInfo<'a> { #[cfg(not(feature = "datafusion"))] /// Files read by the transaction - pub fn read_files(&self) -> Result, CommitConflictError> { - Ok(self.read_snapshot.file_actions().unwrap().into_iter()) + pub fn read_files(&self) -> Result + '_, CommitConflictError> { + Ok(self.read_snapshot.file_actions().unwrap()) } /// Whether the whole table was read during the transaction @@ -307,13 +332,6 @@ impl WinningCommitSummary { } } - // pub fn only_add_files(&self) -> bool { - // !self - // .actions - // .iter() - // .any(|action| matches!(action, Action::remove(_))) - // } - pub fn is_blind_append(&self) -> Option { self.commit_info .as_ref() @@ -677,9 +695,12 @@ mod tests { actions: Vec, read_whole_table: bool, ) -> Result<(), CommitConflictError> { + use crate::table::state::DeltaTableState; + let setup_actions = setup.unwrap_or_else(|| init_table_actions(None)); let state = DeltaTableState::from_actions(setup_actions).unwrap(); - let transaction_info = TransactionInfo::new(&state, reads, &actions, read_whole_table); + let snapshot = state.snapshot(); + let transaction_info = TransactionInfo::new(snapshot, reads, &actions, read_whole_table); let summary = WinningCommitSummary { actions: concurrent, commit_info: None, diff --git a/crates/core/src/operations/transaction/mod.rs b/crates/core/src/operations/transaction/mod.rs index 63d1789e0a..babff18439 100644 --- a/crates/core/src/operations/transaction/mod.rs +++ b/crates/core/src/operations/transaction/mod.rs @@ -1,22 +1,104 @@ -//! Delta transactions -use std::collections::HashMap; +//! Add a commit entry to the Delta Table. +//! This module provides a unified interface for modifying commit behavior and attributes +//! +//! [`CommitProperties`] provides an unified client interface for all Delta opeartions. +//! Internally this is used to initialize a [`CommitBuilder`]. +//! +//! For advanced use cases [`CommitBuilder`] can be used which allows +//! finer control over the commit process. The builder can be converted +//! into a future the yield either a [`PreparedCommit`] or a [`FinalizedCommit`]. +//! +//! A [`PreparedCommit`] represents a temporary commit marker written to storage. +//! To convert to a [`FinalizedCommit`] an atomic rename is attempted. If the rename fails +//! then conflict resolution is performed and the atomic rename is tried for the latest version. +//! +//!
+//!                                          Client Interface
+//!        ┌─────────────────────────────┐                    
+//!        │      Commit Properties      │                    
+//!        │                             │                    
+//!        │ Public commit interface for │                    
+//!        │     all Delta Operations    │                    
+//!        │                             │                    
+//!        └─────────────┬───────────────┘                    
+//!                      │                                    
+//! ─────────────────────┼────────────────────────────────────
+//!                      │                                    
+//!                      ▼                  Advanced Interface
+//!        ┌─────────────────────────────┐                    
+//!        │       Commit Builder        │                    
+//!        │                             │                    
+//!        │   Advanced entry point for  │                    
+//!        │     creating a commit       │                    
+//!        └─────────────┬───────────────┘                    
+//!                      │                                    
+//!                      ▼                                    
+//!     ┌───────────────────────────────────┐                 
+//!     │                                   │                 
+//!     │ ┌───────────────────────────────┐ │                 
+//!     │ │        Prepared Commit        │ │                 
+//!     │ │                               │ │                 
+//!     │ │     Represents a temporary    │ │                 
+//!     │ │   commit marker written to    │ │                 
+//!     │ │           storage             │ │                 
+//!     │ └──────────────┬────────────────┘ │                 
+//!     │                │                  │                 
+//!     │                ▼                  │                 
+//!     │ ┌───────────────────────────────┐ │                 
+//!     │ │       Finalize Commit         │ │                 
+//!     │ │                               │ │                 
+//!     │ │   Convert the commit marker   │ │                 
+//!     │ │   to a commit using atomic    │ │                 
+//!     │ │         operations            │ │                 
+//!     │ │                               │ │                 
+//!     │ └───────────────────────────────┘ │                 
+//!     │                                   │                 
+//!     └────────────────┬──────────────────┘                 
+//!                      │                                    
+//!                      ▼                                    
+//!       ┌───────────────────────────────┐                   
+//!       │          Post Commit          │                   
+//!       │                               │                   
+//!       │ Commit that was materialized  │                   
+//!       │ to storage with post commit   │                   
+//!       │      hooks to be executed     │                   
+//!       └──────────────┬────────────────┘                 
+//!                      │                                    
+//!                      ▼    
+//!       ┌───────────────────────────────┐                   
+//!       │        Finalized Commit       │                   
+//!       │                               │                   
+//!       │ Commit that was materialized  │                   
+//!       │         to storage            │                   
+//!       │                               │                   
+//!       └───────────────────────────────┘           
+//!
use chrono::Utc; use conflict_checker::ConflictChecker; +use futures::future::BoxFuture; use object_store::path::Path; use object_store::{Error as ObjectStoreError, ObjectStore}; use serde_json::Value; +use std::collections::HashMap; use self::conflict_checker::{CommitConflictError, TransactionInfo, WinningCommitSummary}; -use crate::crate_version; -use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Action, CommitInfo, ReaderFeatures, WriterFeatures}; -use crate::logstore::LogStore; +use crate::checkpoints::{cleanup_expired_logs_for, create_checkpoint_for}; +use crate::errors::DeltaTableError; +use crate::kernel::{ + Action, CommitInfo, EagerSnapshot, Metadata, Protocol, ReaderFeatures, Transaction, + WriterFeatures, +}; +use crate::logstore::LogStoreRef; use crate::protocol::DeltaOperation; +use crate::table::config::TableConfig; use crate::table::state::DeltaTableState; +use crate::{crate_version, DeltaResult}; pub use self::protocol::INSTANCE as PROTOCOL; +#[cfg(test)] +pub(crate) mod application; mod conflict_checker; mod protocol; #[cfg(feature = "datafusion")] @@ -25,6 +107,7 @@ mod state; pub(crate) mod test_utils; const DELTA_LOG_FOLDER: &str = "_delta_log"; +pub(crate) const DEFAULT_RETRIES: usize = 15; /// Error raised while commititng transaction #[derive(thiserror::Error, Debug)] @@ -71,12 +154,12 @@ pub enum TransactionError { UnsupportedWriterFeatures(Vec), /// Error returned when writer features are required but not specified - #[error("Writer features must be specified for writerversion >= 7")] - WriterFeaturesRequired, + #[error("Writer features must be specified for writerversion >= 7, please specify: {0:?}")] + WriterFeaturesRequired(WriterFeatures), /// Error returned when reader features are required but not specified - #[error("Reader features must be specified for reader version >= 3")] - ReaderFeaturesRequired, + #[error("Reader features must be specified for reader version >= 3, please specify: {0:?}")] + ReaderFeaturesRequired(ReaderFeatures), /// The transaction failed to commit due to an error in an implementation-specific layer. /// Currently used by DynamoDb-backed S3 log store when database operations fail. @@ -104,167 +187,554 @@ impl From for DeltaTableError { } } -// Convert actions to their json representation -fn log_entry_from_actions<'a>( - actions: impl IntoIterator, -) -> Result { - let mut jsons = Vec::::new(); - for action in actions { - let json = serde_json::to_string(action) - .map_err(|e| TransactionError::SerializeLogJson { json_err: e })?; - jsons.push(json); - } - Ok(jsons.join("\n")) +/// Error raised while commititng transaction +#[derive(thiserror::Error, Debug)] +pub enum CommitBuilderError {} + +impl From for DeltaTableError { + fn from(err: CommitBuilderError) -> Self { + DeltaTableError::CommitValidation { source: err } + } +} + +/// Reference to some structure that contains mandatory attributes for performing a commit. +pub trait TableReference: Send + Sync { + /// Well known table configuration + fn config(&self) -> TableConfig; + + /// Get the table protocol of the snapshot + fn protocol(&self) -> &Protocol; + + /// Get the table metadata of the snapshot + fn metadata(&self) -> &Metadata; + + /// Try to cast this table reference to a `EagerSnapshot` + fn eager_snapshot(&self) -> &EagerSnapshot; +} + +impl TableReference for EagerSnapshot { + fn protocol(&self) -> &Protocol { + EagerSnapshot::protocol(self) + } + + fn metadata(&self) -> &Metadata { + EagerSnapshot::metadata(self) + } + + fn config(&self) -> TableConfig { + self.table_config() + } + + fn eager_snapshot(&self) -> &EagerSnapshot { + self + } +} + +impl TableReference for DeltaTableState { + fn config(&self) -> TableConfig { + self.snapshot.config() + } + + fn protocol(&self) -> &Protocol { + self.snapshot.protocol() + } + + fn metadata(&self) -> &Metadata { + self.snapshot.metadata() + } + + fn eager_snapshot(&self) -> &EagerSnapshot { + &self.snapshot + } +} + +/// Data that was actually written to the log store. +#[derive(Debug)] +pub struct CommitData { + /// The actions + pub actions: Vec, + /// The Operation + pub operation: DeltaOperation, + /// The Metadata + pub app_metadata: HashMap, + /// Application specific transaction + pub app_transactions: Vec, +} + +impl CommitData { + /// Create new data to be comitted + pub fn new( + mut actions: Vec, + operation: DeltaOperation, + mut app_metadata: HashMap, + app_transactions: Vec, + ) -> Self { + if !actions.iter().any(|a| matches!(a, Action::CommitInfo(..))) { + let mut commit_info = operation.get_commit_info(); + commit_info.timestamp = Some(Utc::now().timestamp_millis()); + app_metadata.insert( + "clientVersion".to_string(), + Value::String(format!("delta-rs.{}", crate_version())), + ); + app_metadata.extend(commit_info.info); + commit_info.info = app_metadata.clone(); + actions.push(Action::CommitInfo(commit_info)) + } + + for txn in &app_transactions { + actions.push(Action::Txn(txn.clone())) + } + + CommitData { + actions, + operation, + app_metadata, + app_transactions, + } + } + + /// Obtain the byte representation of the commit. + pub fn get_bytes(&self) -> Result { + let mut jsons = Vec::::new(); + for action in &self.actions { + let json = serde_json::to_string(action) + .map_err(|e| TransactionError::SerializeLogJson { json_err: e })?; + jsons.push(json); + } + Ok(bytes::Bytes::from(jsons.join("\n"))) + } +} + +#[derive(Clone, Debug, Copy)] +/// Properties for post commit hook. +pub struct PostCommitHookProperties { + create_checkpoint: bool, + /// Override the EnableExpiredLogCleanUp setting, if None config setting is used + cleanup_expired_logs: Option, +} + +#[derive(Clone, Debug)] +/// End user facing interface to be used by operations on the table. +/// Enable controling commit behaviour and modifying metadata that is written during a commit. +pub struct CommitProperties { + pub(crate) app_metadata: HashMap, + pub(crate) app_transaction: Vec, + max_retries: usize, + create_checkpoint: bool, + cleanup_expired_logs: Option, +} + +impl Default for CommitProperties { + fn default() -> Self { + Self { + app_metadata: Default::default(), + app_transaction: Vec::new(), + max_retries: DEFAULT_RETRIES, + create_checkpoint: true, + cleanup_expired_logs: None, + } + } +} + +impl CommitProperties { + /// Specify metadata the be comitted + pub fn with_metadata( + mut self, + metadata: impl IntoIterator, + ) -> Self { + self.app_metadata = HashMap::from_iter(metadata); + self + } + + /// Specify if it should create a checkpoint when the commit interval condition is met + pub fn with_create_checkpoint(mut self, create_checkpoint: bool) -> Self { + self.create_checkpoint = create_checkpoint; + self + } + + /// Add an additonal application transaction to the commit + pub fn with_application_transaction(mut self, txn: Transaction) -> Self { + self.app_transaction.push(txn); + self + } + + /// Override application transactions for the commit + pub fn with_application_transactions(mut self, txn: Vec) -> Self { + self.app_transaction = txn; + self + } + + /// Specify if it should clean up the logs when the logRetentionDuration interval is met + pub fn with_cleanup_expired_logs(mut self, cleanup_expired_logs: Option) -> Self { + self.cleanup_expired_logs = cleanup_expired_logs; + self + } +} + +impl From for CommitBuilder { + fn from(value: CommitProperties) -> Self { + CommitBuilder { + max_retries: value.max_retries, + app_metadata: value.app_metadata, + post_commit_hook: Some(PostCommitHookProperties { + create_checkpoint: value.create_checkpoint, + cleanup_expired_logs: value.cleanup_expired_logs, + }), + app_transaction: value.app_transaction, + ..Default::default() + } + } +} + +/// Prepare data to be committed to the Delta log and control how the commit is performed +pub struct CommitBuilder { + actions: Vec, + app_metadata: HashMap, + app_transaction: Vec, + max_retries: usize, + post_commit_hook: Option, +} + +impl Default for CommitBuilder { + fn default() -> Self { + CommitBuilder { + actions: Vec::new(), + app_metadata: HashMap::new(), + app_transaction: Vec::new(), + max_retries: DEFAULT_RETRIES, + post_commit_hook: None, + } + } } -pub(crate) fn get_commit_bytes( - operation: &DeltaOperation, - actions: &Vec, - app_metadata: Option>, -) -> Result { - if !actions.iter().any(|a| matches!(a, Action::CommitInfo(..))) { - let mut extra_info = HashMap::::new(); - let mut commit_info = operation.get_commit_info(); - commit_info.timestamp = Some(Utc::now().timestamp_millis()); - extra_info.insert( - "clientVersion".to_string(), - Value::String(format!("delta-rs.{}", crate_version())), +impl<'a> CommitBuilder { + /// Actions to be included in the commit + pub fn with_actions(mut self, actions: Vec) -> Self { + self.actions = actions; + self + } + + /// Metadata for the operation performed like metrics, user, and notebook + pub fn with_app_metadata(mut self, app_metadata: HashMap) -> Self { + self.app_metadata = app_metadata; + self + } + + /// Maximum number of times to retry the transaction before failing to commit + pub fn with_max_retries(mut self, max_retries: usize) -> Self { + self.max_retries = max_retries; + self + } + + /// Specify all the post commit hook properties + pub fn with_post_commit_hook(mut self, post_commit_hook: PostCommitHookProperties) -> Self { + self.post_commit_hook = Some(post_commit_hook); + self + } + + /// Prepare a Commit operation using the configured builder + pub fn build( + self, + table_data: Option<&'a dyn TableReference>, + log_store: LogStoreRef, + operation: DeltaOperation, + ) -> PreCommit<'a> { + let data = CommitData::new( + self.actions, + operation, + self.app_metadata, + self.app_transaction, ); - if let Some(meta) = app_metadata { - extra_info.extend(meta) + PreCommit { + log_store, + table_data, + max_retries: self.max_retries, + data, + post_commit_hook: self.post_commit_hook, } - commit_info.info = extra_info; - Ok(bytes::Bytes::from(log_entry_from_actions( - actions - .iter() - .chain(std::iter::once(&Action::CommitInfo(commit_info))), - )?)) - } else { - Ok(bytes::Bytes::from(log_entry_from_actions(actions)?)) } } -/// Low-level transaction API. Creates a temporary commit file. Once created, -/// the transaction object could be dropped and the actual commit could be executed -/// with `DeltaTable.try_commit_transaction`. -/// TODO: comment is outdated now -pub async fn prepare_commit<'a>( - storage: &dyn ObjectStore, - operation: &DeltaOperation, - actions: &Vec, - app_metadata: Option>, -) -> Result { - // Serialize all actions that are part of this log entry. - let log_entry = get_commit_bytes(operation, actions, app_metadata)?; - - // Write delta log entry as temporary file to storage. For the actual commit, - // the temporary file is moved (atomic rename) to the delta log folder within `commit` function. - let token = uuid::Uuid::new_v4().to_string(); - let file_name = format!("_commit_{token}.json.tmp"); - let path = Path::from_iter([DELTA_LOG_FOLDER, &file_name]); - storage.put(&path, log_entry).await?; - - Ok(path) +/// Represents a commit that has not yet started but all details are finalized +pub struct PreCommit<'a> { + log_store: LogStoreRef, + table_data: Option<&'a dyn TableReference>, + data: CommitData, + max_retries: usize, + post_commit_hook: Option, } -/// Commit a transaction, with up to 15 retries. This is higher-level transaction API. -/// -/// Will error early if the a concurrent transaction has already been committed -/// and conflicts with this transaction. -pub async fn commit( - log_store: &dyn LogStore, - actions: &Vec, - operation: DeltaOperation, - read_snapshot: Option<&DeltaTableState>, - app_metadata: Option>, -) -> DeltaResult { - commit_with_retries( - log_store, - actions, - operation, - read_snapshot, - app_metadata, - 15, - ) - .await +impl<'a> std::future::IntoFuture for PreCommit<'a> { + type Output = DeltaResult; + type IntoFuture = BoxFuture<'a, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + Box::pin(async move { self.into_prepared_commit_future().await?.await?.await }) + } } -/// Commit a transaction, with up configurable number of retries. This is higher-level transaction API. -/// -/// The function will error early if the a concurrent transaction has already been committed -/// and conflicts with this transaction. -pub async fn commit_with_retries( - log_store: &dyn LogStore, - actions: &Vec, - operation: DeltaOperation, - read_snapshot: Option<&DeltaTableState>, - app_metadata: Option>, +impl<'a> PreCommit<'a> { + /// Prepare the commit but do not finalize it + pub fn into_prepared_commit_future(self) -> BoxFuture<'a, DeltaResult>> { + let this = self; + + Box::pin(async move { + if let Some(table_reference) = this.table_data { + PROTOCOL.can_commit(table_reference, &this.data.actions, &this.data.operation)?; + } + + // Write delta log entry as temporary file to storage. For the actual commit, + // the temporary file is moved (atomic rename) to the delta log folder within `commit` function. + let log_entry = this.data.get_bytes()?; + let token = uuid::Uuid::new_v4().to_string(); + let path = Path::from_iter([DELTA_LOG_FOLDER, &format!("_commit_{token}.json.tmp")]); + this.log_store + .object_store() + .put(&path, log_entry.into()) + .await?; + + Ok(PreparedCommit { + path, + log_store: this.log_store, + table_data: this.table_data, + max_retries: this.max_retries, + data: this.data, + post_commit: this.post_commit_hook, + }) + }) + } +} + +/// Represents a inflight commit with a temporary commit marker on the log store +pub struct PreparedCommit<'a> { + path: Path, + log_store: LogStoreRef, + data: CommitData, + table_data: Option<&'a dyn TableReference>, max_retries: usize, -) -> DeltaResult { - if let Some(read_snapshot) = read_snapshot { - PROTOCOL.can_commit(read_snapshot, actions)?; - } - - let tmp_commit = prepare_commit( - log_store.object_store().as_ref(), - &operation, - actions, - app_metadata, - ) - .await?; - - if read_snapshot.is_none() { - log_store.write_commit_entry(0, &tmp_commit).await?; - return Ok(0); - } - - let read_snapshot = read_snapshot.unwrap(); - - let mut attempt_number = 1; - while attempt_number <= max_retries { - let version = read_snapshot.version() + attempt_number as i64; - match log_store.write_commit_entry(version, &tmp_commit).await { - Ok(()) => return Ok(version), - Err(TransactionError::VersionAlreadyExists(version)) => { - let summary = - WinningCommitSummary::try_new(log_store, version - 1, version).await?; - let transaction_info = TransactionInfo::try_new( - read_snapshot, - operation.read_predicate(), - actions, - // TODO allow tainting whole table - false, - )?; - let conflict_checker = - ConflictChecker::new(transaction_info, summary, Some(&operation)); - match conflict_checker.check_conflicts() { - Ok(_) => { - attempt_number += 1; + post_commit: Option, +} + +impl<'a> PreparedCommit<'a> { + /// The temporary commit file created + pub fn path(&self) -> &Path { + &self.path + } +} + +impl<'a> std::future::IntoFuture for PreparedCommit<'a> { + type Output = DeltaResult>; + type IntoFuture = BoxFuture<'a, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let this = self; + + Box::pin(async move { + let tmp_commit = &this.path; + + if this.table_data.is_none() { + this.log_store.write_commit_entry(0, tmp_commit).await?; + return Ok(PostCommit { + version: 0, + data: this.data, + create_checkpoint: false, + cleanup_expired_logs: None, + log_store: this.log_store, + table_data: this.table_data, + }); + } + + // unwrap() is safe here due to the above check + // TODO: refactor to only depend on TableReference Trait + let read_snapshot = this.table_data.unwrap().eager_snapshot(); + + let mut attempt_number = 1; + while attempt_number <= this.max_retries { + let version = read_snapshot.version() + attempt_number as i64; + match this.log_store.write_commit_entry(version, tmp_commit).await { + Ok(()) => { + return Ok(PostCommit { + version, + data: this.data, + create_checkpoint: this + .post_commit + .map(|v| v.create_checkpoint) + .unwrap_or_default(), + cleanup_expired_logs: this + .post_commit + .map(|v| v.cleanup_expired_logs) + .unwrap_or_default(), + log_store: this.log_store, + table_data: this.table_data, + }); + } + Err(TransactionError::VersionAlreadyExists(version)) => { + let summary = WinningCommitSummary::try_new( + this.log_store.as_ref(), + version - 1, + version, + ) + .await?; + let transaction_info = TransactionInfo::try_new( + read_snapshot, + this.data.operation.read_predicate(), + &this.data.actions, + this.data.operation.read_whole_table(), + )?; + let conflict_checker = ConflictChecker::new( + transaction_info, + summary, + Some(&this.data.operation), + ); + match conflict_checker.check_conflicts() { + Ok(_) => { + attempt_number += 1; + } + Err(err) => { + this.log_store + .abort_commit_entry(version, tmp_commit) + .await?; + return Err(TransactionError::CommitConflict(err).into()); + } + }; } Err(err) => { - log_store.object_store().delete(&tmp_commit).await?; - return Err(TransactionError::CommitConflict(err).into()); + this.log_store + .abort_commit_entry(version, tmp_commit) + .await?; + return Err(err.into()); } - }; + } + } + + Err(TransactionError::MaxCommitAttempts(this.max_retries as i32).into()) + }) + } +} + +/// Represents items for the post commit hook +pub struct PostCommit<'a> { + /// The winning version number of the commit + pub version: i64, + /// The data that was comitted to the log store + pub data: CommitData, + create_checkpoint: bool, + cleanup_expired_logs: Option, + log_store: LogStoreRef, + table_data: Option<&'a dyn TableReference>, +} + +impl<'a> PostCommit<'a> { + /// Runs the post commit activities + async fn run_post_commit_hook(&self) -> DeltaResult { + if let Some(table) = self.table_data { + let mut snapshot = table.eager_snapshot().clone(); + if self.version - snapshot.version() > 1 { + // This may only occur during concurrent write actions. We need to update the state first to - 1 + // then we can advance. + snapshot + .update(self.log_store.clone(), Some(self.version - 1)) + .await?; + snapshot.advance(vec![&self.data])?; + } else { + snapshot.advance(vec![&self.data])?; + } + let state = DeltaTableState { snapshot }; + // Execute each hook + if self.create_checkpoint { + self.create_checkpoint(&state, &self.log_store, self.version) + .await?; } - Err(err) => { - log_store.object_store().delete(&tmp_commit).await?; - return Err(err.into()); + let cleanup_logs = if let Some(cleanup_logs) = self.cleanup_expired_logs { + cleanup_logs + } else { + state.table_config().enable_expired_log_cleanup() + }; + + if cleanup_logs { + cleanup_expired_logs_for( + self.version, + self.log_store.as_ref(), + Utc::now().timestamp_millis() + - state.table_config().log_retention_duration().as_millis() as i64, + ) + .await?; } + Ok(state) + } else { + let state = DeltaTableState::try_new( + &Path::default(), + self.log_store.object_store(), + Default::default(), + Some(self.version), + ) + .await?; + Ok(state) + } + } + async fn create_checkpoint( + &self, + table_state: &DeltaTableState, + log_store: &LogStoreRef, + version: i64, + ) -> DeltaResult<()> { + let checkpoint_interval = table_state.config().checkpoint_interval() as i64; + if ((version + 1) % checkpoint_interval) == 0 { + create_checkpoint_for(version, table_state, log_store.as_ref()).await? } + Ok(()) } +} + +/// A commit that successfully completed +pub struct FinalizedCommit { + /// The new table state after a commmit + pub snapshot: DeltaTableState, + + /// Version of the finalized commit + pub version: i64, +} - Err(TransactionError::MaxCommitAttempts(max_retries as i32).into()) +impl FinalizedCommit { + /// The new table state after a commmit + pub fn snapshot(&self) -> DeltaTableState { + self.snapshot.clone() + } + /// Version of the finalized commit + pub fn version(&self) -> i64 { + self.version + } +} + +impl<'a> std::future::IntoFuture for PostCommit<'a> { + type Output = DeltaResult; + type IntoFuture = BoxFuture<'a, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let this = self; + + Box::pin(async move { + match this.run_post_commit_hook().await { + Ok(snapshot) => Ok(FinalizedCommit { + snapshot, + version: this.version, + }), + Err(err) => Err(err), + } + }) + } } #[cfg(test)] mod tests { use std::{collections::HashMap, sync::Arc}; - use self::test_utils::init_table_actions; use super::*; - use crate::{logstore::default_logstore::DefaultLogStore, storage::commit_uri_from_version}; - use object_store::memory::InMemory; + use crate::{ + logstore::{default_logstore::DefaultLogStore, LogStore}, + storage::commit_uri_from_version, + }; + use object_store::{memory::InMemory, PutPayload}; use url::Url; #[test] @@ -275,15 +745,6 @@ mod tests { assert_eq!(version, Path::from("_delta_log/00000000000000000123.json")) } - #[test] - fn test_log_entry_from_actions() { - let actions = init_table_actions(None); - let entry = log_entry_from_actions(&actions).unwrap(); - let lines: Vec<_> = entry.lines().collect(); - // writes every action to a line - assert_eq!(actions.len(), lines.len()) - } - #[tokio::test] async fn test_try_commit_transaction() { let store = Arc::new(InMemory::new()); @@ -297,8 +758,8 @@ mod tests { ); let tmp_path = Path::from("_delta_log/tmp"); let version_path = Path::from("_delta_log/00000000000000000000.json"); - store.put(&tmp_path, bytes::Bytes::new()).await.unwrap(); - store.put(&version_path, bytes::Bytes::new()).await.unwrap(); + store.put(&tmp_path, PutPayload::new()).await.unwrap(); + store.put(&version_path, PutPayload::new()).await.unwrap(); let res = log_store.write_commit_entry(0, &tmp_path).await; // fails if file version already exists diff --git a/crates/core/src/operations/transaction/protocol.rs b/crates/core/src/operations/transaction/protocol.rs index 07a7b75405..f3bb87098a 100644 --- a/crates/core/src/operations/transaction/protocol.rs +++ b/crates/core/src/operations/transaction/protocol.rs @@ -2,9 +2,13 @@ use std::collections::HashSet; use lazy_static::lazy_static; use once_cell::sync::Lazy; +use tracing::log::*; -use super::TransactionError; -use crate::kernel::{Action, ReaderFeatures, WriterFeatures}; +use super::{TableReference, TransactionError}; +use crate::kernel::{ + Action, DataType, EagerSnapshot, ReaderFeatures, Schema, StructField, WriterFeatures, +}; +use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; lazy_static! { @@ -69,15 +73,60 @@ impl ProtocolChecker { } /// Check append-only at the high level (operation level) - pub fn check_append_only(&self, snapshot: &DeltaTableState) -> Result<(), TransactionError> { + pub fn check_append_only(&self, snapshot: &EagerSnapshot) -> Result<(), TransactionError> { if snapshot.table_config().append_only() { return Err(TransactionError::DeltaTableAppendOnly); } Ok(()) } + /// checks if table contains timestamp_ntz in any field including nested fields. + pub fn contains_timestampntz<'a>( + &self, + mut fields: impl Iterator, + ) -> bool { + fn _check_type(dtype: &DataType) -> bool { + match dtype { + &DataType::TIMESTAMP_NTZ => true, + DataType::Array(inner) => _check_type(inner.element_type()), + DataType::Struct(inner) => inner.fields().any(|f| _check_type(f.data_type())), + _ => false, + } + } + fields.any(|f| _check_type(f.data_type())) + } + + /// Check can write_timestamp_ntz + pub fn check_can_write_timestamp_ntz( + &self, + snapshot: &DeltaTableState, + schema: &Schema, + ) -> Result<(), TransactionError> { + let contains_timestampntz = self.contains_timestampntz(schema.fields()); + let required_features: Option<&HashSet> = + match snapshot.protocol().min_writer_version { + 0..=6 => None, + _ => snapshot.protocol().writer_features.as_ref(), + }; + + if let Some(table_features) = required_features { + if !table_features.contains(&WriterFeatures::TimestampWithoutTimezone) + && contains_timestampntz + { + return Err(TransactionError::WriterFeaturesRequired( + WriterFeatures::TimestampWithoutTimezone, + )); + } + } else if contains_timestampntz { + return Err(TransactionError::WriterFeaturesRequired( + WriterFeatures::TimestampWithoutTimezone, + )); + } + Ok(()) + } + /// Check if delta-rs can read form the given delta table. - pub fn can_read_from(&self, snapshot: &DeltaTableState) -> Result<(), TransactionError> { + pub fn can_read_from(&self, snapshot: &dyn TableReference) -> Result<(), TransactionError> { let required_features: Option<&HashSet> = match snapshot.protocol().min_reader_version { 0 | 1 => None, @@ -96,20 +145,36 @@ impl ProtocolChecker { } /// Check if delta-rs can write to the given delta table. - pub fn can_write_to(&self, snapshot: &DeltaTableState) -> Result<(), TransactionError> { + pub fn can_write_to(&self, snapshot: &dyn TableReference) -> Result<(), TransactionError> { // NOTE: writers must always support all required reader features self.can_read_from(snapshot)?; + let min_writer_version = snapshot.protocol().min_writer_version; + + let required_features: Option<&HashSet> = match min_writer_version { + 0 | 1 => None, + 2 => Some(&WRITER_V2), + 3 => Some(&WRITER_V3), + 4 => Some(&WRITER_V4), + 5 => Some(&WRITER_V5), + 6 => Some(&WRITER_V6), + _ => snapshot.protocol().writer_features.as_ref(), + }; - let required_features: Option<&HashSet> = - match snapshot.protocol().min_writer_version { - 0 | 1 => None, - 2 => Some(&WRITER_V2), - 3 => Some(&WRITER_V3), - 4 => Some(&WRITER_V4), - 5 => Some(&WRITER_V5), - 6 => Some(&WRITER_V6), - _ => snapshot.protocol().writer_features.as_ref(), - }; + if (4..7).contains(&min_writer_version) { + debug!("min_writer_version is less 4-6, checking for unsupported table features"); + if let Ok(schema) = snapshot.metadata().schema() { + for field in schema.fields() { + if field.metadata.contains_key( + crate::kernel::ColumnMetadataKey::GenerationExpression.as_ref(), + ) { + error!("The table contains `delta.generationExpression` settings on columns which mean this table cannot be currently written to by delta-rs"); + return Err(TransactionError::UnsupportedWriterFeatures(vec![ + WriterFeatures::GeneratedColumns, + ])); + } + } + } + } if let Some(features) = required_features { let mut diff = features.difference(&self.writer_features).peekable(); @@ -124,8 +189,9 @@ impl ProtocolChecker { pub fn can_commit( &self, - snapshot: &DeltaTableState, + snapshot: &dyn TableReference, actions: &[Action], + operation: &DeltaOperation, ) -> Result<(), TransactionError> { self.can_write_to(snapshot)?; @@ -133,23 +199,30 @@ impl ProtocolChecker { let append_only_enabled = if snapshot.protocol().min_writer_version < 2 { false } else if snapshot.protocol().min_writer_version < 7 { - snapshot.table_config().append_only() + snapshot.config().append_only() } else { snapshot .protocol() .writer_features .as_ref() - .ok_or(TransactionError::WriterFeaturesRequired)? + .ok_or(TransactionError::WriterFeaturesRequired( + WriterFeatures::AppendOnly, + ))? .contains(&WriterFeatures::AppendOnly) - && snapshot.table_config().append_only() + && snapshot.config().append_only() }; if append_only_enabled { - actions.iter().try_for_each(|action| match action { - Action::Remove(remove) if remove.data_change => { - Err(TransactionError::DeltaTableAppendOnly) + match operation { + DeltaOperation::Restore { .. } | DeltaOperation::FileSystemCheck { .. } => {} + _ => { + actions.iter().try_for_each(|action| match action { + Action::Remove(remove) if remove.data_change => { + Err(TransactionError::DeltaTableAppendOnly) + } + _ => Ok(()), + })?; } - _ => Ok(()), - })?; + } } Ok(()) @@ -164,11 +237,18 @@ impl ProtocolChecker { /// As we implement new features, we need to update this instance accordingly. /// resulting version support is determined by the supported table feature set. pub static INSTANCE: Lazy = Lazy::new(|| { - let reader_features = HashSet::new(); + let mut reader_features = HashSet::new(); + reader_features.insert(ReaderFeatures::TimestampWithoutTimezone); // reader_features.insert(ReaderFeatures::ColumnMapping); let mut writer_features = HashSet::new(); writer_features.insert(WriterFeatures::AppendOnly); + writer_features.insert(WriterFeatures::TimestampWithoutTimezone); + #[cfg(feature = "cdf")] + { + writer_features.insert(WriterFeatures::ChangeDataFeed); + writer_features.insert(WriterFeatures::GeneratedColumns); + } #[cfg(feature = "datafusion")] { writer_features.insert(WriterFeatures::Invariants); @@ -186,7 +266,10 @@ pub static INSTANCE: Lazy = Lazy::new(|| { mod tests { use super::super::test_utils::create_metadata_action; use super::*; - use crate::kernel::{Action, Add, Protocol, Remove}; + use crate::kernel::DataType as DeltaDataType; + use crate::kernel::{Action, Add, PrimitiveType, Protocol, Remove}; + use crate::protocol::SaveMode; + use crate::table::state::DeltaTableState; use crate::DeltaConfigKey; use std::collections::HashMap; @@ -197,6 +280,12 @@ mod tests { data_change: true, ..Default::default() })]; + let append_op = DeltaOperation::Write { + mode: SaveMode::Append, + partition_by: None, + predicate: None, + }; + let change_actions = vec![ Action::Add(Add { path: "test".to_string(), @@ -209,6 +298,8 @@ mod tests { ..Default::default() }), ]; + let change_op = DeltaOperation::Update { predicate: None }; + let neutral_actions = vec![ Action::Add(Add { path: "test".to_string(), @@ -221,6 +312,7 @@ mod tests { ..Default::default() }), ]; + let neutral_op = DeltaOperation::Update { predicate: None }; let create_actions = |writer: i32, append: &str, feat: Vec| { vec![ @@ -244,39 +336,81 @@ mod tests { let actions = create_actions(1, "true", vec![]); let snapshot = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &change_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + let eager = snapshot.snapshot(); + assert!(checker + .can_commit(eager, &append_actions, &append_op) + .is_ok()); + assert!(checker + .can_commit(eager, &change_actions, &change_op) + .is_ok()); + assert!(checker + .can_commit(eager, &neutral_actions, &neutral_op) + .is_ok()); let actions = create_actions(2, "true", vec![]); let snapshot = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &change_actions).is_err()); - assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + let eager = snapshot.snapshot(); + assert!(checker + .can_commit(eager, &append_actions, &append_op) + .is_ok()); + assert!(checker + .can_commit(eager, &change_actions, &change_op) + .is_err()); + assert!(checker + .can_commit(eager, &neutral_actions, &neutral_op) + .is_ok()); let actions = create_actions(2, "false", vec![]); let snapshot = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &change_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + let eager = snapshot.snapshot(); + assert!(checker + .can_commit(eager, &append_actions, &append_op) + .is_ok()); + assert!(checker + .can_commit(eager, &change_actions, &change_op) + .is_ok()); + assert!(checker + .can_commit(eager, &neutral_actions, &neutral_op) + .is_ok()); let actions = create_actions(7, "true", vec![WriterFeatures::AppendOnly]); let snapshot = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &change_actions).is_err()); - assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + let eager = snapshot.snapshot(); + assert!(checker + .can_commit(eager, &append_actions, &append_op) + .is_ok()); + assert!(checker + .can_commit(eager, &change_actions, &change_op) + .is_err()); + assert!(checker + .can_commit(eager, &neutral_actions, &neutral_op) + .is_ok()); let actions = create_actions(7, "false", vec![WriterFeatures::AppendOnly]); let snapshot = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &change_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + let eager = snapshot.snapshot(); + assert!(checker + .can_commit(eager, &append_actions, &append_op) + .is_ok()); + assert!(checker + .can_commit(eager, &change_actions, &change_op) + .is_ok()); + assert!(checker + .can_commit(eager, &neutral_actions, &neutral_op) + .is_ok()); let actions = create_actions(7, "true", vec![]); let snapshot = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &change_actions).is_ok()); - assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + let eager = snapshot.snapshot(); + assert!(checker + .can_commit(eager, &append_actions, &append_op) + .is_ok()); + assert!(checker + .can_commit(eager, &change_actions, &change_op) + .is_ok()); + assert!(checker + .can_commit(eager, &neutral_actions, &neutral_op) + .is_ok()); } #[test] @@ -291,8 +425,9 @@ mod tests { create_metadata_action(None, Some(HashMap::new())), ]; let snapshot_1 = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker_1.can_read_from(&snapshot_1).is_ok()); - assert!(checker_1.can_write_to(&snapshot_1).is_ok()); + let eager_1 = snapshot_1.snapshot(); + assert!(checker_1.can_read_from(eager_1).is_ok()); + assert!(checker_1.can_write_to(eager_1).is_ok()); let checker_2 = ProtocolChecker::new(READER_V2.clone(), HashSet::new()); let actions = vec![ @@ -304,11 +439,12 @@ mod tests { create_metadata_action(None, Some(HashMap::new())), ]; let snapshot_2 = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker_1.can_read_from(&snapshot_2).is_err()); - assert!(checker_1.can_write_to(&snapshot_2).is_err()); - assert!(checker_2.can_read_from(&snapshot_1).is_ok()); - assert!(checker_2.can_read_from(&snapshot_2).is_ok()); - assert!(checker_2.can_write_to(&snapshot_2).is_ok()); + let eager_2 = snapshot_2.snapshot(); + assert!(checker_1.can_read_from(eager_2).is_err()); + assert!(checker_1.can_write_to(eager_2).is_err()); + assert!(checker_2.can_read_from(eager_1).is_ok()); + assert!(checker_2.can_read_from(eager_2).is_ok()); + assert!(checker_2.can_write_to(eager_2).is_ok()); let checker_3 = ProtocolChecker::new(READER_V2.clone(), WRITER_V2.clone()); let actions = vec![ @@ -320,14 +456,15 @@ mod tests { create_metadata_action(None, Some(HashMap::new())), ]; let snapshot_3 = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker_1.can_read_from(&snapshot_3).is_err()); - assert!(checker_1.can_write_to(&snapshot_3).is_err()); - assert!(checker_2.can_read_from(&snapshot_3).is_ok()); - assert!(checker_2.can_write_to(&snapshot_3).is_err()); - assert!(checker_3.can_read_from(&snapshot_1).is_ok()); - assert!(checker_3.can_read_from(&snapshot_2).is_ok()); - assert!(checker_3.can_read_from(&snapshot_3).is_ok()); - assert!(checker_3.can_write_to(&snapshot_3).is_ok()); + let eager_3 = snapshot_3.snapshot(); + assert!(checker_1.can_read_from(eager_3).is_err()); + assert!(checker_1.can_write_to(eager_3).is_err()); + assert!(checker_2.can_read_from(eager_3).is_ok()); + assert!(checker_2.can_write_to(eager_3).is_err()); + assert!(checker_3.can_read_from(eager_1).is_ok()); + assert!(checker_3.can_read_from(eager_2).is_ok()); + assert!(checker_3.can_read_from(eager_3).is_ok()); + assert!(checker_3.can_write_to(eager_3).is_ok()); let checker_4 = ProtocolChecker::new(READER_V2.clone(), WRITER_V3.clone()); let actions = vec![ @@ -339,17 +476,18 @@ mod tests { create_metadata_action(None, Some(HashMap::new())), ]; let snapshot_4 = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker_1.can_read_from(&snapshot_4).is_err()); - assert!(checker_1.can_write_to(&snapshot_4).is_err()); - assert!(checker_2.can_read_from(&snapshot_4).is_ok()); - assert!(checker_2.can_write_to(&snapshot_4).is_err()); - assert!(checker_3.can_read_from(&snapshot_4).is_ok()); - assert!(checker_3.can_write_to(&snapshot_4).is_err()); - assert!(checker_4.can_read_from(&snapshot_1).is_ok()); - assert!(checker_4.can_read_from(&snapshot_2).is_ok()); - assert!(checker_4.can_read_from(&snapshot_3).is_ok()); - assert!(checker_4.can_read_from(&snapshot_4).is_ok()); - assert!(checker_4.can_write_to(&snapshot_4).is_ok()); + let eager_4 = snapshot_4.snapshot(); + assert!(checker_1.can_read_from(eager_4).is_err()); + assert!(checker_1.can_write_to(eager_4).is_err()); + assert!(checker_2.can_read_from(eager_4).is_ok()); + assert!(checker_2.can_write_to(eager_4).is_err()); + assert!(checker_3.can_read_from(eager_4).is_ok()); + assert!(checker_3.can_write_to(eager_4).is_err()); + assert!(checker_4.can_read_from(eager_1).is_ok()); + assert!(checker_4.can_read_from(eager_2).is_ok()); + assert!(checker_4.can_read_from(eager_3).is_ok()); + assert!(checker_4.can_read_from(eager_4).is_ok()); + assert!(checker_4.can_write_to(eager_4).is_ok()); let checker_5 = ProtocolChecker::new(READER_V2.clone(), WRITER_V4.clone()); let actions = vec![ @@ -361,20 +499,21 @@ mod tests { create_metadata_action(None, Some(HashMap::new())), ]; let snapshot_5 = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker_1.can_read_from(&snapshot_5).is_err()); - assert!(checker_1.can_write_to(&snapshot_5).is_err()); - assert!(checker_2.can_read_from(&snapshot_5).is_ok()); - assert!(checker_2.can_write_to(&snapshot_5).is_err()); - assert!(checker_3.can_read_from(&snapshot_5).is_ok()); - assert!(checker_3.can_write_to(&snapshot_5).is_err()); - assert!(checker_4.can_read_from(&snapshot_5).is_ok()); - assert!(checker_4.can_write_to(&snapshot_5).is_err()); - assert!(checker_5.can_read_from(&snapshot_1).is_ok()); - assert!(checker_5.can_read_from(&snapshot_2).is_ok()); - assert!(checker_5.can_read_from(&snapshot_3).is_ok()); - assert!(checker_5.can_read_from(&snapshot_4).is_ok()); - assert!(checker_5.can_read_from(&snapshot_5).is_ok()); - assert!(checker_5.can_write_to(&snapshot_5).is_ok()); + let eager_5 = snapshot_5.snapshot(); + assert!(checker_1.can_read_from(eager_5).is_err()); + assert!(checker_1.can_write_to(eager_5).is_err()); + assert!(checker_2.can_read_from(eager_5).is_ok()); + assert!(checker_2.can_write_to(eager_5).is_err()); + assert!(checker_3.can_read_from(eager_5).is_ok()); + assert!(checker_3.can_write_to(eager_5).is_err()); + assert!(checker_4.can_read_from(eager_5).is_ok()); + assert!(checker_4.can_write_to(eager_5).is_err()); + assert!(checker_5.can_read_from(eager_1).is_ok()); + assert!(checker_5.can_read_from(eager_2).is_ok()); + assert!(checker_5.can_read_from(eager_3).is_ok()); + assert!(checker_5.can_read_from(eager_4).is_ok()); + assert!(checker_5.can_read_from(eager_5).is_ok()); + assert!(checker_5.can_write_to(eager_5).is_ok()); let checker_6 = ProtocolChecker::new(READER_V2.clone(), WRITER_V5.clone()); let actions = vec![ @@ -386,23 +525,24 @@ mod tests { create_metadata_action(None, Some(HashMap::new())), ]; let snapshot_6 = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker_1.can_read_from(&snapshot_6).is_err()); - assert!(checker_1.can_write_to(&snapshot_6).is_err()); - assert!(checker_2.can_read_from(&snapshot_6).is_ok()); - assert!(checker_2.can_write_to(&snapshot_6).is_err()); - assert!(checker_3.can_read_from(&snapshot_6).is_ok()); - assert!(checker_3.can_write_to(&snapshot_6).is_err()); - assert!(checker_4.can_read_from(&snapshot_6).is_ok()); - assert!(checker_4.can_write_to(&snapshot_6).is_err()); - assert!(checker_5.can_read_from(&snapshot_6).is_ok()); - assert!(checker_5.can_write_to(&snapshot_6).is_err()); - assert!(checker_6.can_read_from(&snapshot_1).is_ok()); - assert!(checker_6.can_read_from(&snapshot_2).is_ok()); - assert!(checker_6.can_read_from(&snapshot_3).is_ok()); - assert!(checker_6.can_read_from(&snapshot_4).is_ok()); - assert!(checker_6.can_read_from(&snapshot_5).is_ok()); - assert!(checker_6.can_read_from(&snapshot_6).is_ok()); - assert!(checker_6.can_write_to(&snapshot_6).is_ok()); + let eager_6 = snapshot_6.snapshot(); + assert!(checker_1.can_read_from(eager_6).is_err()); + assert!(checker_1.can_write_to(eager_6).is_err()); + assert!(checker_2.can_read_from(eager_6).is_ok()); + assert!(checker_2.can_write_to(eager_6).is_err()); + assert!(checker_3.can_read_from(eager_6).is_ok()); + assert!(checker_3.can_write_to(eager_6).is_err()); + assert!(checker_4.can_read_from(eager_6).is_ok()); + assert!(checker_4.can_write_to(eager_6).is_err()); + assert!(checker_5.can_read_from(eager_6).is_ok()); + assert!(checker_5.can_write_to(eager_6).is_err()); + assert!(checker_6.can_read_from(eager_1).is_ok()); + assert!(checker_6.can_read_from(eager_2).is_ok()); + assert!(checker_6.can_read_from(eager_3).is_ok()); + assert!(checker_6.can_read_from(eager_4).is_ok()); + assert!(checker_6.can_read_from(eager_5).is_ok()); + assert!(checker_6.can_read_from(eager_6).is_ok()); + assert!(checker_6.can_write_to(eager_6).is_ok()); let checker_7 = ProtocolChecker::new(READER_V2.clone(), WRITER_V6.clone()); let actions = vec![ @@ -414,25 +554,85 @@ mod tests { create_metadata_action(None, Some(HashMap::new())), ]; let snapshot_7 = DeltaTableState::from_actions(actions).unwrap(); - assert!(checker_1.can_read_from(&snapshot_7).is_err()); - assert!(checker_1.can_write_to(&snapshot_7).is_err()); - assert!(checker_2.can_read_from(&snapshot_7).is_ok()); - assert!(checker_2.can_write_to(&snapshot_7).is_err()); - assert!(checker_3.can_read_from(&snapshot_7).is_ok()); - assert!(checker_3.can_write_to(&snapshot_7).is_err()); - assert!(checker_4.can_read_from(&snapshot_7).is_ok()); - assert!(checker_4.can_write_to(&snapshot_7).is_err()); - assert!(checker_5.can_read_from(&snapshot_7).is_ok()); - assert!(checker_5.can_write_to(&snapshot_7).is_err()); - assert!(checker_6.can_read_from(&snapshot_7).is_ok()); - assert!(checker_6.can_write_to(&snapshot_7).is_err()); - assert!(checker_7.can_read_from(&snapshot_1).is_ok()); - assert!(checker_7.can_read_from(&snapshot_2).is_ok()); - assert!(checker_7.can_read_from(&snapshot_3).is_ok()); - assert!(checker_7.can_read_from(&snapshot_4).is_ok()); - assert!(checker_7.can_read_from(&snapshot_5).is_ok()); - assert!(checker_7.can_read_from(&snapshot_6).is_ok()); - assert!(checker_7.can_read_from(&snapshot_7).is_ok()); - assert!(checker_7.can_write_to(&snapshot_7).is_ok()); + let eager_7 = snapshot_7.snapshot(); + assert!(checker_1.can_read_from(eager_7).is_err()); + assert!(checker_1.can_write_to(eager_7).is_err()); + assert!(checker_2.can_read_from(eager_7).is_ok()); + assert!(checker_2.can_write_to(eager_7).is_err()); + assert!(checker_3.can_read_from(eager_7).is_ok()); + assert!(checker_3.can_write_to(eager_7).is_err()); + assert!(checker_4.can_read_from(eager_7).is_ok()); + assert!(checker_4.can_write_to(eager_7).is_err()); + assert!(checker_5.can_read_from(eager_7).is_ok()); + assert!(checker_5.can_write_to(eager_7).is_err()); + assert!(checker_6.can_read_from(eager_7).is_ok()); + assert!(checker_6.can_write_to(eager_7).is_err()); + assert!(checker_7.can_read_from(eager_1).is_ok()); + assert!(checker_7.can_read_from(eager_2).is_ok()); + assert!(checker_7.can_read_from(eager_3).is_ok()); + assert!(checker_7.can_read_from(eager_4).is_ok()); + assert!(checker_7.can_read_from(eager_5).is_ok()); + assert!(checker_7.can_read_from(eager_6).is_ok()); + assert!(checker_7.can_read_from(eager_7).is_ok()); + assert!(checker_7.can_write_to(eager_7).is_ok()); + } + + #[tokio::test] + async fn test_minwriter_v4_with_cdf() { + let checker_5 = ProtocolChecker::new(READER_V2.clone(), WRITER_V4.clone()); + let actions = vec![ + Action::Protocol( + Protocol::new(2, 4) + .with_writer_features(vec![crate::kernel::WriterFeatures::ChangeDataFeed]), + ), + create_metadata_action(None, Some(HashMap::new())), + ]; + let snapshot_5 = DeltaTableState::from_actions(actions).unwrap(); + let eager_5 = snapshot_5.snapshot(); + assert!(checker_5.can_write_to(eager_5).is_ok()); + } + + /// Technically we do not yet support generated columns, but it is okay to "accept" writing to + /// a column with minWriterVersion=4 and the generated columns feature as long as the + /// `delta.generationExpression` isn't actually defined the write is still allowed + #[tokio::test] + async fn test_minwriter_v4_with_generated_columns() { + let checker_5 = ProtocolChecker::new(READER_V2.clone(), WRITER_V4.clone()); + let actions = vec![ + Action::Protocol( + Protocol::new(2, 4) + .with_writer_features(vec![crate::kernel::WriterFeatures::GeneratedColumns]), + ), + create_metadata_action(None, Some(HashMap::new())), + ]; + let snapshot_5 = DeltaTableState::from_actions(actions).unwrap(); + let eager_5 = snapshot_5.snapshot(); + assert!(checker_5.can_write_to(eager_5).is_ok()); + } + + #[tokio::test] + async fn test_minwriter_v4_with_generated_columns_and_expressions() { + let checker_5 = ProtocolChecker::new(READER_V2.clone(), WRITER_V4.clone()); + let actions = vec![Action::Protocol(Protocol::new(2, 4))]; + + let table: crate::DeltaTable = crate::DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + Some(HashMap::from([( + "delta.generationExpression".into(), + "x IS TRUE".into(), + )])), + ) + .with_actions(actions) + .with_configuration_property(DeltaConfigKey::EnableChangeDataFeed, Some("true")) + .await + .expect("failed to make a version 4 table with EnableChangeDataFeed"); + let eager_5 = table + .snapshot() + .expect("Failed to get snapshot from test table"); + assert!(checker_5.can_write_to(eager_5).is_err()); } } diff --git a/crates/core/src/operations/transaction/state.rs b/crates/core/src/operations/transaction/state.rs index ab778f2cb6..d705a616b1 100644 --- a/crates/core/src/operations/transaction/state.rs +++ b/crates/core/src/operations/transaction/state.rs @@ -5,111 +5,35 @@ use arrow::array::{ArrayRef, BooleanArray}; use arrow::datatypes::{ DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; -use datafusion::datasource::physical_plan::wrap_partition_type_in_dict; -use datafusion::execution::context::SessionState; +use datafusion::execution::context::SessionContext; use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; use datafusion_common::scalar::ScalarValue; -use datafusion_common::{Column, DFSchema}; -use datafusion_expr::{utils::conjunction, Expr}; -use itertools::Either; +use datafusion_common::{Column, ToDFSchema}; +use datafusion_expr::Expr; +use itertools::Itertools; use object_store::ObjectStore; +use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; -use crate::delta_datafusion::expr::parse_predicate_expression; -use crate::delta_datafusion::{ - get_null_of_arrow_type, logical_expr_to_physical_expr, to_correct_scalar_value, -}; +use crate::delta_datafusion::{get_null_of_arrow_type, to_correct_scalar_value, DataFusionMixins}; use crate::errors::DeltaResult; -use crate::kernel::Add; +use crate::kernel::{Add, EagerSnapshot}; use crate::table::state::DeltaTableState; impl DeltaTableState { - /// Get the table schema as an [`ArrowSchemaRef`] - pub fn arrow_schema(&self) -> DeltaResult { - self._arrow_schema(true) - } - - fn _arrow_schema(&self, wrap_partitions: bool) -> DeltaResult { - let meta = self.metadata(); - let fields = meta - .schema()? - .fields() - .iter() - .filter(|f| !meta.partition_columns.contains(&f.name().to_string())) - .map(|f| f.try_into()) - .chain( - meta.schema()? - .fields() - .iter() - .filter(|f| meta.partition_columns.contains(&f.name().to_string())) - .map(|f| { - let field = ArrowField::try_from(f)?; - let corrected = if wrap_partitions { - match field.data_type() { - // Only dictionary-encode types that may be large - // // https://github.com/apache/arrow-datafusion/pull/5545 - DataType::Utf8 - | DataType::LargeUtf8 - | DataType::Binary - | DataType::LargeBinary => { - wrap_partition_type_in_dict(field.data_type().clone()) - } - _ => field.data_type().clone(), - } - } else { - field.data_type().clone() - }; - Ok(field.with_data_type(corrected)) - }), - ) - .collect::, _>>()?; - - Ok(Arc::new(ArrowSchema::new(fields))) - } - - pub(crate) fn input_schema(&self) -> DeltaResult { - self._arrow_schema(false) - } - - /// Iterate over all files in the log matching a predicate - pub fn files_matching_predicate( - &self, - filters: &[Expr], - ) -> DeltaResult> { - if let Some(Some(predicate)) = - (!filters.is_empty()).then_some(conjunction(filters.iter().cloned())) - { - let expr = logical_expr_to_physical_expr(&predicate, self.arrow_schema()?.as_ref()); - let pruning_predicate = PruningPredicate::try_new(expr, self.arrow_schema()?)?; - Ok(Either::Left( - self.file_actions()? - .into_iter() - .zip(pruning_predicate.prune(self)?) - .filter_map( - |(action, keep_file)| { - if keep_file { - Some(action) - } else { - None - } - }, - ), - )) - } else { - Ok(Either::Right(self.file_actions()?.into_iter())) - } - } - - /// Parse an expression string into a datafusion [`Expr`] - pub fn parse_predicate_expression( + /// Get the physical table schema. + /// + /// This will construct a schema derived from the parquet schema of the latest data file, + /// and fields for partition columns from the schema defined in table meta data. + pub async fn physical_arrow_schema( &self, - expr: impl AsRef, - df_state: &SessionState, - ) -> DeltaResult { - let schema = DFSchema::try_from(self.arrow_schema()?.as_ref().to_owned())?; - parse_predicate_expression(&schema, expr, df_state) + object_store: Arc, + ) -> DeltaResult { + self.snapshot.physical_arrow_schema(object_store).await } +} +impl EagerSnapshot { /// Get the physical table schema. /// /// This will construct a schema derived from the parquet schema of the latest data file, @@ -118,18 +42,17 @@ impl DeltaTableState { &self, object_store: Arc, ) -> DeltaResult { - if let Some(add) = self - .file_actions()? - .iter() - .max_by_key(|obj| obj.modification_time) - { + if let Some(add) = self.file_actions()?.max_by_key(|obj| obj.modification_time) { let file_meta = add.try_into()?; let file_reader = ParquetObjectReader::new(object_store, file_meta); - let file_schema = ParquetRecordBatchStreamBuilder::new(file_reader) - .await? - .build()? - .schema() - .clone(); + let file_schema = ParquetRecordBatchStreamBuilder::new_with_options( + file_reader, + ArrowReaderOptions::new().with_skip_arrow_metadata(true), + ) + .await? + .build()? + .schema() + .clone(); let table_schema = Arc::new(ArrowSchema::new( self.arrow_schema()? @@ -228,7 +151,9 @@ impl<'a> AddContainer<'a> { /// so evaluating expressions is inexact. However, excluded files are guaranteed (for a correct log) /// to not contain matches by the predicate expression. pub fn predicate_matches(&self, predicate: Expr) -> DeltaResult> { - let expr = logical_expr_to_physical_expr(&predicate, &self.schema); + //let expr = logical_expr_to_physical_expr(predicate, &self.schema); + let expr = SessionContext::new() + .create_physical_expr(predicate, &self.schema.clone().to_dfschema()?)?; let pruning_predicate = PruningPredicate::try_new(expr, self.schema.clone())?; Ok(self .inner @@ -298,6 +223,21 @@ impl<'a> PruningStatistics for AddContainer<'a> { ScalarValue::iter_to_array(values).ok() } + /// return the number of rows for the named column in each container + /// as an `Option`. + /// + /// Note: the returned array must contain `num_containers()` rows + fn row_counts(&self, _column: &Column) -> Option { + let values = self.inner.iter().map(|add| { + if let Ok(Some(statistics)) = add.get_stats() { + ScalarValue::UInt64(Some(statistics.num_records as u64)) + } else { + ScalarValue::UInt64(None) + } + }); + ScalarValue::iter_to_array(values).ok() + } + // This function is required since DataFusion 35.0, but is implemented as a no-op // https://github.com/apache/arrow-datafusion/blob/ec6abece2dcfa68007b87c69eefa6b0d7333f628/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs#L550 fn contained(&self, _column: &Column, _value: &HashSet) -> Option { @@ -305,11 +245,11 @@ impl<'a> PruningStatistics for AddContainer<'a> { } } -impl PruningStatistics for DeltaTableState { +impl PruningStatistics for EagerSnapshot { /// return the minimum values for the named column, if known. /// Note: the returned array must contain `num_containers()` rows fn min_values(&self, column: &Column) -> Option { - let files = self.file_actions().ok()?; + let files = self.file_actions().ok()?.collect_vec(); let partition_columns = &self.metadata().partition_columns; let container = AddContainer::new(&files, partition_columns, self.arrow_schema().ok()?); container.min_values(column) @@ -318,7 +258,7 @@ impl PruningStatistics for DeltaTableState { /// return the maximum values for the named column, if known. /// Note: the returned array must contain `num_containers()` rows. fn max_values(&self, column: &Column) -> Option { - let files = self.file_actions().ok()?; + let files = self.file_actions().ok()?.collect_vec(); let partition_columns = &self.metadata().partition_columns; let container = AddContainer::new(&files, partition_columns, self.arrow_schema().ok()?); container.max_values(column) @@ -335,12 +275,23 @@ impl PruningStatistics for DeltaTableState { /// /// Note: the returned array must contain `num_containers()` rows. fn null_counts(&self, column: &Column) -> Option { - let files = self.file_actions().ok()?; + let files = self.file_actions().ok()?.collect_vec(); let partition_columns = &self.metadata().partition_columns; let container = AddContainer::new(&files, partition_columns, self.arrow_schema().ok()?); container.null_counts(column) } + /// return the number of rows for the named column in each container + /// as an `Option`. + /// + /// Note: the returned array must contain `num_containers()` rows + fn row_counts(&self, column: &Column) -> Option { + let files = self.file_actions().ok()?.collect_vec(); + let partition_columns = &self.metadata().partition_columns; + let container = AddContainer::new(&files, partition_columns, self.arrow_schema().ok()?); + container.row_counts(column) + } + // This function is required since DataFusion 35.0, but is implemented as a no-op // https://github.com/apache/arrow-datafusion/blob/ec6abece2dcfa68007b87c69eefa6b0d7333f628/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs#L550 fn contained(&self, _column: &Column, _value: &HashSet) -> Option { @@ -348,9 +299,36 @@ impl PruningStatistics for DeltaTableState { } } +impl PruningStatistics for DeltaTableState { + fn min_values(&self, column: &Column) -> Option { + self.snapshot.min_values(column) + } + + fn max_values(&self, column: &Column) -> Option { + self.snapshot.max_values(column) + } + + fn num_containers(&self) -> usize { + self.snapshot.num_containers() + } + + fn null_counts(&self, column: &Column) -> Option { + self.snapshot.null_counts(column) + } + + fn row_counts(&self, column: &Column) -> Option { + self.snapshot.row_counts(column) + } + + fn contained(&self, column: &Column, values: &HashSet) -> Option { + self.snapshot.contained(column, values) + } +} + #[cfg(test)] mod tests { use super::*; + use crate::delta_datafusion::DataFusionFileMixins; use crate::operations::transaction::test_utils::{create_add_action, init_table_actions}; use datafusion::prelude::SessionContext; use datafusion_expr::{col, lit}; @@ -391,6 +369,7 @@ mod tests { let state = DeltaTableState::from_actions(actions).unwrap(); let files = state + .snapshot .files_matching_predicate(&[]) .unwrap() .collect::>(); @@ -401,6 +380,7 @@ mod tests { .or(col("value").lt_eq(lit::(0))); let files = state + .snapshot .files_matching_predicate(&[predictate]) .unwrap() .collect::>(); diff --git a/crates/core/src/operations/transaction/test_utils.rs b/crates/core/src/operations/transaction/test_utils.rs index 484f69909a..ada5ded056 100644 --- a/crates/core/src/operations/transaction/test_utils.rs +++ b/crates/core/src/operations/transaction/test_utils.rs @@ -1,7 +1,7 @@ #![allow(unused)] use std::collections::HashMap; -use super::prepare_commit; +use super::CommitBuilder; use crate::kernel::{ Action, Add, CommitInfo, DataType, Metadata, PrimitiveType, Protocol, Remove, StructField, StructType, @@ -162,17 +162,9 @@ pub async fn create_initialized_table( }, }; let actions = init_table_actions(None); - let prepared_commit = prepare_commit( - log_store.object_store().as_ref(), - &operation, - &actions, - None, - ) - .await - .unwrap(); - - log_store - .write_commit_entry(0, &prepared_commit) + CommitBuilder::default() + .with_actions(actions) + .build(None, log_store.clone(), operation) .await .unwrap(); DeltaTable::new_with_state(log_store, state) diff --git a/crates/core/src/operations/update.rs b/crates/core/src/operations/update.rs index d07f3f9fc0..2a947f486f 100644 --- a/crates/core/src/operations/update.rs +++ b/crates/core/src/operations/update.rs @@ -19,42 +19,59 @@ //! ```` use std::{ - collections::{HashMap, HashSet}, + collections::HashMap, sync::Arc, time::{Instant, SystemTime, UNIX_EPOCH}, }; -use arrow::datatypes::Schema as ArrowSchema; -use arrow_schema::Field; +use super::write::{write_execution_plan, write_execution_plan_cdc}; +use super::{ + datafusion_utils::Expression, + transaction::{CommitBuilder, CommitProperties}, +}; +use super::{transaction::PROTOCOL, write::WriterStatsConfig}; +use crate::delta_datafusion::{find_files, planner::DeltaPlanner, register_store}; +use crate::kernel::{Action, Remove}; +use crate::logstore::LogStoreRef; +use crate::operations::cdc::*; +use crate::protocol::DeltaOperation; +use crate::table::state::DeltaTableState; +use crate::{ + delta_datafusion::{ + expr::fmt_expr_to_sql, + logical::MetricObserver, + physical::{find_metric_node, get_metric, MetricObserverExec}, + DataFusionMixins, DeltaColumn, DeltaScanConfigBuilder, DeltaSessionContext, + DeltaTableProvider, + }, + DeltaTableError, +}; +use crate::{DeltaResult, DeltaTable}; +use async_trait::async_trait; +use datafusion::error::Result as DataFusionResult; use datafusion::{ + dataframe::DataFrame, + datasource::provider_as_source, execution::context::SessionState, - physical_plan::{metrics::MetricBuilder, projection::ProjectionExec, ExecutionPlan}, + physical_plan::{metrics::MetricBuilder, ExecutionPlan}, + physical_planner::{ExtensionPlanner, PhysicalPlanner}, prelude::SessionContext, }; -use datafusion_common::{Column, DFSchema, ScalarValue}; -use datafusion_expr::{case, col, lit, when, Expr}; -use datafusion_physical_expr::{ - create_physical_expr, - expressions::{self}, - PhysicalExpr, +use datafusion_common::{Column, ScalarValue}; +use datafusion_expr::{ + case, col, lit, when, Expr, Extension, LogicalPlan, LogicalPlanBuilder, UserDefinedLogicalNode, }; use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::Value; +use tracing::log::*; -use super::datafusion_utils::Expression; -use super::transaction::{commit, PROTOCOL}; -use super::write::write_execution_plan; -use crate::delta_datafusion::{ - expr::fmt_expr_to_sql, physical::MetricObserverExec, DeltaColumn, DeltaSessionContext, -}; -use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; -use crate::kernel::{Action, Remove}; -use crate::logstore::LogStoreRef; -use crate::protocol::DeltaOperation; -use crate::table::state::DeltaTableState; -use crate::{DeltaResult, DeltaTable}; +/// Custom column name used for marking internal [RecordBatch] rows as updated +pub(crate) const UPDATE_PREDICATE_COLNAME: &str = "__delta_rs_update_predicate"; + +const UPDATE_COUNT_ID: &str = "update_source_count"; +const UPDATE_ROW_COUNT: &str = "num_updated_rows"; +const COPIED_ROW_COUNT: &str = "num_copied_rows"; /// Updates records in the Delta Table. /// See this module's documentation for more information @@ -71,8 +88,8 @@ pub struct UpdateBuilder { state: Option, /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Additional information to add to the commit + commit_properties: CommitProperties, /// safe_cast determines how data types that do not match the underlying table are handled /// By default an error is returned safe_cast: bool, @@ -95,6 +112,8 @@ pub struct UpdateMetrics { pub scan_time_ms: u64, } +impl super::Operation<()> for UpdateBuilder {} + impl UpdateBuilder { /// Create a new ['UpdateBuilder'] pub fn new(log_store: LogStoreRef, snapshot: DeltaTableState) -> Self { @@ -105,7 +124,7 @@ impl UpdateBuilder { log_store, state: None, writer_properties: None, - app_metadata: None, + commit_properties: CommitProperties::default(), safe_cast: false, } } @@ -133,11 +152,8 @@ impl UpdateBuilder { } /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } @@ -162,17 +178,55 @@ impl UpdateBuilder { } } +#[derive(Clone)] +struct UpdateMetricExtensionPlanner {} + +#[async_trait] +impl ExtensionPlanner for UpdateMetricExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> DataFusionResult>> { + if let Some(metric_observer) = node.as_any().downcast_ref::() { + if metric_observer.id.eq(UPDATE_COUNT_ID) { + return Ok(Some(MetricObserverExec::try_new( + UPDATE_COUNT_ID.into(), + physical_inputs, + |batch, metrics| { + let array = batch.column_by_name(UPDATE_PREDICATE_COLNAME).unwrap(); + let copied_rows = array.null_count(); + let num_updated = array.len() - copied_rows; + + MetricBuilder::new(metrics) + .global_counter(UPDATE_ROW_COUNT) + .add(num_updated); + + MetricBuilder::new(metrics) + .global_counter(COPIED_ROW_COUNT) + .add(copied_rows); + }, + )?)); + } + } + Ok(None) + } +} + #[allow(clippy::too_many_arguments)] async fn execute( predicate: Option, updates: HashMap, log_store: LogStoreRef, - snapshot: &DeltaTableState, + snapshot: DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, + mut commit_properties: CommitProperties, safe_cast: bool, -) -> DeltaResult<((Vec, i64, Option), UpdateMetrics)> { +) -> DeltaResult<(DeltaTableState, UpdateMetrics)> { // Validate the predicate and update expressions. // // If the predicate is not set, then all files need to be updated. @@ -183,12 +237,17 @@ async fn execute( // perform update operations, and then commit add and remove actions to // the log. + let update_planner = DeltaPlanner:: { + extension_planner: UpdateMetricExtensionPlanner {}, + }; + + let state = state.clone().with_query_planner(Arc::new(update_planner)); + let exec_start = Instant::now(); let mut metrics = UpdateMetrics::default(); - let mut version = snapshot.version(); if updates.is_empty() { - return Ok(((Vec::new(), version, None), metrics)); + return Ok((snapshot, metrics)); } let predicate = match predicate { @@ -199,185 +258,126 @@ async fn execute( None => None, }; - let updates: HashMap = updates + let updates = updates .into_iter() .map(|(key, expr)| match expr { - Expression::DataFusion(e) => Ok((key, e)), + Expression::DataFusion(e) => Ok((key.name, e)), Expression::String(s) => snapshot .parse_predicate_expression(s, &state) - .map(|e| (key, e)), + .map(|e| (key.name, e)), }) - .collect::, _>>()?; + .collect::, _>>()?; let current_metadata = snapshot.metadata(); let table_partition_cols = current_metadata.partition_columns.clone(); let scan_start = Instant::now(); - let candidates = find_files(snapshot, log_store.clone(), &state, predicate.clone()).await?; + let candidates = find_files(&snapshot, log_store.clone(), &state, predicate.clone()).await?; metrics.scan_time_ms = Instant::now().duration_since(scan_start).as_millis() as u64; if candidates.candidates.is_empty() { - return Ok(((Vec::new(), version, None), metrics)); + return Ok((snapshot, metrics)); } let predicate = predicate.unwrap_or(Expr::Literal(ScalarValue::Boolean(Some(true)))); - let execution_props = state.execution_props(); + let scan_config = DeltaScanConfigBuilder::default() + .with_file_column(false) + .build(&snapshot)?; + // For each rewrite evaluate the predicate and then modify each expression // to either compute the new value or obtain the old one then write these batches - let scan = DeltaScanBuilder::new(snapshot, log_store.clone(), &state) - .with_files(&candidates.candidates) - .build() - .await?; - let scan = Arc::new(scan); + let target_provider = Arc::new( + DeltaTableProvider::try_new(snapshot.clone(), log_store.clone(), scan_config.clone())? + .with_files(candidates.candidates.clone()), + ); - // Create a projection for a new column with the predicate evaluated - let input_schema = snapshot.input_schema()?; + let target_provider = provider_as_source(target_provider); + let plan = LogicalPlanBuilder::scan("target", target_provider.clone(), None)?.build()?; - let mut fields = Vec::new(); - for field in input_schema.fields.iter() { - fields.push(field.to_owned()); - } - fields.push(Arc::new(Field::new( - "__delta_rs_update_predicate", - arrow_schema::DataType::Boolean, - true, - ))); - // Recreate the schemas with the new column included - let input_schema = Arc::new(ArrowSchema::new(fields)); - let input_dfschema: DFSchema = input_schema.as_ref().clone().try_into()?; - - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let scan_schema = scan.schema(); - for (i, field) in scan_schema.fields().into_iter().enumerate() { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } + let df = DataFrame::new(state.clone(), plan); // Take advantage of how null counts are tracked in arrow arrays use the // null count to track how many records do NOT statisfy the predicate. The // count is then exposed through the metrics through the `UpdateCountExec` // execution plan - let predicate_null = when(predicate.clone(), lit(true)).otherwise(lit(ScalarValue::Boolean(None)))?; - let predicate_expr = create_physical_expr(&predicate_null, &input_dfschema, execution_props)?; - expressions.push((predicate_expr, "__delta_rs_update_predicate".to_string())); - - let projection_predicate: Arc = - Arc::new(ProjectionExec::try_new(expressions, scan)?); - - let count_plan = Arc::new(MetricObserverExec::new( - "update_count".into(), - projection_predicate.clone(), - |batch, metrics| { - let array = batch.column_by_name("__delta_rs_update_predicate").unwrap(); - let copied_rows = array.null_count(); - let num_updated = array.len() - copied_rows; - - MetricBuilder::new(metrics) - .global_counter("num_updated_rows") - .add(num_updated); - - MetricBuilder::new(metrics) - .global_counter("num_copied_rows") - .add(copied_rows); - }, - )); - // Perform another projection but instead calculate updated values based on - // the predicate value. If the predicate is true then evalute the user - // provided expression otherwise return the original column value - // - // For each update column a new column with a name of __delta_rs_ + `original name` is created - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let scan_schema = count_plan.schema(); - for (i, field) in scan_schema.fields().into_iter().enumerate() { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } + let df_with_update_col = df + .clone() + .with_column(UPDATE_PREDICATE_COLNAME, predicate_null)?; - // Maintain a map from the original column name to its temporary column index - let mut map = HashMap::::new(); - let mut control_columns = HashSet::::new(); - control_columns.insert("__delta_rs_update_predicate".to_owned()); - - for (column, expr) in updates { - let expr = case(col("__delta_rs_update_predicate")) - .when(lit(true), expr.to_owned()) - .otherwise(col(column.to_owned()))?; - let predicate_expr = create_physical_expr(&expr, &input_dfschema, execution_props)?; - map.insert(column.name.clone(), expressions.len()); - let c = "__delta_rs_".to_string() + &column.name; - expressions.push((predicate_expr, c.clone())); - control_columns.insert(c); - } + let plan_with_metrics = LogicalPlan::Extension(Extension { + node: Arc::new(MetricObserver { + id: UPDATE_COUNT_ID.into(), + input: df_with_update_col.into_unoptimized_plan(), + enable_pushdown: false, + }), + }); - let projection_update: Arc = - Arc::new(ProjectionExec::try_new(expressions, count_plan.clone())?); - - // Project again to remove __delta_rs columns and rename update columns to their original name - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let scan_schema = projection_update.schema(); - for (i, field) in scan_schema.fields().into_iter().enumerate() { - if !control_columns.contains(field.name()) { - match map.get(field.name()) { - Some(value) => { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), *value)), - field.name().to_owned(), - )); - } - None => { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } - } - } - } + let df_with_predicate_and_metrics = DataFrame::new(state.clone(), plan_with_metrics); - let projection: Arc = Arc::new(ProjectionExec::try_new( - expressions, - projection_update.clone(), - )?); + let expressions: Vec = df_with_predicate_and_metrics + .schema() + .fields() + .into_iter() + .map(|field| { + let field_name = field.name(); + let expr = match updates.get(field_name) { + Some(expr) => case(col(UPDATE_PREDICATE_COLNAME)) + .when(lit(true), expr.to_owned()) + .otherwise(col(Column::from_name(field_name)))? + .alias(field_name), + None => col(Column::from_name(field_name)), + }; + Ok(expr) + }) + .collect::>>()?; + + let updated_df = df_with_predicate_and_metrics.select(expressions.clone())?; + let physical_plan = updated_df.clone().create_physical_plan().await?; + let writer_stats_config = WriterStatsConfig::new( + snapshot.table_config().num_indexed_cols(), + snapshot + .table_config() + .stats_columns() + .map(|v| v.iter().map(|v| v.to_string()).collect::>()), + ); + + let tracker = CDCTracker::new( + df, + updated_df.drop_columns(&vec![UPDATE_PREDICATE_COLNAME])?, + ); let add_actions = write_execution_plan( - Some(snapshot), + Some(&snapshot), state.clone(), - projection.clone(), + physical_plan.clone(), table_partition_cols.clone(), log_store.object_store().clone(), Some(snapshot.table_config().target_file_size() as usize), None, - writer_properties, + writer_properties.clone(), safe_cast, - false, + None, + writer_stats_config.clone(), + None, ) .await?; - let count_metrics = count_plan.metrics().unwrap(); - - metrics.num_updated_rows = count_metrics - .sum_by_name("num_updated_rows") - .map(|m| m.as_usize()) - .unwrap_or(0); + let err = || DeltaTableError::Generic("Unable to locate expected metric node".into()); + let update_count = find_metric_node(UPDATE_COUNT_ID, &physical_plan).ok_or_else(err)?; + let update_count_metrics = update_count.metrics().unwrap(); - metrics.num_copied_rows = count_metrics - .sum_by_name("num_copied_rows") - .map(|m| m.as_usize()) - .unwrap_or(0); + metrics.num_updated_rows = get_metric(&update_count_metrics, UPDATE_ROW_COUNT); + metrics.num_copied_rows = get_metric(&update_count_metrics, COPIED_ROW_COUNT); let deletion_timestamp = SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap() .as_millis() as i64; - let mut actions: Vec = add_actions.into_iter().map(Action::Add).collect(); + let mut actions: Vec = add_actions.clone(); metrics.num_added_files = actions.len(); metrics.num_removed_files = candidates.candidates.len(); @@ -403,27 +403,46 @@ async fn execute( predicate: Some(fmt_expr_to_sql(&predicate)?), }; - let mut app_metadata = match app_metadata { - Some(meta) => meta, - None => HashMap::new(), - }; - - app_metadata.insert("readVersion".to_owned(), snapshot.version().into()); - - if let Ok(map) = serde_json::to_value(&metrics) { - app_metadata.insert("operationMetrics".to_owned(), map); + commit_properties + .app_metadata + .insert("readVersion".to_owned(), snapshot.version().into()); + + commit_properties.app_metadata.insert( + "operationMetrics".to_owned(), + serde_json::to_value(&metrics)?, + ); + + if let Ok(true) = should_write_cdc(&snapshot) { + match tracker.collect() { + Ok(df) => { + let cdc_actions = write_execution_plan_cdc( + Some(&snapshot), + state, + df.create_physical_plan().await?, + table_partition_cols, + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties, + safe_cast, + writer_stats_config, + None, + ) + .await?; + actions.extend(cdc_actions); + } + Err(err) => { + error!("Failed to collect CDC batches: {err:#?}"); + } + }; } - version = commit( - log_store.as_ref(), - &actions, - operation.clone(), - Some(snapshot), - Some(app_metadata), - ) - .await?; + let commit = CommitBuilder::from(commit_properties) + .with_actions(actions) + .build(Some(&snapshot), log_store, operation) + .await?; - Ok(((actions, version, Some(operation)), metrics)) + Ok((commit.snapshot(), metrics)) } impl std::future::IntoFuture for UpdateBuilder { @@ -431,12 +450,11 @@ impl std::future::IntoFuture for UpdateBuilder { type IntoFuture = BoxFuture<'static, Self::Output>; fn into_future(self) -> Self::IntoFuture { - let mut this = self; + let this = self; Box::pin(async move { - PROTOCOL.check_append_only(&this.snapshot)?; - - PROTOCOL.can_write_to(&this.snapshot)?; + PROTOCOL.check_append_only(&this.snapshot.snapshot)?; + PROTOCOL.can_write_to(&this.snapshot.snapshot)?; let state = this.state.unwrap_or_else(|| { let session: SessionContext = DeltaSessionContext::default().into(); @@ -447,34 +465,34 @@ impl std::future::IntoFuture for UpdateBuilder { session.state() }); - let ((actions, version, operation), metrics) = execute( + let (snapshot, metrics) = execute( this.predicate, this.updates, this.log_store.clone(), - &this.snapshot, + this.snapshot, state, this.writer_properties, - this.app_metadata, + this.commit_properties, this.safe_cast, ) .await?; - if let Some(op) = &operation { - this.snapshot.merge(actions, op, version)?; - } - - let table = DeltaTable::new_with_state(this.log_store, this.snapshot); - Ok((table, metrics)) + Ok(( + DeltaTable::new_with_state(this.log_store, snapshot), + metrics, + )) }) } } #[cfg(test)] mod tests { + use super::*; + + use crate::delta_datafusion::cdf::DeltaCdfScan; use crate::kernel::DataType as DeltaDataType; - use crate::kernel::PrimitiveType; - use crate::kernel::StructField; - use crate::kernel::StructType; + use crate::kernel::{Action, PrimitiveType, Protocol, StructField, StructType}; + use crate::operations::collect_sendable_stream; use crate::operations::DeltaOps; use crate::writer::test_utils::datafusion::get_data; use crate::writer::test_utils::datafusion::write_batch; @@ -483,12 +501,13 @@ mod tests { }; use crate::DeltaConfigKey; use crate::DeltaTable; + use arrow::array::{Int32Array, StringArray}; use arrow::datatypes::Schema as ArrowSchema; use arrow::datatypes::{Field, Schema}; use arrow::record_batch::RecordBatch; - use arrow_array::Int32Array; use arrow_schema::DataType; use datafusion::assert_batches_sorted_eq; + use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; use serde_json::json; use std::sync::Arc; @@ -498,7 +517,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); @@ -788,7 +807,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); let table = write_batch(table, batch).await; @@ -968,4 +987,248 @@ mod tests { .await; assert!(res.is_err()); } + + #[tokio::test] + async fn test_no_cdc_on_older_tables() { + let table = prepare_values_table().await; + assert_eq!(table.version(), 0); + assert_eq!(table.get_files_count(), 1); + + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + arrow::datatypes::DataType::Int32, + true, + )])); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]))], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .update() + .with_predicate(col("value").eq(lit(2))) + .with_update("value", lit(12)) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + // NOTE: This currently doesn't really assert anything because cdc_files() is not reading + // actions correct + if let Some(state) = table.state.clone() { + let cdc_files = state.cdc_files(); + assert!(cdc_files.is_ok()); + if let Ok(cdc_files) = cdc_files { + let cdc_files: Vec<_> = cdc_files.collect(); + assert_eq!(cdc_files.len(), 0); + } + } else { + panic!("I shouldn't exist!"); + } + + // Too close for missiles, switching to guns. Just checking that the data wasn't actually + // written instead! + if let Ok(files) = crate::storage::utils::flatten_list_stream( + &table.object_store(), + Some(&object_store::path::Path::from("_change_data")), + ) + .await + { + assert_eq!( + 0, + files.len(), + "This test should not find any written CDC files! {files:#?}" + ); + } + } + + #[tokio::test] + async fn test_update_cdc_enabled() { + // Currently you cannot pass EnableChangeDataFeed through `with_configuration_property` + // so the only way to create a truly CDC enabled table is by shoving the Protocol + // directly into the actions list + let actions = vec![Action::Protocol(Protocol::new(1, 4))]; + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_actions(actions) + .with_configuration_property(DeltaConfigKey::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + arrow::datatypes::DataType::Int32, + true, + )])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]))], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .update() + .with_predicate(col("value").eq(lit(2))) + .with_update("value", lit(12)) + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(3)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+------------------+-----------------+", + "| value | _change_type | _commit_version |", + "+-------+------------------+-----------------+", + "| 1 | insert | 1 |", + "| 2 | insert | 1 |", + "| 2 | update_preimage | 2 |", + "| 12 | update_postimage | 2 |", + "| 3 | insert | 1 |", + "+-------+------------------+-----------------+", + ], &batches } + } + + #[tokio::test] + async fn test_update_cdc_enabled_partitions() { + // Currently you cannot pass EnableChangeDataFeed through `with_configuration_property` + // so the only way to create a truly CDC enabled table is by shoving the Protocol + // directly into the actions list + let actions = vec![Action::Protocol(Protocol::new(1, 4))]; + let table: DeltaTable = DeltaOps::new_in_memory() + .create() + .with_column( + "year", + DeltaDataType::Primitive(PrimitiveType::String), + true, + None, + ) + .with_column( + "value", + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + None, + ) + .with_partition_columns(vec!["year"]) + .with_actions(actions) + .with_configuration_property(DeltaConfigKey::EnableChangeDataFeed, Some("true")) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let schema = Arc::new(Schema::new(vec![ + Field::new("year", DataType::Utf8, true), + Field::new("value", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![ + Some("2020"), + Some("2020"), + Some("2024"), + ])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + ], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![batch]) + .await + .expect("Failed to write first batch"); + assert_eq!(table.version(), 1); + + let (table, _metrics) = DeltaOps(table) + .update() + .with_predicate(col("value").eq(lit(2))) + .with_update("year", "2024") + .await + .unwrap(); + assert_eq!(table.version(), 2); + + let ctx = SessionContext::new(); + let table = DeltaOps(table) + .load_cdf() + .with_session_ctx(ctx.clone()) + .with_starting_version(0) + .build() + .await + .expect("Failed to load CDF"); + + let mut batches = collect_batches( + table.properties().output_partitioning().partition_count(), + table, + ctx, + ) + .await + .expect("Failed to collect batches"); + + let _ = arrow::util::pretty::print_batches(&batches); + + // The batches will contain a current _commit_timestamp which shouldn't be check_append_only + let _: Vec<_> = batches.iter_mut().map(|b| b.remove_column(3)).collect(); + + assert_batches_sorted_eq! {[ + "+-------+------------------+-----------------+------+", + "| value | _change_type | _commit_version | year |", + "+-------+------------------+-----------------+------+", + "| 1 | insert | 1 | 2020 |", + "| 2 | insert | 1 | 2020 |", + "| 2 | update_preimage | 2 | 2020 |", + "| 2 | update_postimage | 2 | 2024 |", + "| 3 | insert | 1 | 2024 |", + "+-------+------------------+-----------------+------+", + ], &batches } + } + + async fn collect_batches( + num_partitions: usize, + stream: DeltaCdfScan, + ctx: SessionContext, + ) -> Result, Box> { + let mut batches = vec![]; + for p in 0..num_partitions { + let data: Vec = + collect_sendable_stream(stream.execute(p, ctx.task_ctx())?).await?; + batches.extend_from_slice(&data); + } + Ok(batches) + } } diff --git a/crates/core/src/operations/vacuum.rs b/crates/core/src/operations/vacuum.rs index f539b0e22d..0e4bd2b467 100644 --- a/crates/core/src/operations/vacuum.rs +++ b/crates/core/src/operations/vacuum.rs @@ -21,7 +21,7 @@ //! let (table, metrics) = VacuumBuilder::new(table.object_store(). table.state).await?; //! ```` -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt::Debug; use std::sync::Arc; @@ -31,13 +31,10 @@ use futures::{StreamExt, TryStreamExt}; use object_store::Error; use object_store::{path::Path, ObjectStore}; use serde::Serialize; -use serde_json::Value; -use super::transaction::commit; -use crate::crate_version; +use super::transaction::{CommitBuilder, CommitProperties}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::Action; -use crate::logstore::{LogStore, LogStoreRef}; +use crate::logstore::LogStoreRef; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; use crate::DeltaTable; @@ -94,10 +91,12 @@ pub struct VacuumBuilder { dry_run: bool, /// Override the source of time clock: Option>, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Additional information to add to the commit + commit_properties: CommitProperties, } +impl super::Operation<()> for VacuumBuilder {} + /// Details for the Vacuum operation including which files were #[derive(Debug)] pub struct VacuumMetrics { @@ -138,7 +137,7 @@ impl VacuumBuilder { enforce_retention_duration: true, dry_run: false, clock: None, - app_metadata: None, + commit_properties: CommitProperties::default(), } } @@ -168,11 +167,8 @@ impl VacuumBuilder { } /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } @@ -258,7 +254,11 @@ impl std::future::IntoFuture for VacuumBuilder { } let metrics = plan - .execute(this.log_store.as_ref(), &this.snapshot, this.app_metadata) + .execute( + this.log_store.clone(), + &this.snapshot, + this.commit_properties, + ) .await?; Ok(( DeltaTable::new_with_state(this.log_store, this.snapshot), @@ -286,9 +286,9 @@ impl VacuumPlan { /// Execute the vacuum plan and delete files from underlying storage pub async fn execute( self, - store: &dyn LogStore, + store: LogStoreRef, snapshot: &DeltaTableState, - app_metadata: Option>, + mut commit_properties: CommitProperties, ) -> Result { if self.files_to_delete.is_empty() { return Ok(VacuumMetrics { @@ -307,30 +307,22 @@ impl VacuumPlan { status: String::from("COMPLETED"), // Maybe this should be FAILED when vacuum has error during the files, not sure how to check for this }; - let start_metrics = serde_json::to_value(VacuumStartOperationMetrics { + let start_metrics = VacuumStartOperationMetrics { num_files_to_delete: self.files_to_delete.len() as i64, size_of_data_to_delete: self.file_sizes.iter().sum(), - }); + }; // Begin VACUUM START COMMIT - let mut commit_info = start_operation.get_commit_info(); - let mut extra_info = match app_metadata.clone() { - Some(meta) => meta, - None => HashMap::new(), - }; - commit_info.timestamp = Some(Utc::now().timestamp_millis()); - extra_info.insert( - "clientVersion".to_string(), - Value::String(format!("delta-rs.{}", crate_version())), + let mut start_props = CommitProperties::default(); + start_props.app_metadata = commit_properties.app_metadata.clone(); + start_props.app_metadata.insert( + "operationMetrics".to_owned(), + serde_json::to_value(start_metrics)?, ); - if let Ok(map) = start_metrics { - extra_info.insert("operationMetrics".to_owned(), map); - } - commit_info.info = extra_info; - let start_actions = vec![Action::CommitInfo(commit_info)]; - - commit(store, &start_actions, start_operation, Some(snapshot), None).await?; + CommitBuilder::from(start_props) + .build(Some(snapshot), store.clone(), start_operation) + .await?; // Finish VACUUM START COMMIT let locations = futures::stream::iter(self.files_to_delete) @@ -349,32 +341,19 @@ impl VacuumPlan { .await?; // Create end metadata - let end_metrics = serde_json::to_value(VacuumEndOperationMetrics { + let end_metrics = VacuumEndOperationMetrics { num_deleted_files: files_deleted.len() as i64, num_vacuumed_directories: 0, // Set to zero since we only remove files not dirs - }); - - // Begin VACUUM END COMMIT - let mut commit_info = end_operation.get_commit_info(); - - let mut extra_info = match app_metadata.clone() { - Some(meta) => meta, - None => HashMap::new(), }; - commit_info.timestamp = Some(Utc::now().timestamp_millis()); - extra_info.insert( - "clientVersion".to_string(), - Value::String(format!("delta-rs.{}", crate_version())), + // Begin VACUUM END COMMIT + commit_properties.app_metadata.insert( + "operationMetrics".to_owned(), + serde_json::to_value(end_metrics)?, ); - if let Ok(map) = end_metrics { - extra_info.insert("operationMetrics".to_owned(), map); - } - commit_info.info = extra_info; - - let end_actions = vec![Action::CommitInfo(commit_info)]; - - commit(store, &end_actions, end_operation, Some(snapshot), None).await?; + CommitBuilder::from(commit_properties) + .build(Some(snapshot), store.clone(), end_operation) + .await?; // Finish VACUUM END COMMIT Ok(VacuumMetrics { diff --git a/crates/core/src/operations/write.rs b/crates/core/src/operations/write.rs index bb976b5fb9..923eadeeaf 100644 --- a/crates/core/src/operations/write.rs +++ b/crates/core/src/operations/write.rs @@ -1,4 +1,3 @@ -//! Used to write [RecordBatch]es into a delta table. //! //! New Table Semantics //! - The schema of the [RecordBatch] is used to initialize the table. @@ -26,33 +25,43 @@ //! ```` use std::collections::HashMap; +use std::str::FromStr; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; +use std::{iter, vec}; use arrow_array::RecordBatch; use arrow_cast::can_cast_types; -use arrow_schema::{DataType, Fields, SchemaRef as ArrowSchemaRef}; +use arrow_schema::{ArrowError, DataType, Fields, SchemaRef as ArrowSchemaRef}; use datafusion::execution::context::{SessionContext, SessionState, TaskContext}; -use datafusion::physical_expr::create_physical_expr; use datafusion::physical_plan::filter::FilterExec; +use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::{memory::MemoryExec, ExecutionPlan}; -use datafusion_common::DFSchema; -use datafusion_expr::Expr; +use datafusion_common::{DFSchema, ScalarValue}; +use datafusion_expr::{lit, Expr}; +use datafusion_physical_expr::expressions::{self}; +use datafusion_physical_expr::PhysicalExpr; use futures::future::BoxFuture; use futures::StreamExt; +use object_store::prefix::PrefixStore; use parquet::file::properties::WriterProperties; +use tracing::log::*; +use super::cdc::should_write_cdc; use super::datafusion_utils::Expression; -use super::transaction::PROTOCOL; +use super::transaction::{CommitBuilder, CommitProperties, TableReference, PROTOCOL}; use super::writer::{DeltaWriter, WriterConfig}; -use super::{transaction::commit, CreateBuilder}; +use super::CreateBuilder; use crate::delta_datafusion::expr::fmt_expr_to_sql; use crate::delta_datafusion::expr::parse_predicate_expression; -use crate::delta_datafusion::DeltaDataChecker; -use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; +use crate::delta_datafusion::{ + find_files, register_store, DeltaScanBuilder, DeltaScanConfigBuilder, +}; +use crate::delta_datafusion::{DataFusionMixins, DeltaDataChecker}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Action, Add, PartitionsExt, Remove, StructType}; +use crate::kernel::{Action, Add, AddCDCFile, Metadata, PartitionsExt, Remove, StructType}; use crate::logstore::LogStoreRef; +use crate::operations::cast::{cast_record_batch, merge_schema}; use crate::protocol::{DeltaOperation, SaveMode}; use crate::storage::ObjectStoreRef; use crate::table::state::DeltaTableState; @@ -60,6 +69,8 @@ use crate::table::Constraint as DeltaConstraint; use crate::writer::record_batch::divide_by_partition_values; use crate::DeltaTable; +use tokio::sync::mpsc::Sender; + #[derive(thiserror::Error, Debug)] enum WriteError { #[error("No data source supplied to write command.")] @@ -88,6 +99,30 @@ impl From for DeltaTableError { } } +///Specifies how to handle schema drifts +#[derive(PartialEq, Clone, Copy)] +pub enum SchemaMode { + /// Overwrite the schema with the new schema + Overwrite, + /// Append the new schema to the existing schema + Merge, +} + +impl FromStr for SchemaMode { + type Err = DeltaTableError; + + fn from_str(s: &str) -> DeltaResult { + match s.to_ascii_lowercase().as_str() { + "overwrite" => Ok(SchemaMode::Overwrite), + "merge" => Ok(SchemaMode::Merge), + _ => Err(DeltaTableError::Generic(format!( + "Invalid schema write mode provided: {}, only these are supported: ['overwrite', 'merge']", + s + ))), + } + } +} + /// Write data into a DeltaTable pub struct WriteBuilder { /// A snapshot of the to-be-loaded table's state @@ -110,14 +145,14 @@ pub struct WriteBuilder { write_batch_size: Option, /// RecordBatches to be written into the table batches: Option>, - /// whether to overwrite the schema - overwrite_schema: bool, + /// whether to overwrite the schema or to merge it. None means to fail on schmema drift + schema_mode: Option, /// how to handle cast failures, either return NULL (safe=true) or return ERR (safe=false) safe_cast: bool, /// Parquet writer properties writer_properties: Option, - /// Additional metadata to be added to commit - app_metadata: Option>, + /// Additional information to add to the commit + commit_properties: CommitProperties, /// Name of the table, only used when table doesn't exist yet name: Option, /// Description of the table, only used when table doesn't exist yet @@ -126,6 +161,8 @@ pub struct WriteBuilder { configuration: HashMap>, } +impl super::Operation<()> for WriteBuilder {} + impl WriteBuilder { /// Create a new [`WriteBuilder`] pub fn new(log_store: LogStoreRef, snapshot: Option) -> Self { @@ -141,9 +178,9 @@ impl WriteBuilder { write_batch_size: None, batches: None, safe_cast: false, - overwrite_schema: false, + schema_mode: None, writer_properties: None, - app_metadata: None, + commit_properties: CommitProperties::default(), name: None, description: None, configuration: Default::default(), @@ -156,9 +193,9 @@ impl WriteBuilder { self } - /// Add overwrite_schema - pub fn with_overwrite_schema(mut self, overwrite_schema: bool) -> Self { - self.overwrite_schema = overwrite_schema; + /// Add Schema Write Mode + pub fn with_schema_mode(mut self, schema_mode: SchemaMode) -> Self { + self.schema_mode = Some(schema_mode); self } @@ -222,11 +259,8 @@ impl WriteBuilder { } /// Additional metadata to be added to commit info - pub fn with_metadata( - mut self, - metadata: impl IntoIterator, - ) -> Self { - self.app_metadata = Some(HashMap::from_iter(metadata)); + pub fn with_commit_properties(mut self, commit_properties: CommitProperties) -> Self { + self.commit_properties = commit_properties; self } @@ -259,6 +293,21 @@ impl WriteBuilder { match &self.snapshot { Some(snapshot) => { PROTOCOL.can_write_to(snapshot)?; + + let schema: StructType = if let Some(plan) = &self.input { + (plan.schema()).try_into()? + } else if let Some(batches) = &self.batches { + if batches.is_empty() { + return Err(WriteError::MissingData.into()); + } + (batches[0].schema()).try_into()? + } else { + return Err(WriteError::MissingData.into()); + }; + + if self.schema_mode.is_none() { + PROTOCOL.check_can_write_timestamp_ntz(snapshot, &schema)?; + } match self.mode { SaveMode::ErrorIfExists => { Err(WriteError::AlreadyExists(self.log_store.root_uri()).into()) @@ -279,7 +328,7 @@ impl WriteBuilder { }?; let mut builder = CreateBuilder::new() .with_log_store(self.log_store.clone()) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_configuration(self.configuration.clone()); if let Some(partition_columns) = self.partition_columns.as_ref() { builder = builder.with_partition_columns(partition_columns.clone()) @@ -299,6 +348,24 @@ impl WriteBuilder { } } } +/// Configuration for the writer on how to collect stats +#[derive(Clone)] +pub struct WriterStatsConfig { + /// Number of columns to collect stats for, idx based + num_indexed_cols: i32, + /// Optional list of columns which to collect stats for, takes precedende over num_index_cols + stats_columns: Option>, +} + +impl WriterStatsConfig { + /// Create new writer stats config + pub fn new(num_indexed_cols: i32, stats_columns: Option>) -> Self { + Self { + num_indexed_cols, + stats_columns, + } + } +} #[allow(clippy::too_many_arguments)] async fn write_execution_plan_with_predicate( @@ -312,17 +379,17 @@ async fn write_execution_plan_with_predicate( write_batch_size: Option, writer_properties: Option, safe_cast: bool, - overwrite_schema: bool, -) -> DeltaResult> { - // Use input schema to prevent wrapping partitions columns into a dictionary. - let schema: ArrowSchemaRef = if overwrite_schema { + schema_mode: Option, + writer_stats_config: WriterStatsConfig, + sender: Option>, +) -> DeltaResult> { + let schema: ArrowSchemaRef = if schema_mode.is_some() { plan.schema() } else { snapshot .and_then(|s| s.input_schema().ok()) .unwrap_or(plan.schema()) }; - let checker = if let Some(snapshot) = snapshot { DeltaDataChecker::new(snapshot) } else { @@ -339,7 +406,7 @@ async fn write_execution_plan_with_predicate( // Write data to disk let mut tasks = vec![]; - for i in 0..plan.output_partitioning().partition_count() { + for i in 0..plan.properties().output_partitioning().partition_count() { let inner_plan = plan.clone(); let inner_schema = schema.clone(); let task_ctx = Arc::new(TaskContext::from(&state)); @@ -349,27 +416,48 @@ async fn write_execution_plan_with_predicate( writer_properties.clone(), target_file_size, write_batch_size, + writer_stats_config.num_indexed_cols, + writer_stats_config.stats_columns.clone(), ); let mut writer = DeltaWriter::new(object_store.clone(), config); let checker_stream = checker.clone(); + let sender_stream = sender.clone(); let mut stream = inner_plan.execute(i, task_ctx)?; - let handle: tokio::task::JoinHandle>> = - tokio::task::spawn(async move { + + let handle: tokio::task::JoinHandle>> = tokio::task::spawn( + async move { + let sendable = sender_stream.clone(); while let Some(maybe_batch) = stream.next().await { let batch = maybe_batch?; + checker_stream.check_batch(&batch).await?; - let arr = - super::cast::cast_record_batch(&batch, inner_schema.clone(), safe_cast)?; + let arr = super::cast::cast_record_batch( + &batch, + inner_schema.clone(), + safe_cast, + schema_mode == Some(SchemaMode::Merge), + )?; + + if let Some(s) = sendable.as_ref() { + if let Err(e) = s.send(arr.clone()).await { + error!("Failed to send data to observer: {e:#?}"); + } + } else { + debug!("write_execution_plan_with_predicate did not send any batches, no sender."); + } writer.write(&arr).await?; } - writer.close().await - }); + let add_actions = writer.close().await; + match add_actions { + Ok(actions) => Ok(actions.into_iter().map(Action::Add).collect::>()), + Err(err) => Err(err), + } + }, + ); tasks.push(handle); } - - // Collect add actions to add to commit - Ok(futures::future::join_all(tasks) + let actions = futures::future::join_all(tasks) .await .into_iter() .collect::, _>>() @@ -378,7 +466,64 @@ async fn write_execution_plan_with_predicate( .collect::, _>>()? .concat() .into_iter() - .collect::>()) + .collect::>(); + // Collect add actions to add to commit + Ok(actions) +} + +#[allow(clippy::too_many_arguments)] +pub(crate) async fn write_execution_plan_cdc( + snapshot: Option<&DeltaTableState>, + state: SessionState, + plan: Arc, + partition_columns: Vec, + object_store: ObjectStoreRef, + target_file_size: Option, + write_batch_size: Option, + writer_properties: Option, + safe_cast: bool, + writer_stats_config: WriterStatsConfig, + sender: Option>, +) -> DeltaResult> { + let cdc_store = Arc::new(PrefixStore::new(object_store, "_change_data")); + + // If not overwrite, the plan schema is not taken but table schema, + // however we need the plan schema since it has the _change_type_col + let schema_mode = Some(SchemaMode::Overwrite); + Ok(write_execution_plan( + snapshot, + state, + plan, + partition_columns, + cdc_store, + target_file_size, + write_batch_size, + writer_properties, + safe_cast, + schema_mode, + writer_stats_config, + sender, + ) + .await? + .into_iter() + .map(|add| { + // Modify add actions into CDC actions + match add { + Action::Add(add) => { + Action::Cdc(AddCDCFile { + // This is a gnarly hack, but the action needs the nested path, not the + // path isnide the prefixed store + path: format!("_change_data/{}", add.path), + size: add.size, + partition_values: add.partition_values, + data_change: false, + tags: add.tags, + }) + } + _ => panic!("Expected Add action"), + } + }) + .collect::>()) } #[allow(clippy::too_many_arguments)] @@ -392,8 +537,10 @@ pub(crate) async fn write_execution_plan( write_batch_size: Option, writer_properties: Option, safe_cast: bool, - overwrite_schema: bool, -) -> DeltaResult> { + schema_mode: Option, + writer_stats_config: WriterStatsConfig, + sender: Option>, +) -> DeltaResult> { write_execution_plan_with_predicate( None, snapshot, @@ -405,11 +552,14 @@ pub(crate) async fn write_execution_plan( write_batch_size, writer_properties, safe_cast, - overwrite_schema, + schema_mode, + writer_stats_config, + sender, ) .await } +#[allow(clippy::too_many_arguments)] async fn execute_non_empty_expr( snapshot: &DeltaTableState, log_store: LogStoreRef, @@ -418,49 +568,143 @@ async fn execute_non_empty_expr( expression: &Expr, rewrite: &[Add], writer_properties: Option, -) -> DeltaResult> { + writer_stats_config: WriterStatsConfig, + partition_scan: bool, +) -> DeltaResult> { // For each identified file perform a parquet scan + filter + limit (1) + count. // If returned count is not zero then append the file to be rewritten and removed from the log. Otherwise do nothing to the file. + let mut actions: Vec = Vec::new(); let input_schema = snapshot.input_schema()?; let input_dfschema: DFSchema = input_schema.clone().as_ref().clone().try_into()?; + let scan_config = DeltaScanConfigBuilder::new() + .with_schema(snapshot.input_schema()?) + .build(snapshot)?; + let scan = DeltaScanBuilder::new(snapshot, log_store.clone(), &state) .with_files(rewrite) + // Use input schema which doesn't wrap partition values, otherwise divide_by_partition_value won't work on UTF8 partitions + // Since it can't fetch a scalar from a dictionary type + .with_scan_config(scan_config) .build() .await?; let scan = Arc::new(scan); - // Apply the negation of the filter and rewrite files - let negated_expression = Expr::Not(Box::new(Expr::IsTrue(Box::new(expression.clone())))); + // We don't want to verify the predicate against existing data + if !partition_scan { + // Apply the negation of the filter and rewrite files + let negated_expression = Expr::Not(Box::new(Expr::IsTrue(Box::new(expression.clone())))); + + let predicate_expr = state.create_physical_expr(negated_expression, &input_dfschema)?; + let filter: Arc = + Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); + + let add_actions: Vec = write_execution_plan( + Some(snapshot), + state.clone(), + filter, + partition_columns.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties.clone(), + false, + None, + writer_stats_config.clone(), + None, + ) + .await?; - let predicate_expr = create_physical_expr( - &negated_expression, - &input_dfschema, - state.execution_props(), - )?; - let filter: Arc = - Arc::new(FilterExec::try_new(predicate_expr, scan.clone())?); + actions.extend(add_actions); + } - // We don't want to verify the predicate against existing data - let add_actions = write_execution_plan( - Some(snapshot), - state, - filter, + // CDC logic, simply filters data with predicate and adds the _change_type="delete" as literal column + if let Some(cdc_actions) = execute_non_empty_expr_cdc( + snapshot, + log_store, + state.clone(), + scan, + input_dfschema, + expression, partition_columns, - log_store.object_store(), - Some(snapshot.table_config().target_file_size() as usize), - None, writer_properties, - false, - false, + writer_stats_config, ) - .await?; + .await? + { + actions.extend(cdc_actions) + } + Ok(actions) +} - Ok(add_actions) +/// If CDC is enabled it writes all the deletions based on predicate into _change_data directory +#[allow(clippy::too_many_arguments)] +pub(crate) async fn execute_non_empty_expr_cdc( + snapshot: &DeltaTableState, + log_store: LogStoreRef, + state: SessionState, + scan: Arc, + input_dfschema: DFSchema, + expression: &Expr, + table_partition_cols: Vec, + writer_properties: Option, + writer_stats_config: WriterStatsConfig, +) -> DeltaResult>> { + match should_write_cdc(snapshot) { + // Create CDC scan + Ok(true) => { + let cdc_predicate_expr = + state.create_physical_expr(expression.clone(), &input_dfschema)?; + let cdc_scan: Arc = + Arc::new(FilterExec::try_new(cdc_predicate_expr, scan.clone())?); + + // Add literal column "_change_type" + let change_type_lit = lit(ScalarValue::Utf8(Some("delete".to_string()))); + let change_type_expr = state.create_physical_expr(change_type_lit, &input_dfschema)?; + + // Project columns and lit + let project_expressions: Vec<(Arc, String)> = scan + .schema() + .fields() + .into_iter() + .enumerate() + .map(|(idx, field)| -> (Arc, String) { + ( + Arc::new(expressions::Column::new(field.name(), idx)), + field.name().to_owned(), + ) + }) + .chain(iter::once((change_type_expr, "_change_type".to_owned()))) + .collect(); + + let projected_scan: Arc = Arc::new(ProjectionExec::try_new( + project_expressions, + cdc_scan.clone(), + )?); + + let cdc_actions = write_execution_plan_cdc( + Some(snapshot), + state.clone(), + projected_scan.clone(), + table_partition_cols.clone(), + log_store.object_store(), + Some(snapshot.table_config().target_file_size() as usize), + None, + writer_properties, + false, + writer_stats_config, + None, + ) + .await?; + Ok(Some(cdc_actions)) + } + _ => Ok(None), + } } // This should only be called wth a valid predicate +#[allow(clippy::too_many_arguments)] async fn prepare_predicate_actions( predicate: Expr, log_store: LogStoreRef, @@ -469,27 +713,25 @@ async fn prepare_predicate_actions( partition_columns: Vec, writer_properties: Option, deletion_timestamp: i64, + writer_stats_config: WriterStatsConfig, ) -> DeltaResult> { let candidates = find_files(snapshot, log_store.clone(), &state, Some(predicate.clone())).await?; - let add = if candidates.partition_scan { - Vec::new() - } else { - execute_non_empty_expr( - snapshot, - log_store, - state, - partition_columns, - &predicate, - &candidates.candidates, - writer_properties, - ) - .await? - }; - let remove = candidates.candidates; + let mut actions = execute_non_empty_expr( + snapshot, + log_store, + state, + partition_columns, + &predicate, + &candidates.candidates, + writer_properties, + writer_stats_config, + candidates.partition_scan, + ) + .await?; - let mut actions: Vec = add.into_iter().map(Action::Add).collect(); + let remove = candidates.candidates; for action in remove { actions.push(Action::Remove(Remove { @@ -508,6 +750,47 @@ async fn prepare_predicate_actions( Ok(actions) } +/// If CDC is enabled it writes all add add actions data as deletions into _change_data directory +async fn execute_non_empty_expr_cdc_all_actions( + snapshot: &DeltaTableState, + log_store: LogStoreRef, + state: SessionState, + table_partition_cols: Vec, + writer_properties: Option, + writer_stats_config: WriterStatsConfig, +) -> DeltaResult>> { + let current_state_add_actions = &snapshot.file_actions()?; + + let scan_config = DeltaScanConfigBuilder::new() + .with_schema(snapshot.input_schema()?) + .build(snapshot)?; + + // Since all files get removed, check to write CDC + let scan = DeltaScanBuilder::new(snapshot, log_store.clone(), &state) + .with_files(current_state_add_actions) + // Use input schema which doesn't wrap partition values, otherwise divide_by_partition_value won't work on UTF8 partitions + // Since it can't fetch a scalar from a dictionary type + .with_scan_config(scan_config) + .build() + .await?; + + let input_schema = snapshot.input_schema()?; + let input_dfschema: DFSchema = input_schema.clone().as_ref().clone().try_into()?; + + execute_non_empty_expr_cdc( + snapshot, + log_store, + state, + scan.into(), + input_dfschema, + &Expr::Literal(ScalarValue::Boolean(Some(true))), // Keep all data + table_partition_cols, + writer_properties, + writer_stats_config, + ) + .await +} + impl std::future::IntoFuture for WriteBuilder { type Output = DeltaResult; type IntoFuture = BoxFuture<'static, Self::Output>; @@ -518,9 +801,14 @@ impl std::future::IntoFuture for WriteBuilder { Box::pin(async move { if this.mode == SaveMode::Overwrite { if let Some(snapshot) = &this.snapshot { - PROTOCOL.check_append_only(snapshot)?; + PROTOCOL.check_append_only(&snapshot.snapshot)?; } } + if this.schema_mode == Some(SchemaMode::Overwrite) && this.mode != SaveMode::Overwrite { + return Err(DeltaTableError::Generic( + "Schema overwrite not supported for Append".to_string(), + )); + } // Create table actions to initialize table in case it does not yet exist and should be created let mut actions = this.check_preconditions().await?; @@ -547,8 +835,13 @@ impl std::future::IntoFuture for WriteBuilder { } else { Ok(this.partition_columns.unwrap_or_default()) }?; - + let mut schema_drift = false; let plan = if let Some(plan) = this.input { + if this.schema_mode == Some(SchemaMode::Merge) { + return Err(DeltaTableError::Generic( + "Schema merge not supported yet for Datafusion".to_string(), + )); + } Ok(plan) } else if let Some(batches) = this.batches { if batches.is_empty() { @@ -556,6 +849,7 @@ impl std::future::IntoFuture for WriteBuilder { } else { let schema = batches[0].schema(); + let mut new_schema = None; if let Some(snapshot) = &this.snapshot { let table_schema = snapshot .physical_arrow_schema(this.log_store.object_store().clone()) @@ -563,23 +857,42 @@ impl std::future::IntoFuture for WriteBuilder { .or_else(|_| snapshot.arrow_schema()) .unwrap_or(schema.clone()); - if !can_cast_batch(schema.fields(), table_schema.fields()) - && !(this.overwrite_schema && matches!(this.mode, SaveMode::Overwrite)) + if let Err(schema_err) = + try_cast_batch(schema.fields(), table_schema.fields()) { - return Err(DeltaTableError::Generic( - "Schema of data does not match table schema".to_string(), - )); - }; + schema_drift = true; + if this.mode == SaveMode::Overwrite + && this.schema_mode == Some(SchemaMode::Merge) + { + new_schema = + Some(merge_schema(table_schema.clone(), schema.clone())?); + } else if this.mode == SaveMode::Overwrite && this.schema_mode.is_some() + { + new_schema = None // we overwrite anyway, so no need to cast + } else if this.schema_mode == Some(SchemaMode::Merge) { + new_schema = + Some(merge_schema(table_schema.clone(), schema.clone())?); + } else { + return Err(schema_err.into()); + } + } } let data = if !partition_columns.is_empty() { // TODO partitioning should probably happen in its own plan ... let mut partitions: HashMap> = HashMap::new(); for batch in batches { + let real_batch = match new_schema.clone() { + Some(new_schema) => { + cast_record_batch(&batch, new_schema, false, true)? + } + None => batch, + }; + let divided = divide_by_partition_values( - schema.clone(), + new_schema.clone().unwrap_or(schema.clone()), partition_columns.clone(), - &batch, + &real_batch, )?; for part in divided { let key = part.partition_values.hive_partition_path(); @@ -595,17 +908,70 @@ impl std::future::IntoFuture for WriteBuilder { } partitions.into_values().collect::>() } else { - vec![batches] + match new_schema { + Some(ref new_schema) => { + let mut new_batches = vec![]; + for batch in batches { + new_batches.push(cast_record_batch( + &batch, + new_schema.clone(), + false, + true, + )?); + } + vec![new_batches] + } + None => vec![batches], + } }; - Ok(Arc::new(MemoryExec::try_new(&data, schema.clone(), None)?) - as Arc) + Ok(Arc::new(MemoryExec::try_new( + &data, + new_schema.unwrap_or(schema).clone(), + None, + )?) as Arc) } } else { Err(WriteError::MissingData) }?; let schema = plan.schema(); - + if this.schema_mode == Some(SchemaMode::Merge) && schema_drift { + if let Some(snapshot) = &this.snapshot { + let schema_struct: StructType = schema.clone().try_into()?; + let current_protocol = snapshot.protocol(); + let configuration = snapshot.metadata().configuration.clone(); + let maybe_new_protocol = if PROTOCOL + .contains_timestampntz(schema_struct.fields()) + && !current_protocol + .reader_features + .clone() + .unwrap_or_default() + .contains(&crate::kernel::ReaderFeatures::TimestampWithoutTimezone) + // We can check only reader features, as reader and writer timestampNtz + // should be always enabled together + { + let new_protocol = current_protocol.clone().enable_timestamp_ntz(); + if !(current_protocol.min_reader_version == 3 + && current_protocol.min_writer_version == 7) + { + Some(new_protocol.move_table_properties_into_features(&configuration)) + } else { + Some(new_protocol) + } + } else { + None + }; + let schema_action = Action::Metadata(Metadata::try_new( + schema_struct, + partition_columns.clone(), + configuration, + )?); + actions.push(schema_action); + if let Some(new_protocol) = maybe_new_protocol { + actions.push(new_protocol.into()) + } + } + } let state = match this.state { Some(state) => state, None => { @@ -630,6 +996,18 @@ impl std::future::IntoFuture for WriteBuilder { _ => (None, None), }; + let config: Option> = this + .snapshot + .as_ref() + .map(|snapshot| snapshot.table_config()); + + let (num_indexed_cols, stats_columns) = + super::get_num_idx_cols_and_stats_columns(config, this.configuration); + + let writer_stats_config = WriterStatsConfig { + num_indexed_cols, + stats_columns, + }; // Here we need to validate if the new data conforms to a predicate if one is provided let add_actions = write_execution_plan_with_predicate( predicate.clone(), @@ -642,10 +1020,12 @@ impl std::future::IntoFuture for WriteBuilder { this.write_batch_size, this.writer_properties.clone(), this.safe_cast, - this.overwrite_schema, + this.schema_mode, + writer_stats_config.clone(), + None, ) .await?; - actions.extend(add_actions.into_iter().map(Action::Add)); + actions.extend(add_actions); // Collect remove actions if we are overwriting the table if let Some(snapshot) = &this.snapshot { @@ -657,6 +1037,34 @@ impl std::future::IntoFuture for WriteBuilder { .or_else(|_| snapshot.arrow_schema()) .unwrap_or(schema.clone()); + let configuration = snapshot.metadata().configuration.clone(); + let current_protocol = snapshot.protocol(); + let maybe_new_protocol = if PROTOCOL.contains_timestampntz( + TryInto::::try_into(schema.clone())?.fields(), + ) && !current_protocol + .reader_features + .clone() + .unwrap_or_default() + .contains(&crate::kernel::ReaderFeatures::TimestampWithoutTimezone) + // We can check only reader features, as reader and writer timestampNtz + // should be always enabled together + { + let new_protocol = current_protocol.clone().enable_timestamp_ntz(); + if !(current_protocol.min_reader_version == 3 + && current_protocol.min_writer_version == 7) + { + Some(new_protocol.move_table_properties_into_features(&configuration)) + } else { + Some(new_protocol) + } + } else { + None + }; + + if let Some(protocol) = maybe_new_protocol { + actions.push(protocol.into()) + } + if schema != table_schema { let mut metadata = snapshot.metadata().clone(); let delta_schema: StructType = schema.as_ref().try_into()?; @@ -679,6 +1087,7 @@ impl std::future::IntoFuture for WriteBuilder { partition_columns.clone(), this.writer_properties, deletion_timestamp, + writer_stats_config, ) .await?; if !predicate_actions.is_empty() { @@ -691,6 +1100,21 @@ impl std::future::IntoFuture for WriteBuilder { .into_iter() .map(|p| p.remove_action(true).into()); actions.extend(remove_actions); + + let cdc_actions = execute_non_empty_expr_cdc_all_actions( + snapshot, + this.log_store.clone(), + state, + partition_columns.clone(), + this.writer_properties, + writer_stats_config, + ) + .await?; + + // ADD CDC ACTIONS HERE + if let Some(cdc_actions) = cdc_actions { + actions.extend(cdc_actions); + } } }; } @@ -706,48 +1130,83 @@ impl std::future::IntoFuture for WriteBuilder { predicate: predicate_str, }; - let version = commit( - this.log_store.as_ref(), - &actions, - operation.clone(), - this.snapshot.as_ref(), - this.app_metadata, - ) - .await?; + let commit = CommitBuilder::from(this.commit_properties) + .with_actions(actions) + .build( + this.snapshot.as_ref().map(|f| f as &dyn TableReference), + this.log_store.clone(), + operation.clone(), + ) + .await?; - // TODO we do not have the table config available, but since we are merging only our newly - // created actions, it may be safe to assume, that we want to include all actions. - // then again, having only some tombstones may be misleading. - if let Some(mut snapshot) = this.snapshot { - snapshot.merge(actions, &operation, version)?; - Ok(DeltaTable::new_with_state(this.log_store, snapshot)) - } else { - let mut table = DeltaTable::new(this.log_store, Default::default()); - table.update().await?; - Ok(table) - } + Ok(DeltaTable::new_with_state(this.log_store, commit.snapshot)) }) } } -fn can_cast_batch(from_fields: &Fields, to_fields: &Fields) -> bool { +fn try_cast_batch(from_fields: &Fields, to_fields: &Fields) -> Result<(), ArrowError> { if from_fields.len() != to_fields.len() { - return false; + return Err(ArrowError::SchemaError(format!( + "Cannot cast schema, number of fields does not match: {} vs {}", + from_fields.len(), + to_fields.len() + ))); } - from_fields.iter().all(|f| { - if let Some((_, target_field)) = to_fields.find(f.name()) { - if let (DataType::Struct(fields0), DataType::Struct(fields1)) = - (f.data_type(), target_field.data_type()) - { - can_cast_batch(fields0, fields1) + from_fields + .iter() + .map(|f| { + if let Some((_, target_field)) = to_fields.find(f.name()) { + if let (DataType::Struct(fields0), DataType::Struct(fields1)) = + (f.data_type(), target_field.data_type()) + { + try_cast_batch(fields0, fields1) + } else { + match (f.data_type(), target_field.data_type()) { + ( + DataType::Decimal128(left_precision, left_scale) | DataType::Decimal256(left_precision, left_scale), + DataType::Decimal128(right_precision, right_scale) + ) => { + if left_precision <= right_precision && left_scale <= right_scale { + Ok(()) + } else { + Err(ArrowError::SchemaError(format!( + "Cannot cast field {} from {} to {}", + f.name(), + f.data_type(), + target_field.data_type() + ))) + } + }, + ( + _, + DataType::Decimal256(_, _), + ) => { + unreachable!("Target field can never be Decimal 256. According to the protocol: 'The precision and scale can be up to 38.'") + }, + (left, right) => { + if !can_cast_types(left, right) { + Err(ArrowError::SchemaError(format!( + "Cannot cast field {} from {} to {}", + f.name(), + f.data_type(), + target_field.data_type() + ))) + } else { + Ok(()) + } + } + } + } } else { - can_cast_types(f.data_type(), target_field.data_type()) + Err(ArrowError::SchemaError(format!( + "Field {} not found in schema", + f.name() + ))) } - } else { - false - } - }) + }) + .collect::, _>>()?; + Ok(()) } #[cfg(test)] @@ -755,17 +1214,14 @@ mod tests { use super::*; use crate::operations::{collect_sendable_stream, DeltaOps}; use crate::protocol::SaveMode; - use crate::writer::test_utils::datafusion::write_batch; - use crate::writer::test_utils::datafusion::{get_data, get_data_sorted}; + use crate::writer::test_utils::datafusion::{get_data, get_data_sorted, write_batch}; use crate::writer::test_utils::{ get_arrow_schema, get_delta_schema, get_delta_schema_with_nested_struct, get_record_batch, get_record_batch_with_nested_struct, setup_table_with_configuration, }; use crate::DeltaConfigKey; - use arrow::datatypes::Field; - use arrow::datatypes::Schema as ArrowSchema; use arrow_array::{Int32Array, StringArray, TimestampMicrosecondArray}; - use arrow_schema::{DataType, TimeUnit}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use datafusion::prelude::*; use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; use serde_json::{json, Value}; @@ -791,7 +1247,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -802,7 +1258,7 @@ mod tests { let mut table = DeltaOps(table) .write(vec![batch.clone()]) .with_save_mode(SaveMode::Append) - .with_metadata(metadata.clone()) + .with_commit_properties(CommitProperties::default().with_metadata(metadata.clone())) .await .unwrap(); assert_eq!(table.version(), 1); @@ -825,7 +1281,7 @@ mod tests { let mut table = DeltaOps(table) .write(vec![batch.clone()]) .with_save_mode(SaveMode::Append) - .with_metadata(metadata.clone()) + .with_commit_properties(CommitProperties::default().with_metadata(metadata.clone())) .await .unwrap(); assert_eq!(table.version(), 2); @@ -848,7 +1304,7 @@ mod tests { let mut table = DeltaOps(table) .write(vec![batch]) .with_save_mode(SaveMode::Overwrite) - .with_metadata(metadata.clone()) + .with_commit_properties(CommitProperties::default().with_metadata(metadata.clone())) .await .unwrap(); assert_eq!(table.version(), 3); @@ -942,23 +1398,25 @@ mod tests { let schema = Arc::new(ArrowSchema::new(vec![Field::new( "value", - DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())), true, )])); let batch = RecordBatch::try_new( Arc::clone(&schema), - vec![Arc::new(TimestampMicrosecondArray::from(vec![Some(10000)]))], + vec![Arc::new( + TimestampMicrosecondArray::from(vec![Some(10000)]).with_timezone("UTC"), + )], ) .unwrap(); let _res = DeltaOps::from(table).write(vec![batch]).await.unwrap(); let expected = [ - "+-------------------------+", - "| value |", - "+-------------------------+", - "| 1970-01-01T00:00:00.010 |", - "| 2023-06-03 15:35:00 |", - "+-------------------------+", + "+--------------------------+", + "| value |", + "+--------------------------+", + "| 1970-01-01T00:00:00.010Z |", + "| 2023-06-03 15:35:00 |", + "+--------------------------+", ]; let actual = get_data(&_res).await; assert_batches_sorted_eq!(&expected, &actual); @@ -998,6 +1456,218 @@ mod tests { assert_eq!(table.get_files_count(), 4) } + #[tokio::test] + async fn test_merge_schema() { + let batch = get_record_batch(None, false); + let table = DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::ErrorIfExists) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let mut new_schema_builder = arrow_schema::SchemaBuilder::new(); + for field in batch.schema().fields() { + if field.name() != "modified" { + new_schema_builder.push(field.clone()); + } + } + new_schema_builder.push(Field::new("inserted_by", DataType::Utf8, true)); + let new_schema = new_schema_builder.finish(); + let new_fields = new_schema.fields(); + let new_names = new_fields.iter().map(|f| f.name()).collect::>(); + assert_eq!(new_names, vec!["id", "value", "inserted_by"]); + let inserted_by = StringArray::from(vec![ + Some("A1"), + Some("B1"), + None, + Some("B2"), + Some("A3"), + Some("A4"), + None, + None, + Some("B4"), + Some("A5"), + Some("A7"), + ]); + let new_batch = RecordBatch::try_new( + Arc::new(new_schema), + vec![ + Arc::new(batch.column_by_name("id").unwrap().clone()), + Arc::new(batch.column_by_name("value").unwrap().clone()), + Arc::new(inserted_by), + ], + ) + .unwrap(); + + let mut table = DeltaOps(table) + .write(vec![new_batch]) + .with_save_mode(SaveMode::Append) + .with_schema_mode(SchemaMode::Merge) + .await + .unwrap(); + table.load().await.unwrap(); + assert_eq!(table.version(), 1); + let new_schema = table.metadata().unwrap().schema().unwrap(); + let fields = new_schema.fields(); + let names = fields.map(|f| f.name()).collect::>(); + assert_eq!(names, vec!["id", "value", "modified", "inserted_by"]); + } + + #[tokio::test] + async fn test_merge_schema_with_partitions() { + let batch = get_record_batch(None, false); + let table = DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_partition_columns(vec!["id", "value"]) + .with_save_mode(SaveMode::ErrorIfExists) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let mut new_schema_builder = arrow_schema::SchemaBuilder::new(); + for field in batch.schema().fields() { + if field.name() != "modified" { + new_schema_builder.push(field.clone()); + } + } + new_schema_builder.push(Field::new("inserted_by", DataType::Utf8, true)); + let new_schema = new_schema_builder.finish(); + let new_fields = new_schema.fields(); + let new_names = new_fields.iter().map(|f| f.name()).collect::>(); + assert_eq!(new_names, vec!["id", "value", "inserted_by"]); + let inserted_by = StringArray::from(vec![ + Some("A1"), + Some("B1"), + None, + Some("B2"), + Some("A3"), + Some("A4"), + None, + None, + Some("B4"), + Some("A5"), + Some("A7"), + ]); + let new_batch = RecordBatch::try_new( + Arc::new(new_schema), + vec![ + Arc::new(batch.column_by_name("id").unwrap().clone()), + Arc::new(batch.column_by_name("value").unwrap().clone()), + Arc::new(inserted_by), + ], + ) + .unwrap(); + let table = DeltaOps(table) + .write(vec![new_batch]) + .with_save_mode(SaveMode::Append) + .with_schema_mode(SchemaMode::Merge) + .await + .unwrap(); + + assert_eq!(table.version(), 1); + let new_schema = table.metadata().unwrap().schema().unwrap(); + let fields = new_schema.fields(); + let mut names = fields.map(|f| f.name()).collect::>(); + names.sort(); + assert_eq!(names, vec!["id", "inserted_by", "modified", "value"]); + let part_cols = table.metadata().unwrap().partition_columns.clone(); + assert_eq!(part_cols, vec!["id", "value"]); // we want to preserve partitions + } + + #[tokio::test] + async fn test_overwrite_schema() { + let batch = get_record_batch(None, false); + let table = DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::ErrorIfExists) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let mut new_schema_builder = arrow_schema::SchemaBuilder::new(); + for field in batch.schema().fields() { + if field.name() != "modified" { + new_schema_builder.push(field.clone()); + } + } + new_schema_builder.push(Field::new("inserted_by", DataType::Utf8, true)); + let new_schema = new_schema_builder.finish(); + let new_fields = new_schema.fields(); + let new_names = new_fields.iter().map(|f| f.name()).collect::>(); + assert_eq!(new_names, vec!["id", "value", "inserted_by"]); + let inserted_by = StringArray::from(vec![ + Some("A1"), + Some("B1"), + None, + Some("B2"), + Some("A3"), + Some("A4"), + None, + None, + Some("B4"), + Some("A5"), + Some("A7"), + ]); + let new_batch = RecordBatch::try_new( + Arc::new(new_schema), + vec![ + Arc::new(batch.column_by_name("id").unwrap().clone()), + Arc::new(batch.column_by_name("value").unwrap().clone()), + Arc::new(inserted_by), + ], + ) + .unwrap(); + + let table = DeltaOps(table) + .write(vec![new_batch]) + .with_save_mode(SaveMode::Append) + .with_schema_mode(SchemaMode::Overwrite) + .await; + assert!(table.is_err()); + } + + #[tokio::test] + async fn test_overwrite_check() { + // If you do not pass a schema mode, we want to check the schema + let batch = get_record_batch(None, false); + let table = DeltaOps::new_in_memory() + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::ErrorIfExists) + .await + .unwrap(); + assert_eq!(table.version(), 0); + + let mut new_schema_builder = arrow_schema::SchemaBuilder::new(); + + new_schema_builder.push(Field::new("inserted_by", DataType::Utf8, true)); + let new_schema = new_schema_builder.finish(); + let new_fields = new_schema.fields(); + let new_names = new_fields.iter().map(|f| f.name()).collect::>(); + assert_eq!(new_names, vec!["inserted_by"]); + let inserted_by = StringArray::from(vec![ + Some("A1"), + Some("B1"), + None, + Some("B2"), + Some("A3"), + Some("A4"), + None, + None, + Some("B4"), + Some("A5"), + Some("A7"), + ]); + let new_batch = + RecordBatch::try_new(Arc::new(new_schema), vec![Arc::new(inserted_by)]).unwrap(); + + let table = DeltaOps(table) + .write(vec![new_batch]) + .with_save_mode(SaveMode::Append) + .await; + assert!(table.is_err()); + } + #[tokio::test] async fn test_check_invariants() { let batch = get_record_batch(None, false); @@ -1015,7 +1685,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() .with_save_mode(SaveMode::ErrorIfExists) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -1037,7 +1707,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() .with_save_mode(SaveMode::ErrorIfExists) - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -1053,7 +1723,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .await .unwrap(); assert_eq!(table.version(), 0); diff --git a/crates/core/src/operations/writer.rs b/crates/core/src/operations/writer.rs index 5d8808fa3c..5128611ffd 100644 --- a/crates/core/src/operations/writer.rs +++ b/crates/core/src/operations/writer.rs @@ -1,11 +1,13 @@ //! Abstractions and implementations for writing data to delta tables -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use arrow::datatypes::SchemaRef as ArrowSchemaRef; use arrow::error::ArrowError; use arrow::record_batch::RecordBatch; use bytes::Bytes; +use delta_kernel::expressions::Scalar; +use indexmap::IndexMap; use object_store::{path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; use parquet::basic::Compression; @@ -14,7 +16,7 @@ use tracing::debug; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Add, PartitionsExt, Scalar}; +use crate::kernel::{Add, PartitionsExt}; use crate::storage::ObjectStoreRef; use crate::writer::record_batch::{divide_by_partition_values, PartitionResult}; use crate::writer::stats::create_add; @@ -65,6 +67,7 @@ impl From for DeltaTableError { } /// Configuration to write data into Delta tables +#[derive(Debug)] pub struct WriterConfig { /// Schema of the delta table table_schema: ArrowSchemaRef, @@ -77,6 +80,10 @@ pub struct WriterConfig { /// Row chunks passed to parquet writer. This and the internal parquet writer settings /// determine how fine granular we can track / control the size of resulting files. write_batch_size: usize, + /// Num index cols to collect stats for + num_indexed_cols: i32, + /// Stats columns, specific columns to collect stats from, takes precedence over num_indexed_cols + stats_columns: Option>, } impl WriterConfig { @@ -87,6 +94,8 @@ impl WriterConfig { writer_properties: Option, target_file_size: Option, write_batch_size: Option, + num_indexed_cols: i32, + stats_columns: Option>, ) -> Self { let writer_properties = writer_properties.unwrap_or_else(|| { WriterProperties::builder() @@ -102,6 +111,8 @@ impl WriterConfig { writer_properties, target_file_size, write_batch_size, + num_indexed_cols, + stats_columns, } } @@ -111,6 +122,7 @@ impl WriterConfig { } } +#[derive(Debug)] /// A parquet writer implementation tailored to the needs of writing data to a delta table. pub struct DeltaWriter { /// An object store pointing at Delta table root @@ -155,7 +167,7 @@ impl DeltaWriter { pub async fn write_partition( &mut self, record_batch: RecordBatch, - partition_values: &BTreeMap, + partition_values: &IndexMap, ) -> DeltaResult<()> { let partition_key = Path::parse(partition_values.hive_partition_path())?; @@ -174,8 +186,12 @@ impl DeltaWriter { Some(self.config.target_file_size), Some(self.config.write_batch_size), )?; - let mut writer = - PartitionWriter::try_with_config(self.object_store.clone(), config)?; + let mut writer = PartitionWriter::try_with_config( + self.object_store.clone(), + config, + self.config.num_indexed_cols, + self.config.stats_columns.clone(), + )?; writer.write(&record_batch).await?; let _ = self.partition_writers.insert(partition_key, writer); } @@ -211,13 +227,15 @@ impl DeltaWriter { } } -pub(crate) struct PartitionWriterConfig { +/// Write configuration for partition writers +#[derive(Debug)] +pub struct PartitionWriterConfig { /// Schema of the data written to disk file_schema: ArrowSchemaRef, /// Prefix applied to all paths prefix: Path, /// Values for all partition columns - partition_values: BTreeMap, + partition_values: IndexMap, /// Properties passed to underlying parquet writer writer_properties: WriterProperties, /// Size above which we will write a buffered parquet file to disk. @@ -228,9 +246,10 @@ pub(crate) struct PartitionWriterConfig { } impl PartitionWriterConfig { + /// Create a new instance of [PartitionWriterConfig] pub fn try_new( file_schema: ArrowSchemaRef, - partition_values: BTreeMap, + partition_values: IndexMap, writer_properties: Option, target_file_size: Option, write_batch_size: Option, @@ -256,7 +275,12 @@ impl PartitionWriterConfig { } } -pub(crate) struct PartitionWriter { +/// Partition writer implementation +/// This writer takes in table data as RecordBatches and writes it out to partitioned parquet files. +/// It buffers data in memory until it reaches a certain size, then writes it out to optimize file sizes. +/// When you complete writing you get back a list of Add actions that can be used to update the Delta table commit log. +#[derive(Debug)] +pub struct PartitionWriter { object_store: ObjectStoreRef, writer_id: uuid::Uuid, config: PartitionWriterConfig, @@ -264,6 +288,10 @@ pub(crate) struct PartitionWriter { arrow_writer: ArrowWriter, part_counter: usize, files_written: Vec, + /// Num index cols to collect stats for + num_indexed_cols: i32, + /// Stats columns, specific columns to collect stats from, takes precedence over num_indexed_cols + stats_columns: Option>, } impl PartitionWriter { @@ -271,6 +299,8 @@ impl PartitionWriter { pub fn try_with_config( object_store: ObjectStoreRef, config: PartitionWriterConfig, + num_indexed_cols: i32, + stats_columns: Option>, ) -> DeltaResult { let buffer = ShareableBuffer::default(); let arrow_writer = ArrowWriter::try_new( @@ -287,6 +317,8 @@ impl PartitionWriter { arrow_writer, part_counter: 0, files_written: Vec::new(), + num_indexed_cols, + stats_columns, }) } @@ -337,13 +369,15 @@ impl PartitionWriter { let file_size = buffer.len() as i64; // write file to object store - self.object_store.put(&path, buffer).await?; + self.object_store.put(&path, buffer.into()).await?; self.files_written.push( create_add( &self.config.partition_values, path.to_string(), file_size, &metadata, + self.num_indexed_cols, + &self.stats_columns, ) .map_err(|err| WriteError::CreateAdd { source: Box::new(err), @@ -385,6 +419,7 @@ impl PartitionWriter { Ok(()) } + /// Close the writer and get the new [Add] actions. pub async fn close(mut self) -> DeltaResult> { self.flush_arrow_writer().await?; Ok(self.files_written) @@ -395,12 +430,51 @@ impl PartitionWriter { mod tests { use super::*; use crate::storage::utils::flatten_list_stream as list; - use crate::writer::test_utils::get_record_batch; + use crate::table::config::DEFAULT_NUM_INDEX_COLS; + use crate::writer::test_utils::*; use crate::DeltaTableBuilder; use arrow::array::{Int32Array, StringArray}; use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use std::sync::Arc; + fn get_delta_writer( + object_store: ObjectStoreRef, + batch: &RecordBatch, + writer_properties: Option, + target_file_size: Option, + write_batch_size: Option, + ) -> DeltaWriter { + let config = WriterConfig::new( + batch.schema(), + vec![], + writer_properties, + target_file_size, + write_batch_size, + DEFAULT_NUM_INDEX_COLS, + None, + ); + DeltaWriter::new(object_store, config) + } + + fn get_partition_writer( + object_store: ObjectStoreRef, + batch: &RecordBatch, + writer_properties: Option, + target_file_size: Option, + write_batch_size: Option, + ) -> PartitionWriter { + let config = PartitionWriterConfig::try_new( + batch.schema(), + IndexMap::new(), + writer_properties, + target_file_size, + write_batch_size, + ) + .unwrap(); + PartitionWriter::try_with_config(object_store, config, DEFAULT_NUM_INDEX_COLS, None) + .unwrap() + } + #[tokio::test] async fn test_write_partition() { let log_store = DeltaTableBuilder::from_uri("memory://") @@ -410,7 +484,7 @@ mod tests { let batch = get_record_batch(None, false); // write single un-partitioned batch - let mut writer = get_writer(object_store.clone(), &batch, None, None, None); + let mut writer = get_partition_writer(object_store.clone(), &batch, None, None, None); writer.write(&batch).await.unwrap(); let files = list(object_store.as_ref(), None).await.unwrap(); assert_eq!(files.len(), 0); @@ -442,8 +516,9 @@ mod tests { let properties = WriterProperties::builder() .set_max_row_group_size(1024) .build(); - // configure small target file size and row group size so we can observe multiple files written - let mut writer = get_writer(object_store, &batch, Some(properties), Some(10_000), None); + // configure small target file size and and row group size so we can observe multiple files written + let mut writer = + get_partition_writer(object_store, &batch, Some(properties), Some(10_000), None); writer.write(&batch).await.unwrap(); // check that we have written more then once file, and no more then 1 is below target size @@ -470,7 +545,7 @@ mod tests { .unwrap() .object_store(); // configure small target file size so we can observe multiple files written - let mut writer = get_writer(object_store, &batch, None, Some(10_000), None); + let mut writer = get_partition_writer(object_store, &batch, None, Some(10_000), None); writer.write(&batch).await.unwrap(); // check that we have written more then once file, and no more then 1 is below target size @@ -484,7 +559,7 @@ mod tests { #[tokio::test] async fn test_do_not_write_empty_file_on_close() { - let base_int = Arc::new(Int32Array::from((0..10000 as i32).collect::>())); + let base_int = Arc::new(Int32Array::from((0..10000_i32).collect::>())); let base_str = Arc::new(StringArray::from(vec!["A"; 10000])); let schema = Arc::new(ArrowSchema::new(vec![ Field::new("id", DataType::Utf8, true), @@ -498,28 +573,59 @@ mod tests { .object_store(); // configure high batch size and low file size to observe one file written and flushed immediately // upon writing batch, then ensures the buffer is empty upon closing writer - let mut writer = get_writer(object_store, &batch, None, Some(9000), Some(10000)); + let mut writer = get_partition_writer(object_store, &batch, None, Some(9000), Some(10000)); writer.write(&batch).await.unwrap(); let adds = writer.close().await.unwrap(); assert!(adds.len() == 1); } - fn get_writer( - object_store: ObjectStoreRef, - batch: &RecordBatch, - writer_properties: Option, - target_file_size: Option, - write_batch_size: Option, - ) -> PartitionWriter { - let config = PartitionWriterConfig::try_new( - batch.schema(), - BTreeMap::new(), - writer_properties, - target_file_size, - write_batch_size, + #[tokio::test] + async fn test_write_mismatched_schema() { + let log_store = DeltaTableBuilder::from_uri("memory://") + .build_storage() + .unwrap(); + let object_store = log_store.object_store(); + let batch = get_record_batch(None, false); + + // write single un-partitioned batch + let mut writer = get_delta_writer(object_store.clone(), &batch, None, None, None); + writer.write(&batch).await.unwrap(); + // Ensure the write hasn't been flushed + let files = list(object_store.as_ref(), None).await.unwrap(); + assert_eq!(files.len(), 0); + + // Create a second batch with a different schema + let second_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ])); + let second_batch = RecordBatch::try_new( + second_schema, + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2)])), + Arc::new(StringArray::from(vec![Some("will"), Some("robert")])), + ], ) .unwrap(); - PartitionWriter::try_with_config(object_store, config).unwrap() + + let result = writer.write(&second_batch).await; + assert!(result.is_err()); + + match result { + Ok(_) => { + panic!("Should not have successfully written"); + } + Err(e) => { + match e { + DeltaTableError::SchemaMismatch { .. } => { + // this is expected + } + others => { + panic!("Got the wrong error: {others:?}"); + } + } + } + }; } } diff --git a/crates/core/src/protocol/checkpoints.rs b/crates/core/src/protocol/checkpoints.rs index b6787b9b31..f2625e49cf 100644 --- a/crates/core/src/protocol/checkpoints.rs +++ b/crates/core/src/protocol/checkpoints.rs @@ -6,12 +6,15 @@ use std::iter::Iterator; use arrow_json::ReaderBuilder; use arrow_schema::ArrowError; -use chrono::{Datelike, Utc}; +use chrono::{Datelike, NaiveDate, NaiveDateTime, Utc}; use futures::{StreamExt, TryStreamExt}; +use itertools::Itertools; use lazy_static::lazy_static; use object_store::{Error, ObjectStore}; use parquet::arrow::ArrowWriter; +use parquet::basic::Compression; use parquet::errors::ParquetError; +use parquet::file::properties::WriterProperties; use regex::Regex; use serde_json::Value; use tracing::{debug, error}; @@ -19,13 +22,12 @@ use tracing::{debug, error}; use super::{time_utils, ProtocolError}; use crate::kernel::arrow::delta_log_schema_for_table; use crate::kernel::{ - Action, Add as AddAction, DataType, PrimitiveType, Protocol, Remove, StructField, Txn, + Action, Add as AddAction, DataType, PrimitiveType, Protocol, Remove, StructField, }; use crate::logstore::LogStore; use crate::table::state::DeltaTableState; use crate::table::{get_partition_col_data_types, CheckPoint, CheckPointBuilder}; use crate::{open_table_with_version, DeltaTable}; - type SchemaPath = Vec; /// Error returned when there is an error during creating a checkpoint. @@ -55,6 +57,9 @@ enum CheckpointError { #[from] source: ArrowError, }, + + #[error("missing rewquired action type in snapshot: {0}")] + MissingActionType(String), } impl From for ProtocolError { @@ -64,6 +69,7 @@ impl From for ProtocolError { CheckpointError::Arrow { source } => Self::Arrow { source }, CheckpointError::StaleTableVersion(..) => Self::Generic(value.to_string()), CheckpointError::Parquet { source } => Self::ParquetParseError { source }, + CheckpointError::MissingActionType(_) => Self::Generic(value.to_string()), } } } @@ -164,14 +170,16 @@ pub async fn create_checkpoint_for( let object_store = log_store.object_store(); debug!("Writing checkpoint to {:?}.", checkpoint_path); - object_store.put(&checkpoint_path, parquet_bytes).await?; + object_store + .put(&checkpoint_path, parquet_bytes.into()) + .await?; let last_checkpoint_content: Value = serde_json::to_value(checkpoint)?; let last_checkpoint_content = bytes::Bytes::from(serde_json::to_vec(&last_checkpoint_content)?); debug!("Writing _last_checkpoint to {:?}.", last_checkpoint_path); object_store - .put(&last_checkpoint_path, last_checkpoint_content) + .put(&last_checkpoint_path, last_checkpoint_content.into()) .await?; Ok(()) @@ -186,7 +194,7 @@ pub async fn cleanup_expired_logs_for( ) -> Result { lazy_static! { static ref DELTA_LOG_REGEX: Regex = - Regex::new(r"_delta_log/(\d{20})\.(json|checkpoint).*$").unwrap(); + Regex::new(r"_delta_log/(\d{20})\.(json|checkpoint|json.tmp).*$").unwrap(); } let object_store = log_store.object_store(); @@ -254,7 +262,8 @@ fn parquet_bytes_from_state( // Collect a map of paths that require special stats conversion. let mut stats_conversions: Vec<(SchemaPath, DataType)> = Vec::new(); - collect_stats_conversions(&mut stats_conversions, schema.fields().as_slice()); + let fields = schema.fields().collect_vec(); + collect_stats_conversions(&mut stats_conversions, fields.as_slice()); // if any, tombstones do not include extended file metadata, we must omit the extended metadata fields from the remove schema // See https://github.com/delta-io/delta/blob/master/PROTOCOL.md#add-file-and-remove-file @@ -279,8 +288,16 @@ fn parquet_bytes_from_state( let jsons = std::iter::once(Action::Protocol(Protocol { min_reader_version: state.protocol().min_reader_version, min_writer_version: state.protocol().min_writer_version, - writer_features: None, - reader_features: None, + writer_features: if state.protocol().min_writer_version >= 7 { + Some(state.protocol().writer_features.clone().unwrap_or_default()) + } else { + None + }, + reader_features: if state.protocol().min_reader_version >= 3 { + Some(state.protocol().reader_features.clone().unwrap_or_default()) + } else { + None + }, })) // metaData .chain(std::iter::once(Action::Metadata(current_metadata.clone()))) @@ -288,14 +305,8 @@ fn parquet_bytes_from_state( .chain( state .app_transaction_version() - .iter() - .map(|(app_id, version)| { - Action::Txn(Txn { - app_id: app_id.clone(), - version: *version, - last_updated: None, - }) - }), + .map_err(|_| CheckpointError::MissingActionType("txn".to_string()))? + .map(Action::Txn), ) // removes .chain(tombstones.iter().map(|r| { @@ -325,7 +336,15 @@ fn parquet_bytes_from_state( debug!("Writing to checkpoint parquet buffer..."); // Write the Checkpoint parquet file. let mut bytes = vec![]; - let mut writer = ArrowWriter::try_new(&mut bytes, arrow_schema.clone(), None)?; + let mut writer = ArrowWriter::try_new( + &mut bytes, + arrow_schema.clone(), + Some( + WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(), + ), + )?; let mut decoder = ReaderBuilder::new(arrow_schema) .with_batch_size(CHECKPOINT_RECORD_BATCH_SIZE) .build_decoder()?; @@ -417,20 +436,22 @@ fn typed_partition_value_from_string( .map_err(|_| CheckpointError::PartitionValueNotParseable(string_value.to_owned()))? .into()), PrimitiveType::Date => { - let d = chrono::naive::NaiveDate::parse_from_str(string_value, "%Y-%m-%d") - .map_err(|_| { - CheckpointError::PartitionValueNotParseable(string_value.to_owned()) - })?; + let d = NaiveDate::parse_from_str(string_value, "%Y-%m-%d").map_err(|_| { + CheckpointError::PartitionValueNotParseable(string_value.to_owned()) + })?; // day 0 is 1970-01-01 (719163 days from ce) Ok((d.num_days_from_ce() - 719_163).into()) } - PrimitiveType::Timestamp => { - let ts = - chrono::naive::NaiveDateTime::parse_from_str(string_value, "%Y-%m-%d %H:%M:%S") - .map_err(|_| { - CheckpointError::PartitionValueNotParseable(string_value.to_owned()) - })?; - Ok((ts.timestamp_millis() * 1000).into()) + PrimitiveType::Timestamp | PrimitiveType::TimestampNtz => { + let ts = NaiveDateTime::parse_from_str(string_value, "%Y-%m-%d %H:%M:%S.%6f"); + let ts: NaiveDateTime = match ts { + Ok(_) => ts, + Err(_) => NaiveDateTime::parse_from_str(string_value, "%Y-%m-%d %H:%M:%S"), + } + .map_err(|_| { + CheckpointError::PartitionValueNotParseable(string_value.to_owned()) + })?; + Ok((ts.and_utc().timestamp_millis() * 1000).into()) } s => unimplemented!( "Primitive type {} is not supported for partition column values.", @@ -460,7 +481,7 @@ fn typed_partition_value_from_option_string( } } -fn collect_stats_conversions(paths: &mut Vec<(SchemaPath, DataType)>, fields: &[StructField]) { +fn collect_stats_conversions(paths: &mut Vec<(SchemaPath, DataType)>, fields: &[&StructField]) { let mut _path = SchemaPath::new(); fields .iter() @@ -481,9 +502,7 @@ fn collect_field_conversion( DataType::Struct(struct_field) => { let struct_fields = struct_field.fields(); current_path.push(field.name().to_owned()); - struct_fields - .iter() - .for_each(|f| collect_field_conversion(current_path, all_paths, f)); + struct_fields.for_each(|f| collect_field_conversion(current_path, all_paths, f)); current_path.pop(); } _ => { /* noop */ } @@ -532,7 +551,9 @@ mod tests { use super::*; use crate::kernel::StructType; + use crate::operations::transaction::{CommitBuilder, TableReference}; use crate::operations::DeltaOps; + use crate::protocol::Metadata; use crate::writer::test_utils::get_delta_schema; #[tokio::test] @@ -541,7 +562,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(crate::protocol::SaveMode::Ignore) .await .unwrap(); @@ -565,13 +586,91 @@ mod tests { assert_eq!(last_checkpoint.version, 0); } + /// This test validates that a checkpoint can be written and re-read with the minimum viable + /// Metadata. There was a bug which didn't handle the optionality of createdTime. + #[tokio::test] + async fn test_create_checkpoint_with_metadata() { + let table_schema = get_delta_schema(); + + let mut table = DeltaOps::new_in_memory() + .create() + .with_columns(table_schema.fields().cloned()) + .with_save_mode(crate::protocol::SaveMode::Ignore) + .await + .unwrap(); + assert_eq!(table.version(), 0); + assert_eq!(table.get_schema().unwrap(), &table_schema); + + let part_cols: Vec = vec![]; + let metadata = Metadata::try_new(table_schema, part_cols, HashMap::new()).unwrap(); + let actions = vec![Action::Metadata(metadata)]; + + let epoch_id = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("Time went backwards") + .as_millis() as i64; + + let operation = crate::protocol::DeltaOperation::StreamingUpdate { + output_mode: crate::protocol::OutputMode::Append, + query_id: "test".into(), + epoch_id, + }; + let v = CommitBuilder::default() + .with_actions(actions) + .build( + table.state.as_ref().map(|f| f as &dyn TableReference), + table.log_store(), + operation, + ) + .await + .unwrap() + .version(); + + assert_eq!(1, v, "Expected the commit to create table version 1"); + table.load().await.expect("Failed to reload table"); + assert_eq!( + table.version(), + 1, + "The loaded version of the table is not up to date" + ); + + let res = create_checkpoint_for( + table.version(), + table.state.as_ref().unwrap(), + table.log_store.as_ref(), + ) + .await; + assert!(res.is_ok()); + + // Look at the "files" and verify that the _last_checkpoint has the right version + let path = Path::from("_delta_log/_last_checkpoint"); + let last_checkpoint = table + .object_store() + .get(&path) + .await + .expect("Failed to get the _last_checkpoint") + .bytes() + .await + .expect("Failed to get bytes for _last_checkpoint"); + let last_checkpoint: CheckPoint = serde_json::from_slice(&last_checkpoint).expect("Fail"); + assert_eq!(last_checkpoint.version, 1); + + // If the regression exists, this will fail + table.load().await.expect("Failed to reload the table, this likely means that the optional createdTime was not actually optional"); + assert_eq!( + 1, + table.version(), + "The reloaded table doesn't have the right version" + ); + } + #[tokio::test] async fn test_create_checkpoint_for_invalid_version() { let table_schema = get_delta_schema(); let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_save_mode(crate::protocol::SaveMode::Ignore) .await .unwrap(); @@ -649,8 +748,11 @@ mod tests { } for (s, v) in [ + ("2021-08-08 01:00:01.000000", 1628384401000000i64), ("2021-08-08 01:00:01", 1628384401000000i64), + ("1970-01-02 12:59:59.000000", 133199000000i64), ("1970-01-02 12:59:59", 133199000000i64), + ("1970-01-01 13:00:01.000000", 46801000000i64), ("1970-01-01 13:00:01", 46801000000i64), ("1969-12-31 00:00:00", -86400000000i64), ("1677-09-21 00:12:44", -9223372036000000i64), @@ -702,9 +804,8 @@ mod tests { #[test] fn collect_stats_conversions_test() { let delta_schema: StructType = serde_json::from_value(SCHEMA.clone()).unwrap(); - let fields = delta_schema.fields(); + let fields = delta_schema.fields().collect_vec(); let mut paths = Vec::new(); - collect_stats_conversions(&mut paths, fields.as_slice()); assert_eq!(2, paths.len()); diff --git a/crates/core/src/protocol/mod.rs b/crates/core/src/protocol/mod.rs index 3be8a734fa..ce6ef0e8b0 100644 --- a/crates/core/src/protocol/mod.rs +++ b/crates/core/src/protocol/mod.rs @@ -21,7 +21,7 @@ use std::str::FromStr; use tracing::{debug, error}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Add, CommitInfo, Metadata, Protocol, Remove}; +use crate::kernel::{Add, CommitInfo, Metadata, Protocol, Remove, StructField}; use crate::logstore::LogStore; use crate::table::CheckPoint; @@ -326,6 +326,13 @@ pub struct MergePredicate { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase")] pub enum DeltaOperation { + /// Represents a Delta `Add Column` operation. + /// Used to add new columns or field in a struct + AddColumn { + /// Fields added to existing schema + fields: Vec, + }, + /// Represents a Delta `Create` operation. /// Would usually only create the table, if also data is written, /// a `Write` operations is more appropriate @@ -371,12 +378,21 @@ pub enum DeltaOperation { expr: String, }, + /// Drops constraints from a table + DropConstraint { + /// Constraints name + name: String, + }, + /// Merge data with a source data with the following predicate #[serde(rename_all = "camelCase")] Merge { - /// The merge predicate + /// Cleaned merge predicate for conflict checks predicate: Option, + /// The original merge predicate + merge_predicate: Option, + /// Match operations performed matched_predicates: Vec, @@ -398,6 +414,13 @@ pub enum DeltaOperation { epoch_id: i64, }, + /// Set table properties operations + #[serde(rename_all = "camelCase")] + SetTableProperties { + /// Table properties that were added + properties: HashMap, + }, + #[serde(rename_all = "camelCase")] /// Represents a `Optimize` operation Optimize { @@ -442,6 +465,7 @@ impl DeltaOperation { pub fn name(&self) -> &str { // operation names taken from https://learn.microsoft.com/en-us/azure/databricks/delta/history#--operation-metrics-keys match &self { + DeltaOperation::AddColumn { .. } => "ADD COLUMN", DeltaOperation::Create { mode: SaveMode::Overwrite, .. @@ -452,12 +476,14 @@ impl DeltaOperation { DeltaOperation::Update { .. } => "UPDATE", DeltaOperation::Merge { .. } => "MERGE", DeltaOperation::StreamingUpdate { .. } => "STREAMING UPDATE", + DeltaOperation::SetTableProperties { .. } => "SET TBLPROPERTIES", DeltaOperation::Optimize { .. } => "OPTIMIZE", DeltaOperation::FileSystemCheck { .. } => "FSCK", DeltaOperation::Restore { .. } => "RESTORE", DeltaOperation::VacuumStart { .. } => "VACUUM START", DeltaOperation::VacuumEnd { .. } => "VACUUM END", DeltaOperation::AddConstraint { .. } => "ADD CONSTRAINT", + DeltaOperation::DropConstraint { .. } => "DROP CONSTRAINT", } } @@ -494,9 +520,12 @@ impl DeltaOperation { pub fn changes_data(&self) -> bool { match self { Self::Optimize { .. } + | Self::SetTableProperties { .. } + | Self::AddColumn { .. } | Self::VacuumStart { .. } | Self::VacuumEnd { .. } - | Self::AddConstraint { .. } => false, + | Self::AddConstraint { .. } + | Self::DropConstraint { .. } => false, Self::Create { .. } | Self::FileSystemCheck {} | Self::StreamingUpdate { .. } @@ -533,16 +562,15 @@ impl DeltaOperation { /// Denotes if the operation reads the entire table pub fn read_whole_table(&self) -> bool { match self { - // TODO just adding one operation example, as currently none of the - // implemented operations scan the entire table. - Self::Write { predicate, .. } if predicate.is_none() => false, + // Predicate is none -> Merge operation had to join full source and target + Self::Merge { predicate, .. } if predicate.is_none() => true, _ => false, } } } /// The SaveMode used when performing a DeltaOperation -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq)] pub enum SaveMode { /// Files will be appended to the target location. Append, @@ -572,7 +600,7 @@ impl FromStr for SaveMode { } /// The OutputMode used in streaming operations. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, Debug, Copy, Clone)] pub enum OutputMode { /// Only new rows will be written when new data is available. Append, @@ -1180,6 +1208,32 @@ mod tests { assert_eq!(expected, actions); } + #[tokio::test] + async fn test_table_not_always_with_stats() { + let path = "../test/tests/data/delta-stats-optional"; + let mut table = crate::open_table(path).await.unwrap(); + table.load().await.unwrap(); + let actions = table.snapshot().unwrap().add_actions_table(true).unwrap(); + let actions = sort_batch_by(&actions, "path").unwrap(); + // get column-0 path, and column-4 num_records, and column_5 null_count.integer + let expected_path: ArrayRef = Arc::new(array::StringArray::from(vec![ + "part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet", + "part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet", + ])); + let expected_num_records: ArrayRef = + Arc::new(array::Int64Array::from(vec![None, Some(1)])); + let expected_null_count: ArrayRef = + Arc::new(array::Int64Array::from(vec![None, Some(0)])); + + let path_column = actions.column(0); + let num_records_column = actions.column(4); + let null_count_column = actions.column(5); + + assert_eq!(&expected_path, path_column); + assert_eq!(&expected_num_records, num_records_column); + assert_eq!(&expected_null_count, null_count_column); + } + #[tokio::test] async fn test_only_struct_stats() { // test table with no json stats @@ -1298,15 +1352,21 @@ mod tests { ), ( "min.timestamp", - Arc::new(array::TimestampMicrosecondArray::from(vec![ - TimestampMicrosecondType::parse("2022-10-24T22:59:32.846Z"), - ])), + Arc::new( + array::TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2022-10-24T22:59:32.846Z"), + ]) + .with_timezone("UTC"), + ), ), ( "max.timestamp", - Arc::new(array::TimestampMicrosecondArray::from(vec![ - TimestampMicrosecondType::parse("2022-10-24T22:59:32.846Z"), - ])), + Arc::new( + array::TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2022-10-24T22:59:32.846Z"), + ]) + .with_timezone("UTC"), + ), ), ( "null_count.struct.struct_element", diff --git a/crates/core/src/protocol/parquet_read/mod.rs b/crates/core/src/protocol/parquet_read/mod.rs index f838bbdaeb..655dcb05f3 100644 --- a/crates/core/src/protocol/parquet_read/mod.rs +++ b/crates/core/src/protocol/parquet_read/mod.rs @@ -9,7 +9,8 @@ use tracing::{debug, error, warn}; use crate::kernel::models::actions::serde_path::decode_path; use crate::kernel::{ - Action, Add, AddCDCFile, DeletionVectorDescriptor, Metadata, Protocol, Remove, StorageType, Txn, + Action, Add, AddCDCFile, DeletionVectorDescriptor, Metadata, Protocol, Remove, StorageType, + Transaction, }; use crate::protocol::{ColumnCountStat, ColumnValueStat, ProtocolError, Stats}; @@ -433,12 +434,10 @@ impl Metadata { .map_err(|_| gen_action_type_error("metaData", "schemaString", "string"))? .clone(); } - "createdTime" => { - re.created_time = - Some(record.get_long(i).map_err(|_| { - gen_action_type_error("metaData", "createdTime", "long") - })?); - } + "createdTime" => match record.get_long(i) { + Ok(s) => re.created_time = Some(s), + _ => re.created_time = None, + }, "configuration" => { let configuration_map = record .get_map(i) @@ -586,7 +585,7 @@ impl Remove { } } -impl Txn { +impl Transaction { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { ..Default::default() @@ -707,7 +706,7 @@ impl Action { "add" => Action::Add(Add::from_parquet_record(col_data)?), "metaData" => Action::Metadata(Metadata::from_parquet_record(col_data)?), "remove" => Action::Remove(Remove::from_parquet_record(col_data)?), - "txn" => Action::Txn(Txn::from_parquet_record(col_data)?), + "txn" => Action::Txn(Transaction::from_parquet_record(col_data)?), "protocol" => Action::Protocol(Protocol::from_parquet_record(col_data)?), "cdc" => Action::Cdc(AddCDCFile::from_parquet_record(col_data)?), name => { diff --git a/crates/core/src/schema/partitions.rs b/crates/core/src/schema/partitions.rs index a52b82bd9d..d2b2e84979 100644 --- a/crates/core/src/schema/partitions.rs +++ b/crates/core/src/schema/partitions.rs @@ -1,11 +1,13 @@ //! Delta Table partition handling logic. -//! + +use delta_kernel::expressions::Scalar; +use serde::{Serialize, Serializer}; use std::cmp::Ordering; use std::collections::HashMap; use std::convert::TryFrom; use crate::errors::DeltaTableError; -use crate::kernel::{DataType, PrimitiveType, Scalar}; +use crate::kernel::{scalars::ScalarExt, DataType, PrimitiveType}; /// A special value used in Hive to represent the null partition in partitioned tables pub const NULL_PARTITION_VALUE_DATA_PATH: &str = "__HIVE_DEFAULT_PARTITION__"; @@ -31,6 +33,42 @@ pub enum PartitionValue { NotIn(Vec), } +#[derive(Clone, Debug, PartialEq)] +struct ScalarHelper<'a>(&'a Scalar); + +impl PartialOrd for ScalarHelper<'_> { + fn partial_cmp(&self, other: &Self) -> Option { + use Scalar::*; + match (self.0, other.0) { + (Null(_), Null(_)) => Some(Ordering::Equal), + (Integer(a), Integer(b)) => a.partial_cmp(b), + (Long(a), Long(b)) => a.partial_cmp(b), + (Short(a), Short(b)) => a.partial_cmp(b), + (Byte(a), Byte(b)) => a.partial_cmp(b), + (Float(a), Float(b)) => a.partial_cmp(b), + (Double(a), Double(b)) => a.partial_cmp(b), + (String(a), String(b)) => a.partial_cmp(b), + (Boolean(a), Boolean(b)) => a.partial_cmp(b), + (Timestamp(a), Timestamp(b)) => a.partial_cmp(b), + (TimestampNtz(a), TimestampNtz(b)) => a.partial_cmp(b), + (Date(a), Date(b)) => a.partial_cmp(b), + (Binary(a), Binary(b)) => a.partial_cmp(b), + (Decimal(a, p1, s1), Decimal(b, p2, s2)) => { + // TODO implement proper decimal comparison + if p1 != p2 || s1 != s2 { + return None; + }; + a.partial_cmp(b) + } + // TODO should we make an assumption about the ordering of nulls? + // rigth now this is only used for internal purposes. + (Null(_), _) => Some(Ordering::Less), + (_, Null(_)) => Some(Ordering::Greater), + _ => None, + } + } +} + /// A Struct used for filtering a DeltaTable partition by key and value. #[derive(Clone, Debug, PartialEq, Eq)] pub struct PartitionFilter { @@ -48,7 +86,7 @@ fn compare_typed_value( match data_type { DataType::Primitive(primitive_type) => { let other = primitive_type.parse_scalar(filter_value).ok()?; - partition_value.partial_cmp(&other) + ScalarHelper(partition_value).partial_cmp(&ScalarHelper(&other)) } // NOTE: complex types are not supported as partition columns _ => None, @@ -124,6 +162,36 @@ impl PartitionFilter { } } +/// Create desired string representation for PartitionFilter. +/// Used in places like predicate in operationParameters, etc. +impl Serialize for PartitionFilter { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let s = match &self.value { + PartitionValue::Equal(value) => format!("{} = '{}'", self.key, value), + PartitionValue::NotEqual(value) => format!("{} != '{}'", self.key, value), + PartitionValue::GreaterThan(value) => format!("{} > '{}'", self.key, value), + PartitionValue::GreaterThanOrEqual(value) => format!("{} >= '{}'", self.key, value), + PartitionValue::LessThan(value) => format!("{} < '{}'", self.key, value), + PartitionValue::LessThanOrEqual(value) => format!("{} <= '{}'", self.key, value), + // used upper case for IN and NOT similar to SQL + PartitionValue::In(values) => { + let quoted_values: Vec = + values.iter().map(|v| format!("'{}'", v)).collect(); + format!("{} IN ({})", self.key, quoted_values.join(", ")) + } + PartitionValue::NotIn(values) => { + let quoted_values: Vec = + values.iter().map(|v| format!("'{}'", v)).collect(); + format!("{} NOT IN ({})", self.key, quoted_values.join(", ")) + } + }; + serializer.serialize_str(&s) + } +} + /// Create a PartitionFilter from a filter Tuple with the structure (key, operation, value). impl TryFrom<(&str, &str, &str)> for PartitionFilter { type Error = DeltaTableError; @@ -207,3 +275,55 @@ impl DeltaTablePartition { } } } + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + fn check_json_serialize(filter: PartitionFilter, expected_json: &str) { + assert_eq!(serde_json::to_value(filter).unwrap(), json!(expected_json)) + } + + #[test] + fn test_serialize_partition_filter() { + check_json_serialize( + PartitionFilter::try_from(("date", "=", "2022-05-22")).unwrap(), + "date = '2022-05-22'", + ); + check_json_serialize( + PartitionFilter::try_from(("date", "!=", "2022-05-22")).unwrap(), + "date != '2022-05-22'", + ); + check_json_serialize( + PartitionFilter::try_from(("date", ">", "2022-05-22")).unwrap(), + "date > '2022-05-22'", + ); + check_json_serialize( + PartitionFilter::try_from(("date", ">=", "2022-05-22")).unwrap(), + "date >= '2022-05-22'", + ); + check_json_serialize( + PartitionFilter::try_from(("date", "<", "2022-05-22")).unwrap(), + "date < '2022-05-22'", + ); + check_json_serialize( + PartitionFilter::try_from(("date", "<=", "2022-05-22")).unwrap(), + "date <= '2022-05-22'", + ); + check_json_serialize( + PartitionFilter::try_from(("date", "in", vec!["2023-11-04", "2023-06-07"].as_slice())) + .unwrap(), + "date IN ('2023-11-04', '2023-06-07')", + ); + check_json_serialize( + PartitionFilter::try_from(( + "date", + "not in", + vec!["2023-11-04", "2023-06-07"].as_slice(), + )) + .unwrap(), + "date NOT IN ('2023-11-04', '2023-06-07')", + ); + } +} diff --git a/crates/core/src/storage/file.rs b/crates/core/src/storage/file.rs index c63a00dae6..f7fa168127 100644 --- a/crates/core/src/storage/file.rs +++ b/crates/core/src/storage/file.rs @@ -6,12 +6,12 @@ use bytes::Bytes; use futures::stream::BoxStream; use object_store::{ local::LocalFileSystem, path::Path as ObjectStorePath, Error as ObjectStoreError, GetOptions, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + GetResult, ListResult, ObjectMeta, ObjectStore, PutOptions, PutResult, Result as ObjectStoreResult, }; +use object_store::{MultipartUpload, PutMultipartOpts, PutPayload}; use std::ops::Range; use std::sync::Arc; -use tokio::io::AsyncWrite; use url::Url; const STORE_NAME: &str = "DeltaLocalObjectStore"; @@ -166,14 +166,18 @@ impl std::fmt::Display for FileStorageBackend { #[async_trait::async_trait] impl ObjectStore for FileStorageBackend { - async fn put(&self, location: &ObjectStorePath, bytes: Bytes) -> ObjectStoreResult { + async fn put( + &self, + location: &ObjectStorePath, + bytes: PutPayload, + ) -> ObjectStoreResult { self.inner.put(location, bytes).await } async fn put_opts( &self, location: &ObjectStorePath, - bytes: Bytes, + bytes: PutPayload, options: PutOptions, ) -> ObjectStoreResult { self.inner.put_opts(location, bytes, options).await @@ -254,16 +258,16 @@ impl ObjectStore for FileStorageBackend { async fn put_multipart( &self, location: &ObjectStorePath, - ) -> ObjectStoreResult<(MultipartId, Box)> { + ) -> ObjectStoreResult> { self.inner.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &ObjectStorePath, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.inner.put_multipart_opts(location, options).await } } diff --git a/crates/core/src/storage/mod.rs b/crates/core/src/storage/mod.rs index 02a307d51f..3c38a337af 100644 --- a/crates/core/src/storage/mod.rs +++ b/crates/core/src/storage/mod.rs @@ -1,6 +1,7 @@ //! Object storage backend abstraction layer for Delta Table transaction logs and data use dashmap::DashMap; +use object_store::limit::LimitStore; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; @@ -9,6 +10,7 @@ use serde::{Deserialize, Serialize}; use url::Url; pub mod file; +pub mod retry_ext; pub mod utils; use crate::{DeltaResult, DeltaTableError}; @@ -22,6 +24,7 @@ pub use object_store::{ DynObjectStore, Error as ObjectStoreError, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result as ObjectStoreResult, }; +pub use retry_ext::ObjectStoreRetryExt; pub use utils::*; lazy_static! { @@ -48,18 +51,20 @@ impl ObjectStoreFactory for DefaultObjectStoreFactory { fn parse_url_opts( &self, url: &Url, - _options: &StorageOptions, + options: &StorageOptions, ) -> DeltaResult<(ObjectStoreRef, Path)> { match url.scheme() { "memory" => { let path = Path::from_url_path(url.path())?; - let store: Arc = Arc::new(InMemory::new()) as ObjectStoreRef; - Ok((url_prefix_handler(store, path.clone())?, path)) + let inner = Arc::new(InMemory::new()) as ObjectStoreRef; + let store = limit_store_handler(url_prefix_handler(inner, path.clone()), options); + Ok((store, path)) } "file" => { - let store = Arc::new(LocalFileSystem::new_with_prefix( + let inner = Arc::new(LocalFileSystem::new_with_prefix( url.to_file_path().unwrap(), )?) as ObjectStoreRef; + let store = limit_store_handler(inner, options); Ok((store, Path::from("/"))) } _ => Err(DeltaTableError::InvalidTableLocation(url.clone().into())), @@ -146,17 +151,44 @@ pub fn str_is_truthy(val: &str) -> bool { /// Simple function to wrap the given [ObjectStore] in a [PrefixStore] if necessary /// -/// This simplifies the use of t he storage since it ensures that list/get/etc operations +/// This simplifies the use of the storage since it ensures that list/get/etc operations /// start from the prefix in the object storage rather than from the root configured URI of the /// [ObjectStore] -pub fn url_prefix_handler(store: T, prefix: Path) -> DeltaResult { +pub fn url_prefix_handler(store: T, prefix: Path) -> ObjectStoreRef { if prefix != Path::from("/") { - Ok(Arc::new(PrefixStore::new(store, prefix))) + Arc::new(PrefixStore::new(store, prefix)) } else { - Ok(Arc::new(store)) + Arc::new(store) } } +/// Simple function to wrap the given [ObjectStore] in a [LimitStore] if configured +/// +/// Limits the number of concurrent connections the underlying object store +/// Reference [LimitStore](https://docs.rs/object_store/latest/object_store/limit/struct.LimitStore.html) for more information +pub fn limit_store_handler(store: T, options: &StorageOptions) -> ObjectStoreRef { + let concurrency_limit = options + .0 + .get(storage_constants::OBJECT_STORE_CONCURRENCY_LIMIT) + .and_then(|v| v.parse().ok()); + + if let Some(limit) = concurrency_limit { + Arc::new(LimitStore::new(store, limit)) + } else { + Arc::new(store) + } +} + +/// Storage option keys to use when creating [ObjectStore]. +/// The same key should be used whether passing a key in the hashmap or setting it as an environment variable. +/// Must be implemented for a given storage provider +pub mod storage_constants { + + /// The number of concurrent connections the underlying object store can create + /// Reference [LimitStore](https://docs.rs/object_store/latest/object_store/limit/struct.LimitStore.html) for more information + pub const OBJECT_STORE_CONCURRENCY_LIMIT: &str = "OBJECT_STORE_CONCURRENCY_LIMIT"; +} + #[cfg(test)] mod tests { use super::*; @@ -166,7 +198,28 @@ mod tests { let store = InMemory::new(); let path = Path::parse("/databases/foo/bar").expect("Failed to parse path"); - let prefixed = url_prefix_handler(store, path); - assert!(prefixed.is_ok()); + let prefixed = url_prefix_handler(store, path.clone()); + + assert_eq!( + String::from("PrefixObjectStore(databases/foo/bar)"), + format!("{prefixed}") + ); + } + + #[test] + fn test_limit_store_handler() { + let store = InMemory::new(); + + let options = StorageOptions(HashMap::from_iter(vec![( + "OBJECT_STORE_CONCURRENCY_LIMIT".into(), + "500".into(), + )])); + + let limited = limit_store_handler(store, &options); + + assert_eq!( + String::from("LimitStore(500, InMemory)"), + format!("{limited}") + ); } } diff --git a/crates/core/src/storage/retry_ext.rs b/crates/core/src/storage/retry_ext.rs new file mode 100644 index 0000000000..b63c29a8ae --- /dev/null +++ b/crates/core/src/storage/retry_ext.rs @@ -0,0 +1,81 @@ +//! Retry extension for [`ObjectStore`] + +use object_store::{path::Path, Error, ObjectStore, PutPayload, PutResult, Result}; +use tracing::log::*; + +/// Retry extension for [`ObjectStore`] +/// +/// Read-only operations are retried by [`ObjectStore`] internally. However, PUT/DELETE operations +/// are not retried even thought they are technically idempotent. [`ObjectStore`] does not retry +/// those operations because having preconditions may produce different results for the same +/// request. PUT/DELETE operations without preconditions are idempotent and can be retried. +/// Unfortunately, [`ObjectStore`]'s retry mechanism only works on HTTP request level, thus there +/// is no way to distinguish whether a request has preconditions or not. +/// +/// This trait provides additional methods for working with [`ObjectStore`] that automatically retry +/// unconditional operations when they fail. +/// +/// See also: +/// - https://github.com/apache/arrow-rs/pull/5278 +#[async_trait::async_trait] +pub trait ObjectStoreRetryExt: ObjectStore { + /// Save the provided bytes to the specified location + /// + /// The operation is guaranteed to be atomic, it will either successfully write the entirety of + /// bytes to location, or fail. No clients should be able to observe a partially written object + /// + /// Note that `put_with_opts` may have precondition semantics, and thus may not be retriable. + async fn put_with_retries( + &self, + location: &Path, + bytes: PutPayload, + max_retries: usize, + ) -> Result { + let mut attempt_number = 1; + while attempt_number <= max_retries { + match self.put(location, bytes.clone()).await { + Ok(result) => return Ok(result), + Err(err) if attempt_number == max_retries => { + return Err(err); + } + Err(Error::Generic { store, source }) => { + debug!( + "put_with_retries attempt {} failed: {} {}", + attempt_number, store, source + ); + attempt_number += 1; + } + Err(err) => { + return Err(err); + } + } + } + unreachable!("loop yields Ok or Err in body when attempt_number = max_retries") + } + + /// Delete the object at the specified location + async fn delete_with_retries(&self, location: &Path, max_retries: usize) -> Result<()> { + let mut attempt_number = 1; + while attempt_number <= max_retries { + match self.delete(location).await { + Ok(()) | Err(Error::NotFound { .. }) => return Ok(()), + Err(err) if attempt_number == max_retries => { + return Err(err); + } + Err(Error::Generic { store, source }) => { + debug!( + "delete_with_retries attempt {} failed: {} {}", + attempt_number, store, source + ); + attempt_number += 1; + } + Err(err) => { + return Err(err); + } + } + } + unreachable!("loop yields Ok or Err in body when attempt_number = max_retries") + } +} + +impl ObjectStoreRetryExt for T {} diff --git a/crates/core/src/storage/utils.rs b/crates/core/src/storage/utils.rs index e4dde08387..7ea5464b31 100644 --- a/crates/core/src/storage/utils.rs +++ b/crates/core/src/storage/utils.rs @@ -1,6 +1,6 @@ //! Utility functions for working across Delta tables -use chrono::{NaiveDateTime, TimeZone, Utc}; +use chrono::DateTime; use futures::TryStreamExt; use object_store::path::Path; use object_store::{DynObjectStore, ObjectMeta, Result as ObjectStoreResult}; @@ -32,14 +32,13 @@ impl TryFrom<&Add> for ObjectMeta { type Error = DeltaTableError; fn try_from(value: &Add) -> DeltaResult { - let last_modified = Utc.from_utc_datetime( - &NaiveDateTime::from_timestamp_millis(value.modification_time).ok_or( - DeltaTableError::from(crate::protocol::ProtocolError::InvalidField(format!( - "invalid modification_time: {:?}", - value.modification_time - ))), - )?, - ); + let last_modified = DateTime::from_timestamp_millis(value.modification_time).ok_or( + DeltaTableError::from(crate::protocol::ProtocolError::InvalidField(format!( + "invalid modification_time: {:?}", + value.modification_time + ))), + )?; + Ok(Self { // TODO this won't work for absolute paths, since Paths are always relative to store. location: Path::parse(value.path.as_str())?, diff --git a/crates/core/src/table/builder.rs b/crates/core/src/table/builder.rs index e9bf74d1e5..b421a6199b 100644 --- a/crates/core/src/table/builder.rs +++ b/crates/core/src/table/builder.rs @@ -39,7 +39,7 @@ impl From for DeltaTableError { } /// possible version specifications for loading a delta table -#[derive(Debug, Clone, PartialEq, Eq, Default)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)] pub enum DeltaVersion { /// load the newest version #[default] @@ -186,7 +186,7 @@ impl DeltaTableBuilder { ensure_file_location_exists(PathBuf::from(table_uri.as_ref()))?; } - let url = ensure_table_uri(&table_uri).expect("The specified table_uri is not valid"); + let url = ensure_table_uri(&table_uri)?; debug!("creating table builder with {url}"); Ok(Self { @@ -321,7 +321,7 @@ impl DeltaTableBuilder { /// Build the [`DeltaTable`] and load its state pub async fn load(self) -> DeltaResult { - let version = self.options.version.clone(); + let version = self.options.version; let mut table = self.build()?; match version { DeltaVersion::Newest => table.load().await?, @@ -554,4 +554,11 @@ mod tests { let url = ensure_table_uri(&expected).unwrap(); assert_eq!(expected.as_str().trim_end_matches('/'), url.as_str()); } + + #[test] + fn test_invalid_uri() { + // Urls should round trips as-is + DeltaTableBuilder::from_valid_uri("this://is.nonsense") + .expect_err("this should be an error"); + } } diff --git a/crates/core/src/table/config.rs b/crates/core/src/table/config.rs index 24b11a01a4..47307cfecd 100644 --- a/crates/core/src/table/config.rs +++ b/crates/core/src/table/config.rs @@ -2,12 +2,12 @@ use std::time::Duration; use std::{collections::HashMap, str::FromStr}; +use delta_kernel::features::ColumnMappingMode; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; -use crate::errors::DeltaTableError; - use super::Constraint; +use crate::errors::DeltaTableError; /// Typed property keys that can be defined on a delta table /// @@ -208,6 +208,9 @@ macro_rules! table_config { /// Well known delta table configuration pub struct TableConfig<'a>(pub(crate) &'a HashMap>); +/// Default num index cols +pub const DEFAULT_NUM_INDEX_COLS: i32 = 32; + impl<'a> TableConfig<'a> { table_config!( ( @@ -249,7 +252,7 @@ impl<'a> TableConfig<'a> { ( "true to enable deletion vectors and predictive I/O for updates.", DeltaConfigKey::EnableDeletionVectors, - enable_deletio0n_vectors, + enable_deletion_vectors, bool, // in databricks the default is dependent on the workspace settings and runtime version // https://learn.microsoft.com/en-us/azure/databricks/administration-guide/workspace-settings/deletion-vectors @@ -274,7 +277,7 @@ impl<'a> TableConfig<'a> { DeltaConfigKey::CheckpointInterval, checkpoint_interval, i32, - 10 + 100 ), ); @@ -289,7 +292,7 @@ impl<'a> TableConfig<'a> { /// than this value. Otherwise, the query may not be able to restart, as it must still read old files. pub fn deleted_file_retention_duration(&self) -> Duration { lazy_static! { - static ref DEFAULT_DURATION: Duration = parse_interval("interval 1 week").unwrap(); + static ref DEFAULT_DURATION: Duration = parse_interval("interval 1 weeks").unwrap(); } self.0 .get(DeltaConfigKey::DeletedFileRetentionDuration.as_ref()) @@ -305,7 +308,7 @@ impl<'a> TableConfig<'a> { /// constant time. Operations on history are parallel but will become more expensive as the log size increases. pub fn log_retention_duration(&self) -> Duration { lazy_static! { - static ref DEFAULT_DURATION: Duration = parse_interval("interval 30 day").unwrap(); + static ref DEFAULT_DURATION: Duration = parse_interval("interval 30 days").unwrap(); } self.0 .get(DeltaConfigKey::LogRetentionDuration.as_ref()) @@ -362,7 +365,7 @@ impl<'a> TableConfig<'a> { } } -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq)] /// The isolation level applied during transaction pub enum IsolationLevel { /// The strongest isolation level. It ensures that committed write operations @@ -460,49 +463,6 @@ impl FromStr for CheckpointPolicy { } } -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] -/// The Column Mapping modes used for reading and writing data -#[serde(rename_all = "camelCase")] -pub enum ColumnMappingMode { - /// No column mapping is applied - None, - /// Columns are mapped by their field_id in parquet - Id, - /// Columns are mapped to a physical name - Name, -} - -impl Default for ColumnMappingMode { - fn default() -> Self { - Self::None - } -} - -impl AsRef for ColumnMappingMode { - fn as_ref(&self) -> &str { - match self { - Self::None => "none", - Self::Id => "id", - Self::Name => "name", - } - } -} - -impl FromStr for ColumnMappingMode { - type Err = DeltaTableError; - - fn from_str(s: &str) -> Result { - match s.to_ascii_lowercase().as_str() { - "none" => Ok(Self::None), - "id" => Ok(Self::Id), - "name" => Ok(Self::Name), - _ => Err(DeltaTableError::Generic( - "Invalid string for ColumnMappingMode".into(), - )), - } - } -} - const SECONDS_PER_MINUTE: u64 = 60; const SECONDS_PER_HOUR: u64 = 60 * SECONDS_PER_MINUTE; const SECONDS_PER_DAY: u64 = 24 * SECONDS_PER_HOUR; @@ -525,14 +485,14 @@ fn parse_interval(value: &str) -> Result { let number = number as u64; let duration = match it.next().ok_or_else(not_an_interval)? { - "nanosecond" => Duration::from_nanos(number), - "microsecond" => Duration::from_micros(number), - "millisecond" => Duration::from_millis(number), - "second" => Duration::from_secs(number), - "minute" => Duration::from_secs(number * SECONDS_PER_MINUTE), - "hour" => Duration::from_secs(number * SECONDS_PER_HOUR), - "day" => Duration::from_secs(number * SECONDS_PER_DAY), - "week" => Duration::from_secs(number * SECONDS_PER_WEEK), + "nanosecond" | "nanoseconds" => Duration::from_nanos(number), + "microsecond" | "microseconds" => Duration::from_micros(number), + "millisecond" | "milliseconds" => Duration::from_millis(number), + "second" | "seconds" => Duration::from_secs(number), + "minute" | "minutes" => Duration::from_secs(number * SECONDS_PER_MINUTE), + "hour" | "hours" => Duration::from_secs(number * SECONDS_PER_HOUR), + "day" | "days" => Duration::from_secs(number * SECONDS_PER_DAY), + "week" | "weeks" => Duration::from_secs(number * SECONDS_PER_WEEK), unit => { return Err(DeltaConfigError::Validation(format!( "Unknown unit '{unit}'" @@ -591,7 +551,7 @@ mod tests { fn get_long_from_metadata_test() { let md = dummy_metadata(); let config = TableConfig(&md.configuration); - assert_eq!(config.checkpoint_interval(), 10,) + assert_eq!(config.checkpoint_interval(), 100,) } #[test] @@ -620,36 +580,76 @@ mod tests { Duration::from_nanos(123) ); + assert_eq!( + parse_interval("interval 123 nanoseconds").unwrap(), + Duration::from_nanos(123) + ); + assert_eq!( parse_interval("interval 123 microsecond").unwrap(), Duration::from_micros(123) ); + assert_eq!( + parse_interval("interval 123 microseconds").unwrap(), + Duration::from_micros(123) + ); + assert_eq!( parse_interval("interval 123 millisecond").unwrap(), Duration::from_millis(123) ); + assert_eq!( + parse_interval("interval 123 milliseconds").unwrap(), + Duration::from_millis(123) + ); + assert_eq!( parse_interval("interval 123 second").unwrap(), Duration::from_secs(123) ); + assert_eq!( + parse_interval("interval 123 seconds").unwrap(), + Duration::from_secs(123) + ); + assert_eq!( parse_interval("interval 123 minute").unwrap(), Duration::from_secs(123 * 60) ); + assert_eq!( + parse_interval("interval 123 minutes").unwrap(), + Duration::from_secs(123 * 60) + ); + assert_eq!( parse_interval("interval 123 hour").unwrap(), Duration::from_secs(123 * 3600) ); + assert_eq!( + parse_interval("interval 123 hours").unwrap(), + Duration::from_secs(123 * 3600) + ); + assert_eq!( parse_interval("interval 123 day").unwrap(), Duration::from_secs(123 * 86400) ); + assert_eq!( + parse_interval("interval 123 days").unwrap(), + Duration::from_secs(123 * 86400) + ); + + assert_eq!( + parse_interval("interval 123 week").unwrap(), + Duration::from_secs(123 * 604800) + ); + assert_eq!( parse_interval("interval 123 week").unwrap(), Duration::from_secs(123 * 604800) diff --git a/crates/core/src/table/mod.rs b/crates/core/src/table/mod.rs index 7615c72dc3..10ca7bd770 100644 --- a/crates/core/src/table/mod.rs +++ b/crates/core/src/table/mod.rs @@ -1,24 +1,24 @@ //! Delta Table read and write implementation -use std::cmp::Ordering; +use std::cmp::{min, Ordering}; use std::collections::HashMap; use std::fmt; use std::fmt::Formatter; use chrono::{DateTime, Utc}; -use futures::TryStreamExt; +use futures::{StreamExt, TryStreamExt}; use object_store::{path::Path, ObjectStore}; use serde::de::{Error, SeqAccess, Visitor}; use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use tracing::debug; use self::builder::DeltaTableConfig; use self::state::DeltaTableState; use crate::kernel::{ Action, CommitInfo, DataCheck, DataType, LogicalFile, Metadata, Protocol, StructType, + Transaction, }; -use crate::logstore::{self, LogStoreConfig, LogStoreRef}; +use crate::logstore::{self, extract_version_from_filename, LogStoreConfig, LogStoreRef}; use crate::partitions::PartitionFilter; use crate::storage::{commit_uri_from_version, ObjectStoreRef}; use crate::{DeltaResult, DeltaTableError}; @@ -163,7 +163,6 @@ pub(crate) fn get_partition_col_data_types<'a>( // When loading `partitionValues_parsed` we have to convert the stringified partition values back to the correct data type. schema .fields() - .iter() .filter_map(|f| { if metadata .partition_columns @@ -190,6 +189,7 @@ pub enum PeekCommit { } /// In memory representation of a Delta Table +#[derive(Clone)] pub struct DeltaTable { /// The state of the table as of the most recent loaded Delta log entry. pub state: Option, @@ -287,6 +287,11 @@ impl DeltaTable { self.log_store.object_store() } + /// Check if the [`DeltaTable`] exists + pub async fn verify_deltatable_existence(&self) -> DeltaResult { + self.log_store.is_delta_table_location().await + } + /// The URI of the underlying data pub fn table_uri(&self) -> String { self.log_store.root_uri() @@ -340,10 +345,6 @@ impl DeltaTable { &mut self, max_version: Option, ) -> Result<(), DeltaTableError> { - debug!( - "incremental update with version({}) and max_version({max_version:?})", - self.version(), - ); match self.state.as_mut() { Some(state) => state.update(self.log_store.clone(), max_version).await, _ => { @@ -461,7 +462,7 @@ impl DeltaTable { .map(|path| self.log_store.to_uri(&path))) } - /// Get the number of files in the table - retrn 0 if no metadata is loaded + /// Get the number of files in the table - returns 0 if no metadata is loaded pub fn get_files_count(&self) -> usize { self.state.as_ref().map(|s| s.files_count()).unwrap_or(0) } @@ -486,10 +487,11 @@ impl DeltaTable { } /// Returns the current version of the DeltaTable based on the loaded metadata. - pub fn get_app_transaction_version(&self) -> HashMap { + pub fn get_app_transaction_version(&self) -> HashMap { self.state .as_ref() - .map(|s| s.app_transaction_version().clone()) + .and_then(|s| s.app_transaction_version().ok()) + .map(|it| it.map(|t| (t.app_id.clone(), t)).collect()) .unwrap_or_default() } @@ -513,9 +515,29 @@ impl DeltaTable { &mut self, datetime: DateTime, ) -> Result<(), DeltaTableError> { - let mut min_version = 0; + let mut min_version: i64 = -1; + let log_store = self.log_store(); + let prefix = Some(log_store.log_path()); + let offset_path = commit_uri_from_version(min_version); + let object_store = log_store.object_store(); + let mut files = object_store.list_with_offset(prefix, &offset_path); + + while let Some(obj_meta) = files.next().await { + let obj_meta = obj_meta?; + if let Some(log_version) = extract_version_from_filename(obj_meta.location.as_ref()) { + if min_version == -1 { + min_version = log_version + } else { + min_version = min(min_version, log_version); + } + } + if min_version == 0 { + break; + } + } let mut max_version = self.get_latest_version().await?; let mut version = min_version; + let lowest_table_version = min_version; let target_ts = datetime.timestamp_millis(); // binary search @@ -537,8 +559,8 @@ impl DeltaTable { } } - if version < 0 { - version = 0; + if version < lowest_table_version { + version = lowest_table_version; } self.load_version(version).await diff --git a/crates/core/src/table/state.rs b/crates/core/src/table/state.rs index ab5c229c49..9544198581 100644 --- a/crates/core/src/table/state.rs +++ b/crates/core/src/table/state.rs @@ -1,6 +1,6 @@ //! The module for delta table state. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use chrono::Utc; @@ -8,22 +8,21 @@ use futures::TryStreamExt; use object_store::{path::Path, ObjectStore}; use serde::{Deserialize, Serialize}; -use super::config::TableConfig; -use super::{get_partition_col_data_types, DeltaTableConfig}; +use super::{config::TableConfig, get_partition_col_data_types, DeltaTableConfig}; +#[cfg(test)] +use crate::kernel::Action; use crate::kernel::{ - Action, Add, DataType, EagerSnapshot, LogDataHandler, LogicalFile, Metadata, Protocol, Remove, - StructType, + ActionType, Add, AddCDCFile, DataType, EagerSnapshot, LogDataHandler, LogicalFile, Metadata, + Protocol, Remove, StructType, Transaction, }; use crate::logstore::LogStore; use crate::partitions::{DeltaTablePartition, PartitionFilter}; -use crate::protocol::DeltaOperation; use crate::{DeltaResult, DeltaTableError}; /// State snapshot currently held by the Delta Table instance. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct DeltaTableState { - app_transaction_version: HashMap, pub(crate) snapshot: EagerSnapshot, } @@ -35,11 +34,15 @@ impl DeltaTableState { config: DeltaTableConfig, version: Option, ) -> DeltaResult { - let snapshot = EagerSnapshot::try_new(table_root, store.clone(), config, version).await?; - Ok(Self { - snapshot, - app_transaction_version: HashMap::new(), - }) + let snapshot = EagerSnapshot::try_new_with_visitor( + table_root, + store.clone(), + config, + version, + HashSet::from([ActionType::Txn]), + ) + .await?; + Ok(Self { snapshot }) } /// Return table version @@ -57,7 +60,9 @@ impl DeltaTableState { /// Construct a delta table state object from a list of actions #[cfg(test)] pub fn from_actions(actions: Vec) -> DeltaResult { - use crate::protocol::SaveMode; + use crate::operations::transaction::CommitData; + use crate::protocol::{DeltaOperation, SaveMode}; + let metadata = actions .iter() .find_map(|a| match a { @@ -73,7 +78,7 @@ impl DeltaTableState { }) .ok_or(DeltaTableError::NotInitialized)?; - let commit_data = [( + let commit_data = [CommitData::new( actions, DeltaOperation::Create { mode: SaveMode::Append, @@ -81,13 +86,12 @@ impl DeltaTableState { protocol: protocol.clone(), metadata: metadata.clone(), }, - None, + HashMap::new(), + Vec::new(), )]; + let snapshot = EagerSnapshot::new_test(&commit_data).unwrap(); - Ok(Self { - app_transaction_version: Default::default(), - snapshot, - }) + Ok(Self { snapshot }) } /// Returns a semantic accessor to the currently loaded log data. @@ -133,11 +137,22 @@ impl DeltaTableState { Ok(self.snapshot.file_actions()?.collect()) } + /// Full list of add actions representing all parquet files that are part of the current + /// delta table state. + pub fn file_actions_iter(&self) -> DeltaResult + '_> { + self.snapshot.file_actions() + } + /// Get the number of files in the current table state pub fn files_count(&self) -> usize { self.snapshot.files_count() } + /// Full list of all of the CDC files added as part of the changeDataFeed feature + pub fn cdc_files(&self) -> DeltaResult + '_> { + self.snapshot.cdc_files() + } + /// Returns an iterator of file names present in the loaded state #[inline] pub fn file_paths_iter(&self) -> impl Iterator + '_ { @@ -146,10 +161,9 @@ impl DeltaTableState { .map(|add| add.object_store_path()) } - /// HashMap containing the last txn version stored for every app id writing txn - /// actions. - pub fn app_transaction_version(&self) -> &HashMap { - &self.app_transaction_version + /// HashMap containing the last transaction stored for every application. + pub fn app_transaction_version(&self) -> DeltaResult + '_> { + self.snapshot.transactions() } /// The most recent protocol of the table. @@ -172,26 +186,9 @@ impl DeltaTableState { self.snapshot.table_config() } - /// Merges new state information into our state - /// - /// The DeltaTableState also carries the version information for the given state, - /// as there is a one-to-one match between a table state and a version. In merge/update - /// scenarios we cannot infer the intended / correct version number. By default this - /// function will update the tracked version if the version on `new_state` is larger then the - /// currently set version however it is up to the caller to update the `version` field according - /// to the version the merged state represents. - pub(crate) fn merge( - &mut self, - actions: Vec, - operation: &DeltaOperation, - version: i64, - ) -> Result<(), DeltaTableError> { - let commit_infos = vec![(actions, operation.clone(), None)]; - let new_version = self.snapshot.advance(&commit_infos)?; - if new_version != version { - return Err(DeltaTableError::Generic("Version mismatch".to_string())); - } - Ok(()) + /// Obtain the Eager snapshot of the state + pub fn snapshot(&self) -> &EagerSnapshot { + &self.snapshot } /// Update the state of the table to the given version. diff --git a/crates/core/src/table/state_arrow.rs b/crates/core/src/table/state_arrow.rs index 143ab23d1c..197e8d7fd3 100644 --- a/crates/core/src/table/state_arrow.rs +++ b/crates/core/src/table/state_arrow.rs @@ -14,9 +14,9 @@ use arrow_array::{ StringArray, StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, }; use arrow_schema::{DataType, Field, Fields, TimeUnit}; +use delta_kernel::features::ColumnMappingMode; use itertools::Itertools; -use super::config::ColumnMappingMode; use super::state::DeltaTableState; use crate::errors::DeltaTableError; use crate::kernel::{Add, DataType as DeltaDataType, StructType}; @@ -91,7 +91,7 @@ impl DeltaTableState { .fields .iter() .map(|field| Cow::Owned(field.name().clone())) - .zip(partition_cols_batch.columns().iter().map(Arc::clone)), + .zip(partition_cols_batch.columns().iter().cloned()), ) } @@ -103,7 +103,7 @@ impl DeltaTableState { .fields .iter() .map(|field| Cow::Owned(field.name().clone())) - .zip(stats.columns().iter().map(Arc::clone)), + .zip(stats.columns().iter().cloned()), ); } if files.iter().any(|add| add.deletion_vector.is_some()) { @@ -114,7 +114,7 @@ impl DeltaTableState { .fields .iter() .map(|field| Cow::Owned(field.name().clone())) - .zip(delvs.columns().iter().map(Arc::clone)), + .zip(delvs.columns().iter().cloned()), ); } if files.iter().any(|add| { @@ -129,7 +129,7 @@ impl DeltaTableState { .fields .iter() .map(|field| Cow::Owned(field.name().clone())) - .zip(tags.columns().iter().map(Arc::clone)), + .zip(tags.columns().iter().cloned()), ); } @@ -149,7 +149,13 @@ impl DeltaTableState { .map( |name| -> Result { let schema = metadata.schema()?; - let field = schema.field_with_name(name)?; + let field = + schema + .field(name) + .ok_or(DeltaTableError::MetadataError(format!( + "Invalid partition column {0}", + name + )))?; Ok(field.data_type().try_into()?) }, ) @@ -173,12 +179,12 @@ impl DeltaTableState { .map(|name| -> Result<_, DeltaTableError> { let physical_name = self .schema() - .field_with_name(name) - .or(Err(DeltaTableError::MetadataError(format!( + .field(name) + .ok_or(DeltaTableError::MetadataError(format!( "Invalid partition column {0}", name - ))))? - .physical_name()? + )))? + .physical_name(column_mapping_mode)? .to_string(); Ok((physical_name, name.as_str())) }) @@ -328,7 +334,7 @@ impl DeltaTableState { for add in files { if let Some(value) = &add.deletion_vector { - storage_type.append_value(&value.storage_type); + storage_type.append_value(value.storage_type); path_or_inline_div.append_value(value.path_or_inline_dv.clone()); if let Some(ofs) = value.offset { offset.append_value(ofs); @@ -397,8 +403,7 @@ impl DeltaTableState { flatten: bool, ) -> Result { let stats: Vec> = self - .file_actions()? - .iter() + .file_actions_iter()? .map(|f| { f.get_stats() .map_err(|err| DeltaTableError::InvalidStatsJson { json_err: err }) @@ -447,11 +452,12 @@ impl DeltaTableState { .map(|(path, datatype)| -> Result { let null_count = stats .iter() - .flat_map(|maybe_stat| { + .map(|maybe_stat| { maybe_stat .as_ref() .map(|stat| resolve_column_count_stat(&stat.null_count, &path)) }) + .map(|null_count| null_count.flatten()) .collect::>>(); let null_count = Some(value_vec_to_array(null_count, |values| { Ok(Arc::new(arrow::array::Int64Array::from(values))) @@ -463,11 +469,12 @@ impl DeltaTableState { let min_values = if matches!(datatype, DeltaDataType::Primitive(_)) { let min_values = stats .iter() - .flat_map(|maybe_stat| { + .map(|maybe_stat| { maybe_stat .as_ref() .map(|stat| resolve_column_value_stat(&stat.min_values, &path)) }) + .map(|min_value| min_value.flatten()) .collect::>>(); Some(value_vec_to_array(min_values, |values| { @@ -480,11 +487,12 @@ impl DeltaTableState { let max_values = if matches!(datatype, DeltaDataType::Primitive(_)) { let max_values = stats .iter() - .flat_map(|maybe_stat| { + .map(|maybe_stat| { maybe_stat .as_ref() .map(|stat| resolve_column_value_stat(&stat.max_values, &path)) }) + .map(|max_value| max_value.flatten()) .collect::>>(); Some(value_vec_to_array(max_values, |values| { json_value_to_array_general(&arrow_type, values.into_iter()) @@ -570,7 +578,7 @@ impl DeltaTableState { // into StructArrays, until it is consolidated into a single array. columnar_stats = columnar_stats .into_iter() - .group_by(|col_stat| { + .chunk_by(|col_stat| { if col_stat.path.len() < level { col_stat.path.clone() } else { @@ -672,7 +680,6 @@ impl<'a> SchemaLeafIterator<'a> { SchemaLeafIterator { fields_remaining: schema .fields() - .iter() .map(|field| (vec![field.name().as_ref()], field.data_type())) .collect(), } @@ -737,8 +744,8 @@ fn json_value_to_array_general<'a>( .map(|value| value.and_then(|value| value.as_str().map(|value| value.as_bytes()))) .collect_vec(), ))), - DataType::Timestamp(TimeUnit::Microsecond, None) => { - Ok(Arc::new(TimestampMicrosecondArray::from( + DataType::Timestamp(TimeUnit::Microsecond, tz) => match tz { + None => Ok(Arc::new(TimestampMicrosecondArray::from( values .map(|value| { value.and_then(|value| { @@ -746,13 +753,32 @@ fn json_value_to_array_general<'a>( }) }) .collect_vec(), - ))) - } + ))), + Some(tz_str) if tz_str.as_ref() == "UTC" => Ok(Arc::new( + TimestampMicrosecondArray::from( + values + .map(|value| { + value.and_then(|value| { + value.as_str().and_then(TimestampMicrosecondType::parse) + }) + }) + .collect_vec(), + ) + .with_timezone("UTC"), + )), + _ => Err(DeltaTableError::Generic(format!( + "Invalid datatype {}", + datatype + ))), + }, DataType::Date32 => Ok(Arc::new(Date32Array::from( values .map(|value| value.and_then(|value| value.as_str().and_then(Date32Type::parse))) .collect_vec(), ))), - _ => Err(DeltaTableError::Generic("Invalid datatype".to_string())), + _ => Err(DeltaTableError::Generic(format!( + "Invalid datatype {}", + datatype + ))), } } diff --git a/crates/core/src/writer/json.rs b/crates/core/src/writer/json.rs index a51dd86b58..2cf7f6a950 100644 --- a/crates/core/src/writer/json.rs +++ b/crates/core/src/writer/json.rs @@ -1,11 +1,13 @@ //! Main writer API to write json messages to delta table -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::convert::TryFrom; use std::sync::Arc; use arrow::datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; use arrow::record_batch::*; use bytes::Bytes; +use delta_kernel::expressions::Scalar; +use indexmap::IndexMap; use object_store::path::Path; use object_store::ObjectStore; use parquet::{ @@ -21,10 +23,12 @@ use super::utils::{ arrow_schema_without_partitions, next_data_path, record_batch_from_message, record_batch_without_partitions, }; -use super::{DeltaWriter, DeltaWriterError}; +use super::{DeltaWriter, DeltaWriterError, WriteMode}; use crate::errors::DeltaTableError; -use crate::kernel::{Add, PartitionsExt, Scalar, StructType}; +use crate::kernel::{scalars::ScalarExt, Add, PartitionsExt, StructType}; +use crate::storage::ObjectStoreRetryExt; use crate::table::builder::DeltaTableBuilder; +use crate::table::config::DEFAULT_NUM_INDEX_COLS; use crate::writer::utils::ShareableBuffer; use crate::DeltaTable; @@ -45,7 +49,7 @@ pub(crate) struct DataArrowWriter { writer_properties: WriterProperties, buffer: ShareableBuffer, arrow_writer: ArrowWriter, - partition_values: BTreeMap, + partition_values: IndexMap, buffered_record_batch_count: usize, } @@ -153,7 +157,7 @@ impl DataArrowWriter { writer_properties.clone(), )?; - let partition_values = BTreeMap::new(); + let partition_values = IndexMap::new(); let buffered_record_batch_count = 0; Ok(Self { @@ -285,8 +289,20 @@ impl JsonWriter { #[async_trait::async_trait] impl DeltaWriter> for JsonWriter { - /// Writes the given values to internal parquet buffers for each represented partition. + /// Write a chunk of values into the internal write buffers with the default write mode async fn write(&mut self, values: Vec) -> Result<(), DeltaTableError> { + self.write_with_mode(values, WriteMode::Default).await + } + + /// Writes the given values to internal parquet buffers for each represented partition. + async fn write_with_mode( + &mut self, + values: Vec, + mode: WriteMode, + ) -> Result<(), DeltaTableError> { + if mode != WriteMode::Default { + warn!("The JsonWriter does not currently support non-default write modes, falling back to default mode"); + } let mut partial_writes: Vec<(Value, ParquetError)> = Vec::new(); let arrow_schema = self.arrow_schema(); let divided = self.divide_by_partition_values(values)?; @@ -347,13 +363,17 @@ impl DeltaWriter> for JsonWriter { let path = next_data_path(&prefix, 0, &uuid, &writer.writer_properties); let obj_bytes = Bytes::from(writer.buffer.to_vec()); let file_size = obj_bytes.len() as i64; - self.storage.put(&path, obj_bytes).await?; + self.storage + .put_with_retries(&path, obj_bytes.into(), 15) + .await?; actions.push(create_add( &writer.partition_values, path.to_string(), file_size, &metadata, + DEFAULT_NUM_INDEX_COLS, + &None, )?); } Ok(actions) @@ -397,8 +417,8 @@ fn quarantine_failed_parquet_rows( fn extract_partition_values( partition_cols: &[String], record_batch: &RecordBatch, -) -> Result, DeltaWriterError> { - let mut partition_values = BTreeMap::new(); +) -> Result, DeltaWriterError> { + let mut partition_values = IndexMap::new(); for col_name in partition_cols.iter() { let arrow_schema = record_batch.schema(); @@ -499,7 +519,7 @@ mod tests { &record_batch ) .unwrap(), - BTreeMap::from([ + IndexMap::from([ (String::from("col1"), Scalar::Integer(1)), (String::from("col2"), Scalar::Integer(2)), (String::from("col3"), Scalar::Null(DataType::INTEGER)), @@ -507,7 +527,7 @@ mod tests { ); assert_eq!( extract_partition_values(&[String::from("col1")], &record_batch).unwrap(), - BTreeMap::from([(String::from("col1"), Scalar::Integer(1)),]) + IndexMap::from([(String::from("col1"), Scalar::Integer(1)),]) ); assert!(extract_partition_values(&[String::from("col4")], &record_batch).is_err()) } @@ -543,4 +563,100 @@ mod tests { }) )); } + + // The following sets of tests are related to #1386 and mergeSchema support + // + mod schema_evolution { + use super::*; + + #[tokio::test] + async fn test_json_write_mismatched_values() { + let table_dir = tempfile::tempdir().unwrap(); + let schema = get_delta_schema(); + let path = table_dir.path().to_str().unwrap().to_string(); + + let arrow_schema = >::try_from(&schema).unwrap(); + let mut writer = JsonWriter::try_new( + path.clone(), + Arc::new(arrow_schema), + Some(vec!["modified".to_string()]), + None, + ) + .unwrap(); + + let data = serde_json::json!( + { + "id" : "A", + "value": 42, + "modified": "2021-02-01" + } + ); + + writer.write(vec![data]).await.unwrap(); + let add_actions = writer.flush().await.unwrap(); + assert_eq!(add_actions.len(), 1); + + let second_data = serde_json::json!( + { + "id" : 1, + "name" : "Ion" + } + ); + + if writer.write(vec![second_data]).await.is_ok() { + panic!("Should not have successfully written"); + } + } + + #[tokio::test] + async fn test_json_write_mismatched_schema() { + use crate::operations::create::CreateBuilder; + let table_dir = tempfile::tempdir().unwrap(); + let schema = get_delta_schema(); + let path = table_dir.path().to_str().unwrap().to_string(); + + let mut table = CreateBuilder::new() + .with_location(&path) + .with_table_name("test-table") + .with_comment("A table for running tests") + .with_columns(schema.fields().cloned()) + .await + .unwrap(); + table.load().await.expect("Failed to load table"); + assert_eq!(table.version(), 0); + + let arrow_schema = >::try_from(&schema).unwrap(); + let mut writer = JsonWriter::try_new( + path.clone(), + Arc::new(arrow_schema), + Some(vec!["modified".to_string()]), + None, + ) + .unwrap(); + + let data = serde_json::json!( + { + "id" : "A", + "value": 42, + "modified": "2021-02-01" + } + ); + + writer.write(vec![data]).await.unwrap(); + let add_actions = writer.flush().await.unwrap(); + assert_eq!(add_actions.len(), 1); + + let second_data = serde_json::json!( + { + "postcode" : 1, + "name" : "Ion" + } + ); + + // TODO This should fail because we haven't asked to evolve the schema + writer.write(vec![second_data]).await.unwrap(); + writer.flush_and_commit(&mut table).await.unwrap(); + assert_eq!(table.version(), 1); + } + } } diff --git a/crates/core/src/writer/mod.rs b/crates/core/src/writer/mod.rs index b39c8264cb..d3fe529a89 100644 --- a/crates/core/src/writer/mod.rs +++ b/crates/core/src/writer/mod.rs @@ -8,7 +8,7 @@ use serde_json::Value; use crate::errors::DeltaTableError; use crate::kernel::{Action, Add}; -use crate::operations::transaction::commit; +use crate::operations::transaction::CommitBuilder; use crate::protocol::{ColumnCountStat, DeltaOperation, SaveMode}; use crate::DeltaTable; @@ -116,17 +116,34 @@ impl From for DeltaTableError { DeltaWriterError::ObjectStore { source } => DeltaTableError::ObjectStore { source }, DeltaWriterError::Parquet { source } => DeltaTableError::Parquet { source }, DeltaWriterError::DeltaTable(e) => e, + DeltaWriterError::SchemaMismatch { .. } => DeltaTableError::SchemaMismatch { + msg: err.to_string(), + }, _ => DeltaTableError::Generic(err.to_string()), } } } +/// Write mode for the [DeltaWriter] +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum WriteMode { + /// Default write mode which will return an error if schemas do not match correctly + Default, + /// Merge the schema of the table with the newly written data + /// + /// [Read more here](https://delta.io/blog/2023-02-08-delta-lake-schema-evolution/) + MergeSchema, +} + #[async_trait] /// Trait for writing data to Delta tables pub trait DeltaWriter { - /// write a chunk of values into the internal write buffers. + /// Write a chunk of values into the internal write buffers with the default write mode async fn write(&mut self, values: T) -> Result<(), DeltaTableError>; + /// Wreite a chunk of values into the internal write buffers with the specified [WriteMode] + async fn write_with_mode(&mut self, values: T, mode: WriteMode) -> Result<(), DeltaTableError>; + /// Flush the internal write buffers to files in the delta table folder structure. /// The corresponding delta [`Add`] actions are returned and should be committed via a transaction. async fn flush(&mut self) -> Result, DeltaTableError>; @@ -135,27 +152,33 @@ pub trait DeltaWriter { /// and commit the changes to the Delta log, creating a new table version. async fn flush_and_commit(&mut self, table: &mut DeltaTable) -> Result { let adds: Vec<_> = self.flush().await?.drain(..).map(Action::Add).collect(); - let snapshot = table.snapshot()?; - let partition_cols = snapshot.metadata().partition_columns.clone(); - let partition_by = if !partition_cols.is_empty() { - Some(partition_cols) - } else { - None - }; - let operation = DeltaOperation::Write { - mode: SaveMode::Append, - partition_by, - predicate: None, - }; - let version = commit( - table.log_store.as_ref(), - &adds, - operation, - Some(snapshot), - None, - ) - .await?; - table.update().await?; - Ok(version) + flush_and_commit(adds, table).await } } + +/// Method for flushing to be used by writers +pub(crate) async fn flush_and_commit( + adds: Vec, + table: &mut DeltaTable, +) -> Result { + let snapshot = table.snapshot()?; + let partition_cols = snapshot.metadata().partition_columns.clone(); + let partition_by = if !partition_cols.is_empty() { + Some(partition_cols) + } else { + None + }; + let operation = DeltaOperation::Write { + mode: SaveMode::Append, + partition_by, + predicate: None, + }; + + let version = CommitBuilder::default() + .with_actions(adds) + .build(Some(snapshot), table.log_store.clone(), operation) + .await? + .version(); + table.update().await?; + Ok(version) +} diff --git a/crates/core/src/writer/record_batch.rs b/crates/core/src/writer/record_batch.rs index 48525a3335..d99673c8cb 100644 --- a/crates/core/src/writer/record_batch.rs +++ b/crates/core/src/writer/record_batch.rs @@ -5,19 +5,21 @@ //! the writer. Once written, add actions are returned by the writer. It's the users responsibility //! to create the transaction using those actions. -use std::collections::BTreeMap; use std::{collections::HashMap, sync::Arc}; -use arrow::array::{Array, UInt32Array}; +use arrow::array::{new_null_array, Array, UInt32Array}; use arrow::compute::{partition, take}; use arrow::record_batch::RecordBatch; use arrow_array::ArrayRef; use arrow_row::{RowConverter, SortField}; use arrow_schema::{ArrowError, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; use bytes::Bytes; +use delta_kernel::expressions::Scalar; +use indexmap::IndexMap; use object_store::{path::Path, ObjectStore}; use parquet::{arrow::ArrowWriter, errors::ParquetError}; use parquet::{basic::Compression, file::properties::WriterProperties}; +use tracing::log::*; use uuid::Uuid; use super::stats::create_add; @@ -25,17 +27,22 @@ use super::utils::{ arrow_schema_without_partitions, next_data_path, record_batch_without_partitions, ShareableBuffer, }; -use super::{DeltaWriter, DeltaWriterError}; +use super::{DeltaWriter, DeltaWriterError, WriteMode}; use crate::errors::DeltaTableError; -use crate::kernel::{Add, PartitionsExt, Scalar, StructType}; +use crate::kernel::{scalars::ScalarExt, Action, Add, PartitionsExt, StructType}; +use crate::operations::cast::merge_schema; +use crate::storage::ObjectStoreRetryExt; use crate::table::builder::DeltaTableBuilder; +use crate::table::config::DEFAULT_NUM_INDEX_COLS; use crate::DeltaTable; /// Writes messages to a delta lake table. pub struct RecordBatchWriter { storage: Arc, - arrow_schema_ref: Arc, + arrow_schema_ref: ArrowSchemaRef, + original_schema_ref: ArrowSchemaRef, writer_properties: WriterProperties, + should_evolve: bool, partition_columns: Vec, arrow_writers: HashMap, } @@ -67,9 +74,11 @@ impl RecordBatchWriter { Ok(Self { storage, - arrow_schema_ref: schema, + arrow_schema_ref: schema.clone(), + original_schema_ref: schema, writer_properties, partition_columns: partition_columns.unwrap_or_default(), + should_evolve: false, arrow_writers: HashMap::new(), }) } @@ -91,9 +100,11 @@ impl RecordBatchWriter { Ok(Self { storage: table.object_store(), - arrow_schema_ref, + arrow_schema_ref: arrow_schema_ref.clone(), + original_schema_ref: arrow_schema_ref.clone(), writer_properties, partition_columns, + should_evolve: false, arrow_writers: HashMap::new(), }) } @@ -127,30 +138,29 @@ impl RecordBatchWriter { pub async fn write_partition( &mut self, record_batch: RecordBatch, - partition_values: &BTreeMap, - ) -> Result<(), DeltaTableError> { + partition_values: &IndexMap, + mode: WriteMode, + ) -> Result { let arrow_schema = arrow_schema_without_partitions(&self.arrow_schema_ref, &self.partition_columns); let partition_key = partition_values.hive_partition_path(); let record_batch = record_batch_without_partitions(&record_batch, &self.partition_columns)?; - match self.arrow_writers.get_mut(&partition_key) { - Some(writer) => { - writer.write(&record_batch)?; - } + let written_schema = match self.arrow_writers.get_mut(&partition_key) { + Some(writer) => writer.write(&record_batch, mode)?, None => { let mut writer = PartitionWriter::new( arrow_schema, partition_values.clone(), self.writer_properties.clone(), )?; - writer.write(&record_batch)?; + let schema = writer.write(&record_batch, mode)?; let _ = self.arrow_writers.insert(partition_key, writer); + schema } - } - - Ok(()) + }; + Ok(written_schema) } /// Sets the writer properties for the underlying arrow writer. @@ -173,12 +183,32 @@ impl RecordBatchWriter { #[async_trait::async_trait] impl DeltaWriter for RecordBatchWriter { + /// Write a chunk of values into the internal write buffers with the default write mode + async fn write(&mut self, values: RecordBatch) -> Result<(), DeltaTableError> { + self.write_with_mode(values, WriteMode::Default).await + } /// Divides a single record batch into into multiple according to table partitioning. /// Values are written to arrow buffers, to collect data until it should be written to disk. - async fn write(&mut self, values: RecordBatch) -> Result<(), DeltaTableError> { + async fn write_with_mode( + &mut self, + values: RecordBatch, + mode: WriteMode, + ) -> Result<(), DeltaTableError> { + if mode == WriteMode::MergeSchema && !self.partition_columns.is_empty() { + return Err(DeltaTableError::Generic( + "Merging Schemas with partition columns present is currently unsupported" + .to_owned(), + )); + } + // Set the should_evolve flag for later in case the writer should perform schema evolution + // on its flush_and_commit + self.should_evolve = mode == WriteMode::MergeSchema; + for result in self.divide_by_partition_values(&values)? { - self.write_partition(result.record_batch, &result.partition_values) + let schema = self + .write_partition(result.record_batch, &result.partition_values, mode) .await?; + self.arrow_schema_ref = schema; } Ok(()) } @@ -195,41 +225,66 @@ impl DeltaWriter for RecordBatchWriter { let path = next_data_path(&prefix, 0, &uuid, &writer.writer_properties); let obj_bytes = Bytes::from(writer.buffer.to_vec()); let file_size = obj_bytes.len() as i64; - self.storage.put(&path, obj_bytes).await?; + self.storage + .put_with_retries(&path, obj_bytes.into(), 15) + .await?; actions.push(create_add( &writer.partition_values, path.to_string(), file_size, &metadata, + DEFAULT_NUM_INDEX_COLS, + &None, )?); } Ok(actions) } + + /// Flush the internal write buffers to files in the delta table folder structure. + /// and commit the changes to the Delta log, creating a new table version. + async fn flush_and_commit(&mut self, table: &mut DeltaTable) -> Result { + use crate::kernel::{Metadata, StructType}; + let mut adds: Vec = self.flush().await?.drain(..).map(Action::Add).collect(); + + if self.arrow_schema_ref != self.original_schema_ref && self.should_evolve { + let schema: StructType = self.arrow_schema_ref.clone().try_into()?; + if !self.partition_columns.is_empty() { + return Err(DeltaTableError::Generic( + "Merging Schemas with partition columns present is currently unsupported" + .to_owned(), + )); + } + let part_cols: Vec = vec![]; + let metadata = Metadata::try_new(schema, part_cols, HashMap::new())?; + adds.push(Action::Metadata(metadata)); + } + super::flush_and_commit(adds, table).await + } } /// Helper container for partitioned record batches #[derive(Clone, Debug)] pub struct PartitionResult { /// values found in partition columns - pub partition_values: BTreeMap, + pub partition_values: IndexMap, /// remaining dataset with partition column values removed pub record_batch: RecordBatch, } struct PartitionWriter { - arrow_schema: Arc, + arrow_schema: ArrowSchemaRef, writer_properties: WriterProperties, pub(super) buffer: ShareableBuffer, pub(super) arrow_writer: ArrowWriter, - pub(super) partition_values: BTreeMap, + pub(super) partition_values: IndexMap, pub(super) buffered_record_batch_count: usize, } impl PartitionWriter { pub fn new( - arrow_schema: Arc, - partition_values: BTreeMap, + arrow_schema: ArrowSchemaRef, + partition_values: IndexMap, writer_properties: WriterProperties, ) -> Result { let buffer = ShareableBuffer::default(); @@ -254,21 +309,55 @@ impl PartitionWriter { /// Writes the record batch in-memory and updates internal state accordingly. /// This method buffers the write stream internally so it can be invoked for many /// record batches and flushed after the appropriate number of bytes has been written. - pub fn write(&mut self, record_batch: &RecordBatch) -> Result<(), DeltaWriterError> { - if record_batch.schema() != self.arrow_schema { - return Err(DeltaWriterError::SchemaMismatch { - record_batch_schema: record_batch.schema(), - expected_schema: self.arrow_schema.clone(), - }); - } + /// + /// Returns the schema which was written by the write which can be used to understand if a + /// schema evolution has happened + pub fn write( + &mut self, + record_batch: &RecordBatch, + mode: WriteMode, + ) -> Result { + let merged_batch = if record_batch.schema() != self.arrow_schema { + match mode { + WriteMode::MergeSchema => { + debug!("The writer and record batch schemas do not match, merging"); + + let merged = + merge_schema(self.arrow_schema.clone(), record_batch.schema().clone())?; + self.arrow_schema = merged; + + let mut cols = vec![]; + for field in self.arrow_schema.fields() { + if let Some(column) = record_batch.column_by_name(field.name()) { + cols.push(column.clone()); + } else { + let null_column = + new_null_array(field.data_type(), record_batch.num_rows()); + cols.push(null_column); + } + } + Some(RecordBatch::try_new(self.arrow_schema.clone(), cols)?) + } + WriteMode::Default => { + // If the schemas didn't match then an error should be pushed up + Err(DeltaWriterError::SchemaMismatch { + record_batch_schema: record_batch.schema(), + expected_schema: self.arrow_schema.clone(), + })? + } + } + } else { + None + }; // Copy current cursor bytes so we can recover from failures let buffer_bytes = self.buffer.to_vec(); + let record_batch = merged_batch.as_ref().unwrap_or(record_batch); match self.arrow_writer.write(record_batch) { Ok(_) => { self.buffered_record_batch_count += 1; - Ok(()) + Ok(self.arrow_schema.clone()) } // If a write fails we need to reset the state of the PartitionWriter Err(e) => { @@ -302,7 +391,7 @@ pub(crate) fn divide_by_partition_values( if partition_columns.is_empty() { partitions.push(PartitionResult { - partition_values: BTreeMap::new(), + partition_values: IndexMap::new(), record_batch: values.clone(), }); return Ok(partitions); @@ -375,8 +464,11 @@ fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { #[cfg(test)] mod tests { use super::*; - use crate::writer::test_utils::{create_initialized_table, get_record_batch}; + use crate::operations::create::CreateBuilder; + use crate::writer::test_utils::*; use arrow::json::ReaderBuilder; + use arrow_array::{Int32Array, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use std::path::Path; #[tokio::test] @@ -431,7 +523,7 @@ mod tests { use crate::DeltaOps; let table = crate::writer::test_utils::create_bare_table(); - let partition_cols = vec!["modified".to_string()]; + let partition_cols = ["modified".to_string()]; let delta_schema = r#" {"type" : "struct", "fields" : [ @@ -450,7 +542,7 @@ mod tests { let table = DeltaOps(table) .create() .with_partition_columns(partition_cols.to_vec()) - .with_columns(delta_schema.fields().clone()) + .with_columns(delta_schema.fields().cloned()) .await .unwrap(); @@ -477,7 +569,7 @@ mod tests { let mut writer = RecordBatchWriter::for_table(&table).unwrap(); let partitions = writer.divide_by_partition_values(&batch).unwrap(); - let expected_keys = vec![ + let expected_keys = [ String::from("modified=2021-02-01"), String::from("modified=2021-02-02"), ]; @@ -570,7 +662,7 @@ mod tests { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partition_cols) .await .unwrap(); @@ -582,4 +674,313 @@ mod tests { let adds = writer.flush().await.unwrap(); assert_eq!(adds.len(), 4); } + + // The following sets of tests are related to #1386 and mergeSchema support + // + mod schema_evolution { + use itertools::Itertools; + + use super::*; + + #[tokio::test] + async fn test_write_mismatched_schema() { + let batch = get_record_batch(None, false); + let partition_cols = vec![]; + let table = create_initialized_table(&partition_cols).await; + let mut writer = RecordBatchWriter::for_table(&table).unwrap(); + + // Write the first batch with the first schema to the table + writer.write(batch).await.unwrap(); + let adds = writer.flush().await.unwrap(); + assert_eq!(adds.len(), 1); + + // Create a second batch with a different schema + let second_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ])); + let second_batch = RecordBatch::try_new( + second_schema, + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2)])), + Arc::new(StringArray::from(vec![Some("will"), Some("robert")])), + ], + ) + .unwrap(); + + let result = writer.write(second_batch).await; + assert!(result.is_err()); + + match result { + Ok(_) => { + panic!("Should not have successfully written"); + } + Err(e) => { + match e { + DeltaTableError::SchemaMismatch { .. } => { + // this is expected + } + others => { + panic!("Got the wrong error: {others:?}"); + } + } + } + }; + } + + #[tokio::test] + async fn test_write_schema_evolution() { + let table_schema = get_delta_schema(); + let table_dir = tempfile::tempdir().unwrap(); + let table_path = table_dir.path(); + + let mut table = CreateBuilder::new() + .with_location(table_path.to_str().unwrap()) + .with_table_name("test-table") + .with_comment("A table for running tests") + .with_columns(table_schema.fields().cloned()) + .await + .unwrap(); + table.load().await.expect("Failed to load table"); + assert_eq!(table.version(), 0); + + let batch = get_record_batch(None, false); + let mut writer = RecordBatchWriter::for_table(&table).unwrap(); + + writer.write(batch).await.unwrap(); + let version = writer.flush_and_commit(&mut table).await.unwrap(); + assert_eq!(version, 1); + table.load().await.expect("Failed to load table"); + assert_eq!(table.version(), 1); + + // Create a second batch with a different schema + let second_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("vid", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ])); + let second_batch = RecordBatch::try_new( + second_schema, + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2)])), // vid + Arc::new(StringArray::from(vec![Some("will"), Some("robert")])), // name + ], + ) + .unwrap(); + + let result = writer + .write_with_mode(second_batch, WriteMode::MergeSchema) + .await; + assert!( + result.is_ok(), + "Failed to write with WriteMode::MergeSchema, {:?}", + result + ); + let version = writer.flush_and_commit(&mut table).await.unwrap(); + assert_eq!(version, 2); + table.load().await.expect("Failed to load table"); + assert_eq!(table.version(), 2); + + let new_schema = table.metadata().unwrap().schema().unwrap(); + let expected_columns = vec!["id", "value", "modified", "vid", "name"]; + let found_columns: Vec<&String> = new_schema.fields().map(|f| f.name()).collect(); + assert_eq!( + expected_columns, found_columns, + "The new table schema does not contain all evolved columns as expected" + ); + } + + #[tokio::test] + async fn test_write_schema_evolution_with_partition_columns_should_fail_as_unsupported() { + let table_schema = get_delta_schema(); + let table_dir = tempfile::tempdir().unwrap(); + let table_path = table_dir.path(); + + let mut table = CreateBuilder::new() + .with_location(table_path.to_str().unwrap()) + .with_table_name("test-table") + .with_comment("A table for running tests") + .with_columns(table_schema.fields().cloned()) + .with_partition_columns(["id"]) + .await + .unwrap(); + table.load().await.expect("Failed to load table"); + assert_eq!(table.version(), 0); + + let batch = get_record_batch(None, false); + let mut writer = RecordBatchWriter::for_table(&table).unwrap(); + + writer.write(batch).await.unwrap(); + let version = writer.flush_and_commit(&mut table).await.unwrap(); + assert_eq!(version, 1); + table.load().await.expect("Failed to load table"); + assert_eq!(table.version(), 1); + + // Create a second batch with appended columns + let second_batch = { + let second = get_record_batch(None, false); + let second_schema = ArrowSchema::new( + second + .schema() + .fields + .iter() + .cloned() + .chain([ + Field::new("vid", DataType::Int32, true).into(), + Field::new("name", DataType::Utf8, true).into(), + ]) + .collect_vec(), + ); + + let len = second.num_rows(); + + let second_arrays = second + .columns() + .iter() + .cloned() + .chain([ + Arc::new(Int32Array::from(vec![Some(1); len])) as _, // vid + Arc::new(StringArray::from(vec![Some("will"); len])) as _, // name + ]) + .collect_vec(); + + RecordBatch::try_new(second_schema.into(), second_arrays).unwrap() + }; + + let result = writer + .write_with_mode(second_batch, WriteMode::MergeSchema) + .await; + + assert!(result.is_err()); + + match result.unwrap_err() { + DeltaTableError::Generic(s) => { + assert_eq!( + s, + "Merging Schemas with partition columns present is currently unsupported" + ) + } + e => panic!("unexpected error: {e:?}"), + } + } + + #[tokio::test] + async fn test_schema_evolution_column_type_mismatch() { + let batch = get_record_batch(None, false); + let partition_cols = vec![]; + let mut table = create_initialized_table(&partition_cols).await; + + let mut writer = RecordBatchWriter::for_table(&table).unwrap(); + + // Write the first batch with the first schema to the table + writer.write(batch).await.unwrap(); + let version = writer.flush_and_commit(&mut table).await.unwrap(); + assert_eq!(version, 1); + + // Create a second batch with a different schema + let second_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ])); + let second_batch = RecordBatch::try_new( + second_schema, + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2)])), // vid + Arc::new(StringArray::from(vec![Some("will"), Some("robert")])), // name + ], + ) + .unwrap(); + + let result = writer + .write_with_mode(second_batch, WriteMode::MergeSchema) + .await; + assert!( + result.is_err(), + "Did not expect to successfully add new writes with different column types: {:?}", + result + ); + } + + #[tokio::test] + async fn test_schema_evolution_with_nonnullable_col() { + use crate::kernel::{ + DataType as DeltaDataType, PrimitiveType, StructField, StructType, + }; + + let table_schema = StructType::new(vec![ + StructField::new( + "id".to_string(), + DeltaDataType::Primitive(PrimitiveType::String), + false, + ), + StructField::new( + "value".to_string(), + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + ), + StructField::new( + "modified".to_string(), + DeltaDataType::Primitive(PrimitiveType::String), + true, + ), + ]); + let table_dir = tempfile::tempdir().unwrap(); + let table_path = table_dir.path(); + + let mut table = CreateBuilder::new() + .with_location(table_path.to_str().unwrap()) + .with_table_name("test-table") + .with_comment("A table for running tests") + .with_columns(table_schema.fields().cloned()) + .await + .unwrap(); + table.load().await.expect("Failed to load table"); + assert_eq!(table.version(), 0); + + // Hand-crafting the first RecordBatch to ensure that a write with non-nullable columns + // works properly before attepting the second write + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("value", DataType::Int32, true), + Field::new("modified", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + arrow_schema, + vec![ + Arc::new(StringArray::from(vec![Some("1"), Some("2")])), // id + Arc::new(new_null_array(&DataType::Int32, 2)), // value + Arc::new(new_null_array(&DataType::Utf8, 2)), // modified + ], + ) + .unwrap(); + + // Write the first batch with the first schema to the table + let mut writer = RecordBatchWriter::for_table(&table).unwrap(); + writer.write(batch).await.unwrap(); + let version = writer.flush_and_commit(&mut table).await.unwrap(); + assert_eq!(version, 1); + + // Create a second batch with a different schema + let second_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "name", + DataType::Utf8, + true, + )])); + let second_batch = RecordBatch::try_new( + second_schema, + vec![ + Arc::new(StringArray::from(vec![Some("will"), Some("robert")])), // name + ], + ) + .unwrap(); + + let result = writer + .write_with_mode(second_batch, WriteMode::MergeSchema) + .await; + assert!( + result.is_err(), + "Should not have been able to write with a missing non-nullable column: {:?}", + result + ); + } + } } diff --git a/crates/core/src/writer/stats.rs b/crates/core/src/writer/stats.rs index 4ba217cc1e..28a089ae1c 100644 --- a/crates/core/src/writer/stats.rs +++ b/crates/core/src/writer/stats.rs @@ -1,8 +1,11 @@ -use std::collections::BTreeMap; +use std::cmp::min; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use std::{collections::HashMap, ops::AddAssign}; +use delta_kernel::expressions::Scalar; +use indexmap::IndexMap; +use parquet::file::metadata::ParquetMetaData; use parquet::format::FileMetaData; use parquet::schema::types::{ColumnDescriptor, SchemaDescriptor}; use parquet::{basic::LogicalType, errors::ParquetError}; @@ -12,17 +15,24 @@ use parquet::{ }; use super::*; -use crate::kernel::{Add, Scalar}; +use crate::kernel::{scalars::ScalarExt, Add}; use crate::protocol::{ColumnValueStat, Stats}; /// Creates an [`Add`] log action struct. pub fn create_add( - partition_values: &BTreeMap, + partition_values: &IndexMap, path: String, size: i64, file_metadata: &FileMetaData, + num_indexed_cols: i32, + stats_columns: &Option>, ) -> Result { - let stats = stats_from_file_metadata(partition_values, file_metadata)?; + let stats = stats_from_file_metadata( + partition_values, + file_metadata, + num_indexed_cols, + stats_columns, + )?; let stats_string = serde_json::to_string(&stats)?; // Determine the modification timestamp to include in the add action - milliseconds since epoch @@ -58,26 +68,94 @@ pub fn create_add( }) } +// As opposed to `stats_from_file_metadata` which operates on `parquet::format::FileMetaData`, +// this function produces the stats by reading the metadata from already written out files. +// +// Note that the file metadata used here is actually `parquet::file::metadata::FileMetaData` +// which is a thrift decoding of the `parquet::format::FileMetaData` which is typically obtained +// when flushing the write. +pub(crate) fn stats_from_parquet_metadata( + partition_values: &IndexMap, + parquet_metadata: &ParquetMetaData, + num_indexed_cols: i32, + stats_columns: &Option>, +) -> Result { + let num_rows = parquet_metadata.file_metadata().num_rows(); + let schema_descriptor = parquet_metadata.file_metadata().schema_descr_ptr(); + let row_group_metadata = parquet_metadata.row_groups().to_vec(); + + stats_from_metadata( + partition_values, + schema_descriptor, + row_group_metadata, + num_rows, + num_indexed_cols, + stats_columns, + ) +} + fn stats_from_file_metadata( - partition_values: &BTreeMap, + partition_values: &IndexMap, file_metadata: &FileMetaData, + num_indexed_cols: i32, + stats_columns: &Option>, ) -> Result { let type_ptr = parquet::schema::types::from_thrift(file_metadata.schema.as_slice()); let schema_descriptor = type_ptr.map(|type_| Arc::new(SchemaDescriptor::new(type_)))?; + let row_group_metadata: Vec = file_metadata + .row_groups + .iter() + .map(|rg| RowGroupMetaData::from_thrift(schema_descriptor.clone(), rg.clone())) + .collect::, ParquetError>>()?; + + stats_from_metadata( + partition_values, + schema_descriptor, + row_group_metadata, + file_metadata.num_rows, + num_indexed_cols, + stats_columns, + ) +} + +fn stats_from_metadata( + partition_values: &IndexMap, + schema_descriptor: Arc, + row_group_metadata: Vec, + num_rows: i64, + num_indexed_cols: i32, + stats_columns: &Option>, +) -> Result { let mut min_values: HashMap = HashMap::new(); let mut max_values: HashMap = HashMap::new(); let mut null_count: HashMap = HashMap::new(); - let row_group_metadata: Result, ParquetError> = file_metadata - .row_groups - .iter() - .map(|rg| RowGroupMetaData::from_thrift(schema_descriptor.clone(), rg.clone())) - .collect(); - let row_group_metadata = row_group_metadata?; + let idx_to_iterate = if let Some(stats_cols) = stats_columns { + schema_descriptor + .columns() + .iter() + .enumerate() + .filter_map(|(index, col)| { + if stats_cols.contains(&col.name().to_string()) { + Some(index) + } else { + None + } + }) + .collect() + } else if num_indexed_cols == -1 { + (0..schema_descriptor.num_columns()).collect::>() + } else if num_indexed_cols >= 0 { + (0..min(num_indexed_cols as usize, schema_descriptor.num_columns())).collect::>() + } else { + return Err(DeltaWriterError::DeltaTable(DeltaTableError::Generic( + "delta.dataSkippingNumIndexedCols valid values are >=-1".to_string(), + ))); + }; - for i in 0..schema_descriptor.num_columns() { - let column_descr = schema_descriptor.column(i); + for idx in idx_to_iterate { + let column_descr = schema_descriptor.column(idx); let column_path = column_descr.path(); let column_path_parts = column_path.parts(); @@ -90,7 +168,7 @@ fn stats_from_file_metadata( let maybe_stats: Option = row_group_metadata .iter() .map(|g| { - g.column(i) + g.column(idx) .statistics() .map(|s| AggregatedStats::from((s, &column_descr.logical_type()))) }) @@ -118,7 +196,7 @@ fn stats_from_file_metadata( Ok(Stats { min_values, max_values, - num_records: file_metadata.num_rows, + num_records: num_rows, null_count, }) } @@ -180,19 +258,19 @@ impl StatsScalar { // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#timestamp-without-timezone-timestampntz let v = get_stat!(v); let timestamp = match unit { - TimeUnit::MILLIS(_) => chrono::NaiveDateTime::from_timestamp_millis(v), - TimeUnit::MICROS(_) => chrono::NaiveDateTime::from_timestamp_micros(v), + TimeUnit::MILLIS(_) => chrono::DateTime::from_timestamp_millis(v), + TimeUnit::MICROS(_) => chrono::DateTime::from_timestamp_micros(v), TimeUnit::NANOS(_) => { let secs = v / 1_000_000_000; let nanosecs = (v % 1_000_000_000) as u32; - chrono::NaiveDateTime::from_timestamp_opt(secs, nanosecs) + chrono::DateTime::from_timestamp(secs, nanosecs) } }; let timestamp = timestamp.ok_or(DeltaWriterError::StatsParsingFailed { debug_value: v.to_string(), logical_type: logical_type.clone(), })?; - Ok(Self::Timestamp(timestamp)) + Ok(Self::Timestamp(timestamp.naive_utc())) } (Statistics::Int64(v), Some(LogicalType::Decimal { scale, .. })) => { let val = get_stat!(v) as f64 / 10.0_f64.powi(*scale); @@ -231,18 +309,8 @@ impl StatsScalar { v.max_bytes() }; - let val = if val.len() <= 4 { - let mut bytes = [0; 4]; - bytes[..val.len()].copy_from_slice(val); - i32::from_be_bytes(bytes) as f64 - } else if val.len() <= 8 { - let mut bytes = [0; 8]; - bytes[..val.len()].copy_from_slice(val); - i64::from_be_bytes(bytes) as f64 - } else if val.len() <= 16 { - let mut bytes = [0; 16]; - bytes[..val.len()].copy_from_slice(val); - i128::from_be_bytes(bytes) as f64 + let val = if val.len() <= 16 { + i128::from_be_bytes(sign_extend_be(val)) as f64 } else { return Err(DeltaWriterError::StatsParsingFailed { debug_value: format!("{val:?}"), @@ -284,6 +352,19 @@ impl StatsScalar { } } +/// Performs big endian sign extension +/// Copied from arrow-rs repo/parquet crate: +/// https://github.com/apache/arrow-rs/blob/b25c441745602c9967b1e3cc4a28bc469cfb1311/parquet/src/arrow/buffer/bit_util.rs#L54 +pub fn sign_extend_be(b: &[u8]) -> [u8; N] { + assert!(b.len() <= N, "Array too large, expected less than {N}"); + let is_negative = (b[0] & 128u8) == 128u8; + let mut result = if is_negative { [255u8; N] } else { [0u8; N] }; + for (d, s) in result.iter_mut().skip(N - b.len()).zip(b) { + *d = *s; + } + result +} + impl From for serde_json::Value { fn from(scalar: StatsScalar) -> Self { match scalar { @@ -622,6 +703,17 @@ mod tests { }), Value::from(1243124142314.423), ), + ( + simple_parquet_stat!( + Statistics::FixedLenByteArray, + FixedLenByteArray::from(vec![0, 39, 16]) + ), + Some(LogicalType::Decimal { + scale: 3, + precision: 5, + }), + Value::from(10.0), + ), ( simple_parquet_stat!( Statistics::FixedLenByteArray, @@ -645,7 +737,6 @@ mod tests { } } - #[ignore] #[tokio::test] async fn test_delta_stats() { let temp_dir = tempfile::tempdir().unwrap(); diff --git a/crates/core/src/writer/test_utils.rs b/crates/core/src/writer/test_utils.rs index 093ad7cbd0..ff860ed1cf 100644 --- a/crates/core/src/writer/test_utils.rs +++ b/crates/core/src/writer/test_utils.rs @@ -276,7 +276,7 @@ pub async fn setup_table_with_configuration( let table_schema = get_delta_schema(); DeltaOps::new_in_memory() .create() - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_configuration_property(key, value) .await .expect("Failed to create table") @@ -299,7 +299,7 @@ pub async fn create_initialized_table(partition_cols: &[String]) -> DeltaTable { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.fields().clone()) + .with_columns(table_schema.fields().cloned()) .with_partition_columns(partition_cols) .await .unwrap() diff --git a/crates/core/tests/command_merge.rs b/crates/core/tests/command_merge.rs new file mode 100644 index 0000000000..76b511254b --- /dev/null +++ b/crates/core/tests/command_merge.rs @@ -0,0 +1,231 @@ +#![allow(dead_code)] +mod fs_common; + +use arrow_array::RecordBatch; +use arrow_schema::{DataType, Field, Schema as ArrowSchema}; +use datafusion::dataframe::DataFrame; +use datafusion::prelude::SessionContext; +use datafusion_common::Column; +use datafusion_expr::{col, lit, Expr}; +use deltalake_core::kernel::{DataType as DeltaDataType, PrimitiveType, StructField, StructType}; +use deltalake_core::operations::merge::MergeMetrics; +use deltalake_core::operations::transaction::TransactionError; +use deltalake_core::protocol::SaveMode; +use deltalake_core::{open_table, DeltaOps, DeltaResult, DeltaTable, DeltaTableError}; +use std::sync::Arc; + +async fn create_table(table_uri: &str, partition: Option>) -> DeltaTable { + let table_schema = get_delta_schema(); + let ops = DeltaOps::try_from_uri(table_uri).await.unwrap(); + let table = ops + .create() + .with_columns(table_schema.fields().cloned()) + .with_partition_columns(partition.unwrap_or_default()) + .await + .expect("Failed to create table"); + + let schema = get_arrow_schema(); + write_data(table, &schema).await +} + +fn get_delta_schema() -> StructType { + StructType::new(vec![ + StructField::new( + "id".to_string(), + DeltaDataType::Primitive(PrimitiveType::String), + true, + ), + StructField::new( + "value".to_string(), + DeltaDataType::Primitive(PrimitiveType::Integer), + true, + ), + StructField::new( + "event_date".to_string(), + DeltaDataType::Primitive(PrimitiveType::String), + true, + ), + ]) +} + +fn get_arrow_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Utf8, true), + Field::new("value", DataType::Int32, true), + Field::new("event_date", DataType::Utf8, true), + ])) +} + +async fn write_data(table: DeltaTable, schema: &Arc) -> DeltaTable { + let batch = RecordBatch::try_new( + Arc::clone(schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["A", "B", "C", "D"])), + Arc::new(arrow::array::Int32Array::from(vec![1, 10, 10, 100])), + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-01", + "2021-02-01", + "2021-02-02", + "2021-02-02", + ])), + ], + ) + .unwrap(); + // write some data + DeltaOps(table) + .write(vec![batch.clone()]) + .with_save_mode(SaveMode::Append) + .await + .unwrap() +} + +fn create_test_data() -> (DataFrame, DataFrame) { + let schema = get_arrow_schema(); + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["C", "D"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20])), + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-02", + "2021-02-02", + ])), + ], + ) + .unwrap(); + let df1 = ctx.read_batch(batch).unwrap(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["E", "F"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20])), + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-03", + "2021-02-03", + ])), + ], + ) + .unwrap(); + let df2 = ctx.read_batch(batch).unwrap(); + (df1, df2) +} + +async fn merge( + table: DeltaTable, + df: DataFrame, + predicate: Expr, +) -> DeltaResult<(DeltaTable, MergeMetrics)> { + DeltaOps(table) + .merge(df, predicate) + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|update| { + update + .update("value", col("source.value")) + .update("event_date", col("source.event_date")) + }) + .unwrap() + .when_not_matched_insert(|insert| { + insert + .set("id", col("source.id")) + .set("value", col("source.value")) + .set("event_date", col("source.event_date")) + }) + .unwrap() + .await +} + +#[tokio::test] +async fn test_merge_concurrent_conflict() { + // Overlapping id ranges -> Commit conflict + let tmp_dir = tempfile::tempdir().unwrap(); + let table_uri = tmp_dir.path().to_str().to_owned().unwrap(); + + let table_ref1 = create_table(&table_uri.to_string(), Some(vec!["event_date"])).await; + let table_ref2 = open_table(table_uri).await.unwrap(); + let (df1, _df2) = create_test_data(); + + let expr = col("target.id").eq(col("source.id")); + let (_table_ref1, _metrics) = merge(table_ref1, df1.clone(), expr.clone()).await.unwrap(); + let result = merge(table_ref2, df1, expr).await; + + assert!(matches!( + result.as_ref().unwrap_err(), + DeltaTableError::Transaction { .. } + )); + if let DeltaTableError::Transaction { source } = result.unwrap_err() { + assert!(matches!(source, TransactionError::CommitConflict(_))); + } +} + +#[tokio::test] +async fn test_merge_different_range() { + // No overlapping id ranges -> No conflict + let tmp_dir = tempfile::tempdir().unwrap(); + let table_uri = tmp_dir.path().to_str().to_owned().unwrap(); + + let table_ref1 = create_table(table_uri, Some(vec!["event_date"])).await; + let table_ref2 = open_table(table_uri).await.unwrap(); + let (df1, df2) = create_test_data(); + + let expr = col("target.id").eq(col("source.id")); + let (_table_ref1, _metrics) = merge(table_ref1, df1, expr.clone()).await.unwrap(); + let result = merge(table_ref2, df2, expr).await; + + assert!(result.is_ok()); +} + +#[tokio::test] +async fn test_merge_concurrent_different_partition() { + // partition key in predicate -> Successful merge + let tmp_dir = tempfile::tempdir().unwrap(); + let table_uri = tmp_dir.path().to_str().to_owned().unwrap(); + + let table_ref1 = create_table(table_uri, Some(vec!["event_date"])).await; + let table_ref2 = open_table(table_uri).await.unwrap(); + let (df1, df2) = create_test_data(); + + let expr = col("target.id") + .eq(col("source.id")) + .and(col("target.event_date").eq(col("source.event_date"))); + let (_table_ref1, _metrics) = merge(table_ref1, df1, expr.clone()).await.unwrap(); + let result = merge(table_ref2, df2, expr).await; + + assert!(result.is_ok()); +} + +#[tokio::test] +async fn test_merge_concurrent_with_overlapping_files() { + // predicate contains filter and files are overlapping -> Commit conflict + let tmp_dir = tempfile::tempdir().unwrap(); + let table_uri = tmp_dir.path().to_str().to_owned().unwrap(); + + let table_ref1 = create_table(table_uri, None).await; + let table_ref2 = open_table(table_uri).await.unwrap(); + let (df1, _df2) = create_test_data(); + + let expr = col("target.id").eq(col("source.id")); + let (_table_ref1, _metrics) = merge( + table_ref1, + df1.clone(), + expr.clone() + .and(col(Column::from_qualified_name("target.event_date")).lt_eq(lit("2021-02-02"))), + ) + .await + .unwrap(); + let result = merge( + table_ref2, + df1, + expr.and(col(Column::from_qualified_name("target.event_date")).eq(lit("2021-02-02"))), + ) + .await; + + assert!(matches!( + result.as_ref().unwrap_err(), + DeltaTableError::Transaction { .. } + )); + if let DeltaTableError::Transaction { source } = result.unwrap_err() { + assert!(matches!(source, TransactionError::CommitConflict(_))); + } +} diff --git a/crates/core/tests/command_optimize.rs b/crates/core/tests/command_optimize.rs index 5c3875eb92..13cbd168e4 100644 --- a/crates/core/tests/command_optimize.rs +++ b/crates/core/tests/command_optimize.rs @@ -9,7 +9,7 @@ use deltalake_core::kernel::{Action, DataType, PrimitiveType, StructField}; use deltalake_core::operations::optimize::{ create_merge_plan, MetricDetails, Metrics, OptimizeType, }; -use deltalake_core::operations::transaction::commit; +use deltalake_core::operations::transaction::{CommitBuilder, CommitProperties}; use deltalake_core::operations::DeltaOps; use deltalake_core::protocol::DeltaOperation; use deltalake_core::storage::ObjectStoreRef; @@ -180,6 +180,12 @@ async fn test_optimize_non_partitioned_table() -> Result<(), Box> { assert_eq!(metrics.partitions_optimized, 1); assert_eq!(dt.get_files_count(), 2); + let commit_info = dt.history(None).await?; + let last_commit = &commit_info[0]; + let parameters = last_commit.operation_parameters.clone().unwrap(); + assert_eq!(parameters["targetSize"], json!("2000000")); + assert_eq!(parameters["predicate"], "[]"); + Ok(()) } @@ -243,7 +249,7 @@ async fn test_optimize_with_partitions() -> Result<(), Box> { let partition_values = partition_adds[0].partition_values()?; assert_eq!( partition_values.get("date"), - Some(&deltalake_core::kernel::Scalar::String( + Some(&delta_kernel::expressions::Scalar::String( "2022-05-22".to_string() )) ); @@ -252,7 +258,6 @@ async fn test_optimize_with_partitions() -> Result<(), Box> { } #[tokio::test] -#[ignore] /// Validate that optimize fails when a remove action occurs async fn test_conflict_for_remove_actions() -> Result<(), Box> { let context = setup_test(true).await?; @@ -291,20 +296,24 @@ async fn test_conflict_for_remove_actions() -> Result<(), Box> { let remove = add.remove_action(true); let operation = DeltaOperation::Delete { predicate: None }; - commit( - other_dt.log_store().as_ref(), - &vec![Action::Remove(remove)], - operation, - Some(other_dt.snapshot()?), - None, - ) - .await?; + CommitBuilder::default() + .with_actions(vec![Action::Remove(remove)]) + .build(Some(other_dt.snapshot()?), other_dt.log_store(), operation) + .await?; let maybe_metrics = plan - .execute(dt.log_store(), dt.snapshot()?, 1, 20, None, None) + .execute( + dt.log_store(), + dt.snapshot()?, + 1, + 20, + None, + CommitProperties::default(), + ) .await; assert!(maybe_metrics.is_err()); + dt.update().await?; assert_eq!(dt.version(), version + 1); Ok(()) } @@ -352,7 +361,14 @@ async fn test_no_conflict_for_append_actions() -> Result<(), Box> { .await?; let metrics = plan - .execute(dt.log_store(), dt.snapshot()?, 1, 20, None, None) + .execute( + dt.log_store(), + dt.snapshot()?, + 1, + 20, + None, + CommitProperties::default(), + ) .await?; assert_eq!(metrics.num_files_added, 1); assert_eq!(metrics.num_files_removed, 2); @@ -398,7 +414,7 @@ async fn test_commit_interval() -> Result<(), Box> { 1, 20, Some(Duration::from_secs(0)), // this will cause as many commits as num_files_added - None, + CommitProperties::default(), ) .await?; assert_eq!(metrics.num_files_added, 2); @@ -410,7 +426,6 @@ async fn test_commit_interval() -> Result<(), Box> { } #[tokio::test] -#[ignore] /// Validate that bin packing is idempotent. async fn test_idempotent() -> Result<(), Box> { //TODO: Compression makes it hard to get the target file size... @@ -613,8 +628,7 @@ async fn test_commit_info() -> Result<(), Box> { assert_eq!(last_commit.read_version, Some(version)); let parameters = last_commit.operation_parameters.clone().unwrap(); assert_eq!(parameters["targetSize"], json!("2000000")); - // TODO: Requires a string representation for PartitionFilter - // assert_eq!(parameters["predicate"], None); + assert_eq!(parameters["predicate"], "[\"date = '2022-05-22'\"]"); Ok(()) } diff --git a/crates/core/tests/command_restore.rs b/crates/core/tests/command_restore.rs index 1e49132d23..aa5b598347 100644 --- a/crates/core/tests/command_restore.rs +++ b/crates/core/tests/command_restore.rs @@ -1,7 +1,7 @@ use arrow::datatypes::Schema as ArrowSchema; use arrow_array::{Int32Array, RecordBatch}; use arrow_schema::{DataType as ArrowDataType, Field}; -use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; +use chrono::DateTime; use deltalake_core::kernel::{DataType, PrimitiveType, StructField}; use deltalake_core::protocol::SaveMode; use deltalake_core::storage::commit_uri_from_version; @@ -128,8 +128,7 @@ async fn test_restore_by_datetime() -> Result<(), Box> { .head(&commit_uri_from_version(version)) .await?; let timestamp = meta.last_modified.timestamp_millis(); - let naive = NaiveDateTime::from_timestamp_millis(timestamp).unwrap(); - let datetime: DateTime = Utc.from_utc_datetime(&naive); + let datetime = DateTime::from_timestamp_millis(timestamp).unwrap(); let result = DeltaOps(table) .restore() @@ -147,8 +146,7 @@ async fn test_restore_with_error_params() -> Result<(), Box> { let table = context.table; let history = table.history(Some(10)).await?; let timestamp = history.get(1).unwrap().timestamp.unwrap(); - let naive = NaiveDateTime::from_timestamp_millis(timestamp).unwrap(); - let datetime: DateTime = Utc.from_utc_datetime(&naive); + let datetime = DateTime::from_timestamp_millis(timestamp).unwrap(); // datetime and version both set let result = DeltaOps(table) diff --git a/crates/core/tests/commit_info_format.rs b/crates/core/tests/commit_info_format.rs index b47850ae30..df817365b3 100644 --- a/crates/core/tests/commit_info_format.rs +++ b/crates/core/tests/commit_info_format.rs @@ -2,7 +2,7 @@ mod fs_common; use deltalake_core::kernel::Action; -use deltalake_core::operations::transaction::commit; +use deltalake_core::operations::transaction::CommitBuilder; use deltalake_core::protocol::{DeltaOperation, SaveMode}; use serde_json::json; use std::error::Error; @@ -20,14 +20,10 @@ async fn test_operational_parameters() -> Result<(), Box> { predicate: None, }; - commit( - table.log_store().as_ref(), - &actions, - operation, - Some(table.snapshot()?), - None, - ) - .await?; + CommitBuilder::default() + .with_actions(actions) + .build(Some(table.snapshot()?), table.log_store(), operation) + .await?; table.update().await?; let commit_info = table.history(None).await?; diff --git a/crates/core/tests/fs_common/mod.rs b/crates/core/tests/fs_common/mod.rs index 088d22a630..13683b408a 100644 --- a/crates/core/tests/fs_common/mod.rs +++ b/crates/core/tests/fs_common/mod.rs @@ -3,12 +3,14 @@ use deltalake_core::kernel::{ Action, Add, DataType, PrimitiveType, Remove, StructField, StructType, }; use deltalake_core::operations::create::CreateBuilder; -use deltalake_core::operations::transaction::commit; +use deltalake_core::operations::transaction::CommitBuilder; use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::{GetResult, ObjectStoreResult}; use deltalake_core::DeltaTable; use object_store::path::Path as StorePath; -use object_store::{ObjectStore, PutOptions, PutResult}; +use object_store::{ + MultipartUpload, ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult, +}; use serde_json::Value; use std::collections::HashMap; use std::fs; @@ -55,7 +57,7 @@ pub async fn create_test_table( .with_location(path) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(schema.fields().clone()) + .with_columns(schema.fields().cloned()) .with_partition_columns(partition_columns) .with_configuration(config) .await @@ -119,15 +121,16 @@ pub async fn commit_actions( actions: Vec, operation: DeltaOperation, ) -> i64 { - let version = commit( - table.log_store().as_ref(), - &actions, - operation, - Some(table.snapshot().unwrap()), - None, - ) - .await - .unwrap(); + let version = CommitBuilder::default() + .with_actions(actions) + .build( + Some(table.snapshot().unwrap()), + table.log_store().clone(), + operation, + ) + .await + .unwrap() + .version(); table.update().await.unwrap(); version } @@ -143,6 +146,7 @@ impl std::fmt::Display for SlowStore { } impl SlowStore { + #[allow(dead_code)] pub fn new( location: Url, _options: impl Into + Clone, @@ -156,14 +160,14 @@ impl SlowStore { #[async_trait::async_trait] impl ObjectStore for SlowStore { /// Save the provided bytes to the specified location. - async fn put(&self, location: &StorePath, bytes: bytes::Bytes) -> ObjectStoreResult { + async fn put(&self, location: &StorePath, bytes: PutPayload) -> ObjectStoreResult { self.inner.put(location, bytes).await } async fn put_opts( &self, location: &StorePath, - bytes: bytes::Bytes, + bytes: PutPayload, options: PutOptions, ) -> ObjectStoreResult { self.inner.put_opts(location, bytes, options).await @@ -270,18 +274,15 @@ impl ObjectStore for SlowStore { async fn put_multipart( &self, location: &StorePath, - ) -> ObjectStoreResult<( - object_store::MultipartId, - Box, - )> { + ) -> ObjectStoreResult> { self.inner.put_multipart(location).await } - async fn abort_multipart( + async fn put_multipart_opts( &self, location: &StorePath, - multipart_id: &object_store::MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await + options: PutMultipartOpts, + ) -> ObjectStoreResult> { + self.inner.put_multipart_opts(location, options).await } } diff --git a/crates/core/tests/integration_datafusion.rs b/crates/core/tests/integration_datafusion.rs index 90fc3ea9fa..ea83bce29e 100644 --- a/crates/core/tests/integration_datafusion.rs +++ b/crates/core/tests/integration_datafusion.rs @@ -1,14 +1,10 @@ #![cfg(feature = "datafusion")] - -use arrow::array::Int64Array; -use deltalake_test::datafusion::*; -use deltalake_test::utils::*; -use serial_test::serial; - use std::collections::{HashMap, HashSet}; +use std::error::Error; use std::path::PathBuf; use std::sync::Arc; +use arrow::array::Int64Array; use arrow::array::*; use arrow::record_batch::RecordBatch; use arrow_schema::{ @@ -28,8 +24,6 @@ use datafusion_expr::Expr; use datafusion_proto::bytes::{ physical_plan_from_bytes_with_extension_codec, physical_plan_to_bytes_with_extension_codec, }; -use url::Url; - use deltalake_core::delta_datafusion::{DeltaPhysicalCodec, DeltaScan}; use deltalake_core::kernel::{DataType, MapType, PrimitiveType, StructField, StructType}; use deltalake_core::logstore::logstore_for; @@ -41,7 +35,10 @@ use deltalake_core::{ operations::{write::WriteBuilder, DeltaOps}, DeltaTable, DeltaTableError, }; -use std::error::Error; +use deltalake_test::datafusion::*; +use deltalake_test::utils::*; +use serial_test::serial; +use url::Url; mod local { use datafusion::common::stats::Precision; @@ -68,6 +65,8 @@ mod local { #[derive(Debug, Default)] pub struct ExecutionMetricsCollector { scanned_files: HashSet