diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c0c5099c8..8226868f8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,261 @@ # Changelog +## [rust-v0.20.1](https://github.com/delta-io/delta-rs/tree/rust-v0.20.1) (2024-09-27) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.20.0...rust-v0.20.1) + +**Implemented enhancements:** + +- Allow to specify Azurite hostname and service port as backend [\#2900](https://github.com/delta-io/delta-rs/issues/2900) +- docs section usage/Managing a table is out of date w.r.t. optimizing tables [\#2891](https://github.com/delta-io/delta-rs/issues/2891) +- generate more sensible row group size [\#2545](https://github.com/delta-io/delta-rs/issues/2545) + +**Fixed bugs:** + +- Cannot write to Minio with deltalake.write\_deltalake or Polars [\#2894](https://github.com/delta-io/delta-rs/issues/2894) +- Schema Mismatch Error When appending Parquet Files with Metadata using Rust Engine [\#2888](https://github.com/delta-io/delta-rs/issues/2888) +- Assume role support has been broken since 2022 :rofl: [\#2879](https://github.com/delta-io/delta-rs/issues/2879) +- z-order fails on table that is partitioned by value with space [\#2834](https://github.com/delta-io/delta-rs/issues/2834) +- "builder error for url" when creating an instance of a DeltaTable which is located in an azurite blob storage [\#2815](https://github.com/delta-io/delta-rs/issues/2815) + +**Closed issues:** + +- delta-rs can't write to a table if datafusion is not enabled [\#2910](https://github.com/delta-io/delta-rs/issues/2910) + +## [rust-v0.20.0](https://github.com/delta-io/delta-rs/tree/rust-v0.20.0) (2024-09-18) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.19.1...rust-v0.20.0) + +**Fixed bugs:** + +- DeltaTableBuilder flags ignored [\#2808](https://github.com/delta-io/delta-rs/issues/2808) +- Require files in config is not anymore used to skip reading add actions [\#2796](https://github.com/delta-io/delta-rs/issues/2796) + +**Merged pull requests:** + +- feat: improve AWS credential loading between S3 and DynamoDb code paths [\#2887](https://github.com/delta-io/delta-rs/pull/2887) ([rtyler](https://github.com/rtyler)) +- feat: add support for `pyarrow.ExtensionType` [\#2885](https://github.com/delta-io/delta-rs/pull/2885) ([fecet](https://github.com/fecet)) +- fix: conditionally disable enable\_io non-unix based systems [\#2884](https://github.com/delta-io/delta-rs/pull/2884) ([hntd187](https://github.com/hntd187)) +- docs: fix typo in delta-lake-dagster [\#2883](https://github.com/delta-io/delta-rs/pull/2883) ([jessy1092](https://github.com/jessy1092)) +- fix: pin broken dependencies and changes in 0.19.1 [\#2878](https://github.com/delta-io/delta-rs/pull/2878) ([rtyler](https://github.com/rtyler)) +- chore: cleanup codecov defaults [\#2876](https://github.com/delta-io/delta-rs/pull/2876) ([rtyler](https://github.com/rtyler)) +- fix: prepare the next :crab: release with fixed version ranges [\#2875](https://github.com/delta-io/delta-rs/pull/2875) ([rtyler](https://github.com/rtyler)) +- chore: exclude parquet from dependabot as well [\#2874](https://github.com/delta-io/delta-rs/pull/2874) ([rtyler](https://github.com/rtyler)) +- chore: attempt to ignore all dependabot checks for arrow and datafusion [\#2870](https://github.com/delta-io/delta-rs/pull/2870) ([rtyler](https://github.com/rtyler)) +- fix\(rust\): scan schema fix for predicate [\#2869](https://github.com/delta-io/delta-rs/pull/2869) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: rearrange github actions a bit [\#2868](https://github.com/delta-io/delta-rs/pull/2868) ([rtyler](https://github.com/rtyler)) +- fix: set put mode to overwrite in mount backend [\#2861](https://github.com/delta-io/delta-rs/pull/2861) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: pin the build-dependencies for Python to a slightly older vendored openssl [\#2856](https://github.com/delta-io/delta-rs/pull/2856) ([rtyler](https://github.com/rtyler)) +- fix: escaped columns in dataskippingstatscolumns [\#2855](https://github.com/delta-io/delta-rs/pull/2855) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: re-enable optional old casting behavior in merge [\#2853](https://github.com/delta-io/delta-rs/pull/2853) ([ion-elgreco](https://github.com/ion-elgreco)) +- docs: concurrent writes permission missing [\#2846](https://github.com/delta-io/delta-rs/pull/2846) ([poguez](https://github.com/poguez)) +- chore: update python [\#2845](https://github.com/delta-io/delta-rs/pull/2845) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: pin the Rust baseline version to 1.80 [\#2842](https://github.com/delta-io/delta-rs/pull/2842) ([rtyler](https://github.com/rtyler)) +- fix: stats is optional in add action [\#2841](https://github.com/delta-io/delta-rs/pull/2841) ([jkylling](https://github.com/jkylling)) +- chore\(aws\): use backon to replace backoff [\#2840](https://github.com/delta-io/delta-rs/pull/2840) ([Xuanwo](https://github.com/Xuanwo)) +- feat\(rust\): add operationMetrics to WRITE [\#2838](https://github.com/delta-io/delta-rs/pull/2838) ([gavinmead](https://github.com/gavinmead)) +- chore: enable codecov reporting [\#2836](https://github.com/delta-io/delta-rs/pull/2836) ([rtyler](https://github.com/rtyler)) +- docs: fix documentation about max\_spill\_size [\#2835](https://github.com/delta-io/delta-rs/pull/2835) ([junhl](https://github.com/junhl)) +- chore: set max\_retries in CommitProperties [\#2826](https://github.com/delta-io/delta-rs/pull/2826) ([helanto](https://github.com/helanto)) +- refactor\(python\): post\_commit\_hook\_properties derive [\#2824](https://github.com/delta-io/delta-rs/pull/2824) ([ion-elgreco](https://github.com/ion-elgreco)) +- refactor\(python\): add pymergebuilder [\#2823](https://github.com/delta-io/delta-rs/pull/2823) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: make `Add::get_json_stats` public [\#2822](https://github.com/delta-io/delta-rs/pull/2822) ([gruuya](https://github.com/gruuya)) +- docs: fix docstring of set\_table\_properties [\#2820](https://github.com/delta-io/delta-rs/pull/2820) ([astrojuanlu](https://github.com/astrojuanlu)) +- fix\(rust\): set token provider explicitly [\#2817](https://github.com/delta-io/delta-rs/pull/2817) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: public method to get partitions for DeltaTable \(\#2671\) [\#2816](https://github.com/delta-io/delta-rs/pull/2816) ([omkar-foss](https://github.com/omkar-foss)) +- perf: conditional put for default log store \(e.g. azure, gcs, minio, cloudflare\) [\#2813](https://github.com/delta-io/delta-rs/pull/2813) ([ion-elgreco](https://github.com/ion-elgreco)) +- test\(python\): fix optimize call in benchmark [\#2812](https://github.com/delta-io/delta-rs/pull/2812) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: use table config target file size, expose target\_file\_size in python [\#2811](https://github.com/delta-io/delta-rs/pull/2811) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): use require files [\#2809](https://github.com/delta-io/delta-rs/pull/2809) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): allow `in` pushdowns in early\_filter [\#2807](https://github.com/delta-io/delta-rs/pull/2807) ([ion-elgreco](https://github.com/ion-elgreco)) +- docs: added WriterProperties documentation [\#2804](https://github.com/delta-io/delta-rs/pull/2804) ([sherlockbeard](https://github.com/sherlockbeard)) +- fix: enable feature flags to which deltalake-core build tokio with enable\_io [\#2803](https://github.com/delta-io/delta-rs/pull/2803) ([rtyler](https://github.com/rtyler)) +- chore\(python\): remove deprecated or duplicated functions [\#2801](https://github.com/delta-io/delta-rs/pull/2801) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore\(python\): raise not implemented in from\_data\_catalog [\#2799](https://github.com/delta-io/delta-rs/pull/2799) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(rust\): `max_spill_size` default value [\#2795](https://github.com/delta-io/delta-rs/pull/2795) ([mrjsj](https://github.com/mrjsj)) +- feat\(python, rust\): add ColumnProperties And rework in python WriterProperties [\#2793](https://github.com/delta-io/delta-rs/pull/2793) ([sherlockbeard](https://github.com/sherlockbeard)) +- feat: configurable IO runtime [\#2789](https://github.com/delta-io/delta-rs/pull/2789) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: remove some `file_actions` call sites [\#2787](https://github.com/delta-io/delta-rs/pull/2787) ([roeap](https://github.com/roeap)) +- style: more consistent imports [\#2786](https://github.com/delta-io/delta-rs/pull/2786) ([roeap](https://github.com/roeap)) +- feat\(python, rust\): added statistics\_truncate\_length in WriterProperties [\#2784](https://github.com/delta-io/delta-rs/pull/2784) ([sherlockbeard](https://github.com/sherlockbeard)) +- fix: pin maturin verison [\#2778](https://github.com/delta-io/delta-rs/pull/2778) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: trim trailing slash in url storage options \(\#2656\) [\#2775](https://github.com/delta-io/delta-rs/pull/2775) ([omkar-foss](https://github.com/omkar-foss)) +- chore: update the changelog with the 0.19.0 release [\#2774](https://github.com/delta-io/delta-rs/pull/2774) ([rtyler](https://github.com/rtyler)) +- feat\(python, rust\): `add feature` operation [\#2712](https://github.com/delta-io/delta-rs/pull/2712) ([ion-elgreco](https://github.com/ion-elgreco)) + +## [rust-v0.19.1](https://github.com/delta-io/delta-rs/tree/rust-v0.19.1) (2024-09-11) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.19.0...rust-v0.19.1) + +**Implemented enhancements:** + +- question: deletionVectors support [\#2829](https://github.com/delta-io/delta-rs/issues/2829) +- \[Minor\] Make `Add::get_json_stats` public [\#2821](https://github.com/delta-io/delta-rs/issues/2821) +- expose target\_file\_size in python side for WriterProperties [\#2810](https://github.com/delta-io/delta-rs/issues/2810) +- expose default\_column\_properties, column\_properties of parquet WriterProperties in python [\#2785](https://github.com/delta-io/delta-rs/issues/2785) +- CDC support in deltalog when writing delta table [\#2720](https://github.com/delta-io/delta-rs/issues/2720) +- Function behaving similarly to `SHOW PARTITIONS` in the Python API [\#2671](https://github.com/delta-io/delta-rs/issues/2671) +- Expose set\_statistics\_truncate\_length via Python WriterProperties [\#2630](https://github.com/delta-io/delta-rs/issues/2630) + +**Fixed bugs:** + +- `write_deltalake` with predicate throw index out of bounds [\#2867](https://github.com/delta-io/delta-rs/issues/2867) +- writing to blobfuse has stopped working in 0.19.2 [\#2860](https://github.com/delta-io/delta-rs/issues/2860) +- cannot read from public GCS bucket if non logged in [\#2859](https://github.com/delta-io/delta-rs/issues/2859) +- Stats missing for `dataSkippingStatsColumns` when escaping column name [\#2849](https://github.com/delta-io/delta-rs/issues/2849) +- 0.19.2 install error when using poetry, pdm on Ubuntu [\#2848](https://github.com/delta-io/delta-rs/issues/2848) +- `deltalake-*` crates use different version than specified in `Cargo.toml`, leading to unexpected behavior [\#2847](https://github.com/delta-io/delta-rs/issues/2847) +- Databricks fails integrity check after compacting with delta-rs [\#2839](https://github.com/delta-io/delta-rs/issues/2839) +- "failed to load region from IMDS" back in 0.19 despite `AWS_EC2_METADATA_DISABLED=true` [\#2819](https://github.com/delta-io/delta-rs/issues/2819) +- min/max\_row\_groups not respected [\#2814](https://github.com/delta-io/delta-rs/issues/2814) +- Large Memory Spike on Merge [\#2802](https://github.com/delta-io/delta-rs/issues/2802) +- Deleting large number of records fails with no error message [\#2798](https://github.com/delta-io/delta-rs/issues/2798) +- `max_spill_size` incorrect default value [\#2794](https://github.com/delta-io/delta-rs/issues/2794) +- Delta-RS Saved Delta Table not properly ingested into Databricks [\#2779](https://github.com/delta-io/delta-rs/issues/2779) +- Missing Linux binary releases and source tarball for Python release v0.19.0 [\#2777](https://github.com/delta-io/delta-rs/issues/2777) +- Transaction log parsing performance regression [\#2760](https://github.com/delta-io/delta-rs/issues/2760) +- `RecordBatchWriter` only creates stats for the first 32 columns; this prevents calling `create_checkpoint`. [\#2745](https://github.com/delta-io/delta-rs/issues/2745) +- `DeltaScanBuilder` does not respect datafusion context's `datafusion.execution.parquet.pushdown_filters` [\#2739](https://github.com/delta-io/delta-rs/issues/2739) +- `IN (...)` clauses appear to be ignored in merge commands with S3 - extra partitions scanned [\#2726](https://github.com/delta-io/delta-rs/issues/2726) +- Trailing slash on AWS\_ENDPOINT raises S3 Error [\#2656](https://github.com/delta-io/delta-rs/issues/2656) +- AsyncChunkReader::get\_bytes error: Generic MicrosoftAzure error: error decoding response body [\#2592](https://github.com/delta-io/delta-rs/issues/2592) + +## [rust-v0.19.0](https://github.com/delta-io/delta-rs/tree/rust-v0.19.0) (2024-08-14) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.18.2...rust-v0.19.0) + +**Implemented enhancements:** + +- Only allow squash merge [\#2542](https://github.com/delta-io/delta-rs/issues/2542) + +**Fixed bugs:** + +- Write also insert change types in writer CDC [\#2750](https://github.com/delta-io/delta-rs/issues/2750) +- Regression in Python multiprocessing support [\#2744](https://github.com/delta-io/delta-rs/issues/2744) +- SchemaError occurs during table optimisation after upgrade to v0.18.1 [\#2731](https://github.com/delta-io/delta-rs/issues/2731) +- AWS WebIdentityToken exposure in log files [\#2719](https://github.com/delta-io/delta-rs/issues/2719) +- Write performance degrades with multiple writers [\#2683](https://github.com/delta-io/delta-rs/issues/2683) +- Write monotonic sequence, but read is non monotonic [\#2659](https://github.com/delta-io/delta-rs/issues/2659) +- Python `write_deltalake` with `schema_mode="merge"` casts types [\#2642](https://github.com/delta-io/delta-rs/issues/2642) +- Newest docs \(potentially\) not released [\#2587](https://github.com/delta-io/delta-rs/issues/2587) +- CDC is not generated for Structs and Lists [\#2568](https://github.com/delta-io/delta-rs/issues/2568) + +**Closed issues:** + +- delete\_dir bug [\#2713](https://github.com/delta-io/delta-rs/issues/2713) + +**Merged pull requests:** + +- chore: fix a bunch of clippy lints and re-enable tests [\#2773](https://github.com/delta-io/delta-rs/pull/2773) ([rtyler](https://github.com/rtyler)) +- feat: more economic data skipping with datafusion [\#2772](https://github.com/delta-io/delta-rs/pull/2772) ([roeap](https://github.com/roeap)) +- chore: prepare the next notable release of 0.19.0 [\#2768](https://github.com/delta-io/delta-rs/pull/2768) ([rtyler](https://github.com/rtyler)) +- feat: restore the TryFrom for DeltaTablePartition [\#2767](https://github.com/delta-io/delta-rs/pull/2767) ([rtyler](https://github.com/rtyler)) +- feat: fail fast on forked process [\#2765](https://github.com/delta-io/delta-rs/pull/2765) ([Tom-Newton](https://github.com/Tom-Newton)) +- perf: early stop if all values in arr are null [\#2764](https://github.com/delta-io/delta-rs/pull/2764) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): don't flatten fields during cdf read [\#2763](https://github.com/delta-io/delta-rs/pull/2763) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: upgrade to datafusion 41 [\#2761](https://github.com/delta-io/delta-rs/pull/2761) ([rtyler](https://github.com/rtyler)) +- fix\(python, rust\): cdc in writer not creating inserts [\#2751](https://github.com/delta-io/delta-rs/pull/2751) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: improved test fixtures [\#2749](https://github.com/delta-io/delta-rs/pull/2749) ([roeap](https://github.com/roeap)) +- feat: introduce CDC generation for merge operations [\#2747](https://github.com/delta-io/delta-rs/pull/2747) ([rtyler](https://github.com/rtyler)) +- docs: fix broken link in docs [\#2746](https://github.com/delta-io/delta-rs/pull/2746) ([astrojuanlu](https://github.com/astrojuanlu)) +- chore: update delta\_kernel to 0.3.0 [\#2742](https://github.com/delta-io/delta-rs/pull/2742) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- chore: add to code\_owner crates [\#2741](https://github.com/delta-io/delta-rs/pull/2741) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: update changelog and versions for next release [\#2740](https://github.com/delta-io/delta-rs/pull/2740) ([rtyler](https://github.com/rtyler)) +- feat\(python, rust\): arrow large/view types passthrough, rust default engine [\#2738](https://github.com/delta-io/delta-rs/pull/2738) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: column parsing to include nested columns and enclosing char [\#2737](https://github.com/delta-io/delta-rs/pull/2737) ([gtrawinski](https://github.com/gtrawinski)) + +## [rust-v0.18.2](https://github.com/delta-io/delta-rs/tree/rust-v0.18.2) (2024-08-07) + +[Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.18.1...rust-v0.18.2) + +**Implemented enhancements:** + +- Choose which columns to store min/max values for [\#2709](https://github.com/delta-io/delta-rs/issues/2709) +- Projection pushdown for load\_cdf [\#2681](https://github.com/delta-io/delta-rs/issues/2681) +- Way to check if Delta table exists at specified path [\#2662](https://github.com/delta-io/delta-rs/issues/2662) +- Support HDFS via hdfs-native package [\#2611](https://github.com/delta-io/delta-rs/issues/2611) +- Deletion `_change_type` does not appear in change data feed [\#2579](https://github.com/delta-io/delta-rs/issues/2579) + +**Fixed bugs:** + +- Slow add\_actions.to\_pydict for tables with large number of columns, impacting read performance [\#2733](https://github.com/delta-io/delta-rs/issues/2733) +- append is deleting records [\#2716](https://github.com/delta-io/delta-rs/issues/2716) +- segmentation fault - Python 3.10 on Mac M3 [\#2706](https://github.com/delta-io/delta-rs/issues/2706) +- Failure to delete dir and files [\#2703](https://github.com/delta-io/delta-rs/issues/2703) +- DeltaTable.from\_data\_catalog not working [\#2699](https://github.com/delta-io/delta-rs/issues/2699) +- Project should use the same version of `ruff` in the `lint` stage of `python_build.yml` as in `pyproject.toml` [\#2678](https://github.com/delta-io/delta-rs/issues/2678) +- un-tracked columns are giving json error when pyarrow schema have feild with nullable=False and create\_checkpoint is trigged [\#2675](https://github.com/delta-io/delta-rs/issues/2675) +- \[BUG\]write\_delta\({'custom\_metadata':str}\) cannot be converted. str to pyDict error \(0.18.2\_DeltaPython/Windows10\) [\#2697](https://github.com/delta-io/delta-rs/issues/2697) +- Pyarrow engine not supporting schema overwrite with Append mode [\#2654](https://github.com/delta-io/delta-rs/issues/2654) +- `deltalake-core` version re-exported by `deltalake` different than versions used by `deltalake-azure` and `deltalake-gcp` [\#2647](https://github.com/delta-io/delta-rs/issues/2647) +- i32 limit in JSON stats [\#2646](https://github.com/delta-io/delta-rs/issues/2646) +- Rust writer not encoding correct URL for partitions in delta table [\#2634](https://github.com/delta-io/delta-rs/issues/2634) +- Large Types breaks merge predicate pruning [\#2632](https://github.com/delta-io/delta-rs/issues/2632) +- Getting error when converting a partitioned parquet table to delta table [\#2626](https://github.com/delta-io/delta-rs/issues/2626) +- Arrow: Parquet does not support writing empty structs when creating checkpoint [\#2622](https://github.com/delta-io/delta-rs/issues/2622) +- InvalidTableLocation\("Unknown scheme: gs"\) on 0.18.0 [\#2610](https://github.com/delta-io/delta-rs/issues/2610) +- Unable to read delta table created using Uniform [\#2578](https://github.com/delta-io/delta-rs/issues/2578) +- schema merging doesn't work when overwriting with a predicate [\#2567](https://github.com/delta-io/delta-rs/issues/2567) + +**Closed issues:** + +- Unable to write new partitions with type timestamp on tables created with delta-rs 0.10.0 [\#2631](https://github.com/delta-io/delta-rs/issues/2631) + +**Merged pull requests:** + +- fix: schema adapter doesn't map partial batches correctly [\#2735](https://github.com/delta-io/delta-rs/pull/2735) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- perf: grab file size in rust [\#2734](https://github.com/delta-io/delta-rs/pull/2734) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: use logical plan in update, refactor/simplify CDCTracker [\#2727](https://github.com/delta-io/delta-rs/pull/2727) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: use logical plan in delete, delta planner refactoring [\#2725](https://github.com/delta-io/delta-rs/pull/2725) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: try an alternative docke compose invocation syntax [\#2724](https://github.com/delta-io/delta-rs/pull/2724) ([rtyler](https://github.com/rtyler)) +- fix\(python, rust\): use input schema to get correct schema in cdf reads [\#2723](https://github.com/delta-io/delta-rs/pull/2723) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): cdc write-support for `overwrite` and `replacewhere` writes [\#2722](https://github.com/delta-io/delta-rs/pull/2722) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): cdc write-support for `delete` operation [\#2721](https://github.com/delta-io/delta-rs/pull/2721) ([ion-elgreco](https://github.com/ion-elgreco)) +- chore: enabling actions for merge groups [\#2718](https://github.com/delta-io/delta-rs/pull/2718) ([rtyler](https://github.com/rtyler)) +- perf: apply projection when reading checkpoint parquet [\#2717](https://github.com/delta-io/delta-rs/pull/2717) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- feat\(python\): add DeltaTable.is\_deltatable static method \(\#2662\) [\#2715](https://github.com/delta-io/delta-rs/pull/2715) ([omkar-foss](https://github.com/omkar-foss)) +- chore: prepare python release 0.18.3 [\#2707](https://github.com/delta-io/delta-rs/pull/2707) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix\(python, rust\): use url encoder when encoding partition values [\#2705](https://github.com/delta-io/delta-rs/pull/2705) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat\(python, rust\): add projection in CDF reads [\#2704](https://github.com/delta-io/delta-rs/pull/2704) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: ensure DataFusion SessionState Parquet options are applied to DeltaScan [\#2702](https://github.com/delta-io/delta-rs/pull/2702) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- chore: refactor `write_deltalake` in `writer.py` [\#2695](https://github.com/delta-io/delta-rs/pull/2695) ([fpgmaas](https://github.com/fpgmaas)) +- fix\(python\): empty dataset fix for "pyarrow" engine [\#2689](https://github.com/delta-io/delta-rs/pull/2689) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: add test coverage command to `Makefile` [\#2688](https://github.com/delta-io/delta-rs/pull/2688) ([fpgmaas](https://github.com/fpgmaas)) +- chore: create separate action to setup python and rust in the cicd pipeline [\#2687](https://github.com/delta-io/delta-rs/pull/2687) ([fpgmaas](https://github.com/fpgmaas)) +- fix: update delta kernel version [\#2685](https://github.com/delta-io/delta-rs/pull/2685) ([jeppe742](https://github.com/jeppe742)) +- chore: update README.md [\#2684](https://github.com/delta-io/delta-rs/pull/2684) ([veronewra](https://github.com/veronewra)) +- fix\(rust,python\): checkpoint with column nullable false [\#2680](https://github.com/delta-io/delta-rs/pull/2680) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: pin `ruff` and `mypy` versions in the `lint` stage in the CI pipeline [\#2679](https://github.com/delta-io/delta-rs/pull/2679) ([fpgmaas](https://github.com/fpgmaas)) +- chore: enable `RUF` ruleset for `ruff` [\#2677](https://github.com/delta-io/delta-rs/pull/2677) ([fpgmaas](https://github.com/fpgmaas)) +- chore: remove stale code for conditional import of `Literal` [\#2676](https://github.com/delta-io/delta-rs/pull/2676) ([fpgmaas](https://github.com/fpgmaas)) +- chore: remove references to black from the project [\#2674](https://github.com/delta-io/delta-rs/pull/2674) ([fpgmaas](https://github.com/fpgmaas)) +- chore: bump ruff to 0.5.2 [\#2673](https://github.com/delta-io/delta-rs/pull/2673) ([fpgmaas](https://github.com/fpgmaas)) +- chore: improve contributing.md [\#2672](https://github.com/delta-io/delta-rs/pull/2672) ([fpgmaas](https://github.com/fpgmaas)) +- feat: support userMetadata in CommitInfo [\#2670](https://github.com/delta-io/delta-rs/pull/2670) ([jkylling](https://github.com/jkylling)) +- chore: upgrade to datafusion 40 [\#2661](https://github.com/delta-io/delta-rs/pull/2661) ([rtyler](https://github.com/rtyler)) +- docs: improve navigation fixes [\#2660](https://github.com/delta-io/delta-rs/pull/2660) ([avriiil](https://github.com/avriiil)) +- docs: add integration docs for s3 backend [\#2658](https://github.com/delta-io/delta-rs/pull/2658) ([avriiil](https://github.com/avriiil)) +- docs: fix bullets on hdfs docs [\#2653](https://github.com/delta-io/delta-rs/pull/2653) ([Kimahriman](https://github.com/Kimahriman)) +- ci: update CODEOWNERS [\#2650](https://github.com/delta-io/delta-rs/pull/2650) ([hntd187](https://github.com/hntd187)) +- feat\(rust\): fix size\_in\_bytes in last\_checkpoint\_ to i64 [\#2649](https://github.com/delta-io/delta-rs/pull/2649) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: increase subcrate versions [\#2648](https://github.com/delta-io/delta-rs/pull/2648) ([rtyler](https://github.com/rtyler)) +- chore: missed one macos runner reference in actions [\#2645](https://github.com/delta-io/delta-rs/pull/2645) ([rtyler](https://github.com/rtyler)) +- chore: add a reproduction case for merge failures with struct\ [\#2644](https://github.com/delta-io/delta-rs/pull/2644) ([rtyler](https://github.com/rtyler)) +- chore: remove macos builders from pull request flow [\#2638](https://github.com/delta-io/delta-rs/pull/2638) ([rtyler](https://github.com/rtyler)) +- fix: enable parquet pushdown for DeltaScan via TableProvider impl for DeltaTable \(rebase\) [\#2637](https://github.com/delta-io/delta-rs/pull/2637) ([rtyler](https://github.com/rtyler)) +- chore: fix documentation generation with a pin of griffe [\#2636](https://github.com/delta-io/delta-rs/pull/2636) ([rtyler](https://github.com/rtyler)) +- fix\(python\): fixed large\_dtype to schema convert [\#2635](https://github.com/delta-io/delta-rs/pull/2635) ([sherlockbeard](https://github.com/sherlockbeard)) +- fix\(rust, python\): fix writing empty structs when creating checkpoint [\#2627](https://github.com/delta-io/delta-rs/pull/2627) ([sherlockbeard](https://github.com/sherlockbeard)) +- fix\(rust, python\): fix merge schema with overwrite [\#2623](https://github.com/delta-io/delta-rs/pull/2623) ([sherlockbeard](https://github.com/sherlockbeard)) +- chore: bump python 0.18.2 [\#2621](https://github.com/delta-io/delta-rs/pull/2621) ([ion-elgreco](https://github.com/ion-elgreco)) +- feat: report DataFusion metrics for DeltaScan [\#2617](https://github.com/delta-io/delta-rs/pull/2617) ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- feat\(rust,python\): cast each parquet file to delta schema [\#2615](https://github.com/delta-io/delta-rs/pull/2615) ([HawaiianSpork](https://github.com/HawaiianSpork)) +- fix\(rust\): inconsistent order of partitioning columns \(\#2494\) [\#2614](https://github.com/delta-io/delta-rs/pull/2614) ([aditanase](https://github.com/aditanase)) +- docs: add Daft writer [\#2594](https://github.com/delta-io/delta-rs/pull/2594) ([avriiil](https://github.com/avriiil)) +- feat\(python, rust\): `add column` operation [\#2562](https://github.com/delta-io/delta-rs/pull/2562) ([ion-elgreco](https://github.com/ion-elgreco)) +- fix: change arrow map root name to follow with parquet root name [\#2538](https://github.com/delta-io/delta-rs/pull/2538) ([sclmn](https://github.com/sclmn)) +- feat\(python\): handle PyCapsule interface objects in write\_deltalake [\#2534](https://github.com/delta-io/delta-rs/pull/2534) ([kylebarron](https://github.com/kylebarron)) + ## [rust-v0.19.0](https://github.com/delta-io/delta-rs/tree/rust-v0.19.0) (2024-08-14) [Full Changelog](https://github.com/delta-io/delta-rs/compare/rust-v0.18.2...rust-v0.19.0) diff --git a/crates/aws/Cargo.toml b/crates/aws/Cargo.toml index 992a32c93e..e653d2b242 100644 --- a/crates/aws/Cargo.toml +++ b/crates/aws/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-aws" -version = "0.3.0" +version = "0.4.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,7 +12,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = "0.20.0", path = "../core" } +deltalake-core = { version = "0.21.0", path = "../core" } aws-smithy-runtime-api = { version="1.7" } aws-smithy-runtime = { version="1.7", optional = true} aws-credential-types = { version="1.2", features = ["hardcoded-credentials"]} diff --git a/crates/aws/src/logstore/default_logstore.rs b/crates/aws/src/logstore/default_logstore.rs index a5688141c2..f343b88f5a 100644 --- a/crates/aws/src/logstore/default_logstore.rs +++ b/crates/aws/src/logstore/default_logstore.rs @@ -3,11 +3,8 @@ use std::sync::Arc; use bytes::Bytes; +use deltalake_core::logstore::*; use deltalake_core::{ - logstore::{ - abort_commit_entry, get_latest_version, read_commit_entry, write_commit_entry, - CommitOrBytes, LogStore, LogStoreConfig, - }, operations::transaction::TransactionError, storage::{ObjectStoreRef, StorageOptions}, DeltaResult, @@ -103,6 +100,10 @@ impl LogStore for S3LogStore { get_latest_version(self, current_version).await } + async fn get_earliest_version(&self, current_version: i64) -> DeltaResult { + get_earliest_version(self, current_version).await + } + fn object_store(&self) -> Arc { self.storage.clone() } diff --git a/crates/aws/src/logstore/dynamodb_logstore.rs b/crates/aws/src/logstore/dynamodb_logstore.rs index 202df1709e..fccb8c9060 100644 --- a/crates/aws/src/logstore/dynamodb_logstore.rs +++ b/crates/aws/src/logstore/dynamodb_logstore.rs @@ -296,6 +296,10 @@ impl LogStore for S3DynamoDbLogStore { } } + async fn get_earliest_version(&self, current_version: i64) -> DeltaResult { + get_earliest_version(self, current_version).await + } + fn object_store(&self) -> ObjectStoreRef { self.storage.clone() } diff --git a/crates/azure/Cargo.toml b/crates/azure/Cargo.toml index 87a744d608..c9976fae9e 100644 --- a/crates/azure/Cargo.toml +++ b/crates/azure/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-azure" -version = "0.3.0" +version = "0.4.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,7 +12,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = "0.20.0", path = "../core" } +deltalake-core = { version = "0.21.0", path = "../core" } lazy_static = "1" # workspace depenndecies diff --git a/crates/catalog-glue/Cargo.toml b/crates/catalog-glue/Cargo.toml index 549b3a11c8..c80ec9ce0b 100644 --- a/crates/catalog-glue/Cargo.toml +++ b/crates/catalog-glue/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-catalog-glue" -version = "0.4.0" +version = "0.5.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -15,7 +15,7 @@ rust-version.workspace = true async-trait = { workspace = true } aws-config = "1" aws-sdk-glue = "1" -deltalake-core = { version = "0.20.0", path = "../core" } +deltalake-core = { version = "0.21.0", path = "../core" } thiserror = { workspace = true } [dev-dependencies] diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 52df035c71..d150b5b0d6 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-core" -version = "0.20.0" +version = "0.21.0" authors.workspace = true keywords.workspace = true readme.workspace = true diff --git a/crates/core/src/delta_datafusion/mod.rs b/crates/core/src/delta_datafusion/mod.rs index 8d64f85fb2..50983d8400 100644 --- a/crates/core/src/delta_datafusion/mod.rs +++ b/crates/core/src/delta_datafusion/mod.rs @@ -644,7 +644,15 @@ impl<'a> DeltaScanBuilder<'a> { let mut exec_plan_builder = ParquetExecBuilder::new(FileScanConfig { object_store_url: self.log_store.object_store_url(), file_schema, - file_groups: file_groups.into_values().collect(), + // If all files were filtered out, we still need to emit at least one partition to + // pass datafusion sanity checks. + // + // See https://github.com/apache/datafusion/issues/11322 + file_groups: if file_groups.is_empty() { + vec![vec![]] + } else { + file_groups.into_values().collect() + }, statistics: stats, projection: self.projection.cloned(), limit: self.limit, @@ -1764,8 +1772,11 @@ impl From for DeltaColumn { #[cfg(test)] mod tests { - use arrow_array::StructArray; - use arrow_schema::Schema; + use crate::operations::create::CreateBuilder; + use crate::operations::write::SchemaMode; + use crate::writer::test_utils::get_delta_schema; + use arrow::array::StructArray; + use arrow::datatypes::{Field, Schema}; use chrono::{TimeZone, Utc}; use datafusion::assert_batches_sorted_eq; use datafusion::datasource::physical_plan::ParquetExec; @@ -1774,13 +1785,12 @@ mod tests { use datafusion_expr::lit; use datafusion_proto::physical_plan::AsExecutionPlan; use datafusion_proto::protobuf; + use delta_kernel::schema::StructField; use object_store::path::Path; use serde_json::json; use std::ops::Deref; use super::*; - use crate::operations::write::SchemaMode; - use crate::writer::test_utils::get_delta_schema; // test deserialization of serialized partition values. // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#partition-value-serialization @@ -2566,4 +2576,26 @@ mod tests { Ok(true) } } + + #[tokio::test] + async fn passes_sanity_checker_when_all_files_filtered() { + // Run a query that filters out all files and sorts. + // Verify that it returns an empty set of rows without panicing. + // + // Historically, we had a bug that caused us to emit a query plan with 0 partitions, which + // datafusion rejected. + let table = crate::open_table("../test/tests/data/delta-2.2.0-partitioned-types") + .await + .unwrap(); + let ctx = SessionContext::new(); + ctx.register_table("test", Arc::new(table)).unwrap(); + + let df = ctx + .sql("select * from test where c3 = 100 ORDER BY c1 ASC") + .await + .unwrap(); + let actual = df.collect().await.unwrap(); + + assert_eq!(actual.len(), 0); + } } diff --git a/crates/core/src/logstore/default_logstore.rs b/crates/core/src/logstore/default_logstore.rs index 79a1c76653..c31280ef5d 100644 --- a/crates/core/src/logstore/default_logstore.rs +++ b/crates/core/src/logstore/default_logstore.rs @@ -97,6 +97,10 @@ impl LogStore for DefaultLogStore { super::get_latest_version(self, current_version).await } + async fn get_earliest_version(&self, current_version: i64) -> DeltaResult { + super::get_earliest_version(self, current_version).await + } + fn object_store(&self) -> Arc { self.storage.clone() } diff --git a/crates/core/src/logstore/mod.rs b/crates/core/src/logstore/mod.rs index dd82274157..a81811faeb 100644 --- a/crates/core/src/logstore/mod.rs +++ b/crates/core/src/logstore/mod.rs @@ -1,11 +1,12 @@ //! Delta log store. +use std::cmp::min; use std::io::{BufRead, BufReader, Cursor}; use std::sync::OnceLock; use std::{cmp::max, collections::HashMap, sync::Arc}; use bytes::Bytes; use dashmap::DashMap; -use futures::StreamExt; +use futures::{StreamExt, TryStreamExt}; use lazy_static::lazy_static; use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; use regex::Regex; @@ -213,6 +214,9 @@ pub trait LogStore: Sync + Send { /// Find latest version currently stored in the delta log. async fn get_latest_version(&self, start_version: i64) -> DeltaResult; + /// Find earliest version currently stored in the delta log. + async fn get_earliest_version(&self, start_version: i64) -> DeltaResult; + /// Get underlying object store. fn object_store(&self) -> Arc; @@ -441,6 +445,52 @@ pub async fn get_latest_version( Ok(version) } +/// Default implementation for retrieving the earliest version +pub async fn get_earliest_version( + log_store: &dyn LogStore, + current_version: i64, +) -> DeltaResult { + let version_start = match get_last_checkpoint(log_store).await { + Ok(last_check_point) => last_check_point.version, + Err(ProtocolError::CheckpointNotFound) => { + // no checkpoint so start from current_version + current_version + } + Err(e) => { + return Err(DeltaTableError::from(e)); + } + }; + + // list files to find min version + let version = async { + let mut min_version: i64 = version_start; + let prefix = Some(log_store.log_path()); + let offset_path = commit_uri_from_version(version_start); + let object_store = log_store.object_store(); + + // Manually filter until we can provide direction in https://github.com/apache/arrow-rs/issues/6274 + let mut files = object_store + .list(prefix) + .try_filter(move |f| futures::future::ready(f.location < offset_path)) + .boxed(); + + while let Some(obj_meta) = files.next().await { + let obj_meta = obj_meta?; + if let Some(log_version) = extract_version_from_filename(obj_meta.location.as_ref()) { + min_version = min(min_version, log_version); + } + } + + if min_version < 0 { + return Err(DeltaTableError::not_a_table(log_store.root_uri())); + } + + Ok::(min_version) + } + .await?; + Ok(version) +} + /// Read delta log for a specific version pub async fn read_commit_entry( storage: &dyn ObjectStore, diff --git a/crates/core/src/operations/merge/mod.rs b/crates/core/src/operations/merge/mod.rs index 7f87d30d35..fe8d030464 100644 --- a/crates/core/src/operations/merge/mod.rs +++ b/crates/core/src/operations/merge/mod.rs @@ -1154,6 +1154,14 @@ async fn execute( .select(write_projection.clone())? .with_column(CDC_COLUMN_NAME, lit("insert"))?, ); + + let after = cdc_projection + .clone() + .filter(col(TARGET_COLUMN).is_true())? + .select(write_projection.clone())?; + + // Extra select_columns is required so that before and after have same schema order + // DataFusion doesn't have UnionByName yet, see https://github.com/apache/datafusion/issues/12650 let before = cdc_projection .clone() .filter(col(crate::delta_datafusion::PATH_COLUMN).is_not_null())? @@ -1164,13 +1172,16 @@ async fn execute( .filter(|c| c.name != crate::delta_datafusion::PATH_COLUMN) .map(|c| Expr::Column(c.clone())) .collect_vec(), + )? + .select_columns( + &after + .schema() + .columns() + .iter() + .map(|v| v.name()) + .collect::>(), )?; - let after = cdc_projection - .clone() - .filter(col(TARGET_COLUMN).is_true())? - .select(write_projection.clone())?; - let tracker = CDCTracker::new(before, after); change_data.push(tracker.collect()?); } diff --git a/crates/core/src/operations/optimize.rs b/crates/core/src/operations/optimize.rs index cf096d56d1..c4048c9fb7 100644 --- a/crates/core/src/operations/optimize.rs +++ b/crates/core/src/operations/optimize.rs @@ -718,6 +718,8 @@ impl MergePlan { }) .boxed(), OptimizeOperations::ZOrder(zorder_columns, bins) => { + debug!("Starting zorder with the columns: {zorder_columns:?} {bins:?}"); + #[cfg(not(feature = "datafusion"))] let exec_context = Arc::new(zorder::ZOrderExecContext::new( zorder_columns, @@ -729,7 +731,6 @@ impl MergePlan { bins.len() <= num_cpus::get(), )); - debug!("Starting zorder with the columns: {zorder_columns:?} {bins:?}"); #[cfg(feature = "datafusion")] let exec_context = Arc::new(zorder::ZOrderExecContext::new( zorder_columns, diff --git a/crates/core/src/operations/transaction/mod.rs b/crates/core/src/operations/transaction/mod.rs index 6c4e81dc63..69027cc4b7 100644 --- a/crates/core/src/operations/transaction/mod.rs +++ b/crates/core/src/operations/transaction/mod.rs @@ -83,7 +83,7 @@ use object_store::path::Path; use object_store::Error as ObjectStoreError; use serde_json::Value; -use self::conflict_checker::{CommitConflictError, TransactionInfo, WinningCommitSummary}; +use self::conflict_checker::{TransactionInfo, WinningCommitSummary}; use crate::checkpoints::{cleanup_expired_logs_for, create_checkpoint_for}; use crate::errors::DeltaTableError; use crate::kernel::{ @@ -97,6 +97,7 @@ use crate::table::config::TableConfig; use crate::table::state::DeltaTableState; use crate::{crate_version, DeltaResult}; +pub use self::conflict_checker::CommitConflictError; pub use self::protocol::INSTANCE as PROTOCOL; #[cfg(test)] diff --git a/crates/core/src/table/mod.rs b/crates/core/src/table/mod.rs index 65d84985c7..9b73745a20 100644 --- a/crates/core/src/table/mod.rs +++ b/crates/core/src/table/mod.rs @@ -311,6 +311,11 @@ impl DeltaTable { self.log_store.get_latest_version(self.version()).await } + /// returns the earliest available version of the table + pub async fn get_earliest_version(&self) -> Result { + self.log_store.get_earliest_version(self.version()).await + } + /// Currently loaded version of the table pub fn version(&self) -> i64 { self.state.as_ref().map(|s| s.version()).unwrap_or(-1) diff --git a/crates/deltalake/Cargo.toml b/crates/deltalake/Cargo.toml index 4d4df632c5..1477d90c29 100644 --- a/crates/deltalake/Cargo.toml +++ b/crates/deltalake/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake" -version = "0.20.0" +version = "0.21.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -16,12 +16,12 @@ rust-version.workspace = true features = ["azure", "datafusion", "gcs", "hdfs", "json", "python", "s3", "unity-experimental"] [dependencies] -deltalake-core = { version = "0.20.0", path = "../core" } -deltalake-aws = { version = "0.3.0", path = "../aws", default-features = false, optional = true } -deltalake-azure = { version = "0.3.0", path = "../azure", optional = true } -deltalake-gcp = { version = "0.4.0", path = "../gcp", optional = true } -deltalake-hdfs = { version = "0.4.0", path = "../hdfs", optional = true } -deltalake-catalog-glue = { version = "0.4.0", path = "../catalog-glue", optional = true } +deltalake-core = { version = "0.21.0", path = "../core" } +deltalake-aws = { version = "0.4.0", path = "../aws", default-features = false, optional = true } +deltalake-azure = { version = "0.4.0", path = "../azure", optional = true } +deltalake-gcp = { version = "0.5.0", path = "../gcp", optional = true } +deltalake-hdfs = { version = "0.5.0", path = "../hdfs", optional = true } +deltalake-catalog-glue = { version = "0.5.0", path = "../catalog-glue", optional = true } [features] # All of these features are just reflected into the core crate until that diff --git a/crates/gcp/Cargo.toml b/crates/gcp/Cargo.toml index 380aa84852..51020fb630 100644 --- a/crates/gcp/Cargo.toml +++ b/crates/gcp/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-gcp" -version = "0.4.0" +version = "0.5.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,7 +12,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = "0.20.0", path = "../core" } +deltalake-core = { version = "0.21.0", path = "../core" } lazy_static = "1" # workspace depenndecies diff --git a/crates/hdfs/Cargo.toml b/crates/hdfs/Cargo.toml index f601f55c6d..2d654d0bc1 100644 --- a/crates/hdfs/Cargo.toml +++ b/crates/hdfs/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-hdfs" -version = "0.4.0" +version = "0.5.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,7 +12,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = "0.20.0", path = "../core" } +deltalake-core = { version = "0.21.0", path = "../core" } hdfs-native-object-store = "0.11" # workspace dependecies diff --git a/crates/mount/Cargo.toml b/crates/mount/Cargo.toml index a770200b98..97372895ab 100644 --- a/crates/mount/Cargo.toml +++ b/crates/mount/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-mount" -version = "0.4.0" +version = "0.5.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,7 +12,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = "0.20.0", path = "../core", features = [ +deltalake-core = { version = "0.21.0", path = "../core", features = [ "datafusion", ] } lazy_static = "1" diff --git a/crates/test/Cargo.toml b/crates/test/Cargo.toml index 3638e6fefa..9087755fb1 100644 --- a/crates/test/Cargo.toml +++ b/crates/test/Cargo.toml @@ -1,13 +1,13 @@ [package] name = "deltalake-test" -version = "0.3.0" +version = "0.4.0" edition = "2021" publish = false [dependencies] bytes = { workspace = true } chrono = { workspace = true, default-features = false, features = ["clock"] } -deltalake-core = { version = "0.20.0", path = "../core" } +deltalake-core = { version = "0.21.0", path = "../core" } dotenvy = "0" fs_extra = "1.3.0" futures = { version = "0.3" } diff --git a/docs/integrations/object-storage/adls.md b/docs/integrations/object-storage/adls.md new file mode 100644 index 0000000000..2867c07da3 --- /dev/null +++ b/docs/integrations/object-storage/adls.md @@ -0,0 +1,57 @@ +# Azure ADLS Storage Backend + +`delta-rs` offers native support for using Microsoft Azure Data Lake Storage (ADSL) as an object storage backend. + +You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your ADLS access credentials correctly. + +## Passing Credentials Explicitly + +You can also pass ADLS credentials to your query engine explicitly. + +For Polars, you would do this using the `storage_options` keyword as demonstrated above. This will forward your credentials to the `object store` library that Polars uses for cloud storage access under the hood. Read the [`object store` documentation](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants) for more information defining specific credentials. + +## Example: Write Delta table to ADLS with Polars + +Using Polars, you can write a Delta table to ADLS directly like this: + +```python +import polars as pl + +df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]}) + +# define container name +container = + +# define credentials +storage_options = { + "ACCOUNT_NAME": , + "ACCESS_KEY": , +} + +# write Delta to ADLS +df_pl.write_delta( + f"abfs://{container}/delta_table", + storage_options = storage_options +) +``` + +## Example with pandas + +For libraries without direct `write_delta` methods (like Pandas), you can use the `write_deltalake` function from the `deltalake` library: + +```python +import pandas as pd +from deltalake import write_deltalake + +df = pd.DataFrame({"foo": [1, 2, 3, 4, 5]}) + +write_deltalake( + f"abfs://{container}/delta_table_pandas", + df, + storage_options=storage_options +) +``` + +## Using Local Authentication + +If your local session is authenticated using the Azure CLI then you can write Delta tables directly to ADLS. Read more about this in the [Azure CLI documentation](https://learn.microsoft.com/en-us/cli/azure/). diff --git a/docs/integrations/object-storage/s3-like.md b/docs/integrations/object-storage/s3-like.md index 40b2f6e076..b548b4e929 100644 --- a/docs/integrations/object-storage/s3-like.md +++ b/docs/integrations/object-storage/s3-like.md @@ -1,6 +1,6 @@ # CloudFlare R2 & Minio -`delta-rs` offers native support for using Cloudflare R2 and Minio's as storage backend. R2 and Minio support conditional puts, however we have to pass this flag into the storage options. See the example below +`delta-rs` offers native support for using Cloudflare R2 or Minio as an S3-compatible storage backend. R2 and Minio support conditional puts, which removes the need for DynamoDB for safe concurrent writes. However, we have to pass the `aws_conditional_put` flag into `storage_options`. See the example below. You don’t need to install any extra dependencies to read/write Delta tables to S3 with engines that use `delta-rs`. You do need to configure your AWS access credentials correctly. @@ -30,7 +30,7 @@ Follow the steps below to use Delta Lake on S3 (R2/Minio) with Polars: ```python storage_options = { 'AWS_SECRET_ACCESS_KEY': , - 'conditional_put': 'etag', # Here we say to use conditional put, this provides safe concurrency. + 'aws_conditional_put': 'etag', # Here we say to use conditional put, this provides safe concurrency. } ``` @@ -43,41 +43,34 @@ storage_options = { ) ``` -## Delta Lake on S3: Safe Concurrent Writes -You need a locking provider to ensure safe concurrent writes when writing Delta tables to S3. This is because S3 does not guarantee mutual exclusion. +## Minio and Docker -A locking provider guarantees that only one writer is able to create the same file. This prevents corrupted or conflicting data. +Minio is straightforward to host locally with Docker and docker-compose, via the following `docker-compose.yml` file - just run `docker-compose up`: -`delta-rs` uses DynamoDB to guarantee safe concurrent writes. +```yaml +version: '3.8' -Run the code below in your terminal to create a DynamoDB table that will act as your locking provider. - -``` - aws dynamodb create-table \ - --table-name delta_log \ - --attribute-definitions AttributeName=tablePath,AttributeType=S AttributeName=fileName,AttributeType=S \ - --key-schema AttributeName=tablePath,KeyType=HASH AttributeName=fileName,KeyType=RANGE \ - --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 +services: + minio: + image: minio/minio + ports: + - "9000:9000" + - "9001:9001" + environment: + MINIO_ROOT_USER: ... + MINIO_ROOT_PASSWORD: ... + command: server /data --console-address ":9001" ``` -If for some reason you don't want to use DynamoDB as your locking mechanism you can choose to set the `AWS_S3_ALLOW_UNSAFE_RENAME` variable to `true` in order to enable S3 unsafe writes. - -Read more in the [Usage](../../usage/writing/writing-to-s3-with-locking-provider.md) section. - -## Delta Lake on S3: Required permissions - -You need to have permissions to get, put and delete objects in the S3 bucket you're storing your data in. Please note that you must be allowed to delete objects even if you're just appending to the Delta Lake, because there are temporary files into the log folder that are deleted after usage. - -In AWS S3, you will need the following permissions: +With this configuration, Minio will host its S3-compatible API over HTTP, not HTTPS, on port 9000. This requires an additional flag in `storage_options`, `AWS_ALLOW_HTTP`, to be set to `true`: -- s3:GetObject -- s3:PutObject -- s3:DeleteObject - -In DynamoDB, you will need the following permissions: - -- dynamodb:GetItem -- dynamodb:Query -- dynamodb:PutItem -- dynamodb:UpdateItem +```python +storage_options = { + 'AWS_ACCESS_KEY_ID': ..., + 'AWS_SECRET_ACCESS_KEY': ..., + 'AWS_ENDPOINT_URL': 'http://localhost:9000', + 'AWS_ALLOW_HTTP': 'true', + 'aws_conditional_put': 'etag' +} +``` diff --git a/mkdocs.yml b/mkdocs.yml index b0c8d3a0ac..dae40ee871 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,6 +82,8 @@ nav: - api/exceptions.md - Integrations: - Object Storage: + - integrations/object-storage/adls.md + - integrations/object-storage/gcs.md - integrations/object-storage/hdfs.md - integrations/object-storage/s3.md - integrations/object-storage/s3-like.md diff --git a/python/Cargo.toml b/python/Cargo.toml index 0c3c62232a..fadc260d0d 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-python" -version = "0.20.0" +version = "0.20.2" authors = ["Qingping Hou ", "Will Jones "] homepage = "https://github.com/delta-io/delta-rs" license = "Apache-2.0" diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index 02a3765e02..8329dddad9 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -77,6 +77,7 @@ class RawDeltaTable: def has_files(self) -> bool: ... def get_add_file_sizes(self) -> Dict[str, int]: ... def get_latest_version(self) -> int: ... + def get_earliest_version(self) -> int: ... def get_num_index_cols(self) -> int: ... def get_stats_columns(self) -> Optional[List[str]]: ... def metadata(self) -> RawDeltaTableMetaData: ... diff --git a/python/src/lib.rs b/python/src/lib.rs index 77db334283..473f5ceea9 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -245,6 +245,14 @@ impl RawDeltaTable { }) } + pub fn get_earliest_version(&mut self, py: Python) -> PyResult { + py.allow_threads(|| { + Ok(rt() + .block_on(self._table.get_earliest_version()) + .map_err(PythonError::from)?) + }) + } + pub fn get_num_index_cols(&mut self) -> PyResult { Ok(self ._table diff --git a/python/tests/test_merge.py b/python/tests/test_merge.py index 54c2726fd3..2306e1668a 100644 --- a/python/tests/test_merge.py +++ b/python/tests/test_merge.py @@ -1,3 +1,5 @@ +import datetime +import os import pathlib import pyarrow as pa @@ -1038,3 +1040,40 @@ def test_merge_isin_partition_pruning( assert result == expected assert metrics["num_target_files_scanned"] == 2 assert metrics["num_target_files_skipped_during_scan"] == 3 + + +def test_cdc_merge_planning_union_2908(tmp_path): + """https://github.com/delta-io/delta-rs/issues/2908""" + cdc_path = f"{tmp_path}/_change_data" + + data = { + "id": pa.array([1, 2], pa.int64()), + "date": pa.array( + [datetime.date(1970, 1, 1), datetime.date(1970, 1, 2)], pa.date32() + ), + } + + table = pa.Table.from_pydict(data) + + dt = DeltaTable.create( + table_uri=tmp_path, + schema=table.schema, + mode="overwrite", + partition_by=["id"], + configuration={ + "delta.enableChangeDataFeed": "true", + }, + ) + + dt.merge( + source=table, + predicate="s.id = t.id", + source_alias="s", + target_alias="t", + ).when_not_matched_insert_all().execute() + + last_action = dt.history(1)[0] + + assert last_action["operation"] == "MERGE" + assert dt.version() == 1 + assert os.path.exists(cdc_path), "_change_data doesn't exist"