diff --git a/.github/dependabot.yml b/.github/dependabot.yml index bdacb4c00c..1e5b6b27a4 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -10,6 +10,5 @@ updates: ignore: # arrow and datafusion are bumped manually - dependency-name: "arrow*" - update-types: ["version-update:semver-major"] - dependency-name: "datafusion*" - update-types: ["version-update:semver-major"] + - dependency-name: "parquet" diff --git a/Cargo.toml b/Cargo.toml index 74a4c73597..ccbb766e0f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ debug = true debug = "line-tables-only" [workspace.dependencies] -delta_kernel = { version = "0.3.0" } +delta_kernel = { version = "=0.3.0" } # delta_kernel = { path = "../delta-kernel-rs/kernel", version = "0.3.0" } # arrow diff --git a/crates/aws/Cargo.toml b/crates/aws/Cargo.toml index e79d92a3d2..9fe0c05934 100644 --- a/crates/aws/Cargo.toml +++ b/crates/aws/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-aws" -version = "0.1.4" +version = "0.2.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,7 +12,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" } +deltalake-core = { version = "0.19.1", path = "../core" } aws-smithy-runtime-api = { version="1.1.7" } aws-smithy-runtime = { version="1.1.7", optional = true} aws-credential-types = { version="1.1.7", features = ["hardcoded-credentials"]} diff --git a/crates/azure/Cargo.toml b/crates/azure/Cargo.toml index 955c3f827b..56181a9f0c 100644 --- a/crates/azure/Cargo.toml +++ b/crates/azure/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-azure" -version = "0.1.4" +version = "0.2.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -12,7 +12,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] -deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" } +deltalake-core = { version = "0.19.1", path = "../core" } lazy_static = "1" # workspace depenndecies diff --git a/crates/catalog-glue/Cargo.toml b/crates/catalog-glue/Cargo.toml index 74e927bfc4..b107d1955a 100644 --- a/crates/catalog-glue/Cargo.toml +++ b/crates/catalog-glue/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-catalog-glue" -version = "0.2.0" +version = "0.3.0" authors.workspace = true keywords.workspace = true readme.workspace = true @@ -15,9 +15,7 @@ rust-version.workspace = true async-trait = { workspace = true } aws-config = "1" aws-sdk-glue = "1" -deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" } -# This can depend on a lowest common denominator of core once that's released -# deltalake_core = { version = "0.17.0" } +deltalake-core = { version = "0.19.1", path = "../core" } thiserror = { workspace = true } [dev-dependencies] diff --git a/crates/core/src/kernel/snapshot/parse.rs b/crates/core/src/kernel/snapshot/parse.rs index fc61187c00..f75744691e 100644 --- a/crates/core/src/kernel/snapshot/parse.rs +++ b/crates/core/src/kernel/snapshot/parse.rs @@ -83,7 +83,7 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult(arr, "size")?; let modification_time = ex::extract_and_cast::(arr, "modificationTime")?; let data_change = ex::extract_and_cast::(arr, "dataChange")?; - let stats = ex::extract_and_cast::(arr, "stats")?; + let stats = ex::extract_and_cast_opt::(arr, "stats"); let tags = ex::extract_and_cast_opt::(arr, "tags"); let dv = ex::extract_and_cast_opt::(arr, "deletionVector"); @@ -126,7 +126,8 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult ObjectStoreResult { + // In mounted storage we do an unsafe rename/overwrite + // We don't conditionally check whether the file already exists + options.mode = PutMode::Overwrite; self.inner.put_opts(location, bytes, options).await } diff --git a/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000000.json b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..a67c417df7 --- /dev/null +++ b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1666652369577,"userId":"6114986638742036","userName":"dummy_username","operation":"CREATE OR REPLACE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpoint.writeStatsAsStruct\":\"true\"}"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"1007-161845-fa2h8e50","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"a8510a45-92dc-4e9f-9f7a-42bbcc9b752d"}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"8d3d2b8a-f091-4d7d-8a37-432a9beaf17b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"integer\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsJson":"false","delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1666652369483}} diff --git a/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000001.json b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..9ed804569e --- /dev/null +++ b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000001.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1666652373383,"userId":"6114986638742036","userName":"dummy_username","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"1007-161845-fa2h8e50","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"35e88c76-9cfb-4e0e-bce8-2317f3c49c75"}} +{"metaData":{"id":"8d3d2b8a-f091-4d7d-8a37-432a9beaf17b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"integer\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"null\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boolean\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(8,5)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"string\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"map\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array\",\"type\":{\"type\":\"array\",\"elementType\":\"string\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"nested_struct_element\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"struct_of_array_of_map\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsJson":"false","delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1666652369483}} +{"add":{"path":"part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1666652373000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":0,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-10-24\",\"timestamp\":\"2022-10-24T22:59:32.846Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":0,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-10-24\",\"timestamp\":\"2022-10-24T22:59:32.846Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1666652373000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.checkpoint.parquet b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.checkpoint.parquet new file mode 100644 index 0000000000..e944de8c60 Binary files /dev/null and b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.checkpoint.parquet differ diff --git a/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.json b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..f6f9a119ce --- /dev/null +++ b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1666652374424,"userId":"6114986638742036","userName":"dummy_username","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"1007-161845-fa2h8e50","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"efe25f5f-e03a-458d-8fbe-34ed2111b3c1"}} +{"add":{"path":"part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1666652374000,"dataChange":true,"stats_parsed":null,"tags":{"INSERTION_TIME":"1666652374000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/_last_checkpoint b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..ce032077f2 --- /dev/null +++ b/crates/test/tests/data/delta-checkpoint-stats-optional/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"size":4,"size_in_bytes":41898,"version":2} \ No newline at end of file diff --git a/crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet b/crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet new file mode 100644 index 0000000000..1b473a23e8 Binary files /dev/null and b/crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet differ diff --git a/crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet b/crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet new file mode 100644 index 0000000000..612bc6a5b6 Binary files /dev/null and b/crates/test/tests/data/delta-checkpoint-stats-optional/part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet differ diff --git a/python/Cargo.toml b/python/Cargo.toml index d4c1597277..8ce542fd2e 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-python" -version = "0.19.2" +version = "0.19.3" authors = ["Qingping Hou ", "Will Jones "] homepage = "https://github.com/delta-io/delta-rs" license = "Apache-2.0" diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py index 0186500032..c82d64d96c 100644 --- a/python/tests/test_writer.py +++ b/python/tests/test_writer.py @@ -1850,3 +1850,41 @@ def test_empty_dataset_write(tmp_path: pathlib.Path, sample_data: pa.Table): empty_dataset = dataset(empty_arrow_table) with pytest.raises(DeltaError, match="No data source supplied to write command"): write_deltalake(tmp_path, empty_dataset, mode="append") + + +@pytest.mark.pandas +def test_predicate_out_of_bounds(tmp_path: pathlib.Path): + """See """ + import pandas as pd + + data = [ + (datetime(2024, 7, 31, 9, 30, 0), "AAPL", "20240731", 100, 11.1), + (datetime(2024, 7, 31, 9, 30, 0), "GOOG", "20240731", 200, 11.1), + ] + columns = ["ts", "ins", "date", "f1", "f2"] + df = pd.DataFrame(data, columns=columns) + + predicate = "date == 20240731" + write_deltalake( + table_or_uri=tmp_path, + data=df, + partition_by="date", + mode="overwrite", + schema_mode="merge", + predicate=predicate, + ) + + data = [ + (datetime(2024, 7, 31, 9, 30, 0), "AAPL", "20240731", 666, 666), + (datetime(2024, 7, 31, 9, 30, 0), "GOOG", "20240731", 777, 777), + ] + columns = ["ts", "ins", "date", "fb", "fc"] + df = pd.DataFrame(data, columns=columns) + write_deltalake( + table_or_uri=tmp_path, + data=df, + partition_by="date", + mode="overwrite", + schema_mode="merge", + predicate=predicate, + )