Skip to content

Commit

Permalink
Merge branch 'main' into actions-cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
rtyler authored Sep 12, 2024
2 parents b0ea240 + 45bda3f commit 04e90a2
Show file tree
Hide file tree
Showing 22 changed files with 86 additions and 29 deletions.
3 changes: 1 addition & 2 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,5 @@ updates:
ignore:
# arrow and datafusion are bumped manually
- dependency-name: "arrow*"
update-types: ["version-update:semver-major"]
- dependency-name: "datafusion*"
update-types: ["version-update:semver-major"]
- dependency-name: "parquet"
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ debug = true
debug = "line-tables-only"

[workspace.dependencies]
delta_kernel = { version = "0.3.0" }
delta_kernel = { version = "=0.3.0" }
# delta_kernel = { path = "../delta-kernel-rs/kernel", version = "0.3.0" }

# arrow
Expand Down
4 changes: 2 additions & 2 deletions crates/aws/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-aws"
version = "0.1.4"
version = "0.2.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
deltalake-core = { version = "0.19.1", path = "../core" }
aws-smithy-runtime-api = { version="1.1.7" }
aws-smithy-runtime = { version="1.1.7", optional = true}
aws-credential-types = { version="1.1.7", features = ["hardcoded-credentials"]}
Expand Down
4 changes: 2 additions & 2 deletions crates/azure/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-azure"
version = "0.1.4"
version = "0.2.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
deltalake-core = { version = "0.19.1", path = "../core" }
lazy_static = "1"

# workspace depenndecies
Expand Down
6 changes: 2 additions & 4 deletions crates/catalog-glue/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-catalog-glue"
version = "0.2.0"
version = "0.3.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -15,9 +15,7 @@ rust-version.workspace = true
async-trait = { workspace = true }
aws-config = "1"
aws-sdk-glue = "1"
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
# This can depend on a lowest common denominator of core once that's released
# deltalake_core = { version = "0.17.0" }
deltalake-core = { version = "0.19.1", path = "../core" }
thiserror = { workspace = true }

[dev-dependencies]
Expand Down
5 changes: 3 additions & 2 deletions crates/core/src/kernel/snapshot/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult<Vec<Add
let size = ex::extract_and_cast::<Int64Array>(arr, "size")?;
let modification_time = ex::extract_and_cast::<Int64Array>(arr, "modificationTime")?;
let data_change = ex::extract_and_cast::<BooleanArray>(arr, "dataChange")?;
let stats = ex::extract_and_cast::<StringArray>(arr, "stats")?;
let stats = ex::extract_and_cast_opt::<StringArray>(arr, "stats");
let tags = ex::extract_and_cast_opt::<MapArray>(arr, "tags");
let dv = ex::extract_and_cast_opt::<StructArray>(arr, "deletionVector");

Expand Down Expand Up @@ -126,7 +126,8 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult<Vec<Add
size: ex::read_primitive(size, i)?,
modification_time: ex::read_primitive(modification_time, i)?,
data_change: ex::read_bool(data_change, i)?,
stats: ex::read_str_opt(stats, i).map(|s| s.to_string()),
stats: stats
.and_then(|stats| ex::read_str_opt(stats, i).map(|s| s.to_string())),
partition_values: pvs
.and_then(|pv| collect_map(&pv.value(i)).map(|m| m.collect()))
.unwrap_or_default(),
Expand Down
2 changes: 1 addition & 1 deletion crates/core/src/operations/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ async fn execute_non_empty_expr(
let input_dfschema: DFSchema = df_schema.as_ref().clone().try_into()?;

let scan_config = DeltaScanConfigBuilder::new()
.with_schema(df_schema)
.with_schema(snapshot.input_schema()?)
.build(snapshot)?;

let scan = DeltaScanBuilder::new(snapshot, log_store.clone(), &state)
Expand Down
9 changes: 9 additions & 0 deletions crates/core/src/protocol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1221,6 +1221,15 @@ mod tests {
assert_eq!(&expected_null_count, null_count_column);
}

#[tokio::test]
async fn test_table_checkpoint_not_always_with_stats() {
let path = "../test/tests/data/delta-checkpoint-stats-optional";
let mut table = crate::open_table(path).await.unwrap();
table.load().await.unwrap();

assert_eq!(2, table.snapshot().unwrap().file_actions().unwrap().len());
}

#[tokio::test]
async fn test_only_struct_stats() {
// test table with no json stats
Expand Down
12 changes: 6 additions & 6 deletions crates/deltalake/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ rust-version.workspace = true
features = ["azure", "datafusion", "gcs", "hdfs", "json", "python", "s3", "unity-experimental"]

[dependencies]
deltalake-core = { version = "0.19.0", path = "../core" }
deltalake-aws = { version = "0.1.4", path = "../aws", default-features = false, optional = true }
deltalake-azure = { version = "0.1.4", path = "../azure", optional = true }
deltalake-gcp = { version = "0.2.2", path = "../gcp", optional = true }
deltalake-hdfs = { version = "0.2.0", path = "../hdfs", optional = true }
deltalake-catalog-glue = { version = "0.2.0", path = "../catalog-glue", optional = true }
deltalake-core = { version = "0.19.1", path = "../core" }
deltalake-aws = { version = "0.2.0", path = "../aws", default-features = false, optional = true }
deltalake-azure = { version = "0.2.0", path = "../azure", optional = true }
deltalake-gcp = { version = "0.3.0", path = "../gcp", optional = true }
deltalake-hdfs = { version = "0.3.0", path = "../hdfs", optional = true }
deltalake-catalog-glue = { version = "0.3.0", path = "../catalog-glue", optional = true }

[features]
# All of these features are just reflected into the core crate until that
Expand Down
4 changes: 2 additions & 2 deletions crates/gcp/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-gcp"
version = "0.2.3"
version = "0.3.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
deltalake-core = { version = "0.19.1", path = "../core" }
lazy_static = "1"

# workspace depenndecies
Expand Down
4 changes: 2 additions & 2 deletions crates/hdfs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-hdfs"
version = "0.2.0"
version = "0.3.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
deltalake-core = { version = "0.19.1", path = "../core" }
hdfs-native-object-store = "0.11"

# workspace dependecies
Expand Down
4 changes: 2 additions & 2 deletions crates/mount/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-mount"
version = "0.2.0"
version = "0.3.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core", features = [
deltalake-core = { version = "0.19.1", path = "../core", features = [
"datafusion",
] }
lazy_static = "1"
Expand Down
7 changes: 5 additions & 2 deletions crates/mount/src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use object_store::{
GetResult, ListResult, ObjectMeta, ObjectStore, PutOptions, PutResult,
Result as ObjectStoreResult,
};
use object_store::{MultipartUpload, PutMultipartOpts, PutPayload};
use object_store::{MultipartUpload, PutMode, PutMultipartOpts, PutPayload};
use std::ops::Range;
use std::sync::Arc;
use url::Url;
Expand Down Expand Up @@ -168,8 +168,11 @@ impl ObjectStore for MountFileStorageBackend {
&self,
location: &ObjectStorePath,
bytes: PutPayload,
options: PutOptions,
mut options: PutOptions,
) -> ObjectStoreResult<PutResult> {
// In mounted storage we do an unsafe rename/overwrite
// We don't conditionally check whether the file already exists
options.mode = PutMode::Overwrite;
self.inner.put_opts(location, bytes, options).await
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1666652369577,"userId":"6114986638742036","userName":"dummy_username","operation":"CREATE OR REPLACE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpoint.writeStatsAsStruct\":\"true\"}"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"1007-161845-fa2h8e50","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"a8510a45-92dc-4e9f-9f7a-42bbcc9b752d"}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"8d3d2b8a-f091-4d7d-8a37-432a9beaf17b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"integer\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsJson":"false","delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1666652369483}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1666652373383,"userId":"6114986638742036","userName":"dummy_username","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"1007-161845-fa2h8e50","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"35e88c76-9cfb-4e0e-bce8-2317f3c49c75"}}
{"metaData":{"id":"8d3d2b8a-f091-4d7d-8a37-432a9beaf17b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"integer\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"null\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boolean\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(8,5)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"string\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"map\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array\",\"type\":{\"type\":\"array\",\"elementType\":\"string\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"nested_struct_element\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"struct_of_array_of_map\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsJson":"false","delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1666652369483}}
{"add":{"path":"part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1666652373000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":0,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-10-24\",\"timestamp\":\"2022-10-24T22:59:32.846Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":0,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-10-24\",\"timestamp\":\"2022-10-24T22:59:32.846Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1666652373000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"commitInfo":{"timestamp":1666652374424,"userId":"6114986638742036","userName":"dummy_username","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"1007-161845-fa2h8e50","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"efe25f5f-e03a-458d-8fbe-34ed2111b3c1"}}
{"add":{"path":"part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1666652374000,"dataChange":true,"stats_parsed":null,"tags":{"INSERTION_TIME":"1666652374000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"size":4,"size_in_bytes":41898,"version":2}
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-python"
version = "0.19.2"
version = "0.19.3"
authors = ["Qingping Hou <[email protected]>", "Will Jones <[email protected]>"]
homepage = "https://github.com/delta-io/delta-rs"
license = "Apache-2.0"
Expand Down
38 changes: 38 additions & 0 deletions python/tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1850,3 +1850,41 @@ def test_empty_dataset_write(tmp_path: pathlib.Path, sample_data: pa.Table):
empty_dataset = dataset(empty_arrow_table)
with pytest.raises(DeltaError, match="No data source supplied to write command"):
write_deltalake(tmp_path, empty_dataset, mode="append")


@pytest.mark.pandas
def test_predicate_out_of_bounds(tmp_path: pathlib.Path):
"""See <https://github.com/delta-io/delta-rs/issues/2867>"""
import pandas as pd

data = [
(datetime(2024, 7, 31, 9, 30, 0), "AAPL", "20240731", 100, 11.1),
(datetime(2024, 7, 31, 9, 30, 0), "GOOG", "20240731", 200, 11.1),
]
columns = ["ts", "ins", "date", "f1", "f2"]
df = pd.DataFrame(data, columns=columns)

predicate = "date == 20240731"
write_deltalake(
table_or_uri=tmp_path,
data=df,
partition_by="date",
mode="overwrite",
schema_mode="merge",
predicate=predicate,
)

data = [
(datetime(2024, 7, 31, 9, 30, 0), "AAPL", "20240731", 666, 666),
(datetime(2024, 7, 31, 9, 30, 0), "GOOG", "20240731", 777, 777),
]
columns = ["ts", "ins", "date", "fb", "fc"]
df = pd.DataFrame(data, columns=columns)
write_deltalake(
table_or_uri=tmp_path,
data=df,
partition_by="date",
mode="overwrite",
schema_mode="merge",
predicate=predicate,
)

0 comments on commit 04e90a2

Please sign in to comment.