diff --git a/Cargo.lock b/Cargo.lock index 4e761628..31c1bb7c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2179,16 +2179,16 @@ dependencies = [ [[package]] name = "deltalake" -version = "0.17.1" -source = "git+https://github.com/delta-io/delta-rs?rev=28ad3950d90573fa8ff413c336b657b8561e1d41#28ad3950d90573fa8ff413c336b657b8561e1d41" +version = "0.17.3" +source = "git+https://github.com/splitgraph/delta-rs?branch=convert-to-delta-stats#5972aab07723fe11243c017f1938b96b70d45810" dependencies = [ "deltalake-core", ] [[package]] name = "deltalake-core" -version = "0.17.1" -source = "git+https://github.com/delta-io/delta-rs?rev=28ad3950d90573fa8ff413c336b657b8561e1d41#28ad3950d90573fa8ff413c336b657b8561e1d41" +version = "0.17.3" +source = "git+https://github.com/splitgraph/delta-rs?branch=convert-to-delta-stats#5972aab07723fe11243c017f1938b96b70d45810" dependencies = [ "arrow", "arrow-arith", @@ -2238,7 +2238,7 @@ dependencies = [ "roaring", "serde", "serde_json", - "sqlparser 0.44.0", + "sqlparser 0.46.0", "thiserror", "tokio", "tracing", @@ -5941,6 +5941,15 @@ dependencies = [ "sqlparser_derive", ] +[[package]] +name = "sqlparser" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11a81a8cad9befe4cf1b9d2d4b9c6841c76f0882a3fec00d95133953c13b3d3d" +dependencies = [ + "log", +] + [[package]] name = "sqlparser_derive" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index 097a68d8..70ba2b29 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -93,7 +93,7 @@ datafusion-expr = { workspace = true } datafusion-remote-tables = { path = "./datafusion_remote_tables", optional = true } -deltalake = { git = "https://github.com/delta-io/delta-rs", rev = "28ad3950d90573fa8ff413c336b657b8561e1d41", features = ["datafusion"] } +deltalake = { git = "https://github.com/splitgraph/delta-rs", branch = "convert-to-delta-stats", features = ["datafusion"] } futures = "0.3" hex = ">=0.4.0" diff --git a/tests/statements/convert.rs b/tests/statements/convert.rs index a2ede504..84651d97 100644 --- a/tests/statements/convert.rs +++ b/tests/statements/convert.rs @@ -71,5 +71,82 @@ async fn test_convert_from_flat_parquet_table() -> Result<()> { ) .await; + // Ensure partition/column stats are collected in add logs: + // https://github.com/delta-io/delta-rs/pull/2491 + let mut table = context.try_get_delta_table("table_converted").await?; + table.load().await?; + + // Convoluted way of sort-stable stats asserting + let state = table.snapshot()?; + let mut min_values = state + .min_values(&Column::from_name("column1")) + .expect("min values exist") + .as_any() + .downcast_ref::() + .expect("Failed to downcast to Int64Array") + .values() + .to_vec(); + min_values.sort(); + assert_eq!(min_values, vec![1, 3, 5]); + + let mut max_values = state + .max_values(&Column::from_name("column1")) + .expect("max values exist") + .as_any() + .downcast_ref::() + .expect("Failed to downcast to Int64Array") + .values() + .to_vec(); + max_values.sort(); + assert_eq!(max_values, vec![2, 4, 6]); + + let min_values = state + .min_values(&Column::from_name("column2")) + .expect("min values exist"); + let min_values = min_values + .as_any() + .downcast_ref::() + .expect("Failed to downcast to StringArray") + .iter() + .flatten() + .sorted() + .collect::>(); + assert_eq!(min_values, vec!["five", "four", "one"]); + + let max_values = state + .max_values(&Column::from_name("column2")) + .expect("max values exist"); + let max_values = max_values + .as_any() + .downcast_ref::() + .expect("Failed to downcast to StringArray") + .iter() + .flatten() + .sorted() + .collect::>(); + assert_eq!(max_values, vec!["six", "three", "two"]); + + assert_eq!( + table.statistics(), + Some(Statistics { + num_rows: Exact(6), + total_byte_size: Inexact(1985), + column_statistics: vec![ + ColumnStatistics { + null_count: Exact(0), + max_value: Exact(ScalarValue::Int64(Some(6))), + min_value: Exact(ScalarValue::Int64(Some(1))), + distinct_count: Absent + }, + ColumnStatistics { + null_count: Exact(0), + max_value: Absent, + min_value: Absent, + distinct_count: Absent + } + ] + }), + ); + Ok(()) } diff --git a/tests/statements/mod.rs b/tests/statements/mod.rs index e424fde9..2cbadff4 100644 --- a/tests/statements/mod.rs +++ b/tests/statements/mod.rs @@ -2,12 +2,17 @@ use std::collections::HashMap; use std::env; use std::time::Duration; +use arrow::array::{Int64Array, StringArray}; use arrow::record_batch::RecordBatch; use chrono::{TimeZone, Utc}; use datafusion::assert_batches_eq; use datafusion::datasource::TableProvider; +use datafusion::physical_optimizer::pruning::PruningStatistics; +use datafusion_common::stats::Precision::{Absent, Exact, Inexact}; +use datafusion_common::Column; use datafusion_common::{assert_contains, Result}; -use itertools::sorted; +use datafusion_common::{ColumnStatistics, ScalarValue, Statistics}; +use itertools::{sorted, Itertools}; use seafowl::catalog::{DEFAULT_DB, DEFAULT_SCHEMA}; #[cfg(feature = "remote-tables")]