From d833f080367276003af0e2c73c658ff96e671913 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Sat, 11 Nov 2023 19:51:05 +0100 Subject: [PATCH 01/23] chore: update read me (#1810) # Description Marking FSCK ~and Table features~ as done and MERGE as semi-done. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fc572cd467..a22ba8a295 100644 --- a/README.md +++ b/README.md @@ -152,8 +152,8 @@ of features outlined in the Delta [protocol][protocol] is also [tracked](#protoc | Delete - predicates | ![done] | ![done] | Delete data based on a predicate | | Optimize - compaction | ![done] | ![done] | Harmonize the size of data file | | Optimize - Z-order | ![done] | ![done] | Place similar data into the same file | -| Merge | [![semi-done]][merge-rs] | [![open]][merge-py] | Merge two tables (limited to full re-write) | -| FS check | ![done] | | Remove corrupted files from table | +| Merge | [![semi-done]][merge-rs] | [![semi-done]][merge-py] | Merge two tables (limited to full re-write) | +| FS check | ![done] | ![done] | Remove corrupted files from table | ### Protocol Support Level From 1368a789e9dfda7346f5fd0406670e6f39a2ffa7 Mon Sep 17 00:00:00 2001 From: Junjun Dong Date: Sun, 12 Nov 2023 01:09:34 -0800 Subject: [PATCH 02/23] feat: add convert_to_delta (#1686) # Description Add a convert_to_delta operation for converting a Parquet table to a Delta Table in place. # Related Issue(s) - closes #1041 - closes #1682 # Documentation --- crates/deltalake-core/src/kernel/schema.rs | 16 + .../src/operations/convert_to_delta.rs | 842 ++++++++++++++++++ crates/deltalake-core/src/operations/mod.rs | 2 + .../deltalake-core/src/schema/partitions.rs | 3 + crates/deltalake-core/src/writer/utils.rs | 2 +- 5 files changed, 864 insertions(+), 1 deletion(-) create mode 100644 crates/deltalake-core/src/operations/convert_to_delta.rs diff --git a/crates/deltalake-core/src/kernel/schema.rs b/crates/deltalake-core/src/kernel/schema.rs index 12391ca6e8..7694501dca 100644 --- a/crates/deltalake-core/src/kernel/schema.rs +++ b/crates/deltalake-core/src/kernel/schema.rs @@ -1,6 +1,8 @@ //! Delta table schema +use std::borrow::Borrow; use std::fmt::Formatter; +use std::hash::{Hash, Hasher}; use std::sync::Arc; use std::{collections::HashMap, fmt::Display}; @@ -110,6 +112,20 @@ pub struct StructField { pub metadata: HashMap, } +impl Hash for StructField { + fn hash(&self, state: &mut H) { + self.name.hash(state); + } +} + +impl Borrow for StructField { + fn borrow(&self) -> &str { + self.name.as_ref() + } +} + +impl Eq for StructField {} + impl StructField { /// Creates a new field pub fn new(name: impl Into, data_type: DataType, nullable: bool) -> Self { diff --git a/crates/deltalake-core/src/operations/convert_to_delta.rs b/crates/deltalake-core/src/operations/convert_to_delta.rs new file mode 100644 index 0000000000..84fffa1578 --- /dev/null +++ b/crates/deltalake-core/src/operations/convert_to_delta.rs @@ -0,0 +1,842 @@ +//! Command for converting a Parquet table to a Delta table in place +// https://github.com/delta-io/delta/blob/1d5dd774111395b0c4dc1a69c94abc169b1c83b6/spark/src/main/scala/org/apache/spark/sql/delta/commands/ConvertToDeltaCommand.scala + +use crate::{ + kernel::{Action, Add, Schema, StructField}, + logstore::{LogStore, LogStoreRef}, + operations::create::CreateBuilder, + protocol::SaveMode, + storage::config::configure_log_store, + table::builder::ensure_table_uri, + table::config::DeltaConfigKey, + DeltaResult, DeltaTable, DeltaTableError, DeltaTablePartition, ObjectStoreError, + NULL_PARTITION_VALUE_DATA_PATH, +}; +use arrow::{datatypes::Schema as ArrowSchema, error::ArrowError}; +use futures::{ + future::{self, BoxFuture}, + TryStreamExt, +}; +use log::{debug, info}; +use parquet::{ + arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}, + errors::ParquetError, +}; +use percent_encoding::percent_decode_str; +use serde_json::{Map, Value}; +use std::{ + collections::{HashMap, HashSet}, + num::TryFromIntError, + str::Utf8Error, + sync::Arc, +}; + +/// Error converting a Parquet table to a Delta table +#[derive(Debug, thiserror::Error)] +enum Error { + #[error("Object store error: {0}")] + ObjectStore(#[from] ObjectStoreError), + #[error("Arrow error: {0}")] + Arrow(#[from] ArrowError), + #[error("Parquet error: {0}")] + Parquet(#[from] ParquetError), + #[error("DeltaTable error: {0}")] + DeltaTable(#[from] DeltaTableError), + #[error("Error percent-decoding as UTF-8: {0}")] + PercentDecode(#[from] Utf8Error), + #[error("Error converting usize to i64: {0}")] + TryFromUsize(#[from] TryFromIntError), + #[error("No parquet file is found in the given location")] + ParquetFileNotFound, + #[error("The schema of partition columns must be provided to convert a Parquet table to a Delta table")] + MissingPartitionSchema, + #[error("Partition column provided by the user does not exist in the parquet files")] + PartitionColumnNotExist(HashSet), + #[error("The given location is already a delta table location")] + DeltaTableAlready, + #[error("Location must be provided to convert a Parquet table to a Delta table")] + MissingLocation, +} + +impl From for DeltaTableError { + fn from(err: Error) -> Self { + match err { + Error::ObjectStore(e) => DeltaTableError::ObjectStore { source: e }, + Error::Arrow(e) => DeltaTableError::Arrow { source: e }, + Error::Parquet(e) => DeltaTableError::Parquet { source: e }, + Error::DeltaTable(e) => e, + _ => DeltaTableError::GenericError { + source: Box::new(err), + }, + } + } +} + +/// The partition strategy used by the Parquet table +/// Currently only hive-partitioning is supproted for Parquet paths +#[non_exhaustive] +#[derive(Default)] +pub enum PartitionStrategy { + /// Hive-partitioning + #[default] + Hive, +} + +/// Build an operation to convert a Parquet table to a [`DeltaTable`] in place +pub struct ConvertToDeltaBuilder { + log_store: Option, + location: Option, + storage_options: Option>, + partition_schema: HashSet, + partition_strategy: PartitionStrategy, + mode: SaveMode, + name: Option, + comment: Option, + configuration: HashMap>, + metadata: Option>, +} + +impl Default for ConvertToDeltaBuilder { + fn default() -> Self { + Self::new() + } +} + +impl ConvertToDeltaBuilder { + /// Create a new [`ConvertToDeltaBuilder`] + pub fn new() -> Self { + Self { + log_store: None, + location: None, + storage_options: None, + partition_schema: Default::default(), + partition_strategy: Default::default(), + mode: SaveMode::ErrorIfExists, + name: None, + comment: None, + configuration: Default::default(), + metadata: Default::default(), + } + } + + /// Provide a [`LogStore`] instance, that points at table location + pub fn with_log_store(mut self, log_store: Arc) -> Self { + self.log_store = Some(log_store); + self + } + + /// Specify the path to the location where table data is stored, + /// which could be a path on distributed storage. + /// + /// If an object store is also passed using `with_log_store()`, this path will be ignored. + pub fn with_location(mut self, location: impl Into) -> Self { + self.location = Some(location.into()); + self + } + + /// Set options used to initialize storage backend + /// + /// Options may be passed in the HashMap or set as environment variables. + /// + /// [crate::table::builder::s3_storage_options] describes the available options for the AWS or S3-compliant backend. + /// [dynamodb_lock::DynamoDbLockClient] describes additional options for the AWS atomic rename client. + /// + /// If an object store is also passed using `with_log_store()`, these options will be ignored. + pub fn with_storage_options(mut self, storage_options: HashMap) -> Self { + self.storage_options = Some(storage_options); + self + } + + /// Specify the partition schema of the Parquet table + pub fn with_partition_schema( + mut self, + partition_schema: impl IntoIterator, + ) -> Self { + self.partition_schema = HashSet::from_iter(partition_schema); + self + } + + /// Specify the partition strategy of the Parquet table + /// Currently only hive-partitioning is supproted for Parquet paths + pub fn with_partition_strategy(mut self, strategy: PartitionStrategy) -> Self { + self.partition_strategy = strategy; + self + } + /// Specify the behavior when a table exists at location + pub fn with_save_mode(mut self, save_mode: SaveMode) -> Self { + self.mode = save_mode; + self + } + + /// Specify the table name. Optionally qualified with + /// a database name [database_name.] table_name. + pub fn with_table_name(mut self, name: impl Into) -> Self { + self.name = Some(name.into()); + self + } + + /// Comment to describe the table. + pub fn with_comment(mut self, comment: impl Into) -> Self { + self.comment = Some(comment.into()); + self + } + + /// Set configuration on created table + pub fn with_configuration( + mut self, + configuration: impl IntoIterator, Option>)>, + ) -> Self { + self.configuration = configuration + .into_iter() + .map(|(k, v)| (k.into(), v.map(|s| s.into()))) + .collect(); + self + } + + /// Specify a table property in the table configuration + pub fn with_configuration_property( + mut self, + key: DeltaConfigKey, + value: Option>, + ) -> Self { + self.configuration + .insert(key.as_ref().into(), value.map(|v| v.into())); + self + } + + /// Append custom (application-specific) metadata to the commit. + /// + /// This might include provenance information such as an id of the + /// user that made the commit or the program that created it. + pub fn with_metadata(mut self, metadata: Map) -> Self { + self.metadata = Some(metadata); + self + } + + /// Consume self into CreateBuilder with corresponding add actions, schemas and operation meta + async fn into_create_builder(mut self) -> Result { + // Use the specified log store. If a log store is not provided, create a new store from the specified path. + // Return an error if neither log store nor path is provided + let log_store = if let Some(log_store) = self.log_store { + log_store + } else if let Some(location) = self.location { + configure_log_store( + ensure_table_uri(location)?, + self.storage_options.unwrap_or_default(), + )? + } else { + return Err(Error::MissingLocation); + }; + + // Return an error if the location is already a Delta table location + if log_store.is_delta_table_location().await? { + return Err(Error::DeltaTableAlready); + } + info!( + "Converting Parquet table in log store location: {:?}", + log_store.root_uri() + ); + + // Get all the parquet files in the location + let object_store = log_store.object_store(); + let mut files = Vec::new(); + object_store + .list(None) + .await? + .try_for_each_concurrent(10, |meta| { + if Some("parquet") == meta.location.extension() { + debug!("Found parquet file {:#?}", meta.location); + files.push(meta); + } + future::ready(Ok(())) + }) + .await?; + if files.is_empty() { + return Err(Error::ParquetFileNotFound); + } + + // Iterate over the parquet files. Parse partition columns, generate add actions and collect parquet file schemas + let mut arrow_schemas = Vec::new(); + let mut actions = Vec::new(); + // A HashSet of all unique partition columns in a Parquet table + let mut partition_columns = HashSet::new(); + // A vector of StructField of all unique partition columns in a Parquet table + let mut partition_schema_fields = Vec::new(); + for file in files { + // A HashMap from partition column to value for this parquet file only + let mut partition_values = HashMap::new(); + let mut iter = file.location.as_ref().split('/').peekable(); + let mut subpath = iter.next(); + // Get partitions from subpaths. Skip the last subpath + while iter.peek().is_some() { + if let Some(subpath) = subpath { + // Return an error if the partition is not hive-partitioning + let partition = DeltaTablePartition::try_from( + percent_decode_str(subpath).decode_utf8()?.as_ref(), + )?; + debug!( + "Found partition {partition:#?} in parquet file {:#?}", + file.location + ); + let (key, val) = (partition.key, partition.value); + partition_values.insert( + key.clone(), + if val == NULL_PARTITION_VALUE_DATA_PATH { + None + } else { + Some(val) + }, + ); + if partition_columns.insert(key.clone()) { + if let Some(schema) = self.partition_schema.take(key.as_str()) { + partition_schema_fields.push(schema); + } else { + // Return an error if the schema of a partition column is not provided by user + return Err(Error::MissingPartitionSchema); + } + } + } else { + // This error shouldn't happen. The while condition ensures that subpath is not none + panic!("Subpath iterator index overflows"); + } + subpath = iter.next(); + } + + actions.push(Action::Add(Add { + path: percent_decode_str(file.location.as_ref()) + .decode_utf8()? + .to_string(), + size: i64::try_from(file.size)?, + partition_values, + modification_time: file.last_modified.timestamp_millis(), + data_change: true, + ..Default::default() + })); + + let mut arrow_schema = ParquetRecordBatchStreamBuilder::new(ParquetObjectReader::new( + object_store.clone(), + file, + )) + .await? + .schema() + .as_ref() + .clone(); + // Arrow schema of Parquet files may have conflicting metatdata + // Since Arrow schema metadata is not used to generate Delta table schema, we set the metadata field to an empty HashMap + arrow_schema.metadata = HashMap::new(); + arrow_schemas.push(arrow_schema); + } + + if !self.partition_schema.is_empty() { + // Partition column provided by the user does not exist in the parquet files + return Err(Error::PartitionColumnNotExist(self.partition_schema)); + } + + // Merge parquet file schemas + // This step is needed because timestamp will not be preserved when copying files in S3. We can't use the schema of the latest parqeut file as Delta table's schema + let mut schema_fields = Schema::try_from(&ArrowSchema::try_merge(arrow_schemas)?)? + .fields() + .clone(); + schema_fields.append(&mut partition_schema_fields); + debug!("Schema fields for the parquet table: {schema_fields:#?}"); + + // Generate CreateBuilder with corresponding add actions, schemas and operation meta + let mut builder = CreateBuilder::new() + .with_log_store(log_store) + .with_columns(schema_fields) + .with_partition_columns(partition_columns.into_iter()) + .with_actions(actions) + .with_save_mode(self.mode) + .with_configuration(self.configuration); + if let Some(name) = self.name { + builder = builder.with_table_name(name); + } + if let Some(comment) = self.comment { + builder = builder.with_comment(comment); + } + if let Some(metadata) = self.metadata { + builder = builder.with_metadata(metadata); + } + Ok(builder) + } +} + +impl std::future::IntoFuture for ConvertToDeltaBuilder { + type Output = DeltaResult; + type IntoFuture = BoxFuture<'static, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let this = self; + + Box::pin(async move { + let builder = this + .into_create_builder() + .await + .map_err(DeltaTableError::from)?; + let table = builder.await?; + Ok(table) + }) + } +} + +#[cfg(test)] +mod tests { + use super::{ + configure_log_store, ensure_table_uri, ConvertToDeltaBuilder, DeltaTable, LogStoreRef, + StructField, + }; + use crate::{ + kernel::schema::{DataType, PrimitiveType}, + open_table, + storage::config::StorageOptions, + Path, + }; + use pretty_assertions::assert_eq; + use std::fs; + use tempfile::tempdir; + + fn schema_field(key: &str, primitive: PrimitiveType, nullable: bool) -> StructField { + StructField::new(key.to_string(), DataType::Primitive(primitive), nullable) + } + + // Copy all Parquet files in the source location to a temp dir (with Delta log removed) + fn copy_files(src: impl AsRef, dst: impl AsRef) { + fs::create_dir_all(&dst).expect("Failed to create all directories"); + let files = fs::read_dir(src).expect("Failed to read source directory"); + for file in files { + let file = file.expect("Failed to read file"); + let name = file.file_name(); + // Skip Delta log + if name.to_str() != Some("_delta_log") { + if file.file_type().expect("Failed to get file type").is_dir() { + copy_files(file.path(), dst.as_ref().join(name)); + } else { + fs::copy(file.path(), dst.as_ref().join(name)).expect("Failed to copy file"); + } + } + } + } + + fn log_store(path: impl Into) -> LogStoreRef { + configure_log_store( + ensure_table_uri(path.into()).expect("Failed to convert to table URI"), + StorageOptions::default(), + ) + .expect("Failed to create an object store") + } + + async fn create_delta_table( + path: &str, + partition_schema: Vec, + // Whether testing on object store or path + from_path: bool, + ) -> DeltaTable { + let temp_dir = tempdir().expect("Failed to create a temp directory"); + let temp_dir = temp_dir + .path() + .to_str() + .expect("Failed to convert Path to string slice"); + // Copy all files to a temp directory to perform testing. Skip Delta log + copy_files(format!("{}/{}", env!("CARGO_MANIFEST_DIR"), path), temp_dir); + let builder = if from_path { + ConvertToDeltaBuilder::new().with_location(temp_dir) + } else { + ConvertToDeltaBuilder::new().with_log_store(log_store(temp_dir)) + }; + builder + .with_partition_schema(partition_schema) + .await + .unwrap_or_else(|e| { + panic!("Failed to convert to Delta table. Location: {path}. Error: {e}") + }) + } + + async fn open_created_delta_table( + path: &str, + partition_schema: Vec, + ) -> DeltaTable { + let temp_dir = tempdir().expect("Failed to create a temp directory"); + let temp_dir = temp_dir + .path() + .to_str() + .expect("Failed to convert to string slice"); + // Copy all files to a temp directory to perform testing. Skip Delta log + copy_files(format!("{}/{}", env!("CARGO_MANIFEST_DIR"), path), temp_dir); + ConvertToDeltaBuilder::new() + .with_log_store(log_store(temp_dir)) + .with_partition_schema(partition_schema) + .await + .unwrap_or_else(|e| { + panic!("Failed to convert to Delta table. Location: {path}. Error: {e}") + }); + open_table(temp_dir).await.expect("Failed to open table") + } + + fn assert_delta_table( + table: DeltaTable, + // Test data location in the repo + test_data_from: &str, + expected_version: i64, + expected_paths: Vec, + expected_schema: Vec, + expected_partition_values: &[(String, Option)], + ) { + assert_eq!( + table.version(), + expected_version, + "Testing location: {test_data_from:?}" + ); + + let mut files = table.get_files(); + files.sort(); + assert_eq!( + files, expected_paths, + "Testing location: {test_data_from:?}" + ); + + let mut schema_fields = table + .get_schema() + .expect("Failed to get schema") + .fields() + .clone(); + schema_fields.sort_by(|a, b| a.name().cmp(b.name())); + assert_eq!( + schema_fields, expected_schema, + "Testing location: {test_data_from:?}" + ); + + let mut partition_values = table + .get_partition_values() + .flat_map(|map| map.clone()) + .collect::>(); + partition_values.sort(); + assert_eq!(partition_values, expected_partition_values); + } + + // Test Parquet files in object store location + #[tokio::test] + async fn test_convert_to_delta() { + let path = "tests/data/delta-0.8.0-date"; + let table = create_delta_table(path, Vec::new(), false).await; + let action = table + .get_active_add_actions_by_partitions(&[]) + .expect("Failed to get Add actions") + .next() + .expect("Iterator index overflows"); + assert_eq!( + action.path, + "part-00000-d22c627d-9655-4153-9527-f8995620fa42-c000.snappy.parquet" + ); + assert!(action.data_change); + assert_delta_table( + table, + path, + 0, + vec![Path::from( + "part-00000-d22c627d-9655-4153-9527-f8995620fa42-c000.snappy.parquet", + )], + vec![ + schema_field("date", PrimitiveType::Date, true), + schema_field("dayOfYear", PrimitiveType::Integer, true), + ], + &[], + ); + + let path = "tests/data/delta-0.8.0-null-partition"; + let table = create_delta_table( + path, + vec![schema_field("k", PrimitiveType::String, true)], + false, + ) + .await; + assert_delta_table( + table, + path, + 0, + vec![ + Path::from("k=A/part-00000-b1f1dbbb-70bc-4970-893f-9bb772bf246e.c000.snappy.parquet"), + Path::from("k=__HIVE_DEFAULT_PARTITION__/part-00001-8474ac85-360b-4f58-b3ea-23990c71b932.c000.snappy.parquet") + ], + vec![ + schema_field("k", PrimitiveType::String, true), + schema_field("v", PrimitiveType::Long, true), + ], + &[ + ("k".to_string(), None), + ("k".to_string(), Some("A".to_string())), + ], + ); + + let path = "tests/data/delta-0.8.0-special-partition"; + let table = create_delta_table( + path, + vec![schema_field("x", PrimitiveType::String, true)], + false, + ) + .await; + assert_delta_table( + table, + path, + 0, + vec![ + Path::from_url_path( + "x=A%2FA/part-00007-b350e235-2832-45df-9918-6cab4f7578f7.c000.snappy.parquet", + ) + .expect("Invalid URL path"), + Path::from_url_path( + "x=B%20B/part-00015-e9abbc6f-85e9-457b-be8e-e9f5b8a22890.c000.snappy.parquet", + ) + .expect("Invalid URL path"), + ], + vec![ + schema_field("x", PrimitiveType::String, true), + schema_field("y", PrimitiveType::Long, true), + ], + &[ + ("x".to_string(), Some("A/A".to_string())), + ("x".to_string(), Some("B B".to_string())), + ], + ); + + let path = "tests/data/delta-0.8.0-partitioned"; + let table = create_delta_table( + path, + vec![ + schema_field("day", PrimitiveType::String, true), + schema_field("month", PrimitiveType::String, true), + schema_field("year", PrimitiveType::String, true), + ], + false, + ) + .await; + assert_delta_table( + table, + path, + 0, + vec![ + Path::from( + "year=2020/month=1/day=1/part-00000-8eafa330-3be9-4a39-ad78-fd13c2027c7e.c000.snappy.parquet", + ), + Path::from( + "year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet", + ), + Path::from( + "year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet", + ), + Path::from( + "year=2021/month=12/day=20/part-00000-9275fdf4-3961-4184-baa0-1c8a2bb98104.c000.snappy.parquet", + ), + Path::from( + "year=2021/month=12/day=4/part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet", + ), + Path::from( + "year=2021/month=4/day=5/part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet", + ), + ], + vec![ + schema_field("day", PrimitiveType::String, true), + schema_field("month", PrimitiveType::String, true), + schema_field("value", PrimitiveType::String, true), + schema_field("year", PrimitiveType::String, true), + ], + &[ + ("day".to_string(), Some("1".to_string())), + ("day".to_string(), Some("20".to_string())), + ("day".to_string(), Some("3".to_string())), + ("day".to_string(), Some("4".to_string())), + ("day".to_string(), Some("5".to_string())), + ("day".to_string(), Some("5".to_string())), + ("month".to_string(), Some("1".to_string())), + ("month".to_string(), Some("12".to_string())), + ("month".to_string(), Some("12".to_string())), + ("month".to_string(), Some("2".to_string())), + ("month".to_string(), Some("2".to_string())), + ("month".to_string(), Some("4".to_string())), + ("year".to_string(), Some("2020".to_string())), + ("year".to_string(), Some("2020".to_string())), + ("year".to_string(), Some("2020".to_string())), + ("year".to_string(), Some("2021".to_string())), + ("year".to_string(), Some("2021".to_string())), + ("year".to_string(), Some("2021".to_string())), + ], + ); + } + + // Test opening the newly created Delta table + #[tokio::test] + async fn test_open_created_delta_table() { + let path = "tests/data/delta-0.2.0"; + let table = open_created_delta_table(path, Vec::new()).await; + assert_delta_table( + table, + path, + 0, + vec![ + Path::from("part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet"), + Path::from("part-00000-7c2deba3-1994-4fb8-bc07-d46c948aa415-c000.snappy.parquet"), + Path::from("part-00000-b44fcdb0-8b06-4f3a-8606-f8311a96f6dc-c000.snappy.parquet"), + Path::from("part-00000-cb6b150b-30b8-4662-ad28-ff32ddab96d2-c000.snappy.parquet"), + Path::from("part-00001-185eca06-e017-4dea-ae49-fc48b973e37e-c000.snappy.parquet"), + Path::from("part-00001-4327c977-2734-4477-9507-7ccf67924649-c000.snappy.parquet"), + Path::from("part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet"), + ], + vec![schema_field("value", PrimitiveType::Integer, false)], + &[], + ); + + let path = "tests/data/delta-0.8-empty"; + let table = open_created_delta_table(path, Vec::new()).await; + assert_delta_table( + table, + path, + 0, + vec![ + Path::from("part-00000-b0cc5102-6177-4d60-80d3-b5d170011621-c000.snappy.parquet"), + Path::from("part-00007-02b8c308-e5a7-41a8-a653-cb5594582017-c000.snappy.parquet"), + ], + vec![schema_field("column", PrimitiveType::Long, true)], + &[], + ); + + let path = "tests/data/delta-0.8.0"; + let table = open_created_delta_table(path, Vec::new()).await; + assert_delta_table( + table, + path, + 0, + vec![ + Path::from("part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet"), + Path::from("part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet"), + Path::from("part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet"), + ], + vec![schema_field("value", PrimitiveType::Integer, true)], + &[], + ); + } + + // Test Parquet files in path + #[tokio::test] + async fn test_convert_to_delta_from_path() { + let path = "tests/data/delta-2.2.0-partitioned-types"; + let table = create_delta_table( + path, + vec![ + schema_field("c1", PrimitiveType::Integer, true), + schema_field("c2", PrimitiveType::String, true), + ], + true, + ) + .await; + assert_delta_table( + table, + path, + 0, + vec![ + Path::from( + "c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet", + ), + Path::from( + "c1=5/c2=b/part-00007-4e73fa3b-2c88-424a-8051-f8b54328ffdb.c000.snappy.parquet", + ), + Path::from( + "c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet", + ), + ], + vec![ + schema_field("c1", PrimitiveType::Integer, true), + schema_field("c2", PrimitiveType::String, true), + schema_field("c3", PrimitiveType::Integer, true), + ], + &[ + ("c1".to_string(), Some("4".to_string())), + ("c1".to_string(), Some("5".to_string())), + ("c1".to_string(), Some("6".to_string())), + ("c2".to_string(), Some("a".to_string())), + ("c2".to_string(), Some("b".to_string())), + ("c2".to_string(), Some("c".to_string())), + ], + ); + + let path = "tests/data/delta-0.8.0-numeric-partition"; + let table = create_delta_table( + path, + vec![ + schema_field("x", PrimitiveType::Long, true), + schema_field("y", PrimitiveType::Double, true), + ], + true, + ) + .await; + assert_delta_table( + table, + path, + 0, + vec![ + Path::from( + "x=10/y=10.0/part-00015-24eb4845-2d25-4448-b3bb-5ed7f12635ab.c000.snappy.parquet", + ), + Path::from( + "x=9/y=9.9/part-00007-3c50fba1-4264-446c-9c67-d8e24a1ccf83.c000.snappy.parquet", + ), + ], + vec![ + schema_field("x", PrimitiveType::Long, true), + schema_field("y", PrimitiveType::Double, true), + schema_field("z", PrimitiveType::String, true), + ], + &[ + ("x".to_string(), Some("10".to_string())), + ("x".to_string(), Some("9".to_string())), + ("y".to_string(), Some("10.0".to_string())), + ("y".to_string(), Some("9.9".to_string())), + ], + ); + } + + #[tokio::test] + async fn test_missing_location() { + let _table = ConvertToDeltaBuilder::new() + .await + .expect_err("Location is missing. Should error"); + } + + #[tokio::test] + async fn test_empty_dir() { + let temp_dir = tempdir().expect("Failed to create a temp directory"); + let temp_dir = temp_dir + .path() + .to_str() + .expect("Failed to convert to string slice"); + let _table = ConvertToDeltaBuilder::new() + .with_location(temp_dir) + .await + .expect_err("Parquet file does not exist. Should error"); + } + + #[tokio::test] + async fn test_partition_column_not_exist() { + let _table = ConvertToDeltaBuilder::new() + .with_location("tests/data/delta-0.8.0-null-partition") + .with_partition_schema(vec![schema_field("foo", PrimitiveType::String, true)]) + .await + .expect_err( + "Partition column provided by user does not exist in the parquet files. Should error", + ); + } + + #[tokio::test] + async fn test_missing_partition_schema() { + let _table = ConvertToDeltaBuilder::new() + .with_location("tests/data/delta-0.8.0-numeric-partition") + .await + .expect_err("The schema of a partition column is not provided by user. Should error"); + } + + #[tokio::test] + async fn test_delta_table_already() { + let _table = ConvertToDeltaBuilder::new() + .with_location("tests/data/delta-0.2.0") + .await + .expect_err("The given location is already a delta table location. Should error"); + } +} diff --git a/crates/deltalake-core/src/operations/mod.rs b/crates/deltalake-core/src/operations/mod.rs index 0406272a5b..abf9753648 100644 --- a/crates/deltalake-core/src/operations/mod.rs +++ b/crates/deltalake-core/src/operations/mod.rs @@ -14,6 +14,8 @@ use crate::errors::{DeltaResult, DeltaTableError}; use crate::table::builder::DeltaTableBuilder; use crate::DeltaTable; +#[cfg(all(feature = "arrow", feature = "parquet"))] +pub mod convert_to_delta; pub mod create; pub mod filesystem_check; #[cfg(all(feature = "arrow", feature = "parquet"))] diff --git a/crates/deltalake-core/src/schema/partitions.rs b/crates/deltalake-core/src/schema/partitions.rs index c2db1903fa..bc69d019fd 100644 --- a/crates/deltalake-core/src/schema/partitions.rs +++ b/crates/deltalake-core/src/schema/partitions.rs @@ -7,6 +7,9 @@ use crate::kernel::{DataType, PrimitiveType}; use std::cmp::Ordering; use std::collections::HashMap; +/// A special value used in Hive to represent the null partition in partitioned tables +pub const NULL_PARTITION_VALUE_DATA_PATH: &str = "__HIVE_DEFAULT_PARTITION__"; + /// A Enum used for selecting the partition value operation when filtering a DeltaTable partition. #[derive(Clone, Debug, PartialEq, Eq)] pub enum PartitionValue { diff --git a/crates/deltalake-core/src/writer/utils.rs b/crates/deltalake-core/src/writer/utils.rs index 40a7d39bbf..49c3c6bfee 100644 --- a/crates/deltalake-core/src/writer/utils.rs +++ b/crates/deltalake-core/src/writer/utils.rs @@ -27,8 +27,8 @@ use uuid::Uuid; use crate::errors::DeltaResult; use crate::writer::DeltaWriterError; +use crate::NULL_PARTITION_VALUE_DATA_PATH; -const NULL_PARTITION_VALUE_DATA_PATH: &str = "__HIVE_DEFAULT_PARTITION__"; const PARTITION_DATE_FORMAT: &str = "%Y-%m-%d"; const PARTITION_DATETIME_FORMAT: &str = "%Y-%m-%d %H:%M:%S"; From 2b913b37e71ed96212dcec8c3fc8e865754ced82 Mon Sep 17 00:00:00 2001 From: Robert Pack Date: Sun, 12 Nov 2023 10:37:13 +0100 Subject: [PATCH 03/23] ci: adopt ruff format for formatting --- python/Makefile | 8 ++++---- python/pyproject.toml | 3 +-- python/tests/test_table_read.py | 4 +--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/python/Makefile b/python/Makefile index 21cabd91a3..5461f70d9e 100644 --- a/python/Makefile +++ b/python/Makefile @@ -53,8 +53,8 @@ format: ## Format the code $(info --- Rust format ---) cargo fmt $(info --- Python format ---) - black . ruff . --fix + ruff format . .PHONY: check-rust check-rust: ## Run check on Rust @@ -65,9 +65,9 @@ check-rust: ## Run check on Rust .PHONY: check-python check-python: ## Run check on Python - $(info Check Python black) - black --check --diff . - $(info Check Python ruff) + $(info Check Python format) + ruff format --check --diff . + $(info Check Python linting) ruff check . $(info Check Python mypy) mypy diff --git a/python/pyproject.toml b/python/pyproject.toml index 438a49cc56..cc525fef50 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -27,8 +27,7 @@ pandas = [ ] devel = [ "mypy", - "black", - "ruff", + "ruff>=0.1.5", "packaging>=20", "pytest", "pytest-mock", diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py index 5f9faae2bd..a49374e710 100644 --- a/python/tests/test_table_read.py +++ b/python/tests/test_table_read.py @@ -118,9 +118,7 @@ def test_read_simple_table_file_sizes_failure(): x: [-1 for item in x] if x == "size_bytes" else y for x, y in add_actions.items() } - dt.get_add_actions = lambda: SimpleNamespace( - to_pydict=lambda: add_actions_modified - ) # type:ignore + dt.get_add_actions = lambda: SimpleNamespace(to_pydict=lambda: add_actions_modified) # type:ignore with pytest.raises(OSError, match="Cannot seek past end of file."): dt.to_pyarrow_dataset().to_table().to_pydict() From f87ca3aca9917481d9a46bb5b39c435644b58310 Mon Sep 17 00:00:00 2001 From: Thomas Peiselt Date: Mon, 13 Nov 2023 21:16:53 +0100 Subject: [PATCH 04/23] fix: compile error with lifetime issues on optimize (#1843) --- .../deltalake-core/src/operations/optimize.rs | 49 ++++++++++--------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/crates/deltalake-core/src/operations/optimize.rs b/crates/deltalake-core/src/operations/optimize.rs index 0467d43a8b..d86ef6f3e5 100644 --- a/crates/deltalake-core/src/operations/optimize.rs +++ b/crates/deltalake-core/src/operations/optimize.rs @@ -432,7 +432,7 @@ impl MergePlan { Some(task_parameters.input_parameters.target_size as usize), None, )?; - let mut writer = PartitionWriter::try_with_config(object_store.clone(), writer_config)?; + let mut writer = PartitionWriter::try_with_config(object_store, writer_config)?; let mut read_stream = read_stream.await?; @@ -478,19 +478,7 @@ impl MergePlan { let object_store_ref = context.object_store.clone(); // Read all batches into a vec - let batches: Vec = futures::stream::iter(files.clone()) - .then(|file| { - let object_store_ref = object_store_ref.clone(); - async move { - let file_reader = ParquetObjectReader::new(object_store_ref.clone(), file); - ParquetRecordBatchStreamBuilder::new(file_reader) - .await? - .build() - } - }) - .try_flatten() - .try_collect::>() - .await?; + let batches = zorder::collect_batches(object_store_ref, files).await?; // For each batch, compute the zorder key let zorder_keys: Vec = @@ -608,7 +596,7 @@ impl MergePlan { for file in files.iter() { debug!(" file {}", file.location); } - let object_store_ref = log_store.object_store().clone(); + let object_store_ref = log_store.object_store(); let batch_stream = futures::stream::iter(files.clone()) .then(move |file| { let object_store_ref = object_store_ref.clone(); @@ -636,33 +624,30 @@ impl MergePlan { #[cfg(not(feature = "datafusion"))] let exec_context = Arc::new(zorder::ZOrderExecContext::new( zorder_columns, - log_store.object_store().clone(), + log_store.object_store(), // If there aren't enough bins to use all threads, then instead // use threads within the bins. This is important for the case where // the table is un-partitioned, in which case the entire table is just // one big bin. bins.len() <= num_cpus::get(), )); - let object_store = log_store.object_store().clone(); #[cfg(feature = "datafusion")] let exec_context = Arc::new(zorder::ZOrderExecContext::new( zorder_columns, - object_store.clone(), + log_store.object_store(), max_spill_size, )?); let task_parameters = self.task_parameters.clone(); + let log_store = log_store.clone(); futures::stream::iter(bins) .map(move |(partition, files)| { let batch_stream = Self::read_zorder(files.clone(), exec_context.clone()); - - let object_store = object_store.clone(); - let rewrite_result = tokio::task::spawn(Self::rewrite_files( task_parameters.clone(), partition, files, - object_store, + log_store.object_store(), batch_stream, )); util::flatten_join_error(rewrite_result) @@ -1107,6 +1092,26 @@ pub(super) mod zorder { } } + /// Read all batches into a vec - is an async function in disguise + #[cfg(not(feature = "datafusion"))] + pub(super) fn collect_batches( + object_store: ObjectStoreRef, + files: MergeBin, + ) -> impl Future, ParquetError>> { + futures::stream::iter(files.clone()) + .then(move |file| { + let object_store = object_store.clone(); + async move { + let file_reader = ParquetObjectReader::new(object_store.clone(), file); + ParquetRecordBatchStreamBuilder::new(file_reader) + .await? + .build() + } + }) + .try_flatten() + .try_collect::>() + } + #[cfg(feature = "datafusion")] pub use self::datafusion::ZOrderExecContext; From 56a97289201dc05ca2a1a94953b82efdc6bb3b5f Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Mon, 13 Nov 2023 18:05:13 +0000 Subject: [PATCH 05/23] allow issue autoassign on take Signed-off-by: Nikolay Ulmasov --- .github/workflows/issue_comments.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/issue_comments.yml diff --git a/.github/workflows/issue_comments.yml b/.github/workflows/issue_comments.yml new file mode 100644 index 0000000000..c9651cab38 --- /dev/null +++ b/.github/workflows/issue_comments.yml @@ -0,0 +1,25 @@ +name: Comment Commands +on: + issue_comment: + types: created + +permissions: + issues: write + +jobs: + issue_assign: + runs-on: ubuntu-latest + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' + concurrency: + # Only run one a time per user + group: ${{ github.actor }}-issue-assign + steps: + - run: | + CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s) + if [ "$CODE" -eq "204" ] + then + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + else + echo "Issue ${{ github.event.issue.number }} cannot be assigned to ${{ github.event.comment.user.login }}" + fi \ No newline at end of file From 1a2c95e1878652857c3ac9fcaf04ddd98ab57973 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Mon, 13 Nov 2023 16:35:13 -0800 Subject: [PATCH 06/23] chore: add @ion-elgreco to python/ --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index beb7bbb591..b99809d1f6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,7 +1,7 @@ crates/ @wjones127 @roeap @rtyler delta-inspect/ @wjones127 @rtyler proofs/ @houqp -python/ @wjones127 @fvaleye @roeap +python/ @wjones127 @fvaleye @roeap @ion-elgreco tlaplus/ @houqp .github/ @wjones127 @rtyler docs/ @MrPowers From cdf52df80f674b075e2608618535e47b4aef8270 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Tue, 14 Nov 2023 12:20:50 -0800 Subject: [PATCH 07/23] Prevent writing checkpoints with a version that does not exist in table state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have seen this in a production environment where the same writer is issuing append transactions using the operations API, which returns the newly created version, such as 10. If the caller then attempts to create a checkpoint for version 10, the operation will produce an inconsistency in the `_last_checkpoint` file, if the callers in-memory table state has *not* been reloaded since the append operation completed. In this scenario the _delta_log/ directory may contain: . ├── 00000000000000000000.json ├── 00000000000000000001.json ├── 00000000000000000002.json ├── 00000000000000000003.json ├── 00000000000000000004.json ├── 00000000000000000005.json ├── 00000000000000000006.json ├── 00000000000000000007.json ├── 00000000000000000008.json ├── 00000000000000000009.json ├── 00000000000000000010.checkpoint.parquet ├── 00000000000000000010.json └── _last_checkpoint While `_last_checkpoint` contains the following: {"num_of_add_files":null,"parts":null,"size":342,"size_in_bytes":95104,"version":9} This will result in an error on any attempts to read the Delta table: >>> from deltalake import DeltaTable >>> dt = DeltaTable('.') [2023-11-14T18:05:59Z DEBUG deltalake_core::protocol] loading checkpoint from _delta_log/_last_checkpoint [2023-11-14T18:05:59Z DEBUG deltalake_core::table] update with latest checkpoint CheckPoint { version: 9, size: 342, parts: None, size_in_bytes: Some(95104), num_of_add_files: None } Traceback (most recent call last): File "", line 1, in File "/home/tyler/venv/lib64/python3.11/site-packages/deltalake/table.py", line 253, in __init__ self._table = RawDeltaTable( ^^^^^^^^^^^^^^ FileNotFoundError: Object at location /home/tyler/corrupted-table/_delta_log/00000000000000000009.checkpoint.parquet not found: No such file or directory (os error 2) >>> To prevent this error condition, the create_checkpoint_for() function should ensure that the provided checkpoint version (used to write the `.checkpoint.parquet` file) matches the table state's version (used to write the `_last_checkpoint` file). This has the added benefit of helping prevent the caller from passing in a nonsensical version number that could also lead to a broken table. Sponsored-by: Scribd Inc --- .../src/protocol/checkpoints.rs | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/crates/deltalake-core/src/protocol/checkpoints.rs b/crates/deltalake-core/src/protocol/checkpoints.rs index f48fbfbd76..a4cc1b66c7 100644 --- a/crates/deltalake-core/src/protocol/checkpoints.rs +++ b/crates/deltalake-core/src/protocol/checkpoints.rs @@ -38,6 +38,10 @@ enum CheckpointError { #[error("Partition value {0} cannot be parsed from string.")] PartitionValueNotParseable(String), + /// Caller attempt to create a checkpoint for a version which does not exist on the table state + #[error("Attempted to create a checkpoint for a version {0} that does not match the table state {1}")] + StaleTableVersion(i64, i64), + /// Error returned when the parquet writer fails while writing the checkpoint. #[error("Failed to write parquet: {}", .source)] Parquet { @@ -60,6 +64,7 @@ impl From for ProtocolError { match value { CheckpointError::PartitionValueNotParseable(_) => Self::InvalidField(value.to_string()), CheckpointError::Arrow { source } => Self::Arrow { source }, + CheckpointError::StaleTableVersion(..) => Self::Generic(value.to_string()), CheckpointError::Parquet { source } => Self::ParquetParseError { source }, } } @@ -117,6 +122,14 @@ pub async fn create_checkpoint_for( state: &DeltaTableState, log_store: &dyn LogStore, ) -> Result<(), ProtocolError> { + if version != state.version() { + error!( + "create_checkpoint_for called with version {version} but table state contains: {}. The table state may need to be reloaded", + state.version() + ); + return Err(CheckpointError::StaleTableVersion(version, state.version()).into()); + } + // TODO: checkpoints _can_ be multi-part... haven't actually found a good reference for // an appropriate split point yet though so only writing a single part currently. // See https://github.com/delta-io/delta-rs/issues/288 @@ -486,6 +499,72 @@ mod tests { use lazy_static::lazy_static; use serde_json::json; + use crate::operations::DeltaOps; + use crate::writer::test_utils::get_delta_schema; + use object_store::path::Path; + + #[tokio::test] + async fn test_create_checkpoint_for() { + let table_schema = get_delta_schema(); + + let table = DeltaOps::new_in_memory() + .create() + .with_columns(table_schema.fields().clone()) + .with_save_mode(crate::protocol::SaveMode::Ignore) + .await + .unwrap(); + assert_eq!(table.version(), 0); + assert_eq!(table.get_metadata().unwrap().schema, table_schema); + let res = create_checkpoint_for(0, table.get_state(), table.log_store.as_ref()).await; + assert!(res.is_ok()); + + // Look at the "files" and verify that the _last_checkpoint has the right version + let path = Path::from("_delta_log/_last_checkpoint"); + let last_checkpoint = table + .object_store() + .get(&path) + .await + .expect("Failed to get the _last_checkpoint") + .bytes() + .await + .expect("Failed to get bytes for _last_checkpoint"); + let last_checkpoint: CheckPoint = serde_json::from_slice(&last_checkpoint).expect("Fail"); + assert_eq!(last_checkpoint.version, 0); + } + + #[tokio::test] + async fn test_create_checkpoint_for_invalid_version() { + let table_schema = get_delta_schema(); + + let table = DeltaOps::new_in_memory() + .create() + .with_columns(table_schema.fields().clone()) + .with_save_mode(crate::protocol::SaveMode::Ignore) + .await + .unwrap(); + assert_eq!(table.version(), 0); + assert_eq!(table.get_metadata().unwrap().schema, table_schema); + match create_checkpoint_for(1, table.get_state(), table.log_store.as_ref()).await { + Ok(_) => { + /* + * If a checkpoint is allowed to be created here, it will use the passed in + * version, but _last_checkpoint is generated from the table state will point to a + * version 0 checkpoint. + * E.g. + * + * Path { raw: "_delta_log/00000000000000000000.json" } + * Path { raw: "_delta_log/00000000000000000001.checkpoint.parquet" } + * Path { raw: "_delta_log/_last_checkpoint" } + * + */ + panic!( + "We should not allow creating a checkpoint for a version which doesn't exist!" + ); + } + Err(_) => { /* We should expect an error in the "right" case */ } + } + } + #[test] fn typed_partition_value_from_string_test() { let string_value: Value = "Hello World!".into(); From d47b4904139e69c63097a4b5ef830e281ed8cf35 Mon Sep 17 00:00:00 2001 From: Robert Pack Date: Tue, 14 Nov 2023 08:51:15 +0100 Subject: [PATCH 08/23] feat: drop python 3.7 and adopt 3.12 --- .github/workflows/python_build.yml | 25 ++++++--------- README.md | 50 +++++++++++++++--------------- python/Cargo.toml | 5 +-- python/pyproject.toml | 13 +++++--- 4 files changed, 43 insertions(+), 50 deletions(-) diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index 2e6d3bf782..223c13f531 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -18,7 +18,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.8 - name: Check Python run: | @@ -36,20 +36,19 @@ jobs: run: make check-rust test-minimal: - name: Python Build (Python 3.7 PyArrow 8.0.0) + name: Python Build (Python 3.8 PyArrow 8.0.0) runs-on: ubuntu-latest env: RUSTFLAGS: "-C debuginfo=line-tables-only" CARGO_INCREMENTAL: 0 - # use the same environment we have for python release - container: quay.io/pypa/manylinux2014_x86_64:2022-09-24-4f086d0 steps: - # actions/checkout@v3 is a node action, which runs on a fairly new - # version of node. however, manylinux environment's glibc is too old for - # that version of the node. so we will have to use v1 instead, which is a - # docker based action. - - uses: actions/checkout@v1 + - uses: actions/checkout@v3 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 - name: Install latest nightly uses: actions-rs/toolchain@v1 @@ -60,14 +59,8 @@ jobs: - uses: Swatinem/rust-cache@v2 - - name: Enable manylinux Python targets - run: | - echo "/opt/python/cp37-cp37m/bin" >> $GITHUB_PATH - - name: Build and install deltalake run: | - # Needed for openssl build - yum install -y perl-IPC-Cmd pip install virtualenv virtualenv venv source venv/bin/activate @@ -238,7 +231,7 @@ jobs: strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index a22ba8a295..9aa98823c6 100644 --- a/README.md +++ b/README.md @@ -41,10 +41,10 @@ The Delta Lake project aims to unlock the power of the Deltalake for as many use by providing native low-level APIs aimed at developers and integrators, as well as a high-level operations API that lets you query, inspect, and operate your Delta Lake with ease. -| Source | Downloads | Installation Command | Docs | -| --------------------- | --------------------------------- | ----------------------- | --------------- | -| **[PyPi][pypi]** | [![Downloads][pypi-dl]][pypi] | `pip install deltalake` | [Docs][py-docs] | -| **[Crates.io][pypi]** | [![Downloads][crates-dl]][crates] | `cargo add deltalake` | [Docs][rs-docs] | +| Source | Downloads | Installation Command | Docs | +| ----------------------- | --------------------------------- | ----------------------- | --------------- | +| **[PyPi][pypi]** | [![Downloads][pypi-dl]][pypi] | `pip install deltalake` | [Docs][py-docs] | +| **[Crates.io][crates]** | [![Downloads][crates-dl]][crates] | `cargo add deltalake` | [Docs][rs-docs] | [pypi]: https://pypi.org/project/deltalake/ [pypi-dl]: https://img.shields.io/pypi/dm/deltalake?style=flat-square&color=00ADD4 @@ -130,36 +130,36 @@ of features outlined in the Delta [protocol][protocol] is also [tracked](#protoc ### Cloud Integrations -| Storage | Rust | Python | Comment | -| -------------------- | :-------------------: | :-------------------: | ----------------------------------- | -| Local | ![done] | ![done] | | -| S3 - AWS | ![done] | ![done] | requires lock for concurrent writes | -| S3 - MinIO | ![done] | ![done] | requires lock for concurrent writes | -| S3 - R2 | ![done] | ![done] | requires lock for concurrent writes | -| Azure Blob | ![done] | ![done] | | -| Azure ADLS Gen2 | ![done] | ![done] | | -| Microsoft OneLake | ![done] | ![done] | | -| Google Cloud Storage | ![done] | ![done] | | +| Storage | Rust | Python | Comment | +| -------------------- | :-----: | :-----: | ----------------------------------- | +| Local | ![done] | ![done] | | +| S3 - AWS | ![done] | ![done] | requires lock for concurrent writes | +| S3 - MinIO | ![done] | ![done] | requires lock for concurrent writes | +| S3 - R2 | ![done] | ![done] | requires lock for concurrent writes | +| Azure Blob | ![done] | ![done] | | +| Azure ADLS Gen2 | ![done] | ![done] | | +| Microsoft OneLake | ![done] | ![done] | | +| Google Cloud Storage | ![done] | ![done] | | ### Supported Operations -| Operation | Rust | Python | Description | -| --------------------- | :----------------------: | :-----------------: | ------------------------------------------- | -| Create | ![done] | ![done] | Create a new table | -| Read | ![done] | ![done] | Read data from a table | -| Vacuum | ![done] | ![done] | Remove unused files and log entries | -| Delete - partitions | | ![done] | Delete a table partition | -| Delete - predicates | ![done] | ![done] | Delete data based on a predicate | -| Optimize - compaction | ![done] | ![done] | Harmonize the size of data file | -| Optimize - Z-order | ![done] | ![done] | Place similar data into the same file | +| Operation | Rust | Python | Description | +| --------------------- | :----------------------: | :----------------------: | ------------------------------------------- | +| Create | ![done] | ![done] | Create a new table | +| Read | ![done] | ![done] | Read data from a table | +| Vacuum | ![done] | ![done] | Remove unused files and log entries | +| Delete - partitions | | ![done] | Delete a table partition | +| Delete - predicates | ![done] | ![done] | Delete data based on a predicate | +| Optimize - compaction | ![done] | ![done] | Harmonize the size of data file | +| Optimize - Z-order | ![done] | ![done] | Place similar data into the same file | | Merge | [![semi-done]][merge-rs] | [![semi-done]][merge-py] | Merge two tables (limited to full re-write) | -| FS check | ![done] | ![done] | Remove corrupted files from table | +| FS check | ![done] | ![done] | Remove corrupted files from table | ### Protocol Support Level | Writer Version | Requirement | Status | | -------------- | --------------------------------------------- | :------------------: | -| Version 2 | Append Only Tables | ![done] +| Version 2 | Append Only Tables | ![done] | | Version 2 | Column Invariants | ![done] | | Version 3 | Enforce `delta.checkpoint.writeStatsAsJson` | [![open]][writer-rs] | | Version 3 | Enforce `delta.checkpoint.writeStatsAsStruct` | [![open]][writer-rs] | diff --git a/python/Cargo.toml b/python/Cargo.toml index 12e64d8fef..2c85fcb262 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -14,9 +14,6 @@ name = "deltalake" crate-type = ["cdylib"] doc = false -[package.metadata.maturin] -name = "deltalake._internal" - [dependencies] # arrow arrow-schema = { workspace = true, features = ["serde"] } @@ -43,7 +40,7 @@ reqwest = { version = "*", features = ["native-tls-vendored"] } [dependencies.pyo3] version = "0.19" -features = ["extension-module", "abi3", "abi3-py37"] +features = ["extension-module", "abi3", "abi3-py38"] [dependencies.deltalake] path = "../crates/deltalake" diff --git a/python/pyproject.toml b/python/pyproject.toml index cc525fef50..09a7e4b37c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["maturin>=0.14,<0.15"] +requires = ["maturin>=1,<2"] build-backend = "maturin" [project] @@ -7,18 +7,18 @@ name = "deltalake" description = "Native Delta Lake Python binding based on delta-rs with Pandas integration" readme = "README.md" license = {file = "LICENSE.txt"} -requires-python = ">=3.7" +requires-python = ">=3.8" keywords = ["deltalake", "delta", "datalake", "pandas", "arrow"] classifiers = [ "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11" + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" ] dependencies = [ - "pyarrow>=8", - 'typing-extensions;python_version<"3.8"', + "pyarrow>=8" ] [project.optional-dependencies] @@ -49,6 +49,9 @@ pyspark = [ documentation = "https://delta-io.github.io/delta-rs/" repository = "https://github.com/delta-io/delta-rs/tree/main/python/" +[tool.maturin] +module-name = "deltalake._internal" + [tool.mypy] files = "deltalake/*.py" exclude = "^tests" From 18ad754dcfe638b723b09d10dc706cde7149c893 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 15 Nov 2023 09:13:25 -0800 Subject: [PATCH 09/23] docs: tell how to claim an issue --- CONTRIBUTING.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68e517785d..ee3d6c29d7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,3 +7,7 @@ Please take note of our [code of conduct](CODE_OF_CONDUCT.md). If you want to start contributing, first look at our good first issues: https://github.com/delta-io/delta-rs/contribute If you want to contribute something more substantial, see our "Projects seeking contributors" section on our roadmap: https://github.com/delta-io/delta-rs/issues/1128 + +## Claiming an issue + +If you want to claim an issue to work on, you can write the word `take` as a comment in it and you will be automatically assigned. From 2340585924043cfe8ac2a705d5578515145cbcc9 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Wed, 8 Nov 2023 19:09:02 +0000 Subject: [PATCH 10/23] implement issue 1768 Signed-off-by: Nikolay Ulmasov --- python/deltalake/_internal.pyi | 1 + python/deltalake/table.py | 7 +++++++ python/src/lib.rs | 10 ++++++++- python/tests/test_checkpoint.py | 36 +++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index 85887aeff5..4662f52f2f 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -126,6 +126,7 @@ class RawDeltaTable: schema: pyarrow.Schema, partitions_filters: Optional[FilterType], ) -> None: ... + def cleanup_metadata(self) -> None: ... def rust_core_version() -> str: ... def write_new_deltalake( diff --git a/python/deltalake/table.py b/python/deltalake/table.py index ad82a010fd..98739a8b55 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -799,6 +799,13 @@ def update_incremental(self) -> None: def create_checkpoint(self) -> None: self._table.create_checkpoint() + def cleanup_metadata(self) -> None: + """ + Delete expired log files before current version from table. The table log retention is based on + the `configuration.logRetentionDuration` value. + """ + self._table.cleanup_metadata() + def __stringify_partition_values( self, partition_filters: Optional[List[Tuple[str, str, Any]]] ) -> Optional[List[Tuple[str, str, Union[str, List[str]]]]]: diff --git a/python/src/lib.rs b/python/src/lib.rs index a58d1f9aa6..5ee72f72d0 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -20,7 +20,7 @@ use deltalake::arrow::ffi_stream::ArrowArrayStreamReader; use deltalake::arrow::record_batch::RecordBatch; use deltalake::arrow::record_batch::RecordBatchReader; use deltalake::arrow::{self, datatypes::Schema as ArrowSchema}; -use deltalake::checkpoints::create_checkpoint; +use deltalake::checkpoints::{cleanup_metadata, create_checkpoint}; use deltalake::datafusion::datasource::memory::MemTable; use deltalake::datafusion::datasource::provider::TableProvider; use deltalake::datafusion::prelude::SessionContext; @@ -854,6 +854,14 @@ impl RawDeltaTable { Ok(()) } + pub fn cleanup_metadata(&self) -> PyResult<()> { + rt()? + .block_on(cleanup_metadata(&self._table)) + .map_err(PythonError::from)?; + + Ok(()) + } + pub fn get_add_actions(&self, flatten: bool) -> PyResult> { Ok(PyArrowType( self._table diff --git a/python/tests/test_checkpoint.py b/python/tests/test_checkpoint.py index 0e9f090f8a..01c958034e 100644 --- a/python/tests/test_checkpoint.py +++ b/python/tests/test_checkpoint.py @@ -1,3 +1,5 @@ +import datetime as dt +import os import pathlib import pyarrow as pa @@ -23,3 +25,37 @@ def test_checkpoint(tmp_path: pathlib.Path, sample_data: pa.Table): assert last_checkpoint_path.exists() assert checkpoint_path.exists() + + +def test_cleanup_metadata(tmp_path: pathlib.Path, sample_data: pa.Table): + tmp_table_path = tmp_path / "path" / "to" / "table" + first_log_path = tmp_table_path / "_delta_log" / "00000000000000000000.json" + second_log_path = tmp_table_path / "_delta_log" / "00000000000000000001.json" + third_log_path = tmp_table_path / "_delta_log" / "00000000000000000002.json" + + # TODO: Include binary after fixing issue "Json error: binary type is not supported" + sample_data = sample_data.drop(["binary"]) + + # Create few log files + write_deltalake(str(tmp_table_path), sample_data) + write_deltalake(str(tmp_table_path), sample_data, mode="overwrite") + delta_table = DeltaTable(str(tmp_table_path)) + delta_table.delete() + + # Move first log entry timestamp back in time for more than 30 days + old_ts = (dt.datetime.now() - dt.timedelta(days=31)).timestamp() + os.utime(first_log_path, (old_ts, old_ts)) + + # Move second log entry timestamp back in time for a minute + near_ts = (dt.datetime.now() - dt.timedelta(minutes=1)).timestamp() + os.utime(second_log_path, (near_ts, near_ts)) + + assert first_log_path.exists() + assert second_log_path.exists() + assert third_log_path.exists() + + delta_table.cleanup_metadata() + + assert not first_log_path.exists() + assert second_log_path.exists() + assert third_log_path.exists() From 5c324ccd0a8738251a9aea9452d7323400e5e8c6 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Fri, 10 Nov 2023 08:36:34 +0000 Subject: [PATCH 11/23] add default retention value to docs Signed-off-by: Nikolay Ulmasov --- python/deltalake/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 98739a8b55..4ef07e5676 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -802,7 +802,7 @@ def create_checkpoint(self) -> None: def cleanup_metadata(self) -> None: """ Delete expired log files before current version from table. The table log retention is based on - the `configuration.logRetentionDuration` value. + the `configuration.logRetentionDuration` value, 30 days by default. """ self._table.cleanup_metadata() From 691ed8b3b1500e4e23fd076c9d2c1d3195e6a653 Mon Sep 17 00:00:00 2001 From: PierreDubrulle Date: Sun, 12 Nov 2023 18:37:28 +0100 Subject: [PATCH 12/23] Skip serializing optional fields if they're null Fixes #1847 --- .../deltalake-core/src/operations/update.rs | 1 - crates/deltalake-core/src/table/mod.rs | 35 ++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/crates/deltalake-core/src/operations/update.rs b/crates/deltalake-core/src/operations/update.rs index 559f28868d..9f51912579 100644 --- a/crates/deltalake-core/src/operations/update.rs +++ b/crates/deltalake-core/src/operations/update.rs @@ -737,7 +737,6 @@ mod tests { .with_update("value", col("value") + lit(1)) .await .unwrap(); - assert_eq!(table.version(), 1); assert_eq!(table.get_file_uris().count(), 1); assert_eq!(metrics.num_added_files, 1); diff --git a/crates/deltalake-core/src/table/mod.rs b/crates/deltalake-core/src/table/mod.rs index cd0f1808f5..33fdea2ad0 100644 --- a/crates/deltalake-core/src/table/mod.rs +++ b/crates/deltalake-core/src/table/mod.rs @@ -47,14 +47,18 @@ pub struct CheckPoint { pub(crate) version: i64, // 20 digits decimals /// The number of actions that are stored in the checkpoint. pub(crate) size: i64, + #[serde(skip_serializing_if = "Option::is_none")] /// The number of fragments if the last checkpoint was written in multiple parts. This field is optional. pub(crate) parts: Option, // 10 digits decimals + #[serde(skip_serializing_if = "Option::is_none")] /// The number of bytes of the checkpoint. This field is optional. pub(crate) size_in_bytes: Option, + #[serde(skip_serializing_if = "Option::is_none")] /// The number of AddFile actions in the checkpoint. This field is optional. pub(crate) num_of_add_files: Option, } +#[derive(Default)] /// Builder for CheckPoint pub struct CheckPointBuilder { /// Delta table version @@ -118,7 +122,7 @@ impl CheckPoint { Self { version, size, - parts, + parts: parts.or(None), size_in_bytes: None, num_of_add_files: None, } @@ -909,6 +913,35 @@ mod tests { drop(tmp_dir); } + #[tokio::test] + async fn checkpoint_without_added_files_and_no_parts() { + let (dt, tmp_dir) = create_test_table().await; + let check_point = CheckPointBuilder::new(0, 0).build(); + let checkpoint_data_paths = dt.get_checkpoint_data_paths(&check_point); + assert_eq!(checkpoint_data_paths.len(), 1); + assert_eq!( + serde_json::to_string(&check_point).unwrap(), + "{\"version\":0,\"size\":0}" + ); + drop(tmp_dir); + } + + #[tokio::test] + async fn checkpoint_with_added_files() { + let num_of_file_added: i64 = 4; + let (dt, tmp_dir) = create_test_table().await; + let check_point = CheckPointBuilder::new(0, 0) + .with_num_of_add_files(num_of_file_added) + .build(); + let checkpoint_data_paths = dt.get_checkpoint_data_paths(&check_point); + assert_eq!(checkpoint_data_paths.len(), 1); + assert_eq!( + serde_json::to_string(&check_point).unwrap(), + "{\"version\":0,\"size\":0,\"num_of_add_files\":4}" + ); + drop(tmp_dir); + } + #[cfg(any(feature = "s3", feature = "s3-native-tls"))] #[test] fn normalize_table_uri_s3() { From b5bb4b332c3a16d04cfb59c9fc7e63609e9b2d98 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Thu, 16 Nov 2023 20:56:27 +0100 Subject: [PATCH 13/23] chore: clippy (#1871) # Description Just running `cargo clippy --fix` once to get main green again ... --- .../src/operations/transaction/conflict_checker.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/deltalake-core/src/operations/transaction/conflict_checker.rs b/crates/deltalake-core/src/operations/transaction/conflict_checker.rs index 3a0bf0526d..6cefe848b8 100644 --- a/crates/deltalake-core/src/operations/transaction/conflict_checker.rs +++ b/crates/deltalake-core/src/operations/transaction/conflict_checker.rs @@ -522,9 +522,8 @@ impl<'a> ConflictChecker<'a> { .winning_commit_summary .removed_files() .iter() - // TODO remove cloned - .cloned() - .find(|f| read_file_path.contains(&f.path)); + .find(|&f| read_file_path.contains(&f.path)) + .cloned(); if deleted_read_overlap.is_some() || (!self.winning_commit_summary.removed_files().is_empty() && self.txn_info.read_whole_table()) From 44a3760aac9d68bfa6975d20ceba046d853514e9 Mon Sep 17 00:00:00 2001 From: Niko Date: Fri, 17 Nov 2023 02:19:47 +0000 Subject: [PATCH 14/23] fix: docs deployment action (#1869) # Description Moved the requirement for contents write permission from release action to docs deployment. The same permission was used in the previous release action (sphinx) so I'm not introducing any extra permission requirements. One thing I don't really like here is that such permission is only needed for the `Deploy` step but the lowest level it can be defined is job. This should be fine because other steps in the job don't do any git operations. Alternative would be creating a separate Deploy job but then I'd have to duplicate all the steps from the Build part which would effectively be the same thing as now but with a longer/redundant config. Deployment logic remains the same - docs are deployed only when the Build Documentation is run manually or via the Release Python action. # Related Issue(s) #1867 I am unable to fully test this in my environment because the failing version was actually working in my repo - I don't see how things are configured in the main repo. Signed-off-by: Nikolay Ulmasov Co-authored-by: Will Jones --- .github/workflows/docs.yml | 4 +++- .github/workflows/python_release.yml | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 09740c4c9d..e96be656f6 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -35,13 +35,15 @@ jobs: with: src: docs/src/python - build: + build-deploy: needs: [ lint, markdown-link-check, ] runs-on: ubuntu-latest + permissions: + contents: write steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/python_release.yml b/.github/workflows/python_release.yml index 5dffdd651c..6bd0bacd84 100644 --- a/.github/workflows/python_release.yml +++ b/.github/workflows/python_release.yml @@ -128,8 +128,6 @@ jobs: release-pypi-windows, ] runs-on: ubuntu-latest - permissions: - contents: write steps: - name: Trigger the docs release event uses: peter-evans/repository-dispatch@v2 From 4d103a7ceea89d45a480bb4d58d9255d8b12ece3 Mon Sep 17 00:00:00 2001 From: David Blajda Date: Thu, 16 Nov 2023 22:14:44 -0500 Subject: [PATCH 15/23] chore: upgrade datafusion 33 (#1775) # Description I've had to live on the bleeding edge of datafusion since I've discovered multiple bugs while implementing merge enhancements. Creating this PR to contain changes necessary to use it. Datafusion has made significant changes with how table statistics are represented. I refactored and was able to trim a significant amount of code. There were some bugs with how we presented statistics for tables that do not contain column metadata (deleta-0.2.0) where we stated the number of records for a file is 0. ## Fixes The null-ability status for partition columns are now accurately captured by Datafusion. Before if a partition column contained a null value an error would be returned. This should be resolved now. --- Cargo.toml | 31 +- crates/deltalake-core/Cargo.toml | 2 +- .../src/delta_datafusion/expr.rs | 4 + .../src/delta_datafusion/mod.rs | 568 ++++++------------ crates/deltalake-core/src/operations/mod.rs | 4 +- .../deltalake-core/src/operations/optimize.rs | 4 +- .../src/operations/transaction/state.rs | 15 +- .../tests/integration_datafusion.rs | 156 +++-- crates/deltalake-sql/src/planner.rs | 5 + python/Cargo.toml | 2 +- 10 files changed, 348 insertions(+), 443 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0b3862bd1f..a884ff5413 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,23 +19,24 @@ debug = "line-tables-only" [workspace.dependencies] # arrow -arrow = { version = "47" } -arrow-array = { version = "47" } -arrow-buffer = { version = "47" } -arrow-cast = { version = "47" } -arrow-ord = { version = "47" } -arrow-row = { version = "47" } -arrow-schema = { version = "47" } -arrow-select = { version = "47" } -parquet = { version = "47" } +arrow = { version = "48.0.1" } +arrow-array = { version = "48.0.1" } +arrow-buffer = { version = "48.0.1" } +arrow-cast = { version = "48.0.1" } +arrow-ord = { version = "48.0.1" } +arrow-row = { version = "48.0.1" } +arrow-schema = { version = "48.0.1" } +arrow-select = { version = "48.0.1" } +parquet = { version = "48.0.1" } # datafusion -datafusion = { version = "32" } -datafusion-expr = { version = "32" } -datafusion-common = { version = "32" } -datafusion-proto = { version = "32" } -datafusion-sql = { version = "32" } -datafusion-physical-expr = { version = "32" } +datafusion = { version = "33.0.0" } +datafusion-expr = { version = "33.0.0" } +datafusion-common = { version = "33.0.0" } +datafusion-proto = { version = "33.0.0" } +datafusion-sql = { version = "33.0.0" } +datafusion-physical-expr = { version = "33.0.0" } + # serde serde = { version = "1", features = ["derive"] } diff --git a/crates/deltalake-core/Cargo.toml b/crates/deltalake-core/Cargo.toml index 9fa259fa39..fc3e21c38c 100644 --- a/crates/deltalake-core/Cargo.toml +++ b/crates/deltalake-core/Cargo.toml @@ -111,7 +111,7 @@ reqwest = { version = "0.11.18", default-features = false, features = [ # Datafusion dashmap = { version = "5", optional = true } -sqlparser = { version = "0.38", optional = true } +sqlparser = { version = "0.39", optional = true } # NOTE dependencies only for integration tests fs_extra = { version = "1.3.0", optional = true } diff --git a/crates/deltalake-core/src/delta_datafusion/expr.rs b/crates/deltalake-core/src/delta_datafusion/expr.rs index e451484183..e0c284df92 100644 --- a/crates/deltalake-core/src/delta_datafusion/expr.rs +++ b/crates/deltalake-core/src/delta_datafusion/expr.rs @@ -70,6 +70,10 @@ impl<'a> ContextProvider for DeltaContextProvider<'a> { fn get_window_meta(&self, name: &str) -> Option> { self.state.window_functions().get(name).cloned() } + + fn get_table_source(&self, _name: TableReference) -> DFResult> { + unimplemented!() + } } /// Parse a string predicate into an `Expr` diff --git a/crates/deltalake-core/src/delta_datafusion/mod.rs b/crates/deltalake-core/src/delta_datafusion/mod.rs index 38bf135739..1410efbfbc 100644 --- a/crates/deltalake-core/src/delta_datafusion/mod.rs +++ b/crates/deltalake-core/src/delta_datafusion/mod.rs @@ -22,7 +22,6 @@ use std::any::Any; use std::collections::{HashMap, HashSet}; -use std::convert::TryFrom; use std::fmt::{self, Debug}; use std::sync::Arc; @@ -56,6 +55,7 @@ use datafusion::physical_plan::{ SendableRecordBatchStream, Statistics, }; use datafusion_common::scalar::ScalarValue; +use datafusion_common::stats::Precision; use datafusion_common::tree_node::{TreeNode, TreeNodeVisitor, VisitRecursion}; use datafusion_common::{Column, DataFusionError, Result as DataFusionResult, ToDFSchema}; use datafusion_expr::expr::{ScalarFunction, ScalarUDF}; @@ -65,6 +65,7 @@ use datafusion_physical_expr::execution_props::ExecutionProps; use datafusion_physical_expr::{create_physical_expr, PhysicalExpr}; use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::physical_plan::PhysicalExtensionCodec; +use log::error; use object_store::ObjectMeta; use serde::{Deserialize, Serialize}; use url::Url; @@ -72,7 +73,7 @@ use url::Url; use crate::errors::{DeltaResult, DeltaTableError}; use crate::kernel::{Add, DataType as DeltaDataType, Invariant, PrimitiveType}; use crate::logstore::LogStoreRef; -use crate::protocol::{self}; +use crate::protocol::{ColumnCountStat, ColumnValueStat}; use crate::table::builder::ensure_table_uri; use crate::table::state::DeltaTableState; use crate::{open_table, open_table_with_storage_options, DeltaTable}; @@ -105,152 +106,128 @@ impl From for DeltaTableError { } } +fn get_scalar_value(value: Option<&ColumnValueStat>, field: &Arc) -> Precision { + match value { + Some(ColumnValueStat::Value(value)) => to_correct_scalar_value(value, field.data_type()) + .map(|maybe_scalar| maybe_scalar.map(Precision::Exact).unwrap_or_default()) + .unwrap_or_else(|_| { + error!( + "Unable to parse scalar value of {:?} with type {} for column {}", + value, + field.data_type(), + field.name() + ); + Precision::Absent + }), + _ => Precision::Absent, + } +} + impl DeltaTableState { - /// Return statistics for Datafusion Table - pub fn datafusion_table_statistics(&self) -> Statistics { - let stats = self - .files() - .iter() - .try_fold( - Statistics { - num_rows: Some(0), - total_byte_size: Some(0), - column_statistics: Some(vec![ - ColumnStatistics { - null_count: Some(0), - max_value: None, - min_value: None, - distinct_count: None + /// Provide table level statistics to Datafusion + pub fn datafusion_table_statistics(&self) -> DataFusionResult { + // Statistics only support primitive types. Any non primitive column will not have their statistics captured + // If column statistics are missing for any add actions then we simply downgrade to Absent. + + let schema = self.arrow_schema()?; + // Downgrade statistics to absent if file metadata is not present. + let mut downgrade = false; + let unknown_stats = Statistics::new_unknown(&schema); + + let files = self.files(); + + // Initalize statistics + let mut table_stats = match files.get(0) { + Some(file) => match file.get_stats() { + Ok(Some(stats)) => { + let mut column_statistics = Vec::with_capacity(schema.fields().size()); + let total_byte_size = Precision::Exact(file.size as usize); + let num_rows = Precision::Exact(stats.num_records as usize); + + for field in schema.fields() { + let null_count = match stats.null_count.get(field.name()) { + Some(ColumnCountStat::Value(x)) => Precision::Exact(*x as usize), + _ => Precision::Absent, }; - self.schema().unwrap().fields().len() - ]), - is_exact: true, - }, - |acc, action| { - let new_stats = action - .get_stats() - .unwrap_or_else(|_| Some(protocol::Stats::default()))?; - Some(Statistics { - num_rows: acc - .num_rows - .map(|rows| rows + new_stats.num_records as usize), - total_byte_size: acc - .total_byte_size - .map(|total_size| total_size + action.size as usize), - column_statistics: acc.column_statistics.map(|col_stats| { - self.schema() - .unwrap() - .fields() - .iter() - .zip(col_stats) - .map(|(field, stats)| { - let null_count = new_stats - .null_count - .get(field.name()) - .and_then(|x| { - let null_count_acc = stats.null_count?; - let null_count = x.as_value()? as usize; - Some(null_count_acc + null_count) - }) - .or(stats.null_count); - - let max_value = new_stats - .max_values - .get(field.name()) - .and_then(|x| { - let old_stats = stats.clone(); - let max_value = to_scalar_value(x.as_value()?); - - match (max_value, old_stats.max_value) { - (Some(max_value), Some(old_max_value)) => { - if left_larger_than_right( - old_max_value.clone(), - max_value.clone(), - )? { - Some(old_max_value) - } else { - Some(max_value) - } - } - (Some(max_value), None) => Some(max_value), - (None, old) => old, - } - }) - .or_else(|| stats.max_value.clone()); - - let min_value = new_stats - .min_values - .get(field.name()) - .and_then(|x| { - let old_stats = stats.clone(); - let min_value = to_scalar_value(x.as_value()?); - - match (min_value, old_stats.min_value) { - (Some(min_value), Some(old_min_value)) => { - if left_larger_than_right( - min_value.clone(), - old_min_value.clone(), - )? { - Some(old_min_value) - } else { - Some(min_value) - } - } - (Some(min_value), None) => Some(min_value), - (None, old) => old, - } - }) - .or_else(|| stats.min_value.clone()); - - ColumnStatistics { - null_count, - max_value, - min_value, - distinct_count: None, // TODO: distinct - } - }) - .collect() - }), - is_exact: true, - }) - }, - ) - .unwrap_or_default(); - - // Convert column max/min scalar values to correct types based on arrow types. - Statistics { - is_exact: true, - num_rows: stats.num_rows, - total_byte_size: stats.total_byte_size, - column_statistics: stats.column_statistics.map(|col_stats| { - let fields = self.schema().unwrap().fields(); - col_stats - .iter() - .zip(fields) - .map(|(col_states, field)| { - let dt = self - .arrow_schema() - .unwrap() - .field_with_name(field.name()) - .unwrap() - .data_type() - .clone(); - ColumnStatistics { - null_count: col_states.null_count, - max_value: col_states - .max_value - .as_ref() - .and_then(|scalar| correct_scalar_value_type(scalar.clone(), &dt)), - min_value: col_states - .min_value - .as_ref() - .and_then(|scalar| correct_scalar_value_type(scalar.clone(), &dt)), - distinct_count: col_states.distinct_count, + + let max_value = get_scalar_value(stats.max_values.get(field.name()), field); + let min_value = get_scalar_value(stats.min_values.get(field.name()), field); + + column_statistics.push(ColumnStatistics { + null_count, + max_value, + min_value, + distinct_count: Precision::Absent, + }); + } + + Statistics { + total_byte_size, + num_rows, + column_statistics, + } + } + Ok(None) => { + downgrade = true; + let mut stats = unknown_stats.clone(); + stats.total_byte_size = Precision::Exact(file.size as usize); + stats + } + _ => return Ok(unknown_stats), + }, + None => { + // The Table is empty + let mut stats = unknown_stats; + stats.num_rows = Precision::Exact(0); + stats.total_byte_size = Precision::Exact(0); + return Ok(stats); + } + }; + + // Populate the remaining statistics. If file statistics are not present then relevant statistics are downgraded to absent. + for file in &files.as_slice()[1..] { + let byte_size = Precision::Exact(file.size as usize); + table_stats.total_byte_size = table_stats.total_byte_size.add(&byte_size); + + if !downgrade { + match file.get_stats() { + Ok(Some(stats)) => { + let num_records = Precision::Exact(stats.num_records as usize); + + table_stats.num_rows = table_stats.num_rows.add(&num_records); + + for (idx, field) in schema.fields().iter().enumerate() { + let column_stats = table_stats.column_statistics.get_mut(idx).unwrap(); + + let null_count = match stats.null_count.get(field.name()) { + Some(ColumnCountStat::Value(x)) => Precision::Exact(*x as usize), + _ => Precision::Absent, + }; + + let max_value = + get_scalar_value(stats.max_values.get(field.name()), field); + let min_value = + get_scalar_value(stats.min_values.get(field.name()), field); + + column_stats.null_count = column_stats.null_count.add(&null_count); + column_stats.max_value = column_stats.max_value.max(&max_value); + column_stats.min_value = column_stats.min_value.min(&min_value); } - }) - .collect() - }), + } + Ok(None) => { + downgrade = true; + } + Err(_) => return Ok(unknown_stats), + } + } + } + + if downgrade { + table_stats.column_statistics = unknown_stats.column_statistics; + table_stats.num_rows = Precision::Absent; } + + Ok(table_stats) } } @@ -276,9 +253,12 @@ fn get_prune_stats(table: &DeltaTable, column: &Column, get_max: bool) -> Option Some(v) => serde_json::Value::String(v.to_string()), None => serde_json::Value::Null, }; - to_correct_scalar_value(&value, &data_type).unwrap_or( - get_null_of_arrow_type(&data_type).expect("Could not determine null type"), - ) + to_correct_scalar_value(&value, &data_type) + .ok() + .flatten() + .unwrap_or( + get_null_of_arrow_type(&data_type).expect("Could not determine null type"), + ) } else if let Ok(Some(statistics)) = add.get_stats() { let values = if get_max { statistics.max_values @@ -288,7 +268,11 @@ fn get_prune_stats(table: &DeltaTable, column: &Column, get_max: bool) -> Option values .get(&column.name) - .and_then(|f| to_correct_scalar_value(f.as_value()?, &data_type)) + .and_then(|f| { + to_correct_scalar_value(f.as_value()?, &data_type) + .ok() + .flatten() + }) .unwrap_or( get_null_of_arrow_type(&data_type).expect("Could not determine null type"), ) @@ -618,16 +602,28 @@ impl<'a> DeltaScanBuilder<'a> { let mut table_partition_cols = table_partition_cols .iter() - .map(|c| Ok((c.to_owned(), schema.field_with_name(c)?.data_type().clone()))) + .map(|name| schema.field_with_name(name).map(|f| f.to_owned())) .collect::, ArrowError>>()?; if let Some(file_column_name) = &config.file_column_name { - table_partition_cols.push(( + table_partition_cols.push(Field::new( file_column_name.clone(), wrap_partition_type_in_dict(DataType::Utf8), + false, )); } + let stats = self + .snapshot + .datafusion_table_statistics() + .unwrap_or_else(|e| { + error!( + "Error while computing table statistics. Using unknown statistics. {}", + e + ); + Statistics::new_unknown(&schema) + }); + let scan = ParquetFormat::new() .create_physical_plan( self.state, @@ -635,7 +631,7 @@ impl<'a> DeltaScanBuilder<'a> { object_store_url: self.log_store.object_store_url(), file_schema, file_groups: file_groups.into_values().collect(), - statistics: self.snapshot.datafusion_table_statistics(), + statistics: stats, projection: self.projection.cloned(), limit: self.limit, table_partition_cols, @@ -705,7 +701,7 @@ impl TableProvider for DeltaTable { } fn statistics(&self) -> Option { - Some(self.state.datafusion_table_statistics()) + self.state.datafusion_table_statistics().ok() } } @@ -784,7 +780,7 @@ impl TableProvider for DeltaTableProvider { } fn statistics(&self) -> Option { - Some(self.snapshot.datafusion_table_statistics()) + self.snapshot.datafusion_table_statistics().ok() } } @@ -851,7 +847,7 @@ impl ExecutionPlan for DeltaScan { self.parquet_scan.execute(partition, context) } - fn statistics(&self) -> Statistics { + fn statistics(&self) -> DataFusionResult { self.parquet_scan.statistics() } } @@ -936,6 +932,7 @@ pub(crate) fn partitioned_file_from_action( &serde_json::Value::String(value.to_string()), field.data_type(), ) + .unwrap_or(Some(ScalarValue::Null)) .unwrap_or(ScalarValue::Null), None => get_null_of_arrow_type(field.data_type()) .unwrap_or(ScalarValue::Null), @@ -961,174 +958,55 @@ pub(crate) fn partitioned_file_from_action( } } -fn to_scalar_value(stat_val: &serde_json::Value) -> Option { - match stat_val { - serde_json::Value::Bool(val) => Some(ScalarValue::from(*val)), - serde_json::Value::Number(num) => { - if let Some(val) = num.as_i64() { - Some(ScalarValue::from(val)) - } else if let Some(val) = num.as_u64() { - Some(ScalarValue::from(val)) - } else { - num.as_f64().map(ScalarValue::from) - } - } - serde_json::Value::String(s) => Some(ScalarValue::from(s.as_str())), - serde_json::Value::Array(_) => None, - serde_json::Value::Object(_) => None, - serde_json::Value::Null => None, - } +fn parse_timestamp( + stat_val: &serde_json::Value, + field_dt: &ArrowDataType, +) -> DataFusionResult { + let string = match stat_val { + serde_json::Value::String(s) => s.to_owned(), + _ => stat_val.to_string(), + }; + + let time_micro = ScalarValue::try_from_string( + string, + &ArrowDataType::Timestamp(TimeUnit::Microsecond, None), + )?; + let cast_arr = cast_with_options( + &time_micro.to_array()?, + field_dt, + &CastOptions { + safe: false, + ..Default::default() + }, + )?; + ScalarValue::try_from_array(&cast_arr, 0) } pub(crate) fn to_correct_scalar_value( stat_val: &serde_json::Value, field_dt: &ArrowDataType, -) -> Option { +) -> DataFusionResult> { match stat_val { - serde_json::Value::Array(_) => None, - serde_json::Value::Object(_) => None, - serde_json::Value::Null => get_null_of_arrow_type(field_dt).ok(), + serde_json::Value::Array(_) => Ok(None), + serde_json::Value::Object(_) => Ok(None), + serde_json::Value::Null => Ok(Some(get_null_of_arrow_type(field_dt)?)), serde_json::Value::String(string_val) => match field_dt { - ArrowDataType::Timestamp(_, _) => { - let time_nanos = ScalarValue::try_from_string( - string_val.to_owned(), - &ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), - ) - .ok()?; - let cast_arr = cast_with_options( - &time_nanos.to_array(), - field_dt, - &CastOptions { - safe: false, - ..Default::default() - }, - ) - .ok()?; - Some(ScalarValue::try_from_array(&cast_arr, 0).ok()?) - } - _ => Some(ScalarValue::try_from_string(string_val.to_owned(), field_dt).ok()?), + ArrowDataType::Timestamp(_, _) => Ok(Some(parse_timestamp(stat_val, field_dt)?)), + _ => Ok(Some(ScalarValue::try_from_string( + string_val.to_owned(), + field_dt, + )?)), }, other => match field_dt { - ArrowDataType::Timestamp(_, _) => { - let time_nanos = ScalarValue::try_from_string( - other.to_string(), - &ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), - ) - .ok()?; - let cast_arr = cast_with_options( - &time_nanos.to_array(), - field_dt, - &CastOptions { - safe: false, - ..Default::default() - }, - ) - .ok()?; - Some(ScalarValue::try_from_array(&cast_arr, 0).ok()?) - } - _ => Some(ScalarValue::try_from_string(other.to_string(), field_dt).ok()?), + ArrowDataType::Timestamp(_, _) => Ok(Some(parse_timestamp(stat_val, field_dt)?)), + _ => Ok(Some(ScalarValue::try_from_string( + other.to_string(), + field_dt, + )?)), }, } } -fn correct_scalar_value_type(value: ScalarValue, field_dt: &ArrowDataType) -> Option { - match field_dt { - ArrowDataType::Int64 => { - let raw_value = i64::try_from(value).ok()?; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Int32 => { - let raw_value = i64::try_from(value).ok()? as i32; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Int16 => { - let raw_value = i64::try_from(value).ok()? as i16; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Int8 => { - let raw_value = i64::try_from(value).ok()? as i8; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Float32 => { - let raw_value = f64::try_from(value).ok()? as f32; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Float64 => { - let raw_value = f64::try_from(value).ok()?; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Utf8 => match value { - ScalarValue::Utf8(val) => Some(ScalarValue::Utf8(val)), - _ => None, - }, - ArrowDataType::LargeUtf8 => match value { - ScalarValue::Utf8(val) => Some(ScalarValue::LargeUtf8(val)), - _ => None, - }, - ArrowDataType::Boolean => { - let raw_value = bool::try_from(value).ok()?; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Decimal128(_, _) => { - let raw_value = f64::try_from(value).ok()?; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Decimal256(_, _) => { - let raw_value = f64::try_from(value).ok()?; - Some(ScalarValue::from(raw_value)) - } - ArrowDataType::Date32 => { - let raw_value = i64::try_from(value).ok()? as i32; - Some(ScalarValue::Date32(Some(raw_value))) - } - ArrowDataType::Date64 => { - let raw_value = i64::try_from(value).ok()?; - Some(ScalarValue::Date64(Some(raw_value))) - } - ArrowDataType::Timestamp(TimeUnit::Nanosecond, None) => { - let raw_value = i64::try_from(value).ok()?; - Some(ScalarValue::TimestampNanosecond(Some(raw_value), None)) - } - ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => { - let raw_value = i64::try_from(value).ok()?; - Some(ScalarValue::TimestampMicrosecond(Some(raw_value), None)) - } - ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { - let raw_value = i64::try_from(value).ok()?; - Some(ScalarValue::TimestampMillisecond(Some(raw_value), None)) - } - _ => { - log::error!( - "Scalar value of arrow type unimplemented for {:?} and {:?}", - value, - field_dt - ); - None - } - } -} - -fn left_larger_than_right(left: ScalarValue, right: ScalarValue) -> Option { - match (&left, &right) { - (ScalarValue::Float64(Some(l)), ScalarValue::Float64(Some(r))) => Some(l > r), - (ScalarValue::Float32(Some(l)), ScalarValue::Float32(Some(r))) => Some(l > r), - (ScalarValue::Int8(Some(l)), ScalarValue::Int8(Some(r))) => Some(l > r), - (ScalarValue::Int16(Some(l)), ScalarValue::Int16(Some(r))) => Some(l > r), - (ScalarValue::Int32(Some(l)), ScalarValue::Int32(Some(r))) => Some(l > r), - (ScalarValue::Int64(Some(l)), ScalarValue::Int64(Some(r))) => Some(l > r), - (ScalarValue::Utf8(Some(l)), ScalarValue::Utf8(Some(r))) => Some(l > r), - (ScalarValue::Boolean(Some(l)), ScalarValue::Boolean(Some(r))) => Some(l & !r), - _ => { - log::error!( - "Scalar value comparison unimplemented for {:?} and {:?}", - left, - right - ); - None - } - } -} - pub(crate) fn logical_expr_to_physical_expr( expr: &Expr, schema: &ArrowSchema, @@ -1706,81 +1584,11 @@ mod tests { ]; for (raw, data_type, ref_scalar) in reference_pairs { - let scalar = to_correct_scalar_value(raw, data_type).unwrap(); + let scalar = to_correct_scalar_value(raw, data_type).unwrap().unwrap(); assert_eq!(*ref_scalar, scalar) } } - #[test] - fn test_to_scalar_value() { - let reference_pairs = &[ - ( - json!("val"), - Some(ScalarValue::Utf8(Some(String::from("val")))), - ), - (json!("2"), Some(ScalarValue::Utf8(Some(String::from("2"))))), - (json!(true), Some(ScalarValue::Boolean(Some(true)))), - (json!(false), Some(ScalarValue::Boolean(Some(false)))), - (json!(2), Some(ScalarValue::Int64(Some(2)))), - (json!(-2), Some(ScalarValue::Int64(Some(-2)))), - (json!(2.0), Some(ScalarValue::Float64(Some(2.0)))), - (json!(["1", "2"]), None), - (json!({"key": "val"}), None), - (json!(null), None), - ]; - for (stat_val, scalar_val) in reference_pairs { - assert_eq!(to_scalar_value(stat_val), *scalar_val) - } - } - - #[test] - fn test_left_larger_than_right() { - let correct_reference_pairs = vec![ - ( - ScalarValue::Float64(Some(1.0)), - ScalarValue::Float64(Some(2.0)), - ), - ( - ScalarValue::Float32(Some(1.0)), - ScalarValue::Float32(Some(2.0)), - ), - (ScalarValue::Int8(Some(1)), ScalarValue::Int8(Some(2))), - (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(2))), - (ScalarValue::Int32(Some(1)), ScalarValue::Int32(Some(2))), - (ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2))), - ( - ScalarValue::Boolean(Some(false)), - ScalarValue::Boolean(Some(true)), - ), - ( - ScalarValue::Utf8(Some(String::from("1"))), - ScalarValue::Utf8(Some(String::from("2"))), - ), - ]; - for (smaller_val, larger_val) in correct_reference_pairs { - assert_eq!( - left_larger_than_right(smaller_val.clone(), larger_val.clone()), - Some(false) - ); - assert_eq!(left_larger_than_right(larger_val, smaller_val), Some(true)); - } - - let incorrect_reference_pairs = vec![ - ( - ScalarValue::Float64(Some(1.0)), - ScalarValue::Float32(Some(2.0)), - ), - (ScalarValue::Int32(Some(1)), ScalarValue::Float32(Some(2.0))), - ( - ScalarValue::Boolean(Some(true)), - ScalarValue::Float32(Some(2.0)), - ), - ]; - for (left, right) in incorrect_reference_pairs { - assert_eq!(left_larger_than_right(left, right), None); - } - } - #[test] fn test_partitioned_file_from_action() { let mut partition_values = std::collections::HashMap::new(); diff --git a/crates/deltalake-core/src/operations/mod.rs b/crates/deltalake-core/src/operations/mod.rs index abf9753648..863a5a8b63 100644 --- a/crates/deltalake-core/src/operations/mod.rs +++ b/crates/deltalake-core/src/operations/mod.rs @@ -208,7 +208,7 @@ mod datafusion_utils { metrics::{ExecutionPlanMetricsSet, MetricsSet}, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, }; - use datafusion_common::DFSchema; + use datafusion_common::{DFSchema, Statistics}; use datafusion_expr::Expr; use futures::{Stream, StreamExt}; @@ -334,7 +334,7 @@ mod datafusion_utils { })) } - fn statistics(&self) -> datafusion_common::Statistics { + fn statistics(&self) -> DataFusionResult { self.parent.statistics() } diff --git a/crates/deltalake-core/src/operations/optimize.rs b/crates/deltalake-core/src/operations/optimize.rs index d86ef6f3e5..e7d6a1b11d 100644 --- a/crates/deltalake-core/src/operations/optimize.rs +++ b/crates/deltalake-core/src/operations/optimize.rs @@ -1185,10 +1185,10 @@ pub(super) mod zorder { .ok_or(DataFusionError::NotImplemented( "z-order on zero columns.".to_string(), ))?; - let columns = columns + let columns: Vec = columns .iter() .map(|col| col.clone().into_array(length)) - .collect_vec(); + .try_collect()?; let array = zorder_key(&columns)?; Ok(ColumnarValue::Array(array)) } diff --git a/crates/deltalake-core/src/operations/transaction/state.rs b/crates/deltalake-core/src/operations/transaction/state.rs index a209b7369d..0fbbd554dc 100644 --- a/crates/deltalake-core/src/operations/transaction/state.rs +++ b/crates/deltalake-core/src/operations/transaction/state.rs @@ -188,9 +188,12 @@ impl<'a> AddContainer<'a> { Some(v) => serde_json::Value::String(v.to_string()), None => serde_json::Value::Null, }; - to_correct_scalar_value(&value, data_type).unwrap_or( - get_null_of_arrow_type(data_type).expect("Could not determine null type"), - ) + to_correct_scalar_value(&value, data_type) + .ok() + .flatten() + .unwrap_or( + get_null_of_arrow_type(data_type).expect("Could not determine null type"), + ) } else if let Ok(Some(statistics)) = add.get_stats() { let values = if get_max { statistics.max_values @@ -200,7 +203,11 @@ impl<'a> AddContainer<'a> { values .get(&column.name) - .and_then(|f| to_correct_scalar_value(f.as_value()?, data_type)) + .and_then(|f| { + to_correct_scalar_value(f.as_value()?, data_type) + .ok() + .flatten() + }) .unwrap_or( get_null_of_arrow_type(data_type).expect("Could not determine null type"), ) diff --git a/crates/deltalake-core/tests/integration_datafusion.rs b/crates/deltalake-core/tests/integration_datafusion.rs index 7a9c38463f..a412ce6417 100644 --- a/crates/deltalake-core/tests/integration_datafusion.rs +++ b/crates/deltalake-core/tests/integration_datafusion.rs @@ -46,6 +46,7 @@ use std::error::Error; mod common; mod local { + use datafusion::common::stats::Precision; use deltalake_core::writer::JsonWriter; use super::*; @@ -281,67 +282,146 @@ mod local { #[tokio::test] async fn test_datafusion_stats() -> Result<()> { + // Validate a table that contains statisitics for all files let table = open_table("./tests/data/delta-0.8.0").await.unwrap(); - let statistics = table.state.datafusion_table_statistics(); + let statistics = table.state.datafusion_table_statistics()?; - assert_eq!(statistics.num_rows, Some(4),); + assert_eq!(statistics.num_rows, Precision::Exact(4_usize),); - assert_eq!(statistics.total_byte_size, Some(440 + 440)); + assert_eq!( + statistics.total_byte_size, + Precision::Exact((440 + 440) as usize) + ); + let column_stats = statistics.column_statistics.get(0).unwrap(); + assert_eq!(column_stats.null_count, Precision::Exact(0)); assert_eq!( - statistics - .column_statistics - .clone() - .unwrap() - .iter() - .map(|x| x.null_count) - .collect::>>(), - vec![Some(0)], + column_stats.max_value, + Precision::Exact(ScalarValue::from(4_i32)) + ); + assert_eq!( + column_stats.min_value, + Precision::Exact(ScalarValue::from(0_i32)) ); let ctx = SessionContext::new(); ctx.register_table("test_table", Arc::new(table))?; - - let batches = ctx + let actual = ctx .sql("SELECT max(value), min(value) FROM test_table") .await? .collect() .await?; - assert_eq!(batches.len(), 1); - let batch = &batches[0]; + let expected = vec![ + "+-----------------------+-----------------------+", + "| MAX(test_table.value) | MIN(test_table.value) |", + "+-----------------------+-----------------------+", + "| 4 | 0 |", + "+-----------------------+-----------------------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + + // Validate a table that does not contain column statisitics + let table = open_table("./tests/data/delta-0.2.0").await.unwrap(); + let statistics = table.state.datafusion_table_statistics()?; + + assert_eq!(statistics.num_rows, Precision::Absent); + assert_eq!( - batch.column(0).as_ref(), - Arc::new(Int32Array::from(vec![4])).as_ref(), + statistics.total_byte_size, + Precision::Exact((400 + 404 + 396) as usize) ); + let column_stats = statistics.column_statistics.get(0).unwrap(); + assert_eq!(column_stats.null_count, Precision::Absent); + assert_eq!(column_stats.max_value, Precision::Absent); + assert_eq!(column_stats.min_value, Precision::Absent); + + ctx.register_table("test_table2", Arc::new(table))?; + let actual = ctx + .sql("SELECT max(value), min(value) FROM test_table2") + .await? + .collect() + .await?; + let expected = vec![ + "+------------------------+------------------------+", + "| MAX(test_table2.value) | MIN(test_table2.value) |", + "+------------------------+------------------------+", + "| 3 | 1 |", + "+------------------------+------------------------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + + // Validate a table that contains nested structures. + + // This table is interesting since it goes through schema evolution. + // In particular 'new_column' contains statistics for when it + // is introduced (10) but the commit following (11) does not contain + // statistics for this column. + let table = open_table("./tests/data/delta-1.2.1-only-struct-stats") + .await + .unwrap(); + let schema = table.get_schema().unwrap(); + let statistics = table.state.datafusion_table_statistics()?; + assert_eq!(statistics.num_rows, Precision::Exact(12)); + + // `new_column` statistics + let stats = statistics + .column_statistics + .get(schema.index_of("new_column").unwrap()) + .unwrap(); + assert_eq!(stats.null_count, Precision::Absent); + assert_eq!(stats.min_value, Precision::Absent); + assert_eq!(stats.max_value, Precision::Absent); + + // `date` statistics + let stats = statistics + .column_statistics + .get(schema.index_of("date").unwrap()) + .unwrap(); + assert_eq!(stats.null_count, Precision::Exact(0)); + // 2022-10-24 assert_eq!( - batch.column(1).as_ref(), - Arc::new(Int32Array::from(vec![0])).as_ref(), + stats.min_value, + Precision::Exact(ScalarValue::Date32(Some(19289))) ); - assert_eq!( - statistics - .column_statistics - .clone() - .unwrap() - .iter() - .map(|x| x.max_value.as_ref()) - .collect::>>(), - vec![Some(&ScalarValue::from(4_i32))], + stats.max_value, + Precision::Exact(ScalarValue::Date32(Some(19289))) ); + // `timestamp` statistics + let stats = statistics + .column_statistics + .get(schema.index_of("timestamp").unwrap()) + .unwrap(); + assert_eq!(stats.null_count, Precision::Exact(0)); + // 2022-10-24T22:59:32.846Z assert_eq!( - statistics - .column_statistics - .clone() - .unwrap() - .iter() - .map(|x| x.min_value.as_ref()) - .collect::>>(), - vec![Some(&ScalarValue::from(0_i32))], + stats.min_value, + Precision::Exact(ScalarValue::TimestampMicrosecond( + Some(1666652372846000), + None + )) + ); + // 2022-10-24T22:59:46.083Z + assert_eq!( + stats.max_value, + Precision::Exact(ScalarValue::TimestampMicrosecond( + Some(1666652386083000), + None + )) ); + // `struct_element` statistics + let stats = statistics + .column_statistics + .get(schema.index_of("nested_struct").unwrap()) + .unwrap(); + assert_eq!(stats.null_count, Precision::Absent); + assert_eq!(stats.min_value, Precision::Absent); + assert_eq!(stats.max_value, Precision::Absent); + Ok(()) } @@ -782,14 +862,14 @@ mod local { let expected_schema = ArrowSchema::new(vec![ ArrowField::new("c3", ArrowDataType::Int32, true), - ArrowField::new("c1", ArrowDataType::Int32, false), + ArrowField::new("c1", ArrowDataType::Int32, true), ArrowField::new( "c2", ArrowDataType::Dictionary( Box::new(ArrowDataType::UInt16), Box::new(ArrowDataType::Utf8), ), - false, + true, ), ]); diff --git a/crates/deltalake-sql/src/planner.rs b/crates/deltalake-sql/src/planner.rs index 3243ed9c7e..bf07825d4b 100644 --- a/crates/deltalake-sql/src/planner.rs +++ b/crates/deltalake-sql/src/planner.rs @@ -92,6 +92,7 @@ mod tests { use arrow_schema::{DataType, Field, Schema}; use datafusion_common::config::ConfigOptions; use datafusion_common::DataFusionError; + use datafusion_common::Result as DataFusionResult; use datafusion_expr::logical_plan::builder::LogicalTableSource; use datafusion_expr::{AggregateUDF, ScalarUDF, TableSource}; use datafusion_sql::TableReference; @@ -124,6 +125,10 @@ mod tests { impl ContextProvider for TestSchemaProvider { fn get_table_provider(&self, name: TableReference) -> DFResult> { + self.get_table_source(name) + } + + fn get_table_source(&self, name: TableReference) -> DFResult> { match self.tables.get(name.table()) { Some(table) => Ok(table.clone()), _ => Err(DataFusionError::Plan(format!( diff --git a/python/Cargo.toml b/python/Cargo.toml index 2c85fcb262..5194a2fc22 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -39,7 +39,7 @@ tokio = { workspace = true, features = ["rt-multi-thread"] } reqwest = { version = "*", features = ["native-tls-vendored"] } [dependencies.pyo3] -version = "0.19" +version = "0.20" features = ["extension-module", "abi3", "abi3-py38"] [dependencies.deltalake] From c15d7683b4512f6d808c204d0e3dd4f77b7c4216 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Fri, 17 Nov 2023 07:32:54 +0100 Subject: [PATCH 16/23] feat: handle protocol compatibility (#1807) # Description In preparation for further improvements to protocol support, this PR introduces a `ProtocolChecker` which validates that we can read form / write to / commit to a specific table. So far everything is expressed in table features in the hopes, that this keeps on giving correct behaviours as we add more table features. The existing support for append only is integrated and extended to handle enablement logic according to the protocol. --------- Co-authored-by: Will Jones --- crates/deltalake-core/Cargo.toml | 1 + .../deltalake-core/benches/read_checkpoint.rs | 4 +- .../src/kernel/actions/types.rs | 46 ++ .../deltalake-core/src/operations/create.rs | 17 +- .../deltalake-core/src/operations/delete.rs | 6 +- crates/deltalake-core/src/operations/load.rs | 3 + crates/deltalake-core/src/operations/merge.rs | 4 +- crates/deltalake-core/src/operations/mod.rs | 5 - .../deltalake-core/src/operations/optimize.rs | 4 +- .../deltalake-core/src/operations/restore.rs | 1 - .../src/operations/transaction/mod.rs | 80 ++-- .../src/operations/transaction/protocol.rs | 404 ++++++++++++++++++ .../src/operations/transaction/test_utils.rs | 6 +- .../deltalake-core/src/operations/update.rs | 4 +- crates/deltalake-core/src/operations/write.rs | 23 +- .../tests/integration_concurrent_writes.rs | 2 +- 16 files changed, 521 insertions(+), 89 deletions(-) create mode 100644 crates/deltalake-core/src/operations/transaction/protocol.rs diff --git a/crates/deltalake-core/Cargo.toml b/crates/deltalake-core/Cargo.toml index fc3e21c38c..95331eb93e 100644 --- a/crates/deltalake-core/Cargo.toml +++ b/crates/deltalake-core/Cargo.toml @@ -130,6 +130,7 @@ tempfile = "3" tokio = { version = "1", features = ["macros", "rt-multi-thread"] } utime = "0.3" hyper = { version = "0.14", features = ["server"] } +criterion = "0.5" [features] azure = ["object_store/azure"] diff --git a/crates/deltalake-core/benches/read_checkpoint.rs b/crates/deltalake-core/benches/read_checkpoint.rs index 2ecbee661b..0db72c3e17 100644 --- a/crates/deltalake-core/benches/read_checkpoint.rs +++ b/crates/deltalake-core/benches/read_checkpoint.rs @@ -1,6 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use deltalake::table::state::DeltaTableState; -use deltalake::DeltaTableConfig; +use deltalake_core::table::state::DeltaTableState; +use deltalake_core::DeltaTableConfig; use std::fs::File; use std::io::Read; diff --git a/crates/deltalake-core/src/kernel/actions/types.rs b/crates/deltalake-core/src/kernel/actions/types.rs index e8060f145e..a788315b82 100644 --- a/crates/deltalake-core/src/kernel/actions/types.rs +++ b/crates/deltalake-core/src/kernel/actions/types.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, HashSet}; +use std::fmt; use std::str::FromStr; // use std::io::{Cursor, Read}; // use std::sync::Arc; @@ -225,6 +226,24 @@ impl From for ReaderFeatures { } } +impl AsRef for ReaderFeatures { + fn as_ref(&self) -> &str { + match self { + ReaderFeatures::ColumnMapping => "columnMapping", + ReaderFeatures::DeleteionVecotrs => "deletionVectors", + ReaderFeatures::TimestampWithoutTimezone => "timestampNtz", + ReaderFeatures::V2Checkpoint => "v2Checkpoint", + ReaderFeatures::Other(f) => f, + } + } +} + +impl fmt::Display for ReaderFeatures { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_ref()) + } +} + /// Features table writers can support as well as let users know /// what is supported #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] @@ -303,6 +322,33 @@ impl From for WriterFeatures { } } +impl AsRef for WriterFeatures { + fn as_ref(&self) -> &str { + match self { + WriterFeatures::AppendOnly => "appendOnly", + WriterFeatures::Invariants => "invariants", + WriterFeatures::CheckConstraints => "checkConstraints", + WriterFeatures::ChangeDataFeed => "changeDataFeed", + WriterFeatures::GeneratedColumns => "generatedColumns", + WriterFeatures::ColumnMapping => "columnMapping", + WriterFeatures::IdentityColumns => "identityColumns", + WriterFeatures::DeleteionVecotrs => "deletionVectors", + WriterFeatures::RowTracking => "rowTracking", + WriterFeatures::TimestampWithoutTimezone => "timestampNtz", + WriterFeatures::DomainMetadata => "domainMetadata", + WriterFeatures::V2Checkpoint => "v2Checkpoint", + WriterFeatures::IcebergCompatV1 => "icebergCompatV1", + WriterFeatures::Other(f) => f, + } + } +} + +impl fmt::Display for WriterFeatures { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_ref()) + } +} + #[cfg(all(not(feature = "parquet2"), feature = "parquet"))] impl From<&parquet::record::Field> for WriterFeatures { fn from(value: &parquet::record::Field) -> Self { diff --git a/crates/deltalake-core/src/operations/create.rs b/crates/deltalake-core/src/operations/create.rs index 71398faf97..84c2e03627 100644 --- a/crates/deltalake-core/src/operations/create.rs +++ b/crates/deltalake-core/src/operations/create.rs @@ -7,8 +7,7 @@ use std::sync::Arc; use futures::future::BoxFuture; use serde_json::{Map, Value}; -use super::transaction::commit; -use super::{MAX_SUPPORTED_READER_VERSION, MAX_SUPPORTED_WRITER_VERSION}; +use super::transaction::{commit, PROTOCOL}; use crate::errors::{DeltaResult, DeltaTableError}; use crate::kernel::{Action, DataType, Metadata, Protocol, StructField, StructType}; use crate::logstore::{LogStore, LogStoreRef}; @@ -245,8 +244,8 @@ impl CreateBuilder { _ => unreachable!(), }) .unwrap_or_else(|| Protocol { - min_reader_version: MAX_SUPPORTED_READER_VERSION, - min_writer_version: MAX_SUPPORTED_WRITER_VERSION, + min_reader_version: PROTOCOL.default_reader_version(), + min_writer_version: PROTOCOL.default_writer_version(), writer_features: None, reader_features: None, }); @@ -391,8 +390,14 @@ mod tests { .await .unwrap(); assert_eq!(table.version(), 0); - assert_eq!(table.get_min_reader_version(), MAX_SUPPORTED_READER_VERSION); - assert_eq!(table.get_min_writer_version(), MAX_SUPPORTED_WRITER_VERSION); + assert_eq!( + table.get_min_reader_version(), + PROTOCOL.default_reader_version() + ); + assert_eq!( + table.get_min_writer_version(), + PROTOCOL.default_writer_version() + ); assert_eq!(table.schema().unwrap(), &schema); // check we can overwrite default settings via adding actions diff --git a/crates/deltalake-core/src/operations/delete.rs b/crates/deltalake-core/src/operations/delete.rs index bd361c9707..b6c94f423b 100644 --- a/crates/deltalake-core/src/operations/delete.rs +++ b/crates/deltalake-core/src/operations/delete.rs @@ -34,6 +34,8 @@ use parquet::file::properties::WriterProperties; use serde::Serialize; use serde_json::Value; +use super::datafusion_utils::Expression; +use super::transaction::PROTOCOL; use crate::delta_datafusion::expr::fmt_expr_to_sql; use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; use crate::errors::{DeltaResult, DeltaTableError}; @@ -44,8 +46,6 @@ use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; use crate::DeltaTable; -use super::datafusion_utils::Expression; - /// Delete Records from the Delta Table. /// See this module's documentation for more information pub struct DeleteBuilder { @@ -274,6 +274,8 @@ impl std::future::IntoFuture for DeleteBuilder { let mut this = self; Box::pin(async move { + PROTOCOL.can_write_to(&this.snapshot)?; + let state = this.state.unwrap_or_else(|| { let session = SessionContext::new(); diff --git a/crates/deltalake-core/src/operations/load.rs b/crates/deltalake-core/src/operations/load.rs index 1a4c5c4cc6..610f86dee6 100644 --- a/crates/deltalake-core/src/operations/load.rs +++ b/crates/deltalake-core/src/operations/load.rs @@ -6,6 +6,7 @@ use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; use futures::future::BoxFuture; +use super::transaction::PROTOCOL; use crate::errors::{DeltaResult, DeltaTableError}; use crate::logstore::LogStoreRef; use crate::table::state::DeltaTableState; @@ -46,6 +47,8 @@ impl std::future::IntoFuture for LoadBuilder { let this = self; Box::pin(async move { + PROTOCOL.can_read_from(&this.snapshot)?; + let table = DeltaTable::new_with_state(this.log_store, this.snapshot); let schema = table.state.arrow_schema()?; let projection = this diff --git a/crates/deltalake-core/src/operations/merge.rs b/crates/deltalake-core/src/operations/merge.rs index d38ddf0efb..a9ad6a8655 100644 --- a/crates/deltalake-core/src/operations/merge.rs +++ b/crates/deltalake-core/src/operations/merge.rs @@ -64,7 +64,7 @@ use serde::Serialize; use serde_json::Value; use super::datafusion_utils::{into_expr, maybe_into_expr, Expression}; -use super::transaction::commit; +use super::transaction::{commit, PROTOCOL}; use crate::delta_datafusion::expr::{fmt_expr_to_sql, parse_predicate_expression}; use crate::delta_datafusion::{register_store, DeltaScanBuilder}; use crate::kernel::{Action, Remove}; @@ -1208,6 +1208,8 @@ impl std::future::IntoFuture for MergeBuilder { let mut this = self; Box::pin(async move { + PROTOCOL.can_write_to(&this.snapshot)?; + let state = this.state.unwrap_or_else(|| { let session = SessionContext::new(); diff --git a/crates/deltalake-core/src/operations/mod.rs b/crates/deltalake-core/src/operations/mod.rs index 863a5a8b63..a0dbfd0239 100644 --- a/crates/deltalake-core/src/operations/mod.rs +++ b/crates/deltalake-core/src/operations/mod.rs @@ -50,11 +50,6 @@ pub mod write; #[cfg(all(feature = "arrow", feature = "parquet"))] pub mod writer; -/// Maximum supported writer version -pub const MAX_SUPPORTED_WRITER_VERSION: i32 = 1; -/// Maximum supported reader version -pub const MAX_SUPPORTED_READER_VERSION: i32 = 1; - /// High level interface for executing commands against a DeltaTable pub struct DeltaOps(pub DeltaTable); diff --git a/crates/deltalake-core/src/operations/optimize.rs b/crates/deltalake-core/src/operations/optimize.rs index e7d6a1b11d..ef8905e0c9 100644 --- a/crates/deltalake-core/src/operations/optimize.rs +++ b/crates/deltalake-core/src/operations/optimize.rs @@ -38,7 +38,7 @@ use parquet::errors::ParquetError; use parquet::file::properties::WriterProperties; use serde::{Deserialize, Serialize}; -use super::transaction::commit; +use super::transaction::{commit, PROTOCOL}; use super::writer::{PartitionWriter, PartitionWriterConfig}; use crate::errors::{DeltaResult, DeltaTableError}; use crate::kernel::{Action, Remove}; @@ -260,6 +260,8 @@ impl<'a> std::future::IntoFuture for OptimizeBuilder<'a> { let this = self; Box::pin(async move { + PROTOCOL.can_write_to(&this.snapshot)?; + let writer_properties = this.writer_properties.unwrap_or_else(|| { WriterProperties::builder() .set_compression(Compression::ZSTD(ZstdLevel::try_new(4).unwrap())) diff --git a/crates/deltalake-core/src/operations/restore.rs b/crates/deltalake-core/src/operations/restore.rs index c391de6f04..be43bacf5f 100644 --- a/crates/deltalake-core/src/operations/restore.rs +++ b/crates/deltalake-core/src/operations/restore.rs @@ -245,7 +245,6 @@ async fn execute( datetime: datetime_to_restore.map(|time| -> i64 { time.timestamp_millis() }), }, &actions, - &snapshot, None, ) .await?; diff --git a/crates/deltalake-core/src/operations/transaction/mod.rs b/crates/deltalake-core/src/operations/transaction/mod.rs index e5e808d2d5..2f62f663cf 100644 --- a/crates/deltalake-core/src/operations/transaction/mod.rs +++ b/crates/deltalake-core/src/operations/transaction/mod.rs @@ -7,21 +7,23 @@ use object_store::path::Path; use object_store::{Error as ObjectStoreError, ObjectStore}; use serde_json::Value; +use self::conflict_checker::{CommitConflictError, TransactionInfo, WinningCommitSummary}; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::kernel::{Action, CommitInfo}; +use crate::kernel::{Action, CommitInfo, ReaderFeatures, WriterFeatures}; use crate::logstore::LogStore; use crate::protocol::DeltaOperation; use crate::table::state::DeltaTableState; +pub use self::protocol::INSTANCE as PROTOCOL; + mod conflict_checker; +mod protocol; #[cfg(feature = "datafusion")] mod state; #[cfg(test)] pub(crate) mod test_utils; -use self::conflict_checker::{CommitConflictError, TransactionInfo, WinningCommitSummary}; - const DELTA_LOG_FOLDER: &str = "_delta_log"; /// Error raised while commititng transaction @@ -45,17 +47,36 @@ pub enum TransactionError { #[from] source: ObjectStoreError, }, + /// Error returned when a commit conflict ocurred #[error("Failed to commit transaction: {0}")] CommitConflict(#[from] CommitConflictError), + /// Error returned when maximum number of commit trioals is exceeded #[error("Failed to commit transaction: {0}")] MaxCommitAttempts(i32), + /// The transaction includes Remove action with data change but Delta table is append-only #[error( "The transaction includes Remove action with data change but Delta table is append-only" )] DeltaTableAppendOnly, + + /// Error returned when unsupported reader features are required + #[error("Unsupported reader features required: {0:?}")] + UnsupportedReaderFeatures(Vec), + + /// Error returned when unsupported writer features are required + #[error("Unsupported writer features required: {0:?}")] + UnsupportedWriterFeatures(Vec), + + /// Error returned when writer features are required but not specified + #[error("Writer features must be specified for writerversion >= 7")] + WriterFeaturesRequired, + + /// Error returned when reader features are required but not specified + #[error("Reader features must be specified for reader version >= 3")] + ReaderFeaturesRequired, } impl From for DeltaTableError { @@ -76,18 +97,9 @@ impl From for DeltaTableError { // Convert actions to their json representation fn log_entry_from_actions<'a>( actions: impl IntoIterator, - read_snapshot: &DeltaTableState, ) -> Result { - let append_only = read_snapshot.table_config().append_only(); let mut jsons = Vec::::new(); for action in actions { - if append_only { - if let Action::Remove(remove) = action { - if remove.data_change { - return Err(TransactionError::DeltaTableAppendOnly); - } - } - } let json = serde_json::to_string(action) .map_err(|e| TransactionError::SerializeLogJson { json_err: e })?; jsons.push(json); @@ -98,7 +110,6 @@ fn log_entry_from_actions<'a>( pub(crate) fn get_commit_bytes( operation: &DeltaOperation, actions: &Vec, - read_snapshot: &DeltaTableState, app_metadata: Option>, ) -> Result { if !actions.iter().any(|a| matches!(a, Action::CommitInfo(..))) { @@ -117,13 +128,9 @@ pub(crate) fn get_commit_bytes( actions .iter() .chain(std::iter::once(&Action::CommitInfo(commit_info))), - read_snapshot, )?)) } else { - Ok(bytes::Bytes::from(log_entry_from_actions( - actions, - read_snapshot, - )?)) + Ok(bytes::Bytes::from(log_entry_from_actions(actions)?)) } } @@ -135,11 +142,10 @@ pub(crate) async fn prepare_commit<'a>( storage: &dyn ObjectStore, operation: &DeltaOperation, actions: &Vec, - read_snapshot: &DeltaTableState, app_metadata: Option>, ) -> Result { // Serialize all actions that are part of this log entry. - let log_entry = get_commit_bytes(operation, actions, read_snapshot, app_metadata)?; + let log_entry = get_commit_bytes(operation, actions, app_metadata)?; // Write delta log entry as temporary file to storage. For the actual commit, // the temporary file is moved (atomic rename) to the delta log folder within `commit` function. @@ -185,11 +191,11 @@ pub async fn commit_with_retries( app_metadata: Option>, max_retries: usize, ) -> DeltaResult { + PROTOCOL.can_commit(read_snapshot, actions)?; let tmp_commit = prepare_commit( log_store.object_store().as_ref(), &operation, actions, - read_snapshot, app_metadata, ) .await?; @@ -240,12 +246,9 @@ pub async fn commit_with_retries( mod tests { use std::{collections::HashMap, sync::Arc}; - use self::test_utils::{create_remove_action, init_table_actions}; + use self::test_utils::init_table_actions; use super::*; - use crate::{ - logstore::default_logstore::DefaultLogStore, storage::commit_uri_from_version, - DeltaConfigKey, - }; + use crate::{logstore::default_logstore::DefaultLogStore, storage::commit_uri_from_version}; use object_store::memory::InMemory; use url::Url; @@ -260,35 +263,12 @@ mod tests { #[test] fn test_log_entry_from_actions() { let actions = init_table_actions(None); - let state = DeltaTableState::from_actions(actions.clone(), 0).unwrap(); - let entry = log_entry_from_actions(&actions, &state).unwrap(); + let entry = log_entry_from_actions(&actions).unwrap(); let lines: Vec<_> = entry.lines().collect(); // writes every action to a line assert_eq!(actions.len(), lines.len()) } - fn remove_action_exists_when_delta_table_is_append_only( - data_change: bool, - ) -> Result { - let remove = create_remove_action("test_append_only", data_change); - let mut actions = init_table_actions(Some(HashMap::from([( - DeltaConfigKey::AppendOnly.as_ref().to_string(), - Some("true".to_string()), - )]))); - actions.push(remove); - let state = - DeltaTableState::from_actions(actions.clone(), 0).expect("Failed to get table state"); - log_entry_from_actions(&actions, &state) - } - - #[test] - fn test_remove_action_exists_when_delta_table_is_append_only() { - let _err = remove_action_exists_when_delta_table_is_append_only(true) - .expect_err("Remove action is included when Delta table is append-only. Should error"); - let _actions = remove_action_exists_when_delta_table_is_append_only(false) - .expect("Data is not changed by the Remove action. Should succeed"); - } - #[tokio::test] async fn test_try_commit_transaction() { let store = Arc::new(InMemory::new()); diff --git a/crates/deltalake-core/src/operations/transaction/protocol.rs b/crates/deltalake-core/src/operations/transaction/protocol.rs new file mode 100644 index 0000000000..47e4d0a41a --- /dev/null +++ b/crates/deltalake-core/src/operations/transaction/protocol.rs @@ -0,0 +1,404 @@ +use std::collections::HashSet; + +use lazy_static::lazy_static; +use once_cell::sync::Lazy; + +use super::TransactionError; +use crate::kernel::{Action, ReaderFeatures, WriterFeatures}; +use crate::table::state::DeltaTableState; + +lazy_static! { + static ref READER_V2: HashSet = + HashSet::from_iter([ReaderFeatures::ColumnMapping]); + static ref WRITER_V2: HashSet = + HashSet::from_iter([WriterFeatures::AppendOnly, WriterFeatures::Invariants]); + static ref WRITER_V3: HashSet = HashSet::from_iter([ + WriterFeatures::AppendOnly, + WriterFeatures::Invariants, + WriterFeatures::CheckConstraints + ]); + static ref WRITER_V4: HashSet = HashSet::from_iter([ + WriterFeatures::AppendOnly, + WriterFeatures::Invariants, + WriterFeatures::CheckConstraints, + WriterFeatures::ChangeDataFeed, + WriterFeatures::GeneratedColumns + ]); + static ref WRITER_V5: HashSet = HashSet::from_iter([ + WriterFeatures::AppendOnly, + WriterFeatures::Invariants, + WriterFeatures::CheckConstraints, + WriterFeatures::ChangeDataFeed, + WriterFeatures::GeneratedColumns, + WriterFeatures::ColumnMapping, + ]); + static ref WRITER_V6: HashSet = HashSet::from_iter([ + WriterFeatures::AppendOnly, + WriterFeatures::Invariants, + WriterFeatures::CheckConstraints, + WriterFeatures::ChangeDataFeed, + WriterFeatures::GeneratedColumns, + WriterFeatures::ColumnMapping, + WriterFeatures::IdentityColumns, + ]); +} + +pub struct ProtocolChecker { + reader_features: HashSet, + writer_features: HashSet, +} + +impl ProtocolChecker { + /// Create a new protocol checker. + pub fn new( + reader_features: HashSet, + writer_features: HashSet, + ) -> Self { + Self { + reader_features, + writer_features, + } + } + + pub fn default_reader_version(&self) -> i32 { + 1 + } + + pub fn default_writer_version(&self) -> i32 { + 2 + } + + /// Check if delta-rs can read form the given delta table. + pub fn can_read_from(&self, snapshot: &DeltaTableState) -> Result<(), TransactionError> { + let required_features: Option<&HashSet> = + match snapshot.min_reader_version() { + 0 | 1 => None, + 2 => Some(&READER_V2), + _ => snapshot.reader_features(), + }; + if let Some(features) = required_features { + let mut diff = features.difference(&self.reader_features).peekable(); + if diff.peek().is_some() { + return Err(TransactionError::UnsupportedReaderFeatures( + diff.cloned().collect(), + )); + } + }; + Ok(()) + } + + /// Check if delta-rs can write to the given delta table. + pub fn can_write_to(&self, snapshot: &DeltaTableState) -> Result<(), TransactionError> { + // NOTE: writers must always support all required reader features + self.can_read_from(snapshot)?; + + let required_features: Option<&HashSet> = + match snapshot.min_writer_version() { + 0 | 1 => None, + 2 => Some(&WRITER_V2), + 3 => Some(&WRITER_V3), + 4 => Some(&WRITER_V4), + 5 => Some(&WRITER_V5), + 6 => Some(&WRITER_V6), + _ => snapshot.writer_features(), + }; + + if let Some(features) = required_features { + let mut diff = features.difference(&self.writer_features).peekable(); + if diff.peek().is_some() { + return Err(TransactionError::UnsupportedWriterFeatures( + diff.cloned().collect(), + )); + } + }; + Ok(()) + } + + pub fn can_commit( + &self, + snapshot: &DeltaTableState, + actions: &[Action], + ) -> Result<(), TransactionError> { + self.can_write_to(snapshot)?; + + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#append-only-tables + let append_only_enabled = if snapshot.min_writer_version() < 2 { + false + } else if snapshot.min_writer_version() < 7 { + snapshot.table_config().append_only() + } else { + snapshot + .writer_features() + .ok_or(TransactionError::WriterFeaturesRequired)? + .contains(&WriterFeatures::AppendOnly) + && snapshot.table_config().append_only() + }; + if append_only_enabled { + actions.iter().try_for_each(|action| match action { + Action::Remove(remove) if remove.data_change => { + Err(TransactionError::DeltaTableAppendOnly) + } + _ => Ok(()), + })?; + } + + Ok(()) + } +} + +/// The global protocol checker instance to validate table versions and features. +/// +/// This instance is used by default in all transaction operations, since feature +/// support is not configurable but rather decided at compile time. +/// +/// As we implement new features, we need to update this instance accordingly. +/// resulting version support is determined by the supported table feature set. +pub static INSTANCE: Lazy = Lazy::new(|| { + let reader_features = HashSet::new(); + // reader_features.insert(ReaderFeatures::ColumnMapping); + + let mut writer_features = HashSet::new(); + writer_features.insert(WriterFeatures::AppendOnly); + writer_features.insert(WriterFeatures::Invariants); + // writer_features.insert(WriterFeatures::CheckConstraints); + // writer_features.insert(WriterFeatures::ChangeDataFeed); + // writer_features.insert(WriterFeatures::GeneratedColumns); + // writer_features.insert(WriterFeatures::ColumnMapping); + // writer_features.insert(WriterFeatures::IdentityColumns); + + ProtocolChecker::new(reader_features, writer_features) +}); + +#[cfg(test)] +mod tests { + use super::super::test_utils::create_metadata_action; + use super::*; + use crate::kernel::{Action, Add, Protocol, Remove}; + use crate::DeltaConfigKey; + use std::collections::HashMap; + + #[test] + fn test_can_commit_append_only() { + let append_actions = vec![Action::Add(Add { + path: "test".to_string(), + data_change: true, + ..Default::default() + })]; + let change_actions = vec![ + Action::Add(Add { + path: "test".to_string(), + data_change: true, + ..Default::default() + }), + Action::Remove(Remove { + path: "test".to_string(), + data_change: true, + ..Default::default() + }), + ]; + let neutral_actions = vec![ + Action::Add(Add { + path: "test".to_string(), + data_change: false, + ..Default::default() + }), + Action::Remove(Remove { + path: "test".to_string(), + data_change: false, + ..Default::default() + }), + ]; + + let create_actions = |writer: i32, append: &str, feat: Vec| { + vec![ + Action::Protocol(Protocol { + min_reader_version: 1, + min_writer_version: writer, + writer_features: Some(feat.into_iter().collect()), + ..Default::default() + }), + create_metadata_action( + None, + Some(HashMap::from([( + DeltaConfigKey::AppendOnly.as_ref().to_string(), + Some(append.to_string()), + )])), + ), + ] + }; + + let checker = ProtocolChecker::new(HashSet::new(), WRITER_V2.clone()); + + let actions = create_actions(1, "true", vec![]); + let snapshot = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &change_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + + let actions = create_actions(2, "true", vec![]); + let snapshot = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &change_actions).is_err()); + assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + + let actions = create_actions(2, "false", vec![]); + let snapshot = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &change_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + + let actions = create_actions(7, "true", vec![WriterFeatures::AppendOnly]); + let snapshot = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &change_actions).is_err()); + assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + + let actions = create_actions(7, "false", vec![WriterFeatures::AppendOnly]); + let snapshot = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &change_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + + let actions = create_actions(7, "true", vec![]); + let snapshot = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker.can_commit(&snapshot, &append_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &change_actions).is_ok()); + assert!(checker.can_commit(&snapshot, &neutral_actions).is_ok()); + } + + #[test] + fn test_versions() { + let checker_1 = ProtocolChecker::new(HashSet::new(), HashSet::new()); + let actions = vec![Action::Protocol(Protocol { + min_reader_version: 1, + min_writer_version: 1, + ..Default::default() + })]; + let snapshot_1 = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker_1.can_read_from(&snapshot_1).is_ok()); + assert!(checker_1.can_write_to(&snapshot_1).is_ok()); + + let checker_2 = ProtocolChecker::new(READER_V2.clone(), HashSet::new()); + let actions = vec![Action::Protocol(Protocol { + min_reader_version: 2, + min_writer_version: 1, + ..Default::default() + })]; + let snapshot_2 = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker_1.can_read_from(&snapshot_2).is_err()); + assert!(checker_1.can_write_to(&snapshot_2).is_err()); + assert!(checker_2.can_read_from(&snapshot_1).is_ok()); + assert!(checker_2.can_read_from(&snapshot_2).is_ok()); + assert!(checker_2.can_write_to(&snapshot_2).is_ok()); + + let checker_3 = ProtocolChecker::new(READER_V2.clone(), WRITER_V2.clone()); + let actions = vec![Action::Protocol(Protocol { + min_reader_version: 2, + min_writer_version: 2, + ..Default::default() + })]; + let snapshot_3 = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker_1.can_read_from(&snapshot_3).is_err()); + assert!(checker_1.can_write_to(&snapshot_3).is_err()); + assert!(checker_2.can_read_from(&snapshot_3).is_ok()); + assert!(checker_2.can_write_to(&snapshot_3).is_err()); + assert!(checker_3.can_read_from(&snapshot_1).is_ok()); + assert!(checker_3.can_read_from(&snapshot_2).is_ok()); + assert!(checker_3.can_read_from(&snapshot_3).is_ok()); + assert!(checker_3.can_write_to(&snapshot_3).is_ok()); + + let checker_4 = ProtocolChecker::new(READER_V2.clone(), WRITER_V3.clone()); + let actions = vec![Action::Protocol(Protocol { + min_reader_version: 2, + min_writer_version: 3, + ..Default::default() + })]; + let snapshot_4 = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker_1.can_read_from(&snapshot_4).is_err()); + assert!(checker_1.can_write_to(&snapshot_4).is_err()); + assert!(checker_2.can_read_from(&snapshot_4).is_ok()); + assert!(checker_2.can_write_to(&snapshot_4).is_err()); + assert!(checker_3.can_read_from(&snapshot_4).is_ok()); + assert!(checker_3.can_write_to(&snapshot_4).is_err()); + assert!(checker_4.can_read_from(&snapshot_1).is_ok()); + assert!(checker_4.can_read_from(&snapshot_2).is_ok()); + assert!(checker_4.can_read_from(&snapshot_3).is_ok()); + assert!(checker_4.can_read_from(&snapshot_4).is_ok()); + assert!(checker_4.can_write_to(&snapshot_4).is_ok()); + + let checker_5 = ProtocolChecker::new(READER_V2.clone(), WRITER_V4.clone()); + let actions = vec![Action::Protocol(Protocol { + min_reader_version: 2, + min_writer_version: 4, + ..Default::default() + })]; + let snapshot_5 = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker_1.can_read_from(&snapshot_5).is_err()); + assert!(checker_1.can_write_to(&snapshot_5).is_err()); + assert!(checker_2.can_read_from(&snapshot_5).is_ok()); + assert!(checker_2.can_write_to(&snapshot_5).is_err()); + assert!(checker_3.can_read_from(&snapshot_5).is_ok()); + assert!(checker_3.can_write_to(&snapshot_5).is_err()); + assert!(checker_4.can_read_from(&snapshot_5).is_ok()); + assert!(checker_4.can_write_to(&snapshot_5).is_err()); + assert!(checker_5.can_read_from(&snapshot_1).is_ok()); + assert!(checker_5.can_read_from(&snapshot_2).is_ok()); + assert!(checker_5.can_read_from(&snapshot_3).is_ok()); + assert!(checker_5.can_read_from(&snapshot_4).is_ok()); + assert!(checker_5.can_read_from(&snapshot_5).is_ok()); + assert!(checker_5.can_write_to(&snapshot_5).is_ok()); + + let checker_6 = ProtocolChecker::new(READER_V2.clone(), WRITER_V5.clone()); + let actions = vec![Action::Protocol(Protocol { + min_reader_version: 2, + min_writer_version: 5, + ..Default::default() + })]; + let snapshot_6 = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker_1.can_read_from(&snapshot_6).is_err()); + assert!(checker_1.can_write_to(&snapshot_6).is_err()); + assert!(checker_2.can_read_from(&snapshot_6).is_ok()); + assert!(checker_2.can_write_to(&snapshot_6).is_err()); + assert!(checker_3.can_read_from(&snapshot_6).is_ok()); + assert!(checker_3.can_write_to(&snapshot_6).is_err()); + assert!(checker_4.can_read_from(&snapshot_6).is_ok()); + assert!(checker_4.can_write_to(&snapshot_6).is_err()); + assert!(checker_5.can_read_from(&snapshot_6).is_ok()); + assert!(checker_5.can_write_to(&snapshot_6).is_err()); + assert!(checker_6.can_read_from(&snapshot_1).is_ok()); + assert!(checker_6.can_read_from(&snapshot_2).is_ok()); + assert!(checker_6.can_read_from(&snapshot_3).is_ok()); + assert!(checker_6.can_read_from(&snapshot_4).is_ok()); + assert!(checker_6.can_read_from(&snapshot_5).is_ok()); + assert!(checker_6.can_read_from(&snapshot_6).is_ok()); + assert!(checker_6.can_write_to(&snapshot_6).is_ok()); + + let checker_7 = ProtocolChecker::new(READER_V2.clone(), WRITER_V6.clone()); + let actions = vec![Action::Protocol(Protocol { + min_reader_version: 2, + min_writer_version: 6, + ..Default::default() + })]; + let snapshot_7 = DeltaTableState::from_actions(actions, 1).unwrap(); + assert!(checker_1.can_read_from(&snapshot_7).is_err()); + assert!(checker_1.can_write_to(&snapshot_7).is_err()); + assert!(checker_2.can_read_from(&snapshot_7).is_ok()); + assert!(checker_2.can_write_to(&snapshot_7).is_err()); + assert!(checker_3.can_read_from(&snapshot_7).is_ok()); + assert!(checker_3.can_write_to(&snapshot_7).is_err()); + assert!(checker_4.can_read_from(&snapshot_7).is_ok()); + assert!(checker_4.can_write_to(&snapshot_7).is_err()); + assert!(checker_5.can_read_from(&snapshot_7).is_ok()); + assert!(checker_5.can_write_to(&snapshot_7).is_err()); + assert!(checker_6.can_read_from(&snapshot_7).is_ok()); + assert!(checker_6.can_write_to(&snapshot_7).is_err()); + assert!(checker_7.can_read_from(&snapshot_1).is_ok()); + assert!(checker_7.can_read_from(&snapshot_2).is_ok()); + assert!(checker_7.can_read_from(&snapshot_3).is_ok()); + assert!(checker_7.can_read_from(&snapshot_4).is_ok()); + assert!(checker_7.can_read_from(&snapshot_5).is_ok()); + assert!(checker_7.can_read_from(&snapshot_6).is_ok()); + assert!(checker_7.can_read_from(&snapshot_7).is_ok()); + assert!(checker_7.can_write_to(&snapshot_7).is_ok()); + } +} diff --git a/crates/deltalake-core/src/operations/transaction/test_utils.rs b/crates/deltalake-core/src/operations/transaction/test_utils.rs index 56b0894019..2efdcde2ea 100644 --- a/crates/deltalake-core/src/operations/transaction/test_utils.rs +++ b/crates/deltalake-core/src/operations/transaction/test_utils.rs @@ -6,6 +6,7 @@ use crate::kernel::{ Action, Add, CommitInfo, DataType, Metadata, PrimitiveType, Protocol, Remove, StructField, StructType, }; +use crate::operations::transaction::PROTOCOL; use crate::protocol::{DeltaOperation, SaveMode}; use crate::table::state::DeltaTableState; use crate::table::DeltaTableMetaData; @@ -49,8 +50,8 @@ pub fn create_remove_action(path: impl Into, data_change: bool) -> Actio pub fn create_protocol_action(max_reader: Option, max_writer: Option) -> Action { let protocol = Protocol { - min_reader_version: max_reader.unwrap_or(crate::operations::MAX_SUPPORTED_READER_VERSION), - min_writer_version: max_writer.unwrap_or(crate::operations::MAX_SUPPORTED_WRITER_VERSION), + min_reader_version: max_reader.unwrap_or(PROTOCOL.default_reader_version()), + min_writer_version: max_writer.unwrap_or(PROTOCOL.default_writer_version()), writer_features: None, reader_features: None, }; @@ -165,7 +166,6 @@ pub async fn create_initialized_table( log_store.object_store().as_ref(), &operation, &actions, - &state, None, ) .await diff --git a/crates/deltalake-core/src/operations/update.rs b/crates/deltalake-core/src/operations/update.rs index 9f51912579..7583ed6b39 100644 --- a/crates/deltalake-core/src/operations/update.rs +++ b/crates/deltalake-core/src/operations/update.rs @@ -44,7 +44,7 @@ use serde::Serialize; use serde_json::Value; use super::datafusion_utils::{Expression, MetricObserverExec}; -use super::transaction::commit; +use super::transaction::{commit, PROTOCOL}; use super::write::write_execution_plan; use crate::delta_datafusion::expr::fmt_expr_to_sql; use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; @@ -426,6 +426,8 @@ impl std::future::IntoFuture for UpdateBuilder { let mut this = self; Box::pin(async move { + PROTOCOL.can_write_to(&this.snapshot)?; + let state = this.state.unwrap_or_else(|| { let session = SessionContext::new(); diff --git a/crates/deltalake-core/src/operations/write.rs b/crates/deltalake-core/src/operations/write.rs index dec4b7ced7..cb68b72bb2 100644 --- a/crates/deltalake-core/src/operations/write.rs +++ b/crates/deltalake-core/src/operations/write.rs @@ -38,8 +38,8 @@ use futures::future::BoxFuture; use futures::StreamExt; use parquet::file::properties::WriterProperties; +use super::transaction::PROTOCOL; use super::writer::{DeltaWriter, WriterConfig}; -use super::MAX_SUPPORTED_WRITER_VERSION; use super::{transaction::commit, CreateBuilder}; use crate::delta_datafusion::DeltaDataChecker; use crate::errors::{DeltaResult, DeltaTableError}; @@ -60,16 +60,11 @@ enum WriteError { #[error("Failed to execute write task: {source}")] WriteTask { source: tokio::task::JoinError }, - #[error("Delta-rs does not support writer version requirement: {0}")] - UnsupportedWriterVersion(i32), - #[error("A table already exists at: {0}")] AlreadyExists(String), #[error( - "Specified table partitioning does not match table partitioning: expected: {:?}, got: {:?}", - expected, - got + "Specified table partitioning does not match table partitioning: expected: {expected:?}, got: {got:?}", )] PartitionColumnMismatch { expected: Vec, @@ -213,16 +208,12 @@ impl WriteBuilder { async fn check_preconditions(&self) -> DeltaResult> { match self.log_store.is_delta_table_location().await? { true => { - let min_writer = self.snapshot.min_writer_version(); - if min_writer > MAX_SUPPORTED_WRITER_VERSION { - Err(WriteError::UnsupportedWriterVersion(min_writer).into()) - } else { - match self.mode { - SaveMode::ErrorIfExists => { - Err(WriteError::AlreadyExists(self.log_store.root_uri()).into()) - } - _ => Ok(vec![]), + PROTOCOL.can_write_to(&self.snapshot)?; + match self.mode { + SaveMode::ErrorIfExists => { + Err(WriteError::AlreadyExists(self.log_store.root_uri()).into()) } + _ => Ok(vec![]), } } false => { diff --git a/crates/deltalake-core/tests/integration_concurrent_writes.rs b/crates/deltalake-core/tests/integration_concurrent_writes.rs index 90dba7659a..79c16e85dc 100644 --- a/crates/deltalake-core/tests/integration_concurrent_writes.rs +++ b/crates/deltalake-core/tests/integration_concurrent_writes.rs @@ -69,7 +69,7 @@ async fn prepare_table( assert_eq!(0, table.version()); assert_eq!(1, table.get_min_reader_version()); - assert_eq!(1, table.get_min_writer_version()); + assert_eq!(2, table.get_min_writer_version()); assert_eq!(0, table.get_files().len()); Ok((table, table_uri)) From 0ef811063aa12ca84908e62d714ad6f5a69fa1c5 Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Mon, 13 Nov 2023 09:39:29 -0500 Subject: [PATCH 17/23] Add docs on small file compaction with optimize --- .../small-file-compaction-with-optimize.md | 310 ++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 311 insertions(+) create mode 100644 docs/usage/small-file-compaction-with-optimize.md diff --git a/docs/usage/small-file-compaction-with-optimize.md b/docs/usage/small-file-compaction-with-optimize.md new file mode 100644 index 0000000000..ece15deea4 --- /dev/null +++ b/docs/usage/small-file-compaction-with-optimize.md @@ -0,0 +1,310 @@ +# Delta Lake small file compaction with optimize + +This post shows you how to perform small file compaction with using the `optimize` method. This was added to the `DeltaTable` class in version 0.9.0. This command rearranges the small files into larger files which will reduce the number of files and speed up queries. + +This is very helpful for workloads that append frequently. For example, if you have a table that is appended to every 10 minutes, after a year you will have 52,560 files in the table. If the table is partitioned by another dimension, you will have 52,560 files per partition; with just 100 unique values that's millions of files. By running `optimize` periodically, you can reduce the number of files in the table to a more manageable number. + +Typically, you will run optimize less frequently than you append data. If possible, you might run optimize once you know you have finished writing to a particular partition. For example, on a table partitioned by date, you might append data every 10 minutes, but only run optimize once a day at the end of the day. This will ensure you don't need to compact the same data twice. + +This section will also teach you about how to use `vacuum` to physically remove files from storage that are no longer needed. You’ll often want vacuum after running optimize to remove the small files from storage once they’ve been compacted into larger files. + +Let’s start with an example to explain these key concepts. All the code covered in this post is stored in [this notebook](https://github.com/delta-io/delta-examples/blob/master/notebooks/python-deltalake/deltalake_0_9_0.ipynb) in case you’d like to follow along. + +## Create a Delta table with small files + +Let’s start by creating a Delta table with a lot of small files so we can demonstrate the usefulness of the `optimize` command. + +Start by writing a function that generates on thousand rows of random data given a timestamp. + +``` +def record_observations(date: datetime) -> pa.Table: + """Pulls data for a certain datetime""" + nrows = 1000 + return pa.table( + { + "date": pa.array([date.date()] * nrows), + "timestamp": pa.array([date] * nrows), + "value": pc.random(nrows), + } + ) +``` + +Let’s run this function and observe the output: + +``` +record_observations(datetime(2021, 1, 1, 12)).to_pandas() + + date timestamp value +0 2021-01-01 2021-01-01 12:00:00 0.3186397383362023 +1 2021-01-01 2021-01-01 12:00:00 0.04253766974259088 +2 2021-01-01 2021-01-01 12:00:00 0.9355682965171573 +… +999 2021-01-01 2021-01-01 12:00:00 0.23207037062879843 +``` + +Let’s write 100 hours worth of data to the Delta table. + +``` +# Every hour starting at midnight on 2021-01-01 +hours_iter = (datetime(2021, 1, 1) + timedelta(hours=i) for i in itertools.count()) + +# Write 100 hours worth of data +for timestamp in itertools.islice(hours_iter, 100): + write_deltalake( + "observation_data", + record_observations(timestamp), + partition_by=["date"], + mode="append", + ) +``` + +This data was appended to the Delta table in 100 separate transactions, so the table will contain 100 transaction log entries and 100 data files. You can see the number of files with the `files()` method. + +``` +dt = DeltaTable("observation_data") +len(dt.files()) # 100 +``` + +Here’s how the files are persisted in storage. + +``` +observation_data +├── _delta_log +│ ├── 00000000000000000000.json +│ ├── … +│ └── 00000000000000000099.json +├── date=2021-01-01 +│ ├── 0-cfe227c6-edd9-4369-a1b0-db4559a2e693-0.parquet +│ ├── … +│ ├── 23-a4ace29e-e73e-40a1-81d3-0f5dc13093de-0.parquet +├── date=2021-01-02 +│ ├── 24-9698b456-66eb-4075-8732-fe56d81edb60-0.parquet +│ ├── … +│ └── 47-d3fce527-e018-4c02-8acd-a649f6f523d2-0.parquet +├── date=2021-01-03 +│ ├── 48-fd90a7fa-5a14-42ed-9f59-9fe48d87899d-0.parquet +│ ├── … +│ └── 71-5f143ade-8ae2-4854-bdc5-61154175665f-0.parquet +├── date=2021-01-04 +│ ├── 72-477c10fe-dc09-4087-80f0-56006e4a7911-0.parquet +│ ├── … +│ └── 95-1c92cbce-8af4-4fe4-9c11-832245cf4d40-0.parquet +└── date=2021-01-05 + ├── 96-1b878ee5-25fd-431a-bc3e-6dcacc96b470-0.parquet + ├── … + └── 99-9650ed63-c195-433d-a86b-9469088c14ba-0.parquet +``` + +Each of these Parquet files are tiny - they’re only 10 KB. Let’s see how to compact these tiny files into larger files, which is more efficient for data queries. + +## Compact small files in the Delta table with optimize + +Let’s run the optimize command to compact the existing small files into larger files: + +``` +dt = DeltaTable("observation_data") + +dt.optimize() +``` + +Here’s the output of the command: + +``` +{'numFilesAdded': 5, + 'numFilesRemoved': 100, + 'filesAdded': {'min': 39000, + 'max': 238282, + 'avg': 198425.6, + 'totalFiles': 5, + 'totalSize': 992128}, + 'filesRemoved': {'min': 10244, + 'max': 10244, + 'avg': 10244.0, + 'totalFiles': 100, + 'totalSize': 1024400}, + 'partitionsOptimized': 5, + 'numBatches': 1, + 'totalConsideredFiles': 100, + 'totalFilesSkipped': 0, + 'preserveInsertionOrder': True} +``` + +The optimize operation has added 5 new files and marked 100 exisitng files for removal (this is also known as “tombstoning” files). It has compacted the 100 tiny files into 5 larger files. + +Let’s append some more data to the Delta table and see how we can selectively run optimize on the new data that’s added. + +## Handling incremental updates with optimize + +Let’s append another 24 hours of data to the Delta table: + +``` +for timestamp in itertools.islice(hours_iter, 24): + write_deltalake( + dt, + record_observations(timestamp), + partition_by=["date"], + mode="append", + ) +``` + +We can use `get_add_actions()` to introspect the table state. We can see that `2021-01-06` has only a few hours of data so far, so we don't want to optimize that yet. But `2021-01-05` has all 24 hours of data, so it's ready to be optimized. + +``` +dt.get_add_actions(flatten=True).to_pandas()[ + "partition.date" +].value_counts().sort_index() + +2021-01-01 1 +2021-01-02 1 +2021-01-03 1 +2021-01-04 1 +2021-01-05 21 +2021-01-06 4 +``` + +To optimize a single partition, you can pass in a `partition_filters` argument speficying which partitions to optimize. + +``` +dt.optimize(partition_filters=[("date", "=", "2021-01-05")]) + +{'numFilesAdded': 1, + 'numFilesRemoved': 21, + 'filesAdded': {'min': 238282, + 'max': 238282, + 'avg': 238282.0, + 'totalFiles': 1, + 'totalSize': 238282}, + 'filesRemoved': {'min': 10244, + 'max': 39000, + 'avg': 11613.333333333334, + 'totalFiles': 21, + 'totalSize': 243880}, + 'partitionsOptimized': 1, + 'numBatches': 1, + 'totalConsideredFiles': 21, + 'totalFilesSkipped': 0, + 'preserveInsertionOrder': True} +``` + +This optimize operation tombstones 21 small data files and adds one file with all the existing data properly condensed. Let’s take a look a portion of the `_delta_log/00000000000000000125.json` file, which is the transaction log entry that corresponds with this incremental optimize command. + +``` +{ + "remove": { + "path": "date=2021-01-05/part-00000-41178aab-2491-488f-943d-8f03867295ee-c000.snappy.parquet", + "deletionTimestamp": 1683465499480, + "dataChange": false, + "extendedFileMetadata": null, + "partitionValues": { + "date": "2021-01-05" + }, + "size": 39000, + "tags": null + } +} + +{ + "remove": { + "path": "date=2021-01-05/101-79ae6fc9-c0cc-49ec-bb94-9aba879ac949-0.parquet", + "deletionTimestamp": 1683465499481, + "dataChange": false, + "extendedFileMetadata": null, + "partitionValues": { + "date": "2021-01-05" + }, + "size": 10244, + "tags": null + } +} + +… + +{ + "add": { + "path": "date=2021-01-05/part-00000-4b020a40-c836-4a11-851f-4691370c9f3a-c000.snappy.parquet", + "size": 238282, + "partitionValues": { + "date": "2021-01-05" + }, + "modificationTime": 1683465499493, + "dataChange": false, + "stats": "{\"numRecords\":24000,\"minValues\":{\"value\":0.00005581532256615507,\"timestamp\":\"2021-01-05T00:00:00.000Z\"},\"maxValues\":{\"timestamp\":\"2021-01-05T23:00:00.000Z\",\"value\":0.9999911402868216},\"nullCount\":{\"timestamp\":0,\"value\":0}}", + "tags": null + } +} +``` + +The trasaction log indicates that many files have been tombstoned and one file is added, as expected. + +The Delta Lake optimize command “removes” data by marking the data files as removed in the transaction log. The optimize command doesn’t physically delete the Parquet file from storage. Optimize performs a “logical remove” not a “physical remove”. + +Delta Lake uses logical operations so you can time travel back to earlier versions of your data. You can vacuum your Delta table to physically remove Parquet files from storage if you don’t need to time travel and don’t want to pay to store the tombstoned files. + +## Vacuuming after optimizing + +The vacuum command deletes all files from storage that are marked for removal in the transaction log and older than the retention period which is 7 days by default. + +It’s normally a good idea to have a retention period of at least 7 days. For purposes of this example, we will set the retention period to zero, just so you can see how the files get removed from storage. Adjusting the retention period in this manner isn’t recommended for production use cases. + +Let’s run the vacuum command: + +``` +dt.vacuum(retention_hours=0, enforce_retention_duration=False, dry_run=False) +``` + +The command returns a list of all the files that are removed from storage: + +``` +['date=2021-01-02/39-a98680f2-0e0e-4f26-a491-18b183f9eb05-0.parquet', + 'date=2021-01-02/41-e96bc8bb-c571-484c-b534-e897424fb7da-0.parquet', + … + 'date=2021-01-01/0-cfe227c6-edd9-4369-a1b0-db4559a2e693-0.parquet', + 'date=2021-01-01/18-ded53418-172b-4e40-bf2e-7c8142e71bd1-0.parquet'] +``` + +Let’s look at the content of the Delta table now that all the really small files have been removed from storage: + +``` +observation_data +├── _delta_log +│ ├── 00000000000000000000.json +│ ├── 00000000000000000001.json +│ ├── … +│ ├── 00000000000000000124.json +│ └── 00000000000000000125.json +├── date=2021-01-01 +│ └── part-00000-31e3df5a-8bbe-425c-b85d-77794f922837-c000.snappy.parquet +├── date=2021-01-02 +│ └── part-00000-8af07878-b179-49ce-a900-d58595ffb60a-c000.snappy.parquet +├── date=2021-01-03 +│ └── part-00000-5e980864-b32f-4686-a58d-a75fae455c1e-c000.snappy.parquet +├── date=2021-01-04 +│ └── part-00000-1e82d23b-084d-47e3-9790-d68289c39837-c000.snappy.parquet +├── date=2021-01-05 +│ └── part-00000-4b020a40-c836-4a11-851f-4691370c9f3a-c000.snappy.parquet +└── date=2021-01-06 + ├── 121-0ecb5d70-4a28-4cd4-b2d2-89ee2285eaaa-0.parquet + ├── 122-6b2d2758-9154-4392-b287-fe371ee507ec-0.parquet + ├── 123-551d318f-4968-441f-83fc-89f98cd15daf-0.parquet + └── 124-287309d3-662e-449d-b4da-2e67b7cc0557-0.parquet +``` + +All the partitions only contain a single file now, except for the `date=2021-01-06` partition that has not been compacted yet. + +An entire partition won’t necessarily get compacted to a single data file when optimize is run. Each partition has data files that are condensed to the target file size. + +## What causes the small file problem? + +Delta tables can accumulate small files for a variety of reasons: + +* User error: users can accidentally write files that are too small. Users should sometimes repartition in memory before writing to disk to avoid appending files that are too small. +* Frequent appends: systems that append more often tend to append more smaller files. A pipeline that appends every minute will generally generate ten times as many small files compared to a system that appends every ten minutes. +* Appending to partitioned data lakes with high cardinality columns can also cause small files. If you append every hour to a table that’s partitioned on a column with 1,000 distinct values, then every append could create 1,000 new files. Partitioning by date avoids this problem because the data isn’t split up across partitions in this manner. + +## Conclusion + +This page showed you how to create a Delta table with many small files, compact the small files into larger files with optimize, and remove the tombstoned files from storage with vacuum. + +You also learned about how to incrementally optimize partitioned Delta tables, so you only compact newly added data. + +An excessive number of small files slows down Delta table queries, so periodic compaction is important. Make sure to properly maintain your Delta tables, so performance does not degrade over time. diff --git a/mkdocs.yml b/mkdocs.yml index 9ad8d6a4a2..7fe08d77a1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -24,6 +24,7 @@ nav: - Querying a Delta Table: usage/querying-delta-tables.md - Managing a Delta Table: usage/managing-tables.md - Writing Delta Tables: usage/writing-delta-tables.md + - Small file compaction: usage/small-file-compaction-with-optimize.md - API Reference: - api/delta_table.md - api/schema.md From daa700eadaa2a6cc968d0b63cf4c5e7cfd65fc55 Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Fri, 17 Nov 2023 19:19:58 -0500 Subject: [PATCH 18/23] Add Rust installation instructions --- docs/{ => usage}/installation.md | 8 ++++++++ mkdocs.yml | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) rename docs/{ => usage}/installation.md (65%) diff --git a/docs/installation.md b/docs/usage/installation.md similarity index 65% rename from docs/installation.md rename to docs/usage/installation.md index 4bd3ffa3ce..70f75c3f78 100644 --- a/docs/installation.md +++ b/docs/usage/installation.md @@ -1,5 +1,7 @@ # Installation +The `deltalake` project can be installed via Pip for Python or Cargo for Rust. + ## Using Pip ``` bash @@ -7,3 +9,9 @@ pip install deltalake ``` NOTE: official binary wheels are linked against openssl statically for remote objection store communication. Please file Github issue to request for critical openssl upgrade. + +## Using Cargo + +``` bash +cargo add deltalake +``` \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 7fe08d77a1..41f0ee309c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,8 +16,8 @@ theme: - content.tabs.link nav: - Home: index.md - - Installation: installation.md - Usage: + - Installation: usage/installation.md - Overview: usage/index.md - Loading a Delta Table: usage/loading-table.md - Examining a Delta Table: usage/examining-table.md From dd6b45362a14c0f127b32c4b81afc15d17f710d5 Mon Sep 17 00:00:00 2001 From: Robert Pack Date: Sat, 18 Nov 2023 12:25:42 +0100 Subject: [PATCH 19/23] refactor: express log schema in delta types --- .../deltalake-core/src/kernel/actions/mod.rs | 10 +- .../src/kernel/actions/schemas.rs | 481 ++++---- .../src/kernel/actions/types.rs | 125 +- .../kernel/{actions/arrow.rs => arrow/mod.rs} | 17 +- .../src/kernel/arrow/schemas.rs | 63 + crates/deltalake-core/src/kernel/error.rs | 3 +- crates/deltalake-core/src/kernel/mod.rs | 2 + .../src/protocol/checkpoints.rs | 2 +- .../src/schema/arrow_convert.rs | 1049 ----------------- crates/deltalake-core/src/table/config.rs | 56 + 10 files changed, 466 insertions(+), 1342 deletions(-) rename crates/deltalake-core/src/kernel/{actions/arrow.rs => arrow/mod.rs} (98%) create mode 100644 crates/deltalake-core/src/kernel/arrow/schemas.rs delete mode 100644 crates/deltalake-core/src/schema/arrow_convert.rs diff --git a/crates/deltalake-core/src/kernel/actions/mod.rs b/crates/deltalake-core/src/kernel/actions/mod.rs index 865c9d3cd9..637d520c41 100644 --- a/crates/deltalake-core/src/kernel/actions/mod.rs +++ b/crates/deltalake-core/src/kernel/actions/mod.rs @@ -7,9 +7,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -#[cfg(all(feature = "arrow", feature = "parquet"))] -pub(crate) mod arrow; -// pub(crate) mod schemas; +pub(crate) mod schemas; mod serde_path; pub(crate) mod types; @@ -32,10 +30,12 @@ pub enum ActionType { Protocol, /// modify the data in a table by removing individual logical files Remove, - /// The Row ID high-water mark tracks the largest ID that has been assigned to a row in the table. - RowIdHighWaterMark, /// Transactional information Txn, + /// Checkpoint metadata + CheckpointMetadata, + /// Sidecar + Sidecar, } #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)] diff --git a/crates/deltalake-core/src/kernel/actions/schemas.rs b/crates/deltalake-core/src/kernel/actions/schemas.rs index 0cc870318f..ad3e3ccbad 100644 --- a/crates/deltalake-core/src/kernel/actions/schemas.rs +++ b/crates/deltalake-core/src/kernel/actions/schemas.rs @@ -1,255 +1,262 @@ -use std::sync::Arc; +//! Schema definitions for action types -use arrow_schema::{DataType, Field, Fields, Schema}; +use lazy_static::lazy_static; use super::ActionType; +use crate::kernel::schema::{ArrayType, DataType, MapType, StructField, StructType}; -impl ActionType { - /// Returns the root field for the action type - pub fn field(&self) -> Field { - match self { - Self::Add => get_root("add", self.fields()), - Self::Cdc => get_root("cdc", self.fields()), - Self::CommitInfo => get_root("commitInfo", self.fields()), - Self::DomainMetadata => get_root("domainMetadata", self.fields()), - Self::Metadata => get_root("metaData", self.fields()), - Self::Protocol => get_root("protocol", self.fields()), - Self::Remove => get_root("remove", self.fields()), - Self::RowIdHighWaterMark => get_root("rowIdHighWaterMark", self.fields()), - Self::Txn => get_root("txn", self.fields()), - } - } - - /// Returns the child fields for the action type - pub fn fields(&self) -> Vec { - match self { - Self::Add => add_fields(), - Self::Cdc => cdc_fields(), - Self::CommitInfo => commit_info_fields(), - Self::DomainMetadata => domain_metadata_fields(), - Self::Metadata => metadata_fields(), - Self::Protocol => protocol_fields(), - Self::Remove => remove_fields(), - Self::RowIdHighWaterMark => watermark_fields(), - Self::Txn => txn_fields(), - } - } -} - -/// Returns the schema for the delta log -pub fn get_log_schema() -> Schema { - Schema { - fields: Fields::from_iter([ - ActionType::Add.field(), - ActionType::Cdc.field(), - ActionType::CommitInfo.field(), - ActionType::DomainMetadata.field(), - ActionType::Metadata.field(), - ActionType::Protocol.field(), - ActionType::Remove.field(), - ActionType::RowIdHighWaterMark.field(), - ActionType::Txn.field(), - ]), - metadata: Default::default(), - } -} - -fn get_root(name: &str, fields: Vec) -> Field { - Field::new(name, DataType::Struct(Fields::from_iter(fields)), true) -} - -fn add_fields() -> Vec { - Vec::from_iter([ - Field::new("path", DataType::Utf8, false), - Field::new("size", DataType::Int64, false), - Field::new("modificationTime", DataType::Int64, false), - Field::new("dataChange", DataType::Boolean, false), - Field::new("stats", DataType::Utf8, true), - Field::new( - "partitionValues", - DataType::Map(Arc::new(get_map_field()), false), - true, - ), - Field::new( - "tags", - DataType::Map(Arc::new(get_map_field()), false), - true, - ), - Field::new( - "deletionVector", - DataType::Struct(Fields::from(vec![ - Field::new("storageType", DataType::Utf8, false), - Field::new("pathOrInlineDv", DataType::Utf8, false), - Field::new("offset", DataType::Int32, true), - Field::new("sizeInBytes", DataType::Int32, false), - Field::new("cardinality", DataType::Int64, false), - ])), - true, - ), - Field::new("baseRowId", DataType::Int64, true), - Field::new("defaultRowCommitVersion", DataType::Int64, true), - ]) -} - -fn cdc_fields() -> Vec { - Vec::from_iter([ - Field::new("path", DataType::Utf8, true), - Field::new( - "partitionValues", - DataType::Map(Arc::new(get_map_field()), false), - true, - ), - Field::new("size", DataType::Int64, true), - Field::new("dataChange", DataType::Boolean, true), - Field::new( - "tags", - DataType::Map(Arc::new(get_map_field()), false), - true, - ), - ]) -} - -fn remove_fields() -> Vec { - Vec::from_iter([ - Field::new("path", DataType::Utf8, true), - Field::new("deletionTimestamp", DataType::Int64, true), - Field::new("dataChange", DataType::Boolean, true), - Field::new("extendedFileMetadata", DataType::Boolean, true), - Field::new("size", DataType::Int64, true), - Field::new( - "partitionValues", - DataType::Map(Arc::new(get_map_field()), false), - true, - ), - Field::new( - "tags", - DataType::Map(Arc::new(get_map_field()), false), - true, - ), - ]) -} - -fn metadata_fields() -> Vec { - Vec::from_iter([ - Field::new("id", DataType::Utf8, false), - Field::new("name", DataType::Utf8, true), - Field::new("description", DataType::Utf8, true), - Field::new( - "format", - DataType::Struct(Fields::from_iter([ - Field::new("provider", DataType::Utf8, true), - Field::new( - "options", - DataType::Map( - Arc::new(Field::new( - "key_value", - DataType::Struct(Fields::from_iter([ - Field::new("key", DataType::Utf8, false), - Field::new("value", DataType::Utf8, true), - ])), - false, - )), - false, +lazy_static! { + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#change-metadata + static ref METADATA_FIELD: StructField = StructField::new( + "metaData", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("id", DataType::string(), false), + StructField::new("name", DataType::string(), true), + StructField::new("description", DataType::string(), true), + StructField::new( + "format", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("provider", DataType::string(), false), + StructField::new( + "configuration", + DataType::Map(Box::new(MapType::new( + DataType::string(), + DataType::string(), + true, + ))), + true, ), - false, - ), - ])), - false, - ), - Field::new("schemaString", DataType::Utf8, false), - Field::new("createdTime", DataType::Int64, true), - Field::new( - "partitionColumns", - DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), - false, - ), - Field::new( - "configuration", - DataType::Map( - Arc::new(Field::new( - "key_value", - DataType::Struct(Fields::from_iter([ - Field::new("key", DataType::Utf8, false), - Field::new("value", DataType::Utf8, true), - ])), - false, - )), + ]))), false, ), - true, - ), - ]) + StructField::new("schemaString", DataType::string(), false), + StructField::new( + "partitionColumns", + DataType::Array(Box::new(ArrayType::new(DataType::string(), false))), + false, + ), + StructField::new("createdTime", DataType::long(), true), + StructField::new( + "configuration", + DataType::Map(Box::new(MapType::new( + DataType::string(), + DataType::string(), + true, + ))), + false, + ), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#protocol-evolution + static ref PROTOCOL_FIELD: StructField = StructField::new( + "protocol", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("minReaderVersion", DataType::integer(), false), + StructField::new("minWriterVersion", DataType::integer(), false), + StructField::new( + "readerFeatures", + DataType::Array(Box::new(ArrayType::new(DataType::string(), false))), + true, + ), + StructField::new( + "writerFeatures", + DataType::Array(Box::new(ArrayType::new(DataType::string(), false))), + true, + ), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#commit-provenance-information + static ref COMMIT_INFO_FIELD: StructField = StructField::new( + "commitInfo", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("timestamp", DataType::timestamp(), false), + StructField::new("operation", DataType::string(), false), + StructField::new("isolationLevel", DataType::string(), true), + StructField::new("isBlindAppend", DataType::boolean(), true), + StructField::new("txnId", DataType::string(), true), + StructField::new("readVersion", DataType::long(), true), + StructField::new( + "operationParameters", + DataType::Map(Box::new(MapType::new( + DataType::string(), + DataType::string(), + true, + ))), + true, + ), + StructField::new( + "operationMetrics", + DataType::Map(Box::new(MapType::new( + DataType::string(), + DataType::string(), + true, + ))), + true, + ), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#add-file-and-remove-file + static ref ADD_FIELD: StructField = StructField::new( + "add", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("path", DataType::string(), false), + partition_values_field(), + StructField::new("size", DataType::long(), false), + StructField::new("modificationTime", DataType::timestamp(), false), + StructField::new("dataChange", DataType::boolean(), false), + StructField::new("stats", DataType::string(), true), + tags_field(), + deletion_vector_field(), + StructField::new("baseRowId", DataType::long(), true), + StructField::new("defaultRowCommitVersion", DataType::long(), true), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#add-file-and-remove-file + static ref REMOVE_FIELD: StructField = StructField::new( + "remove", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("path", DataType::string(), false), + StructField::new("deletionTimestamp", DataType::timestamp(), true), + StructField::new("dataChange", DataType::boolean(), false), + StructField::new("extendedFileMetadata", DataType::boolean(), true), + partition_values_field(), + StructField::new("size", DataType::long(), true), + StructField::new("stats", DataType::string(), true), + tags_field(), + deletion_vector_field(), + StructField::new("baseRowId", DataType::long(), true), + StructField::new("defaultRowCommitVersion", DataType::long(), true), + ]))), + true, + ); + static ref REMOVE_FIELD_CHECKPOINT: StructField = StructField::new( + "remove", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("path", DataType::string(), false), + StructField::new("deletionTimestamp", DataType::timestamp(), true), + StructField::new("dataChange", DataType::boolean(), false), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#add-cdc-file + static ref CDC_FIELD: StructField = StructField::new( + "cdc", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("path", DataType::string(), false), + partition_values_field(), + StructField::new("size", DataType::long(), false), + StructField::new("dataChange", DataType::boolean(), false), + tags_field(), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#transaction-identifiers + static ref TXN_FIELD: StructField = StructField::new( + "txn", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("appId", DataType::string(), false), + StructField::new("version", DataType::long(), false), + StructField::new("lastUpdated", DataType::timestamp(), true), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#domain-metadata + static ref DOMAIN_METADATA_FIELD: StructField = StructField::new( + "domainMetadata", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("domain", DataType::string(), false), + StructField::new( + "configuration", + DataType::Map(Box::new(MapType::new( + DataType::string(), + DataType::string(), + true, + ))), + false, + ), + StructField::new("removed", DataType::boolean(), false), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-metadata + static ref CHECKPOINT_METADATA_FIELD: StructField = StructField::new( + "checkpointMetadata", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("flavor", DataType::string(), false), + tags_field(), + ]))), + true, + ); + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#sidecar-file-information + static ref SIDECAR_FIELD: StructField = StructField::new( + "sidecar", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("path", DataType::string(), false), + StructField::new("sizeInBytes", DataType::long(), false), + StructField::new("modificationTime", DataType::timestamp(), false), + StructField::new("type", DataType::string(), false), + tags_field(), + ]))), + true, + ); } -fn protocol_fields() -> Vec { - Vec::from_iter([ - Field::new("minReaderVersion", DataType::Int32, false), - Field::new("minWriterVersion", DataType::Int32, false), - Field::new( - "readerFeatures", - DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), +fn tags_field() -> StructField { + StructField::new( + "tags", + DataType::Map(Box::new(MapType::new( + DataType::string(), + DataType::string(), true, - ), - Field::new( - "writerFeatures", - DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), - true, - ), - ]) -} - -fn txn_fields() -> Vec { - Vec::from_iter([ - Field::new("appId", DataType::Utf8, true), - Field::new("version", DataType::Int64, true), - Field::new("lastUpdated", DataType::Int64, true), - ]) -} - -fn watermark_fields() -> Vec { - Vec::from_iter([Field::new("highWaterMark", DataType::Int64, true)]) + ))), + true, + ) } -fn commit_info_fields() -> Vec { - Vec::from_iter([ - Field::new("timestamp", DataType::Int64, true), - Field::new("operation", DataType::Utf8, true), - Field::new("isolationLevel", DataType::Utf8, true), - Field::new("isBlindAppend", DataType::Boolean, true), - Field::new("txnId", DataType::Utf8, true), - Field::new("readVersion", DataType::Int32, true), - Field::new( - "operationParameters", - DataType::Map(Arc::new(get_map_field()), false), - true, - ), - Field::new( - "operationMetrics", - DataType::Map(Arc::new(get_map_field()), false), +fn partition_values_field() -> StructField { + StructField::new( + "partitionValues", + DataType::Map(Box::new(MapType::new( + DataType::string(), + DataType::string(), true, - ), - ]) + ))), + false, + ) } -fn domain_metadata_fields() -> Vec { - Vec::from_iter([ - Field::new("domain", DataType::Utf8, true), - Field::new( - "configuration", - DataType::Map(Arc::new(get_map_field()), false), - true, - ), - Field::new("removed", DataType::Boolean, true), - ]) +fn deletion_vector_field() -> StructField { + StructField::new( + "deletionVector", + DataType::Struct(Box::new(StructType::new(vec![ + StructField::new("storageType", DataType::string(), false), + StructField::new("pathOrInlineDv", DataType::string(), false), + StructField::new("offset", DataType::integer(), true), + StructField::new("sizeInBytes", DataType::integer(), false), + StructField::new("cardinality", DataType::long(), false), + ]))), + true, + ) } -fn get_map_field() -> Field { - Field::new( - "key_value", - DataType::Struct(Fields::from_iter([ - Field::new("key", DataType::Utf8, false), - Field::new("value", DataType::Utf8, true), - ])), - false, - ) +impl ActionType { + /// Returns the type of the corresponding field in the delta log schema + pub fn schema_field(&self) -> &StructField { + match self { + Self::Metadata => &METADATA_FIELD, + Self::Protocol => &PROTOCOL_FIELD, + Self::CommitInfo => &COMMIT_INFO_FIELD, + Self::Add => &ADD_FIELD, + Self::Remove => &REMOVE_FIELD, + Self::Cdc => &CDC_FIELD, + Self::Txn => &TXN_FIELD, + Self::DomainMetadata => &DOMAIN_METADATA_FIELD, + Self::CheckpointMetadata => &CHECKPOINT_METADATA_FIELD, + Self::Sidecar => &SIDECAR_FIELD, + } + } } diff --git a/crates/deltalake-core/src/kernel/actions/types.rs b/crates/deltalake-core/src/kernel/actions/types.rs index a788315b82..aa60823e4a 100644 --- a/crates/deltalake-core/src/kernel/actions/types.rs +++ b/crates/deltalake-core/src/kernel/actions/types.rs @@ -174,7 +174,7 @@ pub enum ReaderFeatures { /// Mapping of one column to another ColumnMapping, /// Deletion vectors for merge, update, delete - DeleteionVecotrs, + DeletionVectors, /// timestamps without timezone support #[serde(alias = "timestampNtz")] TimestampWithoutTimezone, @@ -185,26 +185,13 @@ pub enum ReaderFeatures { Other(String), } -#[allow(clippy::from_over_into)] -impl Into for ReaderFeatures { - fn into(self) -> usize { - match self { - ReaderFeatures::Other(_) => 0, - ReaderFeatures::ColumnMapping => 2, - ReaderFeatures::DeleteionVecotrs - | ReaderFeatures::TimestampWithoutTimezone - | ReaderFeatures::V2Checkpoint => 3, - } - } -} - #[cfg(all(not(feature = "parquet2"), feature = "parquet"))] impl From<&parquet::record::Field> for ReaderFeatures { fn from(value: &parquet::record::Field) -> Self { match value { parquet::record::Field::Str(feature) => match feature.as_str() { "columnMapping" => ReaderFeatures::ColumnMapping, - "deletionVectors" => ReaderFeatures::DeleteionVecotrs, + "deletionVectors" => ReaderFeatures::DeletionVectors, "timestampNtz" => ReaderFeatures::TimestampWithoutTimezone, "v2Checkpoint" => ReaderFeatures::V2Checkpoint, f => ReaderFeatures::Other(f.to_string()), @@ -216,9 +203,15 @@ impl From<&parquet::record::Field> for ReaderFeatures { impl From for ReaderFeatures { fn from(value: String) -> Self { - match value.as_str() { + value.as_str().into() + } +} + +impl From<&str> for ReaderFeatures { + fn from(value: &str) -> Self { + match value { "columnMapping" => ReaderFeatures::ColumnMapping, - "deletionVectors" => ReaderFeatures::DeleteionVecotrs, + "deletionVectors" => ReaderFeatures::DeletionVectors, "timestampNtz" => ReaderFeatures::TimestampWithoutTimezone, "v2Checkpoint" => ReaderFeatures::V2Checkpoint, f => ReaderFeatures::Other(f.to_string()), @@ -230,7 +223,7 @@ impl AsRef for ReaderFeatures { fn as_ref(&self) -> &str { match self { ReaderFeatures::ColumnMapping => "columnMapping", - ReaderFeatures::DeleteionVecotrs => "deletionVectors", + ReaderFeatures::DeletionVectors => "deletionVectors", ReaderFeatures::TimestampWithoutTimezone => "timestampNtz", ReaderFeatures::V2Checkpoint => "v2Checkpoint", ReaderFeatures::Other(f) => f, @@ -264,7 +257,7 @@ pub enum WriterFeatures { /// ID Columns IdentityColumns, /// Deletion vectors for merge, update, delete - DeleteionVecotrs, + DeletionVectors, /// Row tracking on tables RowTracking, /// timestamps without timezone support @@ -281,29 +274,15 @@ pub enum WriterFeatures { Other(String), } -#[allow(clippy::from_over_into)] -impl Into for WriterFeatures { - fn into(self) -> usize { - match self { - WriterFeatures::Other(_) => 0, - WriterFeatures::AppendOnly | WriterFeatures::Invariants => 2, - WriterFeatures::CheckConstraints => 3, - WriterFeatures::ChangeDataFeed | WriterFeatures::GeneratedColumns => 4, - WriterFeatures::ColumnMapping => 5, - WriterFeatures::IdentityColumns - | WriterFeatures::DeleteionVecotrs - | WriterFeatures::RowTracking - | WriterFeatures::TimestampWithoutTimezone - | WriterFeatures::DomainMetadata - | WriterFeatures::V2Checkpoint - | WriterFeatures::IcebergCompatV1 => 7, - } +impl From for WriterFeatures { + fn from(value: String) -> Self { + value.as_str().into() } } -impl From for WriterFeatures { - fn from(value: String) -> Self { - match value.as_str() { +impl From<&str> for WriterFeatures { + fn from(value: &str) -> Self { + match value { "appendOnly" => WriterFeatures::AppendOnly, "invariants" => WriterFeatures::Invariants, "checkConstraints" => WriterFeatures::CheckConstraints, @@ -311,7 +290,7 @@ impl From for WriterFeatures { "generatedColumns" => WriterFeatures::GeneratedColumns, "columnMapping" => WriterFeatures::ColumnMapping, "identityColumns" => WriterFeatures::IdentityColumns, - "deletionVectors" => WriterFeatures::DeleteionVecotrs, + "deletionVectors" => WriterFeatures::DeletionVectors, "rowTracking" => WriterFeatures::RowTracking, "timestampNtz" => WriterFeatures::TimestampWithoutTimezone, "domainMetadata" => WriterFeatures::DomainMetadata, @@ -332,7 +311,7 @@ impl AsRef for WriterFeatures { WriterFeatures::GeneratedColumns => "generatedColumns", WriterFeatures::ColumnMapping => "columnMapping", WriterFeatures::IdentityColumns => "identityColumns", - WriterFeatures::DeleteionVecotrs => "deletionVectors", + WriterFeatures::DeletionVectors => "deletionVectors", WriterFeatures::RowTracking => "rowTracking", WriterFeatures::TimestampWithoutTimezone => "timestampNtz", WriterFeatures::DomainMetadata => "domainMetadata", @@ -361,7 +340,7 @@ impl From<&parquet::record::Field> for WriterFeatures { "generatedColumns" => WriterFeatures::GeneratedColumns, "columnMapping" => WriterFeatures::ColumnMapping, "identityColumns" => WriterFeatures::IdentityColumns, - "deletionVectors" => WriterFeatures::DeleteionVecotrs, + "deletionVectors" => WriterFeatures::DeletionVectors, "rowTracking" => WriterFeatures::RowTracking, "timestampNtz" => WriterFeatures::TimestampWithoutTimezone, "domainMetadata" => WriterFeatures::DomainMetadata, @@ -421,7 +400,7 @@ impl AsRef for StorageType { impl ToString for StorageType { fn to_string(&self) -> String { - self.as_ref().to_string() + self.as_ref().into() } } @@ -450,6 +429,7 @@ pub struct DeletionVectorDescriptor { /// Start of the data for this DV in number of bytes from the beginning of the file it is stored in. /// Always None (absent in JSON) when `storageType = 'i'`. + #[serde(skip_serializing_if = "Option::is_none")] pub offset: Option, /// Size of the serialized DV in bytes (raw data size, i.e. before base85 encoding, if inline). @@ -662,9 +642,11 @@ pub struct Remove { pub data_change: bool, /// The time this logical file was created, as milliseconds since the epoch. + #[serde(skip_serializing_if = "Option::is_none")] pub deletion_timestamp: Option, /// When true the fields `partition_values`, `size`, and `tags` are present + #[serde(skip_serializing_if = "Option::is_none")] pub extended_file_metadata: Option, /// A map from partition column to value for this logical file. @@ -686,9 +668,11 @@ pub struct Remove { /// Default generated Row ID of the first row in the file. The default generated Row IDs /// of the other rows in the file can be reconstructed by adding the physical index of the /// row within the file to the base Row ID + #[serde(skip_serializing_if = "Option::is_none")] pub base_row_id: Option, /// First commit version in which an add action with the same path was committed to the table. + #[serde(skip_serializing_if = "Option::is_none")] pub default_row_commit_version: Option, } @@ -707,13 +691,18 @@ pub struct AddCDCFile { /// absolute path to a CDC file #[serde(with = "serde_path")] pub path: String, + /// The size of this file in bytes pub size: i64, + /// A map from partition column to value for this file pub partition_values: HashMap>, + /// Should always be set to false because they do not change the underlying data of the table pub data_change: bool, + /// Map containing metadata about this file + #[serde(skip_serializing_if = "Option::is_none")] pub tags: Option>>, } @@ -724,9 +713,12 @@ pub struct AddCDCFile { pub struct Txn { /// A unique identifier for the application performing the transaction. pub app_id: String, + /// An application-specific numeric identifier for this transaction. pub version: i64, + /// The time when this transaction action was created in milliseconds since the Unix epoch. + #[serde(skip_serializing_if = "Option::is_none")] pub last_updated: Option, } @@ -739,30 +731,39 @@ pub struct CommitInfo { /// Timestamp in millis when the commit was created #[serde(skip_serializing_if = "Option::is_none")] pub timestamp: Option, + /// Id of the user invoking the commit #[serde(skip_serializing_if = "Option::is_none")] pub user_id: Option, + /// Name of the user invoking the commit #[serde(skip_serializing_if = "Option::is_none")] pub user_name: Option, + /// The operation performed during the #[serde(skip_serializing_if = "Option::is_none")] pub operation: Option, + /// Parameters used for table operation #[serde(skip_serializing_if = "Option::is_none")] pub operation_parameters: Option>, + /// Version of the table when the operation was started #[serde(skip_serializing_if = "Option::is_none")] pub read_version: Option, + /// The isolation level of the commit #[serde(skip_serializing_if = "Option::is_none")] pub isolation_level: Option, + /// TODO #[serde(skip_serializing_if = "Option::is_none")] pub is_blind_append: Option, + /// Delta engine which created the commit. #[serde(skip_serializing_if = "Option::is_none")] pub engine_info: Option, + /// Additional provenance information for the commit #[serde(flatten, default)] pub info: HashMap, @@ -774,12 +775,50 @@ pub struct CommitInfo { pub struct DomainMetadata { /// Identifier for this domain (system or user-provided) pub domain: String, + /// String containing configuration for the metadata domain pub configuration: String, + /// When `true` the action serves as a tombstone pub removed: bool, } +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)] +/// This action is only allowed in checkpoints following V2 spec. It describes the details about the checkpoint. +pub struct CheckpointMetadata { + /// The flavor of the V2 checkpoint. Allowed values: "flat". + pub flavor: String, + + /// Map containing any additional metadata about the v2 spec checkpoint. + #[serde(skip_serializing_if = "Option::is_none")] + pub tags: Option>>, +} + +/// The sidecar action references a sidecar file which provides some of the checkpoint's file actions. +/// This action is only allowed in checkpoints following V2 spec. +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct Sidecar { + /// The name of the sidecar file (not a path). + /// The file must reside in the _delta_log/_sidecars directory. + pub file_name: String, + + /// The size of the sidecar file in bytes + pub size_in_bytes: i64, + + /// The time this sidecar file was created, as milliseconds since the epoch. + pub modification_time: i64, + + /// Type of sidecar. Valid values are: "fileaction". + /// This could be extended in future to allow different kinds of sidecars. + #[serde(rename = "type")] + pub sidecar_type: String, + + /// Map containing any additional metadata about the checkpoint sidecar file. + #[serde(skip_serializing_if = "Option::is_none")] + pub tags: Option>>, +} + #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] /// The isolation level applied during transaction pub enum IsolationLevel { diff --git a/crates/deltalake-core/src/kernel/actions/arrow.rs b/crates/deltalake-core/src/kernel/arrow/mod.rs similarity index 98% rename from crates/deltalake-core/src/kernel/actions/arrow.rs rename to crates/deltalake-core/src/kernel/arrow/mod.rs index d292362604..0c89f6ab48 100644 --- a/crates/deltalake-core/src/kernel/actions/arrow.rs +++ b/crates/deltalake-core/src/kernel/arrow/mod.rs @@ -1,3 +1,5 @@ +//! Conversions between Delta and Arrow data types + use std::sync::Arc; use arrow_schema::{ @@ -6,7 +8,12 @@ use arrow_schema::{ }; use lazy_static::lazy_static; -use super::super::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; +use super::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; + +pub mod schemas; + +const MAP_KEYS_NAME: &str = "keys"; +const MAP_VALUES_NAME: &str = "values"; impl TryFrom<&StructType> for ArrowSchema { type Error = ArrowError; @@ -64,9 +71,9 @@ impl TryFrom<&MapType> for ArrowField { "entries", ArrowDataType::Struct( vec![ - ArrowField::new("key", ArrowDataType::try_from(a.key_type())?, false), + ArrowField::new(MAP_KEYS_NAME, ArrowDataType::try_from(a.key_type())?, false), ArrowField::new( - "value", + MAP_VALUES_NAME, ArrowDataType::try_from(a.value_type())?, a.value_contains_null(), ), @@ -143,12 +150,12 @@ impl TryFrom<&DataType> for ArrowDataType { ArrowDataType::Struct( vec![ ArrowField::new( - "keys", + MAP_KEYS_NAME, >::try_from(m.key_type())?, false, ), ArrowField::new( - "values", + MAP_VALUES_NAME, >::try_from(m.value_type())?, m.value_contains_null(), ), diff --git a/crates/deltalake-core/src/kernel/arrow/schemas.rs b/crates/deltalake-core/src/kernel/arrow/schemas.rs new file mode 100644 index 0000000000..80a29e065e --- /dev/null +++ b/crates/deltalake-core/src/kernel/arrow/schemas.rs @@ -0,0 +1,63 @@ +//! Arrow schemas for the delta log + +use arrow_schema::{Field, Fields, Schema}; +use lazy_static::lazy_static; + +use super::super::ActionType; + +lazy_static! { + static ref ARROW_METADATA_FIELD: Field = + ActionType::Metadata.schema_field().try_into().unwrap(); + static ref ARROW_PROTOCOL_FIELD: Field = + ActionType::Protocol.schema_field().try_into().unwrap(); + static ref ARROW_COMMIT_INFO_FIELD: Field = + ActionType::CommitInfo.schema_field().try_into().unwrap(); + static ref ARROW_ADD_FIELD: Field = ActionType::Add.schema_field().try_into().unwrap(); + static ref ARROW_REMOVE_FIELD: Field = ActionType::Remove.schema_field().try_into().unwrap(); + static ref ARROW_CDC_FIELD: Field = ActionType::Cdc.schema_field().try_into().unwrap(); + static ref ARROW_TXN_FIELD: Field = ActionType::Txn.schema_field().try_into().unwrap(); + static ref ARROW_DOMAIN_METADATA_FIELD: Field = ActionType::DomainMetadata + .schema_field() + .try_into() + .unwrap(); + static ref ARROW_CHECKPOINT_METADATA_FIELD: Field = ActionType::CheckpointMetadata + .schema_field() + .try_into() + .unwrap(); + static ref ARROW_SIDECAR_FIELD: Field = ActionType::Sidecar.schema_field().try_into().unwrap(); +} + +impl ActionType { + /// Returns the root field for the action type + pub fn arrow_field(&self) -> &Field { + match self { + Self::Metadata => &ARROW_METADATA_FIELD, + Self::Protocol => &ARROW_PROTOCOL_FIELD, + Self::CommitInfo => &ARROW_COMMIT_INFO_FIELD, + Self::Add => &ARROW_ADD_FIELD, + Self::Remove => &ARROW_REMOVE_FIELD, + Self::Cdc => &ARROW_CDC_FIELD, + Self::Txn => &ARROW_TXN_FIELD, + Self::DomainMetadata => &ARROW_DOMAIN_METADATA_FIELD, + Self::CheckpointMetadata => &ARROW_CHECKPOINT_METADATA_FIELD, + Self::Sidecar => &ARROW_SIDECAR_FIELD, + } + } +} + +/// Returns the schema for the delta log +pub fn get_log_schema() -> Schema { + Schema { + fields: Fields::from_iter([ + ActionType::Add.arrow_field().clone(), + ActionType::Cdc.arrow_field().clone(), + ActionType::CommitInfo.arrow_field().clone(), + ActionType::DomainMetadata.arrow_field().clone(), + ActionType::Metadata.arrow_field().clone(), + ActionType::Protocol.arrow_field().clone(), + ActionType::Remove.arrow_field().clone(), + ActionType::Txn.arrow_field().clone(), + ]), + metadata: Default::default(), + } +} diff --git a/crates/deltalake-core/src/kernel/error.rs b/crates/deltalake-core/src/kernel/error.rs index 8ec799ca96..a37dbdae67 100644 --- a/crates/deltalake-core/src/kernel/error.rs +++ b/crates/deltalake-core/src/kernel/error.rs @@ -23,9 +23,8 @@ pub enum Error { #[error("Arrow error: {0}")] Parquet(#[from] parquet::errors::ParquetError), - #[cfg(feature = "object_store")] #[error("Error interacting with object store: {0}")] - ObjectStore(object_store::Error), + ObjectStore(#[from] object_store::Error), #[error("File not found: {0}")] FileNotFound(String), diff --git a/crates/deltalake-core/src/kernel/mod.rs b/crates/deltalake-core/src/kernel/mod.rs index 7785c273f9..54f742c3fb 100644 --- a/crates/deltalake-core/src/kernel/mod.rs +++ b/crates/deltalake-core/src/kernel/mod.rs @@ -1,6 +1,8 @@ //! Kernel module pub mod actions; +#[cfg(all(feature = "arrow", feature = "parquet"))] +pub mod arrow; pub mod error; pub mod schema; diff --git a/crates/deltalake-core/src/protocol/checkpoints.rs b/crates/deltalake-core/src/protocol/checkpoints.rs index a4cc1b66c7..837483c35c 100644 --- a/crates/deltalake-core/src/protocol/checkpoints.rs +++ b/crates/deltalake-core/src/protocol/checkpoints.rs @@ -18,7 +18,7 @@ use regex::Regex; use serde_json::Value; use super::{time_utils, ProtocolError}; -use crate::kernel::actions::arrow::delta_log_schema_for_table; +use crate::kernel::arrow::delta_log_schema_for_table; use crate::kernel::{ Action, Add as AddAction, DataType, Metadata, PrimitiveType, Protocol, StructField, StructType, Txn, diff --git a/crates/deltalake-core/src/schema/arrow_convert.rs b/crates/deltalake-core/src/schema/arrow_convert.rs deleted file mode 100644 index d292362604..0000000000 --- a/crates/deltalake-core/src/schema/arrow_convert.rs +++ /dev/null @@ -1,1049 +0,0 @@ -use std::sync::Arc; - -use arrow_schema::{ - ArrowError, DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, - Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, -}; -use lazy_static::lazy_static; - -use super::super::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; - -impl TryFrom<&StructType> for ArrowSchema { - type Error = ArrowError; - - fn try_from(s: &StructType) -> Result { - let fields = s - .fields() - .iter() - .map(>::try_from) - .collect::, ArrowError>>()?; - - Ok(ArrowSchema::new(fields)) - } -} - -impl TryFrom<&StructField> for ArrowField { - type Error = ArrowError; - - fn try_from(f: &StructField) -> Result { - let metadata = f - .metadata() - .iter() - .map(|(key, val)| Ok((key.clone(), serde_json::to_string(val)?))) - .collect::>() - .map_err(|err| ArrowError::JsonError(err.to_string()))?; - - let field = ArrowField::new( - f.name(), - ArrowDataType::try_from(f.data_type())?, - f.is_nullable(), - ) - .with_metadata(metadata); - - Ok(field) - } -} - -impl TryFrom<&ArrayType> for ArrowField { - type Error = ArrowError; - - fn try_from(a: &ArrayType) -> Result { - Ok(ArrowField::new( - "item", - ArrowDataType::try_from(a.element_type())?, - a.contains_null(), - )) - } -} - -impl TryFrom<&MapType> for ArrowField { - type Error = ArrowError; - - fn try_from(a: &MapType) -> Result { - Ok(ArrowField::new( - "entries", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::try_from(a.key_type())?, false), - ArrowField::new( - "value", - ArrowDataType::try_from(a.value_type())?, - a.value_contains_null(), - ), - ] - .into(), - ), - false, // always non-null - )) - } -} - -impl TryFrom<&DataType> for ArrowDataType { - type Error = ArrowError; - - fn try_from(t: &DataType) -> Result { - match t { - DataType::Primitive(p) => { - match p { - PrimitiveType::String => Ok(ArrowDataType::Utf8), - PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type - PrimitiveType::Integer => Ok(ArrowDataType::Int32), - PrimitiveType::Short => Ok(ArrowDataType::Int16), - PrimitiveType::Byte => Ok(ArrowDataType::Int8), - PrimitiveType::Float => Ok(ArrowDataType::Float32), - PrimitiveType::Double => Ok(ArrowDataType::Float64), - PrimitiveType::Boolean => Ok(ArrowDataType::Boolean), - PrimitiveType::Binary => Ok(ArrowDataType::Binary), - PrimitiveType::Decimal(precision, scale) => { - let precision = u8::try_from(*precision).map_err(|_| { - ArrowError::SchemaError(format!( - "Invalid precision for decimal: {}", - precision - )) - })?; - let scale = i8::try_from(*scale).map_err(|_| { - ArrowError::SchemaError(format!("Invalid scale for decimal: {}", scale)) - })?; - - if precision <= 38 { - Ok(ArrowDataType::Decimal128(precision, scale)) - } else if precision <= 76 { - Ok(ArrowDataType::Decimal256(precision, scale)) - } else { - Err(ArrowError::SchemaError(format!( - "Precision too large to be represented in Arrow: {}", - precision - ))) - } - } - PrimitiveType::Date => { - // A calendar date, represented as a year-month-day triple without a - // timezone. Stored as 4 bytes integer representing days since 1970-01-01 - Ok(ArrowDataType::Date32) - } - PrimitiveType::Timestamp => { - // Issue: https://github.com/delta-io/delta/issues/643 - Ok(ArrowDataType::Timestamp(TimeUnit::Microsecond, None)) - } - } - } - DataType::Struct(s) => Ok(ArrowDataType::Struct( - s.fields() - .iter() - .map(>::try_from) - .collect::, ArrowError>>()? - .into(), - )), - DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(>::try_from(a)?))), - DataType::Map(m) => Ok(ArrowDataType::Map( - Arc::new(ArrowField::new( - "entries", - ArrowDataType::Struct( - vec![ - ArrowField::new( - "keys", - >::try_from(m.key_type())?, - false, - ), - ArrowField::new( - "values", - >::try_from(m.value_type())?, - m.value_contains_null(), - ), - ] - .into(), - ), - false, - )), - false, - )), - } - } -} - -impl TryFrom<&ArrowSchema> for StructType { - type Error = ArrowError; - - fn try_from(arrow_schema: &ArrowSchema) -> Result { - let new_fields: Result, _> = arrow_schema - .fields() - .iter() - .map(|field| field.as_ref().try_into()) - .collect(); - Ok(StructType::new(new_fields?)) - } -} - -impl TryFrom for StructType { - type Error = ArrowError; - - fn try_from(arrow_schema: ArrowSchemaRef) -> Result { - arrow_schema.as_ref().try_into() - } -} - -impl TryFrom<&ArrowField> for StructField { - type Error = ArrowError; - - fn try_from(arrow_field: &ArrowField) -> Result { - Ok(StructField::new( - arrow_field.name().clone(), - arrow_field.data_type().try_into()?, - arrow_field.is_nullable(), - ) - .with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v)))) - } -} - -impl TryFrom<&ArrowDataType> for DataType { - type Error = ArrowError; - - fn try_from(arrow_datatype: &ArrowDataType) -> Result { - match arrow_datatype { - ArrowDataType::Utf8 => Ok(DataType::Primitive(PrimitiveType::String)), - ArrowDataType::LargeUtf8 => Ok(DataType::Primitive(PrimitiveType::String)), - ArrowDataType::Int64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type - ArrowDataType::Int32 => Ok(DataType::Primitive(PrimitiveType::Integer)), - ArrowDataType::Int16 => Ok(DataType::Primitive(PrimitiveType::Short)), - ArrowDataType::Int8 => Ok(DataType::Primitive(PrimitiveType::Byte)), - ArrowDataType::UInt64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type - ArrowDataType::UInt32 => Ok(DataType::Primitive(PrimitiveType::Integer)), - ArrowDataType::UInt16 => Ok(DataType::Primitive(PrimitiveType::Short)), - ArrowDataType::UInt8 => Ok(DataType::Primitive(PrimitiveType::Boolean)), - ArrowDataType::Float32 => Ok(DataType::Primitive(PrimitiveType::Float)), - ArrowDataType::Float64 => Ok(DataType::Primitive(PrimitiveType::Double)), - ArrowDataType::Boolean => Ok(DataType::Primitive(PrimitiveType::Boolean)), - ArrowDataType::Binary => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::FixedSizeBinary(_) => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::LargeBinary => Ok(DataType::Primitive(PrimitiveType::Binary)), - ArrowDataType::Decimal128(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( - *p as i32, *s as i32, - ))), - ArrowDataType::Decimal256(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( - *p as i32, *s as i32, - ))), - ArrowDataType::Date32 => Ok(DataType::Primitive(PrimitiveType::Date)), - ArrowDataType::Date64 => Ok(DataType::Primitive(PrimitiveType::Date)), - ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => { - Ok(DataType::Primitive(PrimitiveType::Timestamp)) - } - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz)) - if tz.eq_ignore_ascii_case("utc") => - { - Ok(DataType::Primitive(PrimitiveType::Timestamp)) - } - ArrowDataType::Struct(fields) => { - let converted_fields: Result, _> = fields - .iter() - .map(|field| field.as_ref().try_into()) - .collect(); - Ok(DataType::Struct(Box::new(StructType::new( - converted_fields?, - )))) - } - ArrowDataType::List(field) => Ok(DataType::Array(Box::new(ArrayType::new( - (*field).data_type().try_into()?, - (*field).is_nullable(), - )))), - ArrowDataType::LargeList(field) => Ok(DataType::Array(Box::new(ArrayType::new( - (*field).data_type().try_into()?, - (*field).is_nullable(), - )))), - ArrowDataType::FixedSizeList(field, _) => Ok(DataType::Array(Box::new( - ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()), - ))), - ArrowDataType::Map(field, _) => { - if let ArrowDataType::Struct(struct_fields) = field.data_type() { - let key_type = struct_fields[0].data_type().try_into()?; - let value_type = struct_fields[1].data_type().try_into()?; - let value_type_nullable = struct_fields[1].is_nullable(); - Ok(DataType::Map(Box::new(MapType::new( - key_type, - value_type, - value_type_nullable, - )))) - } else { - panic!("DataType::Map should contain a struct field child"); - } - } - s => Err(ArrowError::SchemaError(format!( - "Invalid data type for Delta Lake: {s}" - ))), - } - } -} - -macro_rules! arrow_map { - ($fieldname: ident, null) => { - ArrowField::new( - stringify!($fieldname), - ArrowDataType::Map( - Arc::new(ArrowField::new( - "entries", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::Utf8, false), - ArrowField::new("value", ArrowDataType::Utf8, true), - ] - .into(), - ), - false, - )), - false, - ), - true, - ) - }; - ($fieldname: ident, not_null) => { - ArrowField::new( - stringify!($fieldname), - ArrowDataType::Map( - Arc::new(ArrowField::new( - "entries", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::Utf8, false), - ArrowField::new("value", ArrowDataType::Utf8, false), - ] - .into(), - ), - false, - )), - false, - ), - false, - ) - }; -} - -macro_rules! arrow_field { - ($fieldname:ident, $type_qual:ident, null) => { - ArrowField::new(stringify!($fieldname), ArrowDataType::$type_qual, true) - }; - ($fieldname:ident, $type_qual:ident, not_null) => { - ArrowField::new(stringify!($fieldname), ArrowDataType::$type_qual, false) - }; -} - -macro_rules! arrow_list { - ($fieldname:ident, $element_name:ident, $type_qual:ident, null) => { - ArrowField::new( - stringify!($fieldname), - ArrowDataType::List(Arc::new(ArrowField::new( - stringify!($element_name), - ArrowDataType::$type_qual, - true, - ))), - true, - ) - }; - ($fieldname:ident, $element_name:ident, $type_qual:ident, not_null) => { - ArrowField::new( - stringify!($fieldname), - ArrowDataType::List(Arc::new(ArrowField::new( - stringify!($element_name), - ArrowDataType::$type_qual, - true, - ))), - false, - ) - }; -} - -macro_rules! arrow_struct { - ($fieldname:ident, [$($inner:tt)+], null) => { - ArrowField::new( - stringify!($fieldname), - ArrowDataType::Struct( - arrow_defs! [$($inner)+].into() - ), - true - ) - }; - ($fieldname:ident, [$($inner:tt)+], not_null) => { - ArrowField::new( - stringify!($fieldname), - ArrowDataType::Struct( - arrow_defs! [$($inner)+].into() - ), - false - ) - } -} - -macro_rules! arrow_def { - ($fieldname:ident $(null)?) => { - arrow_map!($fieldname, null) - }; - ($fieldname:ident not_null) => { - arrow_map!($fieldname, not_null) - }; - ($fieldname:ident[$inner_name:ident]{$type_qual:ident} $(null)?) => { - arrow_list!($fieldname, $inner_name, $type_qual, null) - }; - ($fieldname:ident[$inner_name:ident]{$type_qual:ident} not_null) => { - arrow_list!($fieldname, $inner_name, $type_qual, not_null) - }; - ($fieldname:ident:$type_qual:ident $(null)?) => { - arrow_field!($fieldname, $type_qual, null) - }; - ($fieldname:ident:$type_qual:ident not_null) => { - arrow_field!($fieldname, $type_qual, not_null) - }; - ($fieldname:ident[$($inner:tt)+] $(null)?) => { - arrow_struct!($fieldname, [$($inner)+], null) - }; - ($fieldname:ident[$($inner:tt)+] not_null) => { - arrow_struct!($fieldname, [$($inner)+], not_null) - } -} - -/// A helper macro to create more readable Arrow field definitions, delimited by commas -/// -/// The argument patterns are as follows: -/// -/// fieldname (null|not_null)? -- An arrow field of type map with name "fieldname" consisting of Utf8 key-value pairs, and an -/// optional nullability qualifier (null if not specified). -/// -/// fieldname:type (null|not_null)? -- An Arrow field consisting of an atomic type. For example, -/// id:Utf8 gets mapped to ArrowField::new("id", ArrowDataType::Utf8, true). -/// where customerCount:Int64 not_null gets mapped to gets mapped to -/// ArrowField::new("customerCount", ArrowDataType::Utf8, true) -/// -/// fieldname[list_element]{list_element_type} (null|not_null)? -- An Arrow list, with the name of the elements wrapped in square brackets -/// and the type of the list elements wrapped in curly brackets. For example, -/// customers[name]{Utf8} is an nullable arrow field of type arrow list consisting -/// of elements called "name" with type Utf8. -/// -/// fieldname[element1, element2, element3, ....] (null|not_null)? -- An arrow struct with name "fieldname" consisting of elements adhering to any of the patterns -/// documented, including additional structs arbitrarily nested up to the recursion -/// limit for Rust macros. -macro_rules! arrow_defs { - () => { - vec![] as Vec - }; - ($($fieldname:ident$(:$type_qual:ident)?$([$($inner:tt)+])?$({$list_type_qual:ident})? $($nullable:ident)?),+) => { - vec![ - $(arrow_def!($fieldname$(:$type_qual)?$([$($inner)+])?$({$list_type_qual})? $($nullable)?)),+ - ] - } -} - -/// Returns an arrow schema representing the delta log for use in checkpoints -/// -/// # Arguments -/// -/// * `table_schema` - The arrow schema representing the table backed by the delta log -/// * `partition_columns` - The list of partition columns of the table. -/// * `use_extended_remove_schema` - Whether to include extended file metadata in remove action schema. -/// Required for compatibility with different versions of Databricks runtime. -pub(crate) fn delta_log_schema_for_table( - table_schema: ArrowSchema, - partition_columns: &[String], - use_extended_remove_schema: bool, -) -> ArrowSchemaRef { - lazy_static! { - static ref SCHEMA_FIELDS: Vec = arrow_defs![ - metaData[ - id:Utf8, - name:Utf8, - description:Utf8, - schemaString:Utf8, - createdTime:Int64, - partitionColumns[element]{Utf8}, - configuration, - format[provider:Utf8, options] - ], - protocol[ - minReaderVersion:Int32, - minWriterVersion:Int32 - ], - txn[ - appId:Utf8, - version:Int64 - ] - ]; - static ref ADD_FIELDS: Vec = arrow_defs![ - path:Utf8, - size:Int64, - modificationTime:Int64, - dataChange:Boolean, - stats:Utf8, - partitionValues, - tags, - deletionVector[ - storageType:Utf8 not_null, - pathOrInlineDv:Utf8 not_null, - offset:Int32 null, - sizeInBytes:Int32 not_null, - cardinality:Int64 not_null - ] - ]; - static ref REMOVE_FIELDS: Vec = arrow_defs![ - path: Utf8, - deletionTimestamp: Int64, - dataChange: Boolean, - extendedFileMetadata: Boolean - ]; - static ref REMOVE_EXTENDED_FILE_METADATA_FIELDS: Vec = - arrow_defs![size: Int64, partitionValues, tags]; - }; - - // create add fields according to the specific data table schema - let (partition_fields, non_partition_fields): (Vec, Vec) = - table_schema - .fields() - .iter() - .map(|field| field.to_owned()) - .partition(|field| partition_columns.contains(field.name())); - - let mut stats_parsed_fields: Vec = - vec![ArrowField::new("numRecords", ArrowDataType::Int64, true)]; - if !non_partition_fields.is_empty() { - let mut max_min_vec = Vec::new(); - non_partition_fields - .iter() - .for_each(|f| max_min_schema_for_fields(&mut max_min_vec, f)); - - stats_parsed_fields.extend(["minValues", "maxValues"].into_iter().map(|name| { - ArrowField::new( - name, - ArrowDataType::Struct(max_min_vec.clone().into()), - true, - ) - })); - - let mut null_count_vec = Vec::new(); - non_partition_fields - .iter() - .for_each(|f| null_count_schema_for_fields(&mut null_count_vec, f)); - let null_count_struct = ArrowField::new( - "nullCount", - ArrowDataType::Struct(null_count_vec.into()), - true, - ); - - stats_parsed_fields.push(null_count_struct); - } - let mut add_fields = ADD_FIELDS.clone(); - add_fields.push(ArrowField::new( - "stats_parsed", - ArrowDataType::Struct(stats_parsed_fields.into()), - true, - )); - if !partition_fields.is_empty() { - add_fields.push(ArrowField::new( - "partitionValues_parsed", - ArrowDataType::Struct(partition_fields.into()), - true, - )); - } - - // create remove fields with or without extendedFileMetadata - let mut remove_fields = REMOVE_FIELDS.clone(); - if use_extended_remove_schema { - remove_fields.extend(REMOVE_EXTENDED_FILE_METADATA_FIELDS.clone()); - } - - // include add and remove fields in checkpoint schema - let mut schema_fields = SCHEMA_FIELDS.clone(); - schema_fields.push(ArrowField::new( - "add", - ArrowDataType::Struct(add_fields.into()), - true, - )); - schema_fields.push(ArrowField::new( - "remove", - ArrowDataType::Struct(remove_fields.into()), - true, - )); - - let arrow_schema = ArrowSchema::new(schema_fields); - - std::sync::Arc::new(arrow_schema) -} - -fn max_min_schema_for_fields(dest: &mut Vec, f: &ArrowField) { - match f.data_type() { - ArrowDataType::Struct(struct_fields) => { - let mut child_dest = Vec::new(); - - for f in struct_fields { - max_min_schema_for_fields(&mut child_dest, f); - } - - dest.push(ArrowField::new( - f.name(), - ArrowDataType::Struct(child_dest.into()), - true, - )); - } - // don't compute min or max for list, map or binary types - ArrowDataType::List(_) | ArrowDataType::Map(_, _) | ArrowDataType::Binary => { /* noop */ } - _ => { - let f = f.clone(); - dest.push(f); - } - } -} - -fn null_count_schema_for_fields(dest: &mut Vec, f: &ArrowField) { - match f.data_type() { - ArrowDataType::Struct(struct_fields) => { - let mut child_dest = Vec::new(); - - for f in struct_fields { - null_count_schema_for_fields(&mut child_dest, f); - } - - dest.push(ArrowField::new( - f.name(), - ArrowDataType::Struct(child_dest.into()), - true, - )); - } - _ => { - let f = ArrowField::new(f.name(), ArrowDataType::Int64, true); - dest.push(f); - } - } -} - -#[cfg(test)] -mod tests { - use arrow::array::ArrayData; - use arrow_array::Array; - use arrow_array::{make_array, ArrayRef, MapArray, StringArray, StructArray}; - use arrow_buffer::{Buffer, ToByteSlice}; - use arrow_schema::Field; - - use super::*; - use std::collections::HashMap; - use std::sync::Arc; - - #[test] - fn delta_log_schema_for_table_test() { - // NOTE: We should future proof the checkpoint schema in case action schema changes. - // See https://github.com/delta-io/delta-rs/issues/287 - - let table_schema = ArrowSchema::new(vec![ - ArrowField::new("pcol", ArrowDataType::Int32, true), - ArrowField::new("col1", ArrowDataType::Int32, true), - ]); - let partition_columns = vec!["pcol".to_string()]; - let log_schema = - delta_log_schema_for_table(table_schema.clone(), partition_columns.as_slice(), false); - - // verify top-level schema contains all expected fields and they are named correctly. - let expected_fields = ["metaData", "protocol", "txn", "remove", "add"]; - for f in log_schema.fields().iter() { - assert!(expected_fields.contains(&f.name().as_str())); - } - assert_eq!(5, log_schema.fields().len()); - - // verify add fields match as expected. a lot of transformation goes into these. - let add_fields: Vec<_> = log_schema - .fields() - .iter() - .filter(|f| f.name() == "add") - .flat_map(|f| { - if let ArrowDataType::Struct(fields) = f.data_type() { - fields.iter().cloned() - } else { - unreachable!(); - } - }) - .collect(); - let field_names: Vec<&String> = add_fields.iter().map(|v| v.name()).collect(); - assert_eq!( - vec![ - "path", - "size", - "modificationTime", - "dataChange", - "stats", - "partitionValues", - "tags", - "deletionVector", - "stats_parsed", - "partitionValues_parsed" - ], - field_names - ); - let add_field_map: HashMap<_, _> = add_fields - .iter() - .map(|f| (f.name().to_owned(), f.clone())) - .collect(); - let partition_values_parsed = add_field_map.get("partitionValues_parsed").unwrap(); - if let ArrowDataType::Struct(fields) = partition_values_parsed.data_type() { - assert_eq!(1, fields.len()); - let field = fields.get(0).unwrap().to_owned(); - assert_eq!( - Arc::new(ArrowField::new("pcol", ArrowDataType::Int32, true)), - field - ); - } else { - unreachable!(); - } - let stats_parsed = add_field_map.get("stats_parsed").unwrap(); - if let ArrowDataType::Struct(fields) = stats_parsed.data_type() { - assert_eq!(4, fields.len()); - - let field_map: HashMap<_, _> = fields - .iter() - .map(|f| (f.name().to_owned(), f.clone())) - .collect(); - - for (k, v) in field_map.iter() { - match k.as_ref() { - "minValues" | "maxValues" | "nullCount" => match v.data_type() { - ArrowDataType::Struct(fields) => { - assert_eq!(1, fields.len()); - let field = fields.get(0).unwrap().to_owned(); - let data_type = if k == "nullCount" { - ArrowDataType::Int64 - } else { - ArrowDataType::Int32 - }; - assert_eq!(Arc::new(ArrowField::new("col1", data_type, true)), field); - } - _ => unreachable!(), - }, - "numRecords" => {} - _ => panic!(), - } - } - } else { - unreachable!(); - } - - // verify extended remove schema fields **ARE NOT** included when `use_extended_remove_schema` is false. - let num_remove_fields = log_schema - .fields() - .iter() - .filter(|f| f.name() == "remove") - .flat_map(|f| { - if let ArrowDataType::Struct(fields) = f.data_type() { - fields.iter().cloned() - } else { - unreachable!(); - } - }) - .count(); - assert_eq!(4, num_remove_fields); - - // verify extended remove schema fields **ARE** included when `use_extended_remove_schema` is true. - let log_schema = - delta_log_schema_for_table(table_schema, partition_columns.as_slice(), true); - let remove_fields: Vec<_> = log_schema - .fields() - .iter() - .filter(|f| f.name() == "remove") - .flat_map(|f| { - if let ArrowDataType::Struct(fields) = f.data_type() { - fields.iter().cloned() - } else { - unreachable!(); - } - }) - .collect(); - assert_eq!(7, remove_fields.len()); - let expected_fields = [ - "path", - "deletionTimestamp", - "dataChange", - "extendedFileMetadata", - "partitionValues", - "size", - "tags", - ]; - for f in remove_fields.iter() { - assert!(expected_fields.contains(&f.name().as_str())); - } - } - - #[test] - fn test_arrow_from_delta_decimal_type() { - let precision = 20; - let scale = 2; - let decimal_field = DataType::Primitive(PrimitiveType::Decimal(precision, scale)); - assert_eq!( - >::try_from(&decimal_field).unwrap(), - ArrowDataType::Decimal128(precision as u8, scale as i8) - ); - } - - #[test] - fn test_arrow_from_delta_timestamp_type() { - let timestamp_field = DataType::Primitive(PrimitiveType::Timestamp); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - ArrowDataType::Timestamp(TimeUnit::Microsecond, None) - ); - } - - #[test] - fn test_delta_from_arrow_timestamp_type() { - let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, None); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - DataType::Primitive(PrimitiveType::Timestamp) - ); - } - - #[test] - fn test_delta_from_arrow_timestamp_type_with_tz() { - let timestamp_field = - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())); - assert_eq!( - >::try_from(×tamp_field).unwrap(), - DataType::Primitive(PrimitiveType::Timestamp) - ); - } - - #[test] - fn test_delta_from_arrow_map_type() { - let arrow_map = ArrowDataType::Map( - Arc::new(ArrowField::new( - "entries", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::Int8, false), - ArrowField::new("value", ArrowDataType::Binary, true), - ] - .into(), - ), - false, - )), - false, - ); - let converted_map: DataType = (&arrow_map).try_into().unwrap(); - - assert_eq!( - converted_map, - DataType::Map(Box::new(MapType::new( - DataType::Primitive(PrimitiveType::Byte), - DataType::Primitive(PrimitiveType::Binary), - true, - ))) - ); - } - - #[test] - fn test_record_batch_from_map_type() { - let keys = vec!["0", "1", "5", "6", "7"]; - let values: Vec<&[u8]> = vec![ - b"test_val_1", - b"test_val_2", - b"long_test_val_3", - b"4", - b"test_val_5", - ]; - let entry_offsets = vec![0u32, 1, 1, 4, 5, 5]; - let num_rows = keys.len(); - - // Copied the function `new_from_string` with the patched code from https://github.com/apache/arrow-rs/pull/4808 - // This should be reverted back [`MapArray::new_from_strings`] once arrow is upgraded in this project. - fn new_from_strings<'a>( - keys: impl Iterator, - values: &dyn Array, - entry_offsets: &[u32], - ) -> Result { - let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); - let keys_data = StringArray::from_iter_values(keys); - - let keys_field = Arc::new(Field::new("keys", ArrowDataType::Utf8, false)); - let values_field = Arc::new(Field::new( - "values", - values.data_type().clone(), - values.null_count() > 0, - )); - - let entry_struct = StructArray::from(vec![ - (keys_field, Arc::new(keys_data) as ArrayRef), - (values_field, make_array(values.to_data())), - ]); - - let map_data_type = ArrowDataType::Map( - Arc::new(Field::new( - "entries", - entry_struct.data_type().clone(), - false, - )), - false, - ); - - let map_data = ArrayData::builder(map_data_type) - .len(entry_offsets.len() - 1) - .add_buffer(entry_offsets_buffer) - .add_child_data(entry_struct.into_data()) - .build()?; - - Ok(MapArray::from(map_data)) - } - - let map_array = new_from_strings( - keys.into_iter(), - &arrow::array::BinaryArray::from(values), - entry_offsets.as_slice(), - ) - .expect("Could not create a map array"); - - let schema = - >::try_from(&StructType::new(vec![ - StructField::new( - "example".to_string(), - DataType::Map(Box::new(MapType::new( - DataType::Primitive(PrimitiveType::String), - DataType::Primitive(PrimitiveType::Binary), - false, - ))), - false, - ), - ])) - .expect("Could not get schema"); - - let record_batch = - arrow::record_batch::RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) - .expect("Failed to create RecordBatch"); - - assert_eq!(record_batch.num_columns(), 1); - assert_eq!(record_batch.num_rows(), num_rows); - } - - #[test] - fn test_max_min_schema_for_fields() { - let mut max_min_vec: Vec = Vec::new(); - let fields = [ - ArrowField::new("simple", ArrowDataType::Int32, true), - ArrowField::new( - "struct", - ArrowDataType::Struct( - vec![ArrowField::new("simple", ArrowDataType::Int32, true)].into(), - ), - true, - ), - ArrowField::new( - "list", - ArrowDataType::List(Arc::new(ArrowField::new( - "simple", - ArrowDataType::Int32, - true, - ))), - true, - ), - ArrowField::new( - "map", - ArrowDataType::Map( - Arc::new(ArrowField::new( - "struct", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::Int32, true), - ArrowField::new("value", ArrowDataType::Int32, true), - ] - .into(), - ), - true, - )), - true, - ), - true, - ), - ArrowField::new("binary", ArrowDataType::Binary, true), - ]; - - let expected = vec![fields[0].clone(), fields[1].clone()]; - - fields - .iter() - .for_each(|f| max_min_schema_for_fields(&mut max_min_vec, f)); - - assert_eq!(max_min_vec, expected); - } - - #[test] - fn test_null_count_schema_for_fields() { - let mut null_count_vec: Vec = Vec::new(); - let fields = [ - ArrowField::new("int32", ArrowDataType::Int32, true), - ArrowField::new("int64", ArrowDataType::Int64, true), - ArrowField::new("Utf8", ArrowDataType::Utf8, true), - ArrowField::new( - "list", - ArrowDataType::List(Arc::new(ArrowField::new( - "simple", - ArrowDataType::Int32, - true, - ))), - true, - ), - ArrowField::new( - "map", - ArrowDataType::Map( - Arc::new(ArrowField::new( - "struct", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::Int32, true), - ArrowField::new("value", ArrowDataType::Int32, true), - ] - .into(), - ), - true, - )), - true, - ), - true, - ), - ArrowField::new( - "struct", - ArrowDataType::Struct( - vec![ArrowField::new("int32", ArrowDataType::Int32, true)].into(), - ), - true, - ), - ]; - let expected = vec![ - ArrowField::new(fields[0].name(), ArrowDataType::Int64, true), - ArrowField::new(fields[1].name(), ArrowDataType::Int64, true), - ArrowField::new(fields[2].name(), ArrowDataType::Int64, true), - ArrowField::new(fields[3].name(), ArrowDataType::Int64, true), - ArrowField::new(fields[4].name(), ArrowDataType::Int64, true), - ArrowField::new( - fields[5].name(), - ArrowDataType::Struct( - vec![ArrowField::new("int32", ArrowDataType::Int64, true)].into(), - ), - true, - ), - ]; - fields - .iter() - .for_each(|f| null_count_schema_for_fields(&mut null_count_vec, f)); - assert_eq!(null_count_vec, expected); - } - - /* - * This test validates the trait implementation of - * TryFrom<&Arc> for schema::SchemaField which is required with Arrow 37 since - * iterators on Fields will give an &Arc - */ - #[test] - fn tryfrom_arrowfieldref_with_structs() { - let field = Arc::new(ArrowField::new( - "test_struct", - ArrowDataType::Struct( - vec![ - ArrowField::new("key", ArrowDataType::Int32, true), - ArrowField::new("value", ArrowDataType::Int32, true), - ] - .into(), - ), - true, - )); - let _converted: StructField = field.as_ref().try_into().unwrap(); - } -} diff --git a/crates/deltalake-core/src/table/config.rs b/crates/deltalake-core/src/table/config.rs index 3fa021ce6e..5b82b401b6 100644 --- a/crates/deltalake-core/src/table/config.rs +++ b/crates/deltalake-core/src/table/config.rs @@ -11,6 +11,7 @@ use crate::errors::DeltaTableError; /// /// #[derive(PartialEq, Eq, Hash)] +#[non_exhaustive] pub enum DeltaConfigKey { /// true for this Delta table to be append-only. If append-only, /// existing records cannot be deleted, and existing values cannot be updated. @@ -100,6 +101,9 @@ pub enum DeltaConfigKey { /// The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb. TuneFileSizesForRewrites, + + /// 'classic' for classic Delta Lake checkpoints. 'v2' for v2 checkpoints. + CheckpointPolicy, } impl AsRef for DeltaConfigKey { @@ -111,6 +115,7 @@ impl AsRef for DeltaConfigKey { Self::AutoOptimizeOptimizeWrite => "delta.autoOptimize.optimizeWrite", Self::CheckpointWriteStatsAsJson => "delta.checkpoint.writeStatsAsJson", Self::CheckpointWriteStatsAsStruct => "delta.checkpoint.writeStatsAsStruct", + Self::CheckpointPolicy => "delta.checkpointPolicy", Self::ColumnMappingMode => "delta.columnMapping.mode", Self::DataSkippingNumIndexedCols => "delta.dataSkippingNumIndexedCols", Self::DeletedFileRetentionDuration => "delta.deletedFileRetentionDuration", @@ -140,6 +145,7 @@ impl FromStr for DeltaConfigKey { "delta.autoOptimize.optimizeWrite" => Ok(Self::AutoOptimizeOptimizeWrite), "delta.checkpoint.writeStatsAsJson" => Ok(Self::CheckpointWriteStatsAsJson), "delta.checkpoint.writeStatsAsStruct" => Ok(Self::CheckpointWriteStatsAsStruct), + "delta.checkpointPolicy" => Ok(Self::CheckpointPolicy), "delta.columnMapping.mode" => Ok(Self::ColumnMappingMode), "delta.dataSkippingNumIndexedCols" => Ok(Self::DataSkippingNumIndexedCols), "delta.deletedFileRetentionDuration" | "deletedFileRetentionDuration" => { @@ -280,6 +286,14 @@ impl<'a> TableConfig<'a> { .and_then(|o| o.as_ref().and_then(|v| v.parse().ok())) .unwrap_or_default() } + + /// Policy applied during chepoint creation + pub fn checkpoint_policy(&self) -> CheckpointPolicy { + self.0 + .get(DeltaConfigKey::CheckpointPolicy.as_ref()) + .and_then(|o| o.as_ref().and_then(|v| v.parse().ok())) + .unwrap_or_default() + } } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] @@ -338,6 +352,48 @@ impl FromStr for IsolationLevel { } } +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +/// The checkpoint policy applied when writing checkpoints +#[serde(rename_all = "camelCase")] +pub enum CheckpointPolicy { + /// classic Delta Lake checkpoints + Classic, + /// v2 checkpoints + V2, + /// unknown checkpoint policy + Other(String), +} + +impl Default for CheckpointPolicy { + fn default() -> Self { + Self::Classic + } +} + +impl AsRef for CheckpointPolicy { + fn as_ref(&self) -> &str { + match self { + Self::Classic => "classic", + Self::V2 => "v2", + Self::Other(s) => s, + } + } +} + +impl FromStr for CheckpointPolicy { + type Err = DeltaTableError; + + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "classic" => Ok(Self::Classic), + "v2" => Ok(Self::V2), + _ => Err(DeltaTableError::Generic( + "Invalid string for CheckpointPolicy".into(), + )), + } + } +} + const SECONDS_PER_MINUTE: u64 = 60; const SECONDS_PER_HOUR: u64 = 60 * SECONDS_PER_MINUTE; const SECONDS_PER_DAY: u64 = 24 * SECONDS_PER_HOUR; From e48b8a77d26645f6f20a81a736f6866934b7dc47 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Sun, 19 Nov 2023 16:11:33 +0100 Subject: [PATCH 20/23] feat(python): expose `convert_to_deltalake` (#1842) # Description Exposes added `convert to delta` functionality by @junjunjd to Python API. # Related Issue(s) - closes #1767 --------- Co-authored-by: Robert Pack <42610831+roeap@users.noreply.github.com> --- .../src/operations/convert_to_delta.rs | 16 ++- crates/deltalake-core/src/protocol/mod.rs | 20 +++- python/deltalake/__init__.py | 1 + python/deltalake/_internal.pyi | 10 ++ python/deltalake/writer.py | 55 +++++++++++ python/src/lib.rs | 68 ++++++++++--- python/tests/test_convert_to_delta.py | 97 +++++++++++++++++++ 7 files changed, 254 insertions(+), 13 deletions(-) create mode 100644 python/tests/test_convert_to_delta.py diff --git a/crates/deltalake-core/src/operations/convert_to_delta.rs b/crates/deltalake-core/src/operations/convert_to_delta.rs index 84fffa1578..644591727c 100644 --- a/crates/deltalake-core/src/operations/convert_to_delta.rs +++ b/crates/deltalake-core/src/operations/convert_to_delta.rs @@ -27,7 +27,7 @@ use serde_json::{Map, Value}; use std::{ collections::{HashMap, HashSet}, num::TryFromIntError, - str::Utf8Error, + str::{FromStr, Utf8Error}, sync::Arc, }; @@ -82,6 +82,20 @@ pub enum PartitionStrategy { Hive, } +impl FromStr for PartitionStrategy { + type Err = DeltaTableError; + + fn from_str(s: &str) -> DeltaResult { + match s.to_ascii_lowercase().as_str() { + "hive" => Ok(PartitionStrategy::Hive), + _ => Err(DeltaTableError::Generic(format!( + "Invalid partition strategy provided {}", + s + ))), + } + } +} + /// Build an operation to convert a Parquet table to a [`DeltaTable`] in place pub struct ConvertToDeltaBuilder { log_store: Option, diff --git a/crates/deltalake-core/src/protocol/mod.rs b/crates/deltalake-core/src/protocol/mod.rs index 8a5cd9f858..e2add9b529 100644 --- a/crates/deltalake-core/src/protocol/mod.rs +++ b/crates/deltalake-core/src/protocol/mod.rs @@ -23,8 +23,9 @@ use std::borrow::Borrow; use std::collections::HashMap; use std::hash::{Hash, Hasher}; use std::mem::take; +use std::str::FromStr; -use crate::errors::DeltaResult; +use crate::errors::{DeltaResult, DeltaTableError}; use crate::kernel::{Add, CommitInfo, Metadata, Protocol, Remove}; use crate::logstore::LogStore; use crate::table::CheckPoint; @@ -589,6 +590,23 @@ pub enum SaveMode { Ignore, } +impl FromStr for SaveMode { + type Err = DeltaTableError; + + fn from_str(s: &str) -> DeltaResult { + match s.to_ascii_lowercase().as_str() { + "append" => Ok(SaveMode::Append), + "overwrite" => Ok(SaveMode::Overwrite), + "error" => Ok(SaveMode::ErrorIfExists), + "ignore" => Ok(SaveMode::Ignore), + _ => Err(DeltaTableError::Generic(format!( + "Invalid save mode provided: {}, only these are supported: ['append', 'overwrite', 'error', 'ignore']", + s + ))), + } + } +} + /// The OutputMode used in streaming operations. #[derive(Serialize, Deserialize, Debug, Clone)] pub enum OutputMode { diff --git a/python/deltalake/__init__.py b/python/deltalake/__init__.py index 129eaff1cf..b10a708309 100644 --- a/python/deltalake/__init__.py +++ b/python/deltalake/__init__.py @@ -6,4 +6,5 @@ from .schema import Schema as Schema from .table import DeltaTable as DeltaTable from .table import Metadata as Metadata +from .writer import convert_to_deltalake as convert_to_deltalake from .writer import write_deltalake as write_deltalake diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index 4662f52f2f..f751afa36f 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -140,6 +140,16 @@ def write_new_deltalake( configuration: Optional[Mapping[str, Optional[str]]], storage_options: Optional[Dict[str, str]], ) -> None: ... +def convert_to_deltalake( + uri: str, + partition_by: Optional[pyarrow.Schema], + partition_strategy: Optional[Literal["hive"]], + name: Optional[str], + description: Optional[str], + configuration: Optional[Mapping[str, Optional[str]]], + storage_options: Optional[Dict[str, str]], + custom_metadata: Optional[Dict[str, str]], +) -> None: ... def batch_distinct(batch: pyarrow.RecordBatch) -> pyarrow.RecordBatch: ... # Can't implement inheritance (see note in src/schema.rs), so this is next diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index ef4ae3a57b..dd0d350eb4 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -38,6 +38,7 @@ from ._internal import DeltaDataChecker as _DeltaDataChecker from ._internal import batch_distinct +from ._internal import convert_to_deltalake as _convert_to_deltalake from ._internal import write_new_deltalake as _write_new_deltalake from .exceptions import DeltaProtocolError, TableNotFoundError from .table import MAX_SUPPORTED_WRITER_VERSION, DeltaTable @@ -391,6 +392,60 @@ def validate_batch(batch: pa.RecordBatch) -> pa.RecordBatch: table.update_incremental() +def convert_to_deltalake( + uri: Union[str, Path], + mode: Literal["error", "ignore"] = "error", + partition_by: Optional[pa.Schema] = None, + partition_strategy: Optional[Literal["hive"]] = None, + name: Optional[str] = None, + description: Optional[str] = None, + configuration: Optional[Mapping[str, Optional[str]]] = None, + storage_options: Optional[Dict[str, str]] = None, + custom_metadata: Optional[Dict[str, str]] = None, +) -> None: + """ + `Convert` parquet tables `to delta` tables. + + Currently only HIVE partitioned tables are supported. `Convert to delta` creates + a transaction log commit with add actions, and additional properties provided such + as configuration, name, and description. + + Args: + uri: URI of a table. + partition_by: Optional partitioning schema if table is partitioned. + partition_strategy: Optional partition strategy to read and convert + mode: How to handle existing data. Default is to error if table already exists. + If 'ignore', will not convert anything if table already exists. + name: User-provided identifier for this table. + description: User-provided description for this table. + configuration: A map containing configuration options for the metadata action. + storage_options: options passed to the native delta filesystem. Unused if 'filesystem' is defined. + custom_metadata: custom metadata that will be added to the transaction commit + """ + if partition_by is not None and partition_strategy is None: + raise ValueError("Partition strategy has to be provided with partition_by.") + + if partition_strategy is not None and partition_strategy != "hive": + raise ValueError( + "Currently only `hive` partition strategy is supported to be converted." + ) + + if mode == "ignore" and try_get_deltatable(uri, storage_options) is not None: + return + + _convert_to_deltalake( + str(uri), + partition_by, + partition_strategy, + name, + description, + configuration, + storage_options, + custom_metadata, + ) + return + + def __enforce_append_only( table: Optional[DeltaTable], configuration: Optional[Mapping[str, Optional[str]]], diff --git a/python/src/lib.rs b/python/src/lib.rs index 5ee72f72d0..b9067dfec9 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -27,6 +27,7 @@ use deltalake::datafusion::prelude::SessionContext; use deltalake::delta_datafusion::DeltaDataChecker; use deltalake::errors::DeltaTableError; use deltalake::kernel::{Action, Add, Invariant, Metadata, Remove, StructType}; +use deltalake::operations::convert_to_delta::{ConvertToDeltaBuilder, PartitionStrategy}; use deltalake::operations::delete::DeleteBuilder; use deltalake::operations::filesystem_check::FileSystemCheckBuilder; use deltalake::operations::merge::MergeBuilder; @@ -43,6 +44,7 @@ use deltalake::DeltaTableBuilder; use pyo3::exceptions::{PyIOError, PyRuntimeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyFrozenSet, PyType}; +use serde_json::{Map, Value}; use crate::error::DeltaProtocolError; use crate::error::PythonError; @@ -758,7 +760,8 @@ impl RawDeltaTable { schema: PyArrowType, partitions_filters: Option>, ) -> PyResult<()> { - let mode = save_mode_from_str(mode)?; + let mode = mode.parse().map_err(PythonError::from)?; + let schema: StructType = (&schema.0).try_into().map_err(PythonError::from)?; let existing_schema = self._table.get_schema().map_err(PythonError::from)?; @@ -1088,16 +1091,6 @@ fn batch_distinct(batch: PyArrowType) -> PyResult PyResult { - match value { - "append" => Ok(SaveMode::Append), - "overwrite" => Ok(SaveMode::Overwrite), - "error" => Ok(SaveMode::ErrorIfExists), - "ignore" => Ok(SaveMode::Ignore), - _ => Err(PyValueError::new_err("Invalid save mode")), - } -} - fn current_timestamp() -> i64 { let start = SystemTime::now(); let since_the_epoch = start @@ -1180,6 +1173,58 @@ fn write_new_deltalake( Ok(()) } +#[pyfunction] +#[allow(clippy::too_many_arguments)] +fn convert_to_deltalake( + uri: String, + partition_schema: Option>, + partition_strategy: Option, + name: Option, + description: Option, + configuration: Option>>, + storage_options: Option>, + custom_metadata: Option>, +) -> PyResult<()> { + let mut builder = ConvertToDeltaBuilder::new().with_location(uri); + + if let Some(part_schema) = partition_schema { + let schema: StructType = (&part_schema.0).try_into().map_err(PythonError::from)?; + builder = builder.with_partition_schema(schema.fields().clone()); + } + + if let Some(partition_strategy) = &partition_strategy { + let strategy: PartitionStrategy = partition_strategy.parse().map_err(PythonError::from)?; + builder = builder.with_partition_strategy(strategy); + } + + if let Some(name) = &name { + builder = builder.with_table_name(name); + } + + if let Some(description) = &description { + builder = builder.with_comment(description); + } + + if let Some(config) = configuration { + builder = builder.with_configuration(config); + }; + + if let Some(strg_options) = storage_options { + builder = builder.with_storage_options(strg_options); + }; + + if let Some(metadata) = custom_metadata { + let json_metadata: Map = + metadata.into_iter().map(|(k, v)| (k, v.into())).collect(); + builder = builder.with_metadata(json_metadata); + }; + + rt()? + .block_on(builder.into_future()) + .map_err(PythonError::from)?; + Ok(()) +} + #[pyclass(name = "DeltaDataChecker", module = "deltalake._internal")] struct PyDeltaDataChecker { inner: DeltaDataChecker, @@ -1225,6 +1270,7 @@ fn _internal(py: Python, m: &PyModule) -> PyResult<()> { m.add("__version__", env!("CARGO_PKG_VERSION"))?; m.add_function(pyo3::wrap_pyfunction!(rust_core_version, m)?)?; m.add_function(pyo3::wrap_pyfunction!(write_new_deltalake, m)?)?; + m.add_function(pyo3::wrap_pyfunction!(convert_to_deltalake, m)?)?; m.add_function(pyo3::wrap_pyfunction!(batch_distinct, m)?)?; m.add_class::()?; m.add_class::()?; diff --git a/python/tests/test_convert_to_delta.py b/python/tests/test_convert_to_delta.py new file mode 100644 index 0000000000..29badf3358 --- /dev/null +++ b/python/tests/test_convert_to_delta.py @@ -0,0 +1,97 @@ +import pathlib + +import pyarrow as pa +import pyarrow.dataset as ds +import pytest + +from deltalake import convert_to_deltalake +from deltalake.exceptions import DeltaError +from deltalake.table import DeltaTable + + +def test_local_convert_to_delta(tmp_path: pathlib.Path, sample_data: pa.Table): + ds.write_dataset( + sample_data, + tmp_path, + format="parquet", + existing_data_behavior="overwrite_or_ignore", + ) + + name = "converted_table" + description = "parquet table converted to delta table with delta-rs" + convert_to_deltalake( + tmp_path, + name=name, + description=description, + configuration={"delta.AppendOnly": "True"}, + ) + + dt = DeltaTable(tmp_path) + + assert dt.version() == 0 + assert dt.files() == ["part-0.parquet"] + assert dt.metadata().name == name + assert dt.metadata().description == description + assert dt.metadata().configuration == {"delta.AppendOnly": "True"} + + +def test_convert_delta_write_modes(tmp_path: pathlib.Path, sample_data: pa.Table): + ds.write_dataset( + sample_data, + tmp_path, + format="parquet", + existing_data_behavior="overwrite_or_ignore", + ) + + convert_to_deltalake( + tmp_path, + ) + + with pytest.raises(DeltaError): + convert_to_deltalake( + tmp_path, + ) + + convert_to_deltalake(tmp_path, mode="ignore") + + +def test_convert_delta_with_partitioning(tmp_path: pathlib.Path, sample_data: pa.Table): + ds.write_dataset( + sample_data, + tmp_path, + format="parquet", + existing_data_behavior="overwrite_or_ignore", + partitioning=["utf8"], + partitioning_flavor="hive", + ) + + with pytest.raises( + DeltaError, + match="Generic error: The schema of partition columns must be provided to convert a Parquet table to a Delta table", + ): + convert_to_deltalake( + tmp_path, + ) + with pytest.raises( + ValueError, match="Partition strategy has to be provided with partition_by" + ): + convert_to_deltalake( + tmp_path, + partition_by=pa.schema([pa.field("utf8", pa.string())]), + ) + + with pytest.raises( + ValueError, + match="Currently only `hive` partition strategy is supported to be converted.", + ): + convert_to_deltalake( + tmp_path, + partition_by=pa.schema([pa.field("utf8", pa.string())]), + partition_strategy="directory", + ) + + convert_to_deltalake( + tmp_path, + partition_by=pa.schema([pa.field("utf8", pa.string())]), + partition_strategy="hive", + ) From 8a66343aa361d396ce5a637f9d3ada5844758481 Mon Sep 17 00:00:00 2001 From: David Blajda Date: Sun, 19 Nov 2023 18:52:55 -0500 Subject: [PATCH 21/23] refactor: merge to use logical plans (#1720) # Description This refactors the merge operation to use DataFusion's DataFrame and LogicalPlan APIs The NLJ is eliminated and the query planner can pick the optimal join operator. This also enables the operation to use multiple threads and should result in significant speed up. Merge is still limited to using a single thread in some area. When collecting benchmarks, I encountered multiple OoM issues with Datafusion's hash join implementation. There are multiple tickets upstream open regarding this. For now, I've limited the number of partitions to just 1 to prevent this. Predicates passed as SQL are also easier to use now. Manual casting was required to ensure data types were aligned. Now the logical plan will perform type coercion when optimizing the plan. # Related Issues - enhances #850 - closes #1790 - closes #1753 --- .../src/delta_datafusion/logical.rs | 48 ++ .../src/delta_datafusion/mod.rs | 9 +- .../src/delta_datafusion/physical.rs | 180 +++++ crates/deltalake-core/src/operations/merge.rs | 631 +++++++++--------- crates/deltalake-core/src/operations/mod.rs | 136 +--- .../deltalake-core/src/operations/update.rs | 5 +- 6 files changed, 536 insertions(+), 473 deletions(-) create mode 100644 crates/deltalake-core/src/delta_datafusion/logical.rs create mode 100644 crates/deltalake-core/src/delta_datafusion/physical.rs diff --git a/crates/deltalake-core/src/delta_datafusion/logical.rs b/crates/deltalake-core/src/delta_datafusion/logical.rs new file mode 100644 index 0000000000..7b05dd57d9 --- /dev/null +++ b/crates/deltalake-core/src/delta_datafusion/logical.rs @@ -0,0 +1,48 @@ +//! Logical Operations for DataFusion + +use datafusion_expr::{LogicalPlan, UserDefinedLogicalNodeCore}; + +// Metric Observer is used to update DataFusion metrics from a record batch. +// See MetricObserverExec for the physical implementation + +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) struct MetricObserver { + // id is preserved during conversion to physical node + pub id: String, + pub input: LogicalPlan, +} + +impl UserDefinedLogicalNodeCore for MetricObserver { + // Predicate push down is not supported for this node. Try to limit usage + // near the end of plan. + fn name(&self) -> &str { + "MetricObserver" + } + + fn inputs(&self) -> Vec<&datafusion_expr::LogicalPlan> { + vec![&self.input] + } + + fn schema(&self) -> &datafusion_common::DFSchemaRef { + self.input.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "MetricObserver id={}", &self.id) + } + + fn from_template( + &self, + _exprs: &[datafusion_expr::Expr], + inputs: &[datafusion_expr::LogicalPlan], + ) -> Self { + MetricObserver { + id: self.id.clone(), + input: inputs[0].clone(), + } + } +} diff --git a/crates/deltalake-core/src/delta_datafusion/mod.rs b/crates/deltalake-core/src/delta_datafusion/mod.rs index 1410efbfbc..8dea811383 100644 --- a/crates/deltalake-core/src/delta_datafusion/mod.rs +++ b/crates/deltalake-core/src/delta_datafusion/mod.rs @@ -81,6 +81,8 @@ use crate::{open_table, open_table_with_storage_options, DeltaTable}; const PATH_COLUMN: &str = "__delta_rs_path"; pub mod expr; +pub mod logical; +pub mod physical; impl From for DataFusionError { fn from(err: DeltaTableError) -> Self { @@ -351,7 +353,7 @@ pub(crate) fn logical_schema( snapshot: &DeltaTableState, scan_config: &DeltaScanConfig, ) -> DeltaResult { - let input_schema = snapshot.input_schema()?; + let input_schema = snapshot.arrow_schema()?; let mut fields = Vec::new(); for field in input_schema.fields.iter() { fields.push(field.to_owned()); @@ -505,11 +507,6 @@ impl<'a> DeltaScanBuilder<'a> { self } - pub fn with_schema(mut self, schema: SchemaRef) -> Self { - self.schema = Some(schema); - self - } - pub async fn build(self) -> DeltaResult { let config = self.config; let schema = match self.schema { diff --git a/crates/deltalake-core/src/delta_datafusion/physical.rs b/crates/deltalake-core/src/delta_datafusion/physical.rs new file mode 100644 index 0000000000..954df0b046 --- /dev/null +++ b/crates/deltalake-core/src/delta_datafusion/physical.rs @@ -0,0 +1,180 @@ +//! Physical Operations for DataFusion +use std::sync::Arc; + +use arrow_schema::SchemaRef; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::error::Result as DataFusionResult; +use datafusion::physical_plan::DisplayAs; +use datafusion::physical_plan::{ + metrics::{ExecutionPlanMetricsSet, MetricsSet}, + ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, +}; +use futures::{Stream, StreamExt}; + +use crate::DeltaTableError; + +// Metric Observer is used to update DataFusion metrics from a record batch. +// Typically the null count for a particular column is pulled after performing a +// projection since this count is easy to obtain + +pub(crate) type MetricObserverFunction = fn(&RecordBatch, &ExecutionPlanMetricsSet) -> (); + +pub(crate) struct MetricObserverExec { + parent: Arc, + id: String, + metrics: ExecutionPlanMetricsSet, + update: MetricObserverFunction, +} + +impl MetricObserverExec { + pub fn new(id: String, parent: Arc, f: MetricObserverFunction) -> Self { + MetricObserverExec { + parent, + id, + metrics: ExecutionPlanMetricsSet::new(), + update: f, + } + } + + pub fn try_new( + id: String, + inputs: &[Arc], + f: MetricObserverFunction, + ) -> DataFusionResult> { + match inputs { + [input] => Ok(Arc::new(MetricObserverExec::new(id, input.clone(), f))), + _ => Err(datafusion_common::DataFusionError::External(Box::new( + DeltaTableError::Generic("MetricObserverExec expects only one child".into()), + ))), + } + } + + pub fn id(&self) -> &str { + &self.id + } +} + +impl std::fmt::Debug for MetricObserverExec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MetricObserverExec") + .field("id", &self.id) + .field("metrics", &self.metrics) + .finish() + } +} + +impl DisplayAs for MetricObserverExec { + fn fmt_as( + &self, + _: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "MetricObserverExec id={}", self.id) + } +} + +impl ExecutionPlan for MetricObserverExec { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> arrow_schema::SchemaRef { + self.parent.schema() + } + + fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning { + self.parent.output_partitioning() + } + + fn output_ordering(&self) -> Option<&[datafusion_physical_expr::PhysicalSortExpr]> { + self.parent.output_ordering() + } + + fn children(&self) -> Vec> { + vec![self.parent.clone()] + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> datafusion_common::Result { + let res = self.parent.execute(partition, context)?; + Ok(Box::pin(MetricObserverStream { + schema: self.schema(), + input: res, + metrics: self.metrics.clone(), + update: self.update, + })) + } + + fn statistics(&self) -> DataFusionResult { + self.parent.statistics() + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> datafusion_common::Result> { + MetricObserverExec::try_new(self.id.clone(), &children, self.update) + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } +} + +struct MetricObserverStream { + schema: SchemaRef, + input: SendableRecordBatchStream, + metrics: ExecutionPlanMetricsSet, + update: MetricObserverFunction, +} + +impl Stream for MetricObserverStream { + type Item = DataFusionResult; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + self.input.poll_next_unpin(cx).map(|x| match x { + Some(Ok(batch)) => { + (self.update)(&batch, &self.metrics); + Some(Ok(batch)) + } + other => other, + }) + } + + fn size_hint(&self) -> (usize, Option) { + self.input.size_hint() + } +} + +impl RecordBatchStream for MetricObserverStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +pub(crate) fn find_metric_node( + id: &str, + parent: &Arc, +) -> Option> { + //! Used to locate the physical MetricCountExec Node after the planner converts the logical node + if let Some(metric) = parent.as_any().downcast_ref::() { + if metric.id().eq(id) { + return Some(parent.to_owned()); + } + } + + for child in &parent.children() { + let res = find_metric_node(id, child); + if res.is_some() { + return res; + } + } + + None +} diff --git a/crates/deltalake-core/src/operations/merge.rs b/crates/deltalake-core/src/operations/merge.rs index a9ad6a8655..8b0dd56708 100644 --- a/crates/deltalake-core/src/operations/merge.rs +++ b/crates/deltalake-core/src/operations/merge.rs @@ -8,8 +8,7 @@ //! specified matter. See [`MergeBuilder`] for more information //! //! *WARNING* The current implementation rewrites the entire delta table so only -//! use on small to medium sized tables. The solution also cannot take advantage -//! of multiple threads and is limited to a single single thread. +//! use on small to medium sized tables. //! Enhancements tracked at #850 //! //! # Example @@ -37,27 +36,25 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Instant, SystemTime, UNIX_EPOCH}; -use arrow_schema::SchemaRef; +use async_trait::async_trait; +use datafusion::datasource::provider_as_source; use datafusion::error::Result as DataFusionResult; +use datafusion::execution::context::{QueryPlanner, SessionConfig}; use datafusion::logical_expr::build_join_schema; -use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; +use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}; use datafusion::{ execution::context::SessionState, physical_plan::{ - filter::FilterExec, - joins::{ - utils::{build_join_schema as physical_build_join_schema, JoinFilter}, - NestedLoopJoinExec, - }, metrics::{MetricBuilder, MetricsSet}, - projection::ProjectionExec, ExecutionPlan, }, prelude::{DataFrame, SessionContext}, }; -use datafusion_common::{Column, DFField, DFSchema, ScalarValue, TableReference}; +use datafusion_common::{Column, DFSchema, ScalarValue, TableReference}; use datafusion_expr::{col, conditional_expressions::CaseBuilder, lit, when, Expr, JoinType}; -use datafusion_physical_expr::{create_physical_expr, expressions, PhysicalExpr}; +use datafusion_expr::{ + Extension, LogicalPlan, LogicalPlanBuilder, UserDefinedLogicalNode, UNNAMED_TABLE, +}; use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; @@ -66,15 +63,19 @@ use serde_json::Value; use super::datafusion_utils::{into_expr, maybe_into_expr, Expression}; use super::transaction::{commit, PROTOCOL}; use crate::delta_datafusion::expr::{fmt_expr_to_sql, parse_predicate_expression}; -use crate::delta_datafusion::{register_store, DeltaScanBuilder}; +use crate::delta_datafusion::logical::MetricObserver; +use crate::delta_datafusion::physical::{find_metric_node, MetricObserverExec}; +use crate::delta_datafusion::{register_store, DeltaScanConfig, DeltaTableProvider}; use crate::kernel::{Action, Remove}; use crate::logstore::LogStoreRef; -use crate::operations::datafusion_utils::MetricObserverExec; use crate::operations::write::write_execution_plan; use crate::protocol::{DeltaOperation, MergePredicate}; use crate::table::state::DeltaTableState; use crate::{DeltaResult, DeltaTable, DeltaTableError}; +const SOURCE_COLUMN: &str = "__delta_rs_source"; +const TARGET_COLUMN: &str = "__delta_rs_target"; + const OPERATION_COLUMN: &str = "__delta_rs_operation"; const DELETE_COLUMN: &str = "__delta_rs_delete"; const TARGET_INSERT_COLUMN: &str = "__delta_rs_target_insert"; @@ -83,11 +84,16 @@ const TARGET_DELETE_COLUMN: &str = "__delta_rs_target_delete"; const TARGET_COPY_COLUMN: &str = "__delta_rs_target_copy"; const SOURCE_COUNT_METRIC: &str = "num_source_rows"; +const TARGET_COUNT_METRIC: &str = "num_target_rows"; const TARGET_COPY_METRIC: &str = "num_copied_rows"; const TARGET_INSERTED_METRIC: &str = "num_target_inserted_rows"; const TARGET_UPDATED_METRIC: &str = "num_target_updated_rows"; const TARGET_DELETED_METRIC: &str = "num_target_deleted_rows"; +const SOURCE_COUNT_ID: &str = "merge_source_count"; +const TARGET_COUNT_ID: &str = "merge_target_count"; +const OUTPUT_COUNT_ID: &str = "merge_output_count"; + /// Merge records into a Delta Table. pub struct MergeBuilder { /// The join predicate @@ -557,6 +563,89 @@ pub struct MergeMetrics { pub rewrite_time_ms: u64, } +struct MergeMetricExtensionPlanner {} + +#[async_trait] +impl ExtensionPlanner for MergeMetricExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> DataFusionResult>> { + if let Some(metric_observer) = node.as_any().downcast_ref::() { + if metric_observer.id.eq(SOURCE_COUNT_ID) { + return Ok(Some(MetricObserverExec::try_new( + SOURCE_COUNT_ID.into(), + physical_inputs, + |batch, metrics| { + MetricBuilder::new(metrics) + .global_counter(SOURCE_COUNT_METRIC) + .add(batch.num_rows()); + }, + )?)); + } + + if metric_observer.id.eq(TARGET_COUNT_ID) { + return Ok(Some(MetricObserverExec::try_new( + TARGET_COUNT_ID.into(), + physical_inputs, + |batch, metrics| { + MetricBuilder::new(metrics) + .global_counter(TARGET_COUNT_METRIC) + .add(batch.num_rows()); + }, + )?)); + } + + if metric_observer.id.eq(OUTPUT_COUNT_ID) { + return Ok(Some(MetricObserverExec::try_new( + OUTPUT_COUNT_ID.into(), + physical_inputs, + |batch, metrics| { + MetricBuilder::new(metrics) + .global_counter(TARGET_INSERTED_METRIC) + .add( + batch + .column_by_name(TARGET_INSERT_COLUMN) + .unwrap() + .null_count(), + ); + MetricBuilder::new(metrics) + .global_counter(TARGET_UPDATED_METRIC) + .add( + batch + .column_by_name(TARGET_UPDATE_COLUMN) + .unwrap() + .null_count(), + ); + MetricBuilder::new(metrics) + .global_counter(TARGET_DELETED_METRIC) + .add( + batch + .column_by_name(TARGET_DELETE_COLUMN) + .unwrap() + .null_count(), + ); + MetricBuilder::new(metrics) + .global_counter(TARGET_COPY_METRIC) + .add( + batch + .column_by_name(TARGET_COPY_COLUMN) + .unwrap() + .null_count(), + ); + }, + )?)); + } + } + + Ok(None) + } +} + #[allow(clippy::too_many_arguments)] async fn execute( predicate: Expression, @@ -589,83 +678,61 @@ async fn execute( // If the user specified any not_source_match operations then those // predicates also need to be considered when pruning - let target = Arc::new( - DeltaScanBuilder::new(snapshot, log_store.clone(), &state) - .with_schema(snapshot.input_schema()?) - .build() - .await?, - ); - - let source = source.create_physical_plan().await?; - - let source_count = Arc::new(MetricObserverExec::new(source, |batch, metrics| { - MetricBuilder::new(metrics) - .global_counter(SOURCE_COUNT_METRIC) - .add(batch.num_rows()); - })); - - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let source_schema = source_count.schema(); - - for (i, field) in source_schema.fields().into_iter().enumerate() { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().clone(), - )); - } - expressions.push(( - Arc::new(expressions::Literal::new(true.into())), - "__delta_rs_source".to_owned(), - )); - let source = Arc::new(ProjectionExec::try_new(expressions, source_count.clone())?); - - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let target_schema = target.schema(); - for (i, field) in target_schema.fields().into_iter().enumerate() { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } - expressions.push(( - Arc::new(expressions::Literal::new(true.into())), - "__delta_rs_target".to_owned(), - )); - let target = Arc::new(ProjectionExec::try_new(expressions, target.clone())?); - - // TODO: Currently a NestedLoopJoin is used but we should target to support SortMergeJoin - // This would require rewriting the join predicate to only contain equality between left and right columns and pushing some filters down - // Ideally it would be nice if the optimizer / planner can pick the best join so maybe explore rewriting the entire operation using logical plans. - - // NLJ requires both sides to have one partition for outer joins - let target = Arc::new(CoalescePartitionsExec::new(target)); - let source = Arc::new(CoalescePartitionsExec::new(source)); - - let source_schema = match &source_alias { - Some(alias) => { - DFSchema::try_from_qualified_schema(TableReference::bare(alias), &source.schema())? - } - None => DFSchema::try_from(source.schema().as_ref().to_owned())?, + let source_name = match &source_alias { + Some(alias) => TableReference::bare(alias.to_string()), + None => TableReference::bare(UNNAMED_TABLE), }; - let target_schema = match &target_alias { - Some(alias) => { - DFSchema::try_from_qualified_schema(TableReference::bare(alias), &target.schema())? - } - None => DFSchema::try_from(target.schema().as_ref().to_owned())?, + let target_name = match &target_alias { + Some(alias) => TableReference::bare(alias.to_string()), + None => TableReference::bare(UNNAMED_TABLE), }; - let join_schema_df = build_join_schema(&source_schema, &target_schema, &JoinType::Full)?; + // This is only done to provide the source columns with a correct table reference. Just renaming the columns does not work + let source = + LogicalPlanBuilder::scan(source_name, provider_as_source(source.into_view()), None)? + .build()?; + + let source = LogicalPlan::Extension(Extension { + node: Arc::new(MetricObserver { + id: SOURCE_COUNT_ID.into(), + input: source, + }), + }); + + let source = DataFrame::new(state.clone(), source); + let source = source.with_column(SOURCE_COLUMN, lit(true))?; + + let target_provider = Arc::new(DeltaTableProvider::try_new( + snapshot.clone(), + log_store.clone(), + DeltaScanConfig::default(), + )?); + let target_provider = provider_as_source(target_provider); + + let target = LogicalPlanBuilder::scan(target_name, target_provider, None)?.build()?; - let join_schema = - physical_build_join_schema(&source.schema(), &target.schema(), &JoinType::Full); - let (join_schema, join_order) = (join_schema.0, join_schema.1); + // TODO: This is here to prevent predicate pushdowns. In the future we can replace this node to allow pushdowns depending on which operations are being used. + let target = LogicalPlan::Extension(Extension { + node: Arc::new(MetricObserver { + id: TARGET_COUNT_ID.into(), + input: target, + }), + }); + let target = DataFrame::new(state.clone(), target); + let target = target.with_column(TARGET_COLUMN, lit(true))?; + let source_schema = source.schema(); + let target_schema = target.schema(); + let join_schema_df = build_join_schema(source_schema, target_schema, &JoinType::Full)?; let predicate = match predicate { Expression::DataFusion(expr) => expr, Expression::String(s) => parse_predicate_expression(&join_schema_df, s, &state)?, }; + let join = source.join(target, JoinType::Full, &[], &[], Some(predicate.clone()))?; + let join_schema_df = join.schema().to_owned(); + let match_operations: Vec = match_operations .into_iter() .map(|op| MergeOperation::try_from(op, &join_schema_df, &state, &target_alias)) @@ -681,40 +748,15 @@ async fn execute( .map(|op| MergeOperation::try_from(op, &join_schema_df, &state, &target_alias)) .collect::, DeltaTableError>>()?; - let predicate_expr = create_physical_expr( - &predicate, - &join_schema_df, - &join_schema, - state.execution_props(), - )?; - - let join_filter = JoinFilter::new(predicate_expr, join_order, join_schema); - let join: Arc = Arc::new(NestedLoopJoinExec::try_new( - source.clone(), - target.clone(), - Some(join_filter), - &datafusion_expr::JoinType::Full, - )?); - - // Project to include __delta_rs_operation which indicates which particular operation to perform on the column. - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let schema = join.schema(); - for (i, field) in schema.fields().into_iter().enumerate() { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } - - let matched = col("__delta_rs_source") + let matched = col(SOURCE_COLUMN) .is_true() - .and(col("__delta_rs_target").is_true()); - let not_matched_target = col("__delta_rs_source") + .and(col(TARGET_COLUMN).is_true()); + let not_matched_target = col(SOURCE_COLUMN) .is_true() - .and(col("__delta_rs_target").is_null()); - let not_matched_source = col("__delta_rs_source") + .and(col(TARGET_COLUMN).is_null()); + let not_matched_source = col(SOURCE_COLUMN) .is_null() - .and(col("__delta_rs_target")) + .and(col(TARGET_COLUMN)) .is_true(); // Plus 3 for the default operations for each match category @@ -811,35 +853,10 @@ async fn execute( let case = CaseBuilder::new(None, when_expr, then_expr, None).end()?; - let case = create_physical_expr( - &case, - &join_schema_df, - &join.schema(), - state.execution_props(), - )?; - expressions.push((case, OPERATION_COLUMN.to_owned())); - let projection = Arc::new(ProjectionExec::try_new(expressions, join.clone())?); - - let mut f = join_schema_df.fields().to_owned(); - f.push(DFField::new_unqualified( - OPERATION_COLUMN, - arrow_schema::DataType::Int64, - false, - )); - let project_schema_df = DFSchema::new_with_metadata(f, HashMap::new())?; - - // Project again and include the original table schema plus a column to mark if row needs to be filtered before write - let mut expressions: Vec<(Arc, String)> = Vec::new(); - let schema = projection.schema(); - for (i, field) in schema.fields().into_iter().enumerate() { - expressions.push(( - Arc::new(expressions::Column::new(field.name(), i)), - field.name().to_owned(), - )); - } + let projection = join.with_column(OPERATION_COLUMN, case)?; - let mut projection_map = HashMap::new(); - let mut f = project_schema_df.fields().clone(); + let mut new_columns = projection; + let mut write_projection = Vec::new(); for delta_field in snapshot.schema().unwrap().fields() { let mut when_expr = Vec::with_capacity(operations_size); @@ -853,7 +870,6 @@ async fn execute( }; let name = delta_field.name(); let column = Column::new(qualifier.clone(), name); - let field = project_schema_df.field_with_name(qualifier.as_ref(), name)?; for (idx, (operations, _)) in ops.iter().enumerate() { let op = operations @@ -873,22 +889,9 @@ async fn execute( ) .end()?; - let case = create_physical_expr( - &case, - &project_schema_df, - &projection.schema(), - state.execution_props(), - )?; - - projection_map.insert(delta_field.name(), expressions.len()); let name = "__delta_rs_c_".to_owned() + delta_field.name(); - - f.push(DFField::new_unqualified( - &name, - field.data_type().clone(), - true, - )); - expressions.push((case, name)); + write_projection.push(col(name.clone()).alias(delta_field.name())); + new_columns = new_columns.with_column(&name, case)?; } let mut insert_when = Vec::with_capacity(ops.len()); @@ -954,168 +957,47 @@ async fn execute( ); } - fn build_case( - when: Vec, - then: Vec, - schema: SchemaRef, - input_dfschema: &DFSchema, - state: &SessionState, - ) -> DataFusionResult> { - let case = CaseBuilder::new( + fn build_case(when: Vec, then: Vec) -> DataFusionResult { + CaseBuilder::new( Some(Box::new(col(OPERATION_COLUMN))), when, then, Some(Box::new(lit(false))), ) - .end()?; - - create_physical_expr(&case, input_dfschema, &schema, state.execution_props()) + .end() } - let schema = projection.schema(); - let input_dfschema = project_schema_df; - expressions.push(( - build_case( - delete_when, - delete_then, - schema.clone(), - &input_dfschema, - &state, - )?, - DELETE_COLUMN.to_owned(), - )); - f.push(DFField::new_unqualified( - DELETE_COLUMN, - arrow_schema::DataType::Boolean, - true, - )); - - expressions.push(( - build_case( - insert_when, - insert_then, - schema.clone(), - &input_dfschema, - &state, - )?, - TARGET_INSERT_COLUMN.to_owned(), - )); - f.push(DFField::new_unqualified( - TARGET_INSERT_COLUMN, - arrow_schema::DataType::Boolean, - true, - )); - - expressions.push(( - build_case( - update_when, - update_then, - schema.clone(), - &input_dfschema, - &state, - )?, - TARGET_UPDATE_COLUMN.to_owned(), - )); - f.push(DFField::new_unqualified( - TARGET_UPDATE_COLUMN, - arrow_schema::DataType::Boolean, - true, - )); - - expressions.push(( - build_case( - target_delete_when, - target_delete_then, - schema.clone(), - &input_dfschema, - &state, - )?, - TARGET_DELETE_COLUMN.to_owned(), - )); - f.push(DFField::new_unqualified( + new_columns = new_columns.with_column(DELETE_COLUMN, build_case(delete_when, delete_then)?)?; + new_columns = + new_columns.with_column(TARGET_INSERT_COLUMN, build_case(insert_when, insert_then)?)?; + new_columns = + new_columns.with_column(TARGET_UPDATE_COLUMN, build_case(update_when, update_then)?)?; + new_columns = new_columns.with_column( TARGET_DELETE_COLUMN, - arrow_schema::DataType::Boolean, - true, - )); - - expressions.push(( - build_case( - copy_when, - copy_then, - schema.clone(), - &input_dfschema, - &state, - )?, - TARGET_COPY_COLUMN.to_owned(), - )); - f.push(DFField::new_unqualified( - TARGET_COPY_COLUMN, - arrow_schema::DataType::Boolean, - true, - )); - - let projection = Arc::new(ProjectionExec::try_new(expressions, projection.clone())?); - - let target_count_plan = Arc::new(MetricObserverExec::new(projection, |batch, metrics| { - MetricBuilder::new(metrics) - .global_counter(TARGET_INSERTED_METRIC) - .add( - batch - .column_by_name(TARGET_INSERT_COLUMN) - .unwrap() - .null_count(), - ); - MetricBuilder::new(metrics) - .global_counter(TARGET_UPDATED_METRIC) - .add( - batch - .column_by_name(TARGET_UPDATE_COLUMN) - .unwrap() - .null_count(), - ); - MetricBuilder::new(metrics) - .global_counter(TARGET_DELETED_METRIC) - .add( - batch - .column_by_name(TARGET_DELETE_COLUMN) - .unwrap() - .null_count(), - ); - MetricBuilder::new(metrics) - .global_counter(TARGET_COPY_METRIC) - .add( - batch - .column_by_name(TARGET_COPY_COLUMN) - .unwrap() - .null_count(), - ); - })); - - let write_schema_df = DFSchema::new_with_metadata(f, HashMap::new())?; - - let write_predicate = create_physical_expr( - &(col(DELETE_COLUMN).is_false()), - &write_schema_df, - &target_count_plan.schema(), - state.execution_props(), + build_case(target_delete_when, target_delete_then)?, )?; - let filter: Arc = Arc::new(FilterExec::try_new( - write_predicate, - target_count_plan.clone(), - )?); + new_columns = new_columns.with_column(TARGET_COPY_COLUMN, build_case(copy_when, copy_then)?)?; - let mut expressions: Vec<(Arc, String)> = Vec::new(); - for (key, value) in projection_map { - expressions.push(( - Arc::new(expressions::Column::new( - &("__delta_rs_c_".to_owned() + key), - value, - )), - key.to_owned(), - )); - } - // project filtered records to delta schema - let projection = Arc::new(ProjectionExec::try_new(expressions, filter.clone())?); + let new_columns = new_columns.into_optimized_plan()?; + let operation_count = LogicalPlan::Extension(Extension { + node: Arc::new(MetricObserver { + id: OUTPUT_COUNT_ID.into(), + input: new_columns, + }), + }); + + let operation_count = DataFrame::new(state.clone(), operation_count); + let filtered = operation_count.filter(col(DELETE_COLUMN).is_false())?; + + let project = filtered.select(write_projection)?; + let optimized = &project.into_optimized_plan()?; + + let state = state.with_query_planner(Arc::new(MergePlanner {})); + let write = state.create_physical_plan(optimized).await?; + + let err = || DeltaTableError::Generic("Unable to locate expected metric node".into()); + let source_count = find_metric_node(SOURCE_COUNT_ID, &write).ok_or_else(err)?; + let op_count = find_metric_node(OUTPUT_COUNT_ID, &write).ok_or_else(err)?; // write projected records let table_partition_cols = current_metadata.partition_columns.clone(); @@ -1124,9 +1006,9 @@ async fn execute( let add_actions = write_execution_plan( snapshot, state.clone(), - projection.clone(), + write, table_partition_cols.clone(), - log_store.object_store().clone(), + log_store.object_store(), Some(snapshot.table_config().target_file_size() as usize), None, writer_properties, @@ -1163,7 +1045,7 @@ async fn execute( let mut version = snapshot.version(); let source_count_metrics = source_count.metrics().unwrap(); - let target_count_metrics = target_count_plan.metrics().unwrap(); + let target_count_metrics = op_count.metrics().unwrap(); fn get_metric(metrics: &MetricsSet, name: &str) -> usize { metrics.sum_by_name(name).map(|m| m.as_usize()).unwrap_or(0) } @@ -1200,6 +1082,25 @@ async fn execute( Ok(((actions, version), metrics)) } +// TODO: Abstract MergePlanner into DeltaPlanner to support other delta operations in the future. +struct MergePlanner {} + +#[async_trait] +impl QueryPlanner for MergePlanner { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + session_state: &SessionState, + ) -> DataFusionResult> { + let planner = Arc::new(Box::new(DefaultPhysicalPlanner::with_extension_planners( + vec![Arc::new(MergeMetricExtensionPlanner {})], + ))); + planner + .create_physical_plan(logical_plan, session_state) + .await + } +} + impl std::future::IntoFuture for MergeBuilder { type Output = DeltaResult<(DeltaTable, MergeMetrics)>; type IntoFuture = BoxFuture<'static, Self::Output>; @@ -1211,7 +1112,9 @@ impl std::future::IntoFuture for MergeBuilder { PROTOCOL.can_write_to(&this.snapshot)?; let state = this.state.unwrap_or_else(|| { - let session = SessionContext::new(); + //TODO: Datafusion's Hashjoin has some memory issues. Running with all cores results in a OoM. Can be removed when upstream improvemetns are made. + let config = SessionConfig::new().with_target_partitions(1); + let session = SessionContext::new_with_config(config); // If a user provides their own their DF state then they must register the store themselves register_store(this.log_store.clone(), session.runtime_env()); @@ -1349,8 +1252,8 @@ mod tests { async fn assert_merge(table: DeltaTable, metrics: MergeMetrics) { assert_eq!(table.version(), 2); - assert_eq!(table.get_file_uris().count(), 1); - assert_eq!(metrics.num_target_files_added, 1); + assert!(table.get_file_uris().count() >= 1); + assert!(metrics.num_target_files_added >= 1); assert_eq!(metrics.num_target_files_removed, 1); assert_eq!(metrics.num_target_rows_copied, 1); assert_eq!(metrics.num_target_rows_updated, 3); @@ -1442,7 +1345,7 @@ mod tests { .unwrap() .when_not_matched_by_source_update(|update| { update - .predicate("target.value = arrow_cast(1, 'Int32')") + .predicate("target.value = 1") .update("value", "target.value + cast(1 as int)") }) .unwrap() @@ -1470,9 +1373,7 @@ mod tests { ); assert_eq!( parameters["notMatchedBySourcePredicates"], - json!( - r#"[{"actionType":"update","predicate":"target.value = arrow_cast(1, 'Int32')"}]"# - ) + json!(r#"[{"actionType":"update","predicate":"target.value = 1"}]"#) ); assert_merge(table, metrics).await; @@ -1500,9 +1401,7 @@ mod tests { }) .unwrap() .when_not_matched_by_source_update(|update| { - update - .predicate("value = arrow_cast(1, 'Int32')") - .update("value", "value + cast(1 as int)") + update.predicate("value = 1").update("value", "value + 1") }) .unwrap() .when_not_matched_insert(|insert| { @@ -1543,8 +1442,8 @@ mod tests { .unwrap() .when_not_matched_by_source_update(|update| { update - .predicate("value = arrow_cast(1, 'Int32')") - .update("value", "target.value + cast(1 as int)") + .predicate("value = 1") + .update("value", "target.value + 1") }) .unwrap() .when_not_matched_insert(|insert| { @@ -1657,8 +1556,8 @@ mod tests { .unwrap(); assert_eq!(table.version(), 2); - assert_eq!(table.get_file_uris().count(), 3); - assert_eq!(metrics.num_target_files_added, 3); + assert!(table.get_file_uris().count() >= 3); + assert!(metrics.num_target_files_added >= 3); assert_eq!(metrics.num_target_files_removed, 2); assert_eq!(metrics.num_target_rows_copied, 1); assert_eq!(metrics.num_target_rows_updated, 3); @@ -1720,8 +1619,8 @@ mod tests { .unwrap(); assert_eq!(table.version(), 2); - assert_eq!(table.get_file_uris().count(), 2); - assert_eq!(metrics.num_target_files_added, 2); + assert!(table.get_file_uris().count() >= 2); + assert!(metrics.num_target_files_added >= 2); assert_eq!(metrics.num_target_files_removed, 2); assert_eq!(metrics.num_target_rows_copied, 2); assert_eq!(metrics.num_target_rows_updated, 0); @@ -1784,8 +1683,8 @@ mod tests { .unwrap(); assert_eq!(table.version(), 2); - assert_eq!(table.get_file_uris().count(), 2); - assert_eq!(metrics.num_target_files_added, 2); + assert!(table.get_file_uris().count() >= 2); + assert!(metrics.num_target_files_added >= 2); assert_eq!(metrics.num_target_files_removed, 2); assert_eq!(metrics.num_target_rows_copied, 3); assert_eq!(metrics.num_target_rows_updated, 0); @@ -1918,8 +1817,7 @@ mod tests { .unwrap(); assert_eq!(table.version(), 2); - assert_eq!(table.get_file_uris().count(), 2); - assert_eq!(metrics.num_target_files_added, 2); + assert!(metrics.num_target_files_added >= 2); assert_eq!(metrics.num_target_files_removed, 2); assert_eq!(metrics.num_target_rows_copied, 3); assert_eq!(metrics.num_target_rows_updated, 0); @@ -1949,4 +1847,77 @@ mod tests { let actual = get_data(&table).await; assert_batches_sorted_eq!(&expected, &actual); } + + #[tokio::test] + async fn test_merge_empty_table() { + let schema = get_arrow_schema(&None); + let table = setup_table(Some(vec!["modified"])).await; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_file_uris().count(), 0); + + let ctx = SessionContext::new(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["B", "C", "X"])), + Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), + Arc::new(arrow::array::StringArray::from(vec![ + "2021-02-02", + "2023-07-04", + "2023-07-04", + ])), + ], + ) + .unwrap(); + let source = ctx.read_batch(batch).unwrap(); + + let (table, metrics) = DeltaOps(table) + .merge( + source, + col("target.id") + .eq(col("source.id")) + .and(col("target.modified").eq(lit("2021-02-02"))), + ) + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|update| { + update + .update("value", col("source.value")) + .update("modified", col("source.modified")) + }) + .unwrap() + .when_not_matched_insert(|insert| { + insert + .set("id", col("source.id")) + .set("value", col("source.value")) + .set("modified", col("source.modified")) + }) + .unwrap() + .await + .unwrap(); + + assert_eq!(table.version(), 1); + assert!(table.get_file_uris().count() >= 2); + assert!(metrics.num_target_files_added >= 2); + assert_eq!(metrics.num_target_files_removed, 0); + assert_eq!(metrics.num_target_rows_copied, 0); + assert_eq!(metrics.num_target_rows_updated, 0); + assert_eq!(metrics.num_target_rows_inserted, 3); + assert_eq!(metrics.num_target_rows_deleted, 0); + assert_eq!(metrics.num_output_rows, 3); + assert_eq!(metrics.num_source_rows, 3); + + let expected = vec![ + "+----+-------+------------+", + "| id | value | modified |", + "+----+-------+------------+", + "| B | 10 | 2021-02-02 |", + "| C | 20 | 2023-07-04 |", + "| X | 30 | 2023-07-04 |", + "+----+-------+------------+", + ]; + let actual = get_data(&table).await; + assert_batches_sorted_eq!(&expected, &actual); + } } diff --git a/crates/deltalake-core/src/operations/mod.rs b/crates/deltalake-core/src/operations/mod.rs index a0dbfd0239..a81e16578f 100644 --- a/crates/deltalake-core/src/operations/mod.rs +++ b/crates/deltalake-core/src/operations/mod.rs @@ -192,20 +192,9 @@ impl AsRef for DeltaOps { #[cfg(feature = "datafusion")] mod datafusion_utils { - use std::sync::Arc; - - use arrow_schema::SchemaRef; - use datafusion::arrow::record_batch::RecordBatch; - use datafusion::error::Result as DataFusionResult; use datafusion::execution::context::SessionState; - use datafusion::physical_plan::DisplayAs; - use datafusion::physical_plan::{ - metrics::{ExecutionPlanMetricsSet, MetricsSet}, - ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, - }; - use datafusion_common::{DFSchema, Statistics}; + use datafusion_common::DFSchema; use datafusion_expr::Expr; - use futures::{Stream, StreamExt}; use crate::{delta_datafusion::expr::parse_predicate_expression, DeltaResult}; @@ -255,127 +244,4 @@ mod datafusion_utils { None => None, }) } - - pub(crate) type MetricObserverFunction = fn(&RecordBatch, &ExecutionPlanMetricsSet) -> (); - - pub(crate) struct MetricObserverExec { - parent: Arc, - metrics: ExecutionPlanMetricsSet, - update: MetricObserverFunction, - } - - impl MetricObserverExec { - pub fn new(parent: Arc, f: MetricObserverFunction) -> Self { - MetricObserverExec { - parent, - metrics: ExecutionPlanMetricsSet::new(), - update: f, - } - } - } - - impl std::fmt::Debug for MetricObserverExec { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("MergeStatsExec") - .field("parent", &self.parent) - .field("metrics", &self.metrics) - .finish() - } - } - - impl DisplayAs for MetricObserverExec { - fn fmt_as( - &self, - _: datafusion::physical_plan::DisplayFormatType, - f: &mut std::fmt::Formatter, - ) -> std::fmt::Result { - write!(f, "MetricObserverExec") - } - } - - impl ExecutionPlan for MetricObserverExec { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn schema(&self) -> arrow_schema::SchemaRef { - self.parent.schema() - } - - fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning { - self.parent.output_partitioning() - } - - fn output_ordering(&self) -> Option<&[datafusion_physical_expr::PhysicalSortExpr]> { - self.parent.output_ordering() - } - - fn children(&self) -> Vec> { - vec![self.parent.clone()] - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> datafusion_common::Result - { - let res = self.parent.execute(partition, context)?; - Ok(Box::pin(MetricObserverStream { - schema: self.schema(), - input: res, - metrics: self.metrics.clone(), - update: self.update, - })) - } - - fn statistics(&self) -> DataFusionResult { - self.parent.statistics() - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> datafusion_common::Result> { - ExecutionPlan::with_new_children(self.parent.clone(), children) - } - - fn metrics(&self) -> Option { - Some(self.metrics.clone_inner()) - } - } - - struct MetricObserverStream { - schema: SchemaRef, - input: SendableRecordBatchStream, - metrics: ExecutionPlanMetricsSet, - update: MetricObserverFunction, - } - - impl Stream for MetricObserverStream { - type Item = DataFusionResult; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - self.input.poll_next_unpin(cx).map(|x| match x { - Some(Ok(batch)) => { - (self.update)(&batch, &self.metrics); - Some(Ok(batch)) - } - other => other, - }) - } - - fn size_hint(&self) -> (usize, Option) { - self.input.size_hint() - } - } - - impl RecordBatchStream for MetricObserverStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - } } diff --git a/crates/deltalake-core/src/operations/update.rs b/crates/deltalake-core/src/operations/update.rs index 7583ed6b39..907dec5998 100644 --- a/crates/deltalake-core/src/operations/update.rs +++ b/crates/deltalake-core/src/operations/update.rs @@ -43,10 +43,10 @@ use parquet::file::properties::WriterProperties; use serde::Serialize; use serde_json::Value; -use super::datafusion_utils::{Expression, MetricObserverExec}; +use super::datafusion_utils::Expression; use super::transaction::{commit, PROTOCOL}; use super::write::write_execution_plan; -use crate::delta_datafusion::expr::fmt_expr_to_sql; +use crate::delta_datafusion::{expr::fmt_expr_to_sql, physical::MetricObserverExec}; use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; use crate::kernel::{Action, Remove}; use crate::logstore::LogStoreRef; @@ -275,6 +275,7 @@ async fn execute( Arc::new(ProjectionExec::try_new(expressions, scan)?); let count_plan = Arc::new(MetricObserverExec::new( + "update_count".into(), projection_predicate.clone(), |batch, metrics| { let array = batch.column_by_name("__delta_rs_update_predicate").unwrap(); From 2c8c0ecd3914e69ec8b8be9daa71999f9f660525 Mon Sep 17 00:00:00 2001 From: David Blajda Date: Mon, 20 Nov 2023 02:40:14 -0500 Subject: [PATCH 22/23] feat: create benchmarks for merge (#1857) # Description Implements benchmarks that are similar to Spark's Delta benchmarks. Enable us to have a standard benchmark to measure improvements to merge and some pieces can be factored out to build a framework for bench marking delta workflows. --- crates/benchmarks/Cargo.toml | 46 ++ crates/benchmarks/README.md | 55 +++ crates/benchmarks/src/bin/merge.rs | 647 +++++++++++++++++++++++++++++ 3 files changed, 748 insertions(+) create mode 100644 crates/benchmarks/Cargo.toml create mode 100644 crates/benchmarks/README.md create mode 100644 crates/benchmarks/src/bin/merge.rs diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml new file mode 100644 index 0000000000..76bcc8a312 --- /dev/null +++ b/crates/benchmarks/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "delta-benchmarks" +version = "0.0.1" +authors = ["David Blajda "] +homepage = "https://github.com/delta-io/delta.rs" +license = "Apache-2.0" +keywords = ["deltalake", "delta", "datalake"] +description = "Delta-rs Benchmarks" +edition = "2021" + +[dependencies] +clap = { version = "4", features = [ "derive" ] } +chrono = { version = "0.4.31", default-features = false, features = ["clock"] } +tokio = { version = "1", features = ["fs", "macros", "rt", "io-util"] } +env_logger = "0" + +# arrow +arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-ord = { workspace = true } +arrow-row = { workspace = true } +arrow-schema = { workspace = true, features = ["serde"] } +arrow-select = { workspace = true } +parquet = { workspace = true, features = [ + "async", + "object_store", +] } + +# serde +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } + +# datafusion +datafusion = { workspace = true } +datafusion-expr = { workspace = true } +datafusion-common = { workspace = true } +datafusion-proto = { workspace = true } +datafusion-sql = { workspace = true } +datafusion-physical-expr = { workspace = true } + +[dependencies.deltalake-core] +path = "../deltalake-core" +version = "0" +features = ["datafusion"] diff --git a/crates/benchmarks/README.md b/crates/benchmarks/README.md new file mode 100644 index 0000000000..c5d6b0b920 --- /dev/null +++ b/crates/benchmarks/README.md @@ -0,0 +1,55 @@ +# Merge +The merge benchmarks are similar to the ones used by [Delta Spark](https://github.com/delta-io/delta/pull/1835). + + +## Dataset + +Databricks maintains a public S3 bucket of the TPC-DS dataset with various factor where requesters must pay to download this dataset. Below is an example of how to list the 1gb scale factor + +``` +aws s3api list-objects --bucket devrel-delta-datasets --request-payer requester --prefix tpcds-2.13/tpcds_sf1_parquet/web_returns/ +``` + +You can generate the TPC-DS dataset yourself by downloading and compiling [the generator](https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) +You may need to update the CFLAGS to include `-fcommon` to compile on newer versions of GCC. + +## Commands +These commands can be executed from the root of the benchmark crate. Some commands depend on the existance of the TPC-DS Dataset existing. + +### Convert +Converts a TPC-DS web_returns csv into a Delta table +Assumes the dataset is pipe delimited and records do not have a trailing delimiter + +``` + cargo run --release --bin merge -- convert data/tpcds/web_returns.dat data/web_returns +``` + +### Standard +Execute the standard merge bench suite. +Results can be saved to a delta table for further analysis. +This table has the following schema: + +group_id: Used to group all tests that executed as a part of this call. Default value is the timestamp of execution +name: The benchmark name that was executed +sample: The iteration number for a given benchmark name +duration_ms: How long the benchmark took in ms +data: Free field to pack any additonal data + +``` + cargo run --release --bin merge -- standard data/web_returns 1 data/merge_results +``` + +### Compare +Compare the results of two different runs. +The a Delta table paths and the `group_id` of each run and obtain the speedup for each test case + +``` + cargo run --release --bin merge -- compare data/benchmarks/ 1698636172801 data/benchmarks/ 1699759539902 +``` + +### Show +Show all benchmarks results from a delta table + +``` + cargo run --release --bin merge -- show data/benchmark +``` diff --git a/crates/benchmarks/src/bin/merge.rs b/crates/benchmarks/src/bin/merge.rs new file mode 100644 index 0000000000..5afa3e6f35 --- /dev/null +++ b/crates/benchmarks/src/bin/merge.rs @@ -0,0 +1,647 @@ +use std::{ + sync::Arc, + time::{SystemTime, UNIX_EPOCH}, +}; + +use arrow::datatypes::Schema as ArrowSchema; +use arrow_array::{RecordBatch, StringArray, UInt32Array}; +use chrono::Duration; +use clap::{command, Args, Parser, Subcommand}; +use datafusion::{datasource::MemTable, prelude::DataFrame}; +use datafusion_common::DataFusionError; +use datafusion_expr::{cast, col, lit, random}; +use deltalake_core::protocol::SaveMode; +use deltalake_core::{ + arrow::{ + self, + datatypes::{DataType, Field}, + }, + datafusion::prelude::{CsvReadOptions, SessionContext}, + delta_datafusion::{DeltaScanConfig, DeltaTableProvider}, + operations::merge::{MergeBuilder, MergeMetrics}, + DeltaOps, DeltaTable, DeltaTableBuilder, DeltaTableError, ObjectStore, Path, +}; +use serde_json::json; +use tokio::time::Instant; + +/* Convert web_returns dataset from TPC DS's datagen utility into a Delta table + This table will be partitioned on `wr_returned_date_sk` +*/ +pub async fn convert_tpcds_web_returns(input_path: String, table_path: String) -> Result<(), ()> { + let ctx = SessionContext::new(); + + let schema = ArrowSchema::new(vec![ + Field::new("wr_returned_date_sk", DataType::Int64, true), + Field::new("wr_returned_time_sk", DataType::Int64, true), + Field::new("wr_item_sk", DataType::Int64, false), + Field::new("wr_refunded_customer_sk", DataType::Int64, true), + Field::new("wr_refunded_cdemo_sk", DataType::Int64, true), + Field::new("wr_refunded_hdemo_sk", DataType::Int64, true), + Field::new("wr_refunded_addr_sk", DataType::Int64, true), + Field::new("wr_returning_customer_sk", DataType::Int64, true), + Field::new("wr_returning_cdemo_sk", DataType::Int64, true), + Field::new("wr_returning_hdemo_sk", DataType::Int64, true), + Field::new("wr_returning_addr_sk", DataType::Int64, true), + Field::new("wr_web_page_sk", DataType::Int64, true), + Field::new("wr_reason_sk", DataType::Int64, true), + Field::new("wr_order_number", DataType::Int64, false), + Field::new("wr_return_quantity", DataType::Int32, true), + Field::new("wr_return_amt", DataType::Decimal128(7, 2), true), + Field::new("wr_return_tax", DataType::Decimal128(7, 2), true), + Field::new("wr_return_amt_inc_tax", DataType::Decimal128(7, 2), true), + Field::new("wr_fee", DataType::Decimal128(7, 2), true), + Field::new("wr_return_ship_cost", DataType::Decimal128(7, 2), true), + Field::new("wr_refunded_cash", DataType::Decimal128(7, 2), true), + Field::new("wr_reversed_charge", DataType::Decimal128(7, 2), true), + Field::new("wr_account_credit", DataType::Decimal128(7, 2), true), + Field::new("wr_net_loss", DataType::Decimal128(7, 2), true), + ]); + + let table = ctx + .read_csv( + input_path, + CsvReadOptions { + has_header: false, + delimiter: b'|', + file_extension: ".dat", + schema: Some(&schema), + ..Default::default() + }, + ) + .await + .unwrap(); + + DeltaOps::try_from_uri(table_path) + .await + .unwrap() + .write(table.collect().await.unwrap()) + .with_partition_columns(vec!["wr_returned_date_sk"]) + .await + .unwrap(); + + Ok(()) +} + +fn merge_upsert(source: DataFrame, table: DeltaTable) -> Result { + DeltaOps(table) + .merge(source, "source.wr_item_sk = target.wr_item_sk and source.wr_order_number = target.wr_order_number") + .with_source_alias("source") + .with_target_alias("target") + .when_matched_update(|update| { + update + .update("wr_returned_date_sk", "source.wr_returned_date_sk") + .update("wr_returned_time_sk", "source.wr_returned_time_sk") + .update("wr_item_sk", "source.wr_item_sk") + .update("wr_refunded_customer_sk", "source.wr_refunded_customer_sk") + .update("wr_refunded_cdemo_sk", "source.wr_refunded_cdemo_sk") + .update("wr_refunded_hdemo_sk", "source.wr_refunded_hdemo_sk") + .update("wr_refunded_addr_sk", "source.wr_refunded_addr_sk") + .update("wr_returning_customer_sk", "source.wr_returning_customer_sk") + .update("wr_returning_cdemo_sk", "source.wr_returning_cdemo_sk") + .update("wr_returning_hdemo_sk", "source.wr_returning_hdemo_sk") + .update("wr_returning_addr_sk", "source.wr_returning_addr_sk") + .update("wr_web_page_sk", "source.wr_web_page_sk") + .update("wr_reason_sk", "source.wr_reason_sk") + .update("wr_order_number", "source.wr_order_number") + .update("wr_return_quantity", "source.wr_return_quantity") + .update("wr_return_amt", "source.wr_return_amt") + .update("wr_return_tax", "source.wr_return_tax") + .update("wr_return_amt_inc_tax", "source.wr_return_amt_inc_tax") + .update("wr_fee", "source.wr_fee") + .update("wr_return_ship_cost", "source.wr_return_ship_cost") + .update("wr_refunded_cash", "source.wr_refunded_cash") + .update("wr_reversed_charge", "source.wr_reversed_charge") + .update("wr_account_credit", "source.wr_account_credit") + .update("wr_net_loss", "source.wr_net_loss") + })? + .when_not_matched_insert(|insert| { + insert + .set("wr_returned_date_sk", "source.wr_returned_date_sk") + .set("wr_returned_time_sk", "source.wr_returned_time_sk") + .set("wr_item_sk", "source.wr_item_sk") + .set("wr_refunded_customer_sk", "source.wr_refunded_customer_sk") + .set("wr_refunded_cdemo_sk", "source.wr_refunded_cdemo_sk") + .set("wr_refunded_hdemo_sk", "source.wr_refunded_hdemo_sk") + .set("wr_refunded_addr_sk", "source.wr_refunded_addr_sk") + .set("wr_returning_customer_sk", "source.wr_returning_customer_sk") + .set("wr_returning_cdemo_sk", "source.wr_returning_cdemo_sk") + .set("wr_returning_hdemo_sk", "source.wr_returning_hdemo_sk") + .set("wr_returning_addr_sk", "source.wr_returning_addr_sk") + .set("wr_web_page_sk", "source.wr_web_page_sk") + .set("wr_reason_sk", "source.wr_reason_sk") + .set("wr_order_number", "source.wr_order_number") + .set("wr_return_quantity", "source.wr_return_quantity") + .set("wr_return_amt", "source.wr_return_amt") + .set("wr_return_tax", "source.wr_return_tax") + .set("wr_return_amt_inc_tax", "source.wr_return_amt_inc_tax") + .set("wr_fee", "source.wr_fee") + .set("wr_return_ship_cost", "source.wr_return_ship_cost") + .set("wr_refunded_cash", "source.wr_refunded_cash") + .set("wr_reversed_charge", "source.wr_reversed_charge") + .set("wr_account_credit", "source.wr_account_credit") + .set("wr_net_loss", "source.wr_net_loss") + }) +} + +fn merge_insert(source: DataFrame, table: DeltaTable) -> Result { + DeltaOps(table) + .merge(source, "source.wr_item_sk = target.wr_item_sk and source.wr_order_number = target.wr_order_number") + .with_source_alias("source") + .with_target_alias("target") + .when_not_matched_insert(|insert| { + insert + .set("wr_returned_date_sk", "source.wr_returned_date_sk") + .set("wr_returned_time_sk", "source.wr_returned_time_sk") + .set("wr_item_sk", "source.wr_item_sk") + .set("wr_refunded_customer_sk", "source.wr_refunded_customer_sk") + .set("wr_refunded_cdemo_sk", "source.wr_refunded_cdemo_sk") + .set("wr_refunded_hdemo_sk", "source.wr_refunded_hdemo_sk") + .set("wr_refunded_addr_sk", "source.wr_refunded_addr_sk") + .set("wr_returning_customer_sk", "source.wr_returning_customer_sk") + .set("wr_returning_cdemo_sk", "source.wr_returning_cdemo_sk") + .set("wr_returning_hdemo_sk", "source.wr_returning_hdemo_sk") + .set("wr_returning_addr_sk", "source.wr_returning_addr_sk") + .set("wr_web_page_sk", "source.wr_web_page_sk") + .set("wr_reason_sk", "source.wr_reason_sk") + .set("wr_order_number", "source.wr_order_number") + .set("wr_return_quantity", "source.wr_return_quantity") + .set("wr_return_amt", "source.wr_return_amt") + .set("wr_return_tax", "source.wr_return_tax") + .set("wr_return_amt_inc_tax", "source.wr_return_amt_inc_tax") + .set("wr_fee", "source.wr_fee") + .set("wr_return_ship_cost", "source.wr_return_ship_cost") + .set("wr_refunded_cash", "source.wr_refunded_cash") + .set("wr_reversed_charge", "source.wr_reversed_charge") + .set("wr_account_credit", "source.wr_account_credit") + .set("wr_net_loss", "source.wr_net_loss") + }) +} + +fn merge_delete(source: DataFrame, table: DeltaTable) -> Result { + DeltaOps(table) + .merge(source, "source.wr_item_sk = target.wr_item_sk and source.wr_order_number = target.wr_order_number") + .with_source_alias("source") + .with_target_alias("target") + .when_matched_delete(|delete| { + delete + }) +} + +async fn benchmark_merge_tpcds( + path: String, + parameters: MergePerfParams, + merge: fn(DataFrame, DeltaTable) -> Result, +) -> Result<(core::time::Duration, MergeMetrics), DataFusionError> { + let table = DeltaTableBuilder::from_uri(path).load().await?; + let file_count = table.state.files().len(); + + let provider = DeltaTableProvider::try_new( + table.state.clone(), + table.log_store(), + DeltaScanConfig { + file_column_name: Some("file_path".to_string()), + }, + ) + .unwrap(); + + let ctx = SessionContext::new(); + ctx.register_table("t1", Arc::new(provider))?; + + let files = ctx + .sql("select file_path as file from t1 group by file") + .await? + .with_column("r", random())? + .filter(col("r").lt_eq(lit(parameters.sample_files)))?; + + let file_sample = files.collect_partitioned().await?; + let schema = file_sample.get(0).unwrap().get(0).unwrap().schema(); + let mem_table = Arc::new(MemTable::try_new(schema, file_sample)?); + ctx.register_table("file_sample", mem_table)?; + let file_sample_count = ctx.table("file_sample").await?.count().await?; + + let row_sample = ctx.table("t1").await?.join( + ctx.table("file_sample").await?, + datafusion_common::JoinType::Inner, + &["file_path"], + &["file"], + None, + )?; + + let matched = row_sample + .clone() + .filter(random().lt_eq(lit(parameters.sample_matched_rows)))?; + + let rand = cast(random() * lit(u32::MAX), DataType::Int64); + let not_matched = row_sample + .filter(random().lt_eq(lit(parameters.sample_not_matched_rows)))? + .with_column("wr_item_sk", rand.clone())? + .with_column("wr_order_number", rand)?; + + let source = matched.union(not_matched)?; + + let start = Instant::now(); + let (table, metrics) = merge(source, table)?.await?; + let end = Instant::now(); + + let duration = end.duration_since(start); + + println!("Total File count: {}", file_count); + println!("File sample count: {}", file_sample_count); + println!("{:?}", metrics); + println!("Seconds: {}", duration.as_secs_f32()); + + // Clean up and restore to original state. + let (table, _) = DeltaOps(table).restore().with_version_to_restore(0).await?; + let (table, _) = DeltaOps(table) + .vacuum() + .with_retention_period(Duration::seconds(0)) + .with_enforce_retention_duration(false) + .await?; + table + .object_store() + .delete(&Path::parse("_delta_log/00000000000000000001.json")?) + .await?; + table + .object_store() + .delete(&Path::parse("_delta_log/00000000000000000002.json")?) + .await?; + + Ok((duration, metrics)) +} + +#[derive(Subcommand, Debug)] +enum Command { + Convert(Convert), + Bench(BenchArg), + Standard(Standard), + Compare(Compare), + Show(Show), +} + +#[derive(Debug, Args)] +struct Convert { + tpcds_path: String, + delta_path: String, +} + +#[derive(Debug, Args)] +struct Standard { + delta_path: String, + samples: Option, + output_path: Option, + group_id: Option, +} + +#[derive(Debug, Args)] +struct Compare { + before_path: String, + before_group_id: String, + after_path: String, + after_group_id: String, +} + +#[derive(Debug, Args)] +struct Show { + path: String, +} + +#[derive(Debug, Args)] +struct BenchArg { + table_path: String, + #[command(subcommand)] + name: MergeBench, +} + +struct Bench { + name: String, + op: fn(DataFrame, DeltaTable) -> Result, + params: MergePerfParams, +} + +impl Bench { + fn new( + name: S, + op: fn(DataFrame, DeltaTable) -> Result, + params: MergePerfParams, + ) -> Self { + Bench { + name: name.to_string(), + op, + params, + } + } +} + +#[derive(Debug, Args, Clone)] +struct MergePerfParams { + pub sample_files: f32, + pub sample_matched_rows: f32, + pub sample_not_matched_rows: f32, +} + +#[derive(Debug, Clone, Subcommand)] +enum MergeBench { + Upsert(MergePerfParams), + Delete(MergePerfParams), + Insert(MergePerfParams), +} + +#[derive(Parser, Debug)] +#[command(about)] +struct MergePrefArgs { + #[command(subcommand)] + command: Command, +} + +#[tokio::main] +async fn main() { + match MergePrefArgs::parse().command { + Command::Convert(Convert { + tpcds_path, + delta_path, + }) => { + convert_tpcds_web_returns(tpcds_path, delta_path) + .await + .unwrap(); + } + Command::Bench(BenchArg { table_path, name }) => { + let (merge_op, params): ( + fn(DataFrame, DeltaTable) -> Result, + MergePerfParams, + ) = match name { + MergeBench::Upsert(params) => (merge_upsert, params), + MergeBench::Delete(params) => (merge_delete, params), + MergeBench::Insert(params) => (merge_insert, params), + }; + + benchmark_merge_tpcds(table_path, params, merge_op) + .await + .unwrap(); + } + Command::Standard(Standard { + delta_path, + samples, + output_path, + group_id, + }) => { + let benches = vec![Bench::new( + "delete_only_fileMatchedFraction_0.05_rowMatchedFraction_0.05", + merge_delete, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.05, + sample_not_matched_rows: 0.0, + }, + ), + Bench::new( + "multiple_insert_only_fileMatchedFraction_0.05_rowNotMatchedFraction_0.05", + merge_insert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.00, + sample_not_matched_rows: 0.05, + }, + ), + Bench::new( + "multiple_insert_only_fileMatchedFraction_0.05_rowNotMatchedFraction_0.50", + merge_insert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.00, + sample_not_matched_rows: 0.50, + }, + ), + Bench::new( + "multiple_insert_only_fileMatchedFraction_0.05_rowNotMatchedFraction_1.0", + merge_insert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.00, + sample_not_matched_rows: 1.0, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_0.05_rowMatchedFraction_0.01_rowNotMatchedFraction_0.1", + merge_upsert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.01, + sample_not_matched_rows: 0.1, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_0.05_rowMatchedFraction_0.0_rowNotMatchedFraction_0.1", + merge_upsert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.00, + sample_not_matched_rows: 0.1, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_0.05_rowMatchedFraction_0.1_rowNotMatchedFraction_0.0", + merge_upsert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.1, + sample_not_matched_rows: 0.0, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_0.05_rowMatchedFraction_0.1_rowNotMatchedFraction_0.01", + merge_upsert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.1, + sample_not_matched_rows: 0.01, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_0.05_rowMatchedFraction_0.5_rowNotMatchedFraction_0.001", + merge_upsert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.5, + sample_not_matched_rows: 0.001, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_0.05_rowMatchedFraction_0.99_rowNotMatchedFraction_0.001", + merge_upsert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 0.99, + sample_not_matched_rows: 0.001, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_0.05_rowMatchedFraction_1.0_rowNotMatchedFraction_0.001", + merge_upsert, + MergePerfParams { + sample_files: 0.05, + sample_matched_rows: 1.0, + sample_not_matched_rows: 0.001, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_0.5_rowMatchedFraction_0.001_rowNotMatchedFraction_0.001", + merge_upsert, + MergePerfParams { + sample_files: 0.5, + sample_matched_rows: 0.001, + sample_not_matched_rows: 0.001, + }, + ), + Bench::new( + "upsert_fileMatchedFraction_1.0_rowMatchedFraction_0.001_rowNotMatchedFraction_0.001", + merge_upsert, + MergePerfParams { + sample_files: 1.0, + sample_matched_rows: 0.001, + sample_not_matched_rows: 0.001, + }, + ) + ]; + + let num_samples = samples.unwrap_or(1); + let group_id = group_id.unwrap_or( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + .to_string(), + ); + let output = output_path.unwrap_or("data/benchmarks".into()); + + let mut group_ids = vec![]; + let mut name = vec![]; + let mut samples = vec![]; + let mut duration_ms = vec![]; + let mut data = vec![]; + + for bench in benches { + for sample in 0..num_samples { + println!("Test: {} Sample: {}", bench.name, sample); + let res = + benchmark_merge_tpcds(delta_path.clone(), bench.params.clone(), bench.op) + .await + .unwrap(); + + group_ids.push(group_id.clone()); + name.push(bench.name.clone()); + samples.push(sample); + duration_ms.push(res.0.as_millis() as u32); + data.push(json!(res.1).to_string()); + } + } + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("group_id", DataType::Utf8, false), + Field::new("name", DataType::Utf8, false), + Field::new("sample", DataType::UInt32, false), + Field::new("duration_ms", DataType::UInt32, false), + Field::new("data", DataType::Utf8, true), + ])); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(group_ids)), + Arc::new(StringArray::from(name)), + Arc::new(UInt32Array::from(samples)), + Arc::new(UInt32Array::from(duration_ms)), + Arc::new(StringArray::from(data)), + ], + ) + .unwrap(); + + DeltaOps::try_from_uri(output) + .await + .unwrap() + .write(vec![batch]) + .with_save_mode(SaveMode::Append) + .await + .unwrap(); + } + Command::Compare(Compare { + before_path, + before_group_id, + after_path, + after_group_id, + }) => { + let before_table = DeltaTableBuilder::from_uri(before_path) + .load() + .await + .unwrap(); + let after_table = DeltaTableBuilder::from_uri(after_path) + .load() + .await + .unwrap(); + + let ctx = SessionContext::new(); + ctx.register_table("before", Arc::new(before_table)) + .unwrap(); + ctx.register_table("after", Arc::new(after_table)).unwrap(); + + let before_stats = ctx + .sql(&format!( + " + select name as before_name, + avg(cast(duration_ms as float)) as before_duration_avg + from before where group_id = {} + group by name + ", + before_group_id + )) + .await + .unwrap(); + + let after_stats = ctx + .sql(&format!( + " + select name as after_name, + avg(cast(duration_ms as float)) as after_duration_avg + from after where group_id = {} + group by name + ", + after_group_id + )) + .await + .unwrap(); + + before_stats + .join( + after_stats, + datafusion_common::JoinType::Inner, + &["before_name"], + &["after_name"], + None, + ) + .unwrap() + .select(vec![ + col("before_name").alias("name"), + col("before_duration_avg"), + col("after_duration_avg"), + (col("before_duration_avg") / (col("after_duration_avg"))), + ]) + .unwrap() + .sort(vec![col("name").sort(true, true)]) + .unwrap() + .show() + .await + .unwrap(); + } + Command::Show(Show { path }) => { + let stats = DeltaTableBuilder::from_uri(path).load().await.unwrap(); + let ctx = SessionContext::new(); + ctx.register_table("stats", Arc::new(stats)).unwrap(); + + ctx.sql("select * from stats") + .await + .unwrap() + .show() + .await + .unwrap(); + } + } +} From 1e0d94e29c9c694d89f072458ab6d9c60ab0b738 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Sun, 19 Nov 2023 16:20:00 +0000 Subject: [PATCH 23/23] resolve #1860 Signed-off-by: Nikolay Ulmasov --- crates/deltalake-core/src/data_catalog/mod.rs | 12 ++++++------ crates/deltalake-core/src/lib.rs | 5 +++++ python/src/lib.rs | 4 +--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/crates/deltalake-core/src/data_catalog/mod.rs b/crates/deltalake-core/src/data_catalog/mod.rs index 911d4292f8..db05e429bb 100644 --- a/crates/deltalake-core/src/data_catalog/mod.rs +++ b/crates/deltalake-core/src/data_catalog/mod.rs @@ -7,7 +7,7 @@ pub use unity::*; #[cfg(feature = "unity-experimental")] pub mod client; -#[cfg(feature = "glue")] +#[cfg(any(feature = "glue", feature = "glue-native-tls"))] pub mod glue; #[cfg(feature = "datafusion")] pub mod storage; @@ -49,7 +49,7 @@ pub enum DataCatalogError { }, /// Missing metadata in the catalog - #[cfg(feature = "glue")] + #[cfg(any(feature = "glue", feature = "glue-native-tls"))] #[error("Missing Metadata {metadata} in the Data Catalog ")] MissingMetadata { /// The missing metadata property @@ -57,7 +57,7 @@ pub enum DataCatalogError { }, /// Glue Glue Data Catalog Error - #[cfg(feature = "glue")] + #[cfg(any(feature = "glue", feature = "glue-native-tls"))] #[error("Catalog glue error: {source}")] GlueError { /// The underlying Glue Data Catalog Error @@ -66,7 +66,7 @@ pub enum DataCatalogError { }, /// Error caused by the http request dispatcher not being able to be created. - #[cfg(feature = "glue")] + #[cfg(any(feature = "glue", feature = "glue-native-tls"))] #[error("Failed to create request dispatcher: {source}")] AWSHttpClient { /// The underlying Rusoto TlsError @@ -75,7 +75,7 @@ pub enum DataCatalogError { }, /// Error representing a failure to retrieve AWS credentials. - #[cfg(feature = "glue")] + #[cfg(any(feature = "glue", feature = "glue-native-tls"))] #[error("Failed to retrieve AWS credentials: {source}")] AWSCredentials { /// The underlying Rusoto CredentialsError @@ -138,7 +138,7 @@ pub fn get_data_catalog( "azure" => unimplemented!("Azure Data Catalog is not implemented"), #[cfg(feature = "hdfs")] "hdfs" => unimplemented!("HDFS Data Catalog is not implemented"), - #[cfg(feature = "glue")] + #[cfg(any(feature = "glue", feature = "glue-native-tls"))] "glue" => Ok(Box::new(glue::GlueDataCatalog::new()?)), #[cfg(feature = "unity-experimental")] "unity" => { diff --git a/crates/deltalake-core/src/lib.rs b/crates/deltalake-core/src/lib.rs index 644da2dcac..dfdbac97d6 100644 --- a/crates/deltalake-core/src/lib.rs +++ b/crates/deltalake-core/src/lib.rs @@ -82,6 +82,11 @@ compile_error!( "Features s3 and s3-native-tls are mutually exclusive and cannot be enabled together" ); +#[cfg(all(feature = "glue", feature = "glue-native-tls"))] +compile_error!( + "Features glue and glue-native-tls are mutually exclusive and cannot be enabled together" +); + pub mod data_catalog; pub mod errors; pub mod kernel; diff --git a/python/src/lib.rs b/python/src/lib.rs index b9067dfec9..69195e866d 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -134,9 +134,7 @@ impl RawDeltaTable { catalog_options: Option>, ) -> PyResult { let data_catalog = deltalake::data_catalog::get_data_catalog(data_catalog, catalog_options) - .map_err(|_| { - PyValueError::new_err(format!("Catalog '{}' not available.", data_catalog)) - })?; + .map_err(|e| PyValueError::new_err(format!("{}", e)))?; let table_uri = rt()? .block_on(data_catalog.get_table_storage_location( data_catalog_id,