From cb20ad782ac905b916ab683164f952eb37cf1167 Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Tue, 26 Dec 2023 15:43:51 -0500 Subject: [PATCH 01/29] docs: datafusion integration --- docs/integrations/delta-lake-arrow.md | 2 +- docs/integrations/delta-lake-datafusion.md | 80 ++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 docs/integrations/delta-lake-datafusion.md diff --git a/docs/integrations/delta-lake-arrow.md b/docs/integrations/delta-lake-arrow.md index 6da4d5fcc2..70965e3b74 100644 --- a/docs/integrations/delta-lake-arrow.md +++ b/docs/integrations/delta-lake-arrow.md @@ -2,7 +2,7 @@ Delta Lake tables can be exposed as Arrow tables and Arrow datasets, which allows for interoperability with a variety of query engines. -This page shows you how to convert Delta tables to Arrow data structures and teaches you the difference between Arrow tables and Arrow datasets. +This page shows you how to convert Delta tables to Arrow data structures and teaches you the difference between Arrow tables and Arrow datasets. Tables are "eager" and datasets are "lazy", which has important performance implications, keep reading to learn more! ## Delta Lake to Arrow Dataset diff --git a/docs/integrations/delta-lake-datafusion.md b/docs/integrations/delta-lake-datafusion.md new file mode 100644 index 0000000000..c8fab3e35d --- /dev/null +++ b/docs/integrations/delta-lake-datafusion.md @@ -0,0 +1,80 @@ +# Using Delta Lake with DataFusion + +This page explains how to use Delta Lake with DataFusion. + +Delta Lake offers DataFusion users better performance and more features compared to other formats like CSV or Parquet. + +Delta Lake works well with the DataFusion Rust API and the DataFusion Python API. It's a great option for all DataFusion users. + +Delta Lake also depends on DataFusion to implement some functionality under the hood. We will also discuss this dependency at the end of this guide in case you're interested in learning more about the symbiotic relationship between the two libraries. + +## Delta Lake performance benefits for DataFusion users + +Let's run some DataFusion queries on a Parquet file and a Delta table with the same data to learn more about the performance benefits of Delta Lake. + +Suppose you have the following dataset with 1 billion rows and 9 columns. Here are the first three rows of data: + +``` ++-------+-------+--------------+-------+-------+--------+------+------+---------+ +| id1 | id2 | id3 | id4 | id5 | id6 | v1 | v2 | v3 | +|-------+-------+--------------+-------+-------+--------+------+------+---------| +| id016 | id046 | id0000109363 | 88 | 13 | 146094 | 4 | 6 | 18.8377 | +| id039 | id087 | id0000466766 | 14 | 30 | 111330 | 4 | 14 | 46.7973 | +| id047 | id098 | id0000307804 | 85 | 23 | 187639 | 3 | 5 | 47.5773 | ++-------+-------+--------------+-------+-------+--------+------+------+---------+ +``` + +Here's how to register a Delta Lake table as a PyArrow dataset: + +```python +from datafusion import SessionContext +from deltalake import DeltaTable + +ctx = SessionContext() +table = DeltaTable("G1_1e9_1e2_0_0") +ctx.register_dataset("my_delta_table", table.to_pyarrow_dataset()) +``` + +Now query the table: + +```python +ctx.sql("select id1, sum(v1) as v1 from my_delta_table where id1='id096' group by id1") +``` + +That query takes 2.8 seconds to execute. + +Let's register the same dataset as a Parquet table, run the same query, and compare the runtime difference. + +Register the Parquet table and run the query: + +```python +path = "G1_1e9_1e2_0_0.parquet" +ctx.register_parquet("my_parquet_table", path) +ctx.sql("select id1, sum(v1) as v1 from my_parquet_table where id1='id096' group by id1") +``` + +This query takes 5.3 seconds to run. + +Parquet stores data in row groups and DataFusion can intelligently skip row groups that don't contain relevant data, so the query is faster than a file format like CSV which doesn't support row group skipping. + +Delta Lake stores file-level metadata information in the transaction log, so it can skip entire files when queries are executed. Delta Lake can skip entire files and then skip row groups within the individual files. This makes Delta Lake even faster than Parquet files, especially for larger datasets spread across many files. + +## Delta Lake features for DataFusion users + +Delta Lake also provides other features that are useful for DataFusion users like ACID transactions, concurrency protection, time travel, versioned data, and more. + +## Why Delta Lake depends on DataFusion + +Delta Lake depends on DataFusion to provide some end-user features. + +TODO: Explain how Delta Lake uses DataFusion to provide features. + +## Conclusion + +Delta Lake is a great file format for DataFusion users. + +Delta Lake also uses DataFusion to provide some end-user features. + +DataFusion and Delta Lake have a wonderful symbiotic relationship and play very nicely with each other. + +See [this guide for more information on Delta Lake and PyArrow](https://delta-io.github.io/delta-rs/integrations/delta-lake-arrow/) and why PyArrow Datasets are often a better option than PyArrow tables. From 6da3b3bf810bb372d6d9d6e3528da0453da9391e Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Tue, 26 Dec 2023 17:01:54 -0500 Subject: [PATCH 02/29] docs: explain why delta-rs depends on datafusion --- docs/integrations/delta-lake-datafusion.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/integrations/delta-lake-datafusion.md b/docs/integrations/delta-lake-datafusion.md index c8fab3e35d..a9c2a3cae0 100644 --- a/docs/integrations/delta-lake-datafusion.md +++ b/docs/integrations/delta-lake-datafusion.md @@ -6,7 +6,7 @@ Delta Lake offers DataFusion users better performance and more features compared Delta Lake works well with the DataFusion Rust API and the DataFusion Python API. It's a great option for all DataFusion users. -Delta Lake also depends on DataFusion to implement some functionality under the hood. We will also discuss this dependency at the end of this guide in case you're interested in learning more about the symbiotic relationship between the two libraries. +Delta Lake also depends on DataFusion to implement SQL-related functionality under the hood. We will also discuss this dependency at the end of this guide in case you're interested in learning more about the symbiotic relationship between the two libraries. ## Delta Lake performance benefits for DataFusion users @@ -67,7 +67,12 @@ Delta Lake also provides other features that are useful for DataFusion users lik Delta Lake depends on DataFusion to provide some end-user features. -TODO: Explain how Delta Lake uses DataFusion to provide features. +DataFusion is useful in providing SQL-related Delta Lake features. Some examples: + +* Update and merge are written in terms of SQL expressions. +* Invariants and constraints are written in terms of SQL expressions. + +Anytime we have to evaluate SQL, we need some sort of SQL engine. We use DataFusion for that. ## Conclusion From 74f9d334ce9268e553f1efd7020d4bfc07643719 Mon Sep 17 00:00:00 2001 From: David Blajda Date: Fri, 29 Dec 2023 20:02:28 -0500 Subject: [PATCH 03/29] feat: omit unmodified files during merge write (#1969) # Description Implements a new Datafusion node called `MergeBarrier` that determines which files have modifications. For files that do not have modifications a remove action is no longer created. # Related Issue(s) - enhances #850 --- crates/benchmarks/src/bin/merge.rs | 8 + .../src/delta_datafusion/logical.rs | 16 + .../src/delta_datafusion/mod.rs | 56 +- .../src/operations/merge/barrier.rs | 675 ++++++++++++++++++ .../src/operations/{merge.rs => merge/mod.rs} | 147 ++-- 5 files changed, 821 insertions(+), 81 deletions(-) create mode 100644 crates/deltalake-core/src/operations/merge/barrier.rs rename crates/deltalake-core/src/operations/{merge.rs => merge/mod.rs} (96%) diff --git a/crates/benchmarks/src/bin/merge.rs b/crates/benchmarks/src/bin/merge.rs index d3acb80c0a..affae8b7dd 100644 --- a/crates/benchmarks/src/bin/merge.rs +++ b/crates/benchmarks/src/bin/merge.rs @@ -265,6 +265,14 @@ async fn benchmark_merge_tpcds( .object_store() .delete(&Path::parse("_delta_log/00000000000000000002.json")?) .await?; + table + .object_store() + .delete(&Path::parse("_delta_log/00000000000000000003.json")?) + .await?; + let _ = table + .object_store() + .delete(&Path::parse("_delta_log/00000000000000000004.json")?) + .await; Ok((duration, metrics)) } diff --git a/crates/deltalake-core/src/delta_datafusion/logical.rs b/crates/deltalake-core/src/delta_datafusion/logical.rs index 7b05dd57d9..75ed53d1b1 100644 --- a/crates/deltalake-core/src/delta_datafusion/logical.rs +++ b/crates/deltalake-core/src/delta_datafusion/logical.rs @@ -1,5 +1,7 @@ //! Logical Operations for DataFusion +use std::collections::HashSet; + use datafusion_expr::{LogicalPlan, UserDefinedLogicalNodeCore}; // Metric Observer is used to update DataFusion metrics from a record batch. @@ -10,6 +12,7 @@ pub(crate) struct MetricObserver { // id is preserved during conversion to physical node pub id: String, pub input: LogicalPlan, + pub enable_pushdown: bool, } impl UserDefinedLogicalNodeCore for MetricObserver { @@ -35,6 +38,18 @@ impl UserDefinedLogicalNodeCore for MetricObserver { write!(f, "MetricObserver id={}", &self.id) } + fn prevent_predicate_push_down_columns(&self) -> HashSet { + if self.enable_pushdown { + HashSet::new() + } else { + self.schema() + .fields() + .iter() + .map(|f| f.name().clone()) + .collect() + } + } + fn from_template( &self, _exprs: &[datafusion_expr::Expr], @@ -43,6 +58,7 @@ impl UserDefinedLogicalNodeCore for MetricObserver { MetricObserver { id: self.id.clone(), input: inputs[0].clone(), + enable_pushdown: self.enable_pushdown, } } } diff --git a/crates/deltalake-core/src/delta_datafusion/mod.rs b/crates/deltalake-core/src/delta_datafusion/mod.rs index 5890401e67..17d04c692a 100644 --- a/crates/deltalake-core/src/delta_datafusion/mod.rs +++ b/crates/deltalake-core/src/delta_datafusion/mod.rs @@ -32,7 +32,7 @@ use arrow::datatypes::{DataType as ArrowDataType, Schema as ArrowSchema, SchemaR use arrow::error::ArrowError; use arrow::record_batch::RecordBatch; use arrow_array::types::UInt16Type; -use arrow_array::{Array, DictionaryArray, StringArray}; +use arrow_array::{Array, DictionaryArray, StringArray, TypedDictionaryArray}; use arrow_cast::display::array_value_to_string; use arrow_schema::Field; @@ -132,6 +132,21 @@ fn get_scalar_value(value: Option<&ColumnValueStat>, field: &Arc) -> Prec } } +pub(crate) fn get_path_column<'a>( + batch: &'a RecordBatch, + path_column: &str, +) -> DeltaResult> { + let err = || DeltaTableError::Generic("Unable to obtain Delta-rs path column".to_string()); + batch + .column_by_name(path_column) + .unwrap() + .as_any() + .downcast_ref::>() + .ok_or_else(err)? + .downcast_dict::() + .ok_or_else(err) +} + impl DeltaTableState { /// Provide table level statistics to Datafusion pub fn datafusion_table_statistics(&self) -> DataFusionResult { @@ -1362,31 +1377,20 @@ fn join_batches_with_add_actions( let mut files = Vec::with_capacity(batches.iter().map(|batch| batch.num_rows()).sum()); for batch in batches { - let array = batch.column_by_name(path_column).ok_or_else(|| { - DeltaTableError::Generic(format!("Unable to find column {}", path_column)) - })?; - - let iter: Box>> = - if dict_array { - let array = array - .as_any() - .downcast_ref::>() - .ok_or(DeltaTableError::Generic(format!( - "Unable to downcast column {}", - path_column - )))? - .downcast_dict::() - .ok_or(DeltaTableError::Generic(format!( - "Unable to downcast column {}", - path_column - )))?; - Box::new(array.into_iter()) - } else { - let array = array.as_any().downcast_ref::().ok_or( - DeltaTableError::Generic(format!("Unable to downcast column {}", path_column)), - )?; - Box::new(array.into_iter()) - }; + let err = || DeltaTableError::Generic("Unable to obtain Delta-rs path column".to_string()); + + let iter: Box>> = if dict_array { + let array = get_path_column(&batch, path_column)?; + Box::new(array.into_iter()) + } else { + let array = batch + .column_by_name(path_column) + .ok_or_else(err)? + .as_any() + .downcast_ref::() + .ok_or_else(err)?; + Box::new(array.into_iter()) + }; for path in iter { let path = path.ok_or(DeltaTableError::Generic(format!( diff --git a/crates/deltalake-core/src/operations/merge/barrier.rs b/crates/deltalake-core/src/operations/merge/barrier.rs new file mode 100644 index 0000000000..6883f61253 --- /dev/null +++ b/crates/deltalake-core/src/operations/merge/barrier.rs @@ -0,0 +1,675 @@ +//! Merge Barrier determines which files have modifications during the merge operation +//! +//! For every unique path in the input stream, a barrier is established. If any +//! single record for a file contains any delete, update, or insert operations +//! then the barrier for the file is opened and can be sent downstream. +//! To determine if a file contains zero changes, the input stream is +//! exhausted. Afterwards, records are then dropped. +//! +//! Bookkeeping is maintained to determine which files have modifications so +//! they can be removed from the delta log. + +use std::{ + collections::{HashMap, HashSet}, + pin::Pin, + sync::{Arc, Mutex}, + task::{Context, Poll}, +}; + +use arrow_array::{builder::UInt64Builder, ArrayRef, RecordBatch}; +use arrow_schema::SchemaRef; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, +}; +use datafusion_common::{DataFusionError, Result as DataFusionResult}; +use datafusion_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_physical_expr::{Distribution, PhysicalExpr}; +use futures::{Stream, StreamExt}; + +use crate::{ + delta_datafusion::get_path_column, + operations::merge::{TARGET_DELETE_COLUMN, TARGET_INSERT_COLUMN, TARGET_UPDATE_COLUMN}, + DeltaTableError, +}; + +pub(crate) type BarrierSurvivorSet = Arc>>; + +#[derive(Debug)] +/// Physical Node for the MergeBarrier +/// Batches to this node must be repartitioned on col('deleta_rs_path'). +/// Each record batch then undergoes further partitioning based on the file column to it's corresponding barrier +pub struct MergeBarrierExec { + input: Arc, + file_column: Arc, + survivors: BarrierSurvivorSet, + expr: Arc, +} + +impl MergeBarrierExec { + /// Create a new MergeBarrierExec Node + pub fn new( + input: Arc, + file_column: Arc, + expr: Arc, + ) -> Self { + MergeBarrierExec { + input, + file_column, + survivors: Arc::new(Mutex::new(HashSet::new())), + expr, + } + } + + /// Files that have modifications to them and need to removed from the delta log + pub fn survivors(&self) -> BarrierSurvivorSet { + self.survivors.clone() + } +} + +impl ExecutionPlan for MergeBarrierExec { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> arrow_schema::SchemaRef { + self.input.schema() + } + + fn output_partitioning(&self) -> datafusion_physical_expr::Partitioning { + self.input.output_partitioning() + } + + fn required_input_distribution(&self) -> Vec { + vec![Distribution::HashPartitioned(vec![self.expr.clone()]); 1] + } + + fn output_ordering(&self) -> Option<&[datafusion_physical_expr::PhysicalSortExpr]> { + None + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + fn with_new_children( + self: std::sync::Arc, + children: Vec>, + ) -> datafusion_common::Result> { + Ok(Arc::new(MergeBarrierExec::new( + children[0].clone(), + self.file_column.clone(), + self.expr.clone(), + ))) + } + + fn execute( + &self, + partition: usize, + context: std::sync::Arc, + ) -> datafusion_common::Result { + let input = self.input.execute(partition, context)?; + Ok(Box::pin(MergeBarrierStream::new( + input, + self.schema(), + self.survivors.clone(), + self.file_column.clone(), + ))) + } +} + +impl DisplayAs for MergeBarrierExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "MergeBarrier",)?; + Ok(()) + } + } + } +} + +#[derive(Debug)] +enum State { + Feed, + Drain, + Finalize, + Abort, + Done, +} + +#[derive(Debug)] +enum PartitionBarrierState { + Closed, + Open, +} + +#[derive(Debug)] +struct MergeBarrierPartition { + state: PartitionBarrierState, + buffer: Vec, + file_name: Option, +} + +impl MergeBarrierPartition { + pub fn new(file_name: Option) -> Self { + MergeBarrierPartition { + state: PartitionBarrierState::Closed, + buffer: Vec::new(), + file_name, + } + } + + pub fn feed(&mut self, batch: RecordBatch) -> DataFusionResult<()> { + match self.state { + PartitionBarrierState::Closed => { + let delete_count = get_count(&batch, TARGET_DELETE_COLUMN)?; + let update_count = get_count(&batch, TARGET_UPDATE_COLUMN)?; + let insert_count = get_count(&batch, TARGET_INSERT_COLUMN)?; + self.buffer.push(batch); + + if insert_count > 0 || update_count > 0 || delete_count > 0 { + self.state = PartitionBarrierState::Open; + } + } + PartitionBarrierState::Open => { + self.buffer.push(batch); + } + } + Ok(()) + } + + pub fn drain(&mut self) -> Option { + match self.state { + PartitionBarrierState::Closed => None, + PartitionBarrierState::Open => self.buffer.pop(), + } + } +} + +struct MergeBarrierStream { + schema: SchemaRef, + state: State, + input: SendableRecordBatchStream, + file_column: Arc, + survivors: BarrierSurvivorSet, + map: HashMap, + file_partitions: Vec, +} + +impl MergeBarrierStream { + pub fn new( + input: SendableRecordBatchStream, + schema: SchemaRef, + survivors: BarrierSurvivorSet, + file_column: Arc, + ) -> Self { + // Always allocate for a null bucket at index 0; + let file_partitions = vec![MergeBarrierPartition::new(None)]; + + MergeBarrierStream { + schema, + state: State::Feed, + input, + file_column, + survivors, + file_partitions, + map: HashMap::new(), + } + } +} + +fn get_count(batch: &RecordBatch, column: &str) -> DataFusionResult { + batch + .column_by_name(column) + .map(|array| array.null_count()) + .ok_or_else(|| { + DataFusionError::External(Box::new(DeltaTableError::Generic( + "Required operation column is missing".to_string(), + ))) + }) +} + +impl Stream for MergeBarrierStream { + type Item = DataFusionResult; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + match self.state { + State::Feed => { + match self.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + let file_dictionary = get_path_column(&batch, &self.file_column)?; + + // For each record batch, the key for a file path is not stable. + // We can iterate through the dictionary and lookup the correspond string for each record and then lookup the correct `file_partition` for that value. + // However this approach exposes the cost of hashing so we want to minimize that as much as possible. + // A map from an arrow dictionary key to the correct index of `file_partition` is created for each batch that's processed. + // This ensures we only need to hash each file path at most once per batch. + let mut key_map = Vec::new(); + + for file_name in file_dictionary.values().into_iter() { + let key = match file_name { + Some(name) => { + if !self.map.contains_key(name) { + let key = self.file_partitions.len(); + let part_stream = + MergeBarrierPartition::new(Some(name.to_string())); + self.file_partitions.push(part_stream); + self.map.insert(name.to_string(), key); + } + // Safe unwrap due to the above + *self.map.get(name).unwrap() + } + None => 0, + }; + key_map.push(key) + } + + let mut indices: Vec<_> = (0..(self.file_partitions.len())) + .map(|_| UInt64Builder::with_capacity(batch.num_rows())) + .collect(); + + for (idx, key) in file_dictionary.keys().iter().enumerate() { + match key { + Some(value) => { + indices[key_map[value as usize]].append_value(idx as u64) + } + None => indices[0].append_value(idx as u64), + } + } + + let batches: Vec> = + indices + .into_iter() + .enumerate() + .filter_map(|(partition, mut indices)| { + let indices = indices.finish(); + (!indices.is_empty()).then_some((partition, indices)) + }) + .map(move |(partition, indices)| { + // Produce batches based on indices + let columns = batch + .columns() + .iter() + .map(|c| { + arrow::compute::take(c.as_ref(), &indices, None) + .map_err(DataFusionError::ArrowError) + }) + .collect::>>()?; + + // This unwrap is safe since the processed batched has the same schema + let batch = + RecordBatch::try_new(batch.schema(), columns).unwrap(); + + Ok((partition, batch)) + }) + .collect(); + + for batch in batches { + match batch { + Ok((partition, batch)) => { + self.file_partitions[partition].feed(batch)?; + } + Err(err) => { + self.state = State::Abort; + return Poll::Ready(Some(Err(err))); + } + } + } + + self.state = State::Drain; + continue; + } + Poll::Ready(Some(Err(err))) => { + self.state = State::Abort; + return Poll::Ready(Some(Err(err))); + } + Poll::Ready(None) => { + self.state = State::Finalize; + continue; + } + Poll::Pending => return Poll::Pending, + } + } + State::Drain => { + for part in &mut self.file_partitions { + if let Some(batch) = part.drain() { + return Poll::Ready(Some(Ok(batch))); + } + } + + self.state = State::Feed; + continue; + } + State::Finalize => { + for part in &mut self.file_partitions { + if let Some(batch) = part.drain() { + return Poll::Ready(Some(Ok(batch))); + } + } + + { + let mut lock = self.survivors.lock().map_err(|_| { + DataFusionError::External(Box::new(DeltaTableError::Generic( + "MergeBarrier mutex is poisoned".to_string(), + ))) + })?; + for part in &self.file_partitions { + match part.state { + PartitionBarrierState::Closed => {} + PartitionBarrierState::Open => { + if let Some(file_name) = &part.file_name { + lock.insert(file_name.to_owned()); + } + } + } + } + } + + self.state = State::Done; + continue; + } + State::Abort => return Poll::Ready(None), + State::Done => return Poll::Ready(None), + } + } + } + + fn size_hint(&self) -> (usize, Option) { + (0, self.input.size_hint().1) + } +} + +impl RecordBatchStream for MergeBarrierStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) struct MergeBarrier { + pub input: LogicalPlan, + pub expr: Expr, + pub file_column: Arc, +} + +impl UserDefinedLogicalNodeCore for MergeBarrier { + fn name(&self) -> &str { + "MergeBarrier" + } + + fn inputs(&self) -> Vec<&datafusion_expr::LogicalPlan> { + vec![&self.input] + } + + fn schema(&self) -> &datafusion_common::DFSchemaRef { + self.input.schema() + } + + fn expressions(&self) -> Vec { + vec![self.expr.clone()] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "MergeBarrier") + } + + fn from_template( + &self, + exprs: &[datafusion_expr::Expr], + inputs: &[datafusion_expr::LogicalPlan], + ) -> Self { + MergeBarrier { + input: inputs[0].clone(), + file_column: self.file_column.clone(), + expr: exprs[0].clone(), + } + } +} + +pub(crate) fn find_barrier_node(parent: &Arc) -> Option> { + //! Used to locate the physical Barrier Node after the planner converts the logical node + if parent.as_any().downcast_ref::().is_some() { + return Some(parent.to_owned()); + } + + for child in &parent.children() { + let res = find_barrier_node(child); + if res.is_some() { + return res; + } + } + + None +} + +#[cfg(test)] +mod tests { + use crate::operations::merge::MergeBarrierExec; + use crate::operations::merge::{ + TARGET_DELETE_COLUMN, TARGET_INSERT_COLUMN, TARGET_UPDATE_COLUMN, + }; + use arrow::datatypes::Schema as ArrowSchema; + use arrow_array::RecordBatch; + use arrow_array::StringArray; + use arrow_array::{DictionaryArray, UInt16Array}; + use arrow_schema::DataType as ArrowDataType; + use arrow_schema::Field; + use datafusion::assert_batches_sorted_eq; + use datafusion::execution::TaskContext; + use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; + use datafusion::physical_plan::memory::MemoryExec; + use datafusion::physical_plan::ExecutionPlan; + use datafusion_physical_expr::expressions::Column; + use futures::StreamExt; + use std::sync::Arc; + + use super::BarrierSurvivorSet; + + #[tokio::test] + async fn test_barrier() { + // Validate that files without modifications are dropped and that files with changes passthrough + // File 0: No Changes + // File 1: Contains an update + // File 2: Contains a delete + // null (id: 3): is a insert + + let schema = get_schema(); + let keys = UInt16Array::from(vec![Some(0), Some(1), Some(2), None]); + let values = StringArray::from(vec![Some("file0"), Some("file1"), Some("file2")]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["0", "1", "2", "3"])), + Arc::new(dict), + //insert column + Arc::new(arrow::array::BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + None, + ])), + //update column + Arc::new(arrow::array::BooleanArray::from(vec![ + Some(false), + None, + Some(false), + Some(false), + ])), + //delete column + Arc::new(arrow::array::BooleanArray::from(vec![ + Some(false), + Some(false), + None, + Some(false), + ])), + ], + ) + .unwrap(); + + let (actual, survivors) = execute(vec![batch]).await; + let expected = vec![ + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + "| id | __delta_rs_path | __delta_rs_target_insert | __delta_rs_target_update | __delta_rs_target_delete |", + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + "| 1 | file1 | false | | false |", + "| 2 | file2 | false | false | |", + "| 3 | | | false | false |", + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + + let s = survivors.lock().unwrap(); + assert!(!s.contains(&"file0".to_string())); + assert!(s.contains(&"file1".to_string())); + assert!(s.contains(&"file2".to_string())); + assert_eq!(s.len(), 2); + } + + #[tokio::test] + async fn test_barrier_changing_indicies() { + // Validate implementation can handle different dictionary indicies between batches + + let schema = get_schema(); + let mut batches = vec![]; + + // Batch 1 + let keys = UInt16Array::from(vec![Some(0), Some(1)]); + let values = StringArray::from(vec![Some("file0"), Some("file1")]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["0", "1"])), + Arc::new(dict), + //insert column + Arc::new(arrow::array::BooleanArray::from(vec![ + Some(false), + Some(false), + ])), + //update column + Arc::new(arrow::array::BooleanArray::from(vec![ + Some(false), + Some(false), + ])), + //delete column + Arc::new(arrow::array::BooleanArray::from(vec![ + Some(false), + Some(false), + ])), + ], + ) + .unwrap(); + batches.push(batch); + // Batch 2 + + let keys = UInt16Array::from(vec![Some(0), Some(1)]); + let values = StringArray::from(vec![Some("file1"), Some("file0")]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["2", "3"])), + Arc::new(dict), + //insert column + Arc::new(arrow::array::BooleanArray::from(vec![ + Some(false), + Some(false), + ])), + //update column + Arc::new(arrow::array::BooleanArray::from(vec![None, Some(false)])), + //delete column + Arc::new(arrow::array::BooleanArray::from(vec![Some(false), None])), + ], + ) + .unwrap(); + batches.push(batch); + + let (actual, _survivors) = execute(batches).await; + let expected = vec! + [ + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + "| id | __delta_rs_path | __delta_rs_target_insert | __delta_rs_target_update | __delta_rs_target_delete |", + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + "| 0 | file0 | false | false | false |", + "| 1 | file1 | false | false | false |", + "| 2 | file1 | false | | false |", + "| 3 | file0 | false | false | |", + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + } + + #[tokio::test] + async fn test_barrier_null_paths() { + // Arrow dictionaries are interesting since a null value can be either in the keys of the dict or in the values. + // Validate they can be processed without issue + + let schema = get_schema(); + let keys = UInt16Array::from(vec![Some(0), None, Some(1)]); + let values = StringArray::from(vec![Some("file1"), None]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::StringArray::from(vec!["1", "2", "3"])), + Arc::new(dict), + Arc::new(arrow::array::BooleanArray::from(vec![ + Some(false), + None, + None, + ])), + Arc::new(arrow::array::BooleanArray::from(vec![false, false, false])), + Arc::new(arrow::array::BooleanArray::from(vec![false, false, false])), + ], + ) + .unwrap(); + + let (actual, _) = execute(vec![batch]).await; + let expected = vec![ + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + "| id | __delta_rs_path | __delta_rs_target_insert | __delta_rs_target_update | __delta_rs_target_delete |", + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + "| 2 | | | false | false |", + "| 3 | | | false | false |", + "+----+-----------------+--------------------------+--------------------------+--------------------------+", + ]; + assert_batches_sorted_eq!(&expected, &actual); + } + + async fn execute(input: Vec) -> (Vec, BarrierSurvivorSet) { + let schema = get_schema(); + let repartition = Arc::new(Column::new("__delta_rs_path", 2)); + let exec = Arc::new(MemoryExec::try_new(&[input], schema.clone(), None).unwrap()); + + let task_ctx = Arc::new(TaskContext::default()); + let merge = + MergeBarrierExec::new(exec, Arc::new("__delta_rs_path".to_string()), repartition); + + let survivors = merge.survivors(); + let coalsece = CoalesceBatchesExec::new(Arc::new(merge), 100); + let mut stream = coalsece.execute(0, task_ctx).unwrap(); + (vec![stream.next().await.unwrap().unwrap()], survivors) + } + + fn get_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + Field::new("id", ArrowDataType::Utf8, true), + Field::new( + "__delta_rs_path", + ArrowDataType::Dictionary( + Box::new(ArrowDataType::UInt16), + Box::new(ArrowDataType::Utf8), + ), + true, + ), + Field::new(TARGET_INSERT_COLUMN, ArrowDataType::Boolean, true), + Field::new(TARGET_UPDATE_COLUMN, ArrowDataType::Boolean, true), + Field::new(TARGET_DELETE_COLUMN, ArrowDataType::Boolean, true), + ])) + } +} diff --git a/crates/deltalake-core/src/operations/merge.rs b/crates/deltalake-core/src/operations/merge/mod.rs similarity index 96% rename from crates/deltalake-core/src/operations/merge.rs rename to crates/deltalake-core/src/operations/merge/mod.rs index 0f0da1c21f..7cb752dc21 100644 --- a/crates/deltalake-core/src/operations/merge.rs +++ b/crates/deltalake-core/src/operations/merge/mod.rs @@ -36,6 +36,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Instant, SystemTime, UNIX_EPOCH}; +use arrow_schema::Schema as ArrowSchema; use async_trait::async_trait; use datafusion::datasource::provider_as_source; use datafusion::error::Result as DataFusionResult; @@ -64,31 +65,36 @@ use parquet::file::properties::WriterProperties; use serde::Serialize; use serde_json::Value; +use self::barrier::{MergeBarrier, MergeBarrierExec}; + use super::datafusion_utils::{into_expr, maybe_into_expr, Expression}; use super::transaction::{commit, PROTOCOL}; use crate::delta_datafusion::expr::{fmt_expr_to_sql, parse_predicate_expression}; use crate::delta_datafusion::logical::MetricObserver; use crate::delta_datafusion::physical::{find_metric_node, MetricObserverExec}; use crate::delta_datafusion::{ - execute_plan_to_batch, register_store, DeltaColumn, DeltaScanConfig, DeltaSessionConfig, + execute_plan_to_batch, register_store, DeltaColumn, DeltaScanConfigBuilder, DeltaSessionConfig, DeltaTableProvider, }; use crate::kernel::{Action, Remove}; use crate::logstore::LogStoreRef; +use crate::operations::merge::barrier::find_barrier_node; use crate::operations::write::write_execution_plan; use crate::protocol::{DeltaOperation, MergePredicate}; use crate::table::state::DeltaTableState; use crate::{DeltaResult, DeltaTable, DeltaTableError}; +mod barrier; + const SOURCE_COLUMN: &str = "__delta_rs_source"; const TARGET_COLUMN: &str = "__delta_rs_target"; const OPERATION_COLUMN: &str = "__delta_rs_operation"; const DELETE_COLUMN: &str = "__delta_rs_delete"; -const TARGET_INSERT_COLUMN: &str = "__delta_rs_target_insert"; -const TARGET_UPDATE_COLUMN: &str = "__delta_rs_target_update"; -const TARGET_DELETE_COLUMN: &str = "__delta_rs_target_delete"; -const TARGET_COPY_COLUMN: &str = "__delta_rs_target_copy"; +pub(crate) const TARGET_INSERT_COLUMN: &str = "__delta_rs_target_insert"; +pub(crate) const TARGET_UPDATE_COLUMN: &str = "__delta_rs_target_update"; +pub(crate) const TARGET_DELETE_COLUMN: &str = "__delta_rs_target_delete"; +pub(crate) const TARGET_COPY_COLUMN: &str = "__delta_rs_target_copy"; const SOURCE_COUNT_METRIC: &str = "num_source_rows"; const TARGET_COUNT_METRIC: &str = "num_target_rows"; @@ -580,11 +586,11 @@ struct MergeMetricExtensionPlanner {} impl ExtensionPlanner for MergeMetricExtensionPlanner { async fn plan_extension( &self, - _planner: &dyn PhysicalPlanner, + planner: &dyn PhysicalPlanner, node: &dyn UserDefinedLogicalNode, _logical_inputs: &[&LogicalPlan], physical_inputs: &[Arc], - _session_state: &SessionState, + session_state: &SessionState, ) -> DataFusionResult>> { if let Some(metric_observer) = node.as_any().downcast_ref::() { if metric_observer.id.eq(SOURCE_COUNT_ID) { @@ -653,6 +659,16 @@ impl ExtensionPlanner for MergeMetricExtensionPlanner { } } + if let Some(barrier) = node.as_any().downcast_ref::() { + let schema = barrier.input.schema(); + let exec_schema: ArrowSchema = schema.as_ref().to_owned().into(); + return Ok(Some(Arc::new(MergeBarrierExec::new( + physical_inputs.get(0).unwrap().clone(), + barrier.file_column.clone(), + planner.create_physical_expr(&barrier.expr, schema, &exec_schema, session_state)?, + )))); + } + Ok(None) } } @@ -945,13 +961,20 @@ async fn execute( node: Arc::new(MetricObserver { id: SOURCE_COUNT_ID.into(), input: source, + enable_pushdown: false, }), }); + let scan_config = DeltaScanConfigBuilder::default() + .with_file_column(true) + .build(snapshot)?; + + let file_column = Arc::new(scan_config.file_column_name.clone().unwrap()); + let target_provider = Arc::new(DeltaTableProvider::try_new( snapshot.clone(), log_store.clone(), - DeltaScanConfig::default(), + scan_config, )?); let target_provider = provider_as_source(target_provider); @@ -968,7 +991,7 @@ async fn execute( let state = state.with_query_planner(Arc::new(MergePlanner {})); - let (target, files) = { + let target = { // Attempt to construct an early filter that we can apply to the Add action list and the delta scan. // In the case where there are partition columns in the join predicate, we can scan the source table // to get the distinct list of partitions affected and constrain the search to those. @@ -976,7 +999,7 @@ async fn execute( if !not_match_source_operations.is_empty() { // It's only worth trying to create an early filter where there are no `when_not_matched_source` operators, since // that implies a full scan - (target, snapshot.files().iter().collect_vec()) + target } else if let Some(filter) = try_construct_early_filter( predicate.clone(), snapshot, @@ -987,35 +1010,23 @@ async fn execute( ) .await? { - let file_filter = filter - .clone() - .transform(&|expr| match expr { - Expr::Column(c) => Ok(Transformed::Yes(Expr::Column(Column { - relation: None, // the file filter won't be looking at columns like `target.partition`, it'll just be `partition` - name: c.name, - }))), - expr => Ok(Transformed::No(expr)), - }) - .unwrap(); - let files = snapshot - .files_matching_predicate(&[file_filter])? - .collect_vec(); - - let new_target = LogicalPlan::Filter(Filter::try_new(filter, target.into())?); - (new_target, files) + LogicalPlan::Filter(Filter::try_new(filter, target.into())?) } else { - (target, snapshot.files().iter().collect_vec()) + target } }; let source = DataFrame::new(state.clone(), source); let source = source.with_column(SOURCE_COLUMN, lit(true))?; - // TODO: This is here to prevent predicate pushdowns. In the future we can replace this node to allow pushdowns depending on which operations are being used. + // Not match operations imply a full scan of the target table is required + let enable_pushdown = + not_match_source_operations.is_empty() && not_match_target_operations.is_empty(); let target = LogicalPlan::Extension(Extension { node: Arc::new(MetricObserver { id: TARGET_COUNT_ID.into(), input: target, + enable_pushdown, }), }); let target = DataFrame::new(state.clone(), target); @@ -1272,11 +1283,23 @@ async fn execute( )?; new_columns = new_columns.with_column(TARGET_COPY_COLUMN, build_case(copy_when, copy_then)?)?; - let new_columns = new_columns.into_optimized_plan()?; + let new_columns = new_columns.into_unoptimized_plan(); + + let distrbute_expr = col(file_column.as_str()); + + let merge_barrier = LogicalPlan::Extension(Extension { + node: Arc::new(MergeBarrier { + input: new_columns, + expr: distrbute_expr, + file_column, + }), + }); + let operation_count = LogicalPlan::Extension(Extension { node: Arc::new(MetricObserver { id: OUTPUT_COUNT_ID.into(), - input: new_columns, + input: merge_barrier, + enable_pushdown: false, }), }); @@ -1284,13 +1307,14 @@ async fn execute( let filtered = operation_count.filter(col(DELETE_COLUMN).is_false())?; let project = filtered.select(write_projection)?; - let optimized = &project.into_optimized_plan()?; + let merge_final = &project.into_unoptimized_plan(); - let write = state.create_physical_plan(optimized).await?; + let write = state.create_physical_plan(merge_final).await?; let err = || DeltaTableError::Generic("Unable to locate expected metric node".into()); let source_count = find_metric_node(SOURCE_COUNT_ID, &write).ok_or_else(err)?; let op_count = find_metric_node(OUTPUT_COUNT_ID, &write).ok_or_else(err)?; + let barrier = find_barrier_node(&write).ok_or_else(err)?; // write projected records let table_partition_cols = current_metadata.partition_columns.clone(); @@ -1320,20 +1344,31 @@ async fn execute( let mut actions: Vec = add_actions.into_iter().map(Action::Add).collect(); metrics.num_target_files_added = actions.len(); - for action in files { - metrics.num_target_files_removed += 1; - actions.push(Action::Remove(Remove { - path: action.path.clone(), - deletion_timestamp: Some(deletion_timestamp), - data_change: true, - extended_file_metadata: Some(true), - partition_values: Some(action.partition_values.clone()), - deletion_vector: action.deletion_vector.clone(), - size: Some(action.size), - tags: None, - base_row_id: action.base_row_id, - default_row_commit_version: action.default_row_commit_version, - })) + let survivors = barrier + .as_any() + .downcast_ref::() + .unwrap() + .survivors(); + + { + let lock = survivors.lock().unwrap(); + for action in snapshot.files() { + if lock.contains(&action.path) { + metrics.num_target_files_removed += 1; + actions.push(Action::Remove(Remove { + path: action.path.clone(), + deletion_timestamp: Some(deletion_timestamp), + data_change: true, + extended_file_metadata: Some(true), + partition_values: Some(action.partition_values.clone()), + deletion_vector: action.deletion_vector.clone(), + size: Some(action.size), + tags: None, + base_row_id: action.base_row_id, + default_row_commit_version: action.default_row_commit_version, + })) + } + } } let mut version = snapshot.version(); @@ -1506,6 +1541,8 @@ mod tests { .merge(merge_source(schema), col("target.id").eq(col("source.id"))) .with_source_alias("source") .with_target_alias("target") + .when_not_matched_by_source_delete(|delete| delete) + .unwrap() .await .expect_err("Remove action is included when Delta table is append-only. Should error"); } @@ -2004,7 +2041,7 @@ mod tests { assert_eq!(table.version(), 2); assert!(table.get_file_uris().count() >= 2); - assert!(metrics.num_target_files_added >= 2); + assert_eq!(metrics.num_target_files_added, 2); assert_eq!(metrics.num_target_files_removed, 2); assert_eq!(metrics.num_target_rows_copied, 2); assert_eq!(metrics.num_target_rows_updated, 0); @@ -2068,13 +2105,13 @@ mod tests { assert_eq!(table.version(), 2); assert!(table.get_file_uris().count() >= 2); - assert!(metrics.num_target_files_added >= 2); - assert_eq!(metrics.num_target_files_removed, 2); - assert_eq!(metrics.num_target_rows_copied, 3); + assert_eq!(metrics.num_target_files_added, 1); + assert_eq!(metrics.num_target_files_removed, 1); + assert_eq!(metrics.num_target_rows_copied, 1); assert_eq!(metrics.num_target_rows_updated, 0); assert_eq!(metrics.num_target_rows_inserted, 0); assert_eq!(metrics.num_target_rows_deleted, 1); - assert_eq!(metrics.num_output_rows, 3); + assert_eq!(metrics.num_output_rows, 1); assert_eq!(metrics.num_source_rows, 3); let commit_info = table.history(None).await.unwrap(); @@ -2201,13 +2238,13 @@ mod tests { .unwrap(); assert_eq!(table.version(), 2); - assert!(metrics.num_target_files_added >= 2); - assert_eq!(metrics.num_target_files_removed, 2); - assert_eq!(metrics.num_target_rows_copied, 3); + assert!(metrics.num_target_files_added == 1); + assert_eq!(metrics.num_target_files_removed, 1); + assert_eq!(metrics.num_target_rows_copied, 1); assert_eq!(metrics.num_target_rows_updated, 0); assert_eq!(metrics.num_target_rows_inserted, 0); assert_eq!(metrics.num_target_rows_deleted, 1); - assert_eq!(metrics.num_output_rows, 3); + assert_eq!(metrics.num_output_rows, 1); assert_eq!(metrics.num_source_rows, 3); let commit_info = table.history(None).await.unwrap(); From 40e3b0d52d4fd84502d8afa8130d6dd8f510d421 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Mon, 1 Jan 2024 18:04:30 +0000 Subject: [PATCH 04/29] fix the test_restore_by_datetime test Signed-off-by: Nikolay Ulmasov --- crates/deltalake-core/tests/command_restore.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/crates/deltalake-core/tests/command_restore.rs b/crates/deltalake-core/tests/command_restore.rs index 2c1c06cbb6..fd8d77c8b9 100644 --- a/crates/deltalake-core/tests/command_restore.rs +++ b/crates/deltalake-core/tests/command_restore.rs @@ -6,6 +6,7 @@ use arrow_schema::{DataType as ArrowDataType, Field}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; use deltalake_core::kernel::{DataType, PrimitiveType, StructField}; use deltalake_core::protocol::SaveMode; +use deltalake_core::storage::commit_uri_from_version; use deltalake_core::{DeltaOps, DeltaTable}; use rand::Rng; use std::error::Error; @@ -117,9 +118,15 @@ async fn test_restore_by_version() -> Result<(), Box> { #[tokio::test] async fn test_restore_by_datetime() -> Result<(), Box> { let context = setup_test().await?; - let mut table = context.table; - let history = table.history(Some(10)).await?; - let timestamp = history.get(1).unwrap().timestamp.unwrap(); + let table = context.table; + let version = 1; + + // The way we obtain a timestamp for a version will have to change when/if we start using CommitInfo for timestamps + let meta = table + .object_store() + .head(&commit_uri_from_version(version)) + .await?; + let timestamp = meta.last_modified.timestamp_millis(); let naive = NaiveDateTime::from_timestamp_millis(timestamp).unwrap(); let datetime: DateTime = Utc.from_utc_datetime(&naive); From 7add49171cbe95261e22cf5e5c32aa7512c00d34 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 2 Jan 2024 15:25:15 +0100 Subject: [PATCH 05/29] chore: version bump python release (#2011) --- python/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index a9936a483c..dd3bcca1e9 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-python" -version = "0.14.0" +version = "0.15.0" authors = ["Qingping Hou ", "Will Jones "] homepage = "https://github.com/delta-io/delta-rs" license = "Apache-2.0" From f54bb282e08c1468a18089b6fbb4935b9f781576 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 2 Jan 2024 15:39:41 +0100 Subject: [PATCH 06/29] feat(python): expose large_dtype param in `merge` (#2003) # Description This helps to avoid this [error](https://github.com/delta-io/delta-rs/issues/1998 )since you can now set to large_dtypes=False. Also once upstream in arrow-rs there is better type coercion, this param should be able to be removed completely in the writer and merge operation. --- python/deltalake/table.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 5adeaaa9dc..76b1dd4f49 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -808,6 +808,7 @@ def merge( target_alias: Optional[str] = None, error_on_type_mismatch: bool = True, writer_properties: Optional[WriterProperties] = None, + large_dtypes: bool = True, ) -> "TableMerger": """Pass the source data which you want to merge on the target delta table, providing a predicate in SQL query like format. You can also specify on what to do when the underlying data types do not @@ -820,6 +821,7 @@ def merge( target_alias: Alias for the target table error_on_type_mismatch: specify if merge will return error if data types are mismatching :default = True writer_properties: Pass writer properties to the Rust parquet writer + large_dtypes: If True, the data schema is kept in large_dtypes. Returns: TableMerger: TableMerger Object @@ -835,16 +837,16 @@ def merge( ) if isinstance(source, pyarrow.RecordBatchReader): - source = convert_pyarrow_recordbatchreader(source, large_dtypes=True) + source = convert_pyarrow_recordbatchreader(source, large_dtypes) elif isinstance(source, pyarrow.RecordBatch): - source = convert_pyarrow_recordbatch(source, large_dtypes=True) + source = convert_pyarrow_recordbatch(source, large_dtypes) elif isinstance(source, pyarrow.Table): - source = convert_pyarrow_table(source, large_dtypes=True) + source = convert_pyarrow_table(source, large_dtypes) elif isinstance(source, ds.Dataset): - source = convert_pyarrow_dataset(source, large_dtypes=True) + source = convert_pyarrow_dataset(source, large_dtypes) elif isinstance(source, pandas.DataFrame): source = convert_pyarrow_table( - pyarrow.Table.from_pandas(source), large_dtypes=True + pyarrow.Table.from_pandas(source), large_dtypes ) else: raise TypeError( From 093a7569549fa839eb095805d3e22d460c10c35c Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 2 Jan 2024 16:02:26 +0100 Subject: [PATCH 07/29] feat(python): expose custom metadata to writers (#1994) # Description - exposes the custom_metadata to pyarrow and rust writer - addresses a bug in the create operation, we were not passing the app_metadata to the actual commit # Related Issue(s) - closes https://github.com/delta-io/delta-rs/issues/1990 --- .../deltalake-core/src/operations/create.rs | 14 ++++++++----- python/deltalake/_internal.pyi | 3 +++ python/deltalake/writer.py | 7 +++++++ python/src/lib.rs | 21 ++++++++++++++++++- 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/crates/deltalake-core/src/operations/create.rs b/crates/deltalake-core/src/operations/create.rs index 0dca038f4a..0e44fe215f 100644 --- a/crates/deltalake-core/src/operations/create.rs +++ b/crates/deltalake-core/src/operations/create.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::sync::Arc; use futures::future::BoxFuture; -use serde_json::{Map, Value}; +use serde_json::Value; use super::transaction::{commit, PROTOCOL}; use crate::errors::{DeltaResult, DeltaTableError}; @@ -56,7 +56,7 @@ pub struct CreateBuilder { actions: Vec, log_store: Option, configuration: HashMap>, - metadata: Option>, + metadata: Option>, } impl Default for CreateBuilder { @@ -181,8 +181,11 @@ impl CreateBuilder { /// /// This might include provenance information such as an id of the /// user that made the commit or the program that created it. - pub fn with_metadata(mut self, metadata: Map) -> Self { - self.metadata = Some(metadata); + pub fn with_metadata( + mut self, + metadata: impl IntoIterator, + ) -> Self { + self.metadata = Some(HashMap::from_iter(metadata)); self } @@ -286,6 +289,7 @@ impl std::future::IntoFuture for CreateBuilder { let this = self; Box::pin(async move { let mode = this.mode.clone(); + let app_metadata = this.metadata.clone(); let (mut table, actions, operation) = this.into_table_and_actions()?; let log_store = table.log_store(); let table_state = if log_store.is_delta_table_location().await? { @@ -310,7 +314,7 @@ impl std::future::IntoFuture for CreateBuilder { &actions, operation, table_state, - None, + app_metadata, ) .await?; table.load_version(version).await?; diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index b4d0ca8c3d..b893fc065b 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -135,6 +135,7 @@ class RawDeltaTable: partition_by: List[str], schema: pyarrow.Schema, partitions_filters: Optional[FilterType], + custom_metadata: Optional[Dict[str, str]], ) -> None: ... def cleanup_metadata(self) -> None: ... @@ -149,6 +150,7 @@ def write_new_deltalake( description: Optional[str], configuration: Optional[Mapping[str, Optional[str]]], storage_options: Optional[Dict[str, str]], + custom_metadata: Optional[Dict[str, str]], ) -> None: ... def write_to_deltalake( table_uri: str, @@ -163,6 +165,7 @@ def write_to_deltalake( configuration: Optional[Mapping[str, Optional[str]]], storage_options: Optional[Dict[str, str]], writer_properties: Optional[Dict[str, Optional[str]]], + custom_metadata: Optional[Dict[str, str]], ) -> None: ... def convert_to_deltalake( uri: str, diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index 609a6487c6..7306a5705c 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -100,6 +100,7 @@ def write_deltalake( partition_filters: Optional[List[Tuple[str, str, Any]]] = ..., large_dtypes: bool = ..., engine: Literal["pyarrow"] = ..., + custom_metadata: Optional[Dict[str, str]] = ..., ) -> None: ... @@ -128,6 +129,7 @@ def write_deltalake( large_dtypes: bool = ..., engine: Literal["rust"], writer_properties: WriterProperties = ..., + custom_metadata: Optional[Dict[str, str]] = ..., ) -> None: ... @@ -163,6 +165,7 @@ def write_deltalake( large_dtypes: bool = False, engine: Literal["pyarrow", "rust"] = "pyarrow", writer_properties: Optional[WriterProperties] = None, + custom_metadata: Optional[Dict[str, str]] = None, ) -> None: """Write to a Delta Lake table @@ -236,6 +239,7 @@ def write_deltalake( engine: writer engine to write the delta table. `Rust` engine is still experimental but you may see up to 4x performance improvements over pyarrow. writer_properties: Pass writer properties to the Rust parquet writer. + custom_metadata: Custom metadata to add to the commitInfo. """ table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options) if table is not None: @@ -300,6 +304,7 @@ def write_deltalake( writer_properties=writer_properties._to_dict() if writer_properties else None, + custom_metadata=custom_metadata, ) if table: table.update_incremental() @@ -492,6 +497,7 @@ def validate_batch(batch: pa.RecordBatch) -> pa.RecordBatch: description, configuration, storage_options, + custom_metadata, ) else: table._table.create_write_transaction( @@ -500,6 +506,7 @@ def validate_batch(batch: pa.RecordBatch) -> pa.RecordBatch: partition_by or [], schema, partition_filters, + custom_metadata, ) table.update_incremental() else: diff --git a/python/src/lib.rs b/python/src/lib.rs index 55a7442281..4f921f21cf 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -741,6 +741,7 @@ impl RawDeltaTable { partition_by: Vec, schema: PyArrowType, partitions_filters: Option>, + custom_metadata: Option>, ) -> PyResult<()> { let mode = mode.parse().map_err(PythonError::from)?; @@ -803,6 +804,10 @@ impl RawDeltaTable { partition_by: Some(partition_by), predicate: None, }; + + let app_metadata = + custom_metadata.map(|md| md.into_iter().map(|(k, v)| (k, v.into())).collect()); + let store = self._table.log_store(); rt()? @@ -811,7 +816,7 @@ impl RawDeltaTable { &actions, operation, self._table.get_state(), - None, + app_metadata, )) .map_err(PythonError::from)?; @@ -1173,6 +1178,7 @@ fn write_to_deltalake( configuration: Option>>, storage_options: Option>, writer_properties: Option>>, + custom_metadata: Option>, ) -> PyResult<()> { let batches = data.0.map(|batch| batch.unwrap()).collect::>(); let save_mode = mode.parse().map_err(PythonError::from)?; @@ -1216,6 +1222,12 @@ fn write_to_deltalake( builder = builder.with_configuration(config); }; + if let Some(metadata) = custom_metadata { + let json_metadata: Map = + metadata.into_iter().map(|(k, v)| (k, v.into())).collect(); + builder = builder.with_metadata(json_metadata); + }; + rt()? .block_on(builder.into_future()) .map_err(PythonError::from)?; @@ -1280,6 +1292,7 @@ fn write_new_deltalake( description: Option, configuration: Option>>, storage_options: Option>, + custom_metadata: Option>, ) -> PyResult<()> { let table = DeltaTableBuilder::from_uri(table_uri) .with_storage_options(storage_options.unwrap_or_default()) @@ -1306,6 +1319,12 @@ fn write_new_deltalake( builder = builder.with_configuration(config); }; + if let Some(metadata) = custom_metadata { + let json_metadata: Map = + metadata.into_iter().map(|(k, v)| (k, v.into())).collect(); + builder = builder.with_metadata(json_metadata); + }; + rt()? .block_on(builder.into_future()) .map_err(PythonError::from)?; From 1f9898acc5280087fa3d802c206d2b36dced4591 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 2 Jan 2024 16:43:39 +0100 Subject: [PATCH 08/29] docs: add writer properties to docs (#2002) # Description Forgot to add WriterProperties to the docs page and mark a deprecation in the docs. --- docs/api/delta_writer.md | 2 ++ python/deltalake/table.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/api/delta_writer.md b/docs/api/delta_writer.md index 432a32b768..9b395d3604 100644 --- a/docs/api/delta_writer.md +++ b/docs/api/delta_writer.md @@ -8,6 +8,8 @@ search: ::: deltalake.write_deltalake +::: deltalake.WriterProperties + ## Convert to Delta Tables ::: deltalake.convert_to_deltalake diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 76b1dd4f49..b41e62bd07 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -1178,7 +1178,10 @@ def with_writer_properties( write_batch_size: Optional[int] = None, max_row_group_size: Optional[int] = None, ) -> "TableMerger": - """Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html: + """ + !!! warning "Deprecated" + Use `.merge(writer_properties = WriterProperties())` instead + Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html: Args: data_page_size_limit: Limit DataPage size to this in bytes. From 4cb754b61ad5066f87f76f0c13007c00043ab58a Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:36:19 +0100 Subject: [PATCH 09/29] docs: add alterer (#2014) Again, forgot some docs, and added missing descriptions --- docs/api/delta_table/delta_table_alterer.md | 11 +++++++++++ mkdocs.yml | 1 + python/deltalake/table.py | 11 ++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 docs/api/delta_table/delta_table_alterer.md diff --git a/docs/api/delta_table/delta_table_alterer.md b/docs/api/delta_table/delta_table_alterer.md new file mode 100644 index 0000000000..d859f605e1 --- /dev/null +++ b/docs/api/delta_table/delta_table_alterer.md @@ -0,0 +1,11 @@ +--- +search: + boost: 10 +--- + + +# TableAlterer + +::: deltalake.table.TableAlterer + options: + show_root_heading: true \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 4e713d73ec..a554378f4f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -67,6 +67,7 @@ nav: - api/delta_table/metadata.md - api/delta_table/delta_table_merger.md - api/delta_table/delta_table_optimizer.md + - api/delta_table/delta_table_alterer.md - api/schema.md - api/storage.md - api/catalog.md diff --git a/python/deltalake/table.py b/python/deltalake/table.py index b41e62bd07..4f93a6aa61 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -785,13 +785,22 @@ def update( def optimize( self, ) -> "TableOptimizer": + """Namespace for all table optimize related methods. + + Returns: + TableOptimizer: TableOptimizer Object + """ return TableOptimizer(self) @property def alter( self, ) -> "TableAlterer": - """Namespace for all table alter related methods""" + """Namespace for all table alter related methods. + + Returns: + TableAlterer: TableAlterer Object + """ return TableAlterer(self) def merge( From 6d41b37de413f17b28f2b8ece8eb72b0b1d27215 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Tue, 2 Jan 2024 20:03:21 +0000 Subject: [PATCH 10/29] use temporary table names during the constraint checks Signed-off-by: Nikolay Ulmasov --- crates/deltalake-core/src/delta_datafusion/mod.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/deltalake-core/src/delta_datafusion/mod.rs b/crates/deltalake-core/src/delta_datafusion/mod.rs index 17d04c692a..59cc2ba0ac 100644 --- a/crates/deltalake-core/src/delta_datafusion/mod.rs +++ b/crates/deltalake-core/src/delta_datafusion/mod.rs @@ -1128,7 +1128,10 @@ impl DeltaDataChecker { return Ok(()); } let table = MemTable::try_new(record_batch.schema(), vec![vec![record_batch.clone()]])?; - self.ctx.register_table("data", Arc::new(table))?; + + // Use a random table name to avoid clashes when running multiple parallel tasks, e.g. when using a partitioned table + let table_name: String = uuid::Uuid::new_v4().to_string(); + self.ctx.register_table(&table_name, Arc::new(table))?; let mut violations: Vec = Vec::new(); @@ -1140,8 +1143,9 @@ impl DeltaDataChecker { } let sql = format!( - "SELECT {} FROM data WHERE NOT ({}) LIMIT 1", + "SELECT {} FROM `{}` WHERE NOT ({}) LIMIT 1", check.get_name(), + table_name, check.get_expression() ); @@ -1162,7 +1166,7 @@ impl DeltaDataChecker { } } - self.ctx.deregister_table("data")?; + self.ctx.deregister_table(&table_name)?; if !violations.is_empty() { Err(DeltaTableError::InvalidData { violations }) } else { From 8762c09da9a1f6949293ee32f4f4d5bdc4781711 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Mon, 11 Dec 2023 10:00:47 -0800 Subject: [PATCH 11/29] Refactor AWS specific code into the deltalake-aws crate There are a number of changes here to untangle the coupling inside of deltalake-core to allow deltalake-aws to be separated properly --- .github/workflows/build.yml | 22 - Cargo.toml | 1 + crates/deltalake-aws/Cargo.toml | 26 +- .../common/s3.rs => deltalake-aws/helpers.rs} | 0 crates/deltalake-aws/src/lib.rs | 72 +- .../mod.rs => deltalake-aws/src/logstore.rs} | 92 +- crates/deltalake-aws/src/storage.rs | 597 ++++++++++++ crates/deltalake-aws/tests/common.rs | 173 ++++ .../deltalake-aws/tests/integration_read.rs | 189 ++++ .../tests/integration_s3_dynamodb.rs | 35 +- .../tests/repair_s3_rename_test.rs | 21 +- crates/deltalake-core/.gitignore | 10 +- crates/deltalake-core/Cargo.toml | 39 +- crates/deltalake-core/README.md | 4 - .../src/data_catalog/storage/mod.rs | 12 +- .../src/delta_datafusion/mod.rs | 2 +- crates/deltalake-core/src/errors.rs | 7 +- .../src/kernel/actions/types.rs | 20 +- crates/deltalake-core/src/lib.rs | 94 +- .../src/logstore/default_logstore.rs | 13 +- crates/deltalake-core/src/logstore/mod.rs | 250 +++-- .../src/operations/convert_to_delta.rs | 45 +- crates/deltalake-core/src/operations/load.rs | 2 +- .../deltalake-core/src/operations/vacuum.rs | 8 +- crates/deltalake-core/src/protocol/mod.rs | 30 +- .../src/protocol/parquet2_read/boolean.rs | 76 -- .../parquet2_read/dictionary/binary.rs | 48 - .../protocol/parquet2_read/dictionary/mod.rs | 2 - .../parquet2_read/dictionary/primitive.rs | 19 - .../src/protocol/parquet2_read/map.rs | 111 --- .../src/protocol/parquet2_read/mod.rs | 898 ------------------ .../src/protocol/parquet2_read/primitive.rs | 185 ---- .../src/protocol/parquet2_read/stats.rs | 9 - .../src/protocol/parquet2_read/string.rs | 312 ------ .../src/protocol/parquet2_read/validity.rs | 137 --- .../src/protocol/parquet_read/mod.rs | 2 +- .../deltalake-core/src/protocol/time_utils.rs | 102 +- crates/deltalake-core/src/storage/config.rs | 394 -------- crates/deltalake-core/src/storage/mod.rs | 154 ++- crates/deltalake-core/src/storage/s3.rs | 510 ---------- crates/deltalake-core/src/storage/utils.rs | 8 - crates/deltalake-core/src/table/builder.rs | 272 ++---- crates/deltalake-core/src/table/mod.rs | 24 +- crates/deltalake-core/src/table/state.rs | 58 +- .../deltalake-core/tests/checkpoint_writer.rs | 14 +- .../tests/command_filesystem_check.rs | 79 +- crates/deltalake-core/tests/command_vacuum.rs | 20 +- crates/deltalake-core/tests/common/adls.rs | 93 -- crates/deltalake-core/tests/common/hdfs.rs | 20 - crates/deltalake-core/tests/fs_common/mod.rs | 9 +- .../tests/integration_checkpoint.rs | 41 +- .../tests/integration_concurrent_writes.rs | 37 +- .../tests/integration_datafusion.rs | 117 +-- .../tests/integration_object_store.rs | 499 ---------- .../deltalake-core/tests/integration_read.rs | 211 +--- .../tests/read_delta_log_test.rs | 17 +- .../tests/read_delta_partitions_test.rs | 10 +- crates/deltalake-core/tests/time_travel.rs | 18 +- crates/deltalake-test/.gitignore | 12 + crates/deltalake-test/Cargo.toml | 23 + .../common => deltalake-test/src}/clock.rs | 0 .../src}/datafusion.rs | 6 +- .../mod.rs => deltalake-test/src/lib.rs} | 11 +- .../src/utils.rs} | 245 ++--- .../_delta_log/.s3-optimization-0 | 0 .../_delta_log/.s3-optimization-1 | 0 .../_delta_log/.s3-optimization-2 | 0 .../_delta_log/00000000000000000000.crc | 0 .../_delta_log/00000000000000000000.json | 0 ...413a-85f9-b1b69d4b3b4e-c000.snappy.parquet | Bin ...4f0b-bb96-771a515fbccc-c000.snappy.parquet | Bin ...49a6-a4b9-e39ffed9c15a-c000.snappy.parquet | Bin ...4b0d-9726-c18630c6ad90-c000.snappy.parquet | Bin ...4d60-8420-23261f58a5eb-c000.snappy.parquet | Bin ...4d32-806c-781a1cf123d2-c000.snappy.parquet | Bin ...4e1c-92cd-b4fe8d3bb954-c000.snappy.parquet | Bin ...41b0-ba97-a74b3afc8239-c000.snappy.parquet | Bin .../_delta_log/00000000000000000001.json | 0 .../00000000000000000002.checkpoint.parquet | Bin .../_delta_log/00000000000000000002.json | 0 .../_delta_log/_last_checkpoint | 0 .../data/checkpoints/_delta_log/.gitignore | 0 .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 .../_delta_log/00000000000000000002.json | 0 .../_delta_log/00000000000000000003.json | 0 .../_delta_log/00000000000000000004.json | 0 .../_delta_log/00000000000000000005.json | 0 .../_delta_log/00000000000000000006.json | 0 .../_delta_log/00000000000000000007.json | 0 .../_delta_log/00000000000000000008.json | 0 .../_delta_log/00000000000000000009.json | 0 .../_delta_log/00000000000000000010.json | 0 .../_delta_log/00000000000000000011.json | 0 .../_delta_log/00000000000000000012.json | 0 .../data/checkpoints_tombstones/.gitignore | 0 .../00000000000000000005.checkpoint.parquet | Bin .../_delta_log/00000000000000000005.json | 0 .../_delta_log/00000000000000000006.json | 0 .../_delta_log/00000000000000000007.json | 0 .../_delta_log/00000000000000000008.json | 0 .../_delta_log/00000000000000000009.json | 0 .../00000000000000000010.checkpoint.parquet | Bin .../_delta_log/00000000000000000010.json | 0 .../_delta_log/00000000000000000011.json | 0 .../_delta_log/00000000000000000012.json | 0 .../concurrent_workers/_delta_log/.gitignore | 0 .../_delta_log/00000000000000000000.json | 0 ...-b8b4-bef3de0de409-c000.snappy.parquet.crc | Bin ...-bc07-d46c948aa415-c000.snappy.parquet.crc | Bin ...-8606-f8311a96f6dc-c000.snappy.parquet.crc | Bin ...-ad28-ff32ddab96d2-c000.snappy.parquet.crc | Bin ...-ae49-fc48b973e37e-c000.snappy.parquet.crc | Bin ...-9507-7ccf67924649-c000.snappy.parquet.crc | Bin ...-815e-7eb62007a15c-c000.snappy.parquet.crc | Bin ...b312ca-665d-46ab-93a9-9f87ad2baa92.tmp.crc | Bin ...1a776e-6e56-4423-a9b0-7efc9e58826a.tmp.crc | Bin ...4807e6-437c-44c9-abd2-50e6514d236e.tmp.crc | Bin ...74eda7-fa09-48ce-b06c-56025163f6ae.tmp.crc | Bin ...7ba875-7a14-4e57-9973-1349c21a152c.tmp.crc | Bin ...0000000000000000003.checkpoint.parquet.crc | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 .../_delta_log/00000000000000000002.json | 0 .../00000000000000000003.checkpoint.parquet | Bin .../_delta_log/00000000000000000003.json | 0 .../delta-0.2.0/_delta_log/_last_checkpoint | 0 ...4193-b8b4-bef3de0de409-c000.snappy.parquet | Bin ...4fb8-bc07-d46c948aa415-c000.snappy.parquet | Bin ...4f3a-8606-f8311a96f6dc-c000.snappy.parquet | Bin ...4662-ad28-ff32ddab96d2-c000.snappy.parquet | Bin ...4dea-ae49-fc48b973e37e-c000.snappy.parquet | Bin ...4477-9507-7ccf67924649-c000.snappy.parquet | Bin ...4758-815e-7eb62007a15c-c000.snappy.parquet | Bin ...-80d3-b5d170011621-c000.snappy.parquet.crc | Bin ...-a653-cb5594582017-c000.snappy.parquet.crc | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 ...4d60-80d3-b5d170011621-c000.snappy.parquet | Bin ...41a8-a653-cb5594582017-c000.snappy.parquet | Bin ...-9527-f8995620fa42-c000.snappy.parquet.crc | Bin .../_delta_log/00000000000000000000.json | 0 ...4153-9527-f8995620fa42-c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 ...4970-893f-9bb772bf246e.c000.snappy.parquet | Bin ...4f58-b3ea-23990c71b932.c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 ...-b3bb-5ed7f12635ab.c000.snappy.parquet.crc | Bin ...4448-b3bb-5ed7f12635ab.c000.snappy.parquet | Bin ...-9c67-d8e24a1ccf83.c000.snappy.parquet.crc | Bin ...446c-9c67-d8e24a1ccf83.c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 ...-ad78-fd13c2027c7e.c000.snappy.parquet.crc | Bin ...4a39-ad78-fd13c2027c7e.c000.snappy.parquet | Bin ...-a060-f67ccc63ced9.c000.snappy.parquet.crc | Bin ...42cd-a060-f67ccc63ced9.c000.snappy.parquet | Bin ...-8ea3-3990b2f027b5.c000.snappy.parquet.crc | Bin ...4add-8ea3-3990b2f027b5.c000.snappy.parquet | Bin ...-baa0-1c8a2bb98104.c000.snappy.parquet.crc | Bin ...4184-baa0-1c8a2bb98104.c000.snappy.parquet | Bin ...-b19e-1f92af3fbb25.c000.snappy.parquet.crc | Bin ...4d52-b19e-1f92af3fbb25.c000.snappy.parquet | Bin ...-a6fc-22b7bc92bebb.c000.snappy.parquet.crc | Bin ...4032-a6fc-22b7bc92bebb.c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 ...-9918-6cab4f7578f7.c000.snappy.parquet.crc | Bin ...45df-9918-6cab4f7578f7.c000.snappy.parquet | Bin ...-be8e-e9f5b8a22890.c000.snappy.parquet.crc | Bin ...457b-be8e-e9f5b8a22890.c000.snappy.parquet | Bin ...-8d18-ba5711d6cbe1-c000.snappy.parquet.crc | Bin ...-93ba-ff6bfaf892a1-c000.snappy.parquet.crc | Bin ...-8620-5e68c2654989-c000.snappy.parquet.crc | Bin .../data/delta-0.8.0/_change_data/.gitkeep | 0 .../data/delta-0.8.0/_delta_index/.gitkeep | 0 .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 ...459e-8d18-ba5711d6cbe1-c000.snappy.parquet | Bin ...46c8-93ba-ff6bfaf892a1-c000.snappy.parquet | Bin ...4acb-8620-5e68c2654989-c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.crc | 0 .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.crc | 0 .../_delta_log/00000000000000000001.json | 0 .../_delta_log/00000000000000000002.crc | 0 .../_delta_log/00000000000000000002.json | 0 .../_delta_log/00000000000000000003.crc | 0 .../_delta_log/00000000000000000003.json | 0 .../_delta_log/00000000000000000004.crc | 0 .../_delta_log/00000000000000000004.json | 0 .../_delta_log/00000000000000000005.crc | 0 .../_delta_log/00000000000000000005.json | 0 .../_delta_log/00000000000000000006.crc | 0 .../_delta_log/00000000000000000006.json | 0 .../_delta_log/00000000000000000007.crc | 0 .../_delta_log/00000000000000000007.json | 0 .../_delta_log/00000000000000000008.crc | 0 .../_delta_log/00000000000000000008.json | 0 .../_delta_log/00000000000000000009.crc | 0 .../_delta_log/00000000000000000009.json | 0 .../00000000000000000010.checkpoint.parquet | Bin .../_delta_log/00000000000000000010.crc | 0 .../_delta_log/00000000000000000010.json | 0 .../_delta_log/00000000000000000011.crc | 0 .../_delta_log/00000000000000000011.json | 0 .../_delta_log/00000000000000000012.crc | 0 .../_delta_log/00000000000000000012.json | 0 .../_delta_log/_last_checkpoint | 0 ...484f-87ff-4328ea56045d-c000.snappy.parquet | Bin ...411e-bca9-b067444cbcb0-c000.snappy.parquet | Bin ...405b-be86-68a812f2e4c8-c000.snappy.parquet | Bin ...4bd6-9293-b5daab2ce667-c000.snappy.parquet | Bin ...4453-9202-51d75dee59af-c000.snappy.parquet | Bin ...4601-ac29-68cba64023b5-c000.snappy.parquet | Bin ...45b0-a7ff-2f0395a53966-c000.snappy.parquet | Bin ...42a4-b50c-5a4bf724c037-c000.snappy.parquet | Bin ...4b8f-8ba9-49422fdf9f2e-c000.snappy.parquet | Bin ...4910-aea9-4eaf92f0c68c-c000.snappy.parquet | Bin ...4c9f-98f9-8f3d346727ba-c000.snappy.parquet | Bin ...4d67-ac43-4fbf948bfb9b-c000.snappy.parquet | Bin .../_delta_log/.00000000000000000000.json.crc | Bin .../_delta_log/00000000000000000000.json | 0 ...-82d6-d42121d883fd.c000.snappy.parquet.crc | Bin ...46f5-82d6-d42121d883fd.c000.snappy.parquet | Bin ...-8051-f8b54328ffdb.c000.snappy.parquet.crc | Bin ...424a-8051-f8b54328ffdb.c000.snappy.parquet | Bin ...-acc4-2a9608499d7c.c000.snappy.parquet.crc | Bin ...4fd0-acc4-2a9608499d7c.c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 ...-a5cb-8d9d483ed390-c000.snappy.parquet.crc | Bin ...-8c72-423ee747abc0-c000.snappy.parquet.crc | Bin .../_delta_log/00000000000000000000.json | 0 ...470d-a5cb-8d9d483ed390-c000.snappy.parquet | Bin ...4b73-8c72-423ee747abc0-c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 ...450c-8af1-4145b73a96c7-c000.snappy.parquet | Bin ...4bc3-92e5-96347fe3fd84-c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 .../00000000000000000001.checkpoint.parquet | Bin .../_delta_log/00000000000000000001.json | 0 .../issue_1374/_delta_log/_last_checkpoint | 0 ...4008-82df-e98efdcdd47d-c000.snappy.parquet | Bin ...4008-82df-e98efdcdd49c-c000.snappy.parquet | Bin ...-b8b4-bef3de0de409-c000.snappy.parquet.crc | Bin ...-8606-f8311a96f6dc-c000.snappy.parquet.crc | Bin ...-ae49-fc48b973e37e-c000.snappy.parquet.crc | Bin ...-9507-7ccf67924649-c000.snappy.parquet.crc | Bin .../data/simple_commit/_delta_log/.gitignore | 0 .../_delta_log/00000000000000000000.json | 0 ...4193-b8b4-bef3de0de409-c000.snappy.parquet | Bin ...4f3a-8606-f8311a96f6dc-c000.snappy.parquet | Bin ...4dea-ae49-fc48b973e37e-c000.snappy.parquet | Bin ...4477-9507-7ccf67924649-c000.snappy.parquet | Bin ...-a43c-3eda0d2a499d-c000.snappy.parquet.crc | Bin ...-a8f0-e65b746382dd-c000.snappy.parquet.crc | Bin ...-9074-a278c24c4449-c000.snappy.parquet.crc | Bin ...-b38a-6ee7e24456b1-c000.snappy.parquet.crc | Bin ...-adae-ce66d1fcaef6-c000.snappy.parquet.crc | Bin ...-88a6-abcfb049d3b4-c000.snappy.parquet.crc | Bin ...-9c85-f34969ad3aa9-c000.snappy.parquet.crc | Bin ...-a42b-9731b2e490ae-c000.snappy.parquet.crc | Bin ...-a923-f6f89930a5c1-c000.snappy.parquet.crc | Bin ...-8d07-599a21197296-c000.snappy.parquet.crc | Bin ...-98f6-5e6cfa3ae45d-c000.snappy.parquet.crc | Bin ...-861f-5a649e3d9674-c000.snappy.parquet.crc | Bin ...-b3cc-84502b0c314f-c000.snappy.parquet.crc | Bin ...-a080-73e02491c643-c000.snappy.parquet.crc | Bin ...-8498-7bfb2940713b-c000.snappy.parquet.crc | Bin ...-b9a1-7e717b67f294-c000.snappy.parquet.crc | Bin ...-81ef-5223cf40f025-c000.snappy.parquet.crc | Bin ...-9e83-e31021a93cf9-c000.snappy.parquet.crc | Bin ...-98f5-2fccfa1b123f-c000.snappy.parquet.crc | Bin ...-8d34-a0018cf73b70-c000.snappy.parquet.crc | Bin ...-a888-81565a40161d-c000.snappy.parquet.crc | Bin ...-8475-e21d2a2935f8-c000.snappy.parquet.crc | Bin ...-9403-53e33b3778ac-c000.snappy.parquet.crc | Bin ...-816f-cbd30a3f8c1b-c000.snappy.parquet.crc | Bin ...-93f6-0acf11199a0d-c000.snappy.parquet.crc | Bin ...-be15-135e15b398f4-c000.snappy.parquet.crc | Bin ...-9acd-623e740be992-c000.snappy.parquet.crc | Bin ...-8cd4-6688aad8585d-c000.snappy.parquet.crc | Bin ...-9909-78da7294ffbd-c000.snappy.parquet.crc | Bin ...-a8b4-578c9e9a218d-c000.snappy.parquet.crc | Bin ...-b07f-975d2226b800-c000.snappy.parquet.crc | Bin ...-aa43-993cdf937fd3-c000.snappy.parquet.crc | Bin ...-9613-f5ad1940b689-c000.snappy.parquet.crc | Bin ...-a03d-e356fcd1564a-c000.snappy.parquet.crc | Bin ...-befa-90f056c2d77a-c000.snappy.parquet.crc | Bin ...-a3d3-8dc112766ff5-c000.snappy.parquet.crc | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 .../_delta_log/00000000000000000002.json | 0 .../_delta_log/00000000000000000003.json | 0 .../_delta_log/00000000000000000004.json | 0 ...4768-a43c-3eda0d2a499d-c000.snappy.parquet | Bin ...41fe-a8f0-e65b746382dd-c000.snappy.parquet | Bin ...4ca1-9074-a278c24c4449-c000.snappy.parquet | Bin ...4790-b38a-6ee7e24456b1-c000.snappy.parquet | Bin ...40ba-adae-ce66d1fcaef6-c000.snappy.parquet | Bin ...47c3-88a6-abcfb049d3b4-c000.snappy.parquet | Bin ...4df2-9c85-f34969ad3aa9-c000.snappy.parquet | Bin ...4e2b-a42b-9731b2e490ae-c000.snappy.parquet | Bin ...4c2c-a923-f6f89930a5c1-c000.snappy.parquet | Bin ...4f13-8d07-599a21197296-c000.snappy.parquet | Bin ...4562-98f6-5e6cfa3ae45d-c000.snappy.parquet | Bin ...420c-861f-5a649e3d9674-c000.snappy.parquet | Bin ...4581-b3cc-84502b0c314f-c000.snappy.parquet | Bin ...4bd9-a080-73e02491c643-c000.snappy.parquet | Bin ...4dda-8498-7bfb2940713b-c000.snappy.parquet | Bin ...482d-b9a1-7e717b67f294-c000.snappy.parquet | Bin ...41b6-81ef-5223cf40f025-c000.snappy.parquet | Bin ...4b00-9e83-e31021a93cf9-c000.snappy.parquet | Bin ...40af-98f5-2fccfa1b123f-c000.snappy.parquet | Bin ...45b1-8d34-a0018cf73b70-c000.snappy.parquet | Bin ...49c1-a888-81565a40161d-c000.snappy.parquet | Bin ...4148-8475-e21d2a2935f8-c000.snappy.parquet | Bin ...492b-9403-53e33b3778ac-c000.snappy.parquet | Bin ...4643-816f-cbd30a3f8c1b-c000.snappy.parquet | Bin ...48ee-93f6-0acf11199a0d-c000.snappy.parquet | Bin ...4f4c-be15-135e15b398f4-c000.snappy.parquet | Bin ...4fde-9acd-623e740be992-c000.snappy.parquet | Bin ...4f34-8cd4-6688aad8585d-c000.snappy.parquet | Bin ...48ce-9909-78da7294ffbd-c000.snappy.parquet | Bin ...4a90-a8b4-578c9e9a218d-c000.snappy.parquet | Bin ...43fb-b07f-975d2226b800-c000.snappy.parquet | Bin ...4193-aa43-993cdf937fd3-c000.snappy.parquet | Bin ...4871-9613-f5ad1940b689-c000.snappy.parquet | Bin ...48fb-a03d-e356fcd1564a-c000.snappy.parquet | Bin ...4c02-befa-90f056c2d77a-c000.snappy.parquet | Bin ...461d-a3d3-8dc112766ff5-c000.snappy.parquet | Bin ...-a43c-3eda0d2a499d-c000.snappy.parquet.crc | Bin ...-a8f0-e65b746382dd-c000.snappy.parquet.crc | Bin ...-9074-a278c24c4449-c000.snappy.parquet.crc | Bin ...-b38a-6ee7e24456b1-c000.snappy.parquet.crc | Bin ...-adae-ce66d1fcaef6-c000.snappy.parquet.crc | Bin ...-88a6-abcfb049d3b4-c000.snappy.parquet.crc | Bin ...-9c85-f34969ad3aa9-c000.snappy.parquet.crc | Bin ...-a42b-9731b2e490ae-c000.snappy.parquet.crc | Bin ...-a923-f6f89930a5c1-c000.snappy.parquet.crc | Bin ...-8d07-599a21197296-c000.snappy.parquet.crc | Bin ...-98f6-5e6cfa3ae45d-c000.snappy.parquet.crc | Bin ...-861f-5a649e3d9674-c000.snappy.parquet.crc | Bin ...-b3cc-84502b0c314f-c000.snappy.parquet.crc | Bin ...-a080-73e02491c643-c000.snappy.parquet.crc | Bin ...-8498-7bfb2940713b-c000.snappy.parquet.crc | Bin ...-b9a1-7e717b67f294-c000.snappy.parquet.crc | Bin ...-81ef-5223cf40f025-c000.snappy.parquet.crc | Bin ...-9e83-e31021a93cf9-c000.snappy.parquet.crc | Bin ...-98f5-2fccfa1b123f-c000.snappy.parquet.crc | Bin ...-8d34-a0018cf73b70-c000.snappy.parquet.crc | Bin ...-a888-81565a40161d-c000.snappy.parquet.crc | Bin ...-8475-e21d2a2935f8-c000.snappy.parquet.crc | Bin ...-9403-53e33b3778ac-c000.snappy.parquet.crc | Bin ...-816f-cbd30a3f8c1b-c000.snappy.parquet.crc | Bin ...-93f6-0acf11199a0d-c000.snappy.parquet.crc | Bin ...-be15-135e15b398f4-c000.snappy.parquet.crc | Bin ...-9acd-623e740be992-c000.snappy.parquet.crc | Bin ...-8cd4-6688aad8585d-c000.snappy.parquet.crc | Bin ...-9909-78da7294ffbd-c000.snappy.parquet.crc | Bin ...-a8b4-578c9e9a218d-c000.snappy.parquet.crc | Bin ...-b07f-975d2226b800-c000.snappy.parquet.crc | Bin ...-aa43-993cdf937fd3-c000.snappy.parquet.crc | Bin ...-9613-f5ad1940b689-c000.snappy.parquet.crc | Bin ...-a03d-e356fcd1564a-c000.snappy.parquet.crc | Bin ...-befa-90f056c2d77a-c000.snappy.parquet.crc | Bin ...-a3d3-8dc112766ff5-c000.snappy.parquet.crc | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 .../_delta_log/00000000000000000002.json | 0 .../_delta_log/00000000000000000003.json | 0 .../_delta_log/00000000000000000004.json | 0 ...4768-a43c-3eda0d2a499d-c000.snappy.parquet | Bin ...41fe-a8f0-e65b746382dd-c000.snappy.parquet | Bin ...4ca1-9074-a278c24c4449-c000.snappy.parquet | Bin ...4790-b38a-6ee7e24456b1-c000.snappy.parquet | Bin ...40ba-adae-ce66d1fcaef6-c000.snappy.parquet | Bin ...47c3-88a6-abcfb049d3b4-c000.snappy.parquet | Bin ...4df2-9c85-f34969ad3aa9-c000.snappy.parquet | Bin ...4e2b-a42b-9731b2e490ae-c000.snappy.parquet | Bin ...4c2c-a923-f6f89930a5c1-c000.snappy.parquet | Bin ...4f13-8d07-599a21197296-c000.snappy.parquet | Bin ...4562-98f6-5e6cfa3ae45d-c000.snappy.parquet | Bin ...420c-861f-5a649e3d9674-c000.snappy.parquet | Bin ...4581-b3cc-84502b0c314f-c000.snappy.parquet | Bin ...4bd9-a080-73e02491c643-c000.snappy.parquet | Bin ...4dda-8498-7bfb2940713b-c000.snappy.parquet | Bin ...482d-b9a1-7e717b67f294-c000.snappy.parquet | Bin ...41b6-81ef-5223cf40f025-c000.snappy.parquet | Bin ...4b00-9e83-e31021a93cf9-c000.snappy.parquet | Bin ...40af-98f5-2fccfa1b123f-c000.snappy.parquet | Bin ...45b1-8d34-a0018cf73b70-c000.snappy.parquet | Bin ...49c1-a888-81565a40161d-c000.snappy.parquet | Bin ...4148-8475-e21d2a2935f8-c000.snappy.parquet | Bin ...492b-9403-53e33b3778ac-c000.snappy.parquet | Bin ...4643-816f-cbd30a3f8c1b-c000.snappy.parquet | Bin ...48ee-93f6-0acf11199a0d-c000.snappy.parquet | Bin ...4f4c-be15-135e15b398f4-c000.snappy.parquet | Bin ...4fde-9acd-623e740be992-c000.snappy.parquet | Bin ...4f34-8cd4-6688aad8585d-c000.snappy.parquet | Bin ...48ce-9909-78da7294ffbd-c000.snappy.parquet | Bin ...4a90-a8b4-578c9e9a218d-c000.snappy.parquet | Bin ...43fb-b07f-975d2226b800-c000.snappy.parquet | Bin ...4193-aa43-993cdf937fd3-c000.snappy.parquet | Bin ...4871-9613-f5ad1940b689-c000.snappy.parquet | Bin ...48fb-a03d-e356fcd1564a-c000.snappy.parquet | Bin ...4c02-befa-90f056c2d77a-c000.snappy.parquet | Bin ...461d-a3d3-8dc112766ff5-c000.snappy.parquet | Bin ...484d-bef7-0e63557786ca.c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.crc | 0 .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.crc | 0 .../_delta_log/00000000000000000001.json | 0 .../_delta_log/00000000000000000002.crc | 0 .../_delta_log/00000000000000000002.json | 0 ...4a4c-8abe-3323499043e9.c000.snappy.parquet | Bin ...4a5f-9921-6e56269ec2c9-c000.snappy.parquet | Bin ...-bb0f-15cde3fb14eb-c000.snappy.parquet.crc | Bin ...-98c1-7a69872fd797-c000.snappy.parquet.crc | Bin ...-bcfd-7de5788dfe8d-c000.snappy.parquet.crc | Bin ...-a195-5f1ae583e553-c000.snappy.parquet.crc | Bin ...-99e6-23f1ac2b7b7c-c000.snappy.parquet.crc | Bin ...-b3f6-9f256992c633-c000.snappy.parquet.crc | Bin ...-a411-46d4295da531-c000.snappy.parquet.crc | Bin ...-a05e-8032113a6568-c000.snappy.parquet.crc | Bin ...-a9da-7c6f53f6406b-c000.snappy.parquet.crc | Bin ...-bc1f-de9bd8ae025b-c000.snappy.parquet.crc | Bin ...-834e-dcc098fc9005-c000.snappy.parquet.crc | Bin ...0000000000000000010.checkpoint.parquet.crc | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 .../_delta_log/00000000000000000002.json | 0 .../_delta_log/00000000000000000003.json | 0 .../_delta_log/00000000000000000004.json | 0 .../_delta_log/00000000000000000005.json | 0 .../_delta_log/00000000000000000006.json | 0 .../_delta_log/00000000000000000007.json | 0 .../_delta_log/00000000000000000008.json | 0 .../_delta_log/00000000000000000009.json | 0 .../00000000000000000010.checkpoint.parquet | Bin .../_delta_log/00000000000000000010.json | 0 .../_delta_log/_last_checkpoint | 0 ...4e95-bb0f-15cde3fb14eb-c000.snappy.parquet | Bin ...46c5-98c1-7a69872fd797-c000.snappy.parquet | Bin ...431d-bcfd-7de5788dfe8d-c000.snappy.parquet | Bin ...4b18-a195-5f1ae583e553-c000.snappy.parquet | Bin ...4df4-99e6-23f1ac2b7b7c-c000.snappy.parquet | Bin ...4b02-b3f6-9f256992c633-c000.snappy.parquet | Bin ...40b8-a411-46d4295da531-c000.snappy.parquet | Bin ...4420-a05e-8032113a6568-c000.snappy.parquet | Bin ...4170-a9da-7c6f53f6406b-c000.snappy.parquet | Bin ...4c70-bc1f-de9bd8ae025b-c000.snappy.parquet | Bin ...4eec-834e-dcc098fc9005-c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 ...r_61d16c75-6994-46b7-a15b-8b538852e50e.bin | Bin ...4e51-827b-c3d5516560ca-c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 ...48e8-82b4-0229cc194867-c000.snappy.parquet | Bin ...c-4ffa-bf5c-a0c2833d05eb.c000.zstd.parquet | Bin ...4-48d9-aa60-438228358f1a.c000.zstd.parquet | Bin .../_delta_log/00000000000000000000.crc | 0 .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000000.crc | 0 .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.crc | 0 .../_delta_log/00000000000000000001.json | 0 .../_delta_log/00000000000000000002.crc | 0 .../_delta_log/00000000000000000002.json | 0 .../_delta_log/00000000000000000003.crc | 0 .../_delta_log/00000000000000000003.json | 0 .../_delta_log/00000000000000000004.crc | 0 .../_delta_log/00000000000000000004.json | 0 .../_delta_log/00000000000000000005.crc | 0 .../_delta_log/00000000000000000005.json | 0 .../_delta_log/00000000000000000006.crc | 0 .../_delta_log/00000000000000000006.json | 0 .../_delta_log/00000000000000000007.crc | 0 .../_delta_log/00000000000000000007.json | 0 .../_delta_log/00000000000000000008.crc | 0 .../_delta_log/00000000000000000008.json | 0 .../_delta_log/00000000000000000009.crc | 0 .../_delta_log/00000000000000000009.json | 0 .../00000000000000000010.checkpoint.parquet | Bin .../_delta_log/00000000000000000010.crc | 0 .../_delta_log/00000000000000000010.json | 0 .../_delta_log/00000000000000000011.crc | 0 .../_delta_log/00000000000000000011.json | 0 .../_delta_log/00000000000000000012.crc | 0 .../_delta_log/00000000000000000012.json | 0 .../_delta_log/00000000000000000013.crc | 0 .../_delta_log/00000000000000000013.json | 0 .../_delta_log/00000000000000000014.crc | 0 .../_delta_log/00000000000000000014.json | 0 .../_delta_log/00000000000000000015.crc | 0 .../_delta_log/00000000000000000015.json | 0 .../_delta_log/00000000000000000016.crc | 0 .../_delta_log/00000000000000000016.json | 0 .../_delta_log/00000000000000000017.crc | 0 .../_delta_log/00000000000000000017.json | 0 .../_delta_log/00000000000000000018.crc | 0 .../_delta_log/00000000000000000018.json | 0 .../_delta_log/00000000000000000019.crc | 0 .../_delta_log/00000000000000000019.json | 0 .../00000000000000000020.checkpoint.parquet | Bin .../_delta_log/00000000000000000020.crc | 0 .../_delta_log/00000000000000000020.json | 0 .../_delta_log/_last_checkpoint | 0 ...r_8e4ca8be-7615-43cf-bc06-5d131148683f.bin | Bin ...r_a2084964-69d4-4e1e-95f5-9bbd6571d5c3.bin | Bin ...437a-a9a7-fbfc5137c77d.c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.crc | 0 .../_delta_log/00000000000000000000.json | 0 ...491d-b3c9-3eea548de6cb-c000.snappy.parquet | Bin ...4e13-a624-ddd50ce7f5c4-c000.snappy.parquet | Bin .../_delta_log/.s3-optimization-0 | 0 .../_delta_log/.s3-optimization-1 | 0 .../_delta_log/.s3-optimization-2 | 0 .../_delta_log/00000000000000000000.crc | 0 .../_delta_log/00000000000000000000.json | 0 ...4d88-b78c-cebe430cdd47.c000.snappy.parquet | Bin ...41d6-ab41-f02007d1658c.c000.snappy.parquet | Bin ...4bd4-bdc0-cd25fcc951c6.c000.snappy.parquet | Bin ...409b-8a2d-18462928840e.c000.snappy.parquet | Bin ...4027-87d6-940fcf593a60.c000.snappy.parquet | Bin ...49e6-a25f-b0211cf95d20.c000.snappy.parquet | Bin ...425d-b49a-5afe731aaac8.c000.snappy.parquet | Bin ...44dd-a793-922e30c1b9df.c000.snappy.parquet | Bin ...4809-b02a-ddebda3966e8.c000.snappy.parquet | Bin ...4287-8333-92cb01a5124b.c000.snappy.parquet | Bin .../_delta_log/00000000000000000000.json | 0 .../_delta_log/00000000000000000001.json | 0 .../00000000000000000002.checkpoint.parquet | Bin .../_delta_log/00000000000000000002.json | 0 .../_delta_log/00000000000000000003.json | 0 crates/deltalake/Cargo.toml | 9 +- crates/deltalake/src/lib.rs | 3 + python/src/lib.rs | 2 + python/tests/conftest.py | 2 +- python/tests/test_schema.py | 6 +- python/tests/test_table_read.py | 58 +- python/tests/test_vacuum.py | 2 +- 542 files changed, 2008 insertions(+), 4641 deletions(-) rename crates/{deltalake-core/tests/common/s3.rs => deltalake-aws/helpers.rs} (100%) rename crates/{deltalake-core/src/logstore/s3/mod.rs => deltalake-aws/src/logstore.rs} (76%) create mode 100644 crates/deltalake-aws/src/storage.rs create mode 100644 crates/deltalake-aws/tests/common.rs create mode 100644 crates/deltalake-aws/tests/integration_read.rs rename crates/{deltalake-core => deltalake-aws}/tests/integration_s3_dynamodb.rs (91%) rename crates/{deltalake-core => deltalake-aws}/tests/repair_s3_rename_test.rs (95%) delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/boolean.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/dictionary/binary.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/dictionary/mod.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/dictionary/primitive.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/map.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/mod.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/primitive.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/stats.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/string.rs delete mode 100644 crates/deltalake-core/src/protocol/parquet2_read/validity.rs delete mode 100644 crates/deltalake-core/src/storage/config.rs delete mode 100644 crates/deltalake-core/src/storage/s3.rs delete mode 100644 crates/deltalake-core/tests/common/adls.rs delete mode 100644 crates/deltalake-core/tests/common/hdfs.rs delete mode 100644 crates/deltalake-core/tests/integration_object_store.rs create mode 100644 crates/deltalake-test/.gitignore create mode 100644 crates/deltalake-test/Cargo.toml rename crates/{deltalake-core/tests/common => deltalake-test/src}/clock.rs (100%) rename crates/{deltalake-core/tests/common => deltalake-test/src}/datafusion.rs (68%) rename crates/{deltalake-core/tests/common/mod.rs => deltalake-test/src/lib.rs} (94%) rename crates/{deltalake-core/src/test_utils.rs => deltalake-test/src/utils.rs} (67%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/_delta_log/.s3-optimization-0 (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/_delta_log/.s3-optimization-1 (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/_delta_log/.s3-optimization-2 (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/_delta_log/00000000000000000000.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/part-00000-a496f40c-e091-413a-85f9-b1b69d4b3b4e-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/part-00001-9d9d980b-c500-4f0b-bb96-771a515fbccc-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/part-00002-8826af84-73bd-49a6-a4b9-e39ffed9c15a-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/part-00003-539aff30-2349-4b0d-9726-c18630c6ad90-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/part-00004-1bb9c3e3-c5b0-4d60-8420-23261f58a5eb-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/part-00005-4d47f8ff-94db-4d32-806c-781a1cf123d2-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/part-00006-d0ec7722-b30c-4e1c-92cd-b4fe8d3bb954-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/COVID-19_NYT/part-00007-4582392f-9fc2-41b0-ba97-a74b3afc8239-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoint_with_partitions/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoint_with_partitions/_delta_log/00000000000000000002.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoint_with_partitions/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoint_with_partitions/_delta_log/_last_checkpoint (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/.gitignore (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000003.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000004.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000005.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000006.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000007.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000008.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000009.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000010.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000011.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints/_delta_log/00000000000000000012.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_tombstones/.gitignore (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000005.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000005.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000006.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000007.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000008.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000009.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000010.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000010.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000011.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/checkpoints_vacuumed/_delta_log/00000000000000000012.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/concurrent_workers/_delta_log/.gitignore (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/concurrent_workers/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/.part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/.part-00000-7c2deba3-1994-4fb8-bc07-d46c948aa415-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/.part-00000-b44fcdb0-8b06-4f3a-8606-f8311a96f6dc-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/.part-00000-cb6b150b-30b8-4662-ad28-ff32ddab96d2-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/.part-00001-185eca06-e017-4dea-ae49-fc48b973e37e-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/.part-00001-4327c977-2734-4477-9507-7ccf67924649-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/.part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/..00000000000000000000.json.c6b312ca-665d-46ab-93a9-9f87ad2baa92.tmp.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/..00000000000000000001.json.641a776e-6e56-4423-a9b0-7efc9e58826a.tmp.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/..00000000000000000002.json.e64807e6-437c-44c9-abd2-50e6514d236e.tmp.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/..00000000000000000003.json.b374eda7-fa09-48ce-b06c-56025163f6ae.tmp.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/.._last_checkpoint.477ba875-7a14-4e57-9973-1349c21a152c.tmp.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/.00000000000000000003.checkpoint.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/00000000000000000003.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/00000000000000000003.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/_delta_log/_last_checkpoint (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/part-00000-7c2deba3-1994-4fb8-bc07-d46c948aa415-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/part-00000-b44fcdb0-8b06-4f3a-8606-f8311a96f6dc-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/part-00000-cb6b150b-30b8-4662-ad28-ff32ddab96d2-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/part-00001-185eca06-e017-4dea-ae49-fc48b973e37e-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/part-00001-4327c977-2734-4477-9507-7ccf67924649-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.2.0/part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8-empty/.part-00000-b0cc5102-6177-4d60-80d3-b5d170011621-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8-empty/.part-00007-02b8c308-e5a7-41a8-a653-cb5594582017-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8-empty/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8-empty/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8-empty/part-00000-b0cc5102-6177-4d60-80d3-b5d170011621-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8-empty/part-00007-02b8c308-e5a7-41a8-a653-cb5594582017-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-date/.part-00000-d22c627d-9655-4153-9527-f8995620fa42-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-date/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-date/part-00000-d22c627d-9655-4153-9527-f8995620fa42-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-null-partition/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-null-partition/k=A/part-00000-b1f1dbbb-70bc-4970-893f-9bb772bf246e.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-null-partition/k=__HIVE_DEFAULT_PARTITION__/part-00001-8474ac85-360b-4f58-b3ea-23990c71b932.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-numeric-partition/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-numeric-partition/x=10/y=10.0/.part-00015-24eb4845-2d25-4448-b3bb-5ed7f12635ab.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-numeric-partition/x=10/y=10.0/part-00015-24eb4845-2d25-4448-b3bb-5ed7f12635ab.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-numeric-partition/x=9/y=9.9/.part-00007-3c50fba1-4264-446c-9c67-d8e24a1ccf83.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-numeric-partition/x=9/y=9.9/part-00007-3c50fba1-4264-446c-9c67-d8e24a1ccf83.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2020/month=1/day=1/.part-00000-8eafa330-3be9-4a39-ad78-fd13c2027c7e.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2020/month=1/day=1/part-00000-8eafa330-3be9-4a39-ad78-fd13c2027c7e.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=3/.part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=5/.part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2021/month=12/day=20/.part-00000-9275fdf4-3961-4184-baa0-1c8a2bb98104.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2021/month=12/day=20/part-00000-9275fdf4-3961-4184-baa0-1c8a2bb98104.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2021/month=12/day=4/.part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2021/month=12/day=4/part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2021/month=4/day=5/.part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-partitioned/year=2021/month=4/day=5/part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-special-partition/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-special-partition/x=A%2FA/.part-00007-b350e235-2832-45df-9918-6cab4f7578f7.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-special-partition/x=A%2FA/part-00007-b350e235-2832-45df-9918-6cab4f7578f7.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-special-partition/x=B%20B/.part-00015-e9abbc6f-85e9-457b-be8e-e9f5b8a22890.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0-special-partition/x=B%20B/part-00015-e9abbc6f-85e9-457b-be8e-e9f5b8a22890.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/.part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/.part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/.part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/_change_data/.gitkeep (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/_delta_index/.gitkeep (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-0.8.0/part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000000.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000001.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000002.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000003.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000003.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000004.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000004.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000005.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000005.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000006.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000006.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000007.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000007.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000008.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000008.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000009.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000009.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000010.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000010.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000010.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000011.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000011.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000012.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000012.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/_delta_log/_last_checkpoint (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-1c2d1a32-02dc-484f-87ff-4328ea56045d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-28925d3a-bdf2-411e-bca9-b067444cbcb0-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-6630b7c4-0aca-405b-be86-68a812f2e4c8-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-74151571-7ec6-4bd6-9293-b5daab2ce667-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-7a509247-4f58-4453-9202-51d75dee59af-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-8e0aefe1-6645-4601-ac29-68cba64023b5-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-b26ba634-874c-45b0-a7ff-2f0395a53966-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-c4c8caec-299d-42a4-b50c-5a4bf724c037-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-ce300400-58ff-4b8f-8ba9-49422fdf9f2e-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-e1262b3e-2959-4910-aea9-4eaf92f0c68c-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-e8e3753f-e2f6-4c9f-98f9-8f3d346727ba-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-1.2.1-only-struct-stats/part-00000-f73ff835-0571-4d67-ac43-4fbf948bfb9b-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-2.2.0-partitioned-types/_delta_log/.00000000000000000000.json.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-2.2.0-partitioned-types/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-2.2.0-partitioned-types/c1=4/c2=c/.part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-2.2.0-partitioned-types/c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-2.2.0-partitioned-types/c1=5/c2=b/.part-00007-4e73fa3b-2c88-424a-8051-f8b54328ffdb.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-2.2.0-partitioned-types/c1=5/c2=b/part-00007-4e73fa3b-2c88-424a-8051-f8b54328ffdb.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-2.2.0-partitioned-types/c1=6/c2=a/.part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-2.2.0-partitioned-types/c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-live-table/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/delta-live-table/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/golden/data-reader-array-primitives/.part-00000-182665f0-30df-470d-a5cb-8d9d483ed390-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/golden/data-reader-array-primitives/.part-00001-2e274fe7-eb75-4b73-8c72-423ee747abc0-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/golden/data-reader-array-primitives/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/golden/data-reader-array-primitives/part-00000-182665f0-30df-470d-a5cb-8d9d483ed390-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/golden/data-reader-array-primitives/part-00001-2e274fe7-eb75-4b73-8c72-423ee747abc0-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/http_requests/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/http_requests/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/http_requests/date=2023-04-13/part-00000-e853fe2e-6f42-450c-8af1-4145b73a96c7-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/http_requests/date=2023-04-14/part-00000-731ab1b3-85a8-4bc3-92e5-96347fe3fd84-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/issue_1374/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/issue_1374/_delta_log/00000000000000000001.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/issue_1374/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/issue_1374/_delta_log/_last_checkpoint (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/issue_1374/date=2023-05-24/part-00000-e2b01fc6-a906-4008-82df-e98efdcdd47d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/issue_1374/date=2023-05-24/part-00000-e2b01fc6-a906-4008-82df-e98efdcdd49c-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/.part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/.part-00000-b44fcdb0-8b06-4f3a-8606-f8311a96f6dc-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/.part-00001-185eca06-e017-4dea-ae49-fc48b973e37e-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/.part-00001-4327c977-2734-4477-9507-7ccf67924649-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/_delta_log/.gitignore (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/part-00000-b44fcdb0-8b06-4f3a-8606-f8311a96f6dc-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/part-00001-185eca06-e017-4dea-ae49-fc48b973e37e-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_commit/part-00001-4327c977-2734-4477-9507-7ccf67924649-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00000-2befed33-c358-4768-a43c-3eda0d2a499d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00000-a922ea3b-ffc2-4ca1-9074-a278c24c4449-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00000-c1777d7d-89d9-4790-b38a-6ee7e24456b1-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00000-f17fcbf5-e0dc-40ba-adae-ce66d1fcaef6-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00001-7891c33d-cedc-47c3-88a6-abcfb049d3b4-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00001-bb70d2ba-c196-4df2-9c85-f34969ad3aa9-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00003-508ae4aa-801c-4c2c-a923-f6f89930a5c1-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00003-53f42606-6cda-4f13-8d07-599a21197296-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00004-80938522-09c0-420c-861f-5a649e3d9674-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00004-95c9bc2c-ac85-4581-b3cc-84502b0c314f-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00005-94a0861b-6455-4bd9-a080-73e02491c643-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00007-94f725e2-3963-4b00-9e83-e31021a93cf9-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00011-42f838f9-a911-40af-98f5-2fccfa1b123f-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00045-332fe409-7705-45b1-8d34-a0018cf73b70-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00049-d3095817-de74-49c1-a888-81565a40161d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00058-b462c4cb-0c48-4148-8475-e21d2a2935f8-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00068-90650739-6a8e-492b-9403-53e33b3778ac-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00069-c78b4dd8-f955-4643-816f-cbd30a3f8c1b-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00077-2fcb1c7c-5390-48ee-93f6-0acf11199a0d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00107-3f6c2aa0-fc28-4f4c-be15-135e15b398f4-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00112-07fd790a-11dc-4fde-9acd-623e740be992-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00116-bc66759e-6381-4f34-8cd4-6688aad8585d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00121-d8bc3e53-d2f2-48ce-9909-78da7294ffbd-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00128-b31c3b81-24da-4a90-a8b4-578c9e9a218d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00140-e9b1971d-d708-43fb-b07f-975d2226b800-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00143-03ceb88e-5283-4193-aa43-993cdf937fd3-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00150-ec6643fc-4963-4871-9613-f5ad1940b689-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00154-4630673a-5227-48fb-a03d-e356fcd1564a-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/.part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/_delta_log/00000000000000000003.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/_delta_log/00000000000000000004.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00000-2befed33-c358-4768-a43c-3eda0d2a499d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00000-a922ea3b-ffc2-4ca1-9074-a278c24c4449-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00000-c1777d7d-89d9-4790-b38a-6ee7e24456b1-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00000-f17fcbf5-e0dc-40ba-adae-ce66d1fcaef6-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00001-7891c33d-cedc-47c3-88a6-abcfb049d3b4-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00001-bb70d2ba-c196-4df2-9c85-f34969ad3aa9-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00003-508ae4aa-801c-4c2c-a923-f6f89930a5c1-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00003-53f42606-6cda-4f13-8d07-599a21197296-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00004-80938522-09c0-420c-861f-5a649e3d9674-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00004-95c9bc2c-ac85-4581-b3cc-84502b0c314f-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00005-94a0861b-6455-4bd9-a080-73e02491c643-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00007-94f725e2-3963-4b00-9e83-e31021a93cf9-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00011-42f838f9-a911-40af-98f5-2fccfa1b123f-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00045-332fe409-7705-45b1-8d34-a0018cf73b70-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00049-d3095817-de74-49c1-a888-81565a40161d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00058-b462c4cb-0c48-4148-8475-e21d2a2935f8-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00068-90650739-6a8e-492b-9403-53e33b3778ac-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00069-c78b4dd8-f955-4643-816f-cbd30a3f8c1b-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00077-2fcb1c7c-5390-48ee-93f6-0acf11199a0d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00107-3f6c2aa0-fc28-4f4c-be15-135e15b398f4-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00112-07fd790a-11dc-4fde-9acd-623e740be992-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00116-bc66759e-6381-4f34-8cd4-6688aad8585d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00121-d8bc3e53-d2f2-48ce-9909-78da7294ffbd-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00128-b31c3b81-24da-4a90-a8b4-578c9e9a218d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00140-e9b1971d-d708-43fb-b07f-975d2226b800-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00143-03ceb88e-5283-4193-aa43-993cdf937fd3-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00150-ec6643fc-4963-4871-9613-f5ad1940b689-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00154-4630673a-5227-48fb-a03d-e356fcd1564a-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table/part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00000-2befed33-c358-4768-a43c-3eda0d2a499d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00000-a922ea3b-ffc2-4ca1-9074-a278c24c4449-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00000-c1777d7d-89d9-4790-b38a-6ee7e24456b1-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00000-f17fcbf5-e0dc-40ba-adae-ce66d1fcaef6-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00001-7891c33d-cedc-47c3-88a6-abcfb049d3b4-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00001-bb70d2ba-c196-4df2-9c85-f34969ad3aa9-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00003-508ae4aa-801c-4c2c-a923-f6f89930a5c1-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00003-53f42606-6cda-4f13-8d07-599a21197296-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00004-80938522-09c0-420c-861f-5a649e3d9674-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00004-95c9bc2c-ac85-4581-b3cc-84502b0c314f-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00005-94a0861b-6455-4bd9-a080-73e02491c643-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00007-94f725e2-3963-4b00-9e83-e31021a93cf9-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00011-42f838f9-a911-40af-98f5-2fccfa1b123f-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00045-332fe409-7705-45b1-8d34-a0018cf73b70-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00049-d3095817-de74-49c1-a888-81565a40161d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00058-b462c4cb-0c48-4148-8475-e21d2a2935f8-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00068-90650739-6a8e-492b-9403-53e33b3778ac-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00069-c78b4dd8-f955-4643-816f-cbd30a3f8c1b-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00077-2fcb1c7c-5390-48ee-93f6-0acf11199a0d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00107-3f6c2aa0-fc28-4f4c-be15-135e15b398f4-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00112-07fd790a-11dc-4fde-9acd-623e740be992-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00116-bc66759e-6381-4f34-8cd4-6688aad8585d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00121-d8bc3e53-d2f2-48ce-9909-78da7294ffbd-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00128-b31c3b81-24da-4a90-a8b4-578c9e9a218d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00140-e9b1971d-d708-43fb-b07f-975d2226b800-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00143-03ceb88e-5283-4193-aa43-993cdf937fd3-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00150-ec6643fc-4963-4871-9613-f5ad1940b689-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00154-4630673a-5227-48fb-a03d-e356fcd1564a-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/.part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/_delta_log/00000000000000000003.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/_delta_log/00000000000000000004.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00000-2befed33-c358-4768-a43c-3eda0d2a499d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00000-a922ea3b-ffc2-4ca1-9074-a278c24c4449-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00000-c1777d7d-89d9-4790-b38a-6ee7e24456b1-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00000-f17fcbf5-e0dc-40ba-adae-ce66d1fcaef6-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00001-7891c33d-cedc-47c3-88a6-abcfb049d3b4-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00001-bb70d2ba-c196-4df2-9c85-f34969ad3aa9-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00003-508ae4aa-801c-4c2c-a923-f6f89930a5c1-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00003-53f42606-6cda-4f13-8d07-599a21197296-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00004-80938522-09c0-420c-861f-5a649e3d9674-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00004-95c9bc2c-ac85-4581-b3cc-84502b0c314f-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00005-94a0861b-6455-4bd9-a080-73e02491c643-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00007-94f725e2-3963-4b00-9e83-e31021a93cf9-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00011-42f838f9-a911-40af-98f5-2fccfa1b123f-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00045-332fe409-7705-45b1-8d34-a0018cf73b70-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00049-d3095817-de74-49c1-a888-81565a40161d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00058-b462c4cb-0c48-4148-8475-e21d2a2935f8-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00068-90650739-6a8e-492b-9403-53e33b3778ac-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00069-c78b4dd8-f955-4643-816f-cbd30a3f8c1b-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00077-2fcb1c7c-5390-48ee-93f6-0acf11199a0d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00107-3f6c2aa0-fc28-4f4c-be15-135e15b398f4-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00112-07fd790a-11dc-4fde-9acd-623e740be992-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00116-bc66759e-6381-4f34-8cd4-6688aad8585d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00121-d8bc3e53-d2f2-48ce-9909-78da7294ffbd-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00128-b31c3b81-24da-4a90-a8b4-578c9e9a218d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00140-e9b1971d-d708-43fb-b07f-975d2226b800-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00143-03ceb88e-5283-4193-aa43-993cdf937fd3-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00150-ec6643fc-4963-4871-9613-f5ad1940b689-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00154-4630673a-5227-48fb-a03d-e356fcd1564a-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_features/part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/_change_data/cdc-00000-a846ce80-2eec-484d-bef7-0e63557786ca.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/_delta_log/00000000000000000000.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/_delta_log/00000000000000000001.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/_delta_log/00000000000000000002.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/part-00000-7444aec4-710a-4a4c-8abe-3323499043e9.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_cdc/part-00000-996384f7-3fc5-4a5f-9921-6e56269ec2c9-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-136c36f5-639d-4e95-bb0f-15cde3fb14eb-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-1abe25d3-0da6-46c5-98c1-7a69872fd797-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-3810fbe0-9892-431d-bcfd-7de5788dfe8d-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-3fa65c69-4e55-4b18-a195-5f1ae583e553-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-72ecc4d6-2e44-4df4-99e6-23f1ac2b7b7c-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-7d239c98-d74b-4b02-b3f6-9f256992c633-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-8e7dc8c1-337b-40b8-a411-46d4295da531-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-9afd9224-729f-4420-a05e-8032113a6568-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-e93060ad-9c8c-4170-a9da-7c6f53f6406b-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-e9c6df9a-e585-4c70-bc1f-de9bd8ae025b-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/.part-00000-f0e955c5-a1e3-4eec-834e-dcc098fc9005-c000.snappy.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/.00000000000000000010.checkpoint.parquet.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000003.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000004.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000005.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000006.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000007.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000008.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000009.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000010.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/00000000000000000010.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/_delta_log/_last_checkpoint (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-136c36f5-639d-4e95-bb0f-15cde3fb14eb-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-1abe25d3-0da6-46c5-98c1-7a69872fd797-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-3810fbe0-9892-431d-bcfd-7de5788dfe8d-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-3fa65c69-4e55-4b18-a195-5f1ae583e553-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-72ecc4d6-2e44-4df4-99e6-23f1ac2b7b7c-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-7d239c98-d74b-4b02-b3f6-9f256992c633-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-8e7dc8c1-337b-40b8-a411-46d4295da531-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-9afd9224-729f-4420-a05e-8032113a6568-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-e93060ad-9c8c-4170-a9da-7c6f53f6406b-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-e9c6df9a-e585-4c70-bc1f-de9bd8ae025b-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/simple_table_with_checkpoint/part-00000-f0e955c5-a1e3-4eec-834e-dcc098fc9005-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table-with-dv-small/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table-with-dv-small/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table-with-dv-small/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table-with-dv-small/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table-without-dv-small/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table-without-dv-small/part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_column_mapping/8v/part-00001-69b4a452-aeac-4ffa-bf5c-a0c2833d05eb.c000.zstd.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_column_mapping/BH/part-00000-4d6e745c-8e04-48d9-aa60-438228358f1a.c000.zstd.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_column_mapping/_delta_log/00000000000000000000.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_column_mapping/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000000.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000001.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000002.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000003.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000003.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000004.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000004.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000005.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000005.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000006.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000006.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000007.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000007.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000008.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000008.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000009.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000009.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000010.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000010.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000010.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000011.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000011.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000012.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000012.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000013.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000013.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000014.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000014.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000015.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000015.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000016.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000016.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000017.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000017.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000018.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000018.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000019.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000019.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000020.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000020.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/00000000000000000020.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/_delta_log/_last_checkpoint (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/deletion_vector_8e4ca8be-7615-43cf-bc06-5d131148683f.bin (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/deletion_vector_a2084964-69d4-4e1e-95f5-9bbd6571d5c3.bin (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_deletion_logs/part-00000-cb251d5e-b665-437a-a9a7-fbfc5137c77d.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_edge_timestamps/_delta_log/00000000000000000000.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_edge_timestamps/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_edge_timestamps/part-00000-a9dd181d-61aa-491d-b3c9-3eea548de6cb-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_edge_timestamps/part-00001-f804d355-db40-4e13-a624-ddd50ce7f5c4-c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/_delta_log/.s3-optimization-0 (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/_delta_log/.s3-optimization-1 (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/_delta_log/.s3-optimization-2 (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/_delta_log/00000000000000000000.crc (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00044-22c23f7f-2411-4d88-b78c-cebe430cdd47.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00089-b466c656-9b4a-41d6-ab41-f02007d1658c.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00134-34f9b771-c60a-4bd4-bdc0-cd25fcc951c6.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00179-76f56874-b389-409b-8a2d-18462928840e.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00223-24d8cffb-245d-4027-87d6-940fcf593a60.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00268-365db28b-f856-49e6-a25f-b0211cf95d20.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00313-c528546e-c8ab-425d-b49a-5afe731aaac8.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00358-5937ec73-64a5-44dd-a793-922e30c1b9df.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00403-6af19469-0fc5-4809-b02a-ddebda3966e8.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/table_with_liquid_clustering/part-00447-1755ad02-9b47-4287-8333-92cb01a5124b.c000.snappy.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000000.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000001.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.checkpoint.parquet (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.json (100%) rename crates/{deltalake-core => deltalake-test}/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000003.json (100%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f216830327..8ea03b8661 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -144,25 +144,3 @@ jobs: - name: Run tests with native-tls run: | cargo test --no-default-features --features integration_test,s3-native-tls,datafusion - - parquet2_test: - runs-on: ubuntu-latest - env: - RUSTFLAGS: "-C debuginfo=line-tables-only" - CARGO_INCREMENTAL: 0 - - steps: - - uses: actions/checkout@v3 - - - name: Install minimal stable with clippy and rustfmt - uses: actions-rs/toolchain@v1 - with: - profile: default - toolchain: stable - override: true - - - uses: Swatinem/rust-cache@v2 - - - name: Run tests - working-directory: crates/deltalake-core - run: cargo test --no-default-features --features=parquet2 diff --git a/Cargo.toml b/Cargo.toml index 1e9f311693..2e58e375e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,7 @@ serde_json = "1" # "stdlib" bytes = { version = "1" } chrono = { version = "0.4.31", default-features = false, features = ["clock"] } +log = { version = "0.4" } regex = { version = "1" } thiserror = { version = "1" } url = { version = "2" } diff --git a/crates/deltalake-aws/Cargo.toml b/crates/deltalake-aws/Cargo.toml index 8b7e0f4655..b0f102ce7c 100644 --- a/crates/deltalake-aws/Cargo.toml +++ b/crates/deltalake-aws/Cargo.toml @@ -3,33 +3,43 @@ name = "deltalake-aws" version = "0.1.0" edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] +deltalake-core = { path = "../deltalake-core" } rusoto_core = { version = "0.47", default-features = false, optional = true } -rusoto_credential = { version = "0.47", optional = true } +rusoto_credential = { version = "0.47" } rusoto_sts = { version = "0.47", default-features = false, optional = true } rusoto_dynamodb = { version = "0.47", default-features = false, optional = true } -object_store = "0.7" +object_store = { version = "0.7.1", features = ["aws"]} lazy_static = "1" maplit = "1" +async-trait = { workspace = true } +bytes = { workspace = true } +futures = { workspace = true } +log = { workspace = true } thiserror = { workspace = true } +tokio = { workspace = true } regex = { workspace = true } +uuid = { workspace = true, features = ["serde", "v4"] } +url = { workspace = true } [dev-dependencies] +chrono = { workspace = true } +serial_test = "2" +deltalake-test = { path = "../deltalake-test" } +pretty_env_logger = "*" +rand = "0.8" +serde_json = { workspace = true } [features] +default = ["rustls"] +integration_test = [] native-tls = [ "rusoto_core/native-tls", - "rusoto_credential", "rusoto_sts/native-tls", "rusoto_dynamodb/native-tls", - "object_store/aws", ] rustls = [ "rusoto_core/rustls", - "rusoto_credential", "rusoto_sts/rustls", "rusoto_dynamodb/rustls", - "object_store/aws", ] diff --git a/crates/deltalake-core/tests/common/s3.rs b/crates/deltalake-aws/helpers.rs similarity index 100% rename from crates/deltalake-core/tests/common/s3.rs rename to crates/deltalake-aws/helpers.rs diff --git a/crates/deltalake-aws/src/lib.rs b/crates/deltalake-aws/src/lib.rs index 4a8d36657c..f6a2b2da31 100644 --- a/crates/deltalake-aws/src/lib.rs +++ b/crates/deltalake-aws/src/lib.rs @@ -1,16 +1,22 @@ //! Lock client implementation based on DynamoDb. pub mod errors; +pub mod logstore; +pub mod storage; use lazy_static::lazy_static; +use log::*; use regex::Regex; use std::{ collections::HashMap, str::FromStr, + sync::Arc, time::{Duration, SystemTime}, }; -use object_store::path::Path; +use deltalake_core::logstore::{logstores, LogStore, LogStoreFactory}; +use deltalake_core::storage::{factories, url_prefix_handler, ObjectStoreRef, StorageOptions}; +use deltalake_core::{DeltaResult, Path}; use rusoto_core::{HttpClient, Region, RusotoError}; use rusoto_credential::AutoRefreshingProvider; use rusoto_dynamodb::{ @@ -19,8 +25,48 @@ use rusoto_dynamodb::{ UpdateItemError, UpdateItemInput, }; use rusoto_sts::WebIdentityProvider; +use url::Url; use errors::{DynamoDbConfigError, LockClientError}; +use storage::{S3ObjectStoreFactory, S3StorageOptions}; + +#[derive(Clone, Debug, Default)] +struct S3LogStoreFactory {} + +impl LogStoreFactory for S3LogStoreFactory { + fn with_options( + &self, + store: ObjectStoreRef, + location: &Url, + options: &StorageOptions, + ) -> DeltaResult> { + let store = url_prefix_handler(store, Path::parse(location.path())?)?; + let s3_options = S3StorageOptions::from_map(&options.0); + + if s3_options.locking_provider.as_deref() != Some("dynamodb") { + debug!("S3LogStoreFactory has been asked to create a LogStore without the dynamodb locking provider"); + return Ok(deltalake_core::logstore::default_logstore( + store, location, options, + )); + } + + Ok(Arc::new(logstore::S3DynamoDbLogStore::try_new( + location.clone(), + options.clone(), + &s3_options, + store, + )?)) + } +} + +/// Register an [ObjectStoreFactory] for common S3 [Url] schemes +pub fn register_handlers(_additional_prefixes: Option) { + for scheme in ["s3", "s3a"].iter() { + let url = Url::parse(&format!("{}://", scheme)).unwrap(); + factories().insert(url.clone(), Arc::new(S3ObjectStoreFactory::default())); + logstores().insert(url.clone(), Arc::new(S3LogStoreFactory::default())); + } +} /// Representation of a log entry stored in DynamoDb /// dynamo db item consists of: @@ -62,6 +108,12 @@ pub struct DynamoDbLockClient { config: DynamoDbConfig, } +impl std::fmt::Debug for DynamoDbLockClient { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(f, "DynamoDbLockClient(config: {:?})", self.config) + } +} + impl DynamoDbLockClient { /// Creates a new DynamoDbLockClient from the supplied storage options. pub fn try_new( @@ -514,8 +566,9 @@ fn extract_version_from_filename(name: &str) -> Option { #[cfg(test)] mod tests { - use super::*; + use object_store::memory::InMemory; + use serial_test::serial; fn commit_entry_roundtrip(c: &CommitEntry) -> Result<(), LockClientError> { let item_data: HashMap = create_value_map(c, "some_table"); @@ -547,4 +600,19 @@ mod tests { })?; Ok(()) } + + /// In cases where there is no dynamodb specified locking provider, this should get a default + /// logstore + #[test] + #[serial] + fn test_logstore_factory_default() { + let factory = S3LogStoreFactory::default(); + let store = InMemory::new(); + let url = Url::parse("s3://test-bucket").unwrap(); + std::env::remove_var(storage::s3_constants::AWS_S3_LOCKING_PROVIDER); + let logstore = factory + .with_options(Arc::new(store), &url, &StorageOptions::from(HashMap::new())) + .unwrap(); + assert_eq!(logstore.name(), "DefaultLogStore"); + } } diff --git a/crates/deltalake-core/src/logstore/s3/mod.rs b/crates/deltalake-aws/src/logstore.rs similarity index 76% rename from crates/deltalake-core/src/logstore/s3/mod.rs rename to crates/deltalake-aws/src/logstore.rs index 9e7883c7b2..295251c6ca 100644 --- a/crates/deltalake-core/src/logstore/s3/mod.rs +++ b/crates/deltalake-aws/src/logstore.rs @@ -3,22 +3,22 @@ //! when the underlying object storage does not support atomic `put_if_absent` //! or `rename_if_absent` operations, as is the case for S3. -use deltalake_aws::errors::LockClientError; -use deltalake_aws::{constants, CommitEntry, DynamoDbLockClient, UpdateLogEntryResult}; +use crate::errors::LockClientError; +use crate::storage::S3StorageOptions; +use crate::{constants, CommitEntry, DynamoDbLockClient, UpdateLogEntryResult}; use bytes::Bytes; -use object_store::path::Path; -use object_store::Error as ObjectStoreError; +use deltalake_core::{ObjectStoreError, Path}; +use log::*; use url::Url; -use crate::{ +use deltalake_core::logstore::*; +use deltalake_core::{ operations::transaction::TransactionError, - storage::{config::StorageOptions, s3::S3StorageOptions, ObjectStoreRef}, + storage::{ObjectStoreRef, StorageOptions}, DeltaResult, DeltaTableError, }; -use super::{LogStore, LogStoreConfig}; - const STORE_NAME: &str = "DeltaS3ObjectStore"; const MAX_REPAIR_RETRIES: i64 = 3; @@ -30,6 +30,12 @@ pub struct S3DynamoDbLogStore { table_path: String, } +impl std::fmt::Debug for S3DynamoDbLogStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(f, "S3DynamoDbLogStore({})", self.table_path) + } +} + impl S3DynamoDbLogStore { /// Create log store pub fn try_new( @@ -45,12 +51,13 @@ impl S3DynamoDbLogStore { s3_options.use_web_identity, ) .map_err(|err| DeltaTableError::ObjectStore { - source: object_store::Error::Generic { + source: ObjectStoreError::Generic { store: STORE_NAME, source: err.into(), }, })?; - let table_path = super::to_uri(&location, &Path::from("")); + debug!("S3DynamoDbLogStore configured with lock client: {lock_client:?}"); + let table_path = to_uri(&location, &Path::from("")); Ok(Self { storage: object_store, lock_client, @@ -73,22 +80,22 @@ impl S3DynamoDbLogStore { return Ok(RepairLogEntryResult::AlreadyCompleted); } for retry in 0..=MAX_REPAIR_RETRIES { - match super::write_commit_entry(self.storage.as_ref(), entry.version, &entry.temp_path) - .await - { + match write_commit_entry(&self.storage, entry.version, &entry.temp_path).await { Ok(()) => { + debug!("Successfully committed entry for version {}", entry.version); return self.try_complete_entry(entry, true).await; } // `N.json` has already been moved, complete the entry in DynamoDb just in case Err(TransactionError::ObjectStore { source: ObjectStoreError::NotFound { .. }, }) => { + warn!("It looks like the {}.json has already been moved, we got 404 from ObjectStorage.", entry.version); return self.try_complete_entry(entry, false).await; } Err(err) if retry == MAX_REPAIR_RETRIES => return Err(err), - Err(err) => log::debug!( - "retry #{retry} on log entry {entry:?} failed to move commit: '{err}'" - ), + Err(err) => { + debug!("retry #{retry} on log entry {entry:?} failed to move commit: '{err}'") + } } } unreachable!("for loop yields Ok or Err in body when retry = MAX_REPAIR_RETRIES") @@ -100,6 +107,7 @@ impl S3DynamoDbLogStore { entry: &CommitEntry, copy_performed: bool, ) -> Result { + debug!("try_complete_entry for {:?}, {}", entry, copy_performed); for retry in 0..=MAX_REPAIR_RETRIES { match self .lock_client @@ -114,7 +122,7 @@ impl S3DynamoDbLogStore { }) { Ok(x) => return Ok(Self::map_retry_result(x, copy_performed)), Err(err) if retry == MAX_REPAIR_RETRIES => return Err(err), - Err(err) => log::debug!( + Err(err) => error!( "retry #{retry} on log entry {entry:?} failed to update lock db: '{err}'" ), } @@ -141,6 +149,10 @@ impl S3DynamoDbLogStore { #[async_trait::async_trait] impl LogStore for S3DynamoDbLogStore { + fn name(&self) -> String { + "S3DynamoDbLogStore".into() + } + fn root_uri(&self) -> String { self.table_path.clone() } @@ -153,7 +165,7 @@ impl LogStore for S3DynamoDbLogStore { if let Ok(Some(entry)) = entry { self.repair_entry(&entry).await?; } - super::read_commit_entry(self.storage.as_ref(), version).await + read_commit_entry(&self.storage, version).await } /// Tries to commit a prepared commit file. Returns [DeltaTableError::VersionAlreadyExists] @@ -167,26 +179,34 @@ impl LogStore for S3DynamoDbLogStore { tmp_commit: &Path, ) -> Result<(), TransactionError> { let entry = CommitEntry::new(version, tmp_commit.clone()); + debug!("Writing commit entry for {self:?}: {entry:?}"); // create log entry in dynamo db: complete = false, no expireTime self.lock_client .put_commit_entry(&self.table_path, &entry) .await .map_err(|err| match err { LockClientError::VersionAlreadyExists { version, .. } => { + warn!("LockClientError::VersionAlreadyExists({version})"); TransactionError::VersionAlreadyExists(version) } - LockClientError::ProvisionedThroughputExceeded => todo!(), - LockClientError::LockTableNotFound => TransactionError::LogStoreError { - msg: format!( - "lock table '{}' not found", - self.lock_client.get_lock_table_name() - ), - source: Box::new(err), - }, - err => TransactionError::LogStoreError { - msg: "dynamodb client failed to write log entry".to_owned(), - source: Box::new(err), - }, + LockClientError::ProvisionedThroughputExceeded => todo!( + "deltalake-aws does not yet handle DynamoDB provisioned throughput errors" + ), + LockClientError::LockTableNotFound => { + let table_name = self.lock_client.get_lock_table_name(); + error!("Lock table '{table_name}' not found"); + TransactionError::LogStoreError { + msg: format!("lock table '{table_name}' not found"), + source: Box::new(err), + } + } + err => { + error!("dynamodb client failed to write log entry: {err:?}"); + TransactionError::LogStoreError { + msg: "dynamodb client failed to write log entry".to_owned(), + source: Box::new(err), + } + } })?; // `repair_entry` performs the exact steps required to finalize the commit, but contains // retry logic and more robust error handling under the assumption that any other client @@ -198,6 +218,7 @@ impl LogStore for S3DynamoDbLogStore { } async fn get_latest_version(&self, current_version: i64) -> DeltaResult { + debug!("Retrieving latest version of {self:?} at v{current_version}"); let entry = self .lock_client .get_latest_entry(&self.table_path) @@ -210,7 +231,7 @@ impl LogStore for S3DynamoDbLogStore { self.repair_entry(&entry).await?; Ok(entry.version) } else { - super::get_latest_version(self, current_version).await + get_latest_version(self, current_version).await } } @@ -218,15 +239,6 @@ impl LogStore for S3DynamoDbLogStore { self.storage.clone() } - fn to_uri(&self, location: &Path) -> String { - super::to_uri(&self.config.location, location) - } - - #[cfg(feature = "datafusion")] - fn object_store_url(&self) -> datafusion::execution::object_store::ObjectStoreUrl { - super::object_store_url(&self.config.location) - } - fn config(&self) -> &LogStoreConfig { &self.config } diff --git a/crates/deltalake-aws/src/storage.rs b/crates/deltalake-aws/src/storage.rs new file mode 100644 index 0000000000..97786e9736 --- /dev/null +++ b/crates/deltalake-aws/src/storage.rs @@ -0,0 +1,597 @@ +//! AWS S3 storage backend. + +use bytes::Bytes; +use deltalake_core::storage::object_store::{ + aws::AmazonS3ConfigKey, parse_url_opts, GetOptions, GetResult, ListResult, MultipartId, + ObjectMeta, ObjectStore, Result as ObjectStoreResult, +}; +use deltalake_core::storage::{str_is_truthy, ObjectStoreFactory, ObjectStoreRef, StorageOptions}; +use deltalake_core::{DeltaResult, ObjectStoreError, Path}; +use futures::stream::BoxStream; +use rusoto_core::Region; +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::Range; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; +use tokio::io::AsyncWrite; +use url::Url; + +const STORE_NAME: &str = "DeltaS3ObjectStore"; + +#[derive(Clone, Default, Debug)] +pub struct S3ObjectStoreFactory {} + +impl S3ObjectStoreFactory { + fn with_env_s3(&self, options: &StorageOptions) -> StorageOptions { + let mut options = options.clone(); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if let Ok(config_key) = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) { + if !options.0.contains_key(config_key.as_ref()) { + options + .0 + .insert(config_key.as_ref().to_string(), value.to_string()); + } + } + } + } + options + } +} + +impl ObjectStoreFactory for S3ObjectStoreFactory { + fn parse_url_opts( + &self, + url: &Url, + options: &StorageOptions, + ) -> DeltaResult<(ObjectStoreRef, Path)> { + let options = self.with_env_s3(options); + let (store, prefix) = parse_url_opts( + url, + options.0.iter().filter_map(|(key, value)| { + let s3_key = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; + Some((s3_key, value.clone())) + }), + )?; + + let options = S3StorageOptions::from_map(&options.0); + let store = S3StorageBackend::try_new( + store.into(), + Some("dynamodb") == options.locking_provider.as_deref() || options.allow_unsafe_rename, + )?; + + Ok((Arc::new(store), prefix)) + } +} + +/// Options used to configure the [S3StorageBackend]. +/// +/// Available options are described in [s3_constants]. +#[derive(Clone, Debug, PartialEq, Eq)] +#[allow(missing_docs)] +pub struct S3StorageOptions { + pub endpoint_url: Option, + pub region: Region, + pub profile: Option, + pub aws_access_key_id: Option, + pub aws_secret_access_key: Option, + pub aws_session_token: Option, + pub virtual_hosted_style_request: bool, + pub locking_provider: Option, + pub assume_role_arn: Option, + pub assume_role_session_name: Option, + pub use_web_identity: bool, + pub s3_pool_idle_timeout: Duration, + pub sts_pool_idle_timeout: Duration, + pub s3_get_internal_server_error_retries: usize, + pub allow_unsafe_rename: bool, + pub extra_opts: HashMap, +} + +impl S3StorageOptions { + /// Creates an instance of S3StorageOptions from the given HashMap. + pub fn from_map(options: &HashMap) -> S3StorageOptions { + let extra_opts = options + .iter() + .filter(|(k, _)| !s3_constants::S3_OPTS.contains(&k.as_str())) + .map(|(k, v)| (k.to_owned(), v.to_owned())) + .collect(); + + // Copy web identity values provided in options but not the environment into the environment + // to get picked up by the `from_k8s_env` call in `get_web_identity_provider`. + Self::ensure_env_var(options, s3_constants::AWS_REGION); + Self::ensure_env_var(options, s3_constants::AWS_PROFILE); + Self::ensure_env_var(options, s3_constants::AWS_ACCESS_KEY_ID); + Self::ensure_env_var(options, s3_constants::AWS_SECRET_ACCESS_KEY); + Self::ensure_env_var(options, s3_constants::AWS_SESSION_TOKEN); + Self::ensure_env_var(options, s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE); + Self::ensure_env_var(options, s3_constants::AWS_ROLE_ARN); + Self::ensure_env_var(options, s3_constants::AWS_ROLE_SESSION_NAME); + + let endpoint_url = str_option(options, s3_constants::AWS_ENDPOINT_URL); + let region = if let Some(endpoint_url) = endpoint_url.as_ref() { + Region::Custom { + name: Self::str_or_default(options, s3_constants::AWS_REGION, "custom".to_string()), + endpoint: endpoint_url.to_owned(), + } + } else { + Region::default() + }; + let profile = str_option(options, s3_constants::AWS_PROFILE); + + let s3_pool_idle_timeout = + Self::u64_or_default(options, s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, 15); + let sts_pool_idle_timeout = + Self::u64_or_default(options, s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, 10); + + let s3_get_internal_server_error_retries = Self::u64_or_default( + options, + s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, + 10, + ) as usize; + + let virtual_hosted_style_request: bool = + str_option(options, s3_constants::AWS_S3_ADDRESSING_STYLE) + .map(|addressing_style| addressing_style == "virtual") + .unwrap_or(false); + + let allow_unsafe_rename = str_option(options, s3_constants::AWS_S3_ALLOW_UNSAFE_RENAME) + .map(|val| str_is_truthy(&val)) + .unwrap_or(false); + + Self { + endpoint_url, + region, + profile, + aws_access_key_id: str_option(options, s3_constants::AWS_ACCESS_KEY_ID), + aws_secret_access_key: str_option(options, s3_constants::AWS_SECRET_ACCESS_KEY), + aws_session_token: str_option(options, s3_constants::AWS_SESSION_TOKEN), + virtual_hosted_style_request, + locking_provider: str_option(options, s3_constants::AWS_S3_LOCKING_PROVIDER), + assume_role_arn: str_option(options, s3_constants::AWS_S3_ASSUME_ROLE_ARN), + assume_role_session_name: str_option(options, s3_constants::AWS_S3_ROLE_SESSION_NAME), + use_web_identity: std::env::var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE).is_ok(), + s3_pool_idle_timeout: Duration::from_secs(s3_pool_idle_timeout), + sts_pool_idle_timeout: Duration::from_secs(sts_pool_idle_timeout), + s3_get_internal_server_error_retries, + allow_unsafe_rename, + extra_opts, + } + } + + fn str_or_default(map: &HashMap, key: &str, default: String) -> String { + map.get(key) + .map(|v| v.to_owned()) + .unwrap_or_else(|| std::env::var(key).unwrap_or(default)) + } + + fn u64_or_default(map: &HashMap, key: &str, default: u64) -> u64 { + str_option(map, key) + .and_then(|v| v.parse().ok()) + .unwrap_or(default) + } + + fn ensure_env_var(map: &HashMap, key: &str) { + if let Some(val) = str_option(map, key) { + std::env::set_var(key, val); + } + } +} + +impl Default for S3StorageOptions { + /// Creates an instance of S3StorageOptions from environment variables. + fn default() -> S3StorageOptions { + Self::from_map(&HashMap::new()) + } +} + +/// An S3 implementation of the [ObjectStore] trait +pub struct S3StorageBackend { + inner: ObjectStoreRef, + /// Whether allowed to performance rename_if_not_exist as rename + allow_unsafe_rename: bool, +} + +impl std::fmt::Display for S3StorageBackend { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "S3StorageBackend") + } +} + +impl S3StorageBackend { + /// Creates a new S3StorageBackend. + /// + /// Options are described in [s3_constants]. + pub fn try_new(storage: ObjectStoreRef, allow_unsafe_rename: bool) -> ObjectStoreResult { + Ok(Self { + inner: storage, + allow_unsafe_rename, + }) + } +} + +impl std::fmt::Debug for S3StorageBackend { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(fmt, "S3StorageBackend") + } +} + +#[async_trait::async_trait] +impl ObjectStore for S3StorageBackend { + async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { + self.inner.put(location, bytes).await + } + + async fn get(&self, location: &Path) -> ObjectStoreResult { + self.inner.get(location).await + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> ObjectStoreResult { + self.inner.get_opts(location, options).await + } + + async fn get_range(&self, location: &Path, range: Range) -> ObjectStoreResult { + self.inner.get_range(location, range).await + } + + async fn head(&self, location: &Path) -> ObjectStoreResult { + self.inner.head(location).await + } + + async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { + self.inner.delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> ObjectStoreResult>> { + self.inner.list(prefix).await + } + + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> ObjectStoreResult>> { + self.inner.list_with_offset(prefix, offset).await + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> ObjectStoreResult { + self.inner.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + self.inner.copy(from, to).await + } + + async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> ObjectStoreResult<()> { + todo!() + } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + if self.allow_unsafe_rename { + self.inner.rename(from, to).await + } else { + Err(ObjectStoreError::Generic { + store: STORE_NAME, + source: Box::new(crate::errors::LockClientError::LockClientRequired), + }) + } + } + + async fn put_multipart( + &self, + location: &Path, + ) -> ObjectStoreResult<(MultipartId, Box)> { + self.inner.put_multipart(location).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> ObjectStoreResult<()> { + self.inner.abort_multipart(location, multipart_id).await + } +} + +/// Storage option keys to use when creating [crate::storage::s3::S3StorageOptions]. +/// The same key should be used whether passing a key in the hashmap or setting it as an environment variable. +/// Provided keys may include configuration for the S3 backend and also the optional DynamoDb lock used for atomic rename. +pub mod s3_constants { + /// Custom S3 endpoint. + pub const AWS_ENDPOINT_URL: &str = "AWS_ENDPOINT_URL"; + /// The AWS region. + pub const AWS_REGION: &str = "AWS_REGION"; + /// The AWS profile. + pub const AWS_PROFILE: &str = "AWS_PROFILE"; + /// The AWS_ACCESS_KEY_ID to use for S3. + pub const AWS_ACCESS_KEY_ID: &str = "AWS_ACCESS_KEY_ID"; + /// The AWS_SECRET_ACCESS_KEY to use for S3. + pub const AWS_SECRET_ACCESS_KEY: &str = "AWS_SECRET_ACCESS_KEY"; + /// The AWS_SESSION_TOKEN to use for S3. + pub const AWS_SESSION_TOKEN: &str = "AWS_SESSION_TOKEN"; + /// Uses either "path" (the default) or "virtual", which turns on + /// [virtual host addressing](http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). + pub const AWS_S3_ADDRESSING_STYLE: &str = "AWS_S3_ADDRESSING_STYLE"; + /// Locking provider to use for safe atomic rename. + /// `dynamodb` is currently the only supported locking provider. + /// If not set, safe atomic rename is not available. + pub const AWS_S3_LOCKING_PROVIDER: &str = "AWS_S3_LOCKING_PROVIDER"; + /// The role to assume for S3 writes. + pub const AWS_S3_ASSUME_ROLE_ARN: &str = "AWS_S3_ASSUME_ROLE_ARN"; + /// The role session name to use when a role is assumed. If not provided a random session name is generated. + pub const AWS_S3_ROLE_SESSION_NAME: &str = "AWS_S3_ROLE_SESSION_NAME"; + /// The `pool_idle_timeout` option of aws http client. Has to be lower than 20 seconds, which is + /// default S3 server timeout . + /// However, since rusoto uses hyper as a client, its default timeout is 90 seconds + /// . + /// Hence, the `connection closed before message completed` could occur. + /// To avoid that, the default value of this setting is 15 seconds if it's not set otherwise. + pub const AWS_S3_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_S3_POOL_IDLE_TIMEOUT_SECONDS"; + /// The `pool_idle_timeout` for the as3_constants sts client. See + /// the reasoning in `AWS_S3_POOL_IDLE_TIMEOUT_SECONDS`. + pub const AWS_STS_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_STS_POOL_IDLE_TIMEOUT_SECONDS"; + /// The number of retries for S3 GET requests failed with 500 Internal Server Error. + pub const AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES: &str = + "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES"; + /// The web identity token file to use when using a web identity provider. + /// NOTE: web identity related options are set in the environment when + /// creating an instance of [crate::storage::s3::S3StorageOptions]. + /// See also . + pub const AWS_WEB_IDENTITY_TOKEN_FILE: &str = "AWS_WEB_IDENTITY_TOKEN_FILE"; + /// The role name to use for web identity. + /// NOTE: web identity related options are set in the environment when + /// creating an instance of [crate::storage::s3::S3StorageOptions]. + /// See also . + pub const AWS_ROLE_ARN: &str = "AWS_ROLE_ARN"; + /// The role session name to use for web identity. + /// NOTE: web identity related options are set in the environment when + /// creating an instance of [crate::storage::s3::S3StorageOptions]. + /// See also . + pub const AWS_ROLE_SESSION_NAME: &str = "AWS_ROLE_SESSION_NAME"; + /// Allow http connections - mainly useful for integration tests + pub const AWS_ALLOW_HTTP: &str = "AWS_ALLOW_HTTP"; + + /// If set to "true", allows creating commits without concurrent writer protection. + /// Only safe if there is one writer to a given table. + pub const AWS_S3_ALLOW_UNSAFE_RENAME: &str = "AWS_S3_ALLOW_UNSAFE_RENAME"; + + /// The list of option keys owned by the S3 module. + /// Option keys not contained in this list will be added to the `extra_opts` + /// field of [crate::storage::s3::S3StorageOptions]. + pub const S3_OPTS: &[&str] = &[ + AWS_ENDPOINT_URL, + AWS_REGION, + AWS_PROFILE, + AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY, + AWS_SESSION_TOKEN, + AWS_S3_LOCKING_PROVIDER, + AWS_S3_ASSUME_ROLE_ARN, + AWS_S3_ROLE_SESSION_NAME, + AWS_WEB_IDENTITY_TOKEN_FILE, + AWS_ROLE_ARN, + AWS_ROLE_SESSION_NAME, + AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, + AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, + AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, + ]; +} + +pub(crate) fn str_option(map: &HashMap, key: &str) -> Option { + map.get(key) + .map_or_else(|| std::env::var(key).ok(), |v| Some(v.to_owned())) +} + +#[cfg(test)] +mod tests { + use super::*; + + use maplit::hashmap; + use serial_test::serial; + + #[test] + #[serial] + fn storage_options_default_test() { + std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "http://localhost"); + std::env::set_var(s3_constants::AWS_REGION, "us-west-1"); + std::env::set_var(s3_constants::AWS_PROFILE, "default"); + std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "default_key_id"); + std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "default_secret_key"); + std::env::set_var(s3_constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); + std::env::set_var( + s3_constants::AWS_S3_ASSUME_ROLE_ARN, + "arn:aws:iam::123456789012:role/some_role", + ); + std::env::set_var(s3_constants::AWS_S3_ROLE_SESSION_NAME, "session_name"); + std::env::set_var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE, "token_file"); + std::env::remove_var(s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS); + std::env::remove_var(s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS); + std::env::remove_var(s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES); + + let options = S3StorageOptions::default(); + + assert_eq!( + S3StorageOptions { + endpoint_url: Some("http://localhost".to_string()), + region: Region::Custom { + name: "us-west-1".to_string(), + endpoint: "http://localhost".to_string() + }, + profile: Some("default".to_string()), + aws_access_key_id: Some("default_key_id".to_string()), + aws_secret_access_key: Some("default_secret_key".to_string()), + aws_session_token: None, + virtual_hosted_style_request: false, + assume_role_arn: Some("arn:aws:iam::123456789012:role/some_role".to_string()), + assume_role_session_name: Some("session_name".to_string()), + use_web_identity: true, + locking_provider: Some("dynamodb".to_string()), + s3_pool_idle_timeout: Duration::from_secs(15), + sts_pool_idle_timeout: Duration::from_secs(10), + s3_get_internal_server_error_retries: 10, + extra_opts: HashMap::new(), + allow_unsafe_rename: false, + }, + options + ); + } + + #[test] + #[serial] + fn storage_options_with_only_region_and_credentials() { + std::env::remove_var(s3_constants::AWS_ENDPOINT_URL); + let options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), + s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test".to_string(), + s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + }); + + assert_eq!( + S3StorageOptions { + endpoint_url: None, + region: Region::default(), + aws_access_key_id: Some("test".to_string()), + aws_secret_access_key: Some("test_secret".to_string()), + ..Default::default() + }, + options + ); + } + + #[test] + #[serial] + fn storage_options_from_map_test() { + let options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_ENDPOINT_URL.to_string() => "http://localhost:1234".to_string(), + s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), + s3_constants::AWS_PROFILE.to_string() => "default".to_string(), + s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string(), + s3_constants::AWS_S3_LOCKING_PROVIDER.to_string() => "another_locking_provider".to_string(), + s3_constants::AWS_S3_ASSUME_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/another_role".to_string(), + s3_constants::AWS_S3_ROLE_SESSION_NAME.to_string() => "another_session_name".to_string(), + s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "another_token_file".to_string(), + s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "1".to_string(), + s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "2".to_string(), + s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES.to_string() => "3".to_string(), + s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), + s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), + }); + + assert_eq!( + S3StorageOptions { + endpoint_url: Some("http://localhost:1234".to_string()), + region: Region::Custom { + name: "us-west-2".to_string(), + endpoint: "http://localhost:1234".to_string() + }, + profile: Some("default".to_string()), + aws_access_key_id: Some("test_id".to_string()), + aws_secret_access_key: Some("test_secret".to_string()), + aws_session_token: None, + virtual_hosted_style_request: true, + assume_role_arn: Some("arn:aws:iam::123456789012:role/another_role".to_string()), + assume_role_session_name: Some("another_session_name".to_string()), + use_web_identity: true, + locking_provider: Some("another_locking_provider".to_string()), + s3_pool_idle_timeout: Duration::from_secs(1), + sts_pool_idle_timeout: Duration::from_secs(2), + s3_get_internal_server_error_retries: 3, + extra_opts: hashmap! { + s3_constants::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string() + }, + allow_unsafe_rename: false, + }, + options + ); + } + + #[test] + #[serial] + fn storage_options_mixed_test() { + std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "http://localhost"); + std::env::set_var(s3_constants::AWS_REGION, "us-west-1"); + std::env::set_var(s3_constants::AWS_PROFILE, "default"); + std::env::set_var(s3_constants::AWS_ACCESS_KEY_ID, "wrong_key_id"); + std::env::set_var(s3_constants::AWS_SECRET_ACCESS_KEY, "wrong_secret_key"); + std::env::set_var(s3_constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); + std::env::set_var( + s3_constants::AWS_S3_ASSUME_ROLE_ARN, + "arn:aws:iam::123456789012:role/some_role", + ); + std::env::set_var(s3_constants::AWS_S3_ROLE_SESSION_NAME, "session_name"); + std::env::set_var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE, "token_file"); + + std::env::set_var(s3_constants::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, "1"); + std::env::set_var(s3_constants::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, "2"); + std::env::set_var(s3_constants::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, "3"); + let options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_ACCESS_KEY_ID.to_string() => "test_id_mixed".to_string(), + s3_constants::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret_mixed".to_string(), + s3_constants::AWS_REGION.to_string() => "us-west-2".to_string(), + "DYNAMO_LOCK_PARTITION_KEY_VALUE".to_string() => "my_lock".to_string(), + "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES".to_string() => "3".to_string(), + }); + + assert_eq!( + S3StorageOptions { + endpoint_url: Some("http://localhost".to_string()), + region: Region::Custom { + name: "us-west-2".to_string(), + endpoint: "http://localhost".to_string() + }, + profile: Some("default".to_string()), + aws_access_key_id: Some("test_id_mixed".to_string()), + aws_secret_access_key: Some("test_secret_mixed".to_string()), + aws_session_token: None, + virtual_hosted_style_request: false, + assume_role_arn: Some("arn:aws:iam::123456789012:role/some_role".to_string()), + assume_role_session_name: Some("session_name".to_string()), + use_web_identity: true, + locking_provider: Some("dynamodb".to_string()), + s3_pool_idle_timeout: Duration::from_secs(1), + sts_pool_idle_timeout: Duration::from_secs(2), + s3_get_internal_server_error_retries: 3, + extra_opts: hashmap! { + "DYNAMO_LOCK_PARTITION_KEY_VALUE".to_string() => "my_lock".to_string(), + }, + allow_unsafe_rename: false, + }, + options + ); + } + #[test] + #[serial] + fn storage_options_web_identity_test() { + let _options = S3StorageOptions::from_map(&hashmap! { + s3_constants::AWS_REGION.to_string() => "eu-west-1".to_string(), + s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "web_identity_token_file".to_string(), + s3_constants::AWS_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/web_identity_role".to_string(), + s3_constants::AWS_ROLE_SESSION_NAME.to_string() => "web_identity_session_name".to_string(), + }); + + assert_eq!( + "eu-west-1", + std::env::var(s3_constants::AWS_REGION).unwrap() + ); + + assert_eq!( + "web_identity_token_file", + std::env::var(s3_constants::AWS_WEB_IDENTITY_TOKEN_FILE).unwrap() + ); + + assert_eq!( + "arn:aws:iam::123456789012:role/web_identity_role", + std::env::var(s3_constants::AWS_ROLE_ARN).unwrap() + ); + + assert_eq!( + "web_identity_session_name", + std::env::var(s3_constants::AWS_ROLE_SESSION_NAME).unwrap() + ); + } +} diff --git a/crates/deltalake-aws/tests/common.rs b/crates/deltalake-aws/tests/common.rs new file mode 100644 index 0000000000..764c861c92 --- /dev/null +++ b/crates/deltalake-aws/tests/common.rs @@ -0,0 +1,173 @@ +use chrono::Utc; +use deltalake_aws::register_handlers; +use deltalake_aws::storage::*; +use deltalake_test::utils::*; +use rand::Rng; +use std::process::{Command, ExitStatus, Stdio}; + +#[derive(Clone, Debug)] +pub struct S3Integration { + bucket_name: String, +} + +impl Default for S3Integration { + fn default() -> Self { + register_handlers(None); + Self { + bucket_name: format!("test-delta-table-{}", Utc::now().timestamp()), + } + } +} + +impl StorageIntegration for S3Integration { + /// Create a new bucket + fn create_bucket(&self) -> std::io::Result { + set_env_if_not_set( + "DYNAMO_LOCK_PARTITION_KEY_VALUE", + format!("s3://{}", self.bucket_name()), + ); + Self::create_lock_table()?; + let mut child = Command::new("aws") + .args(["s3", "mb", &self.root_uri()]) + .spawn() + .expect("aws command is installed"); + child.wait() + } + + fn bucket_name(&self) -> String { + self.bucket_name.clone() + } + + fn root_uri(&self) -> String { + format!("s3://{}", &self.bucket_name()) + } + + /// prepare_env + fn prepare_env(&self) { + std::env::set_var( + "DELTA_DYNAMO_TABLE_NAME", + format!("delta_log_it_{}", rand::thread_rng().gen::()), + ); + match std::env::var(s3_constants::AWS_ENDPOINT_URL).ok() { + Some(endpoint_url) if endpoint_url.to_lowercase() == "none" => { + std::env::remove_var(s3_constants::AWS_ENDPOINT_URL) + } + Some(_) => (), + None => std::env::set_var(s3_constants::AWS_ENDPOINT_URL, "http://localhost:4566"), + } + set_env_if_not_set(s3_constants::AWS_ACCESS_KEY_ID, "deltalake"); + set_env_if_not_set(s3_constants::AWS_SECRET_ACCESS_KEY, "weloverust"); + set_env_if_not_set(s3_constants::AWS_REGION, "us-east-1"); + set_env_if_not_set(s3_constants::AWS_S3_LOCKING_PROVIDER, "dynamodb"); + set_env_if_not_set("DYNAMO_LOCK_TABLE_NAME", "test_table"); + set_env_if_not_set("DYNAMO_LOCK_REFRESH_PERIOD_MILLIS", "100"); + set_env_if_not_set("DYNAMO_LOCK_ADDITIONAL_TIME_TO_WAIT_MILLIS", "100"); + } + + /// copy directory + fn copy_directory(&self, source: &str, destination: &str) -> std::io::Result { + let destination = format!("{}/{destination}", self.root_uri()); + let mut child = Command::new("aws") + .args(["s3", "cp", source, &destination, "--recursive"]) + .spawn() + .expect("aws command is installed"); + child.wait() + } +} + +impl S3Integration { + /// delete bucket + fn delete_bucket(bucket_name: impl AsRef) -> std::io::Result { + let mut child = Command::new("aws") + .args(["s3", "rb", bucket_name.as_ref(), "--force"]) + .spawn() + .expect("aws command is installed"); + child.wait() + } + fn create_dynamodb_table( + table_name: &str, + attr_definitions: &[&str], + key_schema: &[&str], + ) -> std::io::Result { + let args = [ + "dynamodb", + "create-table", + "--table-name", + &table_name, + "--provisioned-throughput", + "ReadCapacityUnits=10,WriteCapacityUnits=10", + "--attribute-definitions", + ]; + let mut child = Command::new("aws") + .args(args) + .args(attr_definitions.iter()) + .arg("--key-schema") + .args(key_schema) + .stdout(Stdio::null()) + .spawn() + .expect("aws command is installed"); + let status = child.wait()?; + Self::wait_for_table(table_name)?; + Ok(status) + } + + fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option { + haystack + .windows(needle.len()) + .position(|window| window == needle) + } + + fn wait_for_table(table_name: &str) -> std::io::Result<()> { + let args = ["dynamodb", "describe-table", "--table-name", &table_name]; + loop { + let output = Command::new("aws") + .args(args) + .output() + .expect("aws command is installed"); + if Self::find_subsequence(&output.stdout, "CREATING".as_bytes()).is_some() { + std::thread::sleep(std::time::Duration::from_millis(200)); + continue; + } else { + return Ok(()); + } + } + } + + pub fn create_lock_table() -> std::io::Result { + let table_name = + std::env::var("DELTA_DYNAMO_TABLE_NAME").unwrap_or_else(|_| "delta_log".into()); + Self::create_dynamodb_table( + &table_name, + &[ + "AttributeName=tablePath,AttributeType=S", + "AttributeName=fileName,AttributeType=S", + ], + &[ + "AttributeName=tablePath,KeyType=HASH", + "AttributeName=fileName,KeyType=RANGE", + ], + ) + } + + fn delete_dynamodb_table(table_name: &str) -> std::io::Result { + let mut child = Command::new("aws") + .args(["dynamodb", "delete-table", "--table-name", &table_name]) + .stdout(Stdio::null()) + .spawn() + .expect("aws command is installed"); + child.wait() + } + + pub fn delete_lock_table() -> std::io::Result { + let table_name = + std::env::var("DELTA_DYNAMO_TABLE_NAME").unwrap_or_else(|_| "delta_log".into()); + Self::delete_dynamodb_table(&table_name) + } +} + +impl Drop for S3Integration { + fn drop(&mut self) { + Self::delete_bucket(self.root_uri()).expect("Failed to drop bucket"); + Self::delete_lock_table().expect("Failed to delete lock table"); + } +} diff --git a/crates/deltalake-aws/tests/integration_read.rs b/crates/deltalake-aws/tests/integration_read.rs new file mode 100644 index 0000000000..5e9c6f1040 --- /dev/null +++ b/crates/deltalake-aws/tests/integration_read.rs @@ -0,0 +1,189 @@ +#![cfg(feature = "integration_test")] + +use deltalake_core::{DeltaTableBuilder, Path}; +use deltalake_test::utils::*; +use serial_test::serial; + +mod common; +use common::*; + +static TEST_PREFIXES: &[&str] = &["my table", "你好/😊"]; + +/// TEST_PREFIXES as they should appear in object stores. +static TEST_PREFIXES_ENCODED: &[&str] = &["my%20table", "%E4%BD%A0%E5%A5%BD/%F0%9F%98%8A"]; + +#[tokio::test] +#[serial] +async fn test_read_tables_aws() -> TestResult { + let context = IntegrationContext::new(Box::new(S3Integration::default()))?; + + read_tables(&context).await?; + + for (prefix, prefix_encoded) in TEST_PREFIXES.iter().zip(TEST_PREFIXES_ENCODED.iter()) { + read_table_paths(&context, prefix, prefix_encoded).await?; + } + + Ok(()) +} + +async fn read_tables(context: &IntegrationContext) -> TestResult { + context.load_table(TestTables::Simple).await?; + context.load_table(TestTables::Golden).await?; + context + .load_table(TestTables::Delta0_8_0SpecialPartitioned) + .await?; + + read_simple_table(&context).await?; + read_simple_table_with_version(&context).await?; + read_golden(&context).await?; + + Ok(()) +} + +async fn read_table_paths( + context: &IntegrationContext, + table_root: &str, + upload_path: &str, +) -> TestResult { + context + .load_table_with_name(TestTables::Delta0_8_0SpecialPartitioned, upload_path) + .await?; + + println!("table_root: {}", table_root); + verify_store(&context, table_root).await?; + + read_encoded_table(&context, table_root).await?; + + Ok(()) +} + +async fn verify_store(integration: &IntegrationContext, root_path: &str) -> TestResult { + let table_uri = format!("{}/{}", integration.root_uri(), root_path); + println!("working with table_uri: {}", table_uri); + let storage = DeltaTableBuilder::from_uri(table_uri.clone()) + .with_allow_http(true) + .build_storage()? + .object_store(); + + let files = storage.list_with_delimiter(None).await?; + println!("files: {files:?}"); + assert_eq!( + vec![ + Path::parse("_delta_log").unwrap(), + Path::parse("x=A%2FA").unwrap(), + Path::parse("x=B%20B").unwrap(), + ], + files.common_prefixes + ); + + Ok(()) +} + +async fn read_encoded_table(integration: &IntegrationContext, root_path: &str) -> TestResult { + let table_uri = format!("{}/{}", integration.root_uri(), root_path); + + let table = DeltaTableBuilder::from_uri(table_uri) + .with_allow_http(true) + .load() + .await?; + + assert_eq!(table.version(), 0); + assert_eq!(table.get_files().len(), 2); + + Ok(()) +} + +async fn read_simple_table(integration: &IntegrationContext) -> TestResult { + let table_uri = integration.uri_for_table(TestTables::Simple); + let table = DeltaTableBuilder::from_uri(table_uri) + .with_allow_http(true) + .load() + .await?; + + assert_eq!(table.version(), 4); + assert_eq!(table.protocol().min_writer_version, 2); + assert_eq!(table.protocol().min_reader_version, 1); + assert_eq!( + table.get_files(), + vec![ + Path::from("part-00000-c1777d7d-89d9-4790-b38a-6ee7e24456b1-c000.snappy.parquet"), + Path::from("part-00001-7891c33d-cedc-47c3-88a6-abcfb049d3b4-c000.snappy.parquet"), + Path::from("part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet"), + Path::from("part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet"), + Path::from("part-00000-2befed33-c358-4768-a43c-3eda0d2a499d-c000.snappy.parquet"), + ] + ); + let tombstones = table.get_state().all_tombstones(); + assert_eq!(tombstones.len(), 31); + assert!(tombstones.contains(&deltalake_core::kernel::Remove { + path: "part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet".to_string(), + deletion_timestamp: Some(1587968596250), + data_change: true, + extended_file_metadata: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + size: None, + partition_values: None, + tags: None, + })); + + Ok(()) +} + +async fn read_simple_table_with_version(integration: &IntegrationContext) -> TestResult { + let table_uri = integration.uri_for_table(TestTables::Simple); + + let table = DeltaTableBuilder::from_uri(table_uri) + .with_allow_http(true) + .with_version(3) + .load() + .await?; + + assert_eq!(table.version(), 3); + assert_eq!(table.protocol().min_writer_version, 2); + assert_eq!(table.protocol().min_reader_version, 1); + assert_eq!( + table.get_files(), + vec![ + Path::from("part-00000-c1777d7d-89d9-4790-b38a-6ee7e24456b1-c000.snappy.parquet"), + Path::from("part-00001-7891c33d-cedc-47c3-88a6-abcfb049d3b4-c000.snappy.parquet"), + Path::from("part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet"), + Path::from("part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet"), + Path::from("part-00000-f17fcbf5-e0dc-40ba-adae-ce66d1fcaef6-c000.snappy.parquet"), + Path::from("part-00001-bb70d2ba-c196-4df2-9c85-f34969ad3aa9-c000.snappy.parquet"), + ] + ); + let tombstones = table.get_state().all_tombstones(); + assert_eq!(tombstones.len(), 29); + assert!(tombstones.contains(&deltalake_core::kernel::Remove { + path: "part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet".to_string(), + deletion_timestamp: Some(1587968596250), + data_change: true, + tags: None, + partition_values: None, + base_row_id: None, + default_row_commit_version: None, + size: None, + deletion_vector: None, + extended_file_metadata: None, + })); + + Ok(()) +} + +async fn read_golden(integration: &IntegrationContext) -> TestResult { + let table_uri = integration.uri_for_table(TestTables::Golden); + + let table = DeltaTableBuilder::from_uri(table_uri) + .with_allow_http(true) + .load() + .await + .unwrap(); + + assert_eq!(table.version(), 0); + assert_eq!(table.protocol().min_writer_version, 2); + assert_eq!(table.protocol().min_reader_version, 1); + + Ok(()) +} diff --git a/crates/deltalake-core/tests/integration_s3_dynamodb.rs b/crates/deltalake-aws/tests/integration_s3_dynamodb.rs similarity index 91% rename from crates/deltalake-core/tests/integration_s3_dynamodb.rs rename to crates/deltalake-aws/tests/integration_s3_dynamodb.rs index 38bd8e3a16..7338ca1509 100644 --- a/crates/deltalake-core/tests/integration_s3_dynamodb.rs +++ b/crates/deltalake-aws/tests/integration_s3_dynamodb.rs @@ -1,32 +1,29 @@ //! Integration test to verify correct behavior of S3 DynamoDb locking. //! It inspects the state of the locking table after each operation. -#![cfg(all( - feature = "integration_test", - any(feature = "s3", feature = "s3-native-tls") -))] +#![cfg(feature = "integration_test")] use std::collections::HashMap; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use deltalake_aws::logstore::{RepairLogEntryResult, S3DynamoDbLogStore}; +use deltalake_aws::storage::S3StorageOptions; use deltalake_aws::{CommitEntry, DynamoDbLockClient}; use deltalake_core::kernel::{Action, Add, DataType, PrimitiveType, StructField, StructType}; -use deltalake_core::logstore::s3::{RepairLogEntryResult, S3DynamoDbLogStore}; use deltalake_core::logstore::LogStore; use deltalake_core::operations::transaction::{commit, prepare_commit}; use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::commit_uri_from_version; -use deltalake_core::storage::config::StorageOptions; -use deltalake_core::storage::s3::S3StorageOptions; +use deltalake_core::storage::StorageOptions; use deltalake_core::table::builder::ensure_table_uri; -use deltalake_core::test_utils::{IntegrationContext, StorageIntegration, TestTables}; use deltalake_core::{DeltaOps, DeltaTable, DeltaTableBuilder}; +use deltalake_test::utils::*; use lazy_static::lazy_static; use object_store::path::Path; use serde_json::Value; use serial_test::serial; -#[allow(dead_code)] -mod fs_common; +mod common; +use common::*; pub type TestResult = Result>; @@ -49,7 +46,7 @@ fn make_client() -> TestResult { #[test] #[serial] fn client_config_picks_up_lock_table_name() -> TestResult<()> { - let _context = IntegrationContext::new(StorageIntegration::Amazon)?; + let _context = IntegrationContext::new(Box::new(S3Integration::default()))?; assert!(make_client()? .get_lock_table_name() .starts_with("delta_log_it_")); @@ -59,7 +56,7 @@ fn client_config_picks_up_lock_table_name() -> TestResult<()> { #[tokio::test] #[serial] async fn get_missing_item() -> TestResult<()> { - let _context = IntegrationContext::new(StorageIntegration::Amazon)?; + let _context = IntegrationContext::new(Box::new(S3Integration::default()))?; let client = make_client()?; let version = i64::MAX; let result = client @@ -75,7 +72,7 @@ async fn get_missing_item() -> TestResult<()> { #[tokio::test] #[serial] async fn test_append() -> TestResult<()> { - let context = IntegrationContext::new(StorageIntegration::Amazon)?; + let context = IntegrationContext::new(Box::new(S3Integration::default()))?; let table = prepare_table(&context, "delta01").await?; validate_lock_table_state(&table, 0).await?; append_to_table("datav01.parquet", &table, None).await?; @@ -86,7 +83,7 @@ async fn test_append() -> TestResult<()> { #[tokio::test] #[serial] async fn test_repair_commit_entry() -> TestResult<()> { - let context = IntegrationContext::new(StorageIntegration::Amazon)?; + let context = IntegrationContext::new(Box::new(S3Integration::default()))?; let client = make_client()?; let table = prepare_table(&context, "repair_needed").await?; let options: StorageOptions = OPTIONS.clone().into(); @@ -135,7 +132,7 @@ async fn test_repair_commit_entry() -> TestResult<()> { #[tokio::test] #[serial] async fn test_repair_on_update() -> TestResult<()> { - let context = IntegrationContext::new(StorageIntegration::Amazon)?; + let context = IntegrationContext::new(Box::new(S3Integration::default()))?; let mut table = prepare_table(&context, "repair_on_update").await?; let _entry = create_incomplete_commit_entry(&table, 1, "unfinished_commit").await?; table.update().await?; @@ -152,9 +149,12 @@ const COMMITS: i64 = 5; #[serial] async fn test_concurrent_writers() -> TestResult<()> { // Goal: a test with multiple writers, very similar to `integration_concurrent_writes` - let context = IntegrationContext::new(StorageIntegration::Amazon)?; + let context = IntegrationContext::new(Box::new(S3Integration::default()))?; + println!(">>> preparing table"); let table = prepare_table(&context, "concurrent_writes").await?; + println!(">>> table prepared"); let table_uri = table.table_uri(); + println!("Starting workers on {table_uri}"); let mut workers = Vec::new(); for w in 0..WORKERS { @@ -187,6 +187,7 @@ impl Worker { .load() .await .unwrap(); + println!("Loaded table in worker: {table:?}"); Self { table, name } } @@ -271,11 +272,13 @@ async fn prepare_table(context: &IntegrationContext, table_name: &str) -> TestRe .with_allow_http(true) .with_storage_options(OPTIONS.clone()) .build()?; + println!("table built: {table:?}"); // create delta table let table = DeltaOps(table) .create() .with_columns(schema.fields().clone()) .await?; + println!("table created: {table:?}"); Ok(table) } diff --git a/crates/deltalake-core/tests/repair_s3_rename_test.rs b/crates/deltalake-aws/tests/repair_s3_rename_test.rs similarity index 95% rename from crates/deltalake-core/tests/repair_s3_rename_test.rs rename to crates/deltalake-aws/tests/repair_s3_rename_test.rs index ecab792f39..a48af20d0f 100644 --- a/crates/deltalake-core/tests/repair_s3_rename_test.rs +++ b/crates/deltalake-aws/tests/repair_s3_rename_test.rs @@ -1,16 +1,14 @@ -#![cfg(all( - any(feature = "s3", feature = "s3-native-tls"), - feature = "integration_test" -))] +#![cfg(feature = "integration_test")] + use bytes::Bytes; -use deltalake_core::test_utils::{IntegrationContext, StorageIntegration}; -use deltalake_core::{storage::s3::S3StorageBackend, DeltaTableBuilder, ObjectStore}; -use futures::stream::BoxStream; -use object_store::path::Path; -use object_store::{ +use deltalake_aws::storage::S3StorageBackend; +use deltalake_core::storage::object_store::{ DynObjectStore, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, Result as ObjectStoreResult, }; +use deltalake_core::{DeltaTableBuilder, ObjectStore, Path}; +use deltalake_test::utils::{IntegrationContext, StorageIntegration}; +use futures::stream::BoxStream; use serial_test::serial; use std::ops::Range; use std::sync::{Arc, Mutex}; @@ -18,6 +16,9 @@ use tokio::io::AsyncWrite; use tokio::task::JoinHandle; use tokio::time::Duration; +mod common; +use common::*; + #[tokio::test(flavor = "multi_thread")] #[serial] #[ignore = "currently tests are hanging"] @@ -43,7 +44,7 @@ async fn repair_when_worker_pauses_after_rename_test() { async fn run_repair_test_case(path: &str, pause_copy: bool) -> Result<(), ObjectStoreError> { std::env::set_var("AWS_S3_LOCKING_PROVIDER", "dynamodb"); std::env::set_var("DYNAMO_LOCK_LEASE_DURATION", "2"); - let context = IntegrationContext::new(StorageIntegration::Amazon).unwrap(); + let context = IntegrationContext::new(Box::new(S3Integration::default())).unwrap(); let root_path = Path::from(path); let src1 = root_path.child("src1"); diff --git a/crates/deltalake-core/.gitignore b/crates/deltalake-core/.gitignore index a403c2926d..fd7fc6ad51 100644 --- a/crates/deltalake-core/.gitignore +++ b/crates/deltalake-core/.gitignore @@ -1,12 +1,4 @@ target/ /.idea/ *.bat -tests/data/checkpoints_tombstones/expired/ -tests/data/checkpoints_tombstones/metadata_broken/ -tests/data/checkpoints_tombstones/metadata_false/ -tests/data/checkpoints_tombstones/metadata_true/ -tests/data/checkpoints_with_expired_logs/ -tests/data/read_null_partitions_from_checkpoint/ -tests/data/action_reconciliation/ -tests/data/simple_table_with_no_checkpoint/ -tests/data/simple_table_with_no_checkpoint_2/ +tests/data diff --git a/crates/deltalake-core/Cargo.toml b/crates/deltalake-core/Cargo.toml index d29712baeb..8ace57f36c 100644 --- a/crates/deltalake-core/Cargo.toml +++ b/crates/deltalake-core/Cargo.toml @@ -81,7 +81,6 @@ num-traits = "0.2.15" object_store = "0.7" once_cell = "1.16.0" parking_lot = "0.12" -parquet2 = { version = "0.17", optional = true } percent-encoding = "2" roaring = "0.10.1" tracing = { version = "0.1", optional = true } @@ -89,19 +88,6 @@ rand = "0.8" z85 = "3.0.5" maplit = "1" -# hdfs -datafusion-objectstore-hdfs = { version = "0.1.3", default-features = false, features = [ - "hdfs3", - "try_spawn_blocking", -], optional = true } - -# S3 lock client -rusoto_core = { version = "0.47", default-features = false, optional = true } -rusoto_credential = { version = "0.47", optional = true } -rusoto_sts = { version = "0.47", default-features = false, optional = true } -deltalake-aws = { path = "../deltalake-aws", default-features = false, optional = true } - - # Unity reqwest = { version = "0.11.18", default-features = false, features = [ "rustls-tls", @@ -109,7 +95,7 @@ reqwest = { version = "0.11.18", default-features = false, features = [ ], optional = true } # Datafusion -dashmap = { version = "5", optional = true } +dashmap = "5" sqlparser = { version = "0.39", optional = true } @@ -118,18 +104,20 @@ fs_extra = { version = "1.3.0", optional = true } tempdir = { version = "0", optional = true } [dev-dependencies] +criterion = "0.5" ctor = "0" +deltalake-test = { path = "../deltalake-test", features = ["datafusion"] } dotenvy = "0" +hyper = { version = "0.14", features = ["server"] } maplit = "1" pretty_assertions = "1.2.1" +pretty_env_logger = "*" rand = "0.8" serial_test = "2" tempdir = "0" tempfile = "3" tokio = { version = "1", features = ["macros", "rt-multi-thread"] } utime = "0.3" -hyper = { version = "0.14", features = ["server"] } -criterion = "0.5" [features] azure = ["object_store/azure"] @@ -146,7 +134,6 @@ arrow = [ ] default = ["arrow", "parquet"] datafusion = [ - "dep:arrow", "dep:datafusion", "datafusion-expr", "datafusion-common", @@ -156,29 +143,13 @@ datafusion = [ "sqlparser", "arrow", "parquet", - "dashmap", ] datafusion-ext = ["datafusion"] gcs = ["object_store/gcp"] -hdfs = ["datafusion-objectstore-hdfs"] # used only for integration testing integration_test = ["fs_extra", "tempdir"] json = ["parquet/json"] python = ["arrow/pyarrow"] -s3-native-tls = [ - "rusoto_core/native-tls", - "rusoto_credential", - "rusoto_sts/native-tls", - "object_store/aws", - "deltalake-aws/native-tls", -] -s3 = [ - "rusoto_core/rustls", - "rusoto_credential", - "rusoto_sts/rustls", - "object_store/aws", - "deltalake-aws/rustls", -] unity-experimental = ["reqwest", "tracing", "hyper"] [[bench]] diff --git a/crates/deltalake-core/README.md b/crates/deltalake-core/README.md index 64d17dcae9..7cb674ea11 100644 --- a/crates/deltalake-core/README.md +++ b/crates/deltalake-core/README.md @@ -48,11 +48,7 @@ cargo run --example read_delta_table - `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion). - `datafusion-ext` - DEPRECATED: alias for `datafusion` feature - `gcs` - enable the Google storage backend to work with Delta Tables in Google Cloud Storage. -- `hdfs` - enable the HDFS storage backend to work with Delta Tables in HDFS. - `json` - enable the JSON feature of the `parquet` crate for better JSON interoperability. -- `parquet2` - use parquet2 for checkpoint deserialization. Since `arrow` and `parquet` features are enabled by default for backwards compatibility, this feature needs to be used with `--no-default-features`. -- `s3` - enable the S3 storage backend to work with Delta Tables in AWS S3. Uses [rustls](https://github.com/ctz/rustls). -- `s3-native-tls` - enable the S3 storage backend but rely on OpenSSL. ## Development diff --git a/crates/deltalake-core/src/data_catalog/storage/mod.rs b/crates/deltalake-core/src/data_catalog/storage/mod.rs index 729b5de224..a332464952 100644 --- a/crates/deltalake-core/src/data_catalog/storage/mod.rs +++ b/crates/deltalake-core/src/data_catalog/storage/mod.rs @@ -14,7 +14,7 @@ use object_store::ObjectStore; use crate::errors::DeltaResult; use crate::open_table_with_storage_options; -use crate::storage::config::{configure_store, StorageOptions}; +use crate::storage::*; use crate::table::builder::ensure_table_uri; const DELTA_LOG_FOLDER: &str = "_delta_log"; @@ -47,9 +47,9 @@ impl ListingSchemaProvider { storage_options: Option>, ) -> DeltaResult { let uri = ensure_table_uri(root_uri)?; - let mut storage_options = storage_options.unwrap_or_default().into(); + let storage_options = storage_options.unwrap_or_default().into(); // We already parsed the url, so unwrapping is safe. - let store = configure_store(&uri, &mut storage_options)?; + let store = store_for(&uri)?; Ok(Self { authority: uri.to_string(), store, @@ -163,7 +163,7 @@ mod tests { #[tokio::test] async fn test_table_names() { - let fs = ListingSchemaProvider::try_new("./tests/data/", None).unwrap(); + let fs = ListingSchemaProvider::try_new("../deltalake-test/tests/data/", None).unwrap(); fs.refresh().await.unwrap(); let table_names = fs.table_names(); assert!(table_names.len() > 20); @@ -172,7 +172,9 @@ mod tests { #[tokio::test] async fn test_query_table() { - let schema = Arc::new(ListingSchemaProvider::try_new("./tests/data/", None).unwrap()); + let schema = Arc::new( + ListingSchemaProvider::try_new("../deltalake-test/tests/data/", None).unwrap(), + ); schema.refresh().await.unwrap(); let ctx = SessionContext::new(); diff --git a/crates/deltalake-core/src/delta_datafusion/mod.rs b/crates/deltalake-core/src/delta_datafusion/mod.rs index 59cc2ba0ac..bfd268fc77 100644 --- a/crates/deltalake-core/src/delta_datafusion/mod.rs +++ b/crates/deltalake-core/src/delta_datafusion/mod.rs @@ -1911,7 +1911,7 @@ mod tests { #[tokio::test] async fn delta_table_provider_with_config() { - let table = crate::open_table("tests/data/delta-2.2.0-partitioned-types") + let table = crate::open_table("../deltalake-test/tests/data/delta-2.2.0-partitioned-types") .await .unwrap(); let config = DeltaScanConfigBuilder::new() diff --git a/crates/deltalake-core/src/errors.rs b/crates/deltalake-core/src/errors.rs index aaa21a4801..67963042f8 100644 --- a/crates/deltalake-core/src/errors.rs +++ b/crates/deltalake-core/src/errors.rs @@ -23,17 +23,13 @@ pub enum DeltaTableError { }, /// Error returned when parsing checkpoint parquet. - #[cfg(any(feature = "parquet", feature = "parquet2"))] + #[cfg(feature = "parquet")] #[error("Failed to parse parquet: {}", .source)] Parquet { /// Parquet error details returned when reading the checkpoint failed. #[cfg(feature = "parquet")] #[from] source: parquet::errors::ParquetError, - /// Parquet error details returned when reading the checkpoint failed. - #[cfg(feature = "parquet2")] - #[from] - source: parquet2::error::Error, }, /// Error returned when converting the schema in Arrow format failed. @@ -231,6 +227,7 @@ impl From for DeltaTableError { ProtocolError::Arrow { source } => DeltaTableError::Arrow { source }, ProtocolError::IO { source } => DeltaTableError::Io { source }, ProtocolError::ObjectStore { source } => DeltaTableError::ObjectStore { source }, + #[cfg(feature = "parquet")] ProtocolError::ParquetParseError { source } => DeltaTableError::Parquet { source }, _ => DeltaTableError::Protocol { source: value }, } diff --git a/crates/deltalake-core/src/kernel/actions/types.rs b/crates/deltalake-core/src/kernel/actions/types.rs index f64a5caa08..67a94ec1c4 100644 --- a/crates/deltalake-core/src/kernel/actions/types.rs +++ b/crates/deltalake-core/src/kernel/actions/types.rs @@ -187,7 +187,7 @@ pub enum ReaderFeatures { Other(String), } -#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] +#[cfg(feature = "parquet")] impl From<&parquet::record::Field> for ReaderFeatures { fn from(value: &parquet::record::Field) -> Self { match value { @@ -330,7 +330,7 @@ impl fmt::Display for WriterFeatures { } } -#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] +#[cfg(feature = "parquet")] impl From<&parquet::record::Field> for WriterFeatures { fn from(value: &parquet::record::Field) -> Self { match value { @@ -599,10 +599,6 @@ pub struct Add { #[cfg(feature = "parquet")] #[serde(skip_serializing, skip_deserializing)] pub partition_values_parsed: Option, - /// Partition values parsed for parquet2 - #[cfg(feature = "parquet2")] - #[serde(skip_serializing, skip_deserializing)] - pub partition_values_parsed: Option, /// Contains statistics (e.g., count, min/max values for columns) about the data in this file in /// raw parquet format. This field needs to be written when statistics are available and the @@ -612,10 +608,6 @@ pub struct Add { #[cfg(feature = "parquet")] #[serde(skip_serializing, skip_deserializing)] pub stats_parsed: Option, - /// Stats parsed for parquet2 - #[cfg(feature = "parquet2")] - #[serde(skip_serializing, skip_deserializing)] - pub stats_parsed: Option, } impl Add { @@ -951,8 +943,10 @@ mod tests { let inline = dv_inline(); assert_eq!(None, inline.absolute_path(&parent).unwrap()); - let path = - std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let path = std::fs::canonicalize(PathBuf::from( + "../deltalake-test/tests/data/table-with-dv-small/", + )) + .unwrap(); let parent = url::Url::from_directory_path(path).unwrap(); let dv_url = parent .join("deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin") @@ -971,7 +965,7 @@ mod tests { // fn test_deletion_vector_read() { // let store = Arc::new(LocalFileSystem::new()); // let path = - // std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + // std::fs::canonicalize(PathBuf::from("../deltalake-test/tests/data/table-with-dv-small/")).unwrap(); // let parent = url::Url::from_directory_path(path).unwrap(); // let root = object_store::path::Path::from(parent.path()); // let fs_client = Arc::new(ObjectStoreFileSystemClient::new( diff --git a/crates/deltalake-core/src/lib.rs b/crates/deltalake-core/src/lib.rs index d95f8d40e3..ba6f17d032 100644 --- a/crates/deltalake-core/src/lib.rs +++ b/crates/deltalake-core/src/lib.rs @@ -6,7 +6,7 @@ //! //! ```rust //! async { -//! let table = deltalake_core::open_table("./tests/data/simple_table").await.unwrap(); +//! let table = deltalake_core::open_table("../deltalake-test/tests/data/simple_table").await.unwrap(); //! let files = table.get_files(); //! }; //! ``` @@ -15,7 +15,7 @@ //! //! ```rust //! async { -//! let table = deltalake_core::open_table_with_version("./tests/data/simple_table", 0).await.unwrap(); +//! let table = deltalake_core::open_table_with_version("../deltalake-test/tests/data/simple_table", 0).await.unwrap(); //! let files = table.get_files_by_partitions(&[deltalake_core::PartitionFilter { //! key: "month".to_string(), //! value: deltalake_core::PartitionValue::Equal("12".to_string()), @@ -28,7 +28,7 @@ //! ```rust //! async { //! let table = deltalake_core::open_table_with_ds( -//! "./tests/data/simple_table", +//! "../deltalake-test/tests/data/simple_table", //! "2020-05-02T23:47:31-07:00", //! ).await.unwrap(); //! let files = table.get_files(); @@ -43,8 +43,6 @@ //! - `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation //! for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion). //! - `datafusion-ext` - DEPRECATED: alias for `datafusion` feature. -//! - `parquet2` - use parquet2 for checkpoint deserialization. Since `arrow` and `parquet` features -//! are enabled by default for backwards compatibility, this feature needs to be used with `--no-default-features`. //! //! # Querying Delta Tables with Datafusion //! @@ -55,7 +53,7 @@ //! //! async { //! let mut ctx = SessionContext::new(); -//! let table = deltalake_core::open_table("./tests/data/simple_table") +//! let table = deltalake_core::open_table("../deltalake-test/tests/data/simple_table") //! .await //! .unwrap(); //! ctx.register_table("demo", Arc::new(table)).unwrap(); @@ -71,16 +69,6 @@ #![allow(rustdoc::invalid_html_tags)] #![allow(clippy::nonminimal_bool)] -#[cfg(all(feature = "parquet", feature = "parquet2"))] -compile_error!( - "Features parquet and parquet2 are mutually exclusive and cannot be enabled together" -); - -#[cfg(all(feature = "s3", feature = "s3-native-tls"))] -compile_error!( - "Features s3 and s3-native-tls are mutually exclusive and cannot be enabled together" -); - #[cfg(all(feature = "glue", feature = "glue-native-tls"))] compile_error!( "Features glue and glue-native-tls are mutually exclusive and cannot be enabled together" @@ -122,16 +110,9 @@ pub use arrow; pub use datafusion; #[cfg(feature = "parquet")] pub use parquet; -#[cfg(feature = "parquet2")] -pub use parquet2; #[cfg(all(feature = "arrow", feature = "parquet"))] pub use protocol::checkpoints; -// needed only for integration tests -// TODO can / should we move this into the test crate? -#[cfg(feature = "integration_test")] -pub mod test_utils; - /// Creates and loads a DeltaTable from the given path with current metadata. /// Infers the storage backend to use from the scheme in the given table path. /// @@ -202,7 +183,9 @@ mod tests { #[tokio::test] async fn read_delta_2_0_table_without_version() { - let table = crate::open_table("./tests/data/delta-0.2.0").await.unwrap(); + let table = crate::open_table("../deltalake-test/tests/data/delta-0.2.0") + .await + .unwrap(); assert_eq!(table.version(), 3); assert_eq!(table.protocol().min_writer_version, 2); assert_eq!(table.protocol().min_reader_version, 1); @@ -232,7 +215,7 @@ mod tests { #[tokio::test] async fn read_delta_table_with_update() { - let path = "./tests/data/simple_table_with_checkpoint/"; + let path = "../deltalake-test/tests/data/simple_table_with_checkpoint/"; let table_newest_version = crate::open_table(path).await.unwrap(); let mut table_to_update = crate::open_table_with_version(path, 0).await.unwrap(); // calling update several times should not produce any duplicates @@ -247,9 +230,10 @@ mod tests { } #[tokio::test] async fn read_delta_2_0_table_with_version() { - let mut table = crate::open_table_with_version("./tests/data/delta-0.2.0", 0) - .await - .unwrap(); + let mut table = + crate::open_table_with_version("../deltalake-test/tests/data/delta-0.2.0", 0) + .await + .unwrap(); assert_eq!(table.version(), 0); assert_eq!(table.protocol().min_writer_version, 2); assert_eq!(table.protocol().min_reader_version, 1); @@ -261,7 +245,7 @@ mod tests { ], ); - table = crate::open_table_with_version("./tests/data/delta-0.2.0", 2) + table = crate::open_table_with_version("../deltalake-test/tests/data/delta-0.2.0", 2) .await .unwrap(); assert_eq!(table.version(), 2); @@ -275,7 +259,7 @@ mod tests { ] ); - table = crate::open_table_with_version("./tests/data/delta-0.2.0", 3) + table = crate::open_table_with_version("../deltalake-test/tests/data/delta-0.2.0", 3) .await .unwrap(); assert_eq!(table.version(), 3); @@ -293,7 +277,9 @@ mod tests { #[tokio::test] async fn read_delta_8_0_table_without_version() { - let table = crate::open_table("./tests/data/delta-0.8.0").await.unwrap(); + let table = crate::open_table("../deltalake-test/tests/data/delta-0.8.0") + .await + .unwrap(); assert_eq!(table.version(), 1); assert_eq!(table.protocol().min_writer_version, 2); assert_eq!(table.protocol().min_reader_version, 1); @@ -339,7 +325,9 @@ mod tests { #[tokio::test] async fn read_delta_8_0_table_with_load_version() { - let mut table = crate::open_table("./tests/data/delta-0.8.0").await.unwrap(); + let mut table = crate::open_table("../deltalake-test/tests/data/delta-0.8.0") + .await + .unwrap(); assert_eq!(table.version(), 1); assert_eq!(table.protocol().min_writer_version, 2); assert_eq!(table.protocol().min_reader_version, 1); @@ -365,8 +353,7 @@ mod tests { #[tokio::test] async fn read_delta_8_0_table_with_partitions() { - let current_dir = Path::from_filesystem_path(std::env::current_dir().unwrap()).unwrap(); - let table = crate::open_table("./tests/data/delta-0.8.0-partitioned") + let table = crate::open_table("../deltalake-test/tests/data/delta-0.8.0-partitioned") .await .unwrap(); @@ -388,23 +375,13 @@ mod tests { Path::from("year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet") ] ); - - #[cfg(unix)] assert_eq!( table.get_file_uris_by_partitions(&filters).unwrap(), vec![ - format!("/{}/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet", current_dir.as_ref()), - format!("/{}/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet", current_dir.as_ref()) + std::fs::canonicalize("../deltalake-test/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet").unwrap().as_path().to_string_lossy(), + std::fs::canonicalize("../deltalake-test/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet").unwrap().as_path().to_string_lossy(), ] - ); - #[cfg(windows)] - assert_eq!( - table.get_file_uris_by_partitions(&filters).unwrap(), - vec![ - format!("{}/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet", current_dir.as_ref()), - format!("{}/tests/data/delta-0.8.0-partitioned/year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet", current_dir.as_ref()) - ] - ); + ); let filters = vec![crate::PartitionFilter { key: "month".to_string(), @@ -449,7 +426,7 @@ mod tests { #[tokio::test] async fn read_delta_8_0_table_with_null_partition() { - let table = crate::open_table("./tests/data/delta-0.8.0-null-partition") + let table = crate::open_table("../deltalake-test/tests/data/delta-0.8.0-null-partition") .await .unwrap(); @@ -478,7 +455,7 @@ mod tests { #[tokio::test] async fn read_delta_8_0_table_with_special_partition() { - let table = crate::open_table("./tests/data/delta-0.8.0-special-partition") + let table = crate::open_table("../deltalake-test/tests/data/delta-0.8.0-special-partition") .await .unwrap(); @@ -511,7 +488,7 @@ mod tests { #[tokio::test] async fn read_delta_8_0_table_partition_with_compare_op() { - let table = crate::open_table("./tests/data/delta-0.8.0-numeric-partition") + let table = crate::open_table("../deltalake-test/tests/data/delta-0.8.0-numeric-partition") .await .unwrap(); @@ -538,11 +515,10 @@ mod tests { ); } - // TODO: enable this for parquet2 #[cfg(feature = "parquet")] #[tokio::test] async fn read_delta_1_2_1_struct_stats_table() { - let table_uri = "./tests/data/delta-1.2.1-only-struct-stats"; + let table_uri = "../deltalake-test/tests/data/delta-1.2.1-only-struct-stats"; let table_from_struct_stats = crate::open_table(table_uri).await.unwrap(); let table_from_json_stats = crate::open_table_with_version(table_uri, 1).await.unwrap(); @@ -574,7 +550,7 @@ mod tests { #[tokio::test] async fn test_table_history() { - let path = "./tests/data/simple_table_with_checkpoint"; + let path = "../deltalake-test/tests/data/simple_table_with_checkpoint"; let mut latest_table = crate::open_table(path).await.unwrap(); let mut table = crate::open_table_with_version(path, 1).await.unwrap(); @@ -596,7 +572,7 @@ mod tests { #[tokio::test] async fn test_poll_table_commits() { - let path = "./tests/data/simple_table_with_checkpoint"; + let path = "../deltalake-test/tests/data/simple_table_with_checkpoint"; let mut table = crate::open_table_with_version(path, 9).await.unwrap(); let peek = table.peek_next_commit(table.version()).await.unwrap(); assert!(matches!(peek, PeekCommit::New(..))); @@ -626,14 +602,14 @@ mod tests { #[tokio::test] async fn test_read_vacuumed_log() { - let path = "./tests/data/checkpoints_vacuumed"; + let path = "../deltalake-test/tests/data/checkpoints_vacuumed"; let table = crate::open_table(path).await.unwrap(); assert_eq!(table.version(), 12); } #[tokio::test] async fn test_read_vacuumed_log_history() { - let path = "./tests/data/checkpoints_vacuumed"; + let path = "../deltalake-test/tests/data/checkpoints_vacuumed"; let mut table = crate::open_table(path).await.unwrap(); // load history for table version with available log file @@ -678,7 +654,7 @@ mod tests { #[tokio::test] async fn read_delta_table_with_cdc() { - let table = crate::open_table("./tests/data/simple_table_with_cdc") + let table = crate::open_table("../deltalake-test/tests/data/simple_table_with_cdc") .await .unwrap(); assert_eq!(table.version(), 2); @@ -692,7 +668,7 @@ mod tests { #[tokio::test()] async fn test_version_zero_table_load() { - let path = "./tests/data/COVID-19_NYT"; + let path = "../deltalake-test/tests/data/COVID-19_NYT"; let mut latest_table: DeltaTable = crate::open_table(path).await.unwrap(); let mut version_0_table = crate::open_table_with_version(path, 0).await.unwrap(); @@ -713,7 +689,7 @@ mod tests { async fn test_fail_fast_on_not_existing_path() { use std::path::Path as FolderPath; - let non_existing_path_str = "./tests/data/folder_doesnt_exist"; + let non_existing_path_str = "../deltalake-test/tests/data/folder_doesnt_exist"; // Check that there is no such path at the beginning let path_doesnt_exist = !FolderPath::new(non_existing_path_str).exists(); diff --git a/crates/deltalake-core/src/logstore/default_logstore.rs b/crates/deltalake-core/src/logstore/default_logstore.rs index 275732fb1a..ed463e9947 100644 --- a/crates/deltalake-core/src/logstore/default_logstore.rs +++ b/crates/deltalake-core/src/logstore/default_logstore.rs @@ -29,6 +29,10 @@ impl DefaultLogStore { #[async_trait::async_trait] impl LogStore for DefaultLogStore { + fn name(&self) -> String { + "DefaultLogStore".into() + } + async fn read_commit_entry(&self, version: i64) -> DeltaResult> { super::read_commit_entry(self.storage.as_ref(), version).await } @@ -54,15 +58,6 @@ impl LogStore for DefaultLogStore { self.storage.clone() } - fn to_uri(&self, location: &Path) -> String { - super::to_uri(&self.config.location, location) - } - - #[cfg(feature = "datafusion")] - fn object_store_url(&self) -> datafusion::execution::object_store::ObjectStoreUrl { - super::object_store_url(&self.config.location) - } - fn config(&self) -> &LogStoreConfig { &self.config } diff --git a/crates/deltalake-core/src/logstore/mod.rs b/crates/deltalake-core/src/logstore/mod.rs index dd13b6bdc5..8bb6b3cd75 100644 --- a/crates/deltalake-core/src/logstore/mod.rs +++ b/crates/deltalake-core/src/logstore/mod.rs @@ -1,4 +1,5 @@ //! Delta log store. +use dashmap::DashMap; use futures::StreamExt; use lazy_static::lazy_static; use regex::Regex; @@ -8,6 +9,7 @@ use serde::{ Deserialize, Serialize, }; use std::io::{BufRead, BufReader, Cursor}; +use std::sync::OnceLock; use std::{cmp::max, collections::HashMap, sync::Arc}; use url::Url; @@ -16,19 +18,71 @@ use crate::{ kernel::Action, operations::transaction::TransactionError, protocol::{get_last_checkpoint, ProtocolError}, - storage::{commit_uri_from_version, config::StorageOptions}, + storage::{commit_uri_from_version, ObjectStoreRef, StorageOptions}, DeltaTableError, }; use bytes::Bytes; -use log::debug; +use log::*; use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; #[cfg(feature = "datafusion")] use datafusion::datasource::object_store::ObjectStoreUrl; pub mod default_logstore; -#[cfg(any(feature = "s3", feature = "s3-native-tls"))] -pub mod s3; + +/// Trait for generating [LogStore] implementations +pub trait LogStoreFactory: Send + Sync { + /// Create a new [LogStore] + fn with_options( + &self, + store: ObjectStoreRef, + location: &Url, + options: &StorageOptions, + ) -> DeltaResult> { + Ok(default_logstore(store, location, options)) + } +} + +/// Return the [DefaultLogStore] implementation with the provided configuration options +pub fn default_logstore( + store: ObjectStoreRef, + location: &Url, + options: &StorageOptions, +) -> Arc { + Arc::new(default_logstore::DefaultLogStore::new( + store, + LogStoreConfig { + location: location.clone(), + options: options.clone(), + }, + )) +} + +#[derive(Clone, Debug, Default)] +struct DefaultLogStoreFactory {} +impl LogStoreFactory for DefaultLogStoreFactory {} + +/// Registry of [LogStoreFactory] instances +pub type FactoryRegistry = Arc>>; + +/// TODO +pub fn logstores() -> FactoryRegistry { + static REGISTRY: OnceLock = OnceLock::new(); + REGISTRY + .get_or_init(|| { + let registry = FactoryRegistry::default(); + registry.insert( + Url::parse("memory://").unwrap(), + Arc::new(DefaultLogStoreFactory::default()), + ); + registry.insert( + Url::parse("file://").unwrap(), + Arc::new(DefaultLogStoreFactory::default()), + ); + registry + }) + .clone() +} /// Sharable reference to [`LogStore`] pub type LogStoreRef = Arc; @@ -37,6 +91,56 @@ lazy_static! { static ref DELTA_LOG_PATH: Path = Path::from("_delta_log"); } +/// Return the [LogStoreRef] for the provided [Url] location +/// +/// This will use the built-in process global [crate::storage::ObjectStoreRegistry] by default +/// +/// ```rust +/// # use deltalake_core::logstore::*; +/// # use std::collections::HashMap; +/// # use url::Url; +/// let location = Url::parse("file:///tmp").expect("Failed to make location"); +/// let logstore = logstore_for(location, HashMap::new()).expect("Failed to get a logstore"); +/// ``` +pub fn logstore_for( + location: Url, + options: impl Into + Clone, +) -> DeltaResult { + // turn location into scheme + let scheme = Url::parse(&format!("{}://", location.scheme())) + .map_err(|_| DeltaTableError::InvalidTableLocation(location.clone().into()))?; + + if let Some(entry) = crate::storage::factories().get(&scheme) { + debug!("Found a storage provider for {scheme} ({location})"); + let (store, _prefix) = entry + .value() + .parse_url_opts(&location, &options.clone().into())?; + return logstore_with(store, location, options); + } + Err(DeltaTableError::InvalidTableLocation(location.into())) +} + +/// Return the [LogStoreRef] using the given [ObjectStoreRef] +pub fn logstore_with( + store: ObjectStoreRef, + location: Url, + options: impl Into + Clone, +) -> DeltaResult { + let scheme = Url::parse(&format!("{}://", location.scheme())) + .map_err(|_| DeltaTableError::InvalidTableLocation(location.clone().into()))?; + + if let Some(factory) = logstores().get(&scheme) { + debug!("Found a logstore provider for {scheme}"); + return factory.with_options(store, &location, &options.into()); + } else { + println!("Could not find a logstore for the scheme {scheme}"); + warn!("Could not find a logstore for the scheme {scheme}"); + } + Err(DeltaTableError::InvalidTableLocation( + location.clone().into(), + )) +} + /// Configuration parameters for a log store #[derive(Debug, Clone)] pub struct LogStoreConfig { @@ -58,6 +162,9 @@ pub struct LogStoreConfig { /// become visible immediately. #[async_trait::async_trait] pub trait LogStore: Sync + Send { + /// Return the name of this LogStore implementation + fn name(&self) -> String; + /// Read data for commit entry with the given version. async fn read_commit_entry(&self, version: i64) -> DeltaResult>; @@ -78,7 +185,10 @@ pub trait LogStore: Sync + Send { fn object_store(&self) -> Arc; /// [Path] to Delta log - fn to_uri(&self, location: &Path) -> String; + fn to_uri(&self, location: &Path) -> String { + let root = &self.config().location; + to_uri(root, location) + } /// Get fully qualified uri for table root fn root_uri(&self) -> String { @@ -112,12 +222,56 @@ pub trait LogStore: Sync + Send { /// registering/fetching. In our case the scheme is hard-coded to "delta-rs", so to get a unique /// host we convert the location from this `LogStore` to a valid name, combining the /// original scheme, host and path with invalid characters replaced. - fn object_store_url(&self) -> ObjectStoreUrl; + fn object_store_url(&self) -> ObjectStoreUrl { + crate::logstore::object_store_url(&self.config().location) + } /// Get configuration representing configured log store. fn config(&self) -> &LogStoreConfig; } +#[cfg(feature = "datafusion")] +fn object_store_url(location: &Url) -> ObjectStoreUrl { + use object_store::path::DELIMITER; + ObjectStoreUrl::parse(format!( + "delta-rs://{}-{}{}", + location.scheme(), + location.host_str().unwrap_or("-"), + location.path().replace(DELIMITER, "-").replace(':', "-") + )) + .expect("Invalid object store url.") +} + +/// TODO +pub fn to_uri(root: &Url, location: &Path) -> String { + match root.scheme() { + "file" => { + #[cfg(windows)] + let uri = format!( + "{}/{}", + root.as_ref().trim_end_matches('/'), + location.as_ref() + ) + .replace("file:///", ""); + #[cfg(unix)] + let uri = format!( + "{}/{}", + root.as_ref().trim_end_matches('/'), + location.as_ref() + ) + .replace("file://", ""); + uri + } + _ => { + if location.as_ref().is_empty() || location.as_ref() == "/" { + root.as_ref().to_string() + } else { + format!("{}/{}", root.as_ref(), location.as_ref()) + } + } + } +} + /// Reads a commit and gets list of actions pub async fn get_actions( version: i64, @@ -199,50 +353,6 @@ lazy_static! { static ref DELTA_LOG_REGEX: Regex = Regex::new(r"(\d{20})\.(json|checkpoint).*$").unwrap(); } -fn to_uri(root: &Url, location: &Path) -> String { - match root.scheme() { - "file" => { - #[cfg(windows)] - let uri = format!( - "{}/{}", - root.as_ref().trim_end_matches('/'), - location.as_ref() - ) - .replace("file:///", ""); - #[cfg(unix)] - let uri = format!( - "{}/{}", - root.as_ref().trim_end_matches('/'), - location.as_ref() - ) - .replace("file://", ""); - uri - } - _ => { - if location.as_ref().is_empty() || location.as_ref() == "/" { - root.as_ref().to_string() - } else { - format!("{}/{}", root.as_ref(), location.as_ref()) - } - } - } -} - -#[cfg(feature = "datafusion")] -fn object_store_url(location: &Url) -> ObjectStoreUrl { - // we are certain, that the URL can be parsed, since - // we make sure when we are parsing the table uri - - use object_store::path::DELIMITER; - ObjectStoreUrl::parse(format!( - "delta-rs://{}-{}{}", - location.scheme(), - location.host_str().unwrap_or("-"), - location.path().replace(DELIMITER, "-").replace(':', "-") - )) - .expect("Invalid object store url.") -} - /// Extract version from a file name in the delta log pub fn extract_version_from_filename(name: &str) -> Option { DELTA_LOG_REGEX @@ -250,7 +360,11 @@ pub fn extract_version_from_filename(name: &str) -> Option { .map(|captures| captures.get(1).unwrap().as_str().parse().unwrap()) } -async fn get_latest_version(log_store: &dyn LogStore, current_version: i64) -> DeltaResult { +/// Default implementation for retrieving the latest version +pub async fn get_latest_version( + log_store: &dyn LogStore, + current_version: i64, +) -> DeltaResult { let version_start = match get_last_checkpoint(log_store).await { Ok(last_check_point) => last_check_point.version, Err(ProtocolError::CheckpointNotFound) => { @@ -296,7 +410,10 @@ async fn get_latest_version(log_store: &dyn LogStore, current_version: i64) -> D } /// Read delta log for a specific version -async fn read_commit_entry(storage: &dyn ObjectStore, version: i64) -> DeltaResult> { +pub async fn read_commit_entry( + storage: &dyn ObjectStore, + version: i64, +) -> DeltaResult> { let commit_uri = commit_uri_from_version(version); match storage.get(&commit_uri).await { Ok(res) => Ok(Some(res.bytes().await?)), @@ -305,7 +422,8 @@ async fn read_commit_entry(storage: &dyn ObjectStore, version: i64) -> DeltaResu } } -async fn write_commit_entry( +/// Default implementation for writing a commit entry +pub async fn write_commit_entry( storage: &dyn ObjectStore, version: i64, tmp_commit: &Path, @@ -326,9 +444,29 @@ async fn write_commit_entry( Ok(()) } -#[cfg(feature = "datafusion")] #[cfg(test)] mod tests { + use super::*; + + #[test] + fn logstore_with_invalid_url() { + let location = Url::parse("nonexistent://table").unwrap(); + let store = logstore_for(location, HashMap::default()); + assert!(store.is_err()); + } + + #[test] + fn logstore_with_memory() { + let location = Url::parse("memory://table").unwrap(); + let store = logstore_for(location, HashMap::default()); + assert!(store.is_ok()); + } +} + +#[cfg(feature = "datafusion")] +#[cfg(test)] +mod datafusion_tests { + use super::*; use url::Url; #[tokio::test] @@ -345,8 +483,8 @@ mod tests { let url_2 = Url::parse(location_2).unwrap(); assert_ne!( - super::object_store_url(&url_1).as_str(), - super::object_store_url(&url_2).as_str(), + object_store_url(&url_1).as_str(), + object_store_url(&url_2).as_str(), ); } } diff --git a/crates/deltalake-core/src/operations/convert_to_delta.rs b/crates/deltalake-core/src/operations/convert_to_delta.rs index 97cb08f560..48dc90b2dc 100644 --- a/crates/deltalake-core/src/operations/convert_to_delta.rs +++ b/crates/deltalake-core/src/operations/convert_to_delta.rs @@ -6,7 +6,7 @@ use crate::{ logstore::{LogStore, LogStoreRef}, operations::create::CreateBuilder, protocol::SaveMode, - storage::config::configure_log_store, + table::builder::ensure_table_uri, table::config::DeltaConfigKey, DeltaResult, DeltaTable, DeltaTableError, DeltaTablePartition, ObjectStoreError, NULL_PARTITION_VALUE_DATA_PATH, @@ -55,6 +55,8 @@ enum Error { DeltaTableAlready, #[error("Location must be provided to convert a Parquet table to a Delta table")] MissingLocation, + #[error("The location provided must be a valid URL")] + InvalidLocation(#[from] url::ParseError), } impl From for DeltaTableError { @@ -231,7 +233,10 @@ impl ConvertToDeltaBuilder { let log_store = if let Some(log_store) = self.log_store { log_store } else if let Some(location) = self.location { - configure_log_store(&location, self.storage_options.unwrap_or_default(), None)? + crate::logstore::logstore_for( + ensure_table_uri(location)?, + self.storage_options.unwrap_or_default(), + )? } else { return Err(Error::MissingLocation); }; @@ -389,11 +394,11 @@ impl std::future::IntoFuture for ConvertToDeltaBuilder { #[cfg(test)] mod tests { - use super::{configure_log_store, ConvertToDeltaBuilder, DeltaTable, LogStoreRef, StructField}; + use super::*; use crate::{ kernel::schema::{DataType, PrimitiveType}, open_table, - storage::config::StorageOptions, + storage::StorageOptions, Path, }; use itertools::Itertools; @@ -424,7 +429,9 @@ mod tests { } fn log_store(path: impl Into) -> LogStoreRef { - configure_log_store(&path.into(), StorageOptions::default(), None) + let path: String = path.into(); + let location = ensure_table_uri(path).expect("Failed to get the URI from the path"); + crate::logstore::logstore_for(location, StorageOptions::default()) .expect("Failed to create an object store") } @@ -442,7 +449,9 @@ mod tests { // Copy all files to a temp directory to perform testing. Skip Delta log copy_files(format!("{}/{}", env!("CARGO_MANIFEST_DIR"), path), temp_dir); let builder = if from_path { - ConvertToDeltaBuilder::new().with_location(temp_dir) + ConvertToDeltaBuilder::new().with_location( + ensure_table_uri(temp_dir).expect("Failed to turn temp dir into a URL"), + ) } else { ConvertToDeltaBuilder::new().with_log_store(log_store(temp_dir)) }; @@ -519,7 +528,7 @@ mod tests { // Test Parquet files in object store location #[tokio::test] async fn test_convert_to_delta() { - let path = "tests/data/delta-0.8.0-date"; + let path = "../deltalake-test/tests/data/delta-0.8.0-date"; let table = create_delta_table(path, Vec::new(), false).await; let action = table .get_active_add_actions_by_partitions(&[]) @@ -545,7 +554,7 @@ mod tests { &[], ); - let path = "tests/data/delta-0.8.0-null-partition"; + let path = "../deltalake-test/tests/data/delta-0.8.0-null-partition"; let table = create_delta_table( path, vec![schema_field("k", PrimitiveType::String, true)], @@ -570,7 +579,7 @@ mod tests { ], ); - let path = "tests/data/delta-0.8.0-special-partition"; + let path = "../deltalake-test/tests/data/delta-0.8.0-special-partition"; let table = create_delta_table( path, vec![schema_field("x", PrimitiveType::String, true)], @@ -601,7 +610,7 @@ mod tests { ], ); - let path = "tests/data/delta-0.8.0-partitioned"; + let path = "../deltalake-test/tests/data/delta-0.8.0-partitioned"; let table = create_delta_table( path, vec![ @@ -668,7 +677,7 @@ mod tests { // Test opening the newly created Delta table #[tokio::test] async fn test_open_created_delta_table() { - let path = "tests/data/delta-0.2.0"; + let path = "../deltalake-test/tests/data/delta-0.2.0"; let table = open_created_delta_table(path, Vec::new()).await; assert_delta_table( table, @@ -687,7 +696,7 @@ mod tests { &[], ); - let path = "tests/data/delta-0.8-empty"; + let path = "../deltalake-test/tests/data/delta-0.8-empty"; let table = open_created_delta_table(path, Vec::new()).await; assert_delta_table( table, @@ -701,7 +710,7 @@ mod tests { &[], ); - let path = "tests/data/delta-0.8.0"; + let path = "../deltalake-test/tests/data/delta-0.8.0"; let table = open_created_delta_table(path, Vec::new()).await; assert_delta_table( table, @@ -720,7 +729,7 @@ mod tests { // Test Parquet files in path #[tokio::test] async fn test_convert_to_delta_from_path() { - let path = "tests/data/delta-2.2.0-partitioned-types"; + let path = "../deltalake-test/tests/data/delta-2.2.0-partitioned-types"; let table = create_delta_table( path, vec![ @@ -760,7 +769,7 @@ mod tests { ], ); - let path = "tests/data/delta-0.8.0-numeric-partition"; + let path = "../deltalake-test/tests/data/delta-0.8.0-numeric-partition"; let table = create_delta_table( path, vec![ @@ -819,7 +828,7 @@ mod tests { #[tokio::test] async fn test_partition_column_not_exist() { let _table = ConvertToDeltaBuilder::new() - .with_location("tests/data/delta-0.8.0-null-partition") + .with_location("../deltalake-test/tests/data/delta-0.8.0-null-partition") .with_partition_schema(vec![schema_field("foo", PrimitiveType::String, true)]) .await .expect_err( @@ -830,7 +839,7 @@ mod tests { #[tokio::test] async fn test_missing_partition_schema() { let _table = ConvertToDeltaBuilder::new() - .with_location("tests/data/delta-0.8.0-numeric-partition") + .with_location("../deltalake-test/tests/data/delta-0.8.0-numeric-partition") .await .expect_err("The schema of a partition column is not provided by user. Should error"); } @@ -838,7 +847,7 @@ mod tests { #[tokio::test] async fn test_delta_table_already() { let _table = ConvertToDeltaBuilder::new() - .with_location("tests/data/delta-0.2.0") + .with_location("../deltalake-test/tests/data/delta-0.2.0") .await .expect_err("The given location is already a delta table location. Should error"); } diff --git a/crates/deltalake-core/src/operations/load.rs b/crates/deltalake-core/src/operations/load.rs index 610f86dee6..0189381922 100644 --- a/crates/deltalake-core/src/operations/load.rs +++ b/crates/deltalake-core/src/operations/load.rs @@ -88,7 +88,7 @@ mod tests { #[tokio::test] async fn test_load_local() -> TestResult { - let table = DeltaTableBuilder::from_uri("./tests/data/delta-0.8.0") + let table = DeltaTableBuilder::from_uri("../deltalake-test/tests/data/delta-0.8.0") .load() .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/vacuum.rs b/crates/deltalake-core/src/operations/vacuum.rs index 7b321400e6..03d9cffed1 100644 --- a/crates/deltalake-core/src/operations/vacuum.rs +++ b/crates/deltalake-core/src/operations/vacuum.rs @@ -404,7 +404,9 @@ mod tests { #[tokio::test] async fn vacuum_delta_8_0_table() { - let table = open_table("./tests/data/delta-0.8.0").await.unwrap(); + let table = open_table("../deltalake-test/tests/data/delta-0.8.0") + .await + .unwrap(); let result = VacuumBuilder::new(table.log_store, table.state.clone()) .with_retention_period(Duration::hours(1)) @@ -413,7 +415,9 @@ mod tests { assert!(result.is_err()); - let table = open_table("./tests/data/delta-0.8.0").await.unwrap(); + let table = open_table("../deltalake-test/tests/data/delta-0.8.0") + .await + .unwrap(); let (table, result) = VacuumBuilder::new(table.log_store, table.state) .with_retention_period(Duration::hours(0)) .with_dry_run(true) diff --git a/crates/deltalake-core/src/protocol/mod.rs b/crates/deltalake-core/src/protocol/mod.rs index 311f6dac7e..53b2f471d1 100644 --- a/crates/deltalake-core/src/protocol/mod.rs +++ b/crates/deltalake-core/src/protocol/mod.rs @@ -4,8 +4,6 @@ #[cfg(all(feature = "arrow", feature = "parquet"))] pub mod checkpoints; -#[cfg(feature = "parquet2")] -pub mod parquet2_read; #[cfg(feature = "parquet")] mod parquet_read; mod time_utils; @@ -60,14 +58,10 @@ pub enum ProtocolError { #[error("Generic action error: {0}")] Generic(String), - #[cfg(any(feature = "parquet", feature = "parquet2"))] + #[cfg(feature = "parquet")] /// Error returned when parsing checkpoint parquet using the parquet crate. #[error("Failed to parse parquet checkpoint: {source}")] ParquetParseError { - /// Parquet error details returned when parsing the checkpoint parquet - #[cfg(feature = "parquet2")] - #[from] - source: parquet2::error::Error, /// Parquet error details returned when parsing the checkpoint parquet #[cfg(feature = "parquet")] #[from] @@ -235,16 +229,10 @@ pub struct StatsParsed { /// Contains a value smaller than all values present in the file for all columns. #[cfg(feature = "parquet")] pub min_values: HashMap, - /// Contains a value smaller than all values present in the file for all columns. - #[cfg(feature = "parquet2")] - pub min_values: HashMap, /// Contains a value larger than all values present in the file for all columns. #[cfg(feature = "parquet")] /// Contains a value larger than all values present in the file for all columns. pub max_values: HashMap, - #[cfg(feature = "parquet2")] - /// Contains a value larger than all values present in the file for all columns. - pub max_values: HashMap, /// The number of null values for all columns. pub null_count: HashMap, } @@ -272,7 +260,7 @@ impl Eq for Add {} impl Add { /// Get whatever stats are available. Uses (parquet struct) parsed_stats if present falling back to json stats. - #[cfg(any(feature = "parquet", feature = "parquet2"))] + #[cfg(feature = "parquet")] pub fn get_stats(&self) -> Result, serde_json::error::Error> { match self.get_stats_parsed() { Ok(Some(stats)) => Ok(Some(stats)), @@ -288,7 +276,7 @@ impl Add { } /// Get whatever stats are available. - #[cfg(not(any(feature = "parquet", feature = "parquet2")))] + #[cfg(not(any(feature = "parquet")))] pub fn get_stats(&self) -> Result, serde_json::error::Error> { self.get_json_stats() } @@ -912,7 +900,7 @@ mod tests { #[tokio::test] async fn test_with_partitions() { // test table with partitions - let path = "./tests/data/delta-0.8.0-null-partition"; + let path = "../deltalake-test/tests/data/delta-0.8.0-null-partition"; let table = crate::open_table(path).await.unwrap(); let actions = table.get_state().add_actions_table(true).unwrap(); let actions = sort_batch_by(&actions, "path").unwrap(); @@ -951,7 +939,7 @@ mod tests { #[tokio::test] async fn test_with_deletion_vector() { // test table with partitions - let path = "./tests/data/table_with_deletion_logs"; + let path = "../deltalake-test/tests/data/table_with_deletion_logs"; let table = crate::open_table(path).await.unwrap(); let actions = table.get_state().add_actions_table(true).unwrap(); let actions = sort_batch_by(&actions, "path").unwrap(); @@ -1057,7 +1045,7 @@ mod tests { #[tokio::test] async fn test_without_partitions() { // test table without partitions - let path = "./tests/data/simple_table"; + let path = "../deltalake-test/tests/data/simple_table"; let table = crate::open_table(path).await.unwrap(); let actions = table.get_state().add_actions_table(true).unwrap(); @@ -1115,7 +1103,7 @@ mod tests { #[tokio::test] async fn test_with_column_mapping() { // test table with column mapping and partitions - let path = "./tests/data/table_with_column_mapping"; + let path = "../deltalake-test/tests/data/table_with_column_mapping"; let table = crate::open_table(path).await.unwrap(); let actions = table.get_state().add_actions_table(true).unwrap(); let expected_columns: Vec<(&str, ArrayRef)> = vec![ @@ -1189,7 +1177,7 @@ mod tests { #[tokio::test] async fn test_with_stats() { // test table with stats - let path = "./tests/data/delta-0.8.0"; + let path = "../deltalake-test/tests/data/delta-0.8.0"; let table = crate::open_table(path).await.unwrap(); let actions = table.get_state().add_actions_table(true).unwrap(); let actions = sort_batch_by(&actions, "path").unwrap(); @@ -1233,7 +1221,7 @@ mod tests { #[tokio::test] async fn test_only_struct_stats() { // test table with no json stats - let path = "./tests/data/delta-1.2.1-only-struct-stats"; + let path = "../deltalake-test/tests/data/delta-1.2.1-only-struct-stats"; let mut table = crate::open_table(path).await.unwrap(); table.load_version(1).await.unwrap(); diff --git a/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs b/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs deleted file mode 100644 index e68971be42..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs +++ /dev/null @@ -1,76 +0,0 @@ -use parquet2::encoding::hybrid_rle::BitmapIter; -use parquet2::metadata::ColumnDescriptor; -use parquet2::page::DataPage; - -use super::validity::ValidityRowIndexIter; -use super::{split_page, ActionVariant, ParseError}; -use crate::kernel::Action; - -/// Parquet dictionary primitive value reader -pub struct SomeBooleanValueIter<'a> { - valid_row_idx_iter: ValidityRowIndexIter<'a>, - value_iter: BitmapIter<'a>, -} - -impl<'a> SomeBooleanValueIter<'a> { - /// Create parquet primitive value reader - pub fn try_new( - page: &'a DataPage, - descriptor: &'a ColumnDescriptor, - ) -> Result { - let (max_def_level, validity_iter, values_buffer) = split_page(page, descriptor)?; - - let valid_row_idx_iter = ValidityRowIndexIter::new(max_def_level, validity_iter); - let value_len_upper_bound = values_buffer.len() * 8; - let value_iter = BitmapIter::new(values_buffer, 0, value_len_upper_bound); - Ok(Self { - valid_row_idx_iter, - value_iter, - }) - } -} - -impl<'a> Iterator for SomeBooleanValueIter<'a> { - type Item = Result<(usize, bool), ParseError>; - - fn next(&mut self) -> Option { - self.valid_row_idx_iter.next().map(|idx_result| { - idx_result.map(|idx| { - let value = self.value_iter.next().unwrap(); - (idx, value) - }) - }) - } -} - -#[inline] -pub fn for_each_boolean_field_value( - actions: &mut Vec>, - page: &DataPage, - descriptor: &ColumnDescriptor, - set_fn: SetFn, -) -> Result<(), ParseError> -where - ActType: ActionVariant, - SetFn: Fn(&mut ActType, bool), -{ - #[cfg(debug_assertions)] - { - use parquet2::schema::types::PhysicalType; - if page.descriptor.primitive_type.physical_type != PhysicalType::Boolean { - return Err(ParseError::InvalidAction(format!( - "expect physical parquet type boolean, got {:?}", - page.descriptor.primitive_type, - ))); - } - } - - let some_value_iter = SomeBooleanValueIter::try_new(page, descriptor)?; - for entry in some_value_iter { - let (idx, value) = entry?; - let a = actions[idx].get_or_insert_with(ActType::default_action); - set_fn(ActType::try_mut_from_action(a)?, value); - } - - Ok(()) -} diff --git a/crates/deltalake-core/src/protocol/parquet2_read/dictionary/binary.rs b/crates/deltalake-core/src/protocol/parquet2_read/dictionary/binary.rs deleted file mode 100644 index ff7db3c524..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/dictionary/binary.rs +++ /dev/null @@ -1,48 +0,0 @@ -use parquet2::encoding::get_length; -use parquet2::error::Error; - -#[derive(Debug)] -pub struct BinaryPageDict<'a> { - values: Vec<&'a [u8]>, -} - -impl<'a> BinaryPageDict<'a> { - pub fn new(values: Vec<&'a [u8]>) -> Self { - Self { values } - } - - #[inline] - pub fn value(&self, index: usize) -> Result<&[u8], Error> { - self.values - .get(index) - .map(|v| *v) - .ok_or_else(|| Error::OutOfSpec("invalid index".to_string())) - } -} - -fn read_plain<'a>(bytes: &'a [u8], length: usize) -> Result, Error> { - let mut bytes = bytes; - let mut values = Vec::new(); - - for _ in 0..length { - let slot_length = get_length(bytes).unwrap(); - bytes = &bytes[4..]; - - if slot_length > bytes.len() { - return Err(Error::OutOfSpec( - "The string on a dictionary page has a length that is out of bounds".to_string(), - )); - } - let (result, remaining) = bytes.split_at(slot_length); - - values.push(result); - bytes = remaining; - } - - Ok(values) -} - -pub fn read<'a>(buf: &'a [u8], num_values: usize) -> Result, Error> { - let values = read_plain(buf, num_values)?; - Ok(BinaryPageDict::new(values)) -} diff --git a/crates/deltalake-core/src/protocol/parquet2_read/dictionary/mod.rs b/crates/deltalake-core/src/protocol/parquet2_read/dictionary/mod.rs deleted file mode 100644 index 16e1f3f7b1..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/dictionary/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub(crate) mod binary; -pub(crate) mod primitive; diff --git a/crates/deltalake-core/src/protocol/parquet2_read/dictionary/primitive.rs b/crates/deltalake-core/src/protocol/parquet2_read/dictionary/primitive.rs deleted file mode 100644 index 89b557c439..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/dictionary/primitive.rs +++ /dev/null @@ -1,19 +0,0 @@ -use parquet2::error::{Error, Result}; -use parquet2::types::{decode, NativeType}; - -pub fn read(buf: &[u8], num_values: usize) -> Result> { - let size_of = std::mem::size_of::(); - - let typed_size = num_values.wrapping_mul(size_of); - - let values = buf.get(..typed_size).ok_or_else(|| { - Error::OutOfSpec( - "The number of values declared in the dict page does not match the length of the page" - .to_string(), - ) - })?; - - let values = values.chunks_exact(size_of).map(decode::).collect(); - - Ok(values) -} diff --git a/crates/deltalake-core/src/protocol/parquet2_read/map.rs b/crates/deltalake-core/src/protocol/parquet2_read/map.rs deleted file mode 100644 index df4dc94ab7..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/map.rs +++ /dev/null @@ -1,111 +0,0 @@ -use parquet2::metadata::ColumnDescriptor; -use parquet2::page::{DataPage, DictPage}; - -use super::string::for_each_repeated_string_field_value_with_idx; -use super::{ActionVariant, ParseError}; -use crate::kernel::Action; - -#[derive(Default)] -pub struct MapState { - keys: Option)>>, - values: Option)>>, -} - -pub fn for_each_map_field_value( - field: &[String], - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - state: &mut MapState, - set_map_fn: SetMapFn, -) -> Result<(), ParseError> -where - ActType: ActionVariant, - SetMapFn: Fn(&mut ActType, (Vec, Vec>)), -{ - debug_assert!(field[0] == "key_value"); - #[cfg(debug_assertions)] - { - use parquet2::schema::types::PhysicalType; - if page.descriptor.primitive_type.physical_type != PhysicalType::ByteArray { - return Err(ParseError::InvalidAction(format!( - "expect parquet utf8 type for map key/value, got primitive type: {:?}", - page.descriptor.primitive_type, - ))); - } - } - - match field[1].as_str() { - "key" => { - let mut keys = vec![]; - for_each_repeated_string_field_value_with_idx( - page, - dict, - descriptor, - |result: Result<(usize, Vec), ParseError>| -> Result<(), ParseError> { - let (row_idx, strings) = result?; - keys.push((row_idx, strings)); - Ok(()) - }, - )?; - state.keys = Some(keys); - } - "value" => { - let mut values = vec![]; - for_each_repeated_string_field_value_with_idx( - page, - dict, - descriptor, - |result: Result<(usize, Vec), ParseError>| -> Result<(), ParseError> { - let (row_idx, strings) = result?; - values.push((row_idx, strings)); - Ok(()) - }, - )?; - state.values = Some(values); - } - _ => { - return Err(ParseError::InvalidAction(format!( - "Unexpected map key: {:?}", - field, - ))); - } - } - - if state.keys.is_some() && state.values.is_some() { - let keys = state.keys.take().unwrap(); - let values = state.values.take().unwrap(); - - let mut values_iter = values.into_iter().peekable(); - - keys.into_iter() - .try_for_each(|(key_row_idx, keys)| -> Result<(), ParseError> { - let (row_idx, (keys, vals)) = match values_iter.peek() { - Some((val_row_idx, _)) if *val_row_idx == key_row_idx => { - let (_, vals) = values_iter.next().unwrap(); - ( - key_row_idx, - ( - keys, - vals.into_iter() - .map(|val| if val == "" { None } else { Some(val) }) - .collect(), - ), - ) - } - _ => { - let vals = std::iter::repeat(None).take(keys.len()).collect(); - (key_row_idx, (keys, vals)) - } - }; - - let a = actions[row_idx].get_or_insert_with(ActType::default_action); - set_map_fn(ActType::try_mut_from_action(a)?, (keys, vals)); - - Ok(()) - })?; - } - - Ok(()) -} diff --git a/crates/deltalake-core/src/protocol/parquet2_read/mod.rs b/crates/deltalake-core/src/protocol/parquet2_read/mod.rs deleted file mode 100644 index d07ae88ae0..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/mod.rs +++ /dev/null @@ -1,898 +0,0 @@ -//! Parquet deserialization for Action enum - -use std::collections::HashMap; - -use log::warn; -use parquet2::encoding::hybrid_rle; -use parquet2::metadata::ColumnDescriptor; -use parquet2::page::{DataPage, DictPage, Page}; -use parquet2::read::decompress; -use parquet2::read::get_page_iterator; -use parquet2::read::levels::get_bit_width; - -use super::ProtocolError; -use crate::kernel::{ - Action, Add, CommitInfo, Metadata, Protocol, ReaderFeatures, Remove, Txn, WriterFeatures, -}; -use boolean::for_each_boolean_field_value; -use map::for_each_map_field_value; -use primitive::for_each_primitive_field_value; -use string::{for_each_repeated_string_field_value, for_each_string_field_value}; - -mod boolean; -mod dictionary; -mod map; -mod primitive; -mod stats; -mod string; -mod validity; - -/// Parquet deserialization error -#[derive(thiserror::Error, Debug)] -pub enum ParseError { - /// Generic parsing error - #[error("{0}")] - Generic(String), - /// Invalid action found during parsing - #[error("Invalid action: {0}")] - InvalidAction(String), - /// Error returned when parsing checkpoint parquet using parquet2 crate. - #[error("Failed to parse parquet: {}", .source)] - Parquet { - /// Parquet error details returned when parsing the checkpoint parquet - #[from] - source: parquet2::error::Error, - }, -} - -impl From for ProtocolError { - fn from(value: ParseError) -> Self { - match value { - ParseError::Generic(msg) => Self::Generic(msg), - ParseError::InvalidAction(msg) => Self::InvalidRow(msg), - ParseError::Parquet { source } => Self::ParquetParseError { source }, - } - } -} - -#[derive(Default)] -struct DeserState { - add_partition_values: map::MapState, - add_tags: map::MapState, - remove_partition_values: map::MapState, - remove_tags: map::MapState, - metadata_fromat_options: map::MapState, - metadata_configuration: map::MapState, -} - -fn hashmap_from_kvpairs( - keys: impl IntoIterator, - values: impl IntoIterator, -) -> HashMap -where - Key: std::hash::Hash + std::cmp::Eq, -{ - keys.into_iter().zip(values.into_iter()).collect() -} - -fn split_page<'a>( - page: &'a DataPage, - descriptor: &'a ColumnDescriptor, -) -> Result<(i16, hybrid_rle::HybridRleDecoder<'a>, &'a [u8]), ParseError> { - let (_rep_levels, def_levels_buf, values_buf) = parquet2::page::split_buffer(page)?; - - let max_def_level = descriptor.descriptor.max_def_level; - let def_bit_width = get_bit_width(max_def_level); - let validity_iter = - hybrid_rle::HybridRleDecoder::try_new(def_levels_buf, def_bit_width, page.num_values())?; - - Ok((max_def_level, validity_iter, values_buf)) -} - -fn split_page_nested<'a>( - page: &'a DataPage, - descriptor: &'a ColumnDescriptor, -) -> Result< - ( - i16, - hybrid_rle::HybridRleDecoder<'a>, - i16, - hybrid_rle::HybridRleDecoder<'a>, - &'a [u8], - ), - ParseError, -> { - let (rep_levels, def_levels_buf, values_buf) = parquet2::page::split_buffer(page)?; - - let max_rep_level = descriptor.descriptor.max_rep_level; - let rep_bit_width = get_bit_width(max_rep_level); - let rep_iter = - hybrid_rle::HybridRleDecoder::try_new(rep_levels, rep_bit_width, page.num_values())?; - - let max_def_level = descriptor.descriptor.max_def_level; - let def_bit_width = get_bit_width(max_def_level); - let validity_iter = - hybrid_rle::HybridRleDecoder::try_new(def_levels_buf, def_bit_width, page.num_values())?; - - Ok(( - max_rep_level, - rep_iter, - max_def_level, - validity_iter, - values_buf, - )) -} - -/// Trait for conversion between concrete action struct and Action enum variant -pub trait ActionVariant { - /// Conrete action struct type - type Variant; - - /// Return action struct wrapped in corresponding Action enum variant - fn default_action() -> Action; - - /// Extract action struct from Action enum - fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError>; -} - -impl ActionVariant for Add { - type Variant = Add; - - fn default_action() -> Action { - Action::Add(Self::default()) - } - - fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { - match a { - Action::Add(v) => Ok(v), - _ => Err(ParseError::Generic(format!( - "expect Add action, got: {:?}", - a - ))), - } - } -} - -impl ActionVariant for Remove { - type Variant = Remove; - - fn default_action() -> Action { - Action::Remove(Self { - data_change: true, - extended_file_metadata: Some(false), - ..Default::default() - }) - } - - fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { - match a { - Action::Remove(v) => Ok(v), - _ => Err(ParseError::Generic(format!( - "expect remove action, got: {:?}", - a - ))), - } - } -} - -impl ActionVariant for Metadata { - type Variant = Metadata; - - fn default_action() -> Action { - Action::Metadata(Self::default()) - } - - fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { - match a { - Action::Metadata(v) => Ok(v), - _ => Err(ParseError::Generic(format!( - "expect metadata action, got: {:?}", - a - ))), - } - } -} - -impl ActionVariant for Txn { - type Variant = Txn; - - fn default_action() -> Action { - Action::Txn(Self::default()) - } - - fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { - match a { - Action::Txn(v) => Ok(v), - _ => Err(ParseError::Generic(format!( - "expect txn action, got: {:?}", - a - ))), - } - } -} - -impl ActionVariant for Protocol { - type Variant = Protocol; - - fn default_action() -> Action { - Action::Protocol(Self::default()) - } - - fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { - match a { - Action::Protocol(v) => Ok(v), - _ => Err(ParseError::Generic(format!( - "expect protocol action, got: {:?}", - a - ))), - } - } -} - -impl ActionVariant for CommitInfo { - type Variant = CommitInfo; - - fn default_action() -> Action { - Action::CommitInfo(CommitInfo::default()) - } - - fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { - match a { - Action::CommitInfo(v) => Ok(v), - _ => Err(ParseError::Generic(format!( - "expect commitInfo action, got: {:?}", - a - ))), - } - } -} - -fn deserialize_txn_column_page( - field: &[String], - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - _state: &mut DeserState, -) -> Result<(), ParseError> { - let f = field[0].as_ref(); - match f { - "version" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Txn, v: i64| action.version = v, - )?; - } - "appId" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Txn, v: String| action.app_id = v, - )?; - } - "lastUpdated" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Txn, v: i64| action.last_updated = Some(v), - )?; - } - _ => { - return Err(ParseError::InvalidAction(format!( - "Unexpected field `{}` in txn", - f - ))) - } - } - Ok(()) -} - -fn deserialize_add_column_page( - field: &[String], - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - state: &mut DeserState, -) -> Result<(), ParseError> { - let f = field[0].as_ref(); - match f { - "path" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Add, v: String| action.path = v, - )?; - } - "size" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Add, v: i64| action.size = v, - )?; - } - "partitionValues" => { - for_each_map_field_value( - &field[1..], - actions, - page, - dict, - descriptor, - &mut state.add_partition_values, - |action: &mut Add, v: (Vec, Vec>)| { - action.partition_values = hashmap_from_kvpairs(v.0, v.1); - }, - )?; - } - // FIXME support partitionValueParsed - "dataChange" => { - for_each_boolean_field_value( - actions, - page, - descriptor, - |action: &mut Add, v: bool| action.data_change = v, - )?; - } - "tags" => { - for_each_map_field_value( - &field[1..], - actions, - page, - dict, - descriptor, - &mut state.add_tags, - |action: &mut Add, v: (Vec, Vec>)| { - action.tags = Some(hashmap_from_kvpairs(v.0, v.1)); - }, - )?; - } - // FIXME: support statsParsed - "stats" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Add, v: String| action.stats = Some(v), - )?; - } - "modificationTime" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Add, v: i64| action.modification_time = v, - )?; - } - _ => { - warn!("Unexpected field `{}` in add", f); - } - } - Ok(()) -} - -fn deserialize_remove_column_page( - field: &[String], - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - state: &mut DeserState, -) -> Result<(), ParseError> { - let f = field[0].as_ref(); - match f { - "path" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Remove, v: String| action.path = v, - )?; - } - "deletionTimestamp" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Remove, v: i64| action.deletion_timestamp = Some(v), - )?; - } - "size" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Remove, v: i64| action.size = Some(v), - )?; - } - // FIXME support partitionValueParsed - "partitionValues" => { - for_each_map_field_value( - &field[1..], - actions, - page, - dict, - descriptor, - &mut state.remove_partition_values, - |action: &mut Remove, v: (Vec, Vec>)| { - action.partition_values = Some(hashmap_from_kvpairs(v.0, v.1)); - }, - )?; - } - "dataChange" => { - for_each_boolean_field_value( - actions, - page, - descriptor, - |action: &mut Remove, v: bool| action.data_change = v, - )?; - } - "extendedFileMetadata" => { - for_each_boolean_field_value( - actions, - page, - descriptor, - |action: &mut Remove, v: bool| action.extended_file_metadata = Some(v), - )?; - } - "tags" => { - for_each_map_field_value( - &field[1..], - actions, - page, - dict, - descriptor, - &mut state.remove_tags, - |action: &mut Remove, v: (Vec, Vec>)| { - action.tags = Some(hashmap_from_kvpairs(v.0, v.1)); - }, - )?; - } - _ => { - warn!("Unexpected field `{}` in remove", f); - } - } - Ok(()) -} - -fn deserialize_metadata_column_page( - field: &[String], - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - state: &mut DeserState, -) -> Result<(), ParseError> { - let f = field[0].as_ref(); - match f { - "id" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Metadata, v: String| action.id = v, - )?; - } - "name" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Metadata, v: String| action.name = Some(v), - )?; - } - "description" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Metadata, v: String| action.description = Some(v), - )?; - } - "format" => { - let sub_f = field[1].as_ref(); - match sub_f { - "provider" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Metadata, v: String| action.format.provider = v, - )?; - } - "options" => { - for_each_map_field_value( - &field[2..], - actions, - page, - dict, - descriptor, - &mut state.metadata_fromat_options, - |action: &mut Metadata, v: (Vec, Vec>)| { - action.format.options = hashmap_from_kvpairs(v.0, v.1); - }, - )?; - } - _ => { - return Err(ParseError::InvalidAction(format!( - "Unexpected field `{}` in metaData.format", - sub_f, - ))) - } - } - } - "schemaString" => { - for_each_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Metadata, v: String| action.schema_string = v, - )?; - } - "partitionColumns" => { - for_each_repeated_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Metadata, v: Vec| action.partition_columns = v, - )?; - } - "createdTime" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Metadata, v: i64| action.created_time = Some(v), - )?; - } - "configuration" => { - for_each_map_field_value( - &field[1..], - actions, - page, - dict, - descriptor, - &mut state.metadata_configuration, - |action: &mut Metadata, v: (Vec, Vec>)| { - action.configuration = hashmap_from_kvpairs(v.0, v.1); - }, - )?; - } - _ => { - warn!("Unexpected field `{}` in metaData", f); - } - } - Ok(()) -} - -fn deserialize_protocol_column_page( - field: &[String], - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - _state: &mut DeserState, -) -> Result<(), ParseError> { - let f = field[0].as_ref(); - match f { - "minReaderVersion" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Protocol, v: i32| action.min_reader_version = v, - )?; - } - "minWriterVersion" => { - for_each_primitive_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Protocol, v: i32| action.min_writer_version = v, - )?; - } - "readerFeatures" => { - for_each_repeated_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Protocol, v: Vec| { - action.reader_features = - Some(v.into_iter().map(ReaderFeatures::from).collect()); - }, - )?; - } - "writerFeatures" => { - for_each_repeated_string_field_value( - actions, - page, - dict, - descriptor, - |action: &mut Protocol, v: Vec| { - action.writer_features = - Some(v.into_iter().map(WriterFeatures::from).collect()); - }, - )?; - } - _ => { - warn!("Unexpected field `{}` in protocol", f); - } - } - Ok(()) -} - -fn deserialize_commit_info_column_page( - _obj_keys: &[String], - _actions: &mut Vec>, - _page: &DataPage, - _dict: &Option, - _descriptor: &ColumnDescriptor, - _state: &mut DeserState, -) -> Result<(), ParseError> { - // parquet snapshots shouldn't contain commit info - Ok(()) -} - -fn deserialize_cdc_column_page( - _field: &[String], - _actions: &mut Vec>, - _page: &DataPage, - _dict: &Option, - _descriptor: &ColumnDescriptor, - _state: &mut DeserState, -) -> Result<(), ParseError> { - // FIXME: support cdc action - Ok(()) -} - -// TODO: find a proper max size to avoid OOM -// see: https://github.com/jorgecarleitao/parquet2/pull/172 -const MAX_PARQUET_HEADER_SIZE: usize = usize::MAX; - -/// Return a vector of action from a given parquet row group -pub fn actions_from_row_group( - row_group: parquet2::metadata::RowGroupMetaData, - reader: &mut R, -) -> Result, ProtocolError> { - let row_count = row_group.num_rows(); - // TODO: reuse actions buffer - let mut actions: Vec> = vec![None; row_count as usize]; - let mut state = DeserState::default(); - - for column_metadata in row_group.columns() { - let column_desc = column_metadata.descriptor(); - let schema_path = &column_desc.path_in_schema; - - let deserialize_column_page = match schema_path[0].as_ref() { - "txn" => deserialize_txn_column_page, - "add" => deserialize_add_column_page, - "remove" => deserialize_remove_column_page, - "metaData" => deserialize_metadata_column_page, - "protocol" => deserialize_protocol_column_page, - "commitInfo" => deserialize_commit_info_column_page, - "cdc" => deserialize_cdc_column_page, - _ => { - return Err(ParseError::InvalidAction(format!( - "unexpected action: {}", - &schema_path[0] - )) - .into()); - } - }; - let field = &schema_path[1..]; - - let buffer = Vec::new(); - let pages = get_page_iterator( - column_metadata, - &mut *reader, - None, - buffer, - MAX_PARQUET_HEADER_SIZE, - )?; - - let mut decompress_buffer = vec![]; - let mut dict = None; - for maybe_page in pages { - // TODO: leverage null count and skip page if possible - let page = maybe_page?; - let page = decompress(page, &mut decompress_buffer)?; - - match page { - Page::Dict(page) => { - // the first page may be a dictionary page, which needs to be deserialized - // depending on your target in-memory format - dict = Some(page); - } - Page::Data(page) => { - deserialize_column_page( - field, - &mut actions, - // TODO: pass by value? - &page, - &dict, - column_desc, - &mut state, - )?; - } - } - } - } - - Ok(actions.into_iter().map(|a| a.unwrap()).collect()) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::collections::HashMap; - use std::fs::File; - - #[test] - fn test_add_action_without_partition_values_and_stats() { - use parquet2::read::read_metadata; - - let path = "./tests/data/delta-0.2.0/_delta_log/00000000000000000003.checkpoint.parquet"; - let mut reader = File::open(path).unwrap(); - let meta_data = read_metadata(&mut reader).unwrap(); - - for row_group in meta_data.row_groups { - let actions = actions_from_row_group(row_group, &mut reader).unwrap(); - match &actions[0] { - Action::Protocol(protocol) => { - assert_eq!(protocol.min_reader_version, 1,); - assert_eq!(protocol.min_writer_version, 2,); - } - _ => panic!("expect protocol action"), - } - match &actions[1] { - Action::Metadata(meta_data) => { - assert_eq!(meta_data.id, "22ef18ba-191c-4c36-a606-3dad5cdf3830"); - assert_eq!(meta_data.name, None); - assert_eq!(meta_data.description, None); - assert_eq!( - meta_data.format, - crate::kernel::Format::new("parquet".to_string(), None), - ); - assert_eq!(meta_data.schema_string, "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}"); - assert_eq!(meta_data.partition_columns.len(), 0); - assert_eq!(meta_data.created_time, Some(1564524294376)); - assert_eq!(meta_data.configuration, HashMap::new()); - } - _ => panic!("expect txn action, got: {:?}", &actions[1]), - } - - match &actions[2] { - Action::Txn(txn) => { - assert_eq!(txn.app_id, "e4a20b59-dd0e-4c50-b074-e8ae4786df30"); - assert_eq!(txn.version, 0); - assert_eq!(txn.last_updated, Some(1564524299648)); - } - _ => panic!("expect txn action, got: {:?}", &actions[1]), - } - match &actions[3] { - Action::Remove(remove) => { - assert_eq!( - remove.path, - "part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet" - ); - assert_eq!(remove.deletion_timestamp, Some(1564524298213)); - assert_eq!(remove.data_change, false); - assert_eq!(remove.extended_file_metadata, Some(false)); - assert_eq!(remove.partition_values, None); - assert_eq!(remove.size, None); - assert_eq!(remove.tags, None); - } - _ => panic!("expect remove action, got: {:?}", &actions[2]), - } - match &actions[9] { - Action::Add(add_action) => { - assert_eq!( - add_action.path, - "part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet" - ); - assert_eq!(add_action.size, 400); - assert_eq!(add_action.modification_time, 1564524297000); - assert_eq!(add_action.partition_values.len(), 0); - assert_eq!(add_action.data_change, false); - assert_eq!(add_action.stats, None); - assert_eq!(add_action.tags, None); - } - _ => panic!("expect add action, got: {:?}", &actions[9]), - } - } - } - - #[test] - fn test_add_action_with_partition_values() { - use parquet2::read::read_metadata; - - let path = "./tests/data/checkpoint_with_partitions/_delta_log/00000000000000000002.checkpoint.parquet"; - let mut reader = File::open(path).unwrap(); - let metadata = read_metadata(&mut reader).unwrap(); - - for row_group in metadata.row_groups { - let actions = actions_from_row_group(row_group, &mut reader).unwrap(); - match &actions[0] { - Action::Protocol(protocol) => { - assert_eq!(protocol.min_reader_version, 1,); - assert_eq!(protocol.min_writer_version, 2,); - } - _ => panic!("expect protocol action"), - } - match &actions[1] { - Action::Metadata(meta_data) => { - assert_eq!(meta_data.id, "94ba8468-c676-4468-b326-adde3ab9dcd2"); - assert_eq!(meta_data.name, None); - assert_eq!(meta_data.description, None); - assert_eq!( - meta_data.format, - crate::kernel::Format::new("parquet".to_string(), None), - ); - assert_eq!( - meta_data.schema_string, - r#"{"type":"struct","fields":[{"name":"id","type":"integer","nullable":true,"metadata":{}},{"name":"color","type":"string","nullable":true,"metadata":{}}]}"# - ); - assert_eq!(meta_data.partition_columns, vec!["color"]); - assert_eq!(meta_data.created_time, Some(1661662807027)); - assert_eq!(meta_data.configuration, HashMap::new()); - } - _ => panic!("expect txn action, got: {:?}", &actions[1]), - } - - match &actions[2] { - Action::Add(add_action) => { - assert_eq!(add_action.path, "f62d8868-d952-4f9d-8bb2-fd4e011ebf36"); - assert_eq!(add_action.size, 100); - assert_eq!(add_action.modification_time, 1661662807080); - assert_eq!(add_action.partition_values.len(), 1); - assert_eq!( - add_action.partition_values.get("color").unwrap(), - &Some("red".to_string()) - ); - assert_eq!(add_action.data_change, false); - assert_eq!(add_action.stats, None); - assert_eq!(add_action.tags, None); - } - _ => panic!("expect add action, got: {:?}", &actions[9]), - } - match &actions[3] { - Action::Add(add_action) => { - assert_eq!(add_action.path, "8ac7d8e1-daab-48ef-9d05-ec22fb4b0d2f"); - assert_eq!(add_action.size, 100); - assert_eq!(add_action.modification_time, 1661662807097); - assert_eq!(add_action.partition_values.len(), 1); - assert_eq!(add_action.partition_values.get("color").unwrap(), &None); - assert_eq!(add_action.data_change, false); - assert_eq!(add_action.stats, None); - assert_eq!(add_action.tags, None); - } - _ => panic!("expect add action, got: {:?}", &actions[9]), - } - } - } -} diff --git a/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs b/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs deleted file mode 100644 index 16cb850f05..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs +++ /dev/null @@ -1,185 +0,0 @@ -//! Parquet primitive type deserialization for Action enum - -use std::convert::TryInto; - -use parquet2::encoding::hybrid_rle; -use parquet2::encoding::Encoding; -use parquet2::metadata::ColumnDescriptor; -use parquet2::page::DataPage; -use parquet2::page::DictPage; -use parquet2::types::NativeType; - -use super::dictionary; -use super::validity::ValidityRowIndexIter; -use super::{split_page, ActionVariant, ParseError}; -use crate::kernel::Action; - -struct ExactChunksIter<'a, T: NativeType> { - chunks: std::slice::ChunksExact<'a, u8>, - phantom: std::marker::PhantomData, -} - -impl<'a, T: NativeType> ExactChunksIter<'a, T> { - #[inline] - pub fn new(slice: &'a [u8]) -> Self { - assert_eq!(slice.len() % std::mem::size_of::(), 0); - let chunks = slice.chunks_exact(std::mem::size_of::()); - Self { - chunks, - phantom: std::marker::PhantomData, - } - } -} - -impl<'a, T: NativeType> Iterator for ExactChunksIter<'a, T> { - type Item = T; - - #[inline] - fn next(&mut self) -> Option { - self.chunks.next().map(|chunk| { - let chunk: ::Bytes = match chunk.try_into() { - Ok(v) => v, - Err(_) => unreachable!(), - }; - T::from_le_bytes(chunk) - }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.chunks.size_hint() - } -} - -/// Parquet primitive value reader -pub struct SomePrimitiveValueIter<'a, T: NativeType> { - valid_row_idx_iter: ValidityRowIndexIter<'a>, - value_iter: ExactChunksIter<'a, T>, -} - -impl<'a, T: NativeType> SomePrimitiveValueIter<'a, T> { - /// Create parquet primitive value reader - pub fn try_new( - page: &'a DataPage, - descriptor: &'a ColumnDescriptor, - ) -> Result { - let (max_def_level, validity_iter, values_buffer) = split_page(page, descriptor)?; - let value_iter = ExactChunksIter::::new(values_buffer); - let valid_row_idx_iter = ValidityRowIndexIter::new(max_def_level, validity_iter); - Ok(Self { - value_iter, - valid_row_idx_iter, - }) - } -} - -impl<'a, T: NativeType> Iterator for SomePrimitiveValueIter<'a, T> { - type Item = Result<(usize, T), ParseError>; - - fn next(&mut self) -> Option { - self.valid_row_idx_iter - .next() - .map(|idx_result| idx_result.map(|idx| (idx, self.value_iter.next().unwrap()))) - } -} - -/// Parquet dictionary primitive value reader -pub struct SomeDictionaryPrimitiveValueIter<'a, T: NativeType> { - valid_row_idx_iter: ValidityRowIndexIter<'a>, - index_iter: hybrid_rle::HybridRleDecoder<'a>, - dict_values: Vec, -} - -impl<'a, T: NativeType> SomeDictionaryPrimitiveValueIter<'a, T> { - /// Create parquet primitive value reader - pub fn try_new( - page: &'a DataPage, - dict: &DictPage, - descriptor: &'a ColumnDescriptor, - ) -> Result { - let (max_def_level, validity_iter, values_buffer) = split_page(page, descriptor)?; - - let valid_row_idx_iter = ValidityRowIndexIter::new(max_def_level, validity_iter); - - let dict_values = dictionary::primitive::read::(&dict.buffer, dict.num_values)?; - - let indices_buffer = values_buffer; - let bit_width = indices_buffer[0]; - let indices_buffer = &indices_buffer[1..]; - - let additional = page.num_values(); - let index_iter = - hybrid_rle::HybridRleDecoder::try_new(indices_buffer, bit_width as u32, additional)?; - Ok(Self { - index_iter, - dict_values, - valid_row_idx_iter, - }) - } -} - -impl<'a, T: NativeType> Iterator for SomeDictionaryPrimitiveValueIter<'a, T> { - type Item = Result<(usize, T), ParseError>; - - fn next(&mut self) -> Option { - self.valid_row_idx_iter.next().map(|idx_result| { - let idx = idx_result?; - let dict_idx = self.index_iter.next().ok_or_else(|| { - ParseError::Generic(format!("No dict index matches row index: {}", idx)) - })??; - let value = self.dict_values[dict_idx as usize]; - Ok((idx, value)) - }) - } -} - -#[inline] -pub fn for_each_primitive_field_value( - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - set_fn: SetFn, -) -> Result<(), ParseError> -where - T: NativeType, - ActType: ActionVariant, - SetFn: Fn(&mut ActType, T), -{ - #[cfg(debug_assertions)] - if page.descriptor.primitive_type.physical_type != T::TYPE { - return Err(ParseError::InvalidAction(format!( - "expect physical parquet type {:?}, got {:?}", - T::TYPE, - page.descriptor.primitive_type, - ))); - } - - match (&page.encoding(), dict) { - (Encoding::Plain, None) => { - let some_value_iter = SomePrimitiveValueIter::::try_new(page, descriptor)?; - for entry in some_value_iter { - let (idx, value) = entry?; - let a = actions[idx].get_or_insert_with(ActType::default_action); - set_fn(ActType::try_mut_from_action(a)?, value); - } - } - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict)) => { - let some_value_iter = - SomeDictionaryPrimitiveValueIter::try_new(page, &dict, descriptor)?; - for entry in some_value_iter { - let (idx, value) = entry?; - let a = actions[idx].get_or_insert_with(ActType::default_action); - set_fn(ActType::try_mut_from_action(a)?, value); - } - } - _ => { - return Err(ParseError::InvalidAction(format!( - "unsupported page encoding type for primitive column: {:?}", - page.encoding() - ))); - } - } - - Ok(()) -} diff --git a/crates/deltalake-core/src/protocol/parquet2_read/stats.rs b/crates/deltalake-core/src/protocol/parquet2_read/stats.rs deleted file mode 100644 index 689dfea6c0..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/stats.rs +++ /dev/null @@ -1,9 +0,0 @@ -use crate::protocol::{Add, ProtocolError, Stats}; - -impl Add { - /// Returns the composite HashMap representation of stats contained in the action if present. - /// Since stats are defined as optional in the protocol, this may be None. - pub fn get_stats_parsed(&self) -> Result, ProtocolError> { - Ok(None) - } -} diff --git a/crates/deltalake-core/src/protocol/parquet2_read/string.rs b/crates/deltalake-core/src/protocol/parquet2_read/string.rs deleted file mode 100644 index 391a9b9390..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/string.rs +++ /dev/null @@ -1,312 +0,0 @@ -//! Parquet string deserialization for Action enum - -use parquet2::encoding::hybrid_rle::HybridRleDecoder; -use parquet2::encoding::Encoding; -use parquet2::metadata::ColumnDescriptor; -use parquet2::page::{DataPage, DictPage}; - -use super::dictionary; -use super::dictionary::binary::BinaryPageDict; -use super::validity::{ValidityRepeatedRowIndexIter, ValidityRowIndexIter}; -use super::{split_page, split_page_nested, ActionVariant, ParseError}; -use crate::kernel::Action; - -pub trait StringValueIter<'a>: Iterator> { - fn try_from_encoded_values( - buffer: &'a [u8], - num_values: usize, - _dict: &'a Option, - ) -> Result - where - Self: Sized; -} - -pub struct PlainStringValueIter<'a> { - values_buffer: &'a [u8], -} - -impl<'a> StringValueIter<'a> for PlainStringValueIter<'a> { - fn try_from_encoded_values( - values_buffer: &'a [u8], - _num_values: usize, - _dict: &Option, - ) -> Result { - Ok(Self { values_buffer }) - } -} - -impl<'a> Iterator for PlainStringValueIter<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - let bytes_len = parquet2::encoding::get_length(self.values_buffer).unwrap() as usize; - let bytes_end = bytes_len + 4; - // skip first 4 bytes (length) - let bytes = &self.values_buffer[4..bytes_end]; - self.values_buffer = &self.values_buffer[bytes_end..]; - - Some(Ok(std::str::from_utf8(bytes).unwrap().to_string())) - } -} - -pub struct DictionaryStringValueIter<'a> { - dict_idx_iter: HybridRleDecoder<'a>, - dict: BinaryPageDict<'a>, -} - -impl<'a> StringValueIter<'a> for DictionaryStringValueIter<'a> { - fn try_from_encoded_values( - values_buf: &'a [u8], - num_values: usize, - dict: &'a Option, - ) -> Result { - let bit_width = values_buf[0]; - let indices_buf = &values_buf[1..]; - let dict = dict.as_ref().unwrap(); - let binary_dict = dictionary::binary::read(&dict.buffer, dict.num_values)?; - - Ok(Self { - dict_idx_iter: HybridRleDecoder::try_new(indices_buf, bit_width.into(), num_values)?, - dict: binary_dict, - }) - } -} - -impl<'a> Iterator for DictionaryStringValueIter<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - self.dict_idx_iter.next().map(|result| { - result - .map(|dict_idx| { - let dict_idx = dict_idx as usize; - std::str::from_utf8( - &self.dict.value(dict_idx).expect("Invalid dictionary index"), - ) - .unwrap() - .to_string() - }) - .map_err(|e| e.into()) - }) - } -} - -/// Parquet string value reader -pub struct SomeStringValueIter<'a, ValIter> -where - ValIter: StringValueIter<'a>, -{ - valid_row_idx_iter: ValidityRowIndexIter<'a>, - values_iter: ValIter, -} - -impl<'a, ValIter> SomeStringValueIter<'a, ValIter> -where - ValIter: StringValueIter<'a>, -{ - /// Create parquet string value reader - pub fn try_new( - page: &'a DataPage, - dict: &'a Option, - descriptor: &'a ColumnDescriptor, - ) -> Result { - let (max_def_level, validity_iter, values_buffer) = split_page(page, descriptor)?; - let valid_row_idx_iter = ValidityRowIndexIter::new(max_def_level, validity_iter); - Ok(Self { - valid_row_idx_iter, - // TODO: page.num_values is more than what's being packed in rle - values_iter: ValIter::try_from_encoded_values(values_buffer, page.num_values(), dict)?, - }) - } -} - -impl<'a, ValIter> Iterator for SomeStringValueIter<'a, ValIter> -where - ValIter: StringValueIter<'a>, -{ - type Item = Result<(usize, String), ParseError>; - - fn next(&mut self) -> Option { - self.valid_row_idx_iter.next().map(|result| { - let idx = result?; - let value = self.values_iter.next().ok_or_else(|| { - ParseError::Generic(format!("No string value matches row index: {}", idx)) - })??; - Ok((idx, value)) - }) - } -} - -/// Parquet repeated string value reader -pub struct SomeRepeatedStringValueIter<'a, ValIter> -where - ValIter: StringValueIter<'a>, -{ - repeated_row_idx_iter: ValidityRepeatedRowIndexIter<'a>, - values_iter: ValIter, -} - -impl<'a, ValIter> SomeRepeatedStringValueIter<'a, ValIter> -where - ValIter: StringValueIter<'a>, -{ - /// Create parquet string value reader - pub fn try_new( - page: &'a DataPage, - dict: &'a Option, - descriptor: &'a ColumnDescriptor, - ) -> Result { - let (max_rep_level, rep_iter, max_def_level, validity_iter, values_buffer) = - split_page_nested(page, descriptor)?; - let repeated_row_idx_iter = ValidityRepeatedRowIndexIter::new( - max_rep_level, - rep_iter, - max_def_level, - validity_iter, - ); - - Ok(Self { - values_iter: ValIter::try_from_encoded_values(values_buffer, page.num_values(), dict)?, - repeated_row_idx_iter, - }) - } -} - -impl<'a, ValIter> Iterator for SomeRepeatedStringValueIter<'a, ValIter> -where - ValIter: StringValueIter<'a>, -{ - type Item = Result<(usize, Vec), ParseError>; - - fn next(&mut self) -> Option { - self.repeated_row_idx_iter.next().map(|result| { - let (idx, item_count) = result?; - - let strings = (0..item_count) - .map(|i| { - self.values_iter.next().ok_or_else(|| { - ParseError::Generic(format!("No string value found list index: {}", i)) - })? - }) - .collect::, _>>()?; - - Ok((idx, strings)) - }) - } -} - -pub fn for_each_repeated_string_field_value_with_idx( - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - map_fn: MapFn, -) -> Result<(), ParseError> -where - MapFn: FnMut(Result<(usize, Vec), ParseError>) -> Result<(), ParseError>, -{ - #[cfg(debug_assertions)] - { - use parquet2::schema::types::PhysicalType; - if page.descriptor.primitive_type.physical_type != PhysicalType::ByteArray { - return Err(ParseError::InvalidAction(format!( - "expect parquet utf8 type, got primitive type: {:?}", - page.descriptor.primitive_type, - ))); - } - } - - match page.encoding() { - Encoding::Plain => { - SomeRepeatedStringValueIter::::try_new(page, dict, descriptor)? - .try_for_each(map_fn)?; - } - Encoding::RleDictionary | Encoding::PlainDictionary => { - SomeRepeatedStringValueIter::::try_new( - page, dict, descriptor, - )? - .try_for_each(map_fn)?; - } - _ => { - return Err(ParseError::InvalidAction(format!( - "unsupported page encoding type for string list column: {:?}", - page.encoding() - ))); - } - } - - Ok(()) -} - -pub fn for_each_repeated_string_field_value( - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - set_fn: SetFn, -) -> Result<(), ParseError> -where - ActType: ActionVariant, - SetFn: Fn(&mut ActType, Vec), -{ - for_each_repeated_string_field_value_with_idx( - page, - dict, - descriptor, - |entry: Result<(usize, Vec), ParseError>| -> Result<(), ParseError> { - let (idx, strings) = entry?; - let a = actions[idx].get_or_insert_with(ActType::default_action); - set_fn(ActType::try_mut_from_action(a)?, strings); - Ok(()) - }, - ) -} - -pub fn for_each_string_field_value( - actions: &mut Vec>, - page: &DataPage, - dict: &Option, - descriptor: &ColumnDescriptor, - set_fn: SetFn, -) -> Result<(), ParseError> -where - ActType: ActionVariant, - SetFn: Fn(&mut ActType, String), -{ - #[cfg(debug_assertions)] - { - use parquet2::schema::types::PhysicalType; - if page.descriptor.primitive_type.physical_type != PhysicalType::ByteArray { - return Err(ParseError::InvalidAction(format!( - "expect parquet utf8 type, got primitive type: {:?}", - page.descriptor.primitive_type, - ))); - } - } - - let map_fn = |entry: Result<(usize, String), ParseError>| -> Result<(), ParseError> { - let (idx, value) = entry?; - let a = actions[idx].get_or_insert_with(ActType::default_action); - set_fn(ActType::try_mut_from_action(a)?, value); - - Ok(()) - }; - - match page.encoding() { - Encoding::Plain => { - SomeStringValueIter::::try_new(page, dict, descriptor)? - .try_for_each(map_fn)?; - } - Encoding::RleDictionary | Encoding::PlainDictionary => { - SomeStringValueIter::::try_new(page, dict, descriptor)? - .try_for_each(map_fn)?; - } - _ => { - return Err(ParseError::InvalidAction(format!( - "unsupported page encoding type for string column: {:?}", - page.encoding() - ))); - } - } - - Ok(()) -} diff --git a/crates/deltalake-core/src/protocol/parquet2_read/validity.rs b/crates/deltalake-core/src/protocol/parquet2_read/validity.rs deleted file mode 100644 index 42f9a4d5b3..0000000000 --- a/crates/deltalake-core/src/protocol/parquet2_read/validity.rs +++ /dev/null @@ -1,137 +0,0 @@ -//! Parquet deserialization for row validity - -use super::ParseError; -use parquet2::encoding::hybrid_rle::HybridRleDecoder; - -/// Iterator that returns row index for rows that are not null -pub struct ValidityRowIndexIter<'a> { - row_idx: usize, - max_def_level: u32, - validity_iter: HybridRleDecoder<'a>, -} - -impl<'a> ValidityRowIndexIter<'a> { - /// Create parquet primitive value reader - pub fn new(max_def_level: i16, validity_iter: HybridRleDecoder<'a>) -> Self { - Self { - max_def_level: max_def_level as u32, - validity_iter, - row_idx: 0, - } - } -} - -impl<'a> Iterator for ValidityRowIndexIter<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - for def_lvl in self.validity_iter.by_ref() { - match def_lvl { - Ok(def_lvl) => { - if def_lvl == self.max_def_level { - let row_idx = self.row_idx; - self.row_idx += 1; - return Some(Ok(row_idx)); - } else { - self.row_idx += 1; - continue; - } - } - Err(e) => return Some(Err(e.into())), - } - } - None - } -} - -/// Iterator that returns row index for leaf repeated rows that are not null -/// -/// For example, used in List type where each index contains multiple values. -#[allow(dead_code)] -pub struct ValidityRepeatedRowIndexIter<'a> { - row_idx: usize, - max_def_level: u32, - max_rep_level: u32, - repeat_count: usize, - lvl_iter: std::iter::Zip, HybridRleDecoder<'a>>, -} - -impl<'a> ValidityRepeatedRowIndexIter<'a> { - /// Create parquet primitive value reader - pub fn new( - max_rep_level: i16, - rep_iter: HybridRleDecoder<'a>, - max_def_level: i16, - validity_iter: HybridRleDecoder<'a>, - ) -> Self { - Self { - lvl_iter: rep_iter.zip(validity_iter), - max_rep_level: max_rep_level as u32, - max_def_level: max_def_level as u32, - row_idx: 0, - repeat_count: 0, - } - } -} - -impl<'a> Iterator for ValidityRepeatedRowIndexIter<'a> { - // (index, item_count) - type Item = Result<(usize, usize), ParseError>; - - fn next(&mut self) -> Option { - for (rep_lvl, def_lvl) in self.lvl_iter.by_ref() { - match (rep_lvl, def_lvl) { - (Ok(rep_lvl), Ok(def_lvl)) => { - if def_lvl == self.max_def_level { - if rep_lvl == 0 { - match self.repeat_count { - 0 => self.repeat_count = 1, - item_count => { - // reached start of next batch - // return current batch - let row_idx = self.row_idx; - self.row_idx += 1; - self.repeat_count = 1; - return Some(Ok((row_idx, item_count))); - } - } - } else { - // accumulate count for current batch - self.repeat_count += 1; - } - } else { - if self.repeat_count >= 1 { - // current row is None, emit previous row - let row_idx = self.row_idx; - let item_count = self.repeat_count; - self.row_idx += 1; - // set to 0 becauze def_lvl not at max def level - self.repeat_count = 0; - return Some(Ok((row_idx, item_count))); - } else { - // both previous and current row are None, proceed to the next row - self.row_idx += 1; - continue; - } - } - } - (_, Err(e)) => { - return Some(Err(e.into())); - } - (Err(e), _) => { - return Some(Err(e.into())); - } - } - } - - // end of iteration, emit the last row - if self.repeat_count >= 1 { - let item_count = self.repeat_count; - // set repeat count to 0 so we can end the iteration - self.repeat_count = 0; - Some(Ok((self.row_idx, item_count))) - } else { - None - } - } -} diff --git a/crates/deltalake-core/src/protocol/parquet_read/mod.rs b/crates/deltalake-core/src/protocol/parquet_read/mod.rs index a546e4b0b0..ecad3b7865 100644 --- a/crates/deltalake-core/src/protocol/parquet_read/mod.rs +++ b/crates/deltalake-core/src/protocol/parquet_read/mod.rs @@ -736,7 +736,7 @@ mod tests { use parquet::file::reader::{FileReader, SerializedFileReader}; use std::fs::File; - let path = "./tests/data/delta-0.2.0/_delta_log/00000000000000000003.checkpoint.parquet"; + let path = "../deltalake-test/tests/data/delta-0.2.0/_delta_log/00000000000000000003.checkpoint.parquet"; let preader = SerializedFileReader::new(File::open(path).unwrap()).unwrap(); let mut iter = preader.get_row_iter(None).unwrap(); diff --git a/crates/deltalake-core/src/protocol/time_utils.rs b/crates/deltalake-core/src/protocol/time_utils.rs index 407185b927..cf77edb862 100644 --- a/crates/deltalake-core/src/protocol/time_utils.rs +++ b/crates/deltalake-core/src/protocol/time_utils.rs @@ -5,78 +5,6 @@ use arrow::temporal_conversions; #[cfg(feature = "parquet")] use parquet::basic::TimeUnit; -#[cfg(feature = "parquet2")] -use parquet2::schema::types::TimeUnit; - -// vendored from arrow-rs and arrow2 so we don't need to depend on arrow2 when the parquet2 feature -// is enabled. -#[cfg(not(feature = "arrow"))] -mod temporal_conversions { - use chrono::NaiveDateTime; - - /// Number of milliseconds in a second - pub const MILLISECONDS: i64 = 1_000; - /// Number of microseconds in a second - pub const MICROSECONDS: i64 = 1_000_000; - /// Number of nanoseconds in a second - pub const NANOSECONDS: i64 = 1_000_000_000; - - /// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`] - #[inline] - pub fn timestamp_ms_to_datetime(v: i64) -> Option { - let (sec, milli_sec) = split_second(v, MILLISECONDS); - - NaiveDateTime::from_timestamp_opt( - // extract seconds from milliseconds - sec, - // discard extracted seconds and convert milliseconds to nanoseconds - milli_sec * MICROSECONDS as u32, - ) - } - - /// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`] - #[inline] - pub fn timestamp_us_to_datetime(v: i64) -> Option { - let (sec, micro_sec) = split_second(v, MICROSECONDS); - - NaiveDateTime::from_timestamp_opt( - // extract seconds from microseconds - sec, - // discard extracted seconds and convert microseconds to nanoseconds - micro_sec * MILLISECONDS as u32, - ) - } - - /// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`] - #[inline] - pub fn timestamp_ns_to_datetime(v: i64) -> Option { - let (sec, nano_sec) = split_second(v, NANOSECONDS); - - NaiveDateTime::from_timestamp_opt( - // extract seconds from nanoseconds - sec, // discard extracted seconds - nano_sec, - ) - } - - /// - #[inline] - pub(crate) fn split_second(v: i64, base: i64) -> (i64, u32) { - if v < 0 { - let v = -v; - let mut seconds = v / base; - let mut part = v % base; - - if part > 0 { - seconds += 1; - part = base - part; - } - (-seconds, part as u32) - } else { - (v / base, (v % base) as u32) - } - } -} /// Convert an ISO-8601/RFC3339 timestamp string to a numeric microsecond epoch representation. /// Stats strings are written with millisecond precision as described by the delta protocol. @@ -85,7 +13,7 @@ pub fn timestamp_micros_from_stats_string(s: &str) -> Result Option { let dt = match time_unit { TimeUnit::MILLIS(_) => temporal_conversions::timestamp_ms_to_datetime(n), @@ -96,25 +24,11 @@ pub fn timestamp_to_delta_stats_string(n: i64, time_unit: &TimeUnit) -> Option Option { - let dt = match time_unit { - TimeUnit::Milliseconds => temporal_conversions::timestamp_ms_to_datetime(n), - TimeUnit::Microseconds => temporal_conversions::timestamp_us_to_datetime(n), - TimeUnit::Nanoseconds => temporal_conversions::timestamp_ns_to_datetime(n), - }?; - - Some(format!("{}", dt.format("%Y-%m-%dT%H:%M:%S%.3fZ"))) -} - -#[cfg(test)] +#[cfg(all(test, feature = "parquet"))] mod tests { use super::*; - #[cfg(not(feature = "parquet2"))] use parquet::format::{MicroSeconds, MilliSeconds, NanoSeconds, TimeUnit}; - #[cfg(not(feature = "parquet2"))] #[test] fn test_timestamp_to_delta_stats_string() { let s = @@ -135,18 +49,6 @@ mod tests { assert_eq!("2021-08-11T12:33:19.541Z".to_string(), s); } - #[cfg(feature = "parquet2")] - #[test] - fn test_timestamp_to_delta_stats_string() { - let s = timestamp_to_delta_stats_string(1628685199541, &TimeUnit::Milliseconds).unwrap(); - assert_eq!("2021-08-11T12:33:19.541Z".to_string(), s); - let s = timestamp_to_delta_stats_string(1628685199541000, &TimeUnit::Microseconds).unwrap(); - assert_eq!("2021-08-11T12:33:19.541Z".to_string(), s); - let s = - timestamp_to_delta_stats_string(1628685199541000000, &TimeUnit::Nanoseconds).unwrap(); - assert_eq!("2021-08-11T12:33:19.541Z".to_string(), s); - } - #[test] fn test_timestamp_micros_from_stats_string() { let us = timestamp_micros_from_stats_string("2021-08-11T12:33:19.541Z").unwrap(); diff --git a/crates/deltalake-core/src/storage/config.rs b/crates/deltalake-core/src/storage/config.rs deleted file mode 100644 index a9ef5fc272..0000000000 --- a/crates/deltalake-core/src/storage/config.rs +++ /dev/null @@ -1,394 +0,0 @@ -//! Configurltion handling for defining Storage backends for DeltaTables. -use std::collections::HashMap; -use std::sync::Arc; - -use object_store::memory::InMemory; -use object_store::path::Path; -use object_store::prefix::PrefixStore; -use object_store::{parse_url_opts, DynObjectStore, Error as ObjectStoreError, ObjectStore}; -use serde::{Deserialize, Serialize}; -use url::Url; - -use super::file::FileStorageBackend; -use super::utils::str_is_truthy; -use super::ObjectStoreRef; -use crate::errors::{DeltaResult, DeltaTableError}; -use crate::logstore::default_logstore::DefaultLogStore; -#[cfg(any(feature = "s3", feature = "s3-native-tls"))] -use crate::logstore::s3::S3DynamoDbLogStore; -use crate::logstore::{LogStoreConfig, LogStoreRef}; -use crate::table::builder::ensure_table_uri; - -#[cfg(any(feature = "s3", feature = "s3-native-tls"))] -use super::s3::{S3StorageBackend, S3StorageOptions}; -#[cfg(feature = "hdfs")] -use datafusion_objectstore_hdfs::object_store::hdfs::HadoopFileSystem; -#[cfg(any(feature = "s3", feature = "s3-native-tls"))] -use object_store::aws::AmazonS3ConfigKey; -#[cfg(feature = "azure")] -use object_store::azure::AzureConfigKey; -#[cfg(feature = "gcs")] -use object_store::gcp::GoogleConfigKey; -#[cfg(any( - feature = "s3", - feature = "s3-native-tls", - feature = "gcs", - feature = "azure" -))] -use std::str::FromStr; - -#[cfg(feature = "azure")] -mod azure; - -/// Recognises various URL formats, identifying the relevant [`ObjectStore`](crate::ObjectStore) -#[derive(Debug, Eq, PartialEq)] -enum ObjectStoreScheme { - /// Url corresponding to LocalFileSystem - Local, - /// Url corresponding to InMemory - Memory, - /// Url corresponding to S3 - AmazonS3, - /// Url corresponding to GoogleCloudStorage - GoogleCloudStorage, - /// Url corresponding to MicrosoftAzure - MicrosoftAzure, - /// Url corresponding to HttpStore - Http, - /// Url corresponding to Hdfs - Hdfs, -} - -impl ObjectStoreScheme { - /// Create an [`ObjectStoreScheme`] from the provided [`Url`] - /// - /// Returns the [`ObjectStoreScheme`] and the remaining [`Path`] - fn parse( - url: &Url, - #[allow(unused)] options: &mut StorageOptions, - ) -> Result<(Self, Path), ObjectStoreError> { - let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); - - let (scheme, path) = match (url.scheme(), url.host_str()) { - ("file", None) => (Self::Local, url.path()), - ("memory", None) => (Self::Memory, url.path()), - ("s3" | "s3a", Some(_)) => (Self::AmazonS3, url.path()), - ("gs", Some(_)) => (Self::GoogleCloudStorage, url.path()), - ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => { - (Self::MicrosoftAzure, url.path()) - } - ("http", Some(_)) => (Self::Http, url.path()), - ("hdfs", Some(_)) => (Self::Hdfs, url.path()), - ("https", Some(host)) => { - if host.ends_with("dfs.core.windows.net") || host.ends_with("blob.core.windows.net") - { - (Self::MicrosoftAzure, url.path()) - } else if host.contains("dfs.fabric.microsoft.com") - || host.contains("blob.fabric.microsoft.com") - { - #[cfg(feature = "azure")] - if !options - .as_azure_options() - .contains_key(&AzureConfigKey::UseFabricEndpoint) - { - options.0.insert( - AzureConfigKey::UseFabricEndpoint.as_ref().to_string(), - "true".to_string(), - ); - } - (Self::MicrosoftAzure, url.path()) - } else if host.ends_with("amazonaws.com") { - match host.starts_with("s3") { - true => (Self::AmazonS3, strip_bucket().unwrap_or_default()), - false => (Self::AmazonS3, url.path()), - } - } else if host.ends_with("r2.cloudflarestorage.com") { - (Self::AmazonS3, strip_bucket().unwrap_or_default()) - } else { - (Self::Http, url.path()) - } - } - _ => return Err(ObjectStoreError::NotImplemented), - }; - - let path = Path::parse(path)?; - Ok((scheme, path)) - } -} - -/// Options used for configuring backend storage -#[derive(Clone, Debug, Serialize, Deserialize, Default)] -pub struct StorageOptions(pub HashMap); - -impl StorageOptions { - /// Create a new instance of [`StorageOptions`] - pub fn new(options: HashMap) -> Self { - let mut options = options; - if let Ok(value) = std::env::var("AZURE_STORAGE_ALLOW_HTTP") { - options.insert("allow_http".into(), value); - } - if let Ok(value) = std::env::var("AZURE_STORAGE_USE_HTTP") { - options.insert("allow_http".into(), value); - } - if let Ok(value) = std::env::var("AWS_ALLOW_HTTP") { - options.insert("allow_http".into(), value); - } - Self(options) - } - - /// Add values from the environment to storage options - #[cfg(feature = "azure")] - pub fn with_env_azure(&mut self) { - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if let Ok(config_key) = AzureConfigKey::from_str(&key.to_ascii_lowercase()) { - if !self.0.contains_key(config_key.as_ref()) { - self.0 - .insert(config_key.as_ref().to_string(), value.to_string()); - } - } - } - } - } - - /// Add values from the environment to storage options - #[cfg(feature = "gcs")] - pub fn with_env_gcs(&mut self) { - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if let Ok(config_key) = GoogleConfigKey::from_str(&key.to_ascii_lowercase()) { - if !self.0.contains_key(config_key.as_ref()) { - self.0 - .insert(config_key.as_ref().to_string(), value.to_string()); - } - } - } - } - } - - /// Add values from the environment to storage options - #[cfg(any(feature = "s3", feature = "s3-native-tls"))] - pub fn with_env_s3(&mut self) { - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if let Ok(config_key) = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) { - if !self.0.contains_key(config_key.as_ref()) { - self.0 - .insert(config_key.as_ref().to_string(), value.to_string()); - } - } - } - } - } - - /// Denotes if unsecure connections via http are allowed - pub fn allow_http(&self) -> bool { - self.0.iter().any(|(key, value)| { - key.to_ascii_lowercase().contains("allow_http") & str_is_truthy(value) - }) - } - - /// Subset of options relevant for azure storage - #[cfg(feature = "azure")] - pub fn as_azure_options(&self) -> HashMap { - self.0 - .iter() - .filter_map(|(key, value)| { - let az_key = AzureConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; - Some((az_key, value.clone())) - }) - .collect() - } - - /// Subset of options relevant for s3 storage - #[cfg(any(feature = "s3", feature = "s3-native-tls"))] - pub fn as_s3_options(&self) -> HashMap { - self.0 - .iter() - .filter_map(|(key, value)| { - let s3_key = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; - Some((s3_key, value.clone())) - }) - .collect() - } - - /// Subset of options relevant for gcs storage - #[cfg(feature = "gcs")] - pub fn as_gcs_options(&self) -> HashMap { - self.0 - .iter() - .filter_map(|(key, value)| { - let gcs_key = GoogleConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; - Some((gcs_key, value.clone())) - }) - .collect() - } -} - -impl From> for StorageOptions { - fn from(value: HashMap) -> Self { - Self::new(value) - } -} - -/// Configure a [`LogStoreRef`] for the given url and configuration -pub fn configure_log_store( - location: &str, - options: impl Into + Clone, - storage_backend: Option<(ObjectStoreRef, Url)>, -) -> DeltaResult { - let mut options = options.into(); - let (object_store, location) = match storage_backend { - Some((object_store, url)) => (object_store, url), - None => { - let url = ensure_table_uri(location)?; - let object_store = crate::storage::config::configure_store(&url, &mut options)?; - (object_store, url) - } - }; - - let (scheme, _prefix) = ObjectStoreScheme::parse(&location, &mut options)?; - match scheme { - #[cfg(any(feature = "s3", feature = "s3-native-tls"))] - ObjectStoreScheme::AmazonS3 => { - let s3_options = S3StorageOptions::from_map(&options.0); - if Some("dynamodb".to_owned()) - == s3_options - .locking_provider - .as_ref() - .map(|v| v.to_lowercase()) - { - Ok(Arc::new(S3DynamoDbLogStore::try_new( - location, - options, - &s3_options, - object_store, - )?)) - } else { - Ok(Arc::new(DefaultLogStore::new( - object_store, - LogStoreConfig { location, options }, - ))) - } - } - _ => Ok(Arc::new(DefaultLogStore::new( - object_store, - LogStoreConfig { location, options }, - ))), - } -} - -/// Configure an instance of an [`ObjectStore`] for the given url and configuration -pub fn configure_store( - url: &Url, - options: &mut StorageOptions, -) -> DeltaResult> { - let (scheme, _prefix) = ObjectStoreScheme::parse(url, options)?; - match scheme { - ObjectStoreScheme::Local => { - let path = url - .to_file_path() - .map_err(|_| DeltaTableError::InvalidTableLocation(url.to_string()))?; - Ok(Arc::new(FileStorageBackend::try_new(path)?)) - } - ObjectStoreScheme::Memory => url_prefix_handler(InMemory::new(), Path::parse(url.path())?), - #[cfg(any(feature = "s3", feature = "s3-native-tls"))] - ObjectStoreScheme::AmazonS3 => { - options.with_env_s3(); - let (store, prefix) = parse_url_opts(url, options.as_s3_options())?; - let s3_options = S3StorageOptions::from_map(&options.0); - if options - .as_s3_options() - .contains_key(&AmazonS3ConfigKey::CopyIfNotExists) - { - url_prefix_handler(store, prefix) - } else if Some("dynamodb".to_owned()) - == s3_options - .locking_provider - .as_ref() - .map(|v| v.to_lowercase()) - { - // if a lock client is requested, unsafe rename is always safe - let store = S3StorageBackend::try_new(Arc::new(store), true)?; - url_prefix_handler(store, prefix) - } else { - let store = - S3StorageBackend::try_new(Arc::new(store), s3_options.allow_unsafe_rename)?; - url_prefix_handler(store, prefix) - } - } - #[cfg(feature = "azure")] - ObjectStoreScheme::MicrosoftAzure => { - let config = azure::AzureConfigHelper::try_new(options.as_azure_options())?.build()?; - let (store, prefix) = parse_url_opts(url, config)?; - url_prefix_handler(store, prefix) - } - #[cfg(feature = "gcs")] - ObjectStoreScheme::GoogleCloudStorage => { - options.with_env_gcs(); - let (store, prefix) = parse_url_opts(url, options.as_gcs_options())?; - url_prefix_handler(store, prefix) - } - #[cfg(feature = "hdfs")] - ObjectStoreScheme::Hdfs => { - let store = HadoopFileSystem::new(url.as_ref()).ok_or_else(|| { - DeltaTableError::Generic(format!( - "failed to create HadoopFileSystem for {}", - url.as_ref() - )) - })?; - url_prefix_handler(store, _prefix) - } - #[cfg(not(feature = "hdfs"))] - ObjectStoreScheme::Hdfs => Err(DeltaTableError::MissingFeature { - feature: "hdfs", - url: url.as_ref().into(), - }), - _ => { - let (store, prefix) = parse_url_opts(url, options.0.clone())?; - url_prefix_handler(store, prefix) - } - } -} - -fn url_prefix_handler(store: T, prefix: Path) -> DeltaResult> { - if prefix != Path::from("/") { - Ok(Arc::new(PrefixStore::new(store, prefix))) - } else { - Ok(Arc::new(store)) - } -} - -#[cfg(test)] -mod test { - use crate::table::builder::ensure_table_uri; - - use super::*; - - #[tokio::test] - async fn test_configure_store_local() -> Result<(), Box> { - let temp_dir = tempfile::tempdir().unwrap(); - let temp_dir_path = temp_dir.path(); - let path = temp_dir_path.join("test space 😁"); - - let table_uri = ensure_table_uri(path.as_os_str().to_str().unwrap()).unwrap(); - - let store = configure_store(&table_uri, &mut StorageOptions::default()).unwrap(); - - let contents = b"test"; - let key = "test.txt"; - let file_path = path.join(key); - std::fs::write(&file_path, contents).unwrap(); - - let res = store - .get(&object_store::path::Path::from(key)) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(res.as_ref(), contents); - - Ok(()) - } -} diff --git a/crates/deltalake-core/src/storage/mod.rs b/crates/deltalake-core/src/storage/mod.rs index 37bdfcb2e0..2398276011 100644 --- a/crates/deltalake-core/src/storage/mod.rs +++ b/crates/deltalake-core/src/storage/mod.rs @@ -1,17 +1,23 @@ //! Object storage backend abstraction layer for Delta Table transaction logs and data -use std::sync::Arc; +use dashmap::DashMap; +use std::collections::HashMap; +use std::sync::{Arc, OnceLock}; use lazy_static::lazy_static; +use serde::{Deserialize, Serialize}; +use url::Url; -pub mod config; pub mod file; pub mod utils; -#[cfg(any(feature = "s3", feature = "s3-native-tls"))] -pub mod s3; +use crate::{DeltaResult, DeltaTableError}; +pub use object_store; +use object_store::local::LocalFileSystem; +use object_store::memory::InMemory; pub use object_store::path::{Path, DELIMITER}; +use object_store::prefix::PrefixStore; pub use object_store::{ DynObjectStore, Error as ObjectStoreError, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result as ObjectStoreResult, @@ -22,11 +28,147 @@ lazy_static! { static ref DELTA_LOG_PATH: Path = Path::from("_delta_log"); } +/// Sharable reference to [`ObjectStore`] +pub type ObjectStoreRef = Arc; + +/// Factory trait for creating [ObjectStoreRef] instances at runtime +pub trait ObjectStoreFactory: Send + Sync { + #[allow(missing_docs)] + fn parse_url_opts( + &self, + url: &Url, + options: &StorageOptions, + ) -> DeltaResult<(ObjectStoreRef, Path)>; +} + +#[derive(Clone, Debug, Default)] +struct DefaultObjectStoreFactory {} + +impl ObjectStoreFactory for DefaultObjectStoreFactory { + fn parse_url_opts( + &self, + url: &Url, + _options: &StorageOptions, + ) -> DeltaResult<(ObjectStoreRef, Path)> { + match url.scheme() { + "memory" => { + let path = Path::from_url_path(url.path())?; + let store = Arc::new(InMemory::new()) as ObjectStoreRef; + Ok((url_prefix_handler(store, path.clone())?, path)) + } + "file" => { + let store = Arc::new(LocalFileSystem::new_with_prefix( + url.to_file_path().unwrap(), + )?) as ObjectStoreRef; + Ok((store, Path::from("/"))) + } + _ => Err(DeltaTableError::InvalidTableLocation(url.clone().into())), + } + } +} + +/// TODO +pub type FactoryRegistry = Arc>>; + +/// TODO +pub fn factories() -> FactoryRegistry { + static REGISTRY: OnceLock = OnceLock::new(); + REGISTRY + .get_or_init(|| { + let registry = FactoryRegistry::default(); + registry.insert( + Url::parse("memory://").unwrap(), + Arc::new(DefaultObjectStoreFactory::default()), + ); + registry.insert( + Url::parse("file://").unwrap(), + Arc::new(DefaultObjectStoreFactory::default()), + ); + registry + }) + .clone() +} + +/// Simpler access pattern for the [FactoryRegistry] to get a single store +pub fn store_for(url: &Url) -> DeltaResult { + let scheme = Url::parse(&format!("{}://", url.scheme())).unwrap(); + if let Some(factory) = factories().get(&scheme) { + let (store, _prefix) = factory.parse_url_opts(url, &StorageOptions::default())?; + Ok(store) + } else { + Err(DeltaTableError::InvalidTableLocation(url.clone().into())) + } +} + +/// Options used for configuring backend storage +#[derive(Clone, Debug, Serialize, Deserialize, Default)] +pub struct StorageOptions(pub HashMap); + +impl From> for StorageOptions { + fn from(value: HashMap) -> Self { + Self(value) + } +} + /// Return the uri of commit version. +/// +/// ```rust +/// # use deltalake_core::storage::*; +/// use object_store::path::Path; +/// let uri = commit_uri_from_version(1); +/// assert_eq!(uri, Path::from("_delta_log/00000000000000000001.json")); +/// ``` pub fn commit_uri_from_version(version: i64) -> Path { let version = format!("{version:020}.json"); DELTA_LOG_PATH.child(version.as_str()) } -/// Sharable reference to [`ObjectStore`] -pub type ObjectStoreRef = Arc; +#[allow(unused)] +/// Return true for all the stringly values typically associated with true +/// +/// aka YAML booleans +pub fn str_is_truthy(val: &str) -> bool { + val.eq_ignore_ascii_case("1") + | val.eq_ignore_ascii_case("true") + | val.eq_ignore_ascii_case("on") + | val.eq_ignore_ascii_case("yes") + | val.eq_ignore_ascii_case("y") +} + +/// Simple function to wrap the given [ObjectStore] in a [PrefixStore] if necessary +/// +/// This simplifies the use of t he storage since it ensures that list/get/etc operations +/// start from the prefix in the object storage rather than from the root configured URI of the +/// [ObjectStore] +pub fn url_prefix_handler(store: T, prefix: Path) -> DeltaResult { + if prefix != Path::from("/") { + Ok(Arc::new(PrefixStore::new(store, prefix))) + } else { + Ok(Arc::new(store)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_url_prefix_handler() { + let store = InMemory::new(); + let path = Path::parse("/databases/foo/bar").expect("Failed to parse path"); + + let prefixed = url_prefix_handler(store, path); + assert!(prefixed.is_ok()); + } + + #[test] + fn test_str_is_truthy() { + for value in ["1", "true", "on", "YES", "Y"].iter() { + assert!(str_is_truthy(value)); + } + + for value in ["0", "FALSE", "off", "NO", "n", "bork"].iter() { + assert!(!str_is_truthy(value)); + } + } +} diff --git a/crates/deltalake-core/src/storage/s3.rs b/crates/deltalake-core/src/storage/s3.rs deleted file mode 100644 index b7bf446317..0000000000 --- a/crates/deltalake-core/src/storage/s3.rs +++ /dev/null @@ -1,510 +0,0 @@ -//! AWS S3 storage backend. - -use super::utils::str_is_truthy; -use crate::table::builder::{s3_storage_options, str_option}; -use bytes::Bytes; -use futures::stream::BoxStream; -use object_store::{path::Path, Error as ObjectStoreError}; -use object_store::{ - DynObjectStore, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, - Result as ObjectStoreResult, -}; -use rusoto_core::Region; -use std::collections::HashMap; -use std::fmt::Debug; -use std::ops::Range; -use std::sync::Arc; -use std::time::Duration; -use tokio::io::AsyncWrite; - -const STORE_NAME: &str = "DeltaS3ObjectStore"; - -/// Options used to configure the S3StorageBackend. -/// -/// Available options are described in [s3_storage_options]. -#[derive(Clone, Debug, PartialEq, Eq)] -#[allow(missing_docs)] -pub struct S3StorageOptions { - pub endpoint_url: Option, - pub region: Region, - pub profile: Option, - pub aws_access_key_id: Option, - pub aws_secret_access_key: Option, - pub aws_session_token: Option, - pub virtual_hosted_style_request: bool, - pub locking_provider: Option, - pub assume_role_arn: Option, - pub assume_role_session_name: Option, - pub use_web_identity: bool, - pub s3_pool_idle_timeout: Duration, - pub sts_pool_idle_timeout: Duration, - pub s3_get_internal_server_error_retries: usize, - pub allow_unsafe_rename: bool, - pub extra_opts: HashMap, -} - -impl S3StorageOptions { - /// Creates an instance of S3StorageOptions from the given HashMap. - pub fn from_map(options: &HashMap) -> S3StorageOptions { - let extra_opts = options - .iter() - .filter(|(k, _)| !s3_storage_options::S3_OPTS.contains(&k.as_str())) - .map(|(k, v)| (k.to_owned(), v.to_owned())) - .collect(); - - // Copy web identity values provided in options but not the environment into the environment - // to get picked up by the `from_k8s_env` call in `get_web_identity_provider`. - Self::ensure_env_var(options, s3_storage_options::AWS_REGION); - Self::ensure_env_var(options, s3_storage_options::AWS_PROFILE); - Self::ensure_env_var(options, s3_storage_options::AWS_ACCESS_KEY_ID); - Self::ensure_env_var(options, s3_storage_options::AWS_SECRET_ACCESS_KEY); - Self::ensure_env_var(options, s3_storage_options::AWS_SESSION_TOKEN); - Self::ensure_env_var(options, s3_storage_options::AWS_WEB_IDENTITY_TOKEN_FILE); - Self::ensure_env_var(options, s3_storage_options::AWS_ROLE_ARN); - Self::ensure_env_var(options, s3_storage_options::AWS_ROLE_SESSION_NAME); - - let endpoint_url = str_option(options, s3_storage_options::AWS_ENDPOINT_URL); - let region = if let Some(endpoint_url) = endpoint_url.as_ref() { - Region::Custom { - name: Self::str_or_default( - options, - s3_storage_options::AWS_REGION, - "custom".to_string(), - ), - endpoint: endpoint_url.to_owned(), - } - } else { - Region::default() - }; - let profile = str_option(options, s3_storage_options::AWS_PROFILE); - - let s3_pool_idle_timeout = Self::u64_or_default( - options, - s3_storage_options::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, - 15, - ); - let sts_pool_idle_timeout = Self::u64_or_default( - options, - s3_storage_options::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, - 10, - ); - - let s3_get_internal_server_error_retries = Self::u64_or_default( - options, - s3_storage_options::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, - 10, - ) as usize; - - let virtual_hosted_style_request: bool = - str_option(options, s3_storage_options::AWS_S3_ADDRESSING_STYLE) - .map(|addressing_style| addressing_style == "virtual") - .unwrap_or(false); - - let allow_unsafe_rename = - str_option(options, s3_storage_options::AWS_S3_ALLOW_UNSAFE_RENAME) - .map(|val| str_is_truthy(&val)) - .unwrap_or(false); - - Self { - endpoint_url, - region, - profile, - aws_access_key_id: str_option(options, s3_storage_options::AWS_ACCESS_KEY_ID), - aws_secret_access_key: str_option(options, s3_storage_options::AWS_SECRET_ACCESS_KEY), - aws_session_token: str_option(options, s3_storage_options::AWS_SESSION_TOKEN), - virtual_hosted_style_request, - locking_provider: str_option(options, s3_storage_options::AWS_S3_LOCKING_PROVIDER), - assume_role_arn: str_option(options, s3_storage_options::AWS_S3_ASSUME_ROLE_ARN), - assume_role_session_name: str_option( - options, - s3_storage_options::AWS_S3_ROLE_SESSION_NAME, - ), - use_web_identity: std::env::var(s3_storage_options::AWS_WEB_IDENTITY_TOKEN_FILE) - .is_ok(), - s3_pool_idle_timeout: Duration::from_secs(s3_pool_idle_timeout), - sts_pool_idle_timeout: Duration::from_secs(sts_pool_idle_timeout), - s3_get_internal_server_error_retries, - allow_unsafe_rename, - extra_opts, - } - } - - fn str_or_default(map: &HashMap, key: &str, default: String) -> String { - map.get(key) - .map(|v| v.to_owned()) - .unwrap_or_else(|| std::env::var(key).unwrap_or(default)) - } - - fn u64_or_default(map: &HashMap, key: &str, default: u64) -> u64 { - str_option(map, key) - .and_then(|v| v.parse().ok()) - .unwrap_or(default) - } - - fn ensure_env_var(map: &HashMap, key: &str) { - if let Some(val) = str_option(map, key) { - std::env::set_var(key, val); - } - } -} - -impl Default for S3StorageOptions { - /// Creates an instance of S3StorageOptions from environment variables. - fn default() -> S3StorageOptions { - Self::from_map(&HashMap::new()) - } -} - -/// An S3 implementation of the [ObjectStore] trait -pub struct S3StorageBackend { - inner: Arc, - /// Whether allowed to performance rename_if_not_exist as rename - allow_unsafe_rename: bool, -} - -impl std::fmt::Display for S3StorageBackend { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "S3StorageBackend") - } -} - -impl S3StorageBackend { - /// Creates a new S3StorageBackend. - /// - /// Options are described in [s3_storage_options]. - /// - /// ```rust - /// use object_store::aws::AmazonS3Builder; - /// use deltalake_core::storage::s3::{S3StorageBackend, S3StorageOptions}; - /// use std::sync::Arc; - /// - /// let inner = AmazonS3Builder::new() - /// .with_region("us-east-1") - /// .with_bucket_name("my-bucket") - /// .with_access_key_id("") - /// .with_secret_access_key("") - /// .build() - /// .unwrap(); - /// let allow_unsafe_rename = true; - /// let store = S3StorageBackend::try_new(Arc::new(inner), allow_unsafe_rename).unwrap(); - /// ``` - pub fn try_new( - storage: Arc, - allow_unsafe_rename: bool, - ) -> ObjectStoreResult { - Ok(Self { - inner: storage, - allow_unsafe_rename, - }) - } -} - -impl std::fmt::Debug for S3StorageBackend { - fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - write!(fmt, "S3StorageBackend") - } -} - -#[async_trait::async_trait] -impl ObjectStore for S3StorageBackend { - async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { - self.inner.put(location, bytes).await - } - - async fn get(&self, location: &Path) -> ObjectStoreResult { - self.inner.get(location).await - } - - async fn get_opts(&self, location: &Path, options: GetOptions) -> ObjectStoreResult { - self.inner.get_opts(location, options).await - } - - async fn get_range(&self, location: &Path, range: Range) -> ObjectStoreResult { - self.inner.get_range(location, range).await - } - - async fn head(&self, location: &Path) -> ObjectStoreResult { - self.inner.head(location).await - } - - async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { - self.inner.delete(location).await - } - - async fn list( - &self, - prefix: Option<&Path>, - ) -> ObjectStoreResult>> { - self.inner.list(prefix).await - } - - async fn list_with_offset( - &self, - prefix: Option<&Path>, - offset: &Path, - ) -> ObjectStoreResult>> { - self.inner.list_with_offset(prefix, offset).await - } - - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> ObjectStoreResult { - self.inner.list_with_delimiter(prefix).await - } - - async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { - self.inner.copy(from, to).await - } - - async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> ObjectStoreResult<()> { - todo!() - } - - async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { - if self.allow_unsafe_rename { - self.inner.rename(from, to).await - } else { - Err(ObjectStoreError::Generic { - store: STORE_NAME, - source: Box::new(deltalake_aws::errors::LockClientError::LockClientRequired), - }) - } - } - - async fn put_multipart( - &self, - location: &Path, - ) -> ObjectStoreResult<(MultipartId, Box)> { - self.inner.put_multipart(location).await - } - - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - self.inner.abort_multipart(location, multipart_id).await - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use maplit::hashmap; - use serial_test::serial; - - #[test] - #[serial] - fn storage_options_default_test() { - std::env::set_var(s3_storage_options::AWS_ENDPOINT_URL, "http://localhost"); - std::env::set_var(s3_storage_options::AWS_REGION, "us-west-1"); - std::env::set_var(s3_storage_options::AWS_PROFILE, "default"); - std::env::set_var(s3_storage_options::AWS_ACCESS_KEY_ID, "default_key_id"); - std::env::set_var( - s3_storage_options::AWS_SECRET_ACCESS_KEY, - "default_secret_key", - ); - std::env::set_var(s3_storage_options::AWS_S3_LOCKING_PROVIDER, "dynamodb"); - std::env::set_var( - s3_storage_options::AWS_S3_ASSUME_ROLE_ARN, - "arn:aws:iam::123456789012:role/some_role", - ); - std::env::set_var(s3_storage_options::AWS_S3_ROLE_SESSION_NAME, "session_name"); - std::env::set_var( - s3_storage_options::AWS_WEB_IDENTITY_TOKEN_FILE, - "token_file", - ); - std::env::remove_var(s3_storage_options::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS); - std::env::remove_var(s3_storage_options::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS); - std::env::remove_var(s3_storage_options::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES); - - let options = S3StorageOptions::default(); - - assert_eq!( - S3StorageOptions { - endpoint_url: Some("http://localhost".to_string()), - region: Region::Custom { - name: "us-west-1".to_string(), - endpoint: "http://localhost".to_string() - }, - profile: Some("default".to_string()), - aws_access_key_id: Some("default_key_id".to_string()), - aws_secret_access_key: Some("default_secret_key".to_string()), - aws_session_token: None, - virtual_hosted_style_request: false, - assume_role_arn: Some("arn:aws:iam::123456789012:role/some_role".to_string()), - assume_role_session_name: Some("session_name".to_string()), - use_web_identity: true, - locking_provider: Some("dynamodb".to_string()), - s3_pool_idle_timeout: Duration::from_secs(15), - sts_pool_idle_timeout: Duration::from_secs(10), - s3_get_internal_server_error_retries: 10, - extra_opts: HashMap::new(), - allow_unsafe_rename: false, - }, - options - ); - } - - #[test] - #[serial] - fn storage_options_with_only_region_and_credentials() { - std::env::remove_var(s3_storage_options::AWS_ENDPOINT_URL); - let options = S3StorageOptions::from_map(&hashmap! { - s3_storage_options::AWS_REGION.to_string() => "eu-west-1".to_string(), - s3_storage_options::AWS_ACCESS_KEY_ID.to_string() => "test".to_string(), - s3_storage_options::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), - }); - - assert_eq!( - S3StorageOptions { - endpoint_url: None, - region: Region::default(), - aws_access_key_id: Some("test".to_string()), - aws_secret_access_key: Some("test_secret".to_string()), - ..Default::default() - }, - options - ); - } - - #[test] - #[serial] - fn storage_options_from_map_test() { - let options = S3StorageOptions::from_map(&hashmap! { - s3_storage_options::AWS_ENDPOINT_URL.to_string() => "http://localhost:1234".to_string(), - s3_storage_options::AWS_REGION.to_string() => "us-west-2".to_string(), - s3_storage_options::AWS_PROFILE.to_string() => "default".to_string(), - s3_storage_options::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string(), - s3_storage_options::AWS_S3_LOCKING_PROVIDER.to_string() => "another_locking_provider".to_string(), - s3_storage_options::AWS_S3_ASSUME_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/another_role".to_string(), - s3_storage_options::AWS_S3_ROLE_SESSION_NAME.to_string() => "another_session_name".to_string(), - s3_storage_options::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "another_token_file".to_string(), - s3_storage_options::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "1".to_string(), - s3_storage_options::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS.to_string() => "2".to_string(), - s3_storage_options::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES.to_string() => "3".to_string(), - s3_storage_options::AWS_ACCESS_KEY_ID.to_string() => "test_id".to_string(), - s3_storage_options::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret".to_string(), - }); - - assert_eq!( - S3StorageOptions { - endpoint_url: Some("http://localhost:1234".to_string()), - region: Region::Custom { - name: "us-west-2".to_string(), - endpoint: "http://localhost:1234".to_string() - }, - profile: Some("default".to_string()), - aws_access_key_id: Some("test_id".to_string()), - aws_secret_access_key: Some("test_secret".to_string()), - aws_session_token: None, - virtual_hosted_style_request: true, - assume_role_arn: Some("arn:aws:iam::123456789012:role/another_role".to_string()), - assume_role_session_name: Some("another_session_name".to_string()), - use_web_identity: true, - locking_provider: Some("another_locking_provider".to_string()), - s3_pool_idle_timeout: Duration::from_secs(1), - sts_pool_idle_timeout: Duration::from_secs(2), - s3_get_internal_server_error_retries: 3, - extra_opts: hashmap! { - s3_storage_options::AWS_S3_ADDRESSING_STYLE.to_string() => "virtual".to_string() - }, - allow_unsafe_rename: false, - }, - options - ); - } - - #[test] - #[serial] - fn storage_options_mixed_test() { - std::env::set_var(s3_storage_options::AWS_ENDPOINT_URL, "http://localhost"); - std::env::set_var(s3_storage_options::AWS_REGION, "us-west-1"); - std::env::set_var(s3_storage_options::AWS_PROFILE, "default"); - std::env::set_var(s3_storage_options::AWS_ACCESS_KEY_ID, "wrong_key_id"); - std::env::set_var( - s3_storage_options::AWS_SECRET_ACCESS_KEY, - "wrong_secret_key", - ); - std::env::set_var(s3_storage_options::AWS_S3_LOCKING_PROVIDER, "dynamodb"); - std::env::set_var( - s3_storage_options::AWS_S3_ASSUME_ROLE_ARN, - "arn:aws:iam::123456789012:role/some_role", - ); - std::env::set_var(s3_storage_options::AWS_S3_ROLE_SESSION_NAME, "session_name"); - std::env::set_var( - s3_storage_options::AWS_WEB_IDENTITY_TOKEN_FILE, - "token_file", - ); - - std::env::set_var(s3_storage_options::AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, "1"); - std::env::set_var(s3_storage_options::AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, "2"); - std::env::set_var( - s3_storage_options::AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, - "3", - ); - let options = S3StorageOptions::from_map(&hashmap! { - s3_storage_options::AWS_ACCESS_KEY_ID.to_string() => "test_id_mixed".to_string(), - s3_storage_options::AWS_SECRET_ACCESS_KEY.to_string() => "test_secret_mixed".to_string(), - s3_storage_options::AWS_REGION.to_string() => "us-west-2".to_string(), - "DYNAMO_LOCK_PARTITION_KEY_VALUE".to_string() => "my_lock".to_string(), - "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES".to_string() => "3".to_string(), - }); - - assert_eq!( - S3StorageOptions { - endpoint_url: Some("http://localhost".to_string()), - region: Region::Custom { - name: "us-west-2".to_string(), - endpoint: "http://localhost".to_string() - }, - profile: Some("default".to_string()), - aws_access_key_id: Some("test_id_mixed".to_string()), - aws_secret_access_key: Some("test_secret_mixed".to_string()), - aws_session_token: None, - virtual_hosted_style_request: false, - assume_role_arn: Some("arn:aws:iam::123456789012:role/some_role".to_string()), - assume_role_session_name: Some("session_name".to_string()), - use_web_identity: true, - locking_provider: Some("dynamodb".to_string()), - s3_pool_idle_timeout: Duration::from_secs(1), - sts_pool_idle_timeout: Duration::from_secs(2), - s3_get_internal_server_error_retries: 3, - extra_opts: hashmap! { - "DYNAMO_LOCK_PARTITION_KEY_VALUE".to_string() => "my_lock".to_string(), - }, - allow_unsafe_rename: false, - }, - options - ); - } - #[test] - #[serial] - fn storage_options_web_identity_test() { - let _options = S3StorageOptions::from_map(&hashmap! { - s3_storage_options::AWS_REGION.to_string() => "eu-west-1".to_string(), - s3_storage_options::AWS_WEB_IDENTITY_TOKEN_FILE.to_string() => "web_identity_token_file".to_string(), - s3_storage_options::AWS_ROLE_ARN.to_string() => "arn:aws:iam::123456789012:role/web_identity_role".to_string(), - s3_storage_options::AWS_ROLE_SESSION_NAME.to_string() => "web_identity_session_name".to_string(), - }); - - assert_eq!( - "eu-west-1", - std::env::var(s3_storage_options::AWS_REGION).unwrap() - ); - - assert_eq!( - "web_identity_token_file", - std::env::var(s3_storage_options::AWS_WEB_IDENTITY_TOKEN_FILE).unwrap() - ); - - assert_eq!( - "arn:aws:iam::123456789012:role/web_identity_role", - std::env::var(s3_storage_options::AWS_ROLE_ARN).unwrap() - ); - - assert_eq!( - "web_identity_session_name", - std::env::var(s3_storage_options::AWS_ROLE_SESSION_NAME).unwrap() - ); - } -} diff --git a/crates/deltalake-core/src/storage/utils.rs b/crates/deltalake-core/src/storage/utils.rs index 7b8e76c47d..39f052a89e 100644 --- a/crates/deltalake-core/src/storage/utils.rs +++ b/crates/deltalake-core/src/storage/utils.rs @@ -60,14 +60,6 @@ pub async fn flatten_list_stream( .await } -pub(crate) fn str_is_truthy(val: &str) -> bool { - val.eq_ignore_ascii_case("1") - | val.eq_ignore_ascii_case("true") - | val.eq_ignore_ascii_case("on") - | val.eq_ignore_ascii_case("yes") - | val.eq_ignore_ascii_case("y") -} - impl TryFrom for ObjectMeta { type Error = DeltaTableError; diff --git a/crates/deltalake-core/src/table/builder.rs b/crates/deltalake-core/src/table/builder.rs index b47411383f..19221d630a 100644 --- a/crates/deltalake-core/src/table/builder.rs +++ b/crates/deltalake-core/src/table/builder.rs @@ -5,6 +5,7 @@ use std::path::PathBuf; use std::sync::Arc; use chrono::{DateTime, FixedOffset, Utc}; +use log::*; use object_store::DynObjectStore; use serde::{Deserialize, Serialize}; use url::Url; @@ -12,7 +13,7 @@ use url::Url; use super::DeltaTable; use crate::errors::{DeltaResult, DeltaTableError}; use crate::logstore::LogStoreRef; -use crate::storage::config::{self, StorageOptions}; +use crate::storage::StorageOptions; #[allow(dead_code)] #[derive(Debug, thiserror::Error)] @@ -134,11 +135,6 @@ impl DeltaTableLoadOptions { } } -enum UriType { - LocalPath(PathBuf), - Url(Url), -} - /// builder for configuring a delta table load. #[derive(Debug)] pub struct DeltaTableBuilder { @@ -150,31 +146,48 @@ pub struct DeltaTableBuilder { impl DeltaTableBuilder { /// Creates `DeltaTableBuilder` from table uri + /// + /// Can panic on an invalid URI + /// + /// ```rust + /// # use deltalake_core::table::builder::*; + /// let builder = DeltaTableBuilder::from_uri("../deltalake-test/tests/data/delta-0.8.0"); + /// assert!(true); + /// ``` pub fn from_uri(table_uri: impl AsRef) -> Self { - Self { - options: DeltaTableLoadOptions::new(table_uri.as_ref()), - storage_options: None, - allow_http: None, - } + let url = ensure_table_uri(&table_uri).expect("The specified table_uri is not valid"); + DeltaTableBuilder::from_valid_uri(url).expect("Failed to create valid builder") } /// Creates `DeltaTableBuilder` from verified table uri. - /// Will fail fast if specified `table_uri` is a local path but doesn't exist. + /// + /// ```rust + /// # use deltalake_core::table::builder::*; + /// let builder = DeltaTableBuilder::from_valid_uri("/tmp"); + /// assert!(builder.is_ok(), "Builder failed with {builder:?}"); + /// ``` pub fn from_valid_uri(table_uri: impl AsRef) -> DeltaResult { - let table_uri = table_uri.as_ref(); - - if let UriType::LocalPath(path) = resolve_uri_type(table_uri)? { - if !path.exists() { - let msg = format!( - "Local path \"{}\" does not exist or you don't have access!", - table_uri - ); - return Err(DeltaTableError::InvalidTableLocation(msg)); + if let Ok(url) = Url::parse(table_uri.as_ref()) { + if url.scheme() == "file" { + let path = url.to_file_path().map_err(|_| { + DeltaTableError::InvalidTableLocation(table_uri.as_ref().to_string()) + })?; + ensure_file_location_exists(path)?; } + } else { + ensure_file_location_exists(PathBuf::from(table_uri.as_ref()))?; } - Ok(DeltaTableBuilder::from_uri(table_uri)) + let url = ensure_table_uri(&table_uri).expect("The specified table_uri is not valid"); + debug!("creating table builder with {url}"); + + Ok(Self { + options: DeltaTableLoadOptions::new(url), + storage_options: None, + allow_http: None, + }) } + /// Sets `require_tombstones=false` to the builder pub fn without_tombstones(mut self) -> Self { self.options.require_tombstones = false; @@ -266,11 +279,22 @@ impl DeltaTableBuilder { /// Build a delta storage backend for the given config pub fn build_storage(self) -> DeltaResult { - config::configure_log_store( - &self.options.table_uri, - self.storage_options(), - self.options.storage_backend, - ) + debug!("build_storage() with {}", &self.options.table_uri); + let location = Url::parse(&self.options.table_uri).map_err(|_| { + DeltaTableError::NotATable(format!( + "Could not turn {} into a URL", + self.options.table_uri + )) + })?; + + if let Some((store, _url)) = self.options.storage_backend.as_ref() { + debug!("Loading a logstore with a custom store: {store:?}"); + crate::logstore::logstore_with(store.clone(), location, self.storage_options()) + } else { + // If there has been no backend defined just default to the normal logstore look up + debug!("Loading a logstore based off the location: {location:?}"); + crate::logstore::logstore_for(location, self.storage_options()) + } } /// Build the [`DeltaTable`] from specified options. @@ -299,126 +323,17 @@ impl DeltaTableBuilder { } } -/// Storage option keys to use when creating [crate::storage::s3::S3StorageOptions]. -/// The same key should be used whether passing a key in the hashmap or setting it as an environment variable. -/// Provided keys may include configuration for the S3 backend and also the optional DynamoDb lock used for atomic rename. -pub mod s3_storage_options { - /// Custom S3 endpoint. - pub const AWS_ENDPOINT_URL: &str = "AWS_ENDPOINT_URL"; - /// The AWS region. - pub const AWS_REGION: &str = "AWS_REGION"; - /// The AWS profile. - pub const AWS_PROFILE: &str = "AWS_PROFILE"; - /// The AWS_ACCESS_KEY_ID to use for S3. - pub const AWS_ACCESS_KEY_ID: &str = "AWS_ACCESS_KEY_ID"; - /// The AWS_SECRET_ACCESS_KEY to use for S3. - pub const AWS_SECRET_ACCESS_KEY: &str = "AWS_SECRET_ACCESS_KEY"; - /// The AWS_SESSION_TOKEN to use for S3. - pub const AWS_SESSION_TOKEN: &str = "AWS_SESSION_TOKEN"; - /// Uses either "path" (the default) or "virtual", which turns on - /// [virtual host addressing](http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). - pub const AWS_S3_ADDRESSING_STYLE: &str = "AWS_S3_ADDRESSING_STYLE"; - /// Locking provider to use for safe atomic rename. - /// `dynamodb` is currently the only supported locking provider. - /// If not set, safe atomic rename is not available. - pub const AWS_S3_LOCKING_PROVIDER: &str = "AWS_S3_LOCKING_PROVIDER"; - /// The role to assume for S3 writes. - pub const AWS_S3_ASSUME_ROLE_ARN: &str = "AWS_S3_ASSUME_ROLE_ARN"; - /// The role session name to use when a role is assumed. If not provided a random session name is generated. - pub const AWS_S3_ROLE_SESSION_NAME: &str = "AWS_S3_ROLE_SESSION_NAME"; - /// The `pool_idle_timeout` option of aws http client. Has to be lower than 20 seconds, which is - /// default S3 server timeout . - /// However, since rusoto uses hyper as a client, its default timeout is 90 seconds - /// . - /// Hence, the `connection closed before message completed` could occur. - /// To avoid that, the default value of this setting is 15 seconds if it's not set otherwise. - pub const AWS_S3_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_S3_POOL_IDLE_TIMEOUT_SECONDS"; - /// The `pool_idle_timeout` for the as3_storage_options sts client. See - /// the reasoning in `AWS_S3_POOL_IDLE_TIMEOUT_SECONDS`. - pub const AWS_STS_POOL_IDLE_TIMEOUT_SECONDS: &str = "AWS_STS_POOL_IDLE_TIMEOUT_SECONDS"; - /// The number of retries for S3 GET requests failed with 500 Internal Server Error. - pub const AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES: &str = - "AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES"; - /// The web identity token file to use when using a web identity provider. - /// NOTE: web identity related options are set in the environment when - /// creating an instance of [crate::storage::s3::S3StorageOptions]. - /// See also . - pub const AWS_WEB_IDENTITY_TOKEN_FILE: &str = "AWS_WEB_IDENTITY_TOKEN_FILE"; - /// The role name to use for web identity. - /// NOTE: web identity related options are set in the environment when - /// creating an instance of [crate::storage::s3::S3StorageOptions]. - /// See also . - pub const AWS_ROLE_ARN: &str = "AWS_ROLE_ARN"; - /// The role session name to use for web identity. - /// NOTE: web identity related options are set in the environment when - /// creating an instance of [crate::storage::s3::S3StorageOptions]. - /// See also . - pub const AWS_ROLE_SESSION_NAME: &str = "AWS_ROLE_SESSION_NAME"; - /// Allow http connections - mainly useful for integration tests - pub const AWS_ALLOW_HTTP: &str = "AWS_ALLOW_HTTP"; - - /// If set to "true", allows creating commits without concurrent writer protection. - /// Only safe if there is one writer to a given table. - pub const AWS_S3_ALLOW_UNSAFE_RENAME: &str = "AWS_S3_ALLOW_UNSAFE_RENAME"; - - /// The list of option keys owned by the S3 module. - /// Option keys not contained in this list will be added to the `extra_opts` - /// field of [crate::storage::s3::S3StorageOptions]. - pub const S3_OPTS: &[&str] = &[ - AWS_ENDPOINT_URL, - AWS_REGION, - AWS_PROFILE, - AWS_ACCESS_KEY_ID, - AWS_SECRET_ACCESS_KEY, - AWS_SESSION_TOKEN, - AWS_S3_LOCKING_PROVIDER, - AWS_S3_ASSUME_ROLE_ARN, - AWS_S3_ROLE_SESSION_NAME, - AWS_WEB_IDENTITY_TOKEN_FILE, - AWS_ROLE_ARN, - AWS_ROLE_SESSION_NAME, - AWS_S3_POOL_IDLE_TIMEOUT_SECONDS, - AWS_STS_POOL_IDLE_TIMEOUT_SECONDS, - AWS_S3_GET_INTERNAL_SERVER_ERROR_RETRIES, - ]; -} - -#[allow(dead_code)] -pub(crate) fn str_option(map: &HashMap, key: &str) -> Option { - map.get(key) - .map_or_else(|| std::env::var(key).ok(), |v| Some(v.to_owned())) -} - -lazy_static::lazy_static! { - static ref KNOWN_SCHEMES: Vec<&'static str> = - Vec::from([ - "file", "memory", "az", "abfs", "abfss", "azure", "wasb", "wasbs", "adl", "s3", "s3a", - "gs", "hdfs", "https", "http", - ]); -} - -/// Utility function to figure out whether string representation of the path -/// is either local path or some kind or URL. -/// -/// Will return an error if the path is not valid. -fn resolve_uri_type(table_uri: impl AsRef) -> DeltaResult { - let table_uri = table_uri.as_ref(); - - if let Ok(url) = Url::parse(table_uri) { - if url.scheme() == "file" { - Ok(UriType::LocalPath(url.to_file_path().map_err(|err| { - let msg = format!("Invalid table location: {}\nError: {:?}", table_uri, err); - DeltaTableError::InvalidTableLocation(msg) - })?)) - // NOTE this check is required to support absolute windows paths which may properly parse as url - } else if KNOWN_SCHEMES.contains(&url.scheme()) { - Ok(UriType::Url(url)) - } else { - Ok(UriType::LocalPath(PathBuf::from(table_uri))) - } - } else { - Ok(UriType::LocalPath(PathBuf::from(table_uri))) +fn create_filetree_from_path(path: &PathBuf) -> DeltaResult<()> { + if !path.exists() { + std::fs::create_dir_all(path).map_err(|err| { + let msg = format!( + "Could not create local directory: {:?}\nError: {:?}", + path, err + ); + DeltaTableError::InvalidTableLocation(msg) + })?; } + Ok(()) } /// Attempt to create a Url from given table location. @@ -435,41 +350,54 @@ fn resolve_uri_type(table_uri: impl AsRef) -> DeltaResult { pub fn ensure_table_uri(table_uri: impl AsRef) -> DeltaResult { let table_uri = table_uri.as_ref(); - let uri_type: UriType = resolve_uri_type(table_uri)?; - - // If it is a local path, we need to create it if it does not exist. - let mut url = match uri_type { - UriType::LocalPath(path) => { - if !path.exists() { - std::fs::create_dir_all(&path).map_err(|err| { - let msg = format!( - "Could not create local directory: {}\nError: {:?}", - table_uri, err - ); - DeltaTableError::InvalidTableLocation(msg) - })?; + debug!("ensure_table_uri {table_uri}"); + let mut url = match Url::parse(table_uri) { + Ok(url) => { + if url.scheme() == "file" { + create_filetree_from_path( + &url.to_file_path() + .expect("Failed to convert a file:// URL to a file path"), + )?; } - let path = std::fs::canonicalize(path).map_err(|err| { - let msg = format!("Invalid table location: {}\nError: {:?}", table_uri, err); + Ok(url) + } + Err(_) => { + let path = PathBuf::from(table_uri); + create_filetree_from_path(&path)?; + let path = std::fs::canonicalize(path.clone()).map_err(|err| { + let msg = format!("Invalid table location: {:?}\nError: {:?}", path, err); DeltaTableError::InvalidTableLocation(msg) })?; - Url::from_directory_path(path).map_err(|_| { + + Url::from_directory_path(path.clone()).map_err(|_| { let msg = format!( - "Could not construct a URL from canonicalized path: {}.\n\ + "Could not construct a URL from canonicalized path: {:?}.\n\ Something must be very wrong with the table path.", - table_uri + path, ); DeltaTableError::InvalidTableLocation(msg) - })? + }) } - UriType::Url(url) => url, - }; + }?; let trimmed_path = url.path().trim_end_matches('/').to_owned(); url.set_path(&trimmed_path); Ok(url) } +/// Validate that the given [PathBuf] does exist, otherwise return a +/// [DeltaTableError::InvalidTableLocation] +fn ensure_file_location_exists(path: PathBuf) -> DeltaResult<()> { + if !path.exists() { + let msg = format!( + "Local path \"{}\" does not exist or you don't have access!", + path.as_path().display(), + ); + return Err(DeltaTableError::InvalidTableLocation(msg)); + } + Ok(()) +} + #[cfg(test)] mod tests { use super::*; @@ -483,6 +411,8 @@ mod tests { assert!(uri.is_ok()); let _uri = ensure_table_uri("./nonexistent"); assert!(uri.is_ok()); + let uri = ensure_table_uri("file:///tmp/nonexistent/some/path"); + assert!(uri.is_ok()); let uri = ensure_table_uri("s3://container/path"); assert!(uri.is_ok()); @@ -567,7 +497,7 @@ mod tests { #[test] fn test_ensure_table_uri_url() { // Urls should round trips as-is - let expected = Url::parse("s3://tests/data/delta-0.8.0").unwrap(); + let expected = Url::parse("s3://deltalake-test/tests/data/delta-0.8.0").unwrap(); let url = ensure_table_uri(&expected).unwrap(); assert_eq!(expected, url); @@ -581,7 +511,7 @@ mod tests { #[tokio::test] async fn read_delta_table_ignoring_tombstones() { - let table = DeltaTableBuilder::from_uri("./tests/data/delta-0.8.0") + let table = DeltaTableBuilder::from_uri("../deltalake-test/tests/data/delta-0.8.0") .without_tombstones() .load() .await @@ -602,7 +532,7 @@ mod tests { #[tokio::test] async fn read_delta_table_ignoring_files() { - let table = DeltaTableBuilder::from_uri("./tests/data/delta-0.8.0") + let table = DeltaTableBuilder::from_uri("../deltalake-test/tests/data/delta-0.8.0") .without_files() .load() .await @@ -617,7 +547,7 @@ mod tests { #[tokio::test] async fn read_delta_table_with_ignoring_files_on_apply_log() { - let mut table = DeltaTableBuilder::from_uri("./tests/data/delta-0.8.0") + let mut table = DeltaTableBuilder::from_uri("../deltalake-test/tests/data/delta-0.8.0") .with_version(0) .without_files() .load() diff --git a/crates/deltalake-core/src/table/mod.rs b/crates/deltalake-core/src/table/mod.rs index 94fef6ae1b..018e79ebe9 100644 --- a/crates/deltalake-core/src/table/mod.rs +++ b/crates/deltalake-core/src/table/mod.rs @@ -30,7 +30,6 @@ use crate::partitions::PartitionFilter; use crate::protocol::{ find_latest_check_point_for_version, get_last_checkpoint, ProtocolError, Stats, }; -use crate::storage::config::configure_log_store; use crate::storage::{commit_uri_from_version, ObjectStoreRef}; pub mod builder; @@ -347,12 +346,9 @@ impl<'de> Deserialize<'de> for DeltaTable { let storage_config: LogStoreConfig = seq .next_element()? .ok_or_else(|| A::Error::invalid_length(0, &self))?; - let log_store = configure_log_store( - storage_config.location.as_ref(), - storage_config.options, - None, - ) - .map_err(|_| A::Error::custom("Failed deserializing LogStore"))?; + let log_store = + crate::logstore::logstore_for(storage_config.location, storage_config.options) + .map_err(|_| A::Error::custom("Failed deserializing LogStore"))?; let last_check_point = seq .next_element()? .ok_or_else(|| A::Error::invalid_length(0, &self))?; @@ -474,7 +470,7 @@ impl DeltaTable { Ok(current_delta_log_ver) } - #[cfg(any(feature = "parquet", feature = "parquet2"))] + #[cfg(feature = "parquet")] async fn restore_checkpoint(&mut self, check_point: CheckPoint) -> Result<(), DeltaTableError> { self.state = DeltaTableState::from_checkpoint(self, &check_point).await?; @@ -516,7 +512,7 @@ impl DeltaTable { /// Updates the DeltaTable to the most recent state committed to the transaction log by /// loading the last checkpoint and incrementally applying each version since. - #[cfg(any(feature = "parquet", feature = "parquet2"))] + #[cfg(feature = "parquet")] pub async fn update(&mut self) -> Result<(), DeltaTableError> { match get_last_checkpoint(self.log_store.as_ref()).await { Ok(last_check_point) => { @@ -538,7 +534,7 @@ impl DeltaTable { } /// Updates the DeltaTable to the most recent state committed to the transaction log. - #[cfg(not(any(feature = "parquet", feature = "parquet2")))] + #[cfg(not(feature = "parquet"))] pub async fn update(&mut self) -> Result<(), DeltaTableError> { self.update_incremental(None).await } @@ -615,7 +611,7 @@ impl DeltaTable { } // 1. find latest checkpoint below version - #[cfg(any(feature = "parquet", feature = "parquet2"))] + #[cfg(feature = "parquet")] match find_latest_check_point_for_version(self.log_store.as_ref(), version).await? { Some(check_point) => { self.restore_checkpoint(check_point).await?; @@ -940,8 +936,6 @@ mod tests { use super::*; use crate::kernel::{DataType, PrimitiveType, StructField}; use crate::operations::create::CreateBuilder; - #[cfg(any(feature = "s3", feature = "s3-native-tls"))] - use crate::table::builder::DeltaTableBuilder; #[tokio::test] async fn table_round_trip() { @@ -981,6 +975,7 @@ mod tests { drop(tmp_dir); } + /* TODO move into deltalake-aws crate #[cfg(any(feature = "s3", feature = "s3-native-tls"))] #[test] fn normalize_table_uri_s3() { @@ -992,10 +987,11 @@ mod tests { ] .iter() { - let table = DeltaTableBuilder::from_uri(table_uri).build().unwrap(); + let table = crate::DeltaTableBuilder::from_uri(table_uri).build().unwrap(); assert_eq!(table.table_uri(), "s3://tests/data/delta-0.8.0"); } } + */ #[test] fn get_table_constraints() { diff --git a/crates/deltalake-core/src/table/state.rs b/crates/deltalake-core/src/table/state.rs index fa9078997c..1aaf095d4f 100644 --- a/crates/deltalake-core/src/table/state.rs +++ b/crates/deltalake-core/src/table/state.rs @@ -20,7 +20,7 @@ use crate::storage::commit_uri_from_version; use crate::table::DeltaTableMetaData; use crate::DeltaTable; -#[cfg(any(feature = "parquet", feature = "parquet2"))] +#[cfg(feature = "parquet")] use super::{CheckPoint, DeltaTableConfig}; /// State snapshot currently held by the Delta Table instance. @@ -92,58 +92,34 @@ impl DeltaTableState { } /// Update DeltaTableState with checkpoint data. - #[cfg(any(feature = "parquet", feature = "parquet2"))] + #[cfg(feature = "parquet")] pub fn process_checkpoint_bytes( &mut self, data: bytes::Bytes, table_config: &DeltaTableConfig, ) -> Result<(), DeltaTableError> { - #[cfg(feature = "parquet")] - { - use parquet::file::reader::{FileReader, SerializedFileReader}; - - let preader = SerializedFileReader::new(data)?; - let schema = preader.metadata().file_metadata().schema(); - if !schema.is_group() { - return Err(DeltaTableError::from(ProtocolError::Generic( - "Action record in checkpoint should be a struct".to_string(), - ))); - } - for record in preader.get_row_iter(None)? { - self.process_action( - Action::from_parquet_record(schema, &record.unwrap())?, - table_config.require_tombstones, - table_config.require_files, - )?; - } + use parquet::file::reader::{FileReader, SerializedFileReader}; + + let preader = SerializedFileReader::new(data)?; + let schema = preader.metadata().file_metadata().schema(); + if !schema.is_group() { + return Err(DeltaTableError::from(ProtocolError::Generic( + "Action record in checkpoint should be a struct".to_string(), + ))); } - - #[cfg(feature = "parquet2")] - { - use crate::protocol::parquet2_read::actions_from_row_group; - use parquet2::read::read_metadata; - - let mut reader = std::io::Cursor::new(data); - let metadata = read_metadata(&mut reader)?; - - for row_group in metadata.row_groups { - for action in - actions_from_row_group(row_group, &mut reader).map_err(ProtocolError::from)? - { - self.process_action( - action, - table_config.require_tombstones, - table_config.require_files, - )?; - } - } + for record in preader.get_row_iter(None)? { + self.process_action( + Action::from_parquet_record(schema, &record.unwrap())?, + table_config.require_tombstones, + table_config.require_files, + )?; } Ok(()) } /// Construct a delta table state object from checkpoint. - #[cfg(any(feature = "parquet", feature = "parquet2"))] + #[cfg(feature = "parquet")] pub async fn from_checkpoint( table: &DeltaTable, check_point: &CheckPoint, diff --git a/crates/deltalake-core/tests/checkpoint_writer.rs b/crates/deltalake-core/tests/checkpoint_writer.rs index ca6fbccf97..72b39b0878 100644 --- a/crates/deltalake-core/tests/checkpoint_writer.rs +++ b/crates/deltalake-core/tests/checkpoint_writer.rs @@ -14,7 +14,7 @@ mod simple_checkpoint { #[tokio::test] async fn simple_checkpoint_test() { - let table_location = "./tests/data/checkpoints"; + let table_location = "../deltalake-test/tests/data/checkpoints"; let table_path = PathBuf::from(table_location); let log_path = table_path.join("_delta_log"); @@ -97,7 +97,7 @@ mod delete_expired_delta_log_in_checkpoint { #[tokio::test] async fn test_delete_expired_logs() { let mut table = fs_common::create_table( - "./tests/data/checkpoints_with_expired_logs/expired", + "../deltalake-test/tests/data/checkpoints_with_expired_logs/expired", Some(hashmap! { DeltaConfigKey::LogRetentionDuration.as_ref().into() => Some("interval 10 minute".to_string()), DeltaConfigKey::EnableExpiredLogCleanup.as_ref().into() => Some("true".to_string()) @@ -161,7 +161,7 @@ mod delete_expired_delta_log_in_checkpoint { #[tokio::test] async fn test_not_delete_expired_logs() { let mut table = fs_common::create_table( - "./tests/data/checkpoints_with_expired_logs/not_delete_expired", + "../deltalake-test/tests/data/checkpoints_with_expired_logs/not_delete_expired", Some(hashmap! { DeltaConfigKey::LogRetentionDuration.as_ref().into() => Some("interval 1 second".to_string()), DeltaConfigKey::EnableExpiredLogCleanup.as_ref().into() => Some("false".to_string()) @@ -237,7 +237,7 @@ mod checkpoints_with_tombstones { #[tokio::test] async fn test_expired_tombstones() { - let mut table = fs_common::create_table("./tests/data/checkpoints_tombstones/expired", Some(hashmap! { + let mut table = fs_common::create_table("../deltalake-test/tests/data/checkpoints_tombstones/expired", Some(hashmap! { DeltaConfigKey::DeletedFileRetentionDuration.as_ref().into() => Some("interval 1 minute".to_string()) })).await; @@ -274,7 +274,7 @@ mod checkpoints_with_tombstones { #[tokio::test] async fn test_checkpoint_with_extended_file_metadata_true() { - let path = "./tests/data/checkpoints_tombstones/metadata_true"; + let path = "../deltalake-test/tests/data/checkpoints_tombstones/metadata_true"; let mut table = fs_common::create_table(path, None).await; let r1 = remove_metadata_true(); let r2 = remove_metadata_true(); @@ -290,7 +290,7 @@ mod checkpoints_with_tombstones { #[tokio::test] async fn test_checkpoint_with_extended_file_metadata_false() { - let path = "./tests/data/checkpoints_tombstones/metadata_false"; + let path = "../deltalake-test/tests/data/checkpoints_tombstones/metadata_false"; let mut table = fs_common::create_table(path, None).await; let r1 = remove_metadata_true(); let r2 = remove_metadata_false(); @@ -313,7 +313,7 @@ mod checkpoints_with_tombstones { #[tokio::test] async fn test_checkpoint_with_extended_file_metadata_broken() { - let path = "./tests/data/checkpoints_tombstones/metadata_broken"; + let path = "../deltalake-test/tests/data/checkpoints_tombstones/metadata_broken"; let mut table = fs_common::create_table(path, None).await; let r1 = remove_metadata_broken(); let r2 = remove_metadata_false(); diff --git a/crates/deltalake-core/tests/command_filesystem_check.rs b/crates/deltalake-core/tests/command_filesystem_check.rs index ac11c5d376..8d0eee6ac6 100644 --- a/crates/deltalake-core/tests/command_filesystem_check.rs +++ b/crates/deltalake-core/tests/command_filesystem_check.rs @@ -1,52 +1,19 @@ #![cfg(feature = "integration_test")] -use deltalake_core::test_utils::{ - set_env_if_not_set, IntegrationContext, StorageIntegration, TestResult, TestTables, -}; use deltalake_core::Path; use deltalake_core::{errors::DeltaTableError, DeltaOps}; +use deltalake_test::utils::*; use serial_test::serial; -mod common; - #[tokio::test] #[serial] async fn test_filesystem_check_local() -> TestResult { - test_filesystem_check(StorageIntegration::Local).await -} - -#[cfg(any(feature = "s3", feature = "s3-native-tls"))] -#[tokio::test] -#[serial] -async fn test_filesystem_check_aws() -> TestResult { - set_env_if_not_set("AWS_S3_ALLOW_UNSAFE_RENAME", "true"); - set_env_if_not_set("AWS_S3_LOCKING_PROVIDER", "none"); - test_filesystem_check(StorageIntegration::Amazon).await -} - -#[cfg(feature = "azure")] -#[tokio::test] -#[serial] -async fn test_filesystem_check_azure() -> TestResult { - test_filesystem_check(StorageIntegration::Microsoft).await -} - -#[cfg(feature = "gcs")] -#[tokio::test] -#[serial] -async fn test_filesystem_check_gcp() -> TestResult { - test_filesystem_check(StorageIntegration::Google).await -} - -#[cfg(feature = "hdfs")] -#[tokio::test] -#[serial] -async fn test_filesystem_check_hdfs() -> TestResult { - Ok(test_filesystem_check(StorageIntegration::Hdfs).await?) + let storage = Box::new(LocalStorageIntegration::default()); + let context = IntegrationContext::new(storage)?; + test_filesystem_check(&context).await } -async fn test_filesystem_check(storage: StorageIntegration) -> TestResult { - let context = IntegrationContext::new(storage)?; +async fn test_filesystem_check(context: &IntegrationContext) -> TestResult { context.load_table(TestTables::Simple).await?; let file = "part-00000-2befed33-c358-4768-a43c-3eda0d2a499d-c000.snappy.parquet"; let path = Path::from_iter([&TestTables::Simple.as_name(), file]); @@ -88,7 +55,7 @@ async fn test_filesystem_check(storage: StorageIntegration) -> TestResult { #[tokio::test] #[serial] async fn test_filesystem_check_partitioned() -> TestResult { - let storage = StorageIntegration::Local; + let storage = Box::new(LocalStorageIntegration::default()); let context = IntegrationContext::new(storage)?; context .load_table(TestTables::Delta0_8_0Partitioned) @@ -122,7 +89,8 @@ async fn test_filesystem_check_partitioned() -> TestResult { #[serial] async fn test_filesystem_check_fails_for_concurrent_delete() -> TestResult { // Validate failure when a non dry only executes on the latest version - let context = IntegrationContext::new(StorageIntegration::Local)?; + let storage = Box::new(LocalStorageIntegration::default()); + let context = IntegrationContext::new(storage)?; context.load_table(TestTables::Simple).await?; let file = "part-00003-53f42606-6cda-4f13-8d07-599a21197296-c000.snappy.parquet"; let path = Path::from_iter([&TestTables::Simple.as_name(), file]); @@ -144,34 +112,3 @@ async fn test_filesystem_check_fails_for_concurrent_delete() -> TestResult { Ok(()) } - -#[tokio::test] -#[serial] -#[ignore = "should this actually fail? with conflict resolution, we are re-trying again."] -async fn test_filesystem_check_outdated() -> TestResult { - // Validate failure when a non dry only executes on the latest version - let context = IntegrationContext::new(StorageIntegration::Local)?; - context.load_table(TestTables::Simple).await?; - let file = "part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet"; - let path = Path::from_iter([&TestTables::Simple.as_name(), file]); - - // Delete an active file from underlying storage without an update to the log to simulate an external fault - context.object_store().delete(&path).await?; - - let table = context - .table_builder(TestTables::Simple) - .with_version(2) - .load() - .await?; - - let op = DeltaOps::from(table); - let res = op.filesystem_check().with_dry_run(false).await; - println!("{:?}", res); - if let Err(DeltaTableError::VersionAlreadyExists(version)) = res { - assert!(version == 3); - } else { - panic!(); - } - - Ok(()) -} diff --git a/crates/deltalake-core/tests/command_vacuum.rs b/crates/deltalake-core/tests/command_vacuum.rs index 51ff3217b3..54ec03cdb2 100644 --- a/crates/deltalake-core/tests/command_vacuum.rs +++ b/crates/deltalake-core/tests/command_vacuum.rs @@ -1,15 +1,13 @@ use chrono::Duration; -use common::clock::TestClock; -use common::TestContext; use deltalake_core::kernel::StructType; use deltalake_core::operations::vacuum::Clock; use deltalake_core::operations::DeltaOps; +use deltalake_test::clock::TestClock; +use deltalake_test::*; use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; use serde_json::json; use std::sync::Arc; -mod common; - /// Basic schema pub fn get_xy_date_schema() -> StructType { serde_json::from_value(json!({ @@ -51,7 +49,7 @@ async fn test_non_partitioned_table() { ]; for path in paths { - common::add_file( + add_file( &mut table, &path, "random junk".as_bytes().into(), @@ -64,7 +62,7 @@ async fn test_non_partitioned_table() { clock.tick(Duration::seconds(10)); - common::remove_file( + remove_file( &mut table, "delete_me.parquet", &[], @@ -103,7 +101,7 @@ async fn test_partitioned_table() { let partition_values = [("date", Some("2022-07-03")), ("x", Some("2"))]; for path in paths { - common::add_file( + add_file( &mut table, &path, "random junk".as_bytes().into(), @@ -116,7 +114,7 @@ async fn test_partitioned_table() { clock.tick(Duration::seconds(10)); - common::remove_file( + remove_file( &mut table, "date=2022-07-03/x=2/delete_me.parquet", &partition_values, @@ -168,7 +166,7 @@ async fn test_partitions_included() { let partition_values = &[("_date", Some("2022-07-03"))]; for path in paths { - common::add_file( + add_file( &mut table, &path, "random junk".as_bytes().into(), @@ -181,7 +179,7 @@ async fn test_partitions_included() { clock.tick(Duration::seconds(10)); - common::remove_file( + remove_file( &mut table, "_date=2022-07-03/delete_me.parquet", partition_values, @@ -247,7 +245,7 @@ async fn test_non_managed_files() { ]; for path in paths_delete.iter().chain(paths_ignore.iter()) { - common::add_file( + add_file( &mut table, path, "random junk".as_bytes().into(), diff --git a/crates/deltalake-core/tests/common/adls.rs b/crates/deltalake-core/tests/common/adls.rs deleted file mode 100644 index 4c441e0325..0000000000 --- a/crates/deltalake-core/tests/common/adls.rs +++ /dev/null @@ -1,93 +0,0 @@ -use super::TestContext; -use chrono::Utc; -use rand::Rng; -use std::collections::HashMap; - -pub struct AzureGen2 { - #[allow(dead_code)] - account_name: String, - #[allow(dead_code)] - account_key: String, - file_system_name: String, -} - -impl Drop for AzureGen2 { - fn drop(&mut self) { - let file_system_name = self.file_system_name.clone(); - az_cli::delete_container(file_system_name).unwrap(); - } -} - -pub async fn setup_azure_gen2_context() -> TestContext { - let mut config = HashMap::new(); - - let storage_account_name = std::env::var("AZURE_STORAGE_ACCOUNT_NAME").unwrap(); - let storage_account_key = std::env::var("AZURE_STORAGE_ACCOUNT_KEY").unwrap(); - let storage_container_name = - std::env::var("AZURE_STORAGE_CONTAINER_NAME").unwrap_or_else(|_| "deltars".to_string()); - - let rand: u16 = rand::thread_rng().gen(); - let file_system_name = format!("delta-rs-test-{}-{}", Utc::now().timestamp(), rand); - - az_cli::create_container(&file_system_name).unwrap(); - - let table_uri = format!("azure://{file_system_name}/"); - - config.insert("URI".to_string(), table_uri); - config.insert( - "AZURE_STORAGE_ACCOUNT_NAME".to_string(), - storage_account_name.clone(), - ); - config.insert( - "AZURE_STORAGE_ACCOUNT_KEY".to_string(), - storage_account_key.clone(), - ); - config.insert( - "AZURE_STORAGE_CONTAINER_NAME".to_string(), - storage_container_name, - ); - - TestContext { - storage_context: Some(Box::new(AzureGen2 { - account_name: storage_account_name, - account_key: storage_account_key, - file_system_name, - })), - config, - ..TestContext::default() - } -} - -pub mod az_cli { - use std::process::{Command, ExitStatus}; - - /// Create a new bucket - pub fn create_container(container_name: impl AsRef) -> std::io::Result { - let mut child = Command::new("az") - .args([ - "storage", - "container", - "create", - "-n", - container_name.as_ref(), - ]) - .spawn() - .expect("az command is installed"); - child.wait() - } - - /// delete bucket - pub fn delete_container(container_name: impl AsRef) -> std::io::Result { - let mut child = Command::new("az") - .args([ - "storage", - "container", - "delete", - "-n", - container_name.as_ref(), - ]) - .spawn() - .expect("az command is installed"); - child.wait() - } -} diff --git a/crates/deltalake-core/tests/common/hdfs.rs b/crates/deltalake-core/tests/common/hdfs.rs deleted file mode 100644 index 8da5ef83b6..0000000000 --- a/crates/deltalake-core/tests/common/hdfs.rs +++ /dev/null @@ -1,20 +0,0 @@ -use super::TestContext; -use std::collections::HashMap; - -pub struct Hdfs { - name_node: String, -} - -pub fn setup_hdfs_context() -> TestContext { - let mut config = HashMap::new(); - - let name_node = "hdfs://localhost:9000".to_owned(); - - config.insert("URI".to_owned(), name_node.clone()); - - TestContext { - storage_context: Some(Box::new(Hdfs { name_node })), - config, - ..TestContext::default() - } -} diff --git a/crates/deltalake-core/tests/fs_common/mod.rs b/crates/deltalake-core/tests/fs_common/mod.rs index c79fc833da..ebd4d50b88 100644 --- a/crates/deltalake-core/tests/fs_common/mod.rs +++ b/crates/deltalake-core/tests/fs_common/mod.rs @@ -5,7 +5,6 @@ use deltalake_core::kernel::{ use deltalake_core::operations::create::CreateBuilder; use deltalake_core::operations::transaction::commit; use deltalake_core::protocol::{DeltaOperation, SaveMode}; -use deltalake_core::storage::config::configure_store; use deltalake_core::storage::{GetResult, ObjectStoreResult}; use deltalake_core::DeltaTable; use object_store::path::Path as StorePath; @@ -37,7 +36,7 @@ pub async fn create_table_from_json( partition_columns: Vec<&str>, config: Value, ) -> DeltaTable { - assert!(path.starts_with("./tests/data")); + assert!(path.starts_with("../deltalake-test/tests/data")); std::fs::create_dir_all(path).unwrap(); std::fs::remove_dir_all(path).unwrap(); std::fs::create_dir_all(path).unwrap(); @@ -144,15 +143,13 @@ impl std::fmt::Display for SlowStore { } } -#[allow(dead_code)] impl SlowStore { pub fn new( location: Url, - options: impl Into + Clone, + _options: impl Into + Clone, ) -> deltalake_core::DeltaResult { - let mut options = options.into(); Ok(Self { - inner: configure_store(&location, &mut options).unwrap(), + inner: deltalake_core::storage::store_for(&location)?, }) } } diff --git a/crates/deltalake-core/tests/integration_checkpoint.rs b/crates/deltalake-core/tests/integration_checkpoint.rs index 768b1172db..ce4525ba83 100644 --- a/crates/deltalake-core/tests/integration_checkpoint.rs +++ b/crates/deltalake-core/tests/integration_checkpoint.rs @@ -3,9 +3,9 @@ use chrono::Utc; use deltalake_core::checkpoints::{cleanup_expired_logs_for, create_checkpoint}; use deltalake_core::kernel::{DataType, PrimitiveType}; -use deltalake_core::test_utils::{IntegrationContext, StorageIntegration, TestResult}; use deltalake_core::writer::{DeltaWriter, JsonWriter}; use deltalake_core::{errors::DeltaResult, DeltaOps, DeltaTableBuilder, ObjectStore}; +use deltalake_test::utils::*; use object_store::path::Path; use serde_json::json; use serial_test::serial; @@ -15,43 +15,8 @@ use tokio::time::sleep; #[tokio::test] #[serial] async fn cleanup_metadata_fs_test() -> TestResult { - let context = IntegrationContext::new(StorageIntegration::Local)?; - cleanup_metadata_test(&context).await?; - Ok(()) -} - -#[cfg(any(feature = "s3", feature = "s3-native-tls"))] -#[tokio::test] -#[serial] -async fn cleanup_metadata_aws_test() -> TestResult { - let context = IntegrationContext::new(StorageIntegration::Amazon)?; - cleanup_metadata_test(&context).await?; - Ok(()) -} - -#[cfg(feature = "azure")] -#[tokio::test] -#[serial] -async fn cleanup_metadata_azure_test() -> TestResult { - let context = IntegrationContext::new(StorageIntegration::Microsoft)?; - cleanup_metadata_test(&context).await?; - Ok(()) -} - -#[cfg(feature = "gcs")] -#[tokio::test] -#[serial] -async fn cleanup_metadata_gcp_test() -> TestResult { - let context = IntegrationContext::new(StorageIntegration::Google)?; - cleanup_metadata_test(&context).await?; - Ok(()) -} - -#[cfg(feature = "hdfs")] -#[tokio::test] -#[serial] -async fn cleanup_metadata_hdfs_test() -> TestResult { - let context = IntegrationContext::new(StorageIntegration::Hdfs)?; + let storage = Box::new(LocalStorageIntegration::default()); + let context = IntegrationContext::new(storage)?; cleanup_metadata_test(&context).await?; Ok(()) } diff --git a/crates/deltalake-core/tests/integration_concurrent_writes.rs b/crates/deltalake-core/tests/integration_concurrent_writes.rs index f57167f2c1..4e66a9f93f 100644 --- a/crates/deltalake-core/tests/integration_concurrent_writes.rs +++ b/crates/deltalake-core/tests/integration_concurrent_writes.rs @@ -1,11 +1,13 @@ #![cfg(feature = "integration_test")] +use log::*; + use deltalake_core::kernel::{Action, Add, DataType, PrimitiveType, StructField, StructType}; use deltalake_core::operations::transaction::commit; use deltalake_core::operations::DeltaOps; use deltalake_core::protocol::{DeltaOperation, SaveMode}; -use deltalake_core::test_utils::{IntegrationContext, StorageIntegration, TestResult, TestTables}; use deltalake_core::{DeltaTable, DeltaTableBuilder}; +use deltalake_test::utils::*; use serial_test::serial; use std::collections::HashMap; use std::future::Future; @@ -15,38 +17,13 @@ use std::time::Duration; #[tokio::test] #[serial] async fn test_concurrent_writes_local() -> TestResult { - test_concurrent_writes(StorageIntegration::Local).await?; - Ok(()) -} - -#[cfg(feature = "s3")] -#[tokio::test] -#[serial] -async fn concurrent_writes_s3() -> TestResult { - test_concurrent_writes(StorageIntegration::Amazon).await?; - Ok(()) -} - -#[cfg(feature = "azure")] -#[tokio::test] -#[serial] -async fn test_concurrent_writes_azure() -> TestResult { - test_concurrent_writes(StorageIntegration::Microsoft).await?; - Ok(()) -} - -// tracked via https://github.com/datafusion-contrib/datafusion-objectstore-hdfs/issues/13 -#[ignore] -#[cfg(feature = "hdfs")] -#[tokio::test] -#[serial] -async fn test_concurrent_writes_hdfs() -> TestResult { - test_concurrent_writes(StorageIntegration::Hdfs).await?; + let storage = Box::new(LocalStorageIntegration::default()); + let context = IntegrationContext::new(storage)?; + test_concurrent_writes(&context).await?; Ok(()) } -async fn test_concurrent_writes(integration: StorageIntegration) -> TestResult { - let context = IntegrationContext::new(integration)?; +async fn test_concurrent_writes(context: &IntegrationContext) -> TestResult { let (_table, table_uri) = prepare_table(&context).await?; run_test(|name| Worker::new(&table_uri, name)).await; Ok(()) diff --git a/crates/deltalake-core/tests/integration_datafusion.rs b/crates/deltalake-core/tests/integration_datafusion.rs index 51be591071..25a3fddbce 100644 --- a/crates/deltalake-core/tests/integration_datafusion.rs +++ b/crates/deltalake-core/tests/integration_datafusion.rs @@ -1,8 +1,8 @@ #![cfg(all(feature = "integration_test", feature = "datafusion"))] use arrow::array::Int64Array; -use common::datafusion::context_with_delta_table_factory; -use deltalake_core::test_utils::{IntegrationContext, StorageIntegration, TestResult, TestTables}; +use deltalake_test::datafusion::*; +use deltalake_test::utils::*; use serial_test::serial; use std::collections::{HashMap, HashSet}; @@ -32,19 +32,17 @@ use url::Url; use deltalake_core::delta_datafusion::{DeltaPhysicalCodec, DeltaScan}; use deltalake_core::kernel::{DataType, MapType, PrimitiveType, StructField, StructType}; +use deltalake_core::logstore::logstore_for; use deltalake_core::operations::create::CreateBuilder; use deltalake_core::protocol::SaveMode; use deltalake_core::writer::{DeltaWriter, RecordBatchWriter}; use deltalake_core::{ open_table, operations::{write::WriteBuilder, DeltaOps}, - storage::config::configure_log_store, DeltaTable, DeltaTableError, }; use std::error::Error; -mod common; - mod local { use datafusion::common::stats::Precision; use deltalake_core::writer::JsonWriter; @@ -53,7 +51,9 @@ mod local { #[tokio::test] #[serial] async fn test_datafusion_local() -> TestResult { - test_datafusion(StorageIntegration::Local).await + let storage = Box::new(LocalStorageIntegration::default()); + let context = IntegrationContext::new(storage)?; + test_datafusion(&context).await } fn get_scanned_files(node: &dyn ExecutionPlan) -> HashSet