From 95e59d4362ef55fd3657314426d3a899bf0f7511 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 21 Oct 2024 08:49:23 +0200 Subject: [PATCH 01/32] find manifest --- Cargo.lock | 5 +- iceberg-rust-spec/src/spec/values.rs | 49 ++ iceberg-rust/Cargo.toml | 1 + iceberg-rust/src/lib.rs | 1 + .../src/table/transaction/operation.rs | 487 +++++++++++------- iceberg-rust/src/util/mod.rs | 135 +++++ 6 files changed, 501 insertions(+), 177 deletions(-) create mode 100644 iceberg-rust/src/util/mod.rs diff --git a/Cargo.lock b/Cargo.lock index d30cb145..a9abb613 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2452,6 +2452,7 @@ dependencies = [ "serde", "serde_derive", "serde_json", + "smallvec", "sqlparser", "thiserror", "thrift", @@ -4294,9 +4295,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" dependencies = [ "serde", ] diff --git a/iceberg-rust-spec/src/spec/values.rs b/iceberg-rust-spec/src/spec/values.rs index 402d6153..7bef7b90 100644 --- a/iceberg-rust-spec/src/spec/values.rs +++ b/iceberg-rust-spec/src/spec/values.rs @@ -817,6 +817,55 @@ mod datetime { } } +pub trait TryAdd: Sized { + fn try_add(&self, other: &Self) -> Result; +} +pub trait TrySub: Sized { + fn try_sub(&self, other: &Self) -> Result; +} + +impl TryAdd for Value { + fn try_add(&self, other: &Self) -> Result { + match (self, other) { + (Value::Int(own), Value::Int(other)) => Ok(Value::Int(own + other)), + (Value::LongInt(own), Value::LongInt(other)) => Ok(Value::LongInt(own + other)), + (Value::Float(own), Value::Float(other)) => Ok(Value::Float(*own + *other)), + (Value::Double(own), Value::Double(other)) => Ok(Value::Double(*own + *other)), + (Value::Date(own), Value::Date(other)) => Ok(Value::Date(own + other)), + (Value::Time(own), Value::Time(other)) => Ok(Value::Time(own + other)), + (Value::Timestamp(own), Value::Timestamp(other)) => Ok(Value::Timestamp(own + other)), + (Value::TimestampTZ(own), Value::TimestampTZ(other)) => { + Ok(Value::TimestampTZ(own + other)) + } + (x, y) => Err(Error::Type( + x.datatype().to_string(), + y.datatype().to_string(), + )), + } + } +} + +impl TrySub for Value { + fn try_sub(&self, other: &Self) -> Result { + match (self, other) { + (Value::Int(own), Value::Int(other)) => Ok(Value::Int(own - other)), + (Value::LongInt(own), Value::LongInt(other)) => Ok(Value::LongInt(own - other)), + (Value::Float(own), Value::Float(other)) => Ok(Value::Float(*own - *other)), + (Value::Double(own), Value::Double(other)) => Ok(Value::Double(*own - *other)), + (Value::Date(own), Value::Date(other)) => Ok(Value::Date(own - other)), + (Value::Time(own), Value::Time(other)) => Ok(Value::Time(own - other)), + (Value::Timestamp(own), Value::Timestamp(other)) => Ok(Value::Timestamp(own - other)), + (Value::TimestampTZ(own), Value::TimestampTZ(other)) => { + Ok(Value::TimestampTZ(own - other)) + } + (x, y) => Err(Error::Type( + x.datatype().to_string(), + y.datatype().to_string(), + )), + } + } +} + #[cfg(test)] mod tests { diff --git a/iceberg-rust/Cargo.toml b/iceberg-rust/Cargo.toml index 4c62f207..4510c49b 100644 --- a/iceberg-rust/Cargo.toml +++ b/iceberg-rust/Cargo.toml @@ -30,4 +30,5 @@ thrift = { version = "0.17.0", default-features = false } thiserror = { workspace = true } derive-getters = { workspace = true } iceberg-rust-spec = { path = "../iceberg-rust-spec", version = "0.5.8" } +smallvec = { version = "1.13.2", features = ["const_generics"] } diff --git a/iceberg-rust/src/lib.rs b/iceberg-rust/src/lib.rs index 8d6fc1b7..b52fd8d8 100644 --- a/iceberg-rust/src/lib.rs +++ b/iceberg-rust/src/lib.rs @@ -10,4 +10,5 @@ pub mod materialized_view; pub mod spec; pub mod sql; pub mod table; +pub(crate) mod util; pub mod view; diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index 2a2e9f36..62696b15 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -3,6 +3,7 @@ */ use std::{ + cmp::Ordering, collections::{HashMap, HashSet}, sync::Arc, }; @@ -22,10 +23,12 @@ use iceberg_rust_spec::spec::{ use iceberg_rust_spec::util::strip_prefix; use iceberg_rust_spec::{error::Error as SpecError, spec::table_metadata::TableMetadata}; use object_store::ObjectStore; +use smallvec::SmallVec; use crate::{ catalog::commit::{TableRequirement, TableUpdate}, error::Error, + util::{struct_to_smallvec, summary_to_rectangle, Rectangle}, }; #[derive(Debug)] @@ -92,87 +95,221 @@ impl Operation { let schema = table_metadata.current_schema(branch.as_deref())?; let old_snapshot = table_metadata.current_snapshot(branch.as_deref())?; - let datafiles = Arc::new(files.into_iter().map(Ok::<_, Error>).try_fold( - HashMap::>::new(), - |mut acc, x| { - let x = x?; - let partition_value = x.partition().clone(); - acc.entry(partition_value).or_default().push(x); - Ok::<_, Error>(acc) - }, - )?); + let partition_column_names = table_metadata + .default_partition_spec()? + .fields() + .iter() + .map(|x| schema.get(*x.source_id() as usize).map(|x| x.name.as_str())) + .collect::>>() + .ok_or(Error::InvalidFormat("Partition columns".to_owned()))?; + + let bounding_partition_values = files + .iter() + .fold(Ok::<_, Error>(None), |acc, x| { + let acc = acc?; + let node = struct_to_smallvec(x.partition(), &partition_column_names)?; + let Some(mut acc) = acc else { + return Ok(Some(Rectangle::new(node.clone(), node))); + }; + acc.expand_with_node(node); + Ok(Some(acc)) + })? + .ok_or(Error::NotFound( + "Bounding".to_owned(), + "rectangle".to_owned(), + ))?; let manifest_list_schema = ManifestListEntry::schema(&table_metadata.format_version)?; - let manifest_list_writer = Arc::new(Mutex::new(apache_avro::Writer::new( - &manifest_list_schema, - Vec::new(), - ))); - - let existing_partitions = Arc::new(Mutex::new(HashSet::new())); + let mut manifest_list_writer = + apache_avro::Writer::new(&manifest_list_schema, Vec::new()); let old_manifest_list_location = old_snapshot.map(|x| x.manifest_list()).cloned(); - let manifest_list_bytes = match old_manifest_list_location { - Some(old_manifest_list_location) => Some( - object_store - .get(&strip_prefix(&old_manifest_list_location).as_str().into()) - .await? - .bytes() - .await?, - ), - None => None, - }; - // Check if file has content "null", if so manifest_list file is empty - let existing_manifest_iter = if let Some(manifest_list_bytes) = &manifest_list_bytes + // Find a manifest to add the new datafiles + let manifest = if let Some(old_manifest_list_location) = &old_manifest_list_location { + let old_manifest_list_bytes = object_store + .get(&strip_prefix(&old_manifest_list_location).as_str().into()) + .await? + .bytes() + .await?; + let manifest_list_reader = - apache_avro::Reader::new(manifest_list_bytes.as_ref())?; + apache_avro::Reader::new(old_manifest_list_bytes.as_ref())?; + + let manifest = if partition_column_names.is_empty() { + // Find the manifest with the lowest row count + manifest_list_reader + .fold(Ok::<_, Error>(None), |acc, x| { + let acc = acc?; + + let manifest = x + .map_err(Into::into) + .and_then(|value| { + ManifestListEntry::try_from_enum( + from_value::(&value)?, + table_metadata, + ) + }) + .unwrap(); + + let row_count = manifest.added_rows_count; + + let Some((old_row_count, old_manifest)) = acc else { + return Ok(Some((row_count, manifest))); + }; - Some(stream::iter(manifest_list_reader).filter_map(|manifest| { - let datafiles = datafiles.clone(); - let existing_partitions = existing_partitions.clone(); - let partition_spec = partition_spec.clone(); - async move { - let manifest = manifest - .map_err(Into::into) - .and_then(|value| { - ManifestListEntry::try_from_enum( - from_value::(&value)?, - table_metadata, - ) - }) - .unwrap(); - - if let Some(summary) = &manifest.partitions { - let partition_values = partition_values_in_bounds( - summary, - datafiles.keys(), - partition_spec.fields(), - ); - if !partition_values.is_empty() { - for file in &partition_values { - existing_partitions.lock().await.insert(file.clone()); - } - Some((ManifestStatus::Existing(manifest), partition_values)) + let Some(row_count) = row_count else { + return Ok(Some((old_row_count, old_manifest))); + }; + + if old_row_count.is_none() + || old_row_count.is_some_and(|x| x > row_count) + { + manifest_list_writer.append_ser(old_manifest)?; + Ok(Some((Some(row_count), manifest))) } else { - Some((ManifestStatus::Existing(manifest), vec![])) + manifest_list_writer.append_ser(manifest)?; + Ok(Some((old_row_count, old_manifest))) } - } else { - Some((ManifestStatus::Existing(manifest), vec![])) - } - } - })) + })? + .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned()))? + .1 + } else { + // Find the manifest with the smallest bounding partition values + manifest_list_reader + .fold(Ok::<_, Error>(None), |acc, x| { + let acc = acc?; + + let manifest = x + .map_err(Into::into) + .and_then(|value| { + ManifestListEntry::try_from_enum( + from_value::(&value)?, + table_metadata, + ) + }) + .unwrap(); + + let mut bounds = summary_to_rectangle( + manifest.partitions.as_ref().ok_or(Error::NotFound( + "Partition".to_owned(), + "struct".to_owned(), + ))?, + )?; + + bounds.expand(&bounding_partition_values); + + let Some((old_bounds, old_manifest)) = acc else { + return Ok(Some((bounds, manifest))); + }; + + match old_bounds.cmp_with_priority(&bounds)? { + Ordering::Greater => { + manifest_list_writer.append_ser(old_manifest)?; + Ok(Some((bounds, manifest))) + } + _ => { + manifest_list_writer.append_ser(manifest)?; + Ok(Some((old_bounds, old_manifest))) + } + } + })? + .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned()))? + .1 + }; + Some(manifest) } else { None }; - let manifest_count = if let Some(manifest_list_bytes) = &manifest_list_bytes { - apache_avro::Reader::new(manifest_list_bytes.as_ref())?.count() - } else { - 0 - }; + let manifest_schema = ManifestEntry::schema( + &partition_value_schema(partition_spec.fields(), schema)?, + &table_metadata.format_version, + )?; + + let mut manifest_writer = ManifestWriter::new( + Vec::new(), + &manifest_schema, + table_metadata, + branch.as_deref(), + )?; + + if let Some(manifest) = manifest { + let manifest_bytes: Vec = object_store + .get(&strip_prefix(&manifest.manifest_path).as_str().into()) + .await? + .bytes() + .await? + .into(); + + let manifest_reader = apache_avro::Reader::new(&*manifest_bytes)?; + manifest_writer.extend(manifest_reader.filter_map(Result::ok))?; + } + + // let datafiles = Arc::new(files.into_iter().map(Ok::<_, Error>).try_fold( + // HashMap::>::new(), + // |mut acc, x| { + // let x = x?; + // let partition_value = x.partition().clone(); + // acc.entry(partition_value).or_default().push(x); + // Ok::<_, Error>(acc) + // }, + // )?); + + // let existing_partitions = Arc::new(Mutex::new(HashSet::new())); + + // // Check if file has content "null", if so manifest_list file is empty + // let existing_manifest_iter = if let Some(manifest_list_bytes) = &manifest_list_bytes + // { + // let manifest_list_reader = + // apache_avro::Reader::new(manifest_list_bytes.as_ref())?; + + // Some(stream::iter(manifest_list_reader).filter_map(|manifest| { + // let datafiles = datafiles.clone(); + // let existing_partitions = existing_partitions.clone(); + // let partition_spec = partition_spec.clone(); + // async move { + // let manifest = manifest + // .map_err(Into::into) + // .and_then(|value| { + // ManifestListEntry::try_from_enum( + // from_value::(&value)?, + // table_metadata, + // ) + // }) + // .unwrap(); + + // if let Some(summary) = &manifest.partitions { + // let partition_values = partition_values_in_bounds( + // summary, + // datafiles.keys(), + // partition_spec.fields(), + // ); + // if !partition_values.is_empty() { + // for file in &partition_values { + // existing_partitions.lock().await.insert(file.clone()); + // } + // Some((ManifestStatus::Existing(manifest), partition_values)) + // } else { + // Some((ManifestStatus::Existing(manifest), vec![])) + // } + // } else { + // Some((ManifestStatus::Existing(manifest), vec![])) + // } + // } + // })) + // } else { + // None + // }; + + // let manifest_count = if let Some(manifest_list_bytes) = &manifest_list_bytes { + // apache_avro::Reader::new(manifest_list_bytes.as_ref())?.count() + // } else { + // 0 + // }; let snapshot_id = generate_snapshot_id(); let snapshot_uuid = &uuid::Uuid::new_v4().to_string(); @@ -182,116 +319,116 @@ impl Operation { + snapshot_uuid + ".avro"; - let new_manifest_iter = stream::iter(datafiles.iter().enumerate()).filter_map( - |(i, (partition_value, _))| { - let existing_partitions = existing_partitions.clone(); - async move { - if !existing_partitions.lock().await.contains(partition_value) { - let manifest_location = table_metadata.location.to_string() - + "/metadata/" - + snapshot_uuid - + "-m" - + &(manifest_count + i).to_string() - + ".avro"; - let manifest = ManifestListEntry { - format_version: table_metadata.format_version.clone(), - manifest_path: manifest_location, - manifest_length: 0, - partition_spec_id: table_metadata.default_spec_id, - content: Content::Data, - sequence_number: table_metadata.last_sequence_number, - min_sequence_number: 0, - added_snapshot_id: snapshot_id, - added_files_count: Some(0), - existing_files_count: Some(0), - deleted_files_count: Some(0), - added_rows_count: Some(0), - existing_rows_count: Some(0), - deleted_rows_count: Some(0), - partitions: None, - key_metadata: None, - }; - Some((ManifestStatus::New(manifest), vec![partition_value.clone()])) - } else { - None - } - } - }, - ); - - match existing_manifest_iter { - Some(existing_manifest_iter) => { - let manifest_iter = - Box::new(existing_manifest_iter.chain(new_manifest_iter)); - - manifest_iter - .then(|(manifest, files): (ManifestStatus, Vec)| { - let object_store = object_store.clone(); - let datafiles = datafiles.clone(); - let branch = branch.clone(); - async move { - write_manifest( - table_metadata, - manifest, - files, - datafiles, - schema, - object_store, - branch, - ) - .await - } - }) - .try_for_each_concurrent(None, |manifest| { - let manifest_list_writer = manifest_list_writer.clone(); - async move { - manifest_list_writer.lock().await.append_ser(manifest)?; - Ok(()) - } - }) - .await?; - } - None => { - new_manifest_iter - .then(|(manifest, files): (ManifestStatus, Vec)| { - let object_store = object_store.clone(); - let datafiles = datafiles.clone(); - let branch = branch.clone(); - async move { - write_manifest( - table_metadata, - manifest, - files, - datafiles, - schema, - object_store, - branch, - ) - .await - } - }) - .try_for_each_concurrent(None, |manifest| { - let manifest_list_writer = manifest_list_writer.clone(); - async move { - manifest_list_writer.lock().await.append_ser(manifest)?; - Ok(()) - } - }) - .await?; - } - } - - let manifest_list_bytes = Arc::into_inner(manifest_list_writer) - .unwrap() - .into_inner() - .into_inner()?; - - object_store - .put( - &strip_prefix(&new_manifest_list_location).into(), - manifest_list_bytes.into(), - ) - .await?; + // let new_manifest_iter = stream::iter(datafiles.iter().enumerate()).filter_map( + // |(i, (partition_value, _))| { + // let existing_partitions = existing_partitions.clone(); + // async move { + // if !existing_partitions.lock().await.contains(partition_value) { + // let manifest_location = table_metadata.location.to_string() + // + "/metadata/" + // + snapshot_uuid + // + "-m" + // + &(manifest_count + i).to_string() + // + ".avro"; + // let manifest = ManifestListEntry { + // format_version: table_metadata.format_version.clone(), + // manifest_path: manifest_location, + // manifest_length: 0, + // partition_spec_id: table_metadata.default_spec_id, + // content: Content::Data, + // sequence_number: table_metadata.last_sequence_number, + // min_sequence_number: 0, + // added_snapshot_id: snapshot_id, + // added_files_count: Some(0), + // existing_files_count: Some(0), + // deleted_files_count: Some(0), + // added_rows_count: Some(0), + // existing_rows_count: Some(0), + // deleted_rows_count: Some(0), + // partitions: None, + // key_metadata: None, + // }; + // Some((ManifestStatus::New(manifest), vec![partition_value.clone()])) + // } else { + // None + // } + // } + // }, + // ); + + // match existing_manifest_iter { + // Some(existing_manifest_iter) => { + // let manifest_iter = + // Box::new(existing_manifest_iter.chain(new_manifest_iter)); + + // manifest_iter + // .then(|(manifest, files): (ManifestStatus, Vec)| { + // let object_store = object_store.clone(); + // let datafiles = datafiles.clone(); + // let branch = branch.clone(); + // async move { + // write_manifest( + // table_metadata, + // manifest, + // files, + // datafiles, + // schema, + // object_store, + // branch, + // ) + // .await + // } + // }) + // .try_for_each_concurrent(None, |manifest| { + // let manifest_list_writer = manifest_list_writer.clone(); + // async move { + // manifest_list_writer.lock().await.append_ser(manifest)?; + // Ok(()) + // } + // }) + // .await?; + // } + // None => { + // new_manifest_iter + // .then(|(manifest, files): (ManifestStatus, Vec)| { + // let object_store = object_store.clone(); + // let datafiles = datafiles.clone(); + // let branch = branch.clone(); + // async move { + // write_manifest( + // table_metadata, + // manifest, + // files, + // datafiles, + // schema, + // object_store, + // branch, + // ) + // .await + // } + // }) + // .try_for_each_concurrent(None, |manifest| { + // let manifest_list_writer = manifest_list_writer.clone(); + // async move { + // manifest_list_writer.lock().await.append_ser(manifest)?; + // Ok(()) + // } + // }) + // .await?; + // } + // } + + // let manifest_list_bytes = Arc::into_inner(manifest_list_writer) + // .unwrap() + // .into_inner() + // .into_inner()?; + + // object_store + // .put( + // &strip_prefix(&new_manifest_list_location).into(), + // manifest_list_bytes.into(), + // ) + // .await?; let mut snapshot_builder = SnapshotBuilder::default(); snapshot_builder diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs new file mode 100644 index 00000000..a02d9aa2 --- /dev/null +++ b/iceberg-rust/src/util/mod.rs @@ -0,0 +1,135 @@ +use std::{cmp::Ordering, convert::identity}; + +use iceberg_rust_spec::{ + manifest_list::FieldSummary, + values::{Struct, TryAdd, TrySub, Value}, +}; +use smallvec::SmallVec; + +use crate::error::Error; + +#[derive(Clone, PartialEq, Eq)] +pub(crate) struct Rectangle { + pub min: SmallVec<[C; 4]>, + pub max: SmallVec<[C; 4]>, +} + +impl Rectangle +where + C: PartialOrd + Clone + TryAdd + TrySub, +{ + pub(crate) fn new(min: SmallVec<[C; 4]>, max: SmallVec<[C; 4]>) -> Self { + Self { min, max } + } + + pub(crate) fn expand(&mut self, rect: &Rectangle) { + for i in 0..self.min.len() { + if rect.min[i] < self.min[i] { + self.min[i] = rect.min[i].clone(); + } + if rect.max[i] > self.max[i] { + self.max[i] = rect.max[i].clone(); + } + } + } + + pub(crate) fn expand_with_node(&mut self, node: SmallVec<[C; 4]>) { + for i in 0..self.min.len() { + if node[i] < self.min[i] { + self.min[i] = node[i].clone(); + } + if node[i] > self.max[i] { + self.max[i] = node[i].clone(); + } + } + } + + /// Determine of one rectangle is larger than the other + pub(crate) fn cmp_with_priority(&self, other: &Rectangle) -> Result { + if self.contains(other) { + Ok(Ordering::Greater) + } else if other.contains(self) { + Ok(Ordering::Less) + } else { + let mut self_iter = self + .max + .iter() + .zip(self.min.iter()) + .map(|(max, min)| max.try_sub(min)); + let mut other_iter = other + .max + .iter() + .zip(other.min.iter()) + .map(|(max, min)| max.try_sub(min)); + while let (Some(own), Some(other)) = (self_iter.next(), other_iter.next()) { + let ordering = own? + .partial_cmp(&other?) + .ok_or(Error::InvalidFormat("Types for Partial Order".to_owned()))?; + let Ordering::Equal = ordering else { + return Ok(ordering); + }; + } + Ok(Ordering::Equal) + } + } + + pub(crate) fn contains(&self, rect: &Rectangle) -> bool { + if self.min.len() == 0 { + return false; + } + for i in 0..self.min.len() { + if rect.min[i] < self.min[i] || rect.max[i] > self.max[i] { + return false; + } + } + true + } + + pub(crate) fn intersects(&self, rect: &Rectangle) -> bool { + if self.min.len() == 0 { + return false; + } + for i in 0..self.min.len() { + if rect.min[i] > self.max[i] || rect.max[i] < self.min[i] { + return false; + } + } + true + } +} + +pub(crate) fn struct_to_smallvec( + s: &Struct, + names: &SmallVec<[&str; 4]>, +) -> Result, Error> { + names + .iter() + .map(|x| { + s.get(*x) + .and_then(|x| identity(x).as_ref().map(Clone::clone)) + }) + .collect::>>() + .ok_or(Error::InvalidFormat("Partition struct".to_owned())) +} + +pub(crate) fn summary_to_rectangle(summaries: &[FieldSummary]) -> Result, Error> { + let mut max = SmallVec::with_capacity(summaries.len()); + let mut min = SmallVec::with_capacity(summaries.len()); + + for summary in summaries { + max.push( + summary + .upper_bound + .clone() + .ok_or(Error::NotFound("Partition".to_owned(), "struct".to_owned()))?, + ); + min.push( + summary + .lower_bound + .clone() + .ok_or(Error::NotFound("Partition".to_owned(), "struct".to_owned()))?, + ); + } + + Ok(Rectangle::new(min, max)) +} From 79a2b920640a8f581f8126cc07f44e9236c770f8 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Wed, 23 Oct 2024 10:53:35 +0200 Subject: [PATCH 02/32] fix compiler errors --- iceberg-rust-spec/src/spec/manifest.rs | 2 + iceberg-rust-spec/src/spec/table_metadata.rs | 2 +- iceberg-rust/src/table/transaction/append.rs | 83 +++ iceberg-rust/src/table/transaction/mod.rs | 5 +- .../src/table/transaction/operation.rs | 618 +++++++++--------- iceberg-rust/src/util/mod.rs | 38 +- 6 files changed, 424 insertions(+), 324 deletions(-) create mode 100644 iceberg-rust/src/table/transaction/append.rs diff --git a/iceberg-rust-spec/src/spec/manifest.rs b/iceberg-rust-spec/src/spec/manifest.rs index 452d2388..abc57d8e 100644 --- a/iceberg-rust-spec/src/spec/manifest.rs +++ b/iceberg-rust-spec/src/spec/manifest.rs @@ -189,8 +189,10 @@ pub struct ManifestEntry { status: Status, /// Snapshot id where the file was added, or deleted if status is 2. /// Inherited when null. + #[builder(setter(strip_option), default)] snapshot_id: Option, /// Sequence number when the file was added. Inherited when null. + #[builder(setter(strip_option), default)] sequence_number: Option, /// File path, partition tuple, metrics, … data_file: DataFile, diff --git a/iceberg-rust-spec/src/spec/table_metadata.rs b/iceberg-rust-spec/src/spec/table_metadata.rs index 1cb5ba96..9a9b2ce3 100644 --- a/iceberg-rust-spec/src/spec/table_metadata.rs +++ b/iceberg-rust-spec/src/spec/table_metadata.rs @@ -733,7 +733,7 @@ pub struct SnapshotLog { pub timestamp_ms: i64, } -#[derive(Debug, Serialize_repr, Deserialize_repr, PartialEq, Eq, Clone)] +#[derive(Debug, Serialize_repr, Deserialize_repr, PartialEq, Eq, Clone, Copy)] #[repr(u8)] /// Iceberg format version #[derive(Default)] diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs new file mode 100644 index 00000000..68a188ec --- /dev/null +++ b/iceberg-rust/src/table/transaction/append.rs @@ -0,0 +1,83 @@ +use std::cmp::Ordering; + +use iceberg_rust_spec::{manifest::ManifestEntry, values::Value}; +use smallvec::{smallvec, SmallVec}; + +use crate::{ + error::Error, + util::{cmp_dist, struct_to_smallvec, sub, Rectangle}, +}; + +/// Split sets of datafiles depending on their partition_values +pub(crate) fn split_datafiles_once( + files: impl Iterator>, + rect: Rectangle, + names: &SmallVec<[&str; 4]>, +) -> Result<[(Vec, Rectangle); 2], Error> { + let mut smaller = Vec::new(); + let mut larger = Vec::new(); + let mut smaller_rect = None; + let mut larger_rect = None; + + for manifest_entry in files { + let manifest_entry = manifest_entry?; + let position = struct_to_smallvec(manifest_entry.data_file().partition(), names)?; + // Check distance to upper and lower bound + if let Ordering::Greater = + cmp_dist(&sub(&position, &rect.min)?, &sub(&rect.max, &position)?)? + { + // if closer to upper bound + larger.push(manifest_entry); + + if larger_rect.is_none() { + larger_rect = Some(Rectangle::new(position.clone(), position)); + } else if let Some(larger_rect) = larger_rect.as_mut() { + larger_rect.expand_with_node(position); + } + } else { + // if closer to lower bound + smaller.push(manifest_entry); + + if smaller_rect.is_none() { + smaller_rect = Some(Rectangle::new(position.clone(), position)); + } else if let Some(smaller_rect) = smaller_rect.as_mut() { + smaller_rect.expand_with_node(position); + } + } + } + Ok([ + ( + smaller, + smaller_rect.ok_or(Error::NotFound("Lower".to_owned(), "rectangle".to_owned()))?, + ), + ( + larger, + larger_rect.ok_or(Error::NotFound("Upper".to_owned(), "rectangle".to_owned()))?, + ), + ]) +} + +pub(crate) fn split_datafiles( + files: impl Iterator>, + rect: Rectangle, + names: &SmallVec<[&str; 4]>, + n_split: u32, +) -> Result; 2]>, Error> { + let [(smaller, smaller_rect), (larger, larger_rect)] = + split_datafiles_once(files, rect, names)?; + if n_split == 1 { + Ok(smallvec![smaller, larger]) + } else { + let mut smaller = split_datafiles( + smaller.into_iter().map(Ok), + smaller_rect, + names, + n_split - 1, + )?; + let mut larger = + split_datafiles(larger.into_iter().map(Ok), larger_rect, names, n_split - 1)?; + + smaller.append(&mut larger); + Ok(smaller) + } +} diff --git a/iceberg-rust/src/table/transaction/mod.rs b/iceberg-rust/src/table/transaction/mod.rs index 5e933f72..f4ffb9b5 100644 --- a/iceberg-rust/src/table/transaction/mod.rs +++ b/iceberg-rust/src/table/transaction/mod.rs @@ -11,6 +11,7 @@ use self::operation::Operation; use super::delete_files; +pub(crate) mod append; pub(crate) mod operation; pub(crate) static APPEND_KEY: &str = "append"; @@ -55,7 +56,7 @@ impl<'table> TableTransaction<'table> { self.operations .entry(APPEND_KEY.to_owned()) .and_modify(|mut x| { - if let Operation::NewAppend { + if let Operation::Append { branch: _, files: old, additional_summary: None, @@ -64,7 +65,7 @@ impl<'table> TableTransaction<'table> { old.extend_from_slice(&files) } }) - .or_insert(Operation::NewAppend { + .or_insert(Operation::Append { branch: self.branch.clone(), files, additional_summary: None, diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index 62696b15..8f1a3dd5 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -2,26 +2,25 @@ * Defines the different [Operation]s on a [Table]. */ -use std::{ - cmp::Ordering, - collections::{HashMap, HashSet}, - sync::Arc, -}; +use std::{cmp::Ordering, collections::HashMap, sync::Arc}; use apache_avro::from_value; use futures::{lock::Mutex, stream, StreamExt, TryStreamExt}; -use iceberg_rust_spec::spec::{ - manifest::{partition_value_schema, DataFile, ManifestEntry, ManifestWriter, Status}, - manifest_list::{Content, FieldSummary, ManifestListEntry, ManifestListEntryEnum}, - partition::PartitionField, - schema::Schema, - snapshot::{ - generate_snapshot_id, SnapshotBuilder, SnapshotReference, SnapshotRetention, Summary, - }, - values::{Struct, Value}, -}; use iceberg_rust_spec::util::strip_prefix; use iceberg_rust_spec::{error::Error as SpecError, spec::table_metadata::TableMetadata}; +use iceberg_rust_spec::{ + manifest::ManifestReader, + spec::{ + manifest::{partition_value_schema, DataFile, ManifestEntry, ManifestWriter, Status}, + manifest_list::{Content, FieldSummary, ManifestListEntry, ManifestListEntryEnum}, + partition::PartitionField, + schema::Schema, + snapshot::{ + generate_snapshot_id, SnapshotBuilder, SnapshotReference, SnapshotRetention, Summary, + }, + values::{Struct, Value}, + }, +}; use object_store::ObjectStore; use smallvec::SmallVec; @@ -31,6 +30,10 @@ use crate::{ util::{struct_to_smallvec, summary_to_rectangle, Rectangle}, }; +use super::append::split_datafiles; + +static MIN_DATAFILES: usize = 4; + #[derive(Debug)] ///Table operations pub enum Operation { @@ -47,7 +50,7 @@ pub enum Operation { // /// Update the table location // UpdateLocation, /// Append new files to the table - NewAppend { + Append { branch: Option, files: Vec, additional_summary: Option>, @@ -86,7 +89,7 @@ impl Operation { object_store: Arc, ) -> Result<(Option, Vec), Error> { match self { - Operation::NewAppend { + Operation::Append { branch, files, additional_summary, @@ -99,9 +102,8 @@ impl Operation { .default_partition_spec()? .fields() .iter() - .map(|x| schema.get(*x.source_id() as usize).map(|x| x.name.as_str())) - .collect::>>() - .ok_or(Error::InvalidFormat("Partition columns".to_owned()))?; + .map(|x| x.name().as_str()) + .collect::>(); let bounding_partition_values = files .iter() @@ -127,6 +129,8 @@ impl Operation { let old_manifest_list_location = old_snapshot.map(|x| x.manifest_list()).cloned(); + let mut file_count = 0; + // Find a manifest to add the new datafiles let manifest = if let Some(old_manifest_list_location) = &old_manifest_list_location { @@ -139,6 +143,7 @@ impl Operation { let manifest_list_reader = apache_avro::Reader::new(old_manifest_list_bytes.as_ref())?; + // Check if table is partitioned let manifest = if partition_column_names.is_empty() { // Find the manifest with the lowest row count manifest_list_reader @@ -157,6 +162,8 @@ impl Operation { let row_count = manifest.added_rows_count; + file_count += manifest.added_files_count.unwrap_or(0) as usize; + let Some((old_row_count, old_manifest)) = acc else { return Ok(Some((row_count, manifest))); }; @@ -202,6 +209,8 @@ impl Operation { bounds.expand(&bounding_partition_values); + file_count += manifest.added_files_count.unwrap_or(0) as usize; + let Some((old_bounds, old_manifest)) = acc else { return Ok(Some((bounds, manifest))); }; @@ -222,96 +231,55 @@ impl Operation { }; Some(manifest) } else { + // If manifest list doesn't exist, there is no manifest None }; - let manifest_schema = ManifestEntry::schema( - &partition_value_schema(partition_spec.fields(), schema)?, - &table_metadata.format_version, - )?; - - let mut manifest_writer = ManifestWriter::new( - Vec::new(), - &manifest_schema, - table_metadata, - branch.as_deref(), - )?; + let limit = MIN_DATAFILES + ((file_count + files.len()) as f64).sqrt() as usize; - if let Some(manifest) = manifest { - let manifest_bytes: Vec = object_store - .get(&strip_prefix(&manifest.manifest_path).as_str().into()) - .await? - .bytes() - .await? - .into(); + let new_file_count = manifest + .as_ref() + .and_then(|x| x.added_files_count) + .unwrap_or(0) as usize + + files.len(); - let manifest_reader = apache_avro::Reader::new(&*manifest_bytes)?; - manifest_writer.extend(manifest_reader.filter_map(Result::ok))?; - } + // How many times do the files need to be split to give at most *limit* files per manifest + let n_splits = match new_file_count / limit { + 0 => 0, + x => x.ilog2() + 1, + }; - // let datafiles = Arc::new(files.into_iter().map(Ok::<_, Error>).try_fold( - // HashMap::>::new(), - // |mut acc, x| { - // let x = x?; - // let partition_value = x.partition().clone(); - // acc.entry(partition_value).or_default().push(x); - // Ok::<_, Error>(acc) - // }, - // )?); - - // let existing_partitions = Arc::new(Mutex::new(HashSet::new())); - - // // Check if file has content "null", if so manifest_list file is empty - // let existing_manifest_iter = if let Some(manifest_list_bytes) = &manifest_list_bytes - // { - // let manifest_list_reader = - // apache_avro::Reader::new(manifest_list_bytes.as_ref())?; - - // Some(stream::iter(manifest_list_reader).filter_map(|manifest| { - // let datafiles = datafiles.clone(); - // let existing_partitions = existing_partitions.clone(); - // let partition_spec = partition_spec.clone(); - // async move { - // let manifest = manifest - // .map_err(Into::into) - // .and_then(|value| { - // ManifestListEntry::try_from_enum( - // from_value::(&value)?, - // table_metadata, - // ) - // }) - // .unwrap(); - - // if let Some(summary) = &manifest.partitions { - // let partition_values = partition_values_in_bounds( - // summary, - // datafiles.keys(), - // partition_spec.fields(), - // ); - // if !partition_values.is_empty() { - // for file in &partition_values { - // existing_partitions.lock().await.insert(file.clone()); - // } - // Some((ManifestStatus::Existing(manifest), partition_values)) - // } else { - // Some((ManifestStatus::Existing(manifest), vec![])) - // } - // } else { - // Some((ManifestStatus::Existing(manifest), vec![])) - // } - // } - // })) - // } else { - // None - // }; - - // let manifest_count = if let Some(manifest_list_bytes) = &manifest_list_bytes { - // apache_avro::Reader::new(manifest_list_bytes.as_ref())?.count() - // } else { - // 0 - // }; + let bounds = manifest + .as_ref() + .and_then(|x| x.partitions.as_deref()) + .map(summary_to_rectangle) + .transpose()? + .map(|mut x| { + x.expand(&bounding_partition_values); + x + }) + .unwrap_or(bounding_partition_values); let snapshot_id = generate_snapshot_id(); + let sequence_number = table_metadata.last_sequence_number + 1; + + let new_datafile_iter = files.into_iter().map(|data_file| { + ManifestEntry::builder() + .with_format_version(table_metadata.format_version) + .with_status(Status::Added) + .with_snapshot_id(snapshot_id) + .with_sequence_number(sequence_number) + .with_data_file(data_file) + .build() + .map_err(crate::spec::error::Error::from) + .map_err(Error::from) + }); + + let manifest_schema = ManifestEntry::schema( + &partition_value_schema(partition_spec.fields(), schema)?, + &table_metadata.format_version, + )?; + let snapshot_uuid = &uuid::Uuid::new_v4().to_string(); let new_manifest_list_location = table_metadata.location.to_string() + "/metadata/snap-" @@ -319,116 +287,241 @@ impl Operation { + snapshot_uuid + ".avro"; - // let new_manifest_iter = stream::iter(datafiles.iter().enumerate()).filter_map( - // |(i, (partition_value, _))| { - // let existing_partitions = existing_partitions.clone(); - // async move { - // if !existing_partitions.lock().await.contains(partition_value) { - // let manifest_location = table_metadata.location.to_string() - // + "/metadata/" - // + snapshot_uuid - // + "-m" - // + &(manifest_count + i).to_string() - // + ".avro"; - // let manifest = ManifestListEntry { - // format_version: table_metadata.format_version.clone(), - // manifest_path: manifest_location, - // manifest_length: 0, - // partition_spec_id: table_metadata.default_spec_id, - // content: Content::Data, - // sequence_number: table_metadata.last_sequence_number, - // min_sequence_number: 0, - // added_snapshot_id: snapshot_id, - // added_files_count: Some(0), - // existing_files_count: Some(0), - // deleted_files_count: Some(0), - // added_rows_count: Some(0), - // existing_rows_count: Some(0), - // deleted_rows_count: Some(0), - // partitions: None, - // key_metadata: None, - // }; - // Some((ManifestStatus::New(manifest), vec![partition_value.clone()])) - // } else { - // None - // } - // } - // }, - // ); - - // match existing_manifest_iter { - // Some(existing_manifest_iter) => { - // let manifest_iter = - // Box::new(existing_manifest_iter.chain(new_manifest_iter)); - - // manifest_iter - // .then(|(manifest, files): (ManifestStatus, Vec)| { - // let object_store = object_store.clone(); - // let datafiles = datafiles.clone(); - // let branch = branch.clone(); - // async move { - // write_manifest( - // table_metadata, - // manifest, - // files, - // datafiles, - // schema, - // object_store, - // branch, - // ) - // .await - // } - // }) - // .try_for_each_concurrent(None, |manifest| { - // let manifest_list_writer = manifest_list_writer.clone(); - // async move { - // manifest_list_writer.lock().await.append_ser(manifest)?; - // Ok(()) - // } - // }) - // .await?; - // } - // None => { - // new_manifest_iter - // .then(|(manifest, files): (ManifestStatus, Vec)| { - // let object_store = object_store.clone(); - // let datafiles = datafiles.clone(); - // let branch = branch.clone(); - // async move { - // write_manifest( - // table_metadata, - // manifest, - // files, - // datafiles, - // schema, - // object_store, - // branch, - // ) - // .await - // } - // }) - // .try_for_each_concurrent(None, |manifest| { - // let manifest_list_writer = manifest_list_writer.clone(); - // async move { - // manifest_list_writer.lock().await.append_ser(manifest)?; - // Ok(()) - // } - // }) - // .await?; - // } - // } - - // let manifest_list_bytes = Arc::into_inner(manifest_list_writer) - // .unwrap() - // .into_inner() - // .into_inner()?; - - // object_store - // .put( - // &strip_prefix(&new_manifest_list_location).into(), - // manifest_list_bytes.into(), - // ) - // .await?; + // Write manifest files + // Split manifest file if limit is exceeded + if n_splits == 0 { + // If manifest doesn't need to be split + let mut manifest_writer = ManifestWriter::new( + Vec::new(), + &manifest_schema, + table_metadata, + branch.as_deref(), + )?; + + // Copy potential existing entries + if let Some(manifest) = &manifest { + let manifest_bytes: Vec = object_store + .get(&strip_prefix(&manifest.manifest_path).as_str().into()) + .await? + .bytes() + .await? + .into(); + + let manifest_reader = apache_avro::Reader::new(&*manifest_bytes)?; + manifest_writer.extend(manifest_reader.filter_map(Result::ok))?; + }; + + // If there is no manifest, create one + let mut manifest = manifest.unwrap_or_else(|| { + let manifest_location = table_metadata.location.to_string() + + "/metadata/" + + snapshot_uuid + + "-m" + + &0.to_string() + + ".avro"; + let manifest = ManifestListEntry { + format_version: table_metadata.format_version.clone(), + manifest_path: manifest_location, + manifest_length: 0, + partition_spec_id: table_metadata.default_spec_id, + content: Content::Data, + sequence_number: table_metadata.last_sequence_number, + min_sequence_number: 0, + added_snapshot_id: snapshot_id, + added_files_count: Some(0), + existing_files_count: Some(0), + deleted_files_count: Some(0), + added_rows_count: Some(0), + existing_rows_count: Some(0), + deleted_rows_count: Some(0), + partitions: None, + key_metadata: None, + }; + manifest + }); + + for manifest_entry in new_datafile_iter { + { + let manifest_entry = manifest_entry?; + + let mut added_rows_count = 0; + + if manifest.partitions.is_none() { + manifest.partitions = Some( + table_metadata + .default_partition_spec()? + .fields() + .iter() + .map(|_| FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }) + .collect::>(), + ); + } + + added_rows_count += manifest_entry.data_file().record_count(); + update_partitions( + manifest.partitions.as_mut().unwrap(), + manifest_entry.data_file().partition(), + table_metadata.default_partition_spec()?.fields(), + )?; + + manifest_writer.append_ser(manifest_entry)?; + + manifest.added_files_count = match manifest.added_files_count { + Some(count) => Some(count + new_file_count as i32), + None => Some(new_file_count as i32), + }; + manifest.added_rows_count = match manifest.added_rows_count { + Some(count) => Some(count + added_rows_count), + None => Some(added_rows_count), + }; + } + } + + let manifest_bytes = manifest_writer.into_inner()?; + + let manifest_length: i64 = manifest_bytes.len() as i64; + + manifest.manifest_length += manifest_length; + + object_store + .put( + &strip_prefix(&manifest.manifest_path).as_str().into(), + manifest_bytes.into(), + ) + .await?; + + manifest_list_writer.append_ser(manifest)?; + } else { + // Split datafiles + let splits = if let Some(manifest) = manifest { + let manifest_bytes: Vec = object_store + .get(&strip_prefix(&manifest.manifest_path).as_str().into()) + .await? + .bytes() + .await? + .into(); + + let manifest_reader = + ManifestReader::new(&*manifest_bytes)?.map(|x| x.map_err(Error::from)); + + split_datafiles( + new_datafile_iter.chain(manifest_reader), + bounds, + &partition_column_names, + n_splits, + )? + } else { + split_datafiles( + new_datafile_iter, + bounds, + &partition_column_names, + n_splits, + )? + }; + + for (i, entries) in splits.into_iter().enumerate() { + let mut manifest_writer = ManifestWriter::new( + Vec::new(), + &manifest_schema, + table_metadata, + branch.as_deref(), + )?; + + let manifest_location = table_metadata.location.to_string() + + "/metadata/" + + snapshot_uuid + + "-m" + + &i.to_string() + + ".avro"; + let mut manifest = ManifestListEntry { + format_version: table_metadata.format_version.clone(), + manifest_path: manifest_location, + manifest_length: 0, + partition_spec_id: table_metadata.default_spec_id, + content: Content::Data, + sequence_number: table_metadata.last_sequence_number, + min_sequence_number: 0, + added_snapshot_id: snapshot_id, + added_files_count: Some(0), + existing_files_count: Some(0), + deleted_files_count: Some(0), + added_rows_count: Some(0), + existing_rows_count: Some(0), + deleted_rows_count: Some(0), + partitions: None, + key_metadata: None, + }; + + for manifest_entry in entries { + { + let mut added_rows_count = 0; + + if manifest.partitions.is_none() { + manifest.partitions = Some( + table_metadata + .default_partition_spec()? + .fields() + .iter() + .map(|_| FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }) + .collect::>(), + ); + } + + added_rows_count += manifest_entry.data_file().record_count(); + update_partitions( + manifest.partitions.as_mut().unwrap(), + manifest_entry.data_file().partition(), + table_metadata.default_partition_spec()?.fields(), + )?; + + manifest_writer.append_ser(manifest_entry)?; + + manifest.added_files_count = match manifest.added_files_count { + Some(count) => Some(count + new_file_count as i32), + None => Some(new_file_count as i32), + }; + manifest.added_rows_count = match manifest.added_rows_count { + Some(count) => Some(count + added_rows_count), + None => Some(added_rows_count), + }; + } + } + + let manifest_bytes = manifest_writer.into_inner()?; + + let manifest_length: i64 = manifest_bytes.len() as i64; + + manifest.manifest_length += manifest_length; + + object_store + .put( + &strip_prefix(&manifest.manifest_path).as_str().into(), + manifest_bytes.into(), + ) + .await?; + + manifest_list_writer.append_ser(manifest)?; + } + }; + + let manifest_list_bytes = manifest_list_writer.into_inner()?; + + object_store + .put( + &strip_prefix(&new_manifest_list_location).into(), + manifest_list_bytes.into(), + ) + .await?; let mut snapshot_builder = SnapshotBuilder::default(); snapshot_builder @@ -645,7 +738,6 @@ impl Operation { pub enum ManifestStatus { New(ManifestListEntry), - Existing(ManifestListEntry), } pub(crate) async fn write_manifest( @@ -671,18 +763,6 @@ pub(crate) async fn write_manifest( )?; let mut manifest = match manifest { - ManifestStatus::Existing(manifest) => { - let manifest_bytes: Vec = object_store - .get(&strip_prefix(&manifest.manifest_path).as_str().into()) - .await? - .bytes() - .await? - .into(); - - let manifest_reader = apache_avro::Reader::new(&*manifest_bytes)?; - manifest_writer.extend(manifest_reader.filter_map(Result::ok))?; - manifest - } ManifestStatus::New(manifest) => manifest, }; let files_count = manifest.added_files_count.unwrap_or_default() + files.len() as i32; @@ -717,11 +797,16 @@ pub(crate) async fn write_manifest( let manifest_entry = ManifestEntry::builder() .with_format_version(table_metadata.format_version.clone()) .with_status(Status::Added) - .with_snapshot_id(table_metadata.current_snapshot_id) + .with_snapshot_id( + table_metadata + .current_snapshot_id + .ok_or(Error::NotFound("Snapshot".to_owned(), "id".to_owned()))?, + ) .with_sequence_number( table_metadata .current_snapshot(branch.as_deref())? - .map(|x| *x.sequence_number()), + .map(|x| *x.sequence_number()) + .ok_or(Error::NotFound("Sequence".to_owned(), "number".to_owned()))?, ) .with_data_file(datafile.clone()) .build() @@ -858,82 +943,3 @@ fn update_partitions( } Ok(()) } - -/// checks if partition values lie in the bounds of the field summary -fn partition_values_in_bounds<'a>( - partitions: &[FieldSummary], - partition_values: impl Iterator, - partition_spec: &[PartitionField], -) -> Vec { - partition_values - .filter(|value| { - partition_spec - .iter() - .map(|field| { - value - .get(field.name()) - .ok_or_else(|| { - Error::InvalidFormat("partition values in schema".to_string()) - }) - .unwrap() - }) - .zip(partitions.iter()) - .all(|(value, summary)| { - if let Some(value) = value { - if let (Some(lower_bound), Some(upper_bound)) = - (&summary.lower_bound, &summary.upper_bound) - { - match (value, lower_bound, upper_bound) { - ( - Value::Int(val), - Value::Int(lower_bound), - Value::Int(upper_bound), - ) => *lower_bound <= *val && *upper_bound >= *val, - ( - Value::LongInt(val), - Value::LongInt(lower_bound), - Value::LongInt(upper_bound), - ) => *lower_bound <= *val && *upper_bound >= *val, - ( - Value::Float(val), - Value::Float(lower_bound), - Value::Float(upper_bound), - ) => *lower_bound <= *val && *upper_bound >= *val, - ( - Value::Double(val), - Value::Double(lower_bound), - Value::Double(upper_bound), - ) => *lower_bound <= *val && *upper_bound >= *val, - ( - Value::Date(val), - Value::Date(lower_bound), - Value::Date(upper_bound), - ) => *lower_bound <= *val && *upper_bound >= *val, - ( - Value::Time(val), - Value::Time(lower_bound), - Value::Time(upper_bound), - ) => *lower_bound <= *val && *upper_bound >= *val, - ( - Value::Timestamp(val), - Value::Timestamp(lower_bound), - Value::Timestamp(upper_bound), - ) => *lower_bound <= *val && *upper_bound >= *val, - ( - Value::TimestampTZ(val), - Value::TimestampTZ(lower_bound), - Value::TimestampTZ(upper_bound), - ) => *lower_bound <= *val && *upper_bound >= *val, - _ => false, - } - } else { - false - } - } else { - summary.contains_null - } - }) - }) - .map(Clone::clone) - .collect() -} diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index a02d9aa2..708bd87e 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -16,7 +16,7 @@ pub(crate) struct Rectangle { impl Rectangle where - C: PartialOrd + Clone + TryAdd + TrySub, + C: PartialOrd + Ord + Clone + TryAdd + TrySub, { pub(crate) fn new(min: SmallVec<[C; 4]>, max: SmallVec<[C; 4]>) -> Self { Self { min, max } @@ -84,18 +84,6 @@ where } true } - - pub(crate) fn intersects(&self, rect: &Rectangle) -> bool { - if self.min.len() == 0 { - return false; - } - for i in 0..self.min.len() { - if rect.min[i] > self.max[i] || rect.max[i] < self.min[i] { - return false; - } - } - true - } } pub(crate) fn struct_to_smallvec( @@ -121,15 +109,35 @@ pub(crate) fn summary_to_rectangle(summaries: &[FieldSummary]) -> Result(left: &[C], right: &[C]) -> Result { + while let (Some(own), Some(other)) = (left.iter().next(), right.iter().next()) { + let ordering = own + .partial_cmp(&other) + .ok_or(Error::InvalidFormat("Types for Partial Order".to_owned()))?; + let Ordering::Equal = ordering else { + return Ok(ordering); + }; + } + Ok(Ordering::Equal) +} + +pub(crate) fn sub(left: &[C], right: &[C]) -> Result, Error> { + let mut v = SmallVec::with_capacity(left.len()); + for i in 0..left.len() { + v[i] = left[i].try_sub(&right[i])? + } + Ok(v) +} From 145c5bcbe6ce3b48e9efa26dbf5581b0f126b054 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Wed, 23 Oct 2024 17:32:19 +0200 Subject: [PATCH 03/32] fix merge issues --- .../src/table/transaction/operation.rs | 53 +++++++++---------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index 8f1a3dd5..f82e2aaa 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -4,15 +4,18 @@ use std::{cmp::Ordering, collections::HashMap, sync::Arc}; -use apache_avro::from_value; use futures::{lock::Mutex, stream, StreamExt, TryStreamExt}; +use iceberg_rust_spec::manifest_list::{ + manifest_list_schema_v1, manifest_list_schema_v2, ManifestListReader, +}; +use iceberg_rust_spec::table_metadata::FormatVersion; use iceberg_rust_spec::util::strip_prefix; use iceberg_rust_spec::{error::Error as SpecError, spec::table_metadata::TableMetadata}; use iceberg_rust_spec::{ manifest::ManifestReader, spec::{ manifest::{partition_value_schema, DataFile, ManifestEntry, ManifestWriter, Status}, - manifest_list::{Content, FieldSummary, ManifestListEntry, ManifestListEntryEnum}, + manifest_list::{Content, FieldSummary, ManifestListEntry}, partition::PartitionField, schema::Schema, snapshot::{ @@ -121,8 +124,10 @@ impl Operation { "rectangle".to_owned(), ))?; - let manifest_list_schema = - ManifestListEntry::schema(&table_metadata.format_version)?; + let manifest_list_schema = match table_metadata.format_version { + FormatVersion::V1 => manifest_list_schema_v1(), + FormatVersion::V2 => manifest_list_schema_v2(), + }; let mut manifest_list_writer = apache_avro::Writer::new(&manifest_list_schema, Vec::new()); @@ -141,7 +146,7 @@ impl Operation { .await?; let manifest_list_reader = - apache_avro::Reader::new(old_manifest_list_bytes.as_ref())?; + ManifestListReader::new(old_manifest_list_bytes.as_ref(), table_metadata)?; // Check if table is partitioned let manifest = if partition_column_names.is_empty() { @@ -150,15 +155,7 @@ impl Operation { .fold(Ok::<_, Error>(None), |acc, x| { let acc = acc?; - let manifest = x - .map_err(Into::into) - .and_then(|value| { - ManifestListEntry::try_from_enum( - from_value::(&value)?, - table_metadata, - ) - }) - .unwrap(); + let manifest = x?; let row_count = manifest.added_rows_count; @@ -190,15 +187,7 @@ impl Operation { .fold(Ok::<_, Error>(None), |acc, x| { let acc = acc?; - let manifest = x - .map_err(Into::into) - .and_then(|value| { - ManifestListEntry::try_from_enum( - from_value::(&value)?, - table_metadata, - ) - }) - .unwrap(); + let manifest = x?; let mut bounds = summary_to_rectangle( manifest.partitions.as_ref().ok_or(Error::NotFound( @@ -614,8 +603,10 @@ impl Operation { (ManifestStatus::New(manifest), vec![partition_value.clone()]) }); - let manifest_list_schema = - ManifestListEntry::schema(&table_metadata.format_version)?; + let manifest_list_schema = match table_metadata.format_version { + FormatVersion::V1 => manifest_list_schema_v1(), + FormatVersion::V2 => manifest_list_schema_v2(), + }; let manifest_list_writer = Arc::new(Mutex::new(apache_avro::Writer::new( &manifest_list_schema, @@ -846,10 +837,12 @@ fn update_partitions( partition_values: &Struct, partition_columns: &[PartitionField], ) -> Result<(), Error> { - for (field, summary) in partition_columns.iter().zip(partitions.iter_mut()) { - let value = &partition_values.get(field.name()).and_then(|x| x.as_ref()); + for (field, summary) in partition_columns.into_iter().zip(partitions.iter_mut()) { + let value = partition_values.get(field.name()).and_then(|x| x.as_ref()); if let Some(value) = value { - if let Some(lower_bound) = &mut summary.lower_bound { + if summary.lower_bound.is_none() { + summary.lower_bound = Some(value.clone()); + } else if let Some(lower_bound) = &mut summary.lower_bound { match (value, lower_bound) { (Value::Int(val), Value::Int(current)) => { if *current > *val { @@ -894,7 +887,9 @@ fn update_partitions( _ => {} } } - if let Some(upper_bound) = &mut summary.upper_bound { + if summary.upper_bound.is_none() { + summary.upper_bound = Some(value.clone()); + } else if let Some(upper_bound) = &mut summary.upper_bound { match (value, upper_bound) { (Value::Int(val), Value::Int(current)) => { if *current < *val { From 78f588fb1ec47e5f94bafd27f761ea74886c64d2 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 08:21:44 +0200 Subject: [PATCH 04/32] improve rewrite operation --- .../src/table/transaction/operation.rs | 394 ++++++++++-------- 1 file changed, 229 insertions(+), 165 deletions(-) diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index f82e2aaa..2cd07f79 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -4,13 +4,12 @@ use std::{cmp::Ordering, collections::HashMap, sync::Arc}; -use futures::{lock::Mutex, stream, StreamExt, TryStreamExt}; use iceberg_rust_spec::manifest_list::{ manifest_list_schema_v1, manifest_list_schema_v2, ManifestListReader, }; +use iceberg_rust_spec::spec::table_metadata::TableMetadata; use iceberg_rust_spec::table_metadata::FormatVersion; use iceberg_rust_spec::util::strip_prefix; -use iceberg_rust_spec::{error::Error as SpecError, spec::table_metadata::TableMetadata}; use iceberg_rust_spec::{ manifest::ManifestReader, spec::{ @@ -552,37 +551,96 @@ impl Operation { files, additional_summary, } => { + let partition_spec = table_metadata.default_partition_spec()?; let old_snapshot = table_metadata.current_snapshot(branch.as_deref())?; let schema = table_metadata.current_schema(branch.as_deref())?.clone(); - // Split datafils by partition - let datafiles = Arc::new(files.into_iter().map(Ok::<_, Error>).try_fold( - HashMap::>::new(), - |mut acc, x| { - let x = x?; - let partition_value = x.partition().clone(); - acc.entry(partition_value).or_default().push(x); - Ok::<_, Error>(acc) - }, - )?); + let partition_column_names = table_metadata + .default_partition_spec()? + .fields() + .iter() + .map(|x| x.name().as_str()) + .collect::>(); + + let bounding_partition_values = files + .iter() + .fold(Ok::<_, Error>(None), |acc, x| { + let acc = acc?; + let node = struct_to_smallvec(x.partition(), &partition_column_names)?; + let Some(mut acc) = acc else { + return Ok(Some(Rectangle::new(node.clone(), node))); + }; + acc.expand_with_node(node); + Ok(Some(acc)) + })? + .ok_or(Error::NotFound( + "Bounding".to_owned(), + "rectangle".to_owned(), + ))?; + + let manifest_list_schema = match table_metadata.format_version { + FormatVersion::V1 => manifest_list_schema_v1(), + FormatVersion::V2 => manifest_list_schema_v2(), + }; + + let mut manifest_list_writer = + apache_avro::Writer::new(&manifest_list_schema, Vec::new()); + + let new_file_count = files.len(); + + let limit = MIN_DATAFILES + ((new_file_count) as f64).sqrt() as usize; + + // How many times do the files need to be split to give at most *limit* files per manifest + let n_splits = match new_file_count / limit { + 0 => 0, + x => x.ilog2() + 1, + }; let snapshot_id = generate_snapshot_id(); + let sequence_number = table_metadata.last_sequence_number + 1; + + let new_datafile_iter = files.into_iter().map(|data_file| { + ManifestEntry::builder() + .with_format_version(table_metadata.format_version) + .with_status(Status::Added) + .with_snapshot_id(snapshot_id) + .with_sequence_number(sequence_number) + .with_data_file(data_file) + .build() + .map_err(crate::spec::error::Error::from) + .map_err(Error::from) + }); + + let manifest_schema = ManifestEntry::schema( + &partition_value_schema(partition_spec.fields(), &schema)?, + &table_metadata.format_version, + )?; + let snapshot_uuid = &uuid::Uuid::new_v4().to_string(); - let manifest_list_location = table_metadata.location.to_string() + let new_manifest_list_location = table_metadata.location.to_string() + "/metadata/snap-" + &snapshot_id.to_string() - + "-" + snapshot_uuid + ".avro"; - let manifest_iter = datafiles.keys().enumerate().map(|(i, partition_value)| { + // Write manifest files + // Split manifest file if limit is exceeded + if n_splits == 0 { + // If manifest doesn't need to be split + let mut manifest_writer = ManifestWriter::new( + Vec::new(), + &manifest_schema, + table_metadata, + branch.as_deref(), + )?; + let manifest_location = table_metadata.location.to_string() + "/metadata/" + snapshot_uuid + "-m" - + &(i).to_string() + + &0.to_string() + ".avro"; - let manifest = ManifestListEntry { + let mut manifest = ManifestListEntry { format_version: table_metadata.format_version.clone(), manifest_path: manifest_location, manifest_length: 0, @@ -600,56 +658,167 @@ impl Operation { partitions: None, key_metadata: None, }; - (ManifestStatus::New(manifest), vec![partition_value.clone()]) - }); - let manifest_list_schema = match table_metadata.format_version { - FormatVersion::V1 => manifest_list_schema_v1(), - FormatVersion::V2 => manifest_list_schema_v2(), - }; + for manifest_entry in new_datafile_iter { + { + let manifest_entry = manifest_entry?; - let manifest_list_writer = Arc::new(Mutex::new(apache_avro::Writer::new( - &manifest_list_schema, - Vec::new(), - ))); - - stream::iter(manifest_iter) - .then(|(manifest, files): (ManifestStatus, Vec)| { - let object_store = object_store.clone(); - let datafiles = datafiles.clone(); - let branch = branch.clone(); - let schema = &schema; - let old_storage_table_metadata = &table_metadata; - async move { - write_manifest( - old_storage_table_metadata, - manifest, - files, - datafiles, - schema, - object_store, - branch, - ) - .await + let mut added_rows_count = 0; + + if manifest.partitions.is_none() { + manifest.partitions = Some( + table_metadata + .default_partition_spec()? + .fields() + .iter() + .map(|_| FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }) + .collect::>(), + ); + } + + added_rows_count += manifest_entry.data_file().record_count(); + update_partitions( + manifest.partitions.as_mut().unwrap(), + manifest_entry.data_file().partition(), + table_metadata.default_partition_spec()?.fields(), + )?; + + manifest_writer.append_ser(manifest_entry)?; + + manifest.added_files_count = match manifest.added_files_count { + Some(count) => Some(count + new_file_count as i32), + None => Some(new_file_count as i32), + }; + manifest.added_rows_count = match manifest.added_rows_count { + Some(count) => Some(count + added_rows_count), + None => Some(added_rows_count), + }; } - }) - .try_for_each_concurrent(None, |manifest| { - let manifest_list_writer = manifest_list_writer.clone(); - async move { - manifest_list_writer.lock().await.append_ser(manifest)?; - Ok(()) + } + + let manifest_bytes = manifest_writer.into_inner()?; + + let manifest_length: i64 = manifest_bytes.len() as i64; + + manifest.manifest_length += manifest_length; + + object_store + .put( + &strip_prefix(&manifest.manifest_path).as_str().into(), + manifest_bytes.into(), + ) + .await?; + + manifest_list_writer.append_ser(manifest)?; + } else { + // Split datafiles + let splits = split_datafiles( + new_datafile_iter, + bounding_partition_values, + &partition_column_names, + n_splits, + )?; + + for (i, entries) in splits.into_iter().enumerate() { + let mut manifest_writer = ManifestWriter::new( + Vec::new(), + &manifest_schema, + table_metadata, + branch.as_deref(), + )?; + + let manifest_location = table_metadata.location.to_string() + + "/metadata/" + + snapshot_uuid + + "-m" + + &i.to_string() + + ".avro"; + let mut manifest = ManifestListEntry { + format_version: table_metadata.format_version.clone(), + manifest_path: manifest_location, + manifest_length: 0, + partition_spec_id: table_metadata.default_spec_id, + content: Content::Data, + sequence_number: table_metadata.last_sequence_number, + min_sequence_number: 0, + added_snapshot_id: snapshot_id, + added_files_count: Some(0), + existing_files_count: Some(0), + deleted_files_count: Some(0), + added_rows_count: Some(0), + existing_rows_count: Some(0), + deleted_rows_count: Some(0), + partitions: None, + key_metadata: None, + }; + + for manifest_entry in entries { + { + let mut added_rows_count = 0; + + if manifest.partitions.is_none() { + manifest.partitions = Some( + table_metadata + .default_partition_spec()? + .fields() + .iter() + .map(|_| FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }) + .collect::>(), + ); + } + + added_rows_count += manifest_entry.data_file().record_count(); + update_partitions( + manifest.partitions.as_mut().unwrap(), + manifest_entry.data_file().partition(), + table_metadata.default_partition_spec()?.fields(), + )?; + + manifest_writer.append_ser(manifest_entry)?; + + manifest.added_files_count = match manifest.added_files_count { + Some(count) => Some(count + new_file_count as i32), + None => Some(new_file_count as i32), + }; + manifest.added_rows_count = match manifest.added_rows_count { + Some(count) => Some(count + added_rows_count), + None => Some(added_rows_count), + }; + } } - }) - .await?; - let manifest_list_bytes = Arc::into_inner(manifest_list_writer) - .unwrap() - .into_inner() - .into_inner()?; + let manifest_bytes = manifest_writer.into_inner()?; + + let manifest_length: i64 = manifest_bytes.len() as i64; + + manifest.manifest_length += manifest_length; + + object_store + .put( + &strip_prefix(&manifest.manifest_path).as_str().into(), + manifest_bytes.into(), + ) + .await?; + + manifest_list_writer.append_ser(manifest)?; + } + }; + + let manifest_list_bytes = manifest_list_writer.into_inner()?; object_store .put( - &strip_prefix(&manifest_list_location).into(), + &strip_prefix(&new_manifest_list_location).into(), manifest_list_bytes.into(), ) .await?; @@ -659,7 +828,7 @@ impl Operation { .with_snapshot_id(snapshot_id) .with_sequence_number(0) .with_schema_id(*schema.schema_id()) - .with_manifest_list(manifest_list_location) + .with_manifest_list(new_manifest_list_location) .with_summary(Summary { operation: iceberg_rust_spec::spec::snapshot::Operation::Append, other: additional_summary.unwrap_or_default(), @@ -727,111 +896,6 @@ impl Operation { } } -pub enum ManifestStatus { - New(ManifestListEntry), -} - -pub(crate) async fn write_manifest( - table_metadata: &TableMetadata, - manifest: ManifestStatus, - files: Vec, - datafiles: Arc>>, - schema: &Schema, - object_store: Arc, - branch: Option, -) -> Result { - let partition_spec = table_metadata.default_partition_spec()?; - let manifest_schema = ManifestEntry::schema( - &partition_value_schema(partition_spec.fields(), schema)?, - &table_metadata.format_version, - )?; - - let mut manifest_writer = ManifestWriter::new( - Vec::new(), - &manifest_schema, - table_metadata, - branch.as_deref(), - )?; - - let mut manifest = match manifest { - ManifestStatus::New(manifest) => manifest, - }; - let files_count = manifest.added_files_count.unwrap_or_default() + files.len() as i32; - for path in files { - for datafile in datafiles.get(&path).ok_or(Error::InvalidFormat( - "Datafiles for partition value".to_string(), - ))? { - let mut added_rows_count = 0; - - if manifest.partitions.is_none() { - manifest.partitions = Some( - partition_spec - .fields() - .iter() - .map(|_| FieldSummary { - contains_null: false, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }) - .collect::>(), - ); - } - - added_rows_count += datafile.record_count(); - update_partitions( - manifest.partitions.as_mut().unwrap(), - datafile.partition(), - partition_spec.fields(), - )?; - - let manifest_entry = ManifestEntry::builder() - .with_format_version(table_metadata.format_version.clone()) - .with_status(Status::Added) - .with_snapshot_id( - table_metadata - .current_snapshot_id - .ok_or(Error::NotFound("Snapshot".to_owned(), "id".to_owned()))?, - ) - .with_sequence_number( - table_metadata - .current_snapshot(branch.as_deref())? - .map(|x| *x.sequence_number()) - .ok_or(Error::NotFound("Sequence".to_owned(), "number".to_owned()))?, - ) - .with_data_file(datafile.clone()) - .build() - .map_err(SpecError::from)?; - - manifest_writer.append_ser(manifest_entry)?; - - manifest.added_files_count = match manifest.added_files_count { - Some(count) => Some(count + files_count), - None => Some(files_count), - }; - manifest.added_rows_count = match manifest.added_rows_count { - Some(count) => Some(count + added_rows_count), - None => Some(added_rows_count), - }; - } - } - - let manifest_bytes = manifest_writer.into_inner()?; - - let manifest_length: i64 = manifest_bytes.len() as i64; - - manifest.manifest_length += manifest_length; - - object_store - .put( - &strip_prefix(&manifest.manifest_path).as_str().into(), - manifest_bytes.into(), - ) - .await?; - - Ok::<_, Error>(manifest) -} - fn update_partitions( partitions: &mut [FieldSummary], partition_values: &Struct, From 24ab564f54e7284be4eaeb92679de839f106d9ac Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 09:06:55 +0200 Subject: [PATCH 05/32] fix clippy warnings --- iceberg-file-catalog/src/lib.rs | 12 ++--- iceberg-rust-spec/src/spec/manifest_list.rs | 6 +-- iceberg-rust-spec/src/spec/values.rs | 13 +++++- iceberg-rust/src/table/transaction/append.rs | 1 + .../src/table/transaction/operation.rs | 46 ++++++++----------- iceberg-rust/src/util/mod.rs | 8 ++-- 6 files changed, 45 insertions(+), 41 deletions(-) diff --git a/iceberg-file-catalog/src/lib.rs b/iceberg-file-catalog/src/lib.rs index a012fd3d..222785e8 100644 --- a/iceberg-file-catalog/src/lib.rs +++ b/iceberg-file-catalog/src/lib.rs @@ -113,7 +113,7 @@ impl Catalog for FileCatalog { .map_err(IcebergError::from) .map_ok(|x| { let path = x.location.as_ref(); - self.identifier(&path) + self.identifier(path) }) .try_collect() .await @@ -501,7 +501,7 @@ impl Catalog for FileCatalog { impl FileCatalog { fn namespace_path(&self, namespace: &str) -> String { - self.path.as_str().trim_end_matches('/').to_owned() + "/" + &self.name + "/" + &namespace + self.path.as_str().trim_end_matches('/').to_owned() + "/" + &self.name + "/" + namespace } fn tabular_path(&self, namespace: &str, name: &str) -> String { @@ -509,7 +509,7 @@ impl FileCatalog { + "/" + &self.name + "/" - + &namespace + + namespace + "/" + name } @@ -523,7 +523,7 @@ impl FileCatalog { .try_filter(|x| { future::ready( x.ends_with("metadata.json") - && x.starts_with(&(path.clone() + "/v").trim_start_matches('/')), + && x.starts_with((path.clone() + "/v").trim_start_matches('/')), ) }) .try_collect() @@ -551,9 +551,7 @@ impl FileCatalog { let parts = path .trim_start_matches(self.path.trim_start_matches('/')) .trim_start_matches('/') - .split('/') - .skip(1) - .next() + .split('/').nth(1) .ok_or(IcebergError::InvalidFormat("Namespace in path".to_owned()))? .to_owned(); Namespace::try_new(&[parts]).map_err(IcebergError::from) diff --git a/iceberg-rust-spec/src/spec/manifest_list.rs b/iceberg-rust-spec/src/spec/manifest_list.rs index cf1a8d64..9247e5c2 100644 --- a/iceberg-rust-spec/src/spec/manifest_list.rs +++ b/iceberg-rust-spec/src/spec/manifest_list.rs @@ -49,7 +49,7 @@ impl<'a, 'metadata, R: Read> ManifestListReader<'a, 'metadata, R> { FormatVersion::V2 => manifest_list_schema_v2(), }; Ok(Self { - reader: AvroReader::with_schema(&schema, reader)? + reader: AvroReader::with_schema(schema, reader)? .zip(repeat(table_metadata)) .map(avro_value_to_manifest_file), }) @@ -781,7 +781,7 @@ mod tests { let schema = manifest_list_schema_v2(); - let mut writer = apache_avro::Writer::new(&schema, Vec::new()); + let mut writer = apache_avro::Writer::new(schema, Vec::new()); writer.append_ser(manifest_file.clone()).unwrap(); @@ -861,7 +861,7 @@ mod tests { let schema = manifest_list_schema_v1(); - let mut writer = apache_avro::Writer::new(&schema, Vec::new()); + let mut writer = apache_avro::Writer::new(schema, Vec::new()); writer.append_ser(manifest_file.clone()).unwrap(); diff --git a/iceberg-rust-spec/src/spec/values.rs b/iceberg-rust-spec/src/spec/values.rs index 7bef7b90..fff07c6d 100644 --- a/iceberg-rust-spec/src/spec/values.rs +++ b/iceberg-rust-spec/src/spec/values.rs @@ -6,11 +6,13 @@ use std::{ any::Any, collections::{btree_map::Keys, BTreeMap, HashMap}, fmt, + hash::Hash, io::Cursor, slice::Iter, }; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; +use itertools::Itertools; use ordered_float::OrderedFloat; use rust_decimal::Decimal; use serde::{ @@ -128,7 +130,7 @@ impl fmt::Display for Value { /// The partition struct stores the tuple of partition values for each file. /// Its type is derived from the partition fields of the partition spec used to write the manifest file. /// In v2, the partition struct’s field ids must match the ids from the partition spec. -#[derive(Debug, Clone, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, Eq, PartialOrd, Ord)] pub struct Struct { /// Vector to store the field values pub fields: Vec>, @@ -273,6 +275,15 @@ impl PartialEq for Struct { } } +impl Hash for Struct { + fn hash(&self, state: &mut H) { + for key in self.keys().sorted() { + key.hash(state); + self.get(&key).hash(state); + } + } +} + impl Value { /// Perform a partition transformation for the given value pub fn tranform(&self, transform: &Transform) -> Result { diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 68a188ec..7a1b06f7 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -9,6 +9,7 @@ use crate::{ }; /// Split sets of datafiles depending on their partition_values +#[allow(clippy::type_complexity)] pub(crate) fn split_datafiles_once( files: impl Iterator>, rect: Rectangle, diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index 2cd07f79..4fb0fcd1 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -109,11 +109,10 @@ impl Operation { let bounding_partition_values = files .iter() - .fold(Ok::<_, Error>(None), |acc, x| { - let acc = acc?; + .try_fold(None, |acc, x| { let node = struct_to_smallvec(x.partition(), &partition_column_names)?; let Some(mut acc) = acc else { - return Ok(Some(Rectangle::new(node.clone(), node))); + return Ok::<_, Error>(Some(Rectangle::new(node.clone(), node))); }; acc.expand_with_node(node); Ok(Some(acc)) @@ -129,7 +128,7 @@ impl Operation { }; let mut manifest_list_writer = - apache_avro::Writer::new(&manifest_list_schema, Vec::new()); + apache_avro::Writer::new(manifest_list_schema, Vec::new()); let old_manifest_list_location = old_snapshot.map(|x| x.manifest_list()).cloned(); @@ -139,21 +138,19 @@ impl Operation { let manifest = if let Some(old_manifest_list_location) = &old_manifest_list_location { let old_manifest_list_bytes = object_store - .get(&strip_prefix(&old_manifest_list_location).as_str().into()) + .get(&strip_prefix(old_manifest_list_location).as_str().into()) .await? .bytes() .await?; - let manifest_list_reader = + let mut manifest_list_reader = ManifestListReader::new(old_manifest_list_bytes.as_ref(), table_metadata)?; // Check if table is partitioned let manifest = if partition_column_names.is_empty() { // Find the manifest with the lowest row count manifest_list_reader - .fold(Ok::<_, Error>(None), |acc, x| { - let acc = acc?; - + .try_fold(None, |acc, x| { let manifest = x?; let row_count = manifest.added_rows_count; @@ -161,7 +158,7 @@ impl Operation { file_count += manifest.added_files_count.unwrap_or(0) as usize; let Some((old_row_count, old_manifest)) = acc else { - return Ok(Some((row_count, manifest))); + return Ok::<_, Error>(Some((row_count, manifest))); }; let Some(row_count) = row_count else { @@ -183,9 +180,7 @@ impl Operation { } else { // Find the manifest with the smallest bounding partition values manifest_list_reader - .fold(Ok::<_, Error>(None), |acc, x| { - let acc = acc?; - + .try_fold(None, |acc, x| { let manifest = x?; let mut bounds = summary_to_rectangle( @@ -200,7 +195,7 @@ impl Operation { file_count += manifest.added_files_count.unwrap_or(0) as usize; let Some((old_bounds, old_manifest)) = acc else { - return Ok(Some((bounds, manifest))); + return Ok::<_, Error>(Some((bounds, manifest))); }; match old_bounds.cmp_with_priority(&bounds)? { @@ -307,8 +302,9 @@ impl Operation { + "-m" + &0.to_string() + ".avro"; - let manifest = ManifestListEntry { - format_version: table_metadata.format_version.clone(), + + ManifestListEntry { + format_version: table_metadata.format_version, manifest_path: manifest_location, manifest_length: 0, partition_spec_id: table_metadata.default_spec_id, @@ -324,8 +320,7 @@ impl Operation { deleted_rows_count: Some(0), partitions: None, key_metadata: None, - }; - manifest + } }); for manifest_entry in new_datafile_iter { @@ -427,7 +422,7 @@ impl Operation { + &i.to_string() + ".avro"; let mut manifest = ManifestListEntry { - format_version: table_metadata.format_version.clone(), + format_version: table_metadata.format_version, manifest_path: manifest_location, manifest_length: 0, partition_spec_id: table_metadata.default_spec_id, @@ -564,11 +559,10 @@ impl Operation { let bounding_partition_values = files .iter() - .fold(Ok::<_, Error>(None), |acc, x| { - let acc = acc?; + .try_fold(None, |acc, x| { let node = struct_to_smallvec(x.partition(), &partition_column_names)?; let Some(mut acc) = acc else { - return Ok(Some(Rectangle::new(node.clone(), node))); + return Ok::<_, Error>(Some(Rectangle::new(node.clone(), node))); }; acc.expand_with_node(node); Ok(Some(acc)) @@ -584,7 +578,7 @@ impl Operation { }; let mut manifest_list_writer = - apache_avro::Writer::new(&manifest_list_schema, Vec::new()); + apache_avro::Writer::new(manifest_list_schema, Vec::new()); let new_file_count = files.len(); @@ -641,7 +635,7 @@ impl Operation { + &0.to_string() + ".avro"; let mut manifest = ManifestListEntry { - format_version: table_metadata.format_version.clone(), + format_version: table_metadata.format_version, manifest_path: manifest_location, manifest_length: 0, partition_spec_id: table_metadata.default_spec_id, @@ -739,7 +733,7 @@ impl Operation { + &i.to_string() + ".avro"; let mut manifest = ManifestListEntry { - format_version: table_metadata.format_version.clone(), + format_version: table_metadata.format_version, manifest_path: manifest_location, manifest_length: 0, partition_spec_id: table_metadata.default_spec_id, @@ -901,7 +895,7 @@ fn update_partitions( partition_values: &Struct, partition_columns: &[PartitionField], ) -> Result<(), Error> { - for (field, summary) in partition_columns.into_iter().zip(partitions.iter_mut()) { + for (field, summary) in partition_columns.iter().zip(partitions.iter_mut()) { let value = partition_values.get(field.name()).and_then(|x| x.as_ref()); if let Some(value) = value { if summary.lower_bound.is_none() { diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 708bd87e..04bce4b8 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -74,7 +74,7 @@ where } pub(crate) fn contains(&self, rect: &Rectangle) -> bool { - if self.min.len() == 0 { + if self.min.is_empty() { return false; } for i in 0..self.min.len() { @@ -93,8 +93,8 @@ pub(crate) fn struct_to_smallvec( names .iter() .map(|x| { - s.get(*x) - .and_then(|x| identity(x).as_ref().map(Clone::clone)) + s.get(x) + .and_then(|x| identity(x).clone()) }) .collect::>>() .ok_or(Error::InvalidFormat("Partition struct".to_owned())) @@ -125,7 +125,7 @@ pub(crate) fn summary_to_rectangle(summaries: &[FieldSummary]) -> Result(left: &[C], right: &[C]) -> Result { while let (Some(own), Some(other)) = (left.iter().next(), right.iter().next()) { let ordering = own - .partial_cmp(&other) + .partial_cmp(other) .ok_or(Error::InvalidFormat("Types for Partial Order".to_owned()))?; let Ordering::Equal = ordering else { return Ok(ordering); From 2f8b7d27a21336c6ae707adee9586cee975113fc Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 10:50:59 +0200 Subject: [PATCH 06/32] fix out of bound bug in sub --- iceberg-rust/src/util/mod.rs | 43 +++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 04bce4b8..5da456cd 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -92,10 +92,7 @@ pub(crate) fn struct_to_smallvec( ) -> Result, Error> { names .iter() - .map(|x| { - s.get(x) - .and_then(|x| identity(x).clone()) - }) + .map(|x| s.get(x).and_then(|x| identity(x).clone())) .collect::>>() .ok_or(Error::InvalidFormat("Partition struct".to_owned())) } @@ -137,7 +134,43 @@ pub(crate) fn cmp_dist(left: &[C], right: &[C]) -> Result(left: &[C], right: &[C]) -> Result, Error> { let mut v = SmallVec::with_capacity(left.len()); for i in 0..left.len() { - v[i] = left[i].try_sub(&right[i])? + v.push(left[i].try_sub(&right[i])?); } Ok(v) } + +#[cfg(test)] +mod tests { + use iceberg_rust_spec::values::Value; + + use crate::util::sub; + + #[test] + fn test_sub_valid() { + let left = vec![Value::Int(5), Value::Int(10), Value::Int(15)]; + let right = vec![Value::Int(2), Value::Int(3), Value::Int(5)]; + let result = sub(&left, &right).unwrap(); + assert_eq!(result.len(), 3); + assert_eq!(result[0], Value::Int(3)); + assert_eq!(result[1], Value::Int(7)); + assert_eq!(result[2], Value::Int(10)); + } + + #[test] + fn test_sub_empty() { + let left: Vec = vec![]; + let right: Vec = vec![]; + let result = sub(&left, &right).unwrap(); + assert_eq!(result.len(), 0); + } + + #[test] + fn test_sub_same_numbers() { + let left = vec![Value::Int(5), Value::Int(5)]; + let right = vec![Value::Int(5), Value::Int(5)]; + let result = sub(&left, &right).unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result[0], Value::Int(0)); + assert_eq!(result[1], Value::Int(0)); + } +} From ec1aefcfd410cf5c3c05433f387914294ca1597c Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 11:00:14 +0200 Subject: [PATCH 07/32] fix_cmp _dist bug --- iceberg-rust/src/util/mod.rs | 50 ++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 5da456cd..337d2476 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -120,7 +120,7 @@ pub(crate) fn summary_to_rectangle(summaries: &[FieldSummary]) -> Result(left: &[C], right: &[C]) -> Result { - while let (Some(own), Some(other)) = (left.iter().next(), right.iter().next()) { + for (own, other) in left.iter().zip(right.iter()) { let ordering = own .partial_cmp(other) .ok_or(Error::InvalidFormat("Types for Partial Order".to_owned()))?; @@ -143,7 +143,7 @@ pub(crate) fn sub(left: &[C], right: &[C]) -> Result mod tests { use iceberg_rust_spec::values::Value; - use crate::util::sub; + use super::*; #[test] fn test_sub_valid() { @@ -173,4 +173,50 @@ mod tests { assert_eq!(result[0], Value::Int(0)); assert_eq!(result[1], Value::Int(0)); } + #[test] + + fn test_cmp_dist_empty_slices() { + let result = cmp_dist::(&[], &[]); + assert_eq!(result.unwrap(), Ordering::Equal); + } + + #[test] + fn test_cmp_dist_equal_slices() { + let left = vec![1, 2, 3]; + let right = vec![1, 2, 3]; + let result = cmp_dist(&left, &right); + assert_eq!(result.unwrap(), Ordering::Equal); + } + + #[test] + fn test_cmp_dist_less_than() { + let left = vec![1, 2]; + let right = vec![1, 3]; + let result = cmp_dist(&left, &right); + assert_eq!(result.unwrap(), Ordering::Less); + } + + #[test] + fn test_cmp_dist_greater_than() { + let left = vec![1, 4]; + let right = vec![1, 3]; + let result = cmp_dist(&left, &right); + assert_eq!(result.unwrap(), Ordering::Greater); + } + + #[test] + fn test_cmp_dist_with_floats() { + let left = vec![1.0, 2.0]; + let right = vec![1.0, 2.0]; + let result = cmp_dist(&left, &right); + assert_eq!(result.unwrap(), Ordering::Equal); + } + + #[test] + fn test_cmp_dist_with_nan() { + let left = vec![1.0, f64::NAN]; + let right = vec![1.0, 2.0]; + let result = cmp_dist(&left, &right); + assert!(matches!(result, Err(Error::InvalidFormat(_)))); + } } From 9a57a75c752014294f9f29cc59301a1aa6f566a0 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 11:11:51 +0200 Subject: [PATCH 08/32] add tests for contains --- iceberg-rust-spec/src/spec/values.rs | 13 ++++++ iceberg-rust/src/util/mod.rs | 65 ++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/iceberg-rust-spec/src/spec/values.rs b/iceberg-rust-spec/src/spec/values.rs index fff07c6d..d95ce6be 100644 --- a/iceberg-rust-spec/src/spec/values.rs +++ b/iceberg-rust-spec/src/spec/values.rs @@ -8,6 +8,7 @@ use std::{ fmt, hash::Hash, io::Cursor, + ops::{Add, Sub}, slice::Iter, }; @@ -835,6 +836,18 @@ pub trait TrySub: Sized { fn try_sub(&self, other: &Self) -> Result; } +impl + Copy> TryAdd for T { + fn try_add(&self, other: &Self) -> Result { + Ok(*self + *other) + } +} + +impl + Copy> TrySub for T { + fn try_sub(&self, other: &Self) -> Result { + Ok(*self - *other) + } +} + impl TryAdd for Value { fn try_add(&self, other: &Self) -> Result { match (self, other) { diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 337d2476..ac9b9298 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -142,6 +142,7 @@ pub(crate) fn sub(left: &[C], right: &[C]) -> Result #[cfg(test)] mod tests { use iceberg_rust_spec::values::Value; + use smallvec::smallvec; use super::*; @@ -219,4 +220,68 @@ mod tests { let result = cmp_dist(&left, &right); assert!(matches!(result, Err(Error::InvalidFormat(_)))); } + #[test] + fn test_rectangle_contains_empty() { + let container = Rectangle:: { + min: smallvec![], + max: smallvec![], + }; + let rect = Rectangle:: { + min: smallvec![1, 2], + max: smallvec![3, 4], + }; + assert!(!container.contains(&rect)); + } + + #[test] + fn test_rectangle_contains_true() { + let container = Rectangle:: { + min: smallvec![0, 0], + max: smallvec![10, 10], + }; + let rect = Rectangle:: { + min: smallvec![2, 2], + max: smallvec![8, 8], + }; + assert!(container.contains(&rect)); + } + + #[test] + fn test_rectangle_contains_false_min_boundary() { + let container = Rectangle:: { + min: smallvec![1, 1], + max: smallvec![10, 10], + }; + let rect = Rectangle:: { + min: smallvec![0, 5], + max: smallvec![5, 8], + }; + assert!(!container.contains(&rect)); + } + + #[test] + fn test_rectangle_contains_false_max_boundary() { + let container = Rectangle:: { + min: smallvec![0, 0], + max: smallvec![10, 10], + }; + let rect = Rectangle:: { + min: smallvec![5, 5], + max: smallvec![11, 8], + }; + assert!(!container.contains(&rect)); + } + + #[test] + fn test_rectangle_contains_higher_dimensions() { + let container = Rectangle:: { + min: smallvec![0, 0, 0], + max: smallvec![10, 10, 10], + }; + let rect = Rectangle:: { + min: smallvec![1, 1, 1], + max: smallvec![9, 9, 9], + }; + assert!(container.contains(&rect)); + } } From 71c3e93769e79d86f1135dd1785ba85c84393b72 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 15:26:12 +0200 Subject: [PATCH 09/32] tests for cmp_with_priority --- iceberg-rust/src/util/mod.rs | 128 ++++++++++++----------------------- 1 file changed, 42 insertions(+), 86 deletions(-) diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index ac9b9298..8674d069 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -46,43 +46,25 @@ where /// Determine of one rectangle is larger than the other pub(crate) fn cmp_with_priority(&self, other: &Rectangle) -> Result { - if self.contains(other) { - Ok(Ordering::Greater) - } else if other.contains(self) { - Ok(Ordering::Less) - } else { - let mut self_iter = self - .max - .iter() - .zip(self.min.iter()) - .map(|(max, min)| max.try_sub(min)); - let mut other_iter = other - .max - .iter() - .zip(other.min.iter()) - .map(|(max, min)| max.try_sub(min)); - while let (Some(own), Some(other)) = (self_iter.next(), other_iter.next()) { - let ordering = own? - .partial_cmp(&other?) - .ok_or(Error::InvalidFormat("Types for Partial Order".to_owned()))?; - let Ordering::Equal = ordering else { - return Ok(ordering); - }; - } - Ok(Ordering::Equal) - } - } - - pub(crate) fn contains(&self, rect: &Rectangle) -> bool { - if self.min.is_empty() { - return false; - } - for i in 0..self.min.len() { - if rect.min[i] < self.min[i] || rect.max[i] > self.max[i] { - return false; - } + let self_iter = self + .max + .iter() + .zip(self.min.iter()) + .map(|(max, min)| max.try_sub(min)); + let other_iter = other + .max + .iter() + .zip(other.min.iter()) + .map(|(max, min)| max.try_sub(min)); + for (own, other) in self_iter.zip(other_iter) { + let ordering = own? + .partial_cmp(&other?) + .ok_or(Error::InvalidFormat("Types for Partial Order".to_owned()))?; + let Ordering::Equal = ordering else { + return Ok(ordering); + }; } - true + Ok(Ordering::Equal) } } @@ -220,68 +202,42 @@ mod tests { let result = cmp_dist(&left, &right); assert!(matches!(result, Err(Error::InvalidFormat(_)))); } + #[test] - fn test_rectangle_contains_empty() { - let container = Rectangle:: { - min: smallvec![], - max: smallvec![], - }; - let rect = Rectangle:: { - min: smallvec![1, 2], - max: smallvec![3, 4], - }; - assert!(!container.contains(&rect)); + fn test_rectangle_cmp_with_priority_greater() { + let larger = Rectangle::new(smallvec![0, 0], smallvec![10, 10]); + let smaller = Rectangle::new(smallvec![1, 1], smallvec![8, 8]); + assert_eq!( + larger.cmp_with_priority(&smaller).unwrap(), + Ordering::Greater + ); } #[test] - fn test_rectangle_contains_true() { - let container = Rectangle:: { - min: smallvec![0, 0], - max: smallvec![10, 10], - }; - let rect = Rectangle:: { - min: smallvec![2, 2], - max: smallvec![8, 8], - }; - assert!(container.contains(&rect)); + fn test_rectangle_cmp_with_priority_less() { + let larger = Rectangle::new(smallvec![0, 0], smallvec![10, 10]); + let smaller = Rectangle::new(smallvec![1, 1], smallvec![8, 8]); + assert_eq!(smaller.cmp_with_priority(&larger).unwrap(), Ordering::Less); } #[test] - fn test_rectangle_contains_false_min_boundary() { - let container = Rectangle:: { - min: smallvec![1, 1], - max: smallvec![10, 10], - }; - let rect = Rectangle:: { - min: smallvec![0, 5], - max: smallvec![5, 8], - }; - assert!(!container.contains(&rect)); + fn test_rectangle_cmp_with_priority_equal() { + let rect1 = Rectangle::new(smallvec![0, 0], smallvec![5, 5]); + let rect2 = Rectangle::new(smallvec![0, 0], smallvec![5, 5]); + assert_eq!(rect1.cmp_with_priority(&rect2).unwrap(), Ordering::Equal); } #[test] - fn test_rectangle_contains_false_max_boundary() { - let container = Rectangle:: { - min: smallvec![0, 0], - max: smallvec![10, 10], - }; - let rect = Rectangle:: { - min: smallvec![5, 5], - max: smallvec![11, 8], - }; - assert!(!container.contains(&rect)); + fn test_rectangle_cmp_with_priority_partial_dimensions() { + let rect1 = Rectangle::new(smallvec![0, 0], smallvec![6, 4]); + let rect2 = Rectangle::new(smallvec![0, 0], smallvec![4, 6]); + assert!(rect1.cmp_with_priority(&rect2).is_ok()); } #[test] - fn test_rectangle_contains_higher_dimensions() { - let container = Rectangle:: { - min: smallvec![0, 0, 0], - max: smallvec![10, 10, 10], - }; - let rect = Rectangle:: { - min: smallvec![1, 1, 1], - max: smallvec![9, 9, 9], - }; - assert!(container.contains(&rect)); + fn test_rectangle_cmp_with_priority_overlapping() { + let rect1 = Rectangle::new(smallvec![2, 0], smallvec![8, 4]); + let rect2 = Rectangle::new(smallvec![0, 2], smallvec![6, 6]); + assert!(rect1.cmp_with_priority(&rect2).is_ok()); } } From 3b14ef93a0f80ada741705b0f70b459ea52c3a80 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 15:39:42 +0200 Subject: [PATCH 10/32] expand with node test --- iceberg-rust/src/util/mod.rs | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 8674d069..efa9cca1 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -240,4 +240,39 @@ mod tests { let rect2 = Rectangle::new(smallvec![0, 2], smallvec![6, 6]); assert!(rect1.cmp_with_priority(&rect2).is_ok()); } + #[test] + fn test_expand_with_node_smaller_values() { + let mut bounds = Rectangle::new(smallvec![5, 5, 5], smallvec![10, 10, 10]); + let node: SmallVec<[i32; 4]> = smallvec![3, 4, 2]; + bounds.expand_with_node(node); + assert_eq!(bounds.min, smallvec![3, 4, 2] as SmallVec<[i32; 4]>); + assert_eq!(bounds.max, smallvec![10, 10, 10] as SmallVec<[i32; 4]>); + } + + #[test] + fn test_expand_with_node_larger_values() { + let mut bounds = Rectangle::new(smallvec![5, 5, 5], smallvec![10, 10, 10]); + let node: SmallVec<[i32; 4]> = smallvec![6, 12, 11]; + bounds.expand_with_node(node); + assert_eq!(bounds.min, smallvec![5, 5, 5] as SmallVec<[i32; 4]>); + assert_eq!(bounds.max, smallvec![10, 12, 11] as SmallVec<[i32; 4]>); + } + + #[test] + fn test_expand_with_node_mixed_values() { + let mut bounds = Rectangle::new(smallvec![5, 5, 5], smallvec![10, 10, 10]); + let node: SmallVec<[i32; 4]> = smallvec![3, 15, 7]; + bounds.expand_with_node(node); + assert_eq!(bounds.min, smallvec![3, 5, 5] as SmallVec<[i32; 4]>); + assert_eq!(bounds.max, smallvec![10, 15, 10] as SmallVec<[i32; 4]>); + } + + #[test] + fn test_expand_with_node_equal_values() { + let mut bounds = Rectangle::new(smallvec![5, 5, 5], smallvec![10, 10, 10]); + let node: SmallVec<[i32; 4]> = smallvec![5, 10, 7]; + bounds.expand_with_node(node); + assert_eq!(bounds.min, smallvec![5, 5, 5] as SmallVec<[i32; 4]>); + assert_eq!(bounds.max, smallvec![10, 10, 10] as SmallVec<[i32; 4]>); + } } From b09e813747941890b177ba7214f2001a8da707ef Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 15:43:21 +0200 Subject: [PATCH 11/32] test expand rectangle --- iceberg-rust/src/util/mod.rs | 71 ++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index efa9cca1..4ccfef41 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -275,4 +275,75 @@ mod tests { assert_eq!(bounds.min, smallvec![5, 5, 5] as SmallVec<[i32; 4]>); assert_eq!(bounds.max, smallvec![10, 10, 10] as SmallVec<[i32; 4]>); } + #[test] + fn test_rectangle_expand() { + let mut rect1 = Rectangle { + min: smallvec![0, 0], + max: smallvec![5, 5], + }; + + let rect2 = Rectangle { + min: smallvec![-1, -1], + max: smallvec![3, 6], + }; + + rect1.expand(&rect2); + + assert_eq!(rect1.min, smallvec![-1, -1] as SmallVec<[i32; 4]>); + assert_eq!(rect1.max, smallvec![5, 6] as SmallVec<[i32; 4]>); + } + + #[test] + fn test_rectangle_expand_no_change() { + let mut rect1 = Rectangle { + min: smallvec![0, 0], + max: smallvec![10, 10], + }; + + let rect2 = Rectangle { + min: smallvec![2, 2], + max: smallvec![8, 8], + }; + + rect1.expand(&rect2); + + assert_eq!(rect1.min, smallvec![0, 0] as SmallVec<[i32; 4]>); + assert_eq!(rect1.max, smallvec![10, 10] as SmallVec<[i32; 4]>); + } + + #[test] + fn test_rectangle_expand_single_dimension() { + let mut rect1 = Rectangle { + min: smallvec![5], + max: smallvec![10], + }; + + let rect2 = Rectangle { + min: smallvec![3], + max: smallvec![12], + }; + + rect1.expand(&rect2); + + assert_eq!(rect1.min, smallvec![3] as SmallVec<[i32; 4]>); + assert_eq!(rect1.max, smallvec![12] as SmallVec<[i32; 4]>); + } + + #[test] + fn test_rectangle_expand_empty() { + let mut rect1: Rectangle = Rectangle { + min: smallvec![], + max: smallvec![], + }; + + let rect2 = Rectangle { + min: smallvec![], + max: smallvec![], + }; + + rect1.expand(&rect2); + + assert_eq!(rect1.min, smallvec![] as SmallVec<[i32; 4]>); + assert_eq!(rect1.max, smallvec![] as SmallVec<[i32; 4]>); + } } From db15ad875649a15abc4ca5bd3339cc4e47b16eba Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Thu, 24 Oct 2024 16:47:31 +0200 Subject: [PATCH 12/32] remove generic from rectangle --- iceberg-rust/src/table/transaction/append.rs | 14 +- iceberg-rust/src/util/mod.rs | 246 ++++++++++++------- 2 files changed, 159 insertions(+), 101 deletions(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 7a1b06f7..3e275186 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -1,20 +1,20 @@ use std::cmp::Ordering; -use iceberg_rust_spec::{manifest::ManifestEntry, values::Value}; +use iceberg_rust_spec::manifest::ManifestEntry; use smallvec::{smallvec, SmallVec}; use crate::{ error::Error, - util::{cmp_dist, struct_to_smallvec, sub, Rectangle}, + util::{cmp_with_priority, struct_to_smallvec, sub, Rectangle}, }; /// Split sets of datafiles depending on their partition_values #[allow(clippy::type_complexity)] pub(crate) fn split_datafiles_once( files: impl Iterator>, - rect: Rectangle, - names: &SmallVec<[&str; 4]>, -) -> Result<[(Vec, Rectangle); 2], Error> { + rect: Rectangle, + names: &[&str], +) -> Result<[(Vec, Rectangle); 2], Error> { let mut smaller = Vec::new(); let mut larger = Vec::new(); let mut smaller_rect = None; @@ -25,7 +25,7 @@ pub(crate) fn split_datafiles_once( let position = struct_to_smallvec(manifest_entry.data_file().partition(), names)?; // Check distance to upper and lower bound if let Ordering::Greater = - cmp_dist(&sub(&position, &rect.min)?, &sub(&rect.max, &position)?)? + cmp_with_priority(&sub(&position, &rect.min)?, &sub(&rect.max, &position)?)? { // if closer to upper bound larger.push(manifest_entry); @@ -60,7 +60,7 @@ pub(crate) fn split_datafiles_once( pub(crate) fn split_datafiles( files: impl Iterator>, - rect: Rectangle, + rect: Rectangle, names: &SmallVec<[&str; 4]>, n_split: u32, ) -> Result; 2]>, Error> { diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 4ccfef41..18154428 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -2,27 +2,26 @@ use std::{cmp::Ordering, convert::identity}; use iceberg_rust_spec::{ manifest_list::FieldSummary, - values::{Struct, TryAdd, TrySub, Value}, + values::{Struct, TrySub, Value}, }; use smallvec::SmallVec; use crate::error::Error; +type Vec4 = SmallVec<[T; 4]>; + #[derive(Clone, PartialEq, Eq)] -pub(crate) struct Rectangle { - pub min: SmallVec<[C; 4]>, - pub max: SmallVec<[C; 4]>, +pub(crate) struct Rectangle { + pub min: Vec4, + pub max: Vec4, } -impl Rectangle -where - C: PartialOrd + Ord + Clone + TryAdd + TrySub, -{ - pub(crate) fn new(min: SmallVec<[C; 4]>, max: SmallVec<[C; 4]>) -> Self { +impl Rectangle { + pub(crate) fn new(min: Vec4, max: Vec4) -> Self { Self { min, max } } - pub(crate) fn expand(&mut self, rect: &Rectangle) { + pub(crate) fn expand(&mut self, rect: &Rectangle) { for i in 0..self.min.len() { if rect.min[i] < self.min[i] { self.min[i] = rect.min[i].clone(); @@ -33,7 +32,7 @@ where } } - pub(crate) fn expand_with_node(&mut self, node: SmallVec<[C; 4]>) { + pub(crate) fn expand_with_node(&mut self, node: Vec4) { for i in 0..self.min.len() { if node[i] < self.min[i] { self.min[i] = node[i].clone(); @@ -45,7 +44,7 @@ where } /// Determine of one rectangle is larger than the other - pub(crate) fn cmp_with_priority(&self, other: &Rectangle) -> Result { + pub(crate) fn cmp_with_priority(&self, other: &Rectangle) -> Result { let self_iter = self .max .iter() @@ -68,10 +67,7 @@ where } } -pub(crate) fn struct_to_smallvec( - s: &Struct, - names: &SmallVec<[&str; 4]>, -) -> Result, Error> { +pub(crate) fn struct_to_smallvec(s: &Struct, names: &[&str]) -> Result, Error> { names .iter() .map(|x| s.get(x).and_then(|x| identity(x).clone())) @@ -79,7 +75,7 @@ pub(crate) fn struct_to_smallvec( .ok_or(Error::InvalidFormat("Partition struct".to_owned())) } -pub(crate) fn summary_to_rectangle(summaries: &[FieldSummary]) -> Result, Error> { +pub(crate) fn summary_to_rectangle(summaries: &[FieldSummary]) -> Result { let mut max = SmallVec::with_capacity(summaries.len()); let mut min = SmallVec::with_capacity(summaries.len()); @@ -101,7 +97,7 @@ pub(crate) fn summary_to_rectangle(summaries: &[FieldSummary]) -> Result(left: &[C], right: &[C]) -> Result { +pub(crate) fn cmp_with_priority(left: &[Value], right: &[Value]) -> Result { for (own, other) in left.iter().zip(right.iter()) { let ordering = own .partial_cmp(other) @@ -113,7 +109,7 @@ pub(crate) fn cmp_dist(left: &[C], right: &[C]) -> Result(left: &[C], right: &[C]) -> Result, Error> { +pub(crate) fn sub(left: &[Value], right: &[Value]) -> Result, Error> { let mut v = SmallVec::with_capacity(left.len()); for i in 0..left.len() { v.push(left[i].try_sub(&right[i])?); @@ -159,54 +155,44 @@ mod tests { #[test] fn test_cmp_dist_empty_slices() { - let result = cmp_dist::(&[], &[]); + let result = cmp_with_priority(&[], &[]); assert_eq!(result.unwrap(), Ordering::Equal); } #[test] fn test_cmp_dist_equal_slices() { - let left = vec![1, 2, 3]; - let right = vec![1, 2, 3]; - let result = cmp_dist(&left, &right); + let left = vec![Value::Int(1), Value::Int(2), Value::Int(3)]; + let right = vec![Value::Int(1), Value::Int(2), Value::Int(3)]; + let result = cmp_with_priority(&left, &right); assert_eq!(result.unwrap(), Ordering::Equal); } #[test] fn test_cmp_dist_less_than() { - let left = vec![1, 2]; - let right = vec![1, 3]; - let result = cmp_dist(&left, &right); + let left = vec![Value::Int(1), Value::Int(2)]; + let right = vec![Value::Int(1), Value::Int(3)]; + let result = cmp_with_priority(&left, &right); assert_eq!(result.unwrap(), Ordering::Less); } #[test] fn test_cmp_dist_greater_than() { - let left = vec![1, 4]; - let right = vec![1, 3]; - let result = cmp_dist(&left, &right); + let left = vec![Value::Int(1), Value::Int(4)]; + let right = vec![Value::Int(1), Value::Int(3)]; + let result = cmp_with_priority(&left, &right); assert_eq!(result.unwrap(), Ordering::Greater); } - #[test] - fn test_cmp_dist_with_floats() { - let left = vec![1.0, 2.0]; - let right = vec![1.0, 2.0]; - let result = cmp_dist(&left, &right); - assert_eq!(result.unwrap(), Ordering::Equal); - } - - #[test] - fn test_cmp_dist_with_nan() { - let left = vec![1.0, f64::NAN]; - let right = vec![1.0, 2.0]; - let result = cmp_dist(&left, &right); - assert!(matches!(result, Err(Error::InvalidFormat(_)))); - } - #[test] fn test_rectangle_cmp_with_priority_greater() { - let larger = Rectangle::new(smallvec![0, 0], smallvec![10, 10]); - let smaller = Rectangle::new(smallvec![1, 1], smallvec![8, 8]); + let larger = Rectangle::new( + smallvec![Value::Int(0), Value::Int(0)], + smallvec![Value::Int(10), Value::Int(10)], + ); + let smaller = Rectangle::new( + smallvec![Value::Int(1), Value::Int(1)], + smallvec![Value::Int(8), Value::Int(8)], + ); assert_eq!( larger.cmp_with_priority(&smaller).unwrap(), Ordering::Greater @@ -215,123 +201,195 @@ mod tests { #[test] fn test_rectangle_cmp_with_priority_less() { - let larger = Rectangle::new(smallvec![0, 0], smallvec![10, 10]); - let smaller = Rectangle::new(smallvec![1, 1], smallvec![8, 8]); + let larger = Rectangle::new( + smallvec![Value::Int(0), Value::Int(0)], + smallvec![Value::Int(10), Value::Int(10)], + ); + let smaller = Rectangle::new( + smallvec![Value::Int(1), Value::Int(1)], + smallvec![Value::Int(8), Value::Int(8)], + ); assert_eq!(smaller.cmp_with_priority(&larger).unwrap(), Ordering::Less); } #[test] fn test_rectangle_cmp_with_priority_equal() { - let rect1 = Rectangle::new(smallvec![0, 0], smallvec![5, 5]); - let rect2 = Rectangle::new(smallvec![0, 0], smallvec![5, 5]); + let rect1 = Rectangle::new( + smallvec![Value::Int(0), Value::Int(0)], + smallvec![Value::Int(5), Value::Int(5)], + ); + let rect2 = Rectangle::new( + smallvec![Value::Int(0), Value::Int(0)], + smallvec![Value::Int(5), Value::Int(5)], + ); assert_eq!(rect1.cmp_with_priority(&rect2).unwrap(), Ordering::Equal); } #[test] fn test_rectangle_cmp_with_priority_partial_dimensions() { - let rect1 = Rectangle::new(smallvec![0, 0], smallvec![6, 4]); - let rect2 = Rectangle::new(smallvec![0, 0], smallvec![4, 6]); - assert!(rect1.cmp_with_priority(&rect2).is_ok()); + let rect1 = Rectangle::new( + smallvec![Value::Int(0), Value::Int(0)], + smallvec![Value::Int(6), Value::Int(4)], + ); + let rect2 = Rectangle::new( + smallvec![Value::Int(0), Value::Int(0)], + smallvec![Value::Int(4), Value::Int(6)], + ); + assert_eq!(rect1.cmp_with_priority(&rect2).unwrap(), Ordering::Greater); } #[test] fn test_rectangle_cmp_with_priority_overlapping() { - let rect1 = Rectangle::new(smallvec![2, 0], smallvec![8, 4]); - let rect2 = Rectangle::new(smallvec![0, 2], smallvec![6, 6]); - assert!(rect1.cmp_with_priority(&rect2).is_ok()); + let rect1 = Rectangle::new( + smallvec![Value::Int(2), Value::Int(0)], + smallvec![Value::Int(8), Value::Int(4)], + ); + let rect2 = Rectangle::new( + smallvec![Value::Int(0), Value::Int(2)], + smallvec![Value::Int(6), Value::Int(6)], + ); + assert_eq!(rect1.cmp_with_priority(&rect2).unwrap(), Ordering::Equal); } #[test] fn test_expand_with_node_smaller_values() { - let mut bounds = Rectangle::new(smallvec![5, 5, 5], smallvec![10, 10, 10]); - let node: SmallVec<[i32; 4]> = smallvec![3, 4, 2]; + let mut bounds = Rectangle::new( + smallvec![Value::Int(5), Value::Int(5), Value::Int(5)], + smallvec![Value::Int(10), Value::Int(10), Value::Int(10)], + ); + let node: SmallVec<[Value; 4]> = smallvec![Value::Int(3), Value::Int(4), Value::Int(2)]; bounds.expand_with_node(node); - assert_eq!(bounds.min, smallvec![3, 4, 2] as SmallVec<[i32; 4]>); - assert_eq!(bounds.max, smallvec![10, 10, 10] as SmallVec<[i32; 4]>); + assert_eq!( + bounds.min, + smallvec![Value::Int(3), Value::Int(4), Value::Int(2)] as SmallVec<[Value; 4]> + ); + assert_eq!( + bounds.max, + smallvec![Value::Int(10), Value::Int(10), Value::Int(10)] as SmallVec<[Value; 4]> + ); } #[test] fn test_expand_with_node_larger_values() { - let mut bounds = Rectangle::new(smallvec![5, 5, 5], smallvec![10, 10, 10]); - let node: SmallVec<[i32; 4]> = smallvec![6, 12, 11]; + let mut bounds = Rectangle::new( + smallvec![Value::Int(5), Value::Int(5), Value::Int(5)], + smallvec![Value::Int(10), Value::Int(10), Value::Int(10)], + ); + let node: SmallVec<[Value; 4]> = smallvec![Value::Int(6), Value::Int(12), Value::Int(11)]; bounds.expand_with_node(node); - assert_eq!(bounds.min, smallvec![5, 5, 5] as SmallVec<[i32; 4]>); - assert_eq!(bounds.max, smallvec![10, 12, 11] as SmallVec<[i32; 4]>); + assert_eq!( + bounds.min, + smallvec![Value::Int(5), Value::Int(5), Value::Int(5)] as SmallVec<[Value; 4]> + ); + assert_eq!( + bounds.max, + smallvec![Value::Int(10), Value::Int(12), Value::Int(11)] as SmallVec<[Value; 4]> + ); } #[test] fn test_expand_with_node_mixed_values() { - let mut bounds = Rectangle::new(smallvec![5, 5, 5], smallvec![10, 10, 10]); - let node: SmallVec<[i32; 4]> = smallvec![3, 15, 7]; + let mut bounds = Rectangle::new( + smallvec![Value::Int(5), Value::Int(5), Value::Int(5)], + smallvec![Value::Int(10), Value::Int(10), Value::Int(10)], + ); + let node: SmallVec<[Value; 4]> = smallvec![Value::Int(3), Value::Int(15), Value::Int(7)]; bounds.expand_with_node(node); - assert_eq!(bounds.min, smallvec![3, 5, 5] as SmallVec<[i32; 4]>); - assert_eq!(bounds.max, smallvec![10, 15, 10] as SmallVec<[i32; 4]>); + assert_eq!( + bounds.min, + smallvec![Value::Int(3), Value::Int(5), Value::Int(5)] as SmallVec<[Value; 4]> + ); + assert_eq!( + bounds.max, + smallvec![Value::Int(10), Value::Int(15), Value::Int(10)] as SmallVec<[Value; 4]> + ); } #[test] fn test_expand_with_node_equal_values() { - let mut bounds = Rectangle::new(smallvec![5, 5, 5], smallvec![10, 10, 10]); - let node: SmallVec<[i32; 4]> = smallvec![5, 10, 7]; + let mut bounds = Rectangle::new( + smallvec![Value::Int(5), Value::Int(5), Value::Int(5)], + smallvec![Value::Int(10), Value::Int(10), Value::Int(10)], + ); + let node: SmallVec<[Value; 4]> = smallvec![Value::Int(5), Value::Int(10), Value::Int(7)]; bounds.expand_with_node(node); - assert_eq!(bounds.min, smallvec![5, 5, 5] as SmallVec<[i32; 4]>); - assert_eq!(bounds.max, smallvec![10, 10, 10] as SmallVec<[i32; 4]>); + assert_eq!( + bounds.min, + smallvec![Value::Int(5), Value::Int(5), Value::Int(5)] as SmallVec<[Value; 4]> + ); + assert_eq!( + bounds.max, + smallvec![Value::Int(10), Value::Int(10), Value::Int(10)] as SmallVec<[Value; 4]> + ); } #[test] fn test_rectangle_expand() { let mut rect1 = Rectangle { - min: smallvec![0, 0], - max: smallvec![5, 5], + min: smallvec![Value::Int(0), Value::Int(0)], + max: smallvec![Value::Int(5), Value::Int(5)], }; let rect2 = Rectangle { - min: smallvec![-1, -1], - max: smallvec![3, 6], + min: smallvec![Value::Int(-1), Value::Int(-1)], + max: smallvec![Value::Int(3), Value::Int(6)], }; rect1.expand(&rect2); - assert_eq!(rect1.min, smallvec![-1, -1] as SmallVec<[i32; 4]>); - assert_eq!(rect1.max, smallvec![5, 6] as SmallVec<[i32; 4]>); + assert_eq!( + rect1.min, + smallvec![Value::Int(-1), Value::Int(-1)] as SmallVec<[Value; 4]> + ); + assert_eq!( + rect1.max, + smallvec![Value::Int(5), Value::Int(6)] as SmallVec<[Value; 4]> + ); } #[test] fn test_rectangle_expand_no_change() { let mut rect1 = Rectangle { - min: smallvec![0, 0], - max: smallvec![10, 10], + min: smallvec![Value::Int(0), Value::Int(0)], + max: smallvec![Value::Int(10), Value::Int(10)], }; let rect2 = Rectangle { - min: smallvec![2, 2], - max: smallvec![8, 8], + min: smallvec![Value::Int(2), Value::Int(2)], + max: smallvec![Value::Int(8), Value::Int(8)], }; rect1.expand(&rect2); - assert_eq!(rect1.min, smallvec![0, 0] as SmallVec<[i32; 4]>); - assert_eq!(rect1.max, smallvec![10, 10] as SmallVec<[i32; 4]>); + assert_eq!( + rect1.min, + smallvec![Value::Int(0), Value::Int(0)] as SmallVec<[Value; 4]> + ); + assert_eq!( + rect1.max, + smallvec![Value::Int(10), Value::Int(10)] as SmallVec<[Value; 4]> + ); } #[test] fn test_rectangle_expand_single_dimension() { let mut rect1 = Rectangle { - min: smallvec![5], - max: smallvec![10], + min: smallvec![Value::Int(5)], + max: smallvec![Value::Int(10)], }; let rect2 = Rectangle { - min: smallvec![3], - max: smallvec![12], + min: smallvec![Value::Int(3)], + max: smallvec![Value::Int(12)], }; rect1.expand(&rect2); - assert_eq!(rect1.min, smallvec![3] as SmallVec<[i32; 4]>); - assert_eq!(rect1.max, smallvec![12] as SmallVec<[i32; 4]>); + assert_eq!(rect1.min, smallvec![Value::Int(3)] as SmallVec<[Value; 4]>); + assert_eq!(rect1.max, smallvec![Value::Int(12)] as SmallVec<[Value; 4]>); } #[test] fn test_rectangle_expand_empty() { - let mut rect1: Rectangle = Rectangle { + let mut rect1: Rectangle = Rectangle { min: smallvec![], max: smallvec![], }; @@ -343,7 +401,7 @@ mod tests { rect1.expand(&rect2); - assert_eq!(rect1.min, smallvec![] as SmallVec<[i32; 4]>); - assert_eq!(rect1.max, smallvec![] as SmallVec<[i32; 4]>); + assert_eq!(rect1.min, smallvec![] as SmallVec<[Value; 4]>); + assert_eq!(rect1.max, smallvec![] as SmallVec<[Value; 4]>); } } From 7725fdea5c0c3be32ac2f8d57f4d62a8e752356a Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 25 Oct 2024 08:30:27 +0200 Subject: [PATCH 13/32] fix partition column type --- datafusion_iceberg/src/table.rs | 13 ++++++++++--- iceberg-rust-spec/src/spec/manifest_list.rs | 8 ++++---- iceberg-rust-spec/src/spec/partition.rs | 9 ++++++--- iceberg-rust/src/util/mod.rs | 2 +- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/datafusion_iceberg/src/table.rs b/datafusion_iceberg/src/table.rs index 17cd6d6e..47a2ca7f 100644 --- a/datafusion_iceberg/src/table.rs +++ b/datafusion_iceberg/src/table.rs @@ -920,7 +920,13 @@ mod tests { "INSERT INTO orders (id, customer_id, product_id, date, amount) VALUES (7, 1, 3, '2020-01-03', 1), (8, 2, 1, '2020-01-03', 2), - (9, 2, 2, '2020-01-03', 1);", + (9, 2, 2, '2020-01-03', 1), + (10, 1, 2, '2020-01-04', 3), + (11, 3, 1, '2020-01-04', 2), + (12, 2, 3, '2020-01-04', 1), + (13, 1, 1, '2020-01-05', 4), + (14, 3, 2, '2020-01-05', 2), + (15, 2, 3, '2020-01-05', 3);", ) .await .expect("Failed to create query plan for insert") @@ -952,8 +958,9 @@ mod tests { ); for (product_id, amount) in product_ids.iter().zip(amounts) { match product_id.unwrap() { - 1 => assert_eq!(amount.unwrap(), 3), - 2 | 3 => assert_eq!(amount.unwrap(), 1), + 1 => assert_eq!(amount.unwrap(), 7), + 2 => assert_eq!(amount.unwrap(), 4), + 3 => assert_eq!(amount.unwrap(), 1), _ => panic!("Unexpected order id"), } } diff --git a/iceberg-rust-spec/src/spec/manifest_list.rs b/iceberg-rust-spec/src/spec/manifest_list.rs index 9247e5c2..ca5beab1 100644 --- a/iceberg-rust-spec/src/spec/manifest_list.rs +++ b/iceberg-rust-spec/src/spec/manifest_list.rs @@ -773,8 +773,8 @@ mod tests { partitions: Some(vec![FieldSummary { contains_null: true, contains_nan: Some(false), - lower_bound: Some(Value::Date(1234)), - upper_bound: Some(Value::Date(76890)), + lower_bound: Some(Value::Int(1234)), + upper_bound: Some(Value::Int(76890)), }]), key_metadata: None, }; @@ -853,8 +853,8 @@ mod tests { partitions: Some(vec![FieldSummary { contains_null: true, contains_nan: Some(false), - lower_bound: Some(Value::Date(1234)), - upper_bound: Some(Value::Date(76890)), + lower_bound: Some(Value::Int(1234)), + upper_bound: Some(Value::Int(76890)), }]), key_metadata: None, }; diff --git a/iceberg-rust-spec/src/spec/partition.rs b/iceberg-rust-spec/src/spec/partition.rs index 0219b9a4..0ea04db0 100644 --- a/iceberg-rust-spec/src/spec/partition.rs +++ b/iceberg-rust-spec/src/spec/partition.rs @@ -183,10 +183,13 @@ impl PartitionSpec { .map(|field| { schema .get(field.source_id as usize) - .map(|x| x.field_type.clone()) + .ok_or(Error::NotFound( + "Partition field".to_owned(), + field.name.clone(), + )) + .and_then(|x| x.field_type.clone().tranform(&field.transform)) }) - .collect::>>() - .ok_or(Error::InvalidFormat("partition spec".to_string())) + .collect::, Error>>() } } diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 18154428..24cc50a8 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -10,7 +10,7 @@ use crate::error::Error; type Vec4 = SmallVec<[T; 4]>; -#[derive(Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct Rectangle { pub min: Vec4, pub max: Vec4, From 037b9824682e54ae61fa705116decaa001d51b17 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 25 Oct 2024 08:38:03 +0200 Subject: [PATCH 14/32] fix public function --- iceberg-rust/src/table/transaction/append.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 3e275186..49eb4486 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -10,7 +10,7 @@ use crate::{ /// Split sets of datafiles depending on their partition_values #[allow(clippy::type_complexity)] -pub(crate) fn split_datafiles_once( +pub fn split_datafiles_once( files: impl Iterator>, rect: Rectangle, names: &[&str], From 98d497de92bcdd449e05c7b57e9c8018203b8ade Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 25 Oct 2024 08:38:11 +0200 Subject: [PATCH 15/32] fix identity --- iceberg-rust/src/util/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 24cc50a8..ad9eb982 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -1,4 +1,4 @@ -use std::{cmp::Ordering, convert::identity}; +use std::cmp::Ordering; use iceberg_rust_spec::{ manifest_list::FieldSummary, @@ -70,7 +70,7 @@ impl Rectangle { pub(crate) fn struct_to_smallvec(s: &Struct, names: &[&str]) -> Result, Error> { names .iter() - .map(|x| s.get(x).and_then(|x| identity(x).clone())) + .map(|x| s.get(x).and_then(Clone::clone)) .collect::>>() .ok_or(Error::InvalidFormat("Partition struct".to_owned())) } From 5f07a9a02faa230bc7a3b7c1c6bd161d388f7502 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 25 Oct 2024 08:46:35 +0200 Subject: [PATCH 16/32] rename struct_to_smallvec --- iceberg-rust/src/table/transaction/append.rs | 4 ++-- iceberg-rust/src/table/transaction/operation.rs | 6 +++--- iceberg-rust/src/util/mod.rs | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 49eb4486..86996961 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -5,7 +5,7 @@ use smallvec::{smallvec, SmallVec}; use crate::{ error::Error, - util::{cmp_with_priority, struct_to_smallvec, sub, Rectangle}, + util::{cmp_with_priority, partition_struct_to_vec, sub, Rectangle}, }; /// Split sets of datafiles depending on their partition_values @@ -22,7 +22,7 @@ pub fn split_datafiles_once( for manifest_entry in files { let manifest_entry = manifest_entry?; - let position = struct_to_smallvec(manifest_entry.data_file().partition(), names)?; + let position = partition_struct_to_vec(manifest_entry.data_file().partition(), names)?; // Check distance to upper and lower bound if let Ordering::Greater = cmp_with_priority(&sub(&position, &rect.min)?, &sub(&rect.max, &position)?)? diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index 4fb0fcd1..f7f81aba 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -29,7 +29,7 @@ use smallvec::SmallVec; use crate::{ catalog::commit::{TableRequirement, TableUpdate}, error::Error, - util::{struct_to_smallvec, summary_to_rectangle, Rectangle}, + util::{partition_struct_to_vec, summary_to_rectangle, Rectangle}, }; use super::append::split_datafiles; @@ -110,7 +110,7 @@ impl Operation { let bounding_partition_values = files .iter() .try_fold(None, |acc, x| { - let node = struct_to_smallvec(x.partition(), &partition_column_names)?; + let node = partition_struct_to_vec(x.partition(), &partition_column_names)?; let Some(mut acc) = acc else { return Ok::<_, Error>(Some(Rectangle::new(node.clone(), node))); }; @@ -560,7 +560,7 @@ impl Operation { let bounding_partition_values = files .iter() .try_fold(None, |acc, x| { - let node = struct_to_smallvec(x.partition(), &partition_column_names)?; + let node = partition_struct_to_vec(x.partition(), &partition_column_names)?; let Some(mut acc) = acc else { return Ok::<_, Error>(Some(Rectangle::new(node.clone(), node))); }; diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index ad9eb982..eb450d0d 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -67,7 +67,8 @@ impl Rectangle { } } -pub(crate) fn struct_to_smallvec(s: &Struct, names: &[&str]) -> Result, Error> { +/// Converts the values of a partition struct into a vector in the order that the columns appear in the partition spec +pub(crate) fn partition_struct_to_vec(s: &Struct, names: &[&str]) -> Result, Error> { names .iter() .map(|x| s.get(x).and_then(Clone::clone)) From 023d547664f20742e003735292e79ca0f3c52aeb Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 25 Oct 2024 09:00:54 +0200 Subject: [PATCH 17/32] add documentation --- iceberg-rust/src/table/transaction/append.rs | 9 ++++--- iceberg-rust/src/util/mod.rs | 27 +++++++++++++++----- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 86996961..beb13ae7 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -5,7 +5,7 @@ use smallvec::{smallvec, SmallVec}; use crate::{ error::Error, - util::{cmp_with_priority, partition_struct_to_vec, sub, Rectangle}, + util::{cmp_with_priority, partition_struct_to_vec, try_sub, Rectangle}, }; /// Split sets of datafiles depending on their partition_values @@ -24,9 +24,10 @@ pub fn split_datafiles_once( let manifest_entry = manifest_entry?; let position = partition_struct_to_vec(manifest_entry.data_file().partition(), names)?; // Check distance to upper and lower bound - if let Ordering::Greater = - cmp_with_priority(&sub(&position, &rect.min)?, &sub(&rect.max, &position)?)? - { + if let Ordering::Greater = cmp_with_priority( + &try_sub(&position, &rect.min)?, + &try_sub(&rect.max, &position)?, + )? { // if closer to upper bound larger.push(manifest_entry); diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index eb450d0d..39af86d0 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -21,6 +21,10 @@ impl Rectangle { Self { min, max } } + /// Expands the rectangle to include the given rectangle. + /// + /// This method updates the minimum and maximum values of the rectangle to include + /// the values in the given `rect` rectangle. pub(crate) fn expand(&mut self, rect: &Rectangle) { for i in 0..self.min.len() { if rect.min[i] < self.min[i] { @@ -32,6 +36,10 @@ impl Rectangle { } } + /// Expands the rectangle to include the given node. + /// + /// This method updates the minimum and maximum values of the rectangle to include + /// the values in the given `node` vector. pub(crate) fn expand_with_node(&mut self, node: Vec4) { for i in 0..self.min.len() { if node[i] < self.min[i] { @@ -43,7 +51,7 @@ impl Rectangle { } } - /// Determine of one rectangle is larger than the other + /// Determine of one rectangle is larger than the other. Values the earlier columns more than the later. pub(crate) fn cmp_with_priority(&self, other: &Rectangle) -> Result { let self_iter = self .max @@ -68,10 +76,13 @@ impl Rectangle { } /// Converts the values of a partition struct into a vector in the order that the columns appear in the partition spec -pub(crate) fn partition_struct_to_vec(s: &Struct, names: &[&str]) -> Result, Error> { +pub(crate) fn partition_struct_to_vec( + partition_struct: &Struct, + names: &[&str], +) -> Result, Error> { names .iter() - .map(|x| s.get(x).and_then(Clone::clone)) + .map(|x| partition_struct.get(x).and_then(Clone::clone)) .collect::>>() .ok_or(Error::InvalidFormat("Partition struct".to_owned())) } @@ -98,6 +109,7 @@ pub(crate) fn summary_to_rectangle(summaries: &[FieldSummary]) -> Result Result { for (own, other) in left.iter().zip(right.iter()) { let ordering = own @@ -110,7 +122,8 @@ pub(crate) fn cmp_with_priority(left: &[Value], right: &[Value]) -> Result Result, Error> { +/// Try to subtract a value vector from another +pub(crate) fn try_sub(left: &[Value], right: &[Value]) -> Result, Error> { let mut v = SmallVec::with_capacity(left.len()); for i in 0..left.len() { v.push(left[i].try_sub(&right[i])?); @@ -129,7 +142,7 @@ mod tests { fn test_sub_valid() { let left = vec![Value::Int(5), Value::Int(10), Value::Int(15)]; let right = vec![Value::Int(2), Value::Int(3), Value::Int(5)]; - let result = sub(&left, &right).unwrap(); + let result = try_sub(&left, &right).unwrap(); assert_eq!(result.len(), 3); assert_eq!(result[0], Value::Int(3)); assert_eq!(result[1], Value::Int(7)); @@ -140,7 +153,7 @@ mod tests { fn test_sub_empty() { let left: Vec = vec![]; let right: Vec = vec![]; - let result = sub(&left, &right).unwrap(); + let result = try_sub(&left, &right).unwrap(); assert_eq!(result.len(), 0); } @@ -148,7 +161,7 @@ mod tests { fn test_sub_same_numbers() { let left = vec![Value::Int(5), Value::Int(5)]; let right = vec![Value::Int(5), Value::Int(5)]; - let result = sub(&left, &right).unwrap(); + let result = try_sub(&left, &right).unwrap(); assert_eq!(result.len(), 2); assert_eq!(result[0], Value::Int(0)); assert_eq!(result[1], Value::Int(0)); From e050686daee3c9fa9838efc6fe79a2d9639d364b Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 25 Oct 2024 09:59:47 +0200 Subject: [PATCH 18/32] implement sub for string --- iceberg-rust-spec/src/spec/values.rs | 30 +++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/iceberg-rust-spec/src/spec/values.rs b/iceberg-rust-spec/src/spec/values.rs index d95ce6be..c9271564 100644 --- a/iceberg-rust-spec/src/spec/values.rs +++ b/iceberg-rust-spec/src/spec/values.rs @@ -6,7 +6,7 @@ use std::{ any::Any, collections::{btree_map::Keys, BTreeMap, HashMap}, fmt, - hash::Hash, + hash::{DefaultHasher, Hash, Hasher}, io::Cursor, ops::{Add, Sub}, slice::Iter, @@ -882,6 +882,9 @@ impl TrySub for Value { (Value::TimestampTZ(own), Value::TimestampTZ(other)) => { Ok(Value::TimestampTZ(own - other)) } + (Value::String(own), Value::String(other)) => { + Ok(Value::LongInt(sub_string(&own, &other) as i64)) + } (x, y) => Err(Error::Type( x.datatype().to_string(), y.datatype().to_string(), @@ -890,6 +893,31 @@ impl TrySub for Value { } } +fn sub_string(left: &str, right: &str) -> u64 { + if let Some(distance) = left + .chars() + .zip(right.chars()) + .take(256) + .skip_while(|(l, r)| l == r) + .try_fold(0, |acc, (l, r)| { + if let (Some(l), Some(r)) = (l.to_digit(36), r.to_digit(36)) { + Some(acc + (l - r).pow(2)) + } else { + None + } + }) + { + distance as u64 + } else { + let mut hasher = DefaultHasher::new(); + hasher.write(left.as_bytes()); + let left = hasher.finish(); + hasher.write(right.as_bytes()); + let right = hasher.finish(); + left - right + } +} + #[cfg(test)] mod tests { From dc7a29d508142f2181f91d8f0b371b2b1c4aedba Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 25 Oct 2024 10:02:13 +0200 Subject: [PATCH 19/32] clippy fixes --- iceberg-rust-spec/src/spec/values.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iceberg-rust-spec/src/spec/values.rs b/iceberg-rust-spec/src/spec/values.rs index c9271564..0fb59c37 100644 --- a/iceberg-rust-spec/src/spec/values.rs +++ b/iceberg-rust-spec/src/spec/values.rs @@ -280,7 +280,7 @@ impl Hash for Struct { fn hash(&self, state: &mut H) { for key in self.keys().sorted() { key.hash(state); - self.get(&key).hash(state); + self.get(key).hash(state); } } } @@ -883,7 +883,7 @@ impl TrySub for Value { Ok(Value::TimestampTZ(own - other)) } (Value::String(own), Value::String(other)) => { - Ok(Value::LongInt(sub_string(&own, &other) as i64)) + Ok(Value::LongInt(sub_string(own, other) as i64)) } (x, y) => Err(Error::Type( x.datatype().to_string(), From e715e2cb4f55d1d2db98085e56c79b4aae7b383f Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Sun, 27 Oct 2024 06:46:34 +0100 Subject: [PATCH 20/32] add documentation --- iceberg-rust/src/table/transaction/append.rs | 3 ++- iceberg-rust/src/table/transaction/operation.rs | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index beb13ae7..c7b661b7 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -59,10 +59,11 @@ pub fn split_datafiles_once( ]) } +/// Splits the datafiles *n_split* times to decrease the number of datafiles per maniefst. 1 split returns 2 outputs vectors, 2 splits return 4, 3 splits return 8 and so on. pub(crate) fn split_datafiles( files: impl Iterator>, rect: Rectangle, - names: &SmallVec<[&str; 4]>, + names: &[&str], n_split: u32, ) -> Result; 2]>, Error> { let [(smaller, smaller_rect), (larger, larger_rect)] = diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index f7f81aba..acb10776 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -226,7 +226,9 @@ impl Operation { .unwrap_or(0) as usize + files.len(); - // How many times do the files need to be split to give at most *limit* files per manifest + // To achieve fast lookups of the datafiles, the maniest tree should be somewhat balanced, meaning that manifest files should contain a similar number of datafiles. + // This means that maniest files might need to be split up when they get too large. Since the number of datafiles being added by a append operation might be really large, + // it might even be required to split the manifest file multiple times. *N_splits* stores how many times a manifest file needs to be split to give at most *limit* datafiles per manifest let n_splits = match new_file_count / limit { 0 => 0, x => x.ilog2() + 1, From 49cb0367f65af145b04ee87852957b17dfade691 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 28 Oct 2024 11:30:43 +0100 Subject: [PATCH 21/32] expand documentation --- iceberg-rust/src/table/transaction/append.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index c7b661b7..6e8f09b6 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -23,7 +23,8 @@ pub fn split_datafiles_once( for manifest_entry in files { let manifest_entry = manifest_entry?; let position = partition_struct_to_vec(manifest_entry.data_file().partition(), names)?; - // Check distance to upper and lower bound + // Compare distance to upper and lower bound. Since you can't compute a "norm" for a multidimensional vector where the dimensions have different datatypes, + // the dimensions are compared individually and the norm is computed by weighing the earlier columns more than the later. if let Ordering::Greater = cmp_with_priority( &try_sub(&position, &rect.min)?, &try_sub(&rect.max, &position)?, From 12af6d79a3795a33a9ad4b4e8f20387ef57676bb Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Oct 2024 15:48:35 +0100 Subject: [PATCH 22/32] remove public from split_datafiles_once --- iceberg-rust/src/table/transaction/append.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 6e8f09b6..0a538b4b 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -10,7 +10,7 @@ use crate::{ /// Split sets of datafiles depending on their partition_values #[allow(clippy::type_complexity)] -pub fn split_datafiles_once( +fn split_datafiles_once( files: impl Iterator>, rect: Rectangle, names: &[&str], From 76de2f8a1239c9e5ed86d58169c2ab96d05ac502 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Oct 2024 15:51:05 +0100 Subject: [PATCH 23/32] use normal vec for split_datafiles --- iceberg-rust/src/table/transaction/append.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 0a538b4b..3b473654 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -1,7 +1,6 @@ use std::cmp::Ordering; use iceberg_rust_spec::manifest::ManifestEntry; -use smallvec::{smallvec, SmallVec}; use crate::{ error::Error, @@ -60,17 +59,17 @@ fn split_datafiles_once( ]) } -/// Splits the datafiles *n_split* times to decrease the number of datafiles per maniefst. 1 split returns 2 outputs vectors, 2 splits return 4, 3 splits return 8 and so on. +/// Splits the datafiles *n_split* times to decrease the number of datafiles per maniefst. Returns *2^n_splits* lists of manifest entries. pub(crate) fn split_datafiles( files: impl Iterator>, rect: Rectangle, names: &[&str], n_split: u32, -) -> Result; 2]>, Error> { +) -> Result>, Error> { let [(smaller, smaller_rect), (larger, larger_rect)] = split_datafiles_once(files, rect, names)?; if n_split == 1 { - Ok(smallvec![smaller, larger]) + Ok(vec![smaller, larger]) } else { let mut smaller = split_datafiles( smaller.into_iter().map(Ok), From 93b768d19d41aa518e35c559338df00550d44df0 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Oct 2024 15:52:21 +0100 Subject: [PATCH 24/32] remove tryadd --- iceberg-rust-spec/src/spec/values.rs | 30 ---------------------------- 1 file changed, 30 deletions(-) diff --git a/iceberg-rust-spec/src/spec/values.rs b/iceberg-rust-spec/src/spec/values.rs index 0fb59c37..42594e6a 100644 --- a/iceberg-rust-spec/src/spec/values.rs +++ b/iceberg-rust-spec/src/spec/values.rs @@ -829,46 +829,16 @@ mod datetime { } } -pub trait TryAdd: Sized { - fn try_add(&self, other: &Self) -> Result; -} pub trait TrySub: Sized { fn try_sub(&self, other: &Self) -> Result; } -impl + Copy> TryAdd for T { - fn try_add(&self, other: &Self) -> Result { - Ok(*self + *other) - } -} - impl + Copy> TrySub for T { fn try_sub(&self, other: &Self) -> Result { Ok(*self - *other) } } -impl TryAdd for Value { - fn try_add(&self, other: &Self) -> Result { - match (self, other) { - (Value::Int(own), Value::Int(other)) => Ok(Value::Int(own + other)), - (Value::LongInt(own), Value::LongInt(other)) => Ok(Value::LongInt(own + other)), - (Value::Float(own), Value::Float(other)) => Ok(Value::Float(*own + *other)), - (Value::Double(own), Value::Double(other)) => Ok(Value::Double(*own + *other)), - (Value::Date(own), Value::Date(other)) => Ok(Value::Date(own + other)), - (Value::Time(own), Value::Time(other)) => Ok(Value::Time(own + other)), - (Value::Timestamp(own), Value::Timestamp(other)) => Ok(Value::Timestamp(own + other)), - (Value::TimestampTZ(own), Value::TimestampTZ(other)) => { - Ok(Value::TimestampTZ(own + other)) - } - (x, y) => Err(Error::Type( - x.datatype().to_string(), - y.datatype().to_string(), - )), - } - } -} - impl TrySub for Value { fn try_sub(&self, other: &Self) -> Result { match (self, other) { From e677ec6be1daae3f2fe1d712991ffe13ab25a018 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Oct 2024 16:10:47 +0100 Subject: [PATCH 25/32] implement trysub for uuid and fixed --- iceberg-rust-spec/src/spec/values.rs | 30 +++++++++++++++++++++++++++- iceberg-rust/src/util/mod.rs | 4 +++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/iceberg-rust-spec/src/spec/values.rs b/iceberg-rust-spec/src/spec/values.rs index 42594e6a..8464266e 100644 --- a/iceberg-rust-spec/src/spec/values.rs +++ b/iceberg-rust-spec/src/spec/values.rs @@ -2,13 +2,14 @@ * Value in iceberg */ +use core::panic; use std::{ any::Any, collections::{btree_map::Keys, BTreeMap, HashMap}, fmt, hash::{DefaultHasher, Hash, Hasher}, io::Cursor, - ops::{Add, Sub}, + ops::Sub, slice::Iter, }; @@ -855,6 +856,33 @@ impl TrySub for Value { (Value::String(own), Value::String(other)) => { Ok(Value::LongInt(sub_string(own, other) as i64)) } + (Value::UUID(own), Value::UUID(other)) => { + let (own1, own2, own3, own4) = own.to_fields_le(); + let (other1, other2, other3, other4) = other.to_fields_le(); + let mut sub4 = [0; 8]; + for i in 0..own4.len() { + sub4[i] = own4[i] - other4[i]; + } + Ok(Value::UUID(Uuid::from_fields_le( + own1 - other1, + own2 - other2, + own3 - other3, + &sub4, + ))) + } + (Value::Fixed(own_size, own), Value::Fixed(other_size, other)) => Ok(Value::Fixed( + if own_size <= other_size { + *own_size + } else if own_size > other_size { + *other_size + } else { + panic!("Size must be either smaller, equal or larger"); + }, + own.iter() + .zip(other.iter()) + .map(|(own, other)| own - other) + .collect(), + )), (x, y) => Err(Error::Type( x.datatype().to_string(), y.datatype().to_string(), diff --git a/iceberg-rust/src/util/mod.rs b/iceberg-rust/src/util/mod.rs index 39af86d0..034217e9 100644 --- a/iceberg-rust/src/util/mod.rs +++ b/iceberg-rust/src/util/mod.rs @@ -51,7 +51,9 @@ impl Rectangle { } } - /// Determine of one rectangle is larger than the other. Values the earlier columns more than the later. + /// Determine if one rectangle is larger than the other. + /// + ///Values the earlier columns more than the later. pub(crate) fn cmp_with_priority(&self, other: &Rectangle) -> Result { let self_iter = self .max From 97927a56936378cf43ed36e813a5d69996713e31 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Oct 2024 16:52:32 +0100 Subject: [PATCH 26/32] add assert for splitting manifest --- datafusion_iceberg/src/table.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/datafusion_iceberg/src/table.rs b/datafusion_iceberg/src/table.rs index 47a2ca7f..d98bbd88 100644 --- a/datafusion_iceberg/src/table.rs +++ b/datafusion_iceberg/src/table.rs @@ -597,10 +597,13 @@ impl DataSink for IcebergDataSink { mod tests { use datafusion::{arrow::array::Int64Array, prelude::SessionContext}; - use iceberg_rust::spec::{ - partition::{PartitionField, Transform}, - schema::Schema, - types::{PrimitiveType, StructField, StructType, Type}, + use iceberg_rust::{ + catalog::tabular::Tabular, + spec::{ + partition::{PartitionField, Transform}, + schema::Schema, + types::{PrimitiveType, StructField, StructType, Type}, + }, }; use iceberg_rust::{ catalog::Catalog, @@ -613,7 +616,7 @@ mod tests { }; use iceberg_sql_catalog::SqlCatalog; use object_store::{memory::InMemory, ObjectStore}; - use std::sync::Arc; + use std::{ops::Deref, sync::Arc}; use crate::{catalog::catalog::IcebergCatalog, DataFusionTable}; @@ -863,7 +866,7 @@ mod tests { let ctx = SessionContext::new(); - ctx.register_table("orders", table).unwrap(); + ctx.register_table("orders", table.clone()).unwrap(); ctx.sql( "INSERT INTO orders (id, customer_id, product_id, date, amount) VALUES @@ -966,6 +969,13 @@ mod tests { } } } + + match table.tabular.read().await.deref() { + Tabular::Table(table) => { + assert_eq!(table.manifests(None, None).await.unwrap().len(), 2); + } + _ => (), + }; } #[tokio::test] From b5ca1695a5a5765b7ec17c29e27e85146aea5ce0 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Oct 2024 17:43:22 +0100 Subject: [PATCH 27/32] select manifest --- iceberg-rust/src/table/transaction/append.rs | 81 ++++++++++++++++++- .../src/table/transaction/operation.rs | 79 +++--------------- 2 files changed, 89 insertions(+), 71 deletions(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 3b473654..a23456fc 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -1,10 +1,14 @@ use std::cmp::Ordering; -use iceberg_rust_spec::manifest::ManifestEntry; +use iceberg_rust_spec::{ + manifest::ManifestEntry, + manifest_list::{ManifestListEntry, ManifestListReader}, +}; +use smallvec::SmallVec; use crate::{ error::Error, - util::{cmp_with_priority, partition_struct_to_vec, try_sub, Rectangle}, + util::{cmp_with_priority, partition_struct_to_vec, summary_to_rectangle, try_sub, Rectangle}, }; /// Split sets of datafiles depending on their partition_values @@ -84,3 +88,76 @@ pub(crate) fn split_datafiles( Ok(smaller) } } + +/// Select the manifest that yields the smallest bounding rectangle after the bounding rectangle of the new values has been added. +pub(crate) fn select_manifest( + partition_column_names: &SmallVec<[&str; 4]>, + mut manifest_list_reader: ManifestListReader<&[u8]>, + file_count: &mut usize, + manifest_list_writer: &mut apache_avro::Writer>, + bounding_partition_values: &Rectangle, +) -> Result { + let manifest = + if partition_column_names.is_empty() { + // Find the manifest with the lowest row count + manifest_list_reader + .try_fold(None, |acc, x| { + let manifest = x?; + + let row_count = manifest.added_rows_count; + + *file_count += manifest.added_files_count.unwrap_or(0) as usize; + + let Some((old_row_count, old_manifest)) = acc else { + return Ok::<_, Error>(Some((row_count, manifest))); + }; + + let Some(row_count) = row_count else { + return Ok(Some((old_row_count, old_manifest))); + }; + + if old_row_count.is_none() || old_row_count.is_some_and(|x| x > row_count) { + manifest_list_writer.append_ser(old_manifest)?; + Ok(Some((Some(row_count), manifest))) + } else { + manifest_list_writer.append_ser(manifest)?; + Ok(Some((old_row_count, old_manifest))) + } + })? + .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned()))? + .1 + } else { + // Find the manifest with the smallest bounding partition values + manifest_list_reader + .try_fold(None, |acc, x| { + let manifest = x?; + + let mut bounds = + summary_to_rectangle(manifest.partitions.as_ref().ok_or( + Error::NotFound("Partition".to_owned(), "struct".to_owned()), + )?)?; + + bounds.expand(bounding_partition_values); + + *file_count += manifest.added_files_count.unwrap_or(0) as usize; + + let Some((old_bounds, old_manifest)) = acc else { + return Ok::<_, Error>(Some((bounds, manifest))); + }; + + match old_bounds.cmp_with_priority(&bounds)? { + Ordering::Greater => { + manifest_list_writer.append_ser(old_manifest)?; + Ok(Some((bounds, manifest))) + } + _ => { + manifest_list_writer.append_ser(manifest)?; + Ok(Some((old_bounds, old_manifest))) + } + } + })? + .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned()))? + .1 + }; + Ok(manifest) +} diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index acb10776..550334b1 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -2,7 +2,7 @@ * Defines the different [Operation]s on a [Table]. */ -use std::{cmp::Ordering, collections::HashMap, sync::Arc}; +use std::{collections::HashMap, sync::Arc}; use iceberg_rust_spec::manifest_list::{ manifest_list_schema_v1, manifest_list_schema_v2, ManifestListReader, @@ -32,7 +32,7 @@ use crate::{ util::{partition_struct_to_vec, summary_to_rectangle, Rectangle}, }; -use super::append::split_datafiles; +use super::append::{select_manifest, split_datafiles}; static MIN_DATAFILES: usize = 4; @@ -143,75 +143,16 @@ impl Operation { .bytes() .await?; - let mut manifest_list_reader = + let manifest_list_reader = ManifestListReader::new(old_manifest_list_bytes.as_ref(), table_metadata)?; - // Check if table is partitioned - let manifest = if partition_column_names.is_empty() { - // Find the manifest with the lowest row count - manifest_list_reader - .try_fold(None, |acc, x| { - let manifest = x?; - - let row_count = manifest.added_rows_count; - - file_count += manifest.added_files_count.unwrap_or(0) as usize; - - let Some((old_row_count, old_manifest)) = acc else { - return Ok::<_, Error>(Some((row_count, manifest))); - }; - - let Some(row_count) = row_count else { - return Ok(Some((old_row_count, old_manifest))); - }; - - if old_row_count.is_none() - || old_row_count.is_some_and(|x| x > row_count) - { - manifest_list_writer.append_ser(old_manifest)?; - Ok(Some((Some(row_count), manifest))) - } else { - manifest_list_writer.append_ser(manifest)?; - Ok(Some((old_row_count, old_manifest))) - } - })? - .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned()))? - .1 - } else { - // Find the manifest with the smallest bounding partition values - manifest_list_reader - .try_fold(None, |acc, x| { - let manifest = x?; - - let mut bounds = summary_to_rectangle( - manifest.partitions.as_ref().ok_or(Error::NotFound( - "Partition".to_owned(), - "struct".to_owned(), - ))?, - )?; - - bounds.expand(&bounding_partition_values); - - file_count += manifest.added_files_count.unwrap_or(0) as usize; - - let Some((old_bounds, old_manifest)) = acc else { - return Ok::<_, Error>(Some((bounds, manifest))); - }; - - match old_bounds.cmp_with_priority(&bounds)? { - Ordering::Greater => { - manifest_list_writer.append_ser(old_manifest)?; - Ok(Some((bounds, manifest))) - } - _ => { - manifest_list_writer.append_ser(manifest)?; - Ok(Some((old_bounds, old_manifest))) - } - } - })? - .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned()))? - .1 - }; + let manifest = select_manifest( + &partition_column_names, + manifest_list_reader, + &mut file_count, + &mut manifest_list_writer, + &bounding_partition_values, + )?; Some(manifest) } else { // If manifest list doesn't exist, there is no manifest From 7b520902f48e6cd39281ff9ef3a76d5b4736da39 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Wed, 30 Oct 2024 09:01:02 +0100 Subject: [PATCH 28/32] create new manifest-writer --- iceberg-rust/src/table/manifest.rs | 331 +++++++++++++++++++++++++++++ iceberg-rust/src/table/mod.rs | 1 + 2 files changed, 332 insertions(+) create mode 100644 iceberg-rust/src/table/manifest.rs diff --git a/iceberg-rust/src/table/manifest.rs b/iceberg-rust/src/table/manifest.rs new file mode 100644 index 00000000..031aba96 --- /dev/null +++ b/iceberg-rust/src/table/manifest.rs @@ -0,0 +1,331 @@ +/*! + * Helpers to deal with manifest files +*/ + +use std::sync::Arc; + +use apache_avro::{Schema as AvroSchema, Writer as AvroWriter}; +use iceberg_rust_spec::{ + manifest::ManifestEntry, + manifest_list::{self, FieldSummary, ManifestListEntry}, + partition::PartitionField, + schema::{SchemaV1, SchemaV2}, + table_metadata::{FormatVersion, TableMetadata}, + util::strip_prefix, + values::{Struct, Value}, +}; +use object_store::ObjectStore; + +use crate::error::Error; + +/// A helper to write entries into a manifest +pub struct ManifestWriter<'schema, 'metadata> { + table_metadata: &'metadata TableMetadata, + manifest: ManifestListEntry, + writer: AvroWriter<'schema, Vec>, +} + +impl<'schema, 'metadata> ManifestWriter<'schema, 'metadata> { + /// Create empty manifest writer + pub fn new( + manifest_location: &str, + snapshot_id: i64, + schema: &'schema AvroSchema, + table_metadata: &'metadata TableMetadata, + branch: Option<&str>, + ) -> Result { + let mut writer = AvroWriter::new(schema, Vec::new()); + + writer.add_user_metadata( + "format-version".to_string(), + match table_metadata.format_version { + FormatVersion::V1 => "1".as_bytes(), + FormatVersion::V2 => "2".as_bytes(), + }, + )?; + + writer.add_user_metadata( + "schema".to_string(), + match table_metadata.format_version { + FormatVersion::V1 => serde_json::to_string(&Into::::into( + table_metadata.current_schema(branch)?.clone(), + ))?, + FormatVersion::V2 => serde_json::to_string(&Into::::into( + table_metadata.current_schema(branch)?.clone(), + ))?, + }, + )?; + + writer.add_user_metadata( + "schema-id".to_string(), + serde_json::to_string(&table_metadata.current_schema(branch)?.schema_id())?, + )?; + + writer.add_user_metadata( + "partition-spec".to_string(), + serde_json::to_string(&table_metadata.default_partition_spec()?.fields())?, + )?; + + writer.add_user_metadata( + "partition-spec-id".to_string(), + serde_json::to_string(&table_metadata.default_partition_spec()?.spec_id())?, + )?; + + writer.add_user_metadata("content".to_string(), "data")?; + + let manifest = ManifestListEntry { + format_version: table_metadata.format_version, + manifest_path: manifest_location.to_owned(), + manifest_length: 0, + partition_spec_id: table_metadata.default_spec_id, + content: manifest_list::Content::Data, + sequence_number: table_metadata.last_sequence_number, + min_sequence_number: 0, + added_snapshot_id: snapshot_id, + added_files_count: Some(0), + existing_files_count: Some(0), + deleted_files_count: Some(0), + added_rows_count: Some(0), + existing_rows_count: Some(0), + deleted_rows_count: Some(0), + partitions: None, + key_metadata: None, + }; + + Ok(ManifestWriter { + manifest, + writer, + table_metadata, + }) + } + + /// Create an manifest writer from an existing manifest + pub fn from_existing( + bytes: &[u8], + manifest: ManifestListEntry, + schema: &'schema AvroSchema, + table_metadata: &'metadata TableMetadata, + branch: Option<&str>, + ) -> Result { + let manifest_reader = apache_avro::Reader::new(bytes)?; + + let mut writer = AvroWriter::new(schema, Vec::new()); + + writer.add_user_metadata( + "format-version".to_string(), + match table_metadata.format_version { + FormatVersion::V1 => "1".as_bytes(), + FormatVersion::V2 => "2".as_bytes(), + }, + )?; + + writer.add_user_metadata( + "schema".to_string(), + match table_metadata.format_version { + FormatVersion::V1 => serde_json::to_string(&Into::::into( + table_metadata.current_schema(branch)?.clone(), + ))?, + FormatVersion::V2 => serde_json::to_string(&Into::::into( + table_metadata.current_schema(branch)?.clone(), + ))?, + }, + )?; + + writer.add_user_metadata( + "schema-id".to_string(), + serde_json::to_string(&table_metadata.current_schema(branch)?.schema_id())?, + )?; + + writer.add_user_metadata( + "partition-spec".to_string(), + serde_json::to_string(&table_metadata.default_partition_spec()?.fields())?, + )?; + + writer.add_user_metadata( + "partition-spec-id".to_string(), + serde_json::to_string(&table_metadata.default_partition_spec()?.spec_id())?, + )?; + + writer.add_user_metadata("content".to_string(), "data")?; + + writer.extend(manifest_reader.filter_map(Result::ok))?; + + Ok(ManifestWriter { + manifest, + writer, + table_metadata, + }) + } + + /// Add an manifest entry to the manifest + pub fn append(&mut self, manifest_entry: ManifestEntry) -> Result<(), Error> { + let mut added_rows_count = 0; + + if self.manifest.partitions.is_none() { + self.manifest.partitions = Some( + self.table_metadata + .default_partition_spec()? + .fields() + .iter() + .map(|_| FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }) + .collect::>(), + ); + } + + added_rows_count += manifest_entry.data_file().record_count(); + update_partitions( + self.manifest.partitions.as_mut().unwrap(), + manifest_entry.data_file().partition(), + self.table_metadata.default_partition_spec()?.fields(), + )?; + + self.writer.append_ser(manifest_entry)?; + + self.manifest.added_files_count = match self.manifest.added_files_count { + Some(count) => Some(count + 1), + None => Some(1), + }; + self.manifest.added_rows_count = match self.manifest.added_rows_count { + Some(count) => Some(count + added_rows_count), + None => Some(added_rows_count), + }; + + Ok(()) + } + + /// Write the manifest to object storage and return the manifest-list-entry + pub async fn write( + mut self, + object_store: Arc, + ) -> Result { + let manifest_bytes = self.writer.into_inner()?; + + let manifest_length: i64 = manifest_bytes.len() as i64; + + self.manifest.manifest_length += manifest_length; + + object_store + .put( + &strip_prefix(&self.manifest.manifest_path).as_str().into(), + manifest_bytes.into(), + ) + .await?; + Ok(self.manifest) + } +} + +fn update_partitions( + partitions: &mut [FieldSummary], + partition_values: &Struct, + partition_columns: &[PartitionField], +) -> Result<(), Error> { + for (field, summary) in partition_columns.iter().zip(partitions.iter_mut()) { + let value = partition_values.get(field.name()).and_then(|x| x.as_ref()); + if let Some(value) = value { + if summary.lower_bound.is_none() { + summary.lower_bound = Some(value.clone()); + } else if let Some(lower_bound) = &mut summary.lower_bound { + match (value, lower_bound) { + (Value::Int(val), Value::Int(current)) => { + if *current > *val { + *current = *val + } + } + (Value::LongInt(val), Value::LongInt(current)) => { + if *current > *val { + *current = *val + } + } + (Value::Float(val), Value::Float(current)) => { + if *current > *val { + *current = *val + } + } + (Value::Double(val), Value::Double(current)) => { + if *current > *val { + *current = *val + } + } + (Value::Date(val), Value::Date(current)) => { + if *current > *val { + *current = *val + } + } + (Value::Time(val), Value::Time(current)) => { + if *current > *val { + *current = *val + } + } + (Value::Timestamp(val), Value::Timestamp(current)) => { + if *current > *val { + *current = *val + } + } + (Value::TimestampTZ(val), Value::TimestampTZ(current)) => { + if *current > *val { + *current = *val + } + } + _ => {} + } + } + if summary.upper_bound.is_none() { + summary.upper_bound = Some(value.clone()); + } else if let Some(upper_bound) = &mut summary.upper_bound { + match (value, upper_bound) { + (Value::Int(val), Value::Int(current)) => { + if *current < *val { + *current = *val + } + } + (Value::LongInt(val), Value::LongInt(current)) => { + if *current < *val { + *current = *val + } + } + (Value::Float(val), Value::Float(current)) => { + if *current < *val { + *current = *val + } + } + (Value::Double(val), Value::Double(current)) => { + if *current < *val { + *current = *val + } + } + (Value::Date(val), Value::Date(current)) => { + if *current < *val { + *current = *val + } + } + (Value::Time(val), Value::Time(current)) => { + if *current < *val { + *current = *val + } + } + (Value::Timestamp(val), Value::Timestamp(current)) => { + if *current < *val { + *current = *val + } + } + (Value::TimestampTZ(val), Value::TimestampTZ(current)) => { + if *current < *val { + *current = *val + } + } + _ => {} + } + } + } + } + Ok(()) +} + +/// TODO +#[cfg(test)] +mod tests {} diff --git a/iceberg-rust/src/table/mod.rs b/iceberg-rust/src/table/mod.rs index a6758165..4a5bd105 100644 --- a/iceberg-rust/src/table/mod.rs +++ b/iceberg-rust/src/table/mod.rs @@ -29,6 +29,7 @@ use crate::{ table::transaction::TableTransaction, }; +pub mod manifest; pub mod transaction; #[derive(Debug)] From 78596fa1ea6f40e6cb10422f54aca36155277c9f Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Wed, 30 Oct 2024 11:00:57 +0100 Subject: [PATCH 29/32] refactor manifest-writer --- datafusion_iceberg/src/table.rs | 18 +- iceberg-rust-spec/src/spec/manifest.rs | 196 +------ iceberg-rust/src/table/manifest.rs | 126 ++++- iceberg-rust/src/table/mod.rs | 3 +- .../src/table/transaction/operation.rs | 485 ++---------------- 5 files changed, 197 insertions(+), 631 deletions(-) diff --git a/datafusion_iceberg/src/table.rs b/datafusion_iceberg/src/table.rs index d98bbd88..97559db1 100644 --- a/datafusion_iceberg/src/table.rs +++ b/datafusion_iceberg/src/table.rs @@ -929,7 +929,17 @@ mod tests { (12, 2, 3, '2020-01-04', 1), (13, 1, 1, '2020-01-05', 4), (14, 3, 2, '2020-01-05', 2), - (15, 2, 3, '2020-01-05', 3);", + (15, 2, 3, '2020-01-05', 3), + (16, 2, 3, '2020-01-05', 3), + (17, 1, 3, '2020-01-06', 1), + (18, 2, 1, '2020-01-06', 2), + (19, 2, 2, '2020-01-06', 1), + (20, 1, 2, '2020-01-07', 3), + (21, 3, 1, '2020-01-07', 2), + (22, 2, 3, '2020-01-07', 1), + (23, 1, 1, '2020-01-08', 4), + (24, 3, 2, '2020-01-08', 2), + (25, 2, 3, '2020-01-08', 3);", ) .await .expect("Failed to create query plan for insert") @@ -961,9 +971,9 @@ mod tests { ); for (product_id, amount) in product_ids.iter().zip(amounts) { match product_id.unwrap() { - 1 => assert_eq!(amount.unwrap(), 7), - 2 => assert_eq!(amount.unwrap(), 4), - 3 => assert_eq!(amount.unwrap(), 1), + 1 => assert_eq!(amount.unwrap(), 11), + 2 => assert_eq!(amount.unwrap(), 7), + 3 => assert_eq!(amount.unwrap(), 2), _ => panic!("Unexpected order id"), } } diff --git a/iceberg-rust-spec/src/spec/manifest.rs b/iceberg-rust-spec/src/spec/manifest.rs index abc57d8e..080869d7 100644 --- a/iceberg-rust-spec/src/spec/manifest.rs +++ b/iceberg-rust-spec/src/spec/manifest.rs @@ -1,183 +1,25 @@ /*! Manifest files */ -use std::{ - collections::HashMap, - io::Read, - iter::{repeat, Map, Repeat, Zip}, - ops::{Deref, DerefMut}, - sync::Arc, -}; +use std::collections::HashMap; -use apache_avro::{ - types::Value as AvroValue, Reader as AvroReader, Schema as AvroSchema, Writer as AvroWriter, -}; +use apache_avro::Schema as AvroSchema; use derive_builder::Builder; use derive_getters::Getters; use serde::{de::DeserializeOwned, ser::SerializeSeq, Deserialize, Serialize}; use serde_bytes::ByteBuf; use serde_repr::{Deserialize_repr, Serialize_repr}; -use crate::{ - error::Error, - spec::schema::{SchemaV1, SchemaV2}, -}; +use crate::error::Error; use super::{ partition::{PartitionField, PartitionSpec}, schema::Schema, - table_metadata::{FormatVersion, TableMetadata}, + table_metadata::FormatVersion, types::{PrimitiveType, StructType, Type}, values::{Struct, Value}, }; -type ReaderZip<'a, R> = Zip, Repeat>>; -type ReaderMap<'a, R> = Map< - ReaderZip<'a, R>, - fn( - ( - Result, - Arc<(Schema, PartitionSpec, FormatVersion)>, - ), - ) -> Result, ->; - -/// Iterator of ManifestFileEntries -pub struct ManifestReader<'a, R: Read> { - reader: ReaderMap<'a, R>, -} - -impl<'a, R: Read> Iterator for ManifestReader<'a, R> { - type Item = Result; - fn next(&mut self) -> Option { - self.reader.next() - } -} - -impl<'a, R: Read> ManifestReader<'a, R> { - /// Create a new ManifestFile reader - pub fn new(reader: R) -> Result { - let reader = AvroReader::new(reader)?; - let metadata = reader.user_metadata(); - - let format_version: FormatVersion = match metadata - .get("format-version") - .map(|bytes| String::from_utf8(bytes.clone())) - .transpose()? - .unwrap_or("1".to_string()) - .as_str() - { - "1" => Ok(FormatVersion::V1), - "2" => Ok(FormatVersion::V2), - _ => Err(Error::InvalidFormat("format version".to_string())), - }?; - - let schema: Schema = match format_version { - FormatVersion::V1 => TryFrom::::try_from(serde_json::from_slice( - metadata - .get("schema") - .ok_or(Error::InvalidFormat("manifest metadata".to_string()))?, - )?)?, - FormatVersion::V2 => TryFrom::::try_from(serde_json::from_slice( - metadata - .get("schema") - .ok_or(Error::InvalidFormat("manifest metadata".to_string()))?, - )?)?, - }; - - let partition_fields: Vec = serde_json::from_slice( - metadata - .get("partition-spec") - .ok_or(Error::InvalidFormat("manifest metadata".to_string()))?, - )?; - let spec_id: i32 = metadata - .get("partition-spec-id") - .map(|x| String::from_utf8(x.clone())) - .transpose()? - .unwrap_or("0".to_string()) - .parse()?; - let partition_spec = PartitionSpec::builder() - .with_spec_id(spec_id) - .with_fields(partition_fields) - .build()?; - Ok(Self { - reader: reader - .zip(repeat(Arc::new((schema, partition_spec, format_version)))) - .map(avro_value_to_manifest_entry), - }) - } -} - -/// A writer for manifest entries -pub struct ManifestWriter<'a, W: std::io::Write>(AvroWriter<'a, W>); - -impl<'a, W: std::io::Write> Deref for ManifestWriter<'a, W> { - type Target = AvroWriter<'a, W>; - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl<'a, W: std::io::Write> DerefMut for ManifestWriter<'a, W> { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -impl<'a, W: std::io::Write> ManifestWriter<'a, W> { - pub fn new( - writer: W, - schema: &'a AvroSchema, - table_metadata: &TableMetadata, - branch: Option<&str>, - ) -> Result { - let mut avro_writer = AvroWriter::new(schema, writer); - - avro_writer.add_user_metadata( - "format-version".to_string(), - match table_metadata.format_version { - FormatVersion::V1 => "1".as_bytes(), - FormatVersion::V2 => "2".as_bytes(), - }, - )?; - - avro_writer.add_user_metadata( - "schema".to_string(), - match table_metadata.format_version { - FormatVersion::V1 => serde_json::to_string(&Into::::into( - table_metadata.current_schema(branch)?.clone(), - ))?, - FormatVersion::V2 => serde_json::to_string(&Into::::into( - table_metadata.current_schema(branch)?.clone(), - ))?, - }, - )?; - - avro_writer.add_user_metadata( - "schema-id".to_string(), - serde_json::to_string(&table_metadata.current_schema(branch)?.schema_id())?, - )?; - - avro_writer.add_user_metadata( - "partition-spec".to_string(), - serde_json::to_string(&table_metadata.default_partition_spec()?.fields())?, - )?; - - avro_writer.add_user_metadata( - "partition-spec-id".to_string(), - serde_json::to_string(&table_metadata.default_partition_spec()?.spec_id())?, - )?; - - avro_writer.add_user_metadata("content".to_string(), "data")?; - - Ok(ManifestWriter(avro_writer)) - } - - pub fn into_inner(self) -> Result { - Ok(self.0.into_inner()?) - } -} - /// Entry in manifest with the iceberg spec version 2. #[derive(Debug, Serialize, PartialEq, Clone, Getters, Builder)] #[serde(into = "ManifestEntryEnum")] @@ -205,7 +47,7 @@ impl ManifestEntry { } impl ManifestEntry { - pub(crate) fn try_from_v2( + pub fn try_from_v2( value: ManifestEntryV2, schema: &Schema, partition_spec: &PartitionSpec, @@ -219,7 +61,7 @@ impl ManifestEntry { }) } - pub(crate) fn try_from_v1( + pub fn try_from_v1( value: ManifestEntryV1, schema: &Schema, partition_spec: &PartitionSpec, @@ -1483,32 +1325,6 @@ impl DataFileV2 { } } -#[allow(clippy::type_complexity)] -// Convert avro value to ManifestEntry based on the format version of the table. -fn avro_value_to_manifest_entry( - value: ( - Result, - Arc<(Schema, PartitionSpec, FormatVersion)>, - ), -) -> Result { - let entry = value.0?; - let schema = &value.1 .0; - let partition_spec = &value.1 .1; - let format_version = &value.1 .2; - match format_version { - FormatVersion::V2 => ManifestEntry::try_from_v2( - apache_avro::from_value::(&entry)?, - schema, - partition_spec, - ), - FormatVersion::V1 => ManifestEntry::try_from_v1( - apache_avro::from_value::(&entry)?, - schema, - partition_spec, - ), - } -} - #[cfg(test)] mod tests { use crate::spec::{ diff --git a/iceberg-rust/src/table/manifest.rs b/iceberg-rust/src/table/manifest.rs index 031aba96..997a79d2 100644 --- a/iceberg-rust/src/table/manifest.rs +++ b/iceberg-rust/src/table/manifest.rs @@ -2,21 +2,105 @@ * Helpers to deal with manifest files */ -use std::sync::Arc; +use std::{ + io::Read, + iter::{repeat, Map, Repeat, Zip}, + sync::Arc, +}; -use apache_avro::{Schema as AvroSchema, Writer as AvroWriter}; +use apache_avro::{ + types::Value as AvroValue, Reader as AvroReader, Schema as AvroSchema, Writer as AvroWriter, +}; use iceberg_rust_spec::{ - manifest::ManifestEntry, + manifest::{ManifestEntry, ManifestEntryV1, ManifestEntryV2}, manifest_list::{self, FieldSummary, ManifestListEntry}, - partition::PartitionField, - schema::{SchemaV1, SchemaV2}, + partition::{PartitionField, PartitionSpec}, + schema::{Schema, SchemaV1, SchemaV2}, table_metadata::{FormatVersion, TableMetadata}, util::strip_prefix, values::{Struct, Value}, }; use object_store::ObjectStore; -use crate::error::Error; +use crate::{error::Error, spec}; + +type ReaderZip<'a, R> = Zip, Repeat>>; +type ReaderMap<'a, R> = Map< + ReaderZip<'a, R>, + fn( + ( + Result, + Arc<(Schema, PartitionSpec, FormatVersion)>, + ), + ) -> Result, +>; + +/// Iterator of ManifestFileEntries +pub struct ManifestReader<'a, R: Read> { + reader: ReaderMap<'a, R>, +} + +impl<'a, R: Read> Iterator for ManifestReader<'a, R> { + type Item = Result; + fn next(&mut self) -> Option { + self.reader.next() + } +} + +impl<'a, R: Read> ManifestReader<'a, R> { + /// Create a new ManifestFile reader + pub fn new(reader: R) -> Result { + let reader = AvroReader::new(reader)?; + let metadata = reader.user_metadata(); + + let format_version: FormatVersion = match metadata + .get("format-version") + .map(|bytes| String::from_utf8(bytes.clone())) + .transpose()? + .unwrap_or("1".to_string()) + .as_str() + { + "1" => Ok(FormatVersion::V1), + "2" => Ok(FormatVersion::V2), + _ => Err(Error::InvalidFormat("format version".to_string())), + }?; + + let schema: Schema = match format_version { + FormatVersion::V1 => TryFrom::::try_from(serde_json::from_slice( + metadata + .get("schema") + .ok_or(Error::InvalidFormat("manifest metadata".to_string()))?, + )?)?, + FormatVersion::V2 => TryFrom::::try_from(serde_json::from_slice( + metadata + .get("schema") + .ok_or(Error::InvalidFormat("manifest metadata".to_string()))?, + )?)?, + }; + + let partition_fields: Vec = serde_json::from_slice( + metadata + .get("partition-spec") + .ok_or(Error::InvalidFormat("manifest metadata".to_string()))?, + )?; + let spec_id: i32 = metadata + .get("partition-spec-id") + .map(|x| String::from_utf8(x.clone())) + .transpose()? + .unwrap_or("0".to_string()) + .parse()?; + let partition_spec = PartitionSpec::builder() + .with_spec_id(spec_id) + .with_fields(partition_fields) + .build() + .map_err(spec::error::Error::from)?; + Ok(Self { + reader: reader + .zip(repeat(Arc::new((schema, partition_spec, format_version)))) + .map(avro_value_to_manifest_entry), + }) + } +} /// A helper to write entries into a manifest pub struct ManifestWriter<'schema, 'metadata> { @@ -199,7 +283,7 @@ impl<'schema, 'metadata> ManifestWriter<'schema, 'metadata> { } /// Write the manifest to object storage and return the manifest-list-entry - pub async fn write( + pub async fn finish( mut self, object_store: Arc, ) -> Result { @@ -219,6 +303,34 @@ impl<'schema, 'metadata> ManifestWriter<'schema, 'metadata> { } } +#[allow(clippy::type_complexity)] +/// Convert avro value to ManifestEntry based on the format version of the table. +pub fn avro_value_to_manifest_entry( + value: ( + Result, + Arc<(Schema, PartitionSpec, FormatVersion)>, + ), +) -> Result { + let entry = value.0?; + let schema = &value.1 .0; + let partition_spec = &value.1 .1; + let format_version = &value.1 .2; + match format_version { + FormatVersion::V2 => ManifestEntry::try_from_v2( + apache_avro::from_value::(&entry)?, + schema, + partition_spec, + ) + .map_err(Error::from), + FormatVersion::V1 => ManifestEntry::try_from_v1( + apache_avro::from_value::(&entry)?, + schema, + partition_spec, + ) + .map_err(Error::from), + } +} + fn update_partitions( partitions: &mut [FieldSummary], partition_values: &Struct, diff --git a/iceberg-rust/src/table/mod.rs b/iceberg-rust/src/table/mod.rs index 4a5bd105..979d44e9 100644 --- a/iceberg-rust/src/table/mod.rs +++ b/iceberg-rust/src/table/mod.rs @@ -4,6 +4,7 @@ Defining the [Table] struct that represents an iceberg table. use std::{io::Cursor, iter::repeat, sync::Arc}; +use manifest::ManifestReader; use object_store::{path::Path, ObjectStore}; use futures::{ @@ -12,7 +13,7 @@ use futures::{ use iceberg_rust_spec::util::{self}; use iceberg_rust_spec::{ spec::{ - manifest::{Content, ManifestEntry, ManifestReader}, + manifest::{Content, ManifestEntry}, manifest_list::ManifestListEntry, schema::Schema, table_metadata::TableMetadata, diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index 550334b1..945960cb 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -8,24 +8,19 @@ use iceberg_rust_spec::manifest_list::{ manifest_list_schema_v1, manifest_list_schema_v2, ManifestListReader, }; use iceberg_rust_spec::spec::table_metadata::TableMetadata; -use iceberg_rust_spec::table_metadata::FormatVersion; -use iceberg_rust_spec::util::strip_prefix; -use iceberg_rust_spec::{ - manifest::ManifestReader, - spec::{ - manifest::{partition_value_schema, DataFile, ManifestEntry, ManifestWriter, Status}, - manifest_list::{Content, FieldSummary, ManifestListEntry}, - partition::PartitionField, - schema::Schema, - snapshot::{ - generate_snapshot_id, SnapshotBuilder, SnapshotReference, SnapshotRetention, Summary, - }, - values::{Struct, Value}, +use iceberg_rust_spec::spec::{ + manifest::{partition_value_schema, DataFile, ManifestEntry, Status}, + schema::Schema, + snapshot::{ + generate_snapshot_id, SnapshotBuilder, SnapshotReference, SnapshotRetention, Summary, }, }; +use iceberg_rust_spec::table_metadata::FormatVersion; +use iceberg_rust_spec::util::strip_prefix; use object_store::ObjectStore; use smallvec::SmallVec; +use crate::table::manifest::{ManifestReader, ManifestWriter}; use crate::{ catalog::commit::{TableRequirement, TableUpdate}, error::Error, @@ -216,16 +211,7 @@ impl Operation { // Write manifest files // Split manifest file if limit is exceeded if n_splits == 0 { - // If manifest doesn't need to be split - let mut manifest_writer = ManifestWriter::new( - Vec::new(), - &manifest_schema, - table_metadata, - branch.as_deref(), - )?; - - // Copy potential existing entries - if let Some(manifest) = &manifest { + let mut manifest_writer = if let Some(manifest) = manifest { let manifest_bytes: Vec = object_store .get(&strip_prefix(&manifest.manifest_path).as_str().into()) .await? @@ -233,12 +219,14 @@ impl Operation { .await? .into(); - let manifest_reader = apache_avro::Reader::new(&*manifest_bytes)?; - manifest_writer.extend(manifest_reader.filter_map(Result::ok))?; - }; - - // If there is no manifest, create one - let mut manifest = manifest.unwrap_or_else(|| { + ManifestWriter::from_existing( + &manifest_bytes, + manifest, + &manifest_schema, + table_metadata, + branch.as_deref(), + )? + } else { let manifest_location = table_metadata.location.to_string() + "/metadata/" + snapshot_uuid @@ -246,80 +234,20 @@ impl Operation { + &0.to_string() + ".avro"; - ManifestListEntry { - format_version: table_metadata.format_version, - manifest_path: manifest_location, - manifest_length: 0, - partition_spec_id: table_metadata.default_spec_id, - content: Content::Data, - sequence_number: table_metadata.last_sequence_number, - min_sequence_number: 0, - added_snapshot_id: snapshot_id, - added_files_count: Some(0), - existing_files_count: Some(0), - deleted_files_count: Some(0), - added_rows_count: Some(0), - existing_rows_count: Some(0), - deleted_rows_count: Some(0), - partitions: None, - key_metadata: None, - } - }); + ManifestWriter::new( + &manifest_location, + snapshot_id, + &manifest_schema, + table_metadata, + branch.as_deref(), + )? + }; for manifest_entry in new_datafile_iter { - { - let manifest_entry = manifest_entry?; - - let mut added_rows_count = 0; - - if manifest.partitions.is_none() { - manifest.partitions = Some( - table_metadata - .default_partition_spec()? - .fields() - .iter() - .map(|_| FieldSummary { - contains_null: false, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }) - .collect::>(), - ); - } - - added_rows_count += manifest_entry.data_file().record_count(); - update_partitions( - manifest.partitions.as_mut().unwrap(), - manifest_entry.data_file().partition(), - table_metadata.default_partition_spec()?.fields(), - )?; - - manifest_writer.append_ser(manifest_entry)?; - - manifest.added_files_count = match manifest.added_files_count { - Some(count) => Some(count + new_file_count as i32), - None => Some(new_file_count as i32), - }; - manifest.added_rows_count = match manifest.added_rows_count { - Some(count) => Some(count + added_rows_count), - None => Some(added_rows_count), - }; - } + manifest_writer.append(manifest_entry?)?; } - let manifest_bytes = manifest_writer.into_inner()?; - - let manifest_length: i64 = manifest_bytes.len() as i64; - - manifest.manifest_length += manifest_length; - - object_store - .put( - &strip_prefix(&manifest.manifest_path).as_str().into(), - manifest_bytes.into(), - ) - .await?; + let manifest = manifest_writer.finish(object_store.clone()).await?; manifest_list_writer.append_ser(manifest)?; } else { @@ -351,90 +279,26 @@ impl Operation { }; for (i, entries) in splits.into_iter().enumerate() { - let mut manifest_writer = ManifestWriter::new( - Vec::new(), - &manifest_schema, - table_metadata, - branch.as_deref(), - )?; - let manifest_location = table_metadata.location.to_string() + "/metadata/" + snapshot_uuid + "-m" + &i.to_string() + ".avro"; - let mut manifest = ManifestListEntry { - format_version: table_metadata.format_version, - manifest_path: manifest_location, - manifest_length: 0, - partition_spec_id: table_metadata.default_spec_id, - content: Content::Data, - sequence_number: table_metadata.last_sequence_number, - min_sequence_number: 0, - added_snapshot_id: snapshot_id, - added_files_count: Some(0), - existing_files_count: Some(0), - deleted_files_count: Some(0), - added_rows_count: Some(0), - existing_rows_count: Some(0), - deleted_rows_count: Some(0), - partitions: None, - key_metadata: None, - }; + + let mut manifest_writer = ManifestWriter::new( + &manifest_location, + snapshot_id, + &manifest_schema, + table_metadata, + branch.as_deref(), + )?; for manifest_entry in entries { - { - let mut added_rows_count = 0; - - if manifest.partitions.is_none() { - manifest.partitions = Some( - table_metadata - .default_partition_spec()? - .fields() - .iter() - .map(|_| FieldSummary { - contains_null: false, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }) - .collect::>(), - ); - } - - added_rows_count += manifest_entry.data_file().record_count(); - update_partitions( - manifest.partitions.as_mut().unwrap(), - manifest_entry.data_file().partition(), - table_metadata.default_partition_spec()?.fields(), - )?; - - manifest_writer.append_ser(manifest_entry)?; - - manifest.added_files_count = match manifest.added_files_count { - Some(count) => Some(count + new_file_count as i32), - None => Some(new_file_count as i32), - }; - manifest.added_rows_count = match manifest.added_rows_count { - Some(count) => Some(count + added_rows_count), - None => Some(added_rows_count), - }; - } + manifest_writer.append(manifest_entry)?; } - let manifest_bytes = manifest_writer.into_inner()?; - - let manifest_length: i64 = manifest_bytes.len() as i64; - - manifest.manifest_length += manifest_length; - - object_store - .put( - &strip_prefix(&manifest.manifest_path).as_str().into(), - manifest_bytes.into(), - ) - .await?; + let manifest = manifest_writer.finish(object_store.clone()).await?; manifest_list_writer.append_ser(manifest)?; } @@ -564,12 +428,6 @@ impl Operation { // Split manifest file if limit is exceeded if n_splits == 0 { // If manifest doesn't need to be split - let mut manifest_writer = ManifestWriter::new( - Vec::new(), - &manifest_schema, - table_metadata, - branch.as_deref(), - )?; let manifest_location = table_metadata.location.to_string() + "/metadata/" @@ -577,79 +435,19 @@ impl Operation { + "-m" + &0.to_string() + ".avro"; - let mut manifest = ManifestListEntry { - format_version: table_metadata.format_version, - manifest_path: manifest_location, - manifest_length: 0, - partition_spec_id: table_metadata.default_spec_id, - content: Content::Data, - sequence_number: table_metadata.last_sequence_number, - min_sequence_number: 0, - added_snapshot_id: snapshot_id, - added_files_count: Some(0), - existing_files_count: Some(0), - deleted_files_count: Some(0), - added_rows_count: Some(0), - existing_rows_count: Some(0), - deleted_rows_count: Some(0), - partitions: None, - key_metadata: None, - }; + let mut manifest_writer = ManifestWriter::new( + &manifest_location, + snapshot_id, + &manifest_schema, + table_metadata, + branch.as_deref(), + )?; for manifest_entry in new_datafile_iter { - { - let manifest_entry = manifest_entry?; - - let mut added_rows_count = 0; - - if manifest.partitions.is_none() { - manifest.partitions = Some( - table_metadata - .default_partition_spec()? - .fields() - .iter() - .map(|_| FieldSummary { - contains_null: false, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }) - .collect::>(), - ); - } - - added_rows_count += manifest_entry.data_file().record_count(); - update_partitions( - manifest.partitions.as_mut().unwrap(), - manifest_entry.data_file().partition(), - table_metadata.default_partition_spec()?.fields(), - )?; - - manifest_writer.append_ser(manifest_entry)?; - - manifest.added_files_count = match manifest.added_files_count { - Some(count) => Some(count + new_file_count as i32), - None => Some(new_file_count as i32), - }; - manifest.added_rows_count = match manifest.added_rows_count { - Some(count) => Some(count + added_rows_count), - None => Some(added_rows_count), - }; - } + manifest_writer.append(manifest_entry?)?; } - let manifest_bytes = manifest_writer.into_inner()?; - - let manifest_length: i64 = manifest_bytes.len() as i64; - - manifest.manifest_length += manifest_length; - - object_store - .put( - &strip_prefix(&manifest.manifest_path).as_str().into(), - manifest_bytes.into(), - ) - .await?; + let manifest = manifest_writer.finish(object_store.clone()).await?; manifest_list_writer.append_ser(manifest)?; } else { @@ -662,90 +460,26 @@ impl Operation { )?; for (i, entries) in splits.into_iter().enumerate() { - let mut manifest_writer = ManifestWriter::new( - Vec::new(), - &manifest_schema, - table_metadata, - branch.as_deref(), - )?; - let manifest_location = table_metadata.location.to_string() + "/metadata/" + snapshot_uuid + "-m" + &i.to_string() + ".avro"; - let mut manifest = ManifestListEntry { - format_version: table_metadata.format_version, - manifest_path: manifest_location, - manifest_length: 0, - partition_spec_id: table_metadata.default_spec_id, - content: Content::Data, - sequence_number: table_metadata.last_sequence_number, - min_sequence_number: 0, - added_snapshot_id: snapshot_id, - added_files_count: Some(0), - existing_files_count: Some(0), - deleted_files_count: Some(0), - added_rows_count: Some(0), - existing_rows_count: Some(0), - deleted_rows_count: Some(0), - partitions: None, - key_metadata: None, - }; + + let mut manifest_writer = ManifestWriter::new( + &manifest_location, + snapshot_id, + &manifest_schema, + table_metadata, + branch.as_deref(), + )?; for manifest_entry in entries { - { - let mut added_rows_count = 0; - - if manifest.partitions.is_none() { - manifest.partitions = Some( - table_metadata - .default_partition_spec()? - .fields() - .iter() - .map(|_| FieldSummary { - contains_null: false, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }) - .collect::>(), - ); - } - - added_rows_count += manifest_entry.data_file().record_count(); - update_partitions( - manifest.partitions.as_mut().unwrap(), - manifest_entry.data_file().partition(), - table_metadata.default_partition_spec()?.fields(), - )?; - - manifest_writer.append_ser(manifest_entry)?; - - manifest.added_files_count = match manifest.added_files_count { - Some(count) => Some(count + new_file_count as i32), - None => Some(new_file_count as i32), - }; - manifest.added_rows_count = match manifest.added_rows_count { - Some(count) => Some(count + added_rows_count), - None => Some(added_rows_count), - }; - } + manifest_writer.append(manifest_entry)?; } - let manifest_bytes = manifest_writer.into_inner()?; - - let manifest_length: i64 = manifest_bytes.len() as i64; - - manifest.manifest_length += manifest_length; - - object_store - .put( - &strip_prefix(&manifest.manifest_path).as_str().into(), - manifest_bytes.into(), - ) - .await?; + let manifest = manifest_writer.finish(object_store.clone()).await?; manifest_list_writer.append_ser(manifest)?; } @@ -832,110 +566,3 @@ impl Operation { } } } - -fn update_partitions( - partitions: &mut [FieldSummary], - partition_values: &Struct, - partition_columns: &[PartitionField], -) -> Result<(), Error> { - for (field, summary) in partition_columns.iter().zip(partitions.iter_mut()) { - let value = partition_values.get(field.name()).and_then(|x| x.as_ref()); - if let Some(value) = value { - if summary.lower_bound.is_none() { - summary.lower_bound = Some(value.clone()); - } else if let Some(lower_bound) = &mut summary.lower_bound { - match (value, lower_bound) { - (Value::Int(val), Value::Int(current)) => { - if *current > *val { - *current = *val - } - } - (Value::LongInt(val), Value::LongInt(current)) => { - if *current > *val { - *current = *val - } - } - (Value::Float(val), Value::Float(current)) => { - if *current > *val { - *current = *val - } - } - (Value::Double(val), Value::Double(current)) => { - if *current > *val { - *current = *val - } - } - (Value::Date(val), Value::Date(current)) => { - if *current > *val { - *current = *val - } - } - (Value::Time(val), Value::Time(current)) => { - if *current > *val { - *current = *val - } - } - (Value::Timestamp(val), Value::Timestamp(current)) => { - if *current > *val { - *current = *val - } - } - (Value::TimestampTZ(val), Value::TimestampTZ(current)) => { - if *current > *val { - *current = *val - } - } - _ => {} - } - } - if summary.upper_bound.is_none() { - summary.upper_bound = Some(value.clone()); - } else if let Some(upper_bound) = &mut summary.upper_bound { - match (value, upper_bound) { - (Value::Int(val), Value::Int(current)) => { - if *current < *val { - *current = *val - } - } - (Value::LongInt(val), Value::LongInt(current)) => { - if *current < *val { - *current = *val - } - } - (Value::Float(val), Value::Float(current)) => { - if *current < *val { - *current = *val - } - } - (Value::Double(val), Value::Double(current)) => { - if *current < *val { - *current = *val - } - } - (Value::Date(val), Value::Date(current)) => { - if *current < *val { - *current = *val - } - } - (Value::Time(val), Value::Time(current)) => { - if *current < *val { - *current = *val - } - } - (Value::Timestamp(val), Value::Timestamp(current)) => { - if *current < *val { - *current = *val - } - } - (Value::TimestampTZ(val), Value::TimestampTZ(current)) => { - if *current < *val { - *current = *val - } - } - _ => {} - } - } - } - } - Ok(()) -} From 4cc2bff2f316d1b08d718afb76483604c23d06e8 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Wed, 30 Oct 2024 11:16:05 +0100 Subject: [PATCH 30/32] fix clippy warnings --- datafusion_iceberg/src/table.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datafusion_iceberg/src/table.rs b/datafusion_iceberg/src/table.rs index 97559db1..ee53e4d2 100644 --- a/datafusion_iceberg/src/table.rs +++ b/datafusion_iceberg/src/table.rs @@ -980,11 +980,8 @@ mod tests { } } - match table.tabular.read().await.deref() { - Tabular::Table(table) => { - assert_eq!(table.manifests(None, None).await.unwrap().len(), 2); - } - _ => (), + if let Tabular::Table(table) = table.tabular.read().await.deref() { + assert_eq!(table.manifests(None, None).await.unwrap().len(), 2); }; } From f3532f9c8ef74bc4307d01d54ba5d7f37b3c6c12 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 31 Oct 2024 10:44:56 +0100 Subject: [PATCH 31/32] Refactor and fix select_manifest --- iceberg-rust/src/table/transaction/append.rs | 143 ++++++++++-------- .../src/table/transaction/operation.rs | 25 ++- 2 files changed, 93 insertions(+), 75 deletions(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index a23456fc..6e00e18b 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -4,7 +4,6 @@ use iceberg_rust_spec::{ manifest::ManifestEntry, manifest_list::{ManifestListEntry, ManifestListReader}, }; -use smallvec::SmallVec; use crate::{ error::Error, @@ -89,75 +88,85 @@ pub(crate) fn split_datafiles( } } -/// Select the manifest that yields the smallest bounding rectangle after the bounding rectangle of the new values has been added. -pub(crate) fn select_manifest( - partition_column_names: &SmallVec<[&str; 4]>, - mut manifest_list_reader: ManifestListReader<&[u8]>, +/// Select the manifest that yields the smallest bounding rectangle after the +/// bounding rectangle of the new values has been added. +pub(crate) fn select_manifest_partitioned( + manifest_list_reader: ManifestListReader<&[u8]>, file_count: &mut usize, manifest_list_writer: &mut apache_avro::Writer>, bounding_partition_values: &Rectangle, ) -> Result { - let manifest = - if partition_column_names.is_empty() { - // Find the manifest with the lowest row count - manifest_list_reader - .try_fold(None, |acc, x| { - let manifest = x?; - - let row_count = manifest.added_rows_count; - - *file_count += manifest.added_files_count.unwrap_or(0) as usize; - - let Some((old_row_count, old_manifest)) = acc else { - return Ok::<_, Error>(Some((row_count, manifest))); - }; - - let Some(row_count) = row_count else { - return Ok(Some((old_row_count, old_manifest))); - }; - - if old_row_count.is_none() || old_row_count.is_some_and(|x| x > row_count) { - manifest_list_writer.append_ser(old_manifest)?; - Ok(Some((Some(row_count), manifest))) - } else { - manifest_list_writer.append_ser(manifest)?; - Ok(Some((old_row_count, old_manifest))) - } - })? - .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned()))? - .1 - } else { - // Find the manifest with the smallest bounding partition values - manifest_list_reader - .try_fold(None, |acc, x| { - let manifest = x?; - - let mut bounds = - summary_to_rectangle(manifest.partitions.as_ref().ok_or( - Error::NotFound("Partition".to_owned(), "struct".to_owned()), - )?)?; - - bounds.expand(bounding_partition_values); - - *file_count += manifest.added_files_count.unwrap_or(0) as usize; - - let Some((old_bounds, old_manifest)) = acc else { - return Ok::<_, Error>(Some((bounds, manifest))); - }; - - match old_bounds.cmp_with_priority(&bounds)? { - Ordering::Greater => { - manifest_list_writer.append_ser(old_manifest)?; - Ok(Some((bounds, manifest))) - } - _ => { - manifest_list_writer.append_ser(manifest)?; - Ok(Some((old_bounds, old_manifest))) - } - } - })? - .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned()))? - .1 + let mut selected_state = None; + for manifest_res in manifest_list_reader { + let manifest = manifest_res?; + + let mut bounds = summary_to_rectangle( + manifest + .partitions + .as_ref() + .ok_or(Error::NotFound("Partition".to_owned(), "struct".to_owned()))?, + )?; + + bounds.expand(bounding_partition_values); + + *file_count += manifest.added_files_count.unwrap_or(0) as usize; + + let Some((selected_bounds, selected_manifest)) = &selected_state else { + selected_state = Some((bounds, manifest)); + continue; + }; + + match selected_bounds.cmp_with_priority(&bounds)? { + Ordering::Greater => { + manifest_list_writer.append_ser(selected_manifest)?; + selected_state = Some((bounds, manifest)); + continue; + } + _ => { + manifest_list_writer.append_ser(manifest)?; + continue; + } + } + } + selected_state + .map(|(_, manifest)| manifest) + .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned())) +} + +/// Select the manifest with the smallest number of rows. +pub(crate) fn select_manifest_unpartitioned( + manifest_list_reader: ManifestListReader<&[u8]>, + file_count: &mut usize, + manifest_list_writer: &mut apache_avro::Writer>, +) -> Result { + let mut selected_state = None; + for manifest_res in manifest_list_reader { + let manifest = manifest_res?; + // TODO: should this also account for existing_rows_count / existing_files_count? + let row_count = manifest.added_rows_count; + *file_count += manifest.added_files_count.unwrap_or(0) as usize; + + let Some((selected_row_count, selected_manifest)) = &selected_state else { + selected_state = Some((row_count, manifest)); + continue; }; - Ok(manifest) + + // If the file doesn't have any rows, we select it + let Some(row_count) = row_count else { + selected_state = Some((row_count, manifest)); + continue; + }; + + if selected_row_count.is_some_and(|x| x > row_count) { + manifest_list_writer.append_ser(selected_manifest)?; + selected_state = Some((Some(row_count), manifest)); + continue; + } else { + manifest_list_writer.append_ser(manifest)?; + continue; + } + } + selected_state + .map(|(_, manifest)| manifest) + .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned())) } diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index 945960cb..6806f508 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -27,7 +27,7 @@ use crate::{ util::{partition_struct_to_vec, summary_to_rectangle, Rectangle}, }; -use super::append::{select_manifest, split_datafiles}; +use super::append::{select_manifest_partitioned, select_manifest_unpartitioned, split_datafiles}; static MIN_DATAFILES: usize = 4; @@ -127,6 +127,7 @@ impl Operation { let old_manifest_list_location = old_snapshot.map(|x| x.manifest_list()).cloned(); + // TO DISCUSS: What is this? let mut file_count = 0; // Find a manifest to add the new datafiles @@ -141,19 +142,27 @@ impl Operation { let manifest_list_reader = ManifestListReader::new(old_manifest_list_bytes.as_ref(), table_metadata)?; - let manifest = select_manifest( - &partition_column_names, - manifest_list_reader, - &mut file_count, - &mut manifest_list_writer, - &bounding_partition_values, - )?; + let manifest = if partition_column_names.is_empty() { + select_manifest_unpartitioned( + manifest_list_reader, + &mut file_count, + &mut manifest_list_writer, + )? + } else { + select_manifest_partitioned( + manifest_list_reader, + &mut file_count, + &mut manifest_list_writer, + &bounding_partition_values, + )? + }; Some(manifest) } else { // If manifest list doesn't exist, there is no manifest None }; + // TO DISCUSS: What is this rule? let limit = MIN_DATAFILES + ((file_count + files.len()) as f64).sqrt() as usize; let new_file_count = manifest From 12de20a71b102477d07c3b6914c4ecf0d5cfb942 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 31 Oct 2024 11:31:08 +0100 Subject: [PATCH 32/32] Refactor n_split computation --- iceberg-rust/src/table/transaction/append.rs | 27 ++++-- .../src/table/transaction/operation.rs | 93 +++++++++++-------- 2 files changed, 72 insertions(+), 48 deletions(-) diff --git a/iceberg-rust/src/table/transaction/append.rs b/iceberg-rust/src/table/transaction/append.rs index 6e00e18b..9d9f7ec3 100644 --- a/iceberg-rust/src/table/transaction/append.rs +++ b/iceberg-rust/src/table/transaction/append.rs @@ -88,15 +88,20 @@ pub(crate) fn split_datafiles( } } +pub(crate) struct SelectedManifest { + pub manifest: ManifestListEntry, + pub file_count_all_entries: usize, +} + /// Select the manifest that yields the smallest bounding rectangle after the /// bounding rectangle of the new values has been added. pub(crate) fn select_manifest_partitioned( manifest_list_reader: ManifestListReader<&[u8]>, - file_count: &mut usize, manifest_list_writer: &mut apache_avro::Writer>, bounding_partition_values: &Rectangle, -) -> Result { +) -> Result { let mut selected_state = None; + let mut file_count_all_entries = 0; for manifest_res in manifest_list_reader { let manifest = manifest_res?; @@ -109,7 +114,7 @@ pub(crate) fn select_manifest_partitioned( bounds.expand(bounding_partition_values); - *file_count += manifest.added_files_count.unwrap_or(0) as usize; + file_count_all_entries += manifest.added_files_count.unwrap_or(0) as usize; let Some((selected_bounds, selected_manifest)) = &selected_state else { selected_state = Some((bounds, manifest)); @@ -129,22 +134,25 @@ pub(crate) fn select_manifest_partitioned( } } selected_state - .map(|(_, manifest)| manifest) + .map(|(_, entry)| SelectedManifest { + manifest: entry, + file_count_all_entries, + }) .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned())) } /// Select the manifest with the smallest number of rows. pub(crate) fn select_manifest_unpartitioned( manifest_list_reader: ManifestListReader<&[u8]>, - file_count: &mut usize, manifest_list_writer: &mut apache_avro::Writer>, -) -> Result { +) -> Result { let mut selected_state = None; + let mut file_count_all_entries = 0; for manifest_res in manifest_list_reader { let manifest = manifest_res?; // TODO: should this also account for existing_rows_count / existing_files_count? let row_count = manifest.added_rows_count; - *file_count += manifest.added_files_count.unwrap_or(0) as usize; + file_count_all_entries += manifest.added_files_count.unwrap_or(0) as usize; let Some((selected_row_count, selected_manifest)) = &selected_state else { selected_state = Some((row_count, manifest)); @@ -167,6 +175,9 @@ pub(crate) fn select_manifest_unpartitioned( } } selected_state - .map(|(_, manifest)| manifest) + .map(|(_, entry)| SelectedManifest { + manifest: entry, + file_count_all_entries, + }) .ok_or(Error::NotFound("Manifest".to_owned(), "file".to_owned())) } diff --git a/iceberg-rust/src/table/transaction/operation.rs b/iceberg-rust/src/table/transaction/operation.rs index 6806f508..3266dca9 100644 --- a/iceberg-rust/src/table/transaction/operation.rs +++ b/iceberg-rust/src/table/transaction/operation.rs @@ -27,9 +27,35 @@ use crate::{ util::{partition_struct_to_vec, summary_to_rectangle, Rectangle}, }; -use super::append::{select_manifest_partitioned, select_manifest_unpartitioned, split_datafiles}; +use super::append::{ + select_manifest_partitioned, select_manifest_unpartitioned, split_datafiles, SelectedManifest, +}; -static MIN_DATAFILES: usize = 4; +/// The target number of datafiles per manifest is dynamic, but we don't want to go below this number. +static MIN_DATAFILES_PER_MANIFEST: usize = 4; + +/// To achieve fast lookups of the datafiles, the manifest tree should be somewhat balanced, meaning that manifest files should contain a similar number of datafiles. +/// This means that manifest files might need to be split up when they get too large. Since the number of datafiles being added by a append operation might be really large, +/// it might even be required to split the manifest file multiple times. *n_splits* stores how many times a manifest file needs to be split to give at most *limit* datafiles per manifest. +fn compute_n_splits( + existing_file_count: usize, + new_file_count: usize, + selected_manifest_file_count: usize, +) -> u32 { + // We want: + // nb manifests per manifest list ~= nb data files per manifest + // Since: + // total number of data files = nb manifests per manifest list * nb data files per manifest + // We shall have: + // limit = sqrt(total number of data files) + let limit = MIN_DATAFILES_PER_MANIFEST + + ((existing_file_count + new_file_count) as f64).sqrt() as usize; + let new_manifest_file_count = selected_manifest_file_count + new_file_count; + match new_manifest_file_count / limit { + 0 => 0, + x => x.ilog2() + 1, + } +} #[derive(Debug)] ///Table operations @@ -88,7 +114,7 @@ impl Operation { match self { Operation::Append { branch, - files, + files: new_files, additional_summary, } => { let partition_spec = table_metadata.default_partition_spec()?; @@ -102,7 +128,7 @@ impl Operation { .map(|x| x.name().as_str()) .collect::>(); - let bounding_partition_values = files + let bounding_partition_values = new_files .iter() .try_fold(None, |acc, x| { let node = partition_struct_to_vec(x.partition(), &partition_column_names)?; @@ -127,11 +153,10 @@ impl Operation { let old_manifest_list_location = old_snapshot.map(|x| x.manifest_list()).cloned(); - // TO DISCUSS: What is this? - let mut file_count = 0; - // Find a manifest to add the new datafiles - let manifest = if let Some(old_manifest_list_location) = &old_manifest_list_location + let mut existing_file_count = 0; + let selected_manifest_opt = if let Some(old_manifest_list_location) = + &old_manifest_list_location { let old_manifest_list_bytes = object_store .get(&strip_prefix(old_manifest_list_location).as_str().into()) @@ -142,44 +167,40 @@ impl Operation { let manifest_list_reader = ManifestListReader::new(old_manifest_list_bytes.as_ref(), table_metadata)?; - let manifest = if partition_column_names.is_empty() { + let SelectedManifest { + manifest, + file_count_all_entries, + } = if partition_column_names.is_empty() { select_manifest_unpartitioned( manifest_list_reader, - &mut file_count, &mut manifest_list_writer, )? } else { select_manifest_partitioned( manifest_list_reader, - &mut file_count, &mut manifest_list_writer, &bounding_partition_values, )? }; + existing_file_count = file_count_all_entries; Some(manifest) } else { // If manifest list doesn't exist, there is no manifest None }; - // TO DISCUSS: What is this rule? - let limit = MIN_DATAFILES + ((file_count + files.len()) as f64).sqrt() as usize; - - let new_file_count = manifest + let selected_manifest_file_count = selected_manifest_opt .as_ref() - .and_then(|x| x.added_files_count) - .unwrap_or(0) as usize - + files.len(); - - // To achieve fast lookups of the datafiles, the maniest tree should be somewhat balanced, meaning that manifest files should contain a similar number of datafiles. - // This means that maniest files might need to be split up when they get too large. Since the number of datafiles being added by a append operation might be really large, - // it might even be required to split the manifest file multiple times. *N_splits* stores how many times a manifest file needs to be split to give at most *limit* datafiles per manifest - let n_splits = match new_file_count / limit { - 0 => 0, - x => x.ilog2() + 1, - }; - - let bounds = manifest + // TODO: should this also account for existing_files_count? + .and_then(|selected_manifest| selected_manifest.added_files_count) + .unwrap_or(0) as usize; + let n_splits = compute_n_splits( + existing_file_count, + new_files.len(), + selected_manifest_file_count, + ); + + let bounds = selected_manifest_opt .as_ref() .and_then(|x| x.partitions.as_deref()) .map(summary_to_rectangle) @@ -193,7 +214,7 @@ impl Operation { let snapshot_id = generate_snapshot_id(); let sequence_number = table_metadata.last_sequence_number + 1; - let new_datafile_iter = files.into_iter().map(|data_file| { + let new_datafile_iter = new_files.into_iter().map(|data_file| { ManifestEntry::builder() .with_format_version(table_metadata.format_version) .with_status(Status::Added) @@ -220,7 +241,7 @@ impl Operation { // Write manifest files // Split manifest file if limit is exceeded if n_splits == 0 { - let mut manifest_writer = if let Some(manifest) = manifest { + let mut manifest_writer = if let Some(manifest) = selected_manifest_opt { let manifest_bytes: Vec = object_store .get(&strip_prefix(&manifest.manifest_path).as_str().into()) .await? @@ -261,7 +282,7 @@ impl Operation { manifest_list_writer.append_ser(manifest)?; } else { // Split datafiles - let splits = if let Some(manifest) = manifest { + let splits = if let Some(manifest) = selected_manifest_opt { let manifest_bytes: Vec = object_store .get(&strip_prefix(&manifest.manifest_path).as_str().into()) .await? @@ -396,15 +417,7 @@ impl Operation { let mut manifest_list_writer = apache_avro::Writer::new(manifest_list_schema, Vec::new()); - let new_file_count = files.len(); - - let limit = MIN_DATAFILES + ((new_file_count) as f64).sqrt() as usize; - - // How many times do the files need to be split to give at most *limit* files per manifest - let n_splits = match new_file_count / limit { - 0 => 0, - x => x.ilog2() + 1, - }; + let n_splits = compute_n_splits(0, files.len(), 0); let snapshot_id = generate_snapshot_id(); let sequence_number = table_metadata.last_sequence_number + 1;