diff --git a/crates/paimon/Cargo.toml b/crates/paimon/Cargo.toml index 0af0b17..0f03c5b 100644 --- a/crates/paimon/Cargo.toml +++ b/crates/paimon/Cargo.toml @@ -31,6 +31,7 @@ bitflags = "2.6.0" chrono = { version = "0.4.38", features = ["serde"] } serde = { version = "1", features = ["derive"] } serde_with = "3.8.3" +serde_bytes = "0.11.15" snafu = "0.8.3" typed-builder = "^0.18" opendal = "0.48" diff --git a/crates/paimon/src/spec/manifest_file_meta.rs b/crates/paimon/src/spec/manifest_file_meta.rs new file mode 100644 index 0000000..4f3775d --- /dev/null +++ b/crates/paimon/src/spec/manifest_file_meta.rs @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_bytes::Bytes; +use std::fmt::{Display, Formatter}; + +/// Metadata of a manifest file. +/// +/// Impl Reference: +#[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)] +pub struct ManifestFileMeta { + /// manifest file name + #[serde(rename = "_FILE_NAME")] + file_name: String, + + /// manifest file size. + #[serde(rename = "_FILE_SIZE")] + file_size: i64, + + /// number added files in manifest. + #[serde(rename = "_NUM_ADDED_FILES")] + num_added_files: i64, + + /// number deleted files in manifest. + #[serde(rename = "_NUM_DELETED_FILES")] + num_deleted_files: i64, + + /// partition stats, the minimum and maximum values of partition fields in this manifest are beneficial for skipping certain manifest files during queries, it is a SimpleStats. + #[serde(rename = "_PARTITION_STATS")] + partition_stats: BinaryTableStats, + + /// schema id when writing this manifest file. + #[serde(rename = "_SCHEMA_ID")] + schema_id: i64, +} + +impl ManifestFileMeta { + /// Get the manifest file name + #[inline] + pub fn file_name(&self) -> &str { + self.file_name.as_str() + } + + /// Get the manifest file size. + #[inline] + pub fn file_size(&self) -> i64 { + self.file_size + } + + /// Get the number added files in manifest. + #[inline] + pub fn num_added_files(&self) -> i64 { + self.num_added_files + } + + /// Get the number deleted files in manifest. + #[inline] + pub fn num_deleted_files(&self) -> i64 { + self.num_deleted_files + } + + /// Get the partition stats + pub fn partition_stats(&self) -> &BinaryTableStats { + &self.partition_stats + } + + /// Get the schema id when writing this manifest file. + #[inline] + pub fn schema_id(&self) -> i64 { + self.schema_id + } +} + +impl Display for ManifestFileMeta { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{{{}, {}, {}, {}, {:?}, {}}}", + self.file_name, + self.file_size, + self.num_added_files, + self.num_deleted_files, + self.partition_stats, + self.schema_id + ) + } +} + +/// The statistics for columns, supports the following stats. +/// +/// All statistics are stored in the form of a Binary, which can significantly reduce its memory consumption, but the cost is that the column type needs to be known when getting. +/// +/// Impl Reference: +#[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)] +pub struct BinaryTableStats { + /// the minimum values of the columns + #[serde(rename = "_MIN_VALUES", with = "serde_bytes")] + min_values: Vec, + + /// the maximum values of the columns + #[serde(rename = "_MAX_VALUES", with = "serde_bytes")] + max_values: Vec, + + /// the number of nulls of the columns + #[serde( + rename = "_NULL_COUNTS", + serialize_with = "serialize_null_counts", + deserialize_with = "deserialize_null_counts" + )] + null_counts: Vec, +} + +impl BinaryTableStats { + /// Get the minimum values of the columns + #[inline] + pub fn min_values(&self) -> &[u8] { + &self.min_values + } + + /// Get the maximum values of the columns + #[inline] + pub fn max_values(&self) -> &[u8] { + &self.max_values + } + + /// Get the number of nulls of the columns + #[inline] + pub fn null_counts(&self) -> &Vec { + &self.null_counts + } +} + +impl Display for BinaryTableStats { + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +fn serialize_null_counts(value: &Vec, serializer: S) -> Result +where + S: Serializer, +{ + let mut bytes = Vec::new(); + for &num in value { + bytes.extend_from_slice(&num.to_le_bytes()); + } + + let bytes = Bytes::new(bytes.as_slice()); + serializer.serialize_bytes(bytes) +} + +fn deserialize_null_counts<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let bytes = Deserialize::deserialize(deserializer).map(Bytes::new)?; + + let size_of_i64 = std::mem::size_of::(); + let i64_count = bytes.len() / size_of_i64; + let mut i64s = Vec::with_capacity(i64_count); + for chunk in bytes.chunks_exact(size_of_i64) { + i64s.push(i64::from_le_bytes( + chunk.try_into().expect("Chunk must be 8 bytes long"), + )); + } + Ok(i64s) +} diff --git a/crates/paimon/src/spec/manifest_list.rs b/crates/paimon/src/spec/manifest_list.rs new file mode 100644 index 0000000..a37e8c0 --- /dev/null +++ b/crates/paimon/src/spec/manifest_list.rs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::manifest_file_meta::ManifestFileMeta; + +/// This file includes several [`ManifestFileMeta`], representing all data of the whole table at the corresponding snapshot. +pub struct ManifestList {} + +impl ManifestList { + /// Write several [`ManifestFileMeta`]s into a manifest list. + /// + /// NOTE: This method is atomic. + pub fn write(&mut self, _metas: Vec) -> &str { + todo!() + } +} diff --git a/crates/paimon/src/spec/mod.rs b/crates/paimon/src/spec/mod.rs index fc09dcd..a2d1fa3 100644 --- a/crates/paimon/src/spec/mod.rs +++ b/crates/paimon/src/spec/mod.rs @@ -28,5 +28,11 @@ pub use schema::*; mod snapshot; pub use snapshot::*; +mod manifest_file_meta; +pub use manifest_file_meta::*; + +mod manifest_list; +pub use manifest_list::*; + mod types; pub use types::*;