From a7988f85c6b40d5508c90163b9d8f75624015735 Mon Sep 17 00:00:00 2001 From: Ryan Tan Date: Tue, 9 Jul 2024 23:33:56 +0800 Subject: [PATCH 1/2] fix: align datafilemeat to java --- crates/paimon/Cargo.toml | 1 + crates/paimon/src/spec/data_file.rs | 125 ++++++++++++++++++++++++++++ crates/paimon/src/spec/mod.rs | 1 + 3 files changed, 127 insertions(+) create mode 100644 crates/paimon/src/spec/data_file.rs diff --git a/crates/paimon/Cargo.toml b/crates/paimon/Cargo.toml index aae592d..c315923 100644 --- a/crates/paimon/Cargo.toml +++ b/crates/paimon/Cargo.toml @@ -27,6 +27,7 @@ license.workspace = true version.workspace = true [dependencies] +chrono = "0.4.38" serde = { version = "1", features = ["derive"] } serde_with = "3.8.3" snafu = "0.8.3" diff --git a/crates/paimon/src/spec/data_file.rs b/crates/paimon/src/spec/data_file.rs new file mode 100644 index 0000000..b819d7a --- /dev/null +++ b/crates/paimon/src/spec/data_file.rs @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::schema::DataField; +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use std::fmt::{Display, Formatter}; + +/// Data type of a sequence of fields. A field consists of a field name, field type, and an optional +/// description. The most specific type of a row of a table is a row type. In this case, each column +/// of the row corresponds to the field of the row type that has the same ordinal position as the +/// column. Compared to the SQL standard, an optional field description simplifies the handling with +/// complex structures. +/// +/// Impl Reference: +/// +/// TODO: make RowType extends DataType. +/// TODO: move me to a better place. +pub struct RowType { + fields: Vec, +} + +impl RowType { + pub const fn new(list: Vec) -> Self { + Self { fields: list } + } +} + +pub const EMPTY_BINARY_ROW: BinaryRow = BinaryRow::new(0); + +/// An implementation of InternalRow. +/// +/// Impl Reference: +#[derive(Debug, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all("camelCase"))] +pub struct BinaryRow { + arity: i32, + null_bits_size_in_bytes: i32, +} + +impl BinaryRow { + pub const HEADER_SIZE_IN_BYTES: i32 = 8; + pub const fn cal_bit_set_width_in_bytes(arity: i32) -> i32 { + ((arity + 63 + Self::HEADER_SIZE_IN_BYTES) / 64) * 8 + } + pub const fn cal_fix_part_size_in_bytes(arity: i32) -> i32 { + Self::cal_bit_set_width_in_bytes(arity) + 8 * arity + } + pub const fn new(arity: i32) -> Self { + Self { + arity, + null_bits_size_in_bytes: (arity + 7) / 8, + } + } +} + +/// TODO: implement me. +/// The statistics for columns, supports the following stats. +/// +/// Impl References: +type SimpleStats = (); + +/// The Source of a file. +/// TODO: move me to the manifest module. +/// +/// Impl References: +#[repr(u8)] +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all("camelCase"))] +pub enum FileSource { + Append = 0, + Compact = 1, +} + +/// Metadata of a data file. +/// +/// Impl References: +#[derive(Debug, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all("camelCase"))] +pub struct DataFileMeta { + pub file_name: String, + pub file_size: i64, + // row_count tells the total number of rows (including add & delete) in this file. + pub row_count: i64, + pub min_key: BinaryRow, + pub max_key: BinaryRow, + pub key_stats: SimpleStats, + pub value_stats: SimpleStats, + pub min_sequence_number: i64, + pub max_sequence_number: i64, + pub schema_id: i64, + pub level: i32, + pub extra_files: Vec, + pub creation_time: DateTime, + // rowCount = add_row_count + delete_row_count. + pub delete_row_count: Option, + // file index filter bytes, if it is small, store in data file meta + pub embedded_index: Option>, + pub file_source: Option, +} + +impl Display for DataFileMeta { + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +impl DataFileMeta { + // TODO: implement me + pub const SCHEMA: RowType = RowType::new(vec![]); +} diff --git a/crates/paimon/src/spec/mod.rs b/crates/paimon/src/spec/mod.rs index b4b8370..2343c29 100644 --- a/crates/paimon/src/spec/mod.rs +++ b/crates/paimon/src/spec/mod.rs @@ -19,6 +19,7 @@ //! //! All paimon specs types are defined here. +mod data_file; mod schema; pub use schema::*; From 0537104315a923bb84c460016cabba98ea278750 Mon Sep 17 00:00:00 2001 From: Ryan Tan Date: Fri, 19 Jul 2024 00:13:46 +0800 Subject: [PATCH 2/2] fix: make clippy happy --- crates/paimon/Cargo.toml | 2 +- crates/paimon/src/spec/data_file.rs | 10 +++++----- crates/paimon/src/spec/mod.rs | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/crates/paimon/Cargo.toml b/crates/paimon/Cargo.toml index c315923..6d057d5 100644 --- a/crates/paimon/Cargo.toml +++ b/crates/paimon/Cargo.toml @@ -27,7 +27,7 @@ license.workspace = true version.workspace = true [dependencies] -chrono = "0.4.38" +chrono = {version = "0.4.38", features = ["serde"]} serde = { version = "1", features = ["derive"] } serde_with = "3.8.3" snafu = "0.8.3" diff --git a/crates/paimon/src/spec/data_file.rs b/crates/paimon/src/spec/data_file.rs index b819d7a..4f6c41f 100644 --- a/crates/paimon/src/spec/data_file.rs +++ b/crates/paimon/src/spec/data_file.rs @@ -31,12 +31,12 @@ use std::fmt::{Display, Formatter}; /// TODO: make RowType extends DataType. /// TODO: move me to a better place. pub struct RowType { - fields: Vec, + _fields: Vec, } impl RowType { pub const fn new(list: Vec) -> Self { - Self { fields: list } + Self { _fields: list } } } @@ -46,7 +46,7 @@ pub const EMPTY_BINARY_ROW: BinaryRow = BinaryRow::new(0); /// /// Impl Reference: #[derive(Debug, Eq, PartialEq, Serialize, Deserialize)] -#[serde(rename_all("camelCase"))] +#[serde(rename_all = "camelCase")] pub struct BinaryRow { arity: i32, null_bits_size_in_bytes: i32, @@ -80,7 +80,7 @@ type SimpleStats = (); /// Impl References: #[repr(u8)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] -#[serde(rename_all("camelCase"))] +#[serde(rename_all = "camelCase")] pub enum FileSource { Append = 0, Compact = 1, @@ -90,7 +90,7 @@ pub enum FileSource { /// /// Impl References: #[derive(Debug, Eq, PartialEq, Serialize, Deserialize)] -#[serde(rename_all("camelCase"))] +#[serde(rename_all = "camelCase")] pub struct DataFileMeta { pub file_name: String, pub file_size: i64, diff --git a/crates/paimon/src/spec/mod.rs b/crates/paimon/src/spec/mod.rs index 2343c29..eb25755 100644 --- a/crates/paimon/src/spec/mod.rs +++ b/crates/paimon/src/spec/mod.rs @@ -20,6 +20,8 @@ //! All paimon specs types are defined here. mod data_file; +pub use data_file::*; + mod schema; pub use schema::*;