Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Parquet f32/f64 handle signed zeros in statistics
Browse files Browse the repository at this point in the history
Jefffrey committed Nov 7, 2023
1 parent 8c20c98 commit e00f2cf
Showing 3 changed files with 147 additions and 6 deletions.
29 changes: 27 additions & 2 deletions parquet/src/column/writer/encoder.rs
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use crate::basic::Encoding;
use crate::basic::{Encoding, Type};
use crate::bloom_filter::Sbbf;
use crate::column::writer::{
compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min,
@@ -308,5 +308,30 @@ where
max = val;
}
}
Some((min.clone(), max.clone()))

// Float/Double statistics have special case for zero.
//
// If computed min is zero, whether negative or positive,
// the spec states that the min should be written as -0.0
// (negative zero)
//
// For max, it has similar logic but will be written as 0.0
// (positive zero)
let min = replace_zero(min, -0.0);
let max = replace_zero(max, 0.0);

Some((min, max))
}

#[inline]
fn replace_zero<T: ParquetValueType>(val: &T, replace: f32) -> T {
match T::PHYSICAL_TYPE {
Type::FLOAT if f32::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => {
T::try_from_le_slice(&f32::to_le_bytes(replace)).unwrap()
}
Type::DOUBLE if f64::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => {
T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap()
}
_ => val.clone(),
}
}
120 changes: 118 additions & 2 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
@@ -2111,6 +2111,64 @@ mod tests {
assert!(matches!(stats, Statistics::Float(_)));
}

#[test]
fn test_float_statistics_zero_only() {
let stats = statistics_roundtrip::<FloatType>(&[0.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Float");
}
}

#[test]
fn test_float_statistics_neg_zero_only() {
let stats = statistics_roundtrip::<FloatType>(&[-0.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Float");
}
}

#[test]
fn test_float_statistics_zero_min() {
let stats = statistics_roundtrip::<FloatType>(&[0.0, 1.0, f32::NAN, 2.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &2.0);
} else {
panic!("expecting Statistics::Float");
}
}

#[test]
fn test_float_statistics_neg_zero_max() {
let stats = statistics_roundtrip::<FloatType>(&[-0.0, -1.0, f32::NAN, -2.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-2.0);
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Float");
}
}

#[test]
fn test_double_statistics_nan_middle() {
let stats = statistics_roundtrip::<DoubleType>(&[1.0, f64::NAN, 2.0]);
@@ -2120,7 +2178,7 @@ mod tests {
assert_eq!(stats.min(), &1.0);
assert_eq!(stats.max(), &2.0);
} else {
panic!("expecting Statistics::Float");
panic!("expecting Statistics::Double");
}
}

@@ -2133,7 +2191,7 @@ mod tests {
assert_eq!(stats.min(), &1.0);
assert_eq!(stats.max(), &2.0);
} else {
panic!("expecting Statistics::Float");
panic!("expecting Statistics::Double");
}
}

@@ -2145,6 +2203,64 @@ mod tests {
assert!(stats.is_min_max_backwards_compatible());
}

#[test]
fn test_double_statistics_zero_only() {
let stats = statistics_roundtrip::<DoubleType>(&[0.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Double");
}
}

#[test]
fn test_double_statistics_neg_zero_only() {
let stats = statistics_roundtrip::<DoubleType>(&[-0.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Double");
}
}

#[test]
fn test_double_statistics_zero_min() {
let stats = statistics_roundtrip::<DoubleType>(&[0.0, 1.0, f64::NAN, 2.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &2.0);
} else {
panic!("expecting Statistics::Double");
}
}

#[test]
fn test_double_statistics_neg_zero_max() {
let stats = statistics_roundtrip::<DoubleType>(&[-0.0, -1.0, f64::NAN, -2.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-2.0);
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Double");
}
}

#[test]
fn test_compare_greater_byte_array_decimals() {
assert!(!compare_greater_byte_array_decimals(&[], &[],),);
4 changes: 2 additions & 2 deletions parquet/src/data_type.rs
Original file line number Diff line number Diff line change
@@ -632,15 +632,15 @@ pub(crate) mod private {

/// Return the value as i64 if possible
///
/// This is essentially the same as `std::convert::TryInto<i64>` but can
/// This is essentially the same as `std::convert::TryInto<i64>` but can't be
/// implemented for `f32` and `f64`, types that would fail orphan rules
fn as_i64(&self) -> Result<i64> {
Err(general_err!("Type cannot be converted to i64"))
}

/// Return the value as u64 if possible
///
/// This is essentially the same as `std::convert::TryInto<u64>` but can
/// This is essentially the same as `std::convert::TryInto<u64>` but can't be
/// implemented for `f32` and `f64`, types that would fail orphan rules
fn as_u64(&self) -> Result<u64> {
self.as_i64()

0 comments on commit e00f2cf

Please sign in to comment.