Skip to content

Commit

Permalink
Handle writing signed zero statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
Jefffrey committed Nov 7, 2023
1 parent 0b7111e commit b122f47
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 4 deletions.
16 changes: 12 additions & 4 deletions parquet/src/column/writer/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use crate::basic::{Encoding, Type};
use half::f16;

use crate::basic::{Encoding, LogicalType, Type};
use crate::bloom_filter::Sbbf;
use crate::column::writer::{
compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min,
Expand Down Expand Up @@ -317,21 +319,27 @@ where
//
// For max, it has similar logic but will be written as 0.0
// (positive zero)
let min = replace_zero(min, -0.0);
let max = replace_zero(max, 0.0);
let min = replace_zero(min, descr, -0.0);
let max = replace_zero(max, descr, 0.0);

Some((min, max))
}

#[inline]
fn replace_zero<T: ParquetValueType>(val: &T, replace: f32) -> T {
fn replace_zero<T: ParquetValueType>(val: &T, descr: &ColumnDescriptor, replace: f32) -> T {
match T::PHYSICAL_TYPE {
Type::FLOAT if f32::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => {
T::try_from_le_slice(&f32::to_le_bytes(replace)).unwrap()
}
Type::DOUBLE if f64::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => {
T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap()
}
Type::FIXED_LEN_BYTE_ARRAY
if descr.logical_type() == Some(LogicalType::Float16)
&& f16::from_le_bytes(val.as_bytes().try_into().unwrap()) == f16::NEG_ZERO =>
{
T::try_from_le_slice(&f16::to_le_bytes(f16::from_f32(replace))).unwrap()
}
_ => val.clone(),
}
}
56 changes: 56 additions & 0 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2170,6 +2170,62 @@ mod tests {
assert!(stats.is_min_max_backwards_compatible());
}

#[test]
fn test_float16_statistics_zero_only() {
let input = [f16::ZERO]
.into_iter()
.map(|s| ByteArray::from(s).into())
.collect::<Vec<_>>();

let stats = float16_statistics_roundtrip(&input);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO));
assert_eq!(stats.max(), &ByteArray::from(f16::ZERO));
}

#[test]
fn test_float16_statistics_neg_zero_only() {
let input = [f16::NEG_ZERO]
.into_iter()
.map(|s| ByteArray::from(s).into())
.collect::<Vec<_>>();

let stats = float16_statistics_roundtrip(&input);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO));
assert_eq!(stats.max(), &ByteArray::from(f16::ZERO));
}

#[test]
fn test_float16_statistics_zero_min() {
let input = [f16::ZERO, f16::ONE, f16::NAN, f16::PI]
.into_iter()
.map(|s| ByteArray::from(s).into())
.collect::<Vec<_>>();

let stats = float16_statistics_roundtrip(&input);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO));
assert_eq!(stats.max(), &ByteArray::from(f16::PI));
}

#[test]
fn test_float16_statistics_neg_zero_max() {
let input = [f16::NEG_ZERO, f16::NEG_ONE, f16::NAN, -f16::PI]
.into_iter()
.map(|s| ByteArray::from(s).into())
.collect::<Vec<_>>();

let stats = float16_statistics_roundtrip(&input);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
assert_eq!(stats.min(), &ByteArray::from(-f16::PI));
assert_eq!(stats.max(), &ByteArray::from(f16::ZERO));
}

#[test]
fn test_float_statistics_nan_middle() {
let stats = statistics_roundtrip::<FloatType>(&[1.0, f32::NAN, 2.0]);
Expand Down

0 comments on commit b122f47

Please sign in to comment.