From 2809fbf6515e33b8f2cd12fb6a4a81a1068441b9 Mon Sep 17 00:00:00 2001 From: benibus Date: Wed, 14 Jun 2023 14:52:55 -0400 Subject: [PATCH] Implement column statistics --- cpp/src/parquet/float_internal.h | 61 +++++ cpp/src/parquet/statistics.cc | 144 ++++++++++-- cpp/src/parquet/statistics_test.cc | 343 +++++++++++++++++++++-------- 3 files changed, 442 insertions(+), 106 deletions(-) create mode 100644 cpp/src/parquet/float_internal.h diff --git a/cpp/src/parquet/float_internal.h b/cpp/src/parquet/float_internal.h new file mode 100644 index 0000000000000..c82c9d575ce3b --- /dev/null +++ b/cpp/src/parquet/float_internal.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/bit_util.h" +#include "arrow/util/ubsan.h" +#include "parquet/types.h" + +namespace parquet { + +struct float16 { + constexpr static uint16_t min() { return 0b1111101111111111; } + constexpr static uint16_t max() { return 0b0111101111111111; } + constexpr static uint16_t positive_zero() { return 0b0000000000000000; } + constexpr static uint16_t negative_zero() { return 0b1000000000000000; } + + static uint8_t* min_ptr() { return min_; } + static uint8_t* max_ptr() { return max_; } + static uint8_t* positive_zero_ptr() { return positive_zero_; } + static uint8_t* negative_zero_ptr() { return negative_zero_; } + + static bool is_nan(uint16_t n) { return (n & 0x7c00) == 0x7c00 && (n & 0x03ff) != 0; } + static bool is_zero(uint16_t n) { return (n & 0x7fff) == 0; } + static bool signbit(uint16_t n) { return (n & 0x8000) != 0; } + + static uint16_t Pack(const uint8_t* src) { + return ::arrow::bit_util::FromLittleEndian(::arrow::util::SafeLoadAs(src)); + } + static uint16_t Pack(const FLBA& src) { return Pack(src.ptr); } + + static uint8_t* Unpack(uint16_t src, uint8_t* dest) { + src = ::arrow::bit_util::ToLittleEndian(src); + return static_cast(std::memcpy(dest, &src, sizeof(src))); + } + + private: + static inline uint8_t min_[] = {0b11111111, 0b11111011}; + static inline uint8_t max_[] = {0b11111111, 0b01111011}; + static inline uint8_t positive_zero_[] = {0b00000000, 0b00000000}; + static inline uint8_t negative_zero_[] = {0b00000000, 0b10000000}; +}; + +} // namespace parquet diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index aa6df7e32a98c..ad4e12e3e60f0 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -35,6 +35,7 @@ #include "arrow/visit_data_inline.h" #include "parquet/encoding.h" #include "parquet/exception.h" +#include "parquet/float_internal.h" #include "parquet/platform.h" #include "parquet/schema.h" @@ -277,11 +278,54 @@ template struct CompareHelper : public BinaryLikeCompareHelperBase {}; +struct Float16CompareHelper { + using T = FLBA; + + static T DefaultMin() { return T{float16::max_ptr()}; } + static T DefaultMax() { return T{float16::min_ptr()}; } + + static T Coalesce(T val, T fallback) { + return val.ptr != nullptr && float16::is_nan(float16::Pack(val)) ? fallback : val; + } + + static inline bool Compare(int type_length, const T& a, const T& b) { + uint16_t l = float16::Pack(a); + uint16_t r = float16::Pack(b); + + if (l & 0x8000) { + if (r & 0x8000) { + // Both are negative + return (l & 0x7fff) > (r & 0x7fff); + } else { + // Handle +/-0 + return (l & 0x7fff) || r != 0; + } + } else if (r & 0x8000) { + return false; + } else { + // Both are positive + return (l & 0x7fff) < (r & 0x7fff); + } + } + + static T Min(int type_length, const T& a, const T& b) { + if (a.ptr == nullptr) return b; + if (b.ptr == nullptr) return a; + return Compare(type_length, a, b) ? a : b; + } + + static T Max(int type_length, const T& a, const T& b) { + if (a.ptr == nullptr) return b; + if (b.ptr == nullptr) return a; + return Compare(type_length, a, b) ? b : a; + } +}; + using ::std::optional; template ::arrow::enable_if_t::value, optional>> -CleanStatistic(std::pair min_max) { +CleanStatistic(std::pair min_max, LogicalType::Type::type) { return min_max; } @@ -292,7 +336,7 @@ CleanStatistic(std::pair min_max) { // - If max is -0.0f, replace with 0.0f template ::arrow::enable_if_t::value, optional>> -CleanStatistic(std::pair min_max) { +CleanStatistic(std::pair min_max, LogicalType::Type::type) { T min = min_max.first; T max = min_max.second; @@ -318,26 +362,55 @@ CleanStatistic(std::pair min_max) { return {{min, max}}; } -optional> CleanStatistic(std::pair min_max) { +optional> CleanFloat16Statistic(std::pair min_max) { + FLBA min = min_max.first; + FLBA max = min_max.second; + uint16_t min_packed = float16::Pack(min); + uint16_t max_packed = float16::Pack(max); + + if (float16::is_nan(min_packed) || float16::is_nan(max_packed)) { + return ::std::nullopt; + } + + if (min_packed == float16::max() && max_packed == float16::min()) { + return ::std::nullopt; + } + + if (min_packed == float16::positive_zero()) { + min = FLBA{float16::negative_zero_ptr()}; + } + if (max_packed == float16::negative_zero()) { + max = FLBA{float16::positive_zero_ptr()}; + } + + return {{min, max}}; +} + +optional> CleanStatistic(std::pair min_max, + LogicalType::Type::type logical_type) { if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) { return ::std::nullopt; } + if (logical_type == LogicalType::Type::FLOAT16) { + return CleanFloat16Statistic(std::move(min_max)); + } return min_max; } optional> CleanStatistic( - std::pair min_max) { + std::pair min_max, LogicalType::Type::type) { if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) { return ::std::nullopt; } return min_max; } -template +template > class TypedComparatorImpl : virtual public TypedComparator { public: using T = typename DType::c_type; - using Helper = CompareHelper; + using Helper = HelperType; explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {} @@ -412,9 +485,9 @@ TypedComparatorImpl::GetMinMax(const int32_t* va return {SafeCopy(min), SafeCopy(max)}; } -template +template std::pair -TypedComparatorImpl::GetMinMax(const ::arrow::Array& values) { +TypedComparatorImpl::GetMinMax(const ::arrow::Array& values) { ParquetException::NYI(values.type()->ToString()); } @@ -458,6 +531,16 @@ std::pair TypedComparatorImpl::GetMi return GetMinMaxBinaryHelper(*this, values); } +static LogicalType::Type::type LogicalTypeId(const ColumnDescriptor* descr) { + if (const auto& logical_type = descr->logical_type()) { + return logical_type->type(); + } + return LogicalType::Type::NONE; +} +static LogicalType::Type::type LogicalTypeId(const Statistics& stats) { + return LogicalTypeId(stats.descr()); +} + template class TypedStatisticsImpl : public TypedStatistics { public: @@ -468,8 +551,7 @@ class TypedStatisticsImpl : public TypedStatistics { pool_(pool), min_buffer_(AllocateBuffer(pool_, 0)), max_buffer_(AllocateBuffer(pool_, 0)) { - auto comp = Comparator::Make(descr); - comparator_ = std::static_pointer_cast>(comp); + comparator_ = MakeComparator(descr); TypedStatisticsImpl::Reset(); has_null_count_ = true; has_distinct_count_ = true; @@ -525,6 +607,19 @@ class TypedStatisticsImpl : public TypedStatistics { bool Equals(const Statistics& raw_other) const override { if (physical_type() != raw_other.physical_type()) return false; + const auto logical_id = LogicalTypeId(*this); + switch (logical_id) { + // Only compare against logical types that influence the interpretation of the + // physical type + case LogicalType::Type::FLOAT16: + if (LogicalTypeId(raw_other) != logical_id) { + return false; + } + break; + default: + break; + } + const auto& other = checked_cast(raw_other); if (has_min_max_ != other.has_min_max_) return false; @@ -650,7 +745,7 @@ class TypedStatisticsImpl : public TypedStatistics { void SetMinMaxPair(std::pair min_max) { // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN - auto maybe_min_max = CleanStatistic(min_max); + auto maybe_min_max = CleanStatistic(min_max, LogicalTypeId(*this)); if (!maybe_min_max) return; auto min = maybe_min_max.value().first; @@ -759,12 +854,8 @@ void TypedStatisticsImpl::PlainDecode(const std::string& src, dst->ptr = reinterpret_cast(src.c_str()); } -} // namespace - -// ---------------------------------------------------------------------- -// Public factory functions - -std::shared_ptr Comparator::Make(Type::type physical_type, +std::shared_ptr DoMakeComparator(Type::type physical_type, + LogicalType::Type::type logical_type, SortOrder::type sort_order, int type_length) { if (SortOrder::SIGNED == sort_order) { @@ -784,6 +875,10 @@ std::shared_ptr Comparator::Make(Type::type physical_type, case Type::BYTE_ARRAY: return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: + if (logical_type == LogicalType::Type::FLOAT16) { + return std::make_shared< + TypedComparatorImpl>(); + } return std::make_shared>(type_length); default: ParquetException::NYI("Signed Compare not implemented"); @@ -809,8 +904,21 @@ std::shared_ptr Comparator::Make(Type::type physical_type, return nullptr; } +} // namespace + +// ---------------------------------------------------------------------- +// Public factory functions + +std::shared_ptr Comparator::Make(Type::type physical_type, + SortOrder::type sort_order, + int type_length) { + return DoMakeComparator(physical_type, LogicalType::Type::NONE, sort_order, + type_length); +} + std::shared_ptr Comparator::Make(const ColumnDescriptor* descr) { - return Make(descr->physical_type(), descr->sort_order(), descr->type_length()); + return DoMakeComparator(descr->physical_type(), LogicalTypeId(descr), + descr->sort_order(), descr->type_length()); } std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 8b9a42aa18bba..dcb6fda9209d6 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -40,6 +40,7 @@ #include "parquet/column_writer.h" #include "parquet/file_reader.h" #include "parquet/file_writer.h" +#include "parquet/float_internal.h" #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/statistics.h" @@ -680,9 +681,22 @@ TEST(CorrectStatistics, Basics) { // Test SortOrder class static const int NUM_VALUES = 10; -template +template +struct RebindLogical { + using ParquetType = T; + using CType = typename T::c_type; +}; + +template <> +struct RebindLogical { + using ParquetType = FLBAType; + using CType = ParquetType::c_type; +}; + +template class TestStatisticsSortOrder : public ::testing::Test { public: + using TestType = typename RebindLogical::ParquetType; using c_type = typename TestType::c_type; void SetUp() override { @@ -760,7 +774,7 @@ class TestStatisticsSortOrder : public ::testing::Test { }; using CompareTestTypes = ::testing::Types; + ByteArrayType, FLBAType, Float16LogicalType>; // TYPE::INT32 template <> @@ -907,6 +921,36 @@ void TestStatisticsSortOrder::SetValues() { .set_max(std::string(reinterpret_cast(&vals[8][0]), FLBA_LENGTH)); } +template <> +void TestStatisticsSortOrder::AddNodes(std::string name) { + auto node = + schema::PrimitiveNode::Make(name, Repetition::REQUIRED, LogicalType::Float16(), + Type::FIXED_LEN_BYTE_ARRAY, sizeof(uint16_t)); + fields_.push_back(std::move(node)); +} + +template <> +void TestStatisticsSortOrder::SetValues() { + constexpr int kValueLen = 2; + constexpr int kNumBytes = NUM_VALUES * kValueLen; + + const uint16_t packed_vals[NUM_VALUES] = { + 0b0000000000000000, 0b0000000000000000, 0b1000000000000000, 0b1000010000000000, + 0b0111110000001000, 0b1000000000000000, 0b0000010000000000, 0b0000000001000000, + 0b1111110000001000, 0b1000000001000000}; + + values_buf_.resize(kNumBytes); + uint8_t* ptr = values_buf_.data(); + for (int i = 0; i < NUM_VALUES; ++i) { + values_[i].ptr = float16::Unpack(packed_vals[i], ptr); + ptr += kValueLen; + } + + stats_[0] + .set_min(std::string(reinterpret_cast(values_[3].ptr), kValueLen)) + .set_max(std::string(reinterpret_cast(values_[6].ptr), kValueLen)); +} + TYPED_TEST_SUITE(TestStatisticsSortOrder, CompareTestTypes); TYPED_TEST(TestStatisticsSortOrder, MinMax) { @@ -972,12 +1016,20 @@ TEST_F(TestStatisticsSortOrderFLBA, UnknownSortOrder) { ASSERT_FALSE(cc_metadata->is_stats_set()); } +template +static std::string EncodeValue(const T& val) { + return std::string(reinterpret_cast(&val), sizeof(val)); +} +static std::string EncodeValue(const FLBA& val, int length = sizeof(uint16_t)) { + return std::string(reinterpret_cast(val.ptr), length); +} + template void AssertMinMaxAre(Stats stats, const Array& values, T expected_min, T expected_max) { stats->Update(values.data(), values.size(), 0); ASSERT_TRUE(stats->HasMinMax()); - EXPECT_EQ(stats->min(), expected_min); - EXPECT_EQ(stats->max(), expected_max); + EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min)); + EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max)); } template @@ -989,8 +1041,8 @@ void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitm stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count + null_count, non_null_count, null_count); ASSERT_TRUE(stats->HasMinMax()); - EXPECT_EQ(stats->min(), expected_min); - EXPECT_EQ(stats->max(), expected_max); + EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min)); + EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max)); } template @@ -1073,50 +1125,217 @@ void CheckExtrema() { TEST(TestStatistic, Int32Extrema) { CheckExtrema(); } TEST(TestStatistic, Int64Extrema) { CheckExtrema(); } -// PARQUET-1225: Float NaN values may lead to incorrect min-max -template -void CheckNaNs() { - using T = typename ParquetType::c_type; +template +class TestFloatStatistics : public ::testing::Test { + public: + using ParquetType = typename RebindLogical::ParquetType; + using c_type = typename ParquetType::c_type; + + void Init(); + void SetUp() override { this->Init(); } + + bool signbit(c_type val); + void CheckEq(const c_type& l, const c_type& r); + NodePtr MakeNode(const std::string& name, Repetition::type rep); + + template + void CheckMinMaxZeroesSign(Stats stats, const Values& values) { + stats->Update(values.data(), values.size(), 0); + ASSERT_TRUE(stats->HasMinMax()); + + this->CheckEq(stats->min(), positive_zero_); + ASSERT_TRUE(this->signbit(stats->min())); + + this->CheckEq(stats->max(), positive_zero_); + ASSERT_FALSE(this->signbit(stats->max())); + } + + // ARROW-5562: Ensure that -0.0f and 0.0f values are properly handled like in + // parquet-mr + void TestNegativeZeroes() { + NodePtr node = this->MakeNode("f", Repetition::OPTIONAL); + ColumnDescriptor descr(node, 1, 1); + + { + std::array values{negative_zero_, positive_zero_}; + auto stats = MakeStatistics(&descr); + CheckMinMaxZeroesSign(stats, values); + } + + { + std::array values{positive_zero_, negative_zero_}; + auto stats = MakeStatistics(&descr); + CheckMinMaxZeroesSign(stats, values); + } + + { + std::array values{negative_zero_, negative_zero_}; + auto stats = MakeStatistics(&descr); + CheckMinMaxZeroesSign(stats, values); + } + + { + std::array values{positive_zero_, positive_zero_}; + auto stats = MakeStatistics(&descr); + CheckMinMaxZeroesSign(stats, values); + } + } + + // PARQUET-1225: Float NaN values may lead to incorrect min-max + template + void CheckNaNs(ColumnDescriptor* descr, const Values& all_nans, const Values& some_nans, + const Values& other_nans, c_type min, c_type max, uint8_t valid_bitmap, + uint8_t valid_bitmap_no_nans) { + auto some_nan_stats = MakeStatistics(descr); + // Ingesting only nans should not yield valid min max + AssertUnsetMinMax(some_nan_stats, all_nans); + // Ingesting a mix of NaNs and non-NaNs should not yield valid min max. + AssertMinMaxAre(some_nan_stats, some_nans, min, max); + // Ingesting only nans after a valid min/max, should have not effect + AssertMinMaxAre(some_nan_stats, all_nans, min, max); + + some_nan_stats = MakeStatistics(descr); + AssertUnsetMinMax(some_nan_stats, all_nans, &valid_bitmap); + // NaNs should not pollute min max when excluded via null bitmap. + AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap_no_nans, min, max); + // Ingesting NaNs with a null bitmap should not change the result. + AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap, min, max); + + // An array that doesn't start with NaN + auto other_stats = MakeStatistics(descr); + AssertMinMaxAre(other_stats, other_nans, min, max); + } + + void TestNaNs(); + + protected: + std::vector data_buf_; + c_type positive_zero_; + c_type negative_zero_; +}; + +template +void TestFloatStatistics::Init() { + positive_zero_ = c_type{}; + negative_zero_ = -positive_zero_; +} +template <> +void TestFloatStatistics::Init() { + positive_zero_ = c_type{float16::positive_zero_ptr()}; + negative_zero_ = c_type{float16::negative_zero_ptr()}; +} + +template +NodePtr TestFloatStatistics::MakeNode(const std::string& name, Repetition::type rep) { + return PrimitiveNode::Make(name, rep, ParquetType::type_num); +} +template <> +NodePtr TestFloatStatistics::MakeNode(const std::string& name, + Repetition::type rep) { + return PrimitiveNode::Make(name, rep, LogicalType::Float16(), + Type::FIXED_LEN_BYTE_ARRAY, 2); +} +template +void TestFloatStatistics::CheckEq(const c_type& l, const c_type& r) { + ASSERT_EQ(l, r); +} +template <> +void TestFloatStatistics::CheckEq(const c_type& a, const c_type& b) { + auto l = float16::Pack(a); + auto r = float16::Pack(b); + if (float16::is_zero(l) && float16::is_zero(r)) return; + ASSERT_EQ(l, r); +} + +template +bool TestFloatStatistics::signbit(c_type val) { + return std::signbit(val); +} +template <> +bool TestFloatStatistics::signbit(c_type val) { + return float16::signbit(float16::Pack(val)); +} + +template +void TestFloatStatistics::TestNaNs() { constexpr int kNumValues = 8; - NodePtr node = PrimitiveNode::Make("f", Repetition::OPTIONAL, ParquetType::type_num); + NodePtr node = this->MakeNode("f", Repetition::OPTIONAL); ColumnDescriptor descr(node, 1, 1); - constexpr T nan = std::numeric_limits::quiet_NaN(); - constexpr T min = -4.0f; - constexpr T max = 3.0f; + constexpr c_type nan = std::numeric_limits::quiet_NaN(); + constexpr c_type min = -4.0f; + constexpr c_type max = 3.0f; + + std::array all_nans{nan, nan, nan, nan, nan, nan, nan, nan}; + std::array some_nans{nan, max, -3.0f, -1.0f, nan, 2.0f, min, nan}; + std::array other_nans{1.5f, max, -3.0f, -1.0f, nan, 2.0f, min, nan}; - std::array all_nans{nan, nan, nan, nan, nan, nan, nan, nan}; - std::array some_nans{nan, max, -3.0f, -1.0f, nan, 2.0f, min, nan}; uint8_t valid_bitmap = 0x7F; // 0b01111111 // NaNs excluded uint8_t valid_bitmap_no_nans = 0x6E; // 0b01101110 - // Test values - auto some_nan_stats = MakeStatistics(&descr); - // Ingesting only nans should not yield valid min max - AssertUnsetMinMax(some_nan_stats, all_nans); - // Ingesting a mix of NaNs and non-NaNs should not yield valid min max. - AssertMinMaxAre(some_nan_stats, some_nans, min, max); - // Ingesting only nans after a valid min/max, should have not effect - AssertMinMaxAre(some_nan_stats, all_nans, min, max); + this->CheckNaNs(&descr, all_nans, some_nans, other_nans, min, max, valid_bitmap, + valid_bitmap_no_nans); +} + +template <> +void TestFloatStatistics::TestNaNs() { + constexpr int kNumValues = 8; + constexpr int kValueLen = sizeof(uint16_t); + + NodePtr node = this->MakeNode("f", Repetition::OPTIONAL); + ColumnDescriptor descr(node, 1, 1); + + const uint16_t nan_int = 0b1111110010101010; + const uint16_t min_int = 0b1010010111000110; + const uint16_t max_int = 0b0011100011010011; + uint8_t min_max_data[2 * kValueLen]; + const auto min = FLBA{float16::Unpack(min_int, &min_max_data[0 * kValueLen])}; + const auto max = FLBA{float16::Unpack(max_int, &min_max_data[1 * kValueLen])}; + + std::array all_nans_packed = {nan_int, nan_int, nan_int, nan_int, + nan_int, nan_int, nan_int, nan_int}; + std::array some_nans_packed = {nan_int, + max_int, + 0b1000111000110000, + 0b1000010001000001, + nan_int, + 0b0000100000011110, + min_int, + nan_int}; + std::array other_nans_packed = some_nans_packed; + other_nans_packed[0] = 0b0000010000110011; + + std::array bytes; + uint8_t* at = bytes.data(); + auto prepare_values = [&](const auto& packed_values) -> std::vector { + std::vector out; + for (uint16_t packed : packed_values) { + out.push_back(FLBA{float16::Unpack(packed, at)}); + at += kValueLen; + } + return out; + }; + + auto all_nans = prepare_values(all_nans_packed); + auto some_nans = prepare_values(some_nans_packed); + auto other_nans = prepare_values(other_nans_packed); - some_nan_stats = MakeStatistics(&descr); - AssertUnsetMinMax(some_nan_stats, all_nans, &valid_bitmap); - // NaNs should not pollute min max when excluded via null bitmap. - AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap_no_nans, min, max); - // Ingesting NaNs with a null bitmap should not change the result. - AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap, min, max); + uint8_t valid_bitmap = 0x7F; // 0b01111111 + // NaNs excluded + uint8_t valid_bitmap_no_nans = 0x6E; // 0b01101110 - // An array that doesn't start with NaN - std::array other_nans{1.5f, max, -3.0f, -1.0f, nan, 2.0f, min, nan}; - auto other_stats = MakeStatistics(&descr); - AssertMinMaxAre(other_stats, other_nans, min, max); + this->CheckNaNs(&descr, all_nans, some_nans, other_nans, min, max, valid_bitmap, + valid_bitmap_no_nans); } -TEST(TestStatistic, NaNFloatValues) { CheckNaNs(); } +using FloatingPointTypes = ::testing::Types; + +TYPED_TEST_SUITE(TestFloatStatistics, FloatingPointTypes); -TEST(TestStatistic, NaNDoubleValues) { CheckNaNs(); } +TYPED_TEST(TestFloatStatistics, NegativeZeros) { this->TestNegativeZeroes(); } +TYPED_TEST(TestFloatStatistics, NaNs) { this->TestNaNs(); } // ARROW-7376 TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) { @@ -1132,58 +1351,6 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) { AssertUnsetMinMax(stats, nans_but_last, &all_but_last_valid); } -template -void AssertMinMaxZeroesSign(Stats stats, const Array& values) { - stats->Update(values.data(), values.size(), 0); - ASSERT_TRUE(stats->HasMinMax()); - - T zero{}; - ASSERT_EQ(stats->min(), zero); - ASSERT_TRUE(std::signbit(stats->min())); - - ASSERT_EQ(stats->max(), zero); - ASSERT_FALSE(std::signbit(stats->max())); -} - -// ARROW-5562: Ensure that -0.0f and 0.0f values are properly handled like in -// parquet-mr -template -void CheckNegativeZeroStats() { - using T = typename ParquetType::c_type; - - NodePtr node = PrimitiveNode::Make("f", Repetition::OPTIONAL, ParquetType::type_num); - ColumnDescriptor descr(node, 1, 1); - T zero{}; - - { - std::array values{-zero, zero}; - auto stats = MakeStatistics(&descr); - AssertMinMaxZeroesSign(stats, values); - } - - { - std::array values{zero, -zero}; - auto stats = MakeStatistics(&descr); - AssertMinMaxZeroesSign(stats, values); - } - - { - std::array values{-zero, -zero}; - auto stats = MakeStatistics(&descr); - AssertMinMaxZeroesSign(stats, values); - } - - { - std::array values{zero, zero}; - auto stats = MakeStatistics(&descr); - AssertMinMaxZeroesSign(stats, values); - } -} - -TEST(TestStatistics, FloatNegativeZero) { CheckNegativeZeroStats(); } - -TEST(TestStatistics, DoubleNegativeZero) { CheckNegativeZeroStats(); } - // Test statistics for binary column with UNSIGNED sort order TEST(TestStatisticsSortOrderMinMax, Unsigned) { std::string dir_string(test::get_data_dir());